Skip to content

Commit cdda69d

Browse files
committed
GLSL: Implement SPV_NV_cooperative_matrix2, SPV_NV_tensor_addressing.
1 parent 5b82b65 commit cdda69d

File tree

9 files changed

+659
-37
lines changed

9 files changed

+659
-37
lines changed

reference/shaders-no-opt/vulkan/comp/nv-coopmat-2.vk.nocompat.spv16.comp.vk

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,27 @@
77
#error No extension available for FP16.
88
#endif
99
#extension GL_EXT_shader_16bit_storage : require
10+
#extension GL_EXT_buffer_reference2 : require
1011
#extension GL_KHR_cooperative_matrix : require
1112
#extension GL_KHR_memory_scope_semantics : require
1213
#extension GL_NV_cooperative_matrix2 : require
1314
#extension GL_EXT_float_e4m3 : require
1415
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
1516

17+
layout(buffer_reference) buffer A_buffer_ref;
18+
19+
layout(constant_id = 0) const uint Clamp = 0u;
20+
21+
layout(buffer_reference, std430) buffer A_buffer_ref
22+
{
23+
float16_t data_a[];
24+
};
25+
26+
layout(set = 0, binding = 0, std430) buffer A_buffer
27+
{
28+
float16_t data_a[];
29+
} _157;
30+
1631
void accum_to_a_cast()
1732
{
1833
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> Accum = coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator>(float16_t(0.0));
@@ -34,9 +49,99 @@ void value_cast()
3449
void saturated_cast()
3550
{
3651
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> Accum = coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator>(float16_t(0.0));
37-
coopmat<floate4m3_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseB> _52;
38-
saturatedConvertEXT(_52, Accum);
39-
coopmat<floate4m3_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseB> B = _52;
52+
coopmat<floate4m3_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseB> _107;
53+
saturatedConvertEXT(_107, Accum);
54+
coopmat<floate4m3_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseB> B = _107;
55+
}
56+
57+
void tensor_layouts()
58+
{
59+
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout1 = createTensorLayoutNV(2u, gl_CooperativeMatrixClampModeConstantNV);
60+
tensorLayoutNV<2, (Clamp)> layout2 = createTensorLayoutNV(2u, (Clamp));
61+
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout3 = setTensorLayoutClampValueNV(layout1, 42u);
62+
layout1 = setTensorLayoutBlockSizeNV(layout1, 1u, 16u);
63+
layout1 = setTensorLayoutBlockSizeNV(layout1, 1u, 16u);
64+
layout1 = setTensorLayoutDimensionNV(layout1, 128u, 128u);
65+
layout1 = setTensorLayoutDimensionNV(layout1, 128u, 128u);
66+
layout1 = setTensorLayoutDimensionNV(layout1, 128u, 128u);
67+
layout1 = setTensorLayoutStrideNV(layout1, 1u, 1u);
68+
}
69+
70+
float16_t decodeLoad(const in A_buffer_ref buf, const in uint blockCoord[2], const in uint coordInBlock[2])
71+
{
72+
return buf.data_a[0];
73+
}
74+
75+
void load_stores()
76+
{
77+
uint offset = 17u;
78+
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout1 = createTensorLayoutNV(2u, gl_CooperativeMatrixClampModeConstantNV);
79+
tensorViewNV<2u, false, 0u, 1u> view = createTensorViewNV(2u, false, 0u, 1u);
80+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> A;
81+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _163;
82+
coopMatLoadTensorNV(_163, _157.data_a, offset, layout1);
83+
A = _163;
84+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _169;
85+
coopMatLoadTensorNV(_169, _157.data_a, offset, sliceTensorLayoutNV(layout1, 16u, 16u, 0u, 16u));
86+
A = _169;
87+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _175;
88+
coopMatLoadTensorNV(_175, _157.data_a, offset, layout1, view);
89+
A = _175;
90+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _180;
91+
coopMatLoadTensorNV(_180, _157.data_a, offset, layout1, decodeLoad);
92+
A = _180;
93+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _186;
94+
coopMatLoadTensorNV(_186, _157.data_a, offset, layout1, view, decodeLoad);
95+
A = _186;
96+
coopMatStoreTensorNV(A, _157.data_a, offset, sliceTensorLayoutNV(layout1, 16u, 16u, 0u, 16u));
97+
coopMatStoreTensorNV(A, _157.data_a, offset, sliceTensorLayoutNV(layout1, 16u, 16u, 0u, 16u), view);
98+
}
99+
100+
float16_t maxReduce(const in float16_t x, const in float16_t y)
101+
{
102+
return max(x, y);
103+
}
104+
105+
float16_t maxReduceIndirect(const in float16_t x, const in float16_t y)
106+
{
107+
return maxReduce(x, y);
108+
}
109+
110+
float16_t Exp(const in uint row, const in uint col, const in float16_t elem)
111+
{
112+
return exp(elem);
113+
}
114+
115+
float16_t ExpWithArg(const in uint row, const in uint col, const in float16_t elem, const in bool maybe)
116+
{
117+
if (maybe)
118+
{
119+
return exp(elem);
120+
}
121+
else
122+
{
123+
return elem;
124+
}
125+
}
126+
127+
void callback_functions()
128+
{
129+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> A;
130+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> _201;
131+
coopMatReduceNV(_201, A, gl_CooperativeMatrixReduceRowNV, maxReduce);
132+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> reduced = _201;
133+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> _204;
134+
coopMatReduceNV(_204, reduced, gl_CooperativeMatrixReduceRowAndColumnNV, maxReduce);
135+
reduced = _204;
136+
coopmat<float16_t, gl_ScopeSubgroup, 8u, 8u, gl_MatrixUseAccumulator> _211;
137+
coopMatReduceNV(_211, A, gl_CooperativeMatrixReduce2x2NV, maxReduceIndirect);
138+
coopmat<float16_t, gl_ScopeSubgroup, 8u, 8u, gl_MatrixUseAccumulator> B = _211;
139+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> _213;
140+
coopMatPerElementNV(_213, A, Exp);
141+
A = _213;
142+
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> _216;
143+
coopMatPerElementNV(_216, A, ExpWithArg, true);
144+
A = _216;
40145
}
41146

42147
void main()
@@ -45,5 +150,8 @@ void main()
45150
accum_to_b_cast();
46151
value_cast();
47152
saturated_cast();
153+
tensor_layouts();
154+
load_stores();
155+
callback_functions();
48156
}
49157

shaders-no-opt/vulkan/comp/nv-coopmat-2.vk.nocompat.spv16.comp

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,38 @@
22
#extension GL_KHR_memory_scope_semantics : require
33
#extension GL_KHR_cooperative_matrix : require
44
#extension GL_NV_cooperative_matrix2 : require
5+
#extension GL_EXT_buffer_reference : require
56
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
7+
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
68
#extension GL_EXT_float_e5m2 : require
79
#extension GL_EXT_float_e4m3 : require
810

911
layout(local_size_x = 1) in;
12+
layout(constant_id = 0) const uint32_t Clamp = 0;
13+
layout (binding = 0) buffer A_buffer {float16_t data_a[];};
14+
layout(buffer_reference, std430, buffer_reference_align = 8) buffer A_buffer_ref {float16_t data_a[];};
15+
16+
float16_t Exp(const in uint32_t row, const in uint32_t col, const in float16_t elem)
17+
{
18+
return exp(elem);
19+
}
20+
21+
float16_t ExpWithArg(const in uint32_t row, const in uint32_t col, const in float16_t elem, const in bool maybe)
22+
{
23+
if(maybe) {
24+
return exp(elem);
25+
} else {
26+
return elem;
27+
}
28+
}
29+
30+
float16_t maxReduce(const in float16_t x, const in float16_t y) {
31+
return max(x, y);
32+
}
33+
34+
float16_t maxReduceIndirect(const in float16_t x, const in float16_t y) {
35+
return maxReduce(x, y);
36+
}
1037

1138
void accum_to_a_cast()
1239
{
@@ -41,10 +68,72 @@ void saturated_cast()
4168
saturatedConvertEXT(B, Accum);
4269
}
4370

71+
void tensor_layouts() {
72+
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout1 = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
73+
tensorLayoutNV<2, Clamp> layout2 = createTensorLayoutNV(2, Clamp);
74+
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout3 = setTensorLayoutClampValueNV(layout1, 42);
75+
76+
const uint32_t BLOCK_SIZE = 16;
77+
layout1 = setTensorLayoutBlockSizeNV(layout1, 1, BLOCK_SIZE);
78+
layout1 = setTensorLayoutBlockSizeNV(layout1, 1, 16);
79+
80+
const int N = 128;
81+
const int D = 128;
82+
const int KV = 128;
83+
layout1 = setTensorLayoutDimensionNV(layout1, N, D);
84+
layout1 = setTensorLayoutDimensionNV(layout1, KV, D);
85+
layout1 = setTensorLayoutDimensionNV(layout1, KV, D);
86+
layout1 = setTensorLayoutStrideNV(layout1, 1, 1);
87+
}
88+
89+
90+
void tensor_views() {
91+
tensorViewNV<2, false> view1 = createTensorViewNV(2, false);
92+
tensorViewNV<2, false, 0, 1> view2 = createTensorViewNV(2, false);
93+
tensorViewNV<2, false, 1, 0> viewTransposed = createTensorViewNV(2, false, 1, 0);
94+
view1 = setTensorViewClipNV(view1, 0, 16, 0, 16);
95+
view1 = setTensorViewDimensionsNV(view1, 256, 256);
96+
view1 = setTensorViewStrideNV(view1, 2, 1);
97+
}
98+
99+
float16_t decodeLoad(const in A_buffer_ref buf, const in uint32_t blockCoord[2], const in uint32_t coordInBlock[2]) {
100+
return buf.data_a[0];
101+
}
102+
103+
void load_stores() {
104+
uint32_t offset = 17;
105+
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A;
106+
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout1 = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
107+
tensorViewNV<2, false, 0, 1> view = createTensorViewNV(2, false, 0, 1);
108+
109+
coopMatLoadTensorNV(A, data_a, offset, layout1);
110+
coopMatLoadTensorNV(A, data_a, offset, sliceTensorLayoutNV(layout1, 16, 16, 0, 16));
111+
coopMatLoadTensorNV(A, data_a, offset, layout1, view);
112+
coopMatLoadTensorNV(A, data_a, offset, layout1, decodeLoad);
113+
coopMatLoadTensorNV(A, data_a, offset, layout1, view, decodeLoad);
114+
coopMatStoreTensorNV(A, data_a, offset, sliceTensorLayoutNV(layout1, 16, 16, 0, 16));
115+
coopMatStoreTensorNV(A, data_a, offset, sliceTensorLayoutNV(layout1, 16, 16, 0, 16), view);
116+
}
117+
118+
void callback_functions() {
119+
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> A;
120+
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> reduced;
121+
coopMatReduceNV(reduced, A, gl_CooperativeMatrixReduceRowNV, maxReduce);
122+
coopMatReduceNV(reduced, reduced, gl_CooperativeMatrixReduceRowAndColumnNV, maxReduce);
123+
coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseAccumulator> B;
124+
coopMatReduceNV(B, A, gl_CooperativeMatrixReduce2x2NV, maxReduceIndirect);
125+
126+
coopMatPerElementNV(A, A, Exp);
127+
coopMatPerElementNV(A, A, ExpWithArg, true);
128+
}
129+
44130
void main()
45131
{
46132
accum_to_a_cast();
47133
accum_to_b_cast();
48134
value_cast();
49135
saturated_cast();
136+
tensor_layouts();
137+
load_stores();
138+
callback_functions();
50139
}

spirv_common.hpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ void join_helper(StringStream<> &stream, T &&t, Ts &&... ts)
7676
}
7777
} // namespace inner
7878

79+
// From spec on Dims: "The value must be greater than zero and less than or equal to 5."
80+
static constexpr const size_t TENSOR_VIEW_NV_MAX_DIMS = 5;
81+
7982
class Bitset
8083
{
8184
public:
@@ -584,6 +587,8 @@ struct SPIRType : IVariant
584587
AccelerationStructure,
585588
RayQuery,
586589
CoopVecNV,
590+
TensorLayoutNv,
591+
TensorViewNv,
587592

588593
// Keep internal types at the end.
589594
ControlPointArray,
@@ -642,6 +647,19 @@ struct SPIRType : IVariant
642647
uint32_t rank;
643648
uint32_t shape;
644649
} tensor;
650+
651+
struct
652+
{
653+
uint32_t dim_id;
654+
uint32_t clamp_mode_id;
655+
} tensorLayoutNv;
656+
657+
struct
658+
{
659+
uint32_t dim_id;
660+
uint32_t has_dimensions_id;
661+
int32_t dim_ids[TENSOR_VIEW_NV_MAX_DIMS];
662+
} tensorViewNv;
645663
} ext;
646664

647665
spv::StorageClass storage = spv::StorageClassGeneric;
@@ -1010,6 +1028,8 @@ struct SPIRFunction : IVariant
10101028
// read and write counts as access to the function arguments
10111029
// is not local to the function in question.
10121030
bool alias_global_variable;
1031+
spv::StorageClass storage;
1032+
bool force_const;
10131033
};
10141034

10151035
// When calling a function, and we're remapping separate image samplers,
@@ -1057,7 +1077,7 @@ struct SPIRFunction : IVariant
10571077
void add_parameter(TypeID parameter_type, ID id, bool alias_global_variable = false)
10581078
{
10591079
// Arguments are read-only until proven otherwise.
1060-
arguments.push_back({ parameter_type, id, 0u, 0u, alias_global_variable });
1080+
arguments.push_back({ parameter_type, id, 0u, 0u, alias_global_variable, spv::StorageClassGeneric, false });
10611081
}
10621082

10631083
// Hooks to be run when the function returns.

0 commit comments

Comments
 (0)