Skip to content

Commit 1824631

Browse files
committed
glsl: implement SPV_NV_cooperative_vector
https://github.khronos.org/SPIRV-Registry/extensions/NV/SPV_NV_cooperative_vector.html The implementation tries to follow the code for SPV_EXT_cooperative_matrix. The extension could be mapped in a follow-up to the following HLSL proposal https://github.com/microsoft/hlsl-specs/blob/main/proposals/0026-hlsl-long-vector-type.md
1 parent d8e3e2b commit 1824631

11 files changed

+373
-56
lines changed
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#version 450
2+
#if defined(GL_AMD_gpu_shader_half_float)
3+
#extension GL_AMD_gpu_shader_half_float : require
4+
#elif defined(GL_EXT_shader_explicit_arithmetic_types_float16)
5+
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
6+
#else
7+
#error No extension available for FP16.
8+
#endif
9+
#extension GL_EXT_shader_16bit_storage : require
10+
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
11+
#extension GL_EXT_shader_8bit_storage : require
12+
#extension GL_NV_cooperative_vector : require
13+
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
14+
15+
layout(set = 0, binding = 0, std430) buffer Q
16+
{
17+
float16_t data_q[];
18+
} _15;
19+
20+
void main()
21+
{
22+
coopvecNV<float16_t, 16u> _20;
23+
coopVecLoadNV(_20, _15.data_q, 16u);
24+
coopvecNV<float16_t, 16u> tempArg = _20;
25+
coopvecNV<float16_t, 16u> inVec = tempArg;
26+
coopVecStoreNV(inVec, _15.data_q, 16u);
27+
coopvecNV<float16_t, 16u> _33;
28+
coopVecMatMulAddNV(_33, inVec, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, 16u, 16u, 4, false, 2u);
29+
coopvecNV<float16_t, 16u> tempArg_1 = _33;
30+
inVec = tempArg_1;
31+
coopvecNV<float16_t, 16u> _38;
32+
coopVecMatMulNV(_38, inVec, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, 16u, 16u, 4, false, 2u);
33+
coopvecNV<float16_t, 16u> tempArg_2 = _38;
34+
inVec = tempArg_2;
35+
coopvecNV<float16_t, 8u> a = coopvecNV<float16_t, 8u>(float16_t(0.0));
36+
inVec = max(inVec, inVec);
37+
coopVecOuterProductAccumulateNV(inVec, inVec, _15.data_q, 0u, 0u, 4, gl_ComponentTypeFloat16NV);
38+
coopVecReduceSumAccumulateNV(inVec, _15.data_q, 0u);
39+
coopvecNV<int8_t, 8u> a_8bit = coopvecNV<int8_t, 8u>(int8_t(0));
40+
coopvecNV<int, 8u> b_32bit = coopvecNV<int, 8u>(0);
41+
b_32bit = (max((coopvecNV<int, 8u>(a_8bit)), b_32bit));
42+
}
43+

reference/opt/shaders/vulkan/comp/replicated-composites.spv16.vk.nocompat.comp.vk

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@
99
#extension GL_EXT_shader_16bit_storage : require
1010
#extension GL_KHR_cooperative_matrix : require
1111
#extension GL_KHR_memory_scope_semantics : require
12+
#extension GL_NV_cooperative_vector : require
1213
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
1314

1415
layout(constant_id = 0) const float spec_const = 0.0;
15-
const vec4 _29 = vec4(spec_const);
16-
float _53;
16+
const vec4 _33 = vec4(spec_const);
17+
float _62;
1718

1819
layout(set = 0, binding = 0, std140) uniform UBO
1920
{
@@ -25,11 +26,13 @@ void main()
2526
vec4 a = vec4(0.0);
2627
mat4 b = mat4(vec4(1.0), vec4(1.0), vec4(1.0), vec4(1.0));
2728
coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(float16_t(0.0));
28-
vec4 c = _29;
29-
vec4 _44 = vec4(ubo.uniform_float);
30-
vec4 d = _44;
31-
mat4 e = mat4(_44, _44, _44, _44);
32-
float16_t _51 = float16_t(ubo.uniform_float);
33-
coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(_51);
29+
coopvecNV<float16_t, 16u> vec = coopvecNV<float16_t, 16u>(float16_t(0.0));
30+
vec4 c = _33;
31+
vec4 _48 = vec4(ubo.uniform_float);
32+
vec4 d = _48;
33+
mat4 e = mat4(_48, _48, _48, _48);
34+
float16_t _55 = float16_t(ubo.uniform_float);
35+
coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(_55);
36+
coopvecNV<float16_t, 16u> vec_dynamic = coopvecNV<float16_t, 16u>(_55);
3437
}
3538

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#version 450
2+
#if defined(GL_AMD_gpu_shader_half_float)
3+
#extension GL_AMD_gpu_shader_half_float : require
4+
#elif defined(GL_EXT_shader_explicit_arithmetic_types_float16)
5+
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
6+
#else
7+
#error No extension available for FP16.
8+
#endif
9+
#extension GL_EXT_shader_16bit_storage : require
10+
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
11+
#extension GL_EXT_shader_8bit_storage : require
12+
#extension GL_NV_cooperative_vector : require
13+
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
14+
15+
layout(set = 0, binding = 0, std430) buffer Q
16+
{
17+
float16_t data_q[];
18+
} _15;
19+
20+
void main()
21+
{
22+
coopvecNV<float16_t, 16u> _20;
23+
coopVecLoadNV(_20, _15.data_q, 16u);
24+
coopvecNV<float16_t, 16u> tempArg = _20;
25+
coopvecNV<float16_t, 16u> inVec = tempArg;
26+
coopVecStoreNV(inVec, _15.data_q, 16u);
27+
coopvecNV<float16_t, 16u> _33;
28+
coopVecMatMulAddNV(_33, inVec, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, 16u, 16u, 4, false, 2u);
29+
coopvecNV<float16_t, 16u> tempArg_1 = _33;
30+
inVec = tempArg_1;
31+
coopvecNV<float16_t, 16u> _38;
32+
coopVecMatMulNV(_38, inVec, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, 16u, 16u, 4, false, 2u);
33+
coopvecNV<float16_t, 16u> tempArg_2 = _38;
34+
inVec = tempArg_2;
35+
coopvecNV<float16_t, 8u> a = coopvecNV<float16_t, 8u>(float16_t(0.0));
36+
inVec = max(inVec, inVec);
37+
coopVecOuterProductAccumulateNV(inVec, inVec, _15.data_q, 0u, 0u, 4, gl_ComponentTypeFloat16NV);
38+
coopVecReduceSumAccumulateNV(inVec, _15.data_q, 0u);
39+
coopvecNV<int8_t, 8u> a_8bit = coopvecNV<int8_t, 8u>(int8_t(0));
40+
coopvecNV<int, 8u> b_32bit = coopvecNV<int, 8u>(0);
41+
b_32bit = (max((coopvecNV<int, 8u>(a_8bit)), b_32bit));
42+
}
43+

reference/shaders/vulkan/comp/replicated-composites.spv16.vk.nocompat.comp.vk

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@
99
#extension GL_EXT_shader_16bit_storage : require
1010
#extension GL_KHR_cooperative_matrix : require
1111
#extension GL_KHR_memory_scope_semantics : require
12+
#extension GL_NV_cooperative_vector : require
1213
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
1314

1415
layout(constant_id = 0) const float spec_const = 0.0;
15-
const vec4 _29 = vec4(spec_const);
16-
const float _34[8] = float[](1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
16+
const vec4 _33 = vec4(spec_const);
17+
const float _38[8] = float[](1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
1718

1819
layout(set = 0, binding = 0, std140) uniform UBO
1920
{
@@ -25,10 +26,13 @@ void main()
2526
vec4 a = vec4(0.0);
2627
mat4 b = mat4(vec4(1.0), vec4(1.0), vec4(1.0), vec4(1.0));
2728
coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(float16_t(0.0));
28-
vec4 c = _29;
29+
coopvecNV<float16_t, 16u> vec = coopvecNV<float16_t, 16u>(float16_t(0.0));
30+
vec4 c = _33;
2931
vec4 d = vec4(ubo.uniform_float);
3032
mat4 e = mat4(d, d, d, d);
31-
float16_t _51 = float16_t(ubo.uniform_float);
32-
coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(_51);
33+
float16_t _55 = float16_t(ubo.uniform_float);
34+
coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(_55);
35+
float16_t _60 = float16_t(ubo.uniform_float);
36+
coopvecNV<float16_t, 16u> vec_dynamic = coopvecNV<float16_t, 16u>(_60);
3337
}
3438

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#version 450
2+
3+
#extension GL_EXT_shader_16bit_storage : require
4+
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
5+
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
6+
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
7+
8+
#extension GL_KHR_memory_scope_semantics : enable
9+
#extension GL_NV_cooperative_vector : enable
10+
#extension GL_EXT_buffer_reference : enable
11+
#extension GL_EXT_null_initializer : enable
12+
13+
layout(local_size_x = 64) in;
14+
15+
layout (binding = 0) buffer Q {float16_t data_q[];};
16+
17+
18+
void main()
19+
{
20+
coopvecNV<float16_t, 16> inVec;
21+
coopvecNV<float16_t, 16> outVec;
22+
23+
24+
coopVecLoadNV(inVec, data_q, 16);
25+
coopVecStoreNV(inVec, data_q, 16);
26+
27+
const int matrixLayout = 4;
28+
const int interpretation = gl_ComponentTypeFloat16NV;
29+
coopVecMatMulAddNV(inVec,
30+
inVec,
31+
interpretation,
32+
data_q,
33+
16,
34+
interpretation,
35+
data_q,
36+
16,
37+
interpretation,
38+
16, 16,
39+
matrixLayout,
40+
false,
41+
2);
42+
coopVecMatMulNV(inVec,
43+
inVec,
44+
interpretation,
45+
data_q,
46+
16,
47+
interpretation,
48+
16, 16,
49+
matrixLayout,
50+
false,
51+
2);
52+
coopvecNV<float16_t, 8> a = coopvecNV<float16_t, 8>(0);
53+
inVec = max(inVec, inVec);
54+
55+
coopVecOuterProductAccumulateNV(inVec, inVec, data_q, 0, 0, matrixLayout, gl_ComponentTypeFloat16NV);
56+
coopVecReduceSumAccumulateNV(inVec, data_q, 0);
57+
58+
coopvecNV<int8_t, 8> a_8bit = coopvecNV<int8_t, 8>(0);
59+
coopvecNV<int32_t, 8> b_32bit = coopvecNV<int32_t, 8>(0);
60+
b_32bit = max(coopvecNV<int32_t, 8>(a_8bit), b_32bit);
61+
}

shaders/vulkan/comp/replicated-composites.spv16.vk.nocompat.comp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
#extension GL_KHR_memory_scope_semantics : enable
77
#extension GL_KHR_cooperative_matrix : enable
8-
//#extension GL_NV_cooperative_vector : enable
8+
#extension GL_NV_cooperative_vector : enable
99
layout (constant_id = 0) const float spec_const = 0;
1010

1111
layout (set = 0, binding = 0) uniform UBO {
@@ -17,12 +17,13 @@ void main() {
1717
vec4 a = vec4(0.0);
1818
mat4 b = mat4(vec4(1.0), vec4(1.0), vec4(1.0), vec4(1.0));
1919
coopmat<float16_t, gl_ScopeWorkgroup, 16, 16, gl_MatrixUseAccumulator> matrix = coopmat<float16_t, gl_ScopeWorkgroup, 16, 16, gl_MatrixUseAccumulator>(0);
20-
//coopvecNV<float16_t, 16> vec = coopvecNV<float16_t, 16>(0);
20+
coopvecNV<float16_t, 16> vec = coopvecNV<float16_t, 16>(0);
2121
vec4 c = vec4(spec_const);
2222
float array[] = float[](1.0, 1.0, 1.0, 1.0, 1.0, 1.0f, 1.0, 1.0f);
2323

2424
// runtime variables
2525
vec4 d = vec4(ubo.uniform_float);
2626
mat4 e = mat4(d, d, d, d);
2727
coopmat<float16_t, gl_ScopeWorkgroup, 16, 16, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16, 16, gl_MatrixUseAccumulator>(ubo.uniform_float);
28+
coopvecNV<float16_t, 16> vec_dynamic = coopvecNV<float16_t, 16>(ubo.uniform_float);
2829
}

spirv_common.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,7 @@ struct SPIRType : IVariant
574574
Sampler,
575575
AccelerationStructure,
576576
RayQuery,
577+
CoopVecNv,
577578

578579
// Keep internal types at the end.
579580
ControlPointArray,
@@ -618,6 +619,12 @@ struct SPIRType : IVariant
618619
uint32_t scope_id = 0;
619620
} cooperative;
620621

622+
struct
623+
{
624+
uint32_t component_type_id = 0;
625+
uint32_t component_count_id = 0;
626+
} coopVecNv;
627+
621628
spv::StorageClass storage = spv::StorageClassGeneric;
622629

623630
SmallVector<TypeID> member_types;

spirv_cross.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ void Compiler::register_global_read_dependencies(const SPIRBlock &block, uint32_
376376

377377
case OpLoad:
378378
case OpCooperativeMatrixLoadKHR:
379+
case OpCooperativeVectorLoadNV:
379380
case OpImageRead:
380381
{
381382
// If we're in a storage class which does not get invalidated, adding dependencies here is no big deal.
@@ -4346,6 +4347,7 @@ bool Compiler::may_read_undefined_variable_in_block(const SPIRBlock &block, uint
43464347

43474348
case OpCopyObject:
43484349
case OpLoad:
4350+
case OpCooperativeVectorLoadNV:
43494351
case OpCooperativeMatrixLoadKHR:
43504352
if (ops[2] == var)
43514353
return true;
@@ -5481,6 +5483,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
54815483
{
54825484
case OpLoad:
54835485
case OpCooperativeMatrixLoadKHR:
5486+
case OpCooperativeVectorLoadNV:
54845487
{
54855488
if (length < 3)
54865489
return false;
@@ -5559,6 +5562,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
55595562
case OpImageWrite:
55605563
case OpAtomicStore:
55615564
case OpCooperativeMatrixStoreKHR:
5565+
case OpCooperativeVectorStoreNV:
55625566
{
55635567
if (length < 1)
55645568
return false;

0 commit comments

Comments
 (0)