glsl: implement SPV_NV_cooperative_vector

theHamsta · theHamsta · commit 1824631311ea · 2025-07-03T20:07:06.000+02:00
https://github.khronos.org/SPIRV-Registry/extensions/NV/SPV_NV_cooperative_vector.html The implementation tries to follow the code for SPV_EXT_cooperative_matrix. The extension could be mapped in a follow-up to the following HLSL proposal https://github.com/microsoft/hlsl-specs/blob/main/proposals/0026-hlsl-long-vector-type.md
diff --git a/reference/opt/shaders/comp/cooperative-vec-nv.spv16.vk.nocompat.comp.vk b/reference/opt/shaders/comp/cooperative-vec-nv.spv16.vk.nocompat.comp.vk
@@ -0,0 +1,43 @@
+#version 450
+#if defined(GL_AMD_gpu_shader_half_float)
+#extension GL_AMD_gpu_shader_half_float : require
+#elif defined(GL_EXT_shader_explicit_arithmetic_types_float16)
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#else
+#error No extension available for FP16.
+#endif
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_8bit_storage : require
+#extension GL_NV_cooperative_vector : require
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout(set = 0, binding = 0, std430) buffer Q
+{
+    float16_t data_q[];
+} _15;
+
+void main()
+{
+    coopvecNV<float16_t, 16u> _20;
+    coopVecLoadNV(_20, _15.data_q, 16u);
+    coopvecNV<float16_t, 16u> tempArg = _20;
+    coopvecNV<float16_t, 16u> inVec = tempArg;
+    coopVecStoreNV(inVec, _15.data_q, 16u);
+    coopvecNV<float16_t, 16u> _33;
+    coopVecMatMulAddNV(_33, inVec, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, 16u, 16u, 4, false, 2u);
+    coopvecNV<float16_t, 16u> tempArg_1 = _33;
+    inVec = tempArg_1;
+    coopvecNV<float16_t, 16u> _38;
+    coopVecMatMulNV(_38, inVec, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, 16u, 16u, 4, false, 2u);
+    coopvecNV<float16_t, 16u> tempArg_2 = _38;
+    inVec = tempArg_2;
+    coopvecNV<float16_t, 8u> a = coopvecNV<float16_t, 8u>(float16_t(0.0));
+    inVec = max(inVec, inVec);
+    coopVecOuterProductAccumulateNV(inVec, inVec, _15.data_q, 0u, 0u, 4, gl_ComponentTypeFloat16NV);
+    coopVecReduceSumAccumulateNV(inVec, _15.data_q, 0u);
+    coopvecNV<int8_t, 8u> a_8bit = coopvecNV<int8_t, 8u>(int8_t(0));
+    coopvecNV<int, 8u> b_32bit = coopvecNV<int, 8u>(0);
+    b_32bit = (max((coopvecNV<int, 8u>(a_8bit)), b_32bit));
+}
+
diff --git a/reference/opt/shaders/vulkan/comp/replicated-composites.spv16.vk.nocompat.comp.vk b/reference/opt/shaders/vulkan/comp/replicated-composites.spv16.vk.nocompat.comp.vk
@@ -9,11 +9,12 @@
 #extension GL_EXT_shader_16bit_storage : require
 #extension GL_KHR_cooperative_matrix : require
 #extension GL_KHR_memory_scope_semantics : require
+#extension GL_NV_cooperative_vector : require
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 layout(constant_id = 0) const float spec_const = 0.0;
-const vec4 _29 = vec4(spec_const);
-float _53;
+const vec4 _33 = vec4(spec_const);
+float _62;
 
 layout(set = 0, binding = 0, std140) uniform UBO
 {
@@ -25,11 +26,13 @@ void main()
     vec4 a = vec4(0.0);
     mat4 b = mat4(vec4(1.0), vec4(1.0), vec4(1.0), vec4(1.0));
     coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(float16_t(0.0));
-    vec4 c = _29;
-    vec4 _44 = vec4(ubo.uniform_float);
-    vec4 d = _44;
-    mat4 e = mat4(_44, _44, _44, _44);
-    float16_t _51 = float16_t(ubo.uniform_float);
-    coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(_51);
+    coopvecNV<float16_t, 16u> vec = coopvecNV<float16_t, 16u>(float16_t(0.0));
+    vec4 c = _33;
+    vec4 _48 = vec4(ubo.uniform_float);
+    vec4 d = _48;
+    mat4 e = mat4(_48, _48, _48, _48);
+    float16_t _55 = float16_t(ubo.uniform_float);
+    coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(_55);
+    coopvecNV<float16_t, 16u> vec_dynamic = coopvecNV<float16_t, 16u>(_55);
 }
 
diff --git a/reference/shaders/comp/cooperative-vec-nv.spv16.vk.nocompat.comp.vk b/reference/shaders/comp/cooperative-vec-nv.spv16.vk.nocompat.comp.vk
@@ -0,0 +1,43 @@
+#version 450
+#if defined(GL_AMD_gpu_shader_half_float)
+#extension GL_AMD_gpu_shader_half_float : require
+#elif defined(GL_EXT_shader_explicit_arithmetic_types_float16)
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#else
+#error No extension available for FP16.
+#endif
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_8bit_storage : require
+#extension GL_NV_cooperative_vector : require
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout(set = 0, binding = 0, std430) buffer Q
+{
+    float16_t data_q[];
+} _15;
+
+void main()
+{
+    coopvecNV<float16_t, 16u> _20;
+    coopVecLoadNV(_20, _15.data_q, 16u);
+    coopvecNV<float16_t, 16u> tempArg = _20;
+    coopvecNV<float16_t, 16u> inVec = tempArg;
+    coopVecStoreNV(inVec, _15.data_q, 16u);
+    coopvecNV<float16_t, 16u> _33;
+    coopVecMatMulAddNV(_33, inVec, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, 16u, 16u, 4, false, 2u);
+    coopvecNV<float16_t, 16u> tempArg_1 = _33;
+    inVec = tempArg_1;
+    coopvecNV<float16_t, 16u> _38;
+    coopVecMatMulNV(_38, inVec, gl_ComponentTypeFloat16NV, _15.data_q, 16u, gl_ComponentTypeFloat16NV, 16u, 16u, 4, false, 2u);
+    coopvecNV<float16_t, 16u> tempArg_2 = _38;
+    inVec = tempArg_2;
+    coopvecNV<float16_t, 8u> a = coopvecNV<float16_t, 8u>(float16_t(0.0));
+    inVec = max(inVec, inVec);
+    coopVecOuterProductAccumulateNV(inVec, inVec, _15.data_q, 0u, 0u, 4, gl_ComponentTypeFloat16NV);
+    coopVecReduceSumAccumulateNV(inVec, _15.data_q, 0u);
+    coopvecNV<int8_t, 8u> a_8bit = coopvecNV<int8_t, 8u>(int8_t(0));
+    coopvecNV<int, 8u> b_32bit = coopvecNV<int, 8u>(0);
+    b_32bit = (max((coopvecNV<int, 8u>(a_8bit)), b_32bit));
+}
+
diff --git a/reference/shaders/vulkan/comp/replicated-composites.spv16.vk.nocompat.comp.vk b/reference/shaders/vulkan/comp/replicated-composites.spv16.vk.nocompat.comp.vk
@@ -9,11 +9,12 @@
 #extension GL_EXT_shader_16bit_storage : require
 #extension GL_KHR_cooperative_matrix : require
 #extension GL_KHR_memory_scope_semantics : require
+#extension GL_NV_cooperative_vector : require
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 layout(constant_id = 0) const float spec_const = 0.0;
-const vec4 _29 = vec4(spec_const);
-const float _34[8] = float[](1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
+const vec4 _33 = vec4(spec_const);
+const float _38[8] = float[](1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
 
 layout(set = 0, binding = 0, std140) uniform UBO
 {
@@ -25,10 +26,13 @@ void main()
     vec4 a = vec4(0.0);
     mat4 b = mat4(vec4(1.0), vec4(1.0), vec4(1.0), vec4(1.0));
     coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(float16_t(0.0));
-    vec4 c = _29;
+    coopvecNV<float16_t, 16u> vec = coopvecNV<float16_t, 16u>(float16_t(0.0));
+    vec4 c = _33;
     vec4 d = vec4(ubo.uniform_float);
     mat4 e = mat4(d, d, d, d);
-    float16_t _51 = float16_t(ubo.uniform_float);
-    coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(_51);
+    float16_t _55 = float16_t(ubo.uniform_float);
+    coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16u, 16u, gl_MatrixUseAccumulator>(_55);
+    float16_t _60 = float16_t(ubo.uniform_float);
+    coopvecNV<float16_t, 16u> vec_dynamic = coopvecNV<float16_t, 16u>(_60);
 }
 
diff --git a/shaders/comp/cooperative-vec-nv.spv16.vk.nocompat.comp b/shaders/comp/cooperative-vec-nv.spv16.vk.nocompat.comp
@@ -0,0 +1,61 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
+
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_NV_cooperative_vector : enable
+#extension GL_EXT_buffer_reference : enable
+#extension GL_EXT_null_initializer : enable
+
+layout(local_size_x = 64) in;
+
+layout (binding = 0) buffer Q {float16_t data_q[];};
+
+
+void main()
+{
+    coopvecNV<float16_t, 16> inVec;
+    coopvecNV<float16_t, 16> outVec;
+
+
+    coopVecLoadNV(inVec, data_q, 16);
+    coopVecStoreNV(inVec, data_q, 16);
+    
+    const int matrixLayout = 4;
+    const int interpretation = gl_ComponentTypeFloat16NV;
+    coopVecMatMulAddNV(inVec,
+                       inVec,
+                       interpretation,
+                       data_q,
+                       16,
+                       interpretation,
+                       data_q,
+                       16,
+                       interpretation,
+                       16, 16,
+                       matrixLayout,
+                       false,
+                       2);
+    coopVecMatMulNV(inVec,
+                    inVec,
+                    interpretation,
+                    data_q,
+                    16,
+                    interpretation,
+                    16, 16,
+                    matrixLayout,
+                    false,
+                    2);
+    coopvecNV<float16_t, 8> a = coopvecNV<float16_t, 8>(0);
+    inVec = max(inVec, inVec);
+
+    coopVecOuterProductAccumulateNV(inVec, inVec, data_q, 0, 0, matrixLayout, gl_ComponentTypeFloat16NV);
+    coopVecReduceSumAccumulateNV(inVec,  data_q, 0);
+
+    coopvecNV<int8_t, 8> a_8bit = coopvecNV<int8_t, 8>(0);
+    coopvecNV<int32_t, 8> b_32bit = coopvecNV<int32_t, 8>(0);
+    b_32bit = max(coopvecNV<int32_t, 8>(a_8bit), b_32bit);
+}
diff --git a/shaders/vulkan/comp/replicated-composites.spv16.vk.nocompat.comp b/shaders/vulkan/comp/replicated-composites.spv16.vk.nocompat.comp
@@ -5,7 +5,7 @@
 
 #extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_cooperative_matrix : enable
-//#extension GL_NV_cooperative_vector : enable
+#extension GL_NV_cooperative_vector : enable
 layout (constant_id = 0) const float spec_const = 0;
 
 layout (set = 0, binding = 0) uniform UBO {
@@ -17,12 +17,13 @@ void main() {
     vec4 a = vec4(0.0);
     mat4 b = mat4(vec4(1.0), vec4(1.0), vec4(1.0), vec4(1.0));
     coopmat<float16_t, gl_ScopeWorkgroup, 16, 16, gl_MatrixUseAccumulator> matrix = coopmat<float16_t, gl_ScopeWorkgroup, 16, 16, gl_MatrixUseAccumulator>(0);
-    //coopvecNV<float16_t, 16> vec = coopvecNV<float16_t, 16>(0);
+    coopvecNV<float16_t, 16> vec = coopvecNV<float16_t, 16>(0);
     vec4 c = vec4(spec_const);
     float array[] = float[](1.0, 1.0, 1.0, 1.0, 1.0, 1.0f, 1.0, 1.0f);
 
     // runtime variables
     vec4 d = vec4(ubo.uniform_float);
     mat4 e = mat4(d, d, d, d);
     coopmat<float16_t, gl_ScopeWorkgroup, 16, 16, gl_MatrixUseAccumulator> matrix2 = coopmat<float16_t, gl_ScopeWorkgroup, 16, 16, gl_MatrixUseAccumulator>(ubo.uniform_float);
+    coopvecNV<float16_t, 16> vec_dynamic = coopvecNV<float16_t, 16>(ubo.uniform_float);
 }
diff --git a/spirv_common.hpp b/spirv_common.hpp
@@ -574,6 +574,7 @@ struct SPIRType : IVariant
 		Sampler,
 		AccelerationStructure,
 		RayQuery,
+		CoopVecNv,
 
 		// Keep internal types at the end.
 		ControlPointArray,
@@ -618,6 +619,12 @@ struct SPIRType : IVariant
 		uint32_t scope_id = 0;
 	} cooperative;
 
+	struct
+	{
+		uint32_t component_type_id = 0;
+		uint32_t component_count_id = 0;
+	} coopVecNv;
+
 	spv::StorageClass storage = spv::StorageClassGeneric;
 
 	SmallVector<TypeID> member_types;
diff --git a/spirv_cross.cpp b/spirv_cross.cpp
@@ -376,6 +376,7 @@ void Compiler::register_global_read_dependencies(const SPIRBlock &block, uint32_
 
 		case OpLoad:
 		case OpCooperativeMatrixLoadKHR:
+		case OpCooperativeVectorLoadNV:
 		case OpImageRead:
 		{
 			// If we're in a storage class which does not get invalidated, adding dependencies here is no big deal.
@@ -4346,6 +4347,7 @@ bool Compiler::may_read_undefined_variable_in_block(const SPIRBlock &block, uint
 
 		case OpCopyObject:
 		case OpLoad:
+		case OpCooperativeVectorLoadNV:
 		case OpCooperativeMatrixLoadKHR:
 			if (ops[2] == var)
 				return true;
@@ -5481,6 +5483,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
 	{
 	case OpLoad:
 	case OpCooperativeMatrixLoadKHR:
+	case OpCooperativeVectorLoadNV:
 	{
 		if (length < 3)
 			return false;
@@ -5559,6 +5562,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
 	case OpImageWrite:
 	case OpAtomicStore:
 	case OpCooperativeMatrixStoreKHR:
+	case OpCooperativeVectorStoreNV:
 	{
 		if (length < 1)
 			return false;
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp
diff --git a/spirv_parser.cpp b/spirv_parser.cpp