From 3414a0c8986f08689e310050e5f527f1adc81f27 Mon Sep 17 00:00:00 2001 From: "Singh, Nitin" Date: Mon, 10 Nov 2025 09:47:50 +0000 Subject: [PATCH] s8s32 UT with new mma and copy atoms --- test/unit/gemm/device/CMakeLists.txt | 12 ++- .../device/default_gemm_configuration.hpp | 76 ++++++++++++++- test/unit/gemm/device/gemm_testbed_3x.hpp | 24 ++++- .../xe_gemm_s8_s8_s32_tensor_op_s32.cpp | 7 +- ...xe_gemm_s8_s8_s32_tensor_op_s32_legacy.cpp | 95 +++++++++++++++++++ 5 files changed, 205 insertions(+), 9 deletions(-) create mode 100644 test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32_legacy.cpp diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt index 279ff2828e..366fe30853 100644 --- a/test/unit/gemm/device/CMakeLists.txt +++ b/test/unit/gemm/device/CMakeLists.txt @@ -30,7 +30,7 @@ if(CUTLASS_ENABLE_SYCL) if(SYCL_INTEL_TARGET) cutlass_test_unit_add_executable( - cutlass_test_unit_gemm_device_tensorop_xe + cutlass_test_unit_gemm_device_tensorop_xe_legacy xe_gemm_bf16_bf16_bf16_tensor_op_bf16.cpp xe_gemm_fp16_fp16_fp16_tensor_op_fp16.cpp xe_gemm_bf16_bf16_bf16_tensor_op_fp32.cpp @@ -38,13 +38,19 @@ if(CUTLASS_ENABLE_SYCL) xe_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp xe_gemm_fp16_fp16_fp16_tensor_op_fp32.cpp xe_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp - xe_gemm_s8_s8_s32_tensor_op_s32.cpp + xe_gemm_s8_s8_s32_tensor_op_s32_legacy.cpp xe_gemm_tf32_tf32_fp32_tensor_op_fp32.cpp xe_gemm_f8_f8_fp32_tensor_op_fp32.cpp xe_gemm_fp16_s8_fp32_tensor_op_fp32.cpp gemm_universal_bf16n_bf16t_f32n_tensor_op_f32_xe.cpp ) + # TODO :- Port remaining legacy tests after enabling new atoms + cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_tensorop_xe + xe_gemm_s8_s8_s32_tensor_op_s32.cpp + ) + cutlass_test_unit_add_executable( cutlass_test_unit_gemm_device_tensorop_cooperative_xe xe_gemm_bf16_bf16_fp32_tensor_op_fp32_cooperative.cpp @@ -93,7 +99,7 @@ if(CUTLASS_ENABLE_SYCL) cutlass_test_unit_gemm_device_mixed_input_tensorop_xe cutlass_test_unit_gemm_device_tensorop_xe_group_gemm cutlass_test_unit_gemm_device_mixed_dtype_tensorop_xe_group_gemm - cutlass_test_unit_gemm_device_tensorop_xe + cutlass_test_unit_gemm_device_tensorop_xe_legacy ) add_custom_target( diff --git a/test/unit/gemm/device/default_gemm_configuration.hpp b/test/unit/gemm/device/default_gemm_configuration.hpp index 9b786027fc..219be39e60 100644 --- a/test/unit/gemm/device/default_gemm_configuration.hpp +++ b/test/unit/gemm/device/default_gemm_configuration.hpp @@ -62,6 +62,17 @@ struct DefaultGemmConfigurationToCutlass3Types { static_assert(sizeof(ElementA) == 0, "No valid DefaultGemmConfigurationToCutlass3Types configuration exists."); }; +template< + class OperatorClass, class ArchTag, + class ElementA, class LayoutA, + class ElementB, class LayoutB, + class ElementC, class LayoutC, + class ElementAccumulator> +struct XeLegacyGemmConfigurationToCutlass3Types { + static_assert(sizeof(ElementA) == 0, "No valid DefaultGemmConfigurationToCutlass3Types configuration exists."); +}; + + // This type is only intended to demonstrate porting 2.x kernels to 3.0 template< class OperatorClass, class ArchTag, @@ -1901,9 +1912,9 @@ struct DefaultGemmConfigurationToCutlass3Types< /////////////////////////////////////////////////////////////////////////////// -// Intel XE MMA S32S8 +// Intel XE MMA S32S8 Legacy template -struct DefaultGemmConfigurationToCutlass3Types< +struct XeLegacyGemmConfigurationToCutlass3Types< arch::OpClassTensorOp, arch::IntelXe, int8_t, LayoutA, int8_t, LayoutB, @@ -1961,6 +1972,64 @@ struct DefaultGemmConfigurationToCutlass3Types< /////////////////////////////////////////////////////////////////////////////// +// Intel XE MMA S32S8 +template +struct DefaultGemmConfigurationToCutlass3Types< + arch::OpClassTensorOp, arch::IntelXe, + int8_t, LayoutA, + int8_t, LayoutB, + int32_t, LayoutC, + int32_t> +{ + using TileShape = Shape<_256, _256, _32>; + + using GEMMDispatchPolicy = gemm::MainloopXeL1Staged<3>; + + using TiledMma = + typename TiledMMAHelper< + MMA_Atom>, + Layout, + Layout, Stride<_4, _1, _0>> + >::TiledMMA; + + using GmemTiledCopyA = void; + using GmemTiledCopyB = void; + + // Mainloop + using CollectiveMainloop = collective::CollectiveMma< + GEMMDispatchPolicy, TileShape, + int8_t, TagToStrideA_t, + int8_t, TagToStrideB_t, + TiledMma, + GmemTiledCopyA, void, void, cute::identity, // A + GmemTiledCopyB, void, void, cute::identity // B + >; + + using EpilogueDispatchPolicy = epilogue::IntelXeGeneric; + using EpilogueOp = epilogue::fusion::LinearCombination; + + using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks< + EpilogueDispatchPolicy, + EpilogueOp, + TileShape, + decltype(tile_shape(TiledMma())) + >; + + using GmemTiledCopyC = XE_LOAD_2D<32, 8, 16>; + using GmemTiledCopyD = XE_STORE_2D<32, 8, 16>; + + using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue< + EpilogueDispatchPolicy, + TileShape, + int32_t, TagToStrideC_t, + int32_t, TagToStrideC_t, + FusionCallBacks, + GmemTiledCopyC, void, void, + GmemTiledCopyD, void, void>; +}; + +/////////////////////////////////////////////////////////////////////////////// + namespace detail { // @@ -2002,7 +2071,7 @@ struct DefaultGemm_TensorOpXe_OperandB struct DefaultGemmConfigurationToCutlass3Types< arch::OpClassTensorOp, arch::IntelXe, @@ -2158,6 +2227,7 @@ struct DefaultGemmConfigurationToCutlass3Types< XE_2D_U32x8x16_ST_N, void, void>; }; +// Intel XE MMA FP32FP16 template struct DefaultGemmConfigurationToCutlass3Types< arch::OpClassTensorOp, arch::IntelXe, diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp index fb2e3dc6bd..e4eb642e38 100644 --- a/test/unit/gemm/device/gemm_testbed_3x.hpp +++ b/test/unit/gemm/device/gemm_testbed_3x.hpp @@ -70,6 +70,8 @@ #include "cute/layout.hpp" #include "cute/numeric/int.hpp" +#include "cutlass/util/GPU_Clock.hpp" + namespace test { namespace gemm { namespace device { @@ -3041,7 +3043,7 @@ struct TestbedImpl { if (status != cutlass::Status::kSuccess) { #if defined(CUTLASS_ENABLE_SYCL) - std::cerr << "This test is not supported." << "\n"; + std::cerr << "This test is not supported. - gemm_op can_implement failed" << "\n"; return true; #else cudaError_t error = cudaGetLastError(); @@ -3069,7 +3071,7 @@ struct TestbedImpl { status = gemm_op.initialize(arguments, workspace.get()); if (status != cutlass::Status::kSuccess) { #if defined(CUTLASS_ENABLE_SYCL) - std::cerr << "This test is not supported." << "\n"; + std::cerr << "This test is not supported. - gemm_op initialize failed" << "\n"; #else cudaError_t error = cudaGetLastError(); const auto error_str = cudaGetErrorString(error); @@ -3079,10 +3081,28 @@ struct TestbedImpl { #if (CUTLASS_DEBUG_TRACE_LEVEL > 1) CUTLASS_TRACE_HOST("TestbedImpl::run: Calling gemm_op.run"); #endif + GPU_Clock timer; + if (profiling) + timer.start(); status = gemm_op.run(); #if defined(CUTLASS_ENABLE_SYCL) try { compat::wait_and_throw(); + if (profiling) { + double time = timer.seconds(); + auto m = cute::get<0>(problem_size); + auto n = cute::get<1>(problem_size); + auto k = cute::get<2>(problem_size); + auto l = cute::get<3>(problem_size); + double tops = (2.0 * m * n * k * l) * 1e-12; + printf( + "[Perf] M=%d N=%d K=%d L=%d | " + "-> [%4.3f] Tops/s (%.4f ms)\n", + m, n, k, l, + tops / time, + time * 1000 + ); + } } catch (std::exception const &e) { ADD_FAILURE() << "Error at Kernel Sync."; return false; diff --git a/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32.cpp b/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32.cpp index e1e1145163..666ceb43af 100644 --- a/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32.cpp +++ b/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32.cpp @@ -58,7 +58,12 @@ struct XE_Device_Gemm_s8_s8_s32_tensor_op_s32 { typename Config::CollectiveMainloop, typename Config::CollectiveEpilogue>; - using Gemm = gemm::device::GemmUniversalAdapter; + struct Gemm : public gemm::device::GemmUniversalAdapter { + static constexpr int kAlignmentA = 16; + static constexpr int kAlignmentB = 16; + static constexpr int kAlignmentC = 4; + static constexpr int kAlignmentD = 4; + }; }; TEST(XE_Device_Gemm_s8t_s8t_s32t_tensor_op_s32, 256x256x32) { diff --git a/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32_legacy.cpp b/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32_legacy.cpp new file mode 100644 index 0000000000..0c66550d5f --- /dev/null +++ b/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32_legacy.cpp @@ -0,0 +1,95 @@ +/*************************************************************************************************** + * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Tests for Xe s8_s8_s32 +*/ + + +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "default_gemm_configuration.hpp" + +#include "gemm_testbed_3x.hpp" + +namespace cutlass { +namespace { +template +struct XE_Device_Gemm_s8_s8_s32_tensor_op_s32 { + using Config = gemm::device::XeLegacyGemmConfigurationToCutlass3Types< + arch::OpClassTensorOp, arch::IntelXe, + int8_t, LayoutA, + int8_t, LayoutB, + int32_t, layout::RowMajor, + int32_t>; + + using GemmKernel = gemm::kernel::GemmUniversal< + cute::Shape, + typename Config::CollectiveMainloop, + typename Config::CollectiveEpilogue>; + + using Gemm = gemm::device::GemmUniversalAdapter; +}; + +TEST(XE_Device_Gemm_s8t_s8t_s32t_tensor_op_s32, 256x256x32) { + using LayoutA = layout::RowMajor; + using LayoutB = layout::RowMajor; + using Gemm = XE_Device_Gemm_s8_s8_s32_tensor_op_s32::Gemm; + EXPECT_TRUE(test::gemm::device::TestXe()); +} + +// TODO(Codeplay): Test on XE2 because the copy function is not available in the IGC driver for PVC +TEST(XE2_Device_Gemm_s8n_s8t_s32t_tensor_op_s32, 64x128x32) { + using LayoutA = layout::ColumnMajor; + using LayoutB = layout::RowMajor; + using Gemm = XE_Device_Gemm_s8_s8_s32_tensor_op_s32::Gemm; + EXPECT_TRUE(test::gemm::device::TestXe()); +} + +TEST(XE_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x32) { + using LayoutA = layout::RowMajor; + using LayoutB = layout::ColumnMajor; + using Gemm = XE_Device_Gemm_s8_s8_s32_tensor_op_s32::Gemm; + EXPECT_TRUE(test::gemm::device::TestXe()); +} + +// TODO(Codeplay): Test on XE2 because the copy function is not available in the IGC driver for PVC +TEST(XE2_Device_Gemm_s8n_s8n_s32t_tensor_op_s32, 64x128x32) { + using LayoutA = layout::ColumnMajor; + using LayoutB = layout::ColumnMajor; + using Gemm = XE_Device_Gemm_s8_s8_s32_tensor_op_s32::Gemm; + EXPECT_TRUE(test::gemm::device::TestXe()); +} + +} +} // namespace cutlass