Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions test/unit/gemm/device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,27 @@
if(CUTLASS_ENABLE_SYCL)
if(SYCL_INTEL_TARGET)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_tensorop_xe
cutlass_test_unit_gemm_device_tensorop_xe_legacy
xe_gemm_bf16_bf16_bf16_tensor_op_bf16.cpp
xe_gemm_fp16_fp16_fp16_tensor_op_fp16.cpp
xe_gemm_bf16_bf16_bf16_tensor_op_fp32.cpp
xe_gemm_bf16_bf16_fp32_tensor_op_bf16.cpp
xe_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
xe_gemm_fp16_fp16_fp16_tensor_op_fp32.cpp
xe_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
xe_gemm_s8_s8_s32_tensor_op_s32.cpp
xe_gemm_s8_s8_s32_tensor_op_s32_legacy.cpp
xe_gemm_tf32_tf32_fp32_tensor_op_fp32.cpp
xe_gemm_f8_f8_fp32_tensor_op_fp32.cpp
xe_gemm_fp16_s8_fp32_tensor_op_fp32.cpp
gemm_universal_bf16n_bf16t_f32n_tensor_op_f32_xe.cpp
)

# TODO :- Port remaining legacy tests after enabling new atoms
cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_tensorop_xe
xe_gemm_s8_s8_s32_tensor_op_s32.cpp
)

cutlass_test_unit_add_executable(
cutlass_test_unit_gemm_device_tensorop_cooperative_xe
xe_gemm_bf16_bf16_fp32_tensor_op_fp32_cooperative.cpp
Expand Down Expand Up @@ -93,7 +99,7 @@ if(CUTLASS_ENABLE_SYCL)
cutlass_test_unit_gemm_device_mixed_input_tensorop_xe
cutlass_test_unit_gemm_device_tensorop_xe_group_gemm
cutlass_test_unit_gemm_device_mixed_dtype_tensorop_xe_group_gemm
cutlass_test_unit_gemm_device_tensorop_xe
cutlass_test_unit_gemm_device_tensorop_xe_legacy
)

add_custom_target(
Expand Down
76 changes: 73 additions & 3 deletions test/unit/gemm/device/default_gemm_configuration.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ struct DefaultGemmConfigurationToCutlass3Types {
static_assert(sizeof(ElementA) == 0, "No valid DefaultGemmConfigurationToCutlass3Types configuration exists.");
};

template<
class OperatorClass, class ArchTag,
class ElementA, class LayoutA,
class ElementB, class LayoutB,
class ElementC, class LayoutC,
class ElementAccumulator>
struct XeLegacyGemmConfigurationToCutlass3Types {
static_assert(sizeof(ElementA) == 0, "No valid DefaultGemmConfigurationToCutlass3Types configuration exists.");
};


// This type is only intended to demonstrate porting 2.x kernels to 3.0
template<
class OperatorClass, class ArchTag,
Expand Down Expand Up @@ -1901,9 +1912,9 @@ struct DefaultGemmConfigurationToCutlass3Types<

///////////////////////////////////////////////////////////////////////////////

// Intel XE MMA S32S8
// Intel XE MMA S32S8 Legacy
template <typename LayoutA, typename LayoutB, typename LayoutC>
struct DefaultGemmConfigurationToCutlass3Types<
struct XeLegacyGemmConfigurationToCutlass3Types<
arch::OpClassTensorOp, arch::IntelXe,
int8_t, LayoutA,
int8_t, LayoutB,
Expand Down Expand Up @@ -1961,6 +1972,64 @@ struct DefaultGemmConfigurationToCutlass3Types<

///////////////////////////////////////////////////////////////////////////////

// Intel XE MMA S32S8
template <typename LayoutA, typename LayoutB, typename LayoutC>
struct DefaultGemmConfigurationToCutlass3Types<
arch::OpClassTensorOp, arch::IntelXe,
int8_t, LayoutA,
int8_t, LayoutB,
int32_t, LayoutC,
int32_t>
{
using TileShape = Shape<_256, _256, _32>;

using GEMMDispatchPolicy = gemm::MainloopXeL1Staged<3>;

using TiledMma =
typename TiledMMAHelper<
MMA_Atom<XE_DPAS_TT<8, int32_t, int8_t>>,
Layout<TileShape>,
Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>
>::TiledMMA;

using GmemTiledCopyA = void;
using GmemTiledCopyB = void;

// Mainloop
using CollectiveMainloop = collective::CollectiveMma<
GEMMDispatchPolicy, TileShape,
int8_t, TagToStrideA_t<LayoutA>,
int8_t, TagToStrideB_t<LayoutB>,
TiledMma,
GmemTiledCopyA, void, void, cute::identity, // A
GmemTiledCopyB, void, void, cute::identity // B
>;

using EpilogueDispatchPolicy = epilogue::IntelXeGeneric;
using EpilogueOp = epilogue::fusion::LinearCombination<int32_t, int32_t>;

using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks<
EpilogueDispatchPolicy,
EpilogueOp,
TileShape,
decltype(tile_shape(TiledMma()))
>;

using GmemTiledCopyC = XE_LOAD_2D<32, 8, 16>;
using GmemTiledCopyD = XE_STORE_2D<32, 8, 16>;

using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
EpilogueDispatchPolicy,
TileShape,
int32_t, TagToStrideC_t<LayoutC>,
int32_t, TagToStrideC_t<LayoutC>,
FusionCallBacks,
GmemTiledCopyC, void, void,
GmemTiledCopyD, void, void>;
};

///////////////////////////////////////////////////////////////////////////////

namespace detail {

//
Expand Down Expand Up @@ -2002,7 +2071,7 @@ struct DefaultGemm_TensorOpXe_OperandB<tfloat32_t, layout::ColumnMajor, 32, Size

///////////////////////////////////////////////////////////////////////////////

// Intel XE MMA S32S8
// Intel XE MMA F32TF32
template <typename LayoutA, typename LayoutB, typename LayoutC>
struct DefaultGemmConfigurationToCutlass3Types<
arch::OpClassTensorOp, arch::IntelXe,
Expand Down Expand Up @@ -2158,6 +2227,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
XE_2D_U32x8x16_ST_N, void, void>;
};

// Intel XE MMA FP32FP16
template <typename LayoutA, typename LayoutB, typename LayoutC>
struct DefaultGemmConfigurationToCutlass3Types<
arch::OpClassTensorOp, arch::IntelXe,
Expand Down
24 changes: 22 additions & 2 deletions test/unit/gemm/device/gemm_testbed_3x.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@
#include "cute/layout.hpp"
#include "cute/numeric/int.hpp"

#include "cutlass/util/GPU_Clock.hpp"

namespace test {
namespace gemm {
namespace device {
Expand Down Expand Up @@ -3041,7 +3043,7 @@ struct TestbedImpl {

if (status != cutlass::Status::kSuccess) {
#if defined(CUTLASS_ENABLE_SYCL)
std::cerr << "This test is not supported." << "\n";
std::cerr << "This test is not supported. - gemm_op can_implement failed" << "\n";
return true;
#else
cudaError_t error = cudaGetLastError();
Expand Down Expand Up @@ -3069,7 +3071,7 @@ struct TestbedImpl {
status = gemm_op.initialize(arguments, workspace.get());
if (status != cutlass::Status::kSuccess) {
#if defined(CUTLASS_ENABLE_SYCL)
std::cerr << "This test is not supported." << "\n";
std::cerr << "This test is not supported. - gemm_op initialize failed" << "\n";
#else
cudaError_t error = cudaGetLastError();
const auto error_str = cudaGetErrorString(error);
Expand All @@ -3079,10 +3081,28 @@ struct TestbedImpl {
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
CUTLASS_TRACE_HOST("TestbedImpl::run: Calling gemm_op.run");
#endif
GPU_Clock timer;
if (profiling)
timer.start();
status = gemm_op.run();
#if defined(CUTLASS_ENABLE_SYCL)
try {
compat::wait_and_throw();
if (profiling) {
double time = timer.seconds();
auto m = cute::get<0>(problem_size);
auto n = cute::get<1>(problem_size);
auto k = cute::get<2>(problem_size);
auto l = cute::get<3>(problem_size);
double tops = (2.0 * m * n * k * l) * 1e-12;
printf(
"[Perf] M=%d N=%d K=%d L=%d | "
"-> [%4.3f] Tops/s (%.4f ms)\n",
m, n, k, l,
tops / time,
time * 1000
);
}
} catch (std::exception const &e) {
ADD_FAILURE() << "Error at Kernel Sync.";
return false;
Expand Down
7 changes: 6 additions & 1 deletion test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ struct XE_Device_Gemm_s8_s8_s32_tensor_op_s32 {
typename Config::CollectiveMainloop,
typename Config::CollectiveEpilogue>;

using Gemm = gemm::device::GemmUniversalAdapter<GemmKernel>;
struct Gemm : public gemm::device::GemmUniversalAdapter<GemmKernel> {
static constexpr int kAlignmentA = 16;
static constexpr int kAlignmentB = 16;
static constexpr int kAlignmentC = 4;
static constexpr int kAlignmentD = 4;
};
};

TEST(XE_Device_Gemm_s8t_s8t_s32t_tensor_op_s32, 256x256x32) {
Expand Down
95 changes: 95 additions & 0 deletions test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32_legacy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/***************************************************************************************************
* Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

/*! \file
\brief Tests for Xe s8_s8_s32
*/


#include "cutlass/cutlass.h"

#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "default_gemm_configuration.hpp"

#include "gemm_testbed_3x.hpp"

namespace cutlass {
namespace {
template <typename LayoutA, typename LayoutB>
struct XE_Device_Gemm_s8_s8_s32_tensor_op_s32 {
using Config = gemm::device::XeLegacyGemmConfigurationToCutlass3Types<
arch::OpClassTensorOp, arch::IntelXe,
int8_t, LayoutA,
int8_t, LayoutB,
int32_t, layout::RowMajor,
int32_t>;

using GemmKernel = gemm::kernel::GemmUniversal<
cute::Shape<int, int, int, int>,
typename Config::CollectiveMainloop,
typename Config::CollectiveEpilogue>;

using Gemm = gemm::device::GemmUniversalAdapter<GemmKernel>;
};

TEST(XE_Device_Gemm_s8t_s8t_s32t_tensor_op_s32, 256x256x32) {
using LayoutA = layout::RowMajor;
using LayoutB = layout::RowMajor;
using Gemm = XE_Device_Gemm_s8_s8_s32_tensor_op_s32<LayoutA, LayoutB>::Gemm;
EXPECT_TRUE(test::gemm::device::TestXe<Gemm>());
}

// TODO(Codeplay): Test on XE2 because the copy function is not available in the IGC driver for PVC
TEST(XE2_Device_Gemm_s8n_s8t_s32t_tensor_op_s32, 64x128x32) {
using LayoutA = layout::ColumnMajor;
using LayoutB = layout::RowMajor;
using Gemm = XE_Device_Gemm_s8_s8_s32_tensor_op_s32<LayoutA, LayoutB>::Gemm;
EXPECT_TRUE(test::gemm::device::TestXe<Gemm>());
}

TEST(XE_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x32) {
using LayoutA = layout::RowMajor;
using LayoutB = layout::ColumnMajor;
using Gemm = XE_Device_Gemm_s8_s8_s32_tensor_op_s32<LayoutA, LayoutB>::Gemm;
EXPECT_TRUE(test::gemm::device::TestXe<Gemm>());
}

// TODO(Codeplay): Test on XE2 because the copy function is not available in the IGC driver for PVC
TEST(XE2_Device_Gemm_s8n_s8n_s32t_tensor_op_s32, 64x128x32) {
using LayoutA = layout::ColumnMajor;
using LayoutB = layout::ColumnMajor;
using Gemm = XE_Device_Gemm_s8_s8_s32_tensor_op_s32<LayoutA, LayoutB>::Gemm;
EXPECT_TRUE(test::gemm::device::TestXe<Gemm>());
}

}
} // namespace cutlass
Loading