diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index f9d15e1cca..67ccd1d07f 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -1,5 +1,5 @@
name: Bug Report
-description: Create a bug report to help us improve CUTLASS
+description: Create a bug report to help us improve SYCL*TLA
title: "[BUG] "
labels: ["? - Needs Triage", "bug"]
assignees: []
@@ -10,8 +10,9 @@ body:
attributes:
label: Which component has the problem?
options:
- - CuTe DSL
- - CUTLASS C++
+ - CuTe APIs
+ - CUTLASS APIs
+ - Python (APIs or Pypi package)
validations:
required: true
- type: textarea
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
deleted file mode 100644
index 4572ae1b98..0000000000
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-blank_issues_enabled: true
-contact_links:
- - name: CUTLASS Discord
- url: https://discord.gg/nvidiadeveloper
- about: Come chat about using and contributing to CUTLASS!
diff --git a/.github/ISSUE_TEMPLATE/documentation_request.md b/.github/ISSUE_TEMPLATE/documentation_request.md
index c9fa21fac9..9a3f2f0bab 100644
--- a/.github/ISSUE_TEMPLATE/documentation_request.md
+++ b/.github/ISSUE_TEMPLATE/documentation_request.md
@@ -1,6 +1,6 @@
---
name: Documentation request
-about: Report incorrect or needed documentation to improve CUTLASS
+about: Report incorrect or needed documentation to improve SYCL*TLA
title: "[DOC]"
labels: "? - Needs Triage, documentation"
assignees: ''
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
index 4f1d616576..ca4c4880f1 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -1,5 +1,5 @@
name: Feature Request
-description: Suggest an idea for CUTLASS
+description: Suggest an idea for SYCL*TLA
title: "[FEA] "
labels: ["? - Needs Triage", "feature request"]
assignees: []
@@ -10,8 +10,9 @@ body:
attributes:
label: Which component requires the feature?
options:
- - CuTe DSL
- - CUTLASS C++
+ - CuTe APIs
+ - CUTLASS APIs
+ - Python (APIs or Pypi package)
validations:
required: true
- type: textarea
@@ -21,7 +22,7 @@ body:
description: Please fill out all sections below
value: |
**Is your feature request related to a problem? Please describe.**
- A clear and concise description of what the problem is. Ex. I wish I could use CUTLASS to do [...]
+ A clear and concise description of what the problem is. Ex. I wish I could use SYCL*TLA to do [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
diff --git a/.github/ISSUE_TEMPLATE/submit_question.md b/.github/ISSUE_TEMPLATE/submit_question.md
index 5aa2a672d2..efd4322849 100644
--- a/.github/ISSUE_TEMPLATE/submit_question.md
+++ b/.github/ISSUE_TEMPLATE/submit_question.md
@@ -1,6 +1,6 @@
---
name: Submit question
-about: Ask a general question about CUTLASS
+about: Ask a general question about SYCL*TLA
title: "[QST]"
labels: "? - Needs Triage, question"
assignees: ''
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000000..ac43cf77b9
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,19 @@
+## Description
+
+
+## Type
+- [ ] Bug - [ ] Feature - [ ] Performance - [ ] Refactor
+
+## Testing
+- [ ] Tests pass - [ ] Xe12 - [ ] Xe20
+
+## Performance
+| Metric | Before | After |
+|--------|--------|-------|
+| | | |
+
+## References
+Fixes #
+
+## Checklist
+- [ ] Copyright - [ ] Co-pilot Review - [ ] Deprecated APIs not used
diff --git a/.github/PULL_REQUEST_TEMPLATE/bug_fix.md b/.github/PULL_REQUEST_TEMPLATE/bug_fix.md
new file mode 100644
index 0000000000..ce123ee50e
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/bug_fix.md
@@ -0,0 +1,20 @@
+## Bug
+
+
+Severity:
+
+## Root Cause
+
+
+## Fix
+
+
+## Verification
+Before:
+After:
+
+## Testing
+- [ ] Regression/Units test
+- [ ] Tests pass
+
+## Details
diff --git a/.github/PULL_REQUEST_TEMPLATE/feature.md b/.github/PULL_REQUEST_TEMPLATE/feature.md
new file mode 100644
index 0000000000..42e6f25d9f
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/feature.md
@@ -0,0 +1,23 @@
+## Feature
+
+
+## Use Case
+
+
+## API
+```cpp
+// Signature
+```
+
+## Example
+```cpp
+// Usage
+```
+
+## Testing
+- [ ] Tests - [ ] Example - [ ] Docs
+
+## ToDo
+- [ ] Implement A
+- [ ] Implement B
+- [ ] Document
diff --git a/.github/PULL_REQUEST_TEMPLATE/performance.md b/.github/PULL_REQUEST_TEMPLATE/performance.md
new file mode 100644
index 0000000000..92a18959db
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/performance.md
@@ -0,0 +1,19 @@
+## Optimization
+
+
+## Profiling
+Tool:
+Bottleneck:
+
+## Results
+| Case | Before | After | Gain |
+|------|--------|-------|------|
+| | | | |
+
+## Changes
+
+
+## Testing
+- [ ] Tests pass - [ ] Xe12 - [ ] Xe20
+
+Related: #
diff --git a/.github/PULL_REQUEST_TEMPLATE/refactoring.md b/.github/PULL_REQUEST_TEMPLATE/refactoring.md
new file mode 100644
index 0000000000..63ead276f9
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/refactoring.md
@@ -0,0 +1,23 @@
+## Refactoring
+
+
+## Why
+
+
+## Changes
+
+
+## Preservation
+- [ ] Tests unchanged
+- [ ] Perf unchanged
+
+## Quality
+| Metric | Before | After |
+|--------|--------|-------|
+| LOC | | |
+| Performance | | |
+
+## ToDo
+- [ ] Implement A
+- [ ] Implement B
+- [ ] Document
diff --git a/.github/actions/install-intel-graphics/action.yml b/.github/actions/install-intel-graphics/action.yml
index 93aa9c3cdc..2d3a71b35d 100644
--- a/.github/actions/install-intel-graphics/action.yml
+++ b/.github/actions/install-intel-graphics/action.yml
@@ -17,20 +17,22 @@ runs:
run: |
shopt -s expand_aliases
which sudo || alias sudo=""
- if [[ "${{ inputs.GPU }}" == "BMG" ]]; then
+ if [[ "${{ inputs.GPU }}" == "BMG" || "${{ inputs.GPU }}" == "PVC" ]]; then
sudo add-apt-repository ppa:kobuk-team/intel-graphics
sudo apt update
else
- . /etc/os-release
- wget https://repositories.intel.com/gpu/ubuntu/dists/${VERSION_CODENAME}/intel-gpu-ubuntu-${VERSION_CODENAME}.run
- chmod +x intel-gpu-ubuntu-${VERSION_CODENAME}.run
- sudo ./intel-gpu-ubuntu-${VERSION_CODENAME}.run
- sudo apt install -y \
- intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
- libegl-mesa0 libegl1-mesa-dev libgl1-mesa-dev \
- libgles2-mesa-dev libigdgmm12 libxatracker2 mesa-va-drivers \
- mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo \
- libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev hwinfo
+ # LTS PVC drivers
+ # . /etc/os-release
+ # wget https://repositories.intel.com/gpu/ubuntu/dists/${VERSION_CODENAME}/intel-gpu-ubuntu-${VERSION_CODENAME}.run
+ # chmod +x intel-gpu-ubuntu-${VERSION_CODENAME}.run
+ # sudo ./intel-gpu-ubuntu-${VERSION_CODENAME}.run
+ # sudo apt install -y \
+ # intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+ # libegl-mesa0 libegl1-mesa-dev libgl1-mesa-dev \
+ # libgles2-mesa-dev libigdgmm12 libxatracker2 mesa-va-drivers \
+ # mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo \
+ # libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev hwinfo
+ exit 1
fi
sudo apt-get install -y libze-intel-gpu1 libze-dev intel-metrics-discovery \
intel-opencl-icd ocl-icd-opencl-dev clinfo intel-gsc intel-ocloc g++
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000000..8ce2a9079d
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,167 @@
+# Copilot Coding Agent Onboarding — SYCL*TLA
+
+Purpose
+-------
+This file is a short, focused onboarding guide so a coding agent (Copilot coding agent) can make correct, CI-safe changes to the SYCL*TLA repository without long exploratory searches. Keep edits conservative: prefer small, well-tested changes and follow the PR checklist described below.
+
+Top-level constraints (read first)
+---------------------------------
+- **Intel copyright headers**: Many files carry dual NVIDIA/Intel copyright headers. Do not remove or alter copyright headers on modified files.
+- **Intel Xe APIs**: The codebase uses new Intel "Xe" APIs (Xe12 for PVC, Xe20 for BMG) and Intel oneAPI toolchain conventions; prefer SYCL-compatible code and avoid adding CUDA-only code paths without explicit gating.
+- **CI Requirements**: Changes must build and pass CI workflows in `.github/workflows/*` (notably `intel_test.yml`, `intel_test_gpp_host.yml`, `sycl_python_test.yml`).
+- **Test Coverage**: Check for test coverage before making changes. C++ tests are in `test/unit/`, Python tests in `test/python/`.
+- **PR Descriptions**: Must include: what changed, why, local build/test steps performed, and expected CI/benchmark impact (see PR templates in `.github/PULL_REQUEST_TEMPLATE/`).
+
+Quick actions to always run locally before creating a PR
+------------------------------------------------------
+1. **ALWAYS source Intel environment first** (required for builds that target Intel compilers; if not available, CMake configure will still catch syntax errors but linking will fail):
+
+```bash
+source /opt/intel/oneapi/setvars.sh
+export CXX=icpx
+export CC=icx
+```
+
+2. **ALWAYS create a clean build directory** and configure for SYCL:
+
+```bash
+rm -rf build && mkdir build && cd build
+cmake .. -G Ninja \
+ -DCUTLASS_ENABLE_SYCL=ON \
+ -DDPCPP_SYCL_TARGET=intel_gpu_bmg_g21 \
+ -DCUTLASS_SYCL_RUNNING_CI=ON
+ninja
+```
+
+**Critical Notes:**
+- `-DDPCPP_SYCL_TARGET` must match your hardware: `intel_gpu_bmg_g21` for BMG (Arc B580), `intel_gpu_pvc` for PVC (Data Center Max). This affects intrinsic availability.
+- Build time: ~10-20 minutes for full build on 8-core machine.
+- If Intel oneAPI is not installed, CMake configure will still catch syntax errors but linking and target-specific checks will fail.
+- **NEVER commit without running a full build locally first**.
+
+Build / Test / Lint summary
+---------------------------
+- **Bootstrap**: No special bootstrap required. Python dependencies in `pyproject.toml` (`networkx`, `numpy`, `pydot`, `scipy`, `treelib`) are needed for Python tests. Install with `pip install -e .` in project root.
+- **Build**: Use CMake 3.22+ and Ninja (see commands above). **ALWAYS** run from clean build directory to avoid stale state.
+- **C++ Unit Tests**: After build, run `cmake --build . --target test_unit` (runs all unit tests in `test/unit/`).
+- **C++ Examples**: `cmake --build . --target test_examples` (builds and validates examples in `examples/`).
+- **Python Tests**:
+ ```bash
+ cd python
+ python3 -m pytest -q
+ ```
+ CI runs specific test like `test/python/cutlass/gemm/gemm_bf16_pvc.py`. **ALWAYS** set `export CUTLASS_USE_SYCL=1` before running Python tests.
+- **Linting**: No automated linter. Follow existing code style and ensure `-Werror` flag passes (set in CI).
+
+**Environment Variables Required for Runtime:**
+```bash
+export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
+export IGC_ExtraOCLOptions="-cl-intel-256-GRF-per-thread"
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file -gline-tables-only"
+export IGC_VectorAliasBBThreshold=100000000000
+```
+These are set in CI workflows and should be set locally for accurate testing.
+
+Common failure modes & mitigations
+---------------------------------
+- **Missing Intel environment**: builds fail at linking or with unknown compilers. Mitigation: Source `/opt/intel/oneapi/setvars.sh` or unset `CXX`/`CC` to use system compilers for syntax-only checks.
+- **Wrong SYCL target**: some intrinsics are target-specific (e.g., 2D block prefetch intrinsics). Match the CI target or use conservative code paths.
+- **Layout constraints in Intel Xe epilogues** (ColumnMajor/RowMajor): prefer to reuse existing epilogue code and tests to avoid violating layout constraints. If making changes, run the affected tests locally.
+- **Missing libraries in LD_LIBRARY_PATH** for runtime: set `LD_LIBRARY_PATH` to include `build/tools/library` when running `python` tests that load `.so` wrappers.
+- **CMake cache issues**: If you see unexpected build behavior, **ALWAYS** delete `build/` completely and reconfigure. Stale CMake cache causes many hard-to-debug issues.
+- **Python import errors**: If Python tests fail with import errors, run `pip install -e .` from project root first.
+
+CI and validation pipelines (what will run)
+-------------------------------------------
+- See `.github/workflows/` for exact pipelines. Most important:
+ - `intel_test.yml` — primary CI build for Intel targets
+ - `intel_test_gpp_host.yml` — GPP host builds
+ - `sycl_python_test.yml` — Python test workflow
+ - `nvidia_test.yml` / `cuda_test.yml` — CUDA-targeted tests (keep changes SYCL-first unless explicitly modifying CUDA paths)
+
+How the agent should validate its changes
+-----------------------------------------
+1. Run a local CMake configure and build (fast smoke test):
+
+```bash
+rm -rf build && mkdir build && cd build
+cmake .. -G Ninja -DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=intel_gpu_bmg_g21 -DCUTLASS_SYCL_RUNNING_CI=ON
+ninja -k 0
+```
+
+2. Run the Python test subset that touches modified components (or all Python tests if the change is cross-cutting):
+
+```bash
+cd python
+python3 -m pytest -q
+```
+
+3. For C++ kernel changes, run unit tests: `cmake --build . --target test_unit -j 8`
+
+4. For examples changes, run: `cmake --build . --target test_examples -j 1`
+--------------------------------------------------
+- Short summary of change and the files modified.
+- Build steps executed locally (CMake + Ninja commands, environment variables set).
+- Tests run and their results (include pytest subset names and pass/fail counts).
+- If the change affects performance or kernel selection, include expected performance impact and a short benchmark (size and results).
+- State whether the Intel oneAPI environment was required to fully validate the change.
+
+Project layout (quick map)
+--------------------------
+- Root files: `CMakeLists.txt`, `README.md`, `CHANGELOG-SYCL.md`, `SYCL.cmake`, `pyproject.toml`
+- Major directories:
+ - `include/` — core headers and kernel templates
+ - `python/` — Python wrapper, generator, tests
+ - `examples/` — usage examples, e.g., `11_xe20_cutlass_library`
+ - `test/` — C++ tests and validation kernels
+ - `tools/` — build/test utilities
+ - `media/` — documentation and architecture notes (search `media/docs/cpp/xe_rearchitecture.md`, `media/docs/python/xe_cutlass_library.md`)
+
+Files the agent should inspect when making changes
+--------------------------------------------------
+- `python/cutlass_library/generator.py` — kernel generation and filtering logic
+- `python/cutlass_library/arch_constants.py` — architecture detection and constants
+- `include/cutlass/gemm/kernel/*` — GEMM kernel implementations
+- `.github/workflows/*` — CI steps; ensure changes don't break these workflows
+
+Search tips
+-----------
+- Use `grep -R "HACK\|TODO\|WORKAROUND\|FIXME"` to find fragile areas.
+- Search for `intel` and `Xe` keywords to find Intel-specific code paths.
+
+Testing coverage
+----------------
+- The repo contains Python tests in `python/` and C++ tests under `test/`.
+- Before assuming full coverage, run the test suite locally and include failing tests in your PR notes.
+
+Special rules for changes
+-------------------------
+- Keep changes minimal and well-scoped. If modifying kernel selection or architecture constants, include tests or fallbacks.
+- Preserve Intel copyright headers.
+- Avoid introducing CUDA-only code paths in SYCL code.
+
+When to run a wider search
+--------------------------
+Trust these instructions first. Only perform a broad code search if:
+- The instructions are clearly missing information for the requested change, or
+- A test or build step fails unexpectedly after following these steps.
+
+Where to look for help
+----------------------
+- `README.md` and `media/docs/*` for architecture details
+- `.github/workflows/` for CI expectations
+- Open issues and PR templates in `.github/ISSUE_TEMPLATE` and `.github/PULL_REQUEST_TEMPLATE`
+
+Short checklist before opening a PR
+-----------------------------------
+- [ ] Build configured and compiled locally (or syntax-checked if environment unavailable)
+- [ ] Relevant tests run locally and passed
+- [ ] PR description includes steps and validation results
+- [ ] No removal of Intel copyright headers
+
+If anything here proves inaccurate
+----------------------------------
+Run the minimal searches you need, then update this file with the corrected steps so future agents benefit.
+
+---
+This file is intentionally short (<= 2 pages). For deeper onboarding, consult `README.md`, `media/docs/*`, and the workflows in `.github/workflows/`.
diff --git a/.github/instructions/copilot.instructions.md b/.github/instructions/copilot.instructions.md
new file mode 100644
index 0000000000..2ae263a078
--- /dev/null
+++ b/.github/instructions/copilot.instructions.md
@@ -0,0 +1,167 @@
+# GitHub Copilot Instructions for SYCL*TLA
+
+## Project Overview
+
+SYCL*TLA (SYCL Templates for Linear Algebra) is a high-performance C++ template library for tensor linear algebra operations on Intel GPUs using SYCL. This repository is forked from NVIDIA CUTLASS and extends CUTLASS and CuTe API support to Intel GPUs through SYCL enablement.
+
+## Key Technologies and Frameworks
+
+- **SYCL**: Primary parallel programming model for Intel GPU acceleration
+- **Intel oneAPI**: Compiler toolchain (icpx/icx) and runtime environment
+- **CUTLASS 3.x**: Template-based GEMM library architecture
+- **CuTe**: Compile-time tensor abstraction library
+- **Intel Xe Architecture**: Target hardware (BMG arch=20, PVC arch=12)
+- **CMake/Ninja**: Build system configuration
+- **Python**: High-level interface and testing framework
+
+## Architecture-Specific Guidelines
+
+### Intel Xe GPU Support
+- **Primary Targets**: Intel Data Center GPU Max/Flex Series (PVC) and Intel Arc B580 (BMG)
+- **Architecture Constants**: Use BMG (arch=20) for consumer GPUs, PVC (arch=12) for data center
+- **Compilation Targets**:
+ - BMG: `intel_gpu_bmg_g21`
+ - PVC: `intel_gpu_pvc`
+- **DPAS Operations**: Intel's Dot Product Accumulate Systolic instruction support
+
+### Layout Constraints
+- **Intel Xe Requirements**: Specific layout constraints for epilogue operations
+- **Memory Access Patterns**: 2D block operations with Intel GPU intrinsics
+- **Stride Requirements**: Use CuTe stride descriptors with int64_t types
+
+## Code Style and Conventions
+
+### C++ Templates
+- Follow CUTLASS 3.x template parameter conventions
+- Use CuTe abstractions for tensor operations
+- Maintain compile-time optimization patterns
+- Template specializations for Intel Xe architecture features
+
+### SYCL-Specific Patterns
+- Use `sycl::queue` for device operations
+- Prefer `sycl::buffer` and `sycl::accessor` for memory management
+- Follow Intel GPU subgroup programming model
+- Use Intel GPU intrinsics for optimized operations
+
+### File Organization
+- **Headers**: `include/cutlass/` for core templates
+- **Kernels**: Architecture-specific implementations in `include/cutlass/gemm/kernel/`
+- **Examples**: `examples/` with Intel Xe demonstrations
+- **Python**: `python/cutlass_library/` for high-level interface
+- **Tests**: `test/` with Intel GPU validation
+
+## Development Workflow
+
+### Environment Setup
+```bash
+# Source Intel oneAPI environment
+source /opt/intel/oneapi/setvars.sh
+
+# Configure for Intel GPU compilation
+export CXX=icpx
+export CC=icx
+```
+
+### Build Configuration
+- Use CMake with `-DCUTLASS_ENABLE_SYCL=ON`
+- Specify target architecture: `-DDPCPP_SYCL_TARGET=intel_gpu_bmg_g21`
+- Enable CI mode for testing: `-DCUTLASS_SYCL_RUNNING_CI=ON`
+
+### Testing Patterns
+- Validate on both BMG and PVC architectures when possible
+- Include mixed-precision test cases (FP16, BF16, FP8)
+- Test epilogue fusion operations
+- Verify against reference implementations
+
+## Common Anti-Patterns to Avoid
+
+### Performance Issues
+- **Avoid**: Unnecessary template instantiations
+- **Avoid**: Non-coalesced memory access patterns
+- **Avoid**: Suboptimal tile sizes for Intel Xe
+- **Avoid**: Missing architecture-specific optimizations
+
+### SYCL-Specific Issues
+- **Avoid**: CUDA-specific code paths in SYCL builds
+- **Avoid**: Hardcoded NVIDIA architecture assumptions
+- **Avoid**: Missing Intel GPU intrinsic availability checks
+- **Avoid**: Incorrect compilation target specifications
+
+### Template Code Issues
+- **Avoid**: Template parameter naming conflicts with CUTLASS conventions
+- **Avoid**: Missing SFINAE constraints for Intel Xe specializations
+- **Avoid**: Breaking CuTe compile-time optimization patterns
+
+## Key Files and Components
+
+### Core Templates
+- `include/cutlass/gemm/kernel/xe_gemm_universal.hpp`: Intel Xe GEMM kernels
+- `include/cutlass/epilogue/threadblock/xe_epilogue.hpp`: Intel Xe epilogue operations
+- `include/cutlass/arch/arch.h`: Architecture detection and constants
+
+### Python Interface
+- `python/cutlass_library/generator.py`: Kernel generation logic
+- `python/cutlass_library/arch_constants.py`: Architecture configuration
+- `python/cutlass_library/cutlass_test_wrapper_source.cpp`: C++ wrapper
+
+### Build System
+- `CMakeLists.txt`: Main build configuration
+- `SYCL.cmake`: SYCL-specific build rules
+- `.github/workflows/intel_test.yml`: CI configuration
+
+## Documentation Standards
+
+### Code Comments
+- Document Intel Xe-specific optimizations and constraints
+- Explain template parameter purposes and valid ranges
+- Include usage examples for complex template patterns
+- Reference Intel GPU programming guides where applicable
+
+### Commit Messages
+- Use conventional commit format: `type(scope): description`
+- Common types: `feat`, `fix`, `perf`, `refactor`, `test`, `docs`
+- Include architecture scope when relevant: `feat(xe20): add BMG GEMM support`
+
+### Pull Request Guidelines
+- Include performance impact analysis for kernel changes
+- Test on multiple Intel GPU architectures when possible
+- Document any breaking changes to template interfaces
+- Provide before/after benchmarks for optimizations
+
+## Intel GPU-Specific Considerations
+
+### Memory Hierarchy
+- L3 cache optimization for Intel Xe
+- Shared local memory usage patterns
+- Global memory coalescing requirements
+
+### Compute Capabilities
+- DPAS instruction utilization
+- Subgroup size considerations (16 or 32)
+- Matrix engine programming patterns
+
+### Debugging and Profiling
+- Use Intel VTune for performance analysis
+- Intel Graphics Compiler optimization flags
+- SYCL profiling and debugging tools
+
+## Integration Points
+
+### Python Ecosystem
+- PyTorch XPU backend compatibility
+- NumPy array interface support
+- dpctl device management integration
+
+### Intel oneAPI Ecosystem
+- oneMKL integration for reference implementations
+- Level Zero runtime compatibility
+- Intel GPU driver requirements
+
+## Version and Compatibility
+
+- **SYCL Version**: Follow Intel SYCL implementation standards
+- **Intel oneAPI**: Support latest LTS and current releases
+- **Python**: Maintain compatibility with Python 3.8+
+- **CMake**: Minimum version 3.18 for SYCL support
+
+When reviewing or contributing code, always consider the Intel GPU architecture implications, SYCL best practices, and template library design patterns that maintain the high-performance characteristics of the CUTLASS foundation while enabling optimal execution on Intel hardware.
\ No newline at end of file
diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
index 09bcd640ab..e32c8128e3 100644
--- a/.github/workflows/intel_test.yml
+++ b/.github/workflows/intel_test.yml
@@ -43,16 +43,22 @@ jobs:
gpu: BMG
intel_graphics: ROLLING
sycl_target: intel_gpu_bmg_g21
+ igc_version_major: 2
+ igc_version_minor: 18
runner: bmg108629-01
- compiler: RELEASE
gpu: PVC
intel_graphics: ROLLING
sycl_target: intel_gpu_pvc
+ igc_version_major: 2
+ igc_version_minor: 11
runner: pvc146162-01
- compiler: NIGHTLY
gpu: PVC
intel_graphics: ROLLING
sycl_target: intel_gpu_pvc
+ igc_version_major: 2
+ igc_version_minor: 11
runner: pvc146162-01
name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }}
@@ -103,6 +109,8 @@ jobs:
cmake -G Ninja \
-DCUTLASS_ENABLE_SYCL=ON \
-DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
+ -DIGC_VERSION_MAJOR=${{ matrix.igc_version_major }} \
+ -DIGC_VERSION_MINOR=${{ matrix.igc_version_minor }} \
-DCMAKE_CXX_FLAGS="-Werror" \
-DCUTLASS_SYCL_RUNNING_CI=ON
cmake --build .
diff --git a/.github/workflows/intel_test_gpp_host.yml b/.github/workflows/intel_test_gpp_host.yml
index c75c9ad508..62c253324e 100644
--- a/.github/workflows/intel_test_gpp_host.yml
+++ b/.github/workflows/intel_test_gpp_host.yml
@@ -28,11 +28,15 @@ jobs:
gpu: BMG
intel_graphics: ROLLING
sycl_target: intel_gpu_bmg_g21
+ igc_version_major: 2
+ igc_version_minor: 18
runner: bmg108629-01
- compiler: RELEASE
gpu: PVC
intel_graphics: ROLLING
sycl_target: intel_gpu_pvc
+ igc_version_major: 2
+ igc_version_minor: 11
runner: pvc146162-01
@@ -83,9 +87,12 @@ jobs:
cmake -G Ninja \
-DCUTLASS_ENABLE_SYCL=ON \
-DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
+ -DIGC_VERSION_MAJOR=${{ matrix.igc_version_major }} \
+ -DIGC_VERSION_MINOR=${{ matrix.igc_version_minor }} \
-DCUTLASS_SYCL_RUNNING_CI=ON \
-DCMAKE_CXX_FLAGS="-Werror" \
- -DDPCPP_HOST_COMPILER=g++-13
+ -DDPCPP_HOST_COMPILER=g++-13 \
+ -DDPCPP_HOST_COMPILER_OPTIONS="-Werror"
cmake --build .
- name: Unit test
diff --git a/CHANGELOG-SYCL.md b/CHANGELOG-SYCL.md
index df341ceb58..08ad066fc3 100644
--- a/CHANGELOG-SYCL.md
+++ b/CHANGELOG-SYCL.md
@@ -1,6 +1,46 @@
-# CUTLASS SYCL Changelog
+# SYCL*TLA (previously referred to as cutlass-sycl) Changelog
-## [CUTLASS SYCL 0.5](https://github.com/intel/cutlass-sycl/releases/tag/v0.5) (2025-09-26)
+## [SYCL*TLA 0.6](https://github.com/intel/sycl-tla/releases/tag/v0.6) (2025-11-03)
+### Major Architecture Changes
+- **Flash Attention Reimplementation ([#d02c58b](https://github.com/intel/sycl-tla/commit/d02c58b4))**: Complete rewrite of Flash Attention using new Xe atoms
+ - Enhanced performance with optimized memory access patterns
+ - Better integration with Intel Xe hardware capabilities
+- **CUTLASS Library Generation ([#578](https://github.com/intel/sycl-tla/pull/578))**: Full support for CUTLASS library generation and operations
+ - New Xe architecture support in library generation pipeline
+ - Automated kernel instantiation and compilation support
+
+### Enhancements
+- **Python Operations Support ([#595](https://github.com/intel/sycl-tla/pull/595))**: Enhanced Python bindings with comprehensive test coverage
+ - Improved Python API stability and usability
+ - Enhanced test framework for Python operations
+- **CuTe Subgroup Extensions**: New subgroup-scope operations for Intel Xe
+ - Subgroup broadcast and reduction operations ([#9a6aa27](https://github.com/intel/sycl-tla/commit/9a6aa27c))
+ - `make_subgroup_tensor` helpers for improved tensor manipulation ([#21fb89a](https://github.com/intel/sycl-tla/commit/21fb89a8))
+- **Enhanced 2D Copy Operations**: Extended block 2D copy functionality
+ - New `make_block_2d_copy_{C,D}` variants with subtiling support ([#48d82e8](https://github.com/intel/sycl-tla/commit/48d82e87))
+ - Support for size-1 fragments in block 2D copies ([#2212f1b](https://github.com/intel/sycl-tla/commit/2212f1b9))
+- **4-bit VNNI Reorders ([#593](https://github.com/intel/sycl-tla/pull/593))**: New 4-bit unit stride to VNNI reorder operations
+- **Batch GEMM with new APIs ([#540](https://github.com/intel/sycl-tla/pull/540))**: Enhanced Batch GEMM with new streamlined APIs
+- **Grouped GEMM with new APIs ([#574](https://github.com/intel/sycl-tla/pull/574))**: Enhanced grouped GEMM with new streamlined APIs
+
+### Test Improvements
+- **Python Test Coverage**: Comprehensive test suite improvements for Python operations
+- **CI Infrastructure**: Enhanced continuous integration with PVC driver updates ([#575](https://github.com/intel/sycl-tla/pull/575))
+- **Code Reorganization**: Renamed `python/cutlass` to `python/cutlass_cppgen` for clarity ([#587](https://github.com/intel/sycl-tla/pull/587))
+
+### Bug Fixes
+- **Epilogue Data Type Fixes**:
+ - Fixed trD compute type in Xe Epilogue ([#580](https://github.com/intel/sycl-tla/pull/580))
+ - Resolved epilogue data type mismatches ([#563](https://github.com/intel/sycl-tla/pull/563))
+- **CuTe Copy(new APIs) Improvements**: Multiple fixes for Xe copy operations ([#dec36a9](https://github.com/intel/sycl-tla/commit/dec36a9e))
+- **Split Barrier Refactoring**: Improved split barrier functionality for better reliability ([#521dfcd](https://github.com/intel/sycl-tla/commit/521dfcd4))
+
+### Notes and Known Issues
+- Python Operations for FP8 and INT8 not generated for CUTLASS library in this release.
+- Unit tests and benchmark tests are not yet migrated to newly re architected CuTe APIs.
+
+
+## [SYCL*TLA 0.5](https://github.com/intel/cutlass-sycl/releases/tag/v0.5) (2025-09-26)
### Major Architecture Changes
- **Xe Rearchitecture ([#477](https://github.com/intel/cutlass-sycl/pull/477))**: Complete redesign of Xe CuTe atoms with new architecture
- New MMA atoms for improved performance
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7a897f95bd..4d4e8e7599 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,34 +2,93 @@
# CUTLASS 4.x
-## [4.2.0](https://github.com/NVIDIA/cutlass/tree/main) (2025-08-21)
+## [4.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v4.2.1) (2025-09-22)
### CuTe DSL
-* We will likely be skipping 4.2.dev release and directly target 4.2.
-* CuTeDSL version remains at 4.1.0 till then.
+* Bug fixings and improvements
+ - Fixed an issue when running DSL codes with cuda-python 13.0
+ - Fixed an issue when running inductor with DSL codes
+ - Fixed an issue with unexpected logging when running DSL codes in FlashInfer
+ - Fixed the issue reported in https://github.com/NVIDIA/cutlass/issues/2647
+ - Fixed an issue when conditional define of variables outside of dynamic control flow
### CUTLASS C++
-* Add K major scale factor support for Hopper SM90 blockwise kernels.
+* Bypass EVT for nosmem blockwise kernels on Blackwell.
+* Rename cutlass/python/cutlass directory to cutlass/python/cutlass_cppgen.
+
+## [4.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v4.2.0) (2025-09-15)
+
+### CuTe DSL
+* More Python versions are now supported for both x86-64 and aarch64, including
+ - Python 3.10, 3.11, 3.12, and 3.13
+* Added new example and updated notebook to get started with CuTe DSL
+ - [Call kernels with dlpack bypassed](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/ampere/call_bypass_dlpack.py)
+ - Updates on [TensorSSA demonstration](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/tensorssa.ipynb)
+ + Added a section for introducing the broadcast
+* API updates
+ - Please refer to [DSL API changelog](https://docs.nvidia.com/cutlass/media/docs/pythonDSL/cute_dsl_api/changelog.html) for details
+* Bug fixings and improvements
+ - Fixed ``cute.print_tensor`` for coordinate tensor
+ - Fixed `cute.print` for tuple of layouts
+ - Fixed frozen object is not properly updated after fully assigned in dynamic control flow
+ - Fixed assign tuple/list element in a dynamic control flow may cause compilation failure
+ - Improved error message when CUDA context is not initialized
+ - Improved docstring of congruent and weakly_congruent
+
+### CUTLASS C++
+* Support for Blackwell SM103 kernels for B300 GPUs.
+ - Collective mainloop codes: [Blockscaled datatypes with support for dense GEMM mainloop](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm103_blockscaled_mma_warpspecialized.hpp)
+ - New [GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
+ - Kernel codes: [Blockscaled datatypes with support for dense GEMM kernel](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp).
+* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM103 architecture:
+ - [Blockscaled ultra fp4 dense GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/89_sm103_fp4_ultra_gemm/).
+ - [Blockscaled ultra fp4 dense grouped GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/90_sm103_fp4_ultra_grouped_gemm).
+* Set of unit tests that demonstrate the usage of Blackwell SM103 blockscaled GEMM
+ - Unit test files with prefix name of `sm103_` under [GEMM device unit tests](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/).
+* Support for Blackwell SM121 kernels for DGX Spark GPUs.
+ - Share the major codes with Blackwell SM120 kernels.
+* Add support for heuristics-based kernel filtering and autotuning using `nvidia-matmul-heuristics` to find the best kernels for a given scenario.
+ - Details please refer to [heuristics doc](https://github.com/NVIDIA/cutlass/tree/main/media/docs/cpp/heuristics.md).
* Further enhance Blackwell SM100 Attention kernels in [example 77](https://github.com/NVIDIA/cutlass/tree/main/examples/77_blackwell_fmha/).
- Add fused reduction kernel support for cutlass MLA.
+ - Add softmax skip correction.
+ - Support for GQA in FMHA backward kernel.
- Fix an issue where `get_unmasked_trip_count` may return a negative value.
- Fix an issue where mbarriers are initialized with a zero arrival count.
-* Add Blackwell SM120 blockwise gemm kernel example: [example 87](https://github.com/NVIDIA/cutlass/tree/main/87_blackwell_geforce_gemm_blockwise/).
-* Support for Blackwell SM100 cpasync kernel.
- - Collective mainloop codes: [cpasync mainloop](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_mma_cpasync_warpspecialized.hpp).
- - Kernel codes: [cpasync kernel](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm100_gemm_cpasync_warpspecialized.hpp).
-* Support for Blackwell SM121 kernels for DGX Spark GPUs.
- - Share the major codes with Blackwell SM120 kernels.
+ - Fix a corner case issue where the sequence length of q is not a multiple of tile_q.
+ - Remove tma padding for forward kernel inputs.
+* Add Blackwell SM100 kernels for MoEs (focusing on Low-Latency inference performance): [example 92](https://github.com/NVIDIA/cutlass/tree/main/examples/92_blackwell_moe_gemm/). It uses TMA (for weights) and CPASYNC (for tokens) to load input matrices and allow only one problem dimension to vary across groups/experts, unlike general Grouped GEMMs. Note: further API simplifications and kernel improvements are upcoming. Any feedback on API is welcome.
+* Further enhance blockwise and groupwise GEMMs on Hopper and Blackwell
+ - On Blackwell SM120, a blockwise gemm kernel is added: [example 87](https://github.com/NVIDIA/cutlass/tree/main/examples/87_blackwell_geforce_gemm_blockwise/).
+ - On Hopper, add K major scale factor support for SM90 blockwise kernels.
+ - On Hopper, relax the restriction that the k dimension of the problem size has to be the multiple of the k dimension of the tile size.
+ - On Hopper, grouped version supports the case when k = 0.
+* Support for Blackwell SM100 fp4 gemv kernels.
+ - Kernel codes: [Gemv kernel](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/gemv_blockscaled.h).
+ - Example codes: [example 91](https://github.com/NVIDIA/cutlass/tree/main/examples/91_fp4_gemv/)
* Support for Blackwell SM100 legacy mixed input GEMM kernels.
- Collective mainloop codes: [Mixed input mainloop](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp).
- Kernel codes: [Mixed input kernel](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mixed_input_transform.hpp).
- Example codes: [example 86](https://github.com/NVIDIA/cutlass/tree/main/examples/86_blackwell_mixed_dtype_gemm/).
-* Support for Blackwell SM100 fp4 gemv kernels.
- - Kernel codes: [Gemv kernel](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/gemv_blockscaled.h).
- - Example codes: [example 91](https://github.com/NVIDIA/cutlass/tree/main/examples/91_fp4_gemv/)
+* Support for Blackwell SM100 cpasync kernel.
+ - Collective mainloop codes: [cpasync mainloop](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_mma_cpasync_warpspecialized.hpp).
+ - Kernel codes: [cpasync kernel](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm100_gemm_cpasync_warpspecialized.hpp).
+* Support Blackwell SM120 mixed input blockscaled grouped GEMM.
+* Instantiating more Blackwell kernels in profiler.
+ - Blackwell SM100 and SM103 kernels support `CUTLASS_LIBRARY_INSTANTIATION_LEVEL` to instantiate all possible combinations.
+ - To use this feature, `CUTLASS_LIBRARY_KERNELS` must be non-empty. Profiler will combine `CUTLASS_LIBRARY_KERNELS` and `CUTLASS_LIBRARY_INSTANTIATION_LEVEL` to instantiate specific kernels.
+ - Details please check [Profiler Doc](https://github.com/NVIDIA/cutlass/tree/main/media/docs/cpp/profiler.md).
+* Fix some profiler issues:
+ - Modify default cluster callback values to none 0 to avoid profiler failure when these values are not set in command line.
+ - Fix some no output and timeout issues.
+ - Fix Pingpong Blockwise Hopper library generation.
* From CUDA 13.0, the Blackwell SM101 for Thor GPUs is renamed to SM110.
- For CUDA toolkit version < 13.0, SM101 is still used for Thor GPUs.
- For CUDA toolkit version >= 13.0, SM110 is used for Thor GPUs and SM101 is no longer valid.
+* Rename legacy Python API package from `cutlass` to `cutlass_cppgen` and add Blackwell EVT support to legacy Python interface.
+ - Restructuring the C++ Blackwell SM100 Collective Epilogue Builder to work with the Python interface's `EpilogueDescriptors`.
+ - Added Blackwell SM100 EVT Emitter on the Python side and routed most emission through Hopper SM90 Emitter.
+ - Added some support for running SM100 kernels via the Python interface.
* CuTe changes:
- Fix inaccurate GridDim calculation under [CuTe tutorial](https://github.com/NVIDIA/cutlass/tree/main/examples/cute/tutorial/blackwell/).
- Add [movmatrix](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-movmatrix) support.
@@ -38,18 +97,15 @@
- Shorten `nullspace` implementation.
- Isolate and comment on `cosize` hacks.
- Important documentation correction: `E<0,1> == 1@0@1`.
-* Add support for heuristics-based kernel filtering and autotuning using `nvidia-matmul-heuristics`.
- - Details please refer to [heuristics doc](https://github.com/NVIDIA/cutlass/tree/main/media/docs/cpp/heuristics.md).
-* Rename legacy Python API package from `cutlass` to `cutlass_cppgen`.
-* Fix some profiler issues:
- - Modify default cluster callback values to none 0 to avoid profiler failure when these values are not set in command line.
- - Fix some no output and timeout issues.
+* Fix some kernel issues:
+ - Fix Hopper SM90 group gemm kernel to only use the commit group and wait group instead of also waiting on mbarriers.
+ - Fix a tiny bug when K is large for Blackwell SM103 fp4 grouped GEMM kernel.
* Add following unit tests:
- [fp16 accmulator for sm89 fp8 mma](https://github.com/NVIDIA/cutlass/tree/main/test/unit/cute/ampere/cooperative_gemm.cu)
- [movmatrix test](https://github.com/NVIDIA/cutlass/tree/main/test/unit/cute/turing/movm.cu)
- [fp8 narrow mma n](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_void_f32_narrow_mma_n.cu) and [fp16 narrow mma n](test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_void_bf16_narrow_mma_n.cu)
* Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs!
-* Optimal code generation with CUDA toolkit versions 13.0.
+* Optimal code generation with CUDA toolkit versions 13.0U1.
## [4.1.0](https://github.com/NVIDIA/cutlass/releases/tag/v4.1.0) (2025-07-16)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f0110c4d71..c9f974cac2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,24 +125,50 @@ option(CUTLASS_SYCL_BUILTIN_ENABLE "Enable this option to use builtin functions
if (CUTLASS_ENABLE_SYCL)
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
- if(DPCPP_SYCL_TARGET STREQUAL "nvptx64-nvidia-cuda")
+ string(REPLACE "," ";" DPCPP_SYCL_TARGET_LIST "${DPCPP_SYCL_TARGET}")
+ message(STATUS "DPCPP_SYCL_TARGET as list ${DPCPP_SYCL_TARGET_LIST}")
+
+ if("nvptx64-nvidia-cuda" IN_LIST DPCPP_SYCL_TARGET_LIST)
set(SYCL_NVIDIA_TARGET ON)
add_compile_definitions(SYCL_NVIDIA_TARGET)
- elseif(DPCPP_SYCL_TARGET STREQUAL "intel_gpu_pvc" OR
- DPCPP_SYCL_TARGET STREQUAL "spir64" OR
- DPCPP_SYCL_TARGET STREQUAL "intel_gpu_bmg_g21")
- set(SYCL_INTEL_TARGET ON)
- add_compile_definitions(SYCL_INTEL_TARGET)
- add_compile_options(-Wall
- -Wno-unused-variable
- -Wno-unused-local-typedef
- -Wno-unused-but-set-variable
- -Wno-uninitialized
- -Wno-reorder-ctor
- -Wno-logical-op-parentheses
- -Wno-unused-function
- -Wno-unknown-pragmas
- )
+ list(REMOVE_ITEM DPCPP_SYCL_TARGET_LIST "nvptx64-nvidia-cuda")
+ endif()
+
+ set(INTEL_SYCL_TARGETS
+ intel_gpu_pvc
+ spir64
+ intel_gpu_bmg_g21
+ bmg
+ pvc
+ )
+
+ foreach(ITEM IN LISTS DPCPP_SYCL_TARGET_LIST)
+ if(ITEM IN_LIST INTEL_SYCL_TARGETS)
+ set(SYCL_INTEL_TARGET ON)
+ else()
+ message(FATAL_ERROR "In SYCL target list '${DPCPP_SYCL_TARGET}' ${ITEM} is invalid.")
+ endif()
+ endforeach()
+
+ if (SYCL_NVIDIA_TARGET AND SYCL_INTEL_TARGET)
+ message(FATAL_ERROR "Both SYCL_NVIDIA_TARGET and SYCL_INTEL_TARGET are set. Only one is allowed.")
+ elseif (NOT SYCL_NVIDIA_TARGET AND NOT SYCL_INTEL_TARGET)
+ message(FATAL_ERROR "Both SYCL_NVIDIA_TARGET and SYCL_INTEL_TARGET are unset.")
+ endif()
+
+ if(SYCL_INTEL_TARGET)
+ add_compile_definitions(SYCL_INTEL_TARGET)
+ add_compile_options(
+ -Wall
+ -Wno-unused-variable
+ -Wno-unused-local-typedef
+ -Wno-unused-but-set-variable
+ -Wno-uninitialized
+ -Wno-reorder-ctor
+ -Wno-logical-op-parentheses
+ -Wno-unused-function
+ -Wno-unknown-pragmas
+ )
endif()
add_compile_definitions(CUTLASS_ENABLE_SYCL)
@@ -158,7 +184,6 @@ if (CUTLASS_ENABLE_SYCL)
add_compile_definitions(CUTLASS_SYCL_BUILTIN_ENABLE)
endif()
- include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/onemkl.cmake)
endif()
find_package(Doxygen QUIET)
@@ -201,7 +226,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
if(CUTLASS_ENABLE_HEADERS_ONLY)
set(CUTLASS_ENABLE_EXAMPLES_INIT OFF)
- set(CUTLASS_ENABLE_TOOLS_INIT ON)
+ set(CUTLASS_ENABLE_TOOLS_INIT OFF)
set(CUTLASS_ENABLE_LIBRARY_INIT OFF)
set(CUTLASS_ENABLE_TESTS_INIT OFF)
else()
@@ -213,6 +238,9 @@ else()
else()
set(CUTLASS_ENABLE_TESTS_INIT OFF)
endif()
+
+ # Not include MKL when headers only
+ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/onemkl.cmake)
endif()
set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
@@ -417,7 +445,7 @@ set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma-delimited list of opera
set(CUTLASS_LIBRARY_KERNELS ${CUTLASS_LIBRARY_KERNELS_INIT} CACHE STRING "Comma-delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If the string 'all' is specified, all kernels are enabled.")
set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma-delimited list of kernels to exclude from build. This option ONLY takes effect if CUTLASS_LIBRARY_KERNELS is set.")
set(CUTLASS_LIBRARY_EXCLUDE_KERNELS "" CACHE STRING "Comma-delimited list of kernels to exclude from build. This option always takes effect, whether or not CUTLASS_LIBRARY_KERNELS is set. It also can exclude kernels from the filter file (see KERNEL_FILTER_FILE).")
-set(CUTLASS_LIBRARY_INSTANTIATION_LEVEL "" CACHE STRING "Instantiation level for SM90 kernels. Set to `max` and make sure CUTLASS_LIBRARY_KERNELS is non-empty to stamp all possible kernel configurations.")
+set(CUTLASS_LIBRARY_INSTANTIATION_LEVEL "" CACHE STRING "Instantiation level for SM90 and SM100 kernels. Set to `max` and make sure CUTLASS_LIBRARY_KERNELS is non-empty to stamp all possible kernel configurations.")
if(CUTLASS_LIBRARY_INSTANTIATION_LEVEL OR CUTLASS_LIBRARY_HEURISTICS_PROBLEMS_FILE)
message(STATUS "Enable extended SM90 WGMMA instruction shapes for instantiation levels")
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index e6715dd490..0c725d3248 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -2,7 +2,7 @@
[README](./README.md#documentation) > **Contributors**
-# CUTLASS SYCL Developers **
+# SYCL*TLA Developers **
Alejandro Acosta
Amit Singh Chandel
diff --git a/README.md b/README.md
index 0f2e74ed4d..a4544af34a 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,15 @@

-# CUTLASS SYCL 0.5
+# SYCL\* Templates for Linear Algebra (SYCL\*TLA)
-**This repository fast-follows NVIDIA CUTLASS repository adding SYCL support for Intel GPUs.**
+**This repository is forked from the NVIDIA CUTLASS repository and extends CUTLASS and CuTe API support to Intel GPUs through SYCL enablement.**
+*This project was previously referred to as CUTLASS-SYCL, you may see references to CUTLASS-SYCL in the code and documentation.*
+*For SYCL support instructions, refer to the [SYCL build documentation](./media/docs/cpp/build/building_with_sycl_support.md)*
-**For SYCL support instructions, refer to the [SYCL build documentation](./media/docs/cpp/build/building_with_sycl_support.md)**
+*SYCL is a trademark of the Khronos Group Inc, Other names and brands may be claimed as the property of others.*
+[](https://scorecard.dev/viewer/?uri=github.com/intel/sycl-tla)
-[](https://scorecard.dev/viewer/?uri=github.com/intel/cutlass-sycl)
-
-CUTLASS‑SYCL is a modular, header‑only C++ template framework for high‑performance
+SYCL\*TLA is a modular, header‑only C++ template framework for high‑performance
GEMM, and fused epilogue kernels. It applies hierarchical tiling, composable policy
abstractions, and efficient data‑movement primitives to build flexible, reusable
building blocks for dense linear algebra. The SYCL implementation brings those
@@ -16,13 +17,13 @@ optimizations to Intel GPUs with tuned kernels for modern execution units and me
hierarchies. It adds mixed‑precision and epilogue fusion pathways designed to
simplify integrating advanced quantization and post‑processing into custom pipelines.
-To support a wide variety of applications, CUTLASS-SYCL provides extensive
+To support a wide variety of applications, SYCL\*TLA provides extensive
support for mixed-precision computations on Intel hardware, providing
specialized data-movement and multiply-accumulate abstractions for FP64, FP32,
FP16, BF16, 8b floating point types (E5M2 and E4M3 for FP8), narrow integer
types (4 and 8b signed and unsigned integers with support for zero-point
quantization), and mixed-precision operations with tensor-wise, channel-wise,
-and group-wise quantization support. CUTLASS-SYCL demonstrates optimal matrix
+and group-wise quantization support. SYCL\*TLA demonstrates optimal matrix
multiply operations targeting Intel's programmable, high-throughput execution
units implemented in Intel Data Center GPU Max/Flex Series (Intel Xe
architecture, codename: Ponte-Vecchio) and Intel Arc B580 GPUs.
@@ -33,41 +34,48 @@ See the [functionality docs](./media/docs/cpp/functionality.md) for a more compr
list of kernel level features, data types, instructions, and minimum supported by CUTLASS on each GPU
architecture.
-Base NVIDIA CUTLASS Versions for CUTLASS-SYCL releases:
-| CUTLASS SYCL | NVIDIA CUTLASS |
+This project fast follows NVIDIA CUTLASS releases to ensure parity of APIs and features.
+
+Base NVIDIA CUTLASS Versions for SYCL*TLA releases:
+| SYCL*TLA | NVIDIA CUTLASS |
|-----------------|----------|
|0.1| 3.9|
|0.2 | 3.9.2 |
|0.3 | 3.9.2 |
|0.5 | 4.2.0 |
+|0.6 | 4.2.0 |
-# What's New in CUTLASS SYCL 0.5
+# What's New in SYCL*TLA 0.6
+## [SYCL*TLA 0.6](https://github.com/intel/sycl-tla/releases/tag/v0.6) (2025-11-03)
### Major Architecture Changes
-- **Xe Rearchitecture ([#477](https://github.com/intel/cutlass-sycl/pull/477))**: Complete redesign of Xe CuTe atoms with new architecture
- - New MMA atoms for improved performance
- - Enhanced 2D copy atoms (loads, stores, prefetch with VNNI/transpose support)
- - New 2D copy helpers (low-level `make_block_2d_copy` and high-level `make_block_2d_copy_{A,B,C}`)
- - Generic and optimized reorder atoms for {int4, uint4, int8, uint8, e2m1, e4m3, e5m2} -> {half, bfloat16}
- - Requires IGC version [v2.18.5](https://github.com/intel/intel-graphics-compiler/releases/tag/v2.18.5) or later
-
-### New Features
-- **G++ Host Compiler Support ([#490](https://github.com/intel/cutlass-sycl/pull/490))**: Support for G++ 13 as host compiler
-- Migrated `syclcompat` to this repository as `cutlasscompat` for better compatibility
- - Fixed compilation issues when using G++ instead of clang++
- - Added new CI workflow for testing G++ host compiler builds
- - Enhanced build system to support `-DDPCPP_HOST_COMPILER=g++` option
-- **Grouped GEMM for Mixed Dtype ([#457](https://github.com/intel/cutlass-sycl/pull/457))**: Extended grouped GEMM support to mixed precision operations
- - Added support for BF16 + S8 mixed dtype grouped GEMM
- - Added support for FP16 + U4 mixed dtype grouped GEMM
- - New examples: `10_bmg_grouped_gemm_bf16_f16_s8.cpp` and `10_bmg_grouped_gemm_f16_u4.cpp`
+- **Flash Attention Reimplementation ([#d02c58b](https://github.com/intel/sycl-tla/commit/d02c58b4))**: Complete rewrite of Flash Attention using new Xe atoms
+ - Enhanced performance with optimized memory access patterns
+ - Better integration with Intel Xe hardware capabilities
+- **CUTLASS Library Generation ([#578](https://github.com/intel/sycl-tla/pull/578))**: Full support for CUTLASS library generation and operations
+ - New Xe architecture support in library generation pipeline
+ - Automated kernel instantiation and compilation support
+
+### Enhancements
+- **Python Operations Support ([#595](https://github.com/intel/sycl-tla/pull/595))**: Enhanced Python bindings with comprehensive test coverage
+ - Improved Python API stability and usability
+ - Enhanced test framework for Python operations
+- **CuTe Subgroup Extensions**: New subgroup-scope operations for Intel Xe
+ - Subgroup broadcast and reduction operations ([#9a6aa27](https://github.com/intel/sycl-tla/commit/9a6aa27c))
+ - `make_subgroup_tensor` helpers for improved tensor manipulation ([#21fb89a](https://github.com/intel/sycl-tla/commit/21fb89a8))
+- **Enhanced 2D Copy Operations**: Extended block 2D copy functionality
+ - New `make_block_2d_copy_{C,D}` variants with subtiling support ([#48d82e8](https://github.com/intel/sycl-tla/commit/48d82e87))
+ - Support for size-1 fragments in block 2D copies ([#2212f1b](https://github.com/intel/sycl-tla/commit/2212f1b9))
+- **4-bit VNNI Reorders ([#593](https://github.com/intel/sycl-tla/pull/593))**: New 4-bit unit stride to VNNI reorder operations
+- **Batch GEMM with new APIs ([#540](https://github.com/intel/sycl-tla/pull/540))**: Enhanced Batch GEMM with new streamlined APIs
+- **Grouped GEMM with new APIs ([#574](https://github.com/intel/sycl-tla/pull/574))**: Enhanced grouped GEMM with new streamlined APIs
**See the [CHANGELOG](CHANGELOG-SYCL.md) for details of all past releases and updates.**
# CuTe
-CUTLASS-SYCL supports the newly introducted core library, CuTe, to describe and manipulate tensors of threads and data.
-CuTe in CUTLASS-SYCL is a collection of C++ SYCL template abstractions for
+SYCL\*TLA supports the newly introduced core library, CuTe, to describe and manipulate tensors of threads and data.
+CuTe in SYCL\*TLA is a collection of C++ SYCL template abstractions for
defining and operating on hierarchically multidimensional layouts of threads and data.
CuTe provides `Layout` and `Tensor` objects that compactly package the type,
shape, memory space, and layout of data, while performing the complicated indexing for the user.
@@ -81,7 +89,7 @@ The representation of layouts is powerful enough to represent nearly
everything we need to implement efficient dense linear algebra.
Layouts can also be combined and manipulated via functional composition, on which we build a large set of common operations such as tiling and partitioning.
-CUTLASS-SYCL and beyond adopts CuTe throughout the GEMM hierarchy in its templates.
+SYCL\*TLA and beyond adopts CuTe throughout the GEMM hierarchy in its templates.
This greatly simplifies the design and improves code composability and readability.
More documentation specific to CuTe can be found in its
[dedicated documentation directory](./media/docs/cpp/cute/00_quickstart.md).
@@ -97,7 +105,7 @@ Minimum requirements:
## Hardware Support
-CUTLASS-SYCL runs successfully on the following Intel GPUs.
+SYCL*TLA runs successfully on the following Intel GPUs.
|**GPU**|**Intel GPU Architecture**
|---|---|
@@ -119,7 +127,7 @@ We are regularly testing following setup in CI.
## Target Architecture
-The target architecture information is passed on to CUTLASS-SYCL via the cmake flag
+The target architecture information is passed on to SYCL*TLA via the cmake flag
`DPCPP_SYCL_TARGET`.
```
@@ -156,13 +164,13 @@ CUTLASS is described in the following documents and the accompanying
# Resources
-# Building CUTLASS-SYCL
+# Building SYCL*TLA
-CUTLASS-SYCL is a header-only template library and does not need to be built to be used by other
-projects. Client applications should target CUTLASS-SYCL's `include/` directory in their include
+SYCL*TLA is a header-only template library and does not need to be built to be used by other
+projects. Client applications should target SYCL*TLA's `include/` directory in their include
paths.
-CUTLASS-SYCL unit tests, examples, and utilities can be built with CMake.
+SYCL*TLA unit tests, examples, and utilities can be built with CMake.
The minimum version of CMake is given in the [Quickstart guide](./media/docs/cpp/quickstart.md).
Make sure you have Intel oneAPI DPC++ compiler installed and the environment is properly set up.
@@ -170,7 +178,7 @@ Make sure you have Intel oneAPI DPC++ compiler installed and the environment is
$ source /opt/intel/oneapi/setvars.sh
```
-Create a build directory within the CUTLASS-SYCL project, then run CMake. You need to specify
+Create a build directory within the SYCL*TLA project, then run CMake. You need to specify
the target Intel GPU architecture using the `DPCPP_SYCL_TARGET` flag.
For Intel Data Center GPU Max Series (Ponte Vecchio), use `intel_gpu_pvc`.
For Intel Arc GPU B580 Graphics, use `intel_gpu_bmg_g21`.
@@ -193,9 +201,9 @@ To compile with G++ as host compiler, add the flag `-DDPCPP_HOST_COMPILER=g++-13
$ CC=icx CXX=icpx cmake .. -G Ninja -DCUTLASS_ENABLE_SYCL=ON -DDPCPP_HOST_COMPILER=g++-13 -DDPCPP_SYCL_TARGET="intel_gpu_bmg_g21" # compiles for Intel Arc GPU B580 Graphics with G++ as host compiler
```
-From the `build/` directory, compile and run the CUTLASS-SYCL unit tests by building the target `test_unit` with make.
+From the `build/` directory, compile and run the SYCL*TLA unit tests by building the target `test_unit` with make.
-The unit tests are organized as several binaries mirroring the top-level namespaces of CUTLASS-SYCL,
+The unit tests are organized as several binaries mirroring the top-level namespaces of SYCL*TLA,
and they may be executed in parallel via make's `-j` command line argument.
```bash
@@ -213,12 +221,12 @@ All tests should pass on supported Intel GPU platforms, though the exact number
# Project Structure
-CUTLASS-SYCL is arranged as a header-only library along with Utilities, Tools, Examples, and unit tests.
+SYCL*TLA is arranged as a header-only library along with Utilities, Tools, Examples, and unit tests.
-A detailed explanation of the source code organization may be found in the
-[CUTLASS-SYCL documentation](./media/docs/cpp/code_organization.md), but several main components are summarized below.
+A detailed explanation of the source code organization may be found in the
+[SYCL*TLA documentation](./media/docs/cpp/code_organization.md), but several main components are summarized below.
-## CUTLASS-SYCL Template Library
+## SYCL*TLA
```
include/ # client applications should target this directory in their build's include paths
@@ -263,23 +271,23 @@ include/ # client applications should target this directory
```
-### CUTLASS SDK Examples
+### SYCL*TLA Examples
-[CUTLASS SDK examples](./examples) apply CUTLASS templates to implement basic computations.
+[SYCL*TLA examples](./examples) apply SYCL*TLA templates to implement basic computations.
### Tools
```
tools/
- library/ # CUTLASS-SYCL Instance Library - contains instantiations of all supported CUTLASS-SYCL templates
+ library/ # SYCL*TLA Instance Library - contains instantiations of all supported SYCL*TLA templates
include/
cutlass/
library/
- profiler/ # CUTLASS Profiler - SYCL support not yet available
+ profiler/ # Profiler - SYCL support not yet available
# (command-line utility for executing operations)
- util/ # CUTLASS-SYCL Utilities - contains numerous helper classes for
+ util/ # Utilities - contains numerous helper classes for
include/ # managing tensors in Intel GPU device memory, reference
cutlass/ # implementations for SYCL GEMM, random initialization
util/ # of tensors, and I/O for Intel GPU environments.
@@ -294,12 +302,40 @@ Instructions for building and running the Unit tests are described in the [Quick
# About
-CUTLASS-SYCL is released by INTEL Corporation as Open Source software under the
+SYCL*TLA is released by INTEL Corporation as Open Source software under the
[3-clause "New" BSD license](LICENSE.txt).
# Contributors
-The official list of CUTLASS-SYCL developers and contributors is available here: [CONTRIBUTORS](CONTRIBUTORS.md).
+The official list of SYCL*TLA developers and contributors is available here: [CONTRIBUTORS](CONTRIBUTORS.md).
+
+# Contributing
+
+## Pull Request Templates
+
+We provide concise PR templates to streamline documentation:
+
+### Quick Start
+
+**GitHub CLI:**
+```bash
+gh pr create --template .github/PULL_REQUEST_TEMPLATE/bug_fix.md
+gh pr create --template .github/PULL_REQUEST_TEMPLATE/performance.md
+gh pr create --template .github/PULL_REQUEST_TEMPLATE/feature.md
+gh pr create --template .github/PULL_REQUEST_TEMPLATE/refactoring.md
+```
+
+**GitHub Web:** Add `?template=.md` to PR URL (e.g., `?template=bug_fix.md`)
+
+### Which Template?
+
+- 🐛 **Bug fixes** → `bug_fix.md` - Root cause + verification
+- ⚡ **Performance** → `performance.md` - Profiling data + benchmarks
+- ✨ **Features** → `feature.md` - API design + examples
+- 🔨 **Refactoring** → `refactoring.md` - Refactored/Redesigned code
+- 📝 **Mixed/Other** → Default template
+
+See [`.github/PULL_REQUEST_TEMPLATE/README.md`](.github/PULL_REQUEST_TEMPLATE/README.md) for details.
# Copyright
diff --git a/SECURITY.md b/SECURITY.md
index 87f1750e7f..56b64f8994 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -2,7 +2,4 @@
## Reporting a Vulnerability
-To report a vulnerability or a security issue please fill the security
-advisories form [here](../../security/advisories/new), send an email to
-security@codeplay.com or contact us using the [contact form on our web
-page](https://codeplay.com/company/contact/?q=Report%20Security%20Issue).
+Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
diff --git a/applications/flash_attention_v2/collective/copy_block_slm.hpp b/applications/flash_attention_v2/collective/copy_block_slm.hpp
new file mode 100644
index 0000000000..41b34a2140
--- /dev/null
+++ b/applications/flash_attention_v2/collective/copy_block_slm.hpp
@@ -0,0 +1,169 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+namespace cute {
+
+/* Flat copies */
+template
+CUTE_HOST_DEVICE
+void
+copy_block_r2s(Tensor const& src,
+ Tensor & dst)
+{
+ static_assert(is_rmem_v && is_smem_v, "Expected rmem->smem copy");
+
+ auto atom_r2s = Copy_Atom, float>{}; // TODO: larger block messages
+
+ auto atom_shape = make_shape(_1{}, size(src));
+ auto src_v = src.compose(make_layout(atom_shape));
+ auto dst_v = dst.compose(make_layout(atom_shape, Stride<_0, _16>{}));
+
+ copy(atom_r2s, src_v, dst_v);
+}
+
+template
+CUTE_HOST_DEVICE
+void
+copy_block_s2r(Tensor const& src,
+ Tensor & dst)
+{
+ static_assert(is_smem_v && is_rmem_v, "Expected smem->rmem copy");
+
+ auto atom_s2r = Copy_Atom, float>{};
+
+ auto atom_shape = make_shape(_1{}, size(dst));
+ auto src_v = src.compose(make_layout(atom_shape, Stride<_0, _16>{}));
+ auto dst_v = dst.compose(make_layout(atom_shape));
+
+ copy(atom_s2r, src_v, dst_v);
+}
+
+/* Coordinate-aware copies */
+template
+CUTE_HOST_DEVICE
+void
+copy_block_r2s(SubgroupTensor const& src,
+ Tensor & dst,
+ DstCoordLayout const& dst_c)
+{
+ using _SG = intel::_SGSize;
+
+ static_assert(is_rmem_v && is_smem_v, "Expected rmem->smem copy");
+ static_assert(sizeof_bits_v == 32, "Only 32-bit data supported");
+
+ auto atom_r2s = Copy_Atom, float>{}; // TODO: larger block messages
+
+ auto atom_shape = make_shape(_1{}, size(SrcLayout{}));
+
+ auto src_c_wi0 = composition(project_strides(SrcCoordLayout{}), make_layout(atom_shape, Stride<_0, _SG>{}));
+ auto rlayout = composition(right_inverse(project_strides(dst_c)), src_c_wi0);
+
+ auto src_v = src.compose(make_layout(atom_shape));
+ auto dst_v = dst.compose(rlayout);
+
+ copy(atom_r2s, src_v, dst_v);
+}
+
+template
+CUTE_HOST_DEVICE
+void
+copy_block_s2r(Tensor const& src,
+ SrcCoordLayout const& src_c,
+ SubgroupTensor & dst)
+{
+ using _SG = intel::_SGSize;
+
+ static_assert(is_smem_v && is_rmem_v, "Expected smem->rmem copy");
+ static_assert(sizeof_bits_v == 32, "Only 32-bit data supported");
+
+ auto atom_s2r = Copy_Atom, float>{};
+
+ auto atom_shape = make_shape(_1{}, size(DstLayout{}));
+
+ auto dst_c_wi0 = composition(project_strides(DstCoordLayout{}), make_layout(atom_shape, Stride<_0, _SG>{}));
+ auto rlayout = composition(right_inverse(project_strides(src_c)), dst_c_wi0);
+
+ auto src_v = src.compose(rlayout);
+ auto dst_v = dst.compose(make_layout(atom_shape));
+
+ copy(atom_s2r, src_v, dst_v);
+}
+
+/* Variants accepting rvalue dst */
+template
+CUTE_HOST_DEVICE
+void
+copy_block_r2s(Tensor const& src,
+ Tensor && dst)
+{
+ return copy_block_r2s(src, dst);
+}
+
+template
+CUTE_HOST_DEVICE
+void
+copy_block_s2r(Tensor const& src,
+ Tensor && dst)
+{
+ return copy_block_s2r(src, dst);
+}
+
+template
+CUTE_HOST_DEVICE
+void
+copy_block_r2s(SubgroupTensor const& src,
+ Tensor && dst,
+ DstCoordLayout const& dst_c)
+{
+ return copy_block_r2s(src, dst, dst_c);
+}
+
+template
+CUTE_HOST_DEVICE
+void
+copy_block_s2r(Tensor const& src,
+ SrcCoordLayout const& src_c,
+ SubgroupTensor && dst)
+{
+ return copy_block_s2r(src, dst);
+}
+
+} /* namespace cute */
\ No newline at end of file
diff --git a/applications/flash_attention_v2/collective/xe_fmha_fwd_epilogue.hpp b/applications/flash_attention_v2/collective/xe_fmha_fwd_epilogue.hpp
new file mode 100644
index 0000000000..efa54931d3
--- /dev/null
+++ b/applications/flash_attention_v2/collective/xe_fmha_fwd_epilogue.hpp
@@ -0,0 +1,294 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/detail/layout.hpp"
+
+#include "cute/algorithm/subgroup_algorithms.hpp"
+#include "cute/algorithm/tensor_algorithms.hpp"
+
+#include "copy_block_slm.hpp"
+
+namespace cutlass::fmha::collective {
+
+using namespace cute;
+
+template // Optional TiledCopy for loading O
+class FMHAFwdEpilogue {
+
+public:
+ //
+ // Type Aliases
+ //
+ using TiledMMAPV = typename CollectiveMainloop::TiledMMAPV;
+ using TileShapePV = decltype(TiledMMAPV{}.tile_mnk());
+ using TileShapeO = TileShapeO_;
+ using SGPerWG = decltype(product(take<1,4>(shape(typename TiledMMAPV::ThrLayoutVMNK{}))));
+
+ using TensorO = TensorO_;
+ using TensorO2D = decltype(TensorO_{}(append>(make_coord(_,_),0)));
+ using ElementO = typename TensorO_::value_type;
+
+ using FragA = typename CollectiveMainloop::FragA;
+ using FragARow = typename CollectiveMainloop::FragARow;
+ using ElementA = typename FragA::value_type;
+
+ // Split k-reduced tiles between participating subgroups.
+ // Assumption: the A tile is contiguous.
+ using ReduceK = decltype(size<3>(typename TiledMMAPV::ThrLayoutVMNK{}));
+
+ static auto reduce_sg_v_helper() {
+ constexpr auto v_total_sg = get<1>(SGTileShapeA{}) / intel::_SGSize{};
+ constexpr auto v_avail_sg = ReduceK{} / ReduceSGQ{};
+ return Int<(v_total_sg > v_avail_sg) ? cute::gcd(v_total_sg, v_avail_sg) : v_total_sg>{};
+ }
+
+ using SGTileShapeA = decltype(atuple_coshape(FragA{}.tv_layout()));
+ using ReduceSGQ = decltype(cute::gcd(get<0>(SGTileShapeA{}), ReduceK{}));
+ using ReduceSGV = decltype(reduce_sg_v_helper());
+ using ReduceSGLayout = decltype(make_identity_layout(Shape{}));
+
+ using SGTileShapeO = decltype(shape_div(take<0,2>(SGTileShapeA{}), shape(ReduceSGLayout{})));
+
+ using ReduceFragA = decltype(make_subgroup_tensor(
+ make_layout(select<1,0>(SGTileShapeO{}),
+ Stride, E<0>>{})
+ ));
+ using ReduceFragARow = decltype(reduce<1>(ReduceFragA{}, sycl::plus{}));
+
+ static auto default_tiled_copy_O_helper() {
+ if constexpr (ReduceK{} == _1{})
+ return make_block_2d_copy_D(TiledMMAPV{}, TensorO2D{});
+ else
+ return make_block_2d_copy_D_subtiled(TiledMMAPV{}, ReduceFragA{}.tv_layout(), ReduceSGLayout{}, TensorO2D{});
+ }
+
+ using DefaultTiledCopyO = decltype(default_tiled_copy_O_helper());
+ using TiledCopyO = conditional_t, DefaultTiledCopyO, TiledCopyO_>;
+
+ // Stateless design -- no arguments or parameters.
+ struct Arguments {};
+ struct Params {};
+
+ // Shared memory storage
+ // Note sum/max tiles are padded to 16 elements, due to limitations in CuTe block load infrastructure.
+ using AlignedSGTileA_Q = C<((size<0>(SGTileShapeA{}) + intel::sg_size - 1) / intel::sg_size) * intel::sg_size>;
+
+ struct SharedStorageNone {};
+ struct SharedStorageReduceK {
+ cute::array a_data;
+ cute::array a_sum_data, a_max_data;
+ };
+
+ using SharedStorage = conditional_t<(ReduceK{} > _1{}), SharedStorageReduceK, SharedStorageNone>;
+
+private:
+ SharedStorage &shared;
+
+public:
+ static constexpr
+ Params to_underlying_arguments(Arguments const &args, void * /* workspace */) {
+ return {};
+ }
+
+ CUTLASS_HOST_DEVICE static bool can_implement(Arguments const&) {
+ return true;
+ }
+
+ CUTLASS_HOST_DEVICE
+ FMHAFwdEpilogue(Params const&, SharedStorage& shared_) : shared(shared_) {}
+
+ template
+ CUTLASS_DEVICE
+ void
+ operator()(TensorO2D const& O, // Global O tensor: (q,v)
+ FragA & tArA, // O accumulator: (q,v)
+ FragARow & tA_max, // Softmax row-wise max accumulator
+ FragARow & tA_sum, // Softmax row-wise sum accumulator
+ QVCoord blk_qv, // WG tile indices: (q,v)
+ int thr_id) { // Work-item ID
+
+ using namespace cute;
+ using ElementA = typename FragA::element_type;
+
+ // Reduce k-blocks of A and A_sum across WG, if needed.
+ auto [rA, rA_sum, active] = reduce_A(tArA, tA_max, tA_sum, thr_id);
+
+ /* Some subgroups may not have any work to do; if so, quit early. */
+ if (!active) return;
+
+ /* Complete softmax, dividing out sums. */
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < rA_sum.size(); i++)
+ rA_sum(i) = ElementA(1) / rA_sum(i);
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < rA.size(); i++)
+ rA(i) *= broadcast<0>(rA_sum, rA, i);
+
+ /* Tile output */
+ Tensor cO = make_identity_tensor(O.shape()); // (q,v)
+ Tensor gO = local_tile(cO, TileShapeO{}, blk_qv); // (q,v)
+
+ /* Prepare slices */
+ TiledCopyO copy_o{O};
+ auto thr_copy_o = copy_o.get_slice(thr_id);
+
+ auto tOrO = thr_copy_o.partition_sg_fragment_S(gO);
+ auto tOgO = thr_copy_o.partition_D(gO);
+
+ /* Reorder tile and write out */
+ reorder(rA, tOrO);
+ copy(copy_o, tOrO, tOgO);
+ }
+
+ // Reduce k-blocks of A and A_sum across WG, if needed.
+ // Note that each k block has its own scale factor based on A_max,
+ // so A/A_sum contributions need to be rescaled to match.
+ template
+ CUTLASS_DEVICE
+ decltype(auto)
+ reduce_A(FragA & tArA, // O accumulator: (q,v)
+ FragARow & tA_max, // Softmax row-wise max accumulator
+ FragARow & tA_sum, // Softmax row-wise sum accumulator
+ int thr_id) { // Work-item ID
+
+ using namespace sycl::ext::oneapi::this_work_item;
+
+ if constexpr (ReduceK{} == _1{}) {
+ return std::make_tuple(tArA, tA_sum, true);
+ } else {
+ /* Identify A tile ID and k block for this subgroup. */
+ auto thr_vak = group<1,3>(TiledMMAPV{}.get_thr_layout_vmnk()).get_flat_coord(assert_uniform(thr_id));
+ auto a_tile = get<1>(thr_vak);
+ auto k_blk = get<2>(thr_vak);
+
+ /* Set up SLM tensors and partition A tiles among participating subgroups */
+ auto shape_A = append(append(SGTileShapeA{}, ReduceK{}), SGPerWG{}/ReduceK{});
+ auto shape_A_row = make_shape(get<0>(SGTileShapeO{}), shape(ReduceSGLayout{}), ReduceK{}, SGPerWG{}/ReduceK{});
+
+ /* Physical layouts, with subtile modes broken out */
+ auto sA_layout = group<2,4>(flat_divide(make_ordered_layout(shape_A, Step<_1,_0,_2,_3>{}), SGTileShapeO{}));
+ auto sA_row_stride = make_stride(_1{}, make_stride(get<0>(shape_A_row), _0{}),
+ AlignedSGTileA_Q{}, AlignedSGTileA_Q{} * ReduceK{});
+ auto sA_row_layout = make_layout(shape_A_row, sA_row_stride);
+
+ /* Coordinate layouts, with subtile modes broken out */
+ auto basis2 = make_basis_like(SGTileShapeO{});
+ auto sA_coords = make_layout(append(SGTileShapeO{}, shape(ReduceSGLayout{})),
+ append(basis2, product_each(zip(SGTileShapeO{}, basis2))));
+
+ auto sA = make_tensor(make_smem_ptr(&shared.a_data), sA_layout); // (q,v,rblk_dst,rblk_src,a_tile)
+ auto sA_max = make_tensor(make_smem_ptr(&shared.a_max_data), sA_row_layout); // (q,rblk_dst,rblk_src,a_tile)
+ auto sA_sum = make_tensor(make_smem_ptr(&shared.a_sum_data), sA_row_layout); // (q,rblk_dst,rblk_src,a_tile)
+
+ /* Write my contributions to SLM. */
+ copy_block_r2s(tA_max, sA_max(_,_,k_blk,a_tile));
+ barrier_arrive(ScopeWorkgroup, SemanticsRelease | SemanticsWGMemory);
+ copy_block_r2s(tA_sum, sA_sum(_,_,k_blk,a_tile));
+ copy_block_r2s(tArA, sA(_,_,_,k_blk,a_tile), sA_coords);
+
+ bool active = (k_blk < size(ReduceSGLayout{}))
+ || (ReduceK{} == size(ReduceSGLayout{})); // help compiler out
+
+ /* Wait for maxima to be available, signal other data available */
+ barrier_wait(ScopeWorkgroup, SemanticsAcquire | SemanticsWGMemory);
+ barrier_arrive(ScopeWorkgroup, SemanticsRelease | SemanticsWGMemory);
+
+ ReduceFragA rA;
+ ReduceFragARow rA_sum, rA_max, rA_kmax[ReduceK{}];
+
+ if (active) {
+ /* Read A_max back from SLM and reduce. */
+ CUTLASS_PRAGMA_UNROLL
+ for (int kr = 0; kr < ReduceK{}; kr++) {
+ copy_block_s2r(sA_max(_,k_blk,kr,a_tile), rA_kmax[kr]);
+ }
+
+ rA_max = rA_kmax[0];
+ for (int kr = 1; kr < ReduceK{}; kr++)
+ cute::transform(rA_max, rA_kmax[kr], rA_max, cute::max_fn{});
+
+ /* Calculate scale factors for aligning per-block maxima. */
+ for (int kr = 0; kr < ReduceK{}; kr++) {
+ cute::transform(rA_max, rA_kmax[kr], rA_kmax[kr], [](auto gmax, auto kmax) {
+ return sycl::native::exp2(kmax - gmax);
+ });
+ }
+ }
+
+ /* Wait for A/A_sum data to be available */
+ barrier_wait(ScopeWorkgroup, SemanticsAcquire | SemanticsWGMemory);
+
+ if (active) {
+ /* Read A/A_sum back from SLM, align scaling to new maxima, and reduce. */
+ clear(rA_sum);
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int kr = 0; kr < ReduceK{}; kr++) {
+ ReduceFragARow rA_sum_read;
+ copy_block_s2r(sA_sum(_,k_blk,kr,a_tile), rA_sum_read);
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < rA_sum_read.size(); i++) {
+ rA_sum(i) += rA_sum_read(i) * rA_kmax[kr](i);
+ }
+ }
+
+ clear(rA);
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int kr = 0; kr < ReduceK{}; kr++) {
+ ReduceFragA rA_read;
+ copy_block_s2r(sA(_,_,k_blk,kr,a_tile), sA_coords(_,_,0), rA_read);
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < rA_read.size(); i++) {
+ rA(i) += rA_read(i) * broadcast<0>(rA_kmax[kr], rA, i);
+ }
+ }
+ }
+ return std::make_tuple(rA, rA_sum, active);
+ }
+ }
+};
+
+
+} // namespace cutlass::fmha::collective
diff --git a/applications/flash_attention_v2/collective/xe_fmha_fwd_mainloop.hpp b/applications/flash_attention_v2/collective/xe_fmha_fwd_mainloop.hpp
new file mode 100644
index 0000000000..b7c400a63a
--- /dev/null
+++ b/applications/flash_attention_v2/collective/xe_fmha_fwd_mainloop.hpp
@@ -0,0 +1,428 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/algorithm/subgroup_algorithms.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "fmha_fusion.hpp"
+
+namespace cutlass::fmha {
+
+template class XeDefault {}; // Default FMHA mainloop, P in registers.
+
+};
+
+namespace cutlass::fmha::collective {
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template // Optional TiledCopy for loading V
+struct FMHAFwdMainloop {
+ static_assert(cutlass::detail::dependent_false, "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template
+struct FMHAFwdMainloop, CausalMask_,
+ TiledMMAQK_, TiledMMAPV_, VTiles_,
+ TensorQ_, TensorK_, TensorV_,
+ TiledCopyQ_, TiledCopyK_, TiledCopyV_> {
+ //
+ // Type Aliases
+ //
+ using TiledMMAQK = TiledMMAQK_;
+ using TiledMMAPV = TiledMMAPV_;
+ using TileShapeQK = decltype(TiledMMAQK{}.tile_mnk());
+ using TileShapePV = decltype(TiledMMAPV{}.tile_mnk());
+ static constexpr int VTiles = VTiles_;
+ using SubgroupLayoutQK = decltype(TiledMMAQK{}.get_atom_layout_mnk());
+ using SGPerWG = decltype(product(take<1,4>(shape(typename TiledMMAQK::ThrLayoutVMNK{}))));
+
+ using TensorQ = TensorQ_;
+ using TensorK = TensorK_;
+ using TensorV = TensorV_;
+
+ using TensorQ2D = decltype(TensorQ_{}(append>(make_coord(_,_),0)));
+ using TensorK2D = decltype(TensorK_{}(append>(make_coord(_,_),0)));
+ using TensorV2D = decltype(TensorV_{}(append>(make_coord(_,_),0)));
+
+ using TiledCopyQ = conditional_t, decltype(make_block_2d_copy_A(TiledMMAQK{}, TensorQ2D{})), TiledCopyQ_>;
+ using TiledCopyK = conditional_t, decltype(make_block_2d_copy_B(TiledMMAQK{}, TensorK2D{})), TiledCopyK_>;
+ using TiledCopyV = conditional_t, decltype(make_block_2d_copy_B(TiledMMAPV{}, TensorV2D{})), TiledCopyV_>;
+
+ // TODO: static_asserts on TiledMMAPV here...
+
+ //
+ // Accumulator types
+ //
+ // FragS: accumulator for Q*K MMA
+ // FragO: accumulator for P*V MMAs.
+ // Note: v mode may be split into multiple pieces
+ // to reduce register pressure.
+ // Frag*Row types are reductions of the corresponding Frag* types
+ // over rows.
+ //
+ template
+ using FragC = decltype(TiledMMA{}.get_slice(0).partition_sg_fragment_C(
+ make_identity_tensor(select<0,1>(TiledMMA{}.tile_mnk()))));
+
+ using FragS = FragC;
+ using FragSRow = decltype(reduce<1>(FragS{}, sycl::plus{}));
+ using ElementS = typename TiledMMAQK::ValTypeD;
+
+ using SingleFragA = FragC; // (atom val,q',v')
+ using FragA = expand_sg_fragment_t; // (atom val,q',v',VV)
+ using FragARow = decltype(reduce<1>(FragA{}, sycl::plus{}));
+ using ElementA = typename TiledMMAPV::ValTypeD;
+
+ static constexpr bool CausalMask = CausalMask_;
+
+ // User-facing arguments
+ struct Arguments {
+ ElementS const scale;
+ };
+
+ // Kernel-facing parameters
+ using Params = Arguments;
+
+ // SLM data
+ struct SharedStorage {};
+
+ Params params;
+
+ //
+ // Methods
+ //
+
+ FMHAFwdMainloop(Params const& params_, SharedStorage&) : params(params_) {}
+
+ static constexpr
+ Params to_underlying_arguments(Arguments const &args, void * /* workspace */) {
+ constexpr double kLog2e = 1.4426950408889634074; // log_2(e)
+ ElementS val = args.scale * static_cast(kLog2e);
+ return Params{val};
+ }
+
+ CUTLASS_HOST_DEVICE static
+ bool can_implement(Arguments const&) {
+ return true;
+ }
+
+ template
+ CUTLASS_DEVICE
+ void
+ operator()(TensorQ2D const& Q_2D, // (q,d)
+ TensorK2D const& K_2D, // (k,d)
+ TensorV2D const& V_2D, // (d,k)
+ FragA & tArA, // Output accumulator (q,v)
+ FragARow & tA_max, // Softmax row-wise max accumulator
+ FragARow & tA_sum, // Softmax row-wise sum accumulator
+ QVCoord blk_qv, // WG tile indices: (Q,V)
+ int blk_k0, // K block range: [K0,K1)
+ int blk_k1,
+ int total_blk, // Total # of K blocks
+ int thr_id,
+ int seq_len,
+ int full_tile_offset,
+ int discard_seq_coord) {
+ using namespace sycl::ext::oneapi::this_work_item;
+
+ // Short dimension names:
+ // q = sequence len dimension for Q
+ // k = sequence len dimension for K
+ // d = head size dimension for K/Q
+ // v = head size dimension for V
+ // VV = MMA tile indices for V
+ // Capital letters (Q, K, ...) refer to WG block indices.
+ // Primed letters (q', k', ...) refer to atom block indices.
+
+ auto tile_shape_v = make_shape(get<1>(TileShapePV{}) * C{}, get<2>(TileShapePV{}));
+
+ /* Create proxy coordinate tensors for Q/K/P/V */
+ Tensor cQ = make_identity_tensor(Q_2D.shape()); // (q,d)
+ Tensor cK = make_identity_tensor(K_2D.shape()); // (k,d)
+ Tensor cV = make_identity_tensor(V_2D.shape()); // (v,k)
+ Tensor cP = make_identity_tensor(take<0,2>(TileShapeQK{})); // (q,k)
+
+ /* Partition global tensors into workgroup tiles */
+ Tensor gQ = local_tile(cQ, TileShapeQK{}, append(blk_qv,_), Step<_1,X,_1>{}); // (q,d,D)
+ Tensor gK = local_tile(cK, TileShapeQK{}, make_coord(_,_,_), Step{}); // (k,d,K,D)
+ Tensor gV = local_tile(cV, tile_shape_v, make_coord(get<1>(blk_qv),_)); // (v,k,K)
+ Tensor gV_split = local_tile(gV, TileShapePV{}, make_coord(_,_,0), Step{}); // (v,k,VV,K)
+
+ /* Create global -> register copies */
+ TiledCopyQ copy_q{Q_2D};
+ TiledCopyK copy_k{K_2D};
+ TiledCopyV copy_v{V_2D};
+
+ /* Create MMAs */
+ TiledMMAQK mma_qk{};
+ TiledMMAPV mma_pv{};
+
+ /* Slice TiledCopy/TiledMMA operations down to to work-item level */
+ auto thr_copy_q = copy_q.get_slice(thr_id);
+ auto thr_copy_k = copy_k.get_slice(thr_id);
+ auto thr_copy_v = copy_v.get_slice(thr_id);
+ auto thr_mma_qk = mma_qk.get_slice(thr_id);
+ auto thr_mma_pv = mma_pv.get_slice(thr_id);
+
+ /* Partition coordinate tensors for copy */
+ auto tQgQ = thr_copy_q.partition_S(gQ); // (atom_val,q',d',D)
+ auto tKgK = thr_copy_k.partition_S(gK); // (atom_val,k',d',K,D)
+ auto tVgV = thr_copy_v.partition_S(gV_split); // (atom_val,v',k',VV,K)
+
+ /* Create register fragments for MMA and copies */
+ auto tQrQ = thr_copy_q.partition_sg_fragment_D(gQ(_,_,0));
+ auto tSrQ = thr_mma_qk.partition_sg_fragment_A(gQ(_,_,0));
+
+ auto tKrK = thr_copy_k.partition_sg_fragment_D(gK(_,_,0,0));
+ auto tSrK = thr_mma_qk.partition_sg_fragment_B(gK(_,_,0,0));
+
+ auto tSrS = thr_mma_qk.partition_sg_fragment_C(cP);
+ auto tArP = thr_mma_pv.partition_sg_fragment_A(cP);
+
+ auto tVrV = thr_copy_v.partition_sg_fragment_D(gV_split(_,_,0,0));
+ auto tArV = thr_mma_pv.partition_sg_fragment_B(gV_split(_,_,0,0));
+
+ /* Create TiledCopy objects for prefetches */
+ auto prefetch_q = make_block_2d_prefetch(copy_q);
+ auto prefetch_k = make_block_2d_prefetch(copy_k);
+ auto prefetch_v = make_block_2d_prefetch(tile_shape_v, V_2D);
+
+ /* Partition global tensors for prefetch */
+ auto pQgQ = prefetch_q.get_slice(thr_id).partition_S(gQ);
+ auto pKgK = prefetch_k.get_slice(thr_id).partition_S(gK);
+ auto pVgV = prefetch_v.get_slice(thr_id).partition_S(gV);
+
+ // ------
+ // Kernel
+ // ------
+
+ /* Initialization steps for first block: Q/K prefetch, O init */
+ /* TODO: limit D prefetch for large head size, and reorder K prefetches */
+ if (blk_k0 == 0) {
+ for (int D = 0; D < size<3>(pQgQ); D++) {
+ prefetch(prefetch_q, pQgQ(_,_,_,D));
+ }
+
+ for (int D = 0; D < size<4>(pKgK); D++) {
+ CUTLASS_PRAGMA_UNROLL
+ for (int K = 0; K < Stages; K++) {
+ prefetch(prefetch_k, pKgK(_,_,_,K,D));
+ }
+ }
+
+ clear(tArA);
+ fill(tA_max, cutlass::platform::numeric_limits::lowest());
+ clear(tA_sum);
+ }
+
+ /* Check if */
+ bool check_remainder_k = (seq_len % get<1>(TileShapeQK{}) != 0);
+
+ /* Main loop, blocked in k. */
+ for (int K = blk_k0; K < blk_k1; K++) {
+ /* Split barrier to keep threads together */
+ barrier_arrive(ScopeWorkgroup);
+
+ /* GEMM 1: S = K * Q */
+ clear(tSrS); /* TODO: fuse w/ initial gemm call */
+ for (int D = 0; D < size<4>(tKgK); D++) {
+ copy(copy_q, tQgQ(_,_,_,D), tQrQ);
+ copy(copy_k, tKgK(_,_,_,K,D), tKrK);
+
+ reorder(tQrQ, tSrQ);
+ reorder(tKrK, tSrK);
+
+ cute::gemm(mma_qk, tSrQ, tSrK, tSrS);
+ }
+
+ /* V prefetch for GEMM 2 */
+ prefetch(prefetch_v, pVgV(_,_,_,K));
+
+ /* Causal masking */
+ if constexpr (CausalMask) {
+ if (K == blk_k1 - 1) {
+ // Need to get global col and row indices to mask the elements
+ Tensor cPgP = make_identity_tensor(make_shape(seq_len, seq_len));
+ Tensor gP = local_tile(cPgP, take<0,2>(TileShapeQK{}), make_coord(get<0>(blk_qv), K));
+ auto cS_thread = thr_mma_qk.partition_C(gP);
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < tSrS.size(); ++i) {
+ int row_idx = get<0>(cS_thread(i));
+ int col_idx = get<1>(cS_thread(i));
+ if (col_idx - full_tile_offset > row_idx - discard_seq_coord) {
+ tSrS(i) = ElementS(-INFINITY);
+ }
+ }
+ }
+ }
+ /* k masking for remainder tiles */
+ if (check_remainder_k && K == total_blk - 1) {
+ FragSRow k_rem_mask;
+ int k = get<0>(tKgK(0,0,0,K,0)) + get_sub_group().get_local_id()[0];
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < k_rem_mask.size(); i++, k += intel::sg_size) {
+ k_rem_mask(i) = (k < seq_len) ? ElementS(sycl::nan(0u)) : ElementS(-INFINITY);
+ }
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < tSrS.size(); i++) {
+ tSrS(i) = sycl::fmin(tSrS(i), broadcast<1>(k_rem_mask, tSrS, i));
+ }
+ }
+
+ /* Apply softmax and scaling */
+ softmax(K == 0, tSrS, tA_max, tA_sum, tArA);
+ reorder(tSrS, tArP);
+
+ /* GEMM 2: A += P * V, split in v dimension */
+ CUTLASS_PRAGMA_UNROLL
+ for (int VV = 0; VV < VTiles; VV++) {
+ copy(copy_v, tVgV(_,_,_,VV,K), tVrV);
+ reorder(tVrV, tArV);
+ cute::gemm(mma_pv, tArP, tArV, tArA(_,_,_,VV));
+ }
+
+ /* K prefetch */
+ for (int D = 0; D < size<4>(pKgK); D++) {
+ prefetch(prefetch_k, pKgK(_,_,_,K+Stages,D));
+ }
+
+ barrier_wait(ScopeWorkgroup);
+ }
+ }
+
+ // Single step of blocked softmax.
+ CUTLASS_DEVICE
+ void
+ softmax(bool first_block, // First softmax block?
+ FragS & tS, // Softmax src/dst block
+ FragSRow & tS_max, // Softmax row-wise max accumulator
+ FragSRow & tS_sum, // Softmax row-wise sum accumulator
+ FragA & tA) { // O accumulator (for rescaling)
+
+ /* Compute row-wise maxima for this block */
+ auto tS_bmax = reduce<1>(tS, sycl::maximum{});
+
+ /* Update (scaled) maxima */
+ auto tS_prev_max = tS_max;
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < tS_max.size(); i++) {
+ tS_max(i) = sycl::max(tS_max(i), params.scale * tS_bmax(i));
+ }
+
+ /* Scale S and subtract maxima, then exponentiate */
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < tS.size(); i++)
+ tS(i) = sycl::native::exp2(params.scale * tS(i) - broadcast<0>(tS_max, tS, i));
+
+ /* Rescale existing S sums and O accumulator */
+ if (!first_block) {
+ FragSRow rescale;
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < tS_max.size(); i++) {
+ rescale(i) = sycl::native::exp2(tS_prev_max(i) - tS_max(i));
+ tS_sum(i) *= rescale(i);
+ }
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < tA.size(); i++)
+ tA(i) *= broadcast<0>(rescale, tA, i);
+ }
+
+ /* Update sums */
+ auto tS_bsum = reduce<1>(tS, sycl::plus{});
+ for (int i = 0; i < tS_sum.size(); i++)
+ tS_sum(i) += tS_bsum(i);
+ }
+};
+
+
+template
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_sg_layout_pv(SGLayoutQK const&)
+{
+ return make_layout(
+ get<0>(SGLayoutQK{}),
+ Layout<_1, _0>{},
+ get<1>(SGLayoutQK{})
+ );
+}
+
+// Get a P*V TiledMMA given K*Q tile size and SG configuration, for mainloops
+// not supporting S data interchange among subgroups (e.g. XeDefault).
+template
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_tiled_mma_pv(MMAOp const&, WGTileQK const& wg_tile_qk, SGLayoutQK const& sg_layout_qk, TileV const&) {
+ using TileQ = decltype(get<0>(wg_tile_qk));
+ using TileK = decltype(get<1>(wg_tile_qk));
+
+ using WGTilePV = Shape;
+ using SGLayoutPV = decltype(get_sg_layout_pv(sg_layout_qk));
+
+ static_assert(size(SGLayoutPV{}) == size(SGLayoutQK{}),
+ "Q*K cannot be parallelized in the head size dimension");
+
+ return TiledMMAHelper{};
+}
+
+} // namespace cutlass::fmha::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/applications/flash_attention_v2/kernel/xe_fhma_fwd_kernel.hpp b/applications/flash_attention_v2/kernel/xe_fhma_fwd_kernel.hpp
new file mode 100644
index 0000000000..f5905f746a
--- /dev/null
+++ b/applications/flash_attention_v2/kernel/xe_fhma_fwd_kernel.hpp
@@ -0,0 +1,700 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+#include "flash_attention_v2/collective/xe_fmha_fwd_mainloop.hpp"
+#include "flash_attention_v2/collective/xe_fmha_fwd_epilogue.hpp"
+#include "cute/util/type_traits.hpp"
+#include "flash_attention_v2/collective/fmha_fusion.hpp"
+#include "flash_attention_v2/kernel/xe_tile_scheduler.hpp"
+
+namespace cutlass::fmha::kernel {
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+template
+struct FMHAProblemShape {
+ using SeqLenType = cute::conditional_t;
+ int batch;
+ int num_heads_q, num_heads_kv;
+ SeqLenType seq_len_qo, seq_len_kv;
+ int head_size_qk, head_size_vo;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template
+class XeFMHAFwdKernel {
+
+public:
+ //
+ // Type Aliases
+ //
+ using ProblemShape = ProblemShape_;
+ using VariableLength = cutlass::fmha::collective::VariableLength;
+ static constexpr bool is_var_len = cutlass::fmha::collective::is_variable_length_v;
+ // Mainloop derived types
+ using CollectiveMainloop = CollectiveMainloop_;
+ using MainloopArguments = typename CollectiveMainloop::Arguments;
+ using MainloopParams = typename CollectiveMainloop::Params;
+
+ using TiledMMAQK = typename CollectiveMainloop::TiledMMAQK;
+ using TiledMMAPV = typename CollectiveMainloop::TiledMMAPV;
+ using TileShapeQK = typename CollectiveMainloop::TileShapeQK;
+ using TileShapePV = typename CollectiveMainloop::TileShapePV;
+ using SubgroupLayoutQK = typename CollectiveMainloop::SubgroupLayoutQK;
+ using ElementQ = typename CollectiveMainloop::TensorQ::element_type;
+ using ElementK = typename CollectiveMainloop::TensorK::element_type;
+ using ElementV = typename CollectiveMainloop::TensorV::element_type;
+
+ using StrideQ = decltype(stride(typename CollectiveMainloop::TensorQ{}));
+ using StrideK = decltype(stride(typename CollectiveMainloop::TensorK{}));
+ using StrideV = decltype(stride(typename CollectiveMainloop::TensorV{}));
+
+ using SGPerWG = typename CollectiveMainloop::SGPerWG;
+
+ using FragA = typename CollectiveMainloop::FragA;
+ using FragARow = typename CollectiveMainloop::FragARow;
+
+ // Tile scheduler derived types
+ using TileScheduler = TileScheduler_;
+ using TileSchedulerParams = typename TileScheduler::Params;
+
+ // Epilogue derived types
+ using CollectiveEpilogue = CollectiveEpilogue_;
+ using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+ using EpilogueParams = typename CollectiveEpilogue::Params;
+
+ using TileShapeO = typename CollectiveEpilogue::TileShapeO;
+ using ElementO = typename CollectiveEpilogue::TensorO::element_type;
+ using StrideO = decltype(stride(typename CollectiveEpilogue::TensorO{}));
+
+ // Kernel level shared memory storage
+ using MainloopSharedStorage = typename CollectiveMainloop::SharedStorage;
+ using EpilogueSharedStorage = typename CollectiveEpilogue::SharedStorage;
+ union SharedStorage {
+ MainloopSharedStorage mainloop;
+ EpilogueSharedStorage epilogue;
+ };
+
+ static constexpr int SharedStorageSize = is_empty_v ? size_t(0)
+ : sizeof(SharedStorage);
+
+ // Device side arguments
+ struct KernelArguments {
+ ProblemShape shape;
+ const ElementQ *Q;
+ StrideQ dQ;
+ const ElementK *K;
+ StrideK dK;
+ const ElementV *V;
+ StrideV dV;
+ ElementO *O;
+ StrideO dO;
+ };
+ using KernelParams = KernelArguments;
+
+ struct Arguments {
+ KernelArguments kernel{};
+ MainloopArguments mainloop{};
+ EpilogueArguments epilogue{};
+ KernelHardwareInfo hw_info{};
+ };
+
+ // Kernel entry point API
+ struct Params {
+ KernelParams kernel;
+ MainloopParams mainloop;
+ EpilogueParams epilogue;
+ TileSchedulerParams scheduler;
+ };
+
+ //
+ // Methods
+ //
+
+ static Params to_underlying_arguments(Arguments const &args, void *workspace) {
+ return {args.kernel,
+ CollectiveMainloop::to_underlying_arguments(args.mainloop, workspace),
+ CollectiveEpilogue::to_underlying_arguments(args.epilogue, workspace),
+ TileScheduler::to_underlying_arguments(args.kernel.shape, args.hw_info, TileShapeO{})};
+ }
+
+ static bool can_implement(Arguments const &args) {
+ return CollectiveMainloop::can_implement(args.mainloop)
+ && CollectiveEpilogue::can_implement(args.epilogue);
+ }
+
+ static int get_workspace_size(Arguments const &args) { return 0; }
+
+ static cutlass::Status initialize_workspace(Arguments const &args, void *workspace = nullptr,
+ cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
+ return Status::kSuccess;
+ }
+
+ static dim3 get_grid_shape(Params const ¶ms) {
+ return TileScheduler::template get_grid_shape(params.scheduler);
+ }
+
+ static dim3 get_block_shape() { return dim3(SGPerWG::value * intel::sg_size, 1, 1); }
+
+ CUTLASS_DEVICE
+ Shape get_sequence_length_shape(ProblemShape const& problem_shape, int const& batch) {
+ if constexpr (is_var_len) {
+ return cutlass::fmha::collective::apply_variable_length(Shape{problem_shape.seq_len_qo, problem_shape.seq_len_kv}, batch);
+ } else {
+ return Shape{problem_shape.seq_len_qo, problem_shape.seq_len_kv};
+ }
+ }
+
+ CUTLASS_DEVICE
+ void operator()(Params const ¶ms, char *smem_buf)
+ {
+ using namespace sycl::ext::oneapi::this_work_item;
+
+ SharedStorage& shared_storage = *reinterpret_cast(smem_buf);
+
+ auto &p = params.kernel;
+ ProblemShape const& s = p.shape;
+ int head_group_q = s.num_heads_q / s.num_heads_kv;
+
+ int thr_id = int(ThreadIdxX());
+ int sub_group_id = thr_id / intel::sg_size;
+ int q_sg_tile = get<0>(shape_div(TileShapeQK{}, shape(SubgroupLayoutQK{})));
+
+ auto cS = make_identity_tensor(take<0,2>(TiledMMAQK{}.tile_mnk()));
+ auto tScS = TiledMMAQK{}.get_slice(thr_id).partition_C(cS);
+ auto q_offset_wi = get<0>(tScS(0));
+ auto q_offset_sg = group_broadcast(sycl::ext::oneapi::this_work_item::get_sub_group(), q_offset_wi, 0);
+
+ TileScheduler tile_scheduler{params.scheduler};
+
+ CUTLASS_PRAGMA_NO_UNROLL
+ for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+ auto [blk_q, blk_v, head_q, idx_b] = tile_scheduler.get_block_coord(); // (Q,V,h,b)
+ auto blk_qv = make_coord(blk_q, blk_v);
+ int head = head_q / head_group_q;
+
+ auto sequence_length_shape = get_sequence_length_shape(s, idx_b);
+ auto [seq_len_qo, seq_len_kv] = sequence_length_shape;
+ if (blk_q * get<0>(TileShapeQK{}) >= seq_len_qo) continue;
+
+ auto offset = cute::min(seq_len_qo, seq_len_kv);
+ auto discard_seq_coord = seq_len_qo - offset;
+ auto full_tile_offset = seq_len_kv - offset;
+ int seq_coord = cute::min(seq_len_qo, (blk_q * get<0>(TileShapeQK{}) + q_offset_sg));
+
+ if (CollectiveMainloop::CausalMask && seq_coord < discard_seq_coord) continue;
+ const int seq_len = CollectiveMainloop::CausalMask ? full_tile_offset + cute::min(seq_len_kv, seq_coord - discard_seq_coord) + q_sg_tile : seq_len_kv;
+ const int k_blocks = cute::ceil_div(seq_len, get<1>(TileShapeQK{}));
+
+ int offset_q = 0, offset_k = 0, offset_v = 0, offset_o = 0;
+ if constexpr (is_var_len) {
+ int group_heads_q = s.num_heads_q / s.num_heads_kv;
+ auto qo_cumulative = s.seq_len_qo.cumulative_length;
+ auto kv_cumulative = s.seq_len_kv.cumulative_length;
+ offset_q = s.num_heads_q * s.head_size_qk * qo_cumulative[idx_b];
+ offset_k = s.num_heads_kv * s.head_size_qk * kv_cumulative[idx_b];
+ offset_v = s.num_heads_kv * s.head_size_vo * kv_cumulative[idx_b];
+ offset_o = s.num_heads_q * s.head_size_vo * qo_cumulative[idx_b];
+ }
+
+ auto batch_dim = is_var_len ? 1 : s.batch;
+ auto shape_Q = make_shape(seq_len_qo, s.head_size_qk, s.num_heads_q, batch_dim);
+ auto shape_K = make_shape(seq_len_kv, s.head_size_qk, s.num_heads_kv, batch_dim);
+ auto shape_V = make_shape(s.head_size_vo, seq_len_kv, s.num_heads_kv, batch_dim);
+ auto shape_O = make_shape(seq_len_qo, s.head_size_vo, s.num_heads_kv, batch_dim);
+
+ auto dcQ = const_cast(p.Q + offset_q);
+ auto dcK = const_cast(p.K + offset_k);
+ auto dcV = const_cast(p.V + offset_v);
+ auto ptrO = p.O + offset_o;
+
+ auto stride_q = is_var_len ? cutlass::make_cute_packed_stride(StrideQ{}, shape_Q) : p.dQ;
+ auto stride_k = is_var_len ? cutlass::make_cute_packed_stride(StrideK{}, shape_K) : p.dK;
+ auto stride_v = is_var_len ? cutlass::make_cute_packed_stride(StrideV{}, shape_V) : p.dV;
+ auto stride_o = is_var_len ? cutlass::make_cute_packed_stride(StrideO{}, shape_O) : p.dO;
+
+ Tensor Q = make_tensor(make_gmem_ptr(dcQ), make_layout(shape_Q, stride_q));
+ Tensor K = make_tensor(make_gmem_ptr(dcK), make_layout(shape_K, stride_k));
+ Tensor V = make_tensor(make_gmem_ptr(dcV), make_layout(shape_V, stride_v));
+ Tensor O = make_tensor(make_gmem_ptr(ptrO), make_layout(shape_O, stride_o));
+
+
+ // O accumulator types
+ FragA tArA;
+ FragARow tA_max, tA_sum;
+
+ // Main loop
+ int l_coord = is_var_len ? 0 : idx_b;
+ CollectiveMainloop mainloop(params.mainloop, shared_storage.mainloop);
+ mainloop(Q(_,_,head_q,l_coord),
+ K(_,_,head,l_coord),
+ V(_,_,head,l_coord),
+ tArA, tA_max, tA_sum,
+ blk_qv, 0, k_blocks, k_blocks,
+ thr_id, seq_len,
+ full_tile_offset, discard_seq_coord);
+ if constexpr (!is_empty_v && !is_empty_v) {
+ sycl::group_barrier(get_work_group<3>());
+ }
+
+ // Epilogue
+ CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
+ epilogue(O(_,_,head_q,l_coord),
+ tArA, tA_max, tA_sum,
+ blk_qv, thr_id);
+ }
+ }
+};
+
+template
+class XeFMHAFwdDynamicSplitKernel {
+
+public:
+ //
+ // Type Aliases
+ //
+ using ProblemShape = ProblemShape_;
+
+ // Mainloop derived types
+ using CollectiveMainloop = CollectiveMainloop_;
+ using MainloopArguments = typename CollectiveMainloop::Arguments;
+ using MainloopParams = typename CollectiveMainloop::Params;
+
+ using TiledMMAQK = typename CollectiveMainloop::TiledMMAQK;
+ using TiledMMAPV = typename CollectiveMainloop::TiledMMAPV;
+ using TileShapeQK = typename CollectiveMainloop::TileShapeQK;
+ using TileShapePV = typename CollectiveMainloop::TileShapePV;
+
+ using ElementQ = typename CollectiveMainloop::TensorQ::element_type;
+ using ElementK = typename CollectiveMainloop::TensorK::element_type;
+ using ElementV = typename CollectiveMainloop::TensorV::element_type;
+
+ using StrideQ = decltype(stride(typename CollectiveMainloop::TensorQ{}));
+ using StrideK = decltype(stride(typename CollectiveMainloop::TensorK{}));
+ using StrideV = decltype(stride(typename CollectiveMainloop::TensorV{}));
+
+ using SGPerWG = typename CollectiveMainloop::SGPerWG;
+
+ using FragA = typename CollectiveMainloop::FragA;
+ using SingleFragA = typename CollectiveMainloop::SingleFragA;
+ using FragARow = typename CollectiveMainloop::FragARow;
+ // element dtype for MmaPV results
+ using ElementA = typename CollectiveMainloop::ElementA;
+
+ // Tile scheduler derived types
+ static_assert(is_same_v);
+ using TileScheduler = TileScheduler_;
+ using TileSchedulerParams = typename TileScheduler::Params;
+
+ // Epilogue derived types
+ using CollectiveEpilogue = CollectiveEpilogue_;
+ using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+ using EpilogueParams = typename CollectiveEpilogue::Params;
+
+ using TileShapeO = typename CollectiveEpilogue::TileShapeO;
+ using ElementO = typename CollectiveEpilogue::TensorO::element_type;
+ using StrideO = decltype(stride(typename CollectiveEpilogue::TensorO{}));
+
+ // Kernel level shared memory storage
+ using MainloopSharedStorage = typename CollectiveMainloop::SharedStorage;
+ using EpilogueSharedStorage = typename CollectiveEpilogue::SharedStorage;
+ union SharedStorage {
+ MainloopSharedStorage mainloop;
+ EpilogueSharedStorage epilogue;
+ };
+
+ static constexpr int SharedStorageSize = is_empty_v ? size_t(0)
+ : sizeof(SharedStorage);
+
+ // Important: make sure multiple of 16 element for each copy
+ // this is for storing partial results from different KV partitions
+ static constexpr int num_elem_per_thread = (size(FragA{}.shape()) + 2 * size(FragARow{}.shape()) + 15) / 16 * 16;
+ static const int max_num_partitions = 8;
+
+ // Device side arguments
+ struct KernelArguments {
+ ProblemShape shape;
+ const ElementQ *Q;
+ StrideQ dQ;
+ const ElementK *K;
+ StrideK dK;
+ const ElementV *V;
+ StrideV dV;
+ ElementO *O;
+ StrideO dO;
+ };
+ using KernelParams = KernelArguments;
+
+ struct Arguments {
+ KernelArguments kernel{};
+ MainloopArguments mainloop{};
+ EpilogueArguments epilogue{};
+ KernelHardwareInfo hw_info{};
+ };
+
+ // Kernel entry point API
+ struct Params {
+ KernelParams kernel;
+ MainloopParams mainloop;
+ EpilogueParams epilogue;
+ TileSchedulerParams scheduler;
+ // workspace for storing partial results of different KV partitions
+ ElementA *partial_results_ptr = nullptr;
+ // for atomic add
+ int32_t *atomic_reduce_cnt_ptr = nullptr;
+ };
+
+ //
+ // Methods
+ //
+
+ static Params to_underlying_arguments(Arguments const &args, void *workspace) {
+ int num_batch_heads = args.kernel.shape.batch * args.kernel.shape.num_heads_q;
+ int32_t *atomic_reduce_cnt_ptr = reinterpret_cast(workspace);
+ ElementA *partial_results_ptr = reinterpret_cast(atomic_reduce_cnt_ptr + num_batch_heads);
+ return {args.kernel,
+ CollectiveMainloop::to_underlying_arguments(args.mainloop, workspace),
+ CollectiveEpilogue::to_underlying_arguments(args.epilogue, workspace),
+ TileScheduler::to_underlying_arguments(args.kernel.shape, args.hw_info, TileShapeO{}),
+ partial_results_ptr, atomic_reduce_cnt_ptr
+ };
+ }
+
+ static bool can_implement(Arguments const &args) {
+ // current kernel only support decode
+ if (args.kernel.shape.seq_len_qo > 1) {
+ return false;
+ }
+ // current kernel only support num batch heads less than total XeCore count
+ if (args.kernel.shape.batch * args.kernel.shape.num_heads_q > args.hw_info.sm_count) {
+ return false;
+ }
+ return CollectiveMainloop::can_implement(args.mainloop)
+ && CollectiveEpilogue::can_implement(args.epilogue);
+ }
+
+ static int get_workspace_size(Arguments const &args) {
+ int ws_size = 0;
+ int num_batch_heads = args.kernel.shape.batch * args.kernel.shape.num_heads_q;
+ const int wg_size = SGPerWG::value * intel::sg_size;
+
+ // partial attn outputs, exp sum and max logits
+ ws_size += (max_num_partitions * num_batch_heads) * wg_size * num_elem_per_thread * sizeof(ElementA);
+ // atomic counter
+ ws_size += num_batch_heads * sizeof(int32_t);
+ return ws_size;
+ }
+
+ static cutlass::Status initialize_workspace(Arguments const &args, void *workspace = nullptr,
+ cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
+ int num_batch_heads = args.kernel.shape.batch * args.kernel.shape.num_heads_q;
+ compat::fill(reinterpret_cast(workspace), (int32_t)0, num_batch_heads);
+ auto partial_ws_count = (get_workspace_size(args) - num_batch_heads * sizeof(int32_t)) / sizeof(ElementA);
+ auto* partial_results_ptr = reinterpret_cast(reinterpret_cast(workspace) + num_batch_heads);
+ compat::fill(partial_results_ptr, (ElementA)0, partial_ws_count);
+ return Status::kSuccess;
+ }
+
+ static dim3 get_grid_shape(Params const ¶ms) {
+ return TileScheduler::template get_grid_shape(params.scheduler);
+ }
+
+ static dim3 get_block_shape() { return dim3(SGPerWG::value * intel::sg_size, 1, 1); }
+
+ CUTLASS_DEVICE
+ int get_partition_id(const int cur_wg_id, const int batch_head_id, const int num_blocks_per_wg, const int local_k_blocks) {
+ int partition_id = 0;
+ if (batch_head_id == 0) {
+ return cur_wg_id;
+ }
+ int start_wg_id = batch_head_id * local_k_blocks / num_blocks_per_wg;
+ partition_id = cur_wg_id - start_wg_id;
+ return partition_id;
+ }
+
+ CUTLASS_DEVICE
+ int get_num_partitions(const int batch_head_id, const int num_blocks_per_wg, const int local_k_blocks) {
+ int num_partitions = 1;
+ int start_wg_id = batch_head_id * local_k_blocks / num_blocks_per_wg;
+ int end_wg_id = (batch_head_id + 1) * local_k_blocks / num_blocks_per_wg;
+ num_partitions = end_wg_id - start_wg_id + 1;
+ // end_wg_id is the starting wg id of next batch head id
+ if (((batch_head_id + 1) * local_k_blocks) % num_blocks_per_wg == 0) {
+ num_partitions -= 1;
+ }
+ return num_partitions;
+ }
+
+ template
+ CUTLASS_DEVICE
+ void reduce_split2(const Params ¶ms, FragA &out1, FragARow& max_val1, FragARow& exp_sum_val1, FragA &out2, FragARow& max_val2, FragARow& exp_sum_val2) {
+ // global max value
+ FragARow max_prev1 = max_val1;
+ FragARow max_prev2 = max_val2;
+
+ auto scale = params.mainloop.scale;
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < max_val1.size(); i++) {
+ max_val1(i) = sycl::max(max_val1(i), max_val2(i));
+ }
+
+ FragARow rescale1, rescale2;
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < max_val1.size(); i++) {
+ rescale1(i) = sycl::native::exp2(max_prev1(i) - max_val1(i));
+ rescale2(i) = sycl::native::exp2(max_prev2(i) - max_val1(i));
+ }
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < exp_sum_val1.size(); i++) {
+ exp_sum_val1(i) = exp_sum_val1(i) * rescale1(i) + exp_sum_val2(i) * rescale2(i);
+ }
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < out1.size(); i++)
+ out1(i) = out1(i) * broadcast<0>(rescale1, out1, i) + out2(i) * broadcast<0>(rescale2, out2, i);
+ }
+
+ CUTLASS_DEVICE
+ void operator()(Params const ¶ms, char *smem_buf)
+ {
+ using namespace sycl::ext::oneapi::this_work_item;
+
+ SharedStorage& shared_storage = *reinterpret_cast(smem_buf);
+
+ auto &p = params.kernel;
+ ProblemShape const& s = p.shape;
+ int head_group_q = s.num_heads_q / s.num_heads_kv;
+
+ int thr_id = int(ThreadIdxX());
+ int wg_id = int(BlockIdxZ());
+
+ int sg_id = thr_id / intel::sg_size;
+ int tid_in_sg = thr_id % intel::sg_size;
+ int num_batch_heads = s.batch * s.num_heads_q;
+
+ int local_k_blocks = cute::ceil_div(s.seq_len_kv, get<1>(TileShapeQK{}));
+ // total number of blocks need to be processed across all wgs
+ int total_k_blocks = local_k_blocks * num_batch_heads;
+ // to guarantee all wg process similar number of blocks of KV
+ int num_blocks_per_wg = cute::ceil_div(total_k_blocks, GridDimZ());
+
+ TileScheduler tile_scheduler{params.scheduler, get<1>(TileShapeQK{}), local_k_blocks, num_batch_heads};
+
+ CUTLASS_PRAGMA_NO_UNROLL
+ for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+ auto [blk_q, blk_v, start_batch_head_id] = tile_scheduler.get_block_coord(); // (Q,V, batch_head_idx)
+ auto blk_qv = make_coord(blk_q, blk_v);
+
+ auto shape_Q = make_shape(s.seq_len_qo, s.head_size_qk, s.num_heads_q, s.batch);
+ auto shape_K = make_shape(s.seq_len_kv, s.head_size_qk, s.num_heads_kv, s.batch);
+ auto shape_V = make_shape(s.head_size_vo, s.seq_len_kv, s.num_heads_kv, s.batch);
+ auto shape_O = make_shape(s.seq_len_qo, s.head_size_vo, s.num_heads_kv, s.batch);
+
+ auto dcQ = const_cast(p.Q); // de-const these for uniformity
+ auto dcK = const_cast(p.K);
+ auto dcV = const_cast(p.V);
+
+ Tensor Q = make_tensor(make_gmem_ptr(dcQ), make_layout(shape_Q, p.dQ)); // (q,d,h,b)
+ Tensor K = make_tensor(make_gmem_ptr(dcK), make_layout(shape_K, p.dK)); // (k,d,h,b)
+ Tensor V = make_tensor(make_gmem_ptr(dcV), make_layout(shape_V, p.dV)); // (v,k,h,b)
+ Tensor O = make_tensor(make_gmem_ptr(p.O), make_layout(shape_O, p.dO)); // (q,v,h,b)
+
+ // O accumulator types
+ FragA tArA;
+ FragARow tA_max, tA_sum;
+
+ // compute num computed blocks for start batch head id
+ int num_computed_blocks = (start_batch_head_id == 0) ? (wg_id * num_blocks_per_wg) : (wg_id * num_blocks_per_wg - start_batch_head_id * local_k_blocks);
+ int start_blk, end_blk, head_q, idx_b, head_kv;
+ // leader wg is also responsible for reducing partial results, while other
+ // worker wg only to compute partial results
+ bool is_leader_wg = wg_id < num_batch_heads;
+
+ if (thr_id == 0 && is_leader_wg) {
+ // reset atomic counter before computation
+ *(params.atomic_reduce_cnt_ptr + wg_id) = 0;
+ }
+
+ // Main loop
+ CollectiveMainloop mainloop(params.mainloop, shared_storage.mainloop);
+
+ // compute blocks budget remained for each wg
+ int block_budget_remained = num_blocks_per_wg;
+ int batch_head_id = start_batch_head_id;
+ bool is_update_batch_head_id = false;
+ while (block_budget_remained > 0) {
+ int num_new_blocks = local_k_blocks - num_computed_blocks;
+ if (num_new_blocks <= block_budget_remained) {
+ // finished current batch head id
+ start_blk = num_computed_blocks;
+ end_blk = start_blk + num_new_blocks;
+
+ // update states
+ num_computed_blocks = 0;
+ block_budget_remained -= num_new_blocks;
+ is_update_batch_head_id = true;
+ } else {
+ // budget cannot afford finishing current batch head id
+ start_blk = num_computed_blocks;
+ end_blk = start_blk + block_budget_remained;
+
+ block_budget_remained = 0;
+ is_update_batch_head_id = false;
+ }
+
+ head_q = batch_head_id % s.num_heads_q;
+ idx_b = batch_head_id / s.num_heads_q;
+ head_kv = head_q / head_group_q;
+ // mainloop
+ mainloop(Q(_,_,head_q,idx_b),
+ K(_,_,head_kv,idx_b),
+ V(_,_,head_kv,idx_b),
+ tArA, tA_max, tA_sum,
+ blk_qv, start_blk, end_blk, local_k_blocks,
+ thr_id, s.seq_len_kv, /*for causal*/0, 0);
+
+ // partition id of start batch head id in current wg
+ int partition_id = get_partition_id(wg_id, batch_head_id, num_blocks_per_wg, local_k_blocks);
+
+ // store partial result: tArA, tA_max and tA_sum
+ int offset = batch_head_id * max_num_partitions * num_elem_per_thread * SGPerWG::value * intel::sg_size
+ + partition_id * num_elem_per_thread * SGPerWG::value * intel::sg_size
+ + sg_id * intel::sg_size * num_elem_per_thread
+ + tid_in_sg * num_elem_per_thread;
+ Tensor tPartial = make_tensor(params.partial_results_ptr + offset, make_shape(Int{}));
+ Tensor merged_res = make_tensor(Int{});
+
+ CUTLASS_PRAGMA_UNROLL
+ for(int i = 0; i < size(FragA{}.shape()); ++i) {
+ merged_res(i) = tArA(i);
+ }
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < size(FragARow{}.shape()); ++i) {
+ merged_res(2 * i + size(FragA{}.shape())) = tA_max(i);
+ merged_res(2 * i + 1 + size(FragA{}.shape())) = tA_sum(i);
+ }
+ copy(merged_res, tPartial);
+
+ // after store, set atomic cnt
+ if (thr_id == 0) {
+ atomicAdd(params.atomic_reduce_cnt_ptr + batch_head_id, 1);
+ }
+
+ // advance to next batch head id
+ if (is_update_batch_head_id) {
+ batch_head_id += 1;
+ if (batch_head_id >= num_batch_heads) {
+ break;
+ }
+ }
+ }
+
+ if (is_leader_wg) {
+ int num_partitions = get_num_partitions(wg_id, num_blocks_per_wg, local_k_blocks);
+
+ // check atomic to wait for partial results ready
+ while(atomicLoad(params.atomic_reduce_cnt_ptr + wg_id) != num_partitions) {}
+
+ clear(tArA);
+ clear(tA_max);
+ clear(tA_sum);
+
+ for (int i = 0; i < num_partitions; ++i) {
+ int offset = wg_id * max_num_partitions * SGPerWG::value * intel::sg_size * num_elem_per_thread
+ + i * SGPerWG::value * intel::sg_size * num_elem_per_thread
+ + sg_id * intel::sg_size * num_elem_per_thread
+ + tid_in_sg * num_elem_per_thread;
+ Tensor tPartial = make_tensor(params.partial_results_ptr + offset, make_shape(Int{}));
+ Tensor merged_res = make_tensor(Int{});
+ copy(tPartial, merged_res);
+
+ if (i == 0) {
+ CUTLASS_PRAGMA_UNROLL
+ for(int i = 0; i < size(FragA{}.shape()); ++i) {
+ tArA(i) = merged_res(i);
+ }
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < size(FragARow{}.shape()); ++i) {
+ tA_max(i) = merged_res(2 * i + size(FragA{}.shape()));
+ tA_sum(i) = merged_res(2 * i + 1 + size(FragA{}.shape()));
+ }
+
+ continue;
+ }
+
+ FragA tArA_2;
+ FragARow tA_max_2, tA_sum_2;
+ CUTLASS_PRAGMA_UNROLL
+ for(int i = 0; i < size(FragA{}.shape()); ++i) {
+ tArA_2(i) = merged_res(i);
+ }
+
+ CUTLASS_PRAGMA_UNROLL
+ for (int i = 0; i < size(FragARow{}.shape()); ++i) {
+ tA_max_2(i) = merged_res(2 * i + size(FragA{}.shape()));
+ tA_sum_2(i) = merged_res(2 * i + 1 + size(FragA{}.shape()));
+ }
+
+ reduce_split2(params, tArA, tA_max, tA_sum, tArA_2, tA_max_2, tA_sum_2);
+ }
+
+ // require group barrier if using SLM
+ if constexpr (!is_empty_v && !is_empty_v) {
+ sycl::group_barrier(get_work_group<3>());
+ }
+
+ head_q = wg_id % s.num_heads_q;
+ idx_b = wg_id / s.num_heads_q;
+ head_kv = head_q / head_group_q;
+
+ // Epilogue
+ CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
+ epilogue(O(_,_,head_q,idx_b),
+ tArA, tA_max, tA_sum,
+ blk_qv, thr_id);
+ }
+ }
+ }
+};
+
+} // namespace cutlass::fmha::kernel
diff --git a/applications/flash_attention_v2/kernel/xe_tile_scheduler.hpp b/applications/flash_attention_v2/kernel/xe_tile_scheduler.hpp
new file mode 100644
index 0000000000..24a686993c
--- /dev/null
+++ b/applications/flash_attention_v2/kernel/xe_tile_scheduler.hpp
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.h"
+
+namespace cutlass::fmha::kernel {
+
+struct XeFHMAIndividualTileScheduler {
+
+ struct Params {
+ dim3 grid;
+ FastDivmod divmod_num_heads;
+ };
+
+ bool valid_ = true;
+ Params params;
+
+ CUTLASS_DEVICE
+ XeFHMAIndividualTileScheduler(Params const& params) : params(params) {}
+
+ template
+ static Params to_underlying_arguments(
+ ProblemShape const& shape, KernelHardwareInfo hw_info,
+ TileShape const& tile_shape)
+ {
+ using namespace cute;
+
+ dim3 grid(size(ceil_div(shape.head_size_vo, get<1>(tile_shape))), // V
+ size(ceil_div(shape.seq_len_qo, get<0>(tile_shape))), // Q
+ size(shape.batch * shape.num_heads_q)); // (h,b) -- split later
+ return Params{grid, {shape.num_heads_q}};
+ }
+
+ template
+ static dim3 get_grid_shape(Params const& params) {
+ return params.grid;
+ }
+
+ CUTLASS_DEVICE
+ bool is_valid() {
+ return valid_;
+ }
+
+ CUTLASS_DEVICE
+ auto get_block_coord() {
+ using namespace cute;
+ int idx_b = BlockIdxZ();
+ int head;
+ params.divmod_num_heads(idx_b, head, idx_b);
+ return make_coord(BlockIdxY(), BlockIdxX(), head, idx_b);
+ }
+
+ CUTLASS_DEVICE
+ XeFHMAIndividualTileScheduler& operator++() {
+ valid_ = false;
+ return *this;
+ }
+};
+
+struct XeFHMAIndividualPersistentTileScheduler {
+
+ struct Params {
+ dim3 grid;
+ FastDivmod divmod_num_heads;
+ };
+
+ bool valid_ = true;
+ Params params;
+ int kv_tile_size_;
+ // num of kv blocks for each head
+ int local_num_kv_blocks_;
+ int num_batch_heads_;
+
+ CUTLASS_DEVICE
+ XeFHMAIndividualPersistentTileScheduler(Params const& params, int kv_tile_size,
+ int local_num_kv_blocks, int num_batch_heads)
+ : params(params), kv_tile_size_(kv_tile_size), local_num_kv_blocks_(local_num_kv_blocks), num_batch_heads_(num_batch_heads) {}
+
+ template
+ static Params to_underlying_arguments(
+ ProblemShape const& shape, KernelHardwareInfo hw_info,
+ TileShape const& tile_shape)
+ {
+ using namespace cute;
+
+ dim3 grid(size(ceil_div(shape.head_size_vo, get<1>(tile_shape))), // V
+ size(ceil_div(shape.seq_len_qo, get<0>(tile_shape))), // Q
+ size(shape.batch * shape.num_heads_q)); // (h,b) -- split later
+ int num_heads = shape.num_heads_q;
+ grid.z = hw_info.sm_count;
+
+ return Params{grid, {num_heads}};
+ }
+
+ template
+ static dim3 get_grid_shape(Params const& params) {
+ return params.grid;
+ }
+
+ CUTLASS_DEVICE
+ bool is_valid() {
+ return valid_;
+ }
+
+ CUTLASS_DEVICE
+ auto get_block_coord() {
+ using namespace cute;
+ int wg_id = BlockIdxZ();
+
+ // total number of blocks need to be processed across all wgs
+ int total_num_kv_blocks = local_num_kv_blocks_ * num_batch_heads_;
+ // guarantee all wg process similar number of blocks of KV (load balance)
+ int num_blocks_per_wg = cute::ceil_div(total_num_kv_blocks, GridDimZ());
+
+ // compute start batch head id for current wg
+ int start_batch_head_id = wg_id * num_blocks_per_wg / local_num_kv_blocks_;
+
+ return make_coord(BlockIdxY(), BlockIdxX(), start_batch_head_id);
+ }
+
+ CUTLASS_DEVICE
+ XeFHMAIndividualPersistentTileScheduler& operator++() {
+ valid_ = false;
+ return *this;
+ }
+};
+
+} // namespace cutlass::fmha::kernel
diff --git a/benchmarks/gemm/benchmark_runner.hpp b/benchmarks/gemm/benchmark_runner.hpp
index 69dc80a741..7fbca3a76b 100644
--- a/benchmarks/gemm/benchmark_runner.hpp
+++ b/benchmarks/gemm/benchmark_runner.hpp
@@ -172,7 +172,7 @@ struct BenchmarkRunnerGemm {
using ElementA = typename Gemm::ElementA;
using ElementB = typename Gemm::ElementB;
- using ElementAcc = typename Gemm::ElementAccumulator;
+ using ElementAccumulator = typename Gemm::ElementAccumulator;
using CollectiveMainloop = typename Gemm::GemmKernel::CollectiveMainloop;
using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
@@ -187,7 +187,6 @@ struct BenchmarkRunnerGemm {
using ElementC = typename Gemm::ElementC;
using ElementOutput = typename CollectiveEpilogue::ElementOutput;
using ElementCompute = typename CollectiveEpilogue::ElementCompute;
- using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
@@ -660,7 +659,7 @@ struct BenchmarkRunnerGemm {
stride_S, block_zero.get(), stride_Z, 128};
}
- arguments.epilogue = {{ElementAcc(options.alpha), ElementAcc(options.beta)}, block_C[0].get(), stride_C, block_D.get(), stride_D};
+ arguments.epilogue = {{ElementAccumulator(options.alpha), ElementAccumulator(options.beta)}, block_C[0].get(), stride_C, block_D.get(), stride_D};
arguments.hw_info = hw_info;
if constexpr(epi_is_deeltactmul){
@@ -748,7 +747,7 @@ struct BenchmarkRunnerGemm {
gemm::GemmUniversalMode::kGemm,
problem_size,
{block_A[input_num].get(), stride_A, block_B[input_num].get(), stride_B},
- {{ElementAcc(options.alpha), ElementAcc(options.beta)}, block_C[input_num].get(), stride_C, block_D.get(), stride_D},
+ {{ElementAccumulator(options.alpha), ElementAccumulator(options.beta)}, block_C[input_num].get(), stride_C, block_D.get(), stride_D},
hw_info
};
if constexpr (is_mixed_dtype) {
diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
index d92070c72c..3013c2302f 100644
--- a/cmake/FindDPCPP.cmake
+++ b/cmake/FindDPCPP.cmake
@@ -41,41 +41,65 @@ add_library(DPCPP::DPCPP INTERFACE IMPORTED)
set(DPCPP_FLAGS "-fsycl;")
if(DPCPP_HOST_COMPILER)
list(APPEND DPCPP_FLAGS "-fsycl-host-compiler=${DPCPP_HOST_COMPILER}")
- list(APPEND DPCPP_FLAGS "-fsycl-host-compiler-options=-Wno-changes-meaning -D$, -D> -I$, -I>")
+ set(_host_opts "-Wno-changes-meaning $<$>:-fPIC> -D$, -D> -I$, -I>")
+ if(DEFINED DPCPP_HOST_COMPILER_OPTIONS AND NOT "${DPCPP_HOST_COMPILER_OPTIONS}" STREQUAL "")
+ set(_host_opts "${DPCPP_HOST_COMPILER_OPTIONS} ${_host_opts}")
+ string(STRIP "${_host_opts}" _host_opts)
+ endif()
+ list(APPEND DPCPP_FLAGS "-fsycl-host-compiler-options=${_host_opts}")
endif()
set(DPCPP_COMPILE_ONLY_FLAGS "")
set(DPCPP_LINK_ONLY_FLAGS "")
-if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "")
- list(APPEND DPCPP_FLAGS "-fsycl-targets=${DPCPP_SYCL_TARGET};")
-endif()
-
option(DPCPP_DISABLE_ITT_FOR_CUTLASS "Disables linking of the Instrumentation and Tracing Technology (ITT) device libraries for VTune" ON)
if(NOT "${DPCPP_USER_FLAGS}" STREQUAL "")
list(APPEND DPCPP_FLAGS "${DPCPP_USER_FLAGS};")
endif()
+string(REPLACE "," ";" DPCPP_SYCL_TARGET_LIST "${DPCPP_SYCL_TARGET}")
+
if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
- if("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
+ if(SYCL_NVIDIA_TARGET)
+ list(APPEND DPCPP_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda;")
list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
list(APPEND DPCPP_COMPILE_ONLY_FLAGS; "-mllvm;-enable-global-offset=false;")
endif()
endif()
-if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc" OR
- "${DPCPP_SYCL_TARGET}" STREQUAL "spir64" OR
- "${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_bmg_g21")
- if ((CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2025.2) OR CUTLASS_SYCL_BUILTIN_ENABLE)
- list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier")
- else()
- list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate")
- endif()
+if (SYCL_INTEL_TARGET)
if(DPCPP_DISABLE_ITT_FOR_CUTLASS)
list(APPEND DPCPP_FLAGS "-fno-sycl-instrument-device-code")
endif()
+ set(SYCL_DEVICES)
+
+ foreach(TGT IN LISTS DPCPP_SYCL_TARGET_LIST)
+ if(TGT STREQUAL "intel_gpu_bmg_g21" OR TGT STREQUAL "bmg")
+ list(APPEND SYCL_DEVICES "bmg_g21")
+ elseif(TGT STREQUAL "intel_gpu_pvc" OR TGT STREQUAL "pvc")
+ list(APPEND SYCL_DEVICES "pvc")
+ endif()
+ endforeach()
+
+ list(REMOVE_DUPLICATES SYCL_DEVICES)
+
+ string(JOIN "," SYCL_DEVICES_STR ${SYCL_DEVICES})
+
+ list(APPEND DPCPP_LINK_ONLY_FLAGS "-fsycl-targets=spir64")
+ list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xs;-device ${SYCL_DEVICES_STR}")
+
+ list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xspirv-translator")
+
+ if((CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" AND
+ CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2025.2) OR CUTLASS_SYCL_BUILTIN_ENABLE)
+ set(SPIRV_EXT "+SPV_INTEL_split_barrier")
+ else()
+ set(SPIRV_EXT "+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate")
+ endif()
+ list(APPEND DPCPP_LINK_ONLY_FLAGS "-spirv-ext=${SPIRV_EXT}")
+
endif()
if(UNIX)
diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index 7e9291227e..b9c7738872 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -154,13 +154,12 @@ struct ExampleRunner {
using ElementA = typename Gemm::ElementA;
using ElementB = typename Gemm::ElementB;
- using ElementAcc = typename Gemm::ElementAccumulator;
+ using ElementAccumulator = typename Gemm::ElementAccumulator;
using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
using ElementC = typename Gemm::ElementC;
using ElementOutput = typename CollectiveEpilogue::ElementOutput;
using ElementCompute = typename CollectiveEpilogue::ElementCompute;
- using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
@@ -277,9 +276,7 @@ struct ExampleRunner {
bool passed = verify(problem_size, options.alpha, options.beta);
std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
- if(!passed) return cutlass::Status::kErrorInternal;
-
- if (options.iterations > 0) {
+ if (passed && options.iterations > 0) {
GPU_Clock timer;
timer.start();
for (int i = 0; i < options.iterations; ++i) {
@@ -345,9 +342,13 @@ int main(int argc, const char** argv)
using LayoutC = cutlass::layout::RowMajor;
using LayoutD = cutlass::layout::RowMajor;
- // The 2D block copy operations used for the A and B matrices
- using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
- using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+ // [New Copy Atom] When left unspecified (void), MainloopXeL1Staged automatically selects
+ // appropriate 2D block copy operations for matrices A and B. Alternatively, you can
+ // explicitly specify new copy atom operations such as XE_LOAD_2D, XE_LOAD_2D_VNNI,
+ // or XE_LOAD_2D_TRANSPOSE.
+ // Refer https://github.com/intel/sycl-tla/blob/main/media/docs/cpp/xe_rearchitecture.md
+ using GmemTiledCopyA = void; //XE_LOAD_2D<16, 32, 32>;
+ using GmemTiledCopyB = void; //XE_LOAD_2D_VNNI<16, 32, 32>;
// Workgroup-level tile
using TileShape = Shape<_256, _256, _32>;
@@ -355,21 +356,21 @@ int main(int argc, const char** argv)
// A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
// hardware (sub-groups for Intel BMG) and iterations by each sub-group.
//
- // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom
- // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The
- // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
+ // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom. This example uses
+ // the XE_DPAS_TT<8, float, cute::bfloat16_t> atom, which represents an 8x16x16 DPAS operation with
+ //float32 accumulation and bfloat16 inputs, TileShape (<256, 256, 32>) and sub-group layout (8x4x1).
+ // The TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
// single contiguous chunk of the work-group TileShape. For this configuration, this implies that
// each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
// 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
// performance reasons.
- using TiledMma = // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
- typename TiledMMAHelper, Layout,
- Layout, Stride<_4, _1, _0>>>::TiledMMA;
+ using TiledMma = typename TiledMMAHelper>, Layout, Layout, Stride<_4, _1, _0>>>::TiledMMA;
// For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
constexpr int PipelineStages = 2;
- using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16;
- using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+ // For older version of copy/mma atom, use cutlass::gemm::MainloopIntelXeXMX16 as dispatch policy
+ using GEMMDispatchPolicy = cutlass::gemm::MainloopXeL1Staged;
+ using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeGeneric;
// This is the 'default' epilogue operation (Linear Combination) which performs everything in:
// (D = alpha * (A*B) + beta * C)
@@ -380,22 +381,22 @@ int main(int argc, const char** argv)
// FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch
// policy/architecture) and defines the epilogue arguments.
- using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks;
+
// GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any
// auxiliary data required
using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
EpilogueDispatchPolicy,
TileShape,
+ void, // Epilogue tile (void = automatic)
ElementAccumulator,
cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
ElementOutput,
cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
- FusionCallBacks,
- XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C
- void, void,
- XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D
- void, void>;
+ FusionCallbacks,
+ void, // The copy atom used to load matrix C (void = automatic)
+ void>; // The copy atom used to store matrix D (void = automatic)
// GEMM Mainloop - iteration over blocks in K dimension
using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
@@ -412,9 +413,9 @@ int main(int argc, const char** argv)
// Define the whole kernel (mainloop and epilogue)
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
- Shape, // Defer global problem shape definition to runtime
- CollectiveMainloop,
- CollectiveEpilogue
+ Shape, // Defer global problem shape definition to runtime
+ CollectiveMainloop,
+ CollectiveEpilogue
>;
// The GemmUniversalAdapter wraps the defined GEMM kernel and handles the launch, and e.g.
diff --git a/examples/00_bmg_gemm/00_bmg_gemm_padded.cpp b/examples/00_bmg_gemm/00_bmg_gemm_padded.cpp
index b231825fe7..4e1905e58d 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm_padded.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm_padded.cpp
@@ -39,7 +39,7 @@
This example makes use of BMGs subgroup cooperative 2d-block copy operations and DPAS instructions.
To support more input shapes using these instructions, rows of the input/output matrices are padded
- to a multiple of 16 and each matrix in batch is padded to a multiple of 64, as required by these
+ to a multiple of 16 and each matrix in batch is padded to a multiple of 64, as required by these
instructions.
The shapes of the A and B matrix are defined at runtime by `options.m`, `.n` and `.k`, and the
@@ -161,14 +161,14 @@ struct ExampleRunner {
using ElementA = typename Gemm::ElementA;
using ElementB = typename Gemm::ElementB;
- using ElementAcc = typename Gemm::ElementAccumulator;
+ using ElementAccumulator = typename Gemm::ElementAccumulator;
using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
using ElementC = typename Gemm::ElementC;
using ElementD = typename Gemm::ElementD;
using ElementOutput = typename CollectiveEpilogue::ElementOutput;
using ElementCompute = typename CollectiveEpilogue::ElementCompute;
- using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
@@ -200,7 +200,7 @@ struct ExampleRunner {
bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
auto [M, N, K, L] = problem_size;
-
+
// Padded values
// The inner dimension is padded. Since this example is all RowMajor,
// we require the following:
@@ -208,7 +208,7 @@ struct ExampleRunner {
int N_C = cute::round_up(N, AlignElemC);
int N_D = cute::round_up(N, AlignElemD);
int K_A = cute::round_up(K, AlignElemA);
-
+
int AlignmentOuter = AlignmentPtr / AlignmentInner;
int M_ACD = cute::round_up(M, AlignmentOuter);
int K_B = cute::round_up(K, AlignmentOuter);
@@ -383,9 +383,13 @@ int main(int argc, const char** argv)
using LayoutC = cutlass::layout::RowMajor;
using LayoutD = cutlass::layout::RowMajor;
- // The 2D block copy operations used for the A and B matrices
- using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
- using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+ // [New Copy Atom] When left unspecified (void), MainloopXeL1Staged automatically selects
+ // appropriate 2D block copy operations for matrices A and B. Alternatively, you can
+ // explicitly specify new copy atom operations such as XE_LOAD_2D, XE_LOAD_2D_VNNI,
+ // or XE_LOAD_2D_TRANSPOSE.
+ // Refer https://github.com/intel/sycl-tla/blob/main/media/docs/cpp/xe_rearchitecture.md
+ using GmemTiledCopyA = void; //XE_LOAD_2D<16, 32, 32>;
+ using GmemTiledCopyB = void; //XE_LOAD_2D_VNNI<16, 32, 32>;
// Workgroup-level tile
using TileShape = Shape<_256, _256, _32>;
@@ -393,21 +397,21 @@ int main(int argc, const char** argv)
// A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
// hardware (sub-groups for Intel BMG) and iterations by each sub-group.
//
- // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom
- // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The
- // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
+ // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom. This example uses
+ // the XE_DPAS_TT<8, float, cute::bfloat16_t> atom, which represents an 8x16x16 DPAS operation with
+ // float32 accumulation and bfloat16 inputs, TileShape (<256, 256, 32>) and sub-group layout (8x4x1).
+ // The TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
// single contiguous chunk of the work-group TileShape. For this configuration, this implies that
// each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
// 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
// performance reasons.
- using TiledMma = // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
- typename TiledMMAHelper, Layout,
- Layout, Stride<_4, _1, _0>>>::TiledMMA;
+ using TiledMma = typename TiledMMAHelper>, Layout, Layout, Stride<_4, _1, _0>>>::TiledMMA;
// For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
constexpr int PipelineStages = 2;
- using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16;
- using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+ // For older version of copy/mma atom, use cutlass::gemm::MainloopIntelXeXMX16 as dispatch policy
+ using GEMMDispatchPolicy = cutlass::gemm::MainloopXeL1Staged;
+ using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeGeneric;
// This is the 'default' epilogue operation (Linear Combination) which performs everything in:
// (D = alpha * (A*B) + beta * C)
@@ -418,22 +422,21 @@ int main(int argc, const char** argv)
// FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch
// policy/architecture) and defines the epilogue arguments.
- using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks;
// GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any
// auxiliary data required
using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
EpilogueDispatchPolicy,
TileShape,
+ void, // Epilogue tile (void = automatic)
ElementAccumulator,
cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
ElementOutput,
cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
- FusionCallBacks,
- XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C
- void, void,
- XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D
- void, void>;
+ FusionCallbacks,
+ void, // The copy atom used to load matrix C (void = automatic)
+ void>; // The copy atom used to store matrix D (void = automatic)
// GEMM Mainloop - iteration over blocks in K dimension
using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
@@ -464,4 +467,4 @@ int main(int argc, const char** argv)
CUTLASS_CHECK(runner.run(options, hw_info));
return 0;
-}
+}
\ No newline at end of file
diff --git a/examples/00_bmg_gemm/00_bmg_gemm_with_sycl_queue.cpp b/examples/00_bmg_gemm/00_bmg_gemm_with_sycl_queue.cpp
index 67e1193e75..4e42e48e4c 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm_with_sycl_queue.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm_with_sycl_queue.cpp
@@ -136,13 +136,12 @@ struct ExampleRunner {
using ElementA = typename Gemm::ElementA;
using ElementB = typename Gemm::ElementB;
- using ElementAcc = typename Gemm::ElementAccumulator;
+ using ElementAccumulator = typename Gemm::ElementAccumulator;
using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
using ElementC = typename Gemm::ElementC;
using ElementOutput = typename CollectiveEpilogue::ElementOutput;
using ElementCompute = typename CollectiveEpilogue::ElementCompute;
- using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
@@ -348,42 +347,50 @@ int main(int argc, const char** argv)
using LayoutC = cutlass::layout::RowMajor;
using LayoutD = cutlass::layout::RowMajor;
- using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
- using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+ // [New Copy Atom] When left unspecified (void), MainloopXeL1Staged automatically selects
+ // appropriate 2D block copy operations for matrices A and B. Alternatively, you can
+ // explicitly specify new copy atom operations such as XE_LOAD_2D, XE_LOAD_2D_VNNI,
+ // or XE_LOAD_2D_TRANSPOSE.
+ // Refer https://github.com/intel/sycl-tla/blob/main/media/docs/cpp/xe_rearchitecture.md
+ using GmemTiledCopyA = void; //XE_LOAD_2D<16, 32, 32>;
+ using GmemTiledCopyB = void; //XE_LOAD_2D_VNNI<16, 32, 32>;
// Workgroup-level tile
using TileShape = Shape<_256, _256, _32>;
- // The Tile of this layout describes how 8x4x1 sub-groups tile the TileShape of <256, 256, 32>.
- // This permutation (which can be thought of as a scatter operation on the default tiling)
- // ensures that each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations)
- // See 0t_mma_atom.md#TiledMMAs for more info.
- // Sub-groups are arranged row-major (stride 4,1,0) for performance reasons.
- using TiledMma =
- typename TiledMMAHelper, Layout,
- Layout, Stride<_4, _1, _0>>>::TiledMMA;
+ // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
+ // hardware (sub-groups for Intel BMG) and iterations by each sub-group.
+ //
+ // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom. This example uses
+ // the XE_DPAS_TT<8, float, cute::bfloat16_t> atom, which represents an 8x16x16 DPAS operation with
+ //float32 accumulation and bfloat16 inputs, TileShape (<256, 256, 32>) and sub-group layout (8x4x1).
+ // The TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
+ // single contiguous chunk of the work-group TileShape. For this configuration, this implies that
+ // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
+ // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
+ // performance reasons.
+ using TiledMma = typename TiledMMAHelper>, Layout, Layout, Stride<_4, _1, _0>>>::TiledMMA;
constexpr int PipelineStages = 2;
- using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16;
- using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+ using GEMMDispatchPolicy = cutlass::gemm::MainloopXeL1Staged;
+ using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeGeneric;
using EpilogueOp = cutlass::epilogue::fusion::LinearCombination;
- using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks;
using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
EpilogueDispatchPolicy,
TileShape,
+ void,
ElementAccumulator,
cutlass::gemm::TagToStrideC_t,
ElementOutput,
cutlass::gemm::TagToStrideC_t,
- FusionCallBacks,
- XE_2D_U32x8x16_LD_N,
- void, void,
- XE_2D_U32x8x16_ST_N,
- void, void>;
+ FusionCallbacks,
+ void,
+ void>;
// Mainloop
using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
@@ -411,4 +418,4 @@ int main(int argc, const char** argv)
CUTLASS_CHECK(runner.run(options, hw_info));
return 0;
-}
+}
\ No newline at end of file
diff --git a/examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp b/examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp
new file mode 100644
index 0000000000..91139cf0d6
--- /dev/null
+++ b/examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp
@@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+ \brief CUTLASS Intel BMG Gemm Example.
+
+ This example constructs and executes a simple CUTLASS GEMM kernel on Intel BMG hardware, and
+ verifies its correctness with a reference implementation
+ (cutlass::reference::device::GemmComplex). The example also provides a performance measurement
+ for the GEMM in TFLOPS.
+
+ This example makes use of BMGs subgroup cooperative 2d-block copy operations and DPAS instructions.
+
+ The shapes of the A and B matrix are defined at runtime by `options.m`, `.n` and `.k`, and the
+ batch size is defined by `options.l`. The tile shape, which defines how much work is executed by
+ a single work-group, is defined at compile time by:
+ ```
+ using TileShape = Shape<_256, _256, _32>;
+ ```
+ That is, each work-group processes a tile of M=256, N=256, and iterates over `options.k` in
+ blocks of K=32.
+
+ Performance of GEMM on BMG is heavily dependent on prefetching the A and B matrices. That is,
+ executing Intel specific prefetch instructions for future iterations to ensure that the required
+ blocks of A and B are resident in cache before they are needed.
+
+ To build & run this example (from your build dir):
+
+ $ ninja 00_bmg_gemm
+ $ ./examples/sycl/00_bmg_gemm/00_bmg_gemm
+
+ Call with `--help` for information about available options
+*/
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/xe_epilogue.hpp"
+#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include
+#include
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "sycl_common.hpp"
+#include "helper.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+ bool help;
+ bool error;
+
+ int m, n, k, l, iterations;
+ float alpha, beta;
+
+ Options():
+ help(false),
+ error(false),
+ m(5120), n(4096), k(4096), l(1), iterations(20),
+ alpha(1.f), beta(0.f)
+ { }
+
+ // Parses the command line
+ void parse(int argc, char const **args) {
+ cutlass::CommandLine cmd(argc, args);
+
+ if (cmd.check_cmd_line_flag("help")) {
+ help = true;
+ return;
+ }
+
+ cmd.get_cmd_line_argument("m", m, 5120);
+ cmd.get_cmd_line_argument("n", n, 4096);
+ cmd.get_cmd_line_argument("k", k, 4096);
+ cmd.get_cmd_line_argument("l", l, 1);
+ cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+ cmd.get_cmd_line_argument("beta", beta, 0.f);
+ cmd.get_cmd_line_argument("iterations", iterations, 100);
+ }
+
+ /// Prints the usage statement.
+ std::ostream & print_usage(std::ostream &out) const {
+
+ out << "BMG GEMM Example\n\n"
+ << "Options:\n\n"
+ << " --help If specified, displays this usage statement\n\n"
+ << " --m= Sets the M extent of the GEMM\n"
+ << " --n= Sets the N extent of the GEMM\n"
+ << " --k= Sets the K extent of the GEMM\n"
+ << " --l= Sets the L extent (batch count) of the GEMM\n"
+ << " --alpha= Epilogue scalar alpha\n"
+ << " --beta= Epilogue scalar beta\n\n"
+ << " --iterations= Iterations\n\n";
+
+ return out;
+ }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+ class Gemm
+>
+struct ExampleRunner {
+
+ using StrideA = typename Gemm::GemmKernel::StrideA;
+ using StrideB = typename Gemm::GemmKernel::StrideB;
+ using StrideC = typename Gemm::GemmKernel::StrideC;
+ using StrideD = typename Gemm::GemmKernel::StrideD;
+
+ using LayoutA = typename Gemm::LayoutA;
+ using LayoutB = typename Gemm::LayoutB;
+ using LayoutC = typename Gemm::LayoutC;
+ using LayoutD = typename Gemm::LayoutD;
+
+ using ElementA = typename Gemm::ElementA;
+ using ElementB = typename Gemm::ElementB;
+ using ElementAcc = typename Gemm::ElementAccumulator;
+
+ using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+ using ElementC = typename Gemm::ElementC;
+ using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+ using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+ using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+ using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+ //
+ // Data members
+ //
+
+ /// Initialization
+ StrideA stride_A;
+ StrideB stride_B;
+ StrideC stride_C;
+ StrideD stride_D;
+ uint64_t seed = 0;
+
+ cutlass::DeviceAllocation block_A;
+ cutlass::DeviceAllocation block_B;
+ cutlass::DeviceAllocation block_C;
+ cutlass::DeviceAllocation block_D;
+ cutlass::DeviceAllocation block_ref_D; // Reference GEMM result for verification
+
+ //
+ // Methods
+ //
+
+ bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+ auto [M, N, K, L] = problem_size;
+
+ cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+ cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+ cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+ cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+ cutlass::reference::device::GemmComplex(
+ {M, N, K},
+ alpha,
+ ref_A,
+ cutlass::ComplexTransform::kNone,
+ ref_B,
+ cutlass::ComplexTransform::kNone,
+ beta,
+ ref_C,
+ ref_D,
+ ElementAccumulator(0),
+ L, // batch_count
+ M * K, // batch_stride_A
+ K * N, // batch_stride_B
+ M * N, // batch_stride_C
+ M * N // batch_stride_D
+ );
+
+ // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue
+ compat::wait();
+
+ // Check if output from CUTLASS kernel and reference kernel are equal or not
+ bool passed = cutlass::reference::device::BlockCompareEqual(
+ block_ref_D.get(), block_D.get(), block_D.size());
+
+ return passed;
+ }
+
+ /// Initialize operands to be used in the GEMM and reference GEMM
+ void initialize(const ProblemShapeType& problem_size) {
+ auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+ auto [M, N, K, L] = problem_shape_MNKL;
+
+ // Complete the stride by combining static layout info (StrideA) with runtime size info (M,K,L)
+ stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+ stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+ stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+ stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+ block_A.reset(static_cast(M) * K * L);
+ block_B.reset(static_cast(K) * N * L);
+ block_C.reset(static_cast(M) * N * L);
+ block_D.reset(static_cast(M) * N * L);
+ block_ref_D.reset(static_cast(M) * N * L);
+
+ initialize_block(block_A, seed + 2023);
+ initialize_block(block_B, seed + 2022);
+ initialize_block(block_C, seed + 2021);
+ }
+
+ cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+ ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+ initialize(problem_size);
+
+ typename Gemm::GemmKernel::Arguments arguments{
+ cutlass::gemm::GemmUniversalMode::kGemm,
+ problem_size,
+ {block_A.get(), stride_A, block_B.get(), stride_B},
+ {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+ hw_info
+ };
+
+ Gemm gemm_op;
+
+ size_t workspace_size = Gemm::get_workspace_size(arguments);
+ cutlass::device_memory::allocation workspace(workspace_size);
+
+ if (gemm_op.can_implement(arguments) != cutlass::Status::kSuccess){
+ std::cout << "Invalid Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+ std::exit(1);
+ }
+
+ CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get()));
+
+ // Run the GEMM
+ CUTLASS_CHECK(gemm_op.run());
+
+ compat::wait();
+
+ // Verify that the result is correct
+ bool passed = verify(problem_size, options.alpha, options.beta);
+ std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+ if(!passed) return cutlass::Status::kErrorInternal;
+
+ if (options.iterations > 0) {
+ GPU_Clock timer;
+ timer.start();
+ for (int i = 0; i < options.iterations; ++i) {
+ gemm_op.run();
+ }
+ compat::wait();
+
+ float cute_time = timer.seconds() / options.iterations;
+ double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+ std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+ printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+ }
+
+ return cutlass::Status::kSuccess;
+ }
+
+};
+
+int main(int argc, const char** argv)
+{
+ //
+ // Parse options
+ //
+
+ Options options;
+
+ options.parse(argc, argv);
+
+ if (options.help) {
+ options.print_usage(std::cout) << std::endl;
+ return 0;
+ }
+
+ if (options.error) {
+ std::cerr << "Aborting execution." << std::endl;
+ return -1;
+ }
+
+ //
+ // Run examples
+ //
+
+ // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+ // information is used by the underlying kernel.
+ cutlass::KernelHardwareInfo hw_info;
+
+ // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+ // to use a GPU other than that with device ID 0.
+ hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+ bool passed;
+
+ // The code section below describes datatype for input, output matrices and computation between
+ // elements in input matrices.
+ using ElementAccumulator = float; // <- data type of accumulator
+ using ElementComputeEpilogue = float; // <- data type of epilogue operations
+ using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A
+ using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B
+ using ElementOutput = float; // <- data type of elements in output matrix D
+
+ using LayoutA = cutlass::layout::RowMajor;
+ using LayoutB = cutlass::layout::RowMajor;
+ using LayoutC = cutlass::layout::RowMajor;
+ using LayoutD = cutlass::layout::RowMajor;
+
+ // The 2D block copy operations used for the A and B matrices
+ using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+ using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+
+ // Workgroup-level tile
+ using TileShape = Shape<_256, _256, _32>;
+
+ // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
+ // hardware (sub-groups for Intel BMG) and iterations by each sub-group.
+ //
+ // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom
+ // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The
+ // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
+ // single contiguous chunk of the work-group TileShape. For this configuration, this implies that
+ // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
+ // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
+ // performance reasons.
+ using TiledMma = // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
+ typename TiledMMAHelper, Layout,
+ Layout, Stride<_4, _1, _0>>>::TiledMMA;
+
+ // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
+ constexpr int PipelineStages = 2;
+ using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16;
+ using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+
+ // This is the 'default' epilogue operation (Linear Combination) which performs everything in:
+ // (D = alpha * (A*B) + beta * C)
+ // aside from the (A*B), which is handled by the GEMM. See 05_bmg_gemm_with_epilogues for more
+ // complex epilogue examples.
+ using EpilogueOp = cutlass::epilogue::fusion::LinearCombination;
+
+ // FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch
+ // policy/architecture) and defines the epilogue arguments.
+ using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks;
+ // GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any
+ // auxiliary data required
+ using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
+ EpilogueDispatchPolicy,
+ TileShape,
+ ElementAccumulator,
+ cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+ ElementOutput,
+ cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+ FusionCallBacks,
+ XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C
+ void, void,
+ XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D
+ void, void>;
+
+ // GEMM Mainloop - iteration over blocks in K dimension
+ using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+ GEMMDispatchPolicy,
+ TileShape,
+ ElementInputA,
+ cutlass::gemm::TagToStrideA_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+ ElementInputB,
+ cutlass::gemm::TagToStrideB_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+ TiledMma,
+ GmemTiledCopyA, void, void, cute::identity, // A
+ GmemTiledCopyB, void, void, cute::identity // B
+ >;
+
+ // Define the whole kernel (mainloop and epilogue)
+ using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+ Shape, // Defer global problem shape definition to runtime
+ CollectiveMainloop,
+ CollectiveEpilogue
+ >;
+
+ // The GemmUniversalAdapter wraps the defined GEMM kernel and handles the launch, and e.g.
+ // persistent scratch memory if required.
+ using Gemm = cutlass::gemm::device::GemmUniversalAdapter;
+
+ ExampleRunner runner;
+
+ CUTLASS_CHECK(runner.run(options, hw_info));
+
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/00_bmg_gemm/legacy/00_bmg_gemm_padded.cpp b/examples/00_bmg_gemm/legacy/00_bmg_gemm_padded.cpp
new file mode 100644
index 0000000000..32a7285e71
--- /dev/null
+++ b/examples/00_bmg_gemm/legacy/00_bmg_gemm_padded.cpp
@@ -0,0 +1,467 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+ \brief CUTLASS Intel BMG Gemm Example.
+
+ This example constructs and executes a simple CUTLASS GEMM kernel on Intel BMG hardware, and
+ verifies its correctness with a reference implementation
+ (cutlass::reference::device::GemmComplex). The example also provides a performance measurement
+ for the GEMM in TFLOPS.
+
+ This example makes use of BMGs subgroup cooperative 2d-block copy operations and DPAS instructions.
+ To support more input shapes using these instructions, rows of the input/output matrices are padded
+ to a multiple of 16 and each matrix in batch is padded to a multiple of 64, as required by these
+ instructions.
+
+ The shapes of the A and B matrix are defined at runtime by `options.m`, `.n` and `.k`, and the
+ batch size is defined by `options.l`. The tile shape, which defines how much work is executed by
+ a single work-group, is defined at compile time by:
+ ```
+ using TileShape = Shape<_256, _256, _32>;
+ ```
+ That is, each work-group processes a tile of M=256, N=256, and iterates over `options.k` in
+ blocks of K=32.
+
+ Performance of GEMM on BMG is heavily dependent on prefetching the A and B matrices. That is,
+ executing Intel specific prefetch instructions for future iterations to ensure that the required
+ blocks of A and B are resident in cache before they are needed.
+
+ To build & run this example (from your build dir):
+
+ $ ninja 00_bmg_gemm
+ $ ./examples/sycl/00_bmg_gemm/00_bmg_gemm
+
+ Call with `--help` for information about available options
+*/
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/xe_epilogue.hpp"
+#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "sycl_common.hpp"
+#include "helper.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The alignment requirement in bytes on inner dimmension that will work for both PVC and BMG
+constexpr int AlignmentInner = 16;
+// The alignment requirement in bytes on outer dimmension that will work for both PVC and BMG
+constexpr int AlignmentPtr = 64;
+
+// Command line options parsing
+struct Options {
+
+ bool help;
+ bool error;
+
+ int m, n, k, l, iterations;
+ float alpha, beta;
+
+ Options():
+ help(false),
+ error(false),
+ m(5120), n(4096), k(4096), l(1), iterations(20),
+ alpha(1.f), beta(0.f)
+ { }
+
+ // Parses the command line
+ void parse(int argc, char const **args) {
+ cutlass::CommandLine cmd(argc, args);
+
+ if (cmd.check_cmd_line_flag("help")) {
+ help = true;
+ return;
+ }
+
+ cmd.get_cmd_line_argument("m", m, 5120);
+ cmd.get_cmd_line_argument("n", n, 4096);
+ cmd.get_cmd_line_argument("k", k, 4096);
+ cmd.get_cmd_line_argument("l", l, 1);
+ cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+ cmd.get_cmd_line_argument("beta", beta, 0.f);
+ cmd.get_cmd_line_argument("iterations", iterations, 100);
+ }
+
+ /// Prints the usage statement.
+ std::ostream & print_usage(std::ostream &out) const {
+
+ out << "BMG GEMM Example\n\n"
+ << "Options:\n\n"
+ << " --help If specified, displays this usage statement\n\n"
+ << " --m= Sets the M extent of the GEMM\n"
+ << " --n= Sets the N extent of the GEMM\n"
+ << " --k= Sets the K extent of the GEMM\n"
+ << " --l= Sets the L extent (batch count) of the GEMM\n"
+ << " --alpha= Epilogue scalar alpha\n"
+ << " --beta= Epilogue scalar beta\n\n"
+ << " --iterations= Iterations\n\n";
+
+ return out;
+ }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+ class Gemm
+>
+struct ExampleRunner {
+
+ using StrideA = typename Gemm::GemmKernel::StrideA;
+ using StrideB = typename Gemm::GemmKernel::StrideB;
+ using StrideC = typename Gemm::GemmKernel::StrideC;
+ using StrideD = typename Gemm::GemmKernel::StrideD;
+
+ using LayoutA = typename Gemm::LayoutA;
+ using LayoutB = typename Gemm::LayoutB;
+ using LayoutC = typename Gemm::LayoutC;
+ using LayoutD = typename Gemm::LayoutD;
+
+ using ElementA = typename Gemm::ElementA;
+ using ElementB = typename Gemm::ElementB;
+ using ElementAcc = typename Gemm::ElementAccumulator;
+
+ using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+ using ElementC = typename Gemm::ElementC;
+ using ElementD = typename Gemm::ElementD;
+ using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+ using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+ using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+ using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+ static constexpr int AlignElemA = AlignmentInner / sizeof(ElementA);
+ static constexpr int AlignElemB = AlignmentInner / sizeof(ElementB);
+ static constexpr int AlignElemC = AlignmentInner / sizeof(ElementB);
+ static constexpr int AlignElemD = AlignmentInner / sizeof(ElementD);
+
+ //
+ // Data members
+ //
+
+ /// Initialization
+ StrideA stride_A;
+ StrideB stride_B;
+ StrideC stride_C;
+ StrideD stride_D;
+ uint64_t seed = 0;
+
+ cutlass::DeviceAllocation block_A;
+ cutlass::DeviceAllocation block_B;
+ cutlass::DeviceAllocation block_C;
+ cutlass::DeviceAllocation block_D;
+ cutlass::DeviceAllocation block_ref_D; // Reference GEMM result for verification
+
+ //
+ // Methods
+ //
+
+ bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+ auto [M, N, K, L] = problem_size;
+
+ // Padded values
+ // The inner dimension is padded. Since this example is all RowMajor,
+ // we require the following:
+ int N_B = cute::round_up(N, AlignElemB);
+ int N_C = cute::round_up(N, AlignElemC);
+ int N_D = cute::round_up(N, AlignElemD);
+ int K_A = cute::round_up(K, AlignElemA);
+
+ int AlignmentOuter = AlignmentPtr / AlignmentInner;
+ int M_ACD = cute::round_up(M, AlignmentOuter);
+ int K_B = cute::round_up(K, AlignmentOuter);
+
+ cutlass::TensorRef ref_A(block_A.get(), LayoutA(K_A));
+ cutlass::TensorRef ref_B(block_B.get(), LayoutB(N_B));
+ cutlass::TensorRef ref_C(block_C.get(), LayoutC(N_C));
+ cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD(N_D));
+
+ cutlass::reference::device::GemmComplex(
+ {M, N, K},
+ alpha,
+ ref_A,
+ cutlass::ComplexTransform::kNone,
+ ref_B,
+ cutlass::ComplexTransform::kNone,
+ beta,
+ ref_C,
+ ref_D,
+ ElementAccumulator(0),
+ L, // batch_count
+ M_ACD * K_A, // batch_stride_A
+ K_B * N_B, // batch_stride_B
+ M_ACD * N_C, // batch_stride_C
+ M_ACD * N_D // batch_stride_D
+ );
+
+ // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue
+ compat::wait();
+
+ // Check if output from CUTLASS kernel and reference kernel are equal or not
+ bool passed = cutlass::reference::device::BlockCompareEqual(
+ block_ref_D.get(), block_D.get(), block_D.size());
+
+ return passed;
+ }
+
+ /// Initialize operands to be used in the GEMM and reference GEMM
+ void initialize(const ProblemShapeType& problem_size) {
+ auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+ auto [M, N, K, L] = problem_shape_MNKL;
+
+ // Padded values
+ int N_B = cute::round_up(N, AlignElemB);
+ int N_C = cute::round_up(N, AlignElemC);
+ int N_D = cute::round_up(N, AlignElemD);
+ int K_A = cute::round_up(K, AlignElemA);
+
+ int AlignmentOuter = AlignmentPtr / AlignmentInner;
+ int M_ACD = cute::round_up(M, AlignmentOuter);
+ int K_B = cute::round_up(K, AlignmentOuter);
+
+ // Complete the stride by combining static layout info (StrideA) with runtime size info (M,K,L)
+ stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M_ACD, K_A, L));
+ stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N_B, K_B, L));
+ stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M_ACD, N_C, L));
+ stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M_ACD, N_D, L));
+
+ block_A.reset(M_ACD * K_A * L);
+ block_B.reset(K_B * N_B * L);
+ block_C.reset(M_ACD * N_C * L);
+ block_D.reset(M_ACD * N_D * L);
+ block_ref_D.reset(M_ACD * N_D * L);
+
+ initialize_block(block_A, seed + 2023);
+ initialize_block(block_B, seed + 2022);
+ initialize_block(block_C, seed + 2021);
+ }
+
+ cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+ ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+ initialize(problem_size);
+
+ typename Gemm::GemmKernel::Arguments arguments{
+ cutlass::gemm::GemmUniversalMode::kGemm,
+ problem_size,
+ {block_A.get(), stride_A, block_B.get(), stride_B},
+ {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+ hw_info
+ };
+
+ Gemm gemm_op;
+
+ size_t workspace_size = Gemm::get_workspace_size(arguments);
+ cutlass::device_memory::allocation workspace(workspace_size);
+
+ if (gemm_op.can_implement(arguments) != cutlass::Status::kSuccess) {
+ std::cout << "Warning: Invalid problem size: "
+ << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l
+ << ".\nThis size is not directly supported by the selected kernel.\n"
+ << "However, this example applies padding as needed, so it will still run correctly."
+ << std::endl;
+ }
+
+ CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get()));
+
+ // Run the GEMM
+ CUTLASS_CHECK(gemm_op.run());
+
+ compat::wait();
+
+ // Verify that the result is correct
+ bool passed = verify(problem_size, options.alpha, options.beta);
+ std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+ if(!passed) return cutlass::Status::kErrorInternal;
+
+ if (options.iterations > 0) {
+ GPU_Clock timer;
+ timer.start();
+ for (int i = 0; i < options.iterations; ++i) {
+ gemm_op.run();
+ }
+ compat::wait();
+
+ float cute_time = timer.seconds() / options.iterations;
+ double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+ std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+ printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+ }
+
+ return cutlass::Status::kSuccess;
+ }
+
+};
+
+int main(int argc, const char** argv)
+{
+ //
+ // Parse options
+ //
+
+ Options options;
+
+ options.parse(argc, argv);
+
+ if (options.help) {
+ options.print_usage(std::cout) << std::endl;
+ return 0;
+ }
+
+ if (options.error) {
+ std::cerr << "Aborting execution." << std::endl;
+ return -1;
+ }
+
+ //
+ // Run examples
+ //
+
+ // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+ // information is used by the underlying kernel.
+ cutlass::KernelHardwareInfo hw_info;
+
+ // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+ // to use a GPU other than that with device ID 0.
+ hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+ bool passed;
+
+ // The code section below describes datatype for input, output matrices and computation between
+ // elements in input matrices.
+ using ElementAccumulator = float; // <- data type of accumulator
+ using ElementComputeEpilogue = float; // <- data type of epilogue operations
+ using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A
+ using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B
+ using ElementOutput = float; // <- data type of elements in output matrix D
+
+ using LayoutA = cutlass::layout::RowMajor;
+ using LayoutB = cutlass::layout::RowMajor;
+ using LayoutC = cutlass::layout::RowMajor;
+ using LayoutD = cutlass::layout::RowMajor;
+
+ // The 2D block copy operations used for the A and B matrices
+ using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+ using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+
+ // Workgroup-level tile
+ using TileShape = Shape<_256, _256, _32>;
+
+ // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
+ // hardware (sub-groups for Intel BMG) and iterations by each sub-group.
+ //
+ // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom
+ // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The
+ // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
+ // single contiguous chunk of the work-group TileShape. For this configuration, this implies that
+ // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
+ // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
+ // performance reasons.
+ using TiledMma = // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
+ typename TiledMMAHelper, Layout,
+ Layout, Stride<_4, _1, _0>>>::TiledMMA;
+
+ // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
+ constexpr int PipelineStages = 2;
+ using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16;
+ using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+
+ // This is the 'default' epilogue operation (Linear Combination) which performs everything in:
+ // (D = alpha * (A*B) + beta * C)
+ // aside from the (A*B), which is handled by the GEMM. See 05_bmg_gemm_with_epilogues for more
+ // complex epilogue examples.
+ using EpilogueOp = cutlass::epilogue::fusion::LinearCombination;
+
+ // FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch
+ // policy/architecture) and defines the epilogue arguments.
+ using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks;
+ // GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any
+ // auxiliary data required
+ using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
+ EpilogueDispatchPolicy,
+ TileShape,
+ ElementAccumulator,
+ cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+ ElementOutput,
+ cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+ FusionCallBacks,
+ XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C
+ void, void,
+ XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D
+ void, void>;
+
+ // GEMM Mainloop - iteration over blocks in K dimension
+ using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+ GEMMDispatchPolicy,
+ TileShape,
+ ElementInputA,
+ cutlass::gemm::TagToStrideA_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+ ElementInputB,
+ cutlass::gemm::TagToStrideB_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+ TiledMma,
+ GmemTiledCopyA, void, void, cute::identity, // A
+ GmemTiledCopyB, void, void, cute::identity // B
+ >;
+
+ // Define the whole kernel (mainloop and epilogue)
+ using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+ Shape, // Defer global problem shape definition to runtime
+ CollectiveMainloop,
+ CollectiveEpilogue
+ >;
+
+ // The GemmUniversalAdapter wraps the defined GEMM kernel and handles the launch, and e.g.
+ // persistent scratch memory if required.
+ using Gemm = cutlass::gemm::device::GemmUniversalAdapter;
+
+ ExampleRunner runner;
+
+ CUTLASS_CHECK(runner.run(options, hw_info));
+
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/00_bmg_gemm/legacy/00_bmg_gemm_with_sycl_queue.cpp b/examples/00_bmg_gemm/legacy/00_bmg_gemm_with_sycl_queue.cpp
new file mode 100644
index 0000000000..3a4e0fa704
--- /dev/null
+++ b/examples/00_bmg_gemm/legacy/00_bmg_gemm_with_sycl_queue.cpp
@@ -0,0 +1,414 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+ \brief CUTLASS Intel BMG Gemm Example with non-default SYCL queue.
+ This example modifies 00_bmg_gemm to use a non-default queue. The main changes are passing the
+ queue to gemm_op.initialize and gemm_op.run. Otherwise, changes are made to allocate memory with
+ the correct queue.
+
+ To build & run this example (from your build dir):
+ $ ninja 00_bmg_gemm_with_sycl_queue
+ $ ./examples/sycl/00_bmg_gemm_with_sycl_queue/00_bmg_gemm_with_sycl_queue
+ Call with `--help` for information about available options
+*/
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/xe_epilogue.hpp"
+#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include
+#include
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "sycl_common.hpp"
+#include "helper.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+ bool help;
+ bool error;
+
+ int m, n, k, l, iterations;
+ float alpha, beta;
+
+ Options():
+ help(false),
+ error(false),
+ m(5120), n(4096), k(4096), l(1), iterations(20),
+ alpha(1.f), beta(0.f)
+ { }
+
+ // Parses the command line
+ void parse(int argc, char const **args) {
+ cutlass::CommandLine cmd(argc, args);
+
+ if (cmd.check_cmd_line_flag("help")) {
+ help = true;
+ return;
+ }
+
+ cmd.get_cmd_line_argument("m", m, 5120);
+ cmd.get_cmd_line_argument("n", n, 4096);
+ cmd.get_cmd_line_argument("k", k, 4096);
+ cmd.get_cmd_line_argument("l", l, 1);
+ cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+ cmd.get_cmd_line_argument("beta", beta, 0.f);
+ cmd.get_cmd_line_argument("iterations", iterations, 100);
+ }
+
+ /// Prints the usage statement.
+ std::ostream & print_usage(std::ostream &out) const {
+
+ out << "BMG GEMM Example\n\n"
+ << "Options:\n\n"
+ << " --help If specified, displays this usage statement\n\n"
+ << " --m= Sets the M extent of the GEMM\n"
+ << " --n= Sets the N extent of the GEMM\n"
+ << " --k= Sets the K extent of the GEMM\n"
+ << " --l= Sets the L extent (batch count) of the GEMM\n"
+ << " --alpha= Epilogue scalar alpha\n"
+ << " --beta= Epilogue scalar beta\n\n"
+ << " --iterations= Iterations\n\n";
+
+ return out;
+ }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+ class Gemm
+>
+struct ExampleRunner {
+
+ using StrideA = typename Gemm::GemmKernel::StrideA;
+ using StrideB = typename Gemm::GemmKernel::StrideB;
+ using StrideC = typename Gemm::GemmKernel::StrideC;
+ using StrideD = typename Gemm::GemmKernel::StrideD;
+
+ using LayoutA = typename Gemm::LayoutA;
+ using LayoutB = typename Gemm::LayoutB;
+ using LayoutC = typename Gemm::LayoutC;
+ using LayoutD = typename Gemm::LayoutD;
+
+ using ElementA = typename Gemm::ElementA;
+ using ElementB = typename Gemm::ElementB;
+ using ElementAcc = typename Gemm::ElementAccumulator;
+
+ using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+ using ElementC = typename Gemm::ElementC;
+ using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+ using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+ using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+ using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+ //
+ // Data members
+ //
+
+ /// Initialization
+ StrideA stride_A;
+ StrideB stride_B;
+ StrideC stride_C;
+ StrideD stride_D;
+ uint64_t seed = 0;
+
+ struct Memory {
+ ElementA* block_A;
+ ElementB* block_B;
+ ElementC* block_C;
+ ElementOutput* block_D;
+ ElementOutput* block_ref_D;
+ sycl::queue q;
+
+ Memory(sycl::queue q, ProblemShapeType problem_shape_MNKL) : q(q) {
+ auto [M, N, K, L] = problem_shape_MNKL;
+ block_A = sycl::malloc_device(static_cast(M) * K * L, q);
+ block_B = sycl::malloc_device(static_cast(N) * K * L, q);
+ block_C = sycl::malloc_device(static_cast(M) * N * L, q);
+ block_D = sycl::malloc_device(static_cast(M) * N * L, q);
+ block_ref_D = sycl::malloc_device(static_cast(M) * N * L, q);
+ }
+
+ ~Memory() {
+ sycl::free(block_A, q);
+ sycl::free(block_B, q);
+ sycl::free(block_C, q);
+ sycl::free(block_D, q);
+ sycl::free(block_ref_D, q);
+ }
+
+ // delete other constructors so avoiding leaks is easy
+ Memory(const Memory&) = delete;
+ Memory(Memory&&) noexcept = delete;
+ Memory& operator=(const Memory&) = delete;
+ Memory& operator=(Memory&&) noexcept = delete;
+ };
+
+ //
+ // Methods
+ //
+
+ bool verify(Memory& mem, const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+ auto [M, N, K, L] = problem_size;
+
+ cutlass::TensorRef ref_A(mem.block_A, LayoutA::packed({M, K}));
+ cutlass::TensorRef ref_B(mem.block_B, LayoutB::packed({K, N}));
+ cutlass::TensorRef ref_C(mem.block_C, LayoutC::packed({M, N}));
+ cutlass::TensorRef ref_D(mem.block_ref_D, LayoutD::packed({M, N}));
+
+ cutlass::reference::device::GemmComplex(
+ {M, N, K},
+ alpha,
+ ref_A,
+ cutlass::ComplexTransform::kNone,
+ ref_B,
+ cutlass::ComplexTransform::kNone,
+ beta,
+ ref_C,
+ ref_D,
+ ElementAccumulator(0),
+ L, // batch_count
+ M * K, // batch_stride_A
+ K * N, // batch_stride_B
+ M * N, // batch_stride_C
+ M * N // batch_stride_D
+ );
+
+ // Check if output from CUTLASS kernel and reference kernel are equal or not
+ bool passed = cutlass::reference::device::BlockCompareEqual(
+ mem.block_ref_D, mem.block_D, M * N * L);
+
+ return passed;
+ }
+
+ /// Initialize operands to be used in the GEMM and reference GEMM
+ void initialize(const ProblemShapeType& problem_size, Memory& mem) {
+ auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+ auto [M, N, K, L] = problem_shape_MNKL;
+
+ stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+ stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+ stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+ stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+ cutlass::initialize_block(mem.block_A, M * K * L, seed + 2023);
+ cutlass::initialize_block(mem.block_B, N * K * L, seed + 2022);
+ cutlass::initialize_block(mem.block_C, M * N * L, seed + 2021);
+ }
+
+ cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+ ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+ auto q = compat::create_queue();
+ Memory mem(q, problem_size);
+ initialize(problem_size, mem);
+
+ typename Gemm::GemmKernel::Arguments arguments{
+ cutlass::gemm::GemmUniversalMode::kGemm,
+ problem_size,
+ {mem.block_A, stride_A, mem.block_B, stride_B},
+ {{options.alpha, options.beta}, mem.block_C, stride_C, mem.block_D, stride_D},
+ hw_info
+ };
+
+ Gemm gemm_op;
+
+ size_t workspace_size = Gemm::get_workspace_size(arguments);
+ if (workspace_size != 0) {
+ return cutlass::Status::kErrorInternal;
+ }
+
+ if (gemm_op.can_implement(arguments) != cutlass::Status::kSuccess){
+ std::cout << "Invalid Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+ std::exit(1);
+ }
+
+ CUTLASS_CHECK(gemm_op.initialize(arguments, nullptr, &q));
+
+ // Run the GEMM
+ CUTLASS_CHECK(gemm_op.run(&q));
+
+ q.wait_and_throw();
+
+ // Verify that the result is correct
+ bool passed = verify(mem, problem_size, options.alpha, options.beta);
+ std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+ if(!passed) return cutlass::Status::kErrorInternal;
+
+ if (options.iterations > 0) {
+ GPU_Clock timer;
+ timer.start();
+ for (int i = 0; i < options.iterations; ++i) {
+ gemm_op.run(&q);
+ }
+
+ q.wait_and_throw();
+
+ float cute_time = timer.seconds() / options.iterations;
+ double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+ std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+ printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+ }
+
+ return cutlass::Status::kSuccess;
+ }
+
+};
+
+int main(int argc, const char** argv)
+{
+ //
+ // Parse options
+ //
+
+ Options options;
+
+ options.parse(argc, argv);
+
+ if (options.help) {
+ options.print_usage(std::cout) << std::endl;
+ return 0;
+ }
+
+ if (options.error) {
+ std::cerr << "Aborting execution." << std::endl;
+ return -1;
+ }
+
+ //
+ // Run examples
+ //
+
+ // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+ // information is used by the underlying kernel.
+ cutlass::KernelHardwareInfo hw_info;
+
+ // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+ // to use a GPU other than that with device ID 0.
+ hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+ bool passed;
+
+ // The code section below describes datatype for input, output matrices and computation between
+ // elements in input matrices.
+ using ElementAccumulator = float; // <- data type of accumulator
+ using ElementComputeEpilogue = float; // <- data type of epilogue operations
+ using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A
+ using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B
+ using ElementOutput = float; // <- data type of elements in output matrix D
+
+ using LayoutA = cutlass::layout::RowMajor;
+ using LayoutB = cutlass::layout::RowMajor;
+ using LayoutC = cutlass::layout::RowMajor;
+ using LayoutD = cutlass::layout::RowMajor;
+
+ using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+ using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+
+ // Workgroup-level tile
+ using TileShape = Shape<_256, _256, _32>;
+
+ // The Tile of this layout describes how 8x4x1 sub-groups tile the TileShape of <256, 256, 32>.
+ // This permutation (which can be thought of as a scatter operation on the default tiling)
+ // ensures that each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations)
+ // See 0t_mma_atom.md#TiledMMAs for more info.
+ // Sub-groups are arranged row-major (stride 4,1,0) for performance reasons.
+ using TiledMma =
+ typename TiledMMAHelper, Layout,
+ Layout, Stride<_4, _1, _0>>>::TiledMMA;
+
+ constexpr int PipelineStages = 2;
+ using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16;
+ using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+
+ using EpilogueOp = cutlass::epilogue::fusion::LinearCombination;
+
+ using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks;
+ using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
+ EpilogueDispatchPolicy,
+ TileShape,
+ ElementAccumulator,
+ cutlass::gemm::TagToStrideC_t,
+ ElementOutput,
+ cutlass::gemm::TagToStrideC_t,
+ FusionCallBacks,
+ XE_2D_U32x8x16_LD_N,
+ void, void,
+ XE_2D_U32x8x16_ST_N,
+ void, void>;
+
+ // Mainloop
+ using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+ GEMMDispatchPolicy,
+ TileShape,
+ ElementInputA,
+ cutlass::gemm::TagToStrideA_t,
+ ElementInputB,
+ cutlass::gemm::TagToStrideB_t