diff --git a/Jenkinsfile b/Jenkinsfile index e8ce97780d7..3f4b0f2e422 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1623,7 +1623,7 @@ pipeline { -D GEMM_PRESHUFFLE_LAYOUT="rcr" \ -D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \ ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \ - python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ + python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } @@ -1664,7 +1664,7 @@ pipeline { -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \ -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \ ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \ - python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ + python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } @@ -1689,7 +1689,7 @@ pipeline { -D GEMM_UNIVERSAL_DATATYPE="fp16" \ -D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" .. && \ ninja -j${nthreads()} benchmark_gemm_universal_all && \ - python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ + python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } steps{ buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) diff --git a/test/ck_tile/gemm_tile_engine/CMakeLists.txt b/test/ck_tile/gemm_tile_engine/CMakeLists.txt index 33effcc1206..dc148d45e78 100644 --- a/test/ck_tile/gemm_tile_engine/CMakeLists.txt +++ b/test/ck_tile/gemm_tile_engine/CMakeLists.txt @@ -10,7 +10,7 @@ # ============================================================================ # Locate tile_engine GEMM scripts directory -set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm") +set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm/gemm_universal") if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR}) message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}") @@ -32,11 +32,11 @@ endif() # config_json - Full path to JSON configuration file # ============================================================================ function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json) - set(target_name "test_gemm_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}") + set(target_name "test_gemm_universal_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}") set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}") # Generated header path (already created during cmake configuration) - set(test_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp") + set(test_header "${working_path}/gemm_universal_single_${datatype}_${layout}_${trait}_${tile_config}.hpp") set(test_params_header "${working_path}/test_params.hpp") # Verify header exists (should have been generated during cmake configuration) @@ -118,7 +118,7 @@ function(build_gemm_test_targets datatype layout config_name) # STEP 1: Discovery phase - list all valid kernel configurations execute_process( - COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py + COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py --working_path ${working_path} --datatype ${datatype} --layout ${layout} @@ -178,7 +178,7 @@ function(build_gemm_test_targets datatype layout config_name) # Generate header using --gen_single execute_process( - COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py + COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py --working_path ${working_path} --gpu_target "${GEMM_TEST_GPU_TARGETS}" --datatype ${datatype} diff --git a/tile_engine/CMakeLists.txt b/tile_engine/CMakeLists.txt index b9dc3201282..0bb885bc35f 100644 --- a/tile_engine/CMakeLists.txt +++ b/tile_engine/CMakeLists.txt @@ -3,6 +3,7 @@ include_directories(BEFORE ${CMAKE_CURRENT_LIST_DIR}/include + ${CMAKE_CURRENT_LIST_DIR}/ops ) add_subdirectory(ops/gemm) diff --git a/tile_engine/ops/common/__init__.py b/tile_engine/ops/common/__init__.py new file mode 100644 index 00000000000..1df48571843 --- /dev/null +++ b/tile_engine/ops/common/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT diff --git a/tile_engine/ops/common/benchmark_utils.py b/tile_engine/ops/common/benchmark_utils.py new file mode 100644 index 00000000000..0c158fa48cd --- /dev/null +++ b/tile_engine/ops/common/benchmark_utils.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +import sys +import json +import subprocess +import argparse +import csv +import time +from pathlib import Path +from typing import List, Dict, Tuple, Optional + + +def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False) -> Optional[Dict]: + """Run a single kernel with given parameters and save output to individual JSON file""" + # Create results directory + results_dir = build_dir / "results" + results_dir.mkdir(exist_ok=True) + + # Generate unique JSON filename for this kernel + json_file = results_dir / f"{kernel_path.stem}.json" + + cmd = [str(kernel_path)] + + # Add parameters + for key, value in params.items(): + cmd.append(f"-{key}={value}") + + # Add JSON output flag for clean JSON output + cmd.append("-json_output=true") + + if verbose: + print(f"Running: {' '.join(cmd)}") + + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) + + if result.returncode != 0: + print(f"Error running {kernel_path.name}: {result.stderr}") + return None + + # Save raw output to individual JSON file + output = result.stdout.strip() + if output: + with open(json_file, "w") as f: + f.write(output) + + # Parse the JSON file + return parse_json_file(json_file, verbose=verbose) + else: + print(f"No output from {kernel_path.name}") + return None + + except subprocess.TimeoutExpired: + print(f"Timeout running {kernel_path.name}") + return None + except Exception as e: + print(f"Error running {kernel_path.name}: {e}") + return None + +def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]: + """Parse JSON data from individual kernel output file""" + try: + with open(json_file, "r") as f: + content = f.read().strip() + + # Parse the JSON directly since executables produce clean JSON + data = json.loads(content) + + # Return the complete JSON data as-is, just add some convenience fields + result = data.copy() + if "perf_result" in data: + perf = data["perf_result"] + # Add convenience fields for backward compatibility + result["time_ms"] = perf.get("latency(ms)", 0) + result["tflops"] = perf.get("tflops(TFlops)", 0) + result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0) + + return result + + except json.JSONDecodeError as e: + if verbose: + print(f"Failed to parse JSON from {json_file}: {e}") + return None + except Exception as e: + if verbose: + print(f"Error reading JSON file {json_file}: {e}") + return None + +def find_best_kernel( + results: List[Dict], metric: str = "tflops" +) -> Optional[Dict]: + """Find the best performing kernel based on metric""" + if not results: + return None + + if metric == "tflops": + return max(results, key=lambda x: x.get("tflops", 0)) + elif metric == "time_ms": + return min(results, key=lambda x: x.get("time_ms", float("inf"))) + elif metric == "bandwidth_gb_s": + return max(results, key=lambda x: x.get("bandwidth_gb_s", 0)) + else: + raise ValueError(f"Unknown metric: {metric}") + + +def export_csv(results: List[Dict], filename: str, verbose: bool = False): + """Export all results to CSV""" + if not results: + print("No results to export") + return + + # Get all unique keys from results + all_keys = set() + for result in results: + all_keys.update(result.keys()) + + # Sort keys for consistent output + fieldnames = sorted(all_keys) + + with open(filename, "w", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(results) + + print(f"Results exported to {filename}") + +def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = False): + """Export best kernel selections to file""" + with open(filename, "w") as f: + f.write("# Best kernel selections\n") + f.write( + "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n" + ) + + for key, kernel in sorted(best_kernels.items()): + f.write( + f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n" + ) + + print(f"Best kernels exported to {filename}") + +def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False): + """Export all results and best kernels to JSON with comprehensive metadata""" + from datetime import datetime + + # Calculate comprehensive summary statistics for all metrics + successful_results = [r for r in results if r.get("tflops", 0) > 0] + + tflops_values = [r.get("tflops", 0) for r in successful_results] + bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results] + latency_values = [ + r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0 + ] + + # Performance breakdown by kernel type + pipeline_stats = {} + scheduler_stats = {} + data_type_stats = {} + + for result in successful_results: + # Get config info from the new structure + config = result.get("config", {}) + + # Pipeline statistics + pipeline = config.get("pipeline", "unknown") + if pipeline not in pipeline_stats: + pipeline_stats[pipeline] = { + "count": 0, + "avg_tflops": 0, + "best_tflops": 0, + } + pipeline_stats[pipeline]["count"] += 1 + pipeline_stats[pipeline]["best_tflops"] = max( + pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0) + ) + + # Scheduler statistics + scheduler = config.get("scheduler", "unknown") + if scheduler not in scheduler_stats: + scheduler_stats[scheduler] = { + "count": 0, + "avg_tflops": 0, + "best_tflops": 0, + } + scheduler_stats[scheduler]["count"] += 1 + scheduler_stats[scheduler]["best_tflops"] = max( + scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0) + ) + + # Data type statistics + data_type = config.get("data_type", "unknown") + if data_type not in data_type_stats: + data_type_stats[data_type] = { + "count": 0, + "avg_tflops": 0, + "best_tflops": 0, + } + data_type_stats[data_type]["count"] += 1 + data_type_stats[data_type]["best_tflops"] = max( + data_type_stats[data_type]["best_tflops"], result.get("tflops", 0) + ) + + # Calculate averages for breakdown stats + for stats_dict, field_name in [ + (pipeline_stats, "pipeline"), + (scheduler_stats, "scheduler"), + (data_type_stats, "data_type"), + ]: + for key in stats_dict: + relevant_results = [ + r + for r in successful_results + if r.get("config", {}).get(field_name, "unknown") == key + ] + if relevant_results: + stats_dict[key]["avg_tflops"] = sum( + r.get("tflops", 0) for r in relevant_results + ) / len(relevant_results) + + output_data = { + "benchmark_metadata": { + "timestamp": datetime.now().isoformat(), + "total_kernels_tested": len(results), + "unique_kernels": len( + set(r.get("name", "unknown") for r in results) + ), + "successful_runs": len(successful_results), + "failed_runs": len(results) - len(successful_results), + }, + "performance_summary": { + "tflops_stats": { + "best": max(tflops_values, default=0), + "average": sum(tflops_values) / len(tflops_values) + if tflops_values + else 0, + "min": min(tflops_values, default=0), + "median": sorted(tflops_values)[len(tflops_values) // 2] + if tflops_values + else 0, + }, + "bandwidth_stats": { + "best_gb_s": max(bandwidth_values, default=0), + "average_gb_s": sum(bandwidth_values) / len(bandwidth_values) + if bandwidth_values + else 0, + "min_gb_s": min(bandwidth_values, default=0), + "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2] + if bandwidth_values + else 0, + }, + "latency_stats": { + "best_ms": min(latency_values, default=0), + "average_ms": sum(latency_values) / len(latency_values) + if latency_values + else 0, + "max_ms": max(latency_values, default=0), + "median_ms": sorted(latency_values)[len(latency_values) // 2] + if latency_values + else 0, + }, + "kernel_type_breakdown": { + "by_pipeline": pipeline_stats, + "by_scheduler": scheduler_stats, + "by_data_type": data_type_stats, + }, + "total_problem_configurations": len(best_kernels) + if best_kernels + else 0, + }, + "kernel_results": results, + "best_kernels_by_problem": best_kernels or {}, + } + + with open(filename, "w") as f: + json.dump(output_data, f, indent=2) + + print(f"JSON results exported to {filename}") + print(f" - Total kernels: {len(results)}") + print(f" - Successful runs: {len(successful_results)}") + print(f" - Best TFLOPS: {max(tflops_values, default=0):.2f}") + print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s") + print(f" - Best latency: {min(latency_values, default=0):.2f}ms") + diff --git a/tile_engine/ops/common/utils.hpp b/tile_engine/ops/common/utils.hpp new file mode 100644 index 00000000000..56bfbde5a07 --- /dev/null +++ b/tile_engine/ops/common/utils.hpp @@ -0,0 +1,171 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" + +// Helper function to determine if a layout is row-major +template +constexpr auto is_row_major(Layout) +{ + return ck_tile::bool_constant>{}; +} + +enum class Metric +{ + LATENCY = 0, + TFLOPS = 1, + BANDWIDTH = 2 +}; + +inline constexpr auto get_metric_name(Metric m) +{ + switch(m) + { + case Metric::LATENCY: return "latency"; + case Metric::TFLOPS: return "tflops"; + case Metric::BANDWIDTH: return "bandwidth"; + default: throw std::invalid_argument("Unsupported metric type"); + } +} + +struct PerformanceResult +{ + double latency_; + double tflops_; + double bandwidth_; + + static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m) + { + switch(m) + { + case Metric::LATENCY: return a.latency_ < b.latency_; + case Metric::TFLOPS: return a.tflops_ > b.tflops_; + case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_; + default: throw std::invalid_argument("Unsupported metric type"); + } + } + + friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result) + { + os << "{\n" + << " \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_ + << ",\n" + << " \"tflops(TFlops)\": " << result.tflops_ << ",\n" + << " \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n" + << "}"; + return os; + } +}; + +template +struct KernelInstance +{ + std::string name_; + Problem problem_; + PerformanceResult perf_result_; + + static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m) + { + return PerformanceResult::compare(a.perf_result_, b.perf_result_, m); + } + + friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj) + { + os << "{\n" + << " \"name\": \"" << obj.name_ << "\",\n" + << " \"problem\": " << obj.problem_ << ",\n" + << " \"perf_result\": " << obj.perf_result_ << "\n" + << "}"; + return os; + } +}; + +struct Setting +{ + int n_warmup_; + int n_repeat_; + bool is_gpu_timer_; + int verify_; + int init_method_; + bool log_; + std::string csv_filename_; + bool flush_cache_; + int rotating_count_; + bool json_output_; +}; + +inline std::string get_rocm_version() +{ + std::ifstream version_file("/opt/rocm/.info/version"); + if(version_file.is_open()) + { + std::string version; + std::getline(version_file, version); + return version; + } + return "Unknown"; +} + +template +auto calculate_rtol_atol(const ck_tile::index_t K, + const ck_tile::index_t kbatch, + const float max_accumulated_value) +{ + using ComputeType = + std::conditional_t; + // Calculate thresholds + const auto rtol = ck_tile::get_relative_threshold( + ck_tile::integer_divide_ceil(K, kbatch)); + const auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch)); + // Calculate error due to split_k accumulation + const auto rtol_split_k = + ck_tile::get_relative_threshold(kbatch); + const auto atol_split_k = ck_tile::get_absolute_threshold( + max_accumulated_value, kbatch); + // Use higher threshold + return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); +} + +template +auto calculate_rtol_atol(const ck_tile::index_t K, + const ck_tile::index_t kbatch, + const float max_accumulated_value) +{ + using ComputeTypeAB = + std::conditional_t; + + using ComputeType = + std::conditional_t; + + // Calculate thresholds + const auto rtol = ck_tile::get_relative_threshold( + ck_tile::integer_divide_ceil(K, kbatch)); + + const auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch)); + + // Calculate error due to split_k accumulation + const auto rtol_split_k = + ck_tile::get_relative_threshold(kbatch); + + const auto atol_split_k = ck_tile::get_absolute_threshold( + max_accumulated_value, kbatch); + + // Use higher threshold + return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); +} diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md new file mode 100644 index 00000000000..5e0bae70806 --- /dev/null +++ b/tile_engine/ops/gemm/README.md @@ -0,0 +1,442 @@ +# CK Tile Engine GEMM Operations + +## Overview + +The CK Tile Engine GEMM module provides a comprehensive system for generating, building, and benchmarking GEMM (General Matrix Multiplication) kernels with various configurations. It supports multiple data types, layouts, and optimization strategies. The system has evolved from a monolithic build approach (where all kernels compile into a single executable) to a more flexible individual kernel compilation system, providing better build parallelism and targeted testing capabilities. + +## Table of Contents + +1. [Build System Architecture](#build-system-architecture) +2. [Build Instructions](#build-instructions) +3. [Running Benchmarks](#running-benchmarks) +4. [Configuration System](#configuration-system) +5. [Scripts and Tools](#scripts-and-tools) +6. [Command Line Options](#command-line-options) +7. [Understanding Kernel Names](#understanding-kernel-names) +8. [Troubleshooting](#troubleshooting) +9. [Performance Tips](#performance-tips) + +## Build System Architecture + +### Individual Kernel Compilation (New Approach) + +The new tile engine benchmark system compiles each kernel configuration into a separate executable. This provides: +- Better build parallelism +- Faster incremental builds +- More targeted testing +- Easier debugging of specific configurations + +Each benchmark executable follows the naming pattern: +``` +benchmark_gemm____ +``` + +### Monolithic Build (Legacy Approach) + +The original system compiles all kernels into a single executable (`benchmark_gemm_[Datatype]_[Layout]`), which can then be filtered at runtime using command-line arguments. + +## Build Instructions + +### Prerequisites +- ROCm installation +- CMake 3.16 or higher +- C++17 compatible compiler + +### Basic Build + +```bash +# In the root of composable kernel, create build directory +mkdir build && cd build + +# Configure with specific datatypes and layouts +# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942) +# Replace [Datatype1;Datatype2;...] with datatypes (fp8, bf8, int8, fp16, bf16, fp32, fp64) +# Replace [Layout1;Layout2;...] with layouts (rcr, rrr, crr, ccr) +../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]" + +# Build specific benchmarks +make benchmark_gemm_[Datatype1]_[Layout1] -j +``` + +### Configuration Options + +The build system supports several configuration options: + +#### Using Custom Config Files +```bash +# Method 1: CMake variable (config file must be in configs/ directory) +cmake -DGEMM_CONFIG_FILE=my_custom_config.json ... + +# Method 2: Environment variable (takes precedence over CMake variable) +export GEMM_CONFIG_FILE=my_custom_config.json +cmake ... +``` + +#### Config File Priority Order +1. **Environment variable** `GEMM_CONFIG_FILE` (highest priority) +2. **CMake variable** `GEMM_CONFIG_FILE` +3. **Default config** (default_config.json for all layouts) + +**Note**: All custom config files must be placed in the `tile_engine/ops/gemm/configs/` directory. + +### Example Build Commands + +```bash +# Build for gfx942 with fp8 and fp16 datatypes, rcr layout +mkdir build && cd build +../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr;ccr;rrr;crr" +make benchmark_gemm_universal_fp8_rcr -j +make benchmark_gemm_universal_fp16_rcr -j +``` + +### Building Individual Kernels + +```bash +# Build a specific kernel configuration +make benchmark_gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_256x256x32_1x4x1_32x32x32 + +# Build all fp16 benchmarks in parallel +make -j$(nproc) $(make help | grep benchmark_gemm_fp16 | awk '{print $2}') +``` + +### Rebuilding After Configuration Changes + +If you modify the configuration file, you must rebuild: +```bash +rm -rf tile_engine/ && make benchmark_gemm_universal_[Datatype]_[Layout] -j +``` + +## Running Benchmarks + +### Individual Kernel Execution + +```bash +cd /path/to/build/directory +./bin/benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 \ + -m=512 -n=512 -k=512 -verify=1 +``` + +### Monolithic Executable (Legacy) + +```bash +# Run specific pipeline/scheduler/epilogue combination +./bin/benchmark_gemm_universal_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=default +``` + +### Automated Testing + +Use the provided test script to run multiple benchmarks: +```bash +cd /path/to/composable_kernel/tile_engine/ops/gemm +./test_benchmark.sh [build_directory] +``` + +## Configuration System + +### Configuration Files + +The system uses JSON configuration files to specify kernel parameters: + +- `configs/default_config.json` - Default configurations for various datatypes +- `configs/user_provided_config.json` - User-customizable configurations + +### Configuration Structure + +```json +{ + "tile_config": { + "tile_m": {"values": [256, 128]}, + "tile_n": {"values": [256, 128]}, + "tile_k": {"values": [64, 32]}, + "warp_m": {"values": [2, 4]}, + "warp_n": {"values": [2, 1]}, + "warp_k": {"values": [1]}, + "warp_tile_m": {"values": [32, 16]}, + "warp_tile_n": {"values": [32, 16]}, + "warp_tile_k": {"values": [16, 32]} + }, + "trait_config": { + "pipeline": {"values": ["compv3", "compv4", "mem"]}, + "scheduler": {"values": ["intrawave", "interwave"]}, + "epilogue": {"values": ["default", "cshuffle"]}, + "pad_m": {"values": [false]}, + "pad_n": {"values": [false]}, + "pad_k": {"values": [false]}, + "persistent": {"values": [false]} + } +} +``` + +## Scripts and Tools + +### Python Scripts + +#### gemm_universal_instance_builder.py +**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files. + +**Key Features**: +- Generates individual kernel header files for separate compilation +- Supports multiple data types (fp16, fp8, bf16, fp32, fp64) +- Validates tile configurations for correctness +- Creates CMake integration files + +**Usage**: +```bash +python gemm_universal_instance_builder.py \ + --working_path ./generated \ + --datatype fp16 \ + --layout rcr \ + --config_json configs/user_provided_config.json \ + --gen_all_individual +``` + +#### gemm_instance_builder_parallel.py +**Purpose**: Parallel version of the instance builder for faster generation of multiple kernel configurations. + +**Features**: +- Multi-threaded kernel generation +- Improved performance for large configuration spaces + +#### validation_utils.py +**Purpose**: Provides comprehensive validation functions for kernel configurations. + +**Key Functions**: +- `is_tile_config_valid()` - Validates tile dimensions and alignments +- `is_trait_combination_valid()` - Checks if pipeline/epilogue/scheduler combinations are supported +- `validate_warp_tile_combination()` - GPU-specific warp tile validation +- `validate_lds_capacity()` - Ensures configurations fit in LDS memory + +**Validation Checks**: +- Dimension alignment (tile dimensions must be divisible by warp dimensions) +- LDS capacity constraints +- GPU-specific warp tile support +- Unsupported trait combinations + +#### test_validation.py +**Purpose**: Test suite for the validation logic to ensure correctness. + +**Usage**: +```bash +python test_validation.py +``` + +**Tests**: +- Warp tile combination validation +- Trait combination validation +- Full tile configuration validation + +#### gemm_universal_benchmark.py +**Purpose**: Python script for running and analyzing GEMM benchmarks. + +**Features**: +- Automated benchmark execution +- Performance data collection +- Result analysis and reporting + +#### json_config.py +**Purpose**: Configuration file parsing and management. + +**Features**: +- JSON configuration loading +- Default configuration handling +- Configuration validation + +#### codegen_utils.py +**Purpose**: Utility functions for code generation. + +**Features**: +- Template processing +- Code formatting utilities +- File generation helpers + +### Shell Scripts + +#### test_benchmark.sh +**Purpose**: Automated benchmark testing script that finds and runs all built benchmark executables. + +**Features**: +- Automatic build directory detection +- Batch execution of multiple benchmarks +- CSV result collection +- Colored output for easy reading +- Example command generation + +**Usage**: +```bash +# Auto-detect build directory +./test_benchmark.sh + +# Specify build directory +./test_benchmark.sh /path/to/build/directory +``` + +**What it does**: +1. Finds all benchmark executables in the build directory +2. Runs each with multiple problem sizes (512, 1024, 2048) +3. Performs GPU verification +4. Saves results to timestamped CSV file +5. Provides summary statistics + +## Command Line Options + +All benchmark executables support the following options: + +### Matrix Dimensions +- `-m=` - M dimension (default: 3840) +- `-n=` - N dimension (default: 4096) +- `-k=` - K dimension (default: 2048) + +### Strides +- `-stride_a=` - Stride for matrix A (default: 0, auto-calculated) +- `-stride_b=` - Stride for matrix B (default: 0, auto-calculated) +- `-stride_c=` - Stride for matrix C (default: 0, auto-calculated) + +### Verification +- `-verify=<0|1|2>` - Verification mode + - 0: No verification (default) + - 1: CPU verification + - 2: GPU verification + +### Performance Testing +- `-warmup=` - Warmup iterations (default: 50) +- `-repeat=` - Benchmark iterations (default: 100) +- `-timer=` - Use GPU timer (default: true) +- `-flush_cache=` - Flush cache between runs (default: true) +- `-rotating_count=` - Cache rotation count (default: 1000) + +### Initialization +- `-init=<0|1|2>` - Tensor initialization method + - 0: Random values [-1, 1] (default) + - 1: Linear sequence (i % 17) + - 2: Constant value (1.0) + +### Output Options +- `-log=` - Enable verbose logging (default: false) +- `-metric=<0|1|2>` - Performance metric + - 0: Latency in ms (default) + - 1: TFLOPS + - 2: Bandwidth in GB/s +- `-json_output=` - JSON format output (default: false) +- `-csv_filename=` - Save results to CSV +- `-csv_format=` - CSV format (default: comprehensive) + +### Advanced Options +- `-split_k=` - Split-K factor (default: 1) +- `-structured_sparsity=` - Enable structured sparsity (default: false) +- `-pipeline=` - Pipeline type (default: compv3) +- `-scheduler=` - Scheduler type (default: intrawave) +- `-epilogue=` - Epilogue type (default: cshuffle) +- `-pad_m=` - Pad M dimension (default: false) +- `-pad_n=` - Pad N dimension (default: false) +- `-pad_k=` - Pad K dimension (default: false) +- `-persistent=` - Use persistent kernel (default: false) + +## Understanding Kernel Names + +The kernel naming convention encodes the configuration: + +``` +benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 + ^^^^ ^^^ ^^^^^^ ^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^ ^^^^^^^^^ + | | | | | | | | | + | | | | | Padding & flags | | Warp tile + | | | | Scheduler | Thread tile + | | | Epilogue Block tile + | | Pipeline + | Layout (Row-Column-Row) + Data type +``` + +### Components: +- **Data type**: fp16, fp32, bf16, fp8, bf8, int8 +- **Layout**: rcr (Row-Column-Row), rrr, crr, ccr +- **Pipeline**: mem, compv3, compv4 +- **Epilogue**: default, cshuffle +- **Scheduler**: intrawave, interwave +- **Flags**: pad_m, pad_n, pad_k, persistent (4 boolean flags) +- **Tile sizes**: BlockTile x ThreadTile x WarpTile + +## Troubleshooting + +### Common Issues + +1. **Kernel not found** + - Ensure the specific benchmark executable is built + - Check the build directory bin/ folder + +2. **Verification failures** + - Try GPU verification (-verify=2) which may be more accurate + - Check data type compatibility + - Verify stride calculations + +3. **Build failures** + - Check GPU architecture compatibility + - Ensure ROCm is properly installed + - Verify configuration file syntax + +4. **Performance variations** + - Increase warmup iterations + - Disable CPU frequency scaling + - Use GPU timer for accurate measurements + +### Debug Options + +Enable verbose logging: +```bash +./bin/benchmark_gemm_... -log=true -verify=1 +``` + +Test validation logic: +```bash +python test_validation.py +``` + +## Performance Tips + +1. **Optimal Problem Sizes**: Use sizes that are multiples of tile dimensions +2. **Warmup**: Use at least 50-100 warmup iterations +3. **GPU Timer**: Always use `-timer=true` for accurate measurements +4. **Cache Management**: Enable cache flushing for consistent results +5. **Thread Affinity**: Set CPU affinity to reduce variation + +## Integration Examples + +### Python Integration + +```python +import subprocess +import json + +# Run benchmark with JSON output +result = subprocess.run([ + './bin/benchmark_gemm_universal_fp16_rcr_...', + '-m=1024', '-n=1024', '-k=1024', + '-json_output=true' +], capture_output=True, text=True) + +# Parse results +data = json.loads(result.stdout) +print(f"Performance: {data['tflops']} TFLOPS") +``` + +### Batch Testing Script + +```bash +#!/bin/bash +SIZES="512 1024 2048 4096" +for size in $SIZES; do + echo "Testing ${size}x${size}x${size}" + ./bin/benchmark_gemm_... -m=$size -n=$size -k=$size \ + -verify=2 -csv_filename=results.csv +done +``` + +## Contributing + +When adding new features or configurations: +1. Update validation logic in `validation_utils.py` +2. Add tests to `test_validation.py` +3. Update configuration examples +4. Document new command-line options + +For more information about the Composable Kernel project, visit the main repository documentation. diff --git a/tile_engine/ops/gemm/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_benchmark.hpp new file mode 100644 index 00000000000..7439264a391 --- /dev/null +++ b/tile_engine/ops/gemm/gemm_benchmark.hpp @@ -0,0 +1,116 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "common/utils.hpp" + +// Data types and Layouts are defined by the generated kernel headers +// No hardcoded type definitions here to avoid conflicts +struct GemmProblem +{ + int split_k_; + int m_, n_, k_; + int stride_a_, stride_b_, stride_c_; + + std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_; + std::string layout_a_, layout_b_, layout_c_; + + bool structured_sparsity_; + + friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem) + { + os << "{\n" + << " \"split_k\":" << problem.split_k_ << ",\n" + << " \"m\":" << problem.m_ << ",\n" + << " \"n\":" << problem.n_ << ",\n" + << " \"k\":" << problem.k_ << ",\n" + << " \"stride_a\":" << problem.stride_a_ << ",\n" + << " \"stride_b\":" << problem.stride_b_ << ",\n" + << " \"stride_c\":" << problem.stride_c_ << ",\n" + << " \"dtype_a\":\"" << problem.dtype_a_ << "\",\n" + << " \"dtype_b\":\"" << problem.dtype_b_ << "\",\n" + << " \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n" + << " \"dtype_c\":\"" << problem.dtype_c_ << "\",\n" + << " \"layout_a\":\"" << problem.layout_a_ << "\",\n" + << " \"layout_b\":\"" << problem.layout_b_ << "\",\n" + << " \"layout_c\":\"" << problem.layout_c_ << "\",\n" + << " \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false") + << "\n" + << "}"; + return os; + } +}; + +// Detect Problem::DsDataType, default to void when absent +template +struct get_DsDataType +{ + using type = void; +}; + +template +struct get_DsDataType> +{ + using type = typename T::DsDataType; +}; + +// Detect Problem::D0DataType, default to void when absent +template +struct get_D0DataType +{ + using type = void; +}; + +template +struct get_D0DataType> +{ + using type = typename T::D0DataType; +}; + +/// @brief Function to compare the results of the device and host computations +template +bool compare(std::string instanceName, + ck_tile::index_t K, + ck_tile::index_t kbatch, + ck_tile::HostTensor& c_m_n_dev_result, + ck_tile::HostTensor& c_m_n_host_result) +{ + using DDataType = typename get_D0DataType::type; + const float max_accumulated_value = + *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end()); + // const auto rtol_atol = calculate_rtol_atol( + // K, kbatch, max_accumulated_value); + auto rtol_atol = [&] { + if constexpr(std::is_void_v) + { + return calculate_rtol_atol( + K, kbatch, max_accumulated_value); + } + else + { + return calculate_rtol_atol( + K, kbatch, max_accumulated_value); + } + }(); + bool pass = ck_tile::check_err(c_m_n_dev_result, + c_m_n_host_result, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "For " << instanceName << " Relative error threshold is " + << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is " + << rtol_atol.at(ck_tile::number<1>{}) << std::endl; + std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl; + + return pass; +} diff --git a/tile_engine/ops/gemm/gemm_benchmark.py b/tile_engine/ops/gemm/gemm_benchmark.py new file mode 100644 index 00000000000..c229b142338 --- /dev/null +++ b/tile_engine/ops/gemm/gemm_benchmark.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +import os +import sys +import json +import subprocess +import argparse +import csv +import time +import importlib.util +from pathlib import Path +from typing import List, Dict, Tuple, Optional + +# TODO: explore modularizing tile engine to avoid accessing imports like this +def _import_benchmark_utils(): + """Import benchmark utilities from commons directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(current_dir) + + # Load the module dynamically + spec = importlib.util.spec_from_file_location( + "benchmark_utils", + os.path.join(parent_dir, "common", "benchmark_utils.py"), + ) + benchmark_utils = importlib.util.module_from_spec(spec) + spec.loader.exec_module(benchmark_utils) + + return benchmark_utils + +benchmark_utils = _import_benchmark_utils() + +class GemmBenchmark: + def __init__(self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"): + self.build_dir = Path(build_dir) + self.verbose = verbose + self.results = [] + self.name = name + + def discover_kernels(self) -> List[Path]: + """Find all benchmark_gemm_* executables in the build directory""" + bin_dir = self.build_dir / "bin" + if not bin_dir.exists(): + print(f"Error: Binary directory {bin_dir} does not exist") + return [] + + glob_name = f"{self.name}*" + kernels = list(bin_dir.glob(glob_name)) + if self.verbose: + print(f"Found {len(kernels)} kernel executables") + for k in kernels: + print(f" - {k.name}") + return kernels + + def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]: + """Extract comprehensive kernel information from filename""" + name = kernel_path.stem + if name.startswith(self.name): + args = name[len(self.name):] + else: + args = name + + # Initialize with basic info + info = { + "executable": str(kernel_path), + "name": name, + "data_type": "unknown", + "layout": "unknown", + "pipeline": "unknown", + "scheduler": "unknown", + "epilogue": "unknown", + } + + # Parse the kernel name pattern: + # benchmark_gemm_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16 + parts = args.split("_") + + if len(parts) >= 5: + info["data_type"] = parts[0] + info["layout"] = parts[1] + info["pipeline"] = parts[2] + info["epilogue"] = parts[3] + info["scheduler"] = parts[4] + + # Extract detailed configuration from the end of the name + config_info = self.parse_detailed_config(name) + info.update(config_info) + + # Generate config ID + info["config_id"] = self.generate_config_id(info) + + return info + + def parse_detailed_config(self, kernel_name: str) -> Dict: + """Parse detailed configuration from kernel name""" + config = { + "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0}, + "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0}, + "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0}, + "optimization_flags": { + "pad_m": False, + "pad_n": False, + "pad_k": False, + "persistent": False, + }, + } + + # Split by underscore and look for patterns + parts = kernel_name.split("_") + + # Look for boolean flags (sequence of True/False values) + bool_sequence = [] + for i, part in enumerate(parts): + if part in ["True", "False"]: + bool_sequence.append(part == "True") + # Continue collecting consecutive boolean values + j = i + 1 + while j < len(parts) and parts[j] in ["True", "False"]: + bool_sequence.append(parts[j] == "True") + j += 1 + break + + # Assign boolean flags if we found them + # Order: pad_m, pad_n, pad_k, persistent (4 flags total) + if len(bool_sequence) >= 4: + config["optimization_flags"]["pad_m"] = bool_sequence[0] + config["optimization_flags"]["pad_n"] = bool_sequence[1] + config["optimization_flags"]["pad_k"] = bool_sequence[2] + config["optimization_flags"]["persistent"] = bool_sequence[3] + + # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16) + # The pattern is: tile_sizes_warp_config_warp_tile + dimension_groups = [] + for part in parts: + if "x" in part and len(part.split("x")) == 3: + try: + dims = [int(x) for x in part.split("x")] + if all(d > 0 for d in dims): + dimension_groups.append(dims) + except ValueError: + continue + + # Assign dimensions based on order and magnitude + if len(dimension_groups) >= 3: + # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile + sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True) + + # Largest dimensions = tile sizes + config["tile_sizes"]["tile_m"] = sorted_groups[0][0] + config["tile_sizes"]["tile_n"] = sorted_groups[0][1] + config["tile_sizes"]["tile_k"] = sorted_groups[0][2] + + # Smallest dimensions = warp config + config["warp_config"]["warp_m"] = sorted_groups[2][0] + config["warp_config"]["warp_n"] = sorted_groups[2][1] + config["warp_config"]["warp_k"] = sorted_groups[2][2] + + # Middle dimensions = warp tile + config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0] + config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1] + config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2] + elif len(dimension_groups) == 2: + # If only 2 groups, assign based on magnitude + sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True) + + # Larger = tile sizes + config["tile_sizes"]["tile_m"] = sorted_groups[0][0] + config["tile_sizes"]["tile_n"] = sorted_groups[0][1] + config["tile_sizes"]["tile_k"] = sorted_groups[0][2] + + # Smaller = warp config + config["warp_config"]["warp_m"] = sorted_groups[1][0] + config["warp_config"]["warp_n"] = sorted_groups[1][1] + config["warp_config"]["warp_k"] = sorted_groups[1][2] + elif len(dimension_groups) == 1: + # Only one group - assume it's tile sizes + config["tile_sizes"]["tile_m"] = dimension_groups[0][0] + config["tile_sizes"]["tile_n"] = dimension_groups[0][1] + config["tile_sizes"]["tile_k"] = dimension_groups[0][2] + + return config + + def generate_config_id(self, info: Dict) -> str: + """Generate a compact config ID from kernel info""" + # Create a compact identifier + parts = [ + info.get("data_type", "unk"), + info.get("layout", "unk"), + info.get("pipeline", "unk"), + info.get("scheduler", "unk"), + ] + + # Add tile configuration if available + tile_sizes = info.get("tile_sizes", {}) + if tile_sizes.get("tile_m", 0) > 0: + tile_str = ( + f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}" + ) + parts.append(tile_str) + + # Add warp config if available + warp_config = info.get("warp_config", {}) + if warp_config.get("warp_m", 0) > 0: + warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}" + parts.append(warp_str) + + # Add warp tile if available + warp_tile = info.get("warp_tile", {}) + if warp_tile.get("warp_tile_m", 0) > 0: + warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}" + parts.append(warp_tile_str) + + return "_".join(parts) + + def benchmark_problem_size( + self, + kernels: List[Path], + m: int, + n: int, + k: int, + split_k: int = 1, + verify: int = 0, + warmup: int = 50, + repeat: int = 100, + flush_cache: bool = True, + rotating_count: int = 1000, + ) -> List[Dict]: + """Benchmark all kernels for a specific problem size""" + results = [] + + params = { + "m": m, + "n": n, + "k": k, + "split_k": split_k, + "verify": verify, + "warmup": warmup, + "repeat": repeat, + "flush_cache": str(flush_cache).lower(), + "rotating_count": rotating_count, + } + + print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}") + + for kernel_path in kernels: + kernel_info = self.extract_kernel_info(kernel_path) + result = benchmark_utils.run_kernel(self.build_dir, kernel_path, params, verbose=self.verbose) + if result: + # Create new structured result format + structured_result = { + "name": kernel_info["name"], # Add name field for compatibility + "config_id": kernel_info["config_id"], + "problem": result.get("problem", {}), + "perf_result": result.get("perf_result", {}), + "config": { + "data_type": kernel_info["data_type"], + "layout": kernel_info["layout"], + "pipeline": kernel_info["pipeline"], + "scheduler": kernel_info["scheduler"], + "epilogue": kernel_info["epilogue"], + "tile_sizes": kernel_info.get("tile_sizes", {}), + "warp_config": kernel_info.get("warp_config", {}), + "warp_tile": kernel_info.get("warp_tile", {}), + "optimization_flags": kernel_info.get("optimization_flags", {}), + }, + "executable": kernel_info["executable"], + # Keep backward compatibility fields + "time_ms": result.get("time_ms", 0), + "tflops": result.get("tflops", 0), + "bandwidth_gb_s": result.get("bandwidth_gb_s", 0), + } + + results.append(structured_result) + + if self.verbose: + print( + f" {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms" + ) + + return results + + def benchmark_sweep( + self, + problem_sizes: List[Tuple[int, int, int]], + split_k_values: List[int] = [1], + verify: bool = False, + warmup: int = 50, + repeat: int = 100, + flush_cache: bool = True, + rotating_count: int = 1000, + ) -> Dict: + """Run comprehensive benchmark sweep""" + kernels = self.discover_kernels() + if not kernels: + print("No kernels found!") + return {} + + all_results = [] + best_kernels = {} + + for m, n, k in problem_sizes: + for split_k in split_k_values: + results = self.benchmark_problem_size( + kernels, + m, + n, + k, + split_k, + verify=2 if verify else 0, + warmup=warmup, + repeat=repeat, + flush_cache=flush_cache, + rotating_count=rotating_count, + ) + + all_results.extend(results) + + # Find best kernel for this configuration + best = benchmark_utils.find_best_kernel(results) + if best: + key = f"m{m}_n{n}_k{k}_splitk{split_k}" + best_kernels[key] = best + print( + f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)" + ) + + self.results = all_results + return best_kernels + + diff --git a/tile_engine/ops/gemm/gemm_common.hpp b/tile_engine/ops/gemm/gemm_common.hpp new file mode 100644 index 00000000000..3a9aed2bc6d --- /dev/null +++ b/tile_engine/ops/gemm/gemm_common.hpp @@ -0,0 +1,96 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" + +// Structure to hold kernel traits for dispatcher +struct KernelTraits +{ + std::string pipeline; // compv3, compv4, mem + std::string scheduler; // intrawave, interwave + std::string epilogue; // cshuffle, default + bool pad_m; + bool pad_n; + bool pad_k; + bool persistent; + + // Constructor with defaults + KernelTraits() + : pipeline("compv3"), + scheduler("intrawave"), + epilogue("cshuffle"), + pad_m(false), + pad_n(false), + pad_k(false), + persistent(false) + { + } +}; + +// Create argument parser +inline auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.") + .insert("n", "4096", "The value for n dimension. Default is 4096.") + .insert("k", "2048", "The value for k dimension. Default is 2048.") + .insert("stride_a", "0", "The stride value for tensor A. Default is 0.") + .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") + .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.") + .insert("stride_c", "0", "The stride value for tensor C. Default is 0.") + .insert("split_k", "1", "The split value for k dimension. Default is 1.") + .insert("verify", + "2", + "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " + "for validation on GPU. Default is 2, GPU validation.") + .insert("log", + "false", + "Whether output kernel instance information or not. Possible values are true or " + "false. Default is false") + .insert( + "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") + .insert( + "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") + .insert("timer", + "true", + "Whether if the timer is gpu timer or not. Possible values are false or true. " + "Default is true.") + .insert("init", + "0", + "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " + "for constant(1). Default is 0, random.") + .insert("flush_cache", + "true", + "To flush cache, possible values are true or false. " + "Default is false.") + .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.") + .insert("metric", + "0", + "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " + "tflops, or 2 for bandwidth. Default is 0, latency.") + .insert("csv_filename", + "", + "The filename of benchmark result. Default is empty (no CSV output).") + .insert("structured_sparsity", + "false", + "Whether use sparsity kernel or not. Possible values are true or false. Default is " + "false") + .insert("json_output", + "false", + "Whether to output results in JSON format only. Possible values are true or false. " + "Default is " + "false"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp index f8c196e32af..23efbfc1682 100644 --- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp +++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp @@ -11,37 +11,15 @@ #include "ck_tile/core.hpp" #include "ck_tile/host.hpp" -#include "gemm_multi_d_common.hpp" +#include "gemm/gemm_benchmark.hpp" // Data types and Layouts are defined by the generated kernel headers // No hardcoded type definitions here to avoid conflicts - -enum class Metric -{ - LATENCY = 0, - TFLOPS = 1, - BANDWIDTH = 2 -}; - -inline constexpr auto get_metric_name(Metric m) +struct GemmMultiDProblem : GemmProblem { - switch(m) - { - case Metric::LATENCY: return "latency"; - case Metric::TFLOPS: return "tflops"; - case Metric::BANDWIDTH: return "bandwidth"; - default: throw std::invalid_argument("Unsupported metric type"); - } -} - -struct GemmMultiDProblem -{ - int split_k_; - int m_, n_, k_; - int stride_a_, stride_b_, stride_d0_, stride_d1_, stride_c_; - - std::string dtype_a_, dtype_b_, dtype_d0_, dtype_d1_, dtype_acc_, dtype_c_; - std::string layout_a_, layout_b_, layout_d0_, layout_d1_, layout_c_; + int stride_d0_, stride_d1_; + std::string dtype_d0_, dtype_d1_; + std::string layout_d0_, layout_d1_; friend std::ostream& operator<<(std::ostream& os, const GemmMultiDProblem& problem) { @@ -71,144 +49,6 @@ struct GemmMultiDProblem } }; -struct PerformanceResult -{ - double latency_; - double tflops_; - double bandwidth_; - - static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m) - { - switch(m) - { - case Metric::LATENCY: return a.latency_ < b.latency_; - case Metric::TFLOPS: return a.tflops_ > b.tflops_; - case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_; - default: throw std::invalid_argument("Unsupported metric type"); - } - } - - friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result) - { - os << "{\n" - << " \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_ - << ",\n" - << " \"tflops(TFlops)\": " << result.tflops_ << ",\n" - << " \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n" - << "}"; - return os; - } -}; - -struct KernelInstance -{ - std::string name_; - GemmMultiDProblem problem_; - PerformanceResult perf_result_; - - static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m) - { - return PerformanceResult::compare(a.perf_result_, b.perf_result_, m); - } - - friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj) - { - os << "{\n" - << " \"name\": \"" << obj.name_ << "\",\n" - << " \"problem\": " << obj.problem_ << ",\n" - << " \"perf_result\": " << obj.perf_result_ << "\n" - << "}"; - return os; - } -}; - -struct Setting -{ - int n_warmup_; - int n_repeat_; - bool is_gpu_timer_; - int verify_; - int init_method_; - bool log_; - std::string csv_filename_; - bool flush_cache_; - int rotating_count_; - bool json_output_; -}; - -inline std::string get_rocm_version() -{ - std::ifstream version_file("/opt/rocm/.info/version"); - if(version_file.is_open()) - { - std::string version; - std::getline(version_file, version); - return version; - } - return "Unknown"; -} - -template -auto calculate_rtol_atol(const ck_tile::index_t K, - const ck_tile::index_t kbatch, - const float max_accumulated_value) -{ - using ComputeTypeAB = - std::conditional_t; - - using ComputeType = - std::conditional_t; - - // Calculate thresholds - const auto rtol = ck_tile::get_relative_threshold( - ck_tile::integer_divide_ceil(K, kbatch)); - - const auto atol = ck_tile::get_absolute_threshold( - max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch)); - - // Calculate error due to split_k accumulation - const auto rtol_split_k = - ck_tile::get_relative_threshold(kbatch); - - const auto atol_split_k = ck_tile::get_absolute_threshold( - max_accumulated_value, kbatch); - - // Use higher threshold - return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); -} - -/// @brief Function to compare the results of the device and host computations -bool compare(std::string instanceName, - ck_tile::index_t K, - ck_tile::index_t kbatch, - ck_tile::HostTensor& c_m_n_dev_result, - ck_tile::HostTensor& c_m_n_host_result) -{ - const float max_accumulated_value = - *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end()); - - const auto rtol_atol = - calculate_rtol_atol( - K, kbatch, max_accumulated_value); - - bool pass = ck_tile::check_err(c_m_n_dev_result, - c_m_n_host_result, - "Error: Incorrect results!", - rtol_atol.at(ck_tile::number<0>{}), - rtol_atol.at(ck_tile::number<1>{})); - - std::cout << "For " << instanceName << " Relative error threshold is " - << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is " - << rtol_atol.at(ck_tile::number<1>{}) << std::endl; - std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl; - - return pass; -} - /// @brief Function to get the kernel output with reference implementation on CPU/GPU void gemm_multi_d_host_reference(int verify, ck_tile::HostTensor& a_m_k, diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py index faf04a7de0d..2e313c3fed6 100644 --- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py +++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py @@ -1,587 +1,53 @@ # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT +import os import sys import json import subprocess import argparse import csv import time +import importlib.util from pathlib import Path from typing import List, Dict, Tuple, Optional +def _import_gemm_benchmark(): + """Import validation utilities from commons directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(current_dir) -class GemmMultiDBenchmark: - def __init__(self, build_dir: str, verbose: bool = False): - self.build_dir = Path(build_dir) - self.verbose = verbose - self.results = [] - - def discover_kernels(self) -> List[Path]: - """Find all benchmark_gemm_multi_d_* executables in the build directory""" - bin_dir = self.build_dir / "bin" - if not bin_dir.exists(): - print(f"Error: Binary directory {bin_dir} does not exist") - return [] - - kernels = list(bin_dir.glob("benchmark_gemm_multi_d_*")) - if self.verbose: - print(f"Found {len(kernels)} kernel executables") - for k in kernels: - print(f" - {k.name}") - return kernels - - def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]: - """Extract comprehensive kernel information from filename""" - name = kernel_path.stem - - # Initialize with basic info - info = { - "executable": str(kernel_path), - "name": name, - "data_type": "unknown", - "layout": "unknown", - "pipeline": "unknown", - "scheduler": "unknown", - "epilogue": "unknown", - } - - # Parse the kernel name pattern: - # benchmark_gemm_multi_d_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16 - parts = name.split("_") - - if len(parts) >= 5: - # Extract data type (3rd part after benchmark_gemm_) - info["data_type"] = parts[4] if len(parts) > 4 else "unknown" - - # Extract layout (4th part) - info["layout"] = parts[5] if len(parts) > 5 else "unknown" - - # Extract pipeline (5th part) - info["pipeline"] = parts[6] if len(parts) > 6 else "unknown" - - # Extract epilogue (6th part) - info["epilogue"] = parts[7] if len(parts) > 7 else "unknown" - - # Extract scheduler (7th part) - info["scheduler"] = parts[8] if len(parts) > 8 else "unknown" - - # Extract detailed configuration from the end of the name - config_info = self.parse_detailed_config(name) - info.update(config_info) - - # Generate config ID - info["config_id"] = self.generate_config_id(info) - - return info - - def parse_detailed_config(self, kernel_name: str) -> Dict: - """Parse detailed configuration from kernel name""" - config = { - "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0}, - "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0}, - "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0}, - "optimization_flags": { - "pad_m": False, - "pad_n": False, - "pad_k": False, - "persistent": False, - }, - } - - # Split by underscore and look for patterns - parts = kernel_name.split("_") - - # Look for boolean flags (sequence of True/False values) - bool_sequence = [] - for i, part in enumerate(parts): - if part in ["True", "False"]: - bool_sequence.append(part == "True") - # Continue collecting consecutive boolean values - j = i + 1 - while j < len(parts) and parts[j] in ["True", "False"]: - bool_sequence.append(parts[j] == "True") - j += 1 - break - - # Assign boolean flags if we found them - # Order: pad_m, pad_n, pad_k, persistent (4 flags total) - if len(bool_sequence) >= 4: - config["optimization_flags"]["pad_m"] = bool_sequence[0] - config["optimization_flags"]["pad_n"] = bool_sequence[1] - config["optimization_flags"]["pad_k"] = bool_sequence[2] - config["optimization_flags"]["persistent"] = bool_sequence[3] - - # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16) - # The pattern is: tile_sizes_warp_config_warp_tile - dimension_groups = [] - for part in parts: - if "x" in part and len(part.split("x")) == 3: - try: - dims = [int(x) for x in part.split("x")] - if all(d > 0 for d in dims): - dimension_groups.append(dims) - except ValueError: - continue - - # Assign dimensions based on order and magnitude - if len(dimension_groups) >= 3: - # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile - sorted_groups = sorted(dimension_groups, key=max, reverse=True) - - # Largest dimensions = tile sizes - config["tile_sizes"]["tile_m"] = sorted_groups[0][0] - config["tile_sizes"]["tile_n"] = sorted_groups[0][1] - config["tile_sizes"]["tile_k"] = sorted_groups[0][2] - - # Smallest dimensions = warp config - config["warp_config"]["warp_m"] = sorted_groups[2][0] - config["warp_config"]["warp_n"] = sorted_groups[2][1] - config["warp_config"]["warp_k"] = sorted_groups[2][2] - - # Middle dimensions = warp tile - config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0] - config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1] - config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2] - elif len(dimension_groups) == 2: - # If only 2 groups, assign based on magnitude - sorted_groups = sorted(dimension_groups, key=max, reverse=True) - - # Larger = tile sizes - config["tile_sizes"]["tile_m"] = sorted_groups[0][0] - config["tile_sizes"]["tile_n"] = sorted_groups[0][1] - config["tile_sizes"]["tile_k"] = sorted_groups[0][2] - - # Smaller = warp config - config["warp_config"]["warp_m"] = sorted_groups[1][0] - config["warp_config"]["warp_n"] = sorted_groups[1][1] - config["warp_config"]["warp_k"] = sorted_groups[1][2] - elif len(dimension_groups) == 1: - # Only one group - assume it's tile sizes - config["tile_sizes"]["tile_m"] = dimension_groups[0][0] - config["tile_sizes"]["tile_n"] = dimension_groups[0][1] - config["tile_sizes"]["tile_k"] = dimension_groups[0][2] - - return config - - def generate_config_id(self, info: Dict) -> str: - """Generate a compact config ID from kernel info""" - # Create a compact identifier - parts = [ - info.get("data_type", "unk"), - info.get("layout", "unk"), - info.get("pipeline", "unk"), - info.get("scheduler", "unk"), - ] - - # Add tile configuration if available - tile_sizes = info.get("tile_sizes", {}) - if tile_sizes.get("tile_m", 0) > 0: - tile_str = ( - f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}" - ) - parts.append(tile_str) - - # Add warp config if available - warp_config = info.get("warp_config", {}) - if warp_config.get("warp_m", 0) > 0: - warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}" - parts.append(warp_str) - - # Add warp tile if available - warp_tile = info.get("warp_tile", {}) - if warp_tile.get("warp_tile_m", 0) > 0: - warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}" - parts.append(warp_tile_str) - - return "_".join(parts) - - def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]: - """Run a single kernel with given parameters and save output to individual JSON file""" - # Create results directory - results_dir = self.build_dir / "results" - results_dir.mkdir(exist_ok=True) - - # Generate unique JSON filename for this kernel - json_file = results_dir / f"{kernel_path.stem}.json" - - cmd = [str(kernel_path)] - - # Add parameters - for key, value in params.items(): - cmd.append(f"-{key}={value}") - - # Add JSON output flag for clean JSON output - cmd.append("-json_output=true") - - if self.verbose: - print(f"Running: {' '.join(cmd)}") - - try: - result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) - - if result.returncode != 0: - print(f"Error running {kernel_path.name}: {result.stderr}") - return None - - # Save raw output to individual JSON file - output = result.stdout.strip() - if output: - with open(json_file, "w") as f: - f.write(output) - - # Parse the JSON file - return self.parse_json_file(json_file) - else: - print(f"No output from {kernel_path.name}") - return None - - except subprocess.TimeoutExpired: - print(f"Timeout running {kernel_path.name}") - return None - except Exception as e: - print(f"Error running {kernel_path.name}: {e}") - return None - - def parse_json_file(self, json_file: Path) -> Optional[Dict]: - """Parse JSON data from individual kernel output file""" - try: - with open(json_file, "r") as f: - content = f.read().strip() - - # Parse the JSON directly since executables produce clean JSON - data = json.loads(content) - - # Return the complete JSON data as-is, just add some convenience fields - result = data.copy() - if "perf_result" in data: - perf = data["perf_result"] - # Add convenience fields for backward compatibility - result["time_ms"] = perf.get("latency(ms)", 0) - result["tflops"] = perf.get("tflops(TFlops)", 0) - result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0) - - return result - - except json.JSONDecodeError as e: - if self.verbose: - print(f"Failed to parse JSON from {json_file}: {e}") - return None - except Exception as e: - if self.verbose: - print(f"Error reading JSON file {json_file}: {e}") - return None - - def benchmark_problem_size( - self, - kernels: List[Path], - m: int, - n: int, - k: int, - split_k: int = 1, - verify: int = 0, - warmup: int = 50, - repeat: int = 100, - flush_cache: bool = True, - rotating_count: int = 1000, - ) -> List[Dict]: - """Benchmark all kernels for a specific problem size""" - results = [] - - params = { - "m": m, - "n": n, - "k": k, - "split_k": split_k, - "verify": verify, - "warmup": warmup, - "repeat": repeat, - "flush_cache": str(flush_cache).lower(), - "rotating_count": rotating_count, - } - - print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}") - - for kernel_path in kernels: - kernel_info = self.extract_kernel_info(kernel_path) - result = self.run_kernel(kernel_path, params) - - if result: - # Create new structured result format - structured_result = { - "name": kernel_info["name"], # Add name field for compatibility - "config_id": kernel_info["config_id"], - "problem": result.get("problem", {}), - "perf_result": result.get("perf_result", {}), - "config": { - "data_type": kernel_info["data_type"], - "layout": kernel_info["layout"], - "pipeline": kernel_info["pipeline"], - "scheduler": kernel_info["scheduler"], - "epilogue": kernel_info["epilogue"], - "tile_sizes": kernel_info.get("tile_sizes", {}), - "warp_config": kernel_info.get("warp_config", {}), - "warp_tile": kernel_info.get("warp_tile", {}), - "optimization_flags": kernel_info.get("optimization_flags", {}), - }, - "executable": kernel_info["executable"], - # Keep backward compatibility fields - "time_ms": result.get("time_ms", 0), - "tflops": result.get("tflops", 0), - "bandwidth_gb_s": result.get("bandwidth_gb_s", 0), - } - - results.append(structured_result) - - if self.verbose: - print( - f" {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms" - ) - - return results - - def find_best_kernel( - self, results: List[Dict], metric: str = "tflops" - ) -> Optional[Dict]: - """Find the best performing kernel based on metric""" - if not results: - return None - - if metric == "tflops": - return max(results, key=lambda x: x.get("tflops", 0)) - elif metric == "time_ms": - return min(results, key=lambda x: x.get("time_ms", float("inf"))) - elif metric == "bandwidth_gb_s": - return max(results, key=lambda x: x.get("bandwidth_gb_s", 0)) - else: - raise ValueError(f"Unknown metric: {metric}") - - def benchmark_sweep( - self, - problem_sizes: List[Tuple[int, int, int]], - split_k_values: List[int] = [1], - verify: bool = False, - warmup: int = 50, - repeat: int = 100, - flush_cache: bool = True, - rotating_count: int = 1000, - ) -> Dict: - """Run comprehensive benchmark sweep""" - kernels = self.discover_kernels() - if not kernels: - print("No kernels found!") - return {} - - all_results = [] - best_kernels = {} - - for m, n, k in problem_sizes: - for split_k in split_k_values: - results = self.benchmark_problem_size( - kernels, - m, - n, - k, - split_k, - verify=2 if verify else 0, - warmup=warmup, - repeat=repeat, - flush_cache=flush_cache, - rotating_count=rotating_count, - ) - - all_results.extend(results) - - # Find best kernel for this configuration - best = self.find_best_kernel(results) - if best: - key = f"m{m}_n{n}_k{k}_splitk{split_k}" - best_kernels[key] = best - print( - f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)" - ) - - self.results = all_results - return best_kernels - - def export_csv(self, filename: str): - """Export all results to CSV""" - if not self.results: - print("No results to export") - return - - # Get all unique keys from results - all_keys = set() - for result in self.results: - all_keys.update(result.keys()) - - # Sort keys for consistent output - fieldnames = sorted(all_keys) - - with open(filename, "w", newline="") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.results) - - print(f"Results exported to {filename}") - - def export_best_kernels(self, best_kernels: Dict, filename: str): - """Export best kernel selections to file""" - with open(filename, "w") as f: - f.write("# Best kernel selections\n") - f.write( - "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n" - ) - - for key, kernel in sorted(best_kernels.items()): - f.write( - f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n" - ) - - print(f"Best kernels exported to {filename}") - - def export_json(self, filename: str, best_kernels: Dict = None): - """Export all results and best kernels to JSON with comprehensive metadata""" - from datetime import datetime - - # Calculate comprehensive summary statistics for all metrics - successful_results = [r for r in self.results if r.get("tflops", 0) > 0] - - tflops_values = [r.get("tflops", 0) for r in successful_results] - bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results] - latency_values = [ - r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0 - ] - - # Performance breakdown by kernel type - pipeline_stats = {} - scheduler_stats = {} - data_type_stats = {} - - for result in successful_results: - # Get config info from the new structure - config = result.get("config", {}) - - # Pipeline statistics - pipeline = config.get("pipeline", "unknown") - if pipeline not in pipeline_stats: - pipeline_stats[pipeline] = { - "count": 0, - "avg_tflops": 0, - "best_tflops": 0, - } - pipeline_stats[pipeline]["count"] += 1 - pipeline_stats[pipeline]["best_tflops"] = max( - pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0) - ) - - # Scheduler statistics - scheduler = config.get("scheduler", "unknown") - if scheduler not in scheduler_stats: - scheduler_stats[scheduler] = { - "count": 0, - "avg_tflops": 0, - "best_tflops": 0, - } - scheduler_stats[scheduler]["count"] += 1 - scheduler_stats[scheduler]["best_tflops"] = max( - scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0) - ) + # Load the module dynamically + spec = importlib.util.spec_from_file_location( + "gemm_benchmark", + os.path.join(parent_dir, "gemm_benchmark.py"), + ) + gemm_benchmark_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(gemm_benchmark_module) - # Data type statistics - data_type = config.get("data_type", "unknown") - if data_type not in data_type_stats: - data_type_stats[data_type] = { - "count": 0, - "avg_tflops": 0, - "best_tflops": 0, - } - data_type_stats[data_type]["count"] += 1 - data_type_stats[data_type]["best_tflops"] = max( - data_type_stats[data_type]["best_tflops"], result.get("tflops", 0) - ) + return gemm_benchmark_module.GemmBenchmark - # Calculate averages for breakdown stats - for stats_dict, field_name in [ - (pipeline_stats, "pipeline"), - (scheduler_stats, "scheduler"), - (data_type_stats, "data_type"), - ]: - for key in stats_dict: - relevant_results = [ - r - for r in successful_results - if r.get("config", {}).get(field_name, "unknown") == key - ] - if relevant_results: - stats_dict[key]["avg_tflops"] = sum( - r.get("tflops", 0) for r in relevant_results - ) / len(relevant_results) +def _import_benchmark_utils(): + """Import benchmark utilities from commons directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(os.path.dirname(current_dir)) - output_data = { - "benchmark_metadata": { - "timestamp": datetime.now().isoformat(), - "total_kernels_tested": len(self.results), - "unique_kernels": len( - set(r.get("name", "unknown") for r in self.results) - ), - "successful_runs": len(successful_results), - "failed_runs": len(self.results) - len(successful_results), - }, - "performance_summary": { - "tflops_stats": { - "best": max(tflops_values, default=0), - "average": sum(tflops_values) / len(tflops_values) - if tflops_values - else 0, - "min": min(tflops_values, default=0), - "median": sorted(tflops_values)[len(tflops_values) // 2] - if tflops_values - else 0, - }, - "bandwidth_stats": { - "best_gb_s": max(bandwidth_values, default=0), - "average_gb_s": sum(bandwidth_values) / len(bandwidth_values) - if bandwidth_values - else 0, - "min_gb_s": min(bandwidth_values, default=0), - "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2] - if bandwidth_values - else 0, - }, - "latency_stats": { - "best_ms": min(latency_values, default=0), - "average_ms": sum(latency_values) / len(latency_values) - if latency_values - else 0, - "max_ms": max(latency_values, default=0), - "median_ms": sorted(latency_values)[len(latency_values) // 2] - if latency_values - else 0, - }, - "kernel_type_breakdown": { - "by_pipeline": pipeline_stats, - "by_scheduler": scheduler_stats, - "by_data_type": data_type_stats, - }, - "total_problem_configurations": len(best_kernels) - if best_kernels - else 0, - }, - "kernel_results": self.results, - "best_kernels_by_problem": best_kernels or {}, - } + # Load the module dynamically + spec = importlib.util.spec_from_file_location( + "benchmark_utils", + os.path.join(parent_dir, "common", "benchmark_utils.py"), + ) + benchmark_utils = importlib.util.module_from_spec(spec) + spec.loader.exec_module(benchmark_utils) - with open(filename, "w") as f: - json.dump(output_data, f, indent=2) + return benchmark_utils - print(f"JSON results exported to {filename}") - print(f" - Total kernels: {len(self.results)}") - print(f" - Successful runs: {len(successful_results)}") - print(f" - Best TFLOPS: {max(tflops_values, default=0):.2f}") - print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s") - print(f" - Best latency: {min(latency_values, default=0):.2f}ms") +GemmBenchmark = _import_gemm_benchmark() +benchmark_utils = _import_benchmark_utils() +class GemmMultiDBenchmark(GemmBenchmark): + def __init__(self, build_dir: str, verbose: bool = False): + super().__init__(build_dir, verbose, name="benchmark_gemm_multi_d_") def main(): parser = argparse.ArgumentParser( @@ -668,12 +134,12 @@ def main(): print(f"\nBenchmark completed in {elapsed_time:.2f} seconds") # Export results - benchmark.export_csv(args.csv) - benchmark.export_best_kernels(best_kernels, args.best) + benchmark_utils.export_csv(benchmark.results, args.csv) + benchmark_utils.export_best_kernels(best_kernels, args.best) # Export JSON if requested if args.json: - benchmark.export_json(args.json, best_kernels) + benchmark_utils.export_json(benchmark.results, args.json, best_kernels) return 0 diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp index 41d2f736e1f..767e8eda6ef 100644 --- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp +++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp @@ -11,81 +11,22 @@ #include "ck_tile/core.hpp" #include "ck_tile/host.hpp" +#include "gemm/gemm_common.hpp" #include "gemm_multi_d_profiler.hpp" -#include "gemm_multi_d_common.hpp" // The kernel header is included via the compile command line with -include flag // It defines SelectedKernel struct and KERNEL_NAME -// DataTypeTraits are now defined in gemm_multi_d_common.hpp - -// Create argument parser -inline auto create_args(int argc, char* argv[]) -{ - ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.") - .insert("n", "4096", "The value for n dimension. Default is 4096.") - .insert("k", "2048", "The value for k dimension. Default is 2048.") - .insert("stride_a", "0", "The stride value for tensor A. Default is 0.") - .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") - .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.") - .insert("stride_c", "0", "The stride value for tensor C. Default is 0.") - .insert("split_k", "1", "The split value for k dimension. Default is 1.") - .insert("verify", - "1", - "for validation on GPU. Default is 1, validation on CPU, as validation on GPU is " - "not supported.") - .insert("log", - "false", - "Whether output kernel instance information or not. Possible values are true or " - "false. Default is false") - .insert( - "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") - .insert( - "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") - .insert("timer", - "true", - "Whether if the timer is gpu timer or not. Possible values are false or true. " - "Default is true.") - .insert("init", - "0", - "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " - "for constant(1). Default is 0, random.") - .insert("flush_cache", - "true", - "To flush cache, possible values are true or false. " - "Default is false.") - .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.") - .insert("metric", - "0", - "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " - "tflops, or 2 for bandwidth. Default is 0, latency.") - .insert("csv_filename", - "", - "The filename of benchmark result. Default is empty (no CSV output).") - .insert("structured_sparsity", - "false", - "Whether use sparsity kernel or not. Possible values are true or false. Default is " - "false") - .insert("json_output", - "false", - "Whether to output results in JSON format only. Possible values are true or false. " - "Default is " - "false"); - - bool result = arg_parser.parse(argc, argv); - return std::make_tuple(result, arg_parser); -} void benchmark_single(const ck_tile::ArgParser& arg_parser) { // Use DataTypeTraits to get the actual type names from the generated header // The generated header defines ADataType, BDataType, AccDataType, CDataType - std::string dtype_a = DataTypeTraits::name; - std::string dtype_b = DataTypeTraits::name; - std::string dtype_acc = DataTypeTraits::name; - std::string dtype_c = DataTypeTraits::name; - std::string dtype_d0 = DataTypeTraits::name; - std::string dtype_d1 = DataTypeTraits::name; + std::string dtype_a = ck_tile::DataTypeTraits::name; + std::string dtype_b = ck_tile::DataTypeTraits::name; + std::string dtype_acc = ck_tile::DataTypeTraits::name; + std::string dtype_c = ck_tile::DataTypeTraits::name; + std::string dtype_d0 = ck_tile::DataTypeTraits::name; + std::string dtype_d1 = ck_tile::DataTypeTraits::name; // Layout names from the layout types std::string layout_a = ALayout::name; @@ -95,26 +36,27 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser) std::string layout_d1 = D1Layout::name; // Create GemmMultiDProblem struct - GemmMultiDProblem gemm_multi_d_problem{arg_parser.get_int("split_k"), - arg_parser.get_int("m"), - arg_parser.get_int("n"), - arg_parser.get_int("k"), - arg_parser.get_int("stride_a"), - arg_parser.get_int("stride_b"), + GemmMultiDProblem gemm_multi_d_problem{GemmProblem{arg_parser.get_int("split_k"), + arg_parser.get_int("m"), + arg_parser.get_int("n"), + arg_parser.get_int("k"), + arg_parser.get_int("stride_a"), + arg_parser.get_int("stride_b"), + arg_parser.get_int("stride_c"), + dtype_a, + dtype_b, + dtype_acc, + dtype_c, + layout_a, + layout_b, + layout_c, + arg_parser.get_bool("structured_sparsity")}, arg_parser.get_int("stride_ds"), arg_parser.get_int("stride_ds"), - arg_parser.get_int("stride_c"), - dtype_a, - dtype_b, dtype_d0, dtype_d1, - dtype_acc, - dtype_c, - layout_a, - layout_b, layout_d0, - layout_d1, - layout_c}; + layout_d1}; // Create Setting struct Setting setting{arg_parser.get_int("warmup"), diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp deleted file mode 100644 index 899221547f6..00000000000 --- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include "ck_tile/core.hpp" -#include "ck_tile/host.hpp" -#include "ck_tile/core/numeric/integer.hpp" -#include "ck_tile/core/numeric/pk_int4.hpp" - -//[TODO] This can be moved to commons -// DataTypeTraits for all supported types -template -struct DataTypeTraits; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp32"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp64"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp16"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "bf16"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp8"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "bf8"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "int8"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "int32"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "pk_int4_t"; -}; - -// Helper function to determine if a layout is row-major -template -constexpr auto is_row_major(Layout) -{ - return ck_tile::bool_constant>{}; -} - -// Structure to hold kernel traits for dispatcher -struct KernelTraits -{ - std::string pipeline; // compv3, compv4, mem - std::string scheduler; // intrawave, interwave - std::string epilogue; // cshuffle, default - bool pad_m; - bool pad_n; - bool pad_k; - bool persistent; - - // Constructor with defaults - KernelTraits() - : pipeline("compv3"), - scheduler("intrawave"), - epilogue("cshuffle"), - pad_m(false), - pad_n(false), - pad_k(false), - persistent(false) - { - } -}; diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp index 3a2cdc71fe6..aeac6c984dc 100644 --- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp @@ -6,44 +6,39 @@ #include #include #include +#include +#include +#include +#include #include "ck_tile/host/device_prop.hpp" #include "ck_tile/ops/gemm.hpp" +#include "gemm/gemm_profiler.hpp" +#include "common/utils.hpp" #include "gemm_multi_d_benchmark.hpp" -class GemmMultiDProfiler +class GemmMultiDProfiler : public GemmProfiler> { public: - static GemmMultiDProfiler& instance(Setting setting) + using BaseGemm = GemmProfiler>; + using BaseGemm::benchmark; + + GemmMultiDProfiler(Setting setting) + : GemmProfiler>(setting) { - static GemmMultiDProfiler instance{setting}; - return instance; - } - - // Overload for single kernel benchmarking - void benchmark(GemmMultiDProblem& gemm_multi_d_problem, - std::function&, - const ck_tile::stream_config&)> kernel_func) - { - // Create a vector with a single callable that returns both name and time - std::vector( - ck_tile::GemmMultiDHostArgs&, const ck_tile::stream_config&)>> - callables; - - callables.push_back([kernel_func](ck_tile::GemmMultiDHostArgs& args, - const ck_tile::stream_config& stream) { - float time = kernel_func(args, stream); - return std::make_tuple(std::string(KERNEL_NAME), time); - }); - - benchmark(gemm_multi_d_problem, callables); } void benchmark( GemmMultiDProblem& gemm_multi_d_problem, std::vector( ck_tile::GemmMultiDHostArgs&, const ck_tile::stream_config&)>>& - callables) + callables) override { const ALayout layout_a = ALayout{}; const BLayout layout_b = BLayout{}; @@ -165,143 +160,4 @@ class GemmMultiDProfiler kernel_run_result); } } - - void process_result(const GemmMultiDProblem& gemm_multi_d_problem, - ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - const std::tuple& kernel_run_result) - { - auto [name, avg_time] = kernel_run_result; - - KernelInstance kernel_instance{name, gemm_multi_d_problem, {-1.0f, -1.0f, -1.0f}}; - - // compute performance metric - std::size_t flop = std::size_t(2) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_ * - gemm_multi_d_problem.k_; - std::size_t num_byte = - sizeof(ADataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.k_ + - sizeof(BDataType) * gemm_multi_d_problem.n_ * gemm_multi_d_problem.k_ + - sizeof(CDataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_; - - // Dth Dimension Updates - ck_tile::static_for<0, DsDataType::size(), 1>{}([&](auto i) { - num_byte += sizeof(ck_tile::remove_cvref_t>) * - gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_; - flop += sizeof(ck_tile::remove_cvref_t>) * - gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_; - }); - - // update - kernel_instance.perf_result_.latency_ = avg_time; - kernel_instance.perf_result_.tflops_ = static_cast(flop) / 1.E9 / avg_time; - kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time; - - if(setting_.log_ > 0 && !setting_.json_output_) - { - std::cout << kernel_instance << std::endl; - } - - // verify result - c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); - bool verified_correct = - !setting_.verify_ || compare(name, - gemm_multi_d_problem.k_, - 1, // Multi d currently supports only k_batch = 1 - c_m_n_dev_result, - c_m_n_host_result); - - if(verified_correct) - { - kernel_instances_.emplace_back(kernel_instance); - } - else - { - std::cout << "Verification failed, skip kernel: " << name << std::endl; - } - - // clear tensor - c_m_n_dev_buf.SetZero(); - c_m_n_dev_result.SetZero(); - } - - KernelInstance select_best_instance(Metric metric) - { - if(kernel_instances_.empty()) - throw std::runtime_error("Empty instances"); - - auto kernel_instance = *std::max_element(kernel_instances_.begin(), - kernel_instances_.end(), - [metric](const auto& a, const auto& b) { - return PerformanceResult::compare( - b.perf_result_, a.perf_result_, metric); - }); - - if(setting_.json_output_) - { - // Output clean JSON only - std::cout << kernel_instance << std::endl; - } - else - { - std::cout << "**********************************" << std::endl; - std::cout << "According to given metrics: " << get_metric_name(metric) << "\n" - << "Current kernel performance is: " << kernel_instance << std::endl; - std::cout << "**********************************" << std::endl; - } - - if(!setting_.csv_filename_.empty()) - { - std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app); - - if(!file.is_open()) - { - std::cerr << "Warning: Failed to open CSV file for writing." << std::endl; - } - else - { - if(file.tellp() == 0) - { - file << "rocm_version,device_name," - << "split_k,m,n,k,stride_a,stride_b,stride_c," - << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c," - << "structured_sparsity," << "name," - << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n"; - } - - const auto& problem = kernel_instance.problem_; - const auto& name = kernel_instance.name_; - const auto& perf = kernel_instance.perf_result_; - - file << get_rocm_version() << "," << ck_tile::get_device_name() << "," - << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << "," - << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << "," - << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_ - << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << "," - << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_ - << "," << name << "," << std::fixed << std::setprecision(4) << perf.latency_ - << "," << std::fixed << std::setprecision(4) << perf.tflops_ << "," - << std::fixed << std::setprecision(4) << perf.bandwidth_ << "," - << get_metric_name(metric) << "\n"; - - if(!file) - { - std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl; - } - } - } - - return kernel_instance; - } - - GemmMultiDProfiler(const GemmMultiDProfiler&) = delete; - GemmMultiDProfiler& operator=(const GemmMultiDProfiler&) = delete; - - private: - ~GemmMultiDProfiler() { kernel_instances_.clear(); } - GemmMultiDProfiler(Setting setting) : setting_(setting) {} - - Setting setting_; - - std::vector kernel_instances_; }; diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp index 748fe581d35..817ff8e99ec 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp @@ -6,192 +6,25 @@ #include "ck_tile/core.hpp" #include "ck_tile/host.hpp" #include "gemm_preshuffle_common.hpp" - -//[TODO] Move parts of this File to commons -enum class Metric -{ - LATENCY = 0, - TFLOPS = 1, - BANDWIDTH = 2 -}; - -inline constexpr auto get_metric_name(Metric m) -{ - switch(m) - { - case Metric::LATENCY: return "latency"; - case Metric::TFLOPS: return "tflops"; - case Metric::BANDWIDTH: return "bandwidth"; - default: throw std::invalid_argument("Unsupported metric type"); - } -} +#include "gemm/gemm_benchmark.hpp" struct KernelConfig { - std::tuple tile_dims; - std::tuple warp_dims; - std::tuple warp_tile_dims; - bool permuteN; -}; - -struct GemmProblem -{ - int split_k_; - int m_, n_, k_; - int stride_a_, stride_b_, stride_c_; - - std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_; - std::string layout_a_, layout_b_, layout_c_; + ck_tile::index_t M_Tile; + ck_tile::index_t N_Tile; + ck_tile::index_t K_Tile; - bool structured_sparsity_; + ck_tile::index_t M_Warp; + ck_tile::index_t N_Warp; + ck_tile::index_t K_Warp; - friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem) - { - os << "{\n" - << " \"split_k\":" << problem.split_k_ << ",\n" - << " \"m\":" << problem.m_ << ",\n" - << " \"n\":" << problem.n_ << ",\n" - << " \"k\":" << problem.k_ << ",\n" - << " \"stride_a\":" << problem.stride_a_ << ",\n" - << " \"stride_b\":" << problem.stride_b_ << ",\n" - << " \"stride_c\":" << problem.stride_c_ << ",\n" - << " \"dtype_a\":\"" << problem.dtype_a_ << "\",\n" - << " \"dtype_b\":\"" << problem.dtype_b_ << "\",\n" - << " \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n" - << " \"dtype_c\":\"" << problem.dtype_c_ << "\",\n" - << " \"layout_a\":\"" << problem.layout_a_ << "\",\n" - << " \"layout_b\":\"" << problem.layout_b_ << "\",\n" - << " \"layout_c\":\"" << problem.layout_c_ << "\",\n" - << " \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false") - << "\n" - << "}"; - return os; - } -}; - -struct PerformanceResult -{ - double latency_; - double tflops_; - double bandwidth_; - - static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m) - { - switch(m) - { - case Metric::LATENCY: return a.latency_ < b.latency_; - case Metric::TFLOPS: return a.tflops_ > b.tflops_; - case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_; - default: throw std::invalid_argument("Unsupported metric type"); - } - } - - friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result) - { - os << "{\n" - << " \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_ - << ",\n" - << " \"tflops(TFlops)\": " << result.tflops_ << ",\n" - << " \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n" - << "}"; - return os; - } -}; + ck_tile::index_t M_Warp_Tile; + ck_tile::index_t N_Warp_Tile; + ck_tile::index_t K_Warp_Tile; -struct KernelInstance -{ - std::string name_; - GemmProblem problem_; - PerformanceResult perf_result_; - - static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m) - { - return PerformanceResult::compare(a.perf_result_, b.perf_result_, m); - } - - friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj) - { - os << "{\n" - << " \"name\": \"" << obj.name_ << "\",\n" - << " \"problem\": " << obj.problem_ << ",\n" - << " \"perf_result\": " << obj.perf_result_ << "\n" - << "}"; - return os; - } -}; - -struct Setting -{ - int n_warmup_; - int n_repeat_; - bool is_gpu_timer_; - int verify_; - int init_method_; - bool log_; - std::string csv_filename_; - bool flush_cache_; - int rotating_count_; - bool json_output_; + bool permuteN; }; -inline std::string get_rocm_version() -{ - std::ifstream version_file("/opt/rocm/.info/version"); - if(version_file.is_open()) - { - std::string version; - std::getline(version_file, version); - return version; - } - return "Unknown"; -} - -template -auto calculate_rtol_atol(const ck_tile::index_t K, - const ck_tile::index_t kbatch, - const float max_accumulated_value) -{ - using ComputeType = - std::conditional_t; - // Calculate thresholds - const auto rtol = ck_tile::get_relative_threshold( - ck_tile::integer_divide_ceil(K, kbatch)); - const auto atol = ck_tile::get_absolute_threshold( - max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch)); - // Calculate error due to split_k accumulation - const auto rtol_split_k = - ck_tile::get_relative_threshold(kbatch); - const auto atol_split_k = ck_tile::get_absolute_threshold( - max_accumulated_value, kbatch); - // Use higher threshold - return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); -} - -/// @brief Function to compare the results of the device and host computations -bool compare(std::string instanceName, - ck_tile::index_t K, - ck_tile::index_t kbatch, - ck_tile::HostTensor& c_m_n_dev_result, - ck_tile::HostTensor& c_m_n_ref) -{ - const float max_accumulated_value = - *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end()); - const auto rtol_atol = calculate_rtol_atol( - K, kbatch, max_accumulated_value); - bool pass = ck_tile::check_err(c_m_n_dev_result, - c_m_n_ref, - "Error: Incorrect results!", - rtol_atol.at(ck_tile::number<0>{}), - rtol_atol.at(ck_tile::number<1>{})); - - std::cout << "For " << instanceName << " Relative error threshold is " - << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is " - << rtol_atol.at(ck_tile::number<1>{}) << std::endl; - std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl; - - return pass; -} - /// @brief Function to get the kernel output with reference implementation on CPU/GPU void gemm_host_reference(int verify, ck_tile::HostTensor& a_m_k, diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py index 53ae6336faf..935de186d66 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py @@ -1,588 +1,54 @@ # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT +import os import sys import json import subprocess import argparse import csv import time +import importlib.util from pathlib import Path from typing import List, Dict, Tuple, Optional -class GemmPreshuffleBenchmark: - def __init__(self, build_dir: str, verbose: bool = False): - self.build_dir = Path(build_dir) - self.verbose = verbose - self.results = [] - - def discover_kernels(self) -> List[Path]: - """Find all benchmark_gemm_preshuffle* executables in the build directory""" - bin_dir = self.build_dir / "bin" - if not bin_dir.exists(): - print(f"Error: Binary directory {bin_dir} does not exist") - return [] - - kernels = list(bin_dir.glob("benchmark_gemm_preshuffle*")) - if self.verbose: - print(f"Found {len(kernels)} kernel executables") - for k in kernels: - print(f" - {k.name}") - return kernels - - def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]: - """Extract comprehensive kernel information from filename""" - name = kernel_path.stem - - # Initialize with basic info - info = { - "executable": str(kernel_path), - "name": name, - "data_type": "unknown", - "layout": "unknown", - "pipeline": "unknown", - "scheduler": "unknown", - "epilogue": "unknown", - } - - # Parse the kernel name pattern: - # benchmark_gemm_preshuffle_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16 - parts = name.split("_") - - if len(parts) >= 4: - # Extract data type (4rd part after benchmark_gemm_preshuffle_) - info["data_type"] = parts[3] if len(parts) > 2 else "unknown" - - # Extract layout (5th part) - info["layout"] = parts[4] if len(parts) > 3 else "unknown" - - # Extract pipeline (6th part) - info["pipeline"] = parts[5] if len(parts) > 4 else "unknown" - - # Extract epilogue (7th part) - info["epilogue"] = parts[6] if len(parts) > 5 else "unknown" - - # Extract scheduler (8th part) - info["scheduler"] = parts[7] if len(parts) > 6 else "unknown" - - # Extract detailed configuration from the end of the name - config_info = self.parse_detailed_config(name) - info.update(config_info) - - # Generate config ID - info["config_id"] = self.generate_config_id(info) - - return info - - def parse_detailed_config(self, kernel_name: str) -> Dict: - """Parse detailed configuration from kernel name""" - config = { - "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0}, - "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0}, - "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0}, - "optimization_flags": { - "pad_m": False, - "pad_n": False, - "pad_k": False, - "persistent": False, - }, - } - - # Split by underscore and look for patterns - parts = kernel_name.split("_") - - # Look for boolean flags (sequence of True/False values) - bool_sequence = [] - for i, part in enumerate(parts): - if part in ["True", "False"]: - bool_sequence.append(part == "True") - # Continue collecting consecutive boolean values - j = i + 1 - while j < len(parts) and parts[j] in ["True", "False"]: - bool_sequence.append(parts[j] == "True") - j += 1 - break - - # Assign boolean flags if we found them - # Order: pad_m, pad_n, pad_k, persistent (4 flags total) - if len(bool_sequence) >= 4: - config["optimization_flags"]["pad_m"] = bool_sequence[0] - config["optimization_flags"]["pad_n"] = bool_sequence[1] - config["optimization_flags"]["pad_k"] = bool_sequence[2] - config["optimization_flags"]["persistent"] = bool_sequence[3] - - # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16) - # The pattern is: tile_sizes_warp_config_warp_tile - dimension_groups = [] - for part in parts: - if "x" in part and len(part.split("x")) == 3: - try: - dims = [int(x) for x in part.split("x")] - if all(d > 0 for d in dims): - dimension_groups.append(dims) - except ValueError: - continue - - # Assign dimensions based on order and magnitude - if len(dimension_groups) >= 3: - # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile - sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True) - - # Largest dimensions = tile sizes - config["tile_sizes"]["tile_m"] = sorted_groups[0][0] - config["tile_sizes"]["tile_n"] = sorted_groups[0][1] - config["tile_sizes"]["tile_k"] = sorted_groups[0][2] - - # Smallest dimensions = warp config - config["warp_config"]["warp_m"] = sorted_groups[2][0] - config["warp_config"]["warp_n"] = sorted_groups[2][1] - config["warp_config"]["warp_k"] = sorted_groups[2][2] - - # Middle dimensions = warp tile - config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0] - config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1] - config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2] - elif len(dimension_groups) == 2: - # If only 2 groups, assign based on magnitude - sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True) - - # Larger = tile sizes - config["tile_sizes"]["tile_m"] = sorted_groups[0][0] - config["tile_sizes"]["tile_n"] = sorted_groups[0][1] - config["tile_sizes"]["tile_k"] = sorted_groups[0][2] - - # Smaller = warp config - config["warp_config"]["warp_m"] = sorted_groups[1][0] - config["warp_config"]["warp_n"] = sorted_groups[1][1] - config["warp_config"]["warp_k"] = sorted_groups[1][2] - elif len(dimension_groups) == 1: - # Only one group - assume it's tile sizes - config["tile_sizes"]["tile_m"] = dimension_groups[0][0] - config["tile_sizes"]["tile_n"] = dimension_groups[0][1] - config["tile_sizes"]["tile_k"] = dimension_groups[0][2] - - return config - - def generate_config_id(self, info: Dict) -> str: - """Generate a compact config ID from kernel info""" - # Create a compact identifier - parts = [ - info.get("data_type", "unk"), - info.get("layout", "unk"), - info.get("pipeline", "unk"), - info.get("scheduler", "unk"), - ] - - # Add tile configuration if available - tile_sizes = info.get("tile_sizes", {}) - if tile_sizes.get("tile_m", 0) > 0: - tile_str = ( - f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}" - ) - parts.append(tile_str) - - # Add warp config if available - warp_config = info.get("warp_config", {}) - if warp_config.get("warp_m", 0) > 0: - warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}" - parts.append(warp_str) - - # Add warp tile if available - warp_tile = info.get("warp_tile", {}) - if warp_tile.get("warp_tile_m", 0) > 0: - warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}" - parts.append(warp_tile_str) - - return "_".join(parts) - - def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]: - """Run a single kernel with given parameters and save output to individual JSON file""" - # Create results directory - results_dir = self.build_dir / "results" - results_dir.mkdir(exist_ok=True) - - # Generate unique JSON filename for this kernel - json_file = results_dir / f"{kernel_path.stem}.json" - - cmd = [str(kernel_path)] - - # Add parameters - for key, value in params.items(): - cmd.append(f"-{key}={value}") - - # Add JSON output flag for clean JSON output - cmd.append("-json_output=true") - - if self.verbose: - print(f"Running: {' '.join(cmd)}") - - try: - result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) - - if result.returncode != 0: - print(f"Error running {kernel_path.name}: {result.stderr}") - return None - - # Save raw output to individual JSON file - output = result.stdout.strip() - - if output: - with open(json_file, "w") as f: - f.write(output) +def _import_gemm_benchmark(): + """Import validation utilities from commons directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(current_dir) - # Parse the JSON file - return self.parse_json_file(json_file) - else: - print(f"No output from {kernel_path.name}") - return None - - except subprocess.TimeoutExpired: - print(f"Timeout running {kernel_path.name}") - return None - except Exception as e: - print(f"Error running {kernel_path.name}: {e}") - return None - - def parse_json_file(self, json_file: Path) -> Optional[Dict]: - """Parse JSON data from individual kernel output file""" - try: - with open(json_file, "r") as f: - content = f.read().strip() - - # Parse the JSON directly since executables produce clean JSON - data = json.loads(content) - - # Return the complete JSON data as-is, just add some convenience fields - result = data.copy() - if "perf_result" in data: - perf = data["perf_result"] - # Add convenience fields for backward compatibility - result["time_ms"] = perf.get("latency(ms)", 0) - result["tflops"] = perf.get("tflops(TFlops)", 0) - result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0) - - return result - - except json.JSONDecodeError as e: - if self.verbose: - print(f"Failed to parse JSON from {json_file}: {e}") - return None - except Exception as e: - if self.verbose: - print(f"Error reading JSON file {json_file}: {e}") - return None - - def benchmark_problem_size( - self, - kernels: List[Path], - m: int, - n: int, - k: int, - split_k: int = 1, - verify: int = 0, - warmup: int = 50, - repeat: int = 100, - flush_cache: bool = True, - rotating_count: int = 1000, - ) -> List[Dict]: - """Benchmark all kernels for a specific problem size""" - results = [] - - params = { - "m": m, - "n": n, - "k": k, - "split_k": split_k, - "verify": verify, - "warmup": warmup, - "repeat": repeat, - "flush_cache": str(flush_cache).lower(), - "rotating_count": rotating_count, - } - - print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}") - - for kernel_path in kernels: - kernel_info = self.extract_kernel_info(kernel_path) - result = self.run_kernel(kernel_path, params) - - if result: - # Create new structured result format - structured_result = { - "name": kernel_info["name"], # Add name field for compatibility - "config_id": kernel_info["config_id"], - "problem": result.get("problem", {}), - "perf_result": result.get("perf_result", {}), - "config": { - "data_type": kernel_info["data_type"], - "layout": kernel_info["layout"], - "pipeline": kernel_info["pipeline"], - "scheduler": kernel_info["scheduler"], - "epilogue": kernel_info["epilogue"], - "tile_sizes": kernel_info.get("tile_sizes", {}), - "warp_config": kernel_info.get("warp_config", {}), - "warp_tile": kernel_info.get("warp_tile", {}), - "optimization_flags": kernel_info.get("optimization_flags", {}), - }, - "executable": kernel_info["executable"], - # Keep backward compatibility fields - "time_ms": result.get("time_ms", 0), - "tflops": result.get("tflops", 0), - "bandwidth_gb_s": result.get("bandwidth_gb_s", 0), - } - - results.append(structured_result) - - if self.verbose: - print( - f" {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms" - ) - - return results - - def find_best_kernel( - self, results: List[Dict], metric: str = "tflops" - ) -> Optional[Dict]: - """Find the best performing kernel based on metric""" - if not results: - return None - - if metric == "tflops": - return max(results, key=lambda x: x.get("tflops", 0)) - elif metric == "time_ms": - return min(results, key=lambda x: x.get("time_ms", float("inf"))) - elif metric == "bandwidth_gb_s": - return max(results, key=lambda x: x.get("bandwidth_gb_s", 0)) - else: - raise ValueError(f"Unknown metric: {metric}") - - def benchmark_sweep( - self, - problem_sizes: List[Tuple[int, int, int]], - split_k_values: List[int] = [1], - verify: bool = False, - warmup: int = 50, - repeat: int = 100, - flush_cache: bool = True, - rotating_count: int = 1000, - ) -> Dict: - """Run comprehensive benchmark sweep""" - kernels = self.discover_kernels() - if not kernels: - print("No kernels found!") - return {} - - all_results = [] - best_kernels = {} - - for m, n, k in problem_sizes: - for split_k in split_k_values: - results = self.benchmark_problem_size( - kernels, - m, - n, - k, - split_k, - verify=2 if verify else 0, - warmup=warmup, - repeat=repeat, - flush_cache=flush_cache, - rotating_count=rotating_count, - ) - - all_results.extend(results) - - # Find best kernel for this configuration - best = self.find_best_kernel(results) - if best: - key = f"m{m}_n{n}_k{k}_splitk{split_k}" - best_kernels[key] = best - print( - f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)" - ) - - self.results = all_results - return best_kernels - - def export_csv(self, filename: str): - """Export all results to CSV""" - if not self.results: - print("No results to export") - return - - # Get all unique keys from results - all_keys = set() - for result in self.results: - all_keys.update(result.keys()) - - # Sort keys for consistent output - fieldnames = sorted(all_keys) - - with open(filename, "w", newline="") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.results) - - print(f"Results exported to {filename}") - - def export_best_kernels(self, best_kernels: Dict, filename: str): - """Export best kernel selections to file""" - with open(filename, "w") as f: - f.write("# Best kernel selections\n") - f.write( - "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n" - ) - - for key, kernel in sorted(best_kernels.items()): - f.write( - f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n" - ) - - print(f"Best kernels exported to {filename}") - - def export_json(self, filename: str, best_kernels: Dict = None): - """Export all results and best kernels to JSON with comprehensive metadata""" - from datetime import datetime - - # Calculate comprehensive summary statistics for all metrics - successful_results = [r for r in self.results if r.get("tflops", 0) > 0] - - tflops_values = [r.get("tflops", 0) for r in successful_results] - bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results] - latency_values = [ - r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0 - ] - - # Performance breakdown by kernel type - pipeline_stats = {} - scheduler_stats = {} - data_type_stats = {} - - for result in successful_results: - # Get config info from the new structure - config = result.get("config", {}) - - # Pipeline statistics - pipeline = config.get("pipeline", "unknown") - if pipeline not in pipeline_stats: - pipeline_stats[pipeline] = { - "count": 0, - "avg_tflops": 0, - "best_tflops": 0, - } - pipeline_stats[pipeline]["count"] += 1 - pipeline_stats[pipeline]["best_tflops"] = max( - pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0) - ) - - # Scheduler statistics - scheduler = config.get("scheduler", "unknown") - if scheduler not in scheduler_stats: - scheduler_stats[scheduler] = { - "count": 0, - "avg_tflops": 0, - "best_tflops": 0, - } - scheduler_stats[scheduler]["count"] += 1 - scheduler_stats[scheduler]["best_tflops"] = max( - scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0) - ) + # Load the module dynamically + spec = importlib.util.spec_from_file_location( + "gemm_benchmark", + os.path.join(parent_dir, "gemm_benchmark.py"), + ) + gemm_benchmark_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(gemm_benchmark_module) - # Data type statistics - data_type = config.get("data_type", "unknown") - if data_type not in data_type_stats: - data_type_stats[data_type] = { - "count": 0, - "avg_tflops": 0, - "best_tflops": 0, - } - data_type_stats[data_type]["count"] += 1 - data_type_stats[data_type]["best_tflops"] = max( - data_type_stats[data_type]["best_tflops"], result.get("tflops", 0) - ) + return gemm_benchmark_module.GemmBenchmark - # Calculate averages for breakdown stats - for stats_dict, field_name in [ - (pipeline_stats, "pipeline"), - (scheduler_stats, "scheduler"), - (data_type_stats, "data_type"), - ]: - for key in stats_dict: - relevant_results = [ - r - for r in successful_results - if r.get("config", {}).get(field_name, "unknown") == key - ] - if relevant_results: - stats_dict[key]["avg_tflops"] = sum( - r.get("tflops", 0) for r in relevant_results - ) / len(relevant_results) +def _import_benchmark_utils(): + """Import benchmark utilities from commons directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(os.path.dirname(current_dir)) - output_data = { - "benchmark_metadata": { - "timestamp": datetime.now().isoformat(), - "total_kernels_tested": len(self.results), - "unique_kernels": len( - set(r.get("name", "unknown") for r in self.results) - ), - "successful_runs": len(successful_results), - "failed_runs": len(self.results) - len(successful_results), - }, - "performance_summary": { - "tflops_stats": { - "best": max(tflops_values, default=0), - "average": sum(tflops_values) / len(tflops_values) - if tflops_values - else 0, - "min": min(tflops_values, default=0), - "median": sorted(tflops_values)[len(tflops_values) // 2] - if tflops_values - else 0, - }, - "bandwidth_stats": { - "best_gb_s": max(bandwidth_values, default=0), - "average_gb_s": sum(bandwidth_values) / len(bandwidth_values) - if bandwidth_values - else 0, - "min_gb_s": min(bandwidth_values, default=0), - "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2] - if bandwidth_values - else 0, - }, - "latency_stats": { - "best_ms": min(latency_values, default=0), - "average_ms": sum(latency_values) / len(latency_values) - if latency_values - else 0, - "max_ms": max(latency_values, default=0), - "median_ms": sorted(latency_values)[len(latency_values) // 2] - if latency_values - else 0, - }, - "kernel_type_breakdown": { - "by_pipeline": pipeline_stats, - "by_scheduler": scheduler_stats, - "by_data_type": data_type_stats, - }, - "total_problem_configurations": len(best_kernels) - if best_kernels - else 0, - }, - "kernel_results": self.results, - "best_kernels_by_problem": best_kernels or {}, - } + # Load the module dynamically + spec = importlib.util.spec_from_file_location( + "benchmark_utils", + os.path.join(parent_dir, "common", "benchmark_utils.py"), + ) + benchmark_utils = importlib.util.module_from_spec(spec) + spec.loader.exec_module(benchmark_utils) - with open(filename, "w") as f: - json.dump(output_data, f, indent=2) + return benchmark_utils - print(f"JSON results exported to {filename}") - print(f" - Total kernels: {len(self.results)}") - print(f" - Successful runs: {len(successful_results)}") - print(f" - Best TFLOPS: {max(tflops_values, default=0):.2f}") - print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s") - print(f" - Best latency: {min(latency_values, default=0):.2f}ms") +GemmBenchmark = _import_gemm_benchmark() +benchmark_utils = _import_benchmark_utils() +class GemmPreshuffleBenchmark(GemmBenchmark): + def __init__(self, build_dir: str, verbose: bool = False): + super().__init__(build_dir, verbose, name="benchmark_gemm_preshuffle_") def main(): parser = argparse.ArgumentParser( @@ -669,12 +135,12 @@ def main(): print(f"\nBenchmark completed in {elapsed_time:.2f} seconds") # Export results - benchmark.export_csv(args.csv) - benchmark.export_best_kernels(best_kernels, args.best) + benchmark_utils.export_csv(benchmark.results, args.csv) + benchmark_utils.export_best_kernels(best_kernels, args.best) # Export JSON if requested if args.json: - benchmark.export_json(args.json, best_kernels) + benchmark_utils.export_json(benchmark.results, args.json, best_kernels) return 0 diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp index 4fbb25f0c90..d03b35f2b46 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp @@ -11,78 +11,21 @@ #include "ck_tile/core.hpp" #include "ck_tile/host.hpp" +#include "gemm/gemm_common.hpp" #include "gemm_preshuffle_profiler.hpp" #include "gemm_preshuffle_common.hpp" // The kernel header is included via the compile command line with -include flag // It defines SelectedKernel struct and KERNEL_NAME -// DataTypeTraits are now defined in gemm_common.hpp - -// Create argument parser -inline auto create_args(int argc, char* argv[]) -{ - ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.") - .insert("n", "4096", "The value for n dimension. Default is 4096.") - .insert("k", "2048", "The value for k dimension. Default is 2048.") - .insert("stride_a", "0", "The stride value for tensor A. Default is 0.") - .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") - .insert("stride_c", "0", "The stride value for tensor C. Default is 0.") - .insert("split_k", "1", "The split value for k dimension. Default is 1.") - .insert("verify", - "2", - "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " - "for validation on GPU. Default is 0, no validation.") - .insert("log", - "false", - "Whether output kernel instance information or not. Possible values are true or " - "false. Default is false") - .insert( - "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") - .insert( - "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") - .insert("timer", - "true", - "Whether if the timer is gpu timer or not. Possible values are false or true. " - "Default is true.") - .insert("init", - "0", - "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " - "for constant(1). Default is 0, random.") - .insert("flush_cache", - "true", - "To flush cache, possible values are true or false. " - "Default is false.") - .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.") - .insert("metric", - "0", - "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " - "tflops, or 2 for bandwidth. Default is 0, latency.") - .insert("csv_filename", - "", - "The filename of benchmark result. Default is empty (no CSV output).") - .insert("structured_sparsity", - "false", - "Whether use sparsity kernel or not. Possible values are true or false. Default is " - "false") - .insert("json_output", - "false", - "Whether to output results in JSON format only. Possible values are true or false. " - "Default is " - "false"); - - bool result = arg_parser.parse(argc, argv); - return std::make_tuple(result, arg_parser); -} void benchmark_single(const ck_tile::ArgParser& arg_parser) { // Use DataTypeTraits to get the actual type names from the generated header // The generated header defines ADataType, BDataType, AccDataType, CDataType - std::string dtype_a = DataTypeTraits::name; - std::string dtype_b = DataTypeTraits::name; - std::string dtype_acc = DataTypeTraits::name; - std::string dtype_c = DataTypeTraits::name; + std::string dtype_a = ck_tile::DataTypeTraits::name; + std::string dtype_b = ck_tile::DataTypeTraits::name; + std::string dtype_acc = ck_tile::DataTypeTraits::name; + std::string dtype_c = ck_tile::DataTypeTraits::name; // Layout names from the layout types std::string layout_a = ALayout::name; @@ -119,29 +62,17 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser) arg_parser.get_bool("json_output")}; // Get the profiler instance - auto& profiler = GemmProfiler::instance(setting); + auto& profiler = GemmPreshuffleProfiler::instance(setting); try { - // Create a lambda that wraps the kernel launch - std::tuple warp_tile_dims = std::make_tuple( - SelectedKernel::WarpTileM, SelectedKernel::WarpTileN, SelectedKernel::WarpTileK); - std::tuple tile_dims = - std::make_tuple(SelectedKernel::TileM, SelectedKernel::TileN, SelectedKernel::TileK); - std::tuple warp_dims = std::make_tuple(SelectedKernel::WarpPerBlock_M, - SelectedKernel::WarpPerBlock_N, - SelectedKernel::WarpPerBlock_K); - bool permuteN = SelectedKernel::PermuteN; - - KernelConfig config{tile_dims, warp_dims, warp_tile_dims, permuteN}; - auto kernel_func = [](const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) { return SelectedKernel::launch(args, stream); }; // Benchmark the kernel - profiler.benchmark(gemm_problem, kernel_func, config); + profiler.benchmark(gemm_problem, kernel_func); // Select best instance based on metric profiler.select_best_instance(static_cast(arg_parser.get_int("metric"))); diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp index 1b2cfe37350..21cda28f754 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp @@ -8,101 +8,20 @@ #include "ck_tile/host.hpp" #include "ck_tile/core/numeric/integer.hpp" #include "ck_tile/core/numeric/pk_int4.hpp" - -//[TODO] This can be moved to commons -// DataTypeTraits for all supported types -template -struct DataTypeTraits; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp32"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp64"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp16"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "bf16"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp8"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "bf8"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "int8"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "int32"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "pk_int4_t"; -}; - -// Helper function to determine if a layout is row-major -template -constexpr auto is_row_major(Layout) -{ - return ck_tile::bool_constant>{}; -} +#include "gemm/gemm_common.hpp" // Structure to hold kernel traits for dispatcher -struct KernelTraits +struct PreshuffleKernelTraits : KernelTraits { - std::string pipeline; // preshufflev2 - std::string scheduler; // intrawave, interwave, default - std::string epilogue; // cshuffle, default - bool pad_m; - bool pad_n; - bool pad_k; - bool persistent; // Constructor with defaults - KernelTraits() - : pipeline("preshufflev2"), - scheduler("default"), - epilogue("default"), - pad_m(false), - pad_n(false), - pad_k(false), - persistent(false) - { - } + PreshuffleKernelTraits() : KernelTraits() { this->pipeline = "preshufflev2"; } }; // Helper to extract traits from kernel name -inline KernelTraits extract_traits_from_name(const std::string& kernel_name) +inline PreshuffleKernelTraits extract_traits_from_name(const std::string& kernel_name) { - KernelTraits traits; + PreshuffleKernelTraits traits; // Extract pipeline if(kernel_name.find("preshufflev2") != std::string::npos) @@ -140,42 +59,3 @@ inline KernelTraits extract_traits_from_name(const std::string& kernel_name) return traits; } - -template -auto shuffle_b(const ck_tile::HostTensor& t, - ck_tile::index_t N_Warp_Tile, - ck_tile::index_t K_Warp_Tile) -{ - assert(t.get_lengths().size() == 2); - int n_ = t.get_lengths()[1]; - int k_ = t.get_lengths()[0]; - int divisor = N_Warp_Tile == 32 ? 2 : 4; - ck_tile::HostTensor t_view( - {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4}); -} - -template -auto shuffle_b_permuteN(const ck_tile::HostTensor& t, - ck_tile::index_t N_Warp_Tile, - ck_tile::index_t K_Warp_Tile, - ck_tile::index_t N_Tile, - ck_tile::index_t N_Warp) -{ - assert(t.get_lengths().size() == 2); - - int n_ = t.get_lengths()[1]; - int k_ = t.get_lengths()[0]; - int divisor = N_Warp_Tile == 32 ? 2 : 4; - int NRepeat = N_Tile / N_Warp_Tile / N_Warp; - ck_tile::HostTensor t_view({n_ / N_Tile, - N_Warp, - N_Warp_Tile, - NRepeat, - k_ / K_Warp_Tile, - divisor, - K_Warp_Tile / divisor}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6}); -} diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp index 739bd7e677a..e7af0738776 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp @@ -4,42 +4,26 @@ #pragma once #include "ck_tile/host/device_prop.hpp" +#include "ck_tile/host/tensor_shuffle_utils.hpp" #include "ck_tile/ops/gemm.hpp" +#include "gemm/gemm_profiler.hpp" #include "gemm_preshuffle_benchmark.hpp" -class GemmProfiler +class GemmPreshuffleProfiler + : public GemmProfiler { public: - static GemmProfiler& instance(Setting setting) - { - static GemmProfiler instance{setting}; - return instance; - } + using BaseGemm = GemmProfiler; + using BaseGemm::benchmark; - // Overload for single kernel benchmarking - void benchmark(GemmProblem& gemm_problem, - std::function - kernel_func, - KernelConfig& config) + GemmPreshuffleProfiler(Setting setting) + : GemmProfiler(setting) { - // Create a vector with a single callable that returns both name and time - std::vector(ck_tile::GemmHostArgs&, - const ck_tile::stream_config&)>> - callables; - - callables.push_back( - [kernel_func](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) { - float time = kernel_func(args, stream); - return std::make_tuple(std::string(KERNEL_NAME), time); - }); - - benchmark(gemm_problem, callables, config); } void benchmark(GemmProblem& gemm_problem, std::vector( - ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables, - KernelConfig& config) + ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override { const ALayout layout_a = ALayout{}; const BLayout layout_b = BLayout{}; @@ -111,21 +95,28 @@ class GemmProfiler c_m_n_dev_buf.SetZero(); c_m_n_dev_result.SetZero(); + // Create a lambda that wraps the kernel launch + KernelConfig config{SelectedKernel::WarpTileM, + SelectedKernel::WarpTileN, + SelectedKernel::WarpTileK, + SelectedKernel::TileM, + SelectedKernel::TileN, + SelectedKernel::TileK, + SelectedKernel::WarpPerBlock_M, + SelectedKernel::WarpPerBlock_N, + SelectedKernel::WarpPerBlock_K, + SelectedKernel::PermuteN}; + for(const auto& callable : callables) { - ck_tile::index_t N_Warp_Tile = std::get<1>(config.warp_tile_dims); - ck_tile::index_t K_Warp_Tile = std::get<2>(config.warp_tile_dims); - ck_tile::index_t N_Tile = std::get<1>(config.tile_dims); - ck_tile::index_t N_Warp = std::get<1>(config.warp_dims); - ck_tile::HostTensor b_shuffle_host = [&]() { if(config.permuteN) { - return shuffle_b_permuteN(b_k_n, N_Warp_Tile, K_Warp_Tile, N_Tile, N_Warp); + return ck_tile::shuffle_b_permuteN(b_k_n, config); } else { - return shuffle_b(b_k_n, N_Warp_Tile, K_Warp_Tile); + return ck_tile::shuffle_b(b_k_n, config); } }(); @@ -158,132 +149,4 @@ class GemmProfiler gemm_problem, c_m_n_dev_buf, c_m_n_ref, c_m_n_dev_result, kernel_run_result); } } - - void process_result(const GemmProblem& gemm_problem, - ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_ref, - ck_tile::HostTensor& c_m_n_dev_result, - const std::tuple& kernel_run_result) - { - auto [name, avg_time] = kernel_run_result; - - KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}}; - - // compute performance metric - std::size_t flop = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_; - std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ + - sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ + - sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_; - - // update - kernel_instance.perf_result_.latency_ = avg_time; - kernel_instance.perf_result_.tflops_ = static_cast(flop) / 1.E9 / avg_time; - kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time; - - if(setting_.log_ > 0 && !setting_.json_output_) - { - std::cout << kernel_instance << std::endl; - } - - // verify result - c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); - - bool verified_correct = - !setting_.verify_ || - compare(name, gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_ref); - - if(verified_correct) - { - kernel_instances_.emplace_back(kernel_instance); - } - else - { - std::cout << "Verification failed, skip kernel: " << name << std::endl; - } - - // clear tensor - c_m_n_dev_buf.SetZero(); - c_m_n_dev_result.SetZero(); - } - - KernelInstance select_best_instance(Metric metric) - { - if(kernel_instances_.empty()) - throw std::runtime_error("Empty instances"); - - auto kernel_instance = *std::max_element(kernel_instances_.begin(), - kernel_instances_.end(), - [metric](const auto& a, const auto& b) { - return PerformanceResult::compare( - b.perf_result_, a.perf_result_, metric); - }); - - if(setting_.json_output_) - { - // Output clean JSON only - std::cout << kernel_instance << std::endl; - } - else - { - std::cout << "**********************************" << std::endl; - std::cout << "According to given metrics: " << get_metric_name(metric) << "\n" - << "Current kernel performance is: " << kernel_instance << std::endl; - std::cout << "**********************************" << std::endl; - } - - if(!setting_.csv_filename_.empty()) - { - std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app); - - if(!file.is_open()) - { - std::cerr << "Warning: Failed to open CSV file for writing." << std::endl; - } - else - { - if(file.tellp() == 0) - { - file << "rocm_version,device_name," - << "split_k,m,n,k,stride_a,stride_b,stride_c," - << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c," - << "structured_sparsity," << "name," - << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n"; - } - - const auto& problem = kernel_instance.problem_; - const auto& name = kernel_instance.name_; - const auto& perf = kernel_instance.perf_result_; - - file << get_rocm_version() << "," << ck_tile::get_device_name() << "," - << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << "," - << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << "," - << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_ - << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << "," - << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_ - << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed - << std::setprecision(4) << perf.latency_ << "," << std::fixed - << std::setprecision(4) << perf.tflops_ << "," << std::fixed - << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric) - << "\n"; - - if(!file) - { - std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl; - } - } - } - - return kernel_instance; - } - - GemmProfiler(const GemmProfiler&) = delete; - GemmProfiler& operator=(const GemmProfiler&) = delete; - - private: - ~GemmProfiler() { kernel_instances_.clear(); } - GemmProfiler(Setting setting) : setting_(setting) {} - - Setting setting_; - - std::vector kernel_instances_; }; diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp new file mode 100644 index 00000000000..ab62b0616f1 --- /dev/null +++ b/tile_engine/ops/gemm/gemm_profiler.hpp @@ -0,0 +1,190 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/host/device_prop.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "gemm_benchmark.hpp" + +template +class GemmProfiler +{ + public: + static Gemm& instance(Setting setting) + { + static Gemm instance{setting}; + return instance; + } + + // Overload for single kernel benchmarking + void benchmark(Problem& gemm_problem, + std::function kernel_func) + { + // Create a vector with a single callable that returns both name and time + std::vector< + std::function(GemmArgs&, const ck_tile::stream_config&)>> + callables; + + callables.push_back([kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) { + float time = kernel_func(args, stream); + return std::make_tuple(std::string(KERNEL_NAME), time); + }); + + benchmark(gemm_problem, callables); + } + + virtual void benchmark(Problem& gemm_problem, + std::vector( + GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0; + + void process_result(const Problem& gemm_problem, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::HostTensor& c_m_n_host_result, + ck_tile::HostTensor& c_m_n_dev_result, + const std::tuple& kernel_run_result) + { + auto [name, avg_time] = kernel_run_result; + using DDataType = typename get_DsDataType::type; + + KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}}; + + // compute performance metric + std::size_t flop = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_; + std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ + + sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ + + sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_; + + if constexpr(!std::is_void_v) + { + ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) { + using DType = ck_tile::remove_cvref_t>; + num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_; + flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_; + }); + } + + // update + kernel_instance.perf_result_.latency_ = avg_time; + kernel_instance.perf_result_.tflops_ = static_cast(flop) / 1.E9 / avg_time; + kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time; + + if(setting_.log_ > 0 && !setting_.json_output_) + { + std::cout << kernel_instance << std::endl; + } + + // verify result + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + int split_k = 1; + if constexpr(std::is_same_v) + { + split_k = gemm_problem.split_k_; + } + bool verified_correct = + !setting_.verify_ || + compare(name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result); + + if(verified_correct) + { + kernel_instances_.emplace_back(kernel_instance); + } + else + { + std::cout << "Verification failed, skip kernel: " << name << std::endl; + } + + // clear tensor + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + } + + KernelInstance select_best_instance(Metric metric) + { + if(kernel_instances_.empty()) + throw std::runtime_error("Empty instances"); + + auto kernel_instance = *std::max_element(kernel_instances_.begin(), + kernel_instances_.end(), + [metric](const auto& a, const auto& b) { + return PerformanceResult::compare( + b.perf_result_, a.perf_result_, metric); + }); + + if(setting_.json_output_) + { + // Output clean JSON only + std::cout << kernel_instance << std::endl; + } + else + { + std::cout << "**********************************" << std::endl; + std::cout << "According to given metrics: " << get_metric_name(metric) << "\n" + << "Current kernel performance is: " << kernel_instance << std::endl; + std::cout << "**********************************" << std::endl; + } + + if(!setting_.csv_filename_.empty()) + { + std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app); + + if(!file.is_open()) + { + std::cerr << "Warning: Failed to open CSV file for writing." << std::endl; + } + else + { + if(file.tellp() == 0) + { + file << "rocm_version,device_name," + << "split_k,m,n,k,stride_a,stride_b,stride_c," + << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c," + << "structured_sparsity," << "name," + << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n"; + } + + const auto& problem = kernel_instance.problem_; + const auto& name = kernel_instance.name_; + const auto& perf = kernel_instance.perf_result_; + + file << get_rocm_version() << "," << ck_tile::get_device_name() << "," + << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << "," + << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << "," + << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_ + << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << "," + << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_ + << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed + << std::setprecision(4) << perf.latency_ << "," << std::fixed + << std::setprecision(4) << perf.tflops_ << "," << std::fixed + << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric) + << "\n"; + + if(!file) + { + std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl; + } + } + } + + return kernel_instance; + } + + GemmProfiler(const GemmProfiler&) = delete; + GemmProfiler& operator=(const GemmProfiler&) = delete; + + protected: + virtual ~GemmProfiler() { kernel_instances_.clear(); } + GemmProfiler(Setting setting) : setting_(setting) {} + + Setting setting_; + + std::vector> kernel_instances_; +}; diff --git a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt index 7505fcd6d04..7f8048b5945 100644 --- a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt +++ b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt @@ -68,7 +68,7 @@ function(create_individual_gemm_universal_target datatype layout trait tile_conf # Create the executable add_executable(${target_name} EXCLUDE_FROM_ALL - ${GEMM_UNIVERSAL_SOURCE_DIR}/gemm_benchmark_single.cpp + ${GEMM_UNIVERSAL_SOURCE_DIR}/gemm_universal_benchmark_single.cpp ${instance_header} ) diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.hpp deleted file mode 100644 index 7c8df32ad89..00000000000 --- a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.hpp +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include -#include -#include - -#include "ck_tile/core.hpp" -#include "ck_tile/host.hpp" -#include "gemm_common.hpp" - -// Data types and Layouts are defined by the generated kernel headers -// No hardcoded type definitions here to avoid conflicts - -enum class Metric -{ - LATENCY = 0, - TFLOPS = 1, - BANDWIDTH = 2 -}; - -inline constexpr auto get_metric_name(Metric m) -{ - switch(m) - { - case Metric::LATENCY: return "latency"; - case Metric::TFLOPS: return "tflops"; - case Metric::BANDWIDTH: return "bandwidth"; - default: throw std::invalid_argument("Unsupported metric type"); - } -} - -struct GemmProblem -{ - int split_k_; - int m_, n_, k_; - int stride_a_, stride_b_, stride_c_; - - std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_; - std::string layout_a_, layout_b_, layout_c_; - - bool structured_sparsity_; - - friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem) - { - os << "{\n" - << " \"split_k\":" << problem.split_k_ << ",\n" - << " \"m\":" << problem.m_ << ",\n" - << " \"n\":" << problem.n_ << ",\n" - << " \"k\":" << problem.k_ << ",\n" - << " \"stride_a\":" << problem.stride_a_ << ",\n" - << " \"stride_b\":" << problem.stride_b_ << ",\n" - << " \"stride_c\":" << problem.stride_c_ << ",\n" - << " \"dtype_a\":\"" << problem.dtype_a_ << "\",\n" - << " \"dtype_b\":\"" << problem.dtype_b_ << "\",\n" - << " \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n" - << " \"dtype_c\":\"" << problem.dtype_c_ << "\",\n" - << " \"layout_a\":\"" << problem.layout_a_ << "\",\n" - << " \"layout_b\":\"" << problem.layout_b_ << "\",\n" - << " \"layout_c\":\"" << problem.layout_c_ << "\",\n" - << " \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false") - << "\n" - << "}"; - return os; - } -}; - -struct PerformanceResult -{ - double latency_; - double tflops_; - double bandwidth_; - - static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m) - { - switch(m) - { - case Metric::LATENCY: return a.latency_ < b.latency_; - case Metric::TFLOPS: return a.tflops_ > b.tflops_; - case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_; - default: throw std::invalid_argument("Unsupported metric type"); - } - } - - friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result) - { - os << "{\n" - << " \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_ - << ",\n" - << " \"tflops(TFlops)\": " << result.tflops_ << ",\n" - << " \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n" - << "}"; - return os; - } -}; - -struct KernelInstance -{ - std::string name_; - GemmProblem problem_; - PerformanceResult perf_result_; - - static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m) - { - return PerformanceResult::compare(a.perf_result_, b.perf_result_, m); - } - - friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj) - { - os << "{\n" - << " \"name\": \"" << obj.name_ << "\",\n" - << " \"problem\": " << obj.problem_ << ",\n" - << " \"perf_result\": " << obj.perf_result_ << "\n" - << "}"; - return os; - } -}; - -struct Setting -{ - int n_warmup_; - int n_repeat_; - bool is_gpu_timer_; - int verify_; - int init_method_; - bool log_; - std::string csv_filename_; - bool flush_cache_; - int rotating_count_; - bool json_output_; -}; - -inline std::string get_rocm_version() -{ - std::ifstream version_file("/opt/rocm/.info/version"); - if(version_file.is_open()) - { - std::string version; - std::getline(version_file, version); - return version; - } - return "Unknown"; -} - -template -auto calculate_rtol_atol(const ck_tile::index_t K, - const ck_tile::index_t kbatch, - const float max_accumulated_value) -{ - using ComputeType = - std::conditional_t; - // Calculate thresholds - const auto rtol = ck_tile::get_relative_threshold( - ck_tile::integer_divide_ceil(K, kbatch)); - const auto atol = ck_tile::get_absolute_threshold( - max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch)); - // Calculate error due to split_k accumulation - const auto rtol_split_k = - ck_tile::get_relative_threshold(kbatch); - const auto atol_split_k = ck_tile::get_absolute_threshold( - max_accumulated_value, kbatch); - // Use higher threshold - return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); -} - -/// @brief Function to compare the results of the device and host computations -bool compare(std::string instanceName, - ck_tile::index_t K, - ck_tile::index_t kbatch, - ck_tile::HostTensor& c_m_n_dev_result, - ck_tile::HostTensor& c_m_n_host_result) -{ - const float max_accumulated_value = - *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end()); - const auto rtol_atol = calculate_rtol_atol( - K, kbatch, max_accumulated_value); - bool pass = ck_tile::check_err(c_m_n_dev_result, - c_m_n_host_result, - "Error: Incorrect results!", - rtol_atol.at(ck_tile::number<0>{}), - rtol_atol.at(ck_tile::number<1>{})); - - std::cout << "For " << instanceName << " Relative error threshold is " - << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is " - << rtol_atol.at(ck_tile::number<1>{}) << std::endl; - std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl; - - return pass; -} - -/// @brief Function to get the kernel output with reference implementation on CPU/GPU -void gemm_host_reference(int verify, - ck_tile::HostTensor& a_m_k, - ck_tile::HostTensor& b_k_n, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::DeviceMem& a_m_k_dev_buf, - ck_tile::DeviceMem& b_k_n_dev_buf, - ck_tile::index_t M, - ck_tile::index_t N, - ck_tile::index_t K, - ck_tile::index_t stride_A, - ck_tile::index_t stride_B, - ck_tile::index_t stride_C) -{ - if(verify == 1) - { - c_m_n_host_result.SetZero(); - - ck_tile::reference_gemm( - a_m_k, b_k_n, c_m_n_host_result); - } - else if(verify == 2) - { - if constexpr(std::is_same_v) - { - // Restore input for B for gpu reference - b_k_n_dev_buf.ToDevice(b_k_n.data()); - } - - ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes()); - c_m_n_host_result.SetZero(); - c_m_n_gpu_buf_ref.SetZero(); - - ADataType* d_A = static_cast(a_m_k_dev_buf.GetDeviceBuffer()); - BDataType* d_B = static_cast(b_k_n_dev_buf.GetDeviceBuffer()); - CDataType* d_C = static_cast(c_m_n_gpu_buf_ref.GetDeviceBuffer()); - - ck_tile::reference_gemm_gpu(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C); - - c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data()); - } -} diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py b/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py deleted file mode 100644 index b7424c6d1da..00000000000 --- a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py +++ /dev/null @@ -1,678 +0,0 @@ -# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -# SPDX-License-Identifier: MIT - -import sys -import json -import subprocess -import argparse -import csv -import time -from pathlib import Path -from typing import List, Dict, Tuple, Optional - - -class GemmBenchmark: - def __init__(self, build_dir: str, verbose: bool = False): - self.build_dir = Path(build_dir) - self.verbose = verbose - self.results = [] - - def discover_kernels(self) -> List[Path]: - """Find all benchmark_gemm_* executables in the build directory""" - bin_dir = self.build_dir / "bin" - if not bin_dir.exists(): - print(f"Error: Binary directory {bin_dir} does not exist") - return [] - - kernels = list(bin_dir.glob("benchmark_gemm_*")) - if self.verbose: - print(f"Found {len(kernels)} kernel executables") - for k in kernels: - print(f" - {k.name}") - return kernels - - def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]: - """Extract comprehensive kernel information from filename""" - name = kernel_path.stem - - # Initialize with basic info - info = { - "executable": str(kernel_path), - "name": name, - "data_type": "unknown", - "layout": "unknown", - "pipeline": "unknown", - "scheduler": "unknown", - "epilogue": "unknown", - } - - # Parse the kernel name pattern: - # benchmark_gemm_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16 - parts = name.split("_") - - if len(parts) >= 3: - # Extract data type (3rd part after benchmark_gemm_) - info["data_type"] = parts[2] if len(parts) > 2 else "unknown" - - # Extract layout (4th part) - info["layout"] = parts[3] if len(parts) > 3 else "unknown" - - # Extract pipeline (5th part) - info["pipeline"] = parts[4] if len(parts) > 4 else "unknown" - - # Extract epilogue (6th part) - info["epilogue"] = parts[5] if len(parts) > 5 else "unknown" - - # Extract scheduler (7th part) - info["scheduler"] = parts[6] if len(parts) > 6 else "unknown" - - # Extract detailed configuration from the end of the name - config_info = self.parse_detailed_config(name) - info.update(config_info) - - # Generate config ID - info["config_id"] = self.generate_config_id(info) - - return info - - def parse_detailed_config(self, kernel_name: str) -> Dict: - """Parse detailed configuration from kernel name""" - config = { - "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0}, - "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0}, - "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0}, - "optimization_flags": { - "pad_m": False, - "pad_n": False, - "pad_k": False, - "persistent": False, - }, - } - - # Split by underscore and look for patterns - parts = kernel_name.split("_") - - # Look for boolean flags (sequence of True/False values) - bool_sequence = [] - for i, part in enumerate(parts): - if part in ["True", "False"]: - bool_sequence.append(part == "True") - # Continue collecting consecutive boolean values - j = i + 1 - while j < len(parts) and parts[j] in ["True", "False"]: - bool_sequence.append(parts[j] == "True") - j += 1 - break - - # Assign boolean flags if we found them - # Order: pad_m, pad_n, pad_k, persistent (4 flags total) - if len(bool_sequence) >= 4: - config["optimization_flags"]["pad_m"] = bool_sequence[0] - config["optimization_flags"]["pad_n"] = bool_sequence[1] - config["optimization_flags"]["pad_k"] = bool_sequence[2] - config["optimization_flags"]["persistent"] = bool_sequence[3] - - # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16) - # The pattern is: tile_sizes_warp_config_warp_tile - dimension_groups = [] - for part in parts: - if "x" in part and len(part.split("x")) == 3: - try: - dims = [int(x) for x in part.split("x")] - if all(d > 0 for d in dims): - dimension_groups.append(dims) - except ValueError: - continue - - # Assign dimensions based on order and magnitude - if len(dimension_groups) >= 3: - # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile - sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True) - - # Largest dimensions = tile sizes - config["tile_sizes"]["tile_m"] = sorted_groups[0][0] - config["tile_sizes"]["tile_n"] = sorted_groups[0][1] - config["tile_sizes"]["tile_k"] = sorted_groups[0][2] - - # Smallest dimensions = warp config - config["warp_config"]["warp_m"] = sorted_groups[2][0] - config["warp_config"]["warp_n"] = sorted_groups[2][1] - config["warp_config"]["warp_k"] = sorted_groups[2][2] - - # Middle dimensions = warp tile - config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0] - config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1] - config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2] - elif len(dimension_groups) == 2: - # If only 2 groups, assign based on magnitude - sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True) - - # Larger = tile sizes - config["tile_sizes"]["tile_m"] = sorted_groups[0][0] - config["tile_sizes"]["tile_n"] = sorted_groups[0][1] - config["tile_sizes"]["tile_k"] = sorted_groups[0][2] - - # Smaller = warp config - config["warp_config"]["warp_m"] = sorted_groups[1][0] - config["warp_config"]["warp_n"] = sorted_groups[1][1] - config["warp_config"]["warp_k"] = sorted_groups[1][2] - elif len(dimension_groups) == 1: - # Only one group - assume it's tile sizes - config["tile_sizes"]["tile_m"] = dimension_groups[0][0] - config["tile_sizes"]["tile_n"] = dimension_groups[0][1] - config["tile_sizes"]["tile_k"] = dimension_groups[0][2] - - return config - - def generate_config_id(self, info: Dict) -> str: - """Generate a compact config ID from kernel info""" - # Create a compact identifier - parts = [ - info.get("data_type", "unk"), - info.get("layout", "unk"), - info.get("pipeline", "unk"), - info.get("scheduler", "unk"), - ] - - # Add tile configuration if available - tile_sizes = info.get("tile_sizes", {}) - if tile_sizes.get("tile_m", 0) > 0: - tile_str = ( - f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}" - ) - parts.append(tile_str) - - # Add warp config if available - warp_config = info.get("warp_config", {}) - if warp_config.get("warp_m", 0) > 0: - warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}" - parts.append(warp_str) - - # Add warp tile if available - warp_tile = info.get("warp_tile", {}) - if warp_tile.get("warp_tile_m", 0) > 0: - warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}" - parts.append(warp_tile_str) - - return "_".join(parts) - - def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]: - """Run a single kernel with given parameters and save output to individual JSON file""" - # Create results directory - results_dir = self.build_dir / "results" - results_dir.mkdir(exist_ok=True) - - # Generate unique JSON filename for this kernel - json_file = results_dir / f"{kernel_path.stem}.json" - - cmd = [str(kernel_path)] - - # Add parameters - for key, value in params.items(): - cmd.append(f"-{key}={value}") - - # Add JSON output flag for clean JSON output - cmd.append("-json_output=true") - - if self.verbose: - print(f"Running: {' '.join(cmd)}") - - try: - result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) - - if result.returncode != 0: - print(f"Error running {kernel_path.name}: {result.stderr}") - return None - - # Save raw output to individual JSON file - output = result.stdout.strip() - if output: - with open(json_file, "w") as f: - f.write(output) - - # Parse the JSON file - return self.parse_json_file(json_file) - else: - print(f"No output from {kernel_path.name}") - return None - - except subprocess.TimeoutExpired: - print(f"Timeout running {kernel_path.name}") - return None - except Exception as e: - print(f"Error running {kernel_path.name}: {e}") - return None - - def parse_json_file(self, json_file: Path) -> Optional[Dict]: - """Parse JSON data from individual kernel output file""" - try: - with open(json_file, "r") as f: - content = f.read().strip() - - # Parse the JSON directly since executables produce clean JSON - data = json.loads(content) - - # Return the complete JSON data as-is, just add some convenience fields - result = data.copy() - if "perf_result" in data: - perf = data["perf_result"] - # Add convenience fields for backward compatibility - result["time_ms"] = perf.get("latency(ms)", 0) - result["tflops"] = perf.get("tflops(TFlops)", 0) - result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0) - - return result - - except json.JSONDecodeError as e: - if self.verbose: - print(f"Failed to parse JSON from {json_file}: {e}") - return None - except Exception as e: - if self.verbose: - print(f"Error reading JSON file {json_file}: {e}") - return None - - def benchmark_problem_size( - self, - kernels: List[Path], - m: int, - n: int, - k: int, - split_k: int = 1, - verify: int = 0, - warmup: int = 50, - repeat: int = 100, - flush_cache: bool = True, - rotating_count: int = 1000, - ) -> List[Dict]: - """Benchmark all kernels for a specific problem size""" - results = [] - - params = { - "m": m, - "n": n, - "k": k, - "split_k": split_k, - "verify": verify, - "warmup": warmup, - "repeat": repeat, - "flush_cache": str(flush_cache).lower(), - "rotating_count": rotating_count, - } - - print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}") - - for kernel_path in kernels: - kernel_info = self.extract_kernel_info(kernel_path) - result = self.run_kernel(kernel_path, params) - - if result: - # Create new structured result format - structured_result = { - "name": kernel_info["name"], # Add name field for compatibility - "config_id": kernel_info["config_id"], - "problem": result.get("problem", {}), - "perf_result": result.get("perf_result", {}), - "config": { - "data_type": kernel_info["data_type"], - "layout": kernel_info["layout"], - "pipeline": kernel_info["pipeline"], - "scheduler": kernel_info["scheduler"], - "epilogue": kernel_info["epilogue"], - "tile_sizes": kernel_info.get("tile_sizes", {}), - "warp_config": kernel_info.get("warp_config", {}), - "warp_tile": kernel_info.get("warp_tile", {}), - "optimization_flags": kernel_info.get("optimization_flags", {}), - }, - "executable": kernel_info["executable"], - # Keep backward compatibility fields - "time_ms": result.get("time_ms", 0), - "tflops": result.get("tflops", 0), - "bandwidth_gb_s": result.get("bandwidth_gb_s", 0), - } - - results.append(structured_result) - - if self.verbose: - print( - f" {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms" - ) - - return results - - def find_best_kernel( - self, results: List[Dict], metric: str = "tflops" - ) -> Optional[Dict]: - """Find the best performing kernel based on metric""" - if not results: - return None - - if metric == "tflops": - return max(results, key=lambda x: x.get("tflops", 0)) - elif metric == "time_ms": - return min(results, key=lambda x: x.get("time_ms", float("inf"))) - elif metric == "bandwidth_gb_s": - return max(results, key=lambda x: x.get("bandwidth_gb_s", 0)) - else: - raise ValueError(f"Unknown metric: {metric}") - - def benchmark_sweep( - self, - problem_sizes: List[Tuple[int, int, int]], - split_k_values: List[int] = [1], - verify: bool = False, - warmup: int = 50, - repeat: int = 100, - flush_cache: bool = True, - rotating_count: int = 1000, - ) -> Dict: - """Run comprehensive benchmark sweep""" - kernels = self.discover_kernels() - if not kernels: - print("No kernels found!") - return {} - - all_results = [] - best_kernels = {} - - for m, n, k in problem_sizes: - for split_k in split_k_values: - results = self.benchmark_problem_size( - kernels, - m, - n, - k, - split_k, - verify=2 if verify else 0, - warmup=warmup, - repeat=repeat, - flush_cache=flush_cache, - rotating_count=rotating_count, - ) - - all_results.extend(results) - - # Find best kernel for this configuration - best = self.find_best_kernel(results) - if best: - key = f"m{m}_n{n}_k{k}_splitk{split_k}" - best_kernels[key] = best - print( - f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)" - ) - - self.results = all_results - return best_kernels - - def export_csv(self, filename: str): - """Export all results to CSV""" - if not self.results: - print("No results to export") - return - - # Get all unique keys from results - all_keys = set() - for result in self.results: - all_keys.update(result.keys()) - - # Sort keys for consistent output - fieldnames = sorted(all_keys) - - with open(filename, "w", newline="") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.results) - - print(f"Results exported to {filename}") - - def export_best_kernels(self, best_kernels: Dict, filename: str): - """Export best kernel selections to file""" - with open(filename, "w") as f: - f.write("# Best kernel selections\n") - f.write( - "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n" - ) - - for key, kernel in sorted(best_kernels.items()): - f.write( - f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n" - ) - - print(f"Best kernels exported to {filename}") - - def export_json(self, filename: str, best_kernels: Dict = None): - """Export all results and best kernels to JSON with comprehensive metadata""" - from datetime import datetime - - # Calculate comprehensive summary statistics for all metrics - successful_results = [r for r in self.results if r.get("tflops", 0) > 0] - - tflops_values = [r.get("tflops", 0) for r in successful_results] - bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results] - latency_values = [ - r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0 - ] - - # Performance breakdown by kernel type - pipeline_stats = {} - scheduler_stats = {} - data_type_stats = {} - - for result in successful_results: - # Get config info from the new structure - config = result.get("config", {}) - - # Pipeline statistics - pipeline = config.get("pipeline", "unknown") - if pipeline not in pipeline_stats: - pipeline_stats[pipeline] = { - "count": 0, - "avg_tflops": 0, - "best_tflops": 0, - } - pipeline_stats[pipeline]["count"] += 1 - pipeline_stats[pipeline]["best_tflops"] = max( - pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0) - ) - - # Scheduler statistics - scheduler = config.get("scheduler", "unknown") - if scheduler not in scheduler_stats: - scheduler_stats[scheduler] = { - "count": 0, - "avg_tflops": 0, - "best_tflops": 0, - } - scheduler_stats[scheduler]["count"] += 1 - scheduler_stats[scheduler]["best_tflops"] = max( - scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0) - ) - - # Data type statistics - data_type = config.get("data_type", "unknown") - if data_type not in data_type_stats: - data_type_stats[data_type] = { - "count": 0, - "avg_tflops": 0, - "best_tflops": 0, - } - data_type_stats[data_type]["count"] += 1 - data_type_stats[data_type]["best_tflops"] = max( - data_type_stats[data_type]["best_tflops"], result.get("tflops", 0) - ) - - # Calculate averages for breakdown stats - for stats_dict, field_name in [ - (pipeline_stats, "pipeline"), - (scheduler_stats, "scheduler"), - (data_type_stats, "data_type"), - ]: - for key in stats_dict: - relevant_results = [ - r - for r in successful_results - if r.get("config", {}).get(field_name, "unknown") == key - ] - if relevant_results: - stats_dict[key]["avg_tflops"] = sum( - r.get("tflops", 0) for r in relevant_results - ) / len(relevant_results) - - output_data = { - "benchmark_metadata": { - "timestamp": datetime.now().isoformat(), - "total_kernels_tested": len(self.results), - "unique_kernels": len( - set(r.get("name", "unknown") for r in self.results) - ), - "successful_runs": len(successful_results), - "failed_runs": len(self.results) - len(successful_results), - }, - "performance_summary": { - "tflops_stats": { - "best": max(tflops_values, default=0), - "average": sum(tflops_values) / len(tflops_values) - if tflops_values - else 0, - "min": min(tflops_values, default=0), - "median": sorted(tflops_values)[len(tflops_values) // 2] - if tflops_values - else 0, - }, - "bandwidth_stats": { - "best_gb_s": max(bandwidth_values, default=0), - "average_gb_s": sum(bandwidth_values) / len(bandwidth_values) - if bandwidth_values - else 0, - "min_gb_s": min(bandwidth_values, default=0), - "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2] - if bandwidth_values - else 0, - }, - "latency_stats": { - "best_ms": min(latency_values, default=0), - "average_ms": sum(latency_values) / len(latency_values) - if latency_values - else 0, - "max_ms": max(latency_values, default=0), - "median_ms": sorted(latency_values)[len(latency_values) // 2] - if latency_values - else 0, - }, - "kernel_type_breakdown": { - "by_pipeline": pipeline_stats, - "by_scheduler": scheduler_stats, - "by_data_type": data_type_stats, - }, - "total_problem_configurations": len(best_kernels) - if best_kernels - else 0, - }, - "kernel_results": self.results, - "best_kernels_by_problem": best_kernels or {}, - } - - with open(filename, "w") as f: - json.dump(output_data, f, indent=2) - - print(f"JSON results exported to {filename}") - print(f" - Total kernels: {len(self.results)}") - print(f" - Successful runs: {len(successful_results)}") - print(f" - Best TFLOPS: {max(tflops_values, default=0):.2f}") - print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s") - print(f" - Best latency: {min(latency_values, default=0):.2f}ms") - - -def main(): - parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool") - parser.add_argument( - "build_dir", help="Build directory containing kernel executables" - ) - parser.add_argument( - "--problem-sizes", - nargs="+", - default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"], - help="Problem sizes as M,N,K tuples", - ) - parser.add_argument( - "--split-k", nargs="+", type=int, default=[1], help="Split-K values to test" - ) - parser.add_argument("--verify", action="store_true", help="Enable verification") - parser.add_argument( - "--csv", default="gemm_benchmark_results.csv", help="CSV output filename" - ) - parser.add_argument( - "--best", default="best_kernels.txt", help="Best kernels output filename" - ) - parser.add_argument("--verbose", action="store_true", help="Verbose output") - parser.add_argument( - "--warmup", - type=int, - default=50, - help="Number of warmup iterations (default: 50)", - ) - parser.add_argument( - "--repeat", - type=int, - default=100, - help="Number of benchmark iterations (default: 100)", - ) - parser.add_argument( - "--flush-cache", - action="store_true", - default=True, - help="Enable cache flushing (default: True)", - ) - parser.add_argument( - "--rotating-count", - type=int, - default=1000, - help="Number of iterations to rotate cache (default: 1000)", - ) - parser.add_argument("--json", help="JSON output filename (optional)") - - args = parser.parse_args() - - # Parse problem sizes - problem_sizes = [] - for size_str in args.problem_sizes: - try: - m, n, k = map(int, size_str.split(",")) - problem_sizes.append((m, n, k)) - except ValueError: - print(f"Invalid problem size: {size_str}") - return 1 - - # Create benchmark instance - benchmark = GemmBenchmark(args.build_dir, verbose=args.verbose) - - # Run benchmark sweep - print("Starting GEMM kernel benchmark sweep...") - start_time = time.time() - - best_kernels = benchmark.benchmark_sweep( - problem_sizes=problem_sizes, - split_k_values=args.split_k, - verify=args.verify, - warmup=args.warmup, - repeat=args.repeat, - flush_cache=args.flush_cache, - rotating_count=args.rotating_count, - ) - - elapsed_time = time.time() - start_time - print(f"\nBenchmark completed in {elapsed_time:.2f} seconds") - - # Export results - benchmark.export_csv(args.csv) - benchmark.export_best_kernels(best_kernels, args.best) - - # Export JSON if requested - if args.json: - benchmark.export_json(args.json, best_kernels) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_universal/gemm_benchmark_single.cpp deleted file mode 100644 index 6323c066a1a..00000000000 --- a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark_single.cpp +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include -#include -#include -#include -#include -#include -#include - -#include "ck_tile/core.hpp" -#include "ck_tile/host.hpp" -#include "gemm_profiler.hpp" -#include "gemm_common.hpp" - -// The kernel header is included via the compile command line with -include flag -// It defines SelectedKernel struct and KERNEL_NAME -// DataTypeTraits are now defined in gemm_common.hpp - -// Create argument parser -inline auto create_args(int argc, char* argv[]) -{ - ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.") - .insert("n", "4096", "The value for n dimension. Default is 4096.") - .insert("k", "2048", "The value for k dimension. Default is 2048.") - .insert("stride_a", "0", "The stride value for tensor A. Default is 0.") - .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") - .insert("stride_c", "0", "The stride value for tensor C. Default is 0.") - .insert("split_k", "1", "The split value for k dimension. Default is 1.") - .insert("verify", - "2", - "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " - "for validation on GPU. Default is 2, GPU validation.") - .insert("log", - "false", - "Whether output kernel instance information or not. Possible values are true or " - "false. Default is false") - .insert( - "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") - .insert( - "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") - .insert("timer", - "true", - "Whether if the timer is gpu timer or not. Possible values are false or true. " - "Default is true.") - .insert("init", - "0", - "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " - "for constant(1). Default is 0, random.") - .insert("flush_cache", - "true", - "To flush cache, possible values are true or false. " - "Default is false.") - .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.") - .insert("metric", - "0", - "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " - "tflops, or 2 for bandwidth. Default is 0, latency.") - .insert("csv_filename", - "", - "The filename of benchmark result. Default is empty (no CSV output).") - .insert("structured_sparsity", - "false", - "Whether use sparsity kernel or not. Possible values are true or false. Default is " - "false") - .insert("json_output", - "false", - "Whether to output results in JSON format only. Possible values are true or false. " - "Default is " - "false"); - - bool result = arg_parser.parse(argc, argv); - return std::make_tuple(result, arg_parser); -} - -void benchmark_single(const ck_tile::ArgParser& arg_parser) -{ - // Use DataTypeTraits to get the actual type names from the generated header - // The generated header defines ADataType, BDataType, AccDataType, CDataType - std::string dtype_a = DataTypeTraits::name; - std::string dtype_b = DataTypeTraits::name; - std::string dtype_acc = DataTypeTraits::name; - std::string dtype_c = DataTypeTraits::name; - - // Layout names from the layout types - std::string layout_a = ALayout::name; - std::string layout_b = BLayout::name; - std::string layout_c = CLayout::name; - - // Create GemmProblem struct - GemmProblem gemm_problem{arg_parser.get_int("split_k"), - arg_parser.get_int("m"), - arg_parser.get_int("n"), - arg_parser.get_int("k"), - arg_parser.get_int("stride_a"), - arg_parser.get_int("stride_b"), - arg_parser.get_int("stride_c"), - dtype_a, - dtype_b, - dtype_acc, - dtype_c, - layout_a, - layout_b, - layout_c, - arg_parser.get_bool("structured_sparsity")}; - - // Create Setting struct - Setting setting{arg_parser.get_int("warmup"), - arg_parser.get_int("repeat"), - arg_parser.get_bool("timer"), - arg_parser.get_int("verify"), - arg_parser.get_int("init"), - arg_parser.get_bool("log"), - arg_parser.get_str("csv_filename"), - arg_parser.get_bool("flush_cache"), - arg_parser.get_int("rotating_count"), - arg_parser.get_bool("json_output")}; - - // Get the profiler instance - auto& profiler = GemmProfiler::instance(setting); - - try - { - // Create a lambda that wraps the kernel launch - auto kernel_func = [](const ck_tile::GemmHostArgs& args, - const ck_tile::stream_config& stream) { - return SelectedKernel::launch(args, stream); - }; - - // Benchmark the kernel - profiler.benchmark(gemm_problem, kernel_func); - - // Select best instance based on metric - profiler.select_best_instance(static_cast(arg_parser.get_int("metric"))); - } - catch(const std::exception& e) - { - std::cerr << "Benchmark failed: " << e.what() << std::endl; - } -} - -int main(int argc, char* argv[]) -{ - try - { - auto [result, parser] = create_args(argc, argv); - if(!result) - return EXIT_FAILURE; - - benchmark_single(parser); - return 0; - } - catch(const std::exception& e) - { - std::cerr << "Error: " << e.what() << "\n"; - return EXIT_FAILURE; - } -} diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_common.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_common.hpp deleted file mode 100644 index 899221547f6..00000000000 --- a/tile_engine/ops/gemm/gemm_universal/gemm_common.hpp +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include "ck_tile/core.hpp" -#include "ck_tile/host.hpp" -#include "ck_tile/core/numeric/integer.hpp" -#include "ck_tile/core/numeric/pk_int4.hpp" - -//[TODO] This can be moved to commons -// DataTypeTraits for all supported types -template -struct DataTypeTraits; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp32"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp64"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp16"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "bf16"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "fp8"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "bf8"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "int8"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "int32"; -}; - -template <> -struct DataTypeTraits -{ - static constexpr const char* name = "pk_int4_t"; -}; - -// Helper function to determine if a layout is row-major -template -constexpr auto is_row_major(Layout) -{ - return ck_tile::bool_constant>{}; -} - -// Structure to hold kernel traits for dispatcher -struct KernelTraits -{ - std::string pipeline; // compv3, compv4, mem - std::string scheduler; // intrawave, interwave - std::string epilogue; // cshuffle, default - bool pad_m; - bool pad_n; - bool pad_k; - bool persistent; - - // Constructor with defaults - KernelTraits() - : pipeline("compv3"), - scheduler("intrawave"), - epilogue("cshuffle"), - pad_m(false), - pad_n(false), - pad_k(false), - persistent(false) - { - } -}; diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_profiler.hpp deleted file mode 100644 index 3c6bbc34d3d..00000000000 --- a/tile_engine/ops/gemm/gemm_universal/gemm_profiler.hpp +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include - -#include "ck_tile/host/device_prop.hpp" -#include "ck_tile/ops/gemm.hpp" -#include "gemm_benchmark.hpp" - -class GemmProfiler -{ - public: - static GemmProfiler& instance(Setting setting) - { - static GemmProfiler instance{setting}; - return instance; - } - - // Overload for single kernel benchmarking - void benchmark(GemmProblem& gemm_problem, - std::function - kernel_func) - { - // Create a vector with a single callable that returns both name and time - std::vector(ck_tile::GemmHostArgs&, - const ck_tile::stream_config&)>> - callables; - - callables.push_back( - [kernel_func](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) { - float time = kernel_func(args, stream); - return std::make_tuple(std::string(KERNEL_NAME), time); - }); - - benchmark(gemm_problem, callables); - } - - void benchmark(GemmProblem& gemm_problem, - std::vector( - ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) - { - const ALayout layout_a = ALayout{}; - const BLayout layout_b = BLayout{}; - const CLayout layout_c = CLayout{}; - - gemm_problem.stride_a_ = ck_tile::get_default_stride( - gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a)); - gemm_problem.stride_b_ = ck_tile::get_default_stride( - gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b)); - gemm_problem.stride_c_ = ck_tile::get_default_stride( - gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)); - - ck_tile::HostTensor a_m_k(ck_tile::host_tensor_descriptor( - gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a))); - ck_tile::HostTensor b_k_n(ck_tile::host_tensor_descriptor( - gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b))); - ck_tile::HostTensor c_m_n_dev_result(ck_tile::host_tensor_descriptor( - gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c))); - - if(setting_.init_method_ == 0) - { - ck_tile::FillUniformDistribution{-1.f, 1.f}(a_m_k); - ck_tile::FillUniformDistribution{-1.f, 1.f}(b_k_n); - } - else if(setting_.init_method_ == 1) - { - ck_tile::FillMonotonicSeq{}(a_m_k); - ck_tile::FillMonotonicSeq{}(b_k_n); - } - else if(setting_.init_method_ == 2) - { - ck_tile::FillConstant{static_cast(1)}(a_m_k); - ck_tile::FillConstant{static_cast(1)}(b_k_n); - } - else - { - a_m_k.SetZero(); - b_k_n.SetZero(); - } - - if(gemm_problem.structured_sparsity_) - { - ck_tile::AdjustToStructuredSparsity{}(a_m_k); - } - - ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); - ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); - ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); - - if constexpr(std::is_same_v) - { - // Permute vector pk_i4x4 data for device implementation - ck_tile::HostTensor b_k_n_dev = b_k_n; - // permute_tensor_b(b_k_n_dev); - ck_tile::permute_vectors_i4x4_b(b_k_n_dev); - b_k_n_dev_buf.ToDevice(b_k_n_dev.data()); - } - else - { - b_k_n_dev_buf.ToDevice(b_k_n.data()); - } - - a_m_k_dev_buf.ToDevice(a_m_k.data()); - c_m_n_dev_buf.SetZero(); - c_m_n_dev_result.SetZero(); - - ck_tile::GemmHostArgs gemm_args = { - a_m_k_dev_buf.GetDeviceBuffer(), - b_k_n_dev_buf.GetDeviceBuffer(), - c_m_n_dev_buf.GetDeviceBuffer(), - gemm_problem.split_k_, - gemm_problem.m_, - gemm_problem.n_, - gemm_problem.k_, - gemm_problem.stride_a_, - gemm_problem.stride_b_, - gemm_problem.stride_c_, - }; - - ck_tile::HostTensor c_m_n_host_result(ck_tile::host_tensor_descriptor( - gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c))); - - if(setting_.verify_) - { - gemm_host_reference(setting_.verify_, - a_m_k, - b_k_n, - c_m_n_host_result, - a_m_k_dev_buf, - b_k_n_dev_buf, - gemm_problem.m_, - gemm_problem.n_, - gemm_problem.k_, - gemm_problem.stride_a_, - gemm_problem.stride_b_, - gemm_problem.stride_c_); - } - - for(auto& callable : callables) - { - auto kernel_run_result = callable(gemm_args, - ck_tile::stream_config{nullptr, - true, - setting_.log_, - setting_.n_warmup_, - setting_.n_repeat_, - setting_.is_gpu_timer_, - setting_.flush_cache_, - setting_.rotating_count_}); - process_result(gemm_problem, - c_m_n_dev_buf, - c_m_n_host_result, - c_m_n_dev_result, - kernel_run_result); - } - } - - void process_result(const GemmProblem& gemm_problem, - ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - const std::tuple& kernel_run_result) - { - auto [name, avg_time] = kernel_run_result; - - KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}}; - - // compute performance metric - std::size_t flop = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_; - std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ + - sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ + - sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_; - - // update - kernel_instance.perf_result_.latency_ = avg_time; - kernel_instance.perf_result_.tflops_ = static_cast(flop) / 1.E9 / avg_time; - kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time; - - if(setting_.log_ > 0 && !setting_.json_output_) - { - std::cout << kernel_instance << std::endl; - } - - // verify result - c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); - bool verified_correct = - !setting_.verify_ || - compare( - name, gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_host_result); - - if(verified_correct) - { - kernel_instances_.emplace_back(kernel_instance); - } - else - { - std::cout << "Verification failed, skip kernel: " << name << std::endl; - } - - // clear tensor - c_m_n_dev_buf.SetZero(); - c_m_n_dev_result.SetZero(); - } - - KernelInstance select_best_instance(Metric metric) - { - if(kernel_instances_.empty()) - throw std::runtime_error("Empty instances"); - - auto kernel_instance = *std::max_element(kernel_instances_.begin(), - kernel_instances_.end(), - [metric](const auto& a, const auto& b) { - return PerformanceResult::compare( - b.perf_result_, a.perf_result_, metric); - }); - - if(setting_.json_output_) - { - // Output clean JSON only - std::cout << kernel_instance << std::endl; - } - else - { - std::cout << "**********************************" << std::endl; - std::cout << "According to given metrics: " << get_metric_name(metric) << "\n" - << "Current kernel performance is: " << kernel_instance << std::endl; - std::cout << "**********************************" << std::endl; - } - - if(!setting_.csv_filename_.empty()) - { - std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app); - - if(!file.is_open()) - { - std::cerr << "Warning: Failed to open CSV file for writing." << std::endl; - } - else - { - if(file.tellp() == 0) - { - file << "rocm_version,device_name," - << "split_k,m,n,k,stride_a,stride_b,stride_c," - << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c," - << "structured_sparsity," << "name," - << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n"; - } - - const auto& problem = kernel_instance.problem_; - const auto& name = kernel_instance.name_; - const auto& perf = kernel_instance.perf_result_; - - file << get_rocm_version() << "," << ck_tile::get_device_name() << "," - << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << "," - << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << "," - << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_ - << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << "," - << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_ - << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed - << std::setprecision(4) << perf.latency_ << "," << std::fixed - << std::setprecision(4) << perf.tflops_ << "," << std::fixed - << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric) - << "\n"; - - if(!file) - { - std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl; - } - } - } - - return kernel_instance; - } - - GemmProfiler(const GemmProfiler&) = delete; - GemmProfiler& operator=(const GemmProfiler&) = delete; - - private: - ~GemmProfiler() { kernel_instances_.clear(); } - GemmProfiler(Setting setting) : setting_(setting) {} - - Setting setting_; - - std::vector kernel_instances_; -}; diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp new file mode 100644 index 00000000000..4cf272a3c82 --- /dev/null +++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp @@ -0,0 +1,66 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "gemm/gemm_benchmark.hpp" + +// Data types and Layouts are defined by the generated kernel headers +// No hardcoded type definitions here to avoid conflicts + +/// @brief Function to get the kernel output with reference implementation on CPU/GPU +void gemm_host_reference(int verify, + ck_tile::HostTensor& a_m_k, + ck_tile::HostTensor& b_k_n, + ck_tile::HostTensor& c_m_n_host_result, + ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C) +{ + if(verify == 1) + { + c_m_n_host_result.SetZero(); + + ck_tile::reference_gemm( + a_m_k, b_k_n, c_m_n_host_result); + } + else if(verify == 2) + { + if constexpr(std::is_same_v) + { + // Restore input for B for gpu reference + b_k_n_dev_buf.ToDevice(b_k_n.data()); + } + + ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes()); + c_m_n_host_result.SetZero(); + c_m_n_gpu_buf_ref.SetZero(); + + ADataType* d_A = static_cast(a_m_k_dev_buf.GetDeviceBuffer()); + BDataType* d_B = static_cast(b_k_n_dev_buf.GetDeviceBuffer()); + CDataType* d_C = static_cast(c_m_n_gpu_buf_ref.GetDeviceBuffer()); + + ck_tile::reference_gemm_gpu(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C); + + c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data()); + } +} diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py new file mode 100755 index 00000000000..4fa5d5dee0e --- /dev/null +++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +import os +import sys +import json +import subprocess +import argparse +import csv +import time +import importlib.util +from pathlib import Path +from typing import List, Dict, Tuple, Optional + +def _import_gemm_benchmark(): + """Import validation utilities from commons directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(current_dir) + + # Load the module dynamically + spec = importlib.util.spec_from_file_location( + "gemm_benchmark", + os.path.join(parent_dir, "gemm_benchmark.py"), + ) + gemm_benchmark_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(gemm_benchmark_module) + + return gemm_benchmark_module.GemmBenchmark + +def _import_benchmark_utils(): + """Import benchmark utilities from commons directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(os.path.dirname(current_dir)) + + # Load the module dynamically + spec = importlib.util.spec_from_file_location( + "benchmark_utils", + os.path.join(parent_dir, "common", "benchmark_utils.py"), + ) + benchmark_utils = importlib.util.module_from_spec(spec) + spec.loader.exec_module(benchmark_utils) + + return benchmark_utils + +GemmBenchmark = _import_gemm_benchmark() +benchmark_utils = _import_benchmark_utils() + +class GemmUniversalBenchmark(GemmBenchmark): + def __init__(self, build_dir: str, verbose: bool = False): + super().__init__(build_dir, verbose, name="benchmark_gemm_") + + +def main(): + parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool") + parser.add_argument( + "build_dir", help="Build directory containing kernel executables" + ) + parser.add_argument( + "--problem-sizes", + nargs="+", + default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"], + help="Problem sizes as M,N,K tuples", + ) + parser.add_argument( + "--split-k", nargs="+", type=int, default=[1], help="Split-K values to test" + ) + parser.add_argument("--verify", action="store_true", help="Enable verification") + parser.add_argument( + "--csv", default="gemm_benchmark_results.csv", help="CSV output filename" + ) + parser.add_argument( + "--best", default="best_kernels.txt", help="Best kernels output filename" + ) + parser.add_argument("--verbose", action="store_true", help="Verbose output") + parser.add_argument( + "--warmup", + type=int, + default=50, + help="Number of warmup iterations (default: 50)", + ) + parser.add_argument( + "--repeat", + type=int, + default=100, + help="Number of benchmark iterations (default: 100)", + ) + parser.add_argument( + "--flush-cache", + action="store_true", + default=True, + help="Enable cache flushing (default: True)", + ) + parser.add_argument( + "--rotating-count", + type=int, + default=1000, + help="Number of iterations to rotate cache (default: 1000)", + ) + parser.add_argument("--json", help="JSON output filename (optional)") + + args = parser.parse_args() + + # Parse problem sizes + problem_sizes = [] + for size_str in args.problem_sizes: + try: + m, n, k = map(int, size_str.split(",")) + problem_sizes.append((m, n, k)) + except ValueError: + print(f"Invalid problem size: {size_str}") + return 1 + + # Create benchmark instance + benchmark = GemmUniversalBenchmark(args.build_dir, verbose=args.verbose) + + # Run benchmark sweep + print("Starting GEMM kernel benchmark sweep...") + start_time = time.time() + + best_kernels = benchmark.benchmark_sweep( + problem_sizes=problem_sizes, + split_k_values=args.split_k, + verify=args.verify, + warmup=args.warmup, + repeat=args.repeat, + flush_cache=args.flush_cache, + rotating_count=args.rotating_count, + ) + + elapsed_time = time.time() - start_time + print(f"\nBenchmark completed in {elapsed_time:.2f} seconds") + + # Export results + benchmark_utils.export_csv(benchmark.results, args.csv) + benchmark_utils.export_best_kernels(best_kernels, args.best) + + # Export JSON if requested + if args.json: + benchmark_utils.export_json(benchmark.results, args.json, best_kernels) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp new file mode 100644 index 00000000000..b2015f8571d --- /dev/null +++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp @@ -0,0 +1,102 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "gemm/gemm_common.hpp" +#include "gemm_universal_profiler.hpp" + +// The kernel header is included via the compile command line with -include flag +// It defines SelectedKernel struct and KERNEL_NAME + +void benchmark_single(const ck_tile::ArgParser& arg_parser) +{ + // Use DataTypeTraits to get the actual type names from the generated header + // The generated header defines ADataType, BDataType, AccDataType, CDataType + std::string dtype_a = ck_tile::DataTypeTraits::name; + std::string dtype_b = ck_tile::DataTypeTraits::name; + std::string dtype_acc = ck_tile::DataTypeTraits::name; + std::string dtype_c = ck_tile::DataTypeTraits::name; + + // Layout names from the layout types + std::string layout_a = ALayout::name; + std::string layout_b = BLayout::name; + std::string layout_c = CLayout::name; + + // Create GemmProblem struct + GemmProblem gemm_problem{arg_parser.get_int("split_k"), + arg_parser.get_int("m"), + arg_parser.get_int("n"), + arg_parser.get_int("k"), + arg_parser.get_int("stride_a"), + arg_parser.get_int("stride_b"), + arg_parser.get_int("stride_c"), + dtype_a, + dtype_b, + dtype_acc, + dtype_c, + layout_a, + layout_b, + layout_c, + arg_parser.get_bool("structured_sparsity")}; + + // Create Setting struct + Setting setting{arg_parser.get_int("warmup"), + arg_parser.get_int("repeat"), + arg_parser.get_bool("timer"), + arg_parser.get_int("verify"), + arg_parser.get_int("init"), + arg_parser.get_bool("log"), + arg_parser.get_str("csv_filename"), + arg_parser.get_bool("flush_cache"), + arg_parser.get_int("rotating_count"), + arg_parser.get_bool("json_output")}; + + // Get the profiler instance + auto& profiler = UniversalGemmProfiler::GemmProfiler::instance(setting); + + try + { + // Create a lambda that wraps the kernel launch + auto kernel_func = [](const ck_tile::GemmHostArgs& args, + const ck_tile::stream_config& stream) { + return SelectedKernel::launch(args, stream); + }; + + // Benchmark the kernel + profiler.benchmark(gemm_problem, kernel_func); + + // Select best instance based on metric + profiler.select_best_instance(static_cast(arg_parser.get_int("metric"))); + } + catch(const std::exception& e) + { + std::cerr << "Benchmark failed: " << e.what() << std::endl; + } +} + +int main(int argc, char* argv[]) +{ + try + { + auto [result, parser] = create_args(argc, argv); + if(!result) + return EXIT_FAILURE; + + benchmark_single(parser); + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + return EXIT_FAILURE; + } +} diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp new file mode 100644 index 00000000000..6cfdcab8009 --- /dev/null +++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp @@ -0,0 +1,147 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +#include "ck_tile/host/device_prop.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "gemm/gemm_benchmark.hpp" +#include "gemm/gemm_profiler.hpp" +#include "gemm_universal_benchmark.hpp" + +class UniversalGemmProfiler + : public GemmProfiler +{ + public: + using BaseGemm = GemmProfiler; + using BaseGemm::benchmark; + + UniversalGemmProfiler(Setting setting) + : GemmProfiler(setting) + { + } + + void benchmark(GemmProblem& gemm_problem, + std::vector( + ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override + { + const ALayout layout_a = ALayout{}; + const BLayout layout_b = BLayout{}; + const CLayout layout_c = CLayout{}; + + gemm_problem.stride_a_ = ck_tile::get_default_stride( + gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a)); + gemm_problem.stride_b_ = ck_tile::get_default_stride( + gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b)); + gemm_problem.stride_c_ = ck_tile::get_default_stride( + gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)); + + ck_tile::HostTensor a_m_k(ck_tile::host_tensor_descriptor( + gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a))); + ck_tile::HostTensor b_k_n(ck_tile::host_tensor_descriptor( + gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b))); + ck_tile::HostTensor c_m_n_dev_result(ck_tile::host_tensor_descriptor( + gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c))); + + if(setting_.init_method_ == 0) + { + ck_tile::FillUniformDistribution{-1.f, 1.f}(a_m_k); + ck_tile::FillUniformDistribution{-1.f, 1.f}(b_k_n); + } + else if(setting_.init_method_ == 1) + { + ck_tile::FillMonotonicSeq{}(a_m_k); + ck_tile::FillMonotonicSeq{}(b_k_n); + } + else if(setting_.init_method_ == 2) + { + ck_tile::FillConstant{static_cast(1)}(a_m_k); + ck_tile::FillConstant{static_cast(1)}(b_k_n); + } + else + { + a_m_k.SetZero(); + b_k_n.SetZero(); + } + + if(gemm_problem.structured_sparsity_) + { + ck_tile::AdjustToStructuredSparsity{}(a_m_k); + } + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + if constexpr(std::is_same_v) + { + // Permute vector pk_i4x4 data for device implementation + ck_tile::HostTensor b_k_n_dev = b_k_n; + // permute_tensor_b(b_k_n_dev); + ck_tile::permute_vectors_i4x4_b(b_k_n_dev); + b_k_n_dev_buf.ToDevice(b_k_n_dev.data()); + } + else + { + b_k_n_dev_buf.ToDevice(b_k_n.data()); + } + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + ck_tile::GemmHostArgs gemm_args = { + a_m_k_dev_buf.GetDeviceBuffer(), + b_k_n_dev_buf.GetDeviceBuffer(), + c_m_n_dev_buf.GetDeviceBuffer(), + gemm_problem.split_k_, + gemm_problem.m_, + gemm_problem.n_, + gemm_problem.k_, + gemm_problem.stride_a_, + gemm_problem.stride_b_, + gemm_problem.stride_c_, + }; + + ck_tile::HostTensor c_m_n_host_result(ck_tile::host_tensor_descriptor( + gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c))); + + if(setting_.verify_) + { + gemm_host_reference(setting_.verify_, + a_m_k, + b_k_n, + c_m_n_host_result, + a_m_k_dev_buf, + b_k_n_dev_buf, + gemm_problem.m_, + gemm_problem.n_, + gemm_problem.k_, + gemm_problem.stride_a_, + gemm_problem.stride_b_, + gemm_problem.stride_c_); + } + + for(auto& callable : callables) + { + auto kernel_run_result = callable(gemm_args, + ck_tile::stream_config{nullptr, + true, + setting_.log_, + setting_.n_warmup_, + setting_.n_repeat_, + setting_.is_gpu_timer_, + setting_.flush_cache_, + setting_.rotating_count_}); + process_result(gemm_problem, + c_m_n_dev_buf, + c_m_n_host_result, + c_m_n_dev_result, + kernel_run_result); + } + } +};