diff --git a/Jenkinsfile b/Jenkinsfile
index e8ce97780d7..3f4b0f2e422 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1623,7 +1623,7 @@ pipeline {
                                             -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
                                             -D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \
                                            ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                     }
@@ -1664,7 +1664,7 @@ pipeline {
                                             -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
                                             -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
                                            ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                     }
@@ -1689,7 +1689,7 @@ pipeline {
                                             -D GEMM_UNIVERSAL_DATATYPE="fp16" \
                                             -D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" .. && \
                                            ninja -j${nthreads()} benchmark_gemm_universal_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
+                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
diff --git a/test/ck_tile/gemm_tile_engine/CMakeLists.txt b/test/ck_tile/gemm_tile_engine/CMakeLists.txt
index 33effcc1206..dc148d45e78 100644
--- a/test/ck_tile/gemm_tile_engine/CMakeLists.txt
+++ b/test/ck_tile/gemm_tile_engine/CMakeLists.txt
@@ -10,7 +10,7 @@
 # ============================================================================
 
 # Locate tile_engine GEMM scripts directory
-set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm")
+set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm/gemm_universal")
 
 if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR})
     message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}")
@@ -32,11 +32,11 @@ endif()
 #   config_json  - Full path to JSON configuration file
 # ============================================================================
 function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json)
-    set(target_name "test_gemm_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
+    set(target_name "test_gemm_universal_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
     
     # Generated header path (already created during cmake configuration)
-    set(test_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
+    set(test_header "${working_path}/gemm_universal_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
     set(test_params_header "${working_path}/test_params.hpp")
     
     # Verify header exists (should have been generated during cmake configuration)
@@ -118,7 +118,7 @@ function(build_gemm_test_targets datatype layout config_name)
 
     # STEP 1: Discovery phase - list all valid kernel configurations
     execute_process(
-        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
+        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
                 --working_path ${working_path}
                 --datatype ${datatype}
                 --layout ${layout}
@@ -178,7 +178,7 @@ function(build_gemm_test_targets datatype layout config_name)
                     
                     # Generate header using --gen_single
                     execute_process(
-                        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
+                        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
                                 --working_path ${working_path}
                                 --gpu_target "${GEMM_TEST_GPU_TARGETS}"
                                 --datatype ${datatype}
diff --git a/tile_engine/CMakeLists.txt b/tile_engine/CMakeLists.txt
index b9dc3201282..0bb885bc35f 100644
--- a/tile_engine/CMakeLists.txt
+++ b/tile_engine/CMakeLists.txt
@@ -3,6 +3,7 @@
 
 include_directories(BEFORE
         ${CMAKE_CURRENT_LIST_DIR}/include
+        ${CMAKE_CURRENT_LIST_DIR}/ops
     )
 
 add_subdirectory(ops/gemm)
diff --git a/tile_engine/ops/common/__init__.py b/tile_engine/ops/common/__init__.py
new file mode 100644
index 00000000000..1df48571843
--- /dev/null
+++ b/tile_engine/ops/common/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
diff --git a/tile_engine/ops/common/benchmark_utils.py b/tile_engine/ops/common/benchmark_utils.py
new file mode 100644
index 00000000000..0c158fa48cd
--- /dev/null
+++ b/tile_engine/ops/common/benchmark_utils.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+
+def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False) -> Optional[Dict]:
+    """Run a single kernel with given parameters and save output to individual JSON file"""
+    # Create results directory
+    results_dir = build_dir / "results"
+    results_dir.mkdir(exist_ok=True)
+
+    # Generate unique JSON filename for this kernel
+    json_file = results_dir / f"{kernel_path.stem}.json"
+
+    cmd = [str(kernel_path)]
+
+    # Add parameters
+    for key, value in params.items():
+        cmd.append(f"-{key}={value}")
+
+    # Add JSON output flag for clean JSON output
+    cmd.append("-json_output=true")
+
+    if verbose:
+        print(f"Running: {' '.join(cmd)}")
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+
+        if result.returncode != 0:
+            print(f"Error running {kernel_path.name}: {result.stderr}")
+            return None
+
+        # Save raw output to individual JSON file
+        output = result.stdout.strip()
+        if output:
+            with open(json_file, "w") as f:
+                f.write(output)
+
+            # Parse the JSON file
+            return parse_json_file(json_file, verbose=verbose)
+        else:
+            print(f"No output from {kernel_path.name}")
+            return None
+
+    except subprocess.TimeoutExpired:
+        print(f"Timeout running {kernel_path.name}")
+        return None
+    except Exception as e:
+        print(f"Error running {kernel_path.name}: {e}")
+        return None
+
+def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
+    """Parse JSON data from individual kernel output file"""
+    try:
+        with open(json_file, "r") as f:
+            content = f.read().strip()
+
+        # Parse the JSON directly since executables produce clean JSON
+        data = json.loads(content)
+
+        # Return the complete JSON data as-is, just add some convenience fields
+        result = data.copy()
+        if "perf_result" in data:
+            perf = data["perf_result"]
+            # Add convenience fields for backward compatibility
+            result["time_ms"] = perf.get("latency(ms)", 0)
+            result["tflops"] = perf.get("tflops(TFlops)", 0)
+            result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
+
+        return result
+
+    except json.JSONDecodeError as e:
+        if verbose:
+            print(f"Failed to parse JSON from {json_file}: {e}")
+        return None
+    except Exception as e:
+        if verbose:
+            print(f"Error reading JSON file {json_file}: {e}")
+        return None
+
+def find_best_kernel(
+     results: List[Dict], metric: str = "tflops"
+) -> Optional[Dict]:
+    """Find the best performing kernel based on metric"""
+    if not results:
+        return None
+
+    if metric == "tflops":
+        return max(results, key=lambda x: x.get("tflops", 0))
+    elif metric == "time_ms":
+        return min(results, key=lambda x: x.get("time_ms", float("inf")))
+    elif metric == "bandwidth_gb_s":
+        return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
+    else:
+        raise ValueError(f"Unknown metric: {metric}")
+
+
+def export_csv(results: List[Dict], filename: str, verbose: bool = False):
+    """Export all results to CSV"""
+    if not results:
+        print("No results to export")
+        return
+
+    # Get all unique keys from results
+    all_keys = set()
+    for result in results:
+        all_keys.update(result.keys())
+
+    # Sort keys for consistent output
+    fieldnames = sorted(all_keys)
+
+    with open(filename, "w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(results)
+
+    print(f"Results exported to {filename}")
+
+def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = False):
+    """Export best kernel selections to file"""
+    with open(filename, "w") as f:
+        f.write("# Best kernel selections\n")
+        f.write(
+            "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
+        )
+
+        for key, kernel in sorted(best_kernels.items()):
+            f.write(
+                f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
+            )
+
+    print(f"Best kernels exported to {filename}")
+
+def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False):
+    """Export all results and best kernels to JSON with comprehensive metadata"""
+    from datetime import datetime
+
+    # Calculate comprehensive summary statistics for all metrics
+    successful_results = [r for r in results if r.get("tflops", 0) > 0]
+
+    tflops_values = [r.get("tflops", 0) for r in successful_results]
+    bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
+    latency_values = [
+        r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
+    ]
+
+    # Performance breakdown by kernel type
+    pipeline_stats = {}
+    scheduler_stats = {}
+    data_type_stats = {}
+
+    for result in successful_results:
+        # Get config info from the new structure
+        config = result.get("config", {})
+
+        # Pipeline statistics
+        pipeline = config.get("pipeline", "unknown")
+        if pipeline not in pipeline_stats:
+            pipeline_stats[pipeline] = {
+                "count": 0,
+                "avg_tflops": 0,
+                "best_tflops": 0,
+            }
+        pipeline_stats[pipeline]["count"] += 1
+        pipeline_stats[pipeline]["best_tflops"] = max(
+            pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
+        )
+
+        # Scheduler statistics
+        scheduler = config.get("scheduler", "unknown")
+        if scheduler not in scheduler_stats:
+            scheduler_stats[scheduler] = {
+                "count": 0,
+                "avg_tflops": 0,
+                "best_tflops": 0,
+            }
+        scheduler_stats[scheduler]["count"] += 1
+        scheduler_stats[scheduler]["best_tflops"] = max(
+            scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
+        )
+
+        # Data type statistics
+        data_type = config.get("data_type", "unknown")
+        if data_type not in data_type_stats:
+            data_type_stats[data_type] = {
+                "count": 0,
+                "avg_tflops": 0,
+                "best_tflops": 0,
+            }
+        data_type_stats[data_type]["count"] += 1
+        data_type_stats[data_type]["best_tflops"] = max(
+            data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
+        )
+
+    # Calculate averages for breakdown stats
+    for stats_dict, field_name in [
+        (pipeline_stats, "pipeline"),
+        (scheduler_stats, "scheduler"),
+        (data_type_stats, "data_type"),
+    ]:
+        for key in stats_dict:
+            relevant_results = [
+                r
+                for r in successful_results
+                if r.get("config", {}).get(field_name, "unknown") == key
+            ]
+            if relevant_results:
+                stats_dict[key]["avg_tflops"] = sum(
+                    r.get("tflops", 0) for r in relevant_results
+                ) / len(relevant_results)
+
+    output_data = {
+        "benchmark_metadata": {
+            "timestamp": datetime.now().isoformat(),
+            "total_kernels_tested": len(results),
+            "unique_kernels": len(
+                set(r.get("name", "unknown") for r in results)
+            ),
+            "successful_runs": len(successful_results),
+            "failed_runs": len(results) - len(successful_results),
+        },
+        "performance_summary": {
+            "tflops_stats": {
+                "best": max(tflops_values, default=0),
+                "average": sum(tflops_values) / len(tflops_values)
+                if tflops_values
+                else 0,
+                "min": min(tflops_values, default=0),
+                "median": sorted(tflops_values)[len(tflops_values) // 2]
+                if tflops_values
+                else 0,
+            },
+            "bandwidth_stats": {
+                "best_gb_s": max(bandwidth_values, default=0),
+                "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
+                if bandwidth_values
+                else 0,
+                "min_gb_s": min(bandwidth_values, default=0),
+                "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
+                if bandwidth_values
+                else 0,
+            },
+            "latency_stats": {
+                "best_ms": min(latency_values, default=0),
+                "average_ms": sum(latency_values) / len(latency_values)
+                if latency_values
+                else 0,
+                "max_ms": max(latency_values, default=0),
+                "median_ms": sorted(latency_values)[len(latency_values) // 2]
+                if latency_values
+                else 0,
+            },
+            "kernel_type_breakdown": {
+                "by_pipeline": pipeline_stats,
+                "by_scheduler": scheduler_stats,
+                "by_data_type": data_type_stats,
+            },
+            "total_problem_configurations": len(best_kernels)
+            if best_kernels
+            else 0,
+        },
+        "kernel_results": results,
+        "best_kernels_by_problem": best_kernels or {},
+    }
+
+    with open(filename, "w") as f:
+        json.dump(output_data, f, indent=2)
+
+    print(f"JSON results exported to {filename}")
+    print(f"  - Total kernels: {len(results)}")
+    print(f"  - Successful runs: {len(successful_results)}")
+    print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
+    print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
+    print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+
diff --git a/tile_engine/ops/common/utils.hpp b/tile_engine/ops/common/utils.hpp
new file mode 100644
index 00000000000..56bfbde5a07
--- /dev/null
+++ b/tile_engine/ops/common/utils.hpp
@@ -0,0 +1,171 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include <iostream>
+#include <functional>
+#include <tuple>
+#include <exception>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+// Helper function to determine if a layout is row-major
+template <typename Layout>
+constexpr auto is_row_major(Layout)
+{
+    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+enum class Metric
+{
+    LATENCY   = 0,
+    TFLOPS    = 1,
+    BANDWIDTH = 2
+};
+
+inline constexpr auto get_metric_name(Metric m)
+{
+    switch(m)
+    {
+    case Metric::LATENCY: return "latency";
+    case Metric::TFLOPS: return "tflops";
+    case Metric::BANDWIDTH: return "bandwidth";
+    default: throw std::invalid_argument("Unsupported metric type");
+    }
+}
+
+struct PerformanceResult
+{
+    double latency_;
+    double tflops_;
+    double bandwidth_;
+
+    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
+    {
+        switch(m)
+        {
+        case Metric::LATENCY: return a.latency_ < b.latency_;
+        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
+        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
+        default: throw std::invalid_argument("Unsupported metric type");
+        }
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
+    {
+        os << "{\n"
+           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
+           << ",\n"
+           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
+           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+template <typename Problem>
+struct KernelInstance
+{
+    std::string name_;
+    Problem problem_;
+    PerformanceResult perf_result_;
+
+    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
+    {
+        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
+    {
+        os << "{\n"
+           << " \"name\": \"" << obj.name_ << "\",\n"
+           << " \"problem\": " << obj.problem_ << ",\n"
+           << " \"perf_result\": " << obj.perf_result_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct Setting
+{
+    int n_warmup_;
+    int n_repeat_;
+    bool is_gpu_timer_;
+    int verify_;
+    int init_method_;
+    bool log_;
+    std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
+    bool json_output_;
+};
+
+inline std::string get_rocm_version()
+{
+    std::ifstream version_file("/opt/rocm/.info/version");
+    if(version_file.is_open())
+    {
+        std::string version;
+        std::getline(version_file, version);
+        return version;
+    }
+    return "Unknown";
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename AccDataType,
+          typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
new file mode 100644
index 00000000000..5e0bae70806
--- /dev/null
+++ b/tile_engine/ops/gemm/README.md
@@ -0,0 +1,442 @@
+# CK Tile Engine GEMM Operations
+
+## Overview
+
+The CK Tile Engine GEMM module provides a comprehensive system for generating, building, and benchmarking GEMM (General Matrix Multiplication) kernels with various configurations. It supports multiple data types, layouts, and optimization strategies. The system has evolved from a monolithic build approach (where all kernels compile into a single executable) to a more flexible individual kernel compilation system, providing better build parallelism and targeted testing capabilities.
+
+## Table of Contents
+
+1. [Build System Architecture](#build-system-architecture)
+2. [Build Instructions](#build-instructions)
+3. [Running Benchmarks](#running-benchmarks)
+4. [Configuration System](#configuration-system)
+5. [Scripts and Tools](#scripts-and-tools)
+6. [Command Line Options](#command-line-options)
+7. [Understanding Kernel Names](#understanding-kernel-names)
+8. [Troubleshooting](#troubleshooting)
+9. [Performance Tips](#performance-tips)
+
+## Build System Architecture
+
+### Individual Kernel Compilation (New Approach)
+
+The new tile engine benchmark system compiles each kernel configuration into a separate executable. This provides:
+- Better build parallelism
+- Faster incremental builds
+- More targeted testing
+- Easier debugging of specific configurations
+
+Each benchmark executable follows the naming pattern:
+```
+benchmark_gemm_<dtype>_<layout>_<config>_<tile_sizes>
+```
+
+### Monolithic Build (Legacy Approach)
+
+The original system compiles all kernels into a single executable (`benchmark_gemm_[Datatype]_[Layout]`), which can then be filtered at runtime using command-line arguments.
+
+## Build Instructions
+
+### Prerequisites
+- ROCm installation
+- CMake 3.16 or higher
+- C++17 compatible compiler
+
+### Basic Build
+
+```bash
+# In the root of composable kernel, create build directory
+mkdir build && cd build
+
+# Configure with specific datatypes and layouts
+# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942)
+# Replace [Datatype1;Datatype2;...] with datatypes (fp8, bf8, int8, fp16, bf16, fp32, fp64)
+# Replace [Layout1;Layout2;...] with layouts (rcr, rrr, crr, ccr)
+../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
+
+# Build specific benchmarks
+make benchmark_gemm_[Datatype1]_[Layout1] -j
+```
+
+### Configuration Options
+
+The build system supports several configuration options:
+
+#### Using Custom Config Files
+```bash
+# Method 1: CMake variable (config file must be in configs/ directory)
+cmake -DGEMM_CONFIG_FILE=my_custom_config.json ...
+
+# Method 2: Environment variable (takes precedence over CMake variable)
+export GEMM_CONFIG_FILE=my_custom_config.json
+cmake ...
+```
+
+#### Config File Priority Order
+1. **Environment variable** `GEMM_CONFIG_FILE` (highest priority)
+2. **CMake variable** `GEMM_CONFIG_FILE`
+3. **Default config** (default_config.json for all layouts)
+
+**Note**: All custom config files must be placed in the `tile_engine/ops/gemm/configs/` directory.
+
+### Example Build Commands
+
+```bash
+# Build for gfx942 with fp8 and fp16 datatypes, rcr layout
+mkdir build && cd build
+../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr;ccr;rrr;crr"
+make benchmark_gemm_universal_fp8_rcr -j
+make benchmark_gemm_universal_fp16_rcr -j
+```
+
+### Building Individual Kernels
+
+```bash
+# Build a specific kernel configuration
+make benchmark_gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_256x256x32_1x4x1_32x32x32
+
+# Build all fp16 benchmarks in parallel
+make -j$(nproc) $(make help | grep benchmark_gemm_fp16 | awk '{print $2}')
+```
+
+### Rebuilding After Configuration Changes
+
+If you modify the configuration file, you must rebuild:
+```bash
+rm -rf tile_engine/ && make benchmark_gemm_universal_[Datatype]_[Layout] -j
+```
+
+## Running Benchmarks
+
+### Individual Kernel Execution
+
+```bash
+cd /path/to/build/directory
+./bin/benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 \
+    -m=512 -n=512 -k=512 -verify=1
+```
+
+### Monolithic Executable (Legacy)
+
+```bash
+# Run specific pipeline/scheduler/epilogue combination
+./bin/benchmark_gemm_universal_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=default
+```
+
+### Automated Testing
+
+Use the provided test script to run multiple benchmarks:
+```bash
+cd /path/to/composable_kernel/tile_engine/ops/gemm
+./test_benchmark.sh [build_directory]
+```
+
+## Configuration System
+
+### Configuration Files
+
+The system uses JSON configuration files to specify kernel parameters:
+
+- `configs/default_config.json` - Default configurations for various datatypes
+- `configs/user_provided_config.json` - User-customizable configurations
+
+### Configuration Structure
+
+```json
+{
+    "tile_config": {
+        "tile_m": {"values": [256, 128]},
+        "tile_n": {"values": [256, 128]},
+        "tile_k": {"values": [64, 32]},
+        "warp_m": {"values": [2, 4]},
+        "warp_n": {"values": [2, 1]},
+        "warp_k": {"values": [1]},
+        "warp_tile_m": {"values": [32, 16]},
+        "warp_tile_n": {"values": [32, 16]},
+        "warp_tile_k": {"values": [16, 32]}
+    },
+    "trait_config": {
+        "pipeline": {"values": ["compv3", "compv4", "mem"]},
+        "scheduler": {"values": ["intrawave", "interwave"]},
+        "epilogue": {"values": ["default", "cshuffle"]},
+        "pad_m": {"values": [false]},
+        "pad_n": {"values": [false]},
+        "pad_k": {"values": [false]},
+        "persistent": {"values": [false]}
+    }
+}
+```
+
+## Scripts and Tools
+
+### Python Scripts
+
+#### gemm_universal_instance_builder.py
+**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files.
+
+**Key Features**:
+- Generates individual kernel header files for separate compilation
+- Supports multiple data types (fp16, fp8, bf16, fp32, fp64)
+- Validates tile configurations for correctness
+- Creates CMake integration files
+
+**Usage**:
+```bash
+python gemm_universal_instance_builder.py \
+    --working_path ./generated \
+    --datatype fp16 \
+    --layout rcr \
+    --config_json configs/user_provided_config.json \
+    --gen_all_individual
+```
+
+#### gemm_instance_builder_parallel.py
+**Purpose**: Parallel version of the instance builder for faster generation of multiple kernel configurations.
+
+**Features**:
+- Multi-threaded kernel generation
+- Improved performance for large configuration spaces
+
+#### validation_utils.py
+**Purpose**: Provides comprehensive validation functions for kernel configurations.
+
+**Key Functions**:
+- `is_tile_config_valid()` - Validates tile dimensions and alignments
+- `is_trait_combination_valid()` - Checks if pipeline/epilogue/scheduler combinations are supported
+- `validate_warp_tile_combination()` - GPU-specific warp tile validation
+- `validate_lds_capacity()` - Ensures configurations fit in LDS memory
+
+**Validation Checks**:
+- Dimension alignment (tile dimensions must be divisible by warp dimensions)
+- LDS capacity constraints
+- GPU-specific warp tile support
+- Unsupported trait combinations
+
+#### test_validation.py
+**Purpose**: Test suite for the validation logic to ensure correctness.
+
+**Usage**:
+```bash
+python test_validation.py
+```
+
+**Tests**:
+- Warp tile combination validation
+- Trait combination validation
+- Full tile configuration validation
+
+#### gemm_universal_benchmark.py
+**Purpose**: Python script for running and analyzing GEMM benchmarks.
+
+**Features**:
+- Automated benchmark execution
+- Performance data collection
+- Result analysis and reporting
+
+#### json_config.py
+**Purpose**: Configuration file parsing and management.
+
+**Features**:
+- JSON configuration loading
+- Default configuration handling
+- Configuration validation
+
+#### codegen_utils.py
+**Purpose**: Utility functions for code generation.
+
+**Features**:
+- Template processing
+- Code formatting utilities
+- File generation helpers
+
+### Shell Scripts
+
+#### test_benchmark.sh
+**Purpose**: Automated benchmark testing script that finds and runs all built benchmark executables.
+
+**Features**:
+- Automatic build directory detection
+- Batch execution of multiple benchmarks
+- CSV result collection
+- Colored output for easy reading
+- Example command generation
+
+**Usage**:
+```bash
+# Auto-detect build directory
+./test_benchmark.sh
+
+# Specify build directory
+./test_benchmark.sh /path/to/build/directory
+```
+
+**What it does**:
+1. Finds all benchmark executables in the build directory
+2. Runs each with multiple problem sizes (512, 1024, 2048)
+3. Performs GPU verification
+4. Saves results to timestamped CSV file
+5. Provides summary statistics
+
+## Command Line Options
+
+All benchmark executables support the following options:
+
+### Matrix Dimensions
+- `-m=<value>` - M dimension (default: 3840)
+- `-n=<value>` - N dimension (default: 4096)
+- `-k=<value>` - K dimension (default: 2048)
+
+### Strides
+- `-stride_a=<value>` - Stride for matrix A (default: 0, auto-calculated)
+- `-stride_b=<value>` - Stride for matrix B (default: 0, auto-calculated)
+- `-stride_c=<value>` - Stride for matrix C (default: 0, auto-calculated)
+
+### Verification
+- `-verify=<0|1|2>` - Verification mode
+  - 0: No verification (default)
+  - 1: CPU verification
+  - 2: GPU verification
+
+### Performance Testing
+- `-warmup=<value>` - Warmup iterations (default: 50)
+- `-repeat=<value>` - Benchmark iterations (default: 100)
+- `-timer=<true|false>` - Use GPU timer (default: true)
+- `-flush_cache=<true|false>` - Flush cache between runs (default: true)
+- `-rotating_count=<value>` - Cache rotation count (default: 1000)
+
+### Initialization
+- `-init=<0|1|2>` - Tensor initialization method
+  - 0: Random values [-1, 1] (default)
+  - 1: Linear sequence (i % 17)
+  - 2: Constant value (1.0)
+
+### Output Options
+- `-log=<true|false>` - Enable verbose logging (default: false)
+- `-metric=<0|1|2>` - Performance metric
+  - 0: Latency in ms (default)
+  - 1: TFLOPS
+  - 2: Bandwidth in GB/s
+- `-json_output=<true|false>` - JSON format output (default: false)
+- `-csv_filename=<filename>` - Save results to CSV
+- `-csv_format=<simple|comprehensive>` - CSV format (default: comprehensive)
+
+### Advanced Options
+- `-split_k=<value>` - Split-K factor (default: 1)
+- `-structured_sparsity=<true|false>` - Enable structured sparsity (default: false)
+- `-pipeline=<compv3|compv4|mem>` - Pipeline type (default: compv3)
+- `-scheduler=<intrawave|interwave>` - Scheduler type (default: intrawave)
+- `-epilogue=<cshuffle|default>` - Epilogue type (default: cshuffle)
+- `-pad_m=<true|false>` - Pad M dimension (default: false)
+- `-pad_n=<true|false>` - Pad N dimension (default: false)
+- `-pad_k=<true|false>` - Pad K dimension (default: false)
+- `-persistent=<true|false>` - Use persistent kernel (default: false)
+
+## Understanding Kernel Names
+
+The kernel naming convention encodes the configuration:
+
+```
+benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16
+               		 ^^^^  ^^^ ^^^^^^ ^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^ ^^^^^^^^^
+               		 |     |   |      |       |         |                       |         |       |
+               		 |     |   |      |       |         Padding & flags         |         |       Warp tile
+               		 |     |   |      |       Scheduler                         |         Thread tile
+               		 |     |   |      Epilogue                                  Block tile
+               		 |     |   Pipeline
+               		 |     Layout (Row-Column-Row)
+               		 Data type
+```
+
+### Components:
+- **Data type**: fp16, fp32, bf16, fp8, bf8, int8
+- **Layout**: rcr (Row-Column-Row), rrr, crr, ccr
+- **Pipeline**: mem, compv3, compv4
+- **Epilogue**: default, cshuffle
+- **Scheduler**: intrawave, interwave
+- **Flags**: pad_m, pad_n, pad_k, persistent (4 boolean flags)
+- **Tile sizes**: BlockTile x ThreadTile x WarpTile
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Kernel not found**
+   - Ensure the specific benchmark executable is built
+   - Check the build directory bin/ folder
+
+2. **Verification failures**
+   - Try GPU verification (-verify=2) which may be more accurate
+   - Check data type compatibility
+   - Verify stride calculations
+
+3. **Build failures**
+   - Check GPU architecture compatibility
+   - Ensure ROCm is properly installed
+   - Verify configuration file syntax
+
+4. **Performance variations**
+   - Increase warmup iterations
+   - Disable CPU frequency scaling
+   - Use GPU timer for accurate measurements
+
+### Debug Options
+
+Enable verbose logging:
+```bash
+./bin/benchmark_gemm_... -log=true -verify=1
+```
+
+Test validation logic:
+```bash
+python test_validation.py
+```
+
+## Performance Tips
+
+1. **Optimal Problem Sizes**: Use sizes that are multiples of tile dimensions
+2. **Warmup**: Use at least 50-100 warmup iterations
+3. **GPU Timer**: Always use `-timer=true` for accurate measurements
+4. **Cache Management**: Enable cache flushing for consistent results
+5. **Thread Affinity**: Set CPU affinity to reduce variation
+
+## Integration Examples
+
+### Python Integration
+
+```python
+import subprocess
+import json
+
+# Run benchmark with JSON output
+result = subprocess.run([
+    './bin/benchmark_gemm_universal_fp16_rcr_...', 
+    '-m=1024', '-n=1024', '-k=1024',
+    '-json_output=true'
+], capture_output=True, text=True)
+
+# Parse results
+data = json.loads(result.stdout)
+print(f"Performance: {data['tflops']} TFLOPS")
+```
+
+### Batch Testing Script
+
+```bash
+#!/bin/bash
+SIZES="512 1024 2048 4096"
+for size in $SIZES; do
+    echo "Testing ${size}x${size}x${size}"
+    ./bin/benchmark_gemm_... -m=$size -n=$size -k=$size \
+        -verify=2 -csv_filename=results.csv
+done
+```
+
+## Contributing
+
+When adding new features or configurations:
+1. Update validation logic in `validation_utils.py`
+2. Add tests to `test_validation.py`
+3. Update configuration examples
+4. Document new command-line options
+
+For more information about the Composable Kernel project, visit the main repository documentation.
diff --git a/tile_engine/ops/gemm/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_benchmark.hpp
new file mode 100644
index 00000000000..7439264a391
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_benchmark.hpp
@@ -0,0 +1,116 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+#include <iomanip>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "common/utils.hpp"
+
+// Data types and Layouts are defined by the generated kernel headers
+// No hardcoded type definitions here to avoid conflicts
+struct GemmProblem
+{
+    int split_k_;
+    int m_, n_, k_;
+    int stride_a_, stride_b_, stride_c_;
+
+    std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_;
+    std::string layout_a_, layout_b_, layout_c_;
+
+    bool structured_sparsity_;
+
+    friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem)
+    {
+        os << "{\n"
+           << "   \"split_k\":" << problem.split_k_ << ",\n"
+           << "   \"m\":" << problem.m_ << ",\n"
+           << "   \"n\":" << problem.n_ << ",\n"
+           << "   \"k\":" << problem.k_ << ",\n"
+           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
+           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
+           << "   \"stride_c\":" << problem.stride_c_ << ",\n"
+           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
+           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
+           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
+           << "   \"dtype_c\":\"" << problem.dtype_c_ << "\",\n"
+           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
+           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
+           << "   \"layout_c\":\"" << problem.layout_c_ << "\",\n"
+           << "   \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false")
+           << "\n"
+           << "}";
+        return os;
+    }
+};
+
+// Detect Problem::DsDataType, default to void when absent
+template <class T, class = void>
+struct get_DsDataType
+{
+    using type = void;
+};
+
+template <class T>
+struct get_DsDataType<T, std::void_t<typename T::DsDataType>>
+{
+    using type = typename T::DsDataType;
+};
+
+// Detect Problem::D0DataType, default to void when absent
+template <class T, class = void>
+struct get_D0DataType
+{
+    using type = void;
+};
+
+template <class T>
+struct get_D0DataType<T, std::void_t<typename T::D0DataType>>
+{
+    using type = typename T::D0DataType;
+};
+
+/// @brief Function to compare the results of the device and host computations
+template <typename Problem>
+bool compare(std::string instanceName,
+             ck_tile::index_t K,
+             ck_tile::index_t kbatch,
+             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
+{
+    using DDataType = typename get_D0DataType<Problem>::type;
+    const float max_accumulated_value =
+        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
+    // const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+    // K, kbatch, max_accumulated_value);
+    auto rtol_atol = [&] {
+        if constexpr(std::is_void_v<DDataType>)
+        {
+            return calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                K, kbatch, max_accumulated_value);
+        }
+        else
+        {
+            return calculate_rtol_atol<ADataType, BDataType, DDataType, AccDataType, CDataType>(
+                K, kbatch, max_accumulated_value);
+        }
+    }();
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_host_result,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "For " << instanceName << " Relative error threshold is "
+              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
+              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
diff --git a/tile_engine/ops/gemm/gemm_benchmark.py b/tile_engine/ops/gemm/gemm_benchmark.py
new file mode 100644
index 00000000000..c229b142338
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_benchmark.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import os
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+import importlib.util
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+# TODO: explore modularizing tile engine to avoid accessing imports like this
+def _import_benchmark_utils():
+    """Import benchmark utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
+
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_utils",
+        os.path.join(parent_dir, "common", "benchmark_utils.py"),
+    )
+    benchmark_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(benchmark_utils)
+
+    return benchmark_utils
+
+benchmark_utils = _import_benchmark_utils()
+
+class GemmBenchmark:
+    def __init__(self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"):
+        self.build_dir = Path(build_dir)
+        self.verbose = verbose
+        self.results = []
+        self.name = name
+
+    def discover_kernels(self) -> List[Path]:
+        """Find all benchmark_gemm_* executables in the build directory"""
+        bin_dir = self.build_dir / "bin"
+        if not bin_dir.exists():
+            print(f"Error: Binary directory {bin_dir} does not exist")
+            return []
+
+        glob_name = f"{self.name}*"
+        kernels = list(bin_dir.glob(glob_name))
+        if self.verbose:
+            print(f"Found {len(kernels)} kernel executables")
+            for k in kernels:
+                print(f"  - {k.name}")
+        return kernels
+
+    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
+        """Extract comprehensive kernel information from filename"""
+        name = kernel_path.stem
+        if name.startswith(self.name):
+            args = name[len(self.name):]
+        else:
+            args = name
+
+        # Initialize with basic info
+        info = {
+            "executable": str(kernel_path),
+            "name": name,
+            "data_type": "unknown",
+            "layout": "unknown",
+            "pipeline": "unknown",
+            "scheduler": "unknown",
+            "epilogue": "unknown",
+        }
+
+        # Parse the kernel name pattern:
+        # benchmark_gemm_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
+        parts = args.split("_")
+
+        if len(parts) >= 5:
+            info["data_type"] = parts[0]
+            info["layout"] = parts[1]
+            info["pipeline"] = parts[2]
+            info["epilogue"] = parts[3]
+            info["scheduler"] = parts[4]
+
+        # Extract detailed configuration from the end of the name
+        config_info = self.parse_detailed_config(name)
+        info.update(config_info)
+
+        # Generate config ID
+        info["config_id"] = self.generate_config_id(info)
+
+        return info
+
+    def parse_detailed_config(self, kernel_name: str) -> Dict:
+        """Parse detailed configuration from kernel name"""
+        config = {
+            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
+            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
+            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
+            "optimization_flags": {
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+            },
+        }
+
+        # Split by underscore and look for patterns
+        parts = kernel_name.split("_")
+
+        # Look for boolean flags (sequence of True/False values)
+        bool_sequence = []
+        for i, part in enumerate(parts):
+            if part in ["True", "False"]:
+                bool_sequence.append(part == "True")
+                # Continue collecting consecutive boolean values
+                j = i + 1
+                while j < len(parts) and parts[j] in ["True", "False"]:
+                    bool_sequence.append(parts[j] == "True")
+                    j += 1
+                break
+
+        # Assign boolean flags if we found them
+        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
+        if len(bool_sequence) >= 4:
+            config["optimization_flags"]["pad_m"] = bool_sequence[0]
+            config["optimization_flags"]["pad_n"] = bool_sequence[1]
+            config["optimization_flags"]["pad_k"] = bool_sequence[2]
+            config["optimization_flags"]["persistent"] = bool_sequence[3]
+
+        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
+        # The pattern is: tile_sizes_warp_config_warp_tile
+        dimension_groups = []
+        for part in parts:
+            if "x" in part and len(part.split("x")) == 3:
+                try:
+                    dims = [int(x) for x in part.split("x")]
+                    if all(d > 0 for d in dims):
+                        dimension_groups.append(dims)
+                except ValueError:
+                    continue
+
+        # Assign dimensions based on order and magnitude
+        if len(dimension_groups) >= 3:
+            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Largest dimensions = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smallest dimensions = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[2][0]
+            config["warp_config"]["warp_n"] = sorted_groups[2][1]
+            config["warp_config"]["warp_k"] = sorted_groups[2][2]
+
+            # Middle dimensions = warp tile
+            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
+            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
+            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 2:
+            # If only 2 groups, assign based on magnitude
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Larger = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smaller = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[1][0]
+            config["warp_config"]["warp_n"] = sorted_groups[1][1]
+            config["warp_config"]["warp_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 1:
+            # Only one group - assume it's tile sizes
+            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
+            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
+            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
+
+        return config
+
+    def generate_config_id(self, info: Dict) -> str:
+        """Generate a compact config ID from kernel info"""
+        # Create a compact identifier
+        parts = [
+            info.get("data_type", "unk"),
+            info.get("layout", "unk"),
+            info.get("pipeline", "unk"),
+            info.get("scheduler", "unk"),
+        ]
+
+        # Add tile configuration if available
+        tile_sizes = info.get("tile_sizes", {})
+        if tile_sizes.get("tile_m", 0) > 0:
+            tile_str = (
+                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
+            )
+            parts.append(tile_str)
+
+        # Add warp config if available
+        warp_config = info.get("warp_config", {})
+        if warp_config.get("warp_m", 0) > 0:
+            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
+            parts.append(warp_str)
+
+        # Add warp tile if available
+        warp_tile = info.get("warp_tile", {})
+        if warp_tile.get("warp_tile_m", 0) > 0:
+            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
+            parts.append(warp_tile_str)
+
+        return "_".join(parts)
+
+    def benchmark_problem_size(
+        self,
+        kernels: List[Path],
+        m: int,
+        n: int,
+        k: int,
+        split_k: int = 1,
+        verify: int = 0,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> List[Dict]:
+        """Benchmark all kernels for a specific problem size"""
+        results = []
+
+        params = {
+            "m": m,
+            "n": n,
+            "k": k,
+            "split_k": split_k,
+            "verify": verify,
+            "warmup": warmup,
+            "repeat": repeat,
+            "flush_cache": str(flush_cache).lower(),
+            "rotating_count": rotating_count,
+        }
+
+        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
+
+        for kernel_path in kernels:
+            kernel_info = self.extract_kernel_info(kernel_path)
+            result = benchmark_utils.run_kernel(self.build_dir, kernel_path, params, verbose=self.verbose)
+            if result:
+                # Create new structured result format
+                structured_result = {
+                    "name": kernel_info["name"],  # Add name field for compatibility
+                    "config_id": kernel_info["config_id"],
+                    "problem": result.get("problem", {}),
+                    "perf_result": result.get("perf_result", {}),
+                    "config": {
+                        "data_type": kernel_info["data_type"],
+                        "layout": kernel_info["layout"],
+                        "pipeline": kernel_info["pipeline"],
+                        "scheduler": kernel_info["scheduler"],
+                        "epilogue": kernel_info["epilogue"],
+                        "tile_sizes": kernel_info.get("tile_sizes", {}),
+                        "warp_config": kernel_info.get("warp_config", {}),
+                        "warp_tile": kernel_info.get("warp_tile", {}),
+                        "optimization_flags": kernel_info.get("optimization_flags", {}),
+                    },
+                    "executable": kernel_info["executable"],
+                    # Keep backward compatibility fields
+                    "time_ms": result.get("time_ms", 0),
+                    "tflops": result.get("tflops", 0),
+                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
+                }
+
+                results.append(structured_result)
+
+                if self.verbose:
+                    print(
+                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
+                    )
+
+        return results
+
+    def benchmark_sweep(
+        self,
+        problem_sizes: List[Tuple[int, int, int]],
+        split_k_values: List[int] = [1],
+        verify: bool = False,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> Dict:
+        """Run comprehensive benchmark sweep"""
+        kernels = self.discover_kernels()
+        if not kernels:
+            print("No kernels found!")
+            return {}
+
+        all_results = []
+        best_kernels = {}
+
+        for m, n, k in problem_sizes:
+            for split_k in split_k_values:
+                results = self.benchmark_problem_size(
+                    kernels,
+                    m,
+                    n,
+                    k,
+                    split_k,
+                    verify=2 if verify else 0,
+                    warmup=warmup,
+                    repeat=repeat,
+                    flush_cache=flush_cache,
+                    rotating_count=rotating_count,
+                )
+
+                all_results.extend(results)
+
+                # Find best kernel for this configuration
+                best = benchmark_utils.find_best_kernel(results)
+                if best:
+                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
+                    best_kernels[key] = best
+                    print(
+                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
+                    )
+
+        self.results = all_results
+        return best_kernels
+
+
diff --git a/tile_engine/ops/gemm/gemm_common.hpp b/tile_engine/ops/gemm/gemm_common.hpp
new file mode 100644
index 00000000000..3a9aed2bc6d
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_common.hpp
@@ -0,0 +1,96 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include <iostream>
+#include <functional>
+#include <tuple>
+#include <exception>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+// Structure to hold kernel traits for dispatcher
+struct KernelTraits
+{
+    std::string pipeline;  // compv3, compv4, mem
+    std::string scheduler; // intrawave, interwave
+    std::string epilogue;  // cshuffle, default
+    bool pad_m;
+    bool pad_n;
+    bool pad_k;
+    bool persistent;
+
+    // Constructor with defaults
+    KernelTraits()
+        : pipeline("compv3"),
+          scheduler("intrawave"),
+          epilogue("cshuffle"),
+          pad_m(false),
+          pad_n(false),
+          pad_k(false),
+          persistent(false)
+    {
+    }
+};
+
+// Create argument parser
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+        .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
+        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("verify",
+                "2",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
+                "for validation on GPU. Default is 2, GPU validation.")
+        .insert("log",
+                "false",
+                "Whether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert(
+            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
+        .insert(
+            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Whether if the timer is gpu timer or not. Possible values are false or true. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "true",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert("csv_filename",
+                "",
+                "The filename of benchmark result. Default is empty (no CSV output).")
+        .insert("structured_sparsity",
+                "false",
+                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
+                "false")
+        .insert("json_output",
+                "false",
+                "Whether to output results in JSON format only. Possible values are true or false. "
+                "Default is "
+                "false");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp
index f8c196e32af..23efbfc1682 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp
@@ -11,37 +11,15 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
-#include "gemm_multi_d_common.hpp"
+#include "gemm/gemm_benchmark.hpp"
 
 // Data types and Layouts are defined by the generated kernel headers
 // No hardcoded type definitions here to avoid conflicts
-
-enum class Metric
-{
-    LATENCY   = 0,
-    TFLOPS    = 1,
-    BANDWIDTH = 2
-};
-
-inline constexpr auto get_metric_name(Metric m)
+struct GemmMultiDProblem : GemmProblem
 {
-    switch(m)
-    {
-    case Metric::LATENCY: return "latency";
-    case Metric::TFLOPS: return "tflops";
-    case Metric::BANDWIDTH: return "bandwidth";
-    default: throw std::invalid_argument("Unsupported metric type");
-    }
-}
-
-struct GemmMultiDProblem
-{
-    int split_k_;
-    int m_, n_, k_;
-    int stride_a_, stride_b_, stride_d0_, stride_d1_, stride_c_;
-
-    std::string dtype_a_, dtype_b_, dtype_d0_, dtype_d1_, dtype_acc_, dtype_c_;
-    std::string layout_a_, layout_b_, layout_d0_, layout_d1_, layout_c_;
+    int stride_d0_, stride_d1_;
+    std::string dtype_d0_, dtype_d1_;
+    std::string layout_d0_, layout_d1_;
 
     friend std::ostream& operator<<(std::ostream& os, const GemmMultiDProblem& problem)
     {
@@ -71,144 +49,6 @@ struct GemmMultiDProblem
     }
 };
 
-struct PerformanceResult
-{
-    double latency_;
-    double tflops_;
-    double bandwidth_;
-
-    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
-    {
-        switch(m)
-        {
-        case Metric::LATENCY: return a.latency_ < b.latency_;
-        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
-        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
-        default: throw std::invalid_argument("Unsupported metric type");
-        }
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
-    {
-        os << "{\n"
-           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
-           << ",\n"
-           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
-           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct KernelInstance
-{
-    std::string name_;
-    GemmMultiDProblem problem_;
-    PerformanceResult perf_result_;
-
-    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
-    {
-        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
-    {
-        os << "{\n"
-           << " \"name\": \"" << obj.name_ << "\",\n"
-           << " \"problem\": " << obj.problem_ << ",\n"
-           << " \"perf_result\": " << obj.perf_result_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct Setting
-{
-    int n_warmup_;
-    int n_repeat_;
-    bool is_gpu_timer_;
-    int verify_;
-    int init_method_;
-    bool log_;
-    std::string csv_filename_;
-    bool flush_cache_;
-    int rotating_count_;
-    bool json_output_;
-};
-
-inline std::string get_rocm_version()
-{
-    std::ifstream version_file("/opt/rocm/.info/version");
-    if(version_file.is_open())
-    {
-        std::string version;
-        std::getline(version_file, version);
-        return version;
-    }
-    return "Unknown";
-}
-
-template <typename ADataType,
-          typename BDataType,
-          typename D0DataType,
-          typename AccDataType,
-          typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeTypeAB =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-
-    using ComputeType =
-        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
-
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-/// @brief Function to compare the results of the device and host computations
-bool compare(std::string instanceName,
-             ck_tile::index_t K,
-             ck_tile::index_t kbatch,
-             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
-{
-    const float max_accumulated_value =
-        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
-
-    const auto rtol_atol =
-        calculate_rtol_atol<ADataType, BDataType, D0DataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_host_result,
-                                   "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
-
-    std::cout << "For " << instanceName << " Relative error threshold is "
-              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
-              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
-
-    return pass;
-}
-
 /// @brief Function to get the kernel output with reference implementation on CPU/GPU
 void gemm_multi_d_host_reference(int verify,
                                  ck_tile::HostTensor<ADataType>& a_m_k,
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
index faf04a7de0d..2e313c3fed6 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
@@ -1,587 +1,53 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
+import os
 import sys
 import json
 import subprocess
 import argparse
 import csv
 import time
+import importlib.util
 from pathlib import Path
 from typing import List, Dict, Tuple, Optional
 
+def _import_gemm_benchmark():
+    """Import validation utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
 
-class GemmMultiDBenchmark:
-    def __init__(self, build_dir: str, verbose: bool = False):
-        self.build_dir = Path(build_dir)
-        self.verbose = verbose
-        self.results = []
-
-    def discover_kernels(self) -> List[Path]:
-        """Find all benchmark_gemm_multi_d_* executables in the build directory"""
-        bin_dir = self.build_dir / "bin"
-        if not bin_dir.exists():
-            print(f"Error: Binary directory {bin_dir} does not exist")
-            return []
-
-        kernels = list(bin_dir.glob("benchmark_gemm_multi_d_*"))
-        if self.verbose:
-            print(f"Found {len(kernels)} kernel executables")
-            for k in kernels:
-                print(f"  - {k.name}")
-        return kernels
-
-    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
-        """Extract comprehensive kernel information from filename"""
-        name = kernel_path.stem
-
-        # Initialize with basic info
-        info = {
-            "executable": str(kernel_path),
-            "name": name,
-            "data_type": "unknown",
-            "layout": "unknown",
-            "pipeline": "unknown",
-            "scheduler": "unknown",
-            "epilogue": "unknown",
-        }
-
-        # Parse the kernel name pattern:
-        # benchmark_gemm_multi_d_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
-        parts = name.split("_")
-
-        if len(parts) >= 5:
-            # Extract data type (3rd part after benchmark_gemm_)
-            info["data_type"] = parts[4] if len(parts) > 4 else "unknown"
-
-            # Extract layout (4th part)
-            info["layout"] = parts[5] if len(parts) > 5 else "unknown"
-
-            # Extract pipeline (5th part)
-            info["pipeline"] = parts[6] if len(parts) > 6 else "unknown"
-
-            # Extract epilogue (6th part)
-            info["epilogue"] = parts[7] if len(parts) > 7 else "unknown"
-
-            # Extract scheduler (7th part)
-            info["scheduler"] = parts[8] if len(parts) > 8 else "unknown"
-
-        # Extract detailed configuration from the end of the name
-        config_info = self.parse_detailed_config(name)
-        info.update(config_info)
-
-        # Generate config ID
-        info["config_id"] = self.generate_config_id(info)
-
-        return info
-
-    def parse_detailed_config(self, kernel_name: str) -> Dict:
-        """Parse detailed configuration from kernel name"""
-        config = {
-            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
-            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
-            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
-            "optimization_flags": {
-                "pad_m": False,
-                "pad_n": False,
-                "pad_k": False,
-                "persistent": False,
-            },
-        }
-
-        # Split by underscore and look for patterns
-        parts = kernel_name.split("_")
-
-        # Look for boolean flags (sequence of True/False values)
-        bool_sequence = []
-        for i, part in enumerate(parts):
-            if part in ["True", "False"]:
-                bool_sequence.append(part == "True")
-                # Continue collecting consecutive boolean values
-                j = i + 1
-                while j < len(parts) and parts[j] in ["True", "False"]:
-                    bool_sequence.append(parts[j] == "True")
-                    j += 1
-                break
-
-        # Assign boolean flags if we found them
-        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
-        if len(bool_sequence) >= 4:
-            config["optimization_flags"]["pad_m"] = bool_sequence[0]
-            config["optimization_flags"]["pad_n"] = bool_sequence[1]
-            config["optimization_flags"]["pad_k"] = bool_sequence[2]
-            config["optimization_flags"]["persistent"] = bool_sequence[3]
-
-        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
-        # The pattern is: tile_sizes_warp_config_warp_tile
-        dimension_groups = []
-        for part in parts:
-            if "x" in part and len(part.split("x")) == 3:
-                try:
-                    dims = [int(x) for x in part.split("x")]
-                    if all(d > 0 for d in dims):
-                        dimension_groups.append(dims)
-                except ValueError:
-                    continue
-
-        # Assign dimensions based on order and magnitude
-        if len(dimension_groups) >= 3:
-            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
-            sorted_groups = sorted(dimension_groups, key=max, reverse=True)
-
-            # Largest dimensions = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smallest dimensions = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[2][0]
-            config["warp_config"]["warp_n"] = sorted_groups[2][1]
-            config["warp_config"]["warp_k"] = sorted_groups[2][2]
-
-            # Middle dimensions = warp tile
-            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
-            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
-            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 2:
-            # If only 2 groups, assign based on magnitude
-            sorted_groups = sorted(dimension_groups, key=max, reverse=True)
-
-            # Larger = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smaller = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[1][0]
-            config["warp_config"]["warp_n"] = sorted_groups[1][1]
-            config["warp_config"]["warp_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 1:
-            # Only one group - assume it's tile sizes
-            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
-            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
-            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
-
-        return config
-
-    def generate_config_id(self, info: Dict) -> str:
-        """Generate a compact config ID from kernel info"""
-        # Create a compact identifier
-        parts = [
-            info.get("data_type", "unk"),
-            info.get("layout", "unk"),
-            info.get("pipeline", "unk"),
-            info.get("scheduler", "unk"),
-        ]
-
-        # Add tile configuration if available
-        tile_sizes = info.get("tile_sizes", {})
-        if tile_sizes.get("tile_m", 0) > 0:
-            tile_str = (
-                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
-            )
-            parts.append(tile_str)
-
-        # Add warp config if available
-        warp_config = info.get("warp_config", {})
-        if warp_config.get("warp_m", 0) > 0:
-            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
-            parts.append(warp_str)
-
-        # Add warp tile if available
-        warp_tile = info.get("warp_tile", {})
-        if warp_tile.get("warp_tile_m", 0) > 0:
-            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
-            parts.append(warp_tile_str)
-
-        return "_".join(parts)
-
-    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
-        """Run a single kernel with given parameters and save output to individual JSON file"""
-        # Create results directory
-        results_dir = self.build_dir / "results"
-        results_dir.mkdir(exist_ok=True)
-
-        # Generate unique JSON filename for this kernel
-        json_file = results_dir / f"{kernel_path.stem}.json"
-
-        cmd = [str(kernel_path)]
-
-        # Add parameters
-        for key, value in params.items():
-            cmd.append(f"-{key}={value}")
-
-        # Add JSON output flag for clean JSON output
-        cmd.append("-json_output=true")
-
-        if self.verbose:
-            print(f"Running: {' '.join(cmd)}")
-
-        try:
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
-
-            if result.returncode != 0:
-                print(f"Error running {kernel_path.name}: {result.stderr}")
-                return None
-
-            # Save raw output to individual JSON file
-            output = result.stdout.strip()
-            if output:
-                with open(json_file, "w") as f:
-                    f.write(output)
-
-                # Parse the JSON file
-                return self.parse_json_file(json_file)
-            else:
-                print(f"No output from {kernel_path.name}")
-                return None
-
-        except subprocess.TimeoutExpired:
-            print(f"Timeout running {kernel_path.name}")
-            return None
-        except Exception as e:
-            print(f"Error running {kernel_path.name}: {e}")
-            return None
-
-    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
-        """Parse JSON data from individual kernel output file"""
-        try:
-            with open(json_file, "r") as f:
-                content = f.read().strip()
-
-            # Parse the JSON directly since executables produce clean JSON
-            data = json.loads(content)
-
-            # Return the complete JSON data as-is, just add some convenience fields
-            result = data.copy()
-            if "perf_result" in data:
-                perf = data["perf_result"]
-                # Add convenience fields for backward compatibility
-                result["time_ms"] = perf.get("latency(ms)", 0)
-                result["tflops"] = perf.get("tflops(TFlops)", 0)
-                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
-
-            return result
-
-        except json.JSONDecodeError as e:
-            if self.verbose:
-                print(f"Failed to parse JSON from {json_file}: {e}")
-            return None
-        except Exception as e:
-            if self.verbose:
-                print(f"Error reading JSON file {json_file}: {e}")
-            return None
-
-    def benchmark_problem_size(
-        self,
-        kernels: List[Path],
-        m: int,
-        n: int,
-        k: int,
-        split_k: int = 1,
-        verify: int = 0,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> List[Dict]:
-        """Benchmark all kernels for a specific problem size"""
-        results = []
-
-        params = {
-            "m": m,
-            "n": n,
-            "k": k,
-            "split_k": split_k,
-            "verify": verify,
-            "warmup": warmup,
-            "repeat": repeat,
-            "flush_cache": str(flush_cache).lower(),
-            "rotating_count": rotating_count,
-        }
-
-        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
-
-        for kernel_path in kernels:
-            kernel_info = self.extract_kernel_info(kernel_path)
-            result = self.run_kernel(kernel_path, params)
-
-            if result:
-                # Create new structured result format
-                structured_result = {
-                    "name": kernel_info["name"],  # Add name field for compatibility
-                    "config_id": kernel_info["config_id"],
-                    "problem": result.get("problem", {}),
-                    "perf_result": result.get("perf_result", {}),
-                    "config": {
-                        "data_type": kernel_info["data_type"],
-                        "layout": kernel_info["layout"],
-                        "pipeline": kernel_info["pipeline"],
-                        "scheduler": kernel_info["scheduler"],
-                        "epilogue": kernel_info["epilogue"],
-                        "tile_sizes": kernel_info.get("tile_sizes", {}),
-                        "warp_config": kernel_info.get("warp_config", {}),
-                        "warp_tile": kernel_info.get("warp_tile", {}),
-                        "optimization_flags": kernel_info.get("optimization_flags", {}),
-                    },
-                    "executable": kernel_info["executable"],
-                    # Keep backward compatibility fields
-                    "time_ms": result.get("time_ms", 0),
-                    "tflops": result.get("tflops", 0),
-                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
-                }
-
-                results.append(structured_result)
-
-                if self.verbose:
-                    print(
-                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
-                    )
-
-        return results
-
-    def find_best_kernel(
-        self, results: List[Dict], metric: str = "tflops"
-    ) -> Optional[Dict]:
-        """Find the best performing kernel based on metric"""
-        if not results:
-            return None
-
-        if metric == "tflops":
-            return max(results, key=lambda x: x.get("tflops", 0))
-        elif metric == "time_ms":
-            return min(results, key=lambda x: x.get("time_ms", float("inf")))
-        elif metric == "bandwidth_gb_s":
-            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
-        else:
-            raise ValueError(f"Unknown metric: {metric}")
-
-    def benchmark_sweep(
-        self,
-        problem_sizes: List[Tuple[int, int, int]],
-        split_k_values: List[int] = [1],
-        verify: bool = False,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> Dict:
-        """Run comprehensive benchmark sweep"""
-        kernels = self.discover_kernels()
-        if not kernels:
-            print("No kernels found!")
-            return {}
-
-        all_results = []
-        best_kernels = {}
-
-        for m, n, k in problem_sizes:
-            for split_k in split_k_values:
-                results = self.benchmark_problem_size(
-                    kernels,
-                    m,
-                    n,
-                    k,
-                    split_k,
-                    verify=2 if verify else 0,
-                    warmup=warmup,
-                    repeat=repeat,
-                    flush_cache=flush_cache,
-                    rotating_count=rotating_count,
-                )
-
-                all_results.extend(results)
-
-                # Find best kernel for this configuration
-                best = self.find_best_kernel(results)
-                if best:
-                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
-                    best_kernels[key] = best
-                    print(
-                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
-                    )
-
-        self.results = all_results
-        return best_kernels
-
-    def export_csv(self, filename: str):
-        """Export all results to CSV"""
-        if not self.results:
-            print("No results to export")
-            return
-
-        # Get all unique keys from results
-        all_keys = set()
-        for result in self.results:
-            all_keys.update(result.keys())
-
-        # Sort keys for consistent output
-        fieldnames = sorted(all_keys)
-
-        with open(filename, "w", newline="") as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(self.results)
-
-        print(f"Results exported to {filename}")
-
-    def export_best_kernels(self, best_kernels: Dict, filename: str):
-        """Export best kernel selections to file"""
-        with open(filename, "w") as f:
-            f.write("# Best kernel selections\n")
-            f.write(
-                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
-            )
-
-            for key, kernel in sorted(best_kernels.items()):
-                f.write(
-                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
-                )
-
-        print(f"Best kernels exported to {filename}")
-
-    def export_json(self, filename: str, best_kernels: Dict = None):
-        """Export all results and best kernels to JSON with comprehensive metadata"""
-        from datetime import datetime
-
-        # Calculate comprehensive summary statistics for all metrics
-        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
-
-        tflops_values = [r.get("tflops", 0) for r in successful_results]
-        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
-        latency_values = [
-            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
-        ]
-
-        # Performance breakdown by kernel type
-        pipeline_stats = {}
-        scheduler_stats = {}
-        data_type_stats = {}
-
-        for result in successful_results:
-            # Get config info from the new structure
-            config = result.get("config", {})
-
-            # Pipeline statistics
-            pipeline = config.get("pipeline", "unknown")
-            if pipeline not in pipeline_stats:
-                pipeline_stats[pipeline] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            pipeline_stats[pipeline]["count"] += 1
-            pipeline_stats[pipeline]["best_tflops"] = max(
-                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
-            )
-
-            # Scheduler statistics
-            scheduler = config.get("scheduler", "unknown")
-            if scheduler not in scheduler_stats:
-                scheduler_stats[scheduler] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            scheduler_stats[scheduler]["count"] += 1
-            scheduler_stats[scheduler]["best_tflops"] = max(
-                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
-            )
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "gemm_benchmark",
+        os.path.join(parent_dir, "gemm_benchmark.py"),
+    )
+    gemm_benchmark_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(gemm_benchmark_module)
 
-            # Data type statistics
-            data_type = config.get("data_type", "unknown")
-            if data_type not in data_type_stats:
-                data_type_stats[data_type] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            data_type_stats[data_type]["count"] += 1
-            data_type_stats[data_type]["best_tflops"] = max(
-                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
-            )
+    return gemm_benchmark_module.GemmBenchmark
 
-        # Calculate averages for breakdown stats
-        for stats_dict, field_name in [
-            (pipeline_stats, "pipeline"),
-            (scheduler_stats, "scheduler"),
-            (data_type_stats, "data_type"),
-        ]:
-            for key in stats_dict:
-                relevant_results = [
-                    r
-                    for r in successful_results
-                    if r.get("config", {}).get(field_name, "unknown") == key
-                ]
-                if relevant_results:
-                    stats_dict[key]["avg_tflops"] = sum(
-                        r.get("tflops", 0) for r in relevant_results
-                    ) / len(relevant_results)
+def _import_benchmark_utils():
+    """Import benchmark utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(os.path.dirname(current_dir))
 
-        output_data = {
-            "benchmark_metadata": {
-                "timestamp": datetime.now().isoformat(),
-                "total_kernels_tested": len(self.results),
-                "unique_kernels": len(
-                    set(r.get("name", "unknown") for r in self.results)
-                ),
-                "successful_runs": len(successful_results),
-                "failed_runs": len(self.results) - len(successful_results),
-            },
-            "performance_summary": {
-                "tflops_stats": {
-                    "best": max(tflops_values, default=0),
-                    "average": sum(tflops_values) / len(tflops_values)
-                    if tflops_values
-                    else 0,
-                    "min": min(tflops_values, default=0),
-                    "median": sorted(tflops_values)[len(tflops_values) // 2]
-                    if tflops_values
-                    else 0,
-                },
-                "bandwidth_stats": {
-                    "best_gb_s": max(bandwidth_values, default=0),
-                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
-                    if bandwidth_values
-                    else 0,
-                    "min_gb_s": min(bandwidth_values, default=0),
-                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
-                    if bandwidth_values
-                    else 0,
-                },
-                "latency_stats": {
-                    "best_ms": min(latency_values, default=0),
-                    "average_ms": sum(latency_values) / len(latency_values)
-                    if latency_values
-                    else 0,
-                    "max_ms": max(latency_values, default=0),
-                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
-                    if latency_values
-                    else 0,
-                },
-                "kernel_type_breakdown": {
-                    "by_pipeline": pipeline_stats,
-                    "by_scheduler": scheduler_stats,
-                    "by_data_type": data_type_stats,
-                },
-                "total_problem_configurations": len(best_kernels)
-                if best_kernels
-                else 0,
-            },
-            "kernel_results": self.results,
-            "best_kernels_by_problem": best_kernels or {},
-        }
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_utils",
+        os.path.join(parent_dir, "common", "benchmark_utils.py"),
+    )
+    benchmark_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(benchmark_utils)
 
-        with open(filename, "w") as f:
-            json.dump(output_data, f, indent=2)
+    return benchmark_utils
 
-        print(f"JSON results exported to {filename}")
-        print(f"  - Total kernels: {len(self.results)}")
-        print(f"  - Successful runs: {len(successful_results)}")
-        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
-        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
-        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+GemmBenchmark = _import_gemm_benchmark()
+benchmark_utils = _import_benchmark_utils()
 
+class GemmMultiDBenchmark(GemmBenchmark):
+    def __init__(self, build_dir: str, verbose: bool = False):
+        super().__init__(build_dir, verbose, name="benchmark_gemm_multi_d_")
 
 def main():
     parser = argparse.ArgumentParser(
@@ -668,12 +134,12 @@ def main():
     print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
 
     # Export results
-    benchmark.export_csv(args.csv)
-    benchmark.export_best_kernels(best_kernels, args.best)
+    benchmark_utils.export_csv(benchmark.results, args.csv)
+    benchmark_utils.export_best_kernels(best_kernels, args.best)
 
     # Export JSON if requested
     if args.json:
-        benchmark.export_json(args.json, best_kernels)
+        benchmark_utils.export_json(benchmark.results, args.json, best_kernels)
 
     return 0
 
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
index 41d2f736e1f..767e8eda6ef 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
@@ -11,81 +11,22 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
+#include "gemm/gemm_common.hpp"
 #include "gemm_multi_d_profiler.hpp"
-#include "gemm_multi_d_common.hpp"
 
 // The kernel header is included via the compile command line with -include flag
 // It defines SelectedKernel struct and KERNEL_NAME
-// DataTypeTraits are now defined in gemm_multi_d_common.hpp
-
-// Create argument parser
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-        .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
-        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "1",
-                "for validation on GPU. Default is 1, validation on CPU, as validation on GPU is "
-                "not supported.")
-        .insert("log",
-                "false",
-                "Whether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert(
-            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
-        .insert(
-            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Whether if the timer is gpu timer or not. Possible values are false or true. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "true",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "",
-                "The filename of benchmark result. Default is empty (no CSV output).")
-        .insert("structured_sparsity",
-                "false",
-                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
-                "false")
-        .insert("json_output",
-                "false",
-                "Whether to output results in JSON format only. Possible values are true or false. "
-                "Default is "
-                "false");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
 
 void benchmark_single(const ck_tile::ArgParser& arg_parser)
 {
     // Use DataTypeTraits to get the actual type names from the generated header
     // The generated header defines ADataType, BDataType, AccDataType, CDataType
-    std::string dtype_a   = DataTypeTraits<ADataType>::name;
-    std::string dtype_b   = DataTypeTraits<BDataType>::name;
-    std::string dtype_acc = DataTypeTraits<AccDataType>::name;
-    std::string dtype_c   = DataTypeTraits<CDataType>::name;
-    std::string dtype_d0  = DataTypeTraits<D0DataType>::name;
-    std::string dtype_d1  = DataTypeTraits<D1DataType>::name;
+    std::string dtype_a   = ck_tile::DataTypeTraits<ADataType>::name;
+    std::string dtype_b   = ck_tile::DataTypeTraits<BDataType>::name;
+    std::string dtype_acc = ck_tile::DataTypeTraits<AccDataType>::name;
+    std::string dtype_c   = ck_tile::DataTypeTraits<CDataType>::name;
+    std::string dtype_d0  = ck_tile::DataTypeTraits<D0DataType>::name;
+    std::string dtype_d1  = ck_tile::DataTypeTraits<D1DataType>::name;
 
     // Layout names from the layout types
     std::string layout_a  = ALayout::name;
@@ -95,26 +36,27 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
     std::string layout_d1 = D1Layout::name;
 
     // Create GemmMultiDProblem struct
-    GemmMultiDProblem gemm_multi_d_problem{arg_parser.get_int("split_k"),
-                                           arg_parser.get_int("m"),
-                                           arg_parser.get_int("n"),
-                                           arg_parser.get_int("k"),
-                                           arg_parser.get_int("stride_a"),
-                                           arg_parser.get_int("stride_b"),
+    GemmMultiDProblem gemm_multi_d_problem{GemmProblem{arg_parser.get_int("split_k"),
+                                                       arg_parser.get_int("m"),
+                                                       arg_parser.get_int("n"),
+                                                       arg_parser.get_int("k"),
+                                                       arg_parser.get_int("stride_a"),
+                                                       arg_parser.get_int("stride_b"),
+                                                       arg_parser.get_int("stride_c"),
+                                                       dtype_a,
+                                                       dtype_b,
+                                                       dtype_acc,
+                                                       dtype_c,
+                                                       layout_a,
+                                                       layout_b,
+                                                       layout_c,
+                                                       arg_parser.get_bool("structured_sparsity")},
                                            arg_parser.get_int("stride_ds"),
                                            arg_parser.get_int("stride_ds"),
-                                           arg_parser.get_int("stride_c"),
-                                           dtype_a,
-                                           dtype_b,
                                            dtype_d0,
                                            dtype_d1,
-                                           dtype_acc,
-                                           dtype_c,
-                                           layout_a,
-                                           layout_b,
                                            layout_d0,
-                                           layout_d1,
-                                           layout_c};
+                                           layout_d1};
 
     // Create Setting struct
     Setting setting{arg_parser.get_int("warmup"),
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp
deleted file mode 100644
index 899221547f6..00000000000
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <string>
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "ck_tile/core/numeric/integer.hpp"
-#include "ck_tile/core/numeric/pk_int4.hpp"
-
-//[TODO] This can be moved to commons
-// DataTypeTraits for all supported types
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-// Helper function to determine if a layout is row-major
-template <typename Layout>
-constexpr auto is_row_major(Layout)
-{
-    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-// Structure to hold kernel traits for dispatcher
-struct KernelTraits
-{
-    std::string pipeline;  // compv3, compv4, mem
-    std::string scheduler; // intrawave, interwave
-    std::string epilogue;  // cshuffle, default
-    bool pad_m;
-    bool pad_n;
-    bool pad_k;
-    bool persistent;
-
-    // Constructor with defaults
-    KernelTraits()
-        : pipeline("compv3"),
-          scheduler("intrawave"),
-          epilogue("cshuffle"),
-          pad_m(false),
-          pad_n(false),
-          pad_k(false),
-          persistent(false)
-    {
-    }
-};
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
index 3a2cdc71fe6..aeac6c984dc 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
@@ -6,44 +6,39 @@
 #include <iostream>
 #include <fstream>
 #include <iomanip>
+#include <vector>
+#include <algorithm>
+#include <functional>
+#include <tuple>
 
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
+#include "gemm/gemm_profiler.hpp"
+#include "common/utils.hpp"
 #include "gemm_multi_d_benchmark.hpp"
 
-class GemmMultiDProfiler
+class GemmMultiDProfiler : public GemmProfiler<GemmMultiDProfiler,
+                                               GemmMultiDProblem,
+                                               ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
 {
     public:
-    static GemmMultiDProfiler& instance(Setting setting)
+    using BaseGemm = GemmProfiler<GemmMultiDProfiler,
+                                  GemmMultiDProblem,
+                                  ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
+    using BaseGemm::benchmark;
+
+    GemmMultiDProfiler(Setting setting)
+        : GemmProfiler<GemmMultiDProfiler,
+                       GemmMultiDProblem,
+                       ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting)
     {
-        static GemmMultiDProfiler instance{setting};
-        return instance;
-    }
-
-    // Overload for single kernel benchmarking
-    void benchmark(GemmMultiDProblem& gemm_multi_d_problem,
-                   std::function<float(const ck_tile::GemmMultiDHostArgs<DsDataType::size()>&,
-                                       const ck_tile::stream_config&)> kernel_func)
-    {
-        // Create a vector with a single callable that returns both name and time
-        std::vector<std::function<std::tuple<std::string, float>(
-            ck_tile::GemmMultiDHostArgs<DsDataType::size()>&, const ck_tile::stream_config&)>>
-            callables;
-
-        callables.push_back([kernel_func](ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args,
-                                          const ck_tile::stream_config& stream) {
-            float time = kernel_func(args, stream);
-            return std::make_tuple(std::string(KERNEL_NAME), time);
-        });
-
-        benchmark(gemm_multi_d_problem, callables);
     }
 
     void benchmark(
         GemmMultiDProblem& gemm_multi_d_problem,
         std::vector<std::function<std::tuple<std::string, float>(
             ck_tile::GemmMultiDHostArgs<DsDataType::size()>&, const ck_tile::stream_config&)>>&
-            callables)
+            callables) override
     {
         const ALayout layout_a   = ALayout{};
         const BLayout layout_b   = BLayout{};
@@ -165,143 +160,4 @@ class GemmMultiDProfiler
                            kernel_run_result);
         }
     }
-
-    void process_result(const GemmMultiDProblem& gemm_multi_d_problem,
-                        ck_tile::DeviceMem& c_m_n_dev_buf,
-                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                        const std::tuple<std::string, float>& kernel_run_result)
-    {
-        auto [name, avg_time] = kernel_run_result;
-
-        KernelInstance kernel_instance{name, gemm_multi_d_problem, {-1.0f, -1.0f, -1.0f}};
-
-        // compute performance metric
-        std::size_t flop = std::size_t(2) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_ *
-                           gemm_multi_d_problem.k_;
-        std::size_t num_byte =
-            sizeof(ADataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.k_ +
-            sizeof(BDataType) * gemm_multi_d_problem.n_ * gemm_multi_d_problem.k_ +
-            sizeof(CDataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
-
-        // Dth Dimension Updates
-        ck_tile::static_for<0, DsDataType::size(), 1>{}([&](auto i) {
-            num_byte += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
-                        gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
-            flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
-                    gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
-        });
-
-        // update
-        kernel_instance.perf_result_.latency_   = avg_time;
-        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
-        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
-
-        if(setting_.log_ > 0 && !setting_.json_output_)
-        {
-            std::cout << kernel_instance << std::endl;
-        }
-
-        // verify result
-        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-        bool verified_correct =
-            !setting_.verify_ || compare(name,
-                                         gemm_multi_d_problem.k_,
-                                         1, // Multi d currently supports only k_batch  = 1
-                                         c_m_n_dev_result,
-                                         c_m_n_host_result);
-
-        if(verified_correct)
-        {
-            kernel_instances_.emplace_back(kernel_instance);
-        }
-        else
-        {
-            std::cout << "Verification failed, skip kernel: " << name << std::endl;
-        }
-
-        // clear tensor
-        c_m_n_dev_buf.SetZero();
-        c_m_n_dev_result.SetZero();
-    }
-
-    KernelInstance select_best_instance(Metric metric)
-    {
-        if(kernel_instances_.empty())
-            throw std::runtime_error("Empty instances");
-
-        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
-                                                 kernel_instances_.end(),
-                                                 [metric](const auto& a, const auto& b) {
-                                                     return PerformanceResult::compare(
-                                                         b.perf_result_, a.perf_result_, metric);
-                                                 });
-
-        if(setting_.json_output_)
-        {
-            // Output clean JSON only
-            std::cout << kernel_instance << std::endl;
-        }
-        else
-        {
-            std::cout << "**********************************" << std::endl;
-            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
-                      << "Current kernel performance is: " << kernel_instance << std::endl;
-            std::cout << "**********************************" << std::endl;
-        }
-
-        if(!setting_.csv_filename_.empty())
-        {
-            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
-
-            if(!file.is_open())
-            {
-                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
-            }
-            else
-            {
-                if(file.tellp() == 0)
-                {
-                    file << "rocm_version,device_name,"
-                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
-                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
-                         << "structured_sparsity," << "name,"
-                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
-                }
-
-                const auto& problem = kernel_instance.problem_;
-                const auto& name    = kernel_instance.name_;
-                const auto& perf    = kernel_instance.perf_result_;
-
-                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
-                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
-                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
-                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
-                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
-                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
-                     << "," << name << "," << std::fixed << std::setprecision(4) << perf.latency_
-                     << "," << std::fixed << std::setprecision(4) << perf.tflops_ << ","
-                     << std::fixed << std::setprecision(4) << perf.bandwidth_ << ","
-                     << get_metric_name(metric) << "\n";
-
-                if(!file)
-                {
-                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
-                }
-            }
-        }
-
-        return kernel_instance;
-    }
-
-    GemmMultiDProfiler(const GemmMultiDProfiler&)            = delete;
-    GemmMultiDProfiler& operator=(const GemmMultiDProfiler&) = delete;
-
-    private:
-    ~GemmMultiDProfiler() { kernel_instances_.clear(); }
-    GemmMultiDProfiler(Setting setting) : setting_(setting) {}
-
-    Setting setting_;
-
-    std::vector<KernelInstance> kernel_instances_;
 };
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
index 748fe581d35..817ff8e99ec 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
@@ -6,192 +6,25 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
 #include "gemm_preshuffle_common.hpp"
-
-//[TODO] Move parts of this File to commons
-enum class Metric
-{
-    LATENCY   = 0,
-    TFLOPS    = 1,
-    BANDWIDTH = 2
-};
-
-inline constexpr auto get_metric_name(Metric m)
-{
-    switch(m)
-    {
-    case Metric::LATENCY: return "latency";
-    case Metric::TFLOPS: return "tflops";
-    case Metric::BANDWIDTH: return "bandwidth";
-    default: throw std::invalid_argument("Unsupported metric type");
-    }
-}
+#include "gemm/gemm_benchmark.hpp"
 
 struct KernelConfig
 {
-    std::tuple<int, int, int> tile_dims;
-    std::tuple<int, int, int> warp_dims;
-    std::tuple<int, int, int> warp_tile_dims;
-    bool permuteN;
-};
-
-struct GemmProblem
-{
-    int split_k_;
-    int m_, n_, k_;
-    int stride_a_, stride_b_, stride_c_;
-
-    std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_;
-    std::string layout_a_, layout_b_, layout_c_;
+    ck_tile::index_t M_Tile;
+    ck_tile::index_t N_Tile;
+    ck_tile::index_t K_Tile;
 
-    bool structured_sparsity_;
+    ck_tile::index_t M_Warp;
+    ck_tile::index_t N_Warp;
+    ck_tile::index_t K_Warp;
 
-    friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem)
-    {
-        os << "{\n"
-           << "   \"split_k\":" << problem.split_k_ << ",\n"
-           << "   \"m\":" << problem.m_ << ",\n"
-           << "   \"n\":" << problem.n_ << ",\n"
-           << "   \"k\":" << problem.k_ << ",\n"
-           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
-           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
-           << "   \"stride_c\":" << problem.stride_c_ << ",\n"
-           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
-           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
-           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
-           << "   \"dtype_c\":\"" << problem.dtype_c_ << "\",\n"
-           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
-           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
-           << "   \"layout_c\":\"" << problem.layout_c_ << "\",\n"
-           << "   \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false")
-           << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct PerformanceResult
-{
-    double latency_;
-    double tflops_;
-    double bandwidth_;
-
-    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
-    {
-        switch(m)
-        {
-        case Metric::LATENCY: return a.latency_ < b.latency_;
-        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
-        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
-        default: throw std::invalid_argument("Unsupported metric type");
-        }
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
-    {
-        os << "{\n"
-           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
-           << ",\n"
-           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
-           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
-           << "}";
-        return os;
-    }
-};
+    ck_tile::index_t M_Warp_Tile;
+    ck_tile::index_t N_Warp_Tile;
+    ck_tile::index_t K_Warp_Tile;
 
-struct KernelInstance
-{
-    std::string name_;
-    GemmProblem problem_;
-    PerformanceResult perf_result_;
-
-    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
-    {
-        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
-    {
-        os << "{\n"
-           << " \"name\": \"" << obj.name_ << "\",\n"
-           << " \"problem\": " << obj.problem_ << ",\n"
-           << " \"perf_result\": " << obj.perf_result_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct Setting
-{
-    int n_warmup_;
-    int n_repeat_;
-    bool is_gpu_timer_;
-    int verify_;
-    int init_method_;
-    bool log_;
-    std::string csv_filename_;
-    bool flush_cache_;
-    int rotating_count_;
-    bool json_output_;
+    bool permuteN;
 };
 
-inline std::string get_rocm_version()
-{
-    std::ifstream version_file("/opt/rocm/.info/version");
-    if(version_file.is_open())
-    {
-        std::string version;
-        std::getline(version_file, version);
-        return version;
-    }
-    return "Unknown";
-}
-
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-/// @brief Function to compare the results of the device and host computations
-bool compare(std::string instanceName,
-             ck_tile::index_t K,
-             ck_tile::index_t kbatch,
-             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-             ck_tile::HostTensor<CDataType>& c_m_n_ref)
-{
-    const float max_accumulated_value =
-        *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
-    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-        K, kbatch, max_accumulated_value);
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_ref,
-                                   "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
-
-    std::cout << "For " << instanceName << " Relative error threshold is "
-              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
-              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
-
-    return pass;
-}
-
 /// @brief Function to get the kernel output with reference implementation on CPU/GPU
 void gemm_host_reference(int verify,
                          ck_tile::HostTensor<ADataType>& a_m_k,
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
index 53ae6336faf..935de186d66 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
@@ -1,588 +1,54 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
+import os
 import sys
 import json
 import subprocess
 import argparse
 import csv
 import time
+import importlib.util
 from pathlib import Path
 from typing import List, Dict, Tuple, Optional
 
 
-class GemmPreshuffleBenchmark:
-    def __init__(self, build_dir: str, verbose: bool = False):
-        self.build_dir = Path(build_dir)
-        self.verbose = verbose
-        self.results = []
-
-    def discover_kernels(self) -> List[Path]:
-        """Find all benchmark_gemm_preshuffle* executables in the build directory"""
-        bin_dir = self.build_dir / "bin"
-        if not bin_dir.exists():
-            print(f"Error: Binary directory {bin_dir} does not exist")
-            return []
-
-        kernels = list(bin_dir.glob("benchmark_gemm_preshuffle*"))
-        if self.verbose:
-            print(f"Found {len(kernels)} kernel executables")
-            for k in kernels:
-                print(f"  - {k.name}")
-        return kernels
-
-    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
-        """Extract comprehensive kernel information from filename"""
-        name = kernel_path.stem
-
-        # Initialize with basic info
-        info = {
-            "executable": str(kernel_path),
-            "name": name,
-            "data_type": "unknown",
-            "layout": "unknown",
-            "pipeline": "unknown",
-            "scheduler": "unknown",
-            "epilogue": "unknown",
-        }
-
-        # Parse the kernel name pattern:
-        # benchmark_gemm_preshuffle_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
-        parts = name.split("_")
-
-        if len(parts) >= 4:
-            # Extract data type (4rd part after benchmark_gemm_preshuffle_)
-            info["data_type"] = parts[3] if len(parts) > 2 else "unknown"
-
-            # Extract layout (5th part)
-            info["layout"] = parts[4] if len(parts) > 3 else "unknown"
-
-            # Extract pipeline (6th part)
-            info["pipeline"] = parts[5] if len(parts) > 4 else "unknown"
-
-            # Extract epilogue (7th part)
-            info["epilogue"] = parts[6] if len(parts) > 5 else "unknown"
-
-            # Extract scheduler (8th part)
-            info["scheduler"] = parts[7] if len(parts) > 6 else "unknown"
-
-        # Extract detailed configuration from the end of the name
-        config_info = self.parse_detailed_config(name)
-        info.update(config_info)
-
-        # Generate config ID
-        info["config_id"] = self.generate_config_id(info)
-
-        return info
-
-    def parse_detailed_config(self, kernel_name: str) -> Dict:
-        """Parse detailed configuration from kernel name"""
-        config = {
-            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
-            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
-            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
-            "optimization_flags": {
-                "pad_m": False,
-                "pad_n": False,
-                "pad_k": False,
-                "persistent": False,
-            },
-        }
-
-        # Split by underscore and look for patterns
-        parts = kernel_name.split("_")
-
-        # Look for boolean flags (sequence of True/False values)
-        bool_sequence = []
-        for i, part in enumerate(parts):
-            if part in ["True", "False"]:
-                bool_sequence.append(part == "True")
-                # Continue collecting consecutive boolean values
-                j = i + 1
-                while j < len(parts) and parts[j] in ["True", "False"]:
-                    bool_sequence.append(parts[j] == "True")
-                    j += 1
-                break
-
-        # Assign boolean flags if we found them
-        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
-        if len(bool_sequence) >= 4:
-            config["optimization_flags"]["pad_m"] = bool_sequence[0]
-            config["optimization_flags"]["pad_n"] = bool_sequence[1]
-            config["optimization_flags"]["pad_k"] = bool_sequence[2]
-            config["optimization_flags"]["persistent"] = bool_sequence[3]
-
-        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
-        # The pattern is: tile_sizes_warp_config_warp_tile
-        dimension_groups = []
-        for part in parts:
-            if "x" in part and len(part.split("x")) == 3:
-                try:
-                    dims = [int(x) for x in part.split("x")]
-                    if all(d > 0 for d in dims):
-                        dimension_groups.append(dims)
-                except ValueError:
-                    continue
-
-        # Assign dimensions based on order and magnitude
-        if len(dimension_groups) >= 3:
-            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
-            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
-
-            # Largest dimensions = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smallest dimensions = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[2][0]
-            config["warp_config"]["warp_n"] = sorted_groups[2][1]
-            config["warp_config"]["warp_k"] = sorted_groups[2][2]
-
-            # Middle dimensions = warp tile
-            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
-            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
-            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 2:
-            # If only 2 groups, assign based on magnitude
-            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
-
-            # Larger = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smaller = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[1][0]
-            config["warp_config"]["warp_n"] = sorted_groups[1][1]
-            config["warp_config"]["warp_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 1:
-            # Only one group - assume it's tile sizes
-            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
-            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
-            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
-
-        return config
-
-    def generate_config_id(self, info: Dict) -> str:
-        """Generate a compact config ID from kernel info"""
-        # Create a compact identifier
-        parts = [
-            info.get("data_type", "unk"),
-            info.get("layout", "unk"),
-            info.get("pipeline", "unk"),
-            info.get("scheduler", "unk"),
-        ]
-
-        # Add tile configuration if available
-        tile_sizes = info.get("tile_sizes", {})
-        if tile_sizes.get("tile_m", 0) > 0:
-            tile_str = (
-                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
-            )
-            parts.append(tile_str)
-
-        # Add warp config if available
-        warp_config = info.get("warp_config", {})
-        if warp_config.get("warp_m", 0) > 0:
-            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
-            parts.append(warp_str)
-
-        # Add warp tile if available
-        warp_tile = info.get("warp_tile", {})
-        if warp_tile.get("warp_tile_m", 0) > 0:
-            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
-            parts.append(warp_tile_str)
-
-        return "_".join(parts)
-
-    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
-        """Run a single kernel with given parameters and save output to individual JSON file"""
-        # Create results directory
-        results_dir = self.build_dir / "results"
-        results_dir.mkdir(exist_ok=True)
-
-        # Generate unique JSON filename for this kernel
-        json_file = results_dir / f"{kernel_path.stem}.json"
-
-        cmd = [str(kernel_path)]
-
-        # Add parameters
-        for key, value in params.items():
-            cmd.append(f"-{key}={value}")
-
-        # Add JSON output flag for clean JSON output
-        cmd.append("-json_output=true")
-
-        if self.verbose:
-            print(f"Running: {' '.join(cmd)}")
-
-        try:
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
-
-            if result.returncode != 0:
-                print(f"Error running {kernel_path.name}: {result.stderr}")
-                return None
-
-            # Save raw output to individual JSON file
-            output = result.stdout.strip()
-
-            if output:
-                with open(json_file, "w") as f:
-                    f.write(output)
+def _import_gemm_benchmark():
+    """Import validation utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
 
-                # Parse the JSON file
-                return self.parse_json_file(json_file)
-            else:
-                print(f"No output from {kernel_path.name}")
-                return None
-
-        except subprocess.TimeoutExpired:
-            print(f"Timeout running {kernel_path.name}")
-            return None
-        except Exception as e:
-            print(f"Error running {kernel_path.name}: {e}")
-            return None
-
-    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
-        """Parse JSON data from individual kernel output file"""
-        try:
-            with open(json_file, "r") as f:
-                content = f.read().strip()
-
-            # Parse the JSON directly since executables produce clean JSON
-            data = json.loads(content)
-
-            # Return the complete JSON data as-is, just add some convenience fields
-            result = data.copy()
-            if "perf_result" in data:
-                perf = data["perf_result"]
-                # Add convenience fields for backward compatibility
-                result["time_ms"] = perf.get("latency(ms)", 0)
-                result["tflops"] = perf.get("tflops(TFlops)", 0)
-                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
-
-            return result
-
-        except json.JSONDecodeError as e:
-            if self.verbose:
-                print(f"Failed to parse JSON from {json_file}: {e}")
-            return None
-        except Exception as e:
-            if self.verbose:
-                print(f"Error reading JSON file {json_file}: {e}")
-            return None
-
-    def benchmark_problem_size(
-        self,
-        kernels: List[Path],
-        m: int,
-        n: int,
-        k: int,
-        split_k: int = 1,
-        verify: int = 0,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> List[Dict]:
-        """Benchmark all kernels for a specific problem size"""
-        results = []
-
-        params = {
-            "m": m,
-            "n": n,
-            "k": k,
-            "split_k": split_k,
-            "verify": verify,
-            "warmup": warmup,
-            "repeat": repeat,
-            "flush_cache": str(flush_cache).lower(),
-            "rotating_count": rotating_count,
-        }
-
-        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
-
-        for kernel_path in kernels:
-            kernel_info = self.extract_kernel_info(kernel_path)
-            result = self.run_kernel(kernel_path, params)
-
-            if result:
-                # Create new structured result format
-                structured_result = {
-                    "name": kernel_info["name"],  # Add name field for compatibility
-                    "config_id": kernel_info["config_id"],
-                    "problem": result.get("problem", {}),
-                    "perf_result": result.get("perf_result", {}),
-                    "config": {
-                        "data_type": kernel_info["data_type"],
-                        "layout": kernel_info["layout"],
-                        "pipeline": kernel_info["pipeline"],
-                        "scheduler": kernel_info["scheduler"],
-                        "epilogue": kernel_info["epilogue"],
-                        "tile_sizes": kernel_info.get("tile_sizes", {}),
-                        "warp_config": kernel_info.get("warp_config", {}),
-                        "warp_tile": kernel_info.get("warp_tile", {}),
-                        "optimization_flags": kernel_info.get("optimization_flags", {}),
-                    },
-                    "executable": kernel_info["executable"],
-                    # Keep backward compatibility fields
-                    "time_ms": result.get("time_ms", 0),
-                    "tflops": result.get("tflops", 0),
-                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
-                }
-
-                results.append(structured_result)
-
-                if self.verbose:
-                    print(
-                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
-                    )
-
-        return results
-
-    def find_best_kernel(
-        self, results: List[Dict], metric: str = "tflops"
-    ) -> Optional[Dict]:
-        """Find the best performing kernel based on metric"""
-        if not results:
-            return None
-
-        if metric == "tflops":
-            return max(results, key=lambda x: x.get("tflops", 0))
-        elif metric == "time_ms":
-            return min(results, key=lambda x: x.get("time_ms", float("inf")))
-        elif metric == "bandwidth_gb_s":
-            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
-        else:
-            raise ValueError(f"Unknown metric: {metric}")
-
-    def benchmark_sweep(
-        self,
-        problem_sizes: List[Tuple[int, int, int]],
-        split_k_values: List[int] = [1],
-        verify: bool = False,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> Dict:
-        """Run comprehensive benchmark sweep"""
-        kernels = self.discover_kernels()
-        if not kernels:
-            print("No kernels found!")
-            return {}
-
-        all_results = []
-        best_kernels = {}
-
-        for m, n, k in problem_sizes:
-            for split_k in split_k_values:
-                results = self.benchmark_problem_size(
-                    kernels,
-                    m,
-                    n,
-                    k,
-                    split_k,
-                    verify=2 if verify else 0,
-                    warmup=warmup,
-                    repeat=repeat,
-                    flush_cache=flush_cache,
-                    rotating_count=rotating_count,
-                )
-
-                all_results.extend(results)
-
-                # Find best kernel for this configuration
-                best = self.find_best_kernel(results)
-                if best:
-                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
-                    best_kernels[key] = best
-                    print(
-                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
-                    )
-
-        self.results = all_results
-        return best_kernels
-
-    def export_csv(self, filename: str):
-        """Export all results to CSV"""
-        if not self.results:
-            print("No results to export")
-            return
-
-        # Get all unique keys from results
-        all_keys = set()
-        for result in self.results:
-            all_keys.update(result.keys())
-
-        # Sort keys for consistent output
-        fieldnames = sorted(all_keys)
-
-        with open(filename, "w", newline="") as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(self.results)
-
-        print(f"Results exported to {filename}")
-
-    def export_best_kernels(self, best_kernels: Dict, filename: str):
-        """Export best kernel selections to file"""
-        with open(filename, "w") as f:
-            f.write("# Best kernel selections\n")
-            f.write(
-                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
-            )
-
-            for key, kernel in sorted(best_kernels.items()):
-                f.write(
-                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
-                )
-
-        print(f"Best kernels exported to {filename}")
-
-    def export_json(self, filename: str, best_kernels: Dict = None):
-        """Export all results and best kernels to JSON with comprehensive metadata"""
-        from datetime import datetime
-
-        # Calculate comprehensive summary statistics for all metrics
-        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
-
-        tflops_values = [r.get("tflops", 0) for r in successful_results]
-        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
-        latency_values = [
-            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
-        ]
-
-        # Performance breakdown by kernel type
-        pipeline_stats = {}
-        scheduler_stats = {}
-        data_type_stats = {}
-
-        for result in successful_results:
-            # Get config info from the new structure
-            config = result.get("config", {})
-
-            # Pipeline statistics
-            pipeline = config.get("pipeline", "unknown")
-            if pipeline not in pipeline_stats:
-                pipeline_stats[pipeline] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            pipeline_stats[pipeline]["count"] += 1
-            pipeline_stats[pipeline]["best_tflops"] = max(
-                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
-            )
-
-            # Scheduler statistics
-            scheduler = config.get("scheduler", "unknown")
-            if scheduler not in scheduler_stats:
-                scheduler_stats[scheduler] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            scheduler_stats[scheduler]["count"] += 1
-            scheduler_stats[scheduler]["best_tflops"] = max(
-                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
-            )
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "gemm_benchmark",
+        os.path.join(parent_dir, "gemm_benchmark.py"),
+    )
+    gemm_benchmark_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(gemm_benchmark_module)
 
-            # Data type statistics
-            data_type = config.get("data_type", "unknown")
-            if data_type not in data_type_stats:
-                data_type_stats[data_type] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            data_type_stats[data_type]["count"] += 1
-            data_type_stats[data_type]["best_tflops"] = max(
-                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
-            )
+    return gemm_benchmark_module.GemmBenchmark
 
-        # Calculate averages for breakdown stats
-        for stats_dict, field_name in [
-            (pipeline_stats, "pipeline"),
-            (scheduler_stats, "scheduler"),
-            (data_type_stats, "data_type"),
-        ]:
-            for key in stats_dict:
-                relevant_results = [
-                    r
-                    for r in successful_results
-                    if r.get("config", {}).get(field_name, "unknown") == key
-                ]
-                if relevant_results:
-                    stats_dict[key]["avg_tflops"] = sum(
-                        r.get("tflops", 0) for r in relevant_results
-                    ) / len(relevant_results)
+def _import_benchmark_utils():
+    """Import benchmark utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(os.path.dirname(current_dir))
 
-        output_data = {
-            "benchmark_metadata": {
-                "timestamp": datetime.now().isoformat(),
-                "total_kernels_tested": len(self.results),
-                "unique_kernels": len(
-                    set(r.get("name", "unknown") for r in self.results)
-                ),
-                "successful_runs": len(successful_results),
-                "failed_runs": len(self.results) - len(successful_results),
-            },
-            "performance_summary": {
-                "tflops_stats": {
-                    "best": max(tflops_values, default=0),
-                    "average": sum(tflops_values) / len(tflops_values)
-                    if tflops_values
-                    else 0,
-                    "min": min(tflops_values, default=0),
-                    "median": sorted(tflops_values)[len(tflops_values) // 2]
-                    if tflops_values
-                    else 0,
-                },
-                "bandwidth_stats": {
-                    "best_gb_s": max(bandwidth_values, default=0),
-                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
-                    if bandwidth_values
-                    else 0,
-                    "min_gb_s": min(bandwidth_values, default=0),
-                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
-                    if bandwidth_values
-                    else 0,
-                },
-                "latency_stats": {
-                    "best_ms": min(latency_values, default=0),
-                    "average_ms": sum(latency_values) / len(latency_values)
-                    if latency_values
-                    else 0,
-                    "max_ms": max(latency_values, default=0),
-                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
-                    if latency_values
-                    else 0,
-                },
-                "kernel_type_breakdown": {
-                    "by_pipeline": pipeline_stats,
-                    "by_scheduler": scheduler_stats,
-                    "by_data_type": data_type_stats,
-                },
-                "total_problem_configurations": len(best_kernels)
-                if best_kernels
-                else 0,
-            },
-            "kernel_results": self.results,
-            "best_kernels_by_problem": best_kernels or {},
-        }
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_utils",
+        os.path.join(parent_dir, "common", "benchmark_utils.py"),
+    )
+    benchmark_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(benchmark_utils)
 
-        with open(filename, "w") as f:
-            json.dump(output_data, f, indent=2)
+    return benchmark_utils
 
-        print(f"JSON results exported to {filename}")
-        print(f"  - Total kernels: {len(self.results)}")
-        print(f"  - Successful runs: {len(successful_results)}")
-        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
-        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
-        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+GemmBenchmark = _import_gemm_benchmark()
+benchmark_utils = _import_benchmark_utils()
 
+class GemmPreshuffleBenchmark(GemmBenchmark):
+    def __init__(self, build_dir: str, verbose: bool = False):
+        super().__init__(build_dir, verbose, name="benchmark_gemm_preshuffle_")
 
 def main():
     parser = argparse.ArgumentParser(
@@ -669,12 +135,12 @@ def main():
     print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
 
     # Export results
-    benchmark.export_csv(args.csv)
-    benchmark.export_best_kernels(best_kernels, args.best)
+    benchmark_utils.export_csv(benchmark.results, args.csv)
+    benchmark_utils.export_best_kernels(best_kernels, args.best)
 
     # Export JSON if requested
     if args.json:
-        benchmark.export_json(args.json, best_kernels)
+        benchmark_utils.export_json(benchmark.results, args.json, best_kernels)
 
     return 0
 
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
index 4fbb25f0c90..d03b35f2b46 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
@@ -11,78 +11,21 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
+#include "gemm/gemm_common.hpp"
 #include "gemm_preshuffle_profiler.hpp"
 #include "gemm_preshuffle_common.hpp"
 
 // The kernel header is included via the compile command line with -include flag
 // It defines SelectedKernel struct and KERNEL_NAME
-// DataTypeTraits are now defined in gemm_common.hpp
-
-// Create argument parser
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "2",
-                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
-                "for validation on GPU. Default is 0, no validation.")
-        .insert("log",
-                "false",
-                "Whether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert(
-            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
-        .insert(
-            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Whether if the timer is gpu timer or not. Possible values are false or true. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "true",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "",
-                "The filename of benchmark result. Default is empty (no CSV output).")
-        .insert("structured_sparsity",
-                "false",
-                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
-                "false")
-        .insert("json_output",
-                "false",
-                "Whether to output results in JSON format only. Possible values are true or false. "
-                "Default is "
-                "false");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
 
 void benchmark_single(const ck_tile::ArgParser& arg_parser)
 {
     // Use DataTypeTraits to get the actual type names from the generated header
     // The generated header defines ADataType, BDataType, AccDataType, CDataType
-    std::string dtype_a   = DataTypeTraits<ADataType>::name;
-    std::string dtype_b   = DataTypeTraits<BDataType>::name;
-    std::string dtype_acc = DataTypeTraits<AccDataType>::name;
-    std::string dtype_c   = DataTypeTraits<CDataType>::name;
+    std::string dtype_a   = ck_tile::DataTypeTraits<ADataType>::name;
+    std::string dtype_b   = ck_tile::DataTypeTraits<BDataType>::name;
+    std::string dtype_acc = ck_tile::DataTypeTraits<AccDataType>::name;
+    std::string dtype_c   = ck_tile::DataTypeTraits<CDataType>::name;
 
     // Layout names from the layout types
     std::string layout_a = ALayout::name;
@@ -119,29 +62,17 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
                     arg_parser.get_bool("json_output")};
 
     // Get the profiler instance
-    auto& profiler = GemmProfiler::instance(setting);
+    auto& profiler = GemmPreshuffleProfiler::instance(setting);
 
     try
     {
-        // Create a lambda that wraps the kernel launch
-        std::tuple<int, int, int> warp_tile_dims = std::make_tuple(
-            SelectedKernel::WarpTileM, SelectedKernel::WarpTileN, SelectedKernel::WarpTileK);
-        std::tuple<int, int, int> tile_dims =
-            std::make_tuple(SelectedKernel::TileM, SelectedKernel::TileN, SelectedKernel::TileK);
-        std::tuple<int, int, int> warp_dims = std::make_tuple(SelectedKernel::WarpPerBlock_M,
-                                                              SelectedKernel::WarpPerBlock_N,
-                                                              SelectedKernel::WarpPerBlock_K);
-        bool permuteN                       = SelectedKernel::PermuteN;
-
-        KernelConfig config{tile_dims, warp_dims, warp_tile_dims, permuteN};
-
         auto kernel_func = [](const ck_tile::GemmHostArgs& args,
                               const ck_tile::stream_config& stream) {
             return SelectedKernel::launch(args, stream);
         };
 
         // Benchmark the kernel
-        profiler.benchmark(gemm_problem, kernel_func, config);
+        profiler.benchmark(gemm_problem, kernel_func);
 
         // Select best instance based on metric
         profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
index 1b2cfe37350..21cda28f754 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
@@ -8,101 +8,20 @@
 #include "ck_tile/host.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/pk_int4.hpp"
-
-//[TODO] This can be moved to commons
-// DataTypeTraits for all supported types
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-// Helper function to determine if a layout is row-major
-template <typename Layout>
-constexpr auto is_row_major(Layout)
-{
-    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
+#include "gemm/gemm_common.hpp"
 
 // Structure to hold kernel traits for dispatcher
-struct KernelTraits
+struct PreshuffleKernelTraits : KernelTraits
 {
-    std::string pipeline;  // preshufflev2
-    std::string scheduler; // intrawave, interwave, default
-    std::string epilogue;  // cshuffle, default
-    bool pad_m;
-    bool pad_n;
-    bool pad_k;
-    bool persistent;
 
     // Constructor with defaults
-    KernelTraits()
-        : pipeline("preshufflev2"),
-          scheduler("default"),
-          epilogue("default"),
-          pad_m(false),
-          pad_n(false),
-          pad_k(false),
-          persistent(false)
-    {
-    }
+    PreshuffleKernelTraits() : KernelTraits() { this->pipeline = "preshufflev2"; }
 };
 
 // Helper to extract traits from kernel name
-inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
+inline PreshuffleKernelTraits extract_traits_from_name(const std::string& kernel_name)
 {
-    KernelTraits traits;
+    PreshuffleKernelTraits traits;
 
     // Extract pipeline
     if(kernel_name.find("preshufflev2") != std::string::npos)
@@ -140,42 +59,3 @@ inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
 
     return traits;
 }
-
-template <typename T>
-auto shuffle_b(const ck_tile::HostTensor<T>& t,
-               ck_tile::index_t N_Warp_Tile,
-               ck_tile::index_t K_Warp_Tile)
-{
-    assert(t.get_lengths().size() == 2);
-    int n_      = t.get_lengths()[1];
-    int k_      = t.get_lengths()[0];
-    int divisor = N_Warp_Tile == 32 ? 2 : 4;
-    ck_tile::HostTensor<T> t_view(
-        {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
-}
-
-template <typename T>
-auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t,
-                        ck_tile::index_t N_Warp_Tile,
-                        ck_tile::index_t K_Warp_Tile,
-                        ck_tile::index_t N_Tile,
-                        ck_tile::index_t N_Warp)
-{
-    assert(t.get_lengths().size() == 2);
-
-    int n_      = t.get_lengths()[1];
-    int k_      = t.get_lengths()[0];
-    int divisor = N_Warp_Tile == 32 ? 2 : 4;
-    int NRepeat = N_Tile / N_Warp_Tile / N_Warp;
-    ck_tile::HostTensor<T> t_view({n_ / N_Tile,
-                                   N_Warp,
-                                   N_Warp_Tile,
-                                   NRepeat,
-                                   k_ / K_Warp_Tile,
-                                   divisor,
-                                   K_Warp_Tile / divisor});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
-}
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
index 739bd7e677a..e7af0738776 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
@@ -4,42 +4,26 @@
 #pragma once
 
 #include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/host/tensor_shuffle_utils.hpp"
 #include "ck_tile/ops/gemm.hpp"
+#include "gemm/gemm_profiler.hpp"
 #include "gemm_preshuffle_benchmark.hpp"
 
-class GemmProfiler
+class GemmPreshuffleProfiler
+    : public GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>
 {
     public:
-    static GemmProfiler& instance(Setting setting)
-    {
-        static GemmProfiler instance{setting};
-        return instance;
-    }
+    using BaseGemm = GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>;
+    using BaseGemm::benchmark;
 
-    // Overload for single kernel benchmarking
-    void benchmark(GemmProblem& gemm_problem,
-                   std::function<float(const ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>
-                       kernel_func,
-                   KernelConfig& config)
+    GemmPreshuffleProfiler(Setting setting)
+        : GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
     {
-        // Create a vector with a single callable that returns both name and time
-        std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs&,
-                                                                 const ck_tile::stream_config&)>>
-            callables;
-
-        callables.push_back(
-            [kernel_func](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {
-                float time = kernel_func(args, stream);
-                return std::make_tuple(std::string(KERNEL_NAME), time);
-            });
-
-        benchmark(gemm_problem, callables, config);
     }
 
     void benchmark(GemmProblem& gemm_problem,
                    std::vector<std::function<std::tuple<std::string, float>(
-                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables,
-                   KernelConfig& config)
+                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override
     {
         const ALayout layout_a = ALayout{};
         const BLayout layout_b = BLayout{};
@@ -111,21 +95,28 @@ class GemmProfiler
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
+        // Create a lambda that wraps the kernel launch
+        KernelConfig config{SelectedKernel::WarpTileM,
+                            SelectedKernel::WarpTileN,
+                            SelectedKernel::WarpTileK,
+                            SelectedKernel::TileM,
+                            SelectedKernel::TileN,
+                            SelectedKernel::TileK,
+                            SelectedKernel::WarpPerBlock_M,
+                            SelectedKernel::WarpPerBlock_N,
+                            SelectedKernel::WarpPerBlock_K,
+                            SelectedKernel::PermuteN};
+
         for(const auto& callable : callables)
         {
-            ck_tile::index_t N_Warp_Tile = std::get<1>(config.warp_tile_dims);
-            ck_tile::index_t K_Warp_Tile = std::get<2>(config.warp_tile_dims);
-            ck_tile::index_t N_Tile      = std::get<1>(config.tile_dims);
-            ck_tile::index_t N_Warp      = std::get<1>(config.warp_dims);
-
             ck_tile::HostTensor<BDataType> b_shuffle_host = [&]() {
                 if(config.permuteN)
                 {
-                    return shuffle_b_permuteN(b_k_n, N_Warp_Tile, K_Warp_Tile, N_Tile, N_Warp);
+                    return ck_tile::shuffle_b_permuteN<KernelConfig>(b_k_n, config);
                 }
                 else
                 {
-                    return shuffle_b(b_k_n, N_Warp_Tile, K_Warp_Tile);
+                    return ck_tile::shuffle_b<KernelConfig>(b_k_n, config);
                 }
             }();
 
@@ -158,132 +149,4 @@ class GemmProfiler
                 gemm_problem, c_m_n_dev_buf, c_m_n_ref, c_m_n_dev_result, kernel_run_result);
         }
     }
-
-    void process_result(const GemmProblem& gemm_problem,
-                        ck_tile::DeviceMem& c_m_n_dev_buf,
-                        ck_tile::HostTensor<CDataType>& c_m_n_ref,
-                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                        const std::tuple<std::string, float>& kernel_run_result)
-    {
-        auto [name, avg_time] = kernel_run_result;
-
-        KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
-
-        // compute performance metric
-        std::size_t flop     = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_;
-        std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ +
-                               sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
-                               sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
-
-        // update
-        kernel_instance.perf_result_.latency_   = avg_time;
-        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
-        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
-
-        if(setting_.log_ > 0 && !setting_.json_output_)
-        {
-            std::cout << kernel_instance << std::endl;
-        }
-
-        // verify result
-        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-
-        bool verified_correct =
-            !setting_.verify_ ||
-            compare(name, gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_ref);
-
-        if(verified_correct)
-        {
-            kernel_instances_.emplace_back(kernel_instance);
-        }
-        else
-        {
-            std::cout << "Verification failed, skip kernel: " << name << std::endl;
-        }
-
-        // clear tensor
-        c_m_n_dev_buf.SetZero();
-        c_m_n_dev_result.SetZero();
-    }
-
-    KernelInstance select_best_instance(Metric metric)
-    {
-        if(kernel_instances_.empty())
-            throw std::runtime_error("Empty instances");
-
-        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
-                                                 kernel_instances_.end(),
-                                                 [metric](const auto& a, const auto& b) {
-                                                     return PerformanceResult::compare(
-                                                         b.perf_result_, a.perf_result_, metric);
-                                                 });
-
-        if(setting_.json_output_)
-        {
-            // Output clean JSON only
-            std::cout << kernel_instance << std::endl;
-        }
-        else
-        {
-            std::cout << "**********************************" << std::endl;
-            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
-                      << "Current kernel performance is: " << kernel_instance << std::endl;
-            std::cout << "**********************************" << std::endl;
-        }
-
-        if(!setting_.csv_filename_.empty())
-        {
-            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
-
-            if(!file.is_open())
-            {
-                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
-            }
-            else
-            {
-                if(file.tellp() == 0)
-                {
-                    file << "rocm_version,device_name,"
-                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
-                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
-                         << "structured_sparsity," << "name,"
-                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
-                }
-
-                const auto& problem = kernel_instance.problem_;
-                const auto& name    = kernel_instance.name_;
-                const auto& perf    = kernel_instance.perf_result_;
-
-                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
-                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
-                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
-                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
-                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
-                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
-                     << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed
-                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
-                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
-                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
-                     << "\n";
-
-                if(!file)
-                {
-                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
-                }
-            }
-        }
-
-        return kernel_instance;
-    }
-
-    GemmProfiler(const GemmProfiler&)            = delete;
-    GemmProfiler& operator=(const GemmProfiler&) = delete;
-
-    private:
-    ~GemmProfiler() { kernel_instances_.clear(); }
-    GemmProfiler(Setting setting) : setting_(setting) {}
-
-    Setting setting_;
-
-    std::vector<KernelInstance> kernel_instances_;
 };
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
new file mode 100644
index 00000000000..ab62b0616f1
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -0,0 +1,190 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <vector>
+#include <algorithm>
+#include <functional>
+#include <tuple>
+
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "gemm_benchmark.hpp"
+
+template <typename Gemm, typename Problem, typename GemmArgs>
+class GemmProfiler
+{
+    public:
+    static Gemm& instance(Setting setting)
+    {
+        static Gemm instance{setting};
+        return instance;
+    }
+
+    // Overload for single kernel benchmarking
+    void benchmark(Problem& gemm_problem,
+                   std::function<float(const GemmArgs&, const ck_tile::stream_config&)> kernel_func)
+    {
+        // Create a vector with a single callable that returns both name and time
+        std::vector<
+            std::function<std::tuple<std::string, float>(GemmArgs&, const ck_tile::stream_config&)>>
+            callables;
+
+        callables.push_back([kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
+            float time = kernel_func(args, stream);
+            return std::make_tuple(std::string(KERNEL_NAME), time);
+        });
+
+        benchmark(gemm_problem, callables);
+    }
+
+    virtual void benchmark(Problem& gemm_problem,
+                           std::vector<std::function<std::tuple<std::string, float>(
+                               GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
+
+    void process_result(const Problem& gemm_problem,
+                        ck_tile::DeviceMem& c_m_n_dev_buf,
+                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                        const std::tuple<std::string, float>& kernel_run_result)
+    {
+        auto [name, avg_time] = kernel_run_result;
+        using DDataType       = typename get_DsDataType<Problem>::type;
+
+        KernelInstance<Problem> kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
+
+        // compute performance metric
+        std::size_t flop     = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_;
+        std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ +
+                               sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
+                               sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
+
+        if constexpr(!std::is_void_v<DDataType>)
+        {
+            ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
+                using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
+                num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
+                flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
+            });
+        }
+
+        // update
+        kernel_instance.perf_result_.latency_   = avg_time;
+        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
+        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
+
+        if(setting_.log_ > 0 && !setting_.json_output_)
+        {
+            std::cout << kernel_instance << std::endl;
+        }
+
+        // verify result
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+        int split_k = 1;
+        if constexpr(std::is_same_v<Problem, GemmProblem>)
+        {
+            split_k = gemm_problem.split_k_;
+        }
+        bool verified_correct =
+            !setting_.verify_ ||
+            compare<Problem>(name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);
+
+        if(verified_correct)
+        {
+            kernel_instances_.emplace_back(kernel_instance);
+        }
+        else
+        {
+            std::cout << "Verification failed, skip kernel: " << name << std::endl;
+        }
+
+        // clear tensor
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+    }
+
+    KernelInstance<Problem> select_best_instance(Metric metric)
+    {
+        if(kernel_instances_.empty())
+            throw std::runtime_error("Empty instances");
+
+        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
+                                                 kernel_instances_.end(),
+                                                 [metric](const auto& a, const auto& b) {
+                                                     return PerformanceResult::compare(
+                                                         b.perf_result_, a.perf_result_, metric);
+                                                 });
+
+        if(setting_.json_output_)
+        {
+            // Output clean JSON only
+            std::cout << kernel_instance << std::endl;
+        }
+        else
+        {
+            std::cout << "**********************************" << std::endl;
+            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
+                      << "Current kernel performance is: " << kernel_instance << std::endl;
+            std::cout << "**********************************" << std::endl;
+        }
+
+        if(!setting_.csv_filename_.empty())
+        {
+            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
+
+            if(!file.is_open())
+            {
+                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
+            }
+            else
+            {
+                if(file.tellp() == 0)
+                {
+                    file << "rocm_version,device_name,"
+                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
+                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
+                         << "structured_sparsity," << "name,"
+                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
+                }
+
+                const auto& problem = kernel_instance.problem_;
+                const auto& name    = kernel_instance.name_;
+                const auto& perf    = kernel_instance.perf_result_;
+
+                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
+                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
+                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
+                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
+                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
+                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
+                     << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed
+                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
+                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
+                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
+                     << "\n";
+
+                if(!file)
+                {
+                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
+                }
+            }
+        }
+
+        return kernel_instance;
+    }
+
+    GemmProfiler(const GemmProfiler&)            = delete;
+    GemmProfiler& operator=(const GemmProfiler&) = delete;
+
+    protected:
+    virtual ~GemmProfiler() { kernel_instances_.clear(); }
+    GemmProfiler(Setting setting) : setting_(setting) {}
+
+    Setting setting_;
+
+    std::vector<KernelInstance<Problem>> kernel_instances_;
+};
diff --git a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
index 7505fcd6d04..7f8048b5945 100644
--- a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
+++ b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
@@ -68,7 +68,7 @@ function(create_individual_gemm_universal_target datatype layout trait tile_conf
     # Create the executable
     add_executable(${target_name}
         EXCLUDE_FROM_ALL
-        ${GEMM_UNIVERSAL_SOURCE_DIR}/gemm_benchmark_single.cpp
+        ${GEMM_UNIVERSAL_SOURCE_DIR}/gemm_universal_benchmark_single.cpp
         ${instance_header}
     )
 
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.hpp
deleted file mode 100644
index 7c8df32ad89..00000000000
--- a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.hpp
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <fstream>
-#include <stdexcept>
-#include <iomanip>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "gemm_common.hpp"
-
-// Data types and Layouts are defined by the generated kernel headers
-// No hardcoded type definitions here to avoid conflicts
-
-enum class Metric
-{
-    LATENCY   = 0,
-    TFLOPS    = 1,
-    BANDWIDTH = 2
-};
-
-inline constexpr auto get_metric_name(Metric m)
-{
-    switch(m)
-    {
-    case Metric::LATENCY: return "latency";
-    case Metric::TFLOPS: return "tflops";
-    case Metric::BANDWIDTH: return "bandwidth";
-    default: throw std::invalid_argument("Unsupported metric type");
-    }
-}
-
-struct GemmProblem
-{
-    int split_k_;
-    int m_, n_, k_;
-    int stride_a_, stride_b_, stride_c_;
-
-    std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_;
-    std::string layout_a_, layout_b_, layout_c_;
-
-    bool structured_sparsity_;
-
-    friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem)
-    {
-        os << "{\n"
-           << "   \"split_k\":" << problem.split_k_ << ",\n"
-           << "   \"m\":" << problem.m_ << ",\n"
-           << "   \"n\":" << problem.n_ << ",\n"
-           << "   \"k\":" << problem.k_ << ",\n"
-           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
-           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
-           << "   \"stride_c\":" << problem.stride_c_ << ",\n"
-           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
-           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
-           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
-           << "   \"dtype_c\":\"" << problem.dtype_c_ << "\",\n"
-           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
-           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
-           << "   \"layout_c\":\"" << problem.layout_c_ << "\",\n"
-           << "   \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false")
-           << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct PerformanceResult
-{
-    double latency_;
-    double tflops_;
-    double bandwidth_;
-
-    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
-    {
-        switch(m)
-        {
-        case Metric::LATENCY: return a.latency_ < b.latency_;
-        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
-        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
-        default: throw std::invalid_argument("Unsupported metric type");
-        }
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
-    {
-        os << "{\n"
-           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
-           << ",\n"
-           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
-           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct KernelInstance
-{
-    std::string name_;
-    GemmProblem problem_;
-    PerformanceResult perf_result_;
-
-    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
-    {
-        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
-    {
-        os << "{\n"
-           << " \"name\": \"" << obj.name_ << "\",\n"
-           << " \"problem\": " << obj.problem_ << ",\n"
-           << " \"perf_result\": " << obj.perf_result_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct Setting
-{
-    int n_warmup_;
-    int n_repeat_;
-    bool is_gpu_timer_;
-    int verify_;
-    int init_method_;
-    bool log_;
-    std::string csv_filename_;
-    bool flush_cache_;
-    int rotating_count_;
-    bool json_output_;
-};
-
-inline std::string get_rocm_version()
-{
-    std::ifstream version_file("/opt/rocm/.info/version");
-    if(version_file.is_open())
-    {
-        std::string version;
-        std::getline(version_file, version);
-        return version;
-    }
-    return "Unknown";
-}
-
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-/// @brief Function to compare the results of the device and host computations
-bool compare(std::string instanceName,
-             ck_tile::index_t K,
-             ck_tile::index_t kbatch,
-             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
-{
-    const float max_accumulated_value =
-        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
-    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-        K, kbatch, max_accumulated_value);
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_host_result,
-                                   "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
-
-    std::cout << "For " << instanceName << " Relative error threshold is "
-              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
-              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
-
-    return pass;
-}
-
-/// @brief Function to get the kernel output with reference implementation on CPU/GPU
-void gemm_host_reference(int verify,
-                         ck_tile::HostTensor<ADataType>& a_m_k,
-                         ck_tile::HostTensor<BDataType>& b_k_n,
-                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                         ck_tile::DeviceMem& a_m_k_dev_buf,
-                         ck_tile::DeviceMem& b_k_n_dev_buf,
-                         ck_tile::index_t M,
-                         ck_tile::index_t N,
-                         ck_tile::index_t K,
-                         ck_tile::index_t stride_A,
-                         ck_tile::index_t stride_B,
-                         ck_tile::index_t stride_C)
-{
-    if(verify == 1)
-    {
-        c_m_n_host_result.SetZero();
-
-        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
-            a_m_k, b_k_n, c_m_n_host_result);
-    }
-    else if(verify == 2)
-    {
-        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-        {
-            // Restore input for B for gpu reference
-            b_k_n_dev_buf.ToDevice(b_k_n.data());
-        }
-
-        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes());
-        c_m_n_host_result.SetZero();
-        c_m_n_gpu_buf_ref.SetZero();
-
-        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
-        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
-        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
-
-        ck_tile::reference_gemm_gpu<ADataType,
-                                    BDataType,
-                                    AccDataType,
-                                    CDataType,
-                                    ALayout,
-                                    BLayout,
-                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
-
-        c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data());
-    }
-}
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py b/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py
deleted file mode 100644
index b7424c6d1da..00000000000
--- a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py
+++ /dev/null
@@ -1,678 +0,0 @@
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-import sys
-import json
-import subprocess
-import argparse
-import csv
-import time
-from pathlib import Path
-from typing import List, Dict, Tuple, Optional
-
-
-class GemmBenchmark:
-    def __init__(self, build_dir: str, verbose: bool = False):
-        self.build_dir = Path(build_dir)
-        self.verbose = verbose
-        self.results = []
-
-    def discover_kernels(self) -> List[Path]:
-        """Find all benchmark_gemm_* executables in the build directory"""
-        bin_dir = self.build_dir / "bin"
-        if not bin_dir.exists():
-            print(f"Error: Binary directory {bin_dir} does not exist")
-            return []
-
-        kernels = list(bin_dir.glob("benchmark_gemm_*"))
-        if self.verbose:
-            print(f"Found {len(kernels)} kernel executables")
-            for k in kernels:
-                print(f"  - {k.name}")
-        return kernels
-
-    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
-        """Extract comprehensive kernel information from filename"""
-        name = kernel_path.stem
-
-        # Initialize with basic info
-        info = {
-            "executable": str(kernel_path),
-            "name": name,
-            "data_type": "unknown",
-            "layout": "unknown",
-            "pipeline": "unknown",
-            "scheduler": "unknown",
-            "epilogue": "unknown",
-        }
-
-        # Parse the kernel name pattern:
-        # benchmark_gemm_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
-        parts = name.split("_")
-
-        if len(parts) >= 3:
-            # Extract data type (3rd part after benchmark_gemm_)
-            info["data_type"] = parts[2] if len(parts) > 2 else "unknown"
-
-            # Extract layout (4th part)
-            info["layout"] = parts[3] if len(parts) > 3 else "unknown"
-
-            # Extract pipeline (5th part)
-            info["pipeline"] = parts[4] if len(parts) > 4 else "unknown"
-
-            # Extract epilogue (6th part)
-            info["epilogue"] = parts[5] if len(parts) > 5 else "unknown"
-
-            # Extract scheduler (7th part)
-            info["scheduler"] = parts[6] if len(parts) > 6 else "unknown"
-
-        # Extract detailed configuration from the end of the name
-        config_info = self.parse_detailed_config(name)
-        info.update(config_info)
-
-        # Generate config ID
-        info["config_id"] = self.generate_config_id(info)
-
-        return info
-
-    def parse_detailed_config(self, kernel_name: str) -> Dict:
-        """Parse detailed configuration from kernel name"""
-        config = {
-            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
-            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
-            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
-            "optimization_flags": {
-                "pad_m": False,
-                "pad_n": False,
-                "pad_k": False,
-                "persistent": False,
-            },
-        }
-
-        # Split by underscore and look for patterns
-        parts = kernel_name.split("_")
-
-        # Look for boolean flags (sequence of True/False values)
-        bool_sequence = []
-        for i, part in enumerate(parts):
-            if part in ["True", "False"]:
-                bool_sequence.append(part == "True")
-                # Continue collecting consecutive boolean values
-                j = i + 1
-                while j < len(parts) and parts[j] in ["True", "False"]:
-                    bool_sequence.append(parts[j] == "True")
-                    j += 1
-                break
-
-        # Assign boolean flags if we found them
-        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
-        if len(bool_sequence) >= 4:
-            config["optimization_flags"]["pad_m"] = bool_sequence[0]
-            config["optimization_flags"]["pad_n"] = bool_sequence[1]
-            config["optimization_flags"]["pad_k"] = bool_sequence[2]
-            config["optimization_flags"]["persistent"] = bool_sequence[3]
-
-        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
-        # The pattern is: tile_sizes_warp_config_warp_tile
-        dimension_groups = []
-        for part in parts:
-            if "x" in part and len(part.split("x")) == 3:
-                try:
-                    dims = [int(x) for x in part.split("x")]
-                    if all(d > 0 for d in dims):
-                        dimension_groups.append(dims)
-                except ValueError:
-                    continue
-
-        # Assign dimensions based on order and magnitude
-        if len(dimension_groups) >= 3:
-            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
-            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
-
-            # Largest dimensions = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smallest dimensions = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[2][0]
-            config["warp_config"]["warp_n"] = sorted_groups[2][1]
-            config["warp_config"]["warp_k"] = sorted_groups[2][2]
-
-            # Middle dimensions = warp tile
-            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
-            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
-            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 2:
-            # If only 2 groups, assign based on magnitude
-            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
-
-            # Larger = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smaller = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[1][0]
-            config["warp_config"]["warp_n"] = sorted_groups[1][1]
-            config["warp_config"]["warp_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 1:
-            # Only one group - assume it's tile sizes
-            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
-            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
-            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
-
-        return config
-
-    def generate_config_id(self, info: Dict) -> str:
-        """Generate a compact config ID from kernel info"""
-        # Create a compact identifier
-        parts = [
-            info.get("data_type", "unk"),
-            info.get("layout", "unk"),
-            info.get("pipeline", "unk"),
-            info.get("scheduler", "unk"),
-        ]
-
-        # Add tile configuration if available
-        tile_sizes = info.get("tile_sizes", {})
-        if tile_sizes.get("tile_m", 0) > 0:
-            tile_str = (
-                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
-            )
-            parts.append(tile_str)
-
-        # Add warp config if available
-        warp_config = info.get("warp_config", {})
-        if warp_config.get("warp_m", 0) > 0:
-            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
-            parts.append(warp_str)
-
-        # Add warp tile if available
-        warp_tile = info.get("warp_tile", {})
-        if warp_tile.get("warp_tile_m", 0) > 0:
-            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
-            parts.append(warp_tile_str)
-
-        return "_".join(parts)
-
-    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
-        """Run a single kernel with given parameters and save output to individual JSON file"""
-        # Create results directory
-        results_dir = self.build_dir / "results"
-        results_dir.mkdir(exist_ok=True)
-
-        # Generate unique JSON filename for this kernel
-        json_file = results_dir / f"{kernel_path.stem}.json"
-
-        cmd = [str(kernel_path)]
-
-        # Add parameters
-        for key, value in params.items():
-            cmd.append(f"-{key}={value}")
-
-        # Add JSON output flag for clean JSON output
-        cmd.append("-json_output=true")
-
-        if self.verbose:
-            print(f"Running: {' '.join(cmd)}")
-
-        try:
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
-
-            if result.returncode != 0:
-                print(f"Error running {kernel_path.name}: {result.stderr}")
-                return None
-
-            # Save raw output to individual JSON file
-            output = result.stdout.strip()
-            if output:
-                with open(json_file, "w") as f:
-                    f.write(output)
-
-                # Parse the JSON file
-                return self.parse_json_file(json_file)
-            else:
-                print(f"No output from {kernel_path.name}")
-                return None
-
-        except subprocess.TimeoutExpired:
-            print(f"Timeout running {kernel_path.name}")
-            return None
-        except Exception as e:
-            print(f"Error running {kernel_path.name}: {e}")
-            return None
-
-    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
-        """Parse JSON data from individual kernel output file"""
-        try:
-            with open(json_file, "r") as f:
-                content = f.read().strip()
-
-            # Parse the JSON directly since executables produce clean JSON
-            data = json.loads(content)
-
-            # Return the complete JSON data as-is, just add some convenience fields
-            result = data.copy()
-            if "perf_result" in data:
-                perf = data["perf_result"]
-                # Add convenience fields for backward compatibility
-                result["time_ms"] = perf.get("latency(ms)", 0)
-                result["tflops"] = perf.get("tflops(TFlops)", 0)
-                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
-
-            return result
-
-        except json.JSONDecodeError as e:
-            if self.verbose:
-                print(f"Failed to parse JSON from {json_file}: {e}")
-            return None
-        except Exception as e:
-            if self.verbose:
-                print(f"Error reading JSON file {json_file}: {e}")
-            return None
-
-    def benchmark_problem_size(
-        self,
-        kernels: List[Path],
-        m: int,
-        n: int,
-        k: int,
-        split_k: int = 1,
-        verify: int = 0,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> List[Dict]:
-        """Benchmark all kernels for a specific problem size"""
-        results = []
-
-        params = {
-            "m": m,
-            "n": n,
-            "k": k,
-            "split_k": split_k,
-            "verify": verify,
-            "warmup": warmup,
-            "repeat": repeat,
-            "flush_cache": str(flush_cache).lower(),
-            "rotating_count": rotating_count,
-        }
-
-        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
-
-        for kernel_path in kernels:
-            kernel_info = self.extract_kernel_info(kernel_path)
-            result = self.run_kernel(kernel_path, params)
-
-            if result:
-                # Create new structured result format
-                structured_result = {
-                    "name": kernel_info["name"],  # Add name field for compatibility
-                    "config_id": kernel_info["config_id"],
-                    "problem": result.get("problem", {}),
-                    "perf_result": result.get("perf_result", {}),
-                    "config": {
-                        "data_type": kernel_info["data_type"],
-                        "layout": kernel_info["layout"],
-                        "pipeline": kernel_info["pipeline"],
-                        "scheduler": kernel_info["scheduler"],
-                        "epilogue": kernel_info["epilogue"],
-                        "tile_sizes": kernel_info.get("tile_sizes", {}),
-                        "warp_config": kernel_info.get("warp_config", {}),
-                        "warp_tile": kernel_info.get("warp_tile", {}),
-                        "optimization_flags": kernel_info.get("optimization_flags", {}),
-                    },
-                    "executable": kernel_info["executable"],
-                    # Keep backward compatibility fields
-                    "time_ms": result.get("time_ms", 0),
-                    "tflops": result.get("tflops", 0),
-                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
-                }
-
-                results.append(structured_result)
-
-                if self.verbose:
-                    print(
-                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
-                    )
-
-        return results
-
-    def find_best_kernel(
-        self, results: List[Dict], metric: str = "tflops"
-    ) -> Optional[Dict]:
-        """Find the best performing kernel based on metric"""
-        if not results:
-            return None
-
-        if metric == "tflops":
-            return max(results, key=lambda x: x.get("tflops", 0))
-        elif metric == "time_ms":
-            return min(results, key=lambda x: x.get("time_ms", float("inf")))
-        elif metric == "bandwidth_gb_s":
-            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
-        else:
-            raise ValueError(f"Unknown metric: {metric}")
-
-    def benchmark_sweep(
-        self,
-        problem_sizes: List[Tuple[int, int, int]],
-        split_k_values: List[int] = [1],
-        verify: bool = False,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> Dict:
-        """Run comprehensive benchmark sweep"""
-        kernels = self.discover_kernels()
-        if not kernels:
-            print("No kernels found!")
-            return {}
-
-        all_results = []
-        best_kernels = {}
-
-        for m, n, k in problem_sizes:
-            for split_k in split_k_values:
-                results = self.benchmark_problem_size(
-                    kernels,
-                    m,
-                    n,
-                    k,
-                    split_k,
-                    verify=2 if verify else 0,
-                    warmup=warmup,
-                    repeat=repeat,
-                    flush_cache=flush_cache,
-                    rotating_count=rotating_count,
-                )
-
-                all_results.extend(results)
-
-                # Find best kernel for this configuration
-                best = self.find_best_kernel(results)
-                if best:
-                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
-                    best_kernels[key] = best
-                    print(
-                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
-                    )
-
-        self.results = all_results
-        return best_kernels
-
-    def export_csv(self, filename: str):
-        """Export all results to CSV"""
-        if not self.results:
-            print("No results to export")
-            return
-
-        # Get all unique keys from results
-        all_keys = set()
-        for result in self.results:
-            all_keys.update(result.keys())
-
-        # Sort keys for consistent output
-        fieldnames = sorted(all_keys)
-
-        with open(filename, "w", newline="") as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(self.results)
-
-        print(f"Results exported to {filename}")
-
-    def export_best_kernels(self, best_kernels: Dict, filename: str):
-        """Export best kernel selections to file"""
-        with open(filename, "w") as f:
-            f.write("# Best kernel selections\n")
-            f.write(
-                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
-            )
-
-            for key, kernel in sorted(best_kernels.items()):
-                f.write(
-                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
-                )
-
-        print(f"Best kernels exported to {filename}")
-
-    def export_json(self, filename: str, best_kernels: Dict = None):
-        """Export all results and best kernels to JSON with comprehensive metadata"""
-        from datetime import datetime
-
-        # Calculate comprehensive summary statistics for all metrics
-        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
-
-        tflops_values = [r.get("tflops", 0) for r in successful_results]
-        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
-        latency_values = [
-            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
-        ]
-
-        # Performance breakdown by kernel type
-        pipeline_stats = {}
-        scheduler_stats = {}
-        data_type_stats = {}
-
-        for result in successful_results:
-            # Get config info from the new structure
-            config = result.get("config", {})
-
-            # Pipeline statistics
-            pipeline = config.get("pipeline", "unknown")
-            if pipeline not in pipeline_stats:
-                pipeline_stats[pipeline] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            pipeline_stats[pipeline]["count"] += 1
-            pipeline_stats[pipeline]["best_tflops"] = max(
-                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
-            )
-
-            # Scheduler statistics
-            scheduler = config.get("scheduler", "unknown")
-            if scheduler not in scheduler_stats:
-                scheduler_stats[scheduler] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            scheduler_stats[scheduler]["count"] += 1
-            scheduler_stats[scheduler]["best_tflops"] = max(
-                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
-            )
-
-            # Data type statistics
-            data_type = config.get("data_type", "unknown")
-            if data_type not in data_type_stats:
-                data_type_stats[data_type] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            data_type_stats[data_type]["count"] += 1
-            data_type_stats[data_type]["best_tflops"] = max(
-                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
-            )
-
-        # Calculate averages for breakdown stats
-        for stats_dict, field_name in [
-            (pipeline_stats, "pipeline"),
-            (scheduler_stats, "scheduler"),
-            (data_type_stats, "data_type"),
-        ]:
-            for key in stats_dict:
-                relevant_results = [
-                    r
-                    for r in successful_results
-                    if r.get("config", {}).get(field_name, "unknown") == key
-                ]
-                if relevant_results:
-                    stats_dict[key]["avg_tflops"] = sum(
-                        r.get("tflops", 0) for r in relevant_results
-                    ) / len(relevant_results)
-
-        output_data = {
-            "benchmark_metadata": {
-                "timestamp": datetime.now().isoformat(),
-                "total_kernels_tested": len(self.results),
-                "unique_kernels": len(
-                    set(r.get("name", "unknown") for r in self.results)
-                ),
-                "successful_runs": len(successful_results),
-                "failed_runs": len(self.results) - len(successful_results),
-            },
-            "performance_summary": {
-                "tflops_stats": {
-                    "best": max(tflops_values, default=0),
-                    "average": sum(tflops_values) / len(tflops_values)
-                    if tflops_values
-                    else 0,
-                    "min": min(tflops_values, default=0),
-                    "median": sorted(tflops_values)[len(tflops_values) // 2]
-                    if tflops_values
-                    else 0,
-                },
-                "bandwidth_stats": {
-                    "best_gb_s": max(bandwidth_values, default=0),
-                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
-                    if bandwidth_values
-                    else 0,
-                    "min_gb_s": min(bandwidth_values, default=0),
-                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
-                    if bandwidth_values
-                    else 0,
-                },
-                "latency_stats": {
-                    "best_ms": min(latency_values, default=0),
-                    "average_ms": sum(latency_values) / len(latency_values)
-                    if latency_values
-                    else 0,
-                    "max_ms": max(latency_values, default=0),
-                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
-                    if latency_values
-                    else 0,
-                },
-                "kernel_type_breakdown": {
-                    "by_pipeline": pipeline_stats,
-                    "by_scheduler": scheduler_stats,
-                    "by_data_type": data_type_stats,
-                },
-                "total_problem_configurations": len(best_kernels)
-                if best_kernels
-                else 0,
-            },
-            "kernel_results": self.results,
-            "best_kernels_by_problem": best_kernels or {},
-        }
-
-        with open(filename, "w") as f:
-            json.dump(output_data, f, indent=2)
-
-        print(f"JSON results exported to {filename}")
-        print(f"  - Total kernels: {len(self.results)}")
-        print(f"  - Successful runs: {len(successful_results)}")
-        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
-        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
-        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
-    parser.add_argument(
-        "build_dir", help="Build directory containing kernel executables"
-    )
-    parser.add_argument(
-        "--problem-sizes",
-        nargs="+",
-        default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"],
-        help="Problem sizes as M,N,K tuples",
-    )
-    parser.add_argument(
-        "--split-k", nargs="+", type=int, default=[1], help="Split-K values to test"
-    )
-    parser.add_argument("--verify", action="store_true", help="Enable verification")
-    parser.add_argument(
-        "--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
-    )
-    parser.add_argument(
-        "--best", default="best_kernels.txt", help="Best kernels output filename"
-    )
-    parser.add_argument("--verbose", action="store_true", help="Verbose output")
-    parser.add_argument(
-        "--warmup",
-        type=int,
-        default=50,
-        help="Number of warmup iterations (default: 50)",
-    )
-    parser.add_argument(
-        "--repeat",
-        type=int,
-        default=100,
-        help="Number of benchmark iterations (default: 100)",
-    )
-    parser.add_argument(
-        "--flush-cache",
-        action="store_true",
-        default=True,
-        help="Enable cache flushing (default: True)",
-    )
-    parser.add_argument(
-        "--rotating-count",
-        type=int,
-        default=1000,
-        help="Number of iterations to rotate cache (default: 1000)",
-    )
-    parser.add_argument("--json", help="JSON output filename (optional)")
-
-    args = parser.parse_args()
-
-    # Parse problem sizes
-    problem_sizes = []
-    for size_str in args.problem_sizes:
-        try:
-            m, n, k = map(int, size_str.split(","))
-            problem_sizes.append((m, n, k))
-        except ValueError:
-            print(f"Invalid problem size: {size_str}")
-            return 1
-
-    # Create benchmark instance
-    benchmark = GemmBenchmark(args.build_dir, verbose=args.verbose)
-
-    # Run benchmark sweep
-    print("Starting GEMM kernel benchmark sweep...")
-    start_time = time.time()
-
-    best_kernels = benchmark.benchmark_sweep(
-        problem_sizes=problem_sizes,
-        split_k_values=args.split_k,
-        verify=args.verify,
-        warmup=args.warmup,
-        repeat=args.repeat,
-        flush_cache=args.flush_cache,
-        rotating_count=args.rotating_count,
-    )
-
-    elapsed_time = time.time() - start_time
-    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
-
-    # Export results
-    benchmark.export_csv(args.csv)
-    benchmark.export_best_kernels(best_kernels, args.best)
-
-    # Export JSON if requested
-    if args.json:
-        benchmark.export_json(args.json, best_kernels)
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_universal/gemm_benchmark_single.cpp
deleted file mode 100644
index 6323c066a1a..00000000000
--- a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark_single.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <iostream>
-#include <functional>
-#include <tuple>
-#include <exception>
-#include <sstream>
-#include <vector>
-#include <string>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "gemm_profiler.hpp"
-#include "gemm_common.hpp"
-
-// The kernel header is included via the compile command line with -include flag
-// It defines SelectedKernel struct and KERNEL_NAME
-// DataTypeTraits are now defined in gemm_common.hpp
-
-// Create argument parser
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "2",
-                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
-                "for validation on GPU. Default is 2, GPU validation.")
-        .insert("log",
-                "false",
-                "Whether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert(
-            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
-        .insert(
-            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Whether if the timer is gpu timer or not. Possible values are false or true. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "true",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "",
-                "The filename of benchmark result. Default is empty (no CSV output).")
-        .insert("structured_sparsity",
-                "false",
-                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
-                "false")
-        .insert("json_output",
-                "false",
-                "Whether to output results in JSON format only. Possible values are true or false. "
-                "Default is "
-                "false");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
-void benchmark_single(const ck_tile::ArgParser& arg_parser)
-{
-    // Use DataTypeTraits to get the actual type names from the generated header
-    // The generated header defines ADataType, BDataType, AccDataType, CDataType
-    std::string dtype_a   = DataTypeTraits<ADataType>::name;
-    std::string dtype_b   = DataTypeTraits<BDataType>::name;
-    std::string dtype_acc = DataTypeTraits<AccDataType>::name;
-    std::string dtype_c   = DataTypeTraits<CDataType>::name;
-
-    // Layout names from the layout types
-    std::string layout_a = ALayout::name;
-    std::string layout_b = BLayout::name;
-    std::string layout_c = CLayout::name;
-
-    // Create GemmProblem struct
-    GemmProblem gemm_problem{arg_parser.get_int("split_k"),
-                             arg_parser.get_int("m"),
-                             arg_parser.get_int("n"),
-                             arg_parser.get_int("k"),
-                             arg_parser.get_int("stride_a"),
-                             arg_parser.get_int("stride_b"),
-                             arg_parser.get_int("stride_c"),
-                             dtype_a,
-                             dtype_b,
-                             dtype_acc,
-                             dtype_c,
-                             layout_a,
-                             layout_b,
-                             layout_c,
-                             arg_parser.get_bool("structured_sparsity")};
-
-    // Create Setting struct
-    Setting setting{arg_parser.get_int("warmup"),
-                    arg_parser.get_int("repeat"),
-                    arg_parser.get_bool("timer"),
-                    arg_parser.get_int("verify"),
-                    arg_parser.get_int("init"),
-                    arg_parser.get_bool("log"),
-                    arg_parser.get_str("csv_filename"),
-                    arg_parser.get_bool("flush_cache"),
-                    arg_parser.get_int("rotating_count"),
-                    arg_parser.get_bool("json_output")};
-
-    // Get the profiler instance
-    auto& profiler = GemmProfiler::instance(setting);
-
-    try
-    {
-        // Create a lambda that wraps the kernel launch
-        auto kernel_func = [](const ck_tile::GemmHostArgs& args,
-                              const ck_tile::stream_config& stream) {
-            return SelectedKernel::launch(args, stream);
-        };
-
-        // Benchmark the kernel
-        profiler.benchmark(gemm_problem, kernel_func);
-
-        // Select best instance based on metric
-        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
-    }
-    catch(const std::exception& e)
-    {
-        std::cerr << "Benchmark failed: " << e.what() << std::endl;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    try
-    {
-        auto [result, parser] = create_args(argc, argv);
-        if(!result)
-            return EXIT_FAILURE;
-
-        benchmark_single(parser);
-        return 0;
-    }
-    catch(const std::exception& e)
-    {
-        std::cerr << "Error: " << e.what() << "\n";
-        return EXIT_FAILURE;
-    }
-}
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_common.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_common.hpp
deleted file mode 100644
index 899221547f6..00000000000
--- a/tile_engine/ops/gemm/gemm_universal/gemm_common.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <string>
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "ck_tile/core/numeric/integer.hpp"
-#include "ck_tile/core/numeric/pk_int4.hpp"
-
-//[TODO] This can be moved to commons
-// DataTypeTraits for all supported types
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-// Helper function to determine if a layout is row-major
-template <typename Layout>
-constexpr auto is_row_major(Layout)
-{
-    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-// Structure to hold kernel traits for dispatcher
-struct KernelTraits
-{
-    std::string pipeline;  // compv3, compv4, mem
-    std::string scheduler; // intrawave, interwave
-    std::string epilogue;  // cshuffle, default
-    bool pad_m;
-    bool pad_n;
-    bool pad_k;
-    bool persistent;
-
-    // Constructor with defaults
-    KernelTraits()
-        : pipeline("compv3"),
-          scheduler("intrawave"),
-          epilogue("cshuffle"),
-          pad_m(false),
-          pad_n(false),
-          pad_k(false),
-          persistent(false)
-    {
-    }
-};
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_profiler.hpp
deleted file mode 100644
index 3c6bbc34d3d..00000000000
--- a/tile_engine/ops/gemm/gemm_universal/gemm_profiler.hpp
+++ /dev/null
@@ -1,289 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-
-#include "ck_tile/host/device_prop.hpp"
-#include "ck_tile/ops/gemm.hpp"
-#include "gemm_benchmark.hpp"
-
-class GemmProfiler
-{
-    public:
-    static GemmProfiler& instance(Setting setting)
-    {
-        static GemmProfiler instance{setting};
-        return instance;
-    }
-
-    // Overload for single kernel benchmarking
-    void benchmark(GemmProblem& gemm_problem,
-                   std::function<float(const ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>
-                       kernel_func)
-    {
-        // Create a vector with a single callable that returns both name and time
-        std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs&,
-                                                                 const ck_tile::stream_config&)>>
-            callables;
-
-        callables.push_back(
-            [kernel_func](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {
-                float time = kernel_func(args, stream);
-                return std::make_tuple(std::string(KERNEL_NAME), time);
-            });
-
-        benchmark(gemm_problem, callables);
-    }
-
-    void benchmark(GemmProblem& gemm_problem,
-                   std::vector<std::function<std::tuple<std::string, float>(
-                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables)
-    {
-        const ALayout layout_a = ALayout{};
-        const BLayout layout_b = BLayout{};
-        const CLayout layout_c = CLayout{};
-
-        gemm_problem.stride_a_ = ck_tile::get_default_stride(
-            gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a));
-        gemm_problem.stride_b_ = ck_tile::get_default_stride(
-            gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b));
-        gemm_problem.stride_c_ = ck_tile::get_default_stride(
-            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c));
-
-        ck_tile::HostTensor<ADataType> a_m_k(ck_tile::host_tensor_descriptor(
-            gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a)));
-        ck_tile::HostTensor<BDataType> b_k_n(ck_tile::host_tensor_descriptor(
-            gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b)));
-        ck_tile::HostTensor<CDataType> c_m_n_dev_result(ck_tile::host_tensor_descriptor(
-            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)));
-
-        if(setting_.init_method_ == 0)
-        {
-            ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-            ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
-        }
-        else if(setting_.init_method_ == 1)
-        {
-            ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
-            ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
-        }
-        else if(setting_.init_method_ == 2)
-        {
-            ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
-            ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
-        }
-        else
-        {
-            a_m_k.SetZero();
-            b_k_n.SetZero();
-        }
-
-        if(gemm_problem.structured_sparsity_)
-        {
-            ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
-        }
-
-        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
-
-        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-        {
-            // Permute vector pk_i4x4 data for device implementation
-            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
-            // permute_tensor_b<decltype(b_k_n_dev)>(b_k_n_dev);
-            ck_tile::permute_vectors_i4x4_b(b_k_n_dev);
-            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
-        }
-        else
-        {
-            b_k_n_dev_buf.ToDevice(b_k_n.data());
-        }
-
-        a_m_k_dev_buf.ToDevice(a_m_k.data());
-        c_m_n_dev_buf.SetZero();
-        c_m_n_dev_result.SetZero();
-
-        ck_tile::GemmHostArgs gemm_args = {
-            a_m_k_dev_buf.GetDeviceBuffer(),
-            b_k_n_dev_buf.GetDeviceBuffer(),
-            c_m_n_dev_buf.GetDeviceBuffer(),
-            gemm_problem.split_k_,
-            gemm_problem.m_,
-            gemm_problem.n_,
-            gemm_problem.k_,
-            gemm_problem.stride_a_,
-            gemm_problem.stride_b_,
-            gemm_problem.stride_c_,
-        };
-
-        ck_tile::HostTensor<CDataType> c_m_n_host_result(ck_tile::host_tensor_descriptor(
-            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)));
-
-        if(setting_.verify_)
-        {
-            gemm_host_reference(setting_.verify_,
-                                a_m_k,
-                                b_k_n,
-                                c_m_n_host_result,
-                                a_m_k_dev_buf,
-                                b_k_n_dev_buf,
-                                gemm_problem.m_,
-                                gemm_problem.n_,
-                                gemm_problem.k_,
-                                gemm_problem.stride_a_,
-                                gemm_problem.stride_b_,
-                                gemm_problem.stride_c_);
-        }
-
-        for(auto& callable : callables)
-        {
-            auto kernel_run_result = callable(gemm_args,
-                                              ck_tile::stream_config{nullptr,
-                                                                     true,
-                                                                     setting_.log_,
-                                                                     setting_.n_warmup_,
-                                                                     setting_.n_repeat_,
-                                                                     setting_.is_gpu_timer_,
-                                                                     setting_.flush_cache_,
-                                                                     setting_.rotating_count_});
-            process_result(gemm_problem,
-                           c_m_n_dev_buf,
-                           c_m_n_host_result,
-                           c_m_n_dev_result,
-                           kernel_run_result);
-        }
-    }
-
-    void process_result(const GemmProblem& gemm_problem,
-                        ck_tile::DeviceMem& c_m_n_dev_buf,
-                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                        const std::tuple<std::string, float>& kernel_run_result)
-    {
-        auto [name, avg_time] = kernel_run_result;
-
-        KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
-
-        // compute performance metric
-        std::size_t flop     = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_;
-        std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ +
-                               sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
-                               sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
-
-        // update
-        kernel_instance.perf_result_.latency_   = avg_time;
-        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
-        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
-
-        if(setting_.log_ > 0 && !setting_.json_output_)
-        {
-            std::cout << kernel_instance << std::endl;
-        }
-
-        // verify result
-        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-        bool verified_correct =
-            !setting_.verify_ ||
-            compare(
-                name, gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_host_result);
-
-        if(verified_correct)
-        {
-            kernel_instances_.emplace_back(kernel_instance);
-        }
-        else
-        {
-            std::cout << "Verification failed, skip kernel: " << name << std::endl;
-        }
-
-        // clear tensor
-        c_m_n_dev_buf.SetZero();
-        c_m_n_dev_result.SetZero();
-    }
-
-    KernelInstance select_best_instance(Metric metric)
-    {
-        if(kernel_instances_.empty())
-            throw std::runtime_error("Empty instances");
-
-        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
-                                                 kernel_instances_.end(),
-                                                 [metric](const auto& a, const auto& b) {
-                                                     return PerformanceResult::compare(
-                                                         b.perf_result_, a.perf_result_, metric);
-                                                 });
-
-        if(setting_.json_output_)
-        {
-            // Output clean JSON only
-            std::cout << kernel_instance << std::endl;
-        }
-        else
-        {
-            std::cout << "**********************************" << std::endl;
-            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
-                      << "Current kernel performance is: " << kernel_instance << std::endl;
-            std::cout << "**********************************" << std::endl;
-        }
-
-        if(!setting_.csv_filename_.empty())
-        {
-            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
-
-            if(!file.is_open())
-            {
-                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
-            }
-            else
-            {
-                if(file.tellp() == 0)
-                {
-                    file << "rocm_version,device_name,"
-                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
-                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
-                         << "structured_sparsity," << "name,"
-                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
-                }
-
-                const auto& problem = kernel_instance.problem_;
-                const auto& name    = kernel_instance.name_;
-                const auto& perf    = kernel_instance.perf_result_;
-
-                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
-                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
-                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
-                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
-                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
-                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
-                     << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed
-                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
-                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
-                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
-                     << "\n";
-
-                if(!file)
-                {
-                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
-                }
-            }
-        }
-
-        return kernel_instance;
-    }
-
-    GemmProfiler(const GemmProfiler&)            = delete;
-    GemmProfiler& operator=(const GemmProfiler&) = delete;
-
-    private:
-    ~GemmProfiler() { kernel_instances_.clear(); }
-    GemmProfiler(Setting setting) : setting_(setting) {}
-
-    Setting setting_;
-
-    std::vector<KernelInstance> kernel_instances_;
-};
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp
new file mode 100644
index 00000000000..4cf272a3c82
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp
@@ -0,0 +1,66 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+#include <iomanip>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm/gemm_benchmark.hpp"
+
+// Data types and Layouts are defined by the generated kernel headers
+// No hardcoded type definitions here to avoid conflicts
+
+/// @brief Function to get the kernel output with reference implementation on CPU/GPU
+void gemm_host_reference(int verify,
+                         ck_tile::HostTensor<ADataType>& a_m_k,
+                         ck_tile::HostTensor<BDataType>& b_k_n,
+                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                         ck_tile::DeviceMem& a_m_k_dev_buf,
+                         ck_tile::DeviceMem& b_k_n_dev_buf,
+                         ck_tile::index_t M,
+                         ck_tile::index_t N,
+                         ck_tile::index_t K,
+                         ck_tile::index_t stride_A,
+                         ck_tile::index_t stride_B,
+                         ck_tile::index_t stride_C)
+{
+    if(verify == 1)
+    {
+        c_m_n_host_result.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_result);
+    }
+    else if(verify == 2)
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes());
+        c_m_n_host_result.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data());
+    }
+}
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
new file mode 100755
index 00000000000..4fa5d5dee0e
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import os
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+import importlib.util
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+def _import_gemm_benchmark():
+    """Import validation utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
+
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "gemm_benchmark",
+        os.path.join(parent_dir, "gemm_benchmark.py"),
+    )
+    gemm_benchmark_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(gemm_benchmark_module)
+
+    return gemm_benchmark_module.GemmBenchmark
+
+def _import_benchmark_utils():
+    """Import benchmark utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(os.path.dirname(current_dir))
+
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_utils",
+        os.path.join(parent_dir, "common", "benchmark_utils.py"),
+    )
+    benchmark_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(benchmark_utils)
+
+    return benchmark_utils
+
+GemmBenchmark = _import_gemm_benchmark()
+benchmark_utils = _import_benchmark_utils()
+
+class GemmUniversalBenchmark(GemmBenchmark):
+    def __init__(self, build_dir: str, verbose: bool = False):
+        super().__init__(build_dir, verbose, name="benchmark_gemm_")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
+    parser.add_argument(
+        "build_dir", help="Build directory containing kernel executables"
+    )
+    parser.add_argument(
+        "--problem-sizes",
+        nargs="+",
+        default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"],
+        help="Problem sizes as M,N,K tuples",
+    )
+    parser.add_argument(
+        "--split-k", nargs="+", type=int, default=[1], help="Split-K values to test"
+    )
+    parser.add_argument("--verify", action="store_true", help="Enable verification")
+    parser.add_argument(
+        "--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
+    )
+    parser.add_argument(
+        "--best", default="best_kernels.txt", help="Best kernels output filename"
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=50,
+        help="Number of warmup iterations (default: 50)",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations (default: 100)",
+    )
+    parser.add_argument(
+        "--flush-cache",
+        action="store_true",
+        default=True,
+        help="Enable cache flushing (default: True)",
+    )
+    parser.add_argument(
+        "--rotating-count",
+        type=int,
+        default=1000,
+        help="Number of iterations to rotate cache (default: 1000)",
+    )
+    parser.add_argument("--json", help="JSON output filename (optional)")
+
+    args = parser.parse_args()
+
+    # Parse problem sizes
+    problem_sizes = []
+    for size_str in args.problem_sizes:
+        try:
+            m, n, k = map(int, size_str.split(","))
+            problem_sizes.append((m, n, k))
+        except ValueError:
+            print(f"Invalid problem size: {size_str}")
+            return 1
+
+    # Create benchmark instance
+    benchmark = GemmUniversalBenchmark(args.build_dir, verbose=args.verbose)
+
+    # Run benchmark sweep
+    print("Starting GEMM kernel benchmark sweep...")
+    start_time = time.time()
+
+    best_kernels = benchmark.benchmark_sweep(
+        problem_sizes=problem_sizes,
+        split_k_values=args.split_k,
+        verify=args.verify,
+        warmup=args.warmup,
+        repeat=args.repeat,
+        flush_cache=args.flush_cache,
+        rotating_count=args.rotating_count,
+    )
+
+    elapsed_time = time.time() - start_time
+    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
+
+    # Export results
+    benchmark_utils.export_csv(benchmark.results, args.csv)
+    benchmark_utils.export_best_kernels(best_kernels, args.best)
+
+    # Export JSON if requested
+    if args.json:
+        benchmark_utils.export_json(benchmark.results, args.json, best_kernels)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
new file mode 100644
index 00000000000..b2015f8571d
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <iostream>
+#include <functional>
+#include <tuple>
+#include <exception>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm/gemm_common.hpp"
+#include "gemm_universal_profiler.hpp"
+
+// The kernel header is included via the compile command line with -include flag
+// It defines SelectedKernel struct and KERNEL_NAME
+
+void benchmark_single(const ck_tile::ArgParser& arg_parser)
+{
+    // Use DataTypeTraits to get the actual type names from the generated header
+    // The generated header defines ADataType, BDataType, AccDataType, CDataType
+    std::string dtype_a   = ck_tile::DataTypeTraits<ADataType>::name;
+    std::string dtype_b   = ck_tile::DataTypeTraits<BDataType>::name;
+    std::string dtype_acc = ck_tile::DataTypeTraits<AccDataType>::name;
+    std::string dtype_c   = ck_tile::DataTypeTraits<CDataType>::name;
+
+    // Layout names from the layout types
+    std::string layout_a = ALayout::name;
+    std::string layout_b = BLayout::name;
+    std::string layout_c = CLayout::name;
+
+    // Create GemmProblem struct
+    GemmProblem gemm_problem{arg_parser.get_int("split_k"),
+                             arg_parser.get_int("m"),
+                             arg_parser.get_int("n"),
+                             arg_parser.get_int("k"),
+                             arg_parser.get_int("stride_a"),
+                             arg_parser.get_int("stride_b"),
+                             arg_parser.get_int("stride_c"),
+                             dtype_a,
+                             dtype_b,
+                             dtype_acc,
+                             dtype_c,
+                             layout_a,
+                             layout_b,
+                             layout_c,
+                             arg_parser.get_bool("structured_sparsity")};
+
+    // Create Setting struct
+    Setting setting{arg_parser.get_int("warmup"),
+                    arg_parser.get_int("repeat"),
+                    arg_parser.get_bool("timer"),
+                    arg_parser.get_int("verify"),
+                    arg_parser.get_int("init"),
+                    arg_parser.get_bool("log"),
+                    arg_parser.get_str("csv_filename"),
+                    arg_parser.get_bool("flush_cache"),
+                    arg_parser.get_int("rotating_count"),
+                    arg_parser.get_bool("json_output")};
+
+    // Get the profiler instance
+    auto& profiler = UniversalGemmProfiler::GemmProfiler::instance(setting);
+
+    try
+    {
+        // Create a lambda that wraps the kernel launch
+        auto kernel_func = [](const ck_tile::GemmHostArgs& args,
+                              const ck_tile::stream_config& stream) {
+            return SelectedKernel::launch(args, stream);
+        };
+
+        // Benchmark the kernel
+        profiler.benchmark(gemm_problem, kernel_func);
+
+        // Select best instance based on metric
+        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Benchmark failed: " << e.what() << std::endl;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, parser] = create_args(argc, argv);
+        if(!result)
+            return EXIT_FAILURE;
+
+        benchmark_single(parser);
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << "\n";
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
new file mode 100644
index 00000000000..6cfdcab8009
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
@@ -0,0 +1,147 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "gemm/gemm_benchmark.hpp"
+#include "gemm/gemm_profiler.hpp"
+#include "gemm_universal_benchmark.hpp"
+
+class UniversalGemmProfiler
+    : public GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>
+{
+    public:
+    using BaseGemm = GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>;
+    using BaseGemm::benchmark;
+
+    UniversalGemmProfiler(Setting setting)
+        : GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
+    {
+    }
+
+    void benchmark(GemmProblem& gemm_problem,
+                   std::vector<std::function<std::tuple<std::string, float>(
+                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override
+    {
+        const ALayout layout_a = ALayout{};
+        const BLayout layout_b = BLayout{};
+        const CLayout layout_c = CLayout{};
+
+        gemm_problem.stride_a_ = ck_tile::get_default_stride(
+            gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a));
+        gemm_problem.stride_b_ = ck_tile::get_default_stride(
+            gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b));
+        gemm_problem.stride_c_ = ck_tile::get_default_stride(
+            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c));
+
+        ck_tile::HostTensor<ADataType> a_m_k(ck_tile::host_tensor_descriptor(
+            gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a)));
+        ck_tile::HostTensor<BDataType> b_k_n(ck_tile::host_tensor_descriptor(
+            gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b)));
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(ck_tile::host_tensor_descriptor(
+            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)));
+
+        if(setting_.init_method_ == 0)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        }
+        else if(setting_.init_method_ == 1)
+        {
+            ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+            ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+        }
+        else if(setting_.init_method_ == 2)
+        {
+            ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
+            ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
+        }
+        else
+        {
+            a_m_k.SetZero();
+            b_k_n.SetZero();
+        }
+
+        if(gemm_problem.structured_sparsity_)
+        {
+            ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+        }
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            // permute_tensor_b<decltype(b_k_n_dev)>(b_k_n_dev);
+            ck_tile::permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+
+        ck_tile::GemmHostArgs gemm_args = {
+            a_m_k_dev_buf.GetDeviceBuffer(),
+            b_k_n_dev_buf.GetDeviceBuffer(),
+            c_m_n_dev_buf.GetDeviceBuffer(),
+            gemm_problem.split_k_,
+            gemm_problem.m_,
+            gemm_problem.n_,
+            gemm_problem.k_,
+            gemm_problem.stride_a_,
+            gemm_problem.stride_b_,
+            gemm_problem.stride_c_,
+        };
+
+        ck_tile::HostTensor<CDataType> c_m_n_host_result(ck_tile::host_tensor_descriptor(
+            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)));
+
+        if(setting_.verify_)
+        {
+            gemm_host_reference(setting_.verify_,
+                                a_m_k,
+                                b_k_n,
+                                c_m_n_host_result,
+                                a_m_k_dev_buf,
+                                b_k_n_dev_buf,
+                                gemm_problem.m_,
+                                gemm_problem.n_,
+                                gemm_problem.k_,
+                                gemm_problem.stride_a_,
+                                gemm_problem.stride_b_,
+                                gemm_problem.stride_c_);
+        }
+
+        for(auto& callable : callables)
+        {
+            auto kernel_run_result = callable(gemm_args,
+                                              ck_tile::stream_config{nullptr,
+                                                                     true,
+                                                                     setting_.log_,
+                                                                     setting_.n_warmup_,
+                                                                     setting_.n_repeat_,
+                                                                     setting_.is_gpu_timer_,
+                                                                     setting_.flush_cache_,
+                                                                     setting_.rotating_count_});
+            process_result(gemm_problem,
+                           c_m_n_dev_buf,
+                           c_m_n_host_result,
+                           c_m_n_dev_result,
+                           kernel_run_result);
+        }
+    }
+};