@@ -2325,9 +2325,46 @@ static void theoretic_tops_cuda( //
23252325 state.counters [" TOP" ] = benchmark::Counter (tops_per_gpu * state.iterations (), benchmark::Counter::kIsRate );
23262326}
23272327
2328+ extern __global__ void tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel ();
2329+ extern __global__ void tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel ();
2330+ extern __global__ void tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel ();
2331+ extern __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel ();
2332+ extern __global__ void tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel ();
2333+ extern __global__ void tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel ();
2334+
2335+ BENCHMARK_CAPTURE ( //
2336+ theoretic_tops_cuda, f32f32_sm60fma, tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel, //
2337+ 16 , 16 , 16 , 60 , 128 , tensor_core_scale_t ::single_k)
2338+ ->MinTime(10 );
2339+ BENCHMARK_CAPTURE ( //
2340+ theoretic_tops_cuda, f64f64_sm60fma, tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel, //
2341+ 16 , 16 , 16 , 60 , 128 , tensor_core_scale_t ::single_k)
2342+ ->MinTime(10 );
2343+ BENCHMARK_CAPTURE ( //
2344+ theoretic_tops_cuda, i32i32_sm60fma, tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel, //
2345+ 16 , 16 , 16 , 60 , 128 , tensor_core_scale_t ::single_k)
2346+ ->MinTime(10 );
2347+ BENCHMARK_CAPTURE ( //
2348+ theoretic_tops_cuda, i64i64_sm60fma, tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel, //
2349+ 16 , 16 , 16 , 60 , 128 , tensor_core_scale_t ::single_k)
2350+ ->MinTime(10 );
2351+ BENCHMARK_CAPTURE ( //
2352+ theoretic_tops_cuda, u8u32_sm60fma, tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel, //
2353+ 16 , 16 , 64 , 60 , 128 , tensor_core_scale_t ::single_k)
2354+ ->MinTime(10 );
2355+ BENCHMARK_CAPTURE ( //
2356+ theoretic_tops_cuda, u24u32_sm60fma, tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel, //
2357+ 16 , 16 , 16 , 60 , 128 , tensor_core_scale_t ::single_k)
2358+ ->MinTime(10 );
2359+
2360+ extern __global__ void tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel ();
23282361extern __global__ void tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel ();
23292362extern __global__ void tops_f16f32_sm70wmma_16x16x16_loop128_cuda_kernel ();
23302363
2364+ BENCHMARK_CAPTURE ( //
2365+ theoretic_tops_cuda, f16f16_sm60fma, tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel, //
2366+ 16 , 16 , 16 , 70 , 128 , tensor_core_scale_t ::single_k)
2367+ ->MinTime(10 );
23312368BENCHMARK_CAPTURE ( //
23322369 theoretic_tops_cuda, f16f16_sm70wmma, tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel, //
23332370 16 , 16 , 16 , 70 , 128 , tensor_core_scale_t ::warp_k)
@@ -2354,11 +2391,16 @@ BENCHMARK_CAPTURE(
23542391 8 , 8 , 128 , 75 , 128 , tensor_core_scale_t ::warp_k)
23552392 ->MinTime(10 );
23562393
2394+ extern __global__ void tops_bf16bf16_sm80fma_16x16x16_loop128_cuda_kernel ();
23572395extern __global__ void tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel ();
23582396extern __global__ void tops_tf32f32_sm80wmma_16x16x8_loop128_cuda_kernel ();
23592397extern __global__ void tops_f64f64_sm80wmma_8x8x4_loop128_cuda_kernel ();
23602398extern __global__ void tops_b1i32and_sm80wmma_8x8x128_loop128_cuda_kernel ();
23612399
2400+ BENCHMARK_CAPTURE ( //
2401+ theoretic_tops_cuda, bf16bf16_sm60fma, tops_bf16bf16_sm80fma_16x16x16_loop128_cuda_kernel, //
2402+ 16 , 16 , 16 , 75 , 128 , tensor_core_scale_t ::single_k)
2403+ ->MinTime(10 );
23622404BENCHMARK_CAPTURE ( //
23632405 theoretic_tops_cuda, bf16f32_sm80wmma, tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel, //
23642406 16 , 16 , 16 , 80 , 128 , tensor_core_scale_t ::warp_k)
@@ -2393,6 +2435,23 @@ BENCHMARK_CAPTURE(
23932435 64 , 256 , 8 , 90 , 128 , tensor_core_scale_t ::warpgroup_k)
23942436 ->MinTime(10 );
23952437
2438+ extern __global__ void tops_u16u32_sm90dpx_16x16x32_loop128_floyd_warshall_cuda_kernel ();
2439+ extern __global__ void tops_i16i32_sm90dpx_16x16x32_loop128_needleman_wunsch_cuda_kernel ();
2440+ extern __global__ void tops_i32i32_sm90dpx_16x16x16_loop128_smith_waterman_cuda_kernel ();
2441+
2442+ BENCHMARK_CAPTURE ( //
2443+ theoretic_tops_cuda, u16u32_sm90dpx, tops_u16u32_sm90dpx_16x16x32_loop128_floyd_warshall_cuda_kernel, //
2444+ 16 , 16 , 32 , 90 , 128 , tensor_core_scale_t ::single_k)
2445+ ->MinTime(10 );
2446+ BENCHMARK_CAPTURE ( //
2447+ theoretic_tops_cuda, i16i32_sm90dpx, tops_i16i32_sm90dpx_16x16x32_loop128_needleman_wunsch_cuda_kernel, //
2448+ 16 , 16 , 32 , 90 , 128 , tensor_core_scale_t ::single_k)
2449+ ->MinTime(10 );
2450+ BENCHMARK_CAPTURE ( //
2451+ theoretic_tops_cuda, i32i32_sm90dpx, tops_i32i32_sm90dpx_16x16x16_loop128_smith_waterman_cuda_kernel, //
2452+ 16 , 16 , 16 , 90 , 128 , tensor_core_scale_t ::single_k)
2453+ ->MinTime(10 );
2454+
23962455#include < filesystem> // `std::filesystem::absolute` to locate PTX IR file
23972456
23982457/* *
0 commit comments