Skip to content

Commit 4445bf3

Browse files
authored
Float FMA vs Integer DP4A & DPX Instructions ☣️
2 parents cccf5fc + 9accccd commit 4445bf3

File tree

3 files changed

+285
-44
lines changed

3 files changed

+285
-44
lines changed

.vscode/settings.json

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"accum",
66
"Adelstein",
77
"Andreas",
8+
"APSP",
89
"ashvardanian",
910
"ASIO",
1011
"asynchrony",
@@ -33,6 +34,7 @@
3334
"cuBLAS",
3435
"cuBLASLt",
3536
"CUDA",
37+
"cuobjdump",
3638
"denormal",
3739
"DOTPROD",
3840
"DPDK",
@@ -81,6 +83,7 @@
8183
"MSVC",
8284
"Müller",
8385
"multishot",
86+
"Needleman",
8487
"Neoverse",
8588
"Niebler",
8689
"Niels",
@@ -104,6 +107,7 @@
104107
"quadpair",
105108
"RDMA",
106109
"reorderable",
110+
"semiring",
107111
"Shankhdhar",
108112
"simdjson",
109113
"sinf",
@@ -120,6 +124,7 @@
120124
"Threadblock",
121125
"TMUL",
122126
"Trettner",
127+
"uchar",
123128
"Unbundling",
124129
"Unif",
125130
"unifex",
@@ -132,11 +137,13 @@
132137
"VHDL",
133138
"VNNI",
134139
"VPCLMULQDQ",
135-
"warpgroup",
140+
"WarpGroup",
141+
"Warshall",
136142
"Weis",
137143
"WGMMA",
138144
"wmma",
139145
"Worklog",
146+
"Wunsch",
140147
"XCOMP",
141148
"XFEATURE",
142149
"XTILE",

less_slow.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2325,9 +2325,46 @@ static void theoretic_tops_cuda( //
23252325
state.counters["TOP"] = benchmark::Counter(tops_per_gpu * state.iterations(), benchmark::Counter::kIsRate);
23262326
}
23272327

2328+
extern __global__ void tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel();
2329+
extern __global__ void tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel();
2330+
extern __global__ void tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel();
2331+
extern __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel();
2332+
extern __global__ void tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel();
2333+
extern __global__ void tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel();
2334+
2335+
BENCHMARK_CAPTURE( //
2336+
theoretic_tops_cuda, f32f32_sm60fma, tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel, //
2337+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2338+
->MinTime(10);
2339+
BENCHMARK_CAPTURE( //
2340+
theoretic_tops_cuda, f64f64_sm60fma, tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel, //
2341+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2342+
->MinTime(10);
2343+
BENCHMARK_CAPTURE( //
2344+
theoretic_tops_cuda, i32i32_sm60fma, tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel, //
2345+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2346+
->MinTime(10);
2347+
BENCHMARK_CAPTURE( //
2348+
theoretic_tops_cuda, i64i64_sm60fma, tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel, //
2349+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2350+
->MinTime(10);
2351+
BENCHMARK_CAPTURE( //
2352+
theoretic_tops_cuda, u8u32_sm60fma, tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel, //
2353+
16, 16, 64, 60, 128, tensor_core_scale_t::single_k)
2354+
->MinTime(10);
2355+
BENCHMARK_CAPTURE( //
2356+
theoretic_tops_cuda, u24u32_sm60fma, tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel, //
2357+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2358+
->MinTime(10);
2359+
2360+
extern __global__ void tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel();
23282361
extern __global__ void tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel();
23292362
extern __global__ void tops_f16f32_sm70wmma_16x16x16_loop128_cuda_kernel();
23302363

2364+
BENCHMARK_CAPTURE( //
2365+
theoretic_tops_cuda, f16f16_sm60fma, tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel, //
2366+
16, 16, 16, 70, 128, tensor_core_scale_t::single_k)
2367+
->MinTime(10);
23312368
BENCHMARK_CAPTURE( //
23322369
theoretic_tops_cuda, f16f16_sm70wmma, tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel, //
23332370
16, 16, 16, 70, 128, tensor_core_scale_t::warp_k)
@@ -2354,11 +2391,16 @@ BENCHMARK_CAPTURE(
23542391
8, 8, 128, 75, 128, tensor_core_scale_t::warp_k)
23552392
->MinTime(10);
23562393

2394+
extern __global__ void tops_bf16bf16_sm80fma_16x16x16_loop128_cuda_kernel();
23572395
extern __global__ void tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel();
23582396
extern __global__ void tops_tf32f32_sm80wmma_16x16x8_loop128_cuda_kernel();
23592397
extern __global__ void tops_f64f64_sm80wmma_8x8x4_loop128_cuda_kernel();
23602398
extern __global__ void tops_b1i32and_sm80wmma_8x8x128_loop128_cuda_kernel();
23612399

2400+
BENCHMARK_CAPTURE( //
2401+
theoretic_tops_cuda, bf16bf16_sm60fma, tops_bf16bf16_sm80fma_16x16x16_loop128_cuda_kernel, //
2402+
16, 16, 16, 75, 128, tensor_core_scale_t::single_k)
2403+
->MinTime(10);
23622404
BENCHMARK_CAPTURE( //
23632405
theoretic_tops_cuda, bf16f32_sm80wmma, tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel, //
23642406
16, 16, 16, 80, 128, tensor_core_scale_t::warp_k)
@@ -2393,6 +2435,23 @@ BENCHMARK_CAPTURE(
23932435
64, 256, 8, 90, 128, tensor_core_scale_t::warpgroup_k)
23942436
->MinTime(10);
23952437

2438+
extern __global__ void tops_u16u32_sm90dpx_16x16x32_loop128_floyd_warshall_cuda_kernel();
2439+
extern __global__ void tops_i16i32_sm90dpx_16x16x32_loop128_needleman_wunsch_cuda_kernel();
2440+
extern __global__ void tops_i32i32_sm90dpx_16x16x16_loop128_smith_waterman_cuda_kernel();
2441+
2442+
BENCHMARK_CAPTURE( //
2443+
theoretic_tops_cuda, u16u32_sm90dpx, tops_u16u32_sm90dpx_16x16x32_loop128_floyd_warshall_cuda_kernel, //
2444+
16, 16, 32, 90, 128, tensor_core_scale_t::single_k)
2445+
->MinTime(10);
2446+
BENCHMARK_CAPTURE( //
2447+
theoretic_tops_cuda, i16i32_sm90dpx, tops_i16i32_sm90dpx_16x16x32_loop128_needleman_wunsch_cuda_kernel, //
2448+
16, 16, 32, 90, 128, tensor_core_scale_t::single_k)
2449+
->MinTime(10);
2450+
BENCHMARK_CAPTURE( //
2451+
theoretic_tops_cuda, i32i32_sm90dpx, tops_i32i32_sm90dpx_16x16x16_loop128_smith_waterman_cuda_kernel, //
2452+
16, 16, 16, 90, 128, tensor_core_scale_t::single_k)
2453+
->MinTime(10);
2454+
23962455
#include <filesystem> // `std::filesystem::absolute` to locate PTX IR file
23972456

23982457
/**

0 commit comments

Comments
 (0)