Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
378 changes: 192 additions & 186 deletions flash-attn3/build.toml
Original file line number Diff line number Diff line change
@@ -1,17 +1,205 @@
[general]
name = "flash-attn3"
version = 1
license = "BSD-3-Clause"
version = 1
backends = ["cuda"]

[general.cuda]
minver = "12.8"
maxver = "13"
minver = "12.6"
# ptxas fails on 13.2, but the 13.0 build is backwards compatible.
maxver = "13.0"


[general.hub]
repo-id = "kernels-community/flash-attn3"

[torch-noarch]
[torch]
stable-abi = "2.9"
src = [
"flash-attn/flash_api_stable.cpp",
]

[kernel.flash_attn_sm80]
backend = "cuda"
cuda-capabilities = ["8.0"]
cuda-flags = [
"-O3",
"-std=c++17",
"--ftemplate-backtrace-limit=0",
"--use_fast_math",
"-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
"-DCUTLASS_ENABLE_GDC_FOR_SM90",
"-DCUTLASS_DEBUG_TRACE_LEVEL=0",
"-DNDEBUG",
]
depends = [
"torch",
"cutlass_4_0",
]
include = ["flash-attn"]
src = [
"flash-attn/block.h",
"flash-attn/copy_sm90_bulk_reduce.hpp",
"flash-attn/epilogue_bwd.hpp",
"flash-attn/epilogue_fwd.hpp",
"flash-attn/flash.h",
"flash-attn/flash_bwd_kernel_sm80.h",
"flash-attn/flash_bwd_kernel_sm90.h",
"flash-attn/flash_bwd_launch_template.h",
"flash-attn/flash_bwd_postprocess_kernel.h",
"flash-attn/flash_bwd_preprocess_kernel.h",
"flash-attn/flash_fwd_launch_template.h",
"flash-attn/flash_fwd_kernel_sm80.h",
"flash-attn/flash_fwd_kernel_sm90.h",
"flash-attn/heuristics.h",
"flash-attn/mainloop_bwd_sm80.hpp",
"flash-attn/mainloop_fwd_sm80.hpp",
"flash-attn/mainloop_bwd_sm90_tma_gmma_ws.hpp",
"flash-attn/mainloop_fwd_sm90_tma_gmma_ws.hpp",
"flash-attn/mask.h",
"flash-attn/named_barrier.hpp",
"flash-attn/pack_gqa.h",
"flash-attn/paged_kv.h",
"flash-attn/rotary.h",
"flash-attn/sm90_pipeline_no_cluster.hpp",
"flash-attn/softmax.h",
"flash-attn/tile_size.h",
"flash-attn/tile_scheduler.hpp",
"flash-attn/instantiations/flash_bwd_hdim128_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim128_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim128_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim128_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_split_softcap_sm80.cu",
]

[kernel.flash_attn]
backend = "cuda"
cuda-capabilities = [
"8.0",
"9.0a",
]
cuda-flags = [
"-O3",
"-std=c++17",
"--ftemplate-backtrace-limit=0",
"--use_fast_math",
"-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
"-DCUTLASS_ENABLE_GDC_FOR_SM90",
"-DCUTLASS_DEBUG_TRACE_LEVEL=0",
"-DNDEBUG",
]
depends = [
"torch",
"cutlass_4_0",
]
src = [
"flash-attn/cuda_check.h",
"flash-attn/flash_fwd_combine.cu",
"flash-attn/flash_fwd_combine_kernel.h",
"flash-attn/flash_fwd_combine_launch_template.h",
"flash-attn/flash.h",
"flash-attn/flash_prepare_scheduler.cu",
"flash-attn/heuristics.h",
"flash-attn/seqlen.h",
"flash-attn/static_switch.h",
"flash-attn/tile_size.h",
"flash-attn/utils.h",
]

[kernel.flash_attn_sm90]
backend = "cuda"
Expand Down Expand Up @@ -360,185 +548,3 @@ src = [
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_split_softcap_sm90.cu",
]

[kernel.flash_attn_sm80]
backend = "cuda"
cuda-capabilities = ["8.0"]
cuda-flags = [
"-O3",
"-std=c++17",
"--ftemplate-backtrace-limit=0",
"--use_fast_math",
"-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
"-DCUTLASS_ENABLE_GDC_FOR_SM90",
"-DCUTLASS_DEBUG_TRACE_LEVEL=0",
"-DNDEBUG",
]
depends = [
"torch",
"cutlass_4_0",
]
include = ["flash-attn"]
src = [
"flash-attn/block.h",
"flash-attn/copy_sm90_bulk_reduce.hpp",
"flash-attn/epilogue_bwd.hpp",
"flash-attn/epilogue_fwd.hpp",
"flash-attn/flash.h",
"flash-attn/flash_bwd_kernel_sm80.h",
"flash-attn/flash_bwd_kernel_sm90.h",
"flash-attn/flash_bwd_launch_template.h",
"flash-attn/flash_bwd_postprocess_kernel.h",
"flash-attn/flash_bwd_preprocess_kernel.h",
"flash-attn/flash_fwd_launch_template.h",
"flash-attn/flash_fwd_kernel_sm80.h",
"flash-attn/flash_fwd_kernel_sm90.h",
"flash-attn/heuristics.h",
"flash-attn/mainloop_bwd_sm80.hpp",
"flash-attn/mainloop_fwd_sm80.hpp",
"flash-attn/mainloop_bwd_sm90_tma_gmma_ws.hpp",
"flash-attn/mainloop_fwd_sm90_tma_gmma_ws.hpp",
"flash-attn/mask.h",
"flash-attn/named_barrier.hpp",
"flash-attn/pack_gqa.h",
"flash-attn/paged_kv.h",
"flash-attn/rotary.h",
"flash-attn/sm90_pipeline_no_cluster.hpp",
"flash-attn/softmax.h",
"flash-attn/tile_size.h",
"flash-attn/tile_scheduler.hpp",
"flash-attn/instantiations/flash_bwd_hdim128_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim128_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim128_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim128_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_split_softcap_sm80.cu",
]

[kernel.flash_attn]
backend = "cuda"
cuda-capabilities = [
"8.0",
"9.0a",
]
cuda-flags = [
"-O3",
"-std=c++17",
"--ftemplate-backtrace-limit=0",
"--use_fast_math",
"-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
"-DCUTLASS_ENABLE_GDC_FOR_SM90",
"-DCUTLASS_DEBUG_TRACE_LEVEL=0",
"-DNDEBUG",
]
depends = [
"torch",
"cutlass_4_0",
]
src = [
"flash-attn/cuda_check.h",
"flash-attn/flash_fwd_combine.cu",
"flash-attn/flash_fwd_combine_kernel.h",
"flash-attn/flash_fwd_combine_launch_template.h",
"flash-attn/flash.h",
"flash-attn/flash_prepare_scheduler.cu",
"flash-attn/heuristics.h",
"flash-attn/seqlen.h",
"flash-attn/static_switch.h",
"flash-attn/tile_size.h",
"flash-attn/utils.h",
]