From d050e38a56d54ab2f617b0c40593b0517772b026 Mon Sep 17 00:00:00 2001 From: Linjun-AMD Date: Thu, 15 Jan 2026 21:30:02 -0600 Subject: [PATCH 1/3] add new tile size for async Signed-off-by: Linjun-AMD --- example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index dd65c0298b3..ef3b2e9545e 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -315,7 +315,7 @@ def scheck(self) -> str: assert False def seqtune(self, max_bm0: int) -> str: - if self.bm0 == max_bm0: + if self.bm0 == max_bm0 or self.bm0 == 64: return "true/*fall back to largest tile*/" else: return f"a.seqlen_q <= {self.bm0}" @@ -847,6 +847,11 @@ def check_hdim_tile( (problem_ctx.hdim, problem_ctx.hdim_v) != (128, 128) and kernel_ctx.tile.F_bm0 != 128 ) + or ( + (problem_ctx.hdim, problem_ctx.hdim_v) == (128, 128) + and kernel_ctx.pipeline.tag != "qr_async" + and kernel_ctx.tile.F_bk0 == 64 + ) ): # non qr_async_trload only support km0=128 tile size when hdim is not 128 # non qr_async only support kn0=128 tile size when hdim is 128 @@ -942,6 +947,7 @@ def get_hdim_tile_size_dict(cls, dtype: str) -> Optional[dict]: ( 96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], (128, 128) : [FmhaFwdTileSize( 16, 32, 64, 128, 32, 128, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 32, -1), FmhaFwdTileSize( 32, 32, 128, 128, 32, 128, 1, 1, 1, 1, 1, 1, 32, 32, 16, 32, 32, 16, -1), + FmhaFwdTileSize( 64, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 16, 16, 32, 16, 16, 16, -1,CppConstraint('get_num_blocks(64) <= num_cus')), FmhaFwdTileSize(128, 64, 32, 128, 16, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1), FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], # (160, 160) : [FmhaFwdTileSize(128, 128 , 32, 160, 32, 160, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], From 4b6dd360cc0429135041c6be09d2d0a3863ad1a2 Mon Sep 17 00:00:00 2001 From: Linjun-AMD Date: Fri, 16 Jan 2026 11:36:24 +0800 Subject: [PATCH 2/3] Update example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index ef3b2e9545e..81c7b067d33 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -947,7 +947,7 @@ def get_hdim_tile_size_dict(cls, dtype: str) -> Optional[dict]: ( 96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], (128, 128) : [FmhaFwdTileSize( 16, 32, 64, 128, 32, 128, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 32, -1), FmhaFwdTileSize( 32, 32, 128, 128, 32, 128, 1, 1, 1, 1, 1, 1, 32, 32, 16, 32, 32, 16, -1), - FmhaFwdTileSize( 64, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 16, 16, 32, 16, 16, 16, -1,CppConstraint('get_num_blocks(64) <= num_cus')), + FmhaFwdTileSize( 64, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 16, 16, 32, 16, 16, 16, -1, CppConstraint('get_num_blocks(64) <= num_cus')), FmhaFwdTileSize(128, 64, 32, 128, 16, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1), FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], # (160, 160) : [FmhaFwdTileSize(128, 128 , 32, 160, 32, 160, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], From 8dae63917217860fc63ea218881257e34a7d23f8 Mon Sep 17 00:00:00 2001 From: Linjun-AMD Date: Sun, 18 Jan 2026 21:36:56 -0600 Subject: [PATCH 3/3] fix lse error Signed-off-by: Linjun-AMD --- .../ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp index 7224ed3a708..e30d4215d68 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp @@ -321,6 +321,8 @@ struct BlockFmhaPipelineQRKSVSAsync { if(num_total_loop <= 0) { + buffer_load_fence(0); // rocm-7.1.1, if whole tile is masked out, need to fence(0) + // otherwise will have compute error(maybe compiler bug?) if constexpr(kStoreLSE) { auto lse =