From b3387886d5cd4957067d03e96c7db9ca4c607c3b Mon Sep 17 00:00:00 2001 From: Arist12 Date: Thu, 4 Jun 2026 15:57:25 +0000 Subject: [PATCH] enh(hotspot_analyzer): add --kernel filter for CSV metadata matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing CSV row-selection heuristic matches by comparing the dispatch directory basename against Kernel_Name in the kernel trace CSV. This works for rocprofv3's timestamped output (e.g. 20240101_120000_pa_decode_kernel), but fails completely for the ui_output_agent__dispatch_ layout produced by rocprofv3's ATT decode step — the basename carries no kernel name, only agent and dispatch numbers. When metadata lookup fails the analyzer falls back to ISA-estimated register counts and prints a warning, silently under-reporting VGPR, SGPR, LDS, and occupancy for every ui_output_agent_* trace. Fix by adding a --kernel SUBSTR option that enables an explicit row-selection path: 1. Substrings-matches Kernel_Name against the supplied filter. 2. If the CSV has a Dispatch_Id column and the directory name encodes dispatch_, also requires the row's Dispatch_Id to match — avoiding false matches when a PyTorch reference kernel shares the same name prefix. 3. Falls back gracefully to kernel-name-only matching when Dispatch_Id is absent from the CSV. The legacy heuristic is unchanged and still used when --kernel is not given, so existing timestamped-dir workflows are unaffected. Update the "not matched" warning to tell users about --kernel so the fix is discoverable without reading source. Example: python hotspot_analyzer.py ui_output_agent_15249_dispatch_223 \ --topk 8 --mode src --detail \ --kernel pa_mqa_logits_fp4_kernel_0 --- .../scripts/hotspot_analyzer.py | 74 ++++++++++++++----- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/.claude/skills/kernel-trace-analysis/scripts/hotspot_analyzer.py b/.claude/skills/kernel-trace-analysis/scripts/hotspot_analyzer.py index 2df937b18..2fd7babba 100644 --- a/.claude/skills/kernel-trace-analysis/scripts/hotspot_analyzer.py +++ b/.claude/skills/kernel-trace-analysis/scripts/hotspot_analyzer.py @@ -238,7 +238,7 @@ def print_source_detail(hotspot, source_cache, context=3): print(f" stall={fmt_cycles(inst.stall_cycles):>7} type={inst.stall_type:<12} {inst.asm}") -def read_kernel_metadata(dispatch_dir): +def read_kernel_metadata(dispatch_dir, kernel_filter=""): """Read authoritative resource counts from ``out_kernel_trace.csv`` if present. The ATT ``code.json`` only contains the (possibly single-CU, possibly @@ -246,10 +246,25 @@ def read_kernel_metadata(dispatch_dir): workgroup size. The kernel-trace CSV carries the real launch metadata. Searches the dispatch dir and its parent (staging often copies the CSV next to the ui_output_agent_* dir). Returns {} if not found. + + Row selection priority: + 1. ``kernel_filter`` substring matched against Kernel_Name, optionally + narrowed by Dispatch_Id when the dir name encodes ``dispatch_`` + (rocprofv3 ``ui_output_agent_*_dispatch_`` layout). Dispatch_Id + matching avoids false matches when a PyTorch reference kernel shares + the same name substring. + 2. Bidirectional name heuristic against the directory basename (legacy + path for timestamped dirs like ``20240101_120000_pa_decode_kernel``). """ candidates = [] for base in (dispatch_dir, os.path.dirname(os.path.abspath(dispatch_dir))): candidates += glob.glob(os.path.join(base, "*kernel_trace*.csv")) + + dir_name = os.path.basename(os.path.abspath(dispatch_dir)) + # Extract the dispatch id from rocprofv3's ui_output_agent__dispatch_ layout. + _dispatch_id_m = re.search(r"dispatch_(\d+)$", dir_name) + dispatch_id = _dispatch_id_m.group(1) if _dispatch_id_m else None + for path in candidates: try: with open(path) as f: @@ -258,24 +273,35 @@ def read_kernel_metadata(dispatch_dir): continue if not rows or "Accum_VGPR_Count" not in rows[0]: continue - # Pick the row whose kernel matches the dispatch dir name. The dir is - # usually staged as "_" while the CSV - # Kernel_Name has a trailing index (e.g. dir ".._pa_decode_ps_kernel" - # vs kernel "pa_decode_ps_kernel_0"), so match bidirectionally on the - # timestamp-stripped short name. - dir_name = os.path.basename(os.path.abspath(dispatch_dir)) - short = re.sub(r"^\d{8}_\d{6}_", "", dir_name) # strip YYYYMMDD_HHMMSS_ - - def _matches(kn): - if not kn: - return False - return kn in dir_name or short in kn or kn.startswith(short) or short.startswith(kn) + + has_dispatch_col = "Dispatch_Id" in rows[0] chosen = None - for r in rows: - if _matches(r.get("Kernel_Name", "")): + if kernel_filter: + # Explicit filter: kernel name substring, narrowed by Dispatch_Id when available. + for r in rows: + if kernel_filter not in r.get("Kernel_Name", ""): + continue + if dispatch_id and has_dispatch_col: + if str(r.get("Dispatch_Id", "")).strip() != dispatch_id: + continue chosen = r break + else: + # Legacy heuristic: bidirectional substring match against the dir basename. + # Works for timestamped dirs like ``20240101_120000_pa_decode_kernel``. + short = re.sub(r"^\d{8}_\d{6}_", "", dir_name) # strip YYYYMMDD_HHMMSS_ + + def _matches(kn): + if not kn: + return False + return kn in dir_name or short in kn or kn.startswith(short) or short.startswith(kn) + + for r in rows: + if _matches(r.get("Kernel_Name", "")): + chosen = r + break + if chosen is None: continue # no matching row in this CSV — try the next candidate @@ -457,7 +483,10 @@ def print_reg_pressure(reg_info): print_header("Register Pressure & Occupancy") print(f" Architecture: {reg_info['arch']}") if not reg_info["has_meta"]: - print(" (no kernel_trace CSV found — accum/LDS/SGPR estimated from ISA only)") + print( + " (kernel_trace CSV not matched — accum/LDS/SGPR estimated from ISA only; " + "pass --kernel to enable CSV metadata lookup)" + ) if reg_info["is_vgpr_form"]: print(f" arch_vgpr: {reg_info['arch_vgpr']} (MFMA vgpr-form: accumulators in arch file, no AGPR)") else: @@ -496,6 +525,17 @@ def main(): "--detail", action="store_true", help="Show source snippet + instruction breakdown under each source hotspot" ) parser.add_argument("--context", type=int, default=3, help="Source lines of context around hotspot (default: 3)") + parser.add_argument( + "--kernel", + default="", + metavar="SUBSTR", + help="Kernel name substring for CSV metadata lookup " + "(e.g. 'pa_mqa_logits_fp4_kernel_0'). " + "Required when the dispatch dir name does not encode the kernel name, " + "as with rocprofv3 ui_output_agent_*_dispatch_ directories. " + "Combined with the dispatch id from the dir name when a Dispatch_Id " + "column is present in the CSV.", + ) args = parser.parse_args() if not os.path.isdir(args.dispatch_dir): @@ -515,7 +555,7 @@ def main(): print(f" Total cycles: {fmt_cycles(total_cycles)}") print(f" Total stalls: {fmt_cycles(total_stall)} ({100*total_stall/total_cycles:.1f}% of total cycles)") - meta = read_kernel_metadata(args.dispatch_dir) + meta = read_kernel_metadata(args.dispatch_dir, kernel_filter=args.kernel) reg_info = detect_arch_and_reg_pressure(instructions, meta) print_reg_pressure(reg_info)