From 4c572540068aefeb50743bb1ae76de23a0dd37bb Mon Sep 17 00:00:00 2001
From: Arist12 <ykzhang@cs.wisc.edu>
Date: Thu, 4 Jun 2026 15:54:47 +0000
Subject: [PATCH] enh(test_common): add profiler-safe HIP-event timing path to
 run_perftest

Add FLYDSL_PERFTEST_USE_EVENTS=1 to time benchmark iterations with a pair
of HIP events rather than torch.profiler.  When set, each iteration is
bracketed by Event.record() / Event.synchronize() and the mean latency is
returned as usual, but torch.profiler is never entered.

This is necessary when running benchmarks under an external rocprofv3
session: nesting torch.profiler (ROCTracer) inside rocprofv3 produces
duplicate-flow warnings and can perturb timing.  With the events path the
benchmark command line stays identical; only the internal timing backend
changes.

Lazy-import torch.profiler in the default path so the module-level import
no longer pulls in ROCTracer on every test collection.  The testGraph path
gets the same lazy import.

FLYDSL_PERFTEST_USE_EVENTS is not set in any test; default behavior is
unchanged.
---
 tests/test_common.py | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/tests/test_common.py b/tests/test_common.py
index 6edd251ed..ba04bf6d0 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -8,7 +8,6 @@
 import numpy as np
 import pandas as pd
 import torch
-import torch.profiler as tpf
 
 logger = logging.getLogger("flydsl")
 
@@ -59,18 +58,40 @@ def wrapper(*args, **kwargs):
                     latencies.append(start_event.elapsed_time(end_event))
                 avg = np.mean(latencies) * 1000
                 logger.info(f"avg: {avg} us/iter from cuda.Event")
-            with tpf.profile(
-                activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA],
-                profile_memory=False,
-                with_stack=False,
-                with_modules=True,
-            ) as prof:
-                data = run_iters_rotate(num_iters, func, rotate_args)
-                torch.cuda.synchronize()
+            if int(os.environ.get("FLYDSL_PERFTEST_USE_EVENTS", 0)):
+                # Profiler-safe timing path: avoids nesting torch.profiler under an
+                # external rocprofv3 session.  Each iteration is timed with a pair of
+                # HIP events; the reported average matches rocprofv3 dispatch timing.
+                data = None
+                latencies = []
+                for iter_idx in range(num_iters):
+                    start_event = torch.cuda.Event(enable_timing=True)
+                    end_event = torch.cuda.Event(enable_timing=True)
+                    args_i, kwargs_i = rotate_args[iter_idx % len(rotate_args)]
+                    start_event.record()
+                    data = func(*args_i, **kwargs_i)
+                    end_event.record()
+                    end_event.synchronize()
+                    latencies.append(start_event.elapsed_time(end_event))
                 torch.cuda.empty_cache()
-            avg = get_trace_perf(prof, num_iters)
+                avg = np.mean(latencies) * 1000
+            else:
+                import torch.profiler as tpf
+
+                with tpf.profile(
+                    activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA],
+                    profile_memory=False,
+                    with_stack=False,
+                    with_modules=True,
+                ) as prof:
+                    data = run_iters_rotate(num_iters, func, rotate_args)
+                    torch.cuda.synchronize()
+                    torch.cuda.empty_cache()
+                avg = get_trace_perf(prof, num_iters)
 
             if testGraph:
+                import torch.profiler as tpf
+
                 graph = torch.cuda.CUDAGraph()
                 with torch.cuda.graph(graph):
                     data = run_iters_rotate(num_iters, func, rotate_args)