From 4c572540068aefeb50743bb1ae76de23a0dd37bb Mon Sep 17 00:00:00 2001 From: Arist12 Date: Thu, 4 Jun 2026 15:54:47 +0000 Subject: [PATCH] enh(test_common): add profiler-safe HIP-event timing path to run_perftest Add FLYDSL_PERFTEST_USE_EVENTS=1 to time benchmark iterations with a pair of HIP events rather than torch.profiler. When set, each iteration is bracketed by Event.record() / Event.synchronize() and the mean latency is returned as usual, but torch.profiler is never entered. This is necessary when running benchmarks under an external rocprofv3 session: nesting torch.profiler (ROCTracer) inside rocprofv3 produces duplicate-flow warnings and can perturb timing. With the events path the benchmark command line stays identical; only the internal timing backend changes. Lazy-import torch.profiler in the default path so the module-level import no longer pulls in ROCTracer on every test collection. The testGraph path gets the same lazy import. FLYDSL_PERFTEST_USE_EVENTS is not set in any test; default behavior is unchanged. --- tests/test_common.py | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/tests/test_common.py b/tests/test_common.py index 6edd251ed..ba04bf6d0 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd import torch -import torch.profiler as tpf logger = logging.getLogger("flydsl") @@ -59,18 +58,40 @@ def wrapper(*args, **kwargs): latencies.append(start_event.elapsed_time(end_event)) avg = np.mean(latencies) * 1000 logger.info(f"avg: {avg} us/iter from cuda.Event") - with tpf.profile( - activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA], - profile_memory=False, - with_stack=False, - with_modules=True, - ) as prof: - data = run_iters_rotate(num_iters, func, rotate_args) - torch.cuda.synchronize() + if int(os.environ.get("FLYDSL_PERFTEST_USE_EVENTS", 0)): + # Profiler-safe timing path: avoids nesting torch.profiler under an + # external rocprofv3 session. Each iteration is timed with a pair of + # HIP events; the reported average matches rocprofv3 dispatch timing. + data = None + latencies = [] + for iter_idx in range(num_iters): + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + args_i, kwargs_i = rotate_args[iter_idx % len(rotate_args)] + start_event.record() + data = func(*args_i, **kwargs_i) + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) torch.cuda.empty_cache() - avg = get_trace_perf(prof, num_iters) + avg = np.mean(latencies) * 1000 + else: + import torch.profiler as tpf + + with tpf.profile( + activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA], + profile_memory=False, + with_stack=False, + with_modules=True, + ) as prof: + data = run_iters_rotate(num_iters, func, rotate_args) + torch.cuda.synchronize() + torch.cuda.empty_cache() + avg = get_trace_perf(prof, num_iters) if testGraph: + import torch.profiler as tpf + graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph): data = run_iters_rotate(num_iters, func, rotate_args)