diff --git a/tests/test_common.py b/tests/test_common.py index 6edd251e..ba04bf6d 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd import torch -import torch.profiler as tpf logger = logging.getLogger("flydsl") @@ -59,18 +58,40 @@ def wrapper(*args, **kwargs): latencies.append(start_event.elapsed_time(end_event)) avg = np.mean(latencies) * 1000 logger.info(f"avg: {avg} us/iter from cuda.Event") - with tpf.profile( - activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA], - profile_memory=False, - with_stack=False, - with_modules=True, - ) as prof: - data = run_iters_rotate(num_iters, func, rotate_args) - torch.cuda.synchronize() + if int(os.environ.get("FLYDSL_PERFTEST_USE_EVENTS", 0)): + # Profiler-safe timing path: avoids nesting torch.profiler under an + # external rocprofv3 session. Each iteration is timed with a pair of + # HIP events; the reported average matches rocprofv3 dispatch timing. + data = None + latencies = [] + for iter_idx in range(num_iters): + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + args_i, kwargs_i = rotate_args[iter_idx % len(rotate_args)] + start_event.record() + data = func(*args_i, **kwargs_i) + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) torch.cuda.empty_cache() - avg = get_trace_perf(prof, num_iters) + avg = np.mean(latencies) * 1000 + else: + import torch.profiler as tpf + + with tpf.profile( + activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA], + profile_memory=False, + with_stack=False, + with_modules=True, + ) as prof: + data = run_iters_rotate(num_iters, func, rotate_args) + torch.cuda.synchronize() + torch.cuda.empty_cache() + avg = get_trace_perf(prof, num_iters) if testGraph: + import torch.profiler as tpf + graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph): data = run_iters_rotate(num_iters, func, rotate_args)