diff --git a/tests/test_common.py b/tests/test_common.py
index 6edd251e..ba04bf6d 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -8,7 +8,6 @@
 import numpy as np
 import pandas as pd
 import torch
-import torch.profiler as tpf
 
 logger = logging.getLogger("flydsl")
 
@@ -59,18 +58,40 @@ def wrapper(*args, **kwargs):
                     latencies.append(start_event.elapsed_time(end_event))
                 avg = np.mean(latencies) * 1000
                 logger.info(f"avg: {avg} us/iter from cuda.Event")
-            with tpf.profile(
-                activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA],
-                profile_memory=False,
-                with_stack=False,
-                with_modules=True,
-            ) as prof:
-                data = run_iters_rotate(num_iters, func, rotate_args)
-                torch.cuda.synchronize()
+            if int(os.environ.get("FLYDSL_PERFTEST_USE_EVENTS", 0)):
+                # Profiler-safe timing path: avoids nesting torch.profiler under an
+                # external rocprofv3 session.  Each iteration is timed with a pair of
+                # HIP events; the reported average matches rocprofv3 dispatch timing.
+                data = None
+                latencies = []
+                for iter_idx in range(num_iters):
+                    start_event = torch.cuda.Event(enable_timing=True)
+                    end_event = torch.cuda.Event(enable_timing=True)
+                    args_i, kwargs_i = rotate_args[iter_idx % len(rotate_args)]
+                    start_event.record()
+                    data = func(*args_i, **kwargs_i)
+                    end_event.record()
+                    end_event.synchronize()
+                    latencies.append(start_event.elapsed_time(end_event))
                 torch.cuda.empty_cache()
-            avg = get_trace_perf(prof, num_iters)
+                avg = np.mean(latencies) * 1000
+            else:
+                import torch.profiler as tpf
+
+                with tpf.profile(
+                    activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA],
+                    profile_memory=False,
+                    with_stack=False,
+                    with_modules=True,
+                ) as prof:
+                    data = run_iters_rotate(num_iters, func, rotate_args)
+                    torch.cuda.synchronize()
+                    torch.cuda.empty_cache()
+                avg = get_trace_perf(prof, num_iters)
 
             if testGraph:
+                import torch.profiler as tpf
+
                 graph = torch.cuda.CUDAGraph()
                 with torch.cuda.graph(graph):
                     data = run_iters_rotate(num_iters, func, rotate_args)