JuliaParallel · jpsamaroo · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/Project.toml b/Project.toml
@@ -40,6 +40,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+LinuxPerf = "b4c46c6c-4fb0-484d-a11a-41bc3392d094"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
@@ -54,6 +55,7 @@ GraphVizExt = "GraphViz"
 GraphVizSimpleExt = "Colors"
 IntelExt = "oneAPI"
 JSON3Ext = "JSON3"
+LinuxPerfExt = "LinuxPerf"
 MetalExt = "Metal"
 OpenCLExt = "OpenCL"
 PlotsExt = ["DataFrames", "Plots"]
@@ -76,6 +78,7 @@ GraphViz = "0.2"
 Graphs = "1"
 JSON3 = "1"
 KernelAbstractions = "0.9"
+LinuxPerf = "0.4.2"
 MacroTools = "0.5"
 MemPool = "0.4.12"
 Metal = "1.1"

diff --git a/ext/LinuxPerfExt.jl b/ext/LinuxPerfExt.jl
@@ -0,0 +1,109 @@
+module LinuxPerfExt
+
+import Dagger
+import LinuxPerf
+
+import TimespanLogging
+import TimespanLogging: Event, init_similar
+
+const DEFAULT_METRIC_SPEC =
+    "cpu-clock, page-faults, context-switches, cpu-migrations, minor-faults, major-faults"
+
+mutable struct LinuxPerfMetrics
+    # Perf-style grouping string, e.g. "(cpu-cycles,instructions), page-faults"
+    metric_spec::String
+    # Lazily constructed bench; metric names are read from bench.groups[*].event_types
+    bench::Union{Nothing, LinuxPerf.PerfBench}
+    # One atomic refcount per EventGroup: enable on 0->1, disable on 1->0
+    refcounts::Vector{Threads.Atomic{Int}}
+    # Baseline counter snapshots per active event, keyed by (category, id)
+    active_baselines::Dict{Any, Vector{UInt64}}
+end
+
+LinuxPerfMetrics(spec::String) =
+    LinuxPerfMetrics(spec, nothing, Threads.Atomic{Int}[], Dict{Any,Vector{UInt64}}())
+LinuxPerfMetrics() = LinuxPerfMetrics(DEFAULT_METRIC_SPEC)
+init_similar(m::LinuxPerfMetrics) = LinuxPerfMetrics(m.metric_spec)
+
+# Build the PerfBench from the metric spec string on first use.
+function _ensure_bench!(metrics::LinuxPerfMetrics)
+    if metrics.bench === nothing
+        groups_spec = LinuxPerf.parse_groups(metrics.metric_spec)
+        event_groups = LinuxPerf.EventGroup[LinuxPerf.EventGroup(g) for g in groups_spec]
+        metrics.bench = LinuxPerf.PerfBench(0, event_groups)
+        metrics.refcounts = [Threads.Atomic{Int}(0) for _ in event_groups]
+    end
+    return metrics.bench
+end
+
+# Read raw counter values from every group, ordered group-by-group.
+# Read format per group: [nr, time_enabled, time_running, value_1, ..., value_nr]
+function _read_raw_counters(bench::LinuxPerf.PerfBench)
+    vals = UInt64[]
+    for g in bench.groups
+        g.leader_fd == -1 && continue
+        buf = Vector{UInt64}(undef, length(g) + 3)
+        read!(g.leader_io, buf)
+        for i in 1:length(g)
+            push!(vals, buf[3 + i])
+        end
+    end
+    return vals
+end
+
+# Subtract baseline from current snapshot; build name->delta dict using
+# metric names sourced from the bench's EventGroup event_types.
+function _delta_to_dict(bench::LinuxPerf.PerfBench, baseline::Vector{UInt64})
+    current = _read_raw_counters(bench)
+    result  = Dict{String, Int64}()
+    idx = 1
+    for g in bench.groups
+        g.leader_fd == -1 && continue
+        for et in g.event_types
+            name = get(LinuxPerf.EVENT_TO_NAME, et, string(et))
+            result[name] = Int64(current[idx] - baseline[idx])
+            idx += 1
+        end
+    end
+    return result
+end
+
+function (metrics::LinuxPerfMetrics)(ev::Event{:start})
+    bench = _ensure_bench!(metrics)
+    isempty(bench.groups) && return nothing
+
+    # Enable each group on the 0->1 transition only
+    for (group, rc) in zip(bench.groups, metrics.refcounts)
+        old = Threads.atomic_add!(rc, 1)
+        if old == 0
+            LinuxPerf.reset!(group)
+            LinuxPerf.enable!(group)
+        end
+    end
+
+    metrics.active_baselines[(ev.category, ev.id)] = _read_raw_counters(bench)
+    nothing
+end
+
+function (metrics::LinuxPerfMetrics)(ev::Event{:finish})
+    bench = metrics.bench
+    bench === nothing && return nothing
+
+    baseline = pop!(metrics.active_baselines, (ev.category, ev.id), nothing)
+    baseline === nothing && return nothing
+
+    # Read delta before potentially disabling any group
+    result = _delta_to_dict(bench, baseline)
+
+    # Disable each group on the 1->0 transition only
+    for (group, rc) in zip(bench.groups, metrics.refcounts)
+        old = Threads.atomic_sub!(rc, 1)
+        if old == 1
+            LinuxPerf.disable!(group)
+        end
+    end
+
+    return result
+end
+
+end # module LinuxPerfExt
diff --git a/src/utils/logging-events.jl b/src/utils/logging-events.jl
@@ -276,4 +276,94 @@ function (::TaskToChunk)(ev::Event{:finish})
     return
 end
 
+"""
+    GCStats
+
+Tracks GC allocations (`Base.GC_Diff.allocd`) between the start and finish of
+each event, using the `gc_num` field already captured in each `Event`.
+"""
+mutable struct GCStats
+    active_starts::Dict{Any, Base.GC_Num}
+end
+GCStats() = GCStats(Dict{Any, Base.GC_Num}())
+init_similar(::GCStats) = GCStats()
+
+function (gs::GCStats)(ev::Event{:start})
+    gs.active_starts[(ev.category, ev.id)] = ev.gc_num
+    nothing
+end
+function (gs::GCStats)(ev::Event{:finish})
+    start_gc = pop!(gs.active_starts, (ev.category, ev.id), nothing)
+    start_gc === nothing && return nothing
+    return Base.GC_Diff(ev.gc_num, start_gc).allocd
+end
+
+"""
+    LockContentionMetrics
+
+Tracks the number of lock contention events between the start and finish of
+each event. Uses a global atomic refcount to enable/disable
+`Base.Threads.lock_profiling` only while at least one event is in-flight.
+"""
+const _lock_contention_refcount = Threads.Atomic{Int}(0)
+
+mutable struct LockContentionMetrics
+    active_starts::Dict{Any, Int64}
+end
+LockContentionMetrics() = LockContentionMetrics(Dict{Any, Int64}())
+init_similar(::LockContentionMetrics) = LockContentionMetrics()
+
+function (lc::LockContentionMetrics)(ev::Event{:start})
+    old = Threads.atomic_add!(_lock_contention_refcount, 1)
+    if old == 0
+        Base.Threads.lock_profiling(true)
+    end
+    lc.active_starts[(ev.category, ev.id)] = Int64(Base.Threads.LOCK_CONFLICT_COUNT[])
+    nothing
+end
+function (lc::LockContentionMetrics)(ev::Event{:finish})
+    baseline = pop!(lc.active_starts, (ev.category, ev.id), nothing)
+    baseline === nothing && return nothing
+    current = Int64(Base.Threads.LOCK_CONFLICT_COUNT[])
+    old = Threads.atomic_sub!(_lock_contention_refcount, 1)
+    if old == 1
+        Base.Threads.lock_profiling(false)
+    end
+    return current - baseline
+end
+
+"""
+    CompileTimeMetrics
+
+Tracks cumulative Julia compile time (via `Base.cumulative_compile_time_ns`)
+between the start and finish of each event. Uses a global atomic refcount to
+enable/disable timing only while at least one event is in-flight.
+"""
+const _compile_timing_refcount = Threads.Atomic{Int}(0)
+
+mutable struct CompileTimeMetrics
+    active_starts::Dict{Any, Int64}
+end
+CompileTimeMetrics() = CompileTimeMetrics(Dict{Any, Int64}())
+init_similar(::CompileTimeMetrics) = CompileTimeMetrics()
+
+function (ct::CompileTimeMetrics)(ev::Event{:start})
+    old = Threads.atomic_add!(_compile_timing_refcount, 1)
+    if old == 0
+        Base.cumulative_compile_timing(true)
+    end
+    ct.active_starts[(ev.category, ev.id)] = Int64(Base.cumulative_compile_time_ns()[1])
+    nothing
+end
+function (ct::CompileTimeMetrics)(ev::Event{:finish})
+    baseline = pop!(ct.active_starts, (ev.category, ev.id), nothing)
+    baseline === nothing && return nothing
+    current = Int64(Base.cumulative_compile_time_ns()[1])
+    old = Threads.atomic_sub!(_compile_timing_refcount, 1)
+    if old == 1
+        Base.cumulative_compile_timing(false)
+    end
+    return current - baseline
+end
+
 end # module Events
diff --git a/src/utils/logging.jl b/src/utils/logging.jl
@@ -18,6 +18,10 @@ Extra events:
 - `taskuidtotid::Bool`: Enables reporting of task UID-to-TID mappings
 - `tasktochunk::Bool`: Enables reporting of DTask-to-Chunk mappings
 - `profile::Bool`: Enables profiling of task execution; not currently recommended, as it adds significant overhead
+- `linuxperf::Bool`: Enables Linux perf event collection (requires LinuxPerf.jl to be loaded)
+- `gc_stats::Bool`: Enables GC allocation tracking per event
+- `lock_contend::Bool`: Enables lock contention counting per event
+- `compile_time::Bool`: Enables Julia compile-time tracking per event
 """
 function enable_logging!(;metrics::Bool=false,
                           timeline::Bool=false,
@@ -30,7 +34,12 @@ function enable_logging!(;metrics::Bool=false,
                           taskresult::Bool=false,
                           taskuidtotid::Bool=false,
                           tasktochunk::Bool=false,
-                          profile::Bool=false)
+                          profile::Bool=false,
+                          linuxperf::Bool=false,
+                          linuxperf_metrics::String="",
+                          gc_stats::Bool=false,
+                          lock_contend::Bool=false,
+                          compile_time::Bool=false)
     ml = TimespanLogging.MultiEventLog()
     ml[:core] = TimespanLogging.Events.CoreMetrics()
     ml[:id] = TimespanLogging.Events.IDMetrics()
@@ -81,10 +90,35 @@ function enable_logging!(;metrics::Bool=false,
         ml[:esat] = TimespanLogging.Events.EventSaturation()
         ml[:psat] = Dagger.Events.ProcessorSaturation()
     end
+    if linuxperf
+        lp = _make_linuxperf_metrics(linuxperf_metrics)
+        if lp !== nothing
+            ml[:linuxperf] = lp
+        end
+    end
+    if gc_stats
+        ml[:gc_stats] = Dagger.Events.GCStats()
+    end
+    if lock_contend
+        ml[:lock_contend] = Dagger.Events.LockContentionMetrics()
+    end
+    if compile_time
+        ml[:compile_time] = Dagger.Events.CompileTimeMetrics()
+    end
     Dagger.Sch.eager_context().log_sink = ml
     return
 end
 
+function _make_linuxperf_metrics(linuxperf_metrics::String)
+    ext = Base.get_extension(Dagger, :LinuxPerfExt)
+    ext === nothing && throw(ErrorException("LinuxPerf.jl is not loaded"))
+    if !isempty(linuxperf_metrics)
+        return ext.LinuxPerfMetrics(linuxperf_metrics)
+    else
+        return ext.LinuxPerfMetrics()
+    end
+end
+
 """
     disable_logging!()