Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
LinuxPerf = "b4c46c6c-4fb0-484d-a11a-41bc3392d094"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Expand All @@ -54,6 +55,7 @@ GraphVizExt = "GraphViz"
GraphVizSimpleExt = "Colors"
IntelExt = "oneAPI"
JSON3Ext = "JSON3"
LinuxPerfExt = "LinuxPerf"
MetalExt = "Metal"
OpenCLExt = "OpenCL"
PlotsExt = ["DataFrames", "Plots"]
Expand All @@ -76,6 +78,7 @@ GraphViz = "0.2"
Graphs = "1"
JSON3 = "1"
KernelAbstractions = "0.9"
LinuxPerf = "0.4.2"
MacroTools = "0.5"
MemPool = "0.4.12"
Metal = "1.1"
Expand Down
109 changes: 109 additions & 0 deletions ext/LinuxPerfExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
module LinuxPerfExt

import Dagger
import LinuxPerf

import TimespanLogging
import TimespanLogging: Event, init_similar

const DEFAULT_METRIC_SPEC =
"cpu-clock, page-faults, context-switches, cpu-migrations, minor-faults, major-faults"

mutable struct LinuxPerfMetrics
# Perf-style grouping string, e.g. "(cpu-cycles,instructions), page-faults"
metric_spec::String
# Lazily constructed bench; metric names are read from bench.groups[*].event_types
bench::Union{Nothing, LinuxPerf.PerfBench}
# One atomic refcount per EventGroup: enable on 0->1, disable on 1->0
refcounts::Vector{Threads.Atomic{Int}}
# Baseline counter snapshots per active event, keyed by (category, id)
active_baselines::Dict{Any, Vector{UInt64}}
end

LinuxPerfMetrics(spec::String) =
LinuxPerfMetrics(spec, nothing, Threads.Atomic{Int}[], Dict{Any,Vector{UInt64}}())
LinuxPerfMetrics() = LinuxPerfMetrics(DEFAULT_METRIC_SPEC)
init_similar(m::LinuxPerfMetrics) = LinuxPerfMetrics(m.metric_spec)

# Build the PerfBench from the metric spec string on first use.
function _ensure_bench!(metrics::LinuxPerfMetrics)
if metrics.bench === nothing
groups_spec = LinuxPerf.parse_groups(metrics.metric_spec)
event_groups = LinuxPerf.EventGroup[LinuxPerf.EventGroup(g) for g in groups_spec]
metrics.bench = LinuxPerf.PerfBench(0, event_groups)
metrics.refcounts = [Threads.Atomic{Int}(0) for _ in event_groups]
end
return metrics.bench
end

# Read raw counter values from every group, ordered group-by-group.
# Read format per group: [nr, time_enabled, time_running, value_1, ..., value_nr]
function _read_raw_counters(bench::LinuxPerf.PerfBench)
vals = UInt64[]
for g in bench.groups
g.leader_fd == -1 && continue
buf = Vector{UInt64}(undef, length(g) + 3)
read!(g.leader_io, buf)
for i in 1:length(g)
push!(vals, buf[3 + i])
end
end
return vals
end

# Subtract baseline from current snapshot; build name->delta dict using
# metric names sourced from the bench's EventGroup event_types.
function _delta_to_dict(bench::LinuxPerf.PerfBench, baseline::Vector{UInt64})
current = _read_raw_counters(bench)
result = Dict{String, Int64}()
idx = 1
for g in bench.groups
g.leader_fd == -1 && continue
for et in g.event_types
name = get(LinuxPerf.EVENT_TO_NAME, et, string(et))
result[name] = Int64(current[idx] - baseline[idx])
idx += 1
end
end
return result
end

function (metrics::LinuxPerfMetrics)(ev::Event{:start})
bench = _ensure_bench!(metrics)
isempty(bench.groups) && return nothing

# Enable each group on the 0->1 transition only
for (group, rc) in zip(bench.groups, metrics.refcounts)
old = Threads.atomic_add!(rc, 1)
if old == 0
LinuxPerf.reset!(group)
LinuxPerf.enable!(group)
end
end

metrics.active_baselines[(ev.category, ev.id)] = _read_raw_counters(bench)
nothing
end

function (metrics::LinuxPerfMetrics)(ev::Event{:finish})
bench = metrics.bench
bench === nothing && return nothing

baseline = pop!(metrics.active_baselines, (ev.category, ev.id), nothing)
baseline === nothing && return nothing

# Read delta before potentially disabling any group
result = _delta_to_dict(bench, baseline)

# Disable each group on the 1->0 transition only
for (group, rc) in zip(bench.groups, metrics.refcounts)
old = Threads.atomic_sub!(rc, 1)
if old == 1
LinuxPerf.disable!(group)
end
end

return result
end

end # module LinuxPerfExt
90 changes: 90 additions & 0 deletions src/utils/logging-events.jl
Original file line number Diff line number Diff line change
Expand Up @@ -276,4 +276,94 @@ function (::TaskToChunk)(ev::Event{:finish})
return
end

"""
GCStats

Tracks GC allocations (`Base.GC_Diff.allocd`) between the start and finish of
each event, using the `gc_num` field already captured in each `Event`.
"""
mutable struct GCStats
active_starts::Dict{Any, Base.GC_Num}
end
GCStats() = GCStats(Dict{Any, Base.GC_Num}())
init_similar(::GCStats) = GCStats()

function (gs::GCStats)(ev::Event{:start})
gs.active_starts[(ev.category, ev.id)] = ev.gc_num
nothing
end
function (gs::GCStats)(ev::Event{:finish})
start_gc = pop!(gs.active_starts, (ev.category, ev.id), nothing)
start_gc === nothing && return nothing
return Base.GC_Diff(ev.gc_num, start_gc).allocd
end

"""
LockContentionMetrics

Tracks the number of lock contention events between the start and finish of
each event. Uses a global atomic refcount to enable/disable
`Base.Threads.lock_profiling` only while at least one event is in-flight.
"""
const _lock_contention_refcount = Threads.Atomic{Int}(0)

mutable struct LockContentionMetrics
active_starts::Dict{Any, Int64}
end
LockContentionMetrics() = LockContentionMetrics(Dict{Any, Int64}())
init_similar(::LockContentionMetrics) = LockContentionMetrics()

function (lc::LockContentionMetrics)(ev::Event{:start})
old = Threads.atomic_add!(_lock_contention_refcount, 1)
if old == 0
Base.Threads.lock_profiling(true)
end
lc.active_starts[(ev.category, ev.id)] = Int64(Base.Threads.LOCK_CONFLICT_COUNT[])
nothing
end
function (lc::LockContentionMetrics)(ev::Event{:finish})
baseline = pop!(lc.active_starts, (ev.category, ev.id), nothing)
baseline === nothing && return nothing
current = Int64(Base.Threads.LOCK_CONFLICT_COUNT[])
old = Threads.atomic_sub!(_lock_contention_refcount, 1)
if old == 1
Base.Threads.lock_profiling(false)
end
return current - baseline
end

"""
CompileTimeMetrics

Tracks cumulative Julia compile time (via `Base.cumulative_compile_time_ns`)
between the start and finish of each event. Uses a global atomic refcount to
enable/disable timing only while at least one event is in-flight.
"""
const _compile_timing_refcount = Threads.Atomic{Int}(0)

mutable struct CompileTimeMetrics
active_starts::Dict{Any, Int64}
end
CompileTimeMetrics() = CompileTimeMetrics(Dict{Any, Int64}())
init_similar(::CompileTimeMetrics) = CompileTimeMetrics()

function (ct::CompileTimeMetrics)(ev::Event{:start})
old = Threads.atomic_add!(_compile_timing_refcount, 1)
if old == 0
Base.cumulative_compile_timing(true)
end
ct.active_starts[(ev.category, ev.id)] = Int64(Base.cumulative_compile_time_ns()[1])
nothing
end
function (ct::CompileTimeMetrics)(ev::Event{:finish})
baseline = pop!(ct.active_starts, (ev.category, ev.id), nothing)
baseline === nothing && return nothing
current = Int64(Base.cumulative_compile_time_ns()[1])
old = Threads.atomic_sub!(_compile_timing_refcount, 1)
if old == 1
Base.cumulative_compile_timing(false)
end
return current - baseline
end

end # module Events
36 changes: 35 additions & 1 deletion src/utils/logging.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ Extra events:
- `taskuidtotid::Bool`: Enables reporting of task UID-to-TID mappings
- `tasktochunk::Bool`: Enables reporting of DTask-to-Chunk mappings
- `profile::Bool`: Enables profiling of task execution; not currently recommended, as it adds significant overhead
- `linuxperf::Bool`: Enables Linux perf event collection (requires LinuxPerf.jl to be loaded)
- `gc_stats::Bool`: Enables GC allocation tracking per event
- `lock_contend::Bool`: Enables lock contention counting per event
- `compile_time::Bool`: Enables Julia compile-time tracking per event
"""
function enable_logging!(;metrics::Bool=false,
timeline::Bool=false,
Expand All @@ -30,7 +34,12 @@ function enable_logging!(;metrics::Bool=false,
taskresult::Bool=false,
taskuidtotid::Bool=false,
tasktochunk::Bool=false,
profile::Bool=false)
profile::Bool=false,
linuxperf::Bool=false,
linuxperf_metrics::String="",
gc_stats::Bool=false,
lock_contend::Bool=false,
compile_time::Bool=false)
ml = TimespanLogging.MultiEventLog()
ml[:core] = TimespanLogging.Events.CoreMetrics()
ml[:id] = TimespanLogging.Events.IDMetrics()
Expand Down Expand Up @@ -81,10 +90,35 @@ function enable_logging!(;metrics::Bool=false,
ml[:esat] = TimespanLogging.Events.EventSaturation()
ml[:psat] = Dagger.Events.ProcessorSaturation()
end
if linuxperf
lp = _make_linuxperf_metrics(linuxperf_metrics)
if lp !== nothing
ml[:linuxperf] = lp
end
end
if gc_stats
ml[:gc_stats] = Dagger.Events.GCStats()
end
if lock_contend
ml[:lock_contend] = Dagger.Events.LockContentionMetrics()
end
if compile_time
ml[:compile_time] = Dagger.Events.CompileTimeMetrics()
end
Dagger.Sch.eager_context().log_sink = ml
return
end

function _make_linuxperf_metrics(linuxperf_metrics::String)
ext = Base.get_extension(Dagger, :LinuxPerfExt)
ext === nothing && throw(ErrorException("LinuxPerf.jl is not loaded"))
if !isempty(linuxperf_metrics)
return ext.LinuxPerfMetrics(linuxperf_metrics)
else
return ext.LinuxPerfMetrics()
end
end

"""
disable_logging!()

Expand Down
Loading
Loading