It seems that the nested loop example should get the following implementation for Threads
using Dagger, Random, Distributions, StatsBase, DataFrames
function f(dist, len, reps, σ)
v = Vector{Float64}(undef, len) # avoiding allocations
maximum(mean(rand!(dist, v)) for _ in 1:reps)/σ
end
function experiments_threads(dists, lens, K=1000)
res = DataFrame()
@sync for T in dists
dist = T()
σ = Threads.@spawn std(dist)
for L in lens
z = Threads.@spawn f(dist, L, K, fetch(σ))
push!(res, (;T, σ, L, z))
end
end
res.z = fetch.(res.z)
res.σ = fetch.(res.σ)
res
end
function experiments_dagger(dists, lens, K=1000)
res = DataFrame()
@sync for T in dists
dist = T()
σ = Dagger.@spawn std(dist)
for L in lens
z = Dagger.@spawn f(dist, L, K, σ)
push!(res, (;T, σ, L, z))
end
end
res.z = fetch.(res.z)
res.σ = fetch.(res.σ)
res
end
dists = [Cosine, Epanechnikov, Laplace, Logistic, Normal, NormalCanon, PGeneralizedGaussian, SkewNormal, SkewedExponentialPower, SymTriangularDist]
lens = [10, 20, 50, 100, 200, 500]
using BenchmarkTools
@btime experiments_dagger(dists, lens) # slightly slower, for 6 Threads, 574.444 ms (9740771 allocations: 271.22 MiB)
@btime experiments_threads(dists, lens) # slightly faster, for 6 Threads, 543.696 ms (9681150 allocations: 268.68 MiB)
The differences in time might be pure randomness in this case.
However, even more confusing, if I am adding additional processes up front (after clean restart of julia)
using Distributed
Distributed.addprocs(2, exeflags=`--threads=3`)
and then run the previous code, then @btime experiments_dagger(dists, lens) is not twice as fast (we added another 6 threads in total), but stays about the same in speed.
It seems that the nested loop example should get the following implementation for Threads
The differences in time might be pure randomness in this case.
However, even more confusing, if I am adding additional processes up front (after clean restart of julia)
and then run the previous code, then
@btime experiments_dagger(dists, lens)is not twice as fast (we added another 6 threads in total), but stays about the same in speed.