Suppose we have a slow function to produce data and another slow function to process data as follow:
# some slow function
function prime(i)
sleep(2)
println("processed $i")
i
end
function slow_process(x)
sleep(2)
println("slow processed $x")
end
function each(rng)
function _iter()
for i ∈ rng
#time d = prime(i)
produce(d)
end
end
return Task(_iter)
end
#time for x ∈ each(1000:1002)
slow_process(x)
end
Output:
% julia test-task.jl
processed 1000
2.063938 seconds (37.84 k allocations: 1.605 MB)
slow processed 1000
processed 1001
2.003115 seconds (17 allocations: 800 bytes)
slow processed 1001
processed 1002
2.001798 seconds (17 allocations: 800 bytes)
slow processed 1002
12.166475 seconds (88.08 k allocations: 3.640 MB)
Is there some way to get and cache data in a parallel thread using #async and feed to the slow_process function?
Edit: I updated the example to clarify the problem. Ideally, the example should take 2+6 seconds instead of 12 seconds.
Edit 2: This is my effort of using #sync and #async but I got the error ERROR (unhandled task failure): no process with id 2 exists
macro swap(x,y)
quote
local tmp = $(esc(x))
$(esc(x)) = $(esc(y))
$(esc(y)) = tmp
end
end
# some slow function
function prime(i)
sleep(2)
println("processed $i")
i
end
function slow_process(x)
sleep(2)
println("slow processed $x")
end
function each(rng)
#assert length(rng) > 1
rng = collect(rng)
a = b = nothing
function _iter()
for i ∈ 1:length(rng)
if a == nothing
a = #async remotecall_fetch(prime, 2, rng[i])
b = #async remotecall_fetch(prime, 2, rng[i+1])
else
if i < length(rng)
a = #async remotecall_fetch(prime, 2, rng[i+1])
end
#swap(a,b)
end
#sync d = a
produce(d)
end
end
return Task(_iter)
end
#time for x ∈ each(1000:1002)
slow_process(x)
end
OK, I have the working solution below:
macro swap(x,y)
quote
local tmp = $(esc(x))
$(esc(x)) = $(esc(y))
$(esc(y)) = tmp
end
end
# some slow function
#everywhere function prime(i)
sleep(2)
println("prime $i")
i
end
function slow_process(x)
sleep(2)
println("slow_process $x")
end
function each(rng)
#assert length(rng) > 1
rng = collect(rng)
a = b = nothing
function _iter()
for i ∈ 1:length(rng)
if a == nothing
a = remotecall(prime, 2, rng[i])
b = remotecall(prime, 2, rng[i+1])
else
if i < length(rng)
a = remotecall(prime, 2, rng[i+1])
end
#swap(a,b)
end
d = fetch(a)
produce(d)
end
end
return Task(_iter)
end
#time for x ∈ each(1000:1002)
slow_process(x)
end
And
% julia -p 2 test-task.jl
8.354102 seconds (148.00 k allocations: 6.204 MB)
Related
I'm working with Julia 1.8.2 on the Advent of Code day 6 and noticed some strange performance difference between while and for loops.
I had written an implementation with a for loop, but realized that I did not need to go over each index, and I could skip indices in certain cases, but when I rewrote my code with a while loop it took ~10x as long to run. Both the for and the while loop code give the correct answer.
Then I added a small basic while vs for loop test to see if it was my code or the actual loops, and the results were even more dramatic. The while test took ~0.5s while the for test completed almost instantly.
My full code is given below, see AoC day6 for the data.
My question is why is the while loop so much slower? Does the Julia interpreter have a hard time optimizing while loops for some reason?
using BenchmarkTools
function parse_data()
open(joinpath(dirname(#__FILE__), "data/day6.txt")) do f
while !eof(f)
line = readline(f)
return line
end
end
end
function test_for(data)
tot = 0
for i = 1:length(data) * 100
tot += 1
end
return tot
end
function test_while(data)
tot = 0
i = 1
while i <= length(data) * 100
tot += 1
i += 1
end
return tot
end
function solve_problem_for(data)
marker_length = 14
for i = 1:length(data)
repeat = false
for (j, item) in enumerate(view(data, i:i+marker_length - 1))
repeat = repeat || occursin(item, view(data, i + j:i + marker_length - 1))
if repeat
break
end
end
if !repeat
return i + marker_length - 1
end
end
end
function solve_problem_while(data)
marker_length = 14
i = 1
while i <= length(data)
repeat = false
for (j, item) in enumerate(view(data, i:i+marker_length - 1))
repeat = repeat || occursin(item, view(data, i + j:i + marker_length - 1))
if repeat
i += j - 1
break
end
end
if !repeat
return i + marker_length - 1
end
i += 1
end
end
function main()
data = parse_data()
#time sol = solve_problem_while(data)
#time sol = solve_problem_while(data)
println(sol)
#time sol = test_while(data)
#time sol = test_while(data)
#time sol = solve_problem_for(data)
#time sol = solve_problem_for(data)
println(sol)
#time sol = test_for(data)
#time sol = test_for(data)
end
main()
I want to use shared memory multi-threading in Julia. As done by the Threads.#threads macro, I can use ccall(:jl_threading_run ...) to do this. And whilst my code now runs in parallel, I don't get the speedup I expected.
The following code is intended as a minimal example of the approach I'm taking and the performance problem I'm having: [EDIT: See later for even more minimal example]
nthreads = Threads.nthreads()
test_size = 1000000
println("STARTED with ", nthreads, " thread(s) and test size of ", test_size, ".")
# Something to be processed:
objects = rand(test_size)
# Somewhere for our results
results = zeros(nthreads)
counts = zeros(nthreads)
# A function to do some work.
function worker_fn()
work_idx = 1
my_result = results[Threads.threadid()]
while work_idx > 0
my_result += objects[work_idx]
work_idx += nthreads
if work_idx > test_size
break
end
counts[Threads.threadid()] += 1
end
end
# Call our worker function using jl_threading_run
#time ccall(:jl_threading_run, Ref{Cvoid}, (Any,), worker_fn)
# Verify that we made as many calls as we think we did.
println("\nCOUNTS:")
println("\tPer thread:\t", counts)
println("\tSum:\t\t", sum(counts))
On an i7-7700, a typical single threaded result is:
STARTED with 1 thread(s) and test size of 1000000.
0.134606 seconds (5.00 M allocations: 76.563 MiB, 1.79% gc time)
COUNTS:
Per thread: [999999.0]
Sum: 999999.0
And with 4 threads:
STARTED with 4 thread(s) and test size of 1000000.
0.140378 seconds (1.81 M allocations: 25.661 MiB)
COUNTS:
Per thread: [249999.0, 249999.0, 249999.0, 249999.0]
Sum: 999996.0
Multi-threading slows things down! Why?
EDIT: A better minimal example can be created #threads macro itself.
a = zeros(Threads.nthreads())
b = rand(test_size)
calls = zeros(Threads.nthreads())
#time Threads.#threads for i = 1 : test_size
a[Threads.threadid()] += b[i]
calls[Threads.threadid()] += 1
end
I falsely assumed that the #threads macro's inclusion in Julia would mean that there was a benefit to be had.
The problem you have is most probably false sharing.
You can solve it by separating the areas you write to far enough like this (here is a "quick and dirty" implementation to show the essence of the change):
julia> function f(spacing)
test_size = 1000000
a = zeros(Threads.nthreads()*spacing)
b = rand(test_size)
calls = zeros(Threads.nthreads()*spacing)
Threads.#threads for i = 1 : test_size
#inbounds begin
a[Threads.threadid()*spacing] += b[i]
calls[Threads.threadid()*spacing] += 1
end
end
a, calls
end
f (generic function with 1 method)
julia> #btime f(1);
41.525 ms (35 allocations: 7.63 MiB)
julia> #btime f(8);
2.189 ms (35 allocations: 7.63 MiB)
or doing per-thread accumulation on a local variable like this (this is a preferred approach as it should be uniformly faster):
function getrange(n)
tid = Threads.threadid()
nt = Threads.nthreads()
d , r = divrem(n, nt)
from = (tid - 1) * d + min(r, tid - 1) + 1
to = from + d - 1 + (tid ≤ r ? 1 : 0)
from:to
end
function f()
test_size = 10^8
a = zeros(Threads.nthreads())
b = rand(test_size)
calls = zeros(Threads.nthreads())
Threads.#threads for k = 1 : Threads.nthreads()
local_a = 0.0
local_c = 0.0
for i in getrange(test_size)
for j in 1:10
local_a += b[i]
local_c += 1
end
end
a[Threads.threadid()] = local_a
calls[Threads.threadid()] = local_c
end
a, calls
end
Also note that you are probably using 4 treads on a machine with 2 physical cores (and only 4 virtual cores) so the gains from threading will not be linear.
I have a function like this:
#everywhere function bellman_operator!(rbc::RBC)
...
#sync #parallel for i = 1:m
....
for j = 1:n
v_max = -1000.0
...
for l = Next : n
......
if v > vmax
vmax = v
Next = l
else
break
end
end
f_v[j, i] = vmax
f_p[j, i] = k
end
end
end
f_v and f_p are sharedArrays, I want to give different arrays for result of each workers, I saw some sample but I can't fix it.How can I use arrays for result of each workers and finally combine the results instead of using SharedArrays?
Is this what you want?
Example 1. Combining results using +:
a = #parallel (+) for i in 1:1000
rand(10, 10)
end
Example 2. Just collecting the results without combining them:
x = Future[]
for i in 1:1000
push!(x, #spawn rand(10,10))
end
y = fetch.(x)
What are the best practices in Julia to get statistics of an array along a given dimension in parallel? I have many large arrays and am looking for something like mean(array, 1), but parallel (and returning a quantile). I can not handle the arrays in parallel because I don't have enough RAM.
I coded up a crude benchmark that also illustrates the approaches I've tried so far: mapslices and #parallel loops over SharedArrays and DArrays (see below). The parallelization does not seem to speed things up much. Adding 7 workers and using SharedArrays yields a 1.8x speedup, using DArrays yields a 2.3x speedup. I'm pretty new to Julia. Is this to be expected? Am I doing something wrong?
Thanks for your help. Below is the output of my script followed by the script itself.
Script output:
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
mapslices on Array
38.152894 seconds (218.71 M allocations: 14.435 GB, 3.33% gc time)
37.985577 seconds (218.10 M allocations: 14.406 GB, 3.23% gc time)
loop over Array using CartesianRange
9.161392 seconds (25.27 M allocations: 9.005 GB, 4.41% gc time)
9.118627 seconds (25.17 M allocations: 9.000 GB, 4.40% gc time)
#parallel loop over SharedArray
9.092477 seconds (322.23 k allocations: 14.190 MB, 0.05% gc time)
4.945648 seconds (18.90 k allocations: 1.405 MB)
#parallel loop over DArray
5.615429 seconds (496.26 k allocations: 21.535 MB, 0.08% gc time)
3.932704 seconds (15.63 k allocations: 1.178 MB)
Script:
procs_added = addprocs(CPU_CORES - 1)
#everywhere using DistributedArrays
function benchmark_array(dtype, dims)
data = rand(dtype, dims...)
println("mapslices on Array")
#time out = mapslices(f->quantile(f, 0.2), data, 1)
#time out = mapslices(f->quantile(f, 0.2), data, 1)
println("loop over Array using CartesianRange")
out = Array(Float32, size(data)[2:end])
#time loop_over_array!(out, data)
#time loop_over_array!(out, data)
end
function loop_over_array!(out::Array, data::Array)
for I in CartesianRange(size(out))
# explicit indexing, since [:, I...] didn't work
out[I] = quantile(data[:, I[1], I[2], I[3]], 0.2)
end
end
function benchmark_shared_array(dtype, dims)
data = SharedArray(dtype, (dims...), pids=workers())
println("#parallel loop over SharedArray")
out = SharedArray(Float32, size(data)[2:end], pids=workers())
#time parallel_loop_over_shared_array!(out, data)
#time parallel_loop_over_shared_array!(out, data)
end
function parallel_loop_over_shared_array!(out::SharedArray, data::SharedArray)
# #parallel for I in CartesianRange(size(out)) does not seem to work
#sync #parallel for i in 1:size(out)[end]
for I in CartesianRange(size(out)[1:end-1])
out[I[1], I[2], i] = quantile(data[:, I[1], I[2], i], 0.2)
end
end
end
function benchmark_distributed_array(dtype, dims)
data = drand(dtype, (dims...), workers(),
[i == length(dims) ? nworkers() : 1 for i in 1:length(dims)])
println("#parallel loop over DArray")
out = dzeros(Float32, size(data)[2:end], workers(),
[i == ndims(data) ? nworkers() : 1 for i in 2:ndims(data)])
#time parallel_loop_over_distributed_array!(out, data)
#time parallel_loop_over_distributed_array!(out, data)
end
function parallel_loop_over_distributed_array!(out::DArray, data::DArray)
#sync for pid in workers()
#spawnat pid begin
inchunk = localpart(data)
outchunk = localpart(out)
for I in CartesianRange(size(outchunk))
outchunk[I] = quantile(inchunk[:, I[1], I[2], I[3]], 0.2)
end
end
end
end
function benchmark_all(dtype, dims)
benchmark_array(dtype, dims)
benchmark_shared_array(dtype, dims)
benchmark_distributed_array(dtype, dims)
end
const dtype = Int
const dims = [128,256,256,64]
benchmark_all(dtype, dims)
Using DistributedArrays in cases when the worker only needs to store unshared data seems overly complicated. I would like to do
r=remotecall(2,a=Float64[])
remotecall(2,setindex!,a,5,10) #Error
or
r=remotecall(2,zeros,10)
remotecall(2,setindex!,r,5,10) #Error.
I would like to do this for each worker and then access the array in an async context. Perform some computations and then fetch the results. I am not sure of this is possible because of the let behavior of async
Below I have made an simplified example for which I modified the pmap example form the docs. T
times=linspace(0.1,2.0,10) # times in secs representing different difficult computations
sort!(times,rev=true)
np = nprocs()
n = length(times)
#create local variables
for p=1:np
if p != myid() || np == 1
remotecall(p,stack = Float64p[]) #does not work
end
end
#everywhere function fun(s)
mid=myid()
sleep(s)
#s represents some computation save to local stack
push!(stack,s)
end
#asynchronously do the computations
#everywhere i = 1
function nextidx()
global i
idx=i;
i+=1;
return idx;
end
#sync begin
for p=1:np
if p != myid() || np == 1
#async begin
j=1
res=zeros(40);
while true
idx = nextidx()
if idx > n
break
end
remotecall(fun, times[idx])
end
end
end
end
end
# collect the results of the computations
for p=1:np
if p != myid() || np == 1
tmpStack=fetch(p,stack)
#do someting with the results
end
end
By using 'global' when you modify the global variable of the worker (e.g., set by #everywhere a = 3), you may be able to resolve your problem. Check out the example code below.
#everywhere a = 0
remotecall_fetch(2, ()->a) # print 0
#everywhere function change_a(b)
global a
a = b
end
b = 10
remotecall_fetch(2, change_a, b)
remotecall_fetch(2, ()->a) # print 10