julia, why does memory allocation happen for a loop inside a function? - performance

In the memory allocation report of julia --track-allocation=user the maximum of allocation is in this function:
- function fuzzy_dot_square( v::Array{Int64, 1} )
- dot_prod = zero(Int64)
7063056168 for i::Int64 in 2:28
0 dot_prod += v[i]*(v[i] + v[i-1] + v[i+1] + v[i+28])# / 4 # no "top" pixel
- end
0 for i in 29:(28*27) # compiler should literate 28*27
0 dot_prod += v[i]*(v[i] + v[i-1] + v[i+1] + v[i-28] + v[i+28])# / 5 # all pixels
- end
0 for i in (28*27):(28*28 - 1)
0 dot_prod += v[i]*(v[i] + v[i-1] + v[i+1] + v[i-28])# / 4 # no "bottom" pixel
- end
-
0 return dot_prod
- end
-- it is a "fuzzy dot product" square of a vector, representing pixel image 28 by 28 (the known MNIST dataset of digit images).
Why does the allocation happen there?
As far as I understand, the dot_prod is the only thing to be allocated.
But the report points at the first for..
Also I tried reproducing it in repl with:
v = Array{Int64,1}(1:100)
dot_prod = zero(Int64)
#allocated for i in 2:28
dot_prod += v[i]
end
-- and I get following error at #allocated for ...:
ERROR: UndefVarError: dot_prod not defined
in macro expansion at ./REPL[3]:2 [inlined]
in (::##1#f#1)() at ./util.jl:256
The #time macro works fine, so probably there is some bug in #allocated? I have julia 0.5.0.

This is a limitation of --track-allocation=user. There's no type instability and there are no allocations.
julia> function fuzzy_dot_square(v)
dot_prod = zero(eltype(v))
for i in 2:28
dot_prod += v[i]*(v[i] + v[i-1] + v[i+1] + v[i+28])# / 4 # no "top" pixel
end
for i in 29:(28*27) # compiler should literate 28*27
dot_prod += v[i]*(v[i] + v[i-1] + v[i+1] + v[i-28] + v[i+28])# / 5 # all pixels
end
for i in (28*27):(28*28 - 1)
dot_prod += v[i]*(v[i] + v[i-1] + v[i+1] + v[i-28])# / 4 # no "bottom" pixel
end
return dot_prod
end
fuzzy_dot_square (generic function with 1 method)
julia> const xs = [1:28^2;];
julia> #allocated fuzzy_dot_square(xs)
0
See also this passage from the Julia documentation:
In interpreting the results, there are a few important details. Under the user setting, the first line of any function directly called from the REPL will exhibit allocation due to events that happen in the REPL code itself. More significantly, JIT-compilation also adds to allocation counts, because much of Julia’s compiler is written in Julia (and compilation usually requires memory allocation). The recommended procedure is to force compilation by executing all the commands you want to analyze, then call Profile.clear_malloc_data() to reset all allocation counters. Finally, execute the desired commands and quit Julia to trigger the generation of the .mem files.
And for further information, see this Julia issue.

Related

Ruby : unexpected ',', expecting '.' or &. or :: or '['

I'm currently trying to implement a mathematic method to approximate
f(x) = 0. I've already implemented it in some languages and I want to do it in ruby now just for training.
But I have this error that I really does'nt understand
Here is my code
def fonction (x)
return (x ** 3) + 4 * (x ** 2) - 10
end
def derive (x)
return 3 * (x ** 2) + 8 * x
end
def newton(f, fPrime, n, u0)
if n == 0 then
return u0
else
uN = newton (f, fPrime, (n - 1), u0)
return uN - f(uN) / fPrime(uN)
end
end
for i in 0..6
puts (newton (fonction, derive, i, 2))
end
i think there is space on newton method call
uN = newton (f, fPrime, (n - 1), u0) # there is space after newton
also in this one
for i in 0..6
puts (newton (fonction, derive, i, 2)) # there is space after newton
end
try remove it, and you will see another error i guess, i try it on repl

Ruby algorithms loops codewars

I got stuck with below task and spent about 3 hours trying to figure it out.
Task description: A man has a rather old car being worth $2000. He saw a secondhand car being worth $8000. He wants to keep his old car until he can buy the secondhand one.
He thinks he can save $1000 each month but the prices of his old car and of the new one decrease of 1.5 percent per month. Furthermore this percent of loss increases by 0.5 percent at the end of every two months. Our man finds it difficult to make all these calculations.
How many months will it take him to save up enough money to buy the car he wants, and how much money will he have left over?
My code so far:
def nbMonths(startPriceOld, startPriceNew, savingperMonth, percentLossByMonth)
dep_value_old = startPriceOld
mth_count = 0
total_savings = 0
dep_value_new = startPriceNew
mth_count_new = 0
while startPriceOld != startPriceNew do
if startPriceOld >= startPriceNew
return mth_count = 0, startPriceOld - startPriceNew
end
dep_value_new = dep_value_new - (dep_value_new * percentLossByMonth / 100)
mth_count_new += 1
if mth_count_new % 2 == 0
dep_value_new = dep_value_new - (dep_value_new * 0.5) / 100
end
dep_value_old = dep_value_old - (dep_value_old * percentLossByMonth / 100)
mth_count += 1
total_savings += savingperMonth
if mth_count % 2 == 0
dep_value_old = dep_value_old - (dep_value_old * 0.5) / 100
end
affordability = total_savings + dep_value_old
if affordability >= dep_value_new
return mth_count, affordability - dep_value_new
end
end
end
print nbMonths(2000, 8000, 1000, 1.5) # Expected result[6, 766])
The data are as follows.
op = 2000.0 # current old car value
np = 8000.0 # current new car price
sv = 1000.0 # annual savings
dr = 0.015 # annual depreciation, both cars (1.5%)
cr = 0.005. # additional depreciation every two years, both cars (0.5%)
After n >= 0 months the man's (let's call him "Rufus") savings plus the value of his car equal
sv*n + op*(1 - n*dr - (cr + 2*cr + 3*cr +...+ (n/2)*cr))
where n/2 is integer division. As
cr + 2*cr + 3*cr +...+ (n/2)*cr = cr*((1+2+..+n)/2) = cr*(1+n/2)*(n/2)
the expression becomes
sv*n + op*(1 - n*dr - cr*(1+(n/2))*(n/2))
Similarly, after n years the cost of the car he wants to purchase will fall to
np * (1 - n*dr - cr*(1+(n/2))*(n/2))
If we set these two expressions equal we obtain the following.
sv*n + op - op*dr*n - op*cr*(n/2) - op*cr*(n/2)**2 =
np - np*dr*n - np*cr*(n/2) - np*cr*(n/2)**2
which reduces to
cr*(np-op)*(n/2)**2 + (sv + dr*(np-op))*n + cr*(np-op)*(n/2) - (np-op) = 0
or
cr*(n/2)**2 + (sv/(np-op) + dr)*n + cr*(n/2) - 1 = 0
If we momentarily treat (n/2) as a float division, this expression reduces to a quadratic.
(cr/4)*n**2 + (sv/(np-op) + dr + cr/2)*n - 1 = 0
= a*n**2 + b*n + c = 0
where
a = cr/4 = 0.005/4 = 0.00125
b = sv/(np-op) + dr + cr/(2*a) = 1000.0/(8000-2000) + 0.015 + 0.005/2 = 0.18417
c = -1
Incidentally, Rufus doesn't have a computer, but he does have an HP 12c calculator his grandfather gave him when he was a kid, which is perfectly adequate for these simple calculations.
The roots are computed as follows.
(-b + Math.sqrt(b**2 - 4*a*c))/(2*a) #=> 5.24
(-b - Math.sqrt(b**2 - 4*a*c))/(2*a) #=> -152.58
It appears that Rufus can purchase the new vehicle (if it's still for sale) in six years. Had we been able able to solve the above equation for n/2 using integer division it might have turned out that Rufus would have had to wait longer. That’s because for a given n both cars would have depreciated less (or at least not not more), and because the car to be purchased is more expensive than the current car, the difference in values would be greater than that obtained with the float approximation for 1/n. We need to check that, however. After n years, Rufus' savings and the value of his beater will equal
sv*n + op*(1 - dr*n - cr*(1+(n/2))*(n/2))
= 1000*n + 2000*(1 - 0.015*n - 0.005*(1+(n/2))*(n/2))
For n = 6 this equals
1000*6 + 2000*(1 - 0.015*6 - 0.005*(1+(6/2))*(6/2))
= 1000*6 + 2000*(1 - 0.015*6 - 0.005*(1+3)*3)
= 1000*6 + 2000*0.85
= 7700
The cost of Rufus' dream car after n years will be
np * (1 - dr*n - cr*(1+(n/2))*(n/2))
= 8000 * (1 - 0.015*n - 0.005*(1+(n/2))*(n/2))
For n=6 this becomes
8000 * (1 - 0.015*6 - 0.005*(1+(6/2))*(6/2))
= 8000*0.85
= 6800
(Notice that the factor 0.85 is the same in both calculations.)
Yes, Rufus will be able to buy the car in 6 years.
def nbMonths(old, new, savings, percent)
percent = percent.fdiv(100)
current_savings = 0
months = 0
loop do
break if current_savings + old >= new
current_savings += savings
old -= old * percent
new -= new * percent
months += 1
percent += 0.005 if months.odd?
end
[months, (current_savings + old - new).round]
end

Julia: why doesn't shared memory multi-threading give me a speedup?

I want to use shared memory multi-threading in Julia. As done by the Threads.#threads macro, I can use ccall(:jl_threading_run ...) to do this. And whilst my code now runs in parallel, I don't get the speedup I expected.
The following code is intended as a minimal example of the approach I'm taking and the performance problem I'm having: [EDIT: See later for even more minimal example]
nthreads = Threads.nthreads()
test_size = 1000000
println("STARTED with ", nthreads, " thread(s) and test size of ", test_size, ".")
# Something to be processed:
objects = rand(test_size)
# Somewhere for our results
results = zeros(nthreads)
counts = zeros(nthreads)
# A function to do some work.
function worker_fn()
work_idx = 1
my_result = results[Threads.threadid()]
while work_idx > 0
my_result += objects[work_idx]
work_idx += nthreads
if work_idx > test_size
break
end
counts[Threads.threadid()] += 1
end
end
# Call our worker function using jl_threading_run
#time ccall(:jl_threading_run, Ref{Cvoid}, (Any,), worker_fn)
# Verify that we made as many calls as we think we did.
println("\nCOUNTS:")
println("\tPer thread:\t", counts)
println("\tSum:\t\t", sum(counts))
On an i7-7700, a typical single threaded result is:
STARTED with 1 thread(s) and test size of 1000000.
0.134606 seconds (5.00 M allocations: 76.563 MiB, 1.79% gc time)
COUNTS:
Per thread: [999999.0]
Sum: 999999.0
And with 4 threads:
STARTED with 4 thread(s) and test size of 1000000.
0.140378 seconds (1.81 M allocations: 25.661 MiB)
COUNTS:
Per thread: [249999.0, 249999.0, 249999.0, 249999.0]
Sum: 999996.0
Multi-threading slows things down! Why?
EDIT: A better minimal example can be created #threads macro itself.
a = zeros(Threads.nthreads())
b = rand(test_size)
calls = zeros(Threads.nthreads())
#time Threads.#threads for i = 1 : test_size
a[Threads.threadid()] += b[i]
calls[Threads.threadid()] += 1
end
I falsely assumed that the #threads macro's inclusion in Julia would mean that there was a benefit to be had.
The problem you have is most probably false sharing.
You can solve it by separating the areas you write to far enough like this (here is a "quick and dirty" implementation to show the essence of the change):
julia> function f(spacing)
test_size = 1000000
a = zeros(Threads.nthreads()*spacing)
b = rand(test_size)
calls = zeros(Threads.nthreads()*spacing)
Threads.#threads for i = 1 : test_size
#inbounds begin
a[Threads.threadid()*spacing] += b[i]
calls[Threads.threadid()*spacing] += 1
end
end
a, calls
end
f (generic function with 1 method)
julia> #btime f(1);
41.525 ms (35 allocations: 7.63 MiB)
julia> #btime f(8);
2.189 ms (35 allocations: 7.63 MiB)
or doing per-thread accumulation on a local variable like this (this is a preferred approach as it should be uniformly faster):
function getrange(n)
tid = Threads.threadid()
nt = Threads.nthreads()
d , r = divrem(n, nt)
from = (tid - 1) * d + min(r, tid - 1) + 1
to = from + d - 1 + (tid ≤ r ? 1 : 0)
from:to
end
function f()
test_size = 10^8
a = zeros(Threads.nthreads())
b = rand(test_size)
calls = zeros(Threads.nthreads())
Threads.#threads for k = 1 : Threads.nthreads()
local_a = 0.0
local_c = 0.0
for i in getrange(test_size)
for j in 1:10
local_a += b[i]
local_c += 1
end
end
a[Threads.threadid()] = local_a
calls[Threads.threadid()] = local_c
end
a, calls
end
Also note that you are probably using 4 treads on a machine with 2 physical cores (and only 4 virtual cores) so the gains from threading will not be linear.

Loop optimisation

I am trying to understand what cache or other optimizations could be done in the source code to get this loop faster. I think it is quite cache friendly but, are there any experts out there that could squeeze a bit more performance tuning this code?
DO K = 1, NZ
DO J = 1, NY
DO I = 1, NX
SIDEBACK = STEN(I-1,J-1,K-1) + STEN(I-1,J,K-1) + STEN(I-1,J+1,K-1) + &
STEN(I ,J-1,K-1) + STEN(I ,J,K-1) + STEN(I ,J+1,K-1) + &
STEN(I+1,J-1,K-1) + STEN(I+1,J,K-1) + STEN(I+1,J+1,K-1)
SIDEOWN = STEN(I-1,J-1,K) + STEN(I-1,J,K) + STEN(I-1,J+1,K) + &
STEN(I ,J-1,K) + STEN(I ,J,K) + STEN(I ,J+1,K) + &
STEN(I+1,J-1,K) + STEN(I+1,J,K) + STEN(I+1,J+1,K)
SIDEFRONT = STEN(I-1,J-1,K+1) + STEN(I-1,J,K+1) + STEN(I-1,J+1,K+1) + &
STEN(I ,J-1,K+1) + STEN(I ,J,K+1) + STEN(I ,J+1,K+1) + &
STEN(I+1,J-1,K+1) + STEN(I+1,J,K+1) + STEN(I+1,J+1,K+1)
RES(I,J,K) = ( SIDEBACK + SIDEOWN + SIDEFRONT ) / 27.0
END DO
END DO
END DO
Ok, I think I've tried everything I reasonably could, and my conclusion unfortunately is that there is not too much room for optimizations, unless you are willing to go into parallelization. Let's see why, let's see what you can and can't do.
Compiler optimizations
Compilers nowadays are extremely good at optimizing code, much much more than humans are. Relying on the optimizations done by the compilers also have the added benefit that they don't ruin the readability of your source code. Whatever you do, (when optimizing for speed) always try it with every reasonable combination of compiler flags. You can even go as far as to try multiple compilers. Personally I only used gfortran (included in GCC) (OS is 64-bit Windows), which I trust to have efficient and correct optimization techniques.
-O2 almost always improve the speed drastically, but even -O3 is a safe bet (among others, it includes delicious loop unrolling). For this problem, I also tried -ffast-math and -fexpensive-optimizations, they didn't have any measurable effect, but -march-corei7(cpu architecture-specific tuning, specific to Core i7) had, so I did the measurements with -O3 -march-corei7
So how fast it actually is?
I wrote the following code to test your solution and compiled it with -O3 -march-corei7. Usually it ran under 0.78-0.82 seconds.
program benchmark
implicit none
real :: start, finish
integer :: I, J, K
real :: SIDEBACK, SIDEOWN, SIDEFRONT
integer, parameter :: NX = 600
integer, parameter :: NY = 600
integer, parameter :: NZ = 600
real, dimension (0 : NX + 2, 0 : NY + 2, 0 : NZ + 2) :: STEN
real, dimension (0 : NX + 2, 0 : NY + 2, 0 : NZ + 2) :: RES
call random_number(STEN)
call cpu_time(start)
DO K = 1, NZ
DO J = 1, NY
DO I = 1, NX
SIDEBACK = STEN(I-1,J-1,K-1) + STEN(I-1,J,K-1) + STEN(I-1,J+1,K-1) + &
STEN(I ,J-1,K-1) + STEN(I ,J,K-1) + STEN(I ,J+1,K-1) + &
STEN(I+1,J-1,K-1) + STEN(I+1,J,K-1) + STEN(I+1,J+1,K-1)
SIDEOWN = STEN(I-1,J-1,K) + STEN(I-1,J,K) + STEN(I-1,J+1,K) + &
STEN(I ,J-1,K) + STEN(I ,J,K) + STEN(I ,J+1,K) + &
STEN(I+1,J-1,K) + STEN(I+1,J,K) + STEN(I+1,J+1,K)
SIDEFRONT = STEN(I-1,J-1,K+1) + STEN(I-1,J,K+1) + STEN(I-1,J+1,K+1) + &
STEN(I ,J-1,K+1) + STEN(I ,J,K+1) + STEN(I ,J+1,K+1) + &
STEN(I+1,J-1,K+1) + STEN(I+1,J,K+1) + STEN(I+1,J+1,K+1)
RES(I,J,K) = ( SIDEBACK + SIDEOWN + SIDEFRONT ) / 27.0
END DO
END DO
END DO
call cpu_time(finish)
!Use the calculated value, so the compiler doesn't optimize away everything.
!Print the original value as well, because one can never be too paranoid.
print *, STEN(1,1,1), RES(1,1,1)
print '(f6.3," seconds.")',finish-start
end program
Ok, so this is as far as the compiler can take us. What's next?
Store intermediate results?
As you might suspect from the question mark, this one didn't really work. Sorry. But let's not rush that forward.
As mentioned in the comments, your current code calculates every partial sum multiple times, meaning one iteration's STEN(I+1,J-1,K-1) + STEN(I+1,J,K-1) + STEN(I+1,J+1,K-1) will be the next iteration's STEN(I,J-1,K-1) + STEN(I,J,K-1) + STEN(I,J+1,K-1), so no need to fetch and calculate again, you can store those partial results.
The problem is, that we cannot store too many partial results. As you said, your code is already quite cache-friendly, every partial sum you store means one less array element you can store in L1 cache. We could store a few values, from the last few iterations of I (values for index I-2, I-3, etc.), but the compiler almost certainly does that already. I have 2 proofs for this suspicion. First, my manual loop unrolling made the program slower, by about 5%
DO K = 1, NZ
DO J = 1, NY
DO I = 1, NX, 8
SIDEBACK(0) = STEN(I-1,J-1,K-1) + STEN(I-1,J,K-1) + STEN(I-1,J+1,K-1)
SIDEBACK(1) = STEN(I ,J-1,K-1) + STEN(I ,J,K-1) + STEN(I ,J+1,K-1)
SIDEBACK(2) = STEN(I+1,J-1,K-1) + STEN(I+1,J,K-1) + STEN(I+1,J+1,K-1)
SIDEBACK(3) = STEN(I+2,J-1,K-1) + STEN(I+2,J,K-1) + STEN(I+2,J+1,K-1)
SIDEBACK(4) = STEN(I+3,J-1,K-1) + STEN(I+3,J,K-1) + STEN(I+3,J+1,K-1)
SIDEBACK(5) = STEN(I+4,J-1,K-1) + STEN(I+4,J,K-1) + STEN(I+4,J+1,K-1)
SIDEBACK(6) = STEN(I+5,J-1,K-1) + STEN(I+5,J,K-1) + STEN(I+5,J+1,K-1)
SIDEBACK(7) = STEN(I+6,J-1,K-1) + STEN(I+6,J,K-1) + STEN(I+6,J+1,K-1)
SIDEBACK(8) = STEN(I+7,J-1,K-1) + STEN(I+7,J,K-1) + STEN(I+7,J+1,K-1)
SIDEBACK(9) = STEN(I+8,J-1,K-1) + STEN(I+8,J,K-1) + STEN(I+8,J+1,K-1)
SIDEOWN(0) = STEN(I-1,J-1,K) + STEN(I-1,J,K) + STEN(I-1,J+1,K)
SIDEOWN(1) = STEN(I ,J-1,K) + STEN(I ,J,K) + STEN(I ,J+1,K)
SIDEOWN(2) = STEN(I+1,J-1,K) + STEN(I+1,J,K) + STEN(I+1,J+1,K)
SIDEOWN(3) = STEN(I+2,J-1,K) + STEN(I+2,J,K) + STEN(I+2,J+1,K)
SIDEOWN(4) = STEN(I+3,J-1,K) + STEN(I+3,J,K) + STEN(I+3,J+1,K)
SIDEOWN(5) = STEN(I+4,J-1,K) + STEN(I+4,J,K) + STEN(I+4,J+1,K)
SIDEOWN(6) = STEN(I+5,J-1,K) + STEN(I+5,J,K) + STEN(I+5,J+1,K)
SIDEOWN(7) = STEN(I+6,J-1,K) + STEN(I+6,J,K) + STEN(I+6,J+1,K)
SIDEOWN(8) = STEN(I+7,J-1,K) + STEN(I+7,J,K) + STEN(I+7,J+1,K)
SIDEOWN(9) = STEN(I+8,J-1,K) + STEN(I+8,J,K) + STEN(I+8,J+1,K)
SIDEFRONT(0) = STEN(I-1,J-1,K+1) + STEN(I-1,J,K+1) + STEN(I-1,J+1,K+1)
SIDEFRONT(1) = STEN(I ,J-1,K+1) + STEN(I ,J,K+1) + STEN(I ,J+1,K+1)
SIDEFRONT(2) = STEN(I+1,J-1,K+1) + STEN(I+1,J,K+1) + STEN(I+1,J+1,K+1)
SIDEFRONT(3) = STEN(I+2,J-1,K+1) + STEN(I+2,J,K+1) + STEN(I+2,J+1,K+1)
SIDEFRONT(4) = STEN(I+3,J-1,K+1) + STEN(I+3,J,K+1) + STEN(I+3,J+1,K+1)
SIDEFRONT(5) = STEN(I+4,J-1,K+1) + STEN(I+4,J,K+1) + STEN(I+4,J+1,K+1)
SIDEFRONT(6) = STEN(I+5,J-1,K+1) + STEN(I+5,J,K+1) + STEN(I+5,J+1,K+1)
SIDEFRONT(7) = STEN(I+6,J-1,K+1) + STEN(I+6,J,K+1) + STEN(I+6,J+1,K+1)
SIDEFRONT(8) = STEN(I+7,J-1,K+1) + STEN(I+7,J,K+1) + STEN(I+7,J+1,K+1)
SIDEFRONT(9) = STEN(I+8,J-1,K+1) + STEN(I+8,J,K+1) + STEN(I+8,J+1,K+1)
RES(I ,J,K) = ( SIDEBACK(0) + SIDEOWN(0) + SIDEFRONT(0) + &
SIDEBACK(1) + SIDEOWN(1) + SIDEFRONT(1) + &
SIDEBACK(2) + SIDEOWN(2) + SIDEFRONT(2) ) / 27.0
RES(I + 1,J,K) = ( SIDEBACK(1) + SIDEOWN(1) + SIDEFRONT(1) + &
SIDEBACK(2) + SIDEOWN(2) + SIDEFRONT(2) + &
SIDEBACK(3) + SIDEOWN(3) + SIDEFRONT(3) ) / 27.0
RES(I + 2,J,K) = ( SIDEBACK(2) + SIDEOWN(2) + SIDEFRONT(2) + &
SIDEBACK(3) + SIDEOWN(3) + SIDEFRONT(3) + &
SIDEBACK(4) + SIDEOWN(4) + SIDEFRONT(4) ) / 27.0
RES(I + 3,J,K) = ( SIDEBACK(3) + SIDEOWN(3) + SIDEFRONT(3) + &
SIDEBACK(4) + SIDEOWN(4) + SIDEFRONT(4) + &
SIDEBACK(5) + SIDEOWN(5) + SIDEFRONT(5) ) / 27.0
RES(I + 4,J,K) = ( SIDEBACK(4) + SIDEOWN(4) + SIDEFRONT(4) + &
SIDEBACK(5) + SIDEOWN(5) + SIDEFRONT(5) + &
SIDEBACK(6) + SIDEOWN(6) + SIDEFRONT(6) ) / 27.0
RES(I + 5,J,K) = ( SIDEBACK(5) + SIDEOWN(5) + SIDEFRONT(5) + &
SIDEBACK(6) + SIDEOWN(6) + SIDEFRONT(6) + &
SIDEBACK(7) + SIDEOWN(7) + SIDEFRONT(7) ) / 27.0
RES(I + 6,J,K) = ( SIDEBACK(6) + SIDEOWN(6) + SIDEFRONT(6) + &
SIDEBACK(7) + SIDEOWN(7) + SIDEFRONT(7) + &
SIDEBACK(8) + SIDEOWN(8) + SIDEFRONT(8) ) / 27.0
RES(I + 7,J,K) = ( SIDEBACK(7) + SIDEOWN(7) + SIDEFRONT(7) + &
SIDEBACK(8) + SIDEOWN(8) + SIDEFRONT(8) + &
SIDEBACK(9) + SIDEOWN(9) + SIDEFRONT(9) ) / 27.0
END DO
END DO
END DO
And what's worse, it's easy to show we are already pretty close the theoretical minimal possible execution time. In order to calculate all these averages, the absolute minimum we need to do, is access every element at least once, and divide them by 27.0. So you can never get faster than the following code, which executes under 0.48-0.5 seconds on my machine.
program benchmark
implicit none
real :: start, finish
integer :: I, J, K
integer, parameter :: NX = 600
integer, parameter :: NY = 600
integer, parameter :: NZ = 600
real, dimension (0 : NX + 2, 0 : NY + 2, 0 : NZ + 2) :: STEN
real, dimension (0 : NX + 2, 0 : NY + 2, 0 : NZ + 2) :: RES
call random_number(STEN)
call cpu_time(start)
DO K = 1, NZ
DO J = 1, NY
DO I = 1, NX
!This of course does not do what you want to do,
!this is just an example of a speed limit we can never surpass.
RES(I, J, K) = STEN(I, J, K) / 27.0
END DO
END DO
END DO
call cpu_time(finish)
!Use the calculated value, so the compiler doesn't optimize away everything.
print *, STEN(1,1,1), RES(1,1,1)
print '(f6.3," seconds.")',finish-start
end program
But hey, even a negative result is a result. If just accessing every element once (and dividing by 27.0) takes up more than half of the execution time, that just means memory access is the bottle neck. Then maybe you can optimize that.
Less data
If you don't need the full precision of 64-bit doubles, you can declare your array with a type of real(kind=4). But maybe your reals are already 4 bytes. In that case, I believe some Fortran implementations support non-standard 16-bit doubles, or depending on your data you can just use integers (maybe floats multiplied by a number then rounded to integer). The smaller your base type is, the more elements you can fit into the cache. The most ideal would be integer(kind=1), of course, it caused more than a 2x speed up on my machine, compared to real(kind=4). But it depends on the precision you need.
Better locality
Column major arrays are slow when you need data from neighbouring column, and row major ones are slow for neighbouring rows.
Fortunately there is a funky way to store data, called a Z-order curve, which does have applications similar to your use case in computer graphics.
I can't promise it will help, maybe it will be terribly counterproductive, but maybe not. Sorry, I didn't feel like implementing it myself, to be honest.
Parallelization
Speaking of computer graphics, this problem is trivially and extremely well parallelizable, maybe even on a GPU, but if you don't want to go that far, you can just use a normal multicore CPU. The Fortran Wiki seems like a good place to search for Fortran parallelization libraries.

Filling a matrix using parallel processing in Julia

I'm trying to speed up the solution time for a dynamic programming problem in Julia (v. 0.5.0), via parallel processing. The problem involves choosing the optimal values for every element of a 1073 x 19 matrix at every iteration, until successive matrix differences fall within a tolerance. I thought that, within each iteration, filling in the values for each element of the matrix could be parallelized. However, I'm seeing a huge performance degradation using SharedArray, and I'm wondering if there's a better way to approach parallel processing for this problem.
I construct the arguments for the function below:
est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]
r = 0.015
tau = 0.35
rho =est_params[1]
sigma =est_params[2]
delta = 0.15
gamma =est_params[3]
a_capital =est_params[4]
lambda1 =est_params[5]
lambda2 =est_params[6]
s =est_params[7]
theta =est_params[8]
mu =est_params[9]
p_bar_k_ss =est_params[10]
beta = (1+r)^(-1)
sigma_range = 4
gz = 19
gp = 29
gk = 37
lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
z=exp(lnz)
gk_m = fld(gk,2)
# Need to add mu somewhere to k_ss
k_ss = (theta*(1-tau)/(r+delta))^(1/(1-theta))
k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
insert!(k,gk_m+1,k_ss)
sort!(k)
p_bar=p_bar_k_ss*k_ss
p = collect(linspace(-p_bar/2,p_bar,gp))
#Tauchen
N = length(z)
Z = zeros(N,1)
Zprob = zeros(Float32,N,N)
Z[N] = lnz[length(z)]
Z[1] = lnz[1]
zstep = (Z[N] - Z[1]) / (N - 1)
for i=2:(N-1)
Z[i] = Z[1] + zstep * (i - 1)
end
for a = 1 : N
for b = 1 : N
if b == 1
Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
elseif b == N
Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
else
Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
end
end
end
# Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
# Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
Zcumprob=zeros(Float32,gz,gz)
# 2 in cumsum! denotes the 2nd dimension, i.e. columns
cumsum!(Zcumprob, Zprob,2)
gm = gk * gp
control=zeros(gm,2)
for i=1:gk
control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
control[(1+gp*(i-1)):(gp*i),2]=p
end
endog=copy(control)
E=Array(Float32,gm,gm,gz)
for h=1:gm
for m=1:gm
for j=1:gz
# set the nonzero net debt indicator
if endog[h,2]<0
p_ind=1
else
p_ind=0
end
# set the investment indicator
if (control[m,1]-(1-delta)*endog[h,1])!=0
i_ind=1
else
i_ind=0
end
E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau)) +
delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
(i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
s*endog[h,2]*p_ind
elem = E[m,h,j]
if E[m,h,j]<0
E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
else
E[m,h,j]=elem
end
end
end
end
I then constructed the function with serial processing. The two for loops iterate through each element to find the largest value in a 1072-sized (=the gm scalar argument in the function) array:
function dynam_serial(E,gm,gz,beta,Zprob)
v = Array(Float32,gm,gz )
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array(Float32,gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1 # arbitrary initial value greater than convcrit
while diff>convcrit
exp_v=v*Zprob'
for h=1:gm
for j=1:gz
Tv[h,j]=findmax(E[:,h,j] + beta*exp_v[:,j])[1]
end
end
diff = maxabs(Tv - v)
v=copy(Tv)
end
end
Timing this, I get:
#time dynam_serial(E,gm,gz,beta,Zprob)
> 106.880008 seconds (91.70 M allocations: 203.233 GB, 15.22% gc time)
Now, I try using Shared Arrays to benefit from parallel processing. Note that I reconfigured the iteration so that I only have one for loop, rather than two. I also use v=deepcopy(Tv); otherwise, v is copied as an Array object, rather than a SharedArray:
function dynam_parallel(E,gm,gz,beta,Zprob)
v = SharedArray(Float32,(gm,gz),init = S -> S[Base.localindexes(S)] = myid() )
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1 # arbitrary initial value greater than convcrit
while diff>convcrit
exp_v=v*Zprob'
Tv = SharedArray(Float32,gm,gz,init = S -> S[Base.localindexes(S)] = myid() )
#sync #parallel for hj=1:(gm*gz)
j=cld(hj,gm)
h=mod(hj,gm)
if h==0;h=gm;end;
#async Tv[h,j]=findmax(E[:,h,j] + beta*exp_v[:,j])[1]
end
diff = maxabs(Tv - v)
v=deepcopy(Tv)
end
end
Timing the parallel version; and using a 4-core 2.5 GHz I7 processor with 16GB of memory, I get:
addprocs(3)
#time dynam_parallel(E,gm,gz,beta,Zprob)
> 164.237208 seconds (2.64 M allocations: 201.812 MB, 0.04% gc time)
Am I doing something incorrect here? Or is there a better way to approach parallel processing in Julia for this particular problem? I've considered using Distributed Arrays, but it's difficult for me to see how to apply them to the present problem.
UPDATE:
Per #DanGetz and his helpful comments, I turned instead to trying to speed up the serial processing version. I was able to get performance down to 53.469780 seconds (67.36 M allocations: 103.419 GiB, 19.12% gc time) through:
1) Upgrading to 0.6.0 (saved about 25 seconds), which includes the helpful #views macro.
2) Preallocating the main array I'm trying to fill in (Tv), per the section on Preallocating Outputs in the Julia Performance Tips: https://docs.julialang.org/en/latest/manual/performance-tips/. (saved another 25 or so seconds)
The biggest remaining slow-down seems to be coming from the add_vecs function, which sums together subarrays of two larger matrices. I've tried devectorizing and using BLAS functions, but haven't been able to produce better performance.
In any event, the improved code for dynam_serial is below:
function add_vecs(r::Array{Float32},h::Int,j::Int,E::Array{Float32},exp_v::Array{Float32},beta::Float32)
#views r=E[:,h,j] + beta*exp_v[:,j]
return r
end
function dynam_serial(E::Array{Float32},gm::Int,gz::Int,beta::Float32,Zprob::Array{Float32})
v = Array{Float32}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array{Float32}(gm,gz)
r = Array{Float32}(gm)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1 # arbitrary initial value greater than convcrit
while diff>convcrit
exp_v=v*Zprob'
for h=1:gm
for j=1:gz
#views Tv[h,j]=findmax(add_vecs(r,h,j,E,exp_v,beta))[1]
end
end
diff = maximum(abs,Tv - v)
v=copy(Tv)
end
return Tv
end
If add_vecs seems to be the critical function, writing an explicit for loop could offer more optimization. How does the following benchmark:
function add_vecs!(r::Array{Float32},h::Int,j::Int,E::Array{Float32},
exp_v::Array{Float32},beta::Float32)
#inbounds for i=1:size(E,1)
r[i]=E[i,h,j] + beta*exp_v[i,j]
end
return r
end
UPDATE
To continue optimizing dynam_serial I have tried to remove more allocations. The result is:
function add_vecs_and_max!(gm::Int,r::Array{Float64},h::Int,j::Int,E::Array{Float64},
exp_v::Array{Float64},beta::Float64)
#inbounds for i=1:gm
r[i] = E[i,h,j]+beta*exp_v[i,j]
end
return findmax(r)[1]
end
function dynam_serial(E::Array{Float64},gm::Int,gz::Int,
beta::Float64,Zprob::Array{Float64})
v = Array{Float64}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
r = Array{Float64}(gm)
exp_v = Array{Float64}(gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1.0 # arbitrary initial value greater than convcrit
while diff>convcrit
A_mul_Bt!(exp_v,v,Zprob)
diff = -Inf
for h=1:gm
for j=1:gz
oldv = v[h,j]
newv = add_vecs_and_max!(gm,r,h,j,E,exp_v,beta)
v[h,j]= newv
diff = max(diff, oldv-newv, newv-oldv)
end
end
end
return v
end
Switching the functions to use Float64 should increase speed (as CPUs are inherently optimized for 64-bit word lengths). Also, using the mutating A_mul_Bt! directly saves another allocation. Avoiding the copy(...) by switching the arrays v and Tv.
How do these optimizations improve your running time?
2nd UPDATE
Updated the code in the UPDATE section to use findmax. Also, changed dynam_serial to use v without Tv, as there was no need to save the old version except for the diff calculation, which is now done inside the loop.
Here's the code I copied-and-pasted, provided by Dan Getz above. I include the array and scalar definitions exactly as I ran them. Performance was: 39.507005 seconds (11 allocations: 486.891 KiB) when running #time dynam_serial(E,gm,gz,beta,Zprob).
using SpecialFunctions
est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]
r = 0.015
tau = 0.35
rho =est_params[1]
sigma =est_params[2]
delta = 0.15
gamma =est_params[3]
a_capital =est_params[4]
lambda1 =est_params[5]
lambda2 =est_params[6]
s =est_params[7]
theta =est_params[8]
mu =est_params[9]
p_bar_k_ss =est_params[10]
beta = (1+r)^(-1)
sigma_range = 4
gz = 19 #15 #19
gp = 29 #19 #29
gk = 37 #25 #37
lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
z=exp.(lnz)
gk_m = fld(gk,2)
# Need to add mu somewhere to k_ss
k_ss = (theta*(1-tau)/(r+delta))^(1/(1-theta))
k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
insert!(k,gk_m+1,k_ss)
sort!(k)
p_bar=p_bar_k_ss*k_ss
p = collect(linspace(-p_bar/2,p_bar,gp))
#Tauchen
N = length(z)
Z = zeros(N,1)
Zprob = zeros(Float64,N,N)
Z[N] = lnz[length(z)]
Z[1] = lnz[1]
zstep = (Z[N] - Z[1]) / (N - 1)
for i=2:(N-1)
Z[i] = Z[1] + zstep * (i - 1)
end
for a = 1 : N
for b = 1 : N
if b == 1
Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
elseif b == N
Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
else
Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
end
end
end
# Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
# Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
Zcumprob=zeros(Float64,gz,gz)
# 2 in cumsum! denotes the 2nd dimension, i.e. columns
cumsum!(Zcumprob, Zprob,2)
gm = gk * gp
control=zeros(gm,2)
for i=1:gk
control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
control[(1+gp*(i-1)):(gp*i),2]=p
end
endog=copy(control)
E=Array(Float64,gm,gm,gz)
for h=1:gm
for m=1:gm
for j=1:gz
# set the nonzero net debt indicator
if endog[h,2]<0
p_ind=1
else
p_ind=0
end
# set the investment indicator
if (control[m,1]-(1-delta)*endog[h,1])!=0
i_ind=1
else
i_ind=0
end
E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau)) +
delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
(i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
s*endog[h,2]*p_ind
elem = E[m,h,j]
if E[m,h,j]<0
E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
else
E[m,h,j]=elem
end
end
end
end
function add_vecs_and_max!(gm::Int,r::Array{Float64},h::Int,j::Int,E::Array{Float64},
exp_v::Array{Float64},beta::Float64)
maxr = -Inf
#inbounds for i=1:gm r[i] = E[i,h,j]+beta*exp_v[i,j]
maxr = max(r[i],maxr)
end
return maxr
end
function dynam_serial(E::Array{Float64},gm::Int,gz::Int,
beta::Float64,Zprob::Array{Float64})
v = Array{Float64}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array{Float64}(gm,gz)
r = Array{Float64}(gm)
exp_v = Array{Float64}(gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1.0 # arbitrary initial value greater than convcrit
while diff>convcrit
A_mul_Bt!(exp_v,v,Zprob)
diff = -Inf
for h=1:gm
for j=1:gz
Tv[h,j]=add_vecs_and_max!(gm,r,h,j,E,exp_v,beta)
diff = max(abs(Tv[h,j]-v[h,j]),diff)
end
end
(v,Tv)=(Tv,v)
end
return v
end
Now, here's another version of the algorithm and inputs. The functions are similar to what Dan Getz suggested, except that I use findmax rather than an iterated max function to find the array maximum. In the input construction, I am using both Float32 and mixing different bit-types together. However, I've consistently achieved better performance this way: 24.905569 seconds (1.81 k allocations: 46.829 MiB, 0.01% gc time). But it's not clear at all why.
using SpecialFunctions
est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]
r = 0.015
tau = 0.35
rho =est_params[1]
sigma =est_params[2]
delta = 0.15
gamma =est_params[3]
a_capital =est_params[4]
lambda1 =est_params[5]
lambda2 =est_params[6]
s =est_params[7]
theta =est_params[8]
mu =est_params[9]
p_bar_k_ss =est_params[10]
beta = Float32((1+r)^(-1))
sigma_range = 4
gz = 19
gp = 29
gk = 37
lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
z=exp(lnz)
gk_m = fld(gk,2)
# Need to add mu somewhere to k_ss
k_ss = (theta*(1-tau)/(r+delta))^(1/(1-theta))
k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
insert!(k,gk_m+1,k_ss)
sort!(k)
p_bar=p_bar_k_ss*k_ss
p = collect(linspace(-p_bar/2,p_bar,gp))
#Tauchen
N = length(z)
Z = zeros(N,1)
Zprob = zeros(Float32,N,N)
Z[N] = lnz[length(z)]
Z[1] = lnz[1]
zstep = (Z[N] - Z[1]) / (N - 1)
for i=2:(N-1)
Z[i] = Z[1] + zstep * (i - 1)
end
for a = 1 : N
for b = 1 : N
if b == 1
Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
elseif b == N
Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
else
Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
end
end
end
# Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
# Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
Zcumprob=zeros(Float32,gz,gz)
# 2 in cumsum! denotes the 2nd dimension, i.e. columns
cumsum!(Zcumprob, Zprob,2)
gm = gk * gp
control=zeros(gm,2)
for i=1:gk
control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
control[(1+gp*(i-1)):(gp*i),2]=p
end
endog=copy(control)
E=Array(Float32,gm,gm,gz)
for h=1:gm
for m=1:gm
for j=1:gz
# set the nonzero net debt indicator
if endog[h,2]<0
p_ind=1
else
p_ind=0
end
# set the investment indicator
if (control[m,1]-(1-delta)*endog[h,1])!=0
i_ind=1
else
i_ind=0
end
E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau)) +
delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
(i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
s*endog[h,2]*p_ind
elem = E[m,h,j]
if E[m,h,j]<0
E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
else
E[m,h,j]=elem
end
end
end
end
function add_vecs!(gm::Int,r::Array{Float32},h::Int,j::Int,E::Array{Float32},
exp_v::Array{Float32},beta::Float32)
#inbounds #views for i=1:gm
r[i]=E[i,h,j] + beta*exp_v[i,j]
end
return r
end
function dynam_serial(E::Array{Float32},gm::Int,gz::Int,beta::Float32,Zprob::Array{Float32})
v = Array{Float32}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array{Float32}(gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1.00000 # arbitrary initial value greater than convcrit
iter=0
exp_v=Array{Float32}(gm,gz)
r=Array{Float32}(gm)
while diff>convcrit
A_mul_Bt!(exp_v,v,Zprob)
for h=1:gm
for j=1:gz
Tv[h,j]=findmax(add_vecs!(gm,r,h,j,E,exp_v,beta))[1]
end
end
diff = maximum(abs,Tv - v)
(v,Tv)=(Tv,v)
end
return v
end

Resources