z-score to probability and vice verse in ruby - ruby

How can I convert z-score to probability using ruby?
Example:
z_score = 0
probability should be 0.5
z_score = 1.76
probability should be 0.039204

According to this https://stackoverflow.com/a/16197404/1062711 post, here is the function that give you the p proba from the z score
def getPercent(z)
return 0 if z < -6.5
return 1 if z > 6.5
factk = 1
sum = 0
term = 1
k = 0
loopStop = Math.exp(-23)
while term.abs > loopStop do
term = 0.3989422804 * ((-1)**k) * (z**k) / (2*k+1) / (2**k) * (z**(k+1)) /factk
sum += term
k += 1
factk *= k
end
sum += 0.5
1-sum
end
puts getPercent(1.76)

Related

Filling a matrix using parallel processing in Julia

I'm trying to speed up the solution time for a dynamic programming problem in Julia (v. 0.5.0), via parallel processing. The problem involves choosing the optimal values for every element of a 1073 x 19 matrix at every iteration, until successive matrix differences fall within a tolerance. I thought that, within each iteration, filling in the values for each element of the matrix could be parallelized. However, I'm seeing a huge performance degradation using SharedArray, and I'm wondering if there's a better way to approach parallel processing for this problem.
I construct the arguments for the function below:
est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]
r = 0.015
tau = 0.35
rho =est_params[1]
sigma =est_params[2]
delta = 0.15
gamma =est_params[3]
a_capital =est_params[4]
lambda1 =est_params[5]
lambda2 =est_params[6]
s =est_params[7]
theta =est_params[8]
mu =est_params[9]
p_bar_k_ss =est_params[10]
beta = (1+r)^(-1)
sigma_range = 4
gz = 19
gp = 29
gk = 37
lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
z=exp(lnz)
gk_m = fld(gk,2)
# Need to add mu somewhere to k_ss
k_ss = (theta*(1-tau)/(r+delta))^(1/(1-theta))
k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
insert!(k,gk_m+1,k_ss)
sort!(k)
p_bar=p_bar_k_ss*k_ss
p = collect(linspace(-p_bar/2,p_bar,gp))
#Tauchen
N = length(z)
Z = zeros(N,1)
Zprob = zeros(Float32,N,N)
Z[N] = lnz[length(z)]
Z[1] = lnz[1]
zstep = (Z[N] - Z[1]) / (N - 1)
for i=2:(N-1)
Z[i] = Z[1] + zstep * (i - 1)
end
for a = 1 : N
for b = 1 : N
if b == 1
Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
elseif b == N
Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
else
Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
end
end
end
# Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
# Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
Zcumprob=zeros(Float32,gz,gz)
# 2 in cumsum! denotes the 2nd dimension, i.e. columns
cumsum!(Zcumprob, Zprob,2)
gm = gk * gp
control=zeros(gm,2)
for i=1:gk
control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
control[(1+gp*(i-1)):(gp*i),2]=p
end
endog=copy(control)
E=Array(Float32,gm,gm,gz)
for h=1:gm
for m=1:gm
for j=1:gz
# set the nonzero net debt indicator
if endog[h,2]<0
p_ind=1
else
p_ind=0
end
# set the investment indicator
if (control[m,1]-(1-delta)*endog[h,1])!=0
i_ind=1
else
i_ind=0
end
E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau)) +
delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
(i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
s*endog[h,2]*p_ind
elem = E[m,h,j]
if E[m,h,j]<0
E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
else
E[m,h,j]=elem
end
end
end
end
I then constructed the function with serial processing. The two for loops iterate through each element to find the largest value in a 1072-sized (=the gm scalar argument in the function) array:
function dynam_serial(E,gm,gz,beta,Zprob)
v = Array(Float32,gm,gz )
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array(Float32,gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1 # arbitrary initial value greater than convcrit
while diff>convcrit
exp_v=v*Zprob'
for h=1:gm
for j=1:gz
Tv[h,j]=findmax(E[:,h,j] + beta*exp_v[:,j])[1]
end
end
diff = maxabs(Tv - v)
v=copy(Tv)
end
end
Timing this, I get:
#time dynam_serial(E,gm,gz,beta,Zprob)
> 106.880008 seconds (91.70 M allocations: 203.233 GB, 15.22% gc time)
Now, I try using Shared Arrays to benefit from parallel processing. Note that I reconfigured the iteration so that I only have one for loop, rather than two. I also use v=deepcopy(Tv); otherwise, v is copied as an Array object, rather than a SharedArray:
function dynam_parallel(E,gm,gz,beta,Zprob)
v = SharedArray(Float32,(gm,gz),init = S -> S[Base.localindexes(S)] = myid() )
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1 # arbitrary initial value greater than convcrit
while diff>convcrit
exp_v=v*Zprob'
Tv = SharedArray(Float32,gm,gz,init = S -> S[Base.localindexes(S)] = myid() )
#sync #parallel for hj=1:(gm*gz)
j=cld(hj,gm)
h=mod(hj,gm)
if h==0;h=gm;end;
#async Tv[h,j]=findmax(E[:,h,j] + beta*exp_v[:,j])[1]
end
diff = maxabs(Tv - v)
v=deepcopy(Tv)
end
end
Timing the parallel version; and using a 4-core 2.5 GHz I7 processor with 16GB of memory, I get:
addprocs(3)
#time dynam_parallel(E,gm,gz,beta,Zprob)
> 164.237208 seconds (2.64 M allocations: 201.812 MB, 0.04% gc time)
Am I doing something incorrect here? Or is there a better way to approach parallel processing in Julia for this particular problem? I've considered using Distributed Arrays, but it's difficult for me to see how to apply them to the present problem.
UPDATE:
Per #DanGetz and his helpful comments, I turned instead to trying to speed up the serial processing version. I was able to get performance down to 53.469780 seconds (67.36 M allocations: 103.419 GiB, 19.12% gc time) through:
1) Upgrading to 0.6.0 (saved about 25 seconds), which includes the helpful #views macro.
2) Preallocating the main array I'm trying to fill in (Tv), per the section on Preallocating Outputs in the Julia Performance Tips: https://docs.julialang.org/en/latest/manual/performance-tips/. (saved another 25 or so seconds)
The biggest remaining slow-down seems to be coming from the add_vecs function, which sums together subarrays of two larger matrices. I've tried devectorizing and using BLAS functions, but haven't been able to produce better performance.
In any event, the improved code for dynam_serial is below:
function add_vecs(r::Array{Float32},h::Int,j::Int,E::Array{Float32},exp_v::Array{Float32},beta::Float32)
#views r=E[:,h,j] + beta*exp_v[:,j]
return r
end
function dynam_serial(E::Array{Float32},gm::Int,gz::Int,beta::Float32,Zprob::Array{Float32})
v = Array{Float32}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array{Float32}(gm,gz)
r = Array{Float32}(gm)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1 # arbitrary initial value greater than convcrit
while diff>convcrit
exp_v=v*Zprob'
for h=1:gm
for j=1:gz
#views Tv[h,j]=findmax(add_vecs(r,h,j,E,exp_v,beta))[1]
end
end
diff = maximum(abs,Tv - v)
v=copy(Tv)
end
return Tv
end
If add_vecs seems to be the critical function, writing an explicit for loop could offer more optimization. How does the following benchmark:
function add_vecs!(r::Array{Float32},h::Int,j::Int,E::Array{Float32},
exp_v::Array{Float32},beta::Float32)
#inbounds for i=1:size(E,1)
r[i]=E[i,h,j] + beta*exp_v[i,j]
end
return r
end
UPDATE
To continue optimizing dynam_serial I have tried to remove more allocations. The result is:
function add_vecs_and_max!(gm::Int,r::Array{Float64},h::Int,j::Int,E::Array{Float64},
exp_v::Array{Float64},beta::Float64)
#inbounds for i=1:gm
r[i] = E[i,h,j]+beta*exp_v[i,j]
end
return findmax(r)[1]
end
function dynam_serial(E::Array{Float64},gm::Int,gz::Int,
beta::Float64,Zprob::Array{Float64})
v = Array{Float64}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
r = Array{Float64}(gm)
exp_v = Array{Float64}(gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1.0 # arbitrary initial value greater than convcrit
while diff>convcrit
A_mul_Bt!(exp_v,v,Zprob)
diff = -Inf
for h=1:gm
for j=1:gz
oldv = v[h,j]
newv = add_vecs_and_max!(gm,r,h,j,E,exp_v,beta)
v[h,j]= newv
diff = max(diff, oldv-newv, newv-oldv)
end
end
end
return v
end
Switching the functions to use Float64 should increase speed (as CPUs are inherently optimized for 64-bit word lengths). Also, using the mutating A_mul_Bt! directly saves another allocation. Avoiding the copy(...) by switching the arrays v and Tv.
How do these optimizations improve your running time?
2nd UPDATE
Updated the code in the UPDATE section to use findmax. Also, changed dynam_serial to use v without Tv, as there was no need to save the old version except for the diff calculation, which is now done inside the loop.
Here's the code I copied-and-pasted, provided by Dan Getz above. I include the array and scalar definitions exactly as I ran them. Performance was: 39.507005 seconds (11 allocations: 486.891 KiB) when running #time dynam_serial(E,gm,gz,beta,Zprob).
using SpecialFunctions
est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]
r = 0.015
tau = 0.35
rho =est_params[1]
sigma =est_params[2]
delta = 0.15
gamma =est_params[3]
a_capital =est_params[4]
lambda1 =est_params[5]
lambda2 =est_params[6]
s =est_params[7]
theta =est_params[8]
mu =est_params[9]
p_bar_k_ss =est_params[10]
beta = (1+r)^(-1)
sigma_range = 4
gz = 19 #15 #19
gp = 29 #19 #29
gk = 37 #25 #37
lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
z=exp.(lnz)
gk_m = fld(gk,2)
# Need to add mu somewhere to k_ss
k_ss = (theta*(1-tau)/(r+delta))^(1/(1-theta))
k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
insert!(k,gk_m+1,k_ss)
sort!(k)
p_bar=p_bar_k_ss*k_ss
p = collect(linspace(-p_bar/2,p_bar,gp))
#Tauchen
N = length(z)
Z = zeros(N,1)
Zprob = zeros(Float64,N,N)
Z[N] = lnz[length(z)]
Z[1] = lnz[1]
zstep = (Z[N] - Z[1]) / (N - 1)
for i=2:(N-1)
Z[i] = Z[1] + zstep * (i - 1)
end
for a = 1 : N
for b = 1 : N
if b == 1
Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
elseif b == N
Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
else
Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
end
end
end
# Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
# Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
Zcumprob=zeros(Float64,gz,gz)
# 2 in cumsum! denotes the 2nd dimension, i.e. columns
cumsum!(Zcumprob, Zprob,2)
gm = gk * gp
control=zeros(gm,2)
for i=1:gk
control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
control[(1+gp*(i-1)):(gp*i),2]=p
end
endog=copy(control)
E=Array(Float64,gm,gm,gz)
for h=1:gm
for m=1:gm
for j=1:gz
# set the nonzero net debt indicator
if endog[h,2]<0
p_ind=1
else
p_ind=0
end
# set the investment indicator
if (control[m,1]-(1-delta)*endog[h,1])!=0
i_ind=1
else
i_ind=0
end
E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau)) +
delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
(i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
s*endog[h,2]*p_ind
elem = E[m,h,j]
if E[m,h,j]<0
E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
else
E[m,h,j]=elem
end
end
end
end
function add_vecs_and_max!(gm::Int,r::Array{Float64},h::Int,j::Int,E::Array{Float64},
exp_v::Array{Float64},beta::Float64)
maxr = -Inf
#inbounds for i=1:gm r[i] = E[i,h,j]+beta*exp_v[i,j]
maxr = max(r[i],maxr)
end
return maxr
end
function dynam_serial(E::Array{Float64},gm::Int,gz::Int,
beta::Float64,Zprob::Array{Float64})
v = Array{Float64}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array{Float64}(gm,gz)
r = Array{Float64}(gm)
exp_v = Array{Float64}(gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1.0 # arbitrary initial value greater than convcrit
while diff>convcrit
A_mul_Bt!(exp_v,v,Zprob)
diff = -Inf
for h=1:gm
for j=1:gz
Tv[h,j]=add_vecs_and_max!(gm,r,h,j,E,exp_v,beta)
diff = max(abs(Tv[h,j]-v[h,j]),diff)
end
end
(v,Tv)=(Tv,v)
end
return v
end
Now, here's another version of the algorithm and inputs. The functions are similar to what Dan Getz suggested, except that I use findmax rather than an iterated max function to find the array maximum. In the input construction, I am using both Float32 and mixing different bit-types together. However, I've consistently achieved better performance this way: 24.905569 seconds (1.81 k allocations: 46.829 MiB, 0.01% gc time). But it's not clear at all why.
using SpecialFunctions
est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]
r = 0.015
tau = 0.35
rho =est_params[1]
sigma =est_params[2]
delta = 0.15
gamma =est_params[3]
a_capital =est_params[4]
lambda1 =est_params[5]
lambda2 =est_params[6]
s =est_params[7]
theta =est_params[8]
mu =est_params[9]
p_bar_k_ss =est_params[10]
beta = Float32((1+r)^(-1))
sigma_range = 4
gz = 19
gp = 29
gk = 37
lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
z=exp(lnz)
gk_m = fld(gk,2)
# Need to add mu somewhere to k_ss
k_ss = (theta*(1-tau)/(r+delta))^(1/(1-theta))
k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
insert!(k,gk_m+1,k_ss)
sort!(k)
p_bar=p_bar_k_ss*k_ss
p = collect(linspace(-p_bar/2,p_bar,gp))
#Tauchen
N = length(z)
Z = zeros(N,1)
Zprob = zeros(Float32,N,N)
Z[N] = lnz[length(z)]
Z[1] = lnz[1]
zstep = (Z[N] - Z[1]) / (N - 1)
for i=2:(N-1)
Z[i] = Z[1] + zstep * (i - 1)
end
for a = 1 : N
for b = 1 : N
if b == 1
Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
elseif b == N
Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
else
Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
end
end
end
# Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
# Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
Zcumprob=zeros(Float32,gz,gz)
# 2 in cumsum! denotes the 2nd dimension, i.e. columns
cumsum!(Zcumprob, Zprob,2)
gm = gk * gp
control=zeros(gm,2)
for i=1:gk
control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
control[(1+gp*(i-1)):(gp*i),2]=p
end
endog=copy(control)
E=Array(Float32,gm,gm,gz)
for h=1:gm
for m=1:gm
for j=1:gz
# set the nonzero net debt indicator
if endog[h,2]<0
p_ind=1
else
p_ind=0
end
# set the investment indicator
if (control[m,1]-(1-delta)*endog[h,1])!=0
i_ind=1
else
i_ind=0
end
E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau)) +
delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
(i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
s*endog[h,2]*p_ind
elem = E[m,h,j]
if E[m,h,j]<0
E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
else
E[m,h,j]=elem
end
end
end
end
function add_vecs!(gm::Int,r::Array{Float32},h::Int,j::Int,E::Array{Float32},
exp_v::Array{Float32},beta::Float32)
#inbounds #views for i=1:gm
r[i]=E[i,h,j] + beta*exp_v[i,j]
end
return r
end
function dynam_serial(E::Array{Float32},gm::Int,gz::Int,beta::Float32,Zprob::Array{Float32})
v = Array{Float32}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array{Float32}(gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1.00000 # arbitrary initial value greater than convcrit
iter=0
exp_v=Array{Float32}(gm,gz)
r=Array{Float32}(gm)
while diff>convcrit
A_mul_Bt!(exp_v,v,Zprob)
for h=1:gm
for j=1:gz
Tv[h,j]=findmax(add_vecs!(gm,r,h,j,E,exp_v,beta))[1]
end
end
diff = maximum(abs,Tv - v)
(v,Tv)=(Tv,v)
end
return v
end

Vectorized code slower than loops? MATLAB

In the problem Im working on there is such a part of code, as shown below. The definition part is just to show you the sizes of arrays. Below I pasted vectorized version - and it is >2x slower. Why it happens so? I know that i happens if vectorization requiers large temporary variables, but (it seems) it is not true here.
And generally, what (other than parfor, with I already use) can I do to speed up this code?
maxN = 100;
levels = maxN+1;
xElements = 101;
umn = complex(zeros(levels, levels));
umn2 = umn;
bessels = ones(xElements, xElements, levels); % 1.09 GB
posMcontainer = ones(xElements, xElements, maxN);
tic
for j = 1 : xElements
for i = 1 : xElements
for n = 1 : 2 : maxN
nn = n + 1;
mm = 1;
for m = 1 : 2 : n
umn(nn, mm) = bessels(i, j, nn) * posMcontainer(i, j, m);
mm = mm + 1;
end
end
end
end
toc % 0.520594 seconds
tic
for j = 1 : xElements
for i = 1 : xElements
for n = 1 : 2 : maxN
nn = n + 1;
m = 1:2:n;
numOfEl = ceil(n/2);
umn2(nn, 1:numOfEl) = bessels(i, j, nn) * posMcontainer(i, j, m);
end
end
end
toc % 1.275926 seconds
sum(sum(umn-umn2)) % veryfying, if all done right
Best regards,
Alex
From the profiler:
Edit:
In reply to #Jason answer, this alternative takes the same time:
for n = 1:2:maxN
nn(n) = n + 1;
numOfEl(n) = ceil(n/2);
end
for j = 1 : xElements
for i = 1 : xElements
for n = 1 : 2 : maxN
umn2(nn(n), 1:numOfEl(n)) = bessels(i, j, nn(n)) * posMcontainer(i, j, 1:2:n);
end
end
end
Edit2:
In reply to #EBH :
The point is to do the following:
parfor i = 1 : xElements
for j = 1 : xElements
umn = complex(zeros(levels, levels)); % cleaning
for n = 0:maxN
mm = 1;
for m = -n:2:n
nn = n + 1; % for indexing
if m < 0
umn(nn, mm) = bessels(i, j, nn) * negMcontainer(i, j, abs(m));
end
if m > 0
umn(nn, mm) = bessels(i, j, nn) * posMcontainer(i, j, m);
end
if m == 0
umn(nn, mm) = bessels(i, j, nn);
end
mm = mm + 1; % for indexing
end % m
end % n
beta1 = sum(sum(Aj1.*umn));
betaSumSq1(i, j) = abs(beta1).^2;
beta2 = sum(sum(Aj2.*umn));
betaSumSq2(i, j) = abs(beta2).^2;
end % j
end % i
I speeded it up as much, as I was able to. What you have written is taking only the last bessels and posMcontainer values, so it does not produce the same result. In the real code, those two containers are filled not with 1, but with some precalculated values.
After your edit, I can see that umn is just a temporary variable for another calculation. It still can be mostly vectorizable:
betaSumSq1 = zeros(xElements); % preallocating
betaSumSq2 = zeros(xElements); % preallocating
% an index matrix to fetch the right values from negMcontainer and
% posMcontainer:
indmat = tril(repmat([0 1;1 0],ceil((maxN+1)/2),floor(levels/2)));
indmat(end,:) = [];
% an index matrix to fetch the values in correct order for umn:
b_ind = repmat([1;0],ceil((maxN+1)/2),1);
b_ind(end) = [];
tempind = logical([fliplr(indmat) b_ind indmat+triu(ones(size(indmat)))]);
% permute the arrays to prevent squeeze:
PM = permute(posMcontainer,[3 1 2]);
NM = permute(negMcontainer,[3 1 2]);
B = permute(bessels,[3 1 2]);
for k = 1 : maxN+1 % third dim
for jj = 1 : xElements % columns
b = B(:,jj,k); % get one vector of B
% perform b*NM for every row of NM*indmat, than flip the result:
neg = fliplr(bsxfun(#times,bsxfun(#times,indmat,NM(:,jj,k).'),b));
% perform b*PM for every row of PM*indmat:
pos = bsxfun(#times,bsxfun(#times,indmat,PM(:,jj,k).'),b);
temp = [neg mod(1:levels,2).'.*b pos].'; % concat neg and pos
% assign them to the right place in umn:
umn = reshape(temp(tempind.'),[levels levels]).';
beta1 = Aj1.*umn;
betaSumSq1(jj,k) = abs(sum(beta1(:))).^2;
beta2 = Aj2.*umn;
betaSumSq2(jj,k) = abs(sum(beta2(:))).^2;
end
end
This reduce running time from ~95 seconds to less 3 seconds (both without parfor), so it improves in almost 97%.
I would suspect it is memory allocation. You are re-allocating the m array in a 3 deep loop.
try rearranging the code:
tic
for n = 1 : 2 : maxN
nn = n + 1;
m = 1:2:n;
numOfEl = ceil(n/2);
for j = 1 : xElements
for i = 1 : xElements
umn2(nn, 1:numOfEl) = bessels(i, j, nn) * posMcontainer(i, j, m);
end
end
end
toc % 1.275926 seconds
I was trying this in Igor pro, which a similar language, but with different optimizations. So the direct translations don't time the same way as Matlab (vectorized was slightly faster in Igor). But reordering the loops did speed up the vectorized form.
In your second part of the code, that is setting umn2, inside the loops, you have:
nn = n + 1;
m = 1:2:n;
numOfEl = ceil(n/2);
Those 3 lines don't require any input from the i and j loops, they only use the n loop. So reordering the loops such that i and j are inside the n loop will mean that those 3 lines are done xElements^2 (100^2) times less often. I suspect it is that m = 1:2:n line that takes time, since that is allocating an array.

Gamma function approximation

I'm trying to write a three-parameter method that approximates the gamma function over a certain interval. The approximation should be a right-endpoint Riemann sum.
The gamma function is given by:
GAMMA(s) =
inf
INT x^(s-1) * exp(-x) dx
0
The right-endpoint Riemann sum approximation over the interval (0, m) should therefore be:
GAMMA(s) ~
m
SUM ((m/n)*i)^(s-1) * exp(-(m/n)*i) * delta_x where delta_x = (m/n)
i=1
My code is as follows:
def gamma(x = 4.0, n = 100000, m = 2500)
array = *(1..n)
result = array.inject(0) {|sum, i| sum + ((((m/n)*i)**(x-1))*((2.7183)**(-(m/n)*i))*(m/n))}
end
puts gamma
The code should return an approximation for 3! = 6, but instead it returns 0.0. Any ideas where I may be going wrong?
The problem is that when you do m/n
you are doing an integer division (eg 3/4 = 0) when you expect a float division (3/4 = 0.75)
you need to define your n and m as floats.
You can rewrite it as
def gamma(x = 4.0, n = 100000, m = 2500)
n = n.to_f
m = m.to_f
(1..n).to_a.inject(0) do |sum, i|
sum + ((((m/n)*i)**(x-1))*((Math::E)**(-(m/n)*i))*(m/n))
end
end
PS: also you do not need the array and result variables.
PS2: consider using Math::E instead of 2.7183
Your problem was identified by #xlembouras. You might consider writing the method as follows.
Code
def gamma(x = 4.0, n = 100000, m = 2500)
ratio = m.to_f/n
xm1 = x-1.0
ratio * (1..m).inject(0) do |sum,i|
ixratio = i*ratio
sum + ixratio**xm1 * Math.exp(-ixratio)
end
end
Examples
gamma(x=4.0, n= 40, m =10).round(6) #=> 1.616233
gamma.round(6) #=> 6.0
Please confirm these calculations are correct.

split rectangle in cells / random coordinates / store in array in FORTRAN

I would like to split a rectangle in cells. In each cell it should be create a random coordinate (y, z).
The wide and height of the rectangle are known (initialW / initalH).
The size of the cells are calculated (dy / dz).
The numbers, in how many cells the rectangle to be part, are known. (numberCellsY / numberCellsZ)
Here my Code in Fortran to split the rectangle in Cells:
yRVEMin = 0.0
yRVEMax = initialW
dy = ( yRVEMax - yRVEMin ) / numberCellsY
zRVEMin = 0.0
zRVEMax = initialH
dz = ( zRVEMax - zRVEMin ) / numberCellsZ
do i = 1, numberCellsY
yMin(i) = (i-1)*dy
yMax(i) = i*dy
end do
do j = 1, numberCellsZ
zMin(j) = (j-1)*dz
zMax(j) = j*dz
end do
Now I would like to produce a random coordinate in each cell. The problem for me is, to store the coodinates in an array. It does not necessarily all be stored in one array, but as least as possible.
To fill the cells with coordinates it should start at the bottom left cell, go through the rows (y-direction), and after the last cell (numberCellsY) jump a column higher (z-dicrection) and start again by the first cell of the new row at left side. That should be made so long until a prescribed number (nfibers) is reached.
Here a deplorable try to do it:
call random_seed
l = 0
do k = 1 , nfibers
if (l < numberCellsY) then
l = l + 1
else
l = 1
end if
call random_number(y)
fiberCoordY(k) = yMin(l) + y * (yMax(l) - yMin(l))
end do
n = 0
do m = 1 , nfibers
if (n < numberCellsZ) then
n = n + 1
else
n = 1
end if
call random_number(z)
fiberCoordZ(m) = zMin(n) + z * (zMax(n) - zMin(n))
end do
The output is not what I want! fiberCoordZ should be stay on (zMin(1) / zMax(1) as long as numberCellsY-steps are reached.
The output for following settings:
nfibers = 9
numberCellsY = 3
numberCellsZ = 3
initialW = 9.0
initialH = 9.0
My random output for fiberCoordY is:
1.768946 3.362770 8.667685 1.898700 5.796713 8.770239 2.463412 3.546694 7.074708
and for fiberCoordZ is:
2.234807 5.213032 6.762228 2.948657 5.937295 8.649946 0.6795220 4.340364 8.352566
In this case the first 3 numbers of fiberCoordz should have a value between 0.0 and 3.0. Than number 4 - 6 a value between 3.0 and 6.0. And number 7 - 9 a value bewtween 6.0 - 9.0.
How can I solve this? If somebody has a solution with a better approach, please post it!
Thanks
Looking at
n = 0
do m = 1 , nfibers
if (n < numberCellsZ) then
n = n + 1
else
n = 1
end if
call random_number(z)
fiberCoordZ(m) = zMin(n) + z * (zMax(n) - zMin(n))
end do
we see that the z coordinate offset (the bottom cell boundary of interest) is being incremented inappropriately: for each consecutive nfibers/numberCellsZ coordinates n should be constant.
n should be incremented only every numberCellsY iterations, so perhaps a condition like
if (MOD(m, numberCellsY).eq.1) n=n+1
would be better.
Thanks francescalus! It works fine.
I added a little more for the case that nfibers > numberCellsY*numberCellsZ
n=0
do m = 1 , nfibers
if (MOD(m, numberCellsY).eq.1 .and. (n < numberCellsY)) then
n=n+1
end if
if (MOD(m, numberCellsY*numberCellsZ).eq.1 ) then
n = 1
end if
call random_number(z)
fiberCoordZ(m) = zMin(n) + z * (zMax(n) - zMin(n))
end do

How to accelerate matlab code?

I'm using matlab to implement a multilayer neural network. In the code I represent
the value of each node AS netValue{k}
the weight between layer k and k + 1 AS weight{k}
etc.
Since these data is three-dimensional, I have to use cell to hold a 2-D matrix to enable matrix multiply.
So it becomes really really slow to train the model, which I expect to have resulted from the usage of cell.
Can anyone tell me how to accelerate this code? Thanks
clc;
close all;
clear all;
input = [-2 : 0.4 : 2;-2:0.4:2];
ican = 4;
depth = 4; % total layer - 1, by convension
[featureNum , sampleNum] = size(input);
levelNum(1) = featureNum;
levelNum(2) = 5;
levelNum(3) = 5;
levelNum(4) = 5;
levelNum(5) = 2;
weight = cell(0);
for k = 1 : depth
weight{k} = rand(levelNum(k+1), levelNum(k)) - 2 * rand(levelNum(k+1) , levelNum(k));
threshold{k} = rand(levelNum(k+1) , 1) - 2 * rand(levelNum(k+1) , 1);
end
runCount = 0;
sumMSE = 1; % init MSE
minError = 1e-5;
afa = 0.1; % step of "gradient ascendence"
% training loop
while(runCount < 100000 & sumMSE > minError)
sumMSE = 0; % sum of MSE
for i = 1 : sampleNum % sample loop
netValue{1} = input(:,i);
for k = 2 : depth
netValue{k} = weight{k-1} * netValue{k-1} + threshold{k-1}; %calculate each layer
netValue{k} = 1 ./ (1 + exp(-netValue{k})); %apply logistic function
end
netValue{depth+1} = weight{depth} * netValue{depth} + threshold{depth}; %output layer
e = 1 + sin((pi / 4) * ican * netValue{1}) - netValue{depth + 1}; %calc error
assistS{depth} = diag(ones(size(netValue{depth+1})));
s{depth} = -2 * assistS{depth} * e;
for k = depth - 1 : -1 : 1
assistS{k} = diag((1-netValue{k+1}).*netValue{k+1});
s{k} = assistS{k} * weight{k+1}' * s{k+1};
end
for k = 1 : depth
weight{k} = weight{k} - afa * s{k} * netValue{k}';
threshold{k} = threshold{k} - afa * s{k};
end
sumMSE = sumMSE + e' * e;
end
sumMSE = sqrt(sumMSE) / sampleNum;
runCount = runCount + 1;
end
x = [-2 : 0.1 : 2;-2:0.1:2];
y = zeros(size(x));
z = 1 + sin((pi / 4) * ican .* x);
% test
for i = 1 : length(x)
netValue{1} = x(:,i);
for k = 2 : depth
netValue{k} = weight{k-1} * netValue{k-1} + threshold{k-1};
netValue{k} = 1 ./ ( 1 + exp(-netValue{k}));
end
y(:, i) = weight{depth} * netValue{depth} + threshold{depth};
end
plot(x(1,:) , y(1,:) , 'r');
hold on;
plot(x(1,:) , z(1,:) , 'g');
hold off;
Have you used the profiler to find out what functions are actually slowing down your code? It shows what lines take the most time to execute.

Resources