```
#jit(nopython=True,parallel=False,fastmath=True)
def func(q,H,N):
q_H = q*(2*H+1)
q_N = q*N
array_3d = np.zeros((q_H,q_H,q_H), dtype=np.float64)
vect = np.random.random((q_N))
M = np.random.random((q_N,q_H))
invM = G1.T
for l in range(q_H):
yl = M[:,l]
for m in range(l,q_H):
ym = M[:,m]
calc_vec = yl*ym*vect
array_3d[:,l,m] = invM#calc_vec
array_3d[:,m,l] = array_3d[:,l,m]
return array_3d
```
q : int (1,2,...,9)
H : int (3,5,10,15,20,25,30)
N : int (512, 1024, 2048, 4096)
I am trying to optimize the computation time of this function. Thanks for your help.
Related
I' don't know with what R commands you can generate datasets with combination of parameters (dimension, cluster proportions,model). I use this commands
formmatrix = function(a,b,c,d,e,f,p){
m1 = c(0,8,rep(0, p-2))
m2 = c(8,0,rep(0, p-2))
m3 = c(-8,-8,rep(0, p-2))
sig1 = diag(c(1,a, rep(1, p-2)))
sig2 = diag(c(b,c, rep(1, p-2)))
sig3 = matrix(c(d,e,e,f),2,2,byrow=T)
sig3 = cbind(sig3, matrix(rep(0,2*(p-2)),2))
sig3 = rbind(sig3, cbind(t(matrix(rep(0,2*(p-2)),2)), diag(rep(1, p-2))))
return(list(m1=m1, m2=m2, m3=m3, sig1=sig1, sig2=sig2, sig3=sig3))
}
p = 2
sim1 = formmatrix(1,1,1,1,0,1, p)
sim2 = formmatrix(5,1,5,1,0,5, p)
sim3 = formmatrix(5,5,1,3,-2,3, p)
sim4 = formmatrix(1,20,5,15,-10,15, p)
sim5 = formmatrix(1, 45, 30, 15, -10, 15, p)
mixt1 <- rbind(rmvnorm(200, mean = sim1$m1, sigma = sim1$sig1),
rmvnorm(300, mean = sim1$m2 ,sigma = sim1$sig2 ),
rmvnorm(500, mean = sim1$m3,sigma = sim1$sig3))
I am thinking about using distributed computation in a problem I face. Suppose I have an index k which increases from 1 to 800 (for instance). And for each k, I have a pool p which has large size and many numbers stored in it. I want to get kth-pool recursively. The protocol is like, if I know (k-1)-th pool, then I can randomly choose two values z1, z2 from it and get a new value through a function f like z = f(z1,z2). Then I store it into k-th pool and repeated this many times until this pool is full and then I try to get (k+1)th-pool from kth-pool.
Due to the large size of the pool, I try to use parallel computation to speed up my Julia code. I am trying to use pmap and use a SharedArray as my (k-1)th-pool within each k. So I write the following code
using Distributed
addprocs(10)
#everywhere using LinearAlgebra
#everywhere using StatsBase
#everywhere using Statistics
#everywhere using DoubleFloats
#everywhere using StaticArrays
#everywhere using SharedArrays
#everywhere using JLD
#everywhere using Dates
#everywhere using Random
#everywhere using Printf
#everywhere function rand_haar2(::Val{n}) where n
M = #SMatrix randn(ComplexDF64, n,n)
q = qr(M).Q
L = cispi.(2 .* #SVector(rand(Double64,n)))
return q*diagm(L)
end
#everywhere function pool_calc(theta,pool::SharedArray,Np)
Random.seed!(myid())
pool_store = zeros(Double64,Np)
Kup= #SMatrix[Double64(cos(theta)) 0; 0 Double64(sin(theta))]
Kdown = #SMatrix[Double64(sin(theta)) 0; 0 Double64(cos(theta))]
P2up = kron(#SMatrix[Double64(1.) 0.;0. 1.], #SMatrix[1 0; 0 0])
P2down = kron(#SMatrix[Double64(1) 0;0 1],#SMatrix[0 0;0 1])
poolcount = 0
poolsize = length(pool)
while poolcount < Np
z1 = pool[rand(1:poolsize)]
rho1 = diagm(#SVector[z1,1-z1])
z2 = pool[rand(1:poolsize)]
rho2 = diagm(#SVector[z2,1-z2])
u1 = rand_haar2_slower(Val{2}())
u2 = rand_haar2_slower(Val{2}())
K1up = u1*Kup*u1'
K1down = u1*Kdown*u1'
K2up = u2*Kup*u2'
K2down = u2*Kdown*u2'
rho1p = K1up*rho1*K1up'
rho2p = K2up*rho2*K2up'
p1 = real(tr(rho1p+rho1p'))/2
p2 = real(tr(rho2p+rho2p'))/2
if rand()<p1
rho1p = (rho1p+rho1p')/(2*p1)
else
rho1p = K1down*rho1*K1down'/((1-p1))
end
if rand()<p2
rho2p = (rho2p+rho2p')/(2*p2)
else
rho2p = K2down*rho2*K2down'/((1-p2))
end
rho = kron(rho1p,rho2p)
U = rand_haar2_slower(Val{4}())
rho_p = P2up*U*rho*U'*P2up'
p = real(tr(rho_p+rho_p'))/2
if rand()<p
temp =(rho_p+rho_p')/2
rho_f = #SMatrix[temp[1,1]+temp[2,2] temp[1,3]+temp[2,4]; temp[3,1]+temp[4,2] temp[3,3]+temp[4,4]]/(p)
else
temp = P2down*U*rho*U'*P2down'
rho_f = #SMatrix[temp[1,1]+temp[2,2] temp[1,3]+temp[2,4]; temp[3,1]+temp[4,2] temp[3,3]+temp[4,4]]/(1-p)
end
rho_f = (rho_f+rho_f')/2
t = abs(tr(rho_f*rho_f))
z = (1-t)/(1+abs(sqrt(2*t-1)))
if !iszero(abs(z))
poolcount = poolcount+1
pool_store[poolcount] = abs(z)
end
end
return pool_store
end
function main()
theta = parse(Double64,ARGS[1])
Nk = parse(Int,ARGS[2])
S_curve = zeros(Double64,Nk)
S_var = zeros(Double64,Nk)
Npool = Int(floor(10^6))
pool = SharedArray{Double64}(Npool)
pool_sample = zeros(Double64,Npool)
spool = zeros(Double64,Npool)
pool .=0.5
for k =1:800
ret = pmap(Np->pool_calc(theta = theta,pool=pool,Np=Np),fill(10^5,10))
pool_target = reduce(vcat,[ret[i][1] for i = 1:10])
spool .=-pool_target .*log.(pool_target).-(1.0 .- pool_target).*log1p.(-pool_target)
S_curve[k] = mean(spool)
S_var[k] = (std(spool)/sqrt(Npool))^2
pool = pool_target
end
label = #sprintf "%.3f" Float32(theta)
save("entropy_real_128p_$(label)_ps6.jld","s", S_curve, "t", S_var)
end
main();
But I faced an error
How to solve this problem?
Thanks
In the problem Im working on there is such a part of code, as shown below. The definition part is just to show you the sizes of arrays. Below I pasted vectorized version - and it is >2x slower. Why it happens so? I know that i happens if vectorization requiers large temporary variables, but (it seems) it is not true here.
And generally, what (other than parfor, with I already use) can I do to speed up this code?
maxN = 100;
levels = maxN+1;
xElements = 101;
umn = complex(zeros(levels, levels));
umn2 = umn;
bessels = ones(xElements, xElements, levels); % 1.09 GB
posMcontainer = ones(xElements, xElements, maxN);
tic
for j = 1 : xElements
for i = 1 : xElements
for n = 1 : 2 : maxN
nn = n + 1;
mm = 1;
for m = 1 : 2 : n
umn(nn, mm) = bessels(i, j, nn) * posMcontainer(i, j, m);
mm = mm + 1;
end
end
end
end
toc % 0.520594 seconds
tic
for j = 1 : xElements
for i = 1 : xElements
for n = 1 : 2 : maxN
nn = n + 1;
m = 1:2:n;
numOfEl = ceil(n/2);
umn2(nn, 1:numOfEl) = bessels(i, j, nn) * posMcontainer(i, j, m);
end
end
end
toc % 1.275926 seconds
sum(sum(umn-umn2)) % veryfying, if all done right
Best regards,
Alex
From the profiler:
Edit:
In reply to #Jason answer, this alternative takes the same time:
for n = 1:2:maxN
nn(n) = n + 1;
numOfEl(n) = ceil(n/2);
end
for j = 1 : xElements
for i = 1 : xElements
for n = 1 : 2 : maxN
umn2(nn(n), 1:numOfEl(n)) = bessels(i, j, nn(n)) * posMcontainer(i, j, 1:2:n);
end
end
end
Edit2:
In reply to #EBH :
The point is to do the following:
parfor i = 1 : xElements
for j = 1 : xElements
umn = complex(zeros(levels, levels)); % cleaning
for n = 0:maxN
mm = 1;
for m = -n:2:n
nn = n + 1; % for indexing
if m < 0
umn(nn, mm) = bessels(i, j, nn) * negMcontainer(i, j, abs(m));
end
if m > 0
umn(nn, mm) = bessels(i, j, nn) * posMcontainer(i, j, m);
end
if m == 0
umn(nn, mm) = bessels(i, j, nn);
end
mm = mm + 1; % for indexing
end % m
end % n
beta1 = sum(sum(Aj1.*umn));
betaSumSq1(i, j) = abs(beta1).^2;
beta2 = sum(sum(Aj2.*umn));
betaSumSq2(i, j) = abs(beta2).^2;
end % j
end % i
I speeded it up as much, as I was able to. What you have written is taking only the last bessels and posMcontainer values, so it does not produce the same result. In the real code, those two containers are filled not with 1, but with some precalculated values.
After your edit, I can see that umn is just a temporary variable for another calculation. It still can be mostly vectorizable:
betaSumSq1 = zeros(xElements); % preallocating
betaSumSq2 = zeros(xElements); % preallocating
% an index matrix to fetch the right values from negMcontainer and
% posMcontainer:
indmat = tril(repmat([0 1;1 0],ceil((maxN+1)/2),floor(levels/2)));
indmat(end,:) = [];
% an index matrix to fetch the values in correct order for umn:
b_ind = repmat([1;0],ceil((maxN+1)/2),1);
b_ind(end) = [];
tempind = logical([fliplr(indmat) b_ind indmat+triu(ones(size(indmat)))]);
% permute the arrays to prevent squeeze:
PM = permute(posMcontainer,[3 1 2]);
NM = permute(negMcontainer,[3 1 2]);
B = permute(bessels,[3 1 2]);
for k = 1 : maxN+1 % third dim
for jj = 1 : xElements % columns
b = B(:,jj,k); % get one vector of B
% perform b*NM for every row of NM*indmat, than flip the result:
neg = fliplr(bsxfun(#times,bsxfun(#times,indmat,NM(:,jj,k).'),b));
% perform b*PM for every row of PM*indmat:
pos = bsxfun(#times,bsxfun(#times,indmat,PM(:,jj,k).'),b);
temp = [neg mod(1:levels,2).'.*b pos].'; % concat neg and pos
% assign them to the right place in umn:
umn = reshape(temp(tempind.'),[levels levels]).';
beta1 = Aj1.*umn;
betaSumSq1(jj,k) = abs(sum(beta1(:))).^2;
beta2 = Aj2.*umn;
betaSumSq2(jj,k) = abs(sum(beta2(:))).^2;
end
end
This reduce running time from ~95 seconds to less 3 seconds (both without parfor), so it improves in almost 97%.
I would suspect it is memory allocation. You are re-allocating the m array in a 3 deep loop.
try rearranging the code:
tic
for n = 1 : 2 : maxN
nn = n + 1;
m = 1:2:n;
numOfEl = ceil(n/2);
for j = 1 : xElements
for i = 1 : xElements
umn2(nn, 1:numOfEl) = bessels(i, j, nn) * posMcontainer(i, j, m);
end
end
end
toc % 1.275926 seconds
I was trying this in Igor pro, which a similar language, but with different optimizations. So the direct translations don't time the same way as Matlab (vectorized was slightly faster in Igor). But reordering the loops did speed up the vectorized form.
In your second part of the code, that is setting umn2, inside the loops, you have:
nn = n + 1;
m = 1:2:n;
numOfEl = ceil(n/2);
Those 3 lines don't require any input from the i and j loops, they only use the n loop. So reordering the loops such that i and j are inside the n loop will mean that those 3 lines are done xElements^2 (100^2) times less often. I suspect it is that m = 1:2:n line that takes time, since that is allocating an array.
I'm trying to develop the adaptive unsharp algorithm described by Polesel et al. in the article "Image Enhancement via Adaptive Unsharp Masking" (link to the article). The core of the algorithm is the minimization of a cost function defined as:
J(m,n) = E[e(m,n)^2] = E[(gd(m,n)-gy(m,n))^2]
where E[] is the statistical expectation and gy(m,n) is:
gy(m,n) = gx(m,n) + lambda1(m,n)*gzx(m,n) + lambda2(m,n)*gzy(m,n);
I want to find lambda1 and lambda2 for each pixel in order to minimize the cost function in each pixel.
Here the code that I wrote so far:
function [ o_sharpened_image ] = AdaptativeUnsharpMask( i_image , t1, t2)
%ADAPTATIVEUNSHARPMASK Summary of this function goes here
% Detailed explanation goes here
if isa(i_image,'dip_image')
i_image = dip_array(i_image);
end
if ~isfloat(i_image)
i_image = im2double(i_image);
end
adh = 4;
adl = 3;
g = [-1 -1 -1; -1 8 -1; -1 -1 -1];
dim = size(i_image);
lambda_x = 0.5*ones(dim);
lambda_y = 0.5*ones(dim);
z_x = conv2(i_image,[-1 2 -1],'same');
z_y = conv2(i_image,[-1; 2; -1],'same');
g_x = conv2(i_image,g,'same');
g_zx = conv2(z_x,g,'same');
g_zy = conv2(z_y,g,'same');
a = ones(dim);
variance_map = colfilt(i_image,[3 3],'sliding',#var);
a(variance_map >= t1 & variance_map < t2) = adh;
a(variance_map >= t2) = adl;
g_d = a.*g_x;
lambda = [lambda_x lambda_y];
lambda0 = lambda;
lambda_min = lsqnonlin(#(lambda) UnsharpCostFunction(lambda,g_d,g_zx,g_zy),lambda0);
o_sharpened_image = i_image + lambda_min(:,1:size(i_image,2)).*z_x + lambda_min(:,size(i_image,2)+1:end).*z_y;
end
Here the code of the cost function:
function [ J ] = UnsharpCostFunction( i_lambda, i_gd, i_gzx, i_gzy )
%UNSHARPCOSTFUNCTION Summary of this function goes herek
gy = i_gd + i_lambda(:,1:size(i_gd,2)).*i_gzx + i_lambda(:,size(i_gd,2)+1:end).*i_gzy;
J = mean((i_gd(:) - gy(:)).^2);
end
For each iteration I print on the command window the value of the J function and it is always the same. What am I doing wrong?
Thank you.
I have a problem that I can't seem to solve. I want a query to determine whether a given value lies within a predefined range, but my loop is very slow for big datasets. Is there a more efficient way?
clear all
close all
Regression(1,1) = 1.001415645694801;
Regression(1,2) = 0.043822386790753;
FF_Value(:,1) = [24.24 30.77 31.37 29.05 29.20 29.53 29.67 27.78];
FF_Value(:,2) = [24.16 30.54 31.15 29.53 29.39 29.34 29.53 28.17];
FF_Distance = FF_Value(:,2)-(Regression(1,2)+Regression(1,1)*FF_Value(:,1));
FF_Distance_Positiv = sort(FF_Distance(FF_Distance > 0));
FF_Distance_Positiv(FF_Distance_Positiv == 0) = [];
FF_Distance_Negativ = sort(FF_Distance(FF_Distance < 0),'descend');
FF_Distance_Negativ(FF_Distance_Negativ == 0) = [];
A = repmat(FF_Distance_Positiv,length(FF_Distance_Negativ),1);
B = repmat(FF_Distance_Negativ',length(FF_Distance_Positiv),1);
C = reshape(B,[length(FF_Distance_Positiv)*length(FF_Distance_Negativ),1]);
Recognition(:,1) = A;
Recognition(:,2) = C;
FF_Recognition = zeros(length(FF_Value),1);
for i = 1:length(Recognition)
for j = 1:length(FF_Value)
if (Regression(1,2)+Recognition(i,1))+Regression(1,1)*FF_Value(j,1) >= FF_Value(j,2) &&...
(Regression(1,2)+Recognition(i,2))+Regression(1,1)*FF_Value(j,1) <= FF_Value(j,2)
FF_Recognition(j,1) = 1;
end
end
end
Welcome to the world of bsxfun's replacing your world of repmats -
%------------ Original code -----------------------------------------
FF_Distance = FF_Value(:,2)-(Regression(1,2)+Regression(1,1)*FF_Value(:,1));
FF_Distance_Positiv = sort(FF_Distance(FF_Distance > 0));
FF_Distance_Positiv(FF_Distance_Positiv == 0) = [];
%// Note for Performance: If number of elements satisfying `FF_Distance_Positiv == 0`
%// is a lot, consider doing this instead -
%// `FF_Distance_Positiv = FF_Distance_Positiv(FF_Distance_Positiv~=0)`.
%// Follow this strategy for `FF_Distance_Negativ` too.
FF_Distance_Negativ = sort(FF_Distance(FF_Distance < 0),'descend');
FF_Distance_Negativ(FF_Distance_Negativ == 0) = [];
%------- Added vectorization replacing `repmats` and nested loops ------------
mult = Regression(1,1)*FF_Value(:,1);
y1 = bsxfun(#plus,Regression(1,2),FF_Distance_Positiv);
y2 = bsxfun(#plus,y1.',mult); %//'
mc1 = bsxfun(#ge,y2,FF_Value(:,2));
z1 = bsxfun(#plus,Regression(1,2),FF_Distance_Negativ);
z2 = bsxfun(#plus,z1.',mult); %//'
mc2 = bsxfun(#le,z2,FF_Value(:,2));
FF_Recognition = all([any(mc1,2) any(mc2,2)],2);