in runnable_avg_period+1, why should I add 1?

static inline void __update_task_entity_contrib(struct sched_entity *se)
u32 contrib;
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
contrib /= (se->avg.runnable_avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
contrib /= (se->avg.runnable_avg_period + 1);
this code calculates the below equation.
load_avg_contrib = ----------------------------- * weight
runnable_avg_period + 1
load_avg_ratio = ----------------------------- * NICE_0_LOAD
runnable_avg_period + 1
Why should we use runnable_avg_period + 1 in this equation?
Why can't we just use runnable_avg_period?


How to calculate the mean of 3D matrices in an image without NaN?

I need to calculate the mean of a 3D matrices (last step in the code). However, there are many NaNs in the (diff_dataframe./dataframe_vor) calculation. So when I use this code, some results will be NaN. How could I calculate the mean of this matrix by ignoring the NaNs? I attached the code as below.
S.amplitude = 1:20;%:20;
S.blocksize = [1 2 3 4 5 6 8 10 12 15 20];
S.frameWidth = 1920;
S.frameHeight = 1080;
image = 127*ones(S.frameHeight,S.frameWidth,3);
S.yuv2rgb = [1 0 1.28033; 1 -0.21482 -0.38059; 1 2.12798 0];
i_bs = 0;
for BS = S.blocksize
i_bs = i_bs + 1;
hblocks = S.frameWidth / BS;
vblocks = S.frameHeight / BS;
i_a = 0;
dataU = randi([0 1],vblocks,hblocks);
dataV = randi([0 1],vblocks,hblocks);
dataframe_yuv = zeros(S.frameHeight, S.frameWidth, 3);
for x = 1 : hblocks
for y = 1 : vblocks
dataframe_yuv((y-1)*BS+1:y*BS, ...
(x-1)*BS+1:x*BS, 2) = dataU(y,x) * 2 - 1;
dataframe_yuv((y-1)*BS+1:y*BS, ...
(x-1)*BS+1:x*BS, 3) = dataV(y,x) * 2 - 1;
dataframe_rgb(:,:,1) = S.yuv2rgb(1,1) * dataframe_yuv(:,:,1) + ...
S.yuv2rgb(1,2) * dataframe_yuv(:,:,2) + ...
S.yuv2rgb(1,3) * dataframe_yuv(:,:,3);
dataframe_rgb(:,:,2) = S.yuv2rgb(2,1) * dataframe_yuv(:,:,1) + ...
S.yuv2rgb(2,2) * dataframe_yuv(:,:,2) + ...
S.yuv2rgb(2,3) * dataframe_yuv(:,:,3);
dataframe_rgb(:,:,3) = S.yuv2rgb(3,1) * dataframe_yuv(:,:,1) + ...
S.yuv2rgb(3,2) * dataframe_yuv(:,:,2) + ...
S.yuv2rgb(3,3) * dataframe_yuv(:,:,3);
for A = S.amplitude
i_a = i_a + 1;
i_q = 0;
image1p = round(image + dataframe_rgb * A);
image1n = round(image - dataframe_rgb * A);
dataframe_vor = ((image1p-image1n)/2)/255;
for Q = S.quality
i_q = i_q + 1;
namestrp = ['greyjpegs/Img_BS' num2str(BS) '_A' num2str(A) '_Q' num2str(Q) '_1p.jpg'];
namestrn = ['greyjpegs/Img_BS' num2str(BS) '_A' num2str(A) '_Q' num2str(Q) '_1n.jpg'];
imwrite(image1p/255,namestrp,'jpg', 'Quality', Q);
imwrite(image1n/255,namestrn,'jpg', 'Quality', Q);
error_mean(i_bs, i_a, i_q) = mean2((abs(diff_dataframe./dataframe_vor)));
mean2 is a shortcut function that's part of the image processing toolbox that finds the entire average of a 2D region which doesn't include handling NaN. In that case, simply remove all values that are NaN and find the resulting average. Note that the removal of NaN unrolls the 2D region into a 1D vector, so we can simply use mean in this case. As an additional check, let's make sure there are no divide by 0 errors, so also check for Inf as well.
Therefore, replace this line:
error_mean(i_bs, i_a, i_q) = mean2((abs(diff_dataframe./dataframe_vor)));
... with:
tmp = abs(diff_dataframe ./ dataframe_vor);
mask = ~isnan(tmp) | ~isinf(tmp);
tmp = tmp(mask);
if isempty(tmp)
error_mean(i_bs, i_a, i_q) = 0;
error_mean(i_bs, i_a, i_q) = mean(tmp);
We first assign the desired operation to a temporary variable, use isnan and isinf to remove out the offending values, then find the average of the rest. One intricacy is that if your entire region is NaN or Inf, then the removal of all these entries in the region results in the empty vector, and finding the mean of this undefined. A separate check is there to be sure that if it's empty, simply assign the value of 0 instead.

Colour Difference DeltaE 2000

I am trying to Calculate the CIE Colour Difference DeltaE 2000 based on DE2000 Formula. I have done as per the formula provided in the website, but I am getting strange delta E values. I am confused where I have gone wrong. I have checked manytimes but I am not able to find the mistake.Can someone tell me which part of my code has problem.
function DE_2K = CIEDE2000(Lab1,Lab2)
% Definition for CIE DE2000
C1 = sqrt((a1)^2+(b1)^2)
C2 = sqrt((a2)^2+(b2)^2)
C_bar = (C1+C2)/2
G = (1 -sqrt(((C_bar)^7)/((C_bar)^7+(25)^7))/2)
a1_dash = a1*(1+G)
a2_dash = a2*(1+G)
C1_dash = sqrt((a1_dash)^2+(b1)^2)
C2_dash = sqrt((a2_dash)^2+(b2)^2)
C_bar_dash = (C1_dash + C2_dash)/2
if (radtodeg(atan(b1/a1_dash)) >= 0 ) h1_dash = radtodeg(atan(b1/a1_dash))
else h1_dash = radtodeg(atan(b1/a1_dash)) + radtodeg(2*pi)
if (radtodeg(atan(b2/a2_dash)) >= 0 ) h2_dash = radtodeg(atan(b2/a2_dash))
else h2_dash = radtodeg(atan(b2/a2_dash)) + radtodeg(2*pi)
if ((h1_dash - h2_dash) > radtodeg(pi)) H_bar_dash = (h1_dash + h2_dash + radtodeg(2*pi))/2
else H_bar_dash = (h1_dash + h2_dash)/2
T = 1 - 0.17*radtodeg(cos(H_bar_dash-radtodeg(pi/6)))+0.24*radtodeg(cos(2*H_bar_dash))+0.32*radtodeg(cos(3*H_bar_dash + radtodeg(pi/30)))- 0.20*radtodeg(cos(4*H_bar_dash + 63))
if ((abs(h2_dash - h1_dash)) <= radtodeg(pi)) DE_h_dash = h2_dash - h1_dash
elseif(abs(h2_dash - h1_dash) > radtodeg(pi) && h2_dash <= h1_dash) DE_h_dash = h2_dash - h1_dash + radtodeg(2*pi)
else DE_h_dash = h2_dash - h1_dash - radtodeg(2*pi)
DE_L_dash = L2 - L1
DE_C_dash = C2_dash - C1_dash
DE_H_dash = 2 * sqrt(C1_dash * C2_dash) * radtodeg(sin(DE_h_dash/2))
S_L = 1 + ((0.015 * (L_bar_dash - 50)^2)/sqrt(20 + (L_bar_dash - 50)^2))
S_C = 1 + (0.045 * C_bar_dash)
S_H = 1 + (0.015 * C_bar_dash * T)
DE_angle = 30 * exp( - ((H_bar_dash - 275)/25)^2)
R_C = 2 * sqrt((C_bar_dash)^7/((C_bar_dash)^7 + (25)^7))
R_T = - R_C * radtodeg(sin(2 * DE_angle))
K_L = 1
K_C = 1
K_H = 1
DE_2K = sqrt( (DE_L_dash/(K_L * S_L))^2 + (DE_C_dash/(K_C * S_C))^2 + (DE_H_dash/(K_H * S_H))^2 + (R_T * (DE_C_dash/(K_C * S_C)) * (DE_H_dash/(K_H * S_H))))
There are some problems in your calculations:
a) if ((h1_dash - h2_dash) > radtodeg(pi)) : don't you need to take the abs of this?
b) 20*radtodeg(cos(4*H_bar_dash + 63) : you need -63 here
c) I assume your if-else structure correctly handles the three cases; you may need to check that:
....else DE_h_dash = h2_dash - h1_dash - radtodeg(2*pi)
d) sin is a number not in degrees, not in radians so no need to convert here:
e) same here: radtodeg(sin(2 * DE_angle))
f) I assume cos/sin take degrees; you many need to double check what is degrees what is radians everywhere.

splitting trapezoid in given proportion

I need to split trapezoid in 2 part of given size with line, parallel basement. I need to get new h1 of new trapezoid.
For example I have trapezoid of area S and I want to split it in 2 trapezoids of areas S1 and S2.
S1 = aS; S2 = (1-a)S;
S1 = (a+z)*(h1)/2;
S2 = (b+z)*(1-h1)/2;
S1/S2 = KS;
To get new h1 I compare a and b, if a != b, I solve square equation and if a == b I work like with square. But sometimes I get mistakes because of rounding (for example when I solve this analytically I get a = b and program thinks a > b). How can I handle this? Or maybe there is another better way to split trapezoid?
Here is simplifyed code:
if (base > base_prev) {
b_t = base; // base of trapezoid
h = H; //height of trapezoid
a_t = base_prev; //another base of trapezoid
KS = S1 / S2;
a_x = (a_t - b_t) * (1 + KS) / h;
b_x = 2 * KS * b_t + 2 * b_t;
c_x = -(a_t * h + b_t * h);
h_tmp = (-b_x + sqrt(b_x * b_x - 4 * a_x * c_x)) / (2 * a_x);
if (h_tmp > h || h_tmp < 0)
h_tmp = (-b_x - sqrt(b_x * b_x - 4 * a_x * c_x)) / (2 * a_x);
} else if (base < base_prev) {
b_t = base_prev;
a_t = base;
KS = S1 / S2;
a_x = (a_t - b_t) * (1 + KS) / h;
b_x = 2 * KS * b_t + 2 * b_t;
c_x = -(a_t * h + b_t * h);
h_tmp = (-b_x + sqrt(b_x * b_x - 4 * a_x * c_x)) / (2 * a_x);
if (h_tmp > h || h_tmp < 0)
h_tmp = (-b_x - sqrt(b_x * b_x - 4 * a_x * c_x)) / (2 * a_x);
else {
KS = S1 / S2;
h_tmp = h * KS;
If you're dealing with catastrophic cancellation, one approach, dating back to a classic article by Forsythe, is to use the alternative solution form x = 2c/(-b -+ sqrt(b^2 - 4ac)) for the quadratic equation ax^2 + bx + c = 0. One way to write the two roots, good for b < 0, is
x = (-b + sqrt(b^2 - 4ac))/(2a)
x = 2c/(-b + sqrt(b^2 - 4ac)),
and another, good for b >= 0, is
x = 2c/(-b - sqrt(b^2 - 4ac))
x = (-b - sqrt(b^2 - 4ac))/(2a).
Alternatively, you could use the bisection method to obtain a reasonably good guess and polish it with Newton's method.

How to accelerate matlab code?

I'm using matlab to implement a multilayer neural network. In the code I represent
the value of each node AS netValue{k}
the weight between layer k and k + 1 AS weight{k}
Since these data is three-dimensional, I have to use cell to hold a 2-D matrix to enable matrix multiply.
So it becomes really really slow to train the model, which I expect to have resulted from the usage of cell.
Can anyone tell me how to accelerate this code? Thanks
close all;
clear all;
input = [-2 : 0.4 : 2;-2:0.4:2];
ican = 4;
depth = 4; % total layer - 1, by convension
[featureNum , sampleNum] = size(input);
levelNum(1) = featureNum;
levelNum(2) = 5;
levelNum(3) = 5;
levelNum(4) = 5;
levelNum(5) = 2;
weight = cell(0);
for k = 1 : depth
weight{k} = rand(levelNum(k+1), levelNum(k)) - 2 * rand(levelNum(k+1) , levelNum(k));
threshold{k} = rand(levelNum(k+1) , 1) - 2 * rand(levelNum(k+1) , 1);
runCount = 0;
sumMSE = 1; % init MSE
minError = 1e-5;
afa = 0.1; % step of "gradient ascendence"
% training loop
while(runCount < 100000 & sumMSE > minError)
sumMSE = 0; % sum of MSE
for i = 1 : sampleNum % sample loop
netValue{1} = input(:,i);
for k = 2 : depth
netValue{k} = weight{k-1} * netValue{k-1} + threshold{k-1}; %calculate each layer
netValue{k} = 1 ./ (1 + exp(-netValue{k})); %apply logistic function
netValue{depth+1} = weight{depth} * netValue{depth} + threshold{depth}; %output layer
e = 1 + sin((pi / 4) * ican * netValue{1}) - netValue{depth + 1}; %calc error
assistS{depth} = diag(ones(size(netValue{depth+1})));
s{depth} = -2 * assistS{depth} * e;
for k = depth - 1 : -1 : 1
assistS{k} = diag((1-netValue{k+1}).*netValue{k+1});
s{k} = assistS{k} * weight{k+1}' * s{k+1};
for k = 1 : depth
weight{k} = weight{k} - afa * s{k} * netValue{k}';
threshold{k} = threshold{k} - afa * s{k};
sumMSE = sumMSE + e' * e;
sumMSE = sqrt(sumMSE) / sampleNum;
runCount = runCount + 1;
x = [-2 : 0.1 : 2;-2:0.1:2];
y = zeros(size(x));
z = 1 + sin((pi / 4) * ican .* x);
% test
for i = 1 : length(x)
netValue{1} = x(:,i);
for k = 2 : depth
netValue{k} = weight{k-1} * netValue{k-1} + threshold{k-1};
netValue{k} = 1 ./ ( 1 + exp(-netValue{k}));
y(:, i) = weight{depth} * netValue{depth} + threshold{depth};
plot(x(1,:) , y(1,:) , 'r');
hold on;
plot(x(1,:) , z(1,:) , 'g');
hold off;
Have you used the profiler to find out what functions are actually slowing down your code? It shows what lines take the most time to execute.

How to have different OpenMP threads execute different tasks

I am using open MP to speed up the flux calculation in my program. I basically want OpenMP to carry out both of these left and right flux calculations in parallel. But on the contrary, the following code takes even more time with the #pragma directives. What do i modify to get it right?
#pragma omp parallel num_threads(2)
#pragma omp single
{//first condition
//cerr<<"Executed thread 0"<<endl;
if ( (fabs(lcellMach-1.0)<EPSILON) || ( (lcellMach-1.0) > 0.0 ) ){//purpose of Epsilon!!!!
FluxP[0] = rhol * vnl;
FluxP[1] = rhol * ul * vnl + Pl*nx;
FluxP[2] = rhol * vl * vnl + Pl*ny;
FluxP[3] = rhol * wl * vnl + Pl*nz;
FluxP[4] = rhol * ((GAMMA * Pl / (rhol * (GAMMA-1.0))) + ((ul*ul + vl*vl + wl*wl)/2.0)) * vnl;
}else if ( (fabs(lcellMach+1.0)<EPSILON) || ( (lcellMach+1.0) < 0.0 ) ){
FluxP[0] = FluxP[1] = FluxP[2] = FluxP[3] = FluxP[4] = 0.0;// If flow direction is opposite the Flux + is zero
}else {
double ql = (ul*ul + vl*vl + wl*wl);// how did this come
FluxP[0] = rhol * lcell_a * (lcellMach+1.0)*(lcellMach+1.0) / 4.0;
FluxP[1] = FluxP[0] * ( ul + (nx*(0.0-vnl + 2.0*lcell_a)/GAMMA) );
FluxP[2] = FluxP[0] * ( vl + (ny*(0.0-vnl + 2.0*lcell_a)/GAMMA) );
FluxP[3] = FluxP[0] * ( wl + (nz*(0.0-vnl + 2.0*lcell_a)/GAMMA) );
FluxP[4] = FluxP[0] * ( ((ql - vnl*vnl)/2.0) + (((GAMMA-1.0)*vnl + 2.0*lcell_a)*((GAMMA-1.0)*vnl + 2.0*lcell_a) / (2.0*(GAMMA*GAMMA-1.0))) );
}//end of 1st
#pragma omp single
{//second condition
//cerr<<"Executed thread 1"<<endl;
if ((fabs(rcellMach+1.0)<EPSILON) || ((rcellMach+1.0) < 0.0)) {
FluxM[0] = rhor * vnr;
FluxM[1] = rhor * ur * vnr + Pr*nx;
FluxM[2] = rhor * vr * vnr + Pr*ny;
FluxM[3] = rhor * wr * vnr + Pr*nz;
FluxM[4] = rhor * ((GAMMA * Pr / (rhor * (GAMMA-1.0))) + ((ur*ur + vr*vr + wr*wr)/2.0)) * vnr;
}else if ((fabs(rcellMach-1.0)<EPSILON) || ((rcellMach-1.0) > 0.0)) {
FluxM[0] = FluxM[1] = FluxM[2] = FluxM[3] = FluxM[4] = 0.0;
}else {
tempFlux[0] = rhor * vnr;
tempFlux[1] = rhor * ur * vnr + Pr*nx;
tempFlux[2] = rhor * vr * vnr + Pr*ny;
tempFlux[3] = rhor * wr * vnr + Pr*nz;
tempFlux[4] = rhor * ((GAMMA * Pr / (rhor * (GAMMA-1.0))) + ((ur*ur + vr*vr + wr*wr)/2.0)) * vnr;
double qr = (ur*ur + vr*vr + wr*wr);
tempFluxP[0] = rhor * rcell_a * (rcellMach+1.0)*(rcellMach+1.0) / 4.0;
tempFluxP[1] = tempFluxP[0] * ( ur + (nx*(0.0-vnr + 2.0*rcell_a)/GAMMA) );
tempFluxP[2] = tempFluxP[0] * ( vr + (ny*(0.0-vnr + 2.0*rcell_a)/GAMMA) );
tempFluxP[3] = tempFluxP[0] * ( wr + (nz*(0.0-vnr + 2.0*rcell_a)/GAMMA) );
tempFluxP[4] = tempFluxP[0] * ( ((qr - vnr*vnr)/2.0) + (((GAMMA-1.0)*vnr + 2.0*rcell_a)*((GAMMA-1.0)*vnr + 2.0*rcell_a) / (2.0*(GAMMA*GAMMA-1.0))) );
for (int j=0; j<O; j++) FluxM[j] = tempFlux[j] - tempFluxP[j];
Urgent help required. Thanks.
What you need is the sections construct:
#pragma omp parallel sections num_threads(2)
#pragma omp section
... code that updates FluxP ...
#pragma omp section
... code that updates FluxM ...
But your code doesn't seem like it would take much time to do the calculations (no big for loops inside for example) so the overhead that OpenMP will put onto it will most likely be more time consuming than the saving in computation time and hence the parallel version will most likely execute slower than the serial.
