How to have different OpenMP threads execute different tasks - openmp

I am using open MP to speed up the flux calculation in my program. I basically want OpenMP to carry out both of these left and right flux calculations in parallel. But on the contrary, the following code takes even more time with the #pragma directives. What do i modify to get it right?
#pragma omp parallel num_threads(2)
{
#pragma omp single
{//first condition
//cerr<<"Executed thread 0"<<endl;
if ( (fabs(lcellMach-1.0)<EPSILON) || ( (lcellMach-1.0) > 0.0 ) ){//purpose of Epsilon!!!!
FluxP[0] = rhol * vnl;
FluxP[1] = rhol * ul * vnl + Pl*nx;
FluxP[2] = rhol * vl * vnl + Pl*ny;
FluxP[3] = rhol * wl * vnl + Pl*nz;
FluxP[4] = rhol * ((GAMMA * Pl / (rhol * (GAMMA-1.0))) + ((ul*ul + vl*vl + wl*wl)/2.0)) * vnl;
}else if ( (fabs(lcellMach+1.0)<EPSILON) || ( (lcellMach+1.0) < 0.0 ) ){
FluxP[0] = FluxP[1] = FluxP[2] = FluxP[3] = FluxP[4] = 0.0;// If flow direction is opposite the Flux + is zero
}else {
double ql = (ul*ul + vl*vl + wl*wl);// how did this come
FluxP[0] = rhol * lcell_a * (lcellMach+1.0)*(lcellMach+1.0) / 4.0;
FluxP[1] = FluxP[0] * ( ul + (nx*(0.0-vnl + 2.0*lcell_a)/GAMMA) );
FluxP[2] = FluxP[0] * ( vl + (ny*(0.0-vnl + 2.0*lcell_a)/GAMMA) );
FluxP[3] = FluxP[0] * ( wl + (nz*(0.0-vnl + 2.0*lcell_a)/GAMMA) );
FluxP[4] = FluxP[0] * ( ((ql - vnl*vnl)/2.0) + (((GAMMA-1.0)*vnl + 2.0*lcell_a)*((GAMMA-1.0)*vnl + 2.0*lcell_a) / (2.0*(GAMMA*GAMMA-1.0))) );
}
}//end of 1st
#pragma omp single
{//second condition
//cerr<<"Executed thread 1"<<endl;
if ((fabs(rcellMach+1.0)<EPSILON) || ((rcellMach+1.0) < 0.0)) {
FluxM[0] = rhor * vnr;
FluxM[1] = rhor * ur * vnr + Pr*nx;
FluxM[2] = rhor * vr * vnr + Pr*ny;
FluxM[3] = rhor * wr * vnr + Pr*nz;
FluxM[4] = rhor * ((GAMMA * Pr / (rhor * (GAMMA-1.0))) + ((ur*ur + vr*vr + wr*wr)/2.0)) * vnr;
}else if ((fabs(rcellMach-1.0)<EPSILON) || ((rcellMach-1.0) > 0.0)) {
FluxM[0] = FluxM[1] = FluxM[2] = FluxM[3] = FluxM[4] = 0.0;
}else {
tempFlux[0] = rhor * vnr;
tempFlux[1] = rhor * ur * vnr + Pr*nx;
tempFlux[2] = rhor * vr * vnr + Pr*ny;
tempFlux[3] = rhor * wr * vnr + Pr*nz;
tempFlux[4] = rhor * ((GAMMA * Pr / (rhor * (GAMMA-1.0))) + ((ur*ur + vr*vr + wr*wr)/2.0)) * vnr;
double qr = (ur*ur + vr*vr + wr*wr);
tempFluxP[0] = rhor * rcell_a * (rcellMach+1.0)*(rcellMach+1.0) / 4.0;
tempFluxP[1] = tempFluxP[0] * ( ur + (nx*(0.0-vnr + 2.0*rcell_a)/GAMMA) );
tempFluxP[2] = tempFluxP[0] * ( vr + (ny*(0.0-vnr + 2.0*rcell_a)/GAMMA) );
tempFluxP[3] = tempFluxP[0] * ( wr + (nz*(0.0-vnr + 2.0*rcell_a)/GAMMA) );
tempFluxP[4] = tempFluxP[0] * ( ((qr - vnr*vnr)/2.0) + (((GAMMA-1.0)*vnr + 2.0*rcell_a)*((GAMMA-1.0)*vnr + 2.0*rcell_a) / (2.0*(GAMMA*GAMMA-1.0))) );
for (int j=0; j<O; j++) FluxM[j] = tempFlux[j] - tempFluxP[j];
}
}
}//pragma
Urgent help required. Thanks.

What you need is the sections construct:
#pragma omp parallel sections num_threads(2)
{
#pragma omp section
{
... code that updates FluxP ...
}
#pragma omp section
{
... code that updates FluxM ...
}
}
But your code doesn't seem like it would take much time to do the calculations (no big for loops inside for example) so the overhead that OpenMP will put onto it will most likely be more time consuming than the saving in computation time and hence the parallel version will most likely execute slower than the serial.

Related

Colour Difference DeltaE 2000

I am trying to Calculate the CIE Colour Difference DeltaE 2000 based on DE2000 Formula. I have done as per the formula provided in the website, but I am getting strange delta E values. I am confused where I have gone wrong. I have checked manytimes but I am not able to find the mistake.Can someone tell me which part of my code has problem.
function DE_2K = CIEDE2000(Lab1,Lab2)
labuno=Lab1
labdos=Lab2
L1=labuno(1)
a1=labuno(2)
b1=labuno(3)
L2=labdos(1)
a2=labdos(2)
b2=labdos(3)
%*******************************************************************
% Definition for CIE DE2000
%*******************************************************************
L_bar_dash=(L1+L2)/2;
C1 = sqrt((a1)^2+(b1)^2)
C2 = sqrt((a2)^2+(b2)^2)
C_bar = (C1+C2)/2
G = (1 -sqrt(((C_bar)^7)/((C_bar)^7+(25)^7))/2)
a1_dash = a1*(1+G)
a2_dash = a2*(1+G)
C1_dash = sqrt((a1_dash)^2+(b1)^2)
C2_dash = sqrt((a2_dash)^2+(b2)^2)
C_bar_dash = (C1_dash + C2_dash)/2
if (radtodeg(atan(b1/a1_dash)) >= 0 ) h1_dash = radtodeg(atan(b1/a1_dash))
else h1_dash = radtodeg(atan(b1/a1_dash)) + radtodeg(2*pi)
end
if (radtodeg(atan(b2/a2_dash)) >= 0 ) h2_dash = radtodeg(atan(b2/a2_dash))
else h2_dash = radtodeg(atan(b2/a2_dash)) + radtodeg(2*pi)
end
if ((h1_dash - h2_dash) > radtodeg(pi)) H_bar_dash = (h1_dash + h2_dash + radtodeg(2*pi))/2
else H_bar_dash = (h1_dash + h2_dash)/2
end
T = 1 - 0.17*radtodeg(cos(H_bar_dash-radtodeg(pi/6)))+0.24*radtodeg(cos(2*H_bar_dash))+0.32*radtodeg(cos(3*H_bar_dash + radtodeg(pi/30)))- 0.20*radtodeg(cos(4*H_bar_dash + 63))
if ((abs(h2_dash - h1_dash)) <= radtodeg(pi)) DE_h_dash = h2_dash - h1_dash
elseif(abs(h2_dash - h1_dash) > radtodeg(pi) && h2_dash <= h1_dash) DE_h_dash = h2_dash - h1_dash + radtodeg(2*pi)
else DE_h_dash = h2_dash - h1_dash - radtodeg(2*pi)
end
DE_L_dash = L2 - L1
DE_C_dash = C2_dash - C1_dash
DE_H_dash = 2 * sqrt(C1_dash * C2_dash) * radtodeg(sin(DE_h_dash/2))
S_L = 1 + ((0.015 * (L_bar_dash - 50)^2)/sqrt(20 + (L_bar_dash - 50)^2))
S_C = 1 + (0.045 * C_bar_dash)
S_H = 1 + (0.015 * C_bar_dash * T)
DE_angle = 30 * exp( - ((H_bar_dash - 275)/25)^2)
R_C = 2 * sqrt((C_bar_dash)^7/((C_bar_dash)^7 + (25)^7))
R_T = - R_C * radtodeg(sin(2 * DE_angle))
K_L = 1
K_C = 1
K_H = 1
DE_2K = sqrt( (DE_L_dash/(K_L * S_L))^2 + (DE_C_dash/(K_C * S_C))^2 + (DE_H_dash/(K_H * S_H))^2 + (R_T * (DE_C_dash/(K_C * S_C)) * (DE_H_dash/(K_H * S_H))))
end
There are some problems in your calculations:
a) if ((h1_dash - h2_dash) > radtodeg(pi)) : don't you need to take the abs of this?
b) 20*radtodeg(cos(4*H_bar_dash + 63) : you need -63 here
c) I assume your if-else structure correctly handles the three cases; you may need to check that:
....else DE_h_dash = h2_dash - h1_dash - radtodeg(2*pi)
d) sin is a number not in degrees, not in radians so no need to convert here:
radtodeg(sin(DE_h_dash/2))
e) same here: radtodeg(sin(2 * DE_angle))
f) I assume cos/sin take degrees; you many need to double check what is degrees what is radians everywhere.

Particle system running slowly

here is update function. As soon as i turn update on my program gets slower. I'm not even able to render 25000 particles at a time. Voxels is a 3 dimensional array. How to i change my update function so that the calculations is done faster. i want to able to render at least 100000 particles.
function update(){
newTime = Date.now();
elapsedTime = newTime - oldTime;
oldTime = newTime;
for(var index =0 ; index < particles.vertices.length; index++){
//particle's old position
var oldPosition = particles.vertices[index];
//making sure particles do not og out of boundary
if (oldPosition.x > screenSquareLength || oldPosition.x < -screenSquareLength){
oldPosition.x = 2 * screenSquareLength * Math.random() - screenSquareLength;
}
if (oldPosition.y > screenSquareLength || oldPosition.y < -screenSquareLength){
oldPosition.y = 2 * screenSquareLength * Math.random() - screenSquareLength;
}
if (oldPosition.z > screenSquareDepth/2 || oldPosition.z < -screenSquareDepth/2){
oldPosition.z = screenSquareDepth * Math.random() - screenSquareDepth/2;
}
var oldVelocity = particlesExtraInfo[index].velocity;
var fieldVelocity;
var xIndex, yIndex, zIndex;
try{
//calculating index of voxel
xIndex = Math.floor(( oldPosition.x + screenSquareLength ) / voxelSize);
yIndex = Math.floor(( oldPosition.y + screenSquareLength ) / voxelSize);
zIndex = Math.floor(( screenSquareDepth / 2 - oldPosition.z) / voxelSize);
//getting velocity, color for particle and if voxel is
fieldVelocity = voxels[zIndex][xIndex][yIndex].userData["velocity"];
particleColor = voxels[zIndex][xIndex][yIndex].userData["color"];
activeVoxel = voxels[zIndex][xIndex][yIndex].userData["visible"];
}catch (e){
console.log("indexX = "+xIndex + " \t Yindex = "+ yIndex+" \t zIndex = "+ zIndex);
}
var particleColor;
var activeVoxel;
try{
var vx = ((oldVelocity.x + fieldVelocity.x) * elapsedTime);
var vy = ((oldVelocity.y + fieldVelocity.y) * elapsedTime);
var vz = ((oldVelocity.z + fieldVelocity.z) * elapsedTime);
var magnitude = Math.abs(vx) + Math.abs(vy) + Math.abs(vz); //Math.sqrt(vx*vx + vy*vy+ vz*vz);
var normalized = new THREE.Vector3(vx / magnitude, vy / magnitude, vz / magnitude);
if((particles.vertices[index].x < 0.1 && particles.vertices[index].x > -0.1) && (particles.vertices[index].y < 0.1 && particles.vertices[index].y > -0.1) && (particles.vertices[index].z < 0.1 && particles.vertices[index].z > -0.1) ){
particles.vertices[index].x = 2 * screenSquareLength * Math.random() - screenSquareLength;;
particles.vertices[index].y = 2 * screenSquareLength * Math.random() - screenSquareLength;;
particles.vertices[index].z = 2 * screenSquareLength * Math.random() - screenSquareLength;;
}
//if voxel is not part of the model update particle postion and velocity
if( activeVoxel == 0){
particles.colors[index] = new THREE.Color(particleColor);//new THREE.Color(0, 0, 1);
particles.colorsNeedUpdate = true;
particles.vertices[index].x += normalized.x/slowingFactor;
particles.vertices[index].y += normalized.y/slowingFactor;
particles.vertices[index].z += normalized.z/slowingFactor;
particles.verticesNeedUpdate = true;
particlesExtraInfo[index].velocity = normalized;
}else{
//voxel is part of particle so update color property of particle
particles.colors[index] = new THREE.Color(0, 0, 1);
particles.colorsNeedUpdate = true;
particles.vertices[index].x += normalized.x/(slowingFactor * 200);
particles.vertices[index].y += normalized.y/(slowingFactor * 200);
particles.vertices[index].z += normalized.z/(slowingFactor * 200);
particles.verticesNeedUpdate = true;
particlesExtraInfo[index].velocity = new THREE.Vector3( normalized.x/slowingFactor, normalized.y/slowingFactor, normalized.z/slowingFactor );
}
}catch(e){
}
}
}
I don't know much about what exactly happens when you update a buffer like this, but I know that it can be slow.
While 25k may be a lot for what you're trying to do (i experimented with 5k and had trouble) there is no reason why you can't optimize your JS before trying to move everything to the gpu (for example).
var foo = 0;
foo+= normalized.x / someFactor;
//better done this way:
var invSomeFactor = 1/someFactor;
// now you avoid dividing the same thing many times in your loop
foo += normalized.x * invSomeFactor;
Math.random() is pretty expensive, you could make a look up table (a large one) and fetch these precomputed values from it.
var myLookupTable = [];
var MAX_VALUES = 2048;
for ( var i = 0 ; i < MAX_VALUES ; i ++ ){
myLookupTable.push(Math.random());
}
//and then you can have a stride for example
var RAND_STRIDE = 0;
//and in the loop
someVec.x += something.x * myLookupTable[ RAND_STRIDE ++ ];
RAND_STRIDE %= MAX_VALUES; //read from the beginning
Finally, you can write a fragment shader, that would read from a buffer, and write into another buffer doing all this logic in the process. Each fragment is your particle and once you run this pass and compute your positions, you need to be able to read the buffer in your particle vertex shader and just assign those positions.

in runnable_avg_period+1, why should I add 1?

kernel/sched/fair.c
static inline void __update_task_entity_contrib(struct sched_entity *se)
{
u32 contrib;
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
contrib /= (se->avg.runnable_avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
contrib /= (se->avg.runnable_avg_period + 1);
.....
this code calculates the below equation.
runnable_avg_sum
load_avg_contrib = ----------------------------- * weight
runnable_avg_period + 1
and:
runnable_avg_sum
load_avg_ratio = ----------------------------- * NICE_0_LOAD
runnable_avg_period + 1
Why should we use runnable_avg_period + 1 in this equation?
Why can't we just use runnable_avg_period?

How to make a vertical wave with using canvas?

I changed the script by adding text instead of an image.
I want to wave was vertically downwards. I know that a little editing but I tried different options and it did not work.
http://jsfiddle.net/7ynn4/3/
var options = {
period:100,
squeeze:0,
wavelength:40,
amplitude:30,
shading:300,
fps:30
}
var ca = document.getElementById('canvas');
var ctx = ca.getContext('2d');
ctx.canvas.width = 400;
ctx.canvas.height = 150;
ctx.font = 'bold 45pt Arial';
ctx.textAlign = 'center';
ctx.fillStyle = 'blue';
ctx.fillText('Hello World', 170, 60);
w = canvas.width,
h = canvas.height,
od = ctx.getImageData( 0, 0, w, h ).data;
setInterval(function() {
var id = ctx.getImageData( 0, 0, w, h ),
d = id.data,
now = ( new Date() )/options.period,
y,
x,
lastO,
shade,
sq = ( y - h/2 ) * options.squeeze,
px,
pct,
o,
y2,
opx;
for ( y = 0; y < h; y += 1 ) {
lastO = 0;
shade = 0;
sq = ( y - h/2 ) * options.squeeze;
for ( x = 0; x < w; x += 1 ) {
px = ( y * w + x ) * 4;
pct = x/w;
o = Math.sin( x/options.wavelength - now ) * options.amplitude * pct;
y2 = y + ( o + sq * pct ) << 0;
opx = ( y2 * w + x ) * 4;
shade = (o-lastO) * options.shading;
d[px ] = od[opx ]+shade;
d[px+1] = od[opx+1]+shade;
d[px+2] = od[opx+2]+shade;
d[px+3] = od[opx+3];
lastO = o;
}
}
ctx.putImageData( id, 0, 0 );
},
1000/options.fps
);
You just flip the values around so that x is affected instead of y -
... vars cut, but replace y2 with x2 ...
/// reversed from here
for (x = 0; x < w; x += 1) {
lastO = 0;
shade = 0;
sq = (x - w * 0.5) * options.squeeze;
for (y = 0; y < h; y += 1) {
px = (y * w + x) * 4;
pct = y / h;
o = Math.sin(y/options.wavelength-now) * options.amplitude * pct;
/// the important one: you might need to compensate here (-5)
x2 = x - 5 + (o + sq * pct) | 0;
opx = (x2 + y * w) * 4;
shade = (o - lastO) * options.shading;
d[px] = od[opx] + shade;
d[px + 1] = od[opx + 1] + shade;
d[px + 2] = od[opx + 2] + shade;
d[px + 3] = od[opx + 3];
lastO = o;
}
}
ctx.putImageData(id, 0, 0);
MODIFIED FIDDLE HERE

How to accelerate matlab code?

I'm using matlab to implement a multilayer neural network. In the code I represent
the value of each node AS netValue{k}
the weight between layer k and k + 1 AS weight{k}
etc.
Since these data is three-dimensional, I have to use cell to hold a 2-D matrix to enable matrix multiply.
So it becomes really really slow to train the model, which I expect to have resulted from the usage of cell.
Can anyone tell me how to accelerate this code? Thanks
clc;
close all;
clear all;
input = [-2 : 0.4 : 2;-2:0.4:2];
ican = 4;
depth = 4; % total layer - 1, by convension
[featureNum , sampleNum] = size(input);
levelNum(1) = featureNum;
levelNum(2) = 5;
levelNum(3) = 5;
levelNum(4) = 5;
levelNum(5) = 2;
weight = cell(0);
for k = 1 : depth
weight{k} = rand(levelNum(k+1), levelNum(k)) - 2 * rand(levelNum(k+1) , levelNum(k));
threshold{k} = rand(levelNum(k+1) , 1) - 2 * rand(levelNum(k+1) , 1);
end
runCount = 0;
sumMSE = 1; % init MSE
minError = 1e-5;
afa = 0.1; % step of "gradient ascendence"
% training loop
while(runCount < 100000 & sumMSE > minError)
sumMSE = 0; % sum of MSE
for i = 1 : sampleNum % sample loop
netValue{1} = input(:,i);
for k = 2 : depth
netValue{k} = weight{k-1} * netValue{k-1} + threshold{k-1}; %calculate each layer
netValue{k} = 1 ./ (1 + exp(-netValue{k})); %apply logistic function
end
netValue{depth+1} = weight{depth} * netValue{depth} + threshold{depth}; %output layer
e = 1 + sin((pi / 4) * ican * netValue{1}) - netValue{depth + 1}; %calc error
assistS{depth} = diag(ones(size(netValue{depth+1})));
s{depth} = -2 * assistS{depth} * e;
for k = depth - 1 : -1 : 1
assistS{k} = diag((1-netValue{k+1}).*netValue{k+1});
s{k} = assistS{k} * weight{k+1}' * s{k+1};
end
for k = 1 : depth
weight{k} = weight{k} - afa * s{k} * netValue{k}';
threshold{k} = threshold{k} - afa * s{k};
end
sumMSE = sumMSE + e' * e;
end
sumMSE = sqrt(sumMSE) / sampleNum;
runCount = runCount + 1;
end
x = [-2 : 0.1 : 2;-2:0.1:2];
y = zeros(size(x));
z = 1 + sin((pi / 4) * ican .* x);
% test
for i = 1 : length(x)
netValue{1} = x(:,i);
for k = 2 : depth
netValue{k} = weight{k-1} * netValue{k-1} + threshold{k-1};
netValue{k} = 1 ./ ( 1 + exp(-netValue{k}));
end
y(:, i) = weight{depth} * netValue{depth} + threshold{depth};
end
plot(x(1,:) , y(1,:) , 'r');
hold on;
plot(x(1,:) , z(1,:) , 'g');
hold off;
Have you used the profiler to find out what functions are actually slowing down your code? It shows what lines take the most time to execute.

Resources