OpenACC call to cuMemFreeHost returned error - openacc

I'm using PGI C Compiler--pgcc v16.10.0 64-bit--to learn how to program with OpenACC
Here is my code to simulate the process of particle transport
typedef struct {
double position;
double direction;
double weight;
int cell;
int group;
int alive;
} Particle;
int size = 100000; // number of particles to be simulated
int tot = (int) (1.3 * size); // this variable limits the maximum of next generation particles
int capacity = 0; // this variable indicates the actual number of next generation particles
/* particles to be simulated */
Particle *par = (Particle *) malloc(size * sizeof(Particle));
/* next generation particles produced */
particle *next = (Particle *) malloc(tot * sizeof(Particle));
/* initialization */
for (int i = 0; i < size; i++){
par[i].position = rand1() * 100.0; // random number between 0.0~1.0
par[i].direction = rand2(); // random number between -1.0~1.0
par[i].weight = 1.0;
par[i].cell = 2;
par[i].group = rand1() > 0.5 ? 1 : 2;
par[i].alive = 1;
}
/* some parameters used in simulation */
double keff = 1.0;
double tracklength, collision, absorption;
/* start simulating */
int generation;
for (generation = 1; generation <= 100; generation++){
int CellID, MatID, GroupID;
int k; // k-th particle to be simulated
#pragma acc parallel copy(capacity) copyin(par[0:size],size, keff) copyout(next[0:tot])
#pragma acc loop reduction(+:tracklength, collision, absorption)
for (k = 0; k < size; k++){
/* do some calculating with par[k] */
/* secondary particle produced under certain circumstances */
if (condition){
next[capacity].position = par[k].position;
next[capacity].direction = rand2();
next[capacity].weight = 1.0;
next[capacity].cell = par[k].cell;
next[capacity].group = rand1() < 0.9 ? 1 : 2;
next[capacity].alive = 1;
capacity++;
}
}
/* after simulation of current generation, update the parameters */
keff = ........ // one formula to update keff
size = capacity;
capacity = 0;
tot = (int) (1.3 * size);
free(par);
par = next;
next = (Particle *) malloc(tot * sizeof(Particle));
}
free(par);
free(next);
i compiled the code with
pgcc -acc -Minfo=accel -ta=tesla:cc30,time -O0 main.c -o test
and get information below:
Loop carried dependence of par->alive, par->cell, par->direction, par->position, par->weight, par->group prevents parallelization
Loop carried dependence of par->direction, par->group, par->position prevents vectorization
Loop carried reuse of next->position prevents parallelization
and then run the executable ./test
an error occurred
call to cuMemFreeHost returned error 700: Illegal address during kernel execution
i have no idea how to work it out[SAD].
BTW the code runs well and returns the correct result when compiled by gcc ignoring #pragma

Related

When declaring a static array as "private" before a parallel loop is perfectly equivalent to declaring the array inside the loop?

I've encountered a situation where the code generates different results in the case of having arrays defined inside the loop on index i (case #1) and in the case of declaring them outside the loop on the i index and using the clause private (case #2).
Case #2 generates the same results of the code running on CPU only.
Case #1
#pragma acc parallel loop
for (j = jbeg; j <= jend; j++){
#pragma acc loop
for (i = ibeg; i <= iend; i++){
double Rc[NFLX][NFLX];
double eta[NFLX], um[NFLX], dv[NFLX];
double lambda[NFLX], alambda[NFLX];
double fL[NFLX], fR[NFLX];
.
.
.
}
}}
Case #2
#pragma acc parallel loop
for (j = jbeg; j <= jend; j++){
double Rc[NFLX][NFLX];
double eta[NFLX], um[NFLX], dv[NFLX];
double lambda[NFLX], alambda[NFLX];
double fL[NFLX], fR[NFLX];
#pragma acc loop private(Rc[:NFLX][:NFLX], eta[:NFLX], \
um[:NFLX], lambda[:NFLX], alambda[:NFLX], \
dv[:NFLX], fL[:NFLX], fR[:NFLX])
for (i = ibeg; i <= iend; i++){
.
.
.
}
}}
I have the following values:
NFLX = 8;
jbeg = 3, jend = 258;
ibeg = 3, iend = 1026;
In which cases the two techniques are equivalent and when it is better to choose one over the other?
This is what I see with -Minfo=accel:
case #1:
71, Local memory used for Rc,dv,fR,um,lambda,alambda,fL,eta
case #2:
71, Local memory used for Rc,dv,fR,lambda,alambda,fL,eta
CUDA shared memory used for Rc,eta
Local memory used for um
CUDA shared memory used for um,lambda,alambda,dv,fL,fR
function:
/* ********************************************************************* */
void Roe_Solver (Data *d, timeStep *Dts, Grid *grid, RBox *box)
/*
* Solve the Riemann problem between L/R states using a
* Rusanov-Lax Friedrichs flux.
*********************************************************************** */
{
int i, j, k;
int ibeg = *(box->nbeg)-1, iend = *(box->nend);
int jbeg = *(box->tbeg), jend = *(box->tend);
int kbeg = *(box->bbeg), kend = *(box->bend);
int VXn = VX1, VXt = VX2, VXb = VX3;
int MXn = MX1, MXt = MX2, MXb = MX3;
int ni, nj;
double gmm = GAMMA_EOS;
double gmm1 = gmm - 1.0;
double gmm1_inv = 1.0/gmm1;
double delta = 1.e-7;
double delta_inv = 1.0/delta;
ARRAY_OFFSET (grid, ni, nj);
INDEX_CYCLE (grid->dir, VXn, VXt, VXb);
INDEX_CYCLE (grid->dir, MXn, MXt, MXb);
#pragma acc parallel loop collapse(2) present(d, Dts, grid)
for (k = kbeg; k <= kend; k++){
for (j = jbeg; j <= jend; j++){
long int offset = ni*(j + nj*k);
double * __restrict__ cmax = &Dts->cmax [offset];
double * __restrict__ SL = &d->sweep.SL [offset];
double * __restrict__ SR = &d->sweep.SR [offset];
double um[NFLX];
double fL[NFLX], fR[NFLX];
#pragma acc loop private(um[:NFLX], fL[:NFLX], fR[:NFLX])
for (i = ibeg; i <= iend; i++){
int nv;
double scrh, vel2;
double a2, a, h;
double alambda, lambda, eta;
double s, c, hl, hr;
double bmin, bmax, scrh1;
double pL, pR;
double * __restrict__ vL = d->sweep.vL [offset + i];
double * __restrict__ vR = d->sweep.vR [offset + i];
double * __restrict__ uL = d->sweep.uL [offset + i];
double * __restrict__ uR = d->sweep.uR [offset + i];
double * __restrict__ flux = d->sweep.flux[offset + i];
double a2L = SoundSpeed2 (vL);
double a2R = SoundSpeed2 (vR);
PrimToCons (vL, uL);
PrimToCons (vR, uR);
Flux (vL, uL, fL, grid->dir);
Flux (vR, uR, fR, grid->dir);
pL = vL[PRS];
pR = vR[PRS];
s = sqrt(vR[RHO]/vL[RHO]);
um[RHO] = vL[RHO]*s;
s = 1.0/(1.0 + s);
c = 1.0 - s;
um[VX1] = s*vL[VX1] + c*vR[VX1];
um[VX2] = s*vL[VX2] + c*vR[VX2];
um[VX3] = s*vL[VX3] + c*vR[VX3];
vel2 = um[VX1]*um[VX1] + um[VX2]*um[VX2] + um[VX3]*um[VX3];
hl = 0.5*(vL[VX1]*vL[VX1] + vL[VX2]*vL[VX2] + vL[VX3]*vL[VX3]);
hl += a2L*gmm1_inv;
hr = 0.5*(vR[VX1]*vR[VX1] + vR[VX2]*vR[VX2] + vR[VX3]*vR[VX3]);
hr += a2R*gmm1_inv;
h = s*hl + c*hr;
/* ----------------------------------------------------
1. the following should be equivalent to
scrh = dv[VX1]*dv[VX1] + dv[VX2]*dv[VX2] + dv[VX3]*dv[VX3];
a2 = s*a2L + c*a2R + 0.5*gmm1*s*c*scrh;
and therefore always positive.
---------------------------------------------------- */
a2 = gmm1*(h - 0.5*vel2);
a = sqrt(a2);
/* ----------------------------------------------------------------
2. define non-zero components of conservative eigenvectors Rc,
eigenvalues (lambda) and wave strenght eta = L.du
---------------------------------------------------------------- */
#pragma acc loop seq
NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]);
/* ---- (u - c_s) ---- */
SL[i] = um[VXn] - a;
/* ---- (u + c_s) ---- */
SR[i] = um[VXn] + a;
/* ---- get max eigenvalue ---- */
cmax[i] = fabs(um[VXn]) + a;
NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]) - 0.5*cmax[i]*(uR[nv] - uL[nv]);
#if DIMENSIONS > 1
/* ---------------------------------------------
3. use the HLL flux function if the interface
lies within a strong shock.
The effect of this switch is visible
in the Mach reflection test.
--------------------------------------------- */
scrh = fabs(vL[PRS] - vR[PRS]);
scrh /= MIN(vL[PRS],vR[PRS]);
if (scrh > 0.5 && (vR[VXn] < vL[VXn])){ /* -- tunable parameter -- */
bmin = MIN(0.0, SL[i]);
bmax = MAX(0.0, SR[i]);
scrh1 = 1.0/(bmax - bmin);
#pragma acc loop seq
for (nv = 0; nv < NFLX; nv++){
flux[nv] = bmin*bmax*(uR[nv] - uL[nv])
+ bmax*fL[nv] - bmin*fR[nv];
flux[nv] *= scrh1;
}
}
#endif /* DIMENSIONS > 1 */
} /* End loop on i */
}} /* End loop on j,k */
}
Technically they are equivalent, but in practice different. What's happening is that the compiler will hoist the declaration of these arrays outside of the loops. This is standard practice for the compiler and happens before the OpenACC directives are applied. What should happen is that then these arrays are implicitly privatized within the scoping unit they are declared. However the compiler doesn't currently track this so the arrays are implicitly copied into the compute region as shared arrays. If you add the flag "-Minfo=accel", you'll see the compiler feedback messages indicating the implicit copies.
I have an open issue report requesting this support, TPR #31360, however it's been a challenge to implement so not in a released compiler as of yet. Hence until/if we can fix the behavior, you'll need to manually hoist the declaration of these arrays and then add them to a "private" clause.

c code is running to slow from nested for loops

my c program is running to slow (right now it is around 40 seconds without parallelization). I have tried using openmp which has brought the timing down significantly but I am looking to use simple and natural ways to make my code run faster other than using parallel for loops. The basic structure of the code is that is takes some command line arguments as inputs and then saves those inputs as variables. Then it recursively computes a variable called Rplus1 using the math.h library and the complex.h library. The problem of the code and where it is taking most of it's time is at the bottom where there are nested for loops. My goal is to get the whole code running in under 5 seconds but as of now it runs in about 40 seconds without using parallel for loops. Please Help!
#include "time.h"
#include "stdio.h"
#include "stdlib.h"
#include "complex.h"
#include "math.h"
#include "string.h"
#include "unistd.h"
#include "omp.h"
#define PI 3.14159265
int main (int argc, char *argv[]){
if(argc >= 8){
double start1 = omp_get_wtime();
// command line arguments are aligned in the following order: [theta] [number of layers in superlattice] [material_1] [lat const_1] [number of unit cells_1] [material_2] [lat const_2] [number of unit cells_2] .... [material_N] [lat const_N] [number of unit cells_N] [Log/Linear] [number of repeating superlattice layers] [yes/no]
int N;
sscanf(argv[2],"%d",&N); // Number of layers in superlattice specified by second input argument
if(strcmp(argv[argc-1],"yes") == 0) //If the substrate is included then add one more layer to the N variable
{
N = N+1;
}
int total;
sscanf(argv[argc-2],"%d",&total); // Number of repeating superlattice layers specified by second to last argument
double layers[N][6], horizangle[1001], vertangle[1001];
double complex (*F_hkl)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*F_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)),SF_table[10];// this array will hold the unit cell structure factors for all of the materials selected for each wavevector in the beam spectrum
double real, real2, lam, c_light = 299792458, h_pl = 4.135667516e-15,E = 10e3, r_0 = 2.818e-15, Lccd = 1.013;// just a few variables to hold values through calculations and constants, speed of light, plancks const, photon energy, and detector distance from sample
double angle;
double complex z;// just a variable to hold complex numbers throughout calculations
int i,j,m,n,t; // integers to index through arrays
lam = (h_pl*c_light)/E;
sscanf(argv[1],"%lf",&angle); //first argument is the angle of incidence, read it
angle = angle*(PI/180.0);
angle2 = -angle;
double (*table)[10] = malloc(10*9*sizeof(double)); // this array holds all the coefficients to calculate the atomic scattering factor below
double (*table2)[10] = malloc(10*2*sizeof(double));
FILE*datfile1 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/coef_table.bin","rb"); // read the binary file containg all the coefficients
fread(table,sizeof(double),90,datfile1);
fclose(datfile1);
FILE*datfile2 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/dispersioncs.bin","rb");
fread(table2,sizeof(double),20,datfile2);
fclose(datfile2);
// Calculate scattering factors for all elements
double a,b;
double k_z = (sin(angle)/lam)*1e-10; // incorporate angular dependence of SF but neglect 0.24 degree divergence because of approximation
for(i = 0;i<10;i++) // for each element...
{
SF_table[i] = 0;
for(j = 0;j<4;j++) // summation
{
a = table[2*j][i];
b = table[2*j+1][i];
SF_table[i] = SF_table[i] + a * exp(-b*k_z*k_z);
}
SF_table[i] = SF_table[i] + table[8][i] + table2[0][i] + table2[1][i]*I;
}
free(table);
double mm = 4.0, (*phi)[1001][1001] = malloc(N*1001*1001*sizeof(double));
for(i = 1; i < N+1; i++) // for each layer of material...
{
sscanf(argv[i*3+1],"%lf",&layers[i-1][1]); // get out of plane lattice constant
sscanf(argv[i*3+2],"%lf",&layers[i-1][2]); // get the number of unit cells in the layer
layers[i-1][1] = layers[i-1][1]*1e-10; // convert lat const input to meters
// Define reciprocal space positions at the incident angle h, k, l
layers[i-1][3] = 0; // h
layers[i-1][4] = 0; // k
double l; // l calculated for each wavevector in the spectrum because l changes with angle of incidence
for (m = 0; m < 1001; m++)
{
for (n = 0; n <1001; n++)
{
l = 4;
phi[i-1][m][n] = 2*PI*layers[i-1][1]*sin(angle)/lam; // Caculate phi for each layer
if(strcmp(argv[i*3],"GaAs") == 0)
{
F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*(SF_table[2]+SF_table[3]*cexp(I*PI*l/2));
F_0[i-1][m][n] = 0.5*8.0*(31 + table2[0][2] + table2[1][2]*I) + 0.5*8.0*(33 + table2[0][3] + table2[1][3]*I);
g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
}
if(strcmp(argv[i*3],"AlGaAs") == 0)
{
F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*((0.76*SF_table[2]+ 0.24*SF_table[4])+SF_table[3]*cexp(I*PI*l/2));
F_0[i-1][m][n] = 0.24*4.0*(13 + table2[0][4] + table2[1][4]*I) + 0.76*4.0*(31 + table2[0][2] + table2[1][2]*I) + 4.0*(33 + table2[0][3] + table2[1][3]*I);
g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
}
}
}
}
double complex (*Rplus1)[1001] = malloc(1001*1001*sizeof(double complex));
for (m = 0; m < 1001; m++)
{
for (n = 0; n <1001; n++)
{
Rplus1[m][n] = 0.0;
}
}
double stop1 = omp_get_wtime();
for(i=1;i<N;i++) // For each layer of the film
{
for(j=0;j<layers[i][2];j++) // For each unit cell
{
for (m = 0; m < 1001; m++) // For each row of the diffraction pattern
{
for (n = 0; n <1001; n++) // For each column of the diffraction pattern
{
Rplus1[m][n] = -I*g[i][m][n] + ((1-I*g_0[i][m][n])*(1-I*g_0[i][m][n]))/(I*g[i][m][n] + (cos(-2*phi[i][m][n])+I*sin(-2*phi[i][m][n]))/Rplus1[m][n]);
}
}
}
}
double stop2 = omp_get_wtime();
double elapsed1 = (double)(stop1 - start1);// Second user defined function to use Durbin and Follis recursive formula
double elapsed2 = (double)(stop2 - start1);// Second user defined function to use Durbin and Follis recursive formula
printf("main() through before diffraction function took %f seconds to run\n\n",elapsed1);
printf("main() through after diffraction function took %f seconds to run\n\n",elapsed2);
}
}

Mandelbrot optimization in openmp

Well i have to paralellisize the mandelbrot program in C. I think i have done it well and i cant get better times. My question if someone has an idea to improve the code, ive been thinking perhaps in nested parallel regions between the outer and insider for...
Also i have doubts if its more elegant or recommended to put all the pragmas in a single line or to write separate pragmas ( one for omp parallel and shared and private variables and a conditional, and another pragma with omp for and schedule dynamic).
Ive the doubt if constants can be used as private variables because i think is cleaner to have constants instead of defined variables.
Also i have written a conditional ( if numcpu >1) it has no sense to use parallel region and make a normal sequential execution.
Finally as i have read the dynamic chunk it depends on hardware and your system configuration... so i have left it as a constant, so it can be easily changed.
Also i adapt the number of threads to the number of processors available..
int main(int argc, char *argv[])
{
omp_set_dynamic(1);
int xactual, yactual;
//each iteration, it calculates: newz = oldz*oldz + p, where p is the current pixel, and oldz stars at the origin
double pr, pi; //real and imaginary part of the pixel p
double newRe, newIm, oldRe, oldIm; //real and imaginary parts of new and old z
double zoom = 1, moveX = -0.5, moveY = 0; //you can change these to zoom and change position
pixel_t *pixels = malloc(sizeof(pixel_t)*IMAGEHEIGHT*IMAGEWIDTH);
clock_t begin, end;
double time_spent;
begin=clock();
int numcpu;
numcpu = omp_get_num_procs();
//FILE * fp;
printf("El número de procesadores que utilizaremos es: %d", numcpu);
omp_set_num_threads(numcpu);
#pragma omp parallel shared(pixels, moveX, moveY, zoom) private(xactual, yactual, pr, pi, newRe, newIm) (if numcpu>1)
{
//int xactual=0;
// int yactual=0;
#pragma omp for schedule(dynamic, CHUNK)
//loop through every pixel
for(yactual = 0; yactual < IMAGEHEIGHT; yactual++)
for(xactual = 0; xactual < IMAGEWIDTH; xactual++)
{
//calculate the initial real and imaginary part of z, based on the pixel location and zoom and position values
pr = 1.5 * (xactual - IMAGEWIDTH / 2) / (0.5 * zoom * IMAGEWIDTH) + moveX;
pi = (yactual - IMAGEHEIGHT / 2) / (0.5 * zoom * IMAGEHEIGHT) + moveY;
newRe = newIm = oldRe = oldIm = 0; //these should start at 0,0
//"i" will represent the number of iterations
int i;
//start the iteration process
for(i = 0; i < ITERATIONS; i++)
{
//remember value of previous iteration
oldRe = newRe;
oldIm = newIm;
//the actual iteration, the real and imaginary part are calculated
newRe = oldRe * oldRe - oldIm * oldIm + pr;
newIm = 2 * oldRe * oldIm + pi;
//if the point is outside the circle with radius 2: stop
if((newRe * newRe + newIm * newIm) > 4) break;
}
// color(i % 256, 255, 255 * (i < maxIterations));
if(i == ITERATIONS)
{
//color(0, 0, 0); // black
pixels[yactual*IMAGEWIDTH+xactual][0] = 0;
pixels[yactual*IMAGEWIDTH+xactual][1] = 0;
pixels[yactual*IMAGEWIDTH+xactual][2] = 0;
}
else
{
double z = sqrt(newRe * newRe + newIm * newIm);
int brightness = 256 * log2(1.75 + i - log2(log2(z))) / log2((double)ITERATIONS);
//color(brightness, brightness, 255)
pixels[yactual*IMAGEWIDTH+xactual][0] = brightness;
pixels[yactual*IMAGEWIDTH+xactual][1] = brightness;
pixels[yactual*IMAGEWIDTH+xactual][2] = 255;
}
}
} //end of parallel region
end= clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
fprintf(stderr, "Elapsed time: %.2lf seconds.\n", time_spent);
You could extend the implementation to leverage SIMD extensions. As far as I know the latest OpenMP standard includes vector constructs. Check out this article that describes the new capabilities.
This whitepaper explains how SSE3 can be used when calculating the Mandelbrot set.

CUDA: Why accessing the same device array is not coalesced?

I am posting a drilled down code for review. I believe it should compile and execute without any problems but since i excluded all the irrelevant parts, I might have made some mistake.
struct Users {
double A[96];
double B[32];
double C[32];
};
This is my Users structure with fixed length arrays. Below is given the main function.
int main(int argc, char **argv) {
int numUsers = 10;
Users *users = new Users[numUsers];
double Step[96];
for (int i = 0; i < 32; i++) {
Step[i] = 0.8;
Step[i + 32] = 0.8;
Step[i + 64] = 0.8;
}
for (int usr = 0; usr < numUsers; usr++) {
for (int i = 0; i < 32; i++) {
users[usr].A[i] = 10;
users[usr].A[i + 32] = 20;
users[usr].A[i + 64] = 30;
}
memset(users[usr].B, 0, sizeof(double) * 32);
memset(users[usr].C, 0, sizeof(double) * 32);
}
double *d_Step;
cudaMalloc((void**)&d_Step, sizeof(double) * 96);
cudaMemcpy(d_Step, Step, sizeof(double) * 96, cudaMemcpyHostToDevice);
Users *deviceUsers;
cudaMalloc((void**)&deviceUsers, sizeof(Users) * numUsers);
cudaMemcpy(deviceUsers, users, sizeof(Users) * numUsers, cudaMemcpyHostToDevice);
dim3 grid;
dim3 block;
grid.x = 1;
grid.y = 1;
grid.z = 1;
block.x = 32;
block.y = 10;
block.z = 1;
calc<<<grid, block >>> (deviceUsers, d_Step, numUsers);
delete users;
return 0;
}
Please note here that Step array is 1D array with 96 bins and I am spanning 10 warps (32 threads in x direction and there are 10 of these in my block). Each warp will access the same Step array. This can be seen below in the kernel.
__global__ void calc(Users *users, double *Step, int numUsers) {
int tId = threadIdx.x + blockIdx.x * blockDim.x;
int uId = threadIdx.y;
while (uId < numUsers) {
double mean00 = users[uId].A[tId] * Step[tId];
double mean01 = users[uId].A[tId + 32] * Step[tId + 32];
double mean02 = users[uId].A[tId + 64] * Step[tId + 64];
users[uId].A[tId] = (mean00 == 0? 0 : 1 / mean00);
users[uId].A[tId + 32] = (mean01 == 0? 0 : 1 / mean01);
users[uId].A[tId + 64] = (mean02 == 0? 0 : 1 / mean02);
uId += 10;
}
}
Now when I use NVIDIA Visual Profiler, the coalesced retrieves are 47%. I further investigated and found out that Step array which is being accessed by each warp causes this problem. If i replace it with some constant, the accesses are 100% coalesced.
Q1) As I understand, coalesced accesses are linked to byte line i.e. byte lines has to be multiple of 32, whether they are integer, double byte lines. Why I am not getting coalesced accesses?
As per my knowledge, cuda whenever assigns a memory block in the device global memory it, it assigned an even address to it. Thus as long as the starting point + 32 location are accessed by a warp, the access should be coalesced. Am I correct?
Hardware
Geforce GTX 470, Compute Capability 2.0
Your kernel read Step 10 times from global memory. Although L1 cache can reduce the actual access to global mem, it still be treated as inefficient access pattern by the profiler.
My profiler names it 'global load efficiency'. It doesn't say if it is coalesced or not.

Cuda thrust global memory writing very slow

I am currently writing a code, that calculates a integral Histogram on the GPU using the Nvidia thrust library.
Therefore I allocate a continuous Block of device memory which I update with a custom functor all the time.
The problem is, that the write to the device memory is veeery slow, but the reads are actually ok.
The basic setup is the following:
struct HistogramCreation
{
HistogramCreation(
...
// pointer to memory
...
){}
/// The actual summation operator
__device__ void operator()(int index){
.. do the calculations ..
for(int j=0;j<30;j++){
(1) *_memoryPointer = values (also using reads to such locations) ;
}
}
}
void foo(){
cudaMalloc(_pointer,size);
HistogramCreation initialCreation( ... _pointer ...);
thrust::for_each(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(_imageSize),
initialCreation);
}
if I change the writing in (1) to the following>
unsigned int val = values;
The performance is much better. THis is the only global memory write I have.
Using the memory write I get about 2s for HD Footage.
using the local variable it takes about 50 ms so about a factor of 40 less.
Why is this so slow? how could I improve it?
Just as #OlegTitov said, frequent load/store with global
memory should be avoided as much as possible. When there's a
situation where it's inevitable, then coalesced memory
access can help the execution process not to get too slow;
however in most cases, histogram calculation is pretty tough
to realize the coalesced access.
While most of the above is basically just restating
#OlegTitov's answer, i'd just like to share about an
investigation i did about finding summation with NVIDIA
CUDA. Actually the result is pretty interesting and i hope
it'll be a helpful information for other xcuda developers.
The experiment was basically to run a speed test of finding
summation with various memory access patterns: using global
memory (1 thread), L2 cache (atomic ops - 128 threads), and
L1 cache (shared mem - 128 threads)
This experiment used:
Kepler GTX 680,
1546 cores # 1.06GHz
GDDR5 256-bit # 3GHz
Here are the kernels:
__global__
void glob(float *h) {
float* hist = h;
uint sd = SEEDRND;
uint random;
for (int i = 0; i < NUMLOOP; i++) {
if (i%NTHREADS==0) random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
hist[rind] += randval;
}
}
__global__
void atom(float *h) {
float* hist = h;
uint sd = SEEDRND;
for (int i = threadIdx.x; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
atomicAdd(&hist[rind], randval);
}
}
__global__
void shm(float *h) {
int lid = threadIdx.x;
uint sd = SEEDRND;
__shared__ float shm[NTHREADS][NBIN];
for (int i = 0; i < NBIN; i++) shm[lid][i] = h[i];
for (int i = lid; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
shm[lid][rind] += randval;
}
/* reduction here */
for (int i = 0; i < NBIN; i++) {
__syncthreads();
if (threadIdx.x < 64) {
shm[threadIdx.x][i] += shm[threadIdx.x+64][i];
}
__syncthreads();
if (threadIdx.x < 32) {
shm[threadIdx.x][i] += shm[threadIdx.x+32][i];
}
__syncthreads();
if (threadIdx.x < 16) {
shm[threadIdx.x][i] += shm[threadIdx.x+16][i];
}
__syncthreads();
if (threadIdx.x < 8) {
shm[threadIdx.x][i] += shm[threadIdx.x+8][i];
}
__syncthreads();
if (threadIdx.x < 4) {
shm[threadIdx.x][i] += shm[threadIdx.x+4][i];
}
__syncthreads();
if (threadIdx.x < 2) {
shm[threadIdx.x][i] += shm[threadIdx.x+2][i];
}
__syncthreads();
if (threadIdx.x == 0) {
shm[0][i] += shm[1][i];
}
}
for (int i = 0; i < NBIN; i++) h[i] = shm[0][i];
}
OUTPUT
atom: 102656.00 shm: 102656.00 glob: 102656.00
atom: 122240.00 shm: 122240.00 glob: 122240.00
... blah blah blah ...
One Thread: 126.3919 msec
Atomic: 7.5459 msec
Sh_mem: 2.2207 msec
The ratio between these kernels is 57:17:1. Many things can
be analyzed here, and it truly does not mean that using
L1 or L2 memory spaces will always give you more than 10
times speedup of the whole program.
And here's the main and other funcs:
#include <iostream>
#include <cstdlib>
#include <cstdio>
using namespace std;
#define NUMLOOP 1000000
#define NBIN 36
#define SEEDRND 1
#define NTHREADS 128
#define NBLOCKS 1
__device__ uint rnd(uint & seed) {
#if LONG_MAX > (16807*2147483647)
int const a = 16807;
int const m = 2147483647;
seed = (long(seed * a))%m;
return seed;
#else
double const a = 16807;
double const m = 2147483647;
double temp = seed * a;
seed = (int) (temp - m * floor(temp/m));
return seed;
#endif
}
... the above kernels ...
int main()
{
float *h_hist, *h_hist2, *h_hist3, *d_hist, *d_hist2,
*d_hist3;
h_hist = (float*)malloc(NBIN * sizeof(float));
h_hist2 = (float*)malloc(NBIN * sizeof(float));
h_hist3 = (float*)malloc(NBIN * sizeof(float));
cudaMalloc((void**)&d_hist, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist2, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist3, NBIN * sizeof(float));
for (int i = 0; i < NBIN; i++) h_hist[i] = 0.0f;
cudaMemcpy(d_hist, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist2, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist3, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaEvent_t start, end;
float elapsed = 0, elapsed2 = 0, elapsed3;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start, 0);
atom<<<NBLOCKS, NTHREADS>>>(d_hist);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed, start, end);
cudaEventRecord(start, 0);
shm<<<NBLOCKS, NTHREADS>>>(d_hist2);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed2, start, end);
cudaEventRecord(start, 0);
glob<<<1, 1>>>(d_hist3);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed3, start, end);
cudaMemcpy(h_hist, d_hist, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist2, d_hist2, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist3, d_hist3, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
/* print output */
for (int i = 0; i < NBIN; i++) {
printf("atom: %10.2f shm: %10.2f glob:
%10.2f¥n",h_hist[i],h_hist2[i],h_hist3[i]);
}
printf("%12s: %8.4f msec¥n", "One Thread", elapsed3);
printf("%12s: %8.4f msec¥n", "Atomic", elapsed);
printf("%12s: %8.4f msec¥n", "Sh_mem", elapsed2);
return 0;
}
When writing GPU code you should avoid reading and writing to/from global memory. Global memory is very slow on GPU. That's the hardware feature. The only thing you can do is to make neighboring treads read/write in neighboring adresses in global memory. This will cause coalescing and speed up the process. But in general read your data once, process it and write it out once.
Note that NVCC might optimize out a lot of your code after you make the modification - it detects that no write to global memory is made and just removes the "unneeded" code. So this speedup may not be coming out of the global writer per ce.
I would recommend using profiler on your actual code (the one with global write) to see if there's anything like unaligned access or other perf problem.

Resources