generating random variables with openmp in c++ - algorithm

How can I generate in parallel (is it efficient? possible?) random variables with my linear congruential generator below:
double* uniform(long N)
{
long i,j;
long a=16807;
long long m=(((long long)1)<<31)-1;
long I[N];
double *U;
#pragma omp parallel for firstprivate(i)
for (j = 0; j < N;j++)
{
if (i==0)
{
int y= omp_get_thread_num(); // undefined ref error here
I[y];
i++;
}
else
{
I[j] = (a*I[j-1])%m;
}
}
#pragma omp parallel for
for (i=0; i<N; i++)
U[i] = (double)I[i]/(m+1.0);
return U;
}
My goal is to generate 2 variables to use them in another function (box-muller method):
double* gauss(long int N)
{
double *X, *Y, *U;
X = generator(N/2);
Y = generator(N/2);
#pragma omp parallel for
for (i=0;i<N/2;i++)
{
U[2*i]=sqrt(-2 * log(X[i]))*sin(Y[i]*2*3.14);
U[2*i+1]=sqrt(-2 * log(X[i]))*cos(Y[i]*2*3.14);
}
return U;
}
How want to know how can I get different seeds when generating uniform variables with the function uniform?

Related

Difference between mutual exclusion like atomic and reduction in OpenMP

I'm am following video lectures of Tim Mattson on OpenMP and there was one exercise to find errors in provided code that count area of the Mandelbrot. So here is the solution that was provided:
#define NPOINTS 1000
#define MAXITER 1000
void testpoint(struct d_complex);
struct d_complex{
double r;
double i;
};
struct d_complex c;
int numoutside = 0;
int main(){
int i,j;
double area, error, eps = 1.0e-5;
#pragma omp parallel for default(shared) private(c,j) firstprivate(eps)
for(i = 0; i<NPOINTS; i++){
for(j=0; j < NPOINTS; j++){
c.r = -2.0+2.5*(double)(i)/(double)(NPOINTS)+eps;
c.i = 1.125*(double)(j)/(double)(NPOINTS)+eps;
testpoint(c);
}
}
area=2.0*2.5*1.125*(double)(NPOINTS*NPOINTS-numoutside)/(double)(NPOINTS*NPOINTS);
error=area/(double)NPOINTS;
printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n",area,error);
printf("Correct answer should be around 1.510659\n");
}
void testpoint(struct d_complex c){
// Does the iteration z=z*z+c, until |z| > 2 when point is known to be outside set
// If loop count reaches MAXITER, point is considered to be inside the set
struct d_complex z;
int iter;
double temp;
z=c;
for (iter=0; iter<MAXITER; iter++){
temp = (z.r*z.r)-(z.i*z.i)+c.r;
z.i = z.r*z.i*2+c.i;
z.r = temp;
if ((z.r*z.r+z.i*z.i)>4.0) {
#pragma omp atomic
numoutside++;
break;
}
}
}
The question I have is, could we use reduction in #pragma omp parallel of variable numoutside like:
#pragma omp parallel for default(shared) private(c,j) firstprivate(eps) reduction(+:numoutside)
without atomic construct in testpoint function?
I tested the function without atomic, and the result was different from the one I got in the first place. Why does that happen? And while I understand the concept of mutual exclusion and use of it because of race conditioning, isn't reduction just another form of solving that problem with private variables?
Thank You in advance.

Matrix Multiplication OpenMP Counter-Intuitive Results

I am currently porting some code over to OpenMP at my place of work. One of the tasks I am doing is figuring out how to speed up matrix multiplication for one of our applications.
The matrices are stored in row-major format, so A[i*cols +j] gives the A_i_j element of the matrix A.
The code looks like this (uncommenting the pragma parallelises the code):
#include <omp.h>
#include <iostream>
#include <iomanip>
#include <stdio.h>
#define NUM_THREADS 8
#define size 500
#define num_iter 10
int main (int argc, char *argv[])
{
// omp_set_num_threads(NUM_THREADS);
int *A = new int [size*size];
int *B = new int [size*size];
int *C = new int [size*size];
for (int i=0; i<size; i++)
{
for (int j=0; j<size; j++)
{
A[i*size+j] = j*1;
B[i*size+j] = i*j+2;
C[i*size+j] = 0;
}
}
double total_time = 0;
double start = 0;
for (int t=0; t<num_iter; t++)
{
start = omp_get_wtime();
int i, k;
// #pragma omp parallel for num_threads(10) private(i, k) collapse(2) schedule(dynamic)
for (int j=0; j<size; j++)
{
for (i=0; i<size; i++)
{
for (k=0; k<size; k++)
{
C[i*size+j] += A[i*size+k] * B[k*size+j];
}
}
}
total_time += omp_get_wtime() - start;
}
std::setprecision(5);
std::cout << total_time/num_iter << std::endl;
delete[] A;
delete[] B;
delete[] C;
return 0;
}
What is confusing me is the following: why is dynamic scheduling faster than static scheduling for this task? Timing the runs and taking an average shows that static scheduling is slower, which to me is a bit counterintuitive since each thread is doing the same amount of work.
Also, am I correctly speeding up my matrix multiplication code?
Parallel matrix multiplication is non-trivial (have you even considered cache-blocking?). Your best bet is likely to be to use a BLAS Library for this, rather than writing it yourself. (Remember, "The best code is the code I do not have to write").
Wikipedia: Basic Linear Algebra Subprograms points to many implementations, a lot of which (including Intel Math Kernel Library) have free licenses.

execution time work sharing vs spmd

I'm writing two similar programs to compare the execution time in spmd vs. worsharing.
To my surprise I'm getting more execution time in work sharing while time in spmd is considerably less.
What did I do wrong to get that?
Here are my codes:
SPMD code:
#define N 1000
float A[N][N], B[N][N], C[N][N]; // declaring matrices of NxN size
int main() {
/* DECLARING VARIABLES */
int i, j, k; // indices for matrix multiplication
/* FILLING MATRICES WITH RANDOM NUMBERS */
srand ( time(NULL) );
for(i=0;i<N;i++) {
for(j=0;j<N;j++) {
A[i][j]= (rand()%10);
B[i][j]= (rand()%10);
}
}
/* MATRIX MULTIPLICATION */
omp_set_num_threads(4);
int id;
int m;
int nthrds;
#pragma omp parallel // num_threads(4)
{
id=omp_get_thread_num();
nthrds=omp_get_num_threads();
printf("number of threads %d",nthrds);
double wtime = omp_get_wtime();
for (i=id;i<N;i=i+nthrds) {
//dummy=i+nthrds;
for(j=id;j<N;j=j+nthrds) {
C[i][j]=0;
for(k=id;k<N;k=k+nthrds) {
C[i][j]+=A[i][k]*B[k][j];
}
// printf("%d ", C[i][j]);
}
//printf("\n");
}
wtime = omp_get_wtime() - wtime;
printf( "Time taken is %f\n", wtime );
getchar();
}
}
the work sharing code is :
.......
int main() {
/* DECLARING VARIABLES */
int i, j, k; // indices for matrix multiplication
/* FILLING MATRICES WITH RANDOM NUMBERS */
srand ( time(NULL) );
for(i=0;i<N;i++) {
for(j=0;j<N;j++) {
A[i][j]= (rand()%10);
B[i][j]= (rand()%10);
}
}
/* MATRIX MULTIPLICATION */
//printf("Max number of threads: %i \n",omp_get_max_threads());
double wtime = omp_get_wtime();
#pragma omp parallel private(j,k) num_threads(8)
{
#pragma omp for
for (i=0;i<N;i++) {
for(j=0;j<N;j++) {
C[i][j]=0;
for(k=0;k<N;k++) {
C[i][j]+=A[i][k]*B[k][j];
}
// printf("%d ", C[i][j]);
}
//printf("\n");
}
double etime = omp_get_wtime() - wtime;
printf( "Time taken is %f\n", etime );
getchar();
}
}

parallelizing in openMP

I have the following code that I want to paralleize using OpenMP
for(m=0; m<r_c; m++)
{
for(n=0; n<c_c; n++)
{
double value = 0.0;
for(j=0; j<r_b; j++)
for(k=0; k<c_b; k++)
{
double a;
if((m-j)<0 || (n-k)<0 || (m-j)>r_a || (n-k)>c_a)
a = 0.0;
else
a = h_a[((m-j)*c_a) + (n-k)];
//printf("%lf\t", a);
value += h_b[(j*c_b) + k] * a;
}
h_c[m*c_c + n] = value;
//printf("%lf\t", h_c[m*c_c + n]);
}
//cout<<"row "<<m<<" completed"<<endl;
}
In this I want every thread to perform "for j" and "for k" simultaneouly.
I am trying to do using pragma omp parallel for before the "for m" loop but not getting the correct result.
How can I do this in an optimized manner. thanks in advance.
Depending exactly from which loop you want to parallelize, you have three options:
#pragma omp parallel
{
#pragma omp for // Option #1
for(m=0; m<r_c; m++)
{
for(n=0; n<c_c; n++)
{
double value = 0.0;
#pragma omp for // Option #2
for(j=0; j<r_b; j++)
for(k=0; k<c_b; k++)
{
double a;
if((m-j)<0 || (n-k)<0 || (m-j)>r_a || (n-k)>c_a)
a = 0.0;
else
a = h_a[((m-j)*c_a) + (n-k)];
//printf("%lf\t", a);
value += h_b[(j*c_b) + k] * a;
}
h_c[m*c_c + n] = value;
//printf("%lf\t", h_c[m*c_c + n]);
}
//cout<<"row "<<m<<" completed"<<endl;
}
}
//////////////////////////////////////////////////////////////////////////
// Option #3
for(m=0; m<r_c; m++)
{
for(n=0; n<c_c; n++)
{
#pragma omp parallel
{
double value = 0.0;
#pragma omp for
for(j=0; j<r_b; j++)
for(k=0; k<c_b; k++)
{
double a;
if((m-j)<0 || (n-k)<0 || (m-j)>r_a || (n-k)>c_a)
a = 0.0;
else
a = h_a[((m-j)*c_a) + (n-k)];
//printf("%lf\t", a);
value += h_b[(j*c_b) + k] * a;
}
h_c[m*c_c + n] = value;
//printf("%lf\t", h_c[m*c_c + n]);
}
}
//cout<<"row "<<m<<" completed"<<endl;
}
Test and profile each. You might find that option #1 is fastest if there isn't a lot of work for each thread, or you may find that with optimizations on, there is no difference (or even a slowdown) when enabling OMP.
Edit
I've adopted the MCVE supplied in the comments as follows:
#include <iostream>
#include <chrono>
#include <omp.h>
#include <algorithm>
#include <vector>
#define W_OMP
int main(int argc, char *argv[])
{
std::vector<double> h_a(9);
std::generate(h_a.begin(), h_a.end(), std::rand);
int r_b = 500;
int c_b = r_b;
std::vector<double> h_b(r_b * c_b);
std::generate(h_b.begin(), h_b.end(), std::rand);
int r_c = 500;
int c_c = r_c;
int r_a = 3, c_a = 3;
std::vector<double> h_c(r_c * c_c);
auto start = std::chrono::system_clock::now();
#ifdef W_OMP
#pragma omp parallel
{
#endif
int m,n,j,k;
#ifdef W_OMP
#pragma omp for
#endif
for(m=0; m<r_c; m++)
{
for(n=0; n<c_c; n++)
{
double value = 0.0,a;
for(j=0; j<r_b; j++)
{
for(k=0; k<c_b; k++)
{
if((m-j)<0 || (n-k)<0 || (m-j)>r_a || (n-k)>c_a)
a = 0.0;
else a = h_a[((m-j)*c_a) + (n-k)];
value += h_b[(j*c_b) + k] * a;
}
}
h_c[m*c_c + n] = value;
}
}
#ifdef W_OMP
}
#endif
auto end = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << elapsed.count() << "ms"
#ifdef W_OMP
"\t with OMP"
#else
"\t without OMP"
#endif
"\n";
return 0;
}
As a reference, I'm using VS2012 (OMP 2.0, grrr). I'm not sure when collapse was introduced, but apparently after 2.0. Optimizations were /O2 and compiled in Release x64.
Benchmarks
Using the original sizes of the loops (7,7,5,5) and therefore arrays, the results were 0ms without OMP and 1ms with. Verdict: optimizations were better, and the added overhead wasn't worth it. Also, the measurements are not reliable (too short).
Using the slightly larger sizes of the loops (100, 100, 100, 100) and therefore arrays, the results were about equal at about 108ms. Verdict: still not worth the naive effort, tweaking OMP parameters might tip the scale. Definitely not the x4 speedup I would hope for.
Using an even larger sizes of the loops (500, 500, 500, 500) and therefore arrays, OMP started to pull ahead. Without OMP 74.3ms, with 15s. Verdict: Worth it. Weird. I got a x5 speedup with four threads and four cores on an i5. I'm not going to try and figure out how that happened.
Summary
As has been stated in countless answers here on SO, it's not always a good idea to parallelize every for loop you come across. Things that can screw up your desired xN speedup:
Not enough work per thread to justify the overhead of creating the additional threads
The work itself is memory bound. This means that the CPU can be running at 1petaHz and you still won't see a speedup.
Memory access patterns. I'm not going to go there. Feel free to edit in the relevant info if you want it.
OMP parameters. The best choice of parameters will often be a result of this entire list (not including this item, to avoid recursion issues).
SIMD operations. Depending on what and how you're doing, the compiler may vectorize your operations. I have no idea if OMP will usurp the SIMD operations, but it is possible. Check your assembly (foreign language to me) to confirm.

Ways to parallelize this using OpenMP?

Can anybody suggest a best way to parallelize this using openmp? The program gets aborted when I run this code.
void grayerode(int **img, int height, int width, int filterheight,
int filterwidth, int iterations, int pixrange)
{
int maxlabel=0;
int fh, fw, iters, pixval=0, i, j, s, k;
int fhlimit = filterheight/2;
int fwlimit = filterwidth/2;
int **smoothedlabels;
allocate_2D_int_matrix ( &smoothedlabels, height, width );
#pragma omp parallel for shared(smoothedlabels,height,width,k)
for (i=0; i<height; i++)
for (j=0; j<width; j++)
smoothedlabels[i][j] = img[i][j];
int *labeltemp = (int *)malloc(pixrange*sizeof(int));
for (s=0; s<pixrange; s++)
labeltemp[s] = 0;
for (iters=0; iters<iterations; iters++) {
#pragma omp parallel for private(i,j,labeltemp)
for (i=fhlimit; i<height-fhlimit; i++) {
for (j=fwlimit; j<width-fwlimit; j++) {
for (fh=-fhlimit; fh<=fhlimit; fh++)
for (fw=-fwlimit; fw<=fwlimit; fw++) {
labeltemp[img[i+fh][j+fw]]++;
}
for (s=0; s<pixrange; s++) {
if (labeltemp[s]>maxlabel) {
maxlabel = labeltemp[s];
pixval = s;
}
}
smoothedlabels[i][j]=pixval;
for (s=0; s<pixrange; s++)
labeltemp[s] = 0;
maxlabel = 0;
}
}
}
for (i=0; i<height; i++)
for (j=0; j<width; j++)
img[i][j] = smoothedlabels[i][j];
free_2D_int_matrix ( &smoothedlabels );
free(labeltemp);
return;
}
A few things:
You are not declaring private variables correctly. One example of doing it the correct way in your code:
#pragma omp parallel for private(i,j) shared(smoothedlabels, img, width, height)
for(i=0; i<height; i++)
for(j=0; j<width; j++)
smoothedlabels[i][j] = img[i][j]
It is important that j remains private or each thread will try change its value - giving you unexpected behaviour. (Note: i is actually implicitly declared private when you declare the pragma statement, but I always prefer to state it explicitly for better readability)
Try avoiding 2D arrays because they restrict your ability to parallelize. In the same example you could do the following:
#pragma omp parallel for private(i) shared(width, height, smoothedlabels, img)
for(i=0; i<width * height; i++)
smoothedlabels[i] = img[i]
This will parallelize the entire loop for you rather than just the outer loop. You can order your 1D array either column wise or row wise.
Same thing goes for the rest of the loops - just apply the same concept.
Later in your code for example, you have the following:
for (fh=-fhlimit; fh<=fhlimit; fh++)
for (fw=-fwlimit; fw<=fwlimit; fw++) {
labeltemp[img[i+fh][j+fw]]++;
}
If you do not declare fh and fw private, then you will get unexpected behaviour for the same reason not declaring j before would give you unexpected behaviour.

Resources