openmp ping pong breaks when using optimization - openmp

I have the following openmp program, compiled with mpicc -fopenmp -O0 ping_pong.c. On my machine executing ./a.out -N 10000000 typically gives "done in 1.22125 secs, m: 10000001". If I increase the level of optimization, the program hangs. Is there a way to 1) decrease the execution time while preserving the ping pong functionality? 2) make the code tolerant (no hang, not slower) of optimization?
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
int num_threads = 2;
int N = 1000000;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-N") == 0) {
N = atoi(argv[++i]);
}
}
omp_set_num_threads(num_threads);
int m = 0;
double t0 = omp_get_wtime();
#pragma omp parallel
{
int id = omp_get_thread_num();
while (m < N) {
if (id == 0) {
if (m % 2 == 0) m++;
}
if (id == 1) {
if (m % 2 == 1) m++;
}
}
}
double t = omp_get_wtime() - t0;
printf("done in %g secs, m: %d\n", t, m);
}

Faster and fully optimizable, flush variable m before each if statement.
// to compile: gcc -fopenmp -O* ping_pong.c
// * can be 0, 1, 2, 3, or fast
// to run: ./a.out -N 10000000
#include <assert.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
int num_threads = 2;
int N = 1000000;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-N") == 0) {
N = atoi(argv[++i]);
}
}
omp_set_num_threads(num_threads);
int m = 0;
int count0 = 0;
int count1 = 0;
int *arr0 = (int *)calloc(N / 2 + 2, sizeof(int));
int *arr1 = (int *)calloc(N / 2 + 2, sizeof(int));
double t0 = omp_get_wtime();
#pragma omp parallel
{
int id = omp_get_thread_num();
if (id == 0) {
printf("id %d reporting for duty!\n", id);
while (m < N) {
#pragma omp flush (m)
if (m % 2 == 0) {
arr0[count0] = m;
m++;
count0++;
}
}
}
else if (id == 1) {
printf("id %d reporting for duty!\n", id);
while (m < N) {
#pragma omp flush (m)
if (m % 2 == 1) {
arr1[count1] = m;
m++;
count1++;
}
}
}
}
double t = omp_get_wtime() - t0;
printf("done in %g secs, m: %d, count0: %d, count1: %d\n", t, m, count0, count1);
for (int i = 1; i < N / 2; i++) {
if (arr0[i] != arr0[i - 1] + 2) {
printf("arr0[%d] = %d, arr0[%d] = %d\n", i, arr0[i], i - 1, arr0[i - 1]);
assert(0);
}
}
for (int i = 1; i < N / 2; i++) {
if (arr1[i] != arr1[i - 1] + 2) {
printf("arr1[%d] = %d, arr1[%d] = %d\n", i, arr1[i], i - 1, arr1[i - 1]);
assert(0);
}
}
printf("Both arrays are correctly formed.\n");
free(arr0);
free(arr1);
return 0;
}

OpenMP's memory model allows for different threads to have temporarily diverging views of shared variables. On cache-coherent architectures such as x86, the most frequent reason for diverging views are register optimisations.
This is very much compiler-dependent, but with -O0 most compilers don't do register optimisation, so both if (m % 2 == 0) and m++ result in code that reads or writes the actual memory location of m. With -O1 and higher, m is optimised to a register variable and the result is written back to memory only at the exit from the while loop. In the latter case, after one iteration, the register value of m in thread 0 becomes odd and the thread gets stuck. Similarly, the initial value of m in thread 1 is even (0) and that thread is already stuck.
Preventing register optimisation (and speculative execution / instruction reordering) from screwing the coherent view of shared variables is what the OpenMP flush construct is for. You need a bunch of #pragma omp flush(m) lines to make sure that both threads see the latest value of m.
You can also declare m as volatile int m = 0. The volatile modifier prevents register optimisation of m, so you'll get code similar to what -O0 produces. This is not the same as using the OpenMP flush directive, since on x86 flush performs a memory fence too.

Related

Value of sum from thrust::reduce not correct

I have been trying to implement some code requiring to call reduce on thrust::device_ptr, and the results are not consistent with CPU implementation while dealing with large values. I have to deal with large values. So is there a way around:
My code:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
}
}
}
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
int main()
{
real** a;
std::cout.precision(30);
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
}
}
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
///************************
//CUDA KERNELS ARE HERE
// REMOVED FOR CLEAR QUESTION
///*************************
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
////////CPU PART DOING SAME THING//////
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
}
}
cout<<"\nsum cpu "<< sum2<<"\n";
if((sum2-sum1)<0.001)
std::cout << "\nSUCESS "<< "\n";
else
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
}
The compiler that I am using is nvcc and my graphics card is nvidia 1650 with compute capability 7.5.
According to the documentation, thrust expects the type for summation to be reflected in the init value:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
^
The type of that constant you have is an integral type. If you change that to a double-precision constant:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
you get matching results, between CPU and GPU, according to my testing. (You could alternatively cast your constant to real type: (real)0 and use that, and there are other ways to address this as well, such as dropping the use of the init value and the binary op.)

Why my rcpp code is not much too fast using openmp parallelization

I try to use openmp to parallelise my loop to be faster. The problem is that the parallelised version is not faster than the sequential version
#include <Rcpp.h>
#include <iostream>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
#include "test.h"
using namespace std;
// [[Rcpp::export]]
std::vector<double> parallel_random_sum(int n, int ncores) {
std::vector<double> res(n);
#pragma omp parallel num_threads(ncores)
{
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
}
}
return res;
}
// [[Rcpp::export]]
std::vector<double> not_parallel_random_sum(int n) {
std::vector<double> res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
}
return res;
}
/*** R
microbenchmark::microbenchmark(
parallel_random_sum(1e7, 8),
not_parallel_random_sum(1e7),
times = 20
)
*/
result ==>
parallel_random_sum(1e+07,8) 62.02360 milliseconds
not_parallel_random_sum(1e+07) 65.56082 milliseconds
The code you are trying to parallelize is just not expensive enough, making the overhead of the parallelization comparable to the gain. If you add some artificial workload in the loop by sleeping for a short amount of time, you can see the performance gain:
#include <chrono>
#include <thread>
#include <Rcpp.h>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
// [[Rcpp::export]]
Rcpp::NumericVector parallel_sleep(int n, int ncores) {
Rcpp::NumericVector res_(n);
RcppParallel::RVector<double> res(res_);
#pragma omp parallel num_threads(ncores)
{
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
std::this_thread::sleep_for(std::chrono::microseconds(10));
lres += j;
res[j] = lres / n;
}
}
return res_;
}
// [[Rcpp::export]]
Rcpp::NumericVector not_parallel_sleep(int n) {
Rcpp::NumericVector res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
std::this_thread::sleep_for(std::chrono::microseconds(10));
lres += j;
res[j] = lres / n;
}
return res;
}
/*** R
N <- 1e4
bench::mark(
parallel_sleep(N, 8),
not_parallel_sleep(N)
)
*/
Result:
# A tibble: 2 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <lis> <list>
1 parallel_sle… 73.2ms 81.3ms 82.3ms 87ms 12.3 80.7KB 0 7 569ms <dbl [1… <Rprofme… <bch… <tibbl…
2 not_parallel… 667.8ms 667.8ms 667.8ms 668ms 1.50 80.7KB 0 1 668ms <dbl [1… <Rprofme… <bch… <tibbl…
Note that I am also using data structures from RcppParallel to avoid the need of a deep copy when returning the data (c.f. comment by #coatless).

MPI_Scatterv submatrix with MPI_Type_struct

I'm currently working on a MPI-program and I'm trying to send blocks of a matrix with scatterv to all processes.
Process description
The matrix is given as an array.
First I produce a datatype with MPI_Type_vector to create the necessary block out of the original array.
Second I create a MPI_Type_struct that should hold rows of blocks.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 16
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p,r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
int *arr;
arr = NULL;
if (r == 0){
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n * n; i++) arr[i] = i;
for (int i = 0; i < n; i++){
printf("\n");
for (int j = 0; j < n; j++)
printf("%4d", arr[i * n + j]);
}
}
printf("\n");
int ps = sqrt(p);
int ns = n / ps;
if (r == 0) {
printf("ps: %d ns: %d\n", ps, ns);
}
/* create datatype */
MPI_Datatype block;
MPI_Type_vector(ns, ns, n, MPI_INT, &block);
int blocks[ps];
MPI_Aint displs[ps];
for (int i = 0; i < ps; i++) {
blocks[i] = 1;
displs[i] = i * sizeof(int);
}
MPI_Datatype types[ps];
//for (int i = 0; i < ps - 1; i++) types[i] = block;
//types[ps - 1] = MPI_UB;
types[0] = block;
for (int i = 1; i < ps; i++) types[i] = MPI_UB;
//types[0] = block;
//types[1] = MPI_UB;
if (r == 0) {
printf("displs:\n");
for(int i = 0; i < ps; i++) printf("%3ld", displs[i]);
printf("\n");
}
MPI_Datatype row;
MPI_Type_struct(ps, blocks, displs, types, &row);
MPI_Type_commit(&row);
/* prepare scatter */
int sdispl[p]; int sendcounts[p];
for (int i = 0; i < p; i++) {
sdispl[i] = (i % ps) + (i / ps) * (ns * ps);
sendcounts[i] = 1;
}
if (r == 0) {
printf("sdispl: \n");
for (int i = 0; i < 4; i++) printf("%3d", sdispl[i]);
printf("\n");
}
int rcv[ns * ns];
MPI_Scatterv(arr, sendcounts, sdispl, row, rcv, ns * ns, MPI_INT, 0, comm);
int result = 1;
if (r == result) {
printf("result for %d:\n", result);
for (int i = 0; i < ns * ns; i++) {
printf("%4d", rcv[i]);
if ((i+1) % ns == 0) printf("\n");
}
}
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
So far the structure of the blocks is correct.
The problem
The block, that was sent to process r = 1 starts with 3 instead of 4. The block for process r = 2 also starts with 6 and the one for process r = 3 starts with 9.
For r == 4 it jumps to 48.
What it should do
r start
0 0
1 4
2 8
3 12
4 64
5 68
6 ...
15 204
The help I would need
I think, that I'm making some mistake with displ and sdispl.
Compiling and Running the example
The code is compiled with the folowing command:
mpicc -o main main.c -lm
I run the code with:
mpirun -np 16 ./main
Thanks for any help in advance!
With the hint of Zulan I was able to solve my problem.
The following code is based on the excellent answer to subarrays.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 8
void print_arr(int *arr, int x) {
printf("\n");
for (int i = 0; i < x*x; i++){
if (i % x == 0) printf("\n");
printf("%4d", arr[i]);
}
printf("\n");
}
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p, r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
/* number of proceses in dim x and dim y */
int ps = sqrt(p);
/* number of elements in dim x and dim y in sarr */
int ns = n/ps;
/* array of data - distributed by process 0 */
int *arr = NULL;
if (r==0) {
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n*n; i++) arr[i] = i;
print_arr(arr, n);
}
MPI_Datatype type, resizedtype;
int sizes[2] = {n,n};
int subsizes[2] = {ns,ns};
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, ns*sizeof(int), &resizedtype);
MPI_Type_commit(&resizedtype);
int counts[p];
for (int i = 0; i < p; i++) counts[i] = 1;
int displs[p];
for (int i = 0; i < p; i++) displs[i] = i%ps + i/ps * ns * ps;
/* subarray to store distributed data */
int sarr[ns * ns];
/* send submatrices to all processes */
MPI_Scatterv(arr, counts, displs, resizedtype, sarr, ns*ns, MPI_INT, 0, comm);
/* print received data for process pr */
int pr = 3;
if (r == pr)
print_arr(sarr, ns);
/* free arr */
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
You can compile the example with
mpicc -o main main.c
and run it with
mpirun -np 4 ./main

openacc create data while running inside a kernels

I'm having a task that is to be accelerated by OpenACC. I need to do dynamic memory allocation within a kernel computation. I've built a simpler demo for it as following.
#include <iostream>
using namespace std;
#pragma acc routine seq
int *routine(int init) {
int *ptr;
#pragma acc data create(ptr[:10])
for (int i = 0; i < 10; ++i) {
ptr[i] = init + i;
}
return ptr;
}
void print_array(int *arr) {
for (int i = 0; i < 10; ++i) {
cout << arr[i] << " ";
}
cout << endl;
}
int main(void) {
int *arrs[5];
#pragma acc kernels
for (int i = 0; i < 5; ++i) {
arrs[i] = routine(i);
}
for (int i = 0; i < 5; ++i) {
print_array(arrs[i]);
}
return 0;
}
In this demo, I'm trying to call the routine while running inside a kernel construct. The routine procedure wants to create some data within the GPU and put some values into it.
While I can compile the code, but it reports runtime problems as following.
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ pgc++ -o test main.cc -acc -Minfo=accel
routine(int):
6, Generating acc routine seq
main:
23, Generating implicit copyout(arrs[:])
26, Accelerator restriction: size of the GPU copy of arrs is unknown
Loop is parallelizable
Generating implicit copy(arrs[:][:])
Accelerator kernel generated
Generating Tesla code
26, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ ./test
call to cuStreamSynchronize returned error 715: Illegal instruction
I'm wondering what I should do to accomplish this task (dynamically allocating memory within processing of a kernel construct). Really appreciate it if you could help.
This is untested, and probably very slow, but this might do what you need it to.
int main() {
const int num = 20;
int a[x] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
int* sizes = (int *)malloc(num * sizeof(int));
int *ptrs[num];
int* temp, *temp2;
int sum;
int* finished = (int *)malloc(num * sizeof(int));
for (int x = 0; x < num; ++x){
finished[x] = 0;
}
#pragma acc kernels copyin(a[0:10]) copyout(ptrs[:num][:1]) async(num*2+1)
{
#pragma acc loop private(temp)
for (int i = 0; i < num; ++i){
#pragma acc loop seq async(i)
for (int j = 0; j < 1; ++j){
temp = ptrs[x];
sizes[i] = ...
}
while (ptrs[x] != x);
ptrs[x] = routine(a, sizes[i]);
}
}
while (true){
sum = 0;
for (int x = 0; x < num; ++x){
sum += finished[x];
}
if (sum == num){
break;
}
for (int x = 0; x < num; ++x){
if (acc_async_test(x) != 0 && finished[x] == 0){
finished[x] = 1;
#pragma acc update host(sizes[x:1])
temp = (int *)malloc(size[x] * sizeof(int));
#pragma acc enter data copyin(temp[0:x])
temp2 = acc_deviceptr(temp);
ptrs[x] = temp2;
#pragma acc update device(ptrs[x:1][0:1])
}
}
}
}

gcc openmp thread reuse

I am using gcc's implementation of openmp to try to parallelize a program. Basically the assignment is to add omp pragmas to obtain speedup on a program that finds amicable numbers.
The original serial program was given(shown below except for the 3 lines I added with comments at the end). We have to parallize first just the outer loop, then just the inner loop. The outer loop was easy and I get close to ideal speedup for a given number of processors. For the inner loop, I get much worse performance than the original serial program. Basically what I am trying to do is a reduction on the sum variable.
Looking at the cpu usage, I am only using ~30% per core. What could be causing this? Is the program continually making new threads everytime it hits the omp parallel for clause? Is there just so much more overhead in doing a barrier for the reduction? Or could it be memory access issue(eg cache thrashing)? From what I read with most implementations of openmp threads get reused overtime(eg pooled), so I am not so sure the first problem is what is wrong.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include <omp.h>
#define numThread 2
int main(int argc, char* argv[]) {
int ser[29], end, i, j, a, limit, als;
als = atoi(argv[1]);
limit = atoi(argv[2]);
for (i = 2; i < limit; i++) {
ser[0] = i;
for (a = 1; a <= als; a++) {
ser[a] = 1;
int prev = ser[a-1];
if ((prev > i) || (a == 1)) {
end = sqrt(prev);
int sum = 0;//added this
#pragma omp parallel for reduction(+:sum) num_threads(numThread)//added this
for (j = 2; j <= end; j++) {
if (prev % j == 0) {
sum += j;
sum += prev / j;
}
}
ser[a] = sum + 1;//added this
}
}
if (ser[als] == i) {
printf("%d", i);
for (j = 1; j < als; j++) {
printf(", %d", ser[j]);
}
printf("\n");
}
}
}
OpenMP thread teams are instantiated on entering the parallel section. This means, indeed, that the thread creation is repeated every time the inner loop is starting.
To enable reuse of threads, use a larger parallel section (to control the lifetime of the team) and specificly control the parallellism for the outer/inner loops, like so:
Execution time for test.exe 1 1000000 has gone down from 43s to 22s using this fix (and the number of threads reflects the numThreads defined value + 1
PS Perhaps stating the obvious, it would not appear that parallelizing the inner loop is a sound performance measure. But that is likely the whole point to this exercise, and I won't critique the question for that.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include <omp.h>
#define numThread 2
int main(int argc, char* argv[]) {
int ser[29], end, i, j, a, limit, als;
als = atoi(argv[1]);
limit = atoi(argv[2]);
#pragma omp parallel num_threads(numThread)
{
#pragma omp single
for (i = 2; i < limit; i++) {
ser[0] = i;
for (a = 1; a <= als; a++) {
ser[a] = 1;
int prev = ser[a-1];
if ((prev > i) || (a == 1)) {
end = sqrt(prev);
int sum = 0;//added this
#pragma omp parallel for reduction(+:sum) //added this
for (j = 2; j <= end; j++) {
if (prev % j == 0) {
sum += j;
sum += prev / j;
}
}
ser[a] = sum + 1;//added this
}
}
if (ser[als] == i) {
printf("%d", i);
for (j = 1; j < als; j++) {
printf(", %d", ser[j]);
}
printf("\n");
}
}
}
}

Resources