Why my rcpp code is not much too fast using openmp parallelization - openmp

I try to use openmp to parallelise my loop to be faster. The problem is that the parallelised version is not faster than the sequential version
#include <Rcpp.h>
#include <iostream>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
#include "test.h"
using namespace std;
// [[Rcpp::export]]
std::vector<double> parallel_random_sum(int n, int ncores) {
std::vector<double> res(n);
#pragma omp parallel num_threads(ncores)
{
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
}
}
return res;
}
// [[Rcpp::export]]
std::vector<double> not_parallel_random_sum(int n) {
std::vector<double> res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
}
return res;
}
/*** R
microbenchmark::microbenchmark(
parallel_random_sum(1e7, 8),
not_parallel_random_sum(1e7),
times = 20
)
*/
result ==>
parallel_random_sum(1e+07,8) 62.02360 milliseconds
not_parallel_random_sum(1e+07) 65.56082 milliseconds

The code you are trying to parallelize is just not expensive enough, making the overhead of the parallelization comparable to the gain. If you add some artificial workload in the loop by sleeping for a short amount of time, you can see the performance gain:
#include <chrono>
#include <thread>
#include <Rcpp.h>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
// [[Rcpp::export]]
Rcpp::NumericVector parallel_sleep(int n, int ncores) {
Rcpp::NumericVector res_(n);
RcppParallel::RVector<double> res(res_);
#pragma omp parallel num_threads(ncores)
{
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
std::this_thread::sleep_for(std::chrono::microseconds(10));
lres += j;
res[j] = lres / n;
}
}
return res_;
}
// [[Rcpp::export]]
Rcpp::NumericVector not_parallel_sleep(int n) {
Rcpp::NumericVector res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
std::this_thread::sleep_for(std::chrono::microseconds(10));
lres += j;
res[j] = lres / n;
}
return res;
}
/*** R
N <- 1e4
bench::mark(
parallel_sleep(N, 8),
not_parallel_sleep(N)
)
*/
Result:
# A tibble: 2 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <lis> <list>
1 parallel_sle… 73.2ms 81.3ms 82.3ms 87ms 12.3 80.7KB 0 7 569ms <dbl [1… <Rprofme… <bch… <tibbl…
2 not_parallel… 667.8ms 667.8ms 667.8ms 668ms 1.50 80.7KB 0 1 668ms <dbl [1… <Rprofme… <bch… <tibbl…
Note that I am also using data structures from RcppParallel to avoid the need of a deep copy when returning the data (c.f. comment by #coatless).

Related

Value of sum from thrust::reduce not correct

I have been trying to implement some code requiring to call reduce on thrust::device_ptr, and the results are not consistent with CPU implementation while dealing with large values. I have to deal with large values. So is there a way around:
My code:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
}
}
}
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
int main()
{
real** a;
std::cout.precision(30);
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
}
}
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
///************************
//CUDA KERNELS ARE HERE
// REMOVED FOR CLEAR QUESTION
///*************************
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
////////CPU PART DOING SAME THING//////
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
}
}
cout<<"\nsum cpu "<< sum2<<"\n";
if((sum2-sum1)<0.001)
std::cout << "\nSUCESS "<< "\n";
else
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
}
The compiler that I am using is nvcc and my graphics card is nvidia 1650 with compute capability 7.5.
According to the documentation, thrust expects the type for summation to be reflected in the init value:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
^
The type of that constant you have is an integral type. If you change that to a double-precision constant:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
you get matching results, between CPU and GPU, according to my testing. (You could alternatively cast your constant to real type: (real)0 and use that, and there are other ways to address this as well, such as dropping the use of the init value and the binary op.)

openmp ping pong breaks when using optimization

I have the following openmp program, compiled with mpicc -fopenmp -O0 ping_pong.c. On my machine executing ./a.out -N 10000000 typically gives "done in 1.22125 secs, m: 10000001". If I increase the level of optimization, the program hangs. Is there a way to 1) decrease the execution time while preserving the ping pong functionality? 2) make the code tolerant (no hang, not slower) of optimization?
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
int num_threads = 2;
int N = 1000000;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-N") == 0) {
N = atoi(argv[++i]);
}
}
omp_set_num_threads(num_threads);
int m = 0;
double t0 = omp_get_wtime();
#pragma omp parallel
{
int id = omp_get_thread_num();
while (m < N) {
if (id == 0) {
if (m % 2 == 0) m++;
}
if (id == 1) {
if (m % 2 == 1) m++;
}
}
}
double t = omp_get_wtime() - t0;
printf("done in %g secs, m: %d\n", t, m);
}
Faster and fully optimizable, flush variable m before each if statement.
// to compile: gcc -fopenmp -O* ping_pong.c
// * can be 0, 1, 2, 3, or fast
// to run: ./a.out -N 10000000
#include <assert.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
int num_threads = 2;
int N = 1000000;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-N") == 0) {
N = atoi(argv[++i]);
}
}
omp_set_num_threads(num_threads);
int m = 0;
int count0 = 0;
int count1 = 0;
int *arr0 = (int *)calloc(N / 2 + 2, sizeof(int));
int *arr1 = (int *)calloc(N / 2 + 2, sizeof(int));
double t0 = omp_get_wtime();
#pragma omp parallel
{
int id = omp_get_thread_num();
if (id == 0) {
printf("id %d reporting for duty!\n", id);
while (m < N) {
#pragma omp flush (m)
if (m % 2 == 0) {
arr0[count0] = m;
m++;
count0++;
}
}
}
else if (id == 1) {
printf("id %d reporting for duty!\n", id);
while (m < N) {
#pragma omp flush (m)
if (m % 2 == 1) {
arr1[count1] = m;
m++;
count1++;
}
}
}
}
double t = omp_get_wtime() - t0;
printf("done in %g secs, m: %d, count0: %d, count1: %d\n", t, m, count0, count1);
for (int i = 1; i < N / 2; i++) {
if (arr0[i] != arr0[i - 1] + 2) {
printf("arr0[%d] = %d, arr0[%d] = %d\n", i, arr0[i], i - 1, arr0[i - 1]);
assert(0);
}
}
for (int i = 1; i < N / 2; i++) {
if (arr1[i] != arr1[i - 1] + 2) {
printf("arr1[%d] = %d, arr1[%d] = %d\n", i, arr1[i], i - 1, arr1[i - 1]);
assert(0);
}
}
printf("Both arrays are correctly formed.\n");
free(arr0);
free(arr1);
return 0;
}
OpenMP's memory model allows for different threads to have temporarily diverging views of shared variables. On cache-coherent architectures such as x86, the most frequent reason for diverging views are register optimisations.
This is very much compiler-dependent, but with -O0 most compilers don't do register optimisation, so both if (m % 2 == 0) and m++ result in code that reads or writes the actual memory location of m. With -O1 and higher, m is optimised to a register variable and the result is written back to memory only at the exit from the while loop. In the latter case, after one iteration, the register value of m in thread 0 becomes odd and the thread gets stuck. Similarly, the initial value of m in thread 1 is even (0) and that thread is already stuck.
Preventing register optimisation (and speculative execution / instruction reordering) from screwing the coherent view of shared variables is what the OpenMP flush construct is for. You need a bunch of #pragma omp flush(m) lines to make sure that both threads see the latest value of m.
You can also declare m as volatile int m = 0. The volatile modifier prevents register optimisation of m, so you'll get code similar to what -O0 produces. This is not the same as using the OpenMP flush directive, since on x86 flush performs a memory fence too.

Finding maximum difference b/w index in an array, with constraint a[i]<=a[j] where i<j

Here is my code, showing the wrong answer on a few test cases, can anyone tell me where it's failing.
I am not able to figure it out even after multiple attempts.
#include <iostream>
using namespace std;
int main() {
//code
int t,n;
cin >> t;
while(t--)
{
cin >> n;
long long int a[n],max=0;
for(int i=0;i<n;i++)
cin >> a[i];
int i=0,j=n-1;
while(i<=j)
{
if(a[j]>=a[i]){
max=j-i; break;}
else if(a[j-1]>=a[i] || a[j]>=a[i+1])
{ max=j-i-1; break;}
else
i++;
j--;
}
cout << max<<"\n";
}
return 0;
}
There is a solution in O(n log n):
Create a vector of index = 0 1 2 ... n-1
Sort (in a stable way) the indices i, j such that a[i] < a[i]
Determine the max_index values
max_index[i]= max (index[j], j >= i)
This can be calculated in a recursive way O(n)
For each index[i], determine index_max[i+1] - ind[i]); and determine the max of them
The maximum we obtained is the value we are looking for.
#include <iostream>
#include <vector>
#include <numeric>
#include <algorithm>
int diff_max (const std::vector<long long int> &a) {
int n = a.size();
std::vector<int> index(n), index_max(n);
int dmax = 0;
std::iota (index.begin(), index.end(), 0);
std::stable_sort (index.begin(), index.end(), [&a] (int i, int j) {return a[i] < a[j];});
index_max[n-1] = index[n-1];
for (int i = n-2; i >= 0; --i) {
index_max[i] = std::max (index_max[i+1], index[i]);
}
for (int i = 0; i < n-1; ++i) {
dmax = std::max (dmax, index_max[i+1] - index[i]);
}
return dmax;
}
int main() {
int t, n;
std::cin >> t;
while(t--) {
std::cin >> n;
std::vector<long long int> a(n);
for (int i = 0; i < n; ++i)
std::cin >> a[i];
auto max = diff_max (a);
std::cout << max << "\n";
}
return 0;
}
One known case where the algorithm fails:
1
5
5 7 6 2 3
The output, in this case, is 0, but it should be 2.
If the first two if conditions are not satisfied then you are incrementing i, here you are only comparing i with j and j-1, but there can be some other value of k such that k < j-1 and (i,j) is the answer.

How to implement parallel prefix sum algo in c?

Here is my code,
#include <stdio.h>
#include <omp.h>
#include <math.h>
int main(int argc, char const *argv[])
{
int i,s=0,j,c=8;
int a[]={3,1,0,4,2,1,2,3};
int l=ceil(log10(c)/log10(2));
#pragma omp parallel for
for (i = 1; i < c-1; ++i)
{
for (j = 0; j <l ; j++)
{
if(i-pow(2,j)>=0){
a[i]+=a[(i-(int)pow(2,j))];//printf("%i %i %i %i\n", i,j,a[i],a[(i-(int)pow(2,j))]);
}
}
}
for (i = 0; i < c; ++i)
{
printf("%i\n",a[i]);
}
return 0;
}
but i am getting wrong answer,what i found is that i am getting wrong answer because the pragma loop is working sequentially, the array a is getting updated before all iteration gets completed.
i used this algo
for all Pi where 1<=i<=n-1
for j = 1 to ceil(log n) -1
if(i- pow(2,j)>=0)
a[i ] = a[i] + a[i - 2j]

The following code for "prime sieve" is not working with Release mode ,,but works perfectly with Debug mode

The following code for "prime sieve" is not working with Release mode ,,but works perfectly with Debug mode.. I cannot figure out why ,, am using Microsoft Visual c++ 2010 express
#include <iostream>
#include <fstream>
#include <cmath>
#include <time.h>
#include <vector>
using namespace std;
void main(){
clock_t start= clock();
int n = 10000000;
bool* primes= new bool[n];
primes[0]=0;
int g = (int)sqrt(n*1.0) +1;
for (int i = 2 ;i <g ; i++){
if(primes[i]){
for (__int64 j = i*i ; j <n ; j+=i)
primes[j]=0;
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
int* p = new int[n/2];
int c = 0;
for (int x = 0 ; x<n ; x++)
if (primes[x]){
p[c]=x;
c++;
}
cout<<p[481516]<<endl;
system("pause");
}
You haven't initialized the rest of the values in your primes array before you start accessing them. Try adding
for (int i=1; i<n; i++)
primes[i] = 1;
before your calculation of g.

Resources