How to implement parallel prefix sum algo in c? - parallel-processing

Here is my code,
#include <stdio.h>
#include <omp.h>
#include <math.h>
int main(int argc, char const *argv[])
{
int i,s=0,j,c=8;
int a[]={3,1,0,4,2,1,2,3};
int l=ceil(log10(c)/log10(2));
#pragma omp parallel for
for (i = 1; i < c-1; ++i)
{
for (j = 0; j <l ; j++)
{
if(i-pow(2,j)>=0){
a[i]+=a[(i-(int)pow(2,j))];//printf("%i %i %i %i\n", i,j,a[i],a[(i-(int)pow(2,j))]);
}
}
}
for (i = 0; i < c; ++i)
{
printf("%i\n",a[i]);
}
return 0;
}
but i am getting wrong answer,what i found is that i am getting wrong answer because the pragma loop is working sequentially, the array a is getting updated before all iteration gets completed.
i used this algo
for all Pi where 1<=i<=n-1
for j = 1 to ceil(log n) -1
if(i- pow(2,j)>=0)
a[i ] = a[i] + a[i - 2j]

Related

openmp ping pong breaks when using optimization

I have the following openmp program, compiled with mpicc -fopenmp -O0 ping_pong.c. On my machine executing ./a.out -N 10000000 typically gives "done in 1.22125 secs, m: 10000001". If I increase the level of optimization, the program hangs. Is there a way to 1) decrease the execution time while preserving the ping pong functionality? 2) make the code tolerant (no hang, not slower) of optimization?
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
int num_threads = 2;
int N = 1000000;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-N") == 0) {
N = atoi(argv[++i]);
}
}
omp_set_num_threads(num_threads);
int m = 0;
double t0 = omp_get_wtime();
#pragma omp parallel
{
int id = omp_get_thread_num();
while (m < N) {
if (id == 0) {
if (m % 2 == 0) m++;
}
if (id == 1) {
if (m % 2 == 1) m++;
}
}
}
double t = omp_get_wtime() - t0;
printf("done in %g secs, m: %d\n", t, m);
}
Faster and fully optimizable, flush variable m before each if statement.
// to compile: gcc -fopenmp -O* ping_pong.c
// * can be 0, 1, 2, 3, or fast
// to run: ./a.out -N 10000000
#include <assert.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
int num_threads = 2;
int N = 1000000;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-N") == 0) {
N = atoi(argv[++i]);
}
}
omp_set_num_threads(num_threads);
int m = 0;
int count0 = 0;
int count1 = 0;
int *arr0 = (int *)calloc(N / 2 + 2, sizeof(int));
int *arr1 = (int *)calloc(N / 2 + 2, sizeof(int));
double t0 = omp_get_wtime();
#pragma omp parallel
{
int id = omp_get_thread_num();
if (id == 0) {
printf("id %d reporting for duty!\n", id);
while (m < N) {
#pragma omp flush (m)
if (m % 2 == 0) {
arr0[count0] = m;
m++;
count0++;
}
}
}
else if (id == 1) {
printf("id %d reporting for duty!\n", id);
while (m < N) {
#pragma omp flush (m)
if (m % 2 == 1) {
arr1[count1] = m;
m++;
count1++;
}
}
}
}
double t = omp_get_wtime() - t0;
printf("done in %g secs, m: %d, count0: %d, count1: %d\n", t, m, count0, count1);
for (int i = 1; i < N / 2; i++) {
if (arr0[i] != arr0[i - 1] + 2) {
printf("arr0[%d] = %d, arr0[%d] = %d\n", i, arr0[i], i - 1, arr0[i - 1]);
assert(0);
}
}
for (int i = 1; i < N / 2; i++) {
if (arr1[i] != arr1[i - 1] + 2) {
printf("arr1[%d] = %d, arr1[%d] = %d\n", i, arr1[i], i - 1, arr1[i - 1]);
assert(0);
}
}
printf("Both arrays are correctly formed.\n");
free(arr0);
free(arr1);
return 0;
}
OpenMP's memory model allows for different threads to have temporarily diverging views of shared variables. On cache-coherent architectures such as x86, the most frequent reason for diverging views are register optimisations.
This is very much compiler-dependent, but with -O0 most compilers don't do register optimisation, so both if (m % 2 == 0) and m++ result in code that reads or writes the actual memory location of m. With -O1 and higher, m is optimised to a register variable and the result is written back to memory only at the exit from the while loop. In the latter case, after one iteration, the register value of m in thread 0 becomes odd and the thread gets stuck. Similarly, the initial value of m in thread 1 is even (0) and that thread is already stuck.
Preventing register optimisation (and speculative execution / instruction reordering) from screwing the coherent view of shared variables is what the OpenMP flush construct is for. You need a bunch of #pragma omp flush(m) lines to make sure that both threads see the latest value of m.
You can also declare m as volatile int m = 0. The volatile modifier prevents register optimisation of m, so you'll get code similar to what -O0 produces. This is not the same as using the OpenMP flush directive, since on x86 flush performs a memory fence too.

Finding maximum difference b/w index in an array, with constraint a[i]<=a[j] where i<j

Here is my code, showing the wrong answer on a few test cases, can anyone tell me where it's failing.
I am not able to figure it out even after multiple attempts.
#include <iostream>
using namespace std;
int main() {
//code
int t,n;
cin >> t;
while(t--)
{
cin >> n;
long long int a[n],max=0;
for(int i=0;i<n;i++)
cin >> a[i];
int i=0,j=n-1;
while(i<=j)
{
if(a[j]>=a[i]){
max=j-i; break;}
else if(a[j-1]>=a[i] || a[j]>=a[i+1])
{ max=j-i-1; break;}
else
i++;
j--;
}
cout << max<<"\n";
}
return 0;
}
There is a solution in O(n log n):
Create a vector of index = 0 1 2 ... n-1
Sort (in a stable way) the indices i, j such that a[i] < a[i]
Determine the max_index values
max_index[i]= max (index[j], j >= i)
This can be calculated in a recursive way O(n)
For each index[i], determine index_max[i+1] - ind[i]); and determine the max of them
The maximum we obtained is the value we are looking for.
#include <iostream>
#include <vector>
#include <numeric>
#include <algorithm>
int diff_max (const std::vector<long long int> &a) {
int n = a.size();
std::vector<int> index(n), index_max(n);
int dmax = 0;
std::iota (index.begin(), index.end(), 0);
std::stable_sort (index.begin(), index.end(), [&a] (int i, int j) {return a[i] < a[j];});
index_max[n-1] = index[n-1];
for (int i = n-2; i >= 0; --i) {
index_max[i] = std::max (index_max[i+1], index[i]);
}
for (int i = 0; i < n-1; ++i) {
dmax = std::max (dmax, index_max[i+1] - index[i]);
}
return dmax;
}
int main() {
int t, n;
std::cin >> t;
while(t--) {
std::cin >> n;
std::vector<long long int> a(n);
for (int i = 0; i < n; ++i)
std::cin >> a[i];
auto max = diff_max (a);
std::cout << max << "\n";
}
return 0;
}
One known case where the algorithm fails:
1
5
5 7 6 2 3
The output, in this case, is 0, but it should be 2.
If the first two if conditions are not satisfied then you are incrementing i, here you are only comparing i with j and j-1, but there can be some other value of k such that k < j-1 and (i,j) is the answer.

Why my rcpp code is not much too fast using openmp parallelization

I try to use openmp to parallelise my loop to be faster. The problem is that the parallelised version is not faster than the sequential version
#include <Rcpp.h>
#include <iostream>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
#include "test.h"
using namespace std;
// [[Rcpp::export]]
std::vector<double> parallel_random_sum(int n, int ncores) {
std::vector<double> res(n);
#pragma omp parallel num_threads(ncores)
{
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
}
}
return res;
}
// [[Rcpp::export]]
std::vector<double> not_parallel_random_sum(int n) {
std::vector<double> res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
}
return res;
}
/*** R
microbenchmark::microbenchmark(
parallel_random_sum(1e7, 8),
not_parallel_random_sum(1e7),
times = 20
)
*/
result ==>
parallel_random_sum(1e+07,8) 62.02360 milliseconds
not_parallel_random_sum(1e+07) 65.56082 milliseconds
The code you are trying to parallelize is just not expensive enough, making the overhead of the parallelization comparable to the gain. If you add some artificial workload in the loop by sleeping for a short amount of time, you can see the performance gain:
#include <chrono>
#include <thread>
#include <Rcpp.h>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
// [[Rcpp::export]]
Rcpp::NumericVector parallel_sleep(int n, int ncores) {
Rcpp::NumericVector res_(n);
RcppParallel::RVector<double> res(res_);
#pragma omp parallel num_threads(ncores)
{
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
std::this_thread::sleep_for(std::chrono::microseconds(10));
lres += j;
res[j] = lres / n;
}
}
return res_;
}
// [[Rcpp::export]]
Rcpp::NumericVector not_parallel_sleep(int n) {
Rcpp::NumericVector res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
std::this_thread::sleep_for(std::chrono::microseconds(10));
lres += j;
res[j] = lres / n;
}
return res;
}
/*** R
N <- 1e4
bench::mark(
parallel_sleep(N, 8),
not_parallel_sleep(N)
)
*/
Result:
# A tibble: 2 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <lis> <list>
1 parallel_sle… 73.2ms 81.3ms 82.3ms 87ms 12.3 80.7KB 0 7 569ms <dbl [1… <Rprofme… <bch… <tibbl…
2 not_parallel… 667.8ms 667.8ms 667.8ms 668ms 1.50 80.7KB 0 1 668ms <dbl [1… <Rprofme… <bch… <tibbl…
Note that I am also using data structures from RcppParallel to avoid the need of a deep copy when returning the data (c.f. comment by #coatless).

Generating random numbers into an array which prints to a Txt file. Before hitting the text file, the numbers need to be sorted

so my objective is to basically print out random numbers from 40,000 to 1,000,000 to a txt file, sorted using the heap method. I can print to the text file just fine with random numbers, but I am a bit stuck at sorting them using a heap method. I started a method and got a bit lost half way through after looking at some tutorials. Any thoughts/ helpful comments? Im literally new to stack overflow (posting wise) so forgive me if I didnt place this here correctly. Thank you!
//Heapsort algorithm
#include <iostream>
#include <string>
#include <fstream>
#include <ctime>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
/*class generateDataSet {
public: static const int MAX_VALUE = 1000000;
public: static const bool SORTED = false;
public: static const bool REVERSE_SORTED = false;
};
*/
int main()
{
const int array= 16;
int arrayNum[array];
srand(time(NULL));
int upB = 1000000;
int loB = 40000;
int temp;
ofstream inputFile;
inputFile.open("randomData.txt");
if (inputFile.is_open())
{
for (int i = 0; i < array; i++)
{
arrayNum[i] = (rand()% (upB - loB + 1)) + loB;
inputFile << arrayNum[i] << "\n";
}
}
return 0;
}
void MaxHeapify(int d[], int i, int n)
{
int j;
int temp;
temp = d[i];
j = 2 * 1;
while (j <= n)
{
if (j < n && d[j + 1] > d[j])
j = j + 1;
if (temp > d[j])
break;
else if (temp <= d[j])
{
d[j / 2] = d[j];
j = 2 * j;
}
}
}
void heapSort(int d[], int n)
{
int i;
int temp;
for (i = n; i >= 2; i);
}

The following code for "prime sieve" is not working with Release mode ,,but works perfectly with Debug mode

The following code for "prime sieve" is not working with Release mode ,,but works perfectly with Debug mode.. I cannot figure out why ,, am using Microsoft Visual c++ 2010 express
#include <iostream>
#include <fstream>
#include <cmath>
#include <time.h>
#include <vector>
using namespace std;
void main(){
clock_t start= clock();
int n = 10000000;
bool* primes= new bool[n];
primes[0]=0;
int g = (int)sqrt(n*1.0) +1;
for (int i = 2 ;i <g ; i++){
if(primes[i]){
for (__int64 j = i*i ; j <n ; j+=i)
primes[j]=0;
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
int* p = new int[n/2];
int c = 0;
for (int x = 0 ; x<n ; x++)
if (primes[x]){
p[c]=x;
c++;
}
cout<<p[481516]<<endl;
system("pause");
}
You haven't initialized the rest of the values in your primes array before you start accessing them. Try adding
for (int i=1; i<n; i++)
primes[i] = 1;
before your calculation of g.

Resources