Different results when using OpenMP and FFTW - openmp

I am trying to parallelize the following loop:
#pragma omp parallel for private(j,i,mxy) firstprivate(in,out,p)
for(int j = 0; j < Ny; j++) {
// #pragma omp parallel for private(i,mxy) firstprivate(in,my,j)
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
in[i+1] = b_2D[mxy] + I*0.0 ;
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
b_2D[mxy] = cimag(out[i+1]) ;
I do get a small speed up, but I keep getting a different result regardless of what variables I set to private and firstprivate. I believe this is correct how I have done it, but why am I getting a different result than when I run this in series?
I have tried the following:
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
#pragma omp parallel private(j,i,mxy) firstprivate(in,out)
fftw_plan p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
for( j = 0; j < N; j++)
in[j] = 0.0;
#pragma omp for
for( j = 0; j < Ny; j++) {
for( i = 0; i < Nx; i++)
in[i+1] = b_2D[i + j*Nx] + I*0.0;
for( i = 0; i < Nx; i++)
b_2D[i + j*Nx] = cimag(out[i+1]) ;
This give me the error: "Segmentation fault: 11"
If I run this:
#pragma omp parallel private(j,i,mxy)
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_plan p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
for( j = 0; j < N; j++)
in[j] = 0.0;
#pragma omp for
for( j = 0; j < Ny; j++) {
for( i = 0; i < Nx; i++)
in[i+1] = b_2D[i + j*Nx] + I*0.0;
for( i = 0; i < Nx; i++)
b_2D[i + j*Nx] = cimag(out[i+1]) ;
I get this error again: "Segmentation fault: 11"
but I run again and it says:
solver(9674,0x7fff74e22000) malloc: *** error for object 0x7f8d70f00410: double free
*** set a breakpoint in malloc_error_break to debug
Abort trap: 6

You are calling FFTW with the same plan p in all threads. Since the plan includes the location of the input and output buffers (the ones supplied to the fftw_plan_dft_whatever plan constructor), all concurrent calls to fftw_execute will utilise those same buffers and not the private copies. The solution is to construct a separate plan for each thread:
#pragma omp parallel private(j,i,mxy) firstprivate(in,out)
// The following OpenMP construct enforces thread-safety
// Remove if the plan constructor is thread-safe
#pragma omp critical (plan_ops)
fftw_plan my_p = fftw_plan_dft_whatever(..., in, out, ...);
// my_p now refers the private in and out arrays
#pragma omp for
for(int j = 0; j < Ny; j++) {
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
in[i+1] = b_2D[mxy] + I*0.0 ;
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
b_2D[mxy] = cimag(out[i+1]) ;
// See comment above for the constructor operation
#pragma omp critical (plan_ops)

The root cause should be this patch isn't backported to fftw-3.3.5 version, and I think you should merge the patch yourself. You can also refer the discussion here.


Thrust's exclusive_scan_by_key function takes the same amount of time as a sequential implementation?

I'm relatively new to Thrust and I'm trying to perform a segmented scan. Here is my code, which you should be able to run as-is:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <chrono>
// Sequential scan for CPU
float* test_seqScan(float* in, int s, int m) {
float* out = new float[s * m];
for (unsigned int i = 0; i < s; i++) {
out[i * m] = 0;
for (unsigned int i = 0; i < s; i++) {
for (unsigned int j = 1; j < m; j++) {
out[i * m + j] = out[i * m + j - 1] + in[i * m + j - 1];
return out;
void test_sumScan(thrust::device_vector<float> dev_in, thrust::device_vector<int> dev_keys, int s, int m) {
// Allocate device memory for output
thrust::device_vector<float> dev_out(s * m);
thrust::exclusive_scan_by_key(thrust::device, dev_keys.begin(), dev_keys.end(), dev_in.begin(), dev_out.begin());
int main(){
int s = 100;
int m = 100000;
float* seq_in = new float[s * m];
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
seq_in[i * m + j] = j + 1;
thrust::host_vector<float> par_in(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
par_in[i * m + j] = j + 1;
thrust::host_vector<int> keys(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
keys[i * m + j] = i;
thrust::device_vector<float> dev_in = par_in;
thrust::device_vector<int> dev_keys = keys;
auto t1 = std::chrono::high_resolution_clock::now();
test_seqScan(seq_in, s, m);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Sequential duration: " << duration1 << "\n\n";
auto t3 = std::chrono::high_resolution_clock::now();
test_sumScan(dev_in, dev_keys, s, m);
auto t4 = std::chrono::high_resolution_clock::now();
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Parallel duration: " << duration2 << "\n\n";
My issue is that both these snippets of code take exactly the same amount of time to run regardless of how small or large I set s and m. I assume that I'm doing something wrong, but I don't know what; can anyone point out the issue?

Why doesn't this parallel for loop give me speedup?

int main()
for (int i = 1; i < 24; i++){
float total_time = 0;
for (int j = 0; j < 5; j++){
vector<int> A(100000000,2);
double start_time = omp_get_wtime();
#pragma omp parallel for
for (int i = 0; i < 100000000; i++){
A[i] *= 2;
double time = omp_get_wtime() - start_time;
total_time += time;
total_time /= 5;
std::cout << "Number of threads: " << i << " Time(ms): " << total_time * 1000 << std::endl;
return 0;
The above code, which just doubles the entries in a vector of integers, has been parallelized by varying the number of threads. On my four-core machine, I observe no speedup. Given the simple nature of this loop, I expect to see at least some speedup. What's the issue here? How can I change it to get speedup?

openacc create data while running inside a kernels

I'm having a task that is to be accelerated by OpenACC. I need to do dynamic memory allocation within a kernel computation. I've built a simpler demo for it as following.
#include <iostream>
using namespace std;
#pragma acc routine seq
int *routine(int init) {
int *ptr;
#pragma acc data create(ptr[:10])
for (int i = 0; i < 10; ++i) {
ptr[i] = init + i;
return ptr;
void print_array(int *arr) {
for (int i = 0; i < 10; ++i) {
cout << arr[i] << " ";
cout << endl;
int main(void) {
int *arrs[5];
#pragma acc kernels
for (int i = 0; i < 5; ++i) {
arrs[i] = routine(i);
for (int i = 0; i < 5; ++i) {
return 0;
In this demo, I'm trying to call the routine while running inside a kernel construct. The routine procedure wants to create some data within the GPU and put some values into it.
While I can compile the code, but it reports runtime problems as following.
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ pgc++ -o test main.cc -acc -Minfo=accel
6, Generating acc routine seq
23, Generating implicit copyout(arrs[:])
26, Accelerator restriction: size of the GPU copy of arrs is unknown
Loop is parallelizable
Generating implicit copy(arrs[:][:])
Accelerator kernel generated
Generating Tesla code
26, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ ./test
call to cuStreamSynchronize returned error 715: Illegal instruction
I'm wondering what I should do to accomplish this task (dynamically allocating memory within processing of a kernel construct). Really appreciate it if you could help.
This is untested, and probably very slow, but this might do what you need it to.
int main() {
const int num = 20;
int a[x] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
int* sizes = (int *)malloc(num * sizeof(int));
int *ptrs[num];
int* temp, *temp2;
int sum;
int* finished = (int *)malloc(num * sizeof(int));
for (int x = 0; x < num; ++x){
finished[x] = 0;
#pragma acc kernels copyin(a[0:10]) copyout(ptrs[:num][:1]) async(num*2+1)
#pragma acc loop private(temp)
for (int i = 0; i < num; ++i){
#pragma acc loop seq async(i)
for (int j = 0; j < 1; ++j){
temp = ptrs[x];
sizes[i] = ...
while (ptrs[x] != x);
ptrs[x] = routine(a, sizes[i]);
while (true){
sum = 0;
for (int x = 0; x < num; ++x){
sum += finished[x];
if (sum == num){
for (int x = 0; x < num; ++x){
if (acc_async_test(x) != 0 && finished[x] == 0){
finished[x] = 1;
#pragma acc update host(sizes[x:1])
temp = (int *)malloc(size[x] * sizeof(int));
#pragma acc enter data copyin(temp[0:x])
temp2 = acc_deviceptr(temp);
ptrs[x] = temp2;
#pragma acc update device(ptrs[x:1][0:1])

openmp, for loop parallelization and critical zone error

I am new to OpenMP and I am using it to implement the Sieve of Eratosthenes, My code are:
int check_eratothenes(int *p, int pn, int n)
int count = 0;
bool* out = new bool[int(pow(pn, 2))];
memset(out, 0, pow(pn, 2));
#pragma omp parallel
for (int i = 0; i < n; i ++)
int j = floor((pn + 1) / p[i]) * p[i];
#pragma omp critical
while (j <= pow(pn, 2))
out[j] = 1;
j += p[i];
#pragma omp parallel
for (int i = pn+1; i < pow(pn, 2); i ++)
#pragma omp critical
if (out[i] == 0)
//cout << i << " ";
count ++;
return count;
But, the above OpenMP pragma is wrong. It can be complied but when it runs, it takes a lot of time to get the result, so it press CTRL + C to stop. And I felt at a loss on how to solve it . Since there are many loops and if statements.
Thanks in advance.

Gaussian Elimination in OpenMP

Gaussian Elimination in OpenMP. I am new to openmp and wondering if I used my pragmas and barrier at correct places. my x values are different each time. Are they supposed to be the same??
#include <stdio.h>
int num;
double mm[6][7];
void gaussElimination();
int main() {
int i, j;
int k, s;
FILE *f = fopen("matrix.in", "r");
fscanf(f, "%d", &num);
for (i=0; i<num; ++i)
for (j=0; j<num+1; ++j)
fscanf(f, "%f", &mm[i][j]);
for (i=0; i < num; i++)
for(j=0; j <num; j++);
for(k=0; k < num; ++k) {
for(s = 0; s < num+1; ++s)
printf("%3.2f\t", mm[k][s]);
return 0;
void gaussElimination() {
int i, j, k, max;
double R;
// #pragma omp parallel for private (i, j)
for( i=0; i < num; ++i) {
max = i;
for(j= i+1; j < num; ++j)
if(mm[j][i] > mm[max][i])
max =j;
for(j=0; j < num+1; ++j) {
R = mm[max][j];
mm[max][j] = mm[i][j];
mm[i][j] = R;
#pragma omp parallel for private ( i, j)
for(j=num; j>= i; --j)
for(k=i+1; k <num; ++k)
mm[k][j] -= mm[k][i]/mm[i][i] * mm[i][j];
#pragma omp barrier
for(i = num-1; i >=0; --i) {
mm[i][num] = mm[i][num] / mm[i][i];
mm[i][i] = 1;
#pragma omp barrier
for(j= i - 1; j >= 0; --j) {
mm[j][num] -= mm[j][i] * mm[i][num];
mm[j][i] = 0;
#pragma omp barrier
With the current code, you have placed the OpenMP pragam on the the j and k loops. However, you have a private(i,j), which makes the variables i and j private (with no initial values). This should be private(j,k), because the j and k loop variables need to be private and i needs to be shared (since it is the loop bound for the j loop). The OpenMP barriers are not doing anything.
