output is wrong when i use OpenACC(Data Clauses) - openacc

I wanna optimize this code by OpenACC, but the output computations are zero.I would be thankful the opportunity to help me in this way and use your guidances to achieve success and solve my trouble.
King regards,
Mohammadi
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#define NX 4
#define NY 4
#define NZ 4
int main(void)
{
int i, j,p, k,m;
static double A[NX][NY][NZ]={0.} ,B[NX][NY][NZ]={0.},C[NX][NY][NZ]={0.},D[NX][NY][NZ]={0.};
FILE *file;
file = fopen("B-and-A.csv", "w");
#pragma acc data copyin(C,D),copy(A,B)
{
for (p = 0; p <=5; p++) {
#pragma acc kernels
for ( i = 1; i < NX - 1; i++ ) {
for ( j = 0; j < NY - 1; j++ ) {
for ( k = 0; k < NZ - 1; k++ ) {
A[i][j][k] = A[i][j][k] + 1.*( B[i][j+1][k] + D[i][j][k] );
}
}
}
#pragma acc kernels
for ( i = 1; i < NX - 1; i++ ) {
for ( j = 0; j < NY - 1; j++ ) {
for ( k = 0; k < NZ - 1; k++ ) {
B[i][j][k] = B[i][j][k]+ 1.*( A[i][j+1][k] + D[i][j][k] );
}
}
}
for (m = 0; m < NZ - 1; m++) {
A[0][m][m] = -25. ;
A[2][m][m] = 52. ;
B[0][m][m] = 15. ;
B[2][m][m] = -55. ;
}
#pragma acc update self(B)
fprintf(file,"%e\n",B[2][2][2]);
printf("%e\n",B[2][2][2]);
}
}
fclose(file);
}

When you perform an "update self(B)", you are copying the device values of B to the host copy of B. Any changes you have made to the host copy before this point will be lost.
For this code, you will need to update B before the loop executed on the host, and then update the device copy after the host loop. Alternatively, you can offload the loop to the GPU so all computation is done on the device.
Option #1:
// Update the host copies before executing on the host
#pragma acc update self(A,B)
// This loop is executed on the host
for (m = 0; m < NZ - 1; m++) {
A[0][m][m] = -25. ;
A[2][m][m] = 52. ;
B[0][m][m] = 15. ;
B[2][m][m] = -55. ;
}
// To keep the device and host copies coherent, update the device copies
#pragma acc update device(A,B)
Option #2:
// Or offload the loop to the device
#pragma acc kernels
for (m = 0; m < NZ - 1; m++) {
A[0][m][m] = -25. ;
A[2][m][m] = 52. ;
B[0][m][m] = 15. ;
B[2][m][m] = -55. ;
}
#pragma acc update self(B)

Related

Value of sum from thrust::reduce not correct

I have been trying to implement some code requiring to call reduce on thrust::device_ptr, and the results are not consistent with CPU implementation while dealing with large values. I have to deal with large values. So is there a way around:
My code:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
}
}
}
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
int main()
{
real** a;
std::cout.precision(30);
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
}
}
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
///************************
//CUDA KERNELS ARE HERE
// REMOVED FOR CLEAR QUESTION
///*************************
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
////////CPU PART DOING SAME THING//////
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
}
}
cout<<"\nsum cpu "<< sum2<<"\n";
if((sum2-sum1)<0.001)
std::cout << "\nSUCESS "<< "\n";
else
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
}
The compiler that I am using is nvcc and my graphics card is nvidia 1650 with compute capability 7.5.
According to the documentation, thrust expects the type for summation to be reflected in the init value:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
^
The type of that constant you have is an integral type. If you change that to a double-precision constant:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
you get matching results, between CPU and GPU, according to my testing. (You could alternatively cast your constant to real type: (real)0 and use that, and there are other ways to address this as well, such as dropping the use of the init value and the binary op.)

openmp ping pong breaks when using optimization

I have the following openmp program, compiled with mpicc -fopenmp -O0 ping_pong.c. On my machine executing ./a.out -N 10000000 typically gives "done in 1.22125 secs, m: 10000001". If I increase the level of optimization, the program hangs. Is there a way to 1) decrease the execution time while preserving the ping pong functionality? 2) make the code tolerant (no hang, not slower) of optimization?
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
int num_threads = 2;
int N = 1000000;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-N") == 0) {
N = atoi(argv[++i]);
}
}
omp_set_num_threads(num_threads);
int m = 0;
double t0 = omp_get_wtime();
#pragma omp parallel
{
int id = omp_get_thread_num();
while (m < N) {
if (id == 0) {
if (m % 2 == 0) m++;
}
if (id == 1) {
if (m % 2 == 1) m++;
}
}
}
double t = omp_get_wtime() - t0;
printf("done in %g secs, m: %d\n", t, m);
}
Faster and fully optimizable, flush variable m before each if statement.
// to compile: gcc -fopenmp -O* ping_pong.c
// * can be 0, 1, 2, 3, or fast
// to run: ./a.out -N 10000000
#include <assert.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
int num_threads = 2;
int N = 1000000;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-N") == 0) {
N = atoi(argv[++i]);
}
}
omp_set_num_threads(num_threads);
int m = 0;
int count0 = 0;
int count1 = 0;
int *arr0 = (int *)calloc(N / 2 + 2, sizeof(int));
int *arr1 = (int *)calloc(N / 2 + 2, sizeof(int));
double t0 = omp_get_wtime();
#pragma omp parallel
{
int id = omp_get_thread_num();
if (id == 0) {
printf("id %d reporting for duty!\n", id);
while (m < N) {
#pragma omp flush (m)
if (m % 2 == 0) {
arr0[count0] = m;
m++;
count0++;
}
}
}
else if (id == 1) {
printf("id %d reporting for duty!\n", id);
while (m < N) {
#pragma omp flush (m)
if (m % 2 == 1) {
arr1[count1] = m;
m++;
count1++;
}
}
}
}
double t = omp_get_wtime() - t0;
printf("done in %g secs, m: %d, count0: %d, count1: %d\n", t, m, count0, count1);
for (int i = 1; i < N / 2; i++) {
if (arr0[i] != arr0[i - 1] + 2) {
printf("arr0[%d] = %d, arr0[%d] = %d\n", i, arr0[i], i - 1, arr0[i - 1]);
assert(0);
}
}
for (int i = 1; i < N / 2; i++) {
if (arr1[i] != arr1[i - 1] + 2) {
printf("arr1[%d] = %d, arr1[%d] = %d\n", i, arr1[i], i - 1, arr1[i - 1]);
assert(0);
}
}
printf("Both arrays are correctly formed.\n");
free(arr0);
free(arr1);
return 0;
}
OpenMP's memory model allows for different threads to have temporarily diverging views of shared variables. On cache-coherent architectures such as x86, the most frequent reason for diverging views are register optimisations.
This is very much compiler-dependent, but with -O0 most compilers don't do register optimisation, so both if (m % 2 == 0) and m++ result in code that reads or writes the actual memory location of m. With -O1 and higher, m is optimised to a register variable and the result is written back to memory only at the exit from the while loop. In the latter case, after one iteration, the register value of m in thread 0 becomes odd and the thread gets stuck. Similarly, the initial value of m in thread 1 is even (0) and that thread is already stuck.
Preventing register optimisation (and speculative execution / instruction reordering) from screwing the coherent view of shared variables is what the OpenMP flush construct is for. You need a bunch of #pragma omp flush(m) lines to make sure that both threads see the latest value of m.
You can also declare m as volatile int m = 0. The volatile modifier prevents register optimisation of m, so you'll get code similar to what -O0 produces. This is not the same as using the OpenMP flush directive, since on x86 flush performs a memory fence too.

Topcoder - grafixMask, Implementing DFS

I have been stuck at the problem grafixMask for a day now. This is the code I wrote following the pseudocode in the tutorial for DFS. I think that my code is not respecting the condition which decides which grid to include resulting in wrong answer but I can't figure out how to fix it.
#include <iostream>
#include <vector>
#include <stack>
#include <algorithm>
#include <sstream>
#include <string>
using namespace std;
const int ROWS = 400;
const int COLUMNS = 600;
class grafixMask {
public:
bool visited[ROWS][COLUMNS];
vector<int> result;
vector<int> sortedAreas (vector<string> rectangles) {
// initialize graph
for (int row = 0; row < ROWS; row++)
for (int column = 0; column < COLUMNS; column++)
visited[row][column] = false;
for (string rec: rectangles) {
int r1, c1, r2, c2;
istringstream ss(rec);
ss >> r1 >> c1 >> r2 >> c2;
// set rectangular masks
for(int i = r1; i <= r2; i++)
for (int j = c1; j <= c2; j++)
visited[i][j] = true;
for (int row = 0; row < ROWS; row++)
for (int column = 0; column < COLUMNS; column++)
if (!visited[row][column])
result.push_back(doFill(row, column)); // find all connected points enclosed by masks
}
sort(result.begin(), result.end());
return result;
}
int doFill(int row, int column){
int res = 0;
stack<pair<int, int> > s;
s.push(make_pair(row, column));
while(!s.empty()) {
pair<int, int> p = s.top();
int r = p.first;
int c = p.second;
s.pop();
if (r < 0 || r >= 400 || c < 0 || c >= 600 || visited[r][c]) continue;
visited[r][c] = true;
res++; // we covered additional area
s.push(make_pair(r-1, c));
s.push(make_pair(r+1, c));
s.push(make_pair(r, c-1));
s.push(make_pair(r, c+1));
}
return res;
}
};
Going through the code infinite number of times I finally spotted what I did wrong:
Look at the code where I take the input as rectangles. Here I have accidentally included the for loop to find all the connected components of grid. So the correct code is:
#include <algorithm>
#include <iostream>
#include <sstream>
#include <stack>
#include <string>
#include <vector>
using namespace std;
const int ROWS = 400;
const int COLUMNS = 600;
bool visited[400][600] = {false};
class grafixMask {
public:
vector<int> result;
vector<int> sortedAreas(vector<string> rectangles) {
for (auto rec : rectangles) {
istringstream ss(rec);
int r1, c1, r2, c2;
ss >> r1 >> c1 >> r2 >> c2;
for (int i = r1; i <= r2; i++)
for (int j = c1; j <= c2; j++) visited[i][j] = true;
}
for (int row = 0; row < ROWS; row++)
for (int column = 0; column < COLUMNS; column++)
if (!visited[row][column]) {
result.push_back(doFill(row, column));
}
sort(result.begin(), result.end());
return result;
}
int doFill(int row, int column) {
int res = 0;
stack<pair<int, int> > s;
s.push(make_pair(row, column));
while (s.empty() == false) {
pair<int, int> p = s.top();
int r = p.first;
int c = p.second;
s.pop();
if (r < 0 || r >= 400 || c < 0 || c >= 600 ||
visited[r][c])
continue;
visited[r][c] = true;
res++; // we covered additional area
int dirRow[] = {1, -1, 0, 0};
int dirCol[] = {0, 0, 1, -1};
for (int i = 0; i < 4; i++) {
int newRow = r + dirRow[i];
int newCol = c + dirCol[i];
if (newRow >= 0 && newRow < 400 && newCol >= 0 && newCol < 600 &&
!visited[newRow][newCol]) {
s.push(make_pair(newRow, newCol));
}
}
}
return res;
}
};

MPI_Scatterv submatrix with MPI_Type_struct

I'm currently working on a MPI-program and I'm trying to send blocks of a matrix with scatterv to all processes.
Process description
The matrix is given as an array.
First I produce a datatype with MPI_Type_vector to create the necessary block out of the original array.
Second I create a MPI_Type_struct that should hold rows of blocks.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 16
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p,r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
int *arr;
arr = NULL;
if (r == 0){
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n * n; i++) arr[i] = i;
for (int i = 0; i < n; i++){
printf("\n");
for (int j = 0; j < n; j++)
printf("%4d", arr[i * n + j]);
}
}
printf("\n");
int ps = sqrt(p);
int ns = n / ps;
if (r == 0) {
printf("ps: %d ns: %d\n", ps, ns);
}
/* create datatype */
MPI_Datatype block;
MPI_Type_vector(ns, ns, n, MPI_INT, &block);
int blocks[ps];
MPI_Aint displs[ps];
for (int i = 0; i < ps; i++) {
blocks[i] = 1;
displs[i] = i * sizeof(int);
}
MPI_Datatype types[ps];
//for (int i = 0; i < ps - 1; i++) types[i] = block;
//types[ps - 1] = MPI_UB;
types[0] = block;
for (int i = 1; i < ps; i++) types[i] = MPI_UB;
//types[0] = block;
//types[1] = MPI_UB;
if (r == 0) {
printf("displs:\n");
for(int i = 0; i < ps; i++) printf("%3ld", displs[i]);
printf("\n");
}
MPI_Datatype row;
MPI_Type_struct(ps, blocks, displs, types, &row);
MPI_Type_commit(&row);
/* prepare scatter */
int sdispl[p]; int sendcounts[p];
for (int i = 0; i < p; i++) {
sdispl[i] = (i % ps) + (i / ps) * (ns * ps);
sendcounts[i] = 1;
}
if (r == 0) {
printf("sdispl: \n");
for (int i = 0; i < 4; i++) printf("%3d", sdispl[i]);
printf("\n");
}
int rcv[ns * ns];
MPI_Scatterv(arr, sendcounts, sdispl, row, rcv, ns * ns, MPI_INT, 0, comm);
int result = 1;
if (r == result) {
printf("result for %d:\n", result);
for (int i = 0; i < ns * ns; i++) {
printf("%4d", rcv[i]);
if ((i+1) % ns == 0) printf("\n");
}
}
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
So far the structure of the blocks is correct.
The problem
The block, that was sent to process r = 1 starts with 3 instead of 4. The block for process r = 2 also starts with 6 and the one for process r = 3 starts with 9.
For r == 4 it jumps to 48.
What it should do
r start
0 0
1 4
2 8
3 12
4 64
5 68
6 ...
15 204
The help I would need
I think, that I'm making some mistake with displ and sdispl.
Compiling and Running the example
The code is compiled with the folowing command:
mpicc -o main main.c -lm
I run the code with:
mpirun -np 16 ./main
Thanks for any help in advance!
With the hint of Zulan I was able to solve my problem.
The following code is based on the excellent answer to subarrays.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 8
void print_arr(int *arr, int x) {
printf("\n");
for (int i = 0; i < x*x; i++){
if (i % x == 0) printf("\n");
printf("%4d", arr[i]);
}
printf("\n");
}
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p, r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
/* number of proceses in dim x and dim y */
int ps = sqrt(p);
/* number of elements in dim x and dim y in sarr */
int ns = n/ps;
/* array of data - distributed by process 0 */
int *arr = NULL;
if (r==0) {
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n*n; i++) arr[i] = i;
print_arr(arr, n);
}
MPI_Datatype type, resizedtype;
int sizes[2] = {n,n};
int subsizes[2] = {ns,ns};
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, ns*sizeof(int), &resizedtype);
MPI_Type_commit(&resizedtype);
int counts[p];
for (int i = 0; i < p; i++) counts[i] = 1;
int displs[p];
for (int i = 0; i < p; i++) displs[i] = i%ps + i/ps * ns * ps;
/* subarray to store distributed data */
int sarr[ns * ns];
/* send submatrices to all processes */
MPI_Scatterv(arr, counts, displs, resizedtype, sarr, ns*ns, MPI_INT, 0, comm);
/* print received data for process pr */
int pr = 3;
if (r == pr)
print_arr(sarr, ns);
/* free arr */
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
You can compile the example with
mpicc -o main main.c
and run it with
mpirun -np 4 ./main

openacc create data while running inside a kernels

I'm having a task that is to be accelerated by OpenACC. I need to do dynamic memory allocation within a kernel computation. I've built a simpler demo for it as following.
#include <iostream>
using namespace std;
#pragma acc routine seq
int *routine(int init) {
int *ptr;
#pragma acc data create(ptr[:10])
for (int i = 0; i < 10; ++i) {
ptr[i] = init + i;
}
return ptr;
}
void print_array(int *arr) {
for (int i = 0; i < 10; ++i) {
cout << arr[i] << " ";
}
cout << endl;
}
int main(void) {
int *arrs[5];
#pragma acc kernels
for (int i = 0; i < 5; ++i) {
arrs[i] = routine(i);
}
for (int i = 0; i < 5; ++i) {
print_array(arrs[i]);
}
return 0;
}
In this demo, I'm trying to call the routine while running inside a kernel construct. The routine procedure wants to create some data within the GPU and put some values into it.
While I can compile the code, but it reports runtime problems as following.
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ pgc++ -o test main.cc -acc -Minfo=accel
routine(int):
6, Generating acc routine seq
main:
23, Generating implicit copyout(arrs[:])
26, Accelerator restriction: size of the GPU copy of arrs is unknown
Loop is parallelizable
Generating implicit copy(arrs[:][:])
Accelerator kernel generated
Generating Tesla code
26, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ ./test
call to cuStreamSynchronize returned error 715: Illegal instruction
I'm wondering what I should do to accomplish this task (dynamically allocating memory within processing of a kernel construct). Really appreciate it if you could help.
This is untested, and probably very slow, but this might do what you need it to.
int main() {
const int num = 20;
int a[x] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
int* sizes = (int *)malloc(num * sizeof(int));
int *ptrs[num];
int* temp, *temp2;
int sum;
int* finished = (int *)malloc(num * sizeof(int));
for (int x = 0; x < num; ++x){
finished[x] = 0;
}
#pragma acc kernels copyin(a[0:10]) copyout(ptrs[:num][:1]) async(num*2+1)
{
#pragma acc loop private(temp)
for (int i = 0; i < num; ++i){
#pragma acc loop seq async(i)
for (int j = 0; j < 1; ++j){
temp = ptrs[x];
sizes[i] = ...
}
while (ptrs[x] != x);
ptrs[x] = routine(a, sizes[i]);
}
}
while (true){
sum = 0;
for (int x = 0; x < num; ++x){
sum += finished[x];
}
if (sum == num){
break;
}
for (int x = 0; x < num; ++x){
if (acc_async_test(x) != 0 && finished[x] == 0){
finished[x] = 1;
#pragma acc update host(sizes[x:1])
temp = (int *)malloc(size[x] * sizeof(int));
#pragma acc enter data copyin(temp[0:x])
temp2 = acc_deviceptr(temp);
ptrs[x] = temp2;
#pragma acc update device(ptrs[x:1][0:1])
}
}
}
}

Resources