I'm dealing with the acceleration of this subroutine.
void PTC3D(double ****V, double ****U)
{
int i,j,k, nv;
static double **u, **v;
if (u == NULL){
u = ARRAY_2D(NMAX_POINT, NVAR, double);
v = ARRAY_2D(NMAX_POINT, NVAR, double);
#pragma acc enter data copyin(v[:NMAX_POINT][:NVAR], u[:NMAX_POINT][:NVAR])
}
#pragma acc parallel loop collapse(2) private(v[:NMAX_POINT][:NVAR], u[:NMAX_POINT][:NVAR])
for (k = KBEG; k <= KEND; k++){
for (j = JBEG; j <= JEND; j++){
#pragma acc loop collapse(2)
for (i = IBEG; i <= IEND; i++){
for (nv = 0; nv < NVAR; nv++) v[i][nv] = V[nv][k][j][i];
}
PTC (v, u, IBEG, IEND);
#pragma acc loop collapse(2)
for (i = IBEG; i <= IEND; i++){
for (nv = 0; nv < NVAR; nv++) U[k][j][i][nv] = u[i][nv];
}
}}
}
I marked PTC with #pragma acc routine vector but inside of it I use an external variable gamma and hence the compiler generates: External and Static variables are not supported in acc routine.
I read that an alternative to passing gamma as function argument is the use of #pragma acc declare. I've no idea how it works.
I tried with #pragma acc declare copyin(gamma) at the beginning of the application, because gamma is defined at the beginning and never changed, but I get some error regarding the scope.
I'd need gamma on the device and to use it on every subroutine.
Related
I'm currently trying to get my matrix-vector multiplication function to compare favorably with BLAS by combining #pragma omp for with #pragma omp simd, but it's not getting any speedup improvement than if I were to just use the for construct. How do I properly vectorize the inner loop with OpenMP's SIMD construct?
vector dot(const matrix& A, const vector& x)
{
assert(A.shape(1) == x.size());
vector y = xt::zeros<double>({A.shape(0)});
int i, j;
#pragma omp parallel shared(A, x, y) private(i, j)
{
#pragma omp for // schedule(static)
for (i = 0; i < y.size(); i++) { // row major
#pragma omp simd
for (j = 0; j < x.size(); j++) {
y(i) += A(i, j) * x(j);
}
}
}
return y;
}
Your directive is incorrect because there would introduce in a race condition (on y(i)). You should use a reduction in this case. Here is an example:
vector dot(const matrix& A, const vector& x)
{
assert(A.shape(1) == x.size());
vector y = xt::zeros<double>({A.shape(0)});
int i, j;
#pragma omp parallel shared(A, x, y) private(i, j)
{
#pragma omp for // schedule(static)
for (i = 0; i < y.size(); i++) { // row major
decltype(y(0)) sum = 0;
#pragma omp simd reduction(+:sum)
for (j = 0; j < x.size(); j++) {
sum += A(i, j) * x(j);
}
y(i) += sum;
}
}
return y;
}
Note that it may not be necessary faster because some compilers are able to automatically vectorize the code (ICC for example). GCC and Clang often fail to perform (advanced) SIMD reductions automatically and such a directive help them a bit. You can check the assembly code to check how the code is vectorized or enable vectorization reports (see here for GCC).
I have been trying to parallelize with this loop with OpenMP
#define AX(i,j,k) (Ax[((k)*n+(j))*n+(i)])
for (int k = k1; k < k2; ++k) {
for (int j = j1; j < j2; ++j) {
for (int i = i1; i < i2; ++i) {
double xx = AX(i,j,k);
double xn = (i > 0) ? AX(i-1,j,k) : 0;
double xe = (j > 0) ? AX(i,j-1,k) : 0;
double xu = (k > 0) ? AX(i,j,k-1) : 0;
AX(i,j,k) = (xx+xn+xe+xu)/6*w;
}
}
}
#undef AX
I put this at the top of this code:
#pragma omp parallel for private (k,j,i) shared(Ax)
I noticed, however, that the #pragma is not working, since my function is simultaneously faster but generates more inconsistent results (probably due to data dependencies).
I probably have to put another clause or try to change something in the code, but I don't have any idea as to what.
EDIT :
Okay thank you I understand why it is not working but I tried why you said, and unfortunetaly it is still not working. Yet, I know the problem but I don't know how to solve it.
void ssor_forward_sweep(int n, int i1, int i2, int j1, int j2, int k1, int k2,
double* restrict Ax, double w)
{
int k,j,i;
double* AxL=malloc(n*sizeof(double));
for (int a=0; a < n;a++){
AxL[a]=Ax[a];
}
#define AX(i,j,k) (Ax[((k)*n+(j))*n+(i)])
#define AXL(i,j,k) (AxL[((k)*n+(j))*n+(i)])
#pragma omp parallel for private (k,j,i) shared(Ax)
for (k = k1; k < k2; ++k) {
for (j = j1; j < j2; ++j) {
for (i = i1; i < i2; ++i) {
double xx = AXL(i,j,k);
double xn = (i > 0) ? AXL(i-1,j,k) : 0;
double xe = (j > 0) ? AXL(i,j-1,k) : 0;
double xu = (k > 0) ? AXL(i,j,k-1) : 0;
AX(i,j,k) = (xx+xn+xe+xu)/6*w;
//AXL(i,j,k) = (xx+xn+xe+xu)/6*w;
}
}
}
#undef AX
#undef AXL
I know that there is still a problem with data dependencies but I don't know how to solve it ; indeed modified values aren't taking in account for the new ones. It also may have a problem when I am copying data.
When I am saying it is not working I don't have any output (no error and no output), it is just directly crashing.
Hope someone can help me !
Thank you so much for the help !
Best regards,
I am new to OpenMP and I am using it to implement the Sieve of Eratosthenes, My code are:
int check_eratothenes(int *p, int pn, int n)
{
int count = 0;
bool* out = new bool[int(pow(pn, 2))];
memset(out, 0, pow(pn, 2));
#pragma omp parallel
for (int i = 0; i < n; i ++)
{
int j = floor((pn + 1) / p[i]) * p[i];
#pragma omp critical
while (j <= pow(pn, 2))
{
out[j] = 1;
j += p[i];
}
}
#pragma omp parallel
for (int i = pn+1; i < pow(pn, 2); i ++)
{
#pragma omp critical
if (out[i] == 0)
{
//cout << i << " ";
count ++;
}
}
return count;
}
But, the above OpenMP pragma is wrong. It can be complied but when it runs, it takes a lot of time to get the result, so it press CTRL + C to stop. And I felt at a loss on how to solve it . Since there are many loops and if statements.
Thanks in advance.
I am using gcc's implementation of openmp to try to parallelize a program. Basically the assignment is to add omp pragmas to obtain speedup on a program that finds amicable numbers.
The original serial program was given(shown below except for the 3 lines I added with comments at the end). We have to parallize first just the outer loop, then just the inner loop. The outer loop was easy and I get close to ideal speedup for a given number of processors. For the inner loop, I get much worse performance than the original serial program. Basically what I am trying to do is a reduction on the sum variable.
Looking at the cpu usage, I am only using ~30% per core. What could be causing this? Is the program continually making new threads everytime it hits the omp parallel for clause? Is there just so much more overhead in doing a barrier for the reduction? Or could it be memory access issue(eg cache thrashing)? From what I read with most implementations of openmp threads get reused overtime(eg pooled), so I am not so sure the first problem is what is wrong.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include <omp.h>
#define numThread 2
int main(int argc, char* argv[]) {
int ser[29], end, i, j, a, limit, als;
als = atoi(argv[1]);
limit = atoi(argv[2]);
for (i = 2; i < limit; i++) {
ser[0] = i;
for (a = 1; a <= als; a++) {
ser[a] = 1;
int prev = ser[a-1];
if ((prev > i) || (a == 1)) {
end = sqrt(prev);
int sum = 0;//added this
#pragma omp parallel for reduction(+:sum) num_threads(numThread)//added this
for (j = 2; j <= end; j++) {
if (prev % j == 0) {
sum += j;
sum += prev / j;
}
}
ser[a] = sum + 1;//added this
}
}
if (ser[als] == i) {
printf("%d", i);
for (j = 1; j < als; j++) {
printf(", %d", ser[j]);
}
printf("\n");
}
}
}
OpenMP thread teams are instantiated on entering the parallel section. This means, indeed, that the thread creation is repeated every time the inner loop is starting.
To enable reuse of threads, use a larger parallel section (to control the lifetime of the team) and specificly control the parallellism for the outer/inner loops, like so:
Execution time for test.exe 1 1000000 has gone down from 43s to 22s using this fix (and the number of threads reflects the numThreads defined value + 1
PS Perhaps stating the obvious, it would not appear that parallelizing the inner loop is a sound performance measure. But that is likely the whole point to this exercise, and I won't critique the question for that.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include <omp.h>
#define numThread 2
int main(int argc, char* argv[]) {
int ser[29], end, i, j, a, limit, als;
als = atoi(argv[1]);
limit = atoi(argv[2]);
#pragma omp parallel num_threads(numThread)
{
#pragma omp single
for (i = 2; i < limit; i++) {
ser[0] = i;
for (a = 1; a <= als; a++) {
ser[a] = 1;
int prev = ser[a-1];
if ((prev > i) || (a == 1)) {
end = sqrt(prev);
int sum = 0;//added this
#pragma omp parallel for reduction(+:sum) //added this
for (j = 2; j <= end; j++) {
if (prev % j == 0) {
sum += j;
sum += prev / j;
}
}
ser[a] = sum + 1;//added this
}
}
if (ser[als] == i) {
printf("%d", i);
for (j = 1; j < als; j++) {
printf(", %d", ser[j]);
}
printf("\n");
}
}
}
}
Gaussian Elimination in OpenMP. I am new to openmp and wondering if I used my pragmas and barrier at correct places. my x values are different each time. Are they supposed to be the same??
#include <stdio.h>
int num;
double mm[6][7];
void gaussElimination();
int main() {
int i, j;
int k, s;
FILE *f = fopen("matrix.in", "r");
fscanf(f, "%d", &num);
for (i=0; i<num; ++i)
for (j=0; j<num+1; ++j)
fscanf(f, "%f", &mm[i][j]);
fclose(f);
for (i=0; i < num; i++)
for(j=0; j <num; j++);
gaussElimination();
for(k=0; k < num; ++k) {
for(s = 0; s < num+1; ++s)
printf("%3.2f\t", mm[k][s]);
printf("\n");
}
return 0;
}
void gaussElimination() {
int i, j, k, max;
double R;
// #pragma omp parallel for private (i, j)
for( i=0; i < num; ++i) {
max = i;
for(j= i+1; j < num; ++j)
if(mm[j][i] > mm[max][i])
max =j;
for(j=0; j < num+1; ++j) {
R = mm[max][j];
mm[max][j] = mm[i][j];
mm[i][j] = R;
}
#pragma omp parallel for private ( i, j)
for(j=num; j>= i; --j)
for(k=i+1; k <num; ++k)
mm[k][j] -= mm[k][i]/mm[i][i] * mm[i][j];
}
#pragma omp barrier
for(i = num-1; i >=0; --i) {
mm[i][num] = mm[i][num] / mm[i][i];
mm[i][i] = 1;
#pragma omp barrier
for(j= i - 1; j >= 0; --j) {
mm[j][num] -= mm[j][i] * mm[i][num];
mm[j][i] = 0;
}
#pragma omp barrier
}
}
With the current code, you have placed the OpenMP pragam on the the j and k loops. However, you have a private(i,j), which makes the variables i and j private (with no initial values). This should be private(j,k), because the j and k loop variables need to be private and i needs to be shared (since it is the loop bound for the j loop). The OpenMP barriers are not doing anything.