Gaussian Elimination in OpenMP

Gaussian Elimination in OpenMP - openmp

Gaussian Elimination in OpenMP. I am new to openmp and wondering if I used my pragmas and barrier at correct places. my x values are different each time. Are they supposed to be the same??
#include <stdio.h>
int num;
double mm[6][7];
void gaussElimination();
int main() {
int i, j;
int k, s;
FILE *f = fopen("matrix.in", "r");
fscanf(f, "%d", &num);
for (i=0; i<num; ++i)
for (j=0; j<num+1; ++j)
fscanf(f, "%f", &mm[i][j]);
fclose(f);
for (i=0; i < num; i++)
for(j=0; j <num; j++);
gaussElimination();
for(k=0; k < num; ++k) {
for(s = 0; s < num+1; ++s)
printf("%3.2f\t", mm[k][s]);
printf("\n");
}
return 0;
}
void gaussElimination() {
int i, j, k, max;
double R;
// #pragma omp parallel for private (i, j)
for( i=0; i < num; ++i) {
max = i;
for(j= i+1; j < num; ++j)
if(mm[j][i] > mm[max][i])
max =j;
for(j=0; j < num+1; ++j) {
R = mm[max][j];
mm[max][j] = mm[i][j];
mm[i][j] = R;
}
#pragma omp parallel for private ( i, j)
for(j=num; j>= i; --j)
for(k=i+1; k <num; ++k)
mm[k][j] -= mm[k][i]/mm[i][i] * mm[i][j];
}
#pragma omp barrier
for(i = num-1; i >=0; --i) {
mm[i][num] = mm[i][num] / mm[i][i];
mm[i][i] = 1;
#pragma omp barrier
for(j= i - 1; j >= 0; --j) {
mm[j][num] -= mm[j][i] * mm[i][num];
mm[j][i] = 0;
}
#pragma omp barrier
}
}

With the current code, you have placed the OpenMP pragam on the the j and k loops. However, you have a private(i,j), which makes the variables i and j private (with no initial values). This should be private(j,k), because the j and k loop variables need to be private and i needs to be shared (since it is the loop bound for the j loop). The OpenMP barriers are not doing anything.

Related

matrix multiplication using malloc without user input

I am trying to use Malloc function to dynamically allocate memory but I also want to specify my data entry for operation rather than taking the user input.
I have found this code here which works fine, but I am working with a large data set and taking user input is not an option, so I want to keep using MALLOC but also define the data set.
like instead of following,
//Input Matrix1
for (i = 0; i < r1; i++)
for (j = 0; j < c1; j++)
scanf_s("%d", &mat1[i][j]);
I want something like
//mat1[2][2] = { {1,2},{2,3} }
to be inputed in the code
What would be the way to do it? I would really appreciate some advice. Thanks
#include<stdio.h>
#include<stdlib.h>
int main() {
int **mat1, **mat2, **res, i, j,k, r1, c1, r2, c2;
printf("\nEnter the Order of the First matrix...\n");
scanf_s("%d %d", &r1, &c1);
printf("\nEnter the Order of the Second matrix...\n");
scanf_s("%d %d", &r2, &c2);
if (c1 != r2) {
printf("Invalid Order of matrix");
exit(EXIT_SUCCESS);
}
mat1 = (int**)malloc(r1 * sizeof(int*));
for (i = 0; i < c1; i++)
mat1[i] = (int*)malloc(c1 * sizeof(int));
mat2 = (int**)malloc(r2 * sizeof(int*));
for (i = 0; i < c2; i++)
mat2[i] = (int*)malloc(c2 * sizeof(int));
res = (int**)calloc(r1, sizeof(int*));
for (i = 0; i < c2; i++)
res[i] = (int*)calloc(c2, sizeof(int));
/**/
//Input Matrix1
for (i = 0; i < r1; i++)
for (j = 0; j < c1; j++)
scanf_s("%d", &mat1[i][j]);
//Input Matrix2
for (i = 0; i < r2; i++)
for (j = 0; j < c2; j++)
scanf_s("%d", &mat2[i][j]);
//Printing Input Matrix 1 and 2
printf("\n Entered Matrix 1: \n");
for (i = 0; i < r1; i++) {
for (j = 0; j < c1; j++)
printf("%d ", mat1[i][j]);
printf("\n");
}
printf("\n Entered Matrix 2: \n");
for (i = 0; i < r2; i++) {
for (j = 0; j < c2; j++)
printf("%d ", mat2[i][j]);
printf("\n");
}
//int mat1[2][2] = { {1,2},{2,3} };
//int mat2[2][2] = { {1,3},{2,4} };
//Computation
//Multiplication
for (i = 0; i < r1; i++) {
for (j = 0; j < c2; j++) {
res[i][j] = 0;
for (k = 0; k < c1; k++)
res[i][j] += mat1[i][k] * mat2[k][j];
}
printf("\n");
}
printf("\nThe Multiplication of two matrix is\n");
for (i = 0; i < r1; i++) {
printf("\n");
for (j = 0; j < c2; j++)
printf("%d\t", res[i][j]);
}
printf("\n");
/* Addition
for(i=0;i<r1;i++)
for(j=0;j<c2;j++)
res[i][j]=mat1[i][j]+mat2[i][j];
printf("\nThe Addition of two matrix is\n");
for(i=0;i<r1;i++){
printf("\n");
for(j=0;j<c2;j++)
printf("%d\t",res[i][j]);
}
*/
return 0;
}

Please specify the format of your input data. Is it a csv file?
You can only specify data in the format int b[4] = {1, 2, 3, 4} when the size of b is fixed, i.e. known at compile time. But if all you matrices are known at compile time anyway, why bother doing dynamic allocation?
Also I cleaned up your code a bit:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define index(x, y, r) (x+r*y)
struct matrix {
int cols;
int rows;
double * data;
};
void print_mat(struct matrix * mat) {
int i, j;
for (i = 0; i < mat->rows; i++) {
for (j = 0; j < mat->cols; j++) {
printf("%f \t", mat->data[index(i, j, mat->rows)]);
}
printf("\n");
}
}
int mat_alloc(struct matrix *mat) {
mat->data = (double*)malloc(mat->rows*mat->cols*sizeof(double));
}
int read_mat(struct matrix *mat) {
int i, j;
for (i = 0; i < mat->rows; i++) {
for (j = 0; j < mat->cols; j++) {
scanf("%lf", &(mat->data[index(i, j, mat->rows)]));
}
}
}
int multiply(struct matrix * a, struct matrix * b, struct matrix * res) {
if(a->cols != b->rows){
printf("Matrix dimensions do not match!\n");
return 0;
}
res->rows = a->rows;
res->cols = b->cols;
mat_alloc(res);
memset(res->data, 0, res->cols*res->rows*sizeof(double));
int i, j, k;
for (i = 0; i < res->rows; i++) {
for (j = 0; j < res->cols; j++) {
for (k = 0; k < a->cols; k++) {
res->data[index(i, j, res->rows)] += a->data[index(i, k, a->rows)] * b->data[index(k, j, b->rows)];
}
}
}
}
int main() {
struct matrix mat1, mat2, res;
printf("\nEnter the Order of the First matrix...\n");
scanf("%d %d", &mat1.rows, &mat1.cols);
printf("\nEnter the Order of the Second matrix...\n");
scanf("%d %d", &mat2.rows, &mat2.cols);
mat_alloc(&mat1);
mat_alloc(&mat2);
read_mat(&mat1);
read_mat(&mat2);
printf("Scanned matrices: \n");
print_mat(&mat1);
printf("\n");
print_mat(&mat2);
printf("\n");
multiply(&mat1, &mat2, &res);
printf("Calculated result: \n");
print_mat(&res);
return 0;
}

openacc create data while running inside a kernels

I'm having a task that is to be accelerated by OpenACC. I need to do dynamic memory allocation within a kernel computation. I've built a simpler demo for it as following.
#include <iostream>
using namespace std;
#pragma acc routine seq
int *routine(int init) {
int *ptr;
#pragma acc data create(ptr[:10])
for (int i = 0; i < 10; ++i) {
ptr[i] = init + i;
}
return ptr;
}
void print_array(int *arr) {
for (int i = 0; i < 10; ++i) {
cout << arr[i] << " ";
}
cout << endl;
}
int main(void) {
int *arrs[5];
#pragma acc kernels
for (int i = 0; i < 5; ++i) {
arrs[i] = routine(i);
}
for (int i = 0; i < 5; ++i) {
print_array(arrs[i]);
}
return 0;
}
In this demo, I'm trying to call the routine while running inside a kernel construct. The routine procedure wants to create some data within the GPU and put some values into it.
While I can compile the code, but it reports runtime problems as following.
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ pgc++ -o test main.cc -acc -Minfo=accel
routine(int):
6, Generating acc routine seq
main:
23, Generating implicit copyout(arrs[:])
26, Accelerator restriction: size of the GPU copy of arrs is unknown
Loop is parallelizable
Generating implicit copy(arrs[:][:])
Accelerator kernel generated
Generating Tesla code
26, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ ./test
call to cuStreamSynchronize returned error 715: Illegal instruction
I'm wondering what I should do to accomplish this task (dynamically allocating memory within processing of a kernel construct). Really appreciate it if you could help.

This is untested, and probably very slow, but this might do what you need it to.
int main() {
const int num = 20;
int a[x] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
int* sizes = (int *)malloc(num * sizeof(int));
int *ptrs[num];
int* temp, *temp2;
int sum;
int* finished = (int *)malloc(num * sizeof(int));
for (int x = 0; x < num; ++x){
finished[x] = 0;
}
#pragma acc kernels copyin(a[0:10]) copyout(ptrs[:num][:1]) async(num*2+1)
{
#pragma acc loop private(temp)
for (int i = 0; i < num; ++i){
#pragma acc loop seq async(i)
for (int j = 0; j < 1; ++j){
temp = ptrs[x];
sizes[i] = ...
}
while (ptrs[x] != x);
ptrs[x] = routine(a, sizes[i]);
}
}
while (true){
sum = 0;
for (int x = 0; x < num; ++x){
sum += finished[x];
}
if (sum == num){
break;
}
for (int x = 0; x < num; ++x){
if (acc_async_test(x) != 0 && finished[x] == 0){
finished[x] = 1;
#pragma acc update host(sizes[x:1])
temp = (int *)malloc(size[x] * sizeof(int));
#pragma acc enter data copyin(temp[0:x])
temp2 = acc_deviceptr(temp);
ptrs[x] = temp2;
#pragma acc update device(ptrs[x:1][0:1])
}
}
}
}

Different results when using OpenMP and FFTW

I am trying to parallelize the following loop:
#pragma omp parallel for private(j,i,mxy) firstprivate(in,out,p)
for(int j = 0; j < Ny; j++) {
// #pragma omp parallel for private(i,mxy) firstprivate(in,my,j)
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
in[i+1] = b_2D[mxy] + I*0.0 ;
}
fftw_execute(p);
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
b_2D[mxy] = cimag(out[i+1]) ;
}
}
I do get a small speed up, but I keep getting a different result regardless of what variables I set to private and firstprivate. I believe this is correct how I have done it, but why am I getting a different result than when I run this in series?
I have tried the following:
fftw_make_planner_thread_safe();
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
#pragma omp parallel private(j,i,mxy) firstprivate(in,out)
{
fftw_plan p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
for( j = 0; j < N; j++)
in[j] = 0.0;
#pragma omp for
for( j = 0; j < Ny; j++) {
for( i = 0; i < Nx; i++)
in[i+1] = b_2D[i + j*Nx] + I*0.0;
fftw_execute(p);
for( i = 0; i < Nx; i++)
b_2D[i + j*Nx] = cimag(out[i+1]) ;
}
fftw_destroy_plan(p);
}
fftw_free(in);
fftw_free(out);
This give me the error: "Segmentation fault: 11"
If I run this:
fftw_make_planner_thread_safe();
#pragma omp parallel private(j,i,mxy)
{
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_plan p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
for( j = 0; j < N; j++)
in[j] = 0.0;
#pragma omp for
for( j = 0; j < Ny; j++) {
for( i = 0; i < Nx; i++)
in[i+1] = b_2D[i + j*Nx] + I*0.0;
fftw_execute(p);
for( i = 0; i < Nx; i++)
b_2D[i + j*Nx] = cimag(out[i+1]) ;
}
fftw_destroy_plan(p);
fftw_free(in);
fftw_free(out);
}
I get this error again: "Segmentation fault: 11"
but I run again and it says:
solver(9674,0x7fff74e22000) malloc: *** error for object 0x7f8d70f00410: double free
*** set a breakpoint in malloc_error_break to debug
Abort trap: 6

You are calling FFTW with the same plan p in all threads. Since the plan includes the location of the input and output buffers (the ones supplied to the fftw_plan_dft_whatever plan constructor), all concurrent calls to fftw_execute will utilise those same buffers and not the private copies. The solution is to construct a separate plan for each thread:
#pragma omp parallel private(j,i,mxy) firstprivate(in,out)
{
// The following OpenMP construct enforces thread-safety
// Remove if the plan constructor is thread-safe
#pragma omp critical (plan_ops)
fftw_plan my_p = fftw_plan_dft_whatever(..., in, out, ...);
// my_p now refers the private in and out arrays
#pragma omp for
for(int j = 0; j < Ny; j++) {
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
in[i+1] = b_2D[mxy] + I*0.0 ;
}
fftw_execute(my_p);
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
b_2D[mxy] = cimag(out[i+1]) ;
}
}
// See comment above for the constructor operation
#pragma omp critical (plan_ops)
fftw_destroy_plan(my_p);
}

The root cause should be this patch isn't backported to fftw-3.3.5 version, and I think you should merge the patch yourself. You can also refer the discussion here.

openmp, for loop parallelization and critical zone error

I am new to OpenMP and I am using it to implement the Sieve of Eratosthenes, My code are:
int check_eratothenes(int *p, int pn, int n)
{
int count = 0;
bool* out = new bool[int(pow(pn, 2))];
memset(out, 0, pow(pn, 2));
#pragma omp parallel
for (int i = 0; i < n; i ++)
{
int j = floor((pn + 1) / p[i]) * p[i];
#pragma omp critical
while (j <= pow(pn, 2))
{
out[j] = 1;
j += p[i];
}
}
#pragma omp parallel
for (int i = pn+1; i < pow(pn, 2); i ++)
{
#pragma omp critical
if (out[i] == 0)
{
//cout << i << " ";
count ++;
}
}
return count;
}
But, the above OpenMP pragma is wrong. It can be complied but when it runs, it takes a lot of time to get the result, so it press CTRL + C to stop. And I felt at a loss on how to solve it . Since there are many loops and if statements.
Thanks in advance.

Why am I getting a SIGABRT here?

I am getting a SIGABRT error here. I don't understand why. Please help me out with this. It runs fine on my computer but on submitting it, it gives a SIGABRT on the very first test case. The memory alloted is to the order of 10^5 only and I have used vectors, instead of arrays, even then it gives this error.
#include <bits/stdc++.h>
using namespace std;
int main(){
int T;
scanf("%d", &T);
while(T--){
string s;
cin>>s;
int N;
string str;
scanf("%d", &N);
vector <string> f;
for (int i=0; i<N; i++){
cin>>str;
f.push_back(str);
}
vector<int> op;
op.resize(10, 0);
for (int i=0; i<N; i++){
for (int j=0;j <10; j++){
op[i] *= 2;
op[i] += ((f[i][j] == '+')? 1: 0);
}
}
int sint=0;
for (int i=0; i<10; i++){
sint*=2;
sint += ((s[i] == 'b')? 1: 0);
}
int allb = 1023;
int tgt = allb ^ sint;
vector <long long int> prev, curr;
prev.resize(1027, 0);
curr.resize(1027, 0);
prev[op[0]] = 1;
prev[0] = 1;
curr[0] = 1;
curr[op[0]] = 1;
if (op[0] == 0){
curr[op[0]]++;
prev[op[0]]++;
}
for (int i=1; i<N; i++){
for (int j=0; j<=1023; j++){
curr[j] = (prev[j] + prev[j ^ op[i]])%1000000007;
}
for (int j=0; j<=1023; j++)
prev[j] = curr[j];
}
cout<<curr[tgt]<<endl;
}
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Gaussian Elimination in OpenMP - openmp

Related

matrix multiplication using malloc without user input

openacc create data while running inside a kernels

Different results when using OpenMP and FFTW

openmp, for loop parallelization and critical zone error

Why am I getting a SIGABRT here?

Categories

Resources