I'm having a problem with my code for matrix multiplication using both MPI and OMP. Code is correctly compiled but it give me wrong result,values in matrix c(in matmul function) are to big and matrix C(in main) doesn't even get results from function matmul. If anyone knows how to fix it,please help.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <omp.h>
#include <mpi.h>
int offset,rows,br_elemenata,cvor_id,cvor,ukupno;
MPI_Status status;
double gettime(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + 1e-6 * tv.tv_usec;
void matfill(long N, double *mat, double val) {
long i, j;
for(i = 0; i < N; i ++)
for(j = 0; j < N; j ++)
mat[i * N + j] = val;
void matmul(long N, double *a, double *b, double *c) {
long i, j, k;
br_elemenata = N / ukupno; //odredjujemo broj elemenata po cvoru
if (N % ukupno != 0) br_elemenata++; //inkrementujemo broj elemenata po cvoru kako ne bismo neki izostavili
if (cvor == 0){
for (cvor_id=1;cvor_id<ukupno;cvor_id++){
offset = cvor_id * br_elemenata;
rows = N - offset;
if (rows > br_elemenata)
rows = br_elemenata;
// slanje podataka sa cvora 0 na ostale cvorove
MPI_Send(&offset, 1, MPI_INT, cvor_id, 0, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, cvor_id, 0, MPI_COMM_WORLD);
MPI_Send(a+offset, rows*N, MPI_DOUBLE, cvor_id, 0, MPI_COMM_WORLD);
MPI_Send(b, N*N, MPI_DOUBLE, cvor_id, 0, MPI_COMM_WORLD);
offset = 0;
rows = br_elemenata;
} else {
// Primanje podataka sa cvora 0
MPI_Recv(&offset, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(a+offset, rows*N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(b, N*N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status);
#pragma omp parallel for shared(a,b,c) private(i,j,k)
for (i = offset; i < offset + rows; i ++)
for (j = 0; j < N; j ++)
for (k = 0; k < N; k ++)
c[i + j] += a[i + k] * b[k * N + j];
printf("Clan: %5.2f\n",c[i]);
if (cvor == 0) {
for (cvor_id = 1; cvor_id < ukupno; cvor_id++) {
MPI_Recv(&offset, 1, MPI_INT, cvor_id, 1, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, cvor_id, 1, MPI_COMM_WORLD, &status);
MPI_Recv(c+offset, rows*N, MPI_DOUBLE, cvor_id, 1, MPI_COMM_WORLD, &status);
} else {
MPI_Send(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
MPI_Send(c+offset, rows*N, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD);
int main(int argc, char **argv) {
long N;
double *A, *B, *C, t;
MPI_Init(&argc,&argv); //Inicijalizacija MPI
MPI_Comm_size(MPI_COMM_WORLD,&ukupno); //odredjujemo ukupan broj cvorova
MPI_Comm_rank(MPI_COMM_WORLD,&cvor); //odredjuje redni broj cvora, nacin da se svaki cvor identifikuje u komunikaciji
if (argc!=2) {
if (cvor==0) printf("Morate unijeti dimenziju matrice!");
MPI_Finalize(); // ako ne postoji argument pri pozivu funkcije, zavrsiti program
return 1;
N = atoi(argv[1]);
A = (double *) malloc(N * N * sizeof(double));
B = (double *) malloc(N * N * sizeof(double));
C = (double *) malloc(N * N * sizeof(double));
matfill(N, A, 1.0);
matfill(N, B, 2.0);
matfill(N, C, 0.0);
t = gettime();
matmul(N, A, B, C);
t = gettime() - t;
// if (cvor == 0){
fprintf(stdout, "%ld\t%le\t%le\n", N, t, (2 * N - 1) * N * N / t);
printf("Clan: %f\n",C[6]);
// }

The main issue is the offset during communication operations. It should be offset*N.
Corrected code :
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <omp.h>
#include <mpi.h>
int offset,rows,br_elemenata,cvor_id,cvor,ukupno;
MPI_Status status;
double gettime(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + 1e-6 * tv.tv_usec;
void matfill(long N, double *mat, double val) {
long i, j;
for(i = 0; i < N; i ++)
for(j = 0; j < N; j ++)
mat[i * N + j] = val;
void matprint(long N, double *mat) {
long i, j;
for(i = 0; i < N; i ++){
for(j = 0; j < N; j ++){
printf("%g ",mat[i*N+j]);
void matdiag(long N, double *mat, double val) {
long i, j;
for(i = 0; i < N; i ++)
for(j = 0; j < N; j ++)
mat[i * N + j] = (double)i;
mat[i * N + j] =0;
void matmul(long N, double *a, double *b, double *c) {
long i, j, k;
br_elemenata = N / ukupno; //odredjujemo broj elemenata po cvoru
if (N % ukupno != 0) br_elemenata++; //inkrementujemo broj elemenata po cvoru kako ne bismo neki izostavili
if (cvor == 0){
for (cvor_id=1;cvor_id<ukupno;cvor_id++){
offset = cvor_id * br_elemenata;
rows = N - offset;
if (rows > br_elemenata)
rows = br_elemenata;
// slanje podataka sa cvora 0 na ostale cvorove
MPI_Send(&offset, 1, MPI_INT, cvor_id, 0, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, cvor_id, 1, MPI_COMM_WORLD);
MPI_Send(a+(offset*N), rows*N, MPI_DOUBLE, cvor_id, 2, MPI_COMM_WORLD);
MPI_Send(b, N*N, MPI_DOUBLE, cvor_id, 3, MPI_COMM_WORLD);
offset = 0;
rows = br_elemenata;
} else {
// Primanje podataka sa cvora 0
MPI_Recv(&offset, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
MPI_Recv(a+(offset*N), rows*N, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD, &status);
MPI_Recv(b, N*N, MPI_DOUBLE, 0, 3, MPI_COMM_WORLD, &status);
#pragma omp parallel for shared(a,b,c) private(i,j,k)
for (i = offset; i < offset + rows; i ++)
for (j = 0; j < N; j ++)
for (k = 0; k < N; k ++)
c[i*N + j] += a[i*N + k] * b[k * N + j];
printf("Clan: %5.2f\n",c[i]);
if (cvor == 0) {
for (cvor_id = 1; cvor_id < ukupno; cvor_id++) {
MPI_Recv(&offset, 1, MPI_INT, cvor_id, 4, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, cvor_id, 5, MPI_COMM_WORLD, &status);
MPI_Recv(c+(N*offset), rows*N, MPI_DOUBLE, cvor_id, 6, MPI_COMM_WORLD, &status);
} else {
MPI_Send(&offset, 1, MPI_INT, 0, 4, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, 0, 5, MPI_COMM_WORLD);
MPI_Send(c+(N*offset), rows*N, MPI_DOUBLE, 0, 6, MPI_COMM_WORLD);
int main(int argc, char **argv) {
long N;
double *A, *B, *C, t;
MPI_Init(&argc,&argv); //Inicijalizacija MPI
MPI_Comm_size(MPI_COMM_WORLD,&ukupno); //odredjujemo ukupan broj cvorova
MPI_Comm_rank(MPI_COMM_WORLD,&cvor); //odredjuje redni broj cvora, nacin da se svaki cvor identifikuje u komunikaciji
if (argc!=2) {
if (cvor==0) printf("Morate unijeti dimenziju matrice!");
MPI_Finalize(); // ako ne postoji argument pri pozivu funkcije, zavrsiti program
return 1;
N = atoi(argv[1]);
A = (double *) malloc(N * N * sizeof(double));
B = (double *) malloc(N * N * sizeof(double));
C = (double *) malloc(N * N * sizeof(double));
matfill(N, A, 1.0);
matfill(N, B, 2.0);
matfill(N, C, 0.0);
matdiag(N,A, 1) ;
t = gettime();
matmul(N, A, B, C);
t = gettime() - t;
if (cvor == 0){
fprintf(stdout, "%ld\t%le\t%le\n", N, t, (2 * N - 1) * N * N / t);
printf("Clan: %f\n",C[6]);
matprint(N, A) ;
matprint(N, B) ;
matprint(N, C) ;
To compile : mpicc main.c -o main To run : mpirun -np 4 main
If you wish to go further, you will be interested by the MPI_Bcast() function, which sends the same thing to everyone. MPI_Scatter() and MPI_Gather() are helpful to distribute matrices or get it back on a given process.
Moreover, the dgemm() function of BLAS may be used to speed up the computation on a given process.
To reduce the memory footprint, the allocated size of A and C may be decreased to account for br_elemenata (except on process 0)...and offsets will have to change...again !


Use MPI gather to get data from tasks into master array

I'm learning how to use MPI to calculate the 4 point stencil of a 2d hot plate. But my issue is that when writing my results to an output file, the output file only contains the results of one of the tasks (I believe its the task that finishes first). What I want to do is gather the data after performing all the iterations in the worker tasks back to the master, but I am unsure how to do that. I want to use MPI_Gather, or believe I need to use it, but I don't quite understand how to do it from the tutorials.
Alternatively, am I misallocating how to return data back to the master in my worker threads?
Code here:
#define BEGIN 1 /* message tag */
#define LTAG 2 /* message tag */
#define RTAG 3 /* message tag */
#define NONE 0 /* indicates no neighbor */
#define DONE 4 /* message tag */
#define MASTER 0 /* taskid of first process */
#include <iostream>
#include <math.h>
#include <fstream>
#include <string>
#include <cstring>
#include <iomanip>
#include <chrono>
#include <mpi.h>
#include <chrono>
using namespace std::chrono;
using namespace std;
void update(int start, int end, int ny, double* u1, double* u2)
int ix, iy;
for (ix = start; ix <= end; ix++)
for (iy = 1; iy <= ny - 2; iy++)
//calculate stencil
*(u2 + ix * ny + iy) = .25 * (
*(u1 + (ix + 1) * ny + iy) +
*(u1 + (ix - 1) * ny + iy) +
*(u1 + ix * ny + (iy + 1)) +
*(u1 + ix * ny + (iy - 1)));
//pass into buffer
for (ix = start; ix <= end; ix++)
for (iy = 1; iy <= ny - 2; iy++)
*(u1 + ix * ny + iy) = *(u2 + ix * ny + iy);
bool isNumber(char number[])
int i = 0;
//checking for negative numbers
if (number[0] == '-')
return false;
for (; number[i] != 0; i++)
//if (number[i] > '9' || number[i] < '0')
if (!isdigit(number[i]))
return false;
return true;
int main(int argc, char* argv[])
//variable init
int N = 0;
int I = 0;
int taskid, /* this task's unique id */
numworkers, /* number of worker processes */
numtasks, /* number of tasks */
averow, rows, offset, extra, /* for sending rows of data */
dest, source, /* to - from for message send-receive */
left, right, /* neighbor tasks */
msgtype, /* for message types */
rc, start, end, /* misc */
i, ix, iy, iz, it; /* loop variables */
MPI_Status status;
//There should be five arguments, no more no less
if (argc != 5)
std::cout << "Invalid parameters, please check your values." << std::endl;
if (!isNumber(argv[2]) || !isNumber(argv[4]))
std::cout << "Invalid parameters, please check your values." << std::endl;
//check the n flag in either valid position.
if (!strcmp(argv[1], "-n"))
N = atoi(argv[2]);
else if (!strcmp(argv[3], "-n"))
N = atoi(argv[4]);
std::cout << "Invalid flags, please check your values." << std::endl;
//check the I flag in either valid position
if (!strcmp(argv[1], "-I"))
I = atoi(argv[2]);
else if (!strcmp(argv[3], "-I"))
I = atoi(argv[4]);
std::cout << "Invalid flags, please check your values." << std::endl;
if (N == 0 || I == 0)
std::cout << "Invalid parameters, please check your values." << std::endl;
int n_x = N + 2; //Based on the piazza post, increase x and y dimension by two for the edges
int n_y = N + 2;
double** phi[2];
double** resultArray;
phi[0] = new double* [n_x];
phi[1] = new double* [n_x];
resultArray = new double* [n_x];
for (int i = 0; i < n_x; i++)
phi[0][i] = new double[n_y];
phi[1][i] = new double[n_y];
resultArray[i] = new double[n_y];
int iterationMax = I;
//Initialize the plate temperatures
for (int j = 0; j < n_x; j++)
for (int i = 0; i < n_x; i++)
if ((i == 0) || (i == n_x - 1) || (j == 0) || (j == n_y - 1))
phi[0][i][j] = 20.0;
phi[1][i][j] = 20.0;
phi[0][i][j] = 0.0;
phi[1][i][j] = 0.0;
for (int j = 0; j < n_x; j++)
if (j > (int)(0.3 * n_x) && j < (int)(0.7 * n_x))
phi[0][0][j] = 100.0;
phi[1][0][j] = 100.0;
/* First, find out my taskid and how many tasks are running */
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
numworkers = numtasks - 1;
if (taskid == 0) {
auto start = high_resolution_clock::now();
/************************* master code *******************************/
/* Check if numworkers is within range - quit if not */
//printf("Starting mpi_heat2D with %d worker tasks.\n", numworkers);
/* Initialize grid */
//printf("Grid size: X= %d Y= %d Time steps= %d\n", n_x, n_y, I);
/* Distribute work to workers. Must first figure out how many rows to */
/* send and what to do with extra rows. */
averow = n_x / numworkers;
extra = n_x % numworkers;
offset = 0;
for (i = 1; i <= numworkers; i++)
rows = (i <= extra) ? averow + 1 : averow;
/* Tell each worker who its neighbors are, since they must exchange */
/* data with each other. */
if (i == 1)
left = 0;
left = i - 1;
if (i == numworkers)
right = 0;
right = i + 1;
/* Now send startup information to each worker */
dest = i;
MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&left, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&right, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&phi[0][offset][0], rows * n_y, MPI_DOUBLE, dest, 1,
/* printf("Sent to task %d: rows= %d offset= %d ", dest, rows, offset);
printf("left= %d right= %d\n", left, right);*/
offset = offset + rows;
/* Now wait for results from all worker tasks */
for (i = 1; i <= numworkers; i++)
source = i;
msgtype = DONE;
MPI_Recv(&offset, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD,
MPI_Recv(&rows, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&phi[0][offset][0], rows * n_y, MPI_DOUBLE, source,
msgtype, MPI_COMM_WORLD, &status);
/* Write final output, call X graph and finalize MPI */
auto stop = high_resolution_clock::now();
auto duration = duration_cast<milliseconds>(stop - start);
// output the time to the console
std::cout << "Time taken to solve diffusion plate: " << duration.count() << "ms" << std::fixed << std::endl;
} /* End of master code */
/************************* workers code **********************************/
if (taskid != MASTER)
/* Receive my offset, rows, neighbors and grid partition from master */
source = MASTER;
msgtype = BEGIN;
MPI_Recv(&offset, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&left, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&right, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&phi[0][offset][0], rows * n_y, MPI_DOUBLE, source, msgtype,
MPI_COMM_WORLD, &status);
/* Determine border elements. Need to consider first and last columns. */
/* Obviously, row 0 can't exchange with row 0-1. Likewise, the last */
/* row can't exchange with last+1. */
start = offset;
end = offset + rows - 1;
if (offset == 0)
start = 1;
if ((offset + rows) == n_x)
// printf("task=%d start=%d end=%d\n", taskid, start, end);
/* Begin doing STEPS iterations. Must communicate border rows with */
/* neighbors. If I have the first or last grid row, then I only need */
/* to communicate with one neighbor */
//printf("Task %d received work. Beginning time steps...\n", taskid);
iz = 0;
for (it = 1; it <= iterationMax; it++)
if (left != NONE)
MPI_Send(&phi[iz][offset][0], n_y, MPI_DOUBLE, left,
source = left;
msgtype = LTAG;
MPI_Recv(&phi[iz][offset - 1][0], n_y, MPI_DOUBLE, source,
msgtype, MPI_COMM_WORLD, &status);
if (right != NONE)
MPI_Send(&phi[iz][offset + rows - 1][0], n_y, MPI_DOUBLE, right,
source = right;
msgtype = RTAG;
MPI_Recv(&phi[iz][offset + rows][0], n_y, MPI_DOUBLE, source, msgtype,
MPI_COMM_WORLD, &status);
/* Now call update to update the value of grid points */
update(start, end, n_y, &phi[iz][0][0], &phi[1-iz][0][0]);
iz = 1 - iz;
/* Finally, send my portion of final results back to master */
MPI_Send(&phi[iz][offset][0], rows * n_y, MPI_DOUBLE, MASTER, DONE,
//printf("Task %d finished work...\n", taskid);
//This is probably not correct
/*double gsize;
int root, myrank;
double* mybuf;
MPI_Gather(&mybuf, 1, MPI_DOUBLE, phi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);*/
std::ofstream outJob;"finalTemperatures.csv");
double* printAddr = &phi[0][0][0];
for (int j = 0; j < n_y; j++) //file IO
for (int i = 0; i < n_x; i++)
outJob << phi[0][j][i] << "," << std::fixed;
outJob << std::endl;
return 0;

error when use multiple mpi_bcast

I have a problem with 3 mpi_bcast and one mpi_scatter, my program don't work well ,mpi_scatter don't work and globalparcsr don't scatter between nodes. when i delete second and third mpi_bcast ,mpi_scatter work well. I want broadcast a and globalindividual and globalfitness and then scatter globalparcsr, part of my code as bellow:
int malloc2dint(int ***array, int n, int m) {
/* allocate the n*m contiguous items */
int *p = (int *)malloc(n*m * sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int **)malloc(n * sizeof(int*));
if (!(*array)) {
return -1;
/* set up the pointers into the contiguous memory */
for (int i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
int main(int argc, char *argv[]) {
int size, rank, divided_pop_size, sum = 0, root = 0, procgridsize, sum2 = 0,generation=0;
int **globalindividual, **localindividual;
int *globalfitness, *localfitness;
int *globalparcsr, *localparcsr;
int **recbuf;
int *sendcounts, *parsendcount; //specifying the number of elements to send to each processor
int *displs, *pardispls; //Entry i specifies the displacement
MPI_Status status;
int offset, rows;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
divided_pop_size = n_initial_pop / size;
if (rank == root)
malloc2dint(&globalindividual, n_initial_pop, num_vertices);
globalfitness = (int*)malloc(n_initial_pop * sizeof(int));
globalparcsr = (int*)malloc(n_initial_pop * sizeof(int));
globalindividual = initial_population(globalindividual, n_initial_pop);
for (int i = 0; i < n_initial_pop; i++) {
for (int j = 0; j < num_vertices; j++)
printf("%d", globalindividual[i][j]);
for (int p = 0; p < size; p++) {
if (rank == p) {
malloc2dint(&localindividual,n_initial_pop + 2, num_vertices);
localindividual = initial_population(localindividual, divided_pop_size + 2);
MPI_Bcast(&a[0][0], 5000 * 5000, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&globalindividual[0][0], n_initial_pop*num_vertices, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&globalfitness, n_initial_pop, MPI_INT, 0, MPI_COMM_WORLD);
return 0;
a is a 2d array and globalindividual is a 2d array with 12 rows and 8 columns and globalfitness is 1d array with size 12
please help me.

Collect all computed by mpi rows of matrix to root matrix

I'm trying hard all day to implement matrix multiply with help of MPI, all examples from the Internet didn't work for me (I don't know why, it compiles, run but not computing). Here is what I'm doing:
From bash:
mpirun -n 2 out/lb8
It reading matrix 2x4 (1 row per process) and starting to compute.
The problem is in SendRecv block (or generally in collecting data)
void Matrix_MPY(double **matrix_a, double **matrix_b, double ***matrix_c, int a_rows, int a_cols) {
int i, j;
int process_rank, process_count;
MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
MPI_Comm_size(MPI_COMM_WORLD, &process_count);
if (a_rows % process_count != 0) {
int rows_per_process = a_rows / process_count;
int current_row = rows_per_process * process_rank;
double **temp;
temp = (double **) malloc(sizeof(double *) * a_rows);
for (i = 0; i < a_rows; ++i){
temp[i] = (double *) malloc(sizeof(double) * a_rows);
for (i = current_row; i < current_row + rows_per_process; ++i) {
for (j = 0; j < a_rows; ++j)
int k;
for(k = 0; k < a_cols; ++k){
temp[i][j] += matrix_a[i][k] * matrix_b[k][j];
MPI_Sendrecv(temp[i], a_rows, MPI_DOUBLE, ROOT, TAG, temp[i], a_rows, MPI_DOUBLE, process_rank, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
*matrix_c = temp;
This solution is worked for me
if (process_rank != ROOT)
MPI_Send(temp[i], a_rows, MPI_DOUBLE, ROOT, i, MPI_COMM_WORLD);
if (process_rank == ROOT) {
for (i = 1; i < process_count; ++i)
for (j = i * rows_per_process; j < i * rows_per_process + rows_per_process; ++j)
*matrix_c = temp;

MPI Gather returning incorrect values

I've come to seek help with my issue.
The whole below code seems to return proper values for root process, but incorrect values like -1.#IND00 for all other processes. Also Barriers don't work, before I generate the arrays and broadcast them, some of the processes freely go over.
The main idea is to put different parts of vector into other processes and then to glue them into one variable with MPI_Gather.
I have no idea where I have gone wrong.
I'll be grateful for any help given.
double *xNowe = calloc(n, sizeof(double));
double *vec = calloc(n/size, sizeof(double));
while(delta > granica)
for(i = mystart; i < myend; i++)
vec[i - mystart] = b[i];
for(j = 0; j < n; j++)
if(i != j)
vec[i - mystart] -= A[i][j] * x0[j];
vec[i - mystart] = vec[i - mystart] / A[i][i];
if(rank > 0)
printf("\n%f", vec[i - mystart]);
printf("1: %d, 10: %d, 50: %d, 110: %d, 200: %d, 300: %d, 400: %d",xNowe[1],xNowe[10],xNowe[110],xNowe[200],xNowe[300],xNowe[400]);
MPI_Allgather(vec, n/size, MPI_DOUBLE, xNowe, n/size, MPI_DOUBLE, MPI_COMM_WORLD);
if(rank == 0)
delta = 0;
for(i = 0; i < n; i++)
delta = delta + ((xNowe[i] - x0[i] > 0) ? (xNowe[i] - x0[i]) : (-(xNowe[i] - x0[i])));
//x0 = xNowe; nie dzialalo
for(i = 0; i < n; i++)
x0[i] = xNowe[i];
MPI_Bcast(&delta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
Update: The loop crashes at 2nd iteration with value calculated on certain indexes of xNowe as:
1: 1204749721, 10: -1085549499, 50: -1034011523, 110: 1063725393, 200: -17690801
07, 300: -1083408896, 400: -5847835510
1: 0, 10: -524288, 50: 0, 110: -524288, 200: 0, 300: -524288, 400: 0
MPI_Gather() gathers values on proc root. If you wish to gather values everywhere, you may use MPI_Allgather()

MPI matrix multification compile err: undeclared

I coded a mpi matrix multification program, which use scanf("%d", &size), designate matrix size, then I defined int matrix[size*size], but when I complied it, it reported that matrix is undeclared. Please tell me why, or what my problem is!
According Ed's suggestion, I changed the matrix definition to if(myid == 0) block, but got the same err! Now I post my code, please help me find out where I made mistakes! thank you!
#include "mpi.h"
#include <stdio.h>
#include <math.h>
#include <time.h>
int size;
int main(int argc, char* argv[])
int myid, numprocs;
int *p;
MPI_Status status;
int i,j,k;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(myid == 0)
scanf("%d", &size);
int matrix1[size*size];
int matrix2[size*size];
int matrix3[size*size];
int section = size/numprocs;
int tail = size % numprocs;
for( i=0; i<size; i++)
for( j=0; j<size; j++)
matrix3[i*size+j]= 0;
printf("Matrix1 is: \n");
for( i=0; i<size; i++)
for( j=0; j<size; j++)
printf("%3d", matrix1[i*size+j]);
printf("Matrix2 is: \n");
for( i=0; i<size; i++)
for( j=0; j<size; j++)
printf("%3d", matrix2[i*size+j]);
//MPI_BCAST(matrix1, size*size, MPI_INT, 0, MPI_COMM_WORLD, );
for( i=1; i<numprocs; i++)
MPI_Send(&size, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
MPI_Send(&section, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
MPI_Send(&tail, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
MPI_Send(maxtrix2, size*size, MPI_INT, i, 0, MPI_COMM_WORLD);
j = 0;
for( i=1; i<numprocs-1; i++)
p = &matrix1[size*section*j++];
MPI_Send(p, size*section, MPI_INT, i, 1, MPI_COMM_WORLD);
p = &matrix1[size*section*j];
MPI_Send(p, size*section+size*tail, MPI_INT, numprocs-1, 1, MPI_COMM_WORLD);
p = matrix3;
for( i=1; i<numprocs-1; i++)
MPI_Recv(p, size*section, MPI_INT, i, 1, MPI_COMM_WORLD, &status);
p = &matrix3[size*section*i];
MPI_Recv(p, size*section+size*tail, MPI_INT, numprocs-1, 1, MPI_COMM_WORLD, &status);
printf("Matrix3 is: \n");
for( i=0; i<size; i++)
for( j=0; j<size; j++)
printf("%2d ", matrix3[i*size+j]);
else if (myid > 0 && myid<numprocs-1 )
MPI_Recv(&size, 1, MPI_INT, 0, 0,MPI_COMM_WORLD, &status);
MPI_Recv(&section, 1, MPI_INT, 0, 0,MPI_COMM_WORLD, &status);
MPI_Recv(&tail, 1, MPI_INT, 0, 0,MPI_COMM_WORLD, &status);
int matrix1[size*size];
int matrix2[size*size];
int matrix3[size*size];
MPI_Recv(matrix2, size*size, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(matrix1, size*section, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
for( i=0; i<section; i++)
for( j=0; j<size; j++)
for( k=0; k<size; k++)
matrix1[i*size+j] = matrix1[i*size+k]*matrix2[k*size+j];
MPI_Send(matrix1, size*section, MPI_INT, 0, 1, MPI_COMM_WORLD);
else if (myid > 0 && myid == numprocs-1)
MPI_Recv(&size, 1, MPI_INT, 0, 0,MPI_COMM_WORLD, &status);
MPI_Recv(&section, 1, MPI_INT, 0, 0,MPI_COMM_WORLD, &status);
MPI_Recv(&tail, 1, MPI_INT, 0, 0,MPI_COMM_WORLD, &status);
int matrix1[size*size];
int matrix2[size*size];
int matrix3[size*size];
MPI_Recv(matrix2, size*size, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(matrix1, size*section+size*tail, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
for( i=0; i<section+tail; i++)
for( j=0; j<size; j++)
for( k=0; k<size; k++)
matrix1[i*size+j] = matrix1[i*size+k]*matrix2[k*size+j];
MPI_Send(matrix1, size*section+size*tail, MPI_INT, 0, 1, MPI_COMM_WORLD);
return 0;
It may be that you are using scanf() on one machine before you set the size of the matrix, however if the size of the matrix is stored on all the machines the scanf() will not be run on them all.
If that is the case, you will have to scanf() the size of the matrix on the main process before you begin with the MPI functionality, and then send the size of the matrix (via COMM_WORLD.Bcast() or some other method) to each process in order for the matrix to be defined correctly.
Of course, this is just a guess because you've provided far too little information to make an informed answer, so I'm going for the most likely explanation.
Ok, here's some changes that will make it compile (Some of them may be done anyways, your code came out a bit funny when you pasted it in, and there may be others I've missed, again code is formatted a bit funny)
MPI_Send(maxtrix2, size*size, MPI_INT, i, 0, MPI_COMM_WORLD);
should be
MPI_Send(&matrix2, size*size, MPI_INT, i, 0, MPI_COMM_WORLD);
int section = size/numprocs;
int tail = size % numprocs;
These need to be defined before the first if statement in order for them to work further in, so just define them straight after the main without assigning them. (Otherwise they don't exist when your other processes try to use them)
Sorry but I don't have time to figure out the code and actually get it to do what you want, but that should at least get you runnable code you can debug.
The value of "size" is not known at compile time. Hence the error.
It may seem logical, if you are new to coding, that you are reading the value of size and trying to allocate it. This will, in fact, work for interpreted languages like Python. But your code is in C. C programs need to be compiled to work. When the compiler looks at your code, it doesnt know what is the value of the variable "size". And in the next statement, you are using the variable "size". So, you are attempting to use a variable whose value is not yet known. That is what the compiler is complaining about.
Two ways to solve this:
1) Declare a sufficiently large matrix, say, 1000 X 1000. And during run time, you decide how much size you want to use. But dont give values more than what you hard coded in the source, i.e. 1000 X 1000. What you are doing here is telling the compiler to allocate memory for 1000 X 1000 items, but you may or may not use the entire space. You will be wasting memory and this is not an efficient way to do this.
2) Use dynamic allocation. However, given the nature of this question, this may be too advanced for you at the moment.
