Use MPI gather to get data from tasks into master array - parallel-processing

I'm learning how to use MPI to calculate the 4 point stencil of a 2d hot plate. But my issue is that when writing my results to an output file, the output file only contains the results of one of the tasks (I believe its the task that finishes first). What I want to do is gather the data after performing all the iterations in the worker tasks back to the master, but I am unsure how to do that. I want to use MPI_Gather, or believe I need to use it, but I don't quite understand how to do it from the tutorials.
Alternatively, am I misallocating how to return data back to the master in my worker threads?
Code here:
#define BEGIN 1 /* message tag */
#define LTAG 2 /* message tag */
#define RTAG 3 /* message tag */
#define NONE 0 /* indicates no neighbor */
#define DONE 4 /* message tag */
#define MASTER 0 /* taskid of first process */
#include <iostream>
#include <math.h>
#include <fstream>
#include <string>
#include <cstring>
#include <iomanip>
#include <chrono>
#include <mpi.h>
#include <chrono>
using namespace std::chrono;
using namespace std;
void update(int start, int end, int ny, double* u1, double* u2)
{
int ix, iy;
for (ix = start; ix <= end; ix++)
{
for (iy = 1; iy <= ny - 2; iy++)
{
//calculate stencil
*(u2 + ix * ny + iy) = .25 * (
*(u1 + (ix + 1) * ny + iy) +
*(u1 + (ix - 1) * ny + iy) +
*(u1 + ix * ny + (iy + 1)) +
*(u1 + ix * ny + (iy - 1)));
}
}
//pass into buffer
for (ix = start; ix <= end; ix++)
for (iy = 1; iy <= ny - 2; iy++)
*(u1 + ix * ny + iy) = *(u2 + ix * ny + iy);
}
bool isNumber(char number[])
{
int i = 0;
//checking for negative numbers
if (number[0] == '-')
return false;
for (; number[i] != 0; i++)
{
//if (number[i] > '9' || number[i] < '0')
if (!isdigit(number[i]))
return false;
}
return true;
}
int main(int argc, char* argv[])
{
//variable init
int N = 0;
int I = 0;
int taskid, /* this task's unique id */
numworkers, /* number of worker processes */
numtasks, /* number of tasks */
averow, rows, offset, extra, /* for sending rows of data */
dest, source, /* to - from for message send-receive */
left, right, /* neighbor tasks */
msgtype, /* for message types */
rc, start, end, /* misc */
i, ix, iy, iz, it; /* loop variables */
MPI_Status status;
//There should be five arguments, no more no less
if (argc != 5)
{
std::cout << "Invalid parameters, please check your values." << std::endl;
exit(1);
}
if (!isNumber(argv[2]) || !isNumber(argv[4]))
{
std::cout << "Invalid parameters, please check your values." << std::endl;
exit(1);
}
//check the n flag in either valid position.
if (!strcmp(argv[1], "-n"))
{
N = atoi(argv[2]);
}
else if (!strcmp(argv[3], "-n"))
{
N = atoi(argv[4]);
}
else
{
std::cout << "Invalid flags, please check your values." << std::endl;
exit(1);
}
//check the I flag in either valid position
if (!strcmp(argv[1], "-I"))
{
I = atoi(argv[2]);
}
else if (!strcmp(argv[3], "-I"))
{
I = atoi(argv[4]);
}
else
{
std::cout << "Invalid flags, please check your values." << std::endl;
exit(1);
}
if (N == 0 || I == 0)
{
std::cout << "Invalid parameters, please check your values." << std::endl;
exit(1);
}
int n_x = N + 2; //Based on the piazza post, increase x and y dimension by two for the edges
int n_y = N + 2;
double** phi[2];
double** resultArray;
phi[0] = new double* [n_x];
phi[1] = new double* [n_x];
resultArray = new double* [n_x];
for (int i = 0; i < n_x; i++)
{
phi[0][i] = new double[n_y];
phi[1][i] = new double[n_y];
resultArray[i] = new double[n_y];
}
int iterationMax = I;
//Initialize the plate temperatures
for (int j = 0; j < n_x; j++)
{
for (int i = 0; i < n_x; i++)
{
if ((i == 0) || (i == n_x - 1) || (j == 0) || (j == n_y - 1))
{
phi[0][i][j] = 20.0;
phi[1][i][j] = 20.0;
}
else
{
phi[0][i][j] = 0.0;
phi[1][i][j] = 0.0;
}
}
}
for (int j = 0; j < n_x; j++)
{
if (j > (int)(0.3 * n_x) && j < (int)(0.7 * n_x))
{
phi[0][0][j] = 100.0;
phi[1][0][j] = 100.0;
}
}
/* First, find out my taskid and how many tasks are running */
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
numworkers = numtasks - 1;
if (taskid == 0) {
auto start = high_resolution_clock::now();
/************************* master code *******************************/
/* Check if numworkers is within range - quit if not */
//printf("Starting mpi_heat2D with %d worker tasks.\n", numworkers);
/* Initialize grid */
//printf("Grid size: X= %d Y= %d Time steps= %d\n", n_x, n_y, I);
/* Distribute work to workers. Must first figure out how many rows to */
/* send and what to do with extra rows. */
averow = n_x / numworkers;
extra = n_x % numworkers;
offset = 0;
for (i = 1; i <= numworkers; i++)
{
rows = (i <= extra) ? averow + 1 : averow;
/* Tell each worker who its neighbors are, since they must exchange */
/* data with each other. */
if (i == 1)
left = 0;
else
left = i - 1;
if (i == numworkers)
right = 0;
else
right = i + 1;
/* Now send startup information to each worker */
dest = i;
MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&left, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&right, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&phi[0][offset][0], rows * n_y, MPI_DOUBLE, dest, 1,
MPI_COMM_WORLD);
/* printf("Sent to task %d: rows= %d offset= %d ", dest, rows, offset);
printf("left= %d right= %d\n", left, right);*/
offset = offset + rows;
}
/* Now wait for results from all worker tasks */
for (i = 1; i <= numworkers; i++)
{
source = i;
msgtype = DONE;
MPI_Recv(&offset, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD,
&status);
MPI_Recv(&rows, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&phi[0][offset][0], rows * n_y, MPI_DOUBLE, source,
msgtype, MPI_COMM_WORLD, &status);
}
/* Write final output, call X graph and finalize MPI */
auto stop = high_resolution_clock::now();
auto duration = duration_cast<milliseconds>(stop - start);
std::cout.precision(5);
// output the time to the console
std::cout << "Time taken to solve diffusion plate: " << duration.count() << "ms" << std::fixed << std::endl;
} /* End of master code */
/************************* workers code **********************************/
if (taskid != MASTER)
{
/* Receive my offset, rows, neighbors and grid partition from master */
source = MASTER;
msgtype = BEGIN;
MPI_Recv(&offset, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&left, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&right, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&phi[0][offset][0], rows * n_y, MPI_DOUBLE, source, msgtype,
MPI_COMM_WORLD, &status);
/* Determine border elements. Need to consider first and last columns. */
/* Obviously, row 0 can't exchange with row 0-1. Likewise, the last */
/* row can't exchange with last+1. */
start = offset;
end = offset + rows - 1;
if (offset == 0)
start = 1;
if ((offset + rows) == n_x)
end--;
// printf("task=%d start=%d end=%d\n", taskid, start, end);
/* Begin doing STEPS iterations. Must communicate border rows with */
/* neighbors. If I have the first or last grid row, then I only need */
/* to communicate with one neighbor */
//printf("Task %d received work. Beginning time steps...\n", taskid);
iz = 0;
for (it = 1; it <= iterationMax; it++)
{
if (left != NONE)
{
MPI_Send(&phi[iz][offset][0], n_y, MPI_DOUBLE, left,
RTAG, MPI_COMM_WORLD);
source = left;
msgtype = LTAG;
MPI_Recv(&phi[iz][offset - 1][0], n_y, MPI_DOUBLE, source,
msgtype, MPI_COMM_WORLD, &status);
}
if (right != NONE)
{
MPI_Send(&phi[iz][offset + rows - 1][0], n_y, MPI_DOUBLE, right,
LTAG, MPI_COMM_WORLD);
source = right;
msgtype = RTAG;
MPI_Recv(&phi[iz][offset + rows][0], n_y, MPI_DOUBLE, source, msgtype,
MPI_COMM_WORLD, &status);
}
/* Now call update to update the value of grid points */
update(start, end, n_y, &phi[iz][0][0], &phi[1-iz][0][0]);
iz = 1 - iz;
}
/* Finally, send my portion of final results back to master */
MPI_Send(&offset, 1, MPI_INT, MASTER, DONE, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, MASTER, DONE, MPI_COMM_WORLD);
MPI_Send(&phi[iz][offset][0], rows * n_y, MPI_DOUBLE, MASTER, DONE,
MPI_COMM_WORLD);
//printf("Task %d finished work...\n", taskid);
MPI_Finalize();
}
//This is probably not correct
/*double gsize;
int root, myrank;
double* mybuf;
MPI_Gather(&mybuf, 1, MPI_DOUBLE, phi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);*/
std::ofstream outJob;
outJob.open("finalTemperatures.csv");
std::cout.precision(15);
double* printAddr = &phi[0][0][0];
for (int j = 0; j < n_y; j++) //file IO
{
for (int i = 0; i < n_x; i++)
{
outJob << phi[0][j][i] << "," << std::fixed;
}
outJob << std::endl;
}
outJob.close();
return 0;
}
}

Related

How fix it. Unhandled exception at 0x0099B514 in ConsoleApplication15.exe: 0xC0000094: Integer division by zero

I am trying to solve problem with c++: Find all unique elements of a two-dimensional array of integers using MPI_Scatter and MPI_Comm_split to distribute the array's rows among a set of processes, so that the set of processes is split into three groups.
Got the code
#include <iostream>
#include <unordered_set>
#include <mpi.h>
using namespace std;
int main(int argc, char* argv[])
{
int rank, size;
int rows = 0, columns = 0;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int* matrix = nullptr;
if (rank == 0)
{
cout << "Enter the number of rows: ";
cin >> rows;
cout << "Enter the number of columns: ";
cin >> columns;
matrix = new int[rows * columns];
cout << "Enter the elements of the matrix: " << endl;
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < columns; j++)
{
cin >> matrix[i * columns + j];
}
}
}
MPI_Bcast(&rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&columns, 1, MPI_INT, 0, MPI_COMM_WORLD);
int sub_size = rows / size;
int* local_matrix = new int[sub_size * columns];
MPI_Scatter(matrix, sub_size * columns, MPI_INT, local_matrix, sub_size * columns, MPI_INT, 0, MPI_COMM_WORLD);
unordered_set<int> local_set;
for (int i = 0; i < sub_size; i++)
{
for (int j = 0; j < columns; j++)
{
local_set.insert(local_matrix[i * columns + j]);
}
}
MPI_Comm sub_comm;
MPI_Comm_split(MPI_COMM_WORLD, rank / (size / 3), rank, &sub_comm);
int sub_rank, new_sub_size;
MPI_Comm_rank(sub_comm, &sub_rank);
MPI_Comm_size(sub_comm, &new_sub_size);
unordered_set<int>* global_set = nullptr;
if (sub_rank == 0)
{
global_set = new unordered_set<int>[new_sub_size];
}
MPI_Gather(&local_set, sizeof(unordered_set<int>), MPI_BYTE, global_set, sizeof(unordered_set<int>), MPI_BYTE, 0, sub_comm);
if (sub_rank == 0)
{
unordered_set<int> final_set;
for (int i = 0; i < new_sub_size; i++)
{
for (auto it = global_set[i].begin(); it != global_set[i].end(); it++) {
final_set.insert(*it);
}
}
cout << "The unique elements in the matrix are: ";
for (auto it = final_set.begin(); it != final_set.end(); it++) {
cout << *it << " ";
}
cout << endl;
delete[] global_set;
}
delete[] local_matrix;
if (rank == 0) {
delete[] matrix;
}
MPI_Finalize();
return 0;
}
After compile and input data microsoft visual studio 2019 gives an error message
Unhandled exception at 0x0099B514 in ConsoleApplication15.exe: 0xC0000094: Integer division by zero.
to this line
MPI_Comm_split(MPI_COMM_WORLD, rank / (size / 3), rank, &sub_comm);
How fix it?

Poor speed up when running code in parallel using MPI and openMP [duplicate]

I have the following implementation:
int main(int argc, char **argv)
{
int n_runs = 100; // Number of runs
int seed = 1;
int arraySize = 400;
/////////////////////////////////////////////////////////////////////
// initialise the random number generator using a fixed seed for reproducibility
srand(seed);
MPI_Init(nullptr, nullptr);
int rank, n_procs;
MPI_Comm_size(MPI_COMM_WORLD, &n_procs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// Initialise the probability step and results vectors.
// We have 21 probabilities between 0 and 1 (inclusive).
double prob_step = 0.05;
std::vector<double> avg_steps_over_p(21,0);
std::vector<double> trans_avg_steps_over_p(21,0);
std::vector<int> min_steps_over_p(21,0);
std::vector<int> trans_min_steps_over_p(21,0);
std::vector<int> max_steps_over_p(21,0);
std::vector<int> trans_max_steps_over_p(21,0);
std::vector<double> prob_reached_end(21,0);
std::vector<double> trans_prob_reached_end(21,0);
// Loop over probabilities and compute the number of steps before the model burns out,
// averaged over n_runs.
for (int i = rank; i < 21; i+=n_procs)
{
double prob = i*prob_step;
int min_steps = std::numeric_limits<int>::max();
int max_steps = 0;
for (int i_run = 0; i_run < n_runs; ++i_run)
{
Results result = forest_fire(arraySize, prob);
avg_steps_over_p[i] += result.stepCount;
if (result.fireReachedEnd) ++prob_reached_end[i];
if (result.stepCount < min_steps) min_steps = result.stepCount;
if (result.stepCount > max_steps) max_steps = result.stepCount;
}
avg_steps_over_p[i] /= n_runs;
min_steps_over_p[i] = min_steps;
max_steps_over_p[i] = max_steps;
prob_reached_end[i] = 1.0*prob_reached_end[i] / n_runs;
}
// Worker processes communicate their results to the master process.
if (rank > 0)
{
MPI_Send(&avg_steps_over_p[0], 21, MPI_DOUBLE, 0, rank, MPI_COMM_WORLD);
MPI_Send(&min_steps_over_p[0], 21, MPI_INT, 0, rank, MPI_COMM_WORLD);
MPI_Send(&max_steps_over_p[0], 21, MPI_INT, 0, rank, MPI_COMM_WORLD);
MPI_Send(&prob_reached_end[0], 21, MPI_DOUBLE, 0, rank, MPI_COMM_WORLD);
} else
{
for (int i = 1; i < n_procs; ++i)
{
MPI_Status status;
MPI_Recv(&trans_avg_steps_over_p[0], 21, MPI_DOUBLE, i, i, MPI_COMM_WORLD, &status);
for (int j = i; j < 21; j += n_procs) {
avg_steps_over_p[j] = trans_avg_steps_over_p[j];
}
MPI_Recv(&trans_min_steps_over_p[0], 21, MPI_INT, i, i, MPI_COMM_WORLD, &status);
for (int j = i; j < 21; j += n_procs) {
min_steps_over_p[j] = trans_min_steps_over_p[j];
}
MPI_Recv(&trans_max_steps_over_p[0], 21, MPI_INT, i, i, MPI_COMM_WORLD, &status);
for (int j = i; j < 21; j += n_procs) {
max_steps_over_p[j] = trans_max_steps_over_p[j];
}
MPI_Recv(&trans_prob_reached_end[0], 21, MPI_DOUBLE, i, i, MPI_COMM_WORLD, &status);
for (int j = i; j < 21; j += n_procs) {
prob_reached_end[j] = trans_prob_reached_end[j];
}
}
// Master process outputs the final result.
std::cout << "Probability, Avg. Steps, Min. Steps, Max Steps" << std::endl;
for (int i = 0; i < 21; ++i)
{
double prob = i * prob_step;
std::cout << prob << "," << avg_steps_over_p[i]
<< "," << min_steps_over_p[i] << ","
<< max_steps_over_p[i] << ","
<< prob_reached_end[i] << std::endl;
}
}
MPI_Finalize();
return 0;
}
I have tried the following parameters: scaling analysis
I'm new to parallelisation and HPC so forgive me if I'm wrong, but I was expecting a speed-up ratio of greater than 3 when increasing the tasks per node and CPUs per task. I haven't yet tried all the possibilities but I believe the behaviour here is odd, especially when keeping CPUs per task at 1 and increasing tasks per node from 2->3->4. I know it's not as simple a case as greater core usage = greater speed up, but from what I've gathered these should speed-up.
Is there a possible inefficiency in my code that is leading to this, or is this expected behaviour? My full code is here, which includes the openMP parallelisation: https://www.codedump.xyz/cpp/Y5Rr68L8Mncmx1Sd.
Many thanks.
I don't know how many operations are in the forest_fire routine but it had better be a couple of tens of thousands otherwise you don't have enough work to overcome the parallelization overhead.
Rank 0 handles all processes sequentially. You should use MPI_Irecv. And I wonder if a collective operation would not be preferable.
You are indexing with [i] which is a strided operation. That is space-wasting as I pointed out in another question you posted. Every process should only allocate as much space as is needed on that process.

error when use multiple mpi_bcast

I have a problem with 3 mpi_bcast and one mpi_scatter, my program don't work well ,mpi_scatter don't work and globalparcsr don't scatter between nodes. when i delete second and third mpi_bcast ,mpi_scatter work well. I want broadcast a and globalindividual and globalfitness and then scatter globalparcsr, part of my code as bellow:
int malloc2dint(int ***array, int n, int m) {
/* allocate the n*m contiguous items */
int *p = (int *)malloc(n*m * sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int **)malloc(n * sizeof(int*));
if (!(*array)) {
free(p);
return -1;
}
/* set up the pointers into the contiguous memory */
for (int i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
}
int main(int argc, char *argv[]) {
int size, rank, divided_pop_size, sum = 0, root = 0, procgridsize, sum2 = 0,generation=0;
int **globalindividual, **localindividual;
int *globalfitness, *localfitness;
int *globalparcsr, *localparcsr;
int **recbuf;
int *sendcounts, *parsendcount; //specifying the number of elements to send to each processor
int *displs, *pardispls; //Entry i specifies the displacement
MPI_Status status;
int offset, rows;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
divided_pop_size = n_initial_pop / size;
if (rank == root)
{
malloc2dint(&globalindividual, n_initial_pop, num_vertices);
read_graph();
globalfitness = (int*)malloc(n_initial_pop * sizeof(int));
globalparcsr = (int*)malloc(n_initial_pop * sizeof(int));
globalindividual = initial_population(globalindividual, n_initial_pop);
for (int i = 0; i < n_initial_pop; i++) {
printf("\n");
for (int j = 0; j < num_vertices; j++)
printf("%d", globalindividual[i][j]);
}
}
for (int p = 0; p < size; p++) {
if (rank == p) {
malloc2dint(&localindividual,n_initial_pop + 2, num_vertices);
localindividual = initial_population(localindividual, divided_pop_size + 2);
}
}
MPI_Bcast(&a[0][0], 5000 * 5000, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Bcast(&globalindividual[0][0], n_initial_pop*num_vertices, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&globalfitness, n_initial_pop, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
a is a 2d array and globalindividual is a 2d array with 12 rows and 8 columns and globalfitness is 1d array with size 12
please help me.

Collect all computed by mpi rows of matrix to root matrix

I'm trying hard all day to implement matrix multiply with help of MPI, all examples from the Internet didn't work for me (I don't know why, it compiles, run but not computing). Here is what I'm doing:
From bash:
mpirun -n 2 out/lb8
It reading matrix 2x4 (1 row per process) and starting to compute.
The problem is in SendRecv block (or generally in collecting data)
void Matrix_MPY(double **matrix_a, double **matrix_b, double ***matrix_c, int a_rows, int a_cols) {
int i, j;
int process_rank, process_count;
MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
MPI_Comm_size(MPI_COMM_WORLD, &process_count);
if (a_rows % process_count != 0) {
error_code = NOT_DEVIDED_BY_RANK_EXCEPTION;
return;
}
int rows_per_process = a_rows / process_count;
int current_row = rows_per_process * process_rank;
double **temp;
temp = (double **) malloc(sizeof(double *) * a_rows);
for (i = 0; i < a_rows; ++i){
temp[i] = (double *) malloc(sizeof(double) * a_rows);
}
for (i = current_row; i < current_row + rows_per_process; ++i) {
for (j = 0; j < a_rows; ++j)
{
int k;
for(k = 0; k < a_cols; ++k){
temp[i][j] += matrix_a[i][k] * matrix_b[k][j];
}
}
MPI_Sendrecv(temp[i], a_rows, MPI_DOUBLE, ROOT, TAG, temp[i], a_rows, MPI_DOUBLE, process_rank, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
*matrix_c = temp;
}
This solution is worked for me
....
if (process_rank != ROOT)
MPI_Send(temp[i], a_rows, MPI_DOUBLE, ROOT, i, MPI_COMM_WORLD);
}
if (process_rank == ROOT) {
for (i = 1; i < process_count; ++i)
{
for (j = i * rows_per_process; j < i * rows_per_process + rows_per_process; ++j)
{
MPI_Recv(temp[j], a_rows, MPI_DOUBLE, i, j, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
}
}
}
*matrix_c = temp;

Matrix Multiplication MPI + OMP

I'm having a problem with my code for matrix multiplication using both MPI and OMP. Code is correctly compiled but it give me wrong result,values in matrix c(in matmul function) are to big and matrix C(in main) doesn't even get results from function matmul. If anyone knows how to fix it,please help.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <omp.h>
#include <mpi.h>
int offset,rows,br_elemenata,cvor_id,cvor,ukupno;
MPI_Status status;
double gettime(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + 1e-6 * tv.tv_usec;
}
void matfill(long N, double *mat, double val) {
long i, j;
for(i = 0; i < N; i ++)
for(j = 0; j < N; j ++)
mat[i * N + j] = val;
}
void matmul(long N, double *a, double *b, double *c) {
long i, j, k;
br_elemenata = N / ukupno; //odredjujemo broj elemenata po cvoru
if (N % ukupno != 0) br_elemenata++; //inkrementujemo broj elemenata po cvoru kako ne bismo neki izostavili
if (cvor == 0){
for (cvor_id=1;cvor_id<ukupno;cvor_id++){
offset = cvor_id * br_elemenata;
rows = N - offset;
if (rows > br_elemenata)
rows = br_elemenata;
// slanje podataka sa cvora 0 na ostale cvorove
MPI_Send(&offset, 1, MPI_INT, cvor_id, 0, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, cvor_id, 0, MPI_COMM_WORLD);
MPI_Send(a+offset, rows*N, MPI_DOUBLE, cvor_id, 0, MPI_COMM_WORLD);
MPI_Send(b, N*N, MPI_DOUBLE, cvor_id, 0, MPI_COMM_WORLD);
}
offset = 0;
rows = br_elemenata;
} else {
// Primanje podataka sa cvora 0
MPI_Recv(&offset, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(a+offset, rows*N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(b, N*N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status);
}
MPI_Barrier(MPI_COMM_WORLD);
#pragma omp parallel for shared(a,b,c) private(i,j,k)
for (i = offset; i < offset + rows; i ++)
for (j = 0; j < N; j ++)
for (k = 0; k < N; k ++)
c[i + j] += a[i + k] * b[k * N + j];
printf("Clan: %5.2f\n",c[i]);
if (cvor == 0) {
for (cvor_id = 1; cvor_id < ukupno; cvor_id++) {
MPI_Recv(&offset, 1, MPI_INT, cvor_id, 1, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, cvor_id, 1, MPI_COMM_WORLD, &status);
MPI_Recv(c+offset, rows*N, MPI_DOUBLE, cvor_id, 1, MPI_COMM_WORLD, &status);
}
} else {
MPI_Send(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
MPI_Send(c+offset, rows*N, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD);
}
}
int main(int argc, char **argv) {
long N;
double *A, *B, *C, t;
MPI_Init(&argc,&argv); //Inicijalizacija MPI
MPI_Comm_size(MPI_COMM_WORLD,&ukupno); //odredjujemo ukupan broj cvorova
MPI_Comm_rank(MPI_COMM_WORLD,&cvor); //odredjuje redni broj cvora, nacin da se svaki cvor identifikuje u komunikaciji
if (argc!=2) {
if (cvor==0) printf("Morate unijeti dimenziju matrice!");
MPI_Finalize(); // ako ne postoji argument pri pozivu funkcije, zavrsiti program
return 1;
}
N = atoi(argv[1]);
A = (double *) malloc(N * N * sizeof(double));
B = (double *) malloc(N * N * sizeof(double));
C = (double *) malloc(N * N * sizeof(double));
matfill(N, A, 1.0);
matfill(N, B, 2.0);
matfill(N, C, 0.0);
t = gettime();
matmul(N, A, B, C);
t = gettime() - t;
// if (cvor == 0){
fprintf(stdout, "%ld\t%le\t%le\n", N, t, (2 * N - 1) * N * N / t);
fflush(stdout);
printf("Clan: %f\n",C[6]);
// }
free(A);
free(B);
free(C);
return EXIT_SUCCESS;
}
The main issue is the offset during communication operations. It should be offset*N.
Corrected code :
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <omp.h>
#include <mpi.h>
int offset,rows,br_elemenata,cvor_id,cvor,ukupno;
MPI_Status status;
double gettime(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + 1e-6 * tv.tv_usec;
}
void matfill(long N, double *mat, double val) {
long i, j;
for(i = 0; i < N; i ++)
for(j = 0; j < N; j ++)
mat[i * N + j] = val;
}
void matprint(long N, double *mat) {
long i, j;
for(i = 0; i < N; i ++){
for(j = 0; j < N; j ++){
printf("%g ",mat[i*N+j]);
}
printf("\n");
}
}
void matdiag(long N, double *mat, double val) {
long i, j;
for(i = 0; i < N; i ++)
for(j = 0; j < N; j ++)
if(i==j){
mat[i * N + j] = (double)i;
}else{
mat[i * N + j] =0;
}
}
void matmul(long N, double *a, double *b, double *c) {
long i, j, k;
br_elemenata = N / ukupno; //odredjujemo broj elemenata po cvoru
if (N % ukupno != 0) br_elemenata++; //inkrementujemo broj elemenata po cvoru kako ne bismo neki izostavili
if (cvor == 0){
for (cvor_id=1;cvor_id<ukupno;cvor_id++){
offset = cvor_id * br_elemenata;
rows = N - offset;
if (rows > br_elemenata)
rows = br_elemenata;
// slanje podataka sa cvora 0 na ostale cvorove
MPI_Send(&offset, 1, MPI_INT, cvor_id, 0, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, cvor_id, 1, MPI_COMM_WORLD);
MPI_Send(a+(offset*N), rows*N, MPI_DOUBLE, cvor_id, 2, MPI_COMM_WORLD);
MPI_Send(b, N*N, MPI_DOUBLE, cvor_id, 3, MPI_COMM_WORLD);
}
offset = 0;
rows = br_elemenata;
} else {
// Primanje podataka sa cvora 0
MPI_Recv(&offset, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
MPI_Recv(a+(offset*N), rows*N, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD, &status);
MPI_Recv(b, N*N, MPI_DOUBLE, 0, 3, MPI_COMM_WORLD, &status);
}
MPI_Barrier(MPI_COMM_WORLD);
#pragma omp parallel for shared(a,b,c) private(i,j,k)
for (i = offset; i < offset + rows; i ++)
for (j = 0; j < N; j ++)
for (k = 0; k < N; k ++)
c[i*N + j] += a[i*N + k] * b[k * N + j];
printf("Clan: %5.2f\n",c[i]);
if (cvor == 0) {
for (cvor_id = 1; cvor_id < ukupno; cvor_id++) {
MPI_Recv(&offset, 1, MPI_INT, cvor_id, 4, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, cvor_id, 5, MPI_COMM_WORLD, &status);
MPI_Recv(c+(N*offset), rows*N, MPI_DOUBLE, cvor_id, 6, MPI_COMM_WORLD, &status);
}
} else {
MPI_Send(&offset, 1, MPI_INT, 0, 4, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, 0, 5, MPI_COMM_WORLD);
MPI_Send(c+(N*offset), rows*N, MPI_DOUBLE, 0, 6, MPI_COMM_WORLD);
}
}
int main(int argc, char **argv) {
long N;
double *A, *B, *C, t;
MPI_Init(&argc,&argv); //Inicijalizacija MPI
MPI_Comm_size(MPI_COMM_WORLD,&ukupno); //odredjujemo ukupan broj cvorova
MPI_Comm_rank(MPI_COMM_WORLD,&cvor); //odredjuje redni broj cvora, nacin da se svaki cvor identifikuje u komunikaciji
if (argc!=2) {
if (cvor==0) printf("Morate unijeti dimenziju matrice!");
MPI_Finalize(); // ako ne postoji argument pri pozivu funkcije, zavrsiti program
return 1;
}
N = atoi(argv[1]);
A = (double *) malloc(N * N * sizeof(double));
B = (double *) malloc(N * N * sizeof(double));
C = (double *) malloc(N * N * sizeof(double));
matfill(N, A, 1.0);
matfill(N, B, 2.0);
matfill(N, C, 0.0);
matdiag(N,A, 1) ;
t = gettime();
matmul(N, A, B, C);
t = gettime() - t;
if (cvor == 0){
fprintf(stdout, "%ld\t%le\t%le\n", N, t, (2 * N - 1) * N * N / t);
fflush(stdout);
printf("Clan: %f\n",C[6]);
printf("A\n");
matprint(N, A) ;
printf("B\n");
matprint(N, B) ;
printf("C\n");
matprint(N, C) ;
}
free(A);
free(B);
free(C);
MPI_Finalize();
return EXIT_SUCCESS;
}
To compile : mpicc main.c -o main To run : mpirun -np 4 main
If you wish to go further, you will be interested by the MPI_Bcast() function, which sends the same thing to everyone. MPI_Scatter() and MPI_Gather() are helpful to distribute matrices or get it back on a given process.
Moreover, the dgemm() function of BLAS may be used to speed up the computation on a given process.
To reduce the memory footprint, the allocated size of A and C may be decreased to account for br_elemenata (except on process 0)...and offsets will have to change...again !

Resources