How fix it. Unhandled exception at 0x0099B514 in ConsoleApplication15.exe: 0xC0000094: Integer division by zero - c++11

I am trying to solve problem with c++: Find all unique elements of a two-dimensional array of integers using MPI_Scatter and MPI_Comm_split to distribute the array's rows among a set of processes, so that the set of processes is split into three groups.
Got the code
#include <iostream>
#include <unordered_set>
#include <mpi.h>
using namespace std;
int main(int argc, char* argv[])
{
int rank, size;
int rows = 0, columns = 0;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int* matrix = nullptr;
if (rank == 0)
{
cout << "Enter the number of rows: ";
cin >> rows;
cout << "Enter the number of columns: ";
cin >> columns;
matrix = new int[rows * columns];
cout << "Enter the elements of the matrix: " << endl;
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < columns; j++)
{
cin >> matrix[i * columns + j];
}
}
}
MPI_Bcast(&rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&columns, 1, MPI_INT, 0, MPI_COMM_WORLD);
int sub_size = rows / size;
int* local_matrix = new int[sub_size * columns];
MPI_Scatter(matrix, sub_size * columns, MPI_INT, local_matrix, sub_size * columns, MPI_INT, 0, MPI_COMM_WORLD);
unordered_set<int> local_set;
for (int i = 0; i < sub_size; i++)
{
for (int j = 0; j < columns; j++)
{
local_set.insert(local_matrix[i * columns + j]);
}
}
MPI_Comm sub_comm;
MPI_Comm_split(MPI_COMM_WORLD, rank / (size / 3), rank, &sub_comm);
int sub_rank, new_sub_size;
MPI_Comm_rank(sub_comm, &sub_rank);
MPI_Comm_size(sub_comm, &new_sub_size);
unordered_set<int>* global_set = nullptr;
if (sub_rank == 0)
{
global_set = new unordered_set<int>[new_sub_size];
}
MPI_Gather(&local_set, sizeof(unordered_set<int>), MPI_BYTE, global_set, sizeof(unordered_set<int>), MPI_BYTE, 0, sub_comm);
if (sub_rank == 0)
{
unordered_set<int> final_set;
for (int i = 0; i < new_sub_size; i++)
{
for (auto it = global_set[i].begin(); it != global_set[i].end(); it++) {
final_set.insert(*it);
}
}
cout << "The unique elements in the matrix are: ";
for (auto it = final_set.begin(); it != final_set.end(); it++) {
cout << *it << " ";
}
cout << endl;
delete[] global_set;
}
delete[] local_matrix;
if (rank == 0) {
delete[] matrix;
}
MPI_Finalize();
return 0;
}
After compile and input data microsoft visual studio 2019 gives an error message
Unhandled exception at 0x0099B514 in ConsoleApplication15.exe: 0xC0000094: Integer division by zero.
to this line
MPI_Comm_split(MPI_COMM_WORLD, rank / (size / 3), rank, &sub_comm);
How fix it?

Related

Use MPI gather to get data from tasks into master array

I'm learning how to use MPI to calculate the 4 point stencil of a 2d hot plate. But my issue is that when writing my results to an output file, the output file only contains the results of one of the tasks (I believe its the task that finishes first). What I want to do is gather the data after performing all the iterations in the worker tasks back to the master, but I am unsure how to do that. I want to use MPI_Gather, or believe I need to use it, but I don't quite understand how to do it from the tutorials.
Alternatively, am I misallocating how to return data back to the master in my worker threads?
Code here:
#define BEGIN 1 /* message tag */
#define LTAG 2 /* message tag */
#define RTAG 3 /* message tag */
#define NONE 0 /* indicates no neighbor */
#define DONE 4 /* message tag */
#define MASTER 0 /* taskid of first process */
#include <iostream>
#include <math.h>
#include <fstream>
#include <string>
#include <cstring>
#include <iomanip>
#include <chrono>
#include <mpi.h>
#include <chrono>
using namespace std::chrono;
using namespace std;
void update(int start, int end, int ny, double* u1, double* u2)
{
int ix, iy;
for (ix = start; ix <= end; ix++)
{
for (iy = 1; iy <= ny - 2; iy++)
{
//calculate stencil
*(u2 + ix * ny + iy) = .25 * (
*(u1 + (ix + 1) * ny + iy) +
*(u1 + (ix - 1) * ny + iy) +
*(u1 + ix * ny + (iy + 1)) +
*(u1 + ix * ny + (iy - 1)));
}
}
//pass into buffer
for (ix = start; ix <= end; ix++)
for (iy = 1; iy <= ny - 2; iy++)
*(u1 + ix * ny + iy) = *(u2 + ix * ny + iy);
}
bool isNumber(char number[])
{
int i = 0;
//checking for negative numbers
if (number[0] == '-')
return false;
for (; number[i] != 0; i++)
{
//if (number[i] > '9' || number[i] < '0')
if (!isdigit(number[i]))
return false;
}
return true;
}
int main(int argc, char* argv[])
{
//variable init
int N = 0;
int I = 0;
int taskid, /* this task's unique id */
numworkers, /* number of worker processes */
numtasks, /* number of tasks */
averow, rows, offset, extra, /* for sending rows of data */
dest, source, /* to - from for message send-receive */
left, right, /* neighbor tasks */
msgtype, /* for message types */
rc, start, end, /* misc */
i, ix, iy, iz, it; /* loop variables */
MPI_Status status;
//There should be five arguments, no more no less
if (argc != 5)
{
std::cout << "Invalid parameters, please check your values." << std::endl;
exit(1);
}
if (!isNumber(argv[2]) || !isNumber(argv[4]))
{
std::cout << "Invalid parameters, please check your values." << std::endl;
exit(1);
}
//check the n flag in either valid position.
if (!strcmp(argv[1], "-n"))
{
N = atoi(argv[2]);
}
else if (!strcmp(argv[3], "-n"))
{
N = atoi(argv[4]);
}
else
{
std::cout << "Invalid flags, please check your values." << std::endl;
exit(1);
}
//check the I flag in either valid position
if (!strcmp(argv[1], "-I"))
{
I = atoi(argv[2]);
}
else if (!strcmp(argv[3], "-I"))
{
I = atoi(argv[4]);
}
else
{
std::cout << "Invalid flags, please check your values." << std::endl;
exit(1);
}
if (N == 0 || I == 0)
{
std::cout << "Invalid parameters, please check your values." << std::endl;
exit(1);
}
int n_x = N + 2; //Based on the piazza post, increase x and y dimension by two for the edges
int n_y = N + 2;
double** phi[2];
double** resultArray;
phi[0] = new double* [n_x];
phi[1] = new double* [n_x];
resultArray = new double* [n_x];
for (int i = 0; i < n_x; i++)
{
phi[0][i] = new double[n_y];
phi[1][i] = new double[n_y];
resultArray[i] = new double[n_y];
}
int iterationMax = I;
//Initialize the plate temperatures
for (int j = 0; j < n_x; j++)
{
for (int i = 0; i < n_x; i++)
{
if ((i == 0) || (i == n_x - 1) || (j == 0) || (j == n_y - 1))
{
phi[0][i][j] = 20.0;
phi[1][i][j] = 20.0;
}
else
{
phi[0][i][j] = 0.0;
phi[1][i][j] = 0.0;
}
}
}
for (int j = 0; j < n_x; j++)
{
if (j > (int)(0.3 * n_x) && j < (int)(0.7 * n_x))
{
phi[0][0][j] = 100.0;
phi[1][0][j] = 100.0;
}
}
/* First, find out my taskid and how many tasks are running */
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
numworkers = numtasks - 1;
if (taskid == 0) {
auto start = high_resolution_clock::now();
/************************* master code *******************************/
/* Check if numworkers is within range - quit if not */
//printf("Starting mpi_heat2D with %d worker tasks.\n", numworkers);
/* Initialize grid */
//printf("Grid size: X= %d Y= %d Time steps= %d\n", n_x, n_y, I);
/* Distribute work to workers. Must first figure out how many rows to */
/* send and what to do with extra rows. */
averow = n_x / numworkers;
extra = n_x % numworkers;
offset = 0;
for (i = 1; i <= numworkers; i++)
{
rows = (i <= extra) ? averow + 1 : averow;
/* Tell each worker who its neighbors are, since they must exchange */
/* data with each other. */
if (i == 1)
left = 0;
else
left = i - 1;
if (i == numworkers)
right = 0;
else
right = i + 1;
/* Now send startup information to each worker */
dest = i;
MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&left, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&right, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&phi[0][offset][0], rows * n_y, MPI_DOUBLE, dest, 1,
MPI_COMM_WORLD);
/* printf("Sent to task %d: rows= %d offset= %d ", dest, rows, offset);
printf("left= %d right= %d\n", left, right);*/
offset = offset + rows;
}
/* Now wait for results from all worker tasks */
for (i = 1; i <= numworkers; i++)
{
source = i;
msgtype = DONE;
MPI_Recv(&offset, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD,
&status);
MPI_Recv(&rows, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&phi[0][offset][0], rows * n_y, MPI_DOUBLE, source,
msgtype, MPI_COMM_WORLD, &status);
}
/* Write final output, call X graph and finalize MPI */
auto stop = high_resolution_clock::now();
auto duration = duration_cast<milliseconds>(stop - start);
std::cout.precision(5);
// output the time to the console
std::cout << "Time taken to solve diffusion plate: " << duration.count() << "ms" << std::fixed << std::endl;
} /* End of master code */
/************************* workers code **********************************/
if (taskid != MASTER)
{
/* Receive my offset, rows, neighbors and grid partition from master */
source = MASTER;
msgtype = BEGIN;
MPI_Recv(&offset, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&left, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&right, 1, MPI_INT, source, msgtype, MPI_COMM_WORLD, &status);
MPI_Recv(&phi[0][offset][0], rows * n_y, MPI_DOUBLE, source, msgtype,
MPI_COMM_WORLD, &status);
/* Determine border elements. Need to consider first and last columns. */
/* Obviously, row 0 can't exchange with row 0-1. Likewise, the last */
/* row can't exchange with last+1. */
start = offset;
end = offset + rows - 1;
if (offset == 0)
start = 1;
if ((offset + rows) == n_x)
end--;
// printf("task=%d start=%d end=%d\n", taskid, start, end);
/* Begin doing STEPS iterations. Must communicate border rows with */
/* neighbors. If I have the first or last grid row, then I only need */
/* to communicate with one neighbor */
//printf("Task %d received work. Beginning time steps...\n", taskid);
iz = 0;
for (it = 1; it <= iterationMax; it++)
{
if (left != NONE)
{
MPI_Send(&phi[iz][offset][0], n_y, MPI_DOUBLE, left,
RTAG, MPI_COMM_WORLD);
source = left;
msgtype = LTAG;
MPI_Recv(&phi[iz][offset - 1][0], n_y, MPI_DOUBLE, source,
msgtype, MPI_COMM_WORLD, &status);
}
if (right != NONE)
{
MPI_Send(&phi[iz][offset + rows - 1][0], n_y, MPI_DOUBLE, right,
LTAG, MPI_COMM_WORLD);
source = right;
msgtype = RTAG;
MPI_Recv(&phi[iz][offset + rows][0], n_y, MPI_DOUBLE, source, msgtype,
MPI_COMM_WORLD, &status);
}
/* Now call update to update the value of grid points */
update(start, end, n_y, &phi[iz][0][0], &phi[1-iz][0][0]);
iz = 1 - iz;
}
/* Finally, send my portion of final results back to master */
MPI_Send(&offset, 1, MPI_INT, MASTER, DONE, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, MASTER, DONE, MPI_COMM_WORLD);
MPI_Send(&phi[iz][offset][0], rows * n_y, MPI_DOUBLE, MASTER, DONE,
MPI_COMM_WORLD);
//printf("Task %d finished work...\n", taskid);
MPI_Finalize();
}
//This is probably not correct
/*double gsize;
int root, myrank;
double* mybuf;
MPI_Gather(&mybuf, 1, MPI_DOUBLE, phi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);*/
std::ofstream outJob;
outJob.open("finalTemperatures.csv");
std::cout.precision(15);
double* printAddr = &phi[0][0][0];
for (int j = 0; j < n_y; j++) //file IO
{
for (int i = 0; i < n_x; i++)
{
outJob << phi[0][j][i] << "," << std::fixed;
}
outJob << std::endl;
}
outJob.close();
return 0;
}
}

Abort signal for Rat in a maze problem (SIGABRT)

I was trying to solve the rat in a mze problem (https://practice.geeksforgeeks.org/problems/rat-in-a-maze-problem/1). But I am getting SIGABRT at the commented line below. I was thinking for quite a lot of time why it is happening. Can you please help? Thank you.
#include <bits/stdc++.h>
using namespace std;
class Solution{
public:
vector<string> check(int i, int j, vector<vector<int>> &m,string str , vector<string> soln,int n)
{
if(i==n-1 && j==n-1){soln.push_back(str);return soln;}
m[i][j]=0;
if(i+1<=n-1 && m[i+1][j]!=0 ){str += "D"; check(i+1,j,m,str,soln,n);}
else if(j+1<=n-1 && m[i][j+1]!=0 )
{
str += "R";
check(i,j+1,m,str,soln,n);// This is causing the SIGABRT when str = "DDRDRR"
}
else if(i-1>=0 && m[i-1][j]!=0){str += "U"; check(i-1,j,m,str,soln,n);}
else if(j-1>=0 && m[i][j-1]!=0){str += "L"; check(i,j-1,m,str,soln,n);}
}
vector<string> findPath(vector<vector<int>> &m, int n) {
vector<string> paths;
vector<string> soln;
if(m[0][0]==0)return paths;
paths = check(0,0,m,"",soln,n);
return paths;
}
};
int main() {
int t;
t=1;
while (t--) {
int n;
n=4;
vector<vector<int>> m{{1, 0, 0, 0},{1, 1, 0, 1}, {1, 1, 0, 0},{0, 1, 1, 1}};
Solution obj;
vector<string> result = obj.findPath(m, n);
sort(result.begin(), result.end());
if (result.size() == 0)
cout << -1;
else
for (int i = 0; i < result.size(); i++) cout << result[i] << " ";
cout << endl;
}
return 0;
}
There's no default case if none of the conditions match, you're simply returning void which is not a vector.
To fix it, add return soln; to the end of the check function.

Why does my won't my code populate the matrix?

My code is trying to find the beginning and ending indices of a section of a matrix that when added, together would equal 20. For each instance this occurs, it would then populate a matrix with said beginning and end indices in the format {beginning row index, beginning column index, ending row index, ending column index} for each row. Each row would represent separate instances. It works fine for one instance but when introduced to other instances it wouldn't populate the matrix. Please help.
#include <cstddef> // size_t
#include <iostream>
using namespace std;
// Populates matrix
void filler(int bIndr, int bIndc, int eIndr, int eIndc, size_t**matrix, const size_t kIndices_size2, const size_t kIndices_size) {
int t = 0;
int matrix2[4] = {0,0,0,0};
for(int i = 0 ; i < kIndices_size2; i++) {
for (int j = 0; j < 2; j++) {
for (int ii = t; ii < kIndices_size; ii++) {
if(j == 0) {
matrix2[ii] = bIndr;
matrix2[ii+1] = bIndc;
cout << matrix2[ii+1] << endl;
break;
}
if(j == 1) {
matrix2[ii] = eIndr;
matrix2[ii+1] = eIndc;
cout << matrix2[ii+1] << endl;
break;
}
}
t = 2;
}
}
for(int i = 0 ; i < kIndices_size; i++) {
matrix[kIndices_size2-1][i] = matrix2[i];
}
}
int main()
{
int goal = 20;
int array[2][8] = {{10,0,0,10,0,0,1,0},{0,0,10,0,0,0,10,0}};
int inst = 0;
int t=0;
int bIndr = 0;
int bIndc = 0;
int eIndr = 0;
int eIndc = 0;
const size_t kIndices_size = 4;
size_t**matrix;
for(int ii = 0; ii < 2; ii++) {
bIndc =0;
for(int j = bIndc; j < 8; j++) {
t = 0;
bIndr = ii;
bIndc = j;
for(int i = j; i < 8; i++) {
t += array[ii][i];
if((goal-t) == 0) {
inst++;
eIndc = i;
eIndr = ii;
matrix=new size_t*[inst];
matrix[inst-1]=new size_t [kIndices_size];
cout << bIndr << bIndc << eIndr << eIndc << endl;
filler(bIndr, bIndc, eIndr, eIndc, matrix, inst, kIndices_size);
break;
}
}
}
}
size_t actual_size = static_cast<size_t>(-1);
cout << actual_size << endl;
size_t* sums_found = &actual_size;
*sums_found = inst;
cout << actual_size << endl;
cout << matrix[0][0] << endl;
for(int i = 0; i < inst; i++) {
for(int ii = 0; ii < kIndices_size; ii++) {
cout << matrix[i][ii] << " ";
}
cout << endl;
}
return 0;
}

Access an matrix as its tranpose in tiled matrix mutliplication in CUDA

I'm currently experimenting with CUDA and i came across this kernel from an answer for matrix multiplication: https://stackoverflow.com/a/18856054/7867026
I want instead of doing A*B to do A_Transpose*A but without saving A_Transpose (only matrix A as an input to kernel). I have to properly set the indexes but I'm confused by this matrix representation. Any help would be appreciated.
most of what you need is here and here.
In the first link it is identified that AxAT involves taking inner products of rows of matrix A, and similarly ATxA will involve taking inner products of columns of matrix A. Also note the symmetry statement. In the second link (scroll down from that point a bit in the programming guide) you will find a complete tiled matrix multiply. You just need to index into both tiles by column.
Here is a worked example, using the code from the SO answer you linked:
$ cat t1654.cu
#include <iostream>
#include <cstdio>
#include <cstdlib>
const int TILE_DIM = 32;
template <typename T>
__global__ void ATA(const T * __restrict__ A, T * __restrict__ C, int ARows, int ACols)
{
T CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
__shared__ T As[TILE_DIM][TILE_DIM];
__shared__ T Bs[TILE_DIM][TILE_DIM];
for (int k = 0; k < (TILE_DIM + ARows - 1)/TILE_DIM; k++) {
if (k*TILE_DIM + threadIdx.y < ARows && blockIdx.y*blockDim.y+threadIdx.x < ACols)
As[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + blockIdx.y*blockDim.y+threadIdx.x];
else
As[threadIdx.y][threadIdx.x] = 0.0;
if (k*TILE_DIM + threadIdx.y < ARows && Col < ACols)
Bs[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + Col];
else
Bs[threadIdx.y][threadIdx.x] = 0.0;
__syncthreads();
for (int n = 0; n < TILE_DIM; ++n)
CValue += As[n][threadIdx.y] * Bs[n][threadIdx.x];
__syncthreads();
}
if (Row < ACols && Col < ACols)
C[((blockIdx.y * blockDim.y + threadIdx.y)*ACols) +
(blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
}
template <typename T>
__global__ void transpose_naive(const T * __restrict__ in, T * __restrict__ out, const int dim){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((col < dim) && (row < dim)) out[col*dim+row] = in[row*dim+col];
}
template <typename T>
__global__ void mm_naive(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int rowA, const int colA, const int colB){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
T Cval = 0;
for (int i = 0; i < colA; i++) Cval += A[row*colA+i]*B[i*colB+col];
C[row*colB+col] = Cval;}
}
typedef float mt;
int main(){
mt *d_A, *d_B, *d_C, *h_A, *h_C, *h_C1;
int m = 64;
int n = 64;
h_A = new mt[m*n];
h_C = new mt[n*n];
h_C1 = new mt[n*n];
cudaMalloc(&d_A, m*n*sizeof(d_A[0]));
cudaMalloc(&d_B, m*n*sizeof(d_A[0]));
cudaMalloc(&d_C, n*n*sizeof(d_C[0]));
// test 1
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = (i==j)?1.0f:0.0f;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
dim3 block(TILE_DIM, TILE_DIM);
dim3 grid((n+block.x-1)/block.x, (n+block.y-1)/block.y);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
// test 2
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = rand()%10;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
transpose_naive<<<grid,block>>>(d_A, d_B, n);
mm_naive<<<grid,block>>>(d_B, d_A, d_C, n, n, n);
cudaMemcpy(h_C1, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C1[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
for (int i = 0; i < n*n; i++) if (h_C[i] != h_C1[i]) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << h_C1[i] << std::endl; return 0;}
}
$ nvcc -o t1654 t1654.cu
$ cuda-memcheck ./t1654
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Note that loading the Bs tile is identical in both cases. The main changes are in loading the As tile, and also note the indexing change when computing Cvalue. These changes are necessary to index in both cases by column.
There may still be bugs. I have not tested the non-square case, nor have I tested the case where the matrix size is not a multiple of block size. Furthermore I've taken no advantage of the symmetry in the output. However this should help with the indexing.

error when use multiple mpi_bcast

I have a problem with 3 mpi_bcast and one mpi_scatter, my program don't work well ,mpi_scatter don't work and globalparcsr don't scatter between nodes. when i delete second and third mpi_bcast ,mpi_scatter work well. I want broadcast a and globalindividual and globalfitness and then scatter globalparcsr, part of my code as bellow:
int malloc2dint(int ***array, int n, int m) {
/* allocate the n*m contiguous items */
int *p = (int *)malloc(n*m * sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int **)malloc(n * sizeof(int*));
if (!(*array)) {
free(p);
return -1;
}
/* set up the pointers into the contiguous memory */
for (int i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
}
int main(int argc, char *argv[]) {
int size, rank, divided_pop_size, sum = 0, root = 0, procgridsize, sum2 = 0,generation=0;
int **globalindividual, **localindividual;
int *globalfitness, *localfitness;
int *globalparcsr, *localparcsr;
int **recbuf;
int *sendcounts, *parsendcount; //specifying the number of elements to send to each processor
int *displs, *pardispls; //Entry i specifies the displacement
MPI_Status status;
int offset, rows;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
divided_pop_size = n_initial_pop / size;
if (rank == root)
{
malloc2dint(&globalindividual, n_initial_pop, num_vertices);
read_graph();
globalfitness = (int*)malloc(n_initial_pop * sizeof(int));
globalparcsr = (int*)malloc(n_initial_pop * sizeof(int));
globalindividual = initial_population(globalindividual, n_initial_pop);
for (int i = 0; i < n_initial_pop; i++) {
printf("\n");
for (int j = 0; j < num_vertices; j++)
printf("%d", globalindividual[i][j]);
}
}
for (int p = 0; p < size; p++) {
if (rank == p) {
malloc2dint(&localindividual,n_initial_pop + 2, num_vertices);
localindividual = initial_population(localindividual, divided_pop_size + 2);
}
}
MPI_Bcast(&a[0][0], 5000 * 5000, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Bcast(&globalindividual[0][0], n_initial_pop*num_vertices, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&globalfitness, n_initial_pop, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
a is a 2d array and globalindividual is a 2d array with 12 rows and 8 columns and globalfitness is 1d array with size 12
please help me.

Resources