I am using MPI_Reduce and MPI_Scatter function to scatter an array of integers in "N" processors and printing the partial and accumulate sum of the array. I am using Microsoft MPI (MSMPI) on visual studio 2010. but each time at execution it gives an exception " _CrtlsValidHeapPointer(PuserData)" along the title "Debug Assertion Failed" The code is as under
enter code here
#include <mpi.h>
#include<iostream>
using namespace std;
int main(int argc, char *argv[]) {
int size;
int rank;
int partialsum=0;
int root =0;
int accum=0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int *globaldata = NULL;
int *localdata = new int(4);
if (rank == root) {
globaldata = new int(size*4);
for (int i=0; i<(size*4); i++)
globaldata[i] = 2*i+1;
cout<<"Processor"<<rank<<" has global data: ";
for (int i=0; i<(size*4); i++)
cout<<globaldata[i]<<" ";
cout<<"\n";
}
MPI_Scatter(globaldata, 4, MPI_INT, localdata, 4, MPI_INT, root, MPI_COMM_WORLD);
cout<<"Processor "<<rank<<"has local data";
for(int i=0; i<4;i++)
cout<<" "<<localdata[i];
cout<<endl;
for(int k=0;k<4;k++)
partialsum += localdata[k];
cout<<"Processor "<<rank<<" Partial Sum = "<<partialsum<<"\n";
MPI_Reduce(&partialsum,&accum,1,MPI_INT,MPI_SUM, root,MPI_COMM_WORLD);
if (rank == 0) {
cout<<"Processor "<<rank<<" Accumulated Sum = "<<accum;
}
MPI_Finalize();
return 0;
}
The error is very simple and lies here:
globaldata = new int(size*4);
The syntax to allocate dynamic arrays with the new operator is new type[size]:
globaldata = new int[size*4];
In your case a space for a single int is allocated and set to size*4 instead and the initialisation code that immediately follows the allocation of memory at the root overwrites past the end of the allocated memory, thus destroying the heap structure.
Related
I have a dynamically allocated array that is sent by rank 0 to other ranks using MPI_Send()
On the receiving side, a dynamic array is allocated memory using malloc()
MPI_Recv() happens on the other ranks. At this receive function, I get invalid Buffer Pointer error.
Code is conceptually similar to this:
struct graph{
int count;
int * array;
} a_graph;
int x = 10;
MPI_Status status;
//ONLY 2 RANKS ARE PRESENT. RANK 0 SENDS MSG TO RANK 1
if (rank == 0){
a_graph * my_graph = malloc(sizeof(my_graph))
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
for(int i =0; i < my_graph->count; i++)
my_graph->array[i] = i;
MPI_Send(my_graph->array,my_graph->count,int,1,0,MPI_COMM_WORLD);
free(my_graph->array);
free(my_graph);
}
else if (rank == 1){
a_graph * my_graph = malloc(sizeof(my_graph))
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
MPI_Recv(my_graph->array,my_graph->count,int,0,0,MPI_COMM_WORLD,&status) // MPI INVALID BUFFER POINTER ERROR HAPPENS AT THIS RECV
}
I dont understand why this happens since memory is allocated in both sender and receiver ranks
Below is a minimal, working, and verifiable (MWVE) example which Zulan suggested you to make. Please provide MWVE in your future questions. Anyway, you need to use MPI datatype MPI_INT instead of int for sending and receiving.
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
typedef struct graph{
int count;
int * array;
} a_graph;
int main()
{
MPI_Init(NULL, NULL);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int x = 10;
MPI_Status status;
//ONLY 2 RANKS ARE PRESENT. RANK 0 SENDS MSG TO RANK 1
if (rank == 0){
a_graph * my_graph = malloc(sizeof(a_graph));
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
for(int i =0; i < my_graph->count; i++)
my_graph->array[i] = i;
MPI_Send(my_graph->array,my_graph->count,MPI_INT,1,0,MPI_COMM_WORLD);
free(my_graph->array);
free(my_graph);
}
else if (rank == 1){
a_graph * my_graph = malloc(sizeof(a_graph));
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
MPI_Recv(my_graph->array,my_graph->count,MPI_INT,0,0,MPI_COMM_WORLD,&status);
for (int i=0; i<my_graph->count; ++i)
{
printf("%i\n", my_graph->array[i]);
}
}
MPI_Finalize();
return 0;
}
I have 4 filepaths in the global_filetable and I am trying to scatter 2 pilepaths to each process.
The process 0 have proper 2 paths, but there is something strange in the process 1 (null)...
EDIT:
Here's the full code:
#include <stdio.h>
#include <limits.h> // PATH_MAX
#include <mpi.h>
int main(int argc, char *argv[])
{
char** global_filetable = (char**)malloc(4 * PATH_MAX * sizeof(char));
for(int i = 0; i < 4; ++i) {
global_filetable[i] = (char*)malloc(PATH_MAX *sizeof(char));
strncpy (filetable[i], "/path/", PATH_MAX);
}
/*for(int i = 0; i < 4; ++i) {
printf("%s\n", global_filetable[i]);
}*/
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
char** local_filetable = (char**)malloc(2 * PATH_MAX * sizeof(char));
MPI_Scatter(global_filetable, 2*PATH_MAX, MPI_CHAR, local_filetable, 2*PATH_MAX , MPI_CHAR, 0, MPI_COMM_WORLD);
{
/* now all processors print their local data: */
for (int p = 0; p < size; ++p) {
if (rank == p) {
printf("Local process on rank %d is:\n", rank);
for (int i = 0; i < 2; i++) {
printf("path: %s\n", local_filetable[i]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}
Output:
Local process on rank 0 is:
path: /path/
path: /path/
Local process on rank 1 is:
path: (null)
path: (null)
Do you have any idea why I am having those nulls?
First, your allocation is inconsistent:
char** local_filetable = (char**)malloc(2 * PATH_MAX * sizeof(char));
The type char** indicates an array of char*, but you allocate a contiguous memory block, which would indicate a char*.
The easiest way would be to use the contiguous memory as char* for both global and local filetables. Depending on what get_filetable() actually does, you may have to convert. You can then index it like this:
char* entry = &filetable[i * PATH_MAX]
You can then simply scatter like this:
MPI_Scatter(global_filetable, 2 * PATH_MAX, MPI_CHAR,
local_filetable, 2 * PATH_MAX, MPI_CHAR, 0, MPI_COMM_WORLD);
Note that there is no more displacement, every rank just gets an equal sized chunk of the contiguous memory.
The next step would be to define a C and MPI struct encapsulating PATH_MAX characters so you can get rid of the constant usage of PATH_MAX and crude indexing.
I think this is much nicer (less complex, less memory management) than using actual char**. You would only need that if memory waste or redundant data transfer becomes an issue.
P.S. Make sure to never put in more than PATH_MAX - 1 characters in an filetable entry to keep space for the tailing \0.
Okay, I'm stupid.
char global_filetable[NUMBER_OF_STRINGS][PATH_MAX];
for(int i = 0; i < 4; ++i) {
strcpy (filetable[i], "/path/");
}
char local_filetable[2][PATH_MAX];
Now it works!
I have implemented a 2d array Mpi scatter which works well. I mean that the master processor can scatter 2d parts of the initial big array. The problem is when I use as input the 2d image file dynamically allocated it doesn't work. I suppose that there must be something wrong with the memory. Is there any way of obtaining 2d parts of a big 2d array dynamically.
I had a similar problem, but it was one-dimensional vector with dynamically allocated.
Solved my problem as follows:
#include <stdio.h>
#include "mpi.h"
main(int argc, char** argv) {
/* .......Variables Initialisation ......*/
int Numprocs, MyRank, Root = 0;
int index;
int *InputBuffer, *RecvBuffer;
int Scatter_DataSize;
int DataSize;
MPI_Status status;
/* ........MPI Initialisation .......*/
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
MPI_Comm_size(MPI_COMM_WORLD, &Numprocs);
if (MyRank == Root) {
DataSize = 80000;
/* ...Allocate memory.....*/
InputBuffer = (int*) malloc(DataSize * sizeof(int));
for (index = 0; index < DataSize; index++)
InputBuffer[index] = index;
}
MPI_Bcast(&DataSize, 1, MPI_INT, Root, MPI_COMM_WORLD);
if (DataSize % Numprocs != 0) {
if (MyRank == Root)
printf("Input is not evenly divisible by Number of Processes\n");
MPI_Finalize();
exit(-1);
}
Scatter_DataSize = DataSize / Numprocs;
RecvBuffer = (int *) malloc(Scatter_DataSize * sizeof(int));
MPI_Scatter(InputBuffer, Scatter_DataSize, MPI_INT, RecvBuffer,
Scatter_DataSize, MPI_INT, Root, MPI_COMM_WORLD);
for (index = 0; index < Scatter_DataSize; ++index)
printf("MyRank = %d, RecvBuffer[%d] = %d \n", MyRank, index,
RecvBuffer[index]);
MPI_Finalize();
}
This link has examples that have helped me:
http://www.cse.iitd.ernet.in/~dheerajb/MPI/Document/hos_cont.html
Hope this helps.
Using visual studios 2010. Win 7. Nsight 2.1
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// incrementArray.cu
#include <stdio.h>
#include <assert.h>
void incrementArrayOnHost(float *a, int N)
{
int i;
for (i=0; i < N; i++) a[i] = a[i]+1.f;
}
__global__ void incrementArrayOnDevice(float *a, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int j = idx;
int i = 2;
i = i+j; //->breakpoint here
if (idx<N) a[idx] = a[idx]+1.f; //->breakpoint here
}
int main(void)
{
float *a_h, *b_h; // pointers to host memory
float *a_d; // pointer to device memory
int i, N = 10;
size_t size = N*sizeof(float);
// allocate arrays on host
a_h = (float *)malloc(size);
b_h = (float *)malloc(size);
// allocate array on device
cudaMalloc((void **) &a_d, size);
// initialization of host data
for (i=0; i<N; i++) a_h[i] = (float)i;
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);
// do calculation on host
incrementArrayOnHost(a_h, N);
// do calculation on device:
// Part 1 of 2. Compute execution configuration
int blockSize = 4;
int nBlocks = N/blockSize + (N%blockSize == 0?0:1);
// Part 2 of 2. Call incrementArrayOnDevice kernel
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
// Retrieve result from device and store in b_h
cudaMemcpy(b_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// check results
for (i=0; i<N; i++) assert(a_h[i] == b_h[i]);
// cleanup
free(a_h); free(b_h); cudaFree(a_d);
return 0;
}
I've tried inserting breakpoints as listed above inside my global void incrementArrayOnDevice(float *a, int N) but they're not hitting.
When I run debug (f5) in visual studios, I tried to step into incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N); but they would skip the entire kernel code section.
tried to add a watch on the variables i and j but there was an error "CXX0017: Error: symbol "i" not found."
Is this issue normal? Can someone please try on their pc and let me know if they can hit the breakpoints? If you can, what possible problem could mine be? Please help! :(
Nsight debugging is different from VS debugging . You need to use Nsight debugging to hit the kernel breakpoints. However, for this you need 2 GPU cards. Do you have 2 cards in the first place? Please check
You can debug on a single GPU but on the following conditions:
You have to be using 5.0 toolkit
You have to be programming on a GPU that suports 303.xx NForceWare or higher
I coded a mpi matrix multification program, which use scanf("%d", &size), designate matrix size, then I defined int matrix[size*size], but when I complied it, it reported that matrix is undeclared. Please tell me why, or what my problem is!
According Ed's suggestion, I changed the matrix definition to if(myid == 0) block, but got the same err! Now I post my code, please help me find out where I made mistakes! thank you!
int size;
int main(int argc, char* argv[]) {
int myid, numprocs;
int *p;
MPI_Status status;
int i,j,k;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(myid == 0)
{
scanf("%d", &size);
int matrix1[size*size];
int matrix2[size*size];
int matrix3[size*size];
int section = size/numprocs;
int tail = size % numprocs;
srand((unsigned)time(NULL));
for( i=0; i<size; i++)
for( j=0; j<size; j++)
{
matrix1[i*size+j]=rand()%9;
matrix3[i*size+j]= 0;
matrix2[i*size+j]=rand()%9;
}
printf("Matrix1 is: \n");
for( i=0; i<size; i++)
{
for( j=0; j<size; j++)
{
printf("%3d", matrix1[i*size+j]);
}
printf("\n");
}
printf("\n");
printf("Matrix2 is: \n");
Reformatted code would be nice...
One problem is that you haven't declared the size variable. Another problem is that the [size] notation for declaring arrays is only good for sizes that are known at compile time. You want to use malloc() instead.
You don't actually need to define a MAX_SIZE if you use dynamic memory allocation.
#include <stdio.h>
#include <stdlib.h>
...
scanf("%d", &size);
int *matrix1 = (int *) malloc(size*size*sizeof(int));
int *matrix2 = (int *) malloc(size*size*sizeof(int));
int *matrix3 = (int *) malloc(size*size*sizeof(int));
...