MPI_Recv() invalid buffer pointer - parallel-processing

I have a dynamically allocated array that is sent by rank 0 to other ranks using MPI_Send()
On the receiving side, a dynamic array is allocated memory using malloc()
MPI_Recv() happens on the other ranks. At this receive function, I get invalid Buffer Pointer error.
Code is conceptually similar to this:
struct graph{
int count;
int * array;
} a_graph;
int x = 10;
MPI_Status status;
//ONLY 2 RANKS ARE PRESENT. RANK 0 SENDS MSG TO RANK 1
if (rank == 0){
a_graph * my_graph = malloc(sizeof(my_graph))
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
for(int i =0; i < my_graph->count; i++)
my_graph->array[i] = i;
MPI_Send(my_graph->array,my_graph->count,int,1,0,MPI_COMM_WORLD);
free(my_graph->array);
free(my_graph);
}
else if (rank == 1){
a_graph * my_graph = malloc(sizeof(my_graph))
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
MPI_Recv(my_graph->array,my_graph->count,int,0,0,MPI_COMM_WORLD,&status) // MPI INVALID BUFFER POINTER ERROR HAPPENS AT THIS RECV
}
I dont understand why this happens since memory is allocated in both sender and receiver ranks

Below is a minimal, working, and verifiable (MWVE) example which Zulan suggested you to make. Please provide MWVE in your future questions. Anyway, you need to use MPI datatype MPI_INT instead of int for sending and receiving.
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
typedef struct graph{
int count;
int * array;
} a_graph;
int main()
{
MPI_Init(NULL, NULL);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int x = 10;
MPI_Status status;
//ONLY 2 RANKS ARE PRESENT. RANK 0 SENDS MSG TO RANK 1
if (rank == 0){
a_graph * my_graph = malloc(sizeof(a_graph));
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
for(int i =0; i < my_graph->count; i++)
my_graph->array[i] = i;
MPI_Send(my_graph->array,my_graph->count,MPI_INT,1,0,MPI_COMM_WORLD);
free(my_graph->array);
free(my_graph);
}
else if (rank == 1){
a_graph * my_graph = malloc(sizeof(a_graph));
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
MPI_Recv(my_graph->array,my_graph->count,MPI_INT,0,0,MPI_COMM_WORLD,&status);
for (int i=0; i<my_graph->count; ++i)
{
printf("%i\n", my_graph->array[i]);
}
}
MPI_Finalize();
return 0;
}

Related

Load balancing MPI multithreading for variable-complexity tasks or variable-speed nodes?

I've written an MPI code that currently multithreads by sending equal numbers of elements from each array to a different process to do work (thus, for 6 workers, the array is broken into 6 equal parts). What I would like to do is send small chunks only if a worker is ready to receive, and receive completed chunks without blocking future sends; this way if one chunk takes 10 seconds but the other chunks take 1 second, other data can be processed while waiting for the long chunk to complete.
Here's some skeleton code I've put together:
#include <mpi.h>
#include <iostream>
#include <vector>
#include <cmath>
struct crazytaxi
{
double a = 10.0;
double b = 25.2;
double c = 222.222;
};
int main(int argc, char** argv)
{
//Initial and temp kanno vectors
std::vector<crazytaxi> kanno;
std::vector<crazytaxi> kanno_tmp;
//init MPI
MPI_Init(NULL,NULL);
//allocate vector
int SZ = 4200;
kanno.resize(SZ);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD,&world_size);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD,&world_rank);
if (world_rank == 0)
{
for (int i = 0; i < SZ; i++)
kanno[i].a = 1.0*i;
kanno[i].b = 10.0/(i+1);
}
for (int j = 0; j < 10; j++) {
//Make sure all processes have same kanno vector;
if (world_rank == 0) {
for (int i = 1; i < world_size; i++)
MPI_Send(&kanno[0],sizeof(crazytaxi)*kanno.size(),MPI_BYTE,i,3,MPI_COMM_WORLD);
} else {
MPI_Recv(&kanno[0],sizeof(crazytaxi)*kanno.size(),MPI_BYTE,0,3,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
}
//copy to tmp vector
kanno_tmp = kanno;
MPI_Barrier();
//the sender
if (world_rank == 0) {
unsigned p1 = 0;
unsigned segment = 10;
unsigned p2 = segment;
while (p1 < SZ) {
for (int i = 0; i < world_size; i++) {
//if (process #i is ready to receive)
//Send data in chunks of 10 to i
//else
//continue
}
}
}
if (world_rank != 0) {
//Receive data to be processed
//do some math
for (unsigned i = p1; i < p2; i++)
kanno_tmp[i].a = std::sqrt(kanno[i].a)/((double)i+1.0);
//Send processed data to 0 and wait to receive new data.
}
//copy temp vector to kanno
kanno = kanno_tmp;
}
//print some of the results;
if (world_rank == 0)
{
for (int i = 0; i < SZ; i += 40)
printf("Line %d: %lg,%lg\n",i,kanno[i].a,kanno[i].b);
}
MPI_Finalize();
}
I can 90% turn this into what I want, except that my MPI_Send and MPI_Recv calls will block, or the 'master' process won't know that the 'slave' processes are ready to receive data.
Is there a way in MPI to do something like
unsigned Datapointer = [some_array_index];
while (Datapointer < array_size) {
if (world_rank == 0) {
for (int i = 1; i < world_size; i++)
{
if (<process i is ready to receive>) {
MPI_Send([...]);
Datapointer += 10;
}
if (<process i has sent data>)
MPI_Recv([...]);
if (Datapointer > array_size) {
MPI_Bcast([killswitch]);
break;
}
}
}
}
MPI_Barrier();
or is there a more efficient way to structure this for variable-complexity chunks or variable-speed nodes?
As #Gilles Gouaillardet, pointed out the keywords in such scenario is MPI_ANY_SOURCE. Using it, the processes can receive message from any source. To know which process send that message, you can use status.MPI_SOURCE on the status of the recv call.
MPI_Status status;
if(rank == 0) {
//send initial work to all processes
while(true) {
MPI_recv(buf, 32, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
// do the distribution logic
MPI_send(buf, 32, MPI_INT, status.MPI_SOURCE, tag, MPI_COMM_WORLD);
// break out of the loop once the work is over and send all the processes
message to stop waiting for work
}
}
else {
while(true){
// receive work from rank 0
MPI_recv(buf, 32, MPI_INT, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
// Perform computation and send back the result
MPI_send(buf, 32, MPI_INT, 0, tag, MPI_COMM_WORLD);
//break this until asked by master 0 using some kind of special message
}
}

MPI - scattering filepaths to processes

I have 4 filepaths in the global_filetable and I am trying to scatter 2 pilepaths to each process.
The process 0 have proper 2 paths, but there is something strange in the process 1 (null)...
EDIT:
Here's the full code:
#include <stdio.h>
#include <limits.h> // PATH_MAX
#include <mpi.h>
int main(int argc, char *argv[])
{
char** global_filetable = (char**)malloc(4 * PATH_MAX * sizeof(char));
for(int i = 0; i < 4; ++i) {
global_filetable[i] = (char*)malloc(PATH_MAX *sizeof(char));
strncpy (filetable[i], "/path/", PATH_MAX);
}
/*for(int i = 0; i < 4; ++i) {
printf("%s\n", global_filetable[i]);
}*/
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
char** local_filetable = (char**)malloc(2 * PATH_MAX * sizeof(char));
MPI_Scatter(global_filetable, 2*PATH_MAX, MPI_CHAR, local_filetable, 2*PATH_MAX , MPI_CHAR, 0, MPI_COMM_WORLD);
{
/* now all processors print their local data: */
for (int p = 0; p < size; ++p) {
if (rank == p) {
printf("Local process on rank %d is:\n", rank);
for (int i = 0; i < 2; i++) {
printf("path: %s\n", local_filetable[i]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}
Output:
Local process on rank 0 is:
path: /path/
path: /path/
Local process on rank 1 is:
path: (null)
path: (null)
Do you have any idea why I am having those nulls?
First, your allocation is inconsistent:
char** local_filetable = (char**)malloc(2 * PATH_MAX * sizeof(char));
The type char** indicates an array of char*, but you allocate a contiguous memory block, which would indicate a char*.
The easiest way would be to use the contiguous memory as char* for both global and local filetables. Depending on what get_filetable() actually does, you may have to convert. You can then index it like this:
char* entry = &filetable[i * PATH_MAX]
You can then simply scatter like this:
MPI_Scatter(global_filetable, 2 * PATH_MAX, MPI_CHAR,
local_filetable, 2 * PATH_MAX, MPI_CHAR, 0, MPI_COMM_WORLD);
Note that there is no more displacement, every rank just gets an equal sized chunk of the contiguous memory.
The next step would be to define a C and MPI struct encapsulating PATH_MAX characters so you can get rid of the constant usage of PATH_MAX and crude indexing.
I think this is much nicer (less complex, less memory management) than using actual char**. You would only need that if memory waste or redundant data transfer becomes an issue.
P.S. Make sure to never put in more than PATH_MAX - 1 characters in an filetable entry to keep space for the tailing \0.
Okay, I'm stupid.
char global_filetable[NUMBER_OF_STRINGS][PATH_MAX];
for(int i = 0; i < 4; ++i) {
strcpy (filetable[i], "/path/");
}
char local_filetable[2][PATH_MAX];
Now it works!

_CrtlsValidHeapPointer(PuserData), Debug Assertion Failed Visual C++ (MPI)

I am using MPI_Reduce and MPI_Scatter function to scatter an array of integers in "N" processors and printing the partial and accumulate sum of the array. I am using Microsoft MPI (MSMPI) on visual studio 2010. but each time at execution it gives an exception " _CrtlsValidHeapPointer(PuserData)" along the title "Debug Assertion Failed" The code is as under
enter code here
#include <mpi.h>
#include<iostream>
using namespace std;
int main(int argc, char *argv[]) {
int size;
int rank;
int partialsum=0;
int root =0;
int accum=0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int *globaldata = NULL;
int *localdata = new int(4);
if (rank == root) {
globaldata = new int(size*4);
for (int i=0; i<(size*4); i++)
globaldata[i] = 2*i+1;
cout<<"Processor"<<rank<<" has global data: ";
for (int i=0; i<(size*4); i++)
cout<<globaldata[i]<<" ";
cout<<"\n";
}
MPI_Scatter(globaldata, 4, MPI_INT, localdata, 4, MPI_INT, root, MPI_COMM_WORLD);
cout<<"Processor "<<rank<<"has local data";
for(int i=0; i<4;i++)
cout<<" "<<localdata[i];
cout<<endl;
for(int k=0;k<4;k++)
partialsum += localdata[k];
cout<<"Processor "<<rank<<" Partial Sum = "<<partialsum<<"\n";
MPI_Reduce(&partialsum,&accum,1,MPI_INT,MPI_SUM, root,MPI_COMM_WORLD);
if (rank == 0) {
cout<<"Processor "<<rank<<" Accumulated Sum = "<<accum;
}
MPI_Finalize();
return 0;
}
The error is very simple and lies here:
globaldata = new int(size*4);
The syntax to allocate dynamic arrays with the new operator is new type[size]:
globaldata = new int[size*4];
In your case a space for a single int is allocated and set to size*4 instead and the initialisation code that immediately follows the allocation of memory at the root overwrites past the end of the allocated memory, thus destroying the heap structure.

Parallel hashing using openmp

I have a piece of code for parallel hashing, the insert code is as follows:
int main(int argc, char** argv){
.....
Entry* table;//hash table
for(size_t i=0;i<N;i++){
keys[i]=i;
values[i] = rand();//random key-value pairs
}
int omp_p = omp_get_max_threads();
#pragma omp parallel for
for(int p=0;p<omp_p;p++){
size_t start = p*N/omp_p;
size_t end = (p+1)*N/omp_p;//each thread gets contiguous chunks of the arrays
for(size_t i=start;i<end;i++){
size_t key = keys[i];
size_t value = values[i];
if(insert(table,key,value) == 0){
printf("Failure!\n");
}
}
}
....
return 0;
}
int insert(Entry* table,size_t key, size_t value){
Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry
/*Use cuckoo hashing*/
size_t location = hash_function_1(key);
for(size_t its=0;its<MAX_ITERATIONS;its++){
entry = __sync_lock_test_and_set(&table[location],entry);
key=get_key(entry);
if(key == KEY_EMPTY)
return1;
}
/*We have replaced a valid key, try to hash it using next available hash function*/
size_t location_1 = hash_function_1(key);
size_t location_2 = hash_function_2(key);
size_t location_3 = hash_function_3(key);
if(location == location_1) location = location_2;
else if(location == location_2) location = location_3;
else location = location_1;
}
return 0;
}
The insert code doesn't scale at all. If I use a single thread, for say, 10M keys, I complete in about 170ms, whereas using 16 threads, I take > 500ms. My suspicion is that this is because the cache line (consisting of the table[] array) is being moved around between the threads during the write operation (__sync_lock_test_and_set(...)) and the invalidation results in a slow down
For example if I modify the insert code to just:
int insert(Entry* table,size_t key, size_t value){
Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry
size_t location = hash_function_1(key);
table[location] = entry;
return 1;
}
I still get the same bad performance. Since this is hashing, I cannot control, where a particular element hashes to. So any suggestions? Also, if this isn't the right reason, any other pointers as to what might be going wrong? I have tried it from 1M to 100M keys, but the single threaded performance is always better.
I have a few suggestions. Since the run time of your insert function is not constant then you should use schedule(dynamic). Second, you should let OpenMP divide the tasks and not do it yourself (one reason, but not the main reason, is that the way you have it now N has to be a multiple of omp_p). If you want to have some control over how it divides the tasks then try changing the chunksize like this schedule(dynamic,n) where n is the chuck size.
#pragma omp parallel for schedule(dynamic)
for(size_t i=0;i<N;i++){
size_t key = keys[i];
size_t value = values[i];
if(insert(table,key,value) == 0){
printf("Failure!\n");
}
}
I would try experimenting with a strategy based on locks, like this simple snippet shows:
#include<omp.h>
#define NHASHES 4
#define NTABLE 1000000
typedef size_t (hash_f)(size_t);
int main(int argc, char** argv) {
Entry table [NTABLE ];
hash_f hashes[NHASHES];
omp_lock_t locks [NTABLE ]
/* ... */
for(size_t ii = 0; ii < N; ii++) {
keys [ii] = ii;
values [ii] = rand();
}
for(size_t ii = 0; ii < NTABLE; ii++) {
omp_init_lock(&locks[ii]);
}
#pragma omp parallel
{
#pragma omp for schedule(static)
for(int ii = 0; ii < N; ii++) {
size_t key = keys [ii];
size_t value = values[ii];
Entry entry = (((Entry)key) << 32) + value;
for ( jj = 0; jj < NHASHES; jj++ ) {
size_t location = hashes[jj]; // I assume this is the computationally demanding part
omp_set_lock(&locks[location]); // Locks the hash table location before working on it
if ( get_key(table[location]) == KEY_EMPTY ) {
table[location] = entry;
break;
}
omp_unset_lock(&locks[location]); // Unlocks the hash table location
}
// Handle failures here
}
} /* pragma omp parallel */
for(size_t ii = 0; ii < NTABLE; ii++) {
omp_destroy_lock(&locks[ii]);
}
/* ... */
return 0;
}
With a little more machinery you can handle a variable number of locks ranging from 1 (equivalent to a critical section) to NTABLE (equivalent to an atomic construct) and see if the granularity in-between provides some benefit.

how to reduce page faults in this program?

I'm gating more then 1000 page faults in this program.
can i reduce them to some smaller value or even to zero ?
or even any other changes can speed up the execution
#include <stdio.h>
#include<stdlib.h>
int main(int argc, char* argv[])
{
register unsigned int u, v,i;
register unsigned int arr_size=0;
register unsigned int b_size=0;
register unsigned int c;
register unsigned int *b;
FILE *file;
register unsigned int *arr;
file=fopen(argv[1],"r");
arr=(unsigned int *)malloc(4*10000000);
while(!feof(file)){
++arr_size;
fscanf(file,"%u\n",&arr[arr_size-1]);
}
fclose(file);
b=(unsigned int *)malloc(arr_size*4);
if (arr_size!=0)
{
++b_size;
b[b_size-1]=0;
for (i = 1; i < arr_size; ++i)
{
if (arr[b[b_size-1]] < arr[i])
{
++b_size;
b[b_size-1]=i;
continue;
}
for (u = 0, v = b_size-1; u < v;)
{
c = (u + v) / 2;
if (arr[b[c]] < arr[i]) u=c+1; else v=c;
}
if (arr[i] < arr[b[u]])
{
b[u] = i;
}
if(i>arr_size)break;
}
}
free(arr);
free(b);
printf("%u\n", b_size);
return 0;
}
The line:
arr=(unsigned int *)malloc(4*10000000);
is not a good programming style. Are you sure that your file is as big as 40MBs? try not to allocate the whole memory in the first lines of your program.

Resources