I have a problem with 3 mpi_bcast and one mpi_scatter, my program don't work well ,mpi_scatter don't work and globalparcsr don't scatter between nodes. when i delete second and third mpi_bcast ,mpi_scatter work well. I want broadcast a and globalindividual and globalfitness and then scatter globalparcsr, part of my code as bellow:
int malloc2dint(int ***array, int n, int m) {
/* allocate the n*m contiguous items */
int *p = (int *)malloc(n*m * sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int **)malloc(n * sizeof(int*));
if (!(*array)) {
free(p);
return -1;
}
/* set up the pointers into the contiguous memory */
for (int i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
}
int main(int argc, char *argv[]) {
int size, rank, divided_pop_size, sum = 0, root = 0, procgridsize, sum2 = 0,generation=0;
int **globalindividual, **localindividual;
int *globalfitness, *localfitness;
int *globalparcsr, *localparcsr;
int **recbuf;
int *sendcounts, *parsendcount; //specifying the number of elements to send to each processor
int *displs, *pardispls; //Entry i specifies the displacement
MPI_Status status;
int offset, rows;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
divided_pop_size = n_initial_pop / size;
if (rank == root)
{
malloc2dint(&globalindividual, n_initial_pop, num_vertices);
read_graph();
globalfitness = (int*)malloc(n_initial_pop * sizeof(int));
globalparcsr = (int*)malloc(n_initial_pop * sizeof(int));
globalindividual = initial_population(globalindividual, n_initial_pop);
for (int i = 0; i < n_initial_pop; i++) {
printf("\n");
for (int j = 0; j < num_vertices; j++)
printf("%d", globalindividual[i][j]);
}
}
for (int p = 0; p < size; p++) {
if (rank == p) {
malloc2dint(&localindividual,n_initial_pop + 2, num_vertices);
localindividual = initial_population(localindividual, divided_pop_size + 2);
}
}
MPI_Bcast(&a[0][0], 5000 * 5000, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Bcast(&globalindividual[0][0], n_initial_pop*num_vertices, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&globalfitness, n_initial_pop, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
a is a 2d array and globalindividual is a 2d array with 12 rows and 8 columns and globalfitness is 1d array with size 12
please help me.
Related
I am trying to solve problem with c++: Find all unique elements of a two-dimensional array of integers using MPI_Scatter and MPI_Comm_split to distribute the array's rows among a set of processes, so that the set of processes is split into three groups.
Got the code
#include <iostream>
#include <unordered_set>
#include <mpi.h>
using namespace std;
int main(int argc, char* argv[])
{
int rank, size;
int rows = 0, columns = 0;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int* matrix = nullptr;
if (rank == 0)
{
cout << "Enter the number of rows: ";
cin >> rows;
cout << "Enter the number of columns: ";
cin >> columns;
matrix = new int[rows * columns];
cout << "Enter the elements of the matrix: " << endl;
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < columns; j++)
{
cin >> matrix[i * columns + j];
}
}
}
MPI_Bcast(&rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&columns, 1, MPI_INT, 0, MPI_COMM_WORLD);
int sub_size = rows / size;
int* local_matrix = new int[sub_size * columns];
MPI_Scatter(matrix, sub_size * columns, MPI_INT, local_matrix, sub_size * columns, MPI_INT, 0, MPI_COMM_WORLD);
unordered_set<int> local_set;
for (int i = 0; i < sub_size; i++)
{
for (int j = 0; j < columns; j++)
{
local_set.insert(local_matrix[i * columns + j]);
}
}
MPI_Comm sub_comm;
MPI_Comm_split(MPI_COMM_WORLD, rank / (size / 3), rank, &sub_comm);
int sub_rank, new_sub_size;
MPI_Comm_rank(sub_comm, &sub_rank);
MPI_Comm_size(sub_comm, &new_sub_size);
unordered_set<int>* global_set = nullptr;
if (sub_rank == 0)
{
global_set = new unordered_set<int>[new_sub_size];
}
MPI_Gather(&local_set, sizeof(unordered_set<int>), MPI_BYTE, global_set, sizeof(unordered_set<int>), MPI_BYTE, 0, sub_comm);
if (sub_rank == 0)
{
unordered_set<int> final_set;
for (int i = 0; i < new_sub_size; i++)
{
for (auto it = global_set[i].begin(); it != global_set[i].end(); it++) {
final_set.insert(*it);
}
}
cout << "The unique elements in the matrix are: ";
for (auto it = final_set.begin(); it != final_set.end(); it++) {
cout << *it << " ";
}
cout << endl;
delete[] global_set;
}
delete[] local_matrix;
if (rank == 0) {
delete[] matrix;
}
MPI_Finalize();
return 0;
}
After compile and input data microsoft visual studio 2019 gives an error message
Unhandled exception at 0x0099B514 in ConsoleApplication15.exe: 0xC0000094: Integer division by zero.
to this line
MPI_Comm_split(MPI_COMM_WORLD, rank / (size / 3), rank, &sub_comm);
How fix it?
I am trying to implement a parallel reduction sum of even and odd number Separately in CUDA.
I'm new in CUDA programming and I'm trying so hard but I can't find a solution.
I have for example the array : [5, 8, 0, -6, 2]. And the result need to be [4, 5] (Even : 8+0-6+2=4, Odd : 5=5).
But the result of my following code is [8, 5].
I think that my problem is in the notion of "shared" but I do not understand why.
__global__ void sumEvenOdd(int *a, int *b, int N){
int column = blockIdx.x * blockIdx.x + threadIdx.x;
__shared__ int s_data[2];
if (column < N){
if (a[column] % 2 == 0){
s_data[0] += a[column];
}
else{
s_data[1] += a[column];
}
__syncthreads();
b[0] = s_data[0];
b[1] = s_data[1];
}
}
void initArray(int *a, int N){
for (unsigned int i = 0; i < N; i++){
a[i] = rand() % 100;
}
}
void verify_result(int *a, int *b, int N){
int *verify_b;
verify_b = (int*)malloc(2 * sizeof(int));
verify_b[0] = 0;
verify_b[1] = 0;
for (unsigned int i = 0; i < N; i++){
if (a[i] % 2 == 0){
verify_b[0] += a[i];
}
else{
verify_b[1] += a[i];
}
}
for (unsigned int i = 0; i < 2; i++){
assert(verify_b[i] == b[i]);
}
}
void printResult(int *a, int *b, int N){
printf("\n");
for (unsigned int i = 0; i < N; i++){
printf("%d, ", a[i]);
}
printf("\n");
for (unsigned int i = 0; i < 2; i++){
printf("%d, ", b[i]);
}
}
int main(){
//Array sizes;
int N = 5;
//Size (in bytes) of matrix
size_t bytes = N * sizeof(int);
//Host pointers
int *a, *b;
// Allocate host memory
a = (int*)malloc(bytes);
b = (int*)malloc(2 * sizeof(int));
// Initialize array
initArray(a, N);
// Device pointers
int *d_a, *d_b;
// Allocated device memory
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, 2 * sizeof(int));
// Copy data to the device
cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice);
//Number of threads
int THREADS = 128;
//Number of blocks
int BLOCKS = (N + THREADS - 1) / THREADS;
// Launch kernel
sumEvenOdd<<<BLOCKS, THREADS>>>(d_a, d_b, N);
cudaDeviceSynchronize();
// Copy back to the host
cudaMemcpy(b, d_b, 2 * sizeof(int), cudaMemcpyDeviceToHost);
// Check result
verify_result(a, b, N);
printResult(a, b, N);
return 0;
}
you cannot just use
s_data[1] += a[column];
remember all units are going to execute this line at the same time, and store in the same position, so all threads are storing into s_data at the same time.
instead you should use atomic add
atomicAdd(&s_data[1], a[column]);
and you should also be initializing s_data to zeros.
I'm currently working on a MPI-program and I'm trying to send blocks of a matrix with scatterv to all processes.
Process description
The matrix is given as an array.
First I produce a datatype with MPI_Type_vector to create the necessary block out of the original array.
Second I create a MPI_Type_struct that should hold rows of blocks.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 16
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p,r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
int *arr;
arr = NULL;
if (r == 0){
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n * n; i++) arr[i] = i;
for (int i = 0; i < n; i++){
printf("\n");
for (int j = 0; j < n; j++)
printf("%4d", arr[i * n + j]);
}
}
printf("\n");
int ps = sqrt(p);
int ns = n / ps;
if (r == 0) {
printf("ps: %d ns: %d\n", ps, ns);
}
/* create datatype */
MPI_Datatype block;
MPI_Type_vector(ns, ns, n, MPI_INT, &block);
int blocks[ps];
MPI_Aint displs[ps];
for (int i = 0; i < ps; i++) {
blocks[i] = 1;
displs[i] = i * sizeof(int);
}
MPI_Datatype types[ps];
//for (int i = 0; i < ps - 1; i++) types[i] = block;
//types[ps - 1] = MPI_UB;
types[0] = block;
for (int i = 1; i < ps; i++) types[i] = MPI_UB;
//types[0] = block;
//types[1] = MPI_UB;
if (r == 0) {
printf("displs:\n");
for(int i = 0; i < ps; i++) printf("%3ld", displs[i]);
printf("\n");
}
MPI_Datatype row;
MPI_Type_struct(ps, blocks, displs, types, &row);
MPI_Type_commit(&row);
/* prepare scatter */
int sdispl[p]; int sendcounts[p];
for (int i = 0; i < p; i++) {
sdispl[i] = (i % ps) + (i / ps) * (ns * ps);
sendcounts[i] = 1;
}
if (r == 0) {
printf("sdispl: \n");
for (int i = 0; i < 4; i++) printf("%3d", sdispl[i]);
printf("\n");
}
int rcv[ns * ns];
MPI_Scatterv(arr, sendcounts, sdispl, row, rcv, ns * ns, MPI_INT, 0, comm);
int result = 1;
if (r == result) {
printf("result for %d:\n", result);
for (int i = 0; i < ns * ns; i++) {
printf("%4d", rcv[i]);
if ((i+1) % ns == 0) printf("\n");
}
}
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
So far the structure of the blocks is correct.
The problem
The block, that was sent to process r = 1 starts with 3 instead of 4. The block for process r = 2 also starts with 6 and the one for process r = 3 starts with 9.
For r == 4 it jumps to 48.
What it should do
r start
0 0
1 4
2 8
3 12
4 64
5 68
6 ...
15 204
The help I would need
I think, that I'm making some mistake with displ and sdispl.
Compiling and Running the example
The code is compiled with the folowing command:
mpicc -o main main.c -lm
I run the code with:
mpirun -np 16 ./main
Thanks for any help in advance!
With the hint of Zulan I was able to solve my problem.
The following code is based on the excellent answer to subarrays.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 8
void print_arr(int *arr, int x) {
printf("\n");
for (int i = 0; i < x*x; i++){
if (i % x == 0) printf("\n");
printf("%4d", arr[i]);
}
printf("\n");
}
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p, r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
/* number of proceses in dim x and dim y */
int ps = sqrt(p);
/* number of elements in dim x and dim y in sarr */
int ns = n/ps;
/* array of data - distributed by process 0 */
int *arr = NULL;
if (r==0) {
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n*n; i++) arr[i] = i;
print_arr(arr, n);
}
MPI_Datatype type, resizedtype;
int sizes[2] = {n,n};
int subsizes[2] = {ns,ns};
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, ns*sizeof(int), &resizedtype);
MPI_Type_commit(&resizedtype);
int counts[p];
for (int i = 0; i < p; i++) counts[i] = 1;
int displs[p];
for (int i = 0; i < p; i++) displs[i] = i%ps + i/ps * ns * ps;
/* subarray to store distributed data */
int sarr[ns * ns];
/* send submatrices to all processes */
MPI_Scatterv(arr, counts, displs, resizedtype, sarr, ns*ns, MPI_INT, 0, comm);
/* print received data for process pr */
int pr = 3;
if (r == pr)
print_arr(sarr, ns);
/* free arr */
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
You can compile the example with
mpicc -o main main.c
and run it with
mpirun -np 4 ./main
I'm trying hard all day to implement matrix multiply with help of MPI, all examples from the Internet didn't work for me (I don't know why, it compiles, run but not computing). Here is what I'm doing:
From bash:
mpirun -n 2 out/lb8
It reading matrix 2x4 (1 row per process) and starting to compute.
The problem is in SendRecv block (or generally in collecting data)
void Matrix_MPY(double **matrix_a, double **matrix_b, double ***matrix_c, int a_rows, int a_cols) {
int i, j;
int process_rank, process_count;
MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
MPI_Comm_size(MPI_COMM_WORLD, &process_count);
if (a_rows % process_count != 0) {
error_code = NOT_DEVIDED_BY_RANK_EXCEPTION;
return;
}
int rows_per_process = a_rows / process_count;
int current_row = rows_per_process * process_rank;
double **temp;
temp = (double **) malloc(sizeof(double *) * a_rows);
for (i = 0; i < a_rows; ++i){
temp[i] = (double *) malloc(sizeof(double) * a_rows);
}
for (i = current_row; i < current_row + rows_per_process; ++i) {
for (j = 0; j < a_rows; ++j)
{
int k;
for(k = 0; k < a_cols; ++k){
temp[i][j] += matrix_a[i][k] * matrix_b[k][j];
}
}
MPI_Sendrecv(temp[i], a_rows, MPI_DOUBLE, ROOT, TAG, temp[i], a_rows, MPI_DOUBLE, process_rank, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
*matrix_c = temp;
}
This solution is worked for me
....
if (process_rank != ROOT)
MPI_Send(temp[i], a_rows, MPI_DOUBLE, ROOT, i, MPI_COMM_WORLD);
}
if (process_rank == ROOT) {
for (i = 1; i < process_count; ++i)
{
for (j = i * rows_per_process; j < i * rows_per_process + rows_per_process; ++j)
{
MPI_Recv(temp[j], a_rows, MPI_DOUBLE, i, j, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
}
}
}
*matrix_c = temp;
I have a laptop with an NVIDIA GT750M 4Gb (compute capability 3.0) graphics card. I need to sort an array of structures on CUDA (about 3 × 10^7 elements). But I cannot figure out how, since I do not have enough experience in CUDA. When using thrust::sort I get strange results (it takes a few tens of minutes, while std::sort takes 1 minute).
struct MyStruct
{
float key;
float a;
float b;
int c;
int d;
int e;
int f;
bool flag;
}
bool minCompare(const MyStruct lhs, const MyStruct rhs)
{
return lhs.key < rhs.key;
}
As Robert Crovella has pointed out in his comment, tents of minutes most likely means that you are doing something wrong. I'm providing an example below in which I compare the performance of sorting an Array of Structures (AoS) and a Structure of Arrays (SoA) using thrust::sort and thrust::sort_by_key. I'm running on a laptop GeForce GT 540M and compiling with CUDA 5.5, so you have a more powerful card than mine. For 100000 elements the execution time is of the order of seconds in both cases. As I pointed out in my comment, the first case is more demanding in terms of computation time (1675ms) than the second (668.9ms).
#include <thrust\device_vector.h>
#include <thrust\sort.h>
struct MyStruct1
{
int key;
int value1;
int value2;
};
struct MyStruct2
{
int N;
int* key;
int* value1;
int* value2;
MyStruct2(int N_) {
N = N_;
cudaMalloc((void**)&key,N*sizeof(int));
cudaMalloc((void**)&value1,N*sizeof(int));
cudaMalloc((void**)&value2,N*sizeof(int));
}
};
__host__ __device__ bool operator<(const MyStruct1 &lhs, const MyStruct1 &rhs) { return (lhs.key < rhs.key); };
void main(void)
{
const int N = 10000;
float time;
cudaEvent_t start, stop;
/*******************************/
/* SORTING ARRAY OF STRUCTURES */
/*******************************/
thrust::host_vector<MyStruct1> h_struct1(N);
for (int i = 0; i<N; i++)
{
MyStruct1 s;
s.key = rand()*255;
s.value1 = rand()*255;
s.value2 = rand()*255;
h_struct1[i] = s;
}
thrust::device_vector<MyStruct1> d_struct(h_struct1);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
thrust::sort(d_struct.begin(), d_struct.end());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting array of structs - elapsed time: %3.1f ms \n", time);
h_struct1 = d_struct;
//for (int i = 0; i<N; i++)
//{
// MyStruct1 s = h_struct1[i];
// printf("key %i value1 %i value2 %i\n",s.key,s.value1,s.value2);
//}
//printf("\n\n");
/*******************************/
/* SORTING STRUCTURES OF ARRAYS*/
/*******************************/
MyStruct2 d_struct2(N);
thrust::host_vector<int> h_temp_key(N);
thrust::host_vector<int> h_temp_value1(N);
thrust::host_vector<int> h_temp_value2(N);
//for (int i = 0; i<N; i++)
//{
// h_temp_key[i] = rand()*255;
// h_temp_value1[i] = rand()*255;
// h_temp_value2[i] = rand()*255;
// printf("Original data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//}
//printf("\n\n");
cudaMemcpy(d_struct2.key,h_temp_key.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value1,h_temp_value1.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value2,h_temp_value2.data(),N*sizeof(int),cudaMemcpyHostToDevice);
// wrap raw pointers with device pointers
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
thrust::device_ptr<int> dev_ptr_key = thrust::device_pointer_cast(d_struct2.key);
thrust::device_ptr<int> dev_ptr_value1 = thrust::device_pointer_cast(d_struct2.value1);
thrust::device_ptr<int> dev_ptr_value2 = thrust::device_pointer_cast(d_struct2.value2);
thrust::device_vector<int> d_indices(N);
thrust::sequence(d_indices.begin(), d_indices.end(), 0, 1);
// first sort the keys and indices by the keys
thrust::sort_by_key(dev_ptr_key, dev_ptr_key + N, d_indices.begin());
// Now reorder the ID arrays using the sorted indices
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value1, dev_ptr_value1);
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value2, dev_ptr_value2);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting struct of arrays - elapsed time: %3.1f ms \n", time);
cudaMemcpy(h_temp_key.data(),d_struct2.key,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value1.data(),d_struct2.value1,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value2.data(),d_struct2.value2,N*sizeof(int),cudaMemcpyDeviceToHost);
//for (int i = 0; i<N; i++) printf("Ordered data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//printf("\n\n");
getchar();
}
For the sake of simplicity, I have skipped adding proper CUDA error check in the sense of What is the canonical way to check for errors using the CUDA runtime API?.
Just in case anyone's still looking, if you decide to use Thrust libraries, you can make a zip iterator of tuples from the structure of arrays that you want to sort, and pass this iterator to thrust::sort_by_key() like so:
void sort() {
const int N = 6;
int *keys = new int[N]; // = { 1, 4, 2, 8, 5, 7};
char *vals = new char[N]; // = {'a', 'b', 'c', 'd', 'e', 'f'};
int *addr = new int[N]; // = { 1, 2, 3, 4, 5, 6};
keys[0]=1; keys[1]=4; keys[2]=2; keys[3]=8; keys[4]=5; keys[5]=7;
vals[0]='a'; vals[1]='b'; vals[2]='c'; vals[3]='d'; vals[4]='e'; vals[5]='f';
addr[0]=1; addr[1]=2; addr[2]=3; addr[3]=4; addr[4]=5; addr[5]=6;
int *d_keys, *d_addr;
char *d_vals;
CUDA_SAFE(cudaMalloc((void **)&d_keys, N * sizeof(int)));
CUDA_SAFE(cudaMalloc((void **)&d_addr, N * sizeof(int)));
CUDA_SAFE(cudaMalloc((void **)&d_vals, N * sizeof(char)));
CUDA_SAFE(cudaMemcpy(d_keys, keys, N * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE(cudaMemcpy(d_vals, vals, N * sizeof(char), cudaMemcpyHostToDevice));
CUDA_SAFE(cudaMemcpy(d_addr, addr, N * sizeof(int), cudaMemcpyHostToDevice));
auto it = thrust::make_zip_iterator(thrust::make_tuple(d_vals, d_addr));
thrust::sort_by_key(thrust::device, d_keys, d_keys+N, it);
CUDA_SAFE(cudaMemcpy(keys, d_keys, N * sizeof(int), cudaMemcpyDeviceToHost));
CUDA_SAFE(cudaMemcpy(vals, d_vals, N * sizeof(char), cudaMemcpyDeviceToHost));
CUDA_SAFE(cudaMemcpy(addr, d_addr, N * sizeof(int), cudaMemcpyDeviceToHost));
printf("Keys: "); for (int i = 0; i < N; i++) printf("%d ", keys[i]); printf("\n");
printf("Vals: "); for (int i = 0; i < N; i++) printf("%c ", vals[i]); printf("\n");
printf("Addr: "); for (int i = 0; i < N; i++) printf("%d ", addr[i]); printf("\n");
}
I have excluded the headers and checks for brevity.