CUDA: Matrix + Matrix, segmentation fault when printing solution matrix in host - matrix

I'm trying to make a simple operation of adding a matrix to another one in CUDA, but I get a segmentation fault when I try to check the resault, here's the code:
/* Includes, system */
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 15000
/* DEVICE CODE */
__global__ void sumaMatrices(int *d_matrix1, int *d_matrix2, int *d_matrixSolucion){
int idThread = blockIdx.x*blockDim.x + threadIdx.x;
if (idThread < N)
{
d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];
}
}
__host__ void printMatrix(int **matrix)
{
int i, j;
//only 4 so the file is not too big
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
printf("%d", matrix[i][j]);
printf(" ");
}
printf("\n");
}
printf("\n");
}
/* HOST CODE*/
int main(int argc, char** argv)
{
int i;
int **h_matrix1;
int **h_matrix2;
int **h_matrixSolucion;
int *d_matrix1;
int *d_matrix2;
int *d_matrixSolucion;
h_matrix1 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix1[i] = (int*)malloc(N * sizeof(int*));
}
h_matrix2 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix2[i] = (int*)malloc(N * sizeof(int*));
}
h_matrixSolucion = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrixSolucion[i] = (int*)malloc(N * sizeof(int*));
}
cudaMalloc((void**)& d_matrix1,N*N*sizeof(int));
cudaMalloc((void**)& d_matrix2,N*N*sizeof(int));
cudaMalloc((void**)& d_matrixSolucion,N*N*sizeof(int));
fillMatrix(h_matrix1);
fillMatrix(h_matrix2);
fillMatrixTo0(h_matrixSolucion);
for(i = 0; i < N; i++)
{
cudaMemcpy(&d_matrix1[i*N], h_matrix1[i], N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&d_matrix2[i*N], h_matrix2[i], N*sizeof(int), cudaMemcpyHostToDevice);
}
int tamBloque = 256;
int tamGrid = N/tamBloque + 1;
sumaMatrices<<<tamGrid, tamBloque>>>(d_matrix1, d_matrix2, d_matrixSolucion);
//nos traemos la informaciĆ³n del device
cudaThreadSynchronize();
for(i = 0; i < N; i++)
{
cudaMemcpy(h_matrixSolucion[i], &d_matrixSolucion[i*N],tamGrid*sizeof(h_matrixSolucion[0]),cudaMemcpyDeviceToHost);
}
printMatrix(h_matrix1);
printMatrix(h_matrix2);
printMatrix(h_matrixSolucion);
}
If I comment that last line the progams doens't give any error.
I'm guesss the problem is that I don't storage the information properly in the kernel (this line: d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];) but I am pretty new to CUDA and I don't really know how to solve it.
EDIT: Now that I've changed the way I get the information back from the device this is what is printing:
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
2 3 4 5
3 4 5 6
4 5 6 7
5 6 7 8
2 4 6 8
0 0 0 0
0 0 0 0
0 0 0 0
The first 2 matrix are the ones with the information and the other one is the solution, but only has 1 line filled.

There are a variety of errors in your code.
There was not definition for fillMatrix
Your underlying host allocations performed with malloc are not guaranteed to be contiguous, therefore you cannot transfer the data back in a single cudaMemcpy operation, but you must use a loop, like the loop you used to transfer data to the GPU
Your host allocations aren't quite right but they don't present an actual problem. This:
h_matrix1[i] = (int*)malloc(N * sizeof(int*));
should be this:
h_matrix1[i] = (int*)malloc(N * sizeof(int));
and likewise for the other similar instances.
Your grid (total number of threads) sizing is not correct. Your kernel uses one thread to perform one elementwise addition. Therefore, for a NxN matrix, you need NxN threads, not just N as you are creating and testing against.
The following code has these issues fixed and seems to work correctly for me:
$ cat t2.cu
/* Includes, system */
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 15000
/* DEVICE CODE */
__global__ void sumaMatrices(int *d_matrix1, int *d_matrix2, int *d_matrixSolucion){
int idThread = blockIdx.x*blockDim.x + threadIdx.x;
if (idThread < N*N)
{
d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];
}
}
__host__ void printMatrix(int **matrix)
{
int i, j;
//only 4 so the file is not too big
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
printf("%d", matrix[i][j]);
printf(" ");
}
printf("\n");
}
printf("\n");
}
/* HOST CODE*/
int main(int argc, char** argv)
{
int i;
int **h_matrix1;
int **h_matrix2;
int **h_matrixSolucion;
int *d_matrix1;
int *d_matrix2;
int *d_matrixSolucion;
h_matrix1 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix1[i] = (int*)malloc(N * sizeof(int));
for (int j = 0; j < N; j++) h_matrix1[i][j] = 1;
}
h_matrix2 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix2[i] = (int*)malloc(N * sizeof(int));
for (int j = 0; j < N; j++) h_matrix2[i][j] = 2;
}
h_matrixSolucion = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrixSolucion[i] = (int*)malloc(N * sizeof(int));
for (int j = 0; j < N; j++) h_matrixSolucion[i][j] = 0;
}
cudaMalloc((void**)& d_matrix1,N*N*sizeof(int));
cudaMalloc((void**)& d_matrix2,N*N*sizeof(int));
cudaMalloc((void**)& d_matrixSolucion,N*N*sizeof(int));
for(i = 0; i < N; i++)
{
cudaMemcpy(&d_matrix1[i*N], h_matrix1[i], N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&d_matrix2[i*N], h_matrix2[i], N*sizeof(int), cudaMemcpyHostToDevice);
}
int tamBloque = 256;
int tamGrid = (N*N)/tamBloque + 1;
sumaMatrices<<<tamGrid, tamBloque>>>(d_matrix1, d_matrix2, d_matrixSolucion);
cudaThreadSynchronize();
for(i = 0; i < N; i++)
{
cudaMemcpy(h_matrixSolucion[i],&d_matrixSolucion[i*N],N*sizeof(int),cudaMemcpyDeviceToHost);
}
printMatrix(h_matrix1);
printMatrix(h_matrix2);
printMatrix(h_matrixSolucion);
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
1 1 1 1
1 1 1 1
1 1 1 1
1 1 1 1
2 2 2 2
2 2 2 2
2 2 2 2
2 2 2 2
3 3 3 3
3 3 3 3
3 3 3 3
3 3 3 3
========= ERROR SUMMARY: 0 errors
$

Related

Schedule round robin matches

How to implement a round robin schedule for an array of 4 elements [1,2,3,4]? The result of the algorithm should be able to display, for each element, the list of the players it will face in chronological order:
(1: 4,2,3)
(2: 3,1,4)
(3: 2,4,1)
(4: 1,3,2)
Line 1: 4,2,3 means that the player (1) will face in order the players (4), (2) and (3).
Of the same way, line 2: 3,1,4 indicates that the player (2) will face in order the players (3), (1) and (2).
We have implemented this code but we encounter a bug when we start filling in the name of the player. Do you have any idea about this problem?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NAME_MAX_LENGTH 20
#define NUM_MIN_PLAYERS 2
#define NUM_MAX_PLAYERS 20
enum Style
{
STYLE_COMPACT,
STYLE_TABLE
};
enum Format
{
FORMAT_ID,
FORMAT_NAME
};
struct PlayerList
{
unsigned int num_players;
char name[NUM_MAX_PLAYERS][NAME_MAX_LENGTH + 1];
};
struct Grid
{
unsigned int num_players;
unsigned int day[NUM_MAX_PLAYERS]
[NUM_MAX_PLAYERS];
};
void printList(struct PlayerList *list)
{
for (int i = 0; i < list->num_players; i++)
{
printf("%d:%s\n", i + 1, list->name[i]);
}
}
struct Grid calculer_berger(struct PlayerList *list)
{
struct Grid grid;
// algo pour remplir la grid
grid.num_players = list->num_players;
int i, j;
for (i = 0; i < list->num_players - 1; i++)
{
for (j = 0; j < list->num_players - 1; j++)
{
if (i == j)
{
/* edge cases */
grid.day[i][list->num_players - 1] = ((i + j) + (i + j) / list->num_players) % list->num_players;
grid.day[list->num_players - 1][j] = ((i + j) + (i + j) / list->num_players) % list->num_players;
grid.day[i][j] = 0;
}
else
{
grid.day[i][j] = ((i + j) + (i + j) / list->num_players) % list->num_players;
}
}
}
grid.day[0][list->num_players - 1] = list->num_players - 1;
grid.day[list->num_players - 1][list->num_players - 1] = 0;
grid.day[list->num_players - 1][0] = list->num_players - 1;
return grid;
}
void permuter(struct Grid *grid)
{
int tmp;
for (int i = 0; i < grid->num_players; i++)
{
for (int j = 1; j <= grid->num_players / 2; j++)
{
tmp = grid->day[i][j];
grid->day[i][j] = grid->day[i][grid->num_players - j];
grid->day[i][grid->num_players - j] = tmp;
}
}
}
void print_grid(struct Grid *grid, struct PlayerList *list)
{
for (int i = 0; i < grid->num_players; i++)
{
for (int j = 0; j < grid->num_players; j++)
{
if (j == 0)
{
printf("%d:", grid->day[i][j] + 1);
}
else
{
printf("%d", grid->day[i][j] + 1);
if (j < grid->num_players - 1)
{
printf(",");
}
}
}
printf("\n");
}
}
int main(int argc, char **argv)
{
struct PlayerList playerList;
char nom[NAME_MAX_LENGTH + 1];
int nbCharLu = 0;
while ((nbCharLu = fscanf(stdin, "%s", nom)) != -1)
{
strcpy(playerList.name[playerList.num_players], nom);
playerList.num_players++;
}
struct Grid myGrid = calculer_berger(&playerList);
printList(&playerList);
print_grid(&myGrid, &playerList);
printf("Apres la permut\n");
permuter(&myGrid);
print_grid(&myGrid, &playerList);
return 0;
}
Assuming you are storing the elements in an Integer array and that you would like to just display the results.
Here is one implementation....The code should accommodate "N" values because of the use of "sizeof"....
feel free to customize it further....
#include <stdio.h>
int main() {
int i,j;
int array[] = {1,2,3,4};
for(i = 0; i < sizeof(array)/sizeof(int);++i){
printf("(%d :",array[i]);
for(j = 0; j < sizeof(array)/sizeof(int);++j){
if(j == i)
continue;
printf("%d ",array[j]);
}
printf(")\n");
}
}
#include <stdio.h>
void main() {
int mid;
int num;
int j, temp;
int k = 0;
int num1;
int data[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14};
num = sizeof(data)/sizeof(int);
mid = (sizeof(data)/sizeof(int))/2;
while(k < num - 1){
printf("Round %d ( ",k+1);
num1 = num;
for(int i = 0;i < mid;i++,num1--) /*pairing the competitors in each round*/
printf("%d:%d ",data[i],data[num1-1]);
for(int i = 0,j = num-1; i < num -2;i++,j--){ /* fixing the first competitor and rotating the others clockwise*/
temp = data[j];
data[j] = data[j-1];
data[j-1] = temp;
}
printf(")\n");
k++;
}
}

Can separate, forked, processes jointly access shared dynamically allocated, pointer, memory?

I am trying to parallelize the addition of two simple 4x4 matrices. The child process adds only odd rows and the parents adds even ones. However, I can't seem to get the processes to work on shared pointer memory and the output is always given in halves, shown beneath the code:
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
int main() {
int A[4][4] = {{1,2,3,4},{6,7,8,9},{11,12,13,14},{16,17,18,19}};
int B[4][4] = {{1,2,3,4},{6,7,8,9},{11,12,13,14},{16,17,18,19}};
int** C = (int**)malloc(sizeof(int)*4);
for (int z = 0; z<4; z++) {
C[z] = (int*)malloc(sizeof(int)*4);
}
pid_t cp;
printf("Before fork\n");
cp = fork();
if (cp == 0) {
for(int i = 1; i < 4; i+=2) {
for(int j = 0; j < 4; j++) {
printf("child process\n");
printf("We are adding cell %d,%d\n",i,j);
C[i][j] = A[i][j] + B[i][j];
sleep(1);
}
}
} else {
for (int k = 0; k < 4; k+=2) {
for(int l = 0; l < 4; l++) {
printf("parent process\n");
printf("We are adding cell %d,%d\n",k,l);
C[k][l] = A[k][l] + B[k][l];
sleep(1);
}
}
}
sleep(10);
printf("We are printing C here\n");
for (int m = 0; m < 4; m++) {
for(int n = 0; n < 4; n++) {
printf("%d ",C[m][n]);
}
printf("\n");
}
}
This is the output of the final for loop in the above code:
We are printing C here
0 0 0 0
12 14 16 18
0 0 0 0
32 34 36 38
We are printing C here
2 4 6 8
0 0 0 0
22 24 26 28
0 0 0 0

MPI_Scatterv submatrix with MPI_Type_struct

I'm currently working on a MPI-program and I'm trying to send blocks of a matrix with scatterv to all processes.
Process description
The matrix is given as an array.
First I produce a datatype with MPI_Type_vector to create the necessary block out of the original array.
Second I create a MPI_Type_struct that should hold rows of blocks.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 16
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p,r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
int *arr;
arr = NULL;
if (r == 0){
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n * n; i++) arr[i] = i;
for (int i = 0; i < n; i++){
printf("\n");
for (int j = 0; j < n; j++)
printf("%4d", arr[i * n + j]);
}
}
printf("\n");
int ps = sqrt(p);
int ns = n / ps;
if (r == 0) {
printf("ps: %d ns: %d\n", ps, ns);
}
/* create datatype */
MPI_Datatype block;
MPI_Type_vector(ns, ns, n, MPI_INT, &block);
int blocks[ps];
MPI_Aint displs[ps];
for (int i = 0; i < ps; i++) {
blocks[i] = 1;
displs[i] = i * sizeof(int);
}
MPI_Datatype types[ps];
//for (int i = 0; i < ps - 1; i++) types[i] = block;
//types[ps - 1] = MPI_UB;
types[0] = block;
for (int i = 1; i < ps; i++) types[i] = MPI_UB;
//types[0] = block;
//types[1] = MPI_UB;
if (r == 0) {
printf("displs:\n");
for(int i = 0; i < ps; i++) printf("%3ld", displs[i]);
printf("\n");
}
MPI_Datatype row;
MPI_Type_struct(ps, blocks, displs, types, &row);
MPI_Type_commit(&row);
/* prepare scatter */
int sdispl[p]; int sendcounts[p];
for (int i = 0; i < p; i++) {
sdispl[i] = (i % ps) + (i / ps) * (ns * ps);
sendcounts[i] = 1;
}
if (r == 0) {
printf("sdispl: \n");
for (int i = 0; i < 4; i++) printf("%3d", sdispl[i]);
printf("\n");
}
int rcv[ns * ns];
MPI_Scatterv(arr, sendcounts, sdispl, row, rcv, ns * ns, MPI_INT, 0, comm);
int result = 1;
if (r == result) {
printf("result for %d:\n", result);
for (int i = 0; i < ns * ns; i++) {
printf("%4d", rcv[i]);
if ((i+1) % ns == 0) printf("\n");
}
}
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
So far the structure of the blocks is correct.
The problem
The block, that was sent to process r = 1 starts with 3 instead of 4. The block for process r = 2 also starts with 6 and the one for process r = 3 starts with 9.
For r == 4 it jumps to 48.
What it should do
r start
0 0
1 4
2 8
3 12
4 64
5 68
6 ...
15 204
The help I would need
I think, that I'm making some mistake with displ and sdispl.
Compiling and Running the example
The code is compiled with the folowing command:
mpicc -o main main.c -lm
I run the code with:
mpirun -np 16 ./main
Thanks for any help in advance!
With the hint of Zulan I was able to solve my problem.
The following code is based on the excellent answer to subarrays.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 8
void print_arr(int *arr, int x) {
printf("\n");
for (int i = 0; i < x*x; i++){
if (i % x == 0) printf("\n");
printf("%4d", arr[i]);
}
printf("\n");
}
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p, r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
/* number of proceses in dim x and dim y */
int ps = sqrt(p);
/* number of elements in dim x and dim y in sarr */
int ns = n/ps;
/* array of data - distributed by process 0 */
int *arr = NULL;
if (r==0) {
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n*n; i++) arr[i] = i;
print_arr(arr, n);
}
MPI_Datatype type, resizedtype;
int sizes[2] = {n,n};
int subsizes[2] = {ns,ns};
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, ns*sizeof(int), &resizedtype);
MPI_Type_commit(&resizedtype);
int counts[p];
for (int i = 0; i < p; i++) counts[i] = 1;
int displs[p];
for (int i = 0; i < p; i++) displs[i] = i%ps + i/ps * ns * ps;
/* subarray to store distributed data */
int sarr[ns * ns];
/* send submatrices to all processes */
MPI_Scatterv(arr, counts, displs, resizedtype, sarr, ns*ns, MPI_INT, 0, comm);
/* print received data for process pr */
int pr = 3;
if (r == pr)
print_arr(sarr, ns);
/* free arr */
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
You can compile the example with
mpicc -o main main.c
and run it with
mpirun -np 4 ./main

How Should I Implement Cilk Parallelism with a Recursive Scan Algorithm?

I implemented a recursive scan (prefix sum) algorithm, which I've included below. Here, I simply generate random lists of size powers of two up to the twenty-seventh power, checking against a simple sequential scan for accuracy. It works.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <mkl.h>
int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);
int main(int argc, char **argv)
{
int n;
int i, j, k;
int *x, *seq, *r;
double begin, end;
srand48(time(0));
/* Randomly generate array of size n. */
for (k = 2; k < 28; k++) {
n = (int) pow(2, k);
seq = (int *) malloc(sizeof(int) * n);
x = (int *) malloc(sizeof(int) * n);
for (i = 0; i < n; i++) {
x[i] = lrand48() % 100 - 50;
seq[i] = x[i];
}
/* Parallel scan. */
begin = dsecnd();
r = pscan(x, n, 0, 2);
end = dsecnd();
printf("%d %lf\n", n, end - begin);
/* Sequential check. */
for (i = 1; i < n; i++) {
seq[i] = seq[i - 1] + seq[i];
}
for (i = 0; i < n; i++) {
if (r[i] != seq[i]) {
fprintf(stderr, "AGGGHHH!!! ERROR. Found with vector: \n");
for (j = 0; j < n; j++) {
printf("%d ", x[i]);
}
printf("\n");
exit(1);
}
}
free(r);
free(x);
free(seq);
}
return 0;
}
/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
int i, j;
int *sums, *sumscan, *scan, **fsum, *rv;
/* Base case, serially scan a chunk. */
if (n <= chunk_size) {
scan = (int *) malloc(sizeof(int) * n);
scan[0] = x[0] + z;
for (i = 1; i < n; i++) {
scan[i] = x[i] + scan[i - 1];
}
return scan;
}
sums = (int *) malloc(sizeof(int) * (n / chunk_size));
/* Reduce each chunk of the array. */
for (i = 0; i < n / chunk_size; i++) {
sums[i] = reduce(&x[i * chunk_size], chunk_size);
}
/* Perform a scan on the sums. */
sumscan = pscan(sums, n / chunk_size, 0, chunk_size);
free(sums);
fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));
/* Perform a recursive scan on each chunk, using
the appropriate offset from the sums scan. */
for (i = 0; i < n / chunk_size; i++) {
if (i > 0) {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
} else {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
}
}
free(sumscan);
rv = (int *) malloc(sizeof(int) * n);
/* Join the arrays. */
for (i = 0; i < n / chunk_size; i++) {
for (j = 0; j < chunk_size; j++) {
rv[i * chunk_size + j] = fsum[i][j];
}
}
for (i = 0; i < n / chunk_size; i++) {
free(fsum[i]);
}
free(fsum);
return rv;
}
/* Serial reduction. */
int reduce(int *x, int n)
{
int i;
int sum;
sum = 0;
for (i = 0; i < n; i++) {
sum += x[i];
}
return sum;
}
Now, I'd like to parallelize it. Because I'm feeling a little hipster-ish, I've hacked up a Cilk implementation. I just replace the two main for loops to parallelize 1) the reduction and 2) the recursive scan of each chunk, using the appropriate scan of the chunk reductions as an offset. It looks like so.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cilk/cilk.h>
#include <mkl.h>
int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);
int main(int argc, char **argv)
{
int n;
int i, j, k;
int *x, *seq, *r;
double begin, end;
srand48(time(0));
/* Randomly generate array of size n. */
for (k = 2; k < 28; k++) {
n = (int) pow(2, k);
seq = (int *) malloc(sizeof(int) * n);
x = (int *) malloc(sizeof(int) * n);
for (i = 0; i < n; i++) {
x[i] = lrand48() % 100 - 50;
seq[i] = x[i];
}
/* Parallel scan. */
begin = dsecnd();
r = pscan(x, n, 0, 2);
end = dsecnd();
printf("%d %lf\n", n, end - begin);
/* Sequential check. */
for (i = 1; i < n; i++) {
seq[i] = seq[i - 1] + seq[i];
}
for (i = 0; i < n; i++) {
if (r[i] != seq[i]) {
fprintf(stderr, "AGGGHHH!!! ERROR. Found with vector: \n");
for (j = 0; j < n; j++) {
printf("%d ", x[i]);
}
printf("\n");
exit(1);
}
}
free(r);
free(x);
free(seq);
}
return 0;
}
/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
int i, j;
int *sums, *sumscan, *scan, **fsum, *rv;
/* Base case, serially scan a chunk. */
if (n <= chunk_size) {
scan = (int *) malloc(sizeof(int) * n);
scan[0] = x[0] + z;
for (i = 1; i < n; i++) {
scan[i] = x[i] + scan[i - 1];
}
return scan;
}
sums = (int *) malloc(sizeof(int) * (n / chunk_size));
/* Reduce each chunk of the array. */
cilk_for (i = 0; i < n / chunk_size; i++) {
sums[i] = reduce(&x[i * chunk_size], chunk_size);
}
/* Perform a scan on the sums. */
sumscan = pscan(sums, n / chunk_size, 0, chunk_size);
free(sums);
fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));
/* Perform a recursive scan on each chunk, using
the appropriate offset from the sums scan. */
cilk_for (i = 0; i < n / chunk_size; i++) {
if (i > 0) {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
} else {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
}
}
free(sumscan);
rv = (int *) malloc(sizeof(int) * n);
/* Join the arrays. */
for (i = 0; i < n / chunk_size; i++) {
for (j = 0; j < chunk_size; j++) {
rv[i * chunk_size + j] = fsum[i][j];
}
}
for (i = 0; i < n / chunk_size; i++) {
free(fsum[i]);
}
free(fsum);
return rv;
}
/* Serial reduction. */
int reduce(int *x, int n)
{
int i;
int sum;
sum = 0;
for (i = 0; i < n; i++) {
sum += x[i];
}
return sum;
}
And it works! Well, it returns correct results. It doesn't achieve the performance I had hoped. The original performance was
4 0.000004
8 0.000001
16 0.000002
32 0.000003
64 0.000005
128 0.000011
256 0.000019
512 0.000035
1024 0.000068
2048 0.000130
4096 0.000257
8192 0.000512
16384 0.001129
32768 0.002262
65536 0.004519
131072 0.009065
262144 0.018297
524288 0.037416
1048576 0.078307
2097152 0.157448
4194304 0.313855
8388608 0.625689
16777216 1.251949
33554432 2.589439
67108864 5.084731
134217728 10.402186
for the single-threaded application, but the Cilk version performend worse, with the following runtimes
4 0.005383
8 0.000011
16 0.000009
32 0.000111
64 0.000055
128 0.000579
256 0.000339
512 0.000544
1024 0.000701
2048 0.001086
4096 0.001265
8192 0.001742
16384 0.002283
32768 0.003891
65536 0.005398
131072 0.009255
262144 0.020736
524288 0.058156
1048576 0.103893
2097152 0.215460
4194304 0.419988
8388608 0.749368
16777216 1.650938
33554432 2.960451
67108864 5.799836
134217728 11.294398
I have a 24-core machine, so we're obviously not seeing the speed-up we would hope for here. My first thought was that Cilk is mishandling the recursion, causing oversubscription, but Cilk is specifically supposed to handle recursion well. Any tips on how to implement this properly? I tried adding cilk_for to the bottom for loop (freeing everything) and the inner for-loop of the penultimate set of loops (joining the array), but that slowed performance down even more.
Any advice is well-appreciated.
However, please don't tell me to switch to Belloch's parallel scan algorithm discussed here. I already implemented that in Cilk, and it worked quite well. I'd like to see if I can match its performance with this recursive solution.
I fixed my performance problems by finding the optimal chunk size for each problem. At that chunk size, the (same) parallel version performs better than the sequential version.
In summary, there were a few things wrong with both my general approach and particularly the chunk size of two:
My benchmarking approach. In a code with a tuning parameter, it doesn't make much sense to plot runtime vs. problem size using the same value for the tuning parameter because the optimal value is dependent on problem size.
A chunk size of two was likely problematic because, while it maximizes parallelism, it also maximizes the number of levels of recursion and, likewise, the overhead that comes along with it.
A chunk size of two prevents vectorization.
As Leeor suggested, a chunk size of two probably also leads to false sharing in the cache.
Props to Leeor for leading me in the right direction.

CUDA kernel gives different result even though input is the same

I want to write a CUDA kernel that will multiply 2 matrices NxN size. I did manage to do it, but without the thread cooperation... Now I want to do it with thread cooperation, and I followed the code provided in the SDK. But for some reason kernel returns different result. So here is the .cu file:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<cuda_runtime_api.h>
#include<device_functions.h>
static void HandleError(cudaError_t err, const char *file, int line)
{
if(err!=cudaSuccess){
printf("%s in %s file at line %s\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#define ORDER 4
__global__ void matrixMul( int* A, int* B, int* C, int wA, int wB)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int aBegin = wA * ORDER * by;
int aEnd = aBegin + wA - 1;
int aStep = ORDER;
int bBegin = ORDER * bx;
int bStep = ORDER * wB;
int Csub=0;
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
__shared__ int As[ORDER][ORDER];
__shared__ int Bs[ORDER][ORDER];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
__syncthreads();
#pragma unroll
for (int k = 0; k < ORDER; ++k)
Csub += As[ty][k] * Bs[k][tx];
__syncthreads();
}
int c = wB * ORDER * by + ORDER * bx;
C[c + wB * ty + tx] = Csub;
}
#endif
int main()
{
int *a=(int*)malloc(ORDER*ORDER*sizeof(int));
int *b=(int*)malloc(ORDER*ORDER*sizeof(int));
int *c=(int*)malloc(ORDER*ORDER*sizeof(int));
int *dev_a, *dev_b, *dev_c;
HANDLE_ERROR(cudaMalloc((void**)&dev_a, ORDER*ORDER*sizeof(int*)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b, ORDER*ORDER*sizeof(int*)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c, ORDER*ORDER*sizeof(int*)));
for(int i=0; i<ORDER*ORDER; i++)
{
a[i]=1;
b[i]=2;
}
HANDLE_ERROR(cudaMemcpy(dev_a, a, ORDER*ORDER*sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b, b, ORDER*ORDER*sizeof(int), cudaMemcpyHostToDevice));
matrixMul<<<ORDER, ORDER>>>(dev_a, dev_b, dev_c, ORDER, ORDER);
HANDLE_ERROR(cudaMemcpy(c, dev_c, ORDER*ORDER*sizeof(int), cudaMemcpyDeviceToHost));
for(int i=0; i<ORDER*ORDER; i++)
{
if((i%ORDER)==0)
printf("\n\n");
printf("%d\t", a[i]);
}
for(int i=0; i<ORDER*ORDER; i++)
{
if((i%ORDER)==0)
printf("\n\n");
printf("%d\t", b[i]);
}
for(int i=0; i<ORDER*ORDER; i++)
{
if((i%ORDER)==0)
printf("\n\n");
printf("%d\t", c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
Yes, I know that there is no "real" question... But if anyone could point me to wright direction I would be grateful. Thank you!
If you need more code example, let me know and I'll edit the question.
EDIT #1: I forgot to mention... I haven't been able to implement nvcc in Visual Studi 2010 so I'm unable to use debugger. Any suggestion about that?
EDIT #2: Updated question so it shows both CUDA kernel and main.
Your kernel seems right, if your thread-geometry is BLOCKSIZE x BLOCKSIZE. Is that the case?
If that isn't your problem:
Since you said you got it working without thread synchronization, you probably got the memory allocation correct.
Try testing with a thread-geometry of 4x4 and the following two matrices:
1 1 1 1 1 0 0 0
2 2 2 2 0 1 0 0
3 3 3 3 0 0 1 0
5 5 5 5 0 0 0 1
The output should give you a hint as to what might be going wrong.

Resources