CUDA kernel gives different result even though input is the same - matrix

I want to write a CUDA kernel that will multiply 2 matrices NxN size. I did manage to do it, but without the thread cooperation... Now I want to do it with thread cooperation, and I followed the code provided in the SDK. But for some reason kernel returns different result. So here is the .cu file:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<cuda_runtime_api.h>
#include<device_functions.h>
static void HandleError(cudaError_t err, const char *file, int line)
{
if(err!=cudaSuccess){
printf("%s in %s file at line %s\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#define ORDER 4
__global__ void matrixMul( int* A, int* B, int* C, int wA, int wB)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int aBegin = wA * ORDER * by;
int aEnd = aBegin + wA - 1;
int aStep = ORDER;
int bBegin = ORDER * bx;
int bStep = ORDER * wB;
int Csub=0;
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
__shared__ int As[ORDER][ORDER];
__shared__ int Bs[ORDER][ORDER];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
__syncthreads();
#pragma unroll
for (int k = 0; k < ORDER; ++k)
Csub += As[ty][k] * Bs[k][tx];
__syncthreads();
}
int c = wB * ORDER * by + ORDER * bx;
C[c + wB * ty + tx] = Csub;
}
#endif
int main()
{
int *a=(int*)malloc(ORDER*ORDER*sizeof(int));
int *b=(int*)malloc(ORDER*ORDER*sizeof(int));
int *c=(int*)malloc(ORDER*ORDER*sizeof(int));
int *dev_a, *dev_b, *dev_c;
HANDLE_ERROR(cudaMalloc((void**)&dev_a, ORDER*ORDER*sizeof(int*)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b, ORDER*ORDER*sizeof(int*)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c, ORDER*ORDER*sizeof(int*)));
for(int i=0; i<ORDER*ORDER; i++)
{
a[i]=1;
b[i]=2;
}
HANDLE_ERROR(cudaMemcpy(dev_a, a, ORDER*ORDER*sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b, b, ORDER*ORDER*sizeof(int), cudaMemcpyHostToDevice));
matrixMul<<<ORDER, ORDER>>>(dev_a, dev_b, dev_c, ORDER, ORDER);
HANDLE_ERROR(cudaMemcpy(c, dev_c, ORDER*ORDER*sizeof(int), cudaMemcpyDeviceToHost));
for(int i=0; i<ORDER*ORDER; i++)
{
if((i%ORDER)==0)
printf("\n\n");
printf("%d\t", a[i]);
}
for(int i=0; i<ORDER*ORDER; i++)
{
if((i%ORDER)==0)
printf("\n\n");
printf("%d\t", b[i]);
}
for(int i=0; i<ORDER*ORDER; i++)
{
if((i%ORDER)==0)
printf("\n\n");
printf("%d\t", c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
Yes, I know that there is no "real" question... But if anyone could point me to wright direction I would be grateful. Thank you!
If you need more code example, let me know and I'll edit the question.
EDIT #1: I forgot to mention... I haven't been able to implement nvcc in Visual Studi 2010 so I'm unable to use debugger. Any suggestion about that?
EDIT #2: Updated question so it shows both CUDA kernel and main.

Your kernel seems right, if your thread-geometry is BLOCKSIZE x BLOCKSIZE. Is that the case?
If that isn't your problem:
Since you said you got it working without thread synchronization, you probably got the memory allocation correct.
Try testing with a thread-geometry of 4x4 and the following two matrices:
1 1 1 1 1 0 0 0
2 2 2 2 0 1 0 0
3 3 3 3 0 0 1 0
5 5 5 5 0 0 0 1
The output should give you a hint as to what might be going wrong.

Related

CUDA - Parallel Reduction Sum of Even and Odd Number Separately

I am trying to implement a parallel reduction sum of even and odd number Separately in CUDA.
I'm new in CUDA programming and I'm trying so hard but I can't find a solution.
I have for example the array : [5, 8, 0, -6, 2]. And the result need to be [4, 5] (Even : 8+0-6+2=4, Odd : 5=5).
But the result of my following code is [8, 5].
I think that my problem is in the notion of "shared" but I do not understand why.
__global__ void sumEvenOdd(int *a, int *b, int N){
int column = blockIdx.x * blockIdx.x + threadIdx.x;
__shared__ int s_data[2];
if (column < N){
if (a[column] % 2 == 0){
s_data[0] += a[column];
}
else{
s_data[1] += a[column];
}
__syncthreads();
b[0] = s_data[0];
b[1] = s_data[1];
}
}
void initArray(int *a, int N){
for (unsigned int i = 0; i < N; i++){
a[i] = rand() % 100;
}
}
void verify_result(int *a, int *b, int N){
int *verify_b;
verify_b = (int*)malloc(2 * sizeof(int));
verify_b[0] = 0;
verify_b[1] = 0;
for (unsigned int i = 0; i < N; i++){
if (a[i] % 2 == 0){
verify_b[0] += a[i];
}
else{
verify_b[1] += a[i];
}
}
for (unsigned int i = 0; i < 2; i++){
assert(verify_b[i] == b[i]);
}
}
void printResult(int *a, int *b, int N){
printf("\n");
for (unsigned int i = 0; i < N; i++){
printf("%d, ", a[i]);
}
printf("\n");
for (unsigned int i = 0; i < 2; i++){
printf("%d, ", b[i]);
}
}
int main(){
//Array sizes;
int N = 5;
//Size (in bytes) of matrix
size_t bytes = N * sizeof(int);
//Host pointers
int *a, *b;
// Allocate host memory
a = (int*)malloc(bytes);
b = (int*)malloc(2 * sizeof(int));
// Initialize array
initArray(a, N);
// Device pointers
int *d_a, *d_b;
// Allocated device memory
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, 2 * sizeof(int));
// Copy data to the device
cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice);
//Number of threads
int THREADS = 128;
//Number of blocks
int BLOCKS = (N + THREADS - 1) / THREADS;
// Launch kernel
sumEvenOdd<<<BLOCKS, THREADS>>>(d_a, d_b, N);
cudaDeviceSynchronize();
// Copy back to the host
cudaMemcpy(b, d_b, 2 * sizeof(int), cudaMemcpyDeviceToHost);
// Check result
verify_result(a, b, N);
printResult(a, b, N);
return 0;
}
you cannot just use
s_data[1] += a[column];
remember all units are going to execute this line at the same time, and store in the same position, so all threads are storing into s_data at the same time.
instead you should use atomic add
atomicAdd(&s_data[1], a[column]);
and you should also be initializing s_data to zeros.

CUDA: Matrix + Matrix, segmentation fault when printing solution matrix in host

I'm trying to make a simple operation of adding a matrix to another one in CUDA, but I get a segmentation fault when I try to check the resault, here's the code:
/* Includes, system */
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 15000
/* DEVICE CODE */
__global__ void sumaMatrices(int *d_matrix1, int *d_matrix2, int *d_matrixSolucion){
int idThread = blockIdx.x*blockDim.x + threadIdx.x;
if (idThread < N)
{
d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];
}
}
__host__ void printMatrix(int **matrix)
{
int i, j;
//only 4 so the file is not too big
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
printf("%d", matrix[i][j]);
printf(" ");
}
printf("\n");
}
printf("\n");
}
/* HOST CODE*/
int main(int argc, char** argv)
{
int i;
int **h_matrix1;
int **h_matrix2;
int **h_matrixSolucion;
int *d_matrix1;
int *d_matrix2;
int *d_matrixSolucion;
h_matrix1 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix1[i] = (int*)malloc(N * sizeof(int*));
}
h_matrix2 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix2[i] = (int*)malloc(N * sizeof(int*));
}
h_matrixSolucion = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrixSolucion[i] = (int*)malloc(N * sizeof(int*));
}
cudaMalloc((void**)& d_matrix1,N*N*sizeof(int));
cudaMalloc((void**)& d_matrix2,N*N*sizeof(int));
cudaMalloc((void**)& d_matrixSolucion,N*N*sizeof(int));
fillMatrix(h_matrix1);
fillMatrix(h_matrix2);
fillMatrixTo0(h_matrixSolucion);
for(i = 0; i < N; i++)
{
cudaMemcpy(&d_matrix1[i*N], h_matrix1[i], N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&d_matrix2[i*N], h_matrix2[i], N*sizeof(int), cudaMemcpyHostToDevice);
}
int tamBloque = 256;
int tamGrid = N/tamBloque + 1;
sumaMatrices<<<tamGrid, tamBloque>>>(d_matrix1, d_matrix2, d_matrixSolucion);
//nos traemos la informaciĆ³n del device
cudaThreadSynchronize();
for(i = 0; i < N; i++)
{
cudaMemcpy(h_matrixSolucion[i], &d_matrixSolucion[i*N],tamGrid*sizeof(h_matrixSolucion[0]),cudaMemcpyDeviceToHost);
}
printMatrix(h_matrix1);
printMatrix(h_matrix2);
printMatrix(h_matrixSolucion);
}
If I comment that last line the progams doens't give any error.
I'm guesss the problem is that I don't storage the information properly in the kernel (this line: d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];) but I am pretty new to CUDA and I don't really know how to solve it.
EDIT: Now that I've changed the way I get the information back from the device this is what is printing:
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
2 3 4 5
3 4 5 6
4 5 6 7
5 6 7 8
2 4 6 8
0 0 0 0
0 0 0 0
0 0 0 0
The first 2 matrix are the ones with the information and the other one is the solution, but only has 1 line filled.
There are a variety of errors in your code.
There was not definition for fillMatrix
Your underlying host allocations performed with malloc are not guaranteed to be contiguous, therefore you cannot transfer the data back in a single cudaMemcpy operation, but you must use a loop, like the loop you used to transfer data to the GPU
Your host allocations aren't quite right but they don't present an actual problem. This:
h_matrix1[i] = (int*)malloc(N * sizeof(int*));
should be this:
h_matrix1[i] = (int*)malloc(N * sizeof(int));
and likewise for the other similar instances.
Your grid (total number of threads) sizing is not correct. Your kernel uses one thread to perform one elementwise addition. Therefore, for a NxN matrix, you need NxN threads, not just N as you are creating and testing against.
The following code has these issues fixed and seems to work correctly for me:
$ cat t2.cu
/* Includes, system */
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 15000
/* DEVICE CODE */
__global__ void sumaMatrices(int *d_matrix1, int *d_matrix2, int *d_matrixSolucion){
int idThread = blockIdx.x*blockDim.x + threadIdx.x;
if (idThread < N*N)
{
d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];
}
}
__host__ void printMatrix(int **matrix)
{
int i, j;
//only 4 so the file is not too big
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
printf("%d", matrix[i][j]);
printf(" ");
}
printf("\n");
}
printf("\n");
}
/* HOST CODE*/
int main(int argc, char** argv)
{
int i;
int **h_matrix1;
int **h_matrix2;
int **h_matrixSolucion;
int *d_matrix1;
int *d_matrix2;
int *d_matrixSolucion;
h_matrix1 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix1[i] = (int*)malloc(N * sizeof(int));
for (int j = 0; j < N; j++) h_matrix1[i][j] = 1;
}
h_matrix2 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix2[i] = (int*)malloc(N * sizeof(int));
for (int j = 0; j < N; j++) h_matrix2[i][j] = 2;
}
h_matrixSolucion = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrixSolucion[i] = (int*)malloc(N * sizeof(int));
for (int j = 0; j < N; j++) h_matrixSolucion[i][j] = 0;
}
cudaMalloc((void**)& d_matrix1,N*N*sizeof(int));
cudaMalloc((void**)& d_matrix2,N*N*sizeof(int));
cudaMalloc((void**)& d_matrixSolucion,N*N*sizeof(int));
for(i = 0; i < N; i++)
{
cudaMemcpy(&d_matrix1[i*N], h_matrix1[i], N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&d_matrix2[i*N], h_matrix2[i], N*sizeof(int), cudaMemcpyHostToDevice);
}
int tamBloque = 256;
int tamGrid = (N*N)/tamBloque + 1;
sumaMatrices<<<tamGrid, tamBloque>>>(d_matrix1, d_matrix2, d_matrixSolucion);
cudaThreadSynchronize();
for(i = 0; i < N; i++)
{
cudaMemcpy(h_matrixSolucion[i],&d_matrixSolucion[i*N],N*sizeof(int),cudaMemcpyDeviceToHost);
}
printMatrix(h_matrix1);
printMatrix(h_matrix2);
printMatrix(h_matrixSolucion);
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
1 1 1 1
1 1 1 1
1 1 1 1
1 1 1 1
2 2 2 2
2 2 2 2
2 2 2 2
2 2 2 2
3 3 3 3
3 3 3 3
3 3 3 3
3 3 3 3
========= ERROR SUMMARY: 0 errors
$

MPI_Scatterv submatrix with MPI_Type_struct

I'm currently working on a MPI-program and I'm trying to send blocks of a matrix with scatterv to all processes.
Process description
The matrix is given as an array.
First I produce a datatype with MPI_Type_vector to create the necessary block out of the original array.
Second I create a MPI_Type_struct that should hold rows of blocks.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 16
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p,r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
int *arr;
arr = NULL;
if (r == 0){
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n * n; i++) arr[i] = i;
for (int i = 0; i < n; i++){
printf("\n");
for (int j = 0; j < n; j++)
printf("%4d", arr[i * n + j]);
}
}
printf("\n");
int ps = sqrt(p);
int ns = n / ps;
if (r == 0) {
printf("ps: %d ns: %d\n", ps, ns);
}
/* create datatype */
MPI_Datatype block;
MPI_Type_vector(ns, ns, n, MPI_INT, &block);
int blocks[ps];
MPI_Aint displs[ps];
for (int i = 0; i < ps; i++) {
blocks[i] = 1;
displs[i] = i * sizeof(int);
}
MPI_Datatype types[ps];
//for (int i = 0; i < ps - 1; i++) types[i] = block;
//types[ps - 1] = MPI_UB;
types[0] = block;
for (int i = 1; i < ps; i++) types[i] = MPI_UB;
//types[0] = block;
//types[1] = MPI_UB;
if (r == 0) {
printf("displs:\n");
for(int i = 0; i < ps; i++) printf("%3ld", displs[i]);
printf("\n");
}
MPI_Datatype row;
MPI_Type_struct(ps, blocks, displs, types, &row);
MPI_Type_commit(&row);
/* prepare scatter */
int sdispl[p]; int sendcounts[p];
for (int i = 0; i < p; i++) {
sdispl[i] = (i % ps) + (i / ps) * (ns * ps);
sendcounts[i] = 1;
}
if (r == 0) {
printf("sdispl: \n");
for (int i = 0; i < 4; i++) printf("%3d", sdispl[i]);
printf("\n");
}
int rcv[ns * ns];
MPI_Scatterv(arr, sendcounts, sdispl, row, rcv, ns * ns, MPI_INT, 0, comm);
int result = 1;
if (r == result) {
printf("result for %d:\n", result);
for (int i = 0; i < ns * ns; i++) {
printf("%4d", rcv[i]);
if ((i+1) % ns == 0) printf("\n");
}
}
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
So far the structure of the blocks is correct.
The problem
The block, that was sent to process r = 1 starts with 3 instead of 4. The block for process r = 2 also starts with 6 and the one for process r = 3 starts with 9.
For r == 4 it jumps to 48.
What it should do
r start
0 0
1 4
2 8
3 12
4 64
5 68
6 ...
15 204
The help I would need
I think, that I'm making some mistake with displ and sdispl.
Compiling and Running the example
The code is compiled with the folowing command:
mpicc -o main main.c -lm
I run the code with:
mpirun -np 16 ./main
Thanks for any help in advance!
With the hint of Zulan I was able to solve my problem.
The following code is based on the excellent answer to subarrays.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 8
void print_arr(int *arr, int x) {
printf("\n");
for (int i = 0; i < x*x; i++){
if (i % x == 0) printf("\n");
printf("%4d", arr[i]);
}
printf("\n");
}
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p, r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
/* number of proceses in dim x and dim y */
int ps = sqrt(p);
/* number of elements in dim x and dim y in sarr */
int ns = n/ps;
/* array of data - distributed by process 0 */
int *arr = NULL;
if (r==0) {
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n*n; i++) arr[i] = i;
print_arr(arr, n);
}
MPI_Datatype type, resizedtype;
int sizes[2] = {n,n};
int subsizes[2] = {ns,ns};
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, ns*sizeof(int), &resizedtype);
MPI_Type_commit(&resizedtype);
int counts[p];
for (int i = 0; i < p; i++) counts[i] = 1;
int displs[p];
for (int i = 0; i < p; i++) displs[i] = i%ps + i/ps * ns * ps;
/* subarray to store distributed data */
int sarr[ns * ns];
/* send submatrices to all processes */
MPI_Scatterv(arr, counts, displs, resizedtype, sarr, ns*ns, MPI_INT, 0, comm);
/* print received data for process pr */
int pr = 3;
if (r == pr)
print_arr(sarr, ns);
/* free arr */
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
You can compile the example with
mpicc -o main main.c
and run it with
mpirun -np 4 ./main

Kernel crashes while trying to do a simple value assignment

I am learning CUDA and still at the very beginner level. I am trying a simple assignment but my code crashes when I run it and I am not sure why. Any help would be appreciated.
EDIT: Crashes on cudaMemcpy and in Image structure, the pixelVal is of type int**. Is that the cause?
Original C++ code:
void Image::reflectImage(bool flag, Image& oldImage)
/*Reflects the Image based on users input*/
{
int rows = oldImage.N;
int cols = oldImage.M;
Image tempImage(oldImage);
for(int i = 0; i < rows; i++)
{
for(int j = 0; j < cols; j++)
tempImage.pixelVal[rows - (i + 1)][j] = oldImage.pixelVal[i][j];
}
oldImage = tempImage;
}
My CUDA kernel & code:
#define NTPB 512
__global__ void fliph(int* a, int* b, int r, int c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= r || j >= c)
return;
a[(r - i * c) + j] = b[i * c + j];
}
void Image::reflectImage(bool flag, Image& oldImage)
/*Reflects the Image based on users input*/
{
int rows = oldImage.N;
int cols = oldImage.M;
Image tempImage(oldImage);
if(flag == true) //horizontal reflection
{
//Allocate device memory
int* dpixels;
int* oldPixels;
int n = rows * cols;
cudaMalloc((void**)&dpixels, n * sizeof(int));
cudaMalloc((void**)&oldPixels, n * sizeof(int));
cudaMemcpy(dpixels, tempImage.pixelVal, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(oldPixels, oldImage.pixelVal, n * sizeof(int), cudaMemcpyHostToDevice);
int nblks = (n + NTPB - 1) / NTPB;
fliph<<<nblks, NTPB>>>(dpixels, oldPixels, rows, cols);
cudaMemcpy(tempImage.pixelVal, dpixels, n * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dpixels);
cudaFree(oldPixels);
}
oldImage = tempImage;
}
You have to create a 2D Grid in order to process the image using 2D indices i and j. In the current case, the kernel is processing only the first row of the image.
To create a 2D indexing mechanism, create a 2D block and 2D grid like this:
const int BLOCK_DIM = 16;
dim3 Block(BLOCK_DIM,BLOCK_DIM);
dim3 Grid;
Grid.x = (cols + Block.x - 1)/Block.x;
Grid.y = (rows + Block.y - 1)/Block.y;
fliph<<<Grid, Block>>>(dpixels, oldPixels, rows, cols);

Unordinary performance gap between OpenCL and CUDA

I have coded a simple tiled matrix multiplication in CUDA. It's like this:
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
__shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
__shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
float Cvalue = 0.0;
// Loop over the M and N tiles required to compute the Pd element
for (int m = 0; m < (numAColumns-1)/TILE_WIDTH+1; ++m) {
if(row<numARows && m*TILE_WIDTH+tx < numAColumns){
ds_A[ty][tx] = A[row*numAColumns + m*TILE_WIDTH+tx];
} else {
ds_A[ty][tx] = 0;
}
if(m*TILE_WIDTH+ty < numBRows && col < numBColumns){
ds_B[ty][tx] = B[(m*TILE_WIDTH+ty)*numBColumns+col];
} else {
ds_B[ty][tx] = 0;
}
__syncthreads();
if(row < numCRows && col < numCColumns){
for (int k = 0; k < TILE_WIDTH; ++k)
Cvalue += ds_A[ty][k] * ds_B[k][tx];
}
__syncthreads();
}
if(row < numCRows && col < numCColumns)
C[row*numCColumns+col] = Cvalue;
}
After that, I used the same above kernel (with some minor changes) in the OpenCL version to compare the performance of CUDA and OpenCL together. But the result was to so far beyond my expectations. OpenCL was 6-7 times faster than CUDA. Is it valid?
The output of Nisght is as follows:
CUDA:
OpenCL:
You can see a large gap between starting the app and executing the kernel. why is that happened?
My GPU is: GTX 580 |
The Kernel Ex time (CUDA): 3.78s |
The Kernel Ex time (OpenCL): 0.53s |
CUDA Code: http://pastebin.com/VQMp3Hba
OpenCL Host Code: http://pastebin.com/cjGYSLQf
OpenCL Kernel Code: http://pastebin.com/KKw3Ayz7
You can try and insert explicit timers in the code instead of trusting the output from the tool. May be the case that the tool is wrong.

Resources