Cuda Matrix Multiplication -- not working for some non-square matrices - matrix

I am experimenting with Cuda programming a the moment. As part of this I am attempting to develop a matrix multiplication algorithm to run on GPU. This algorithm works for square matrices but fails for non-square matrices.
Here is my kernel
float* multiply_gpu(float* matrix1 , float* matrix2);
__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a,
float *b, float *result) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int result_size = rowsA*columnsB;
int value = 0;//the final result
//indices of values from input matrices
if (index < result_size) {
int index1 = (index/rowsA)*rowsA; //get nearest row
int index2 = index%columnsB; //get start column
int k = 0;
while (k<columnsA) { //columnsA == rowsB
value += a[index1]*b[index2]; //v = sum a_ik * b_kj
index1 ++;
index2 += columnsB;
k++;
}
result[index] = value;
}
}
After doing a brief sanity check with my supervisor, he has not seen any issues with it.
I believe the problem lies in this function:
float* multiply_gpu(float* matrix1 , float* matrix2) {
//the dimensions of the matrices
size_t available, total;
cudaError_t error;
cudaError err = cudaMemGetInfo(&available, &total);
if(err != cudaSuccess){
printf("There was an error: %s\n", cudaGetErrorString(err));
}
int height1 = matrix1[0];
int width1 = matrix1[1];
int height2 = matrix2[0];
int width2 = matrix2[1];
if (width1!=height2) {
return NULL;
}
//this array contains the result of the operation
float* result = (float *) malloc(height1*width2*sizeof(float));
//pointers for device matrices
float *d_matrix1;
float *d_matrix2;
float *d_result;
//allocate memory for matrices
error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//now copy matrices onto device -- note the offset of 2
error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result);
printf("%d %d %d %d\n",height1,width1,height2,width2);
//make the host block until mult is finished running
//printf("finished multiplying\n");
cudaDeviceSynchronize();
//copy result back
error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//free now unneeded cuda memory
cudaFree(d_matrix1);
cudaFree(d_matrix2);
cudaFree(d_result);
printf("GOT RESULT\n");
for (int i=0;i<height1*width2;i++) {
printf("%f ",result[i]);
}
printf("\n");
//result ready to be returned
return result;
}
Note that the matrices which are parameters to multiply_gpu have their height at index 0 and width at index 1. The result matrix does not have this information.
An example of incorrect computation:
when I feed the following arrays into multiply_gpu -- {2,3,1,2,3,4,5,6} , {3,2,1,2,3,4,5,6} the answer should be {22,28,49,64} but instead my unit tests generate {22,28,40,52}. So close! Note that for the dot product of (1,2,3)*(1,2,3) (which is not square) the algorithm is happy... What could be the error here? Thanks for any assistance. Will post a solution if I find one independently.

This line is wrong:
int index1 = (index/rowsA)*rowsA; //get nearest row
It should be something like this:
int index1 = (index/columnsB)*columnsA; //get nearest row
Why is this formulation correct? The index1 is used to index through the row elements in A that correspond to the row indicated by the output matrix position we are computing. The output matrix position is just the thread index. If we (integer) divide the thread index by the number of columns in the output matrix i.e. C, we get the row number in question. Then, to find the first element of that row in A, we multiply by the number of columns in A. This correctly indexes us to the first element of the relevant row in A.
Here's a complete app along with my test cases - the only change I made to your code was the change indicated above.
$ cat t290.cu
#include <stdio.h>
__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a, float *b, float *result) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int result_size = rowsA*columnsB;
int value = 0;//the final result
//indices of values from input matrices
if (index < result_size) {
int index1 = (index/columnsB)*columnsA; //get nearest row
int index2 = index%columnsB; //get start column
int k = 0;
while (k<columnsA) { //columnsA == rowsB
value += a[index1]*b[index2]; //v = sum a_ik * b_kj
index1 ++;
index2 += columnsB;
k++;
}
result[index] = value;
}
}
float* multiply_gpu(float* matrix1 , float* matrix2) {
//the dimensions of the matrices
size_t available, total;
cudaError_t error;
cudaError err = cudaMemGetInfo(&available, &total);
if(err != cudaSuccess){
printf("There was an error: %s\n", cudaGetErrorString(err));
}
int height1 = matrix1[0];
int width1 = matrix1[1];
int height2 = matrix2[0];
int width2 = matrix2[1];
if (width1!=height2) {
printf("fail!\n");
return NULL;
}
//this array contains the result of the operation
float* result = (float *) malloc(height1*width2*sizeof(float));
//pointers for device matrices
float *d_matrix1;
float *d_matrix2;
float *d_result;
//allocate memory for matrices
error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//now copy matrices onto device -- note the offset of 2
error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result);
printf("%d %d %d %d\n",height1,width1,height2,width2);
error = cudaGetLastError();
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//make the host block until mult is finished running
//printf("finished multiplying\n");
error = cudaDeviceSynchronize();
if (error != cudaSuccess) {
fprintf(stderr, "kernel fail (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//copy result back
error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//free now unneeded cuda memory
cudaFree(d_matrix1);
cudaFree(d_matrix2);
cudaFree(d_result);
printf("GOT RESULT\n");
for (int i=0;i<height1*width2;i++) {
printf("%f ",result[i]);
}
printf("\n");
//result ready to be returned
return result;
}
int main(){
float m1[8] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
float m2[6] = {2.0, 2.0, 1.0, 1.0, 2.0, 2.0};
float *my_result1 = multiply_gpu(m2, m1);
float m3[8] = {2,3,1,2,3,4,5,6};
float m4[8] = {3,2,1,2,3,4,5,6};
float *my_result2 = multiply_gpu(m3, m4);
float *my_result3 = multiply_gpu(m4, m3);
float m5[12] = {2,5,1,1,1,1,1,1,1,1,1,1};
float m6[22] = {5,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
float *my_result4 = multiply_gpu(m5, m6);
return 0;
}
$ nvcc -arch=sm_20 -o t290 t290.cu
t290.cu: In function âfloat* multiply_gpu(float*, float*)â:
t290.cu:30: warning: converting to âintâ from âfloatâ
t290.cu:31: warning: converting to âintâ from âfloatâ
t290.cu:32: warning: converting to âintâ from âfloatâ
t290.cu:33: warning: converting to âintâ from âfloatâ
$ cuda-memcheck ./t290
========= CUDA-MEMCHECK
2 2 2 3
GOT RESULT
5.000000 7.000000 9.000000 10.000000 14.000000 18.000000
2 3 3 2
GOT RESULT
22.000000 28.000000 49.000000 64.000000
3 2 2 3
GOT RESULT
9.000000 12.000000 15.000000 19.000000 26.000000 33.000000 29.000000 40.000000 51.000000
2 5 5 4
GOT RESULT
5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000
========= ERROR SUMMARY: 0 errors
$

So after carefully reviewing my matrix code I discovered a simple problem with
the mathematics of my operation.
It is true this line was wrong
int index1 = (index/rowsA)*rowsA; //get nearest row
I note that since my matrix is row-ordered, the formula for acquiring the correct index from element at (i,j) is
index = i*rowLength + j
Therefore the assignment to index1 should be
int index1 = (index/rowsA)*columnsA
Why? Well it's obvious that to navigate to the index for row n, we have to move by n row lengths (this is the number of columns in the matrix). My code worked for square matrices but not other rectangular ones because the number of columns doesn't match the number of rows in such a matrix.

Related

When I try to use multidimensional blocks or grids part of my resulted array occurs zeros

I slightly modify CUDA 10.1 Runtime project to acquaint with multidimensional blocks and grids. I use Visual Studio 2015 and NVIDA Quatro P400 video card. But in resulting array after some correct results follow zero values. What is wrong in following program? It uses meltidimensional blocks. The same is with grids.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
const int arraySize = 448 * 1024;
int a[arraySize];
int b[arraySize];
int c[arraySize] = { 0 };
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = 256*(blockIdx.y*blockDim.x + threadIdx.y) + blockIdx.x*blockDim.x + threadIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
int i;
for(i = 0; i < arraySize; i++)
{
a[i] = i;
b[i] = i;
}
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
dim3 threads(256, 4, 1);
dim3 blocks(size >> 10, 1, 1);
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
addKernel << < blocks, threads >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
This calculation is incorrect:
int i = 256*(blockIdx.y*blockDim.x + threadIdx.y) + blockIdx.x*blockDim.x + threadIdx.x;
I'm not sure how you came up with that.
The calculation should be to take your thread x index and add to it the grid width in x, times the row in y.
It should be:
int i = (blockIdx.y*blockDim.y+threadIdx.y)*(gridDim.x*blockDim.x) + blockIdx.x*blockDim.x + threadIdx.x;
// the row in y * grid width in x + thread index in x

Matrix Multiplication : Performance decreases after coalescing global memory access in CUDA

I have started recently started working with GPUs using CUDA. As a starter programme I tried to implement a simple matrix multiplication efficiently
C = AB
, Starting with the naive matrix multiplication (each thread loads all the elements of A and B for an element in C), the tiled implementation (threads collaboratively load a tile of elements from A and B in a tile in shared memory to reduce global memory traffic) provides good speed up.
However, in the tiled implementation too the access to the global memory is not in coalesced order. So, to increase performance it is better to transpose matrix B and then multiply. Below is my code,
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include <time.h>
#include <sys/time.h>
void querydeviceprop();
void allocate_matrix(float *h_a, float *h_b, int matDim);
void verify(float *h_c, float *h_c_check, int matDim);
void print_matrix(float *ha, int m,int n);
void transpose_matrix(float *ha, int matDim);
void mat_mul();
#define TILE_WIDTH 16 //should be equal to numThread for tiling implementation
__global__ void MatrixMult_tiling(float *d_a,float *d_b,float *d_c, int dim){
__shared__ float ta[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
__shared__ float tb[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
int bx,by,tx,ty,i,j;
float res;
int row, col;
bx=blockIdx.x; by=blockIdx.y;
tx=threadIdx.x; ty=threadIdx.y;
row=by*TILE_WIDTH+ty;
col=bx*TILE_WIDTH+tx;
res=0;
for(i=0;i<dim/TILE_WIDTH;i++){
//collaboratively load the elements. Each thread loads a single element.
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[(ty+i*TILE_WIDTH)*dim+col];
__syncthreads();
for(j=0;j<TILE_WIDTH;j++){
res=res+ta[ty][j]*tb[j][tx];
}
__syncthreads();
}
d_c[row*dim+col]=res;
}
__global__ void MatrixMult_tiling_coalesced(float *d_a,float *d_b,float *d_c, int dim){
__shared__ float ta[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
__shared__ float tb[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
int bx,by,tx,ty,i,j;
float res;
int row, col;
bx=blockIdx.x; by=blockIdx.y;
tx=threadIdx.x; ty=threadIdx.y;
row=by*TILE_WIDTH+ty;
col=bx*TILE_WIDTH+tx;
res=0;
for(i=0;i<dim/TILE_WIDTH;i++){
//collaboratively load the elements. Each thread loads a single element.
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
__syncthreads();
for(j=0;j<TILE_WIDTH;j++){
res=res+ta[ty][j]*tb[tx][j];
}
__syncthreads();
}
d_c[row*dim+col]=res;
}
__global__ void MatrixMult_naive(float *d_a,float *d_b,float *d_c, int dim){
int row,col,i;
col=blockIdx.y*blockDim.y+threadIdx.y;
row=blockIdx.x*blockDim.x+threadIdx.x;
float res;
if(row<dim && col<dim){
res=0;
for(i=0;i<dim;i++){
res=res+(d_a[row*dim+i]*d_b[i*dim+col]);
}
d_c[row*dim+col]=res;
}
}
int main(){
mat_mul();
return 0;
}
void mat_mul(){
cudaSetDevice(0);
time_t t;
cudaError_t err = cudaSuccess;
srand((unsigned) time(&t));
cudaEvent_t start, stop;
float milliseconds=0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int matDim = 8192;
float *h_a, *h_b, *h_c, *h_c_check;
/*declare the host memories*/
h_a=(float *)malloc(matDim*matDim*sizeof(float));
h_b=(float *)malloc(matDim*matDim*sizeof(float));
h_c=(float *)malloc(matDim*matDim*sizeof(float));
h_c_check=(float *)malloc(matDim*matDim*sizeof(float));
// Verify that allocations succeeded
if (h_a == NULL || h_b == NULL || h_c == NULL || h_c_check ==NULL)
{
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
allocate_matrix(h_a,h_b,matDim); // allocate memory to hold the matrix
//allocate cuda memory
float *d_a=NULL;
float *d_b=NULL;
float *d_c=NULL;
err=cudaMalloc((void **)&d_a, matDim*matDim*sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err=cudaMalloc((void **)&d_b, matDim*matDim*sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err=cudaMalloc((void **)&d_c, matDim*matDim*sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Matrix dimension is : %d\n",matDim);
// Copy the host input matrix A and B in host memory to the device matrix in device memory
//printf("Copy input data from the host memory to the CUDA device\n");
cudaEventRecord(start);
err = cudaMemcpy(d_a, h_a, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy matrix A %10.10f ms\n",milliseconds);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventRecord(start);
err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy matrix B %10.10f ms\n",milliseconds);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
/*constants for kernel launch*/
int numThread=16; //number of threads per Block axis
int numBlocks=matDim/numThread;
if(matDim%numThread)
numBlocks++;
dim3 dimGrid(numBlocks,numBlocks);
dim3 dimBlock(numThread,numThread);
//-------------------------------------------------------------
//-------transpose and copy to GPU-------
transpose_matrix(h_b, matDim);//transpose first the b matrix
err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventSynchronize(stop);
if (err != cudaSuccess){
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//--------transpose and copy ends-------------
cudaEventRecord(start);
MatrixMult_tiling_coalesced<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
cudaEventRecord(stop);
err = cudaGetLastError();
if (err != cudaSuccess){
fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU time tiled & coalesced %10.10f ms\n",milliseconds);
//printf("Copy output data from the CUDA device to the host memory\n");
cudaEventRecord(start);
err = cudaMemcpy(h_c_check, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy time tiled & coalesced %10.10f ms\n",milliseconds);
//------------transpose back the original B matrix----------------
transpose_matrix(h_b, matDim);//transpose first the b matrix
err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventSynchronize(stop);
if (err != cudaSuccess){
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//------------transpose back the original matrix ends-------------
//-------------------------------------------------------------
cudaEventRecord(start);
MatrixMult_tiling<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
cudaEventRecord(stop);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU time tiled %10.10f ms\n",milliseconds);
//printf("Copy output data from the CUDA device to the host memory\n");
cudaEventRecord(start);
err = cudaMemcpy(h_c, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy time tiled %10.10f ms\n",milliseconds);
//-------------------------------------------------------------
/*
cudaEventRecord(start);
MatrixMult_naive<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
cudaEventRecord(stop);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU time naive %10.10f ms\n",milliseconds);
printf("Copy output data from the CUDA device to the host memory\n");
cudaEventRecord(start);
err = cudaMemcpy(h_c, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU memcpy time naive %10.10f ms\n",milliseconds);
*/
//-------------------------------------------------------------
verify(h_c, h_c_check, matDim);
// Free device global memory
err = cudaFree(d_a);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_b);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_c);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free host memory
free(h_a);
free(h_b);
free(h_c);
printf("Done\n");
}
void allocate_matrix(float *h_a, float *h_b, int matDim){
int i,j;
// Initialize the host input vectors
for (i = 0; i < matDim; i++)
{
for(j=0;j< matDim;j++){
h_a[i*matDim+j] = rand()%10;
h_b[i*matDim+j] = rand()%10;
}
}
}
void print_matrix(float *ha, int m,int n){
int i, j;
for(i=0;i<m;i++){
for(j=0;j<n;j++){
printf(" %.1f ",ha[i*m+j]);
}
printf("\n");
}
}
void transpose_matrix(float *h_a, int matDim){
int i, j;
int temp;
for(i=0;i<matDim;i++)
{
for(j=0;j<i;j++)
{
temp=h_a[i*matDim+j];
h_a[i*matDim+j]=h_a[j*matDim+i];
h_a[j*matDim+i]=temp;
}
}
}
void verify(float *h_c, float *h_c_check, int matDim){
int i,j;
//check the code
for (i = 0; i < matDim; i++)
{
for(j=0;j<matDim;j++){
if (fabs(h_c[i*matDim+j] - h_c_check[i*matDim+j]) > 1e-5)
{
printf("cpu : %f , gpu : %f\t",h_c[i*matDim+j],h_c_check[i*matDim+j]);
fprintf(stderr, "Result verification failed at element %d,%d !\n\n", i,j);
exit(EXIT_FAILURE);
}
}
}
printf("Test PASSED\n");
}
MatrixMult_tiling_coalesced and void MatrixMult_tiling are the functions with and without colalesced memroy access of the elements of B respectively.
Now, the problem is the time taken by MatrixMult_tiling_coalesced is almost double of the time taken by MatrixMult_tiling.
I understand that in the MatrixMult_tiling the eleemnts are loaded in the tiles in coalesced manner(i.e in row major order) for each tile, but the tiles are arranged in along a column whereas the the tiles in MatrixMult_tiling_coalesced are arranged along a row, so the function MatrixMult_tiling_coalesced should be faster than the other one.
But in practice I can see the opposite is true.
I will appreciate if someone can point out the reason.
Thnaks in advance..
EDIT 1:
After the answer of Robert (see below) I understand the problem was in the load operation during elemntwise multiplication.
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];]
to
tb[tx][ty]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
and
res=res+ta[ty][j]*tb[tx][j];
to
res=res+ta[ty][j]*tb[j][tx];
This inclreased performance of the MatrixMult_tiling_coalesced function from 1500 ms to 1000 ms. However, the function MatrixMult_tiling takes only 879 ms. So, the coalesced routine is still slower. I don't understand where is the problem now.
EDIT 2 :
I realized that in the EDIT 1, I had just moved the bank conflict problem from the elementwise multiplication to the element loading section. The folling changes in the code has no bank conflict,
tb[tx][ty]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
to
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
And
res=res+ta[ty][j]*tb[j][tx];
to
res=res+ta[ty][j]*tb[ty][j];
But it is still slightly slower than the MatrixMult_tiling function. The MatrixMult_tiling_coalesced function takes 982 ms vs 870 ms of MatrixMult_tiling function. It should be at least similar to MatrixMult_tiling if not faster.
FINAL EDIT :
Edit 2 will not produce correct result. So the code with edit 1 will be optimum. Transposing the one of the multiplicand matrix is probably not a good idea. :-(
Thanks all for helping.
B certainly isn't the matrix I would transpose in C=AB. But that is neither here nor there.
I'm not sure why you think:
in the tiled implementation too the access to the global memory is not in coalesced order
I don't see any lines of code in your MatrixMult_tiling that result in uncoalesced access.
Just to make sure we don't trip over terminology, "coalesced" or "uncoalesced" are terms that we apply to access patterns to global memory (not shared memory). Your global memory access patterns are in these lines in your tiled kernel:
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[(ty+i*TILE_WIDTH)*dim+col];
...
d_c[row*dim+col]=res;
and none of those patterns to global memory are uncoalesced. In each of the generated indices into d_a, d_b and d_c, if you perform the substitutions, you will find that the threadIdx.x variable is present in all of them and is not multiplied by any value, constant or otherwise. Therefore these patterns will all coalesce (nicely).
I will appreciate if someone can point out the reason.
You have done something bad when it comes to shared memory.
In your tiled kernel, your multiplication operation looks like this:
res=res+ta[ty][j]*tb[j][tx];
For this case:
ta[ty][j]
we have a situation where all threads in the warp (which have linearly increasing tx values but the same ty value) are reading the same location in shared memory. This is an "optimal" access pattern - it does not present any bank conflicts, and will be serviced in the shortest possible time.
For this case:
tb[j][tx]
we have a situation where adjacent threads in the warp are reading adjacent locations in shared memory. This is also an "optimal", un-bank-conflicted pattern, and will be serviced in the shortest possible time.
However in your MatrixMult_tiling_coalesced kernel, the corresponding multiplication operation is:
res=res+ta[ty][j]*tb[tx][j];
Again, with this case:
ta[ty][j]
we have a shared memory "broadcast" pattern (all threads in a warp read from the same location) which is optimal and fast. But in this case:
tb[tx][j]
you have actually created columnar access into shared memory. This is the worst possible access pattern for shared memory, and it will result in a 32-way serialization (or possibly 16-way serialization, in the case of your 16x16 threadblocks) of the load process, and definitely worse performance. Why? Remember that for a given load, j is constant across the warp, and tx increases linearly across the warp. Therefore, let's say j is 1 on a particular loop iteration. The threads in warp 0 will read:
tb[0][1], tb[1][1], tb[2][1], tb[3][1], ...
and these locations all belong to a particular "column" of shared memory, i.e. they all belong to the same shared memory bank. This is the worst-case pattern for shared memory.
For completeness, I claim that all of your global memory access patterns in your MatrixMult_tiling_coalesced kernel are also coalesced:
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
...
d_c[row*dim+col]=res;
so there should be no major difference in the global memory access pattern/activity/efficiency, between your two kernel implementations.
As a side note, I assume this is all a learning exercise. If you are interested in high-performance matrix multiply on the GPU, I would encourage you to consider using CUBLAS.

OPENCL API's take almost same time irrespective of sample size

I've been trying to profile an OpenCL host code for FIR filtering on MAC, Ubuntu and other platforms. My Host code and kernel are as below.
The issue is that irrespective of the number of samples that I provide for the FIR filter, the clenquendrangelernel ends up taking the same amount of time. Also I've profiled the clEnqueueReadBuffer and clEnqueueWriteBuffer as well and somehow they also end up taking the same amount of time. In mac I'm profiling with mach as well as using OpenCL events, in ubuntu, I'm profiling with PAPI. Im unable to understand why this is happening, ideally with increase in the number of samples, the clEnqueueReadBuffer and clEnqueueWriteBuffer should take more time and so should kernel execution.
Kernel:-
__kernel void fir4(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[4] = {5,7,5,7};
/*for(j=0;j<4;j++)
{
output[i] += coeff[j]*(input[i+4-j-1]);
}*/
//unrolled
output[i] += coeff[0]*(input[i+4-0-1]);
output[i] += coeff[1]*(input[i+4-1-1]);
output[i] += coeff[2]*(input[i+4-2-1]);
output[i] += coeff[3]*(input[i+4-3-1]);
}
__kernel void fir8(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[8] = {5,7,5,7,5,7,5,7};
for(j=0;j<8;j++)
{
output[i] += coeff[j]*(input[i+8-j-1]);
}
}
__kernel void fir12(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[12] = {5,7,5,7,5,7,5,7,5,7,5,7};
for(j=0;j<12;j++)
{
output[i] += coeff[j]*(input[i+12-j-1]);
}
}
Host Code:-
// Use a static data size for simplicity
//
#define DATA_SIZE (48000)
#define NUM_COEFF (4)
int main(int argc, char** argv)
{
uint64_t start;
uint64_t end;
uint64_t elapsed;
double elapsedmilli;
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float coeff[NUM_COEFF];
float results_host[DATA_SIZE] = {};
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_event event; //Linking event to kernel for profiling
cl_platform_id platform_id = NULL; // compute device platform id
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i,j = 0;
unsigned int count = DATA_SIZE;
unsigned int taps = NUM_COEFF;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
for(i=0; i < taps; i++)
{
if(!(i%2))
coeff[i] = 5;
else
coeff[i] = 7;
}
//Connect to a platform on device
err = clGetPlatformIDs(1, &platform_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to locate opencl platform!\n");
return EXIT_FAILURE;
}
// Connect to a compute device
//
int gpu = 0;
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
//Use function and load the kernel source from .cl files in the same folder
//
char *KernelSource = load_program_source("fir.cl");
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
switch(taps)
{
case(4):
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
case(8):
{
kernel = clCreateKernel(program, "fir8", &err);
break;
}
case(12):
{
kernel = clCreateKernel(program, "fir12", &err);
break;
}
default:
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
}
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel! - %d\n",err);
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
local = 48;
start = mach_absolute_time();
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &event);
if (err)
{
printf("Error: Failed to execute kernel!-%d\n",err);
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clWaitForEvents(1, &event);
clFinish(commands);
end = mach_absolute_time();
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);
elapsed = end - start;
struct mach_timebase_info info;
mach_timebase_info(&info);
double t = 1e-9 * (elapsed) * info.numer / info.denom;
elapsedmilli = 1e-6 * (elapsed) * info.numer / info.denom;
printf("cl:main timing:MACH clEnqueueNDRangeKernel %f ms, %d elapsed\n",elapsedmilli,elapsed);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i=0; i<DATA_SIZE; i++)
{
for(j=0;j<NUM_COEFF;j++)
{
results_host[i]+=coeff[j]*(data[i+NUM_COEFF-j-1]);
}
//printf("Host Output[%d]-%f\n",i,results_host[i]);
}
for(i = 0; i < count; i++)
{
if(results[i] == results_host[i])
correct++;
//printf("CL Output[%d]-%f\n",i,results[i]);
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values! Samples-%d,Taps-%d\n", correct, count, DATA_SIZE, NUM_COEFF);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
Adding just 10-20 multiplications and additions per item is not comparable to kernel overhead time. Try with 100 or 1000-wide coefficients array.
Using more input elements per item with that way, just increases cache hit numbers(also ratio) because more threads read from same locations.
If DATA_SIZE is several millions, then all data could not fit in cache and become slower linearly with its length. 48000 means less than 200kB. A HD5850 has 512 k L2 cache(3x bandwidth of memory) and 8kB L1 per compute unit(too fast) for example.

Why do I get "Unspecified Launch failure" in CUDA program, multiplying 2 matrices

I am new to CUDA. When I multiply the 1024x1024 matrix, and launch a kernel with:
multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
But when I multiply a 2048 x 2048 matrix, with
dim3(64,64,1)
I get this error:
cudaDeviceSynchronize returned error code 4 after launching addKernel!
unspecified launch failure
From tinkering with the code, I think that the error is in this statement
result += a[row * size + ind] * b[col + size * ind];
in the part
b[col+size*ind]
If I take that out, I don't get a kernel launch error (just the wrong answer, obviously). I cannot figure out what's wrong. Any suggestions would be most appreciated.
I am using Visual Studio 2013. I am using the debugger, but this does not help me find the error.
This seems to be a similar problem:
cudaDeviceSynchronize returned error code 4 after launching
many thanks, here is the code:
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned int size)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row > size || col > size) return;
// target field in 1-D
int z = row * size + col;
int result = 0;
for (int ind = 0; ind < size ; ++ind) {
result += a[row * size + ind] * b[col + size * ind];
}
c[z] = result;
}
int main(){
const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];
for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
a[i] = rand() % 2;
b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for c allocated \n");
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for a allocated \n");
cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for b allocated \n");
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");
cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");
fprintf(stdout, "about to launch kernel \n");
// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
fprintf(stdout, "kernel launched\n");
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
On Windows, I right clicked the NSight monitor icon in the system tray. There I chose Options>General. We see WDDM TDR delay. It was at 2, and I increased it to 10. Then, I ran my program again, and it worked fine.
This was according to Robert's link (see above)
http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm

CUDA matrix multiplication - yet again

I feel a bit bad making a forum thread that has already 10 of the same name, but after checking them all, along with most of the guides around, I still can't figure the problem.
I have a char array [40090][11], and I want to make a custom operation on each possible combination of two of its elements (I consider the whole 11-byte bunch as an element). I understand that is a kind of mmatrix multiplication, the matrices being one-column and one-row.
Following the SDK manual I am thinking of having 1 thread per output element. Since 40090=19*2110, I am using:
dim3 threadsperblock(19,19);
dim3 blocksingrid(2110,2110);
xkernel<<<blocksingrid, threadsperblock>>>(dev_b2);
Question 1: Is this fine?
Alright, then, I THINK I am following the SDK's maunal example faaithfully (not the one using shared memory). Whenever I dare make a portion of my wanted operations on the data, though, I get a massively unhelpful error 30 returned: Unknown error. So, Question 2: What am I doing wrong? Note: Disregard the kernel's not saving anything anywhere.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <ctime>
#include <stdio.h>
using namespace std;
cudaError_t cudafunct(void);
__global__ void xkernel(char * dev_b2);
__device__ unsigned char typecheck(unsigned char type1,unsigned char type2);
#define b2c 40090
unsigned char block2[b2c][11];//
//unsigned int i,b1,b2,counter=0;//Block(2),Piece,Rotation,Type(of block2),InterconnectinTriangle
//unsigned char *block4,type=0;
ofstream ofile;
int main()
{
ifstream block2file("2.blk",ios::binary);
block2file.read((char*)(&block2),b2c*11);
block2file.close();
//block4=new unsigned char[200000000];//200MB will do, better than doing constant reallocs
cudaError_t cudaStatus = cudafunct();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudafunct failed!");
system("PAUSE");
return 1;
}
/*
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}*/
cout<<"Sequence end. Saving to file...\n";
//ofile.open("blk4.et2",ios::binary);
//ofile.write((char*)block4,17*counter);
//ofile.close();
int t=clock();
//cout<<"\nFound a total of "<<counter<<" block4s.\nTime elapsed: "<<t<<" clocks / "<<(double)t/(double)CLOCKS_PER_SEC<<" seconds\n";
system("PAUSE");
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t cudafunct(void)
{
char *dev_b2 = 0;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void**)&dev_b2, sizeof(block2));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b2, block2, sizeof(block2), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
dim3 threadsperblock(19,19);
dim3 blocksingrid(2110,2110);
xkernel<<<blocksingrid, threadsperblock>>>(dev_b2);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching xkernel!\n", cudaStatus);
goto Error;
}
/*
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}*/
Error:
cudaFree(dev_b2);
return cudaStatus;
}
__global__ void xkernel(char *dev_b2)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
/*for(int k=0;k<11;k++)
{
lb2[0][k]=dev_b2[i*b2c+k];
lb2[1][k]=dev_b2[j*b2c+k];
}*/
int b00;
b00=dev_b2[i*b2c];
//int type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4]);
//if(!j && !(i % 100))cout<<setw(6)<<i<<" / "<<jc<<" ("<<setw(10)<<(float)100*i/jc<<" % )"<<endl;
/*if(
(dev_b2[i*b2c+7]!=dev_b2[j*b2c+9])||//SW~NW
(dev_b2[i*b2c+6]!=dev_b2[j*b2c+10])//SE~NE
) return;
if( (type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4]) ) ==255) return;*/
/*if(
(dev_b2[i*b2c+0]==dev_b2[j*b2c+0])||//1st=3rd
(dev_b2[i*b2c+0]==dev_b2[j*b2c+2])||//1st=4th
(dev_b2[i*b2c+2]==dev_b2[j*b2c+0])||//2nd=3rd
(dev_b2[i*b2c+2]==dev_b2[j*b2c+2])//2nd=4th
) return;*/
/*
*(block4+counter*17+0)=b2[i][0];//1st piece
*(block4+counter*17+1)=b2[i][1];//1st rotation
*(block4+counter*17+2)=b2[i][2];//2nd piece
*(block4+counter*17+3)=b2[i][3];//2nd rotation
*(block4+counter*17+4)=b2[j][0];//3rd piece
*(block4+counter*17+5)=b2[j][1];//3rd rotation
*(block4+counter*17+6)=b2[j][2];//4th piece
*(block4+counter*17+7)=b2[j][3];//4th rotation
*(block4+counter*17+8)=type;
*(block4+counter*17+9)=b2[i][5];//Right frame colours, down->up
*(block4+counter*17+10)=b2[j][5];
*(block4+counter*17+11)=b2[j][6];//Up frame colours, right->left
*(block4+counter*17+12)=b2[j][7];
*(block4+counter*17+13)=b2[j][8];//Left frame colours, up->down
*(block4+counter*17+14)=b2[i][8];
*(block4+counter*17+15)=b2[i][9];//Down frame colours, left->right
*(block4+counter++*17+16)=b2[i][10];*/
}
__device__ unsigned char typecheck(unsigned char type1,unsigned char type2)
{//Warning! Previous error! First partenthesis is t*2* = upper piece!
if( (type1==4) && (type2==0) ) return 0;
if( (type1==6) && (type2==1) ) return 1;
if( (type1==2) && (type2==6) ) return 2;
if( (type1==3) && (type2==4) ) return 3;
if( (type1==4) && (type2==4) ) return 4;
if( (type1==8) && (type2==5) ) return 5;
if( (type1==6) && (type2==6) ) return 6;
if( (type1==7) && (type2==8) ) return 7;
if( (type1==8) && (type2==8) ) return 8;
if( (type1==9) && (type2==8) ) return 9;
if( (type1==10) && (type2==8) ) return 10;
if( (type1==8) && (type2==11) ) return 11;
if( (type1==8) && (type2==12) ) return 12;
if( (type1==8) && (type2==13) ) return 13;
return 255;
}
I have a feeling you read out-of-bounds from your dev_b2 array.
blockIdx.x is in range of [0..2110], so the variable i is in range of [0..23210]. But then you multiply it with b2c.
As a result the highest address you read from will be b2c*23210 = 930488900.
But dev_b2 has only the size of b2c*11 = 440990.

Resources