CUDA matrix multiplication - yet again - matrix

I feel a bit bad making a forum thread that has already 10 of the same name, but after checking them all, along with most of the guides around, I still can't figure the problem.
I have a char array [40090][11], and I want to make a custom operation on each possible combination of two of its elements (I consider the whole 11-byte bunch as an element). I understand that is a kind of mmatrix multiplication, the matrices being one-column and one-row.
Following the SDK manual I am thinking of having 1 thread per output element. Since 40090=19*2110, I am using:
dim3 threadsperblock(19,19);
dim3 blocksingrid(2110,2110);
xkernel<<<blocksingrid, threadsperblock>>>(dev_b2);
Question 1: Is this fine?
Alright, then, I THINK I am following the SDK's maunal example faaithfully (not the one using shared memory). Whenever I dare make a portion of my wanted operations on the data, though, I get a massively unhelpful error 30 returned: Unknown error. So, Question 2: What am I doing wrong? Note: Disregard the kernel's not saving anything anywhere.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <ctime>
#include <stdio.h>
using namespace std;
cudaError_t cudafunct(void);
__global__ void xkernel(char * dev_b2);
__device__ unsigned char typecheck(unsigned char type1,unsigned char type2);
#define b2c 40090
unsigned char block2[b2c][11];//
//unsigned int i,b1,b2,counter=0;//Block(2),Piece,Rotation,Type(of block2),InterconnectinTriangle
//unsigned char *block4,type=0;
ofstream ofile;
int main()
{
ifstream block2file("2.blk",ios::binary);
block2file.read((char*)(&block2),b2c*11);
block2file.close();
//block4=new unsigned char[200000000];//200MB will do, better than doing constant reallocs
cudaError_t cudaStatus = cudafunct();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudafunct failed!");
system("PAUSE");
return 1;
}
/*
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}*/
cout<<"Sequence end. Saving to file...\n";
//ofile.open("blk4.et2",ios::binary);
//ofile.write((char*)block4,17*counter);
//ofile.close();
int t=clock();
//cout<<"\nFound a total of "<<counter<<" block4s.\nTime elapsed: "<<t<<" clocks / "<<(double)t/(double)CLOCKS_PER_SEC<<" seconds\n";
system("PAUSE");
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t cudafunct(void)
{
char *dev_b2 = 0;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void**)&dev_b2, sizeof(block2));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b2, block2, sizeof(block2), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
dim3 threadsperblock(19,19);
dim3 blocksingrid(2110,2110);
xkernel<<<blocksingrid, threadsperblock>>>(dev_b2);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching xkernel!\n", cudaStatus);
goto Error;
}
/*
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}*/
Error:
cudaFree(dev_b2);
return cudaStatus;
}
__global__ void xkernel(char *dev_b2)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
/*for(int k=0;k<11;k++)
{
lb2[0][k]=dev_b2[i*b2c+k];
lb2[1][k]=dev_b2[j*b2c+k];
}*/
int b00;
b00=dev_b2[i*b2c];
//int type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4]);
//if(!j && !(i % 100))cout<<setw(6)<<i<<" / "<<jc<<" ("<<setw(10)<<(float)100*i/jc<<" % )"<<endl;
/*if(
(dev_b2[i*b2c+7]!=dev_b2[j*b2c+9])||//SW~NW
(dev_b2[i*b2c+6]!=dev_b2[j*b2c+10])//SE~NE
) return;
if( (type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4]) ) ==255) return;*/
/*if(
(dev_b2[i*b2c+0]==dev_b2[j*b2c+0])||//1st=3rd
(dev_b2[i*b2c+0]==dev_b2[j*b2c+2])||//1st=4th
(dev_b2[i*b2c+2]==dev_b2[j*b2c+0])||//2nd=3rd
(dev_b2[i*b2c+2]==dev_b2[j*b2c+2])//2nd=4th
) return;*/
/*
*(block4+counter*17+0)=b2[i][0];//1st piece
*(block4+counter*17+1)=b2[i][1];//1st rotation
*(block4+counter*17+2)=b2[i][2];//2nd piece
*(block4+counter*17+3)=b2[i][3];//2nd rotation
*(block4+counter*17+4)=b2[j][0];//3rd piece
*(block4+counter*17+5)=b2[j][1];//3rd rotation
*(block4+counter*17+6)=b2[j][2];//4th piece
*(block4+counter*17+7)=b2[j][3];//4th rotation
*(block4+counter*17+8)=type;
*(block4+counter*17+9)=b2[i][5];//Right frame colours, down->up
*(block4+counter*17+10)=b2[j][5];
*(block4+counter*17+11)=b2[j][6];//Up frame colours, right->left
*(block4+counter*17+12)=b2[j][7];
*(block4+counter*17+13)=b2[j][8];//Left frame colours, up->down
*(block4+counter*17+14)=b2[i][8];
*(block4+counter*17+15)=b2[i][9];//Down frame colours, left->right
*(block4+counter++*17+16)=b2[i][10];*/
}
__device__ unsigned char typecheck(unsigned char type1,unsigned char type2)
{//Warning! Previous error! First partenthesis is t*2* = upper piece!
if( (type1==4) && (type2==0) ) return 0;
if( (type1==6) && (type2==1) ) return 1;
if( (type1==2) && (type2==6) ) return 2;
if( (type1==3) && (type2==4) ) return 3;
if( (type1==4) && (type2==4) ) return 4;
if( (type1==8) && (type2==5) ) return 5;
if( (type1==6) && (type2==6) ) return 6;
if( (type1==7) && (type2==8) ) return 7;
if( (type1==8) && (type2==8) ) return 8;
if( (type1==9) && (type2==8) ) return 9;
if( (type1==10) && (type2==8) ) return 10;
if( (type1==8) && (type2==11) ) return 11;
if( (type1==8) && (type2==12) ) return 12;
if( (type1==8) && (type2==13) ) return 13;
return 255;
}

I have a feeling you read out-of-bounds from your dev_b2 array.
blockIdx.x is in range of [0..2110], so the variable i is in range of [0..23210]. But then you multiply it with b2c.
As a result the highest address you read from will be b2c*23210 = 930488900.
But dev_b2 has only the size of b2c*11 = 440990.

Related

When I try to use multidimensional blocks or grids part of my resulted array occurs zeros

I slightly modify CUDA 10.1 Runtime project to acquaint with multidimensional blocks and grids. I use Visual Studio 2015 and NVIDA Quatro P400 video card. But in resulting array after some correct results follow zero values. What is wrong in following program? It uses meltidimensional blocks. The same is with grids.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
const int arraySize = 448 * 1024;
int a[arraySize];
int b[arraySize];
int c[arraySize] = { 0 };
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = 256*(blockIdx.y*blockDim.x + threadIdx.y) + blockIdx.x*blockDim.x + threadIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
int i;
for(i = 0; i < arraySize; i++)
{
a[i] = i;
b[i] = i;
}
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
dim3 threads(256, 4, 1);
dim3 blocks(size >> 10, 1, 1);
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
addKernel << < blocks, threads >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
This calculation is incorrect:
int i = 256*(blockIdx.y*blockDim.x + threadIdx.y) + blockIdx.x*blockDim.x + threadIdx.x;
I'm not sure how you came up with that.
The calculation should be to take your thread x index and add to it the grid width in x, times the row in y.
It should be:
int i = (blockIdx.y*blockDim.y+threadIdx.y)*(gridDim.x*blockDim.x) + blockIdx.x*blockDim.x + threadIdx.x;
// the row in y * grid width in x + thread index in x

Accessing dynamically allocated arrays on device (without passing them as kernel arguments)

How can an array of structs that has been dynamically allocated on the host be used by a kernel, without passing the array of structs as a kernel argument? This seems like a common procedure with a good amount of documentation online, yet it doesn't work on the following program.
Note: Please note that the following questions have been studied before posting this question:
1) copying host memory to cuda __device__ variable 2) Global variable in CUDA 3) Is there any way to dynamically allocate constant memory? CUDA
So far, unsuccessful attempts have been made to:
Dynamically allocate array of structs with cudaMalloc(), then
Use cudaMemcpyToSymbol() with the pointer returned from cudaMalloc() to copy to a __device__ variable which can be used by the kernel.
Code attempt:
NBody.cu (error checking using cudaStatus has mostly been omitted for better readability, and function to read data from file into dynamic array removed):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle.x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}
OUTPUT:
"ERROR: kernel launch failed."
Summary:
How can I print the contents of the array of structs from the kernel, without passing it as a kernel argument?
Coding in C using VS2019 with CUDA 10.2
With the help of #Robert Crovella and #talonmies, here is the solution that outputs a sequence that cycles from 0 to 9 repeatedly.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
//#include "Nbody.h"
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody* d_particle;
//__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle[i].x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}

OPENCL API's take almost same time irrespective of sample size

I've been trying to profile an OpenCL host code for FIR filtering on MAC, Ubuntu and other platforms. My Host code and kernel are as below.
The issue is that irrespective of the number of samples that I provide for the FIR filter, the clenquendrangelernel ends up taking the same amount of time. Also I've profiled the clEnqueueReadBuffer and clEnqueueWriteBuffer as well and somehow they also end up taking the same amount of time. In mac I'm profiling with mach as well as using OpenCL events, in ubuntu, I'm profiling with PAPI. Im unable to understand why this is happening, ideally with increase in the number of samples, the clEnqueueReadBuffer and clEnqueueWriteBuffer should take more time and so should kernel execution.
Kernel:-
__kernel void fir4(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[4] = {5,7,5,7};
/*for(j=0;j<4;j++)
{
output[i] += coeff[j]*(input[i+4-j-1]);
}*/
//unrolled
output[i] += coeff[0]*(input[i+4-0-1]);
output[i] += coeff[1]*(input[i+4-1-1]);
output[i] += coeff[2]*(input[i+4-2-1]);
output[i] += coeff[3]*(input[i+4-3-1]);
}
__kernel void fir8(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[8] = {5,7,5,7,5,7,5,7};
for(j=0;j<8;j++)
{
output[i] += coeff[j]*(input[i+8-j-1]);
}
}
__kernel void fir12(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[12] = {5,7,5,7,5,7,5,7,5,7,5,7};
for(j=0;j<12;j++)
{
output[i] += coeff[j]*(input[i+12-j-1]);
}
}
Host Code:-
// Use a static data size for simplicity
//
#define DATA_SIZE (48000)
#define NUM_COEFF (4)
int main(int argc, char** argv)
{
uint64_t start;
uint64_t end;
uint64_t elapsed;
double elapsedmilli;
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float coeff[NUM_COEFF];
float results_host[DATA_SIZE] = {};
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_event event; //Linking event to kernel for profiling
cl_platform_id platform_id = NULL; // compute device platform id
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i,j = 0;
unsigned int count = DATA_SIZE;
unsigned int taps = NUM_COEFF;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
for(i=0; i < taps; i++)
{
if(!(i%2))
coeff[i] = 5;
else
coeff[i] = 7;
}
//Connect to a platform on device
err = clGetPlatformIDs(1, &platform_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to locate opencl platform!\n");
return EXIT_FAILURE;
}
// Connect to a compute device
//
int gpu = 0;
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
//Use function and load the kernel source from .cl files in the same folder
//
char *KernelSource = load_program_source("fir.cl");
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
switch(taps)
{
case(4):
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
case(8):
{
kernel = clCreateKernel(program, "fir8", &err);
break;
}
case(12):
{
kernel = clCreateKernel(program, "fir12", &err);
break;
}
default:
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
}
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel! - %d\n",err);
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
local = 48;
start = mach_absolute_time();
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &event);
if (err)
{
printf("Error: Failed to execute kernel!-%d\n",err);
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clWaitForEvents(1, &event);
clFinish(commands);
end = mach_absolute_time();
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);
elapsed = end - start;
struct mach_timebase_info info;
mach_timebase_info(&info);
double t = 1e-9 * (elapsed) * info.numer / info.denom;
elapsedmilli = 1e-6 * (elapsed) * info.numer / info.denom;
printf("cl:main timing:MACH clEnqueueNDRangeKernel %f ms, %d elapsed\n",elapsedmilli,elapsed);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i=0; i<DATA_SIZE; i++)
{
for(j=0;j<NUM_COEFF;j++)
{
results_host[i]+=coeff[j]*(data[i+NUM_COEFF-j-1]);
}
//printf("Host Output[%d]-%f\n",i,results_host[i]);
}
for(i = 0; i < count; i++)
{
if(results[i] == results_host[i])
correct++;
//printf("CL Output[%d]-%f\n",i,results[i]);
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values! Samples-%d,Taps-%d\n", correct, count, DATA_SIZE, NUM_COEFF);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
Adding just 10-20 multiplications and additions per item is not comparable to kernel overhead time. Try with 100 or 1000-wide coefficients array.
Using more input elements per item with that way, just increases cache hit numbers(also ratio) because more threads read from same locations.
If DATA_SIZE is several millions, then all data could not fit in cache and become slower linearly with its length. 48000 means less than 200kB. A HD5850 has 512 k L2 cache(3x bandwidth of memory) and 8kB L1 per compute unit(too fast) for example.

Why do I get "Unspecified Launch failure" in CUDA program, multiplying 2 matrices

I am new to CUDA. When I multiply the 1024x1024 matrix, and launch a kernel with:
multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
But when I multiply a 2048 x 2048 matrix, with
dim3(64,64,1)
I get this error:
cudaDeviceSynchronize returned error code 4 after launching addKernel!
unspecified launch failure
From tinkering with the code, I think that the error is in this statement
result += a[row * size + ind] * b[col + size * ind];
in the part
b[col+size*ind]
If I take that out, I don't get a kernel launch error (just the wrong answer, obviously). I cannot figure out what's wrong. Any suggestions would be most appreciated.
I am using Visual Studio 2013. I am using the debugger, but this does not help me find the error.
This seems to be a similar problem:
cudaDeviceSynchronize returned error code 4 after launching
many thanks, here is the code:
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned int size)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row > size || col > size) return;
// target field in 1-D
int z = row * size + col;
int result = 0;
for (int ind = 0; ind < size ; ++ind) {
result += a[row * size + ind] * b[col + size * ind];
}
c[z] = result;
}
int main(){
const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];
for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
a[i] = rand() % 2;
b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for c allocated \n");
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for a allocated \n");
cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for b allocated \n");
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");
cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");
fprintf(stdout, "about to launch kernel \n");
// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
fprintf(stdout, "kernel launched\n");
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
On Windows, I right clicked the NSight monitor icon in the system tray. There I chose Options>General. We see WDDM TDR delay. It was at 2, and I increased it to 10. Then, I ran my program again, and it worked fine.
This was according to Robert's link (see above)
http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm

Applying Sobel Edge Detection with CUDA and OpenCV on a grayscale jpg image

This question has already been asked before, but the asker didn't provide enough information and left unanswered and I am curious about the program.
Original Question Link
I'm trying to do a sobel edge detection using both opencv and cuda library,
the sobel kernel for X direction is
-1 0 1
-2 0 2
-1 0 1
I have 3 files in my project
main.cpp
CudaKernel.cu
CudaKernel.h
main.cpp
#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <Windows.h>
#include <opencv2\core\core.hpp>
#include <opencv2\highgui\highgui.hpp>
#include <opencv2\gpu\gpu.hpp>
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>
#include "CudaKernel.h"
using namespace cv;
using namespace std;
int main(int argc, char** argv)
{
IplImage* image;
try
{
image = cvLoadImage("4555472_460s.jpg", CV_LOAD_IMAGE_GRAYSCALE);
gpu::DeviceInfo info = gpu::getDevice();
cout << info.name() << endl;
cout << "Stream Processor : "<< info.multiProcessorCount() << endl;
cout << "Total Graphic Memory :" << info.totalMemory()/1048576 << " MB" << endl;
}
catch (const cv::Exception* ex)
{
cout << "Error: " << ex->what() << endl;
}
if(!image )
{
cout << "Could not open or find the image" << std::endl ;
return -1;
}
IplImage* image2=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
IplImage* image3=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
unsigned char * pseudo_input=(unsigned char *)image->imageData;
float *output=(float*)image2->imageData;
float *input=(float*)image3->imageData;
int s=image->widthStep/sizeof(float);
for(int w=0;w<=(image->height);w++)
for(int h=0;h<(image->width*image->nChannels);h++)
{
input[w*s+h]= pseudo_input[w*s+h];
}
Pixel *fagget = (unsigned char*) image->imageData;
kernelcall(input, output, image->width,image->height, image->widthStep);
// cv::namedWindow( "Display window", CV_WINDOW_AUTOSIZE );// Create a window for display.
cvShowImage( "Original Image", image ); // Show our image inside it.
cvShowImage("Sobeled Image", image2);
waitKey(0); // Wait for a keystroke in the window
return 0;
}
CudaKernel.cu
#include<cuda.h>
#include<iostream>
#include "CudaKernel.h"
using namespace std;
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
texture <float,2,cudaReadModeElementType> tex1;
texture<unsigned char, 2> tex;
static cudaArray *array = NULL;
static cudaArray *cuArray = NULL;
//Kernel for x direction sobel
__global__ void implement_x_sobel(float* garbage,float* output,int width,int height,int widthStep)
{
int x=blockIdx.x*blockDim.x+threadIdx.x;
int y=blockIdx.y*blockDim.y+threadIdx.y;
float output_value=((0*tex2D(tex1,x,y))+(2*tex2D(tex1,x+1,y))+(-2*tex2D(tex1,x- 1,y))+(0*tex2D(tex1,x,y+1))+(1*tex2D(tex1,x+1,y+1))+(-1*tex2D(tex1,x-1,y+1))+ (1*tex2D(tex1,x+1,y-1))+(0*tex2D(tex1,x,y-1))+(-1*tex2D(tex1,x-1,y-1)));
output[y*widthStep+x]=output_value;
}
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
//Host Code
inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
if ( cudaSuccess != err )
{
printf("cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if ( cudaSuccess != err )
{
printf("cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
void kernelcall(float* input,float* output,int width,int height,int widthStep){
//cudaChannelFormatDesc channelDesc=cudaCreateChannelDesc(32,32,0,0,cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cudaArray *cuArray;
CudaSafeCall(cudaMallocArray(&cuArray,&channelDesc,width,height));
cudaMemcpyToArray(cuArray,0,0,input,widthStep*height,cudaMemcpyHostToDevice);
tex1.addressMode[0]=cudaAddressModeClamp;
tex1.addressMode[1]=cudaAddressModeClamp;
tex1.filterMode=cudaFilterModeLinear;
cudaBindTextureToArray(tex1,cuArray,channelDesc);
tex1.normalized=false;
float * D_output_x;
float * garbage=NULL;
CudaSafeCall(cudaMalloc(&D_output_x,widthStep*height));
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(width+blocksize.x-1)/blocksize.x;
gridsize.y=(height+blocksize.y-1)/blocksize.y;
implement_x_sobel<<<gridsize,blocksize>>>(garbage,D_output_x,width,height,widthStep/sizeof(float));
cudaThreadSynchronize();
CudaCheckError();
CudaSafeCall(cudaMemcpy(output,D_output_x,height*widthStep,cudaMemcpyDeviceToHost));
cudaFree(D_output_x);
cudaFree(garbage);
cudaFreeArray(cuArray);
}
the result is really messed up, it didnt look like the original image at all
Result:
I changed some line of the code to
float *pseudo_input=(float *)image->imageData;
float *output=(float*)image2->imageData;
float *input=(float*)image3->imageData;
float *inputnormalized=(float *)image4->imageData;
int s=image->widthStep/sizeof(float);
for(int w=0;w<=(image->height);w++)
for(int h=0;h<(image->width*image->nChannels);h++)
{
input[w*s+h]= pseudo_input[w*s+h];
}
kernelcall(input, output, image->width,image->height, image->widthStep);
cvNormalize(input,inputnormalized,0,255,NORM_MINMAX, CV_8UC1);
cvShowImage( "Original Image", image ); // Show our image inside it.
cvShowImage("Sobeled Image", image2);
But now I get an unhandled exception error.
OpenCV rule number 1:
Never access the image data directly through the underlying data pointer unless
absolutely necessary, e.g copying data to GPU. Reference (Me :p)
Errors/Recommendations:
Instead of converting the image by looping through the image data
pointer, use cvConvert to change image data type. Looping is very
much prone to error.
When calling the function named kernelcall, you are passing the
data pointer of float images, but passing the widthStep of the
original 8 bit image. This is the main cause of erronous results as
it will result in incorrect indexing inside the kernel.
When performing memory copy between 2 pitched pointers which have
different widthSteps, ALWAYS use 2D memory copy functions available
in CUDA Runtime, e.g. cudaMemcpy2D, cudaMemcpy2DToArray etc. In your case, the cuArray has unknown widthstep internally, and the input IplImage has different widthStep than that of cuArray .
Avoid unnecessary headers, assignments and identifier declaration.
Add bound checks inside the CUDA kernel, so that only those threads perform memory read/write which fall inside the image. It may cause a little divergence, but its better than invalid memory read/writes.
Revised Code (Tested):
Main.cpp
#include <iostream>
#include <opencv2/opencv.hpp>
#include "CudaKernel.h"
using namespace cv;
using namespace std;
int main(int argc, char** argv)
{
IplImage* image;
image = cvLoadImage("4555472_460s.jpg", CV_LOAD_IMAGE_GRAYSCALE);
if(!image )
{
cout << "Could not open or find the image" << std::endl;
return -1;
}
IplImage* image2 = cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
IplImage* image3 = cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
//Convert the input image to float
cvConvert(image,image3);
float *output = (float*)image2->imageData;
float *input = (float*)image3->imageData;
kernelcall(input, output, image->width,image->height, image3->widthStep);
//Normalize the output values from 0.0 to 1.0
cvScale(image2,image2,1.0/255.0);
cvShowImage("Original Image", image );
cvShowImage("Sobeled Image", image2);
cvWaitKey(0);
return 0;
}
CudaKernel.cu
#include<cuda.h>
#include<iostream>
#include "CudaKernel.h"
using namespace std;
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
texture <float,2,cudaReadModeElementType> tex1;
static cudaArray *cuArray = NULL;
//Kernel for x direction sobel
__global__ void implement_x_sobel(float* output,int width,int height,int widthStep)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
//Make sure that thread is inside image bounds
if(x<width && y<height)
{
float output_value = (-1*tex2D(tex1,x-1,y-1)) + (0*tex2D(tex1,x,y-1)) + (1*tex2D(tex1,x+1,y-1))
+ (-2*tex2D(tex1,x-1,y)) + (0*tex2D(tex1,x,y)) + (2*tex2D(tex1,x+1,y))
+ (-1*tex2D(tex1,x-1,y+1)) + (0*tex2D(tex1,x,y+1)) + (1*tex2D(tex1,x+1,y+1));
output[y*widthStep+x]=output_value;
}
}
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
//Host Code
inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
if ( cudaSuccess != err )
{
printf("cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if ( cudaSuccess != err )
{
printf("cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
void kernelcall(float* input,float* output,int width,int height,int widthStep)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
CudaSafeCall(cudaMallocArray(&cuArray,&channelDesc,width,height));
//Never use 1D memory copy if host and device pointers have different widthStep.
// You don't know the width step of CUDA array, so its better to use cudaMemcpy2D...
cudaMemcpy2DToArray(cuArray,0,0,input,widthStep,width * sizeof(float),height,cudaMemcpyHostToDevice);
cudaBindTextureToArray(tex1,cuArray,channelDesc);
float * D_output_x;
CudaSafeCall(cudaMalloc(&D_output_x,widthStep*height));
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(width+blocksize.x-1)/blocksize.x;
gridsize.y=(height+blocksize.y-1)/blocksize.y;
implement_x_sobel<<<gridsize,blocksize>>>(D_output_x,width,height,widthStep/sizeof(float));
cudaThreadSynchronize();
CudaCheckError();
//Don't forget to unbind the texture
cudaUnbindTexture(tex1);
CudaSafeCall(cudaMemcpy(output,D_output_x,height*widthStep,cudaMemcpyDeviceToHost));
cudaFree(D_output_x);
cudaFreeArray(cuArray);
}
Here:-
unsigned char * pseudo_input=(unsigned char *)image->imageData;
float *output=(float*)image2->imageData;
float *input=(float*)image3->imageData;
int s=image->widthStep/sizeof(float);
for(int w=0;w<=(image->height);w++)
for(int h=0;h<(image->width*image->nChannels);h++)
{
input[w*s+h]= pseudo_input[w*s+h];
}
input is float* and pseudo_input is uchar* . convert everything to float and then process. In the end normalize between 0 ans 255 using cvNormalize with NORM_MINMAX to get proper results.

Resources