Two matrix multiplication in OpenCL - performance

Sorry if I'm asking a really simple question, but I couldn't find any related answer on the net.
I'm trying to perform two matrix multiplication on GPU using OpenCL, line R = ABC. They way I do, is by performing matrix multiplication on A and B, and then store the intermediate result (I) on the GPUs DRAM, and then do another matrix multiplication between I and C. My current kernel can also handle batches of matrices.
Here is the code for my MatMul kernel (It is just a basic version, without optimizations):
__kernel void MatrixMultiplication2 (const __global float* A,
const __global float* B,
__global float* C,
const int A_height, const int A_width,
const int B_height, const int B_width,
const int C_height, const int C_width) {
const int row = get_global_id (0);
const int col = get_global_id (1);
const int imgID = get_global_id (2);
const int offsetA = A_height * A_width * imgID;
const int offsetB = B_height * B_width * imgID;
const int offsetC = C_height * C_width * imgID;
float acc = 0.0f;
for (int k = 0; k < A_width; k++) {
acc += A[row*A_width + k + offsetA] * B[k*B_width + col + offsetB];
}
C[row*C_width + col + offsetC] = acc;
}
Here is also the related part in my host code for deploying matrix multiplications:
const size_t GWS1[3] = {A_height, B_width, batch_size};
const size_t LWS1[3] = {32, 32, 1};
const size_t GWS2[3] = {A_height, C_width, batch_size};
const size_t LWS2[3] = {32, 32, 1};
Event evKernel1 ("Kernel1");
Event evKernel2 ("Kernel2");
clFinish (queue);
high_resolution_clock::time_point t1 = high_resolution_clock::now();
err = clEnqueueNDRangeKernel (queue,
kernel1,
3,
NULL,
GWS1,
LWS1,
0,
NULL,
&evKernel1.CLEvent());
clFinish (queue);
CL_CHECK_ERROR (err);
err = clFinish (queue);
CL_CHECK_ERROR (err);
err = clWaitForEvents (1, &evKernel1.CLEvent());
CL_CHECK_ERROR (err);
evKernel1.FillTimingInfo ();
err = clEnqueueNDRangeKernel (queue,
kernel2,
3,
NULL,
GWS2,
LWS2,
0,
NULL,
&evKernel2.CLEvent());
clFinish (queue);
CL_CHECK_ERROR (err);
err =clFinish (queue);
CL_CHECK_ERROR (err);
err = clWaitForEvents (1, &evKernel2.CLEvent());
CL_CHECK_ERROR (err);
high_resolution_clock::time_point t2 = high_resolution_clock::now();
evKernel2.FillTimingInfo ();
readbackMemObject (queue, &mem_R, (int) sizeof (T), RSize, hostMem_R);
Is it the right way to do multi stage matrix multiplication on the GPU? I am deploying the first stage, and then wait until it finishes and then deploy the second one. The performance is really low in this scenario, and when I increase the batch size, it will decrease more. I just want to know if this is the normal way of doing sequential matrix multiplication or not.
Thanks,

Related

Dealing with matrices in CUDA: understanding basic concepts

I'm building a CUDA kernel to compute the numerical N*N jacobian of a function, using finite differences; in the example I provided, it is the square function (each entry of the vector is squared). The host coded allocates in linear memory, while I'm using a 2-dimensional indexing in the kernel.
My issue is that I haven't found a way to sum on the diagonal of the matrices cudaMalloc'ed. My attempt has been to use the statement threadIdx.x == blockIdx.x as a condition for the diagonal, but instead it evaluates to true only for them both at 0.
Here is the kernel and EDIT: I posted the whole code as an answer, based on the suggestions in the comments (the main() is basically the same, while the kernel is not)
template <typename T>
__global__ void jacobian_kernel (
T * J,
const T t0,
const T tn,
const T h,
const T * u0,
const T * un,
const T * un_old)
{
T cgamma = 2 - sqrtf(2);
const unsigned int t = threadIdx.x;
const unsigned int b = blockIdx.x;
const unsigned int tid = t + b * blockDim.x;
/*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
/*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
__shared__ T sm_temp_du[BLOCK_SIZE];
T* temp_du = &sm_temp_du[0];
if (tid < N )
{
temp_sx[b][t] = un[t];
temp_dx[b][t] = un[t];
if ( t == b )
{
if ( tn == t0 )
{
temp_du[t] = u0[t]*0.001;
temp_sx[b][t] += temp_du[t]; //(*)
temp_dx[b][t] -= temp_du[t];
temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );
}
else
{
temp_du[t] = MAX( un[t] - un_old[t], 10e-6 );
temp_sx[b][t] += temp_du[t];
temp_dx[b][t] -= temp_du[t];
}
}
__syncthreads();
//J = f(tn, un + du)
d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
__syncthreads();
J[tid] = (temp_sx[b][t] - temp_dx[b][t]) * powf((2 * temp_du[t]), -1);
//J[tid]*= - h*cgamma/2;
//J[tid]+= ( t == b ? 1 : 0);
//J[tid] = temp_J[tid];
}
}
The general procedure for computing the jacobian is
Copy un into every row of temp_sx and temp_dx
Compute du as a 0.01 magnitude from u0
Sum du to the diagonal of temp_sx, subtract du from the diagonal of temp_dx
Compute the square function on each entry of temp_sx and temp_dx
Subtract them and divide every entry by 2*du
This procedure can be summarized with (f(un + du*e_i) - f(un - du*e_i))/2*du.
My problem is to sum du to the diagonal of the matrices of temp_sx and temp_dx like I tried in (*). How can I achieve that?
EDIT: Now calling 1D blocks and threads; in fact, .y axis wasn't used at all in the kernel. I'm calling the kernel with a fixed amount of shared memory
Note that in int main() I'm calling the kernel with
#define REAL sizeof(float)
#define N 32
#define BLOCK_SIZE 16
#define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)
...
dim3 dimGrid(NUM_BLOCKS,);
dim3 dimBlock(BLOCK_SIZE);
size_t shm_size = N*N*REAL;
jacobian_kernel <<< dimGrid, dimBlock, size_t shm_size >>> (...);
So that I attempt to deal with block-splitting the function calls. In the kernel to sum on the diagonal I used if(threadIdx.x == blockIdx.x){...}. Why isn't this correct? I'm asking it because while debugging and making the code print the statement, It only evaluates true if they both are 0. Thus du[0] is the only numerical value and the matrix becomes nan. Note that this approach worked with the first code I built, where instead I called the kernel with
jacobian_kernel <<< N, N >>> (...)
So that when threadIdx.x == blockIdx.x the element is on the diagonal. This approach doesn't fit anymore though, since now I need to deal with larger N (possibly larger than 1024, which is the maximum number of threads per block).
What statement should I put there that works even if the matrices are split into blocks and threads?
Let me know if I should share some other info.
Here is how I managed to solve my problem, based on the suggestion in the comments on the answer. The example is compilable, provided you put helper_cuda.h and helper_string.h in the same directory or you add -I directive to the CUDA examples include path, installed along with the CUDA toolkit. The relevant changes are only in the kernel; there's a minor change in the main() though, since I was calling double the resources to execute the kernel, but the .y axis of the grid of thread blocks wasn't even used at all, so it didn't generate any error.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include "helper_string.h"
#include <fstream>
#ifndef MAX
#define MAX(a,b) ((a > b) ? a : b)
#endif
#define REAL sizeof(float)
#define N 128
#define BLOCK_SIZE 128
#define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)
template <typename T>
inline void printmatrix( T mat, int rows, int cols);
template <typename T>
__global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old);
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h = 1);
template<typename T>
int main ()
{
float t0 = 0.; //float tn = 0.;
float h = 0.1;
float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;}
float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N);
float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N);
float* J = (float*)malloc(REAL*N*N);
float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A);
float *d_u0;
float *d_un;
float *d_un_old;
float *d_J;
float *d_A;
checkCudaErrors(cudaMalloc((void**)&d_u0, REAL*N)); //printf("1: %p\n", d_u0);
checkCudaErrors(cudaMalloc((void**)&d_un, REAL*N)); //printf("2: %p\n", d_un);
checkCudaErrors(cudaMalloc((void**)&d_un_old, REAL*N)); //printf("3: %p\n", d_un_old);
checkCudaErrors(cudaMalloc((void**)&d_J, REAL*N*N)); //printf("4: %p\n", d_J);
checkCudaErrors(cudaMalloc((void**)&d_A, REAL*N*N)); //printf("4: %p\n", d_J);
checkCudaErrors(cudaMemcpy(d_u0, u0, REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL);
checkCudaErrors(cudaMemcpy(d_un, un, REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL);
checkCudaErrors(cudaMemcpy(d_un_old, un_old, REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL);
checkCudaErrors(cudaMemcpy(d_J, J, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL);
checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL);
dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS \t" << dimGrid.x << "\n";
dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE \t" << dimBlock.x << "\n";
size_t shm_size = N*REAL; //std::cout << shm_size << "\n";
//HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING
//THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS,
//WHILE NOW THE GRID IS 1D
jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old);
checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p\n", d_J);
printmatrix( J, N, N);
checkCudaErrors(cudaDeviceReset());
free(u0);
free(un);
free(un_old);
free(J);
}
template <typename T>
__global__ void jacobian_kernel (
const T * A,
T * J,
const T t0,
const T tn,
const T h,
const T * u0,
const T * un,
const T * un_old)
{
T cgamma = 2 - sqrtf(2);
const unsigned int t = threadIdx.x;
const unsigned int b = blockIdx.x;
const unsigned int tid = t + b * blockDim.x;
/*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
/*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
__shared__ T sm_temp_du;
T* temp_du = &sm_temp_du;
//HERE IS A RELEVANT CHANGE (*)
if ( t < BLOCK_SIZE && b < NUM_BLOCKS )
{
temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f\n", t,(temp_sx[b][t]));
temp_dx[b][t] = un[t];
//printf("t = %d, b = %d, t + b * blockDim.x = %d \n",t, b, tid);
//HERE IS A NOTE (**)
if ( t == b )
{
//printf("t = %d, b = %d \n",t, b);
if ( tn == t0 )
{
*temp_du = u0[t]*0.001;
temp_sx[b][t] += *temp_du;
temp_dx[b][t] -= *temp_du;
temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );
}
else
{
*temp_du = MAX( un[t] - un_old[t], 10e-6 );
temp_sx[b][t] += *temp_du;
temp_dx[b][t] -= *temp_du;
}
;
}
//printf("du[%d] %f\n", tid, (*temp_du));
__syncthreads();
//printf("temp_sx[%d][%d] = %f\n", b, t, temp_sx[b][t]);
//printf("temp_dx[%d][%d] = %f\n", b, t, temp_dx[b][t]);
//d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
//d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f );
matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f );
__syncthreads();
//printf("temp_sx_later[%d][%d] = %f\n", b, t, (temp_sx[b][t]));
//printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f\n", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du);
//if (t == b ) printf( "2du[%d]^-1 = %f\n",t, powf((2 * *temp_du), -1));
J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du);
}
}
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h )
{
__shared__ float temp_u;
temp_u = u[threadIdx.x];
res[threadIdx.x] = h*powf( (temp_u), 2);
}
template <typename T>
inline void printmatrix( T mat, int rows, int cols)
{
std::ofstream matrix_out;
matrix_out.open( "heat_matrix.txt", std::ofstream::out);
for( int i = 0; i < rows; i++)
{
for( int j = 0; j <cols; j++)
{
double next = mat[i + N*j];
matrix_out << ( (next >= 0) ? " " : "") << next << " ";
}
matrix_out << "\n";
}
}
The relevant change is on (*). Before I used if (tid < N) which has two downsides:
First, it is wrong, since it should be tid < N*N, as my data is 2D, while tid is a global index which tracks all the data.
Even if I wrote tid < N*N, since I'm splitting the function calls into blocks, the t < BLOCK_SIZE && b < NUM_BLOCKS seems clearer to me in how the indexing is arranged in the code.
Moreover, the statement t == b in (**) is actually the right one to operate on the diagonal elements of the matrix. The fact that it was evaluated true only on 0 was because of my error right above.
Thanks for the suggestions!

Simple OpenCL program not working

This program is a simple parallel program which adds the elements of 2 vectors.
The program was error free and it was compiled successfully but the results are not right
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
// number of points in Both A and B files (number of rows)
const int number_of_points = 11;
// number of points axis in Both A and B files (number of Columns)
const int number_of_axis = 3;
using namespace std;
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// array contains file A data
float arrayA[number_of_points][number_of_axis]={{0}};
// array contains file B data
float arrayB[number_of_points][number_of_axis]={{0}};
// float X1[number_of_points]; // X values of file A points
float Y1[number_of_points]; // Y values of file A points
// float X2[number_of_points]; // X values of file B points
float Y2[number_of_points]; // Y values of file B points
float *X1 = (float*)malloc(sizeof(float)*number_of_points);
float *X2 = (float*)malloc(sizeof(float)*number_of_points);
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
arrayA[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
arrayB[row][col] = x;
col++;
}
row++;
}
// put Xs of points in X vectors and Ys of points in Y vectors
// input file A
for (int i = 0; i<number_of_points; i++){
X1[i] = arrayA[i][1];
Y1[i] = arrayA[i][2];
}
// input file B
for (int i = 0; i<number_of_points; i++){
X2[i] = arrayB[i][1];
Y2[i] = arrayB[i][2];
}
// int i;
// const int LIST_SIZE = 50;
// int *A = (int*)malloc(sizeof(int)*number_of_points);
// int *B = (int*)malloc(sizeof(int)*number_of_points);
// for(i = 0; i < number_of_points; i++) {
// A[i] = X1[i];
// B[i] = X2[i];
// }
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context =
clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue =
clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem x1_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem x2_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
number_of_points * sizeof(float), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, x1_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X1, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, x2_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X2, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&x1_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&x2_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
float *C = (float*)malloc(sizeof(float)*number_of_points);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < number_of_points; i++)
printf("%f + %f = %f\n", X1[i], X2[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(x1_mem_obj);
ret = clReleaseMemObject(x2_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(X1);
free(X2);
free(C);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
and the kernel file
__kernel void vector_add(__global float *X1,
__global float *X2,
__global float *C) {
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = X1[i] + X2[i];
}
The result was
0.000000 + 0.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
2.000000 + 2.000000 = 0.000000
3.000000 + 3.000000 = 0.000000
4.000000 + 4.000000 = 0.000000
5.000000 + 5.000000 = 0.000000
6.000000 + 6.000000 = 0.000000
7.000000 + 7.000000 = 0.000000
8.000000 + 8.000000 = 0.000000
9.000000 + 9.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
ALL Time taken: 0.07s
You've committed one of the cardinal sins of OpenCL programming, in that you are not checking the error codes from any of your OpenCL API calls! You should always check the return code from every single OpenCL API call. If you did this, it would point you towards the problem very quickly.
The problem is in your kernel enqueue call. If you check the error code, you'll see that you are getting -54 back, which corresponds to CL_INVALID_WORK_GROUP_SIZE. Specifically, kernel invocations have the requirement that the work-group size (local size) exactly divides the global size. You are asking for a work-group size of 64 and a global size of 11, which does not fulfil this requirement.
You can also pass NULL as the work-group size parameter, and the OpenCL implementation will pick a work-group size that will definitely work on your behalf.

Concurrently initializing many arrays with random numbers using Curand and CUDA kernel

I am trying to initialize 100 elements of each these parallel arrays with randomly generated numbers concurrently on the GPU. However, my routine is not producing a variety of random numbers. When I debug the code in Visual Studio I see one number for every element in the array. The object of this code is to optimize the CImg FilledTriangles routine to use the GPU where it can.
What am I doing wrong and how can I fix it? Here is my code:
__global__ void initCurand(curandState* state, unsigned long seed)
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
__syncthreads();
}
/*
* CUDA kernel that will execute 100 threads in parallel
*/
__global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc, float* opacity
,float * angle, unsigned char** color, int height, int width, curandState* state){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curandState localState = state[idx];
__syncthreads();
posx[idx] = (float)(curand_uniform(&localState)*width);
posy[idx] = (float)(curand_uniform(&localState)*height);
rayon[idx] = (float)(10 + curand_uniform(&localState)*50);
angle[idx] = (float)(curand_uniform(&localState)*360);
veloc[idx] = (float)(curand_uniform(&localState)*20 - 10);
color[idx][0] = (unsigned char)(curand_uniform(&localState)*255);
color[idx][1] = (unsigned char)(curand_uniform(&localState)*255);
color[idx][2] = (unsigned char)(curand_uniform(&localState)*255);
opacity[idx] = (float)(0.3 + 1.5*curand_uniform(&localState));
}
Here is the host code that prepares and calls these kernels: I am trying to create 100 threads (for each element) on one block in a grid.
// launch grid of threads
dim3 dimBlock(100);
dim3 dimGrid(1);
initCurand<<<dimBlock,dimGrid>>>(devState, unsigned(time(nullptr)));
// synchronize the device and the host
cudaDeviceSynchronize();
initializeArrays<<<dimBlock, dimGrid>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle,d_color, img0.height(), img0.width(), devState);
Preliminaries:
// Define random properties (pos, size, colors, ..) for all triangles that will be displayed.
float posx[100], posy[100], rayon[100], angle[100], veloc[100], opacity[100];
// Define the same properties but for the device
float* d_posx;
float* d_posy;
float* d_rayon;
float* d_angle;
float* d_veloc;
float* d_opacity;
//unsigned char d_color[100][3];
unsigned char** d_color;
curandState* devState;
cudaError_t err;
// allocate memory on the device for the device arrays
err = cudaMalloc((void**)&d_posx, 100 * sizeof(float));
err = cudaMalloc((void**)&d_posy, 100 * sizeof(float));
err = cudaMalloc((void**)&d_rayon, 100 * sizeof(float));
err = cudaMalloc((void**)&d_angle, 100 * sizeof(float));
err = cudaMalloc((void**)&d_veloc, 100 * sizeof(float));
err = cudaMalloc((void**)&d_opacity, 100 * sizeof(float));
err = cudaMalloc((void**)&devState, 100*sizeof(curandState));
errCheck(err);
size_t pitch;
//allocated the device memory for source array
err = cudaMallocPitch(&d_color, &pitch, 3 * sizeof(unsigned char),100);
Getting the results:
// get the populated arrays back to the host for use
err = cudaMemcpy(posx,d_posx, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(posy,d_posy, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(rayon,d_rayon, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(veloc,d_veloc, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(opacity,d_opacity, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(angle,d_angle, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy2D(color,pitch,d_color,100, 100 *sizeof(unsigned char),3, cudaMemcpyDeviceToHost);
definitely you will need to make a change from this:
err = cudaMalloc((void**)&devState, 100*sizeof(float));
to this:
err = cudaMalloc((void**)&devState, 100*sizeof(curandState));
If you ran your code through cuda-memcheck, you would have discovered this. Your initCurand kernel had plenty of out-of-bounds accesses due to this.
You should also be doing error checking on all cuda calls and all kernel launches. I believe your second kernel call is failing due to a messed up operation on your color[][] array.
Normally when we create an array with cudaMallocPitch, we need to access it using the pitch parameter. C doubly-subscripted arrays by themselves won't work, because C has no inherent knowledge of the actual array width.
I was able to fix it by making the following changes:
__global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc, float* opacity,float * angle, unsigned char* color, int height, int width, curandState* state, size_t pitch){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curandState localState = state[idx];
__syncthreads();
posx[idx] = (float)(curand_uniform(&localState)*width);
posy[idx] = (float)(curand_uniform(&localState)*height);
rayon[idx] = (float)(10 + curand_uniform(&localState)*50);
angle[idx] = (float)(curand_uniform(&localState)*360);
veloc[idx] = (float)(curand_uniform(&localState)*20 - 10);
color[idx*pitch] = (unsigned char)(curand_uniform(&localState)*255);
color[(idx*pitch)+1] = (unsigned char)(curand_uniform(&localState)*255);
color[(idx*pitch)+2] = (unsigned char)(curand_uniform(&localState)*255);
opacity[idx] = (float)(0.3 + 1.5*curand_uniform(&localState));
}
and
initializeArrays<<<dimBlock, dimGrid>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle,d_color, img0.height(), img0.width(), devState, pitch);
and
unsigned char* d_color;
with those changes, I was able to eliminate the errors I found and the code spit out various random values. I haven't inspected all the values, but that should get you started.

Piecemeal processing of a matrix - CUDA

OK, so lets say I have an ( N x N ) matrix that I would like to process. This matrix is quite large for my computer, and if I try to send it to the device all at once I get a 'out of memory error.'
So is there a way to send sections of the matrix to the device? One way I can see to do it is copy portions of the matrix on the host, and then send these manageable copied portions from the host to the device, and then put them back together at the end.
Here is something I have tried, but the cudaMemcpy in the for loop returns error code 11, 'invalid argument.'
int h_N = 10000;
size_t h_size_m = h_N*sizeof(float);
h_A = (float*)malloc(h_size_m*h_size_m);
int d_N = 2500;
size_t d_size_m = d_N*sizeof(float);
InitializeMatrices(h_N);
int i;
int iterations = (h_N*h_N)/(d_N*d_N);
for( i = 0; i < iterations; i++ )
{
float* h_array_ref = h_A+(i*d_N*d_N);
cudasafe( cudaMemcpy(d_A, h_array_ref, d_size_m*d_size_m, cudaMemcpyHostToDevice), "cudaMemcpy");
cudasafe( cudaFree(d_A), "cudaFree(d_A)" );
}
What I'm trying to accomplish with the above code is this: instead of send the entire matrix to the device, I simply send a pointer to a place within that matrix and reserve enough space on the device to do the work, and then with the next iteration of the loop move the pointer forward within the matrix, etc. etc.
Not only can you do this (assuming your problem is easily decomposed this way into sub-arrays), it can be a very useful thing to do for performance; once you get the basic approach you've described working, you can start using asynchronous memory copies and double-buffering to overlap some of the memory transfer time with the time spent computing what is already on-card.
But first one gets the simple thing working. Below is a 1d example (multiplying a vector by a scalar and adding another scalar) but using a linearized 2d array would be the same; the key part is
CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);
int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {
int size=batchsize;
if ((nstart + batchsize) > n) size = n - nstart;
CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );
blocksize = (size+nblocks-1)/nblocks;
cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);
CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );
nbatches++;
}
gputime = tock(&gputimer);
CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );
You allocate the buffers at the start, and then loop through until you're done, each time doing the copy, starting the kernel, and then copying back. You free at the end.
The full code is
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cuda.h>
#include <sys/time.h>
#include <math.h>
#define CHK_CUDA(e) {if (e != cudaSuccess) {fprintf(stderr,"Error: %s\n", cudaGetErrorString(e)); exit(-1);}}
__global__ void cuda_saxpb(const float *xd, const float a, const float b,
float *yd, const int n) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i<n) {
yd[i] = a*xd[i]+b;
}
return;
}
void cpu_saxpb(const float *x, float a, float b, float *y, int n) {
int i;
for (i=0;i<n;i++) {
y[i] = a*x[i]+b;
}
return;
}
int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b);
void tick(struct timeval *timer);
double tock(struct timeval *timer);
int main(int argc, char **argv) {
int n=1000;
int nblocks=10;
int batchsize=100;
float a = 5.;
float b = -1.;
int err;
float *x, *y, *ycuda;
float *xd, *yd;
double abserr;
int blocksize;
int i;
struct timeval cputimer;
struct timeval gputimer;
double cputime, gputime;
err = get_options(argc, argv, &n, &batchsize, &nblocks, &a, &b);
if (batchsize > n) {
fprintf(stderr, "Resetting batchsize to size of vector, %d\n", n);
batchsize = n;
}
if (err) return 0;
x = (float *)malloc(n*sizeof(float));
if (!x) return 1;
y = (float *)malloc(n*sizeof(float));
if (!y) {free(x); return 1;}
ycuda = (float *)malloc(n*sizeof(float));
if (!ycuda) {free(y); free(x); return 1;}
/* run CPU code */
tick(&cputimer);
cpu_saxpb(x, a, b, y, n);
cputime = tock(&cputimer);
/* run GPU code */
/* only have to allocate once */
CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);
int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {
int size=batchsize;
if ((nstart + batchsize) > n) size = n - nstart;
CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );
blocksize = (size+nblocks-1)/nblocks;
cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);
CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );
nbatches++;
}
gputime = tock(&gputimer);
CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );
abserr = 0.;
for (i=0;i<n;i++) {
abserr += fabs(ycuda[i] - y[i]);
}
printf("Y = a*X + b, problemsize = %d\n", n);
printf("CPU time = %lg millisec.\n", cputime*1000.);
printf("GPU time = %lg millisec (done with %d batches of %d).\n",
gputime*1000., nbatches, batchsize);
printf("CUDA and CPU results differ by %lf\n", abserr);
free(x);
free(y);
free(ycuda);
return 0;
}
int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b) {
const struct option long_options[] = {
{"nvals" , required_argument, 0, 'n'},
{"nblocks" , required_argument, 0, 'B'},
{"batchsize" , required_argument, 0, 's'},
{"a", required_argument, 0, 'a'},
{"b", required_argument, 0, 'b'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}};
char c;
int option_index;
int tempint;
while (1) {
c = getopt_long(argc, argv, "n:B:a:b:s:h", long_options, &option_index);
if (c == -1) break;
switch(c) {
case 'n': tempint = atoi(optarg);
if (tempint < 1 || tempint > 500000) {
fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *n);
} else {
*n = tempint;
}
break;
case 's': tempint = atoi(optarg);
if (tempint < 1 || tempint > 50000) {
fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *s);
} else {
*s = tempint;
}
break;
case 'B': tempint = atoi(optarg);
if (tempint < 1 || tempint > 1000 || tempint > *n) {
fprintf(stderr,"%s: Cannot use number of blocks %s;\n Using %d\n", argv[0], optarg, *nb);
} else {
*nb = tempint;
}
break;
case 'a': *a = atof(optarg);
break;
case 'b': *b = atof(optarg);
break;
case 'h':
puts("Calculates y[i] = a*x[i] + b on the GPU.");
puts("Options: ");
puts(" --nvals=N (-n N): Set the number of values in y,x.");
puts(" --batchsize=N (-s N): Set the number of values to transfer at a time.");
puts(" --nblocks=N (-B N): Set the number of blocks used.");
puts(" --a=X (-a X): Set the parameter a.");
puts(" --b=X (-b X): Set the parameter b.");
puts(" --niters=N (-I X): Set number of iterations to calculate.");
puts("");
return +1;
}
}
return 0;
}
void tick(struct timeval *timer) {
gettimeofday(timer, NULL);
}
double tock(struct timeval *timer) {
struct timeval now;
gettimeofday(&now, NULL);
return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}
Running this one gets:
$ ./batched-saxpb --nvals=10240 --batchsize=10240 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.072 millisec.
GPU time = 0.117 millisec (done with 1 batches of 10240).
CUDA and CPU results differ by 0.000000
$ ./batched-saxpb --nvals=10240 --batchsize=5120 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.066 millisec.
GPU time = 0.133 millisec (done with 2 batches of 5120).
CUDA and CPU results differ by 0.000000
$ ./batched-saxpb --nvals=10240 --batchsize=2560 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.067 millisec.
GPU time = 0.167 millisec (done with 4 batches of 2560).
CUDA and CPU results differ by 0.000000
The GPU time goes up in this case (we're doing more memory copies) but the answers stay the same.
Edited: The original version of this code had an option for running multiple iterations of the kernel for timing purposes, but that's unnecessarily confusing in this context so it's removed.

opencl program build failed

The following is the code I wrote for adding 2 2d arrays. Its getting compiled but when I try to run it its showing : error: failed to build program. runtime0.0000
why is it that the prpgram isnt built?
And also why is it that the buildlog that I have queried isnt getting displayed?
Actually since I am just initialising the arrays, I have directly stored to 1d array, not shown the conversion from 2d to 1d.
code:
# include <stdio.h>
#include <stdlib.h>
#ifdef APPLE
#include<OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define order 1000
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
float *A;
float *B;
float *C;
int n,m,p;
int err;
int szA, szB,szC;
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_uint nd;
cl_mem a_in;
cl_mem b_in;
cl_mem c_out;
int i,j;
n=order;
m=order;
p=order;
size_t global[2];
nd=1;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
szA=n*p;
szB=p*m;
szC=n*m;
A=(float *)malloc(sizeof(float)*szA);
B=(float *)malloc(sizeof(float)*szB);
C=(float *)malloc(sizeof(float)*szC);
for(i=0; i<order; i++)
for(j=0; j<order; j++)
A[i*m+j]=i;
B[i*m+j]=i;
FILE *fp;
char fileName[] = "./array_add_kernel.cl";
char *source_str;
size_t source_size;
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
err=clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
err=clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context_properties conpro[]={ CL_CONTEXT_PLATFORM,(cl_context_properties) firstPlatformId, 0};
context=clCreateContext(conpro, 1, &device_id, NULL, NULL, &err);
commands=clCreateCommandQueue(context, device_id,CL_QUEUE_PROFILING_ENABLE, &err);
a_in= clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*szA, NULL, NULL);
b_in= clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*szB, NULL, NULL);
c_out= clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*szC, NULL, NULL);
program= clCreateProgramWithSource(context, 1, (const char**)&source_str,(const size_t *)&source_size, &err);
err= clBuildProgram(program,0, NULL, NULL, NULL, NULL );
if(err!= CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error:Failed to build program executable!");
clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG,sizeof(buffer),buffer,&len);
printf("%s \n",buffer);
}
kernel= clCreateKernel(program, "array_add_kernel", &err);
err= 0;
err= clSetKernelArg(kernel, 0, sizeof(int), &n);
err|= clSetKernelArg(kernel, 1, sizeof(int), &p);
err|= clSetKernelArg(kernel, 2, sizeof(int), &m);
err|= clSetKernelArg(kernel, 3, sizeof(cl_mem), &a_in);
err|= clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_in);
err|= clSetKernelArg(kernel, 5, sizeof(cl_mem), &c_out);
err=clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float)*szA, A, 0, NULL, NULL);
err= clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float)*szB, B, 0, NULL, NULL);
cl_event prof_event;
global[0]= (size_t)n;
global[1]=(size_t)m;
err=clEnqueueNDRangeKernel(commands, kernel, nd, NULL, global, NULL, 0, NULL, &prof_event);
clFinish(commands);
cl_ulong ev_start_time=(cl_ulong)0;
cl_ulong ev_end_time=(cl_ulong)0;
size_t ret_size;
err= clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ev_start_time, NULL);
err= clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL);
err=clEnqueueReadBuffer(commands,c_out,CL_TRUE,0,sizeof(float)*szC,C,0,NULL,NULL);
cl_float runtime=(ev_end_time-ev_start_time)*1.0e-9;
printf("Runtime:%f ",runtime);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseMemObject(a_in);
clReleaseMemObject(b_in);
clReleaseMemObject(c_out);
clReleaseCommandQueue(commands);
clReleaseContext(context);
}
kernel:
kernel void array_add_kernel(
const int n, const int m, const p, _global const float * A, _global const float * B, , _global float * C )
{
int i= get_global_id(0);
int j= get_global_id(1);
C[i*m + j] = A[i*m + j] + B[i*m + j];
}
Fix your kernel. It's filled with errors.
kernel void array_add_kernel(
const int n,
const int m,
const p, // No type specifier
_global const float * A, // Should be global, not _global
_global const float * B, , // Double comma
_global float * C )
{
int i= get_global_id(0);
int j= get_global_id(1);
C[i*m + j] = A[i*m + j] + B[i*m + j];
}
This is the working kernel.
kernel void array_add_kernel(const int n, const int m, global const float * A, global const float * B, global float * C )
{
int i= get_global_id(0);
int j= get_global_id(1);
C[i*m + j] = A[i*m + j] + B[i*m + j];
}

Resources