Related
Sorry if I'm asking a really simple question, but I couldn't find any related answer on the net.
I'm trying to perform two matrix multiplication on GPU using OpenCL, line R = ABC. They way I do, is by performing matrix multiplication on A and B, and then store the intermediate result (I) on the GPUs DRAM, and then do another matrix multiplication between I and C. My current kernel can also handle batches of matrices.
Here is the code for my MatMul kernel (It is just a basic version, without optimizations):
__kernel void MatrixMultiplication2 (const __global float* A,
const __global float* B,
__global float* C,
const int A_height, const int A_width,
const int B_height, const int B_width,
const int C_height, const int C_width) {
const int row = get_global_id (0);
const int col = get_global_id (1);
const int imgID = get_global_id (2);
const int offsetA = A_height * A_width * imgID;
const int offsetB = B_height * B_width * imgID;
const int offsetC = C_height * C_width * imgID;
float acc = 0.0f;
for (int k = 0; k < A_width; k++) {
acc += A[row*A_width + k + offsetA] * B[k*B_width + col + offsetB];
}
C[row*C_width + col + offsetC] = acc;
}
Here is also the related part in my host code for deploying matrix multiplications:
const size_t GWS1[3] = {A_height, B_width, batch_size};
const size_t LWS1[3] = {32, 32, 1};
const size_t GWS2[3] = {A_height, C_width, batch_size};
const size_t LWS2[3] = {32, 32, 1};
Event evKernel1 ("Kernel1");
Event evKernel2 ("Kernel2");
clFinish (queue);
high_resolution_clock::time_point t1 = high_resolution_clock::now();
err = clEnqueueNDRangeKernel (queue,
kernel1,
3,
NULL,
GWS1,
LWS1,
0,
NULL,
&evKernel1.CLEvent());
clFinish (queue);
CL_CHECK_ERROR (err);
err = clFinish (queue);
CL_CHECK_ERROR (err);
err = clWaitForEvents (1, &evKernel1.CLEvent());
CL_CHECK_ERROR (err);
evKernel1.FillTimingInfo ();
err = clEnqueueNDRangeKernel (queue,
kernel2,
3,
NULL,
GWS2,
LWS2,
0,
NULL,
&evKernel2.CLEvent());
clFinish (queue);
CL_CHECK_ERROR (err);
err =clFinish (queue);
CL_CHECK_ERROR (err);
err = clWaitForEvents (1, &evKernel2.CLEvent());
CL_CHECK_ERROR (err);
high_resolution_clock::time_point t2 = high_resolution_clock::now();
evKernel2.FillTimingInfo ();
readbackMemObject (queue, &mem_R, (int) sizeof (T), RSize, hostMem_R);
Is it the right way to do multi stage matrix multiplication on the GPU? I am deploying the first stage, and then wait until it finishes and then deploy the second one. The performance is really low in this scenario, and when I increase the batch size, it will decrease more. I just want to know if this is the normal way of doing sequential matrix multiplication or not.
Thanks,
I created a basic snippet:
Kernel:
__kernel void
kernel1(__global int* a, __global int* b, __global int* c, int size)
{
int idx = get_global_id(0);
if (idx >= 0 && idx < size){
c[idx] = a[idx] + b[idx];
}
}
Code:
#include <CL/cl.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_FILE_SIZE 1024000
#include <sys/stat.h>
#include <sys/types.h>
typedef enum ocl_type_e_t {
OCL_TYPE_NULL = 0,
OCL_TYPE_CPU = 1,
OCL_TYPE_GPU = 2,
OCL_TYPE_IGPU = 3,
OCL_TYPE_ACC = 4
} ocl_type_e_t;
const char*
cl_device_type_to_str(cl_device_type type)
{
static char* strings[] = {
"(invalid)", // invalid
"CL_DEVICE_TYPE_CPU",
"CL_DEVICE_TYPE_GPU",
"CL_DEVICE_TYPE_ACCELERATOR",
"CL_DEVICE_TYPE_CUSTOM",
"CL_DEVICE_TYPE_DEFAULT",
"CL_DEVICE_TYPE_ALL",
};
char* ret;
switch (type) {
case CL_DEVICE_TYPE_CPU:
ret = strings[1];
break;
case CL_DEVICE_TYPE_GPU:
ret = strings[2];
break;
case CL_DEVICE_TYPE_ACCELERATOR:
ret = strings[3];
break;
case CL_DEVICE_TYPE_CUSTOM:
ret = strings[4];
break;
case CL_DEVICE_TYPE_DEFAULT:
ret = strings[5];
break;
case CL_DEVICE_TYPE_ALL:
ret = strings[6];
break;
default:
ret = strings[0];
break;
}
return ret;
}
const char*
file_read(char* const path)
{
struct stat st;
/* st = (struct stat*)malloc(sizeof(stat)); */
int error = stat(path, &st);
if (error != 0) {
printf("Invalid file %s\n", path);
exit(EXIT_FAILURE);
}
int size_file = st.st_size;
if (size_file > MAX_FILE_SIZE) {
printf("File %s is bigger than the max allowed size (%d > %d bytes)\n",
path, size_file, MAX_FILE_SIZE);
exit(EXIT_FAILURE);
}
FILE* fp = fopen(path, "r");
if (fp == NULL) {
printf("Error opening the file %s\n", path);
exit(EXIT_FAILURE);
}
char* const buf = (char* const)malloc(size_file);
if (buf == NULL) {
printf("Error allocating %d bytes for the contents of the file %s\n",
size_file, path);
exit(EXIT_FAILURE);
}
int size_read;
while ((size_read = fread(buf, sizeof(char), size_file, fp)) > 0) {
;
}
fclose(fp);
return buf;
}
cl_event clb_events_waiting[100];
int clb_events_waiting_device[100];
int clb_events_init_read[100];
int clb_num_events_waiting = 0;
void
clbWaitEvents(int * c)
{
if (clb_num_events_waiting > 0){
printf("About to wait events: %d\n", clb_num_events_waiting);
int i;
int waiting = 0;
cl_event ev_waiting[100];
printf("%d = CL_QUEUED, %d = CL_COMPLETE, %d = CL_SUBMITTED, %d = CL_RUNNING\n", CL_QUEUED, CL_COMPLETE, CL_SUBMITTED, CL_RUNNING);
for (i=0; i<clb_num_events_waiting; i++){
cl_int ret;
clGetEventInfo(clb_events_waiting[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);
int dev = clb_events_waiting_device[i];
int init = clb_events_init_read[i] / sizeof(int);
printf("cl_event %s init %6d [%d] = status %d (ref %p)\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, i, ret, (void*)clb_events_waiting[i]);
if (ret != CL_COMPLETE){
ev_waiting[waiting] = clb_events_waiting[i];
waiting++;
}
}
for (i=0; i<clb_num_events_waiting; i++){
int dev = clb_events_waiting_device[i];
int init = clb_events_init_read[i] / sizeof(int);
printf("%s [%d] = %d, [%d] = %d, [%d] = %d\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, c[init], init + 1, c[init + 1], init + 2, c[init + 2]);
}
if (waiting > 0){
printf("about to wait %d events\n", waiting);
clWaitForEvents(waiting, ev_waiting);
printf("wait events finished\n");
}
/* clWaitForEvents(clb_num_events_waiting, clb_events_waiting); */
}
}
typedef struct callback_data
{
cl_command_queue* queue;
cl_mem* buf_c;
int* c_v;
uint size;
cl_event* end;
bool nested_callbacks;
bool blocking;
} callback_data;
void CL_CALLBACK callback_read_fn(cl_event event, cl_int ev_status,
void* user_data);
void CL_CALLBACK callback_kernel_fn(cl_event event, cl_int ev_status,
void* user_data);
int
main(int argc, char* argv[])
{
bool use_callbacks = true;
bool use_nested_callbacks = true;
bool use_blocking = false;
int numSelPlatform = 0;
int numSelDevice = 0;
int doUseCallbacks = 0;
int doUseNestedCallbacks = 0;
int doUseBlocking = 0;
int use_type = 0;
if (argc != 7) {
printf("./%s (platform) (device) (type cpu 0|gpu 1|igpu 2|acc 3) (use "
"callbacks) (use nested callbacks) (use blocking)\n",
argv[0]);
exit(EXIT_FAILURE);
} else {
numSelPlatform = atoi(argv[1]);
numSelDevice = atoi(argv[2]);
use_type = atoi(argv[3]);
doUseCallbacks = atoi(argv[4]);
doUseNestedCallbacks = atoi(argv[5]);
doUseBlocking = atoi(argv[6]);
}
cl_event end;
uint size = 1024;
int* a_v = (int*)malloc(size * sizeof(int));
int* b_v = (int*)malloc(size * sizeof(int));
int* c_v = (int*)malloc(size * sizeof(int));
for (size_t i = 0; i < size; i++) {
a_v[i] = i;
b_v[i] = i + 1;
c_v[i] = 0;
}
const char* kernel_str = file_read("src/kernel.cl");
use_callbacks = doUseCallbacks;
use_nested_callbacks = doUseNestedCallbacks;
use_blocking = doUseBlocking ? CL_TRUE : CL_FALSE;
cl_int st;
cl_int err;
int len = 256;
char buflog[len];
cl_uint numPlatforms = 0;
st = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
st = clGetPlatformIDs(numPlatforms, platforms, NULL);
printf("platforms: %d (%d)\n", numPlatforms, st);
cl_uint selPlatform = numSelPlatform; // 1;
numPlatforms = 1;
cl_platform_id platform = platforms[selPlatform];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, len, &buflog, NULL);
if (buflog != NULL) {
printf("platform name: %s\n", buflog);
}
cl_uint numDevices = 0;
st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
printf("num devices: %d (%d)\n", numDevices, st);
if (st != CL_SUCCESS) {
/* printf("explain error: %s\n", clErrorString(st)); */
printf("error: %d\n", st);
}
cl_device_id* devices = NULL;
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
printf("devices: %d (%d)\n", numDevices, st);
// Context
cl_context context;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &err);
printf("context (%d)\n", err);
// Select device
cl_uint selDevice = numSelDevice; // 0;
numDevices = 1; // clBuildProgram
cl_device_id device = devices[selDevice];
// Device Info
clGetDeviceInfo(device, CL_DEVICE_NAME, len, &buflog, NULL);
if (buflog != NULL) {
printf("device name: %s\n", buflog);
}
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
printf("device type: %s\n", cl_device_type_to_str(type));
// events
cl_event ev_kernel;
// CommandQueue
/* cl_command_queue_properties props; */
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &err);
printf("command queue (%d)\n", err);
// CreateBuffer
cl_mem buf_a;
cl_mem buf_b;
cl_mem buf_c;
ocl_type_e_t ocl_type;
if (use_type == 0) {
ocl_type = OCL_TYPE_CPU;
printf("mode CPU\n");
} else if (use_type == 1) {
ocl_type = OCL_TYPE_GPU;
printf("mode GPU\n");
} else if (use_type == 2) {
ocl_type = OCL_TYPE_IGPU;
printf("mode IGPU\n");
} else if (use_type == 3) {
ocl_type = OCL_TYPE_ACC;
printf("mode ACC\n");
}
/* cl_mem buf_x; */
switch (ocl_type) {
case OCL_TYPE_IGPU:
buf_a = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
a_v, &err);
/* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
* CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
/* Acpy, &err); */
break;
case OCL_TYPE_GPU:
buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
a_v, &err);
break;
case OCL_TYPE_ACC:
buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), a_v, &err);
break;
case OCL_TYPE_CPU:
buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), a_v, &err);
break;
default:
printf("no ocl_type defined\n");
exit(EXIT_FAILURE);
break;
}
printf("create buffer a (%d)\n", err);
if (err != CL_SUCCESS) {
/* printf("create buffer error: %s\n", clErrorString(err)); */
printf("create buffer error: %d\n", err);
}
switch (ocl_type) {
case OCL_TYPE_IGPU:
buf_b = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
b_v, &err);
break;
case OCL_TYPE_GPU:
buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
b_v, &err);
break;
case OCL_TYPE_ACC:
buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), b_v, &err);
break;
case OCL_TYPE_CPU:
buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), b_v, &err);
break;
default:
printf("no ocl_type defined\n");
exit(EXIT_FAILURE);
break;
}
printf("create buffer b (%d)\n", err);
if (err != CL_SUCCESS) {
printf("create buffer error: %d\n", err);
/* printf("create buffer error: %s\n", clErrorString(err)); */
}
switch (ocl_type) {
case OCL_TYPE_IGPU:
buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
c_v, &err);
/* buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, c_rows * c_cols *
* sizeof(int), */
/* c_v, &err); */
/* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
* CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
/* Acpy, &err); */
break;
case OCL_TYPE_GPU:
buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
c_v, &err);
break;
case OCL_TYPE_ACC:
buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), c_v, &err);
break;
case OCL_TYPE_CPU:
buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_USE_HOST_PTR,
/* buf_c = */
/* clCreateBuffer(context, CL_MEM_USE_HOST_PTR, */
/* buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, */
size * sizeof(int), c_v, &err);
break;
default:
printf("no ocl_type defined\n");
exit(EXIT_FAILURE);
break;
}
printf("create buffer c (%d)\n", err);
if (err != CL_SUCCESS) {
/* printf("create buffer error: %s\n", clErrorString(err)); */
printf("create buffer error: %d\n", err);
}
/* b_x = clCreateBuffer(context, CL_MEM_WRITE_ONLY, n * sizeof(float), x,
* &err); */
/* printf("create buffer x (%d)\n", err); */
// WriteBuffer
/* st = clEnqueueWriteBuffer(queue, b_a, CL_FALSE, 0, n * n * sizeof(float),
*/
/* Acpy, 0, NULL, NULL); */
/* printf("write buffer Acpy - b_a (%d)\n", st); */
/* st = clEnqueueWriteBuffer(queue, b_b, CL_FALSE, 0, n * sizeof(float), bcpy,
* 0, */
/* NULL, NULL); */
/* printf("write buffer bcpy - b_b (%d)\n", st); */
// Create Program
cl_program program;
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_str,
NULL, &err);
printf("create program (%d)\n", err);
// Build Program
/* st = clBuildProgram(program, numDevices, (cl_device_id*)&device, NULL,
* NULL, */
/* NULL); */
char* opts = "-Werror";
st = clBuildProgram(program, numDevices, (cl_device_id*)&device, opts, NULL,
NULL);
printf("build program (%d)\n", st);
if (st != CL_SUCCESS) {
/* printf("build status: %s\n", clErrorString(st)); */
printf("build status: %d\n", st);
char log[512];
st = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 512, &log,
NULL);
printf("build info (%d)\n", st);
if (st == CL_SUCCESS) {
printf("%s\n", log);
}
}
// Create Kernel
cl_kernel kernel1;
kernel1 = clCreateKernel(program, "kernel1", &st);
printf("create kernel1 (%d)\n", st);
/* cl_kernel kernel2; */
/* kernel2 = clCreateKernel(program, "ocl1_2", &st); */
/* printf("create kernel2 (%d)\n", st); */
// workgroup size
size_t dims = 1;
size_t gws[] = { 1, 1, 1 };
/* size_t gws[dims]; */
gws[0] = size; // a_rows;
/* gws[0] = 32; */
/* size_t* lws = NULL; */
/* size_t lws[dims]; */
/* size_t lws[dims]; */
/* size_t lws[dims] = NULL; */
/* size_t lws[] = {0, 0, 0}; */
size_t lws[] = { 128, 1, 1 };
printf("gws {%lu, %lu, %lu}\n", gws[0], gws[1], gws[2]);
if (lws != NULL) {
printf("lws {%lu, %lu, %lu}\n", lws[0], lws[1], lws[2]);
} else {
printf("lws unspecified\n");
}
// Set Kernel Args
st = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_a);
printf("set arg %d (%d)\n", 0, st);
st = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &buf_b);
printf("set arg %d (%d)\n", 1, st);
/* printf("set kernel1 arg: %d (%d)\n", 0, st); */
st = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &buf_c);
printf("set arg %d (%d)\n", 2, st);
st = clSetKernelArg(kernel1, 3, sizeof(int), (int*)&size);
printf("set arg %d (%d)\n", 3, st);
// Execute kernel
st = clEnqueueNDRangeKernel(queue, kernel1, dims, NULL, (const size_t*)gws,
(const size_t*)lws, 0, NULL, &ev_kernel);
/* (const size_t*)lws, 0, NULL, NULL); */
/* printf("nd range kernel1 (%d %s)\n", st, clErrorString(st)); */
printf("nd range kernel1 (%d)\n", st);
end = clCreateUserEvent(context, &st);
printf("create user event (%d)\n", st);
callback_data* user_data = (callback_data*)malloc(sizeof(callback_data));
printf("c_v %p\n", (void*)c_v);
user_data->queue = &queue;
user_data->buf_c = &buf_c;
user_data->c_v = c_v;
user_data->size = size;
user_data->end = &end;
user_data->nested_callbacks = use_nested_callbacks;
user_data->blocking = use_blocking;
if (use_callbacks) {
st =
clSetEventCallback(ev_kernel, CL_COMPLETE, callback_kernel_fn, user_data);
printf("set event callback (%d)\n", st);
}
/* printf("first: %2.5f\n", c_v[0]); */
/* print_matrix_float_s_t("c", c); */
// ReadBuffer
/* float* ptr = (float*)clEnqueueMapBuffer(queue, buf_c, CL_TRUE, CL_MAP_READ,
* 0, c_rows * c_cols * sizeof(float), 0, NULL, NULL, &st); */
/* printf("read buffer c_v - buf_c (%d)\n", st); */
/* printf("finish queue\n"); */
/* clFinish(queue); */
/* printf("finished queue\n"); */
if (use_callbacks) {
/* clWaitForCompletion(context); */
printf("waiting for events\n");
/* /\* cl_event events[] = {ev_kernel}; *\/ */
cl_event events[] = { end };
clWaitForEvents(1, events); // ev_kernel);
printf("waited for events\n");
clbWaitEvents(c_v);
} else {
printf("about to read the c buffer\n");
st = clEnqueueReadBuffer(queue, buf_c, use_blocking, 0, size * sizeof(int),
c_v, 0, NULL, NULL);
printf("read buffer c_v - buf_c (%d)\n", st);
}
/* print_matrix("c_v", c_v, c_rows, c_cols); */
/* printf("first: %2.5f\n", c_v[0]); */
/* print_matrix_float_s_t("c", c); */
free(user_data);
clReleaseKernel(kernel1);
/* clReleaseKernel(kernel2); */
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseMemObject(buf_a);
clReleaseMemObject(buf_b);
clReleaseMemObject(buf_c);
/* clReleaseMemObject(b_x); */
clReleaseContext(context);
free(devices);
free(platforms);
#define THRESHOLD 0
// check
printf("about to check (first: %d)\n", c_v[0]);
for (size_t i = 0; i < size; i++) {
if (abs(c_v[i] - (a_v[i] + b_v[i])) > THRESHOLD) {
printf("Wrong checking: a_v[%ld] = %d, b_v[%ld] = %d, c_v[%ld] = %d\n", i,
a_v[i], i, b_v[i], i, c_v[i]);
exit(EXIT_FAILURE);
}
}
return EXIT_SUCCESS;
}
void CL_CALLBACK
callback_read_fn(cl_event event, cl_int ev_status, void* user_data)
{
printf("-- BEGIN callback read executed (%d)\n", ev_status);
callback_data* cb_data = (callback_data*)user_data;
/* cl_command_queue queue = *(cb_data->queue); */
/* cl_mem buf_c = *(cb_data->buf_c); */
int* c_v = cb_data->c_v;
cl_event end = *(cb_data->end);
/* int size = cb_data->size; */
cl_int st;
printf("c_v %p\n", (void*)c_v);
printf("c_v[0] = %d\n", c_v[0]);
/* c_v[1] = 1; */
st = clSetUserEventStatus(end, CL_COMPLETE);
printf("set user event status (%d)\n", st);
// haz que salga el finish
printf("-- END\n");
}
cl_event ev_read;
void CL_CALLBACK
callback_kernel_fn(cl_event event, cl_int ev_status, void* user_data)
{
printf("-- BEGIN callback kernel executed (%d)\n", ev_status);
callback_data* cb_data = (callback_data*)user_data;
cl_command_queue queue = *(cb_data->queue);
cl_mem buf_c = *(cb_data->buf_c);
int* c_v = cb_data->c_v;
int size = cb_data->size;
bool nested_callbacks = cb_data->nested_callbacks;
bool blocking = cb_data->blocking;
cl_event end = *(cb_data->end);
printf("c_v %p\n", (void*)c_v);
printf("c_v[0] = %d\n", c_v[0]);
cl_int st;
/* printf("about to flush\n"); */
/* clFlush(queue); */
/* printf("flushed\n"); */
size_t offset = 0;
/* size = size + 4; */
printf("about to read the c buffer\n");
printf("blocking %d\n", blocking);
clb_events_waiting_device[clb_num_events_waiting] = 0;
clb_events_init_read[clb_num_events_waiting] = 0;
/* why it does not work? (blocking CL_TRUE) */
st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),
c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);
ev_read = clb_events_waiting[clb_num_events_waiting - 1];
printf("enqueue read buffer (%d)\n", st);
/* size * sizeof(int), c_v, 0, NULL, NULL); */
if (nested_callbacks) {
st = clSetEventCallback(ev_read, CL_COMPLETE, callback_read_fn, user_data);
printf("set event callback (%d)\n", st);
/* st = clSetUserEventStatus(end, CL_COMPLETE); */
/* printf("set user event status (%d)\n", st); */
}
/* c_v[1] = 1; */
/* st = clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, ); */
/* printf("event info (%d)\n", st); */
/* int len = 512; */
/* char buflog[len]; */
/* cl_command_type; */
/* clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, len, &buflog, NULL); */
/* if (buflog != NULL) { */
/* printf("- event: %s\n", buflog); */
/* } */
if (!nested_callbacks) {
st = clSetUserEventStatus(end, CL_COMPLETE);
printf("set user event status (%d)\n", st);
/* printf("read buffer c_v - buf_c (%d)\n", st); */
}
printf("-- END\n");
}
And now, if I select the Intel CPU as device:
./callback 0 1 0 1 1 0
It works:
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
context (0)
device name: Intel(R) Core(TM) i5-6200U CPU # 2.30GHz
device type: CL_DEVICE_TYPE_CPU
command queue (0)
mode CPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x1420030
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x1420030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
-- BEGIN callback read executed (0)
c_v 0x1420030
c_v[0] = 1
set user event status (0)
-- END
waited for events
About to wait events: 1
3 = CL_QUEUED, 0 = CL_COMPLETE, 2 = CL_SUBMITTED, 1 = CL_RUNNING
cl_event CPU init 0 [0] = status 0 (ref 0x7f7568000a90)
CPU [0] = 1, [1] = 3, [2] = 5
about to check (first: 1)
Now, if I select the Intel IGPU (Intel Integrated GPU):
./callback 0 0 2 1 1 0
It is freezes / hangs:
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
context (0)
device name: Intel(R) HD Graphics
device type: CL_DEVICE_TYPE_GPU
command queue (0)
mode IGPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x18b7030
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x18b7030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
If I use gdb and run the same test, and do C-c, I can see:
(gdb) r 0 0 2 1 1 0
Starting program: /callbacks/build/callback 0 0 2 1 1 0
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
[New Thread 0x7ffff4cd9700 (LWP 21291)]
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
[New Thread 0x7fffeede2700 (LWP 21292)]
[New Thread 0x7fffee5e0700 (LWP 21293)]
[New Thread 0x7fffee9e1700 (LWP 21294)]
context (0)
device name: Intel(R) HD Graphics
device type: CL_DEVICE_TYPE_GPU
command queue (0)
mode IGPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x607030
[New Thread 0x7fffec827700 (LWP 21295)]
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x607030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
^C
Thread 1 "callback" received signal SIGINT, Interrupt.
0x00007ffff730a756 in pthread_cond_wait##GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
(gdb) bt
#0 0x00007ffff730a756 in pthread_cond_wait##GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
#1 0x00007ffff64c635b in ?? () from /opt/intel/opencl/libintelocl.so
#2 0x00007ffff648c63a in ?? () from /opt/intel/opencl/libintelocl.so
#3 0x00007ffff647b5d1 in ?? () from /opt/intel/opencl/libintelocl.so
#4 0x00007ffff63f3e75 in clWaitForEvents () from /opt/intel/opencl/libintelocl.so
#5 0x00007ffff6edca43 in ?? () from /opt/intel/opencl/libIntelOpenCL.so
#6 0x000000000040237e in main (argc=7, argv=0x7fffffffdc58) at ./src/callback.c:532
As you can see in the first example of execution (CPU) it should appear the two callbacks (two BEGIN/END pairs). In the case of HD Graphics GPU it hangs after the first callback (only one BEGIN/END pair).
Why?
(gdb shows that is freezed in the pthread_cond_wait of the intel opencl driver).
Can anyone explain really what is the behavior with the callbacks/events and the host thread? (best practices, how to avoid deadlocks)
I need fine grained control and the fastest performance, and it looks like is callbacks, but they have weird behaviors...
Expected behavior (only occurs in the CPU, not in the IGPU):
1. The host creates an user event. Then, the host calls a EnqueueKernelNDRange (vector addition) and waits for the user event (WaitForEvents). When the kernel finishes it triggers the callback "callback_kernel".
2. This "callback_kernel" calls a EnqueueReadBuffer non-blocking, and when it finishes triggers the callback "callback_read".
3. The "callback_read" sets CL_COMPLETE the user event.
4. The host continues after the WaitForEvents with the content filled (buffer read).
Your problem is the following line:
/* why it does not work? (blocking CL_TRUE) */
st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);
Inside the callback function, you are trying to issue a blocking call to clEnqueueReadBuffer, which is not allowed in OpenCL. You should check the specification notes which functions are not allowed from the following link.
https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clSetEventCallback.html
I also recommend you to read the whole callback section from the specification your driver supports, I am adding the corresponding section of the latest OpenCL spec 2.2 here.
https://www.khronos.org/registry/OpenCL/specs/opencl-2.2.pdf#page=197
This program is a simple parallel program which adds the elements of 2 vectors.
The program was error free and it was compiled successfully but the results are not right
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
// number of points in Both A and B files (number of rows)
const int number_of_points = 11;
// number of points axis in Both A and B files (number of Columns)
const int number_of_axis = 3;
using namespace std;
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// array contains file A data
float arrayA[number_of_points][number_of_axis]={{0}};
// array contains file B data
float arrayB[number_of_points][number_of_axis]={{0}};
// float X1[number_of_points]; // X values of file A points
float Y1[number_of_points]; // Y values of file A points
// float X2[number_of_points]; // X values of file B points
float Y2[number_of_points]; // Y values of file B points
float *X1 = (float*)malloc(sizeof(float)*number_of_points);
float *X2 = (float*)malloc(sizeof(float)*number_of_points);
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
arrayA[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
arrayB[row][col] = x;
col++;
}
row++;
}
// put Xs of points in X vectors and Ys of points in Y vectors
// input file A
for (int i = 0; i<number_of_points; i++){
X1[i] = arrayA[i][1];
Y1[i] = arrayA[i][2];
}
// input file B
for (int i = 0; i<number_of_points; i++){
X2[i] = arrayB[i][1];
Y2[i] = arrayB[i][2];
}
// int i;
// const int LIST_SIZE = 50;
// int *A = (int*)malloc(sizeof(int)*number_of_points);
// int *B = (int*)malloc(sizeof(int)*number_of_points);
// for(i = 0; i < number_of_points; i++) {
// A[i] = X1[i];
// B[i] = X2[i];
// }
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context =
clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue =
clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem x1_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem x2_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
number_of_points * sizeof(float), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, x1_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X1, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, x2_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X2, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&x1_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&x2_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
float *C = (float*)malloc(sizeof(float)*number_of_points);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < number_of_points; i++)
printf("%f + %f = %f\n", X1[i], X2[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(x1_mem_obj);
ret = clReleaseMemObject(x2_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(X1);
free(X2);
free(C);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
and the kernel file
__kernel void vector_add(__global float *X1,
__global float *X2,
__global float *C) {
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = X1[i] + X2[i];
}
The result was
0.000000 + 0.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
2.000000 + 2.000000 = 0.000000
3.000000 + 3.000000 = 0.000000
4.000000 + 4.000000 = 0.000000
5.000000 + 5.000000 = 0.000000
6.000000 + 6.000000 = 0.000000
7.000000 + 7.000000 = 0.000000
8.000000 + 8.000000 = 0.000000
9.000000 + 9.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
ALL Time taken: 0.07s
You've committed one of the cardinal sins of OpenCL programming, in that you are not checking the error codes from any of your OpenCL API calls! You should always check the return code from every single OpenCL API call. If you did this, it would point you towards the problem very quickly.
The problem is in your kernel enqueue call. If you check the error code, you'll see that you are getting -54 back, which corresponds to CL_INVALID_WORK_GROUP_SIZE. Specifically, kernel invocations have the requirement that the work-group size (local size) exactly divides the global size. You are asking for a work-group size of 64 and a global size of 11, which does not fulfil this requirement.
You can also pass NULL as the work-group size parameter, and the OpenCL implementation will pick a work-group size that will definitely work on your behalf.
OK, so lets say I have an ( N x N ) matrix that I would like to process. This matrix is quite large for my computer, and if I try to send it to the device all at once I get a 'out of memory error.'
So is there a way to send sections of the matrix to the device? One way I can see to do it is copy portions of the matrix on the host, and then send these manageable copied portions from the host to the device, and then put them back together at the end.
Here is something I have tried, but the cudaMemcpy in the for loop returns error code 11, 'invalid argument.'
int h_N = 10000;
size_t h_size_m = h_N*sizeof(float);
h_A = (float*)malloc(h_size_m*h_size_m);
int d_N = 2500;
size_t d_size_m = d_N*sizeof(float);
InitializeMatrices(h_N);
int i;
int iterations = (h_N*h_N)/(d_N*d_N);
for( i = 0; i < iterations; i++ )
{
float* h_array_ref = h_A+(i*d_N*d_N);
cudasafe( cudaMemcpy(d_A, h_array_ref, d_size_m*d_size_m, cudaMemcpyHostToDevice), "cudaMemcpy");
cudasafe( cudaFree(d_A), "cudaFree(d_A)" );
}
What I'm trying to accomplish with the above code is this: instead of send the entire matrix to the device, I simply send a pointer to a place within that matrix and reserve enough space on the device to do the work, and then with the next iteration of the loop move the pointer forward within the matrix, etc. etc.
Not only can you do this (assuming your problem is easily decomposed this way into sub-arrays), it can be a very useful thing to do for performance; once you get the basic approach you've described working, you can start using asynchronous memory copies and double-buffering to overlap some of the memory transfer time with the time spent computing what is already on-card.
But first one gets the simple thing working. Below is a 1d example (multiplying a vector by a scalar and adding another scalar) but using a linearized 2d array would be the same; the key part is
CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);
int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {
int size=batchsize;
if ((nstart + batchsize) > n) size = n - nstart;
CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );
blocksize = (size+nblocks-1)/nblocks;
cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);
CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );
nbatches++;
}
gputime = tock(&gputimer);
CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );
You allocate the buffers at the start, and then loop through until you're done, each time doing the copy, starting the kernel, and then copying back. You free at the end.
The full code is
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cuda.h>
#include <sys/time.h>
#include <math.h>
#define CHK_CUDA(e) {if (e != cudaSuccess) {fprintf(stderr,"Error: %s\n", cudaGetErrorString(e)); exit(-1);}}
__global__ void cuda_saxpb(const float *xd, const float a, const float b,
float *yd, const int n) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i<n) {
yd[i] = a*xd[i]+b;
}
return;
}
void cpu_saxpb(const float *x, float a, float b, float *y, int n) {
int i;
for (i=0;i<n;i++) {
y[i] = a*x[i]+b;
}
return;
}
int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b);
void tick(struct timeval *timer);
double tock(struct timeval *timer);
int main(int argc, char **argv) {
int n=1000;
int nblocks=10;
int batchsize=100;
float a = 5.;
float b = -1.;
int err;
float *x, *y, *ycuda;
float *xd, *yd;
double abserr;
int blocksize;
int i;
struct timeval cputimer;
struct timeval gputimer;
double cputime, gputime;
err = get_options(argc, argv, &n, &batchsize, &nblocks, &a, &b);
if (batchsize > n) {
fprintf(stderr, "Resetting batchsize to size of vector, %d\n", n);
batchsize = n;
}
if (err) return 0;
x = (float *)malloc(n*sizeof(float));
if (!x) return 1;
y = (float *)malloc(n*sizeof(float));
if (!y) {free(x); return 1;}
ycuda = (float *)malloc(n*sizeof(float));
if (!ycuda) {free(y); free(x); return 1;}
/* run CPU code */
tick(&cputimer);
cpu_saxpb(x, a, b, y, n);
cputime = tock(&cputimer);
/* run GPU code */
/* only have to allocate once */
CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);
int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {
int size=batchsize;
if ((nstart + batchsize) > n) size = n - nstart;
CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );
blocksize = (size+nblocks-1)/nblocks;
cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);
CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );
nbatches++;
}
gputime = tock(&gputimer);
CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );
abserr = 0.;
for (i=0;i<n;i++) {
abserr += fabs(ycuda[i] - y[i]);
}
printf("Y = a*X + b, problemsize = %d\n", n);
printf("CPU time = %lg millisec.\n", cputime*1000.);
printf("GPU time = %lg millisec (done with %d batches of %d).\n",
gputime*1000., nbatches, batchsize);
printf("CUDA and CPU results differ by %lf\n", abserr);
free(x);
free(y);
free(ycuda);
return 0;
}
int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b) {
const struct option long_options[] = {
{"nvals" , required_argument, 0, 'n'},
{"nblocks" , required_argument, 0, 'B'},
{"batchsize" , required_argument, 0, 's'},
{"a", required_argument, 0, 'a'},
{"b", required_argument, 0, 'b'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}};
char c;
int option_index;
int tempint;
while (1) {
c = getopt_long(argc, argv, "n:B:a:b:s:h", long_options, &option_index);
if (c == -1) break;
switch(c) {
case 'n': tempint = atoi(optarg);
if (tempint < 1 || tempint > 500000) {
fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *n);
} else {
*n = tempint;
}
break;
case 's': tempint = atoi(optarg);
if (tempint < 1 || tempint > 50000) {
fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *s);
} else {
*s = tempint;
}
break;
case 'B': tempint = atoi(optarg);
if (tempint < 1 || tempint > 1000 || tempint > *n) {
fprintf(stderr,"%s: Cannot use number of blocks %s;\n Using %d\n", argv[0], optarg, *nb);
} else {
*nb = tempint;
}
break;
case 'a': *a = atof(optarg);
break;
case 'b': *b = atof(optarg);
break;
case 'h':
puts("Calculates y[i] = a*x[i] + b on the GPU.");
puts("Options: ");
puts(" --nvals=N (-n N): Set the number of values in y,x.");
puts(" --batchsize=N (-s N): Set the number of values to transfer at a time.");
puts(" --nblocks=N (-B N): Set the number of blocks used.");
puts(" --a=X (-a X): Set the parameter a.");
puts(" --b=X (-b X): Set the parameter b.");
puts(" --niters=N (-I X): Set number of iterations to calculate.");
puts("");
return +1;
}
}
return 0;
}
void tick(struct timeval *timer) {
gettimeofday(timer, NULL);
}
double tock(struct timeval *timer) {
struct timeval now;
gettimeofday(&now, NULL);
return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}
Running this one gets:
$ ./batched-saxpb --nvals=10240 --batchsize=10240 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.072 millisec.
GPU time = 0.117 millisec (done with 1 batches of 10240).
CUDA and CPU results differ by 0.000000
$ ./batched-saxpb --nvals=10240 --batchsize=5120 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.066 millisec.
GPU time = 0.133 millisec (done with 2 batches of 5120).
CUDA and CPU results differ by 0.000000
$ ./batched-saxpb --nvals=10240 --batchsize=2560 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.067 millisec.
GPU time = 0.167 millisec (done with 4 batches of 2560).
CUDA and CPU results differ by 0.000000
The GPU time goes up in this case (we're doing more memory copies) but the answers stay the same.
Edited: The original version of this code had an option for running multiple iterations of the kernel for timing purposes, but that's unnecessarily confusing in this context so it's removed.
I constructed my own little Opencl example using different sources on the net. The actual kernel works, and I get the output I want, but the cleanup functions, I found in one of the examples, cause segfaults. What did I do wrong?
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} while (0)
#define CL_CHECK_ERR(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
typeof(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} \
_ret; \
})
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_int _err;
*GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
printf("\n1-%i\n",_err);
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
cl_device_id* GPUDevices;
GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
// Create a command-queue on the first GPU device
*GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
printf("\n2-%i\n",_err);
// Create OpenCL program with source code
*OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
printf("\n3-%i\n",_err);
CL_CHECK(clBuildProgram(*OpenCLProgram, 0,
NULL, NULL, NULL, NULL));
cl_int errcode;
*cl_forward1 = clCreateKernel(*OpenCLProgram,
"VectorAdd", &errcode);
printf("\n7-%i\n",errcode);
return GPUDevices;
}
int main(int argc, char** argv)
{
cl_context GPUContext;
cl_command_queue GPUCommandQueue;
cl_program OpenCLProgram;
cl_kernel OpenCLVectorAdd;
cl_device_id* GPUDevices;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);
// Two integer source vectors in Host memory
int n=5 ;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
int output[n];
int size_x = n*sizeof(x);
int size_y = n*sizeof(y);
int size_output = n*sizeof(output); // this changes for the second forward1
cl_int _err;
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
printf("\n4-%i\n",_err);
cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
printf("\n5-%i\n",_err);
// Allocate output memory on GPU
cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
size_output, NULL, &_err);
printf("\n6-%i\n",_err);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);
// 7. Launch OpenCL kernel
size_t localWorkSize[1], globalWorkSize[1];
//localWorkSize = ;
globalWorkSize[0] = n;
// Launch the Kernel on the GPU
CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
// Copy the output in GPU memory back to CPU memory
//float* h_C = (float*) malloc(size_output);
CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue,
total_cl, CL_TRUE, 0, size_output,
output, 0, NULL, NULL));
for (int i=0; i<n;i++){
printf("\n%i",output[i]);
}
// Cleanup (each of the following lines causes a seg fault
// ******************************
CL_CHECK(free(GPUDevices));
CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
CL_CHECK(clReleaseProgram(OpenCLProgram));
CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
CL_CHECK(clReleaseContext(GPUContext));
CL_CHECK(clReleaseMemObject(total_cl));
CL_CHECK(clReleaseMemObject(x_cl));
CL_CHECK(clReleaseMemObject(y_cl));
/* ****************
return 0;
}
Merci!
For people who arrives here in the future:
As Brafford suggested, this is resolved by adding clFinish(GPUCommandQueue) after clEnqueueNDRangeKernel as well as clEnqueueReadBuffer.
Apparently trying to clean up any object (e.g. release a queue) that is still under execution yields segmentation fault.
I corrected and changed several small things. So this code should work now.
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} while (0)
#define CL_CHECK_ERR(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
typeof(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} \
_ret; \
})
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_int _err;
*GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
printf("\nclCreateContextFromType:%i\n",_err);
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
cl_device_id* GPUDevices;
GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
// Create a command-queue on the first GPU device
*GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
printf("\nclCreateCommandQueue:%i\n",_err);
// Create OpenCL program with source code
*OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
printf("\nclCreateProgramWithSource:%i\n",_err);
CL_CHECK(clBuildProgram(*OpenCLProgram, 0,
NULL, NULL, NULL, NULL));
cl_int errcode;
*cl_forward1 = clCreateKernel(*OpenCLProgram,
"VectorAdd", &errcode);
printf("\nclCreateKernel:%i\n",errcode);
return GPUDevices;
}
int main(int argc, char** argv)
{
cl_context GPUContext;
cl_command_queue GPUCommandQueue;
cl_program OpenCLProgram;
cl_kernel OpenCLVectorAdd;
cl_device_id* GPUDevices;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);
int n=5 ;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
int output[n];
int size_x = n*sizeof(x);
int size_y = n*sizeof(y);
int size_output = n*sizeof(output);
cl_int _err;
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
printf("\nclCreateBuffer:%i\n",_err);
cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
printf("\nclCreateBuffer:%i\n",_err);
// Allocate output memory on GPU
cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
size_output, NULL, &_err);
printf("\nclCreateBuffer:%i\n",_err);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);
size_t globalWorkSize[1];
globalWorkSize[0] = n;
// Launch the Kernel on the GPU
CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
clFinish(GPUCommandQueue);
// Copy the output in GPU memory back to CPU memory
int* h_c = (int*) malloc(size_output);
CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue,
total_cl, CL_TRUE, 0, size_output,
h_c, 0, NULL, NULL));
clFinish(GPUCommandQueue);
for (int i=0; i<n;i++){
printf("\noutput[%i]=%i",i,h_c[i]);
}
// Cleanup
free(GPUDevices);
CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
CL_CHECK(clReleaseProgram(OpenCLProgram));
CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
CL_CHECK(clReleaseContext(GPUContext));
CL_CHECK(clReleaseMemObject(x_cl));
CL_CHECK(clReleaseMemObject(total_cl));
CL_CHECK(clReleaseMemObject(y_cl));
return 0;
}