OpenCl cleanup causes segfault

OpenCl cleanup causes segfault - memory-management

I constructed my own little Opencl example using different sources on the net. The actual kernel works, and I get the output I want, but the cleanup functions, I found in one of the examples, cause segfaults. What did I do wrong?
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} while (0)
#define CL_CHECK_ERR(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
typeof(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} \
_ret; \
})
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_int _err;
*GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
printf("\n1-%i\n",_err);
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
cl_device_id* GPUDevices;
GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
// Create a command-queue on the first GPU device
*GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
printf("\n2-%i\n",_err);
// Create OpenCL program with source code
*OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
printf("\n3-%i\n",_err);
CL_CHECK(clBuildProgram(*OpenCLProgram, 0,
NULL, NULL, NULL, NULL));
cl_int errcode;
*cl_forward1 = clCreateKernel(*OpenCLProgram,
"VectorAdd", &errcode);
printf("\n7-%i\n",errcode);
return GPUDevices;
}
int main(int argc, char** argv)
{
cl_context GPUContext;
cl_command_queue GPUCommandQueue;
cl_program OpenCLProgram;
cl_kernel OpenCLVectorAdd;
cl_device_id* GPUDevices;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);
// Two integer source vectors in Host memory
int n=5 ;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
int output[n];
int size_x = n*sizeof(x);
int size_y = n*sizeof(y);
int size_output = n*sizeof(output); // this changes for the second forward1
cl_int _err;
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
printf("\n4-%i\n",_err);
cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
printf("\n5-%i\n",_err);
// Allocate output memory on GPU
cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
size_output, NULL, &_err);
printf("\n6-%i\n",_err);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);
// 7. Launch OpenCL kernel
size_t localWorkSize[1], globalWorkSize[1];
//localWorkSize = ;
globalWorkSize[0] = n;
// Launch the Kernel on the GPU
CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
// Copy the output in GPU memory back to CPU memory
//float* h_C = (float*) malloc(size_output);
CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue,
total_cl, CL_TRUE, 0, size_output,
output, 0, NULL, NULL));
for (int i=0; i<n;i++){
printf("\n%i",output[i]);
}
// Cleanup (each of the following lines causes a seg fault
// ******************************
CL_CHECK(free(GPUDevices));
CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
CL_CHECK(clReleaseProgram(OpenCLProgram));
CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
CL_CHECK(clReleaseContext(GPUContext));
CL_CHECK(clReleaseMemObject(total_cl));
CL_CHECK(clReleaseMemObject(x_cl));
CL_CHECK(clReleaseMemObject(y_cl));
/* ****************
return 0;
}
Merci!

For people who arrives here in the future:
As Brafford suggested, this is resolved by adding clFinish(GPUCommandQueue) after clEnqueueNDRangeKernel as well as clEnqueueReadBuffer.
Apparently trying to clean up any object (e.g. release a queue) that is still under execution yields segmentation fault.

I corrected and changed several small things. So this code should work now.
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} while (0)
#define CL_CHECK_ERR(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
typeof(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} \
_ret; \
})
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_int _err;
*GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
printf("\nclCreateContextFromType:%i\n",_err);
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
cl_device_id* GPUDevices;
GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
// Create a command-queue on the first GPU device
*GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
printf("\nclCreateCommandQueue:%i\n",_err);
// Create OpenCL program with source code
*OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
printf("\nclCreateProgramWithSource:%i\n",_err);
CL_CHECK(clBuildProgram(*OpenCLProgram, 0,
NULL, NULL, NULL, NULL));
cl_int errcode;
*cl_forward1 = clCreateKernel(*OpenCLProgram,
"VectorAdd", &errcode);
printf("\nclCreateKernel:%i\n",errcode);
return GPUDevices;
}
int main(int argc, char** argv)
{
cl_context GPUContext;
cl_command_queue GPUCommandQueue;
cl_program OpenCLProgram;
cl_kernel OpenCLVectorAdd;
cl_device_id* GPUDevices;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);
int n=5 ;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
int output[n];
int size_x = n*sizeof(x);
int size_y = n*sizeof(y);
int size_output = n*sizeof(output);
cl_int _err;
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
printf("\nclCreateBuffer:%i\n",_err);
cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
printf("\nclCreateBuffer:%i\n",_err);
// Allocate output memory on GPU
cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
size_output, NULL, &_err);
printf("\nclCreateBuffer:%i\n",_err);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);
size_t globalWorkSize[1];
globalWorkSize[0] = n;
// Launch the Kernel on the GPU
CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
clFinish(GPUCommandQueue);
// Copy the output in GPU memory back to CPU memory
int* h_c = (int*) malloc(size_output);
CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue,
total_cl, CL_TRUE, 0, size_output,
h_c, 0, NULL, NULL));
clFinish(GPUCommandQueue);
for (int i=0; i<n;i++){
printf("\noutput[%i]=%i",i,h_c[i]);
}
// Cleanup
free(GPUDevices);
CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
CL_CHECK(clReleaseProgram(OpenCLProgram));
CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
CL_CHECK(clReleaseContext(GPUContext));
CL_CHECK(clReleaseMemObject(x_cl));
CL_CHECK(clReleaseMemObject(total_cl));
CL_CHECK(clReleaseMemObject(y_cl));
return 0;
}

Related

Why does L2 hardware prefetcher perform worse with only 1 KiB or 2 KiB access size?

I have a simple multi-threaded program where the thread performs random reads on a given file (in memory) divided evenly amongst the threads. The thread reads from the file to buffer and sets a value. This is really a program designed to test memory bandwidth. This is the following program,
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdbool.h>
#include <ctype.h>
#include <inttypes.h>
#include <pthread.h>
#include <assert.h>
#include <time.h>
#define NS_IN_SECOND 1000000000
uint64_t nano_time(void) {
struct timespec ts;
if( clock_gettime(CLOCK_REALTIME, &ts) == 0)
return ts.tv_sec * NS_IN_SECOND + ts.tv_nsec;
}
// avx512 test
#include <stdint.h>
void *__memmove_chk_avx512_no_vzeroupper(void *dest, void *src, size_t s);
/**
* To create 4 GB file: This will allocate space on disk
* $ dd < /dev/zero bs=1048576 count=4096 > testfile
*
* 100 GiB
* dd if=/dev/zero of=bigmmaptest bs=1M count=102400
* To clear cache:
* $ sync; echo 1 > /proc/sys/vm/drop_caches
*/
//#define SAMPLE_LATENCY 1
#define BYTES_IN_GB (1024*1024*1024)
// Block sized will be used for read and the same will be used for striding
// when iterating over a file in mmap.
#define DEFAULT_BLOCK_SIZE 4096 //8192
#define NANOSECONDS_IN_SECOND 1000000000
const char DEFAULT_NAME[] = "/mnt/tmp/mmaptest";
#define EXIT_MSG(...) \
do { \
printf(__VA_ARGS__); \
_exit(-1); \
} while (0)
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *buf,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
size_t get_filesize(const char* filename);
void print_help_message(const char *progname);
char* map_buffer(int fd, size_t size);
void *run_tests(void *);
static int silent = 0;
typedef struct {
int tid;
int fd;
char *mapped_buffer;
int read_mmap;
int read_syscall;
int write_mmap;
int write_syscall;
off_t *offsets;
size_t block_size;
size_t chunk_size;
int retval;
uint64_t start_time;
uint64_t end_time;
} threadargs_t;
size_t filesize;
int main(int argc, char **argv) {
char *fname = (char*) DEFAULT_NAME;
char *mapped_buffer = NULL;
int c, fd, i, flags = O_RDWR, numthreads = 1, ret, option_index;
static int randomaccess = 0,
read_mmap = 0, read_syscall = 0,
write_mmap = 0, write_syscall = 0,
mixed_mmap = 0, write_tr = 0;
off_t *offsets = 0;
size_t block_size = DEFAULT_BLOCK_SIZE, numblocks,
new_file_size = 0;
uint64_t min_start_time, max_end_time = 0, retval;
// permissions
uint64_t mode = S_IRWXU | S_IRWXG;
pthread_t *threads;
threadargs_t *threadargs;
static struct option long_options[] =
{
// Options set a flag
{"randomaccess", no_argument, &randomaccess, 1},
{"readmmap", no_argument, &read_mmap, 1},
{"readsyscall", no_argument, &read_syscall, 1},
{"silent", no_argument, &silent, 1},
{"writemmap", no_argument, &write_mmap, 1},
{"writesyscall", no_argument, &write_syscall, 1},
{"mixedmmap", no_argument, &mixed_mmap, 1},
// Options take an argument
{"block", required_argument, 0, 'b'},
{"file", required_argument, 0, 'f'},
{"help", no_argument, 0, 'h'},
{"size", no_argument, 0, 's'},
{"threads", required_argument, 0, 't'},
{"writethreads", no_argument, 0, 'w'},
{0, 0, 0, 0}
};
//read operations
while(1) {
c = getopt_long(argc, argv, "b:f:h:s:t:w:",
long_options, &option_index);
// is end of the option
if (c == -1)
break;
switch(c)
{
case 0:
break;
case 'b':
block_size = atoi(optarg);
break;
case 'f':
fname = optarg;
break;
case 'h':
print_help_message(argv[0]);
_exit(0);
case 's':
new_file_size = (size_t)(atoi(optarg)) * BYTES_IN_GB;
break;
case 't':
numthreads = (int) (atoi(optarg));
break;
case 'w':
write_tr = atoi(optarg);
break;
default:
break;
}
}
if(!silent){
printf("PID: %d\n", getpid());
printf("Using file %s \n", fname);
}
if ((filesize = get_filesize(fname)) == -1) {
if (read_mmap || read_syscall) {
printf("Cannot obtain file size for %s: %s"
"File must exist prior to running read tests.\n",
fname, strerror(errno));
_exit(-1);
}
else
filesize = new_file_size;
}
fd = open((const char*)fname, flags, mode);
if(fd <0) {
printf("Clould not open/create file %s: %s\n",
fname, strerror(errno));
_exit(-1);
}
if(block_size < 0 || block_size > filesize){
printf("Invalid block size: %zu for file of size "
"%zu. Block size must be greater than 0 and no"
"greater than the file size.\n",
block_size, filesize);
_exit(-1);
}
/*
* Generate random block number for random file access.
* Sequential for sequential access
*/
numblocks = filesize/block_size;
if(filesize % block_size > 0)
numblocks++;
offsets = (off_t *) malloc(numblocks * sizeof(off_t));
if(offsets == 0){
printf("Failed to allocate memory: %s\n", strerror(errno));
_exit(-1);
}
for (uint64_t i = 0; i < numblocks; i++)
if(randomaccess)
offsets[i] = ((int)random() % numblocks) * block_size;
else
offsets[i] = i*block_size;
if (numblocks % numthreads != 0)
EXIT_MSG("We have %" PRIu64 " blocks and %d threads. "
"Threads must evenly divide blocks. "
"Please fix the args.\n",
(uint_least64_t)numblocks, numthreads);
if( read_mmap || write_mmap || mixed_mmap)
assert((mapped_buffer = map_buffer(fd, filesize)) != NULL);
threads = (pthread_t*)malloc(numthreads * sizeof(pthread_t));
threadargs =
(threadargs_t*)malloc(numthreads * sizeof(threadargs_t));
if (threads == NULL || threadargs == NULL)
EXIT_MSG("Could not allocate thread array for %d threads.\n", numthreads);
for (i = 0; i < numthreads; i++) {
if(mixed_mmap){
if (i < write_tr) {
write_mmap = 1;
} else {
read_mmap = 1;
}
}
threadargs[i].fd = fd;
threadargs[i].tid = i;
threadargs[i].block_size = block_size;
threadargs[i].chunk_size = filesize/numthreads;
threadargs[i].mapped_buffer = mapped_buffer;
threadargs[i].offsets = &offsets[numblocks/numthreads * i];
threadargs[i].read_mmap = read_mmap;
threadargs[i].read_syscall = read_syscall;
threadargs[i].write_mmap = write_mmap;
threadargs[i].write_syscall = write_syscall;
int ret = pthread_create(&threads[i], NULL, run_tests, &threadargs[i]);
if (ret!=0)
EXIT_MSG("pthread_create for %dth thread failed: %s\n",
i, strerror(errno));
}
for (i = 0; i< numthreads; i++){
ret = pthread_join(threads[i], NULL);
if (ret !=0)
EXIT_MSG("Thread %d failed in join: %s\n",
i, strerror(errno));
}
// for mixed mode determine read and write aggregate b/w.
if(mixed_mmap) {
// Write b/w
min_start_time = threadargs[0].start_time;
max_end_time = 0;
// Since tid 0 to write_tr-1 did writes, find it's min and max.
for(i=0; i < write_tr; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Write: %.2f\n",
(double)write_tr*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
// Read b/w
min_start_time = threadargs[write_tr].start_time;
max_end_time = 0;
for(i=write_tr; i < numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Read: %.2f\n",
(double)(numthreads-write_tr)*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
/**
* For total run time. Find the smallest start time
* and largest end time across all threads.
*/
min_start_time = threadargs[0].start_time;
max_end_time = 0;
for (i=0; i< numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("%.2f\n",
(double)filesize/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
munmap(mapped_buffer, filesize);
close(fd);
}
void * run_tests(void *args) {
uint64_t retval;
threadargs_t t = *(threadargs_t*)args;
if(t.read_mmap) {
if(!silent)
printf("Running read mmap test:\n");
retval = read_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.read_syscall) {
if(!silent)
printf("Running read syscall test:\n");
retval = read_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_mmap) {
if(!silent)
printf("Running write mmap test:\n");
retval = write_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_syscall) {
if(!silent)
printf("Running write syscall test:\n");
retval = write_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
return (void*) 0;
}
#define READ 1
#define WRITE 2
/**
********* SYSCALL section
*/
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, READ, offsets,
begin, end);
}
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, WRITE, offsets,
begin, end);
}
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char * buffer = NULL;
int i = 0;
size_t total_bytes_transferred = 0;
uint64_t begin_time, end_time, ret_token = 0;
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 0, block_size);
begin_time= nano_time();
while(!done) {
size_t bytes_transferred = 0;
if(optype == READ)
bytes_transferred = pread(fd, buffer, block_size, offsets[i++]);
else if (optype == WRITE)
bytes_transferred = pwrite(fd, buffer, block_size, offsets[i++]);
if (bytes_transferred == 0)
done = true;
else if(bytes_transferred == -1){
printf("Failed to IO: %s\n", strerror(errno));
return -1;
}
else {
total_bytes_transferred += bytes_transferred;
if (optype == WRITE && total_bytes_transferred == filesize)
done = true;
// Do random operation
ret_token += buffer[0];
}
if (i*block_size >= filesize)
done = true;
}
end_time = nano_time();
if(!silent){
printf("%s: %" PRIu64 " bytes transferred in %" PRIu64 ""
" ns.\n", (optype == READ)?"read-syscall":"write-syscall",
(uint_least64_t)total_bytes_transferred, (end_time-begin_time));
// Throughput in GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
return ret_token;
}
/**
* MMAP tests
*/
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end) {
return mmap_test(fd, tid, block_size, filesize, buf, READ, offsets, begin, end);
}
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end){
return mmap_test(fd, tid, block_size, filesize, buf, WRITE, offsets, begin, end);
}
// Add memory addr
#if SAMPLE_LATENCY
#define BEGIN_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) \
lat_begin_time = nano_time();
#define END_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) { \
lat_end_time = nano_time(); \
latency_samples[i/LAT_SAMPL_INTERVAL % MAX_LAT_SAMPLES] = \
lat_end_time - lat_begin_time; \
num_samples++; \
}
#define MAX_LAT_SAMPLES 50
//#define LAT_SAMPL_INTERVAL (1000*1048576)
#define LAT_SAMPL_INTERVAL block_size
#else
#define BEGIN_LAT_SAMPLE ;
#define END_LAT_SAMPLE
#endif
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *mapped_buffer,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char *buffer = NULL;
uint64_t i, j, numblocks, ret;
uint64_t begin_time, end_time, ret_token = 0;
#if SAMPLE_LATENCY
uint64_t lat_begin_time, lat_end_time;
size_t latency_samples[MAX_LAT_SAMPLES];
int num_samples = 0;
memset((void*)latency_samples, 0, sizeof(latency_samples));
#endif
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 1, block_size);
begin_time = nano_time();
for(i=0; i<filesize; i+=block_size){
off_t offset = offsets[i/block_size];
BEGIN_LAT_SAMPLE;
if(optype == READ) {
//__memmove_chk_avx512_no_vzeroupper(buffer, &mapped_buffer[offset], block_size);
memcpy(buffer, &mapped_buffer[offset], block_size);
ret_token += buffer[0];
}
else if (optype == WRITE) {
//__memmove_chk_avx512_no_vzeroupper(&mapped_buffer[offset], buffer, block_size);
memcpy(&mapped_buffer[offset], buffer, block_size);
ret_token += mapped_buffer[i];
}
END_LAT_SAMPLE;
}
end_time = nano_time();
if(!silent) {
printf("%s: %" PRIu64 " bytes read in %" PRIu64 " ns.\n",
(optype==READ)?"readmap":"writemap",
(uint_least64_t)filesize, (end_time-begin_time));
// print GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
#if SAMPLE_LATENCY
printf("\nSample latency for %ld byte block:\n", block_size);
for (i = 0; i < MAX_LAT_SAMPLES; i++)
printf("\t%ld: %ld\n", i, latency_samples[i]);
#endif
return ret_token;
}
char* map_buffer(int fd, size_t size) {
char *mapped_buffer = NULL;
// Populate
mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_POPULATE, fd, 0);
// Shared
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_SHARED, fd, 0);
// Anon test
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if(mapped_buffer == MAP_FAILED)
EXIT_MSG("Failed to mmap file of size %zu: %s\n",
size, strerror(errno));
// Might also need to gurantee page aligned - posix_memalign()
// int mret = madvise(mapped_buffer, filesize, MADV_HUGEPAGE);
// if(mret!=0) {
// fprintf(stderr, "failed madvise: %s\n", strerror(errno));
// }
return mapped_buffer;
}
size_t get_filesize(const char* filename){
int retval;
struct stat st;
retval = stat(filename, &st);
if(retval)
return -1;
else
return st.st_size;
}
void print_help_message(const char *progname) {
/* take only the last portion of the path */
const char *basename = strrchr(progname, '/');
basename = basename ? basename + 1 : progname;
printf("usage: %s [OPTION]\n", basename);
printf(" -h, --help\n"
" Print this help and exit.\n");
printf(" -b, --block[=BLOCKSIZE]\n"
" Block size used for read system calls.\n"
" For mmap tests, the size of the stride when iterating\n"
" over the file.\n"
" Defaults to %d.\n", DEFAULT_BLOCK_SIZE);
printf(" -f, --file[=FILENAME]\n"
" Perform all tests on this file (defaults to %s).\n",
DEFAULT_NAME);
printf(" --readsyscall\n"
" Perform a read test using system calls.\n");
printf(" --readmmap\n"
" Perform a read test using mmap.\n");
printf(" --writesyscall\n"
" Perform a write test using system calls.\n");
printf(" --writemmap\n"
" Perform a write test using mmap.\n");
printf(" --randomaccess\n"
" Perform random access.\n");
printf(" --threads\n"
" Number of threads to use. Defaults to one.\n");
printf(" --mixedmmap\n"
" Perfom read and write concurrently at different offsets\n");
printf(" -w, -writethreads[=0]\n"
" Number of threads that should perform write\n");
}
To compile:
$ gcc testm.c -o testm -lpthread -static -O2 -fno-builtin-memcpy
Commands to run the program:
$ dd if=/dev/zero of=bigmmaptest bs=1M count=25600 # 25 GiB file
$ ./testm -b 1024 -f bigmmaptest --threads 16 --randomaccess --readmmap
I am on a 32 core Xeon 5218 2nd Gen. L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
When the memcpy size is 1 KiB I get 21.7 GB/s but when the size is 256B I get 26.68 GB/s and 34.8 GB/s when the size is 4 KiB. Why is there a drop in the middle?
I observe that 2 KiB also performs poorly when compared to 256B and 4 KiB.
What's more interesting is, when I disable the L2 hardware prefetcher and without any other changes my bandwidth automatically increases for 1 KiB and 2 KiB. Without prefetch 2 KiB memcpy gives 34.8 GB/s. All of these are aggregate bandwidth.
With perf, I did measure L2 load-store misses but they turned out to not change drastically. This effect is also not seen for 8 threads and below.
I am on linux 5.0.4. I am using the glibC memcpy (gcc 7.5.0) and even with -O2 I observe the above quirk. Where 1 KiB access size gives 18.76 GiB/s with L2 prefetch and without I get 30.32 GiB/s. For comparison, 256 B access size provides 24.7 GiB/s with prefetch and 24.8 GiB/s without. Clearly, the drop in performance is because of the L2 cache pollution caused by the prefetcher, as this is not observed with smaller thread counts. I was considering if SMT could be the reason for increased pollution but I observe the effect distinctly at 16 threads on 16 physical cores.
Skimming through glibc memcpy code, I can see that any access below the size of 4 KiB uses AVX 256 instructions, so there is nothing changing there.

The smaller 256B size not seeing a drop from the L2 streamer might be due to the sequence of cache misses being too short to activate the streamer and waste bandwidth (and slots in the LFBs and L2 <-> L3 superqueue) on requests that won't be useful.
For aligned 4k, there are no bytes within the same page that you're not fetching, so the L2 prefetcher is positively useful, or at least not harmful. (Demand loads come in pretty quickly for later lines when running memcpy so I'm guessing speeds were about the same with/without HW prefetch enabled, unless HW prefetch helps getting started on a new 4k chunk while still waiting for the end of the previous.)
The L2 only sees physical addresses, and AFAIK it doesn't try to prefetch across a 4k boundary. (Even if its within the same 2M hugepage, because it doesn't know that either.) The "next-page prefetcher" Intel mentions being new in Ivy Bridge is AFAIK just a TLB prefetch, not data.
So with aligned 4k memcpy, HW prefetch stops automatically at the end of the data you're actually going to read, not wasting any bandwidth. Since mmap gives you page-aligned memory, these 4k memcopies are from a single source page.
(The destination is irrelevant as it probably stays hot in L1d cache, with maybe an occasional eviction to L2, and the reload from it after memcpy can come from store-forwarding, not even having to wait for memcpy's store to commit to L1d.)
Prediction: If your smaller memcpy source starts part way into a 4k page, but still end at the end of a 4k page, you'd probably see similar behaviour to prefetch disabled. e.g. generate a random page number, and start at 3072 bytes into it, doing a 1 KiB copy. So all your 1 KiB copies come from the ends of pages, never middles.
(You'd still have more dTLB misses per byte memcpyed, because each TLB entry is only covering 1 K of the data you ever actually read. You did you use MAP_POPULATE so you shouldn't be seeing page faults in the timed region, assuming you have enough RAM.)
L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
Those are aggregate totals, but L1d and L2 are private per-core! You have 32kiB L1d and 1MiB L2 per core, because this is Cascade Lake, same layout as Skylake-X.
And BTW, I'd consider using a fast PRNG like xorshift+ or xorshift* inside the timing loop; that's easily random enough to defeat prefetching; even a simple LFSR or even LCG with a power-of-2 modulo would do that (and be very cheap, just an imul and add). It avoids having to read offsets from another array, if you really want to isolate just the memcpy memory accesses. Probably doesn't make a difference though. One advantage of a very simple PRNG with a period equal to the space you're trying to cover (like an LCG) is that you won't generate the same address twice, giving you a random permutation of the blocks. But with a big enough block of memory, random cache hits even from L3 are unlikely even without that hard-to-achieve property.
Your current array of offsets is fine. (I didn't look at the code super closely, so I'm just assuming there aren't bugs.)

OpenCL Callback hangs / freezes (deadlock, pthread_cond_wait)

I created a basic snippet:
Kernel:
__kernel void
kernel1(__global int* a, __global int* b, __global int* c, int size)
{
int idx = get_global_id(0);
if (idx >= 0 && idx < size){
c[idx] = a[idx] + b[idx];
}
}
Code:
#include <CL/cl.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_FILE_SIZE 1024000
#include <sys/stat.h>
#include <sys/types.h>
typedef enum ocl_type_e_t {
OCL_TYPE_NULL = 0,
OCL_TYPE_CPU = 1,
OCL_TYPE_GPU = 2,
OCL_TYPE_IGPU = 3,
OCL_TYPE_ACC = 4
} ocl_type_e_t;
const char*
cl_device_type_to_str(cl_device_type type)
{
static char* strings[] = {
"(invalid)", // invalid
"CL_DEVICE_TYPE_CPU",
"CL_DEVICE_TYPE_GPU",
"CL_DEVICE_TYPE_ACCELERATOR",
"CL_DEVICE_TYPE_CUSTOM",
"CL_DEVICE_TYPE_DEFAULT",
"CL_DEVICE_TYPE_ALL",
};
char* ret;
switch (type) {
case CL_DEVICE_TYPE_CPU:
ret = strings[1];
break;
case CL_DEVICE_TYPE_GPU:
ret = strings[2];
break;
case CL_DEVICE_TYPE_ACCELERATOR:
ret = strings[3];
break;
case CL_DEVICE_TYPE_CUSTOM:
ret = strings[4];
break;
case CL_DEVICE_TYPE_DEFAULT:
ret = strings[5];
break;
case CL_DEVICE_TYPE_ALL:
ret = strings[6];
break;
default:
ret = strings[0];
break;
}
return ret;
}
const char*
file_read(char* const path)
{
struct stat st;
/* st = (struct stat*)malloc(sizeof(stat)); */
int error = stat(path, &st);
if (error != 0) {
printf("Invalid file %s\n", path);
exit(EXIT_FAILURE);
}
int size_file = st.st_size;
if (size_file > MAX_FILE_SIZE) {
printf("File %s is bigger than the max allowed size (%d > %d bytes)\n",
path, size_file, MAX_FILE_SIZE);
exit(EXIT_FAILURE);
}
FILE* fp = fopen(path, "r");
if (fp == NULL) {
printf("Error opening the file %s\n", path);
exit(EXIT_FAILURE);
}
char* const buf = (char* const)malloc(size_file);
if (buf == NULL) {
printf("Error allocating %d bytes for the contents of the file %s\n",
size_file, path);
exit(EXIT_FAILURE);
}
int size_read;
while ((size_read = fread(buf, sizeof(char), size_file, fp)) > 0) {
;
}
fclose(fp);
return buf;
}
cl_event clb_events_waiting[100];
int clb_events_waiting_device[100];
int clb_events_init_read[100];
int clb_num_events_waiting = 0;
void
clbWaitEvents(int * c)
{
if (clb_num_events_waiting > 0){
printf("About to wait events: %d\n", clb_num_events_waiting);
int i;
int waiting = 0;
cl_event ev_waiting[100];
printf("%d = CL_QUEUED, %d = CL_COMPLETE, %d = CL_SUBMITTED, %d = CL_RUNNING\n", CL_QUEUED, CL_COMPLETE, CL_SUBMITTED, CL_RUNNING);
for (i=0; i<clb_num_events_waiting; i++){
cl_int ret;
clGetEventInfo(clb_events_waiting[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);
int dev = clb_events_waiting_device[i];
int init = clb_events_init_read[i] / sizeof(int);
printf("cl_event %s init %6d [%d] = status %d (ref %p)\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, i, ret, (void*)clb_events_waiting[i]);
if (ret != CL_COMPLETE){
ev_waiting[waiting] = clb_events_waiting[i];
waiting++;
}
}
for (i=0; i<clb_num_events_waiting; i++){
int dev = clb_events_waiting_device[i];
int init = clb_events_init_read[i] / sizeof(int);
printf("%s [%d] = %d, [%d] = %d, [%d] = %d\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, c[init], init + 1, c[init + 1], init + 2, c[init + 2]);
}
if (waiting > 0){
printf("about to wait %d events\n", waiting);
clWaitForEvents(waiting, ev_waiting);
printf("wait events finished\n");
}
/* clWaitForEvents(clb_num_events_waiting, clb_events_waiting); */
}
}
typedef struct callback_data
{
cl_command_queue* queue;
cl_mem* buf_c;
int* c_v;
uint size;
cl_event* end;
bool nested_callbacks;
bool blocking;
} callback_data;
void CL_CALLBACK callback_read_fn(cl_event event, cl_int ev_status,
void* user_data);
void CL_CALLBACK callback_kernel_fn(cl_event event, cl_int ev_status,
void* user_data);
int
main(int argc, char* argv[])
{
bool use_callbacks = true;
bool use_nested_callbacks = true;
bool use_blocking = false;
int numSelPlatform = 0;
int numSelDevice = 0;
int doUseCallbacks = 0;
int doUseNestedCallbacks = 0;
int doUseBlocking = 0;
int use_type = 0;
if (argc != 7) {
printf("./%s (platform) (device) (type cpu 0|gpu 1|igpu 2|acc 3) (use "
"callbacks) (use nested callbacks) (use blocking)\n",
argv[0]);
exit(EXIT_FAILURE);
} else {
numSelPlatform = atoi(argv[1]);
numSelDevice = atoi(argv[2]);
use_type = atoi(argv[3]);
doUseCallbacks = atoi(argv[4]);
doUseNestedCallbacks = atoi(argv[5]);
doUseBlocking = atoi(argv[6]);
}
cl_event end;
uint size = 1024;
int* a_v = (int*)malloc(size * sizeof(int));
int* b_v = (int*)malloc(size * sizeof(int));
int* c_v = (int*)malloc(size * sizeof(int));
for (size_t i = 0; i < size; i++) {
a_v[i] = i;
b_v[i] = i + 1;
c_v[i] = 0;
}
const char* kernel_str = file_read("src/kernel.cl");
use_callbacks = doUseCallbacks;
use_nested_callbacks = doUseNestedCallbacks;
use_blocking = doUseBlocking ? CL_TRUE : CL_FALSE;
cl_int st;
cl_int err;
int len = 256;
char buflog[len];
cl_uint numPlatforms = 0;
st = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
st = clGetPlatformIDs(numPlatforms, platforms, NULL);
printf("platforms: %d (%d)\n", numPlatforms, st);
cl_uint selPlatform = numSelPlatform; // 1;
numPlatforms = 1;
cl_platform_id platform = platforms[selPlatform];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, len, &buflog, NULL);
if (buflog != NULL) {
printf("platform name: %s\n", buflog);
}
cl_uint numDevices = 0;
st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
printf("num devices: %d (%d)\n", numDevices, st);
if (st != CL_SUCCESS) {
/* printf("explain error: %s\n", clErrorString(st)); */
printf("error: %d\n", st);
}
cl_device_id* devices = NULL;
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
printf("devices: %d (%d)\n", numDevices, st);
// Context
cl_context context;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &err);
printf("context (%d)\n", err);
// Select device
cl_uint selDevice = numSelDevice; // 0;
numDevices = 1; // clBuildProgram
cl_device_id device = devices[selDevice];
// Device Info
clGetDeviceInfo(device, CL_DEVICE_NAME, len, &buflog, NULL);
if (buflog != NULL) {
printf("device name: %s\n", buflog);
}
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
printf("device type: %s\n", cl_device_type_to_str(type));
// events
cl_event ev_kernel;
// CommandQueue
/* cl_command_queue_properties props; */
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &err);
printf("command queue (%d)\n", err);
// CreateBuffer
cl_mem buf_a;
cl_mem buf_b;
cl_mem buf_c;
ocl_type_e_t ocl_type;
if (use_type == 0) {
ocl_type = OCL_TYPE_CPU;
printf("mode CPU\n");
} else if (use_type == 1) {
ocl_type = OCL_TYPE_GPU;
printf("mode GPU\n");
} else if (use_type == 2) {
ocl_type = OCL_TYPE_IGPU;
printf("mode IGPU\n");
} else if (use_type == 3) {
ocl_type = OCL_TYPE_ACC;
printf("mode ACC\n");
}
/* cl_mem buf_x; */
switch (ocl_type) {
case OCL_TYPE_IGPU:
buf_a = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
a_v, &err);
/* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
* CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
/* Acpy, &err); */
break;
case OCL_TYPE_GPU:
buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
a_v, &err);
break;
case OCL_TYPE_ACC:
buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), a_v, &err);
break;
case OCL_TYPE_CPU:
buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), a_v, &err);
break;
default:
printf("no ocl_type defined\n");
exit(EXIT_FAILURE);
break;
}
printf("create buffer a (%d)\n", err);
if (err != CL_SUCCESS) {
/* printf("create buffer error: %s\n", clErrorString(err)); */
printf("create buffer error: %d\n", err);
}
switch (ocl_type) {
case OCL_TYPE_IGPU:
buf_b = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
b_v, &err);
break;
case OCL_TYPE_GPU:
buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
b_v, &err);
break;
case OCL_TYPE_ACC:
buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), b_v, &err);
break;
case OCL_TYPE_CPU:
buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), b_v, &err);
break;
default:
printf("no ocl_type defined\n");
exit(EXIT_FAILURE);
break;
}
printf("create buffer b (%d)\n", err);
if (err != CL_SUCCESS) {
printf("create buffer error: %d\n", err);
/* printf("create buffer error: %s\n", clErrorString(err)); */
}
switch (ocl_type) {
case OCL_TYPE_IGPU:
buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
c_v, &err);
/* buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, c_rows * c_cols *
* sizeof(int), */
/* c_v, &err); */
/* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
* CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
/* Acpy, &err); */
break;
case OCL_TYPE_GPU:
buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
c_v, &err);
break;
case OCL_TYPE_ACC:
buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), c_v, &err);
break;
case OCL_TYPE_CPU:
buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_USE_HOST_PTR,
/* buf_c = */
/* clCreateBuffer(context, CL_MEM_USE_HOST_PTR, */
/* buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, */
size * sizeof(int), c_v, &err);
break;
default:
printf("no ocl_type defined\n");
exit(EXIT_FAILURE);
break;
}
printf("create buffer c (%d)\n", err);
if (err != CL_SUCCESS) {
/* printf("create buffer error: %s\n", clErrorString(err)); */
printf("create buffer error: %d\n", err);
}
/* b_x = clCreateBuffer(context, CL_MEM_WRITE_ONLY, n * sizeof(float), x,
* &err); */
/* printf("create buffer x (%d)\n", err); */
// WriteBuffer
/* st = clEnqueueWriteBuffer(queue, b_a, CL_FALSE, 0, n * n * sizeof(float),
*/
/* Acpy, 0, NULL, NULL); */
/* printf("write buffer Acpy - b_a (%d)\n", st); */
/* st = clEnqueueWriteBuffer(queue, b_b, CL_FALSE, 0, n * sizeof(float), bcpy,
* 0, */
/* NULL, NULL); */
/* printf("write buffer bcpy - b_b (%d)\n", st); */
// Create Program
cl_program program;
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_str,
NULL, &err);
printf("create program (%d)\n", err);
// Build Program
/* st = clBuildProgram(program, numDevices, (cl_device_id*)&device, NULL,
* NULL, */
/* NULL); */
char* opts = "-Werror";
st = clBuildProgram(program, numDevices, (cl_device_id*)&device, opts, NULL,
NULL);
printf("build program (%d)\n", st);
if (st != CL_SUCCESS) {
/* printf("build status: %s\n", clErrorString(st)); */
printf("build status: %d\n", st);
char log[512];
st = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 512, &log,
NULL);
printf("build info (%d)\n", st);
if (st == CL_SUCCESS) {
printf("%s\n", log);
}
}
// Create Kernel
cl_kernel kernel1;
kernel1 = clCreateKernel(program, "kernel1", &st);
printf("create kernel1 (%d)\n", st);
/* cl_kernel kernel2; */
/* kernel2 = clCreateKernel(program, "ocl1_2", &st); */
/* printf("create kernel2 (%d)\n", st); */
// workgroup size
size_t dims = 1;
size_t gws[] = { 1, 1, 1 };
/* size_t gws[dims]; */
gws[0] = size; // a_rows;
/* gws[0] = 32; */
/* size_t* lws = NULL; */
/* size_t lws[dims]; */
/* size_t lws[dims]; */
/* size_t lws[dims] = NULL; */
/* size_t lws[] = {0, 0, 0}; */
size_t lws[] = { 128, 1, 1 };
printf("gws {%lu, %lu, %lu}\n", gws[0], gws[1], gws[2]);
if (lws != NULL) {
printf("lws {%lu, %lu, %lu}\n", lws[0], lws[1], lws[2]);
} else {
printf("lws unspecified\n");
}
// Set Kernel Args
st = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_a);
printf("set arg %d (%d)\n", 0, st);
st = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &buf_b);
printf("set arg %d (%d)\n", 1, st);
/* printf("set kernel1 arg: %d (%d)\n", 0, st); */
st = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &buf_c);
printf("set arg %d (%d)\n", 2, st);
st = clSetKernelArg(kernel1, 3, sizeof(int), (int*)&size);
printf("set arg %d (%d)\n", 3, st);
// Execute kernel
st = clEnqueueNDRangeKernel(queue, kernel1, dims, NULL, (const size_t*)gws,
(const size_t*)lws, 0, NULL, &ev_kernel);
/* (const size_t*)lws, 0, NULL, NULL); */
/* printf("nd range kernel1 (%d %s)\n", st, clErrorString(st)); */
printf("nd range kernel1 (%d)\n", st);
end = clCreateUserEvent(context, &st);
printf("create user event (%d)\n", st);
callback_data* user_data = (callback_data*)malloc(sizeof(callback_data));
printf("c_v %p\n", (void*)c_v);
user_data->queue = &queue;
user_data->buf_c = &buf_c;
user_data->c_v = c_v;
user_data->size = size;
user_data->end = &end;
user_data->nested_callbacks = use_nested_callbacks;
user_data->blocking = use_blocking;
if (use_callbacks) {
st =
clSetEventCallback(ev_kernel, CL_COMPLETE, callback_kernel_fn, user_data);
printf("set event callback (%d)\n", st);
}
/* printf("first: %2.5f\n", c_v[0]); */
/* print_matrix_float_s_t("c", c); */
// ReadBuffer
/* float* ptr = (float*)clEnqueueMapBuffer(queue, buf_c, CL_TRUE, CL_MAP_READ,
* 0, c_rows * c_cols * sizeof(float), 0, NULL, NULL, &st); */
/* printf("read buffer c_v - buf_c (%d)\n", st); */
/* printf("finish queue\n"); */
/* clFinish(queue); */
/* printf("finished queue\n"); */
if (use_callbacks) {
/* clWaitForCompletion(context); */
printf("waiting for events\n");
/* /\* cl_event events[] = {ev_kernel}; *\/ */
cl_event events[] = { end };
clWaitForEvents(1, events); // ev_kernel);
printf("waited for events\n");
clbWaitEvents(c_v);
} else {
printf("about to read the c buffer\n");
st = clEnqueueReadBuffer(queue, buf_c, use_blocking, 0, size * sizeof(int),
c_v, 0, NULL, NULL);
printf("read buffer c_v - buf_c (%d)\n", st);
}
/* print_matrix("c_v", c_v, c_rows, c_cols); */
/* printf("first: %2.5f\n", c_v[0]); */
/* print_matrix_float_s_t("c", c); */
free(user_data);
clReleaseKernel(kernel1);
/* clReleaseKernel(kernel2); */
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseMemObject(buf_a);
clReleaseMemObject(buf_b);
clReleaseMemObject(buf_c);
/* clReleaseMemObject(b_x); */
clReleaseContext(context);
free(devices);
free(platforms);
#define THRESHOLD 0
// check
printf("about to check (first: %d)\n", c_v[0]);
for (size_t i = 0; i < size; i++) {
if (abs(c_v[i] - (a_v[i] + b_v[i])) > THRESHOLD) {
printf("Wrong checking: a_v[%ld] = %d, b_v[%ld] = %d, c_v[%ld] = %d\n", i,
a_v[i], i, b_v[i], i, c_v[i]);
exit(EXIT_FAILURE);
}
}
return EXIT_SUCCESS;
}
void CL_CALLBACK
callback_read_fn(cl_event event, cl_int ev_status, void* user_data)
{
printf("-- BEGIN callback read executed (%d)\n", ev_status);
callback_data* cb_data = (callback_data*)user_data;
/* cl_command_queue queue = *(cb_data->queue); */
/* cl_mem buf_c = *(cb_data->buf_c); */
int* c_v = cb_data->c_v;
cl_event end = *(cb_data->end);
/* int size = cb_data->size; */
cl_int st;
printf("c_v %p\n", (void*)c_v);
printf("c_v[0] = %d\n", c_v[0]);
/* c_v[1] = 1; */
st = clSetUserEventStatus(end, CL_COMPLETE);
printf("set user event status (%d)\n", st);
// haz que salga el finish
printf("-- END\n");
}
cl_event ev_read;
void CL_CALLBACK
callback_kernel_fn(cl_event event, cl_int ev_status, void* user_data)
{
printf("-- BEGIN callback kernel executed (%d)\n", ev_status);
callback_data* cb_data = (callback_data*)user_data;
cl_command_queue queue = *(cb_data->queue);
cl_mem buf_c = *(cb_data->buf_c);
int* c_v = cb_data->c_v;
int size = cb_data->size;
bool nested_callbacks = cb_data->nested_callbacks;
bool blocking = cb_data->blocking;
cl_event end = *(cb_data->end);
printf("c_v %p\n", (void*)c_v);
printf("c_v[0] = %d\n", c_v[0]);
cl_int st;
/* printf("about to flush\n"); */
/* clFlush(queue); */
/* printf("flushed\n"); */
size_t offset = 0;
/* size = size + 4; */
printf("about to read the c buffer\n");
printf("blocking %d\n", blocking);
clb_events_waiting_device[clb_num_events_waiting] = 0;
clb_events_init_read[clb_num_events_waiting] = 0;
/* why it does not work? (blocking CL_TRUE) */
st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),
c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);
ev_read = clb_events_waiting[clb_num_events_waiting - 1];
printf("enqueue read buffer (%d)\n", st);
/* size * sizeof(int), c_v, 0, NULL, NULL); */
if (nested_callbacks) {
st = clSetEventCallback(ev_read, CL_COMPLETE, callback_read_fn, user_data);
printf("set event callback (%d)\n", st);
/* st = clSetUserEventStatus(end, CL_COMPLETE); */
/* printf("set user event status (%d)\n", st); */
}
/* c_v[1] = 1; */
/* st = clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, ); */
/* printf("event info (%d)\n", st); */
/* int len = 512; */
/* char buflog[len]; */
/* cl_command_type; */
/* clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, len, &buflog, NULL); */
/* if (buflog != NULL) { */
/* printf("- event: %s\n", buflog); */
/* } */
if (!nested_callbacks) {
st = clSetUserEventStatus(end, CL_COMPLETE);
printf("set user event status (%d)\n", st);
/* printf("read buffer c_v - buf_c (%d)\n", st); */
}
printf("-- END\n");
}
And now, if I select the Intel CPU as device:
./callback 0 1 0 1 1 0
It works:
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
context (0)
device name: Intel(R) Core(TM) i5-6200U CPU # 2.30GHz
device type: CL_DEVICE_TYPE_CPU
command queue (0)
mode CPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x1420030
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x1420030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
-- BEGIN callback read executed (0)
c_v 0x1420030
c_v[0] = 1
set user event status (0)
-- END
waited for events
About to wait events: 1
3 = CL_QUEUED, 0 = CL_COMPLETE, 2 = CL_SUBMITTED, 1 = CL_RUNNING
cl_event CPU init 0 [0] = status 0 (ref 0x7f7568000a90)
CPU [0] = 1, [1] = 3, [2] = 5
about to check (first: 1)
Now, if I select the Intel IGPU (Intel Integrated GPU):
./callback 0 0 2 1 1 0
It is freezes / hangs:
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
context (0)
device name: Intel(R) HD Graphics
device type: CL_DEVICE_TYPE_GPU
command queue (0)
mode IGPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x18b7030
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x18b7030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
If I use gdb and run the same test, and do C-c, I can see:
(gdb) r 0 0 2 1 1 0
Starting program: /callbacks/build/callback 0 0 2 1 1 0
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
[New Thread 0x7ffff4cd9700 (LWP 21291)]
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
[New Thread 0x7fffeede2700 (LWP 21292)]
[New Thread 0x7fffee5e0700 (LWP 21293)]
[New Thread 0x7fffee9e1700 (LWP 21294)]
context (0)
device name: Intel(R) HD Graphics
device type: CL_DEVICE_TYPE_GPU
command queue (0)
mode IGPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x607030
[New Thread 0x7fffec827700 (LWP 21295)]
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x607030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
^C
Thread 1 "callback" received signal SIGINT, Interrupt.
0x00007ffff730a756 in pthread_cond_wait##GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
(gdb) bt
#0 0x00007ffff730a756 in pthread_cond_wait##GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
#1 0x00007ffff64c635b in ?? () from /opt/intel/opencl/libintelocl.so
#2 0x00007ffff648c63a in ?? () from /opt/intel/opencl/libintelocl.so
#3 0x00007ffff647b5d1 in ?? () from /opt/intel/opencl/libintelocl.so
#4 0x00007ffff63f3e75 in clWaitForEvents () from /opt/intel/opencl/libintelocl.so
#5 0x00007ffff6edca43 in ?? () from /opt/intel/opencl/libIntelOpenCL.so
#6 0x000000000040237e in main (argc=7, argv=0x7fffffffdc58) at ./src/callback.c:532
As you can see in the first example of execution (CPU) it should appear the two callbacks (two BEGIN/END pairs). In the case of HD Graphics GPU it hangs after the first callback (only one BEGIN/END pair).
Why?
(gdb shows that is freezed in the pthread_cond_wait of the intel opencl driver).
Can anyone explain really what is the behavior with the callbacks/events and the host thread? (best practices, how to avoid deadlocks)
I need fine grained control and the fastest performance, and it looks like is callbacks, but they have weird behaviors...
Expected behavior (only occurs in the CPU, not in the IGPU):
1. The host creates an user event. Then, the host calls a EnqueueKernelNDRange (vector addition) and waits for the user event (WaitForEvents). When the kernel finishes it triggers the callback "callback_kernel".
2. This "callback_kernel" calls a EnqueueReadBuffer non-blocking, and when it finishes triggers the callback "callback_read".
3. The "callback_read" sets CL_COMPLETE the user event.
4. The host continues after the WaitForEvents with the content filled (buffer read).

Your problem is the following line:
/* why it does not work? (blocking CL_TRUE) */
st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);
Inside the callback function, you are trying to issue a blocking call to clEnqueueReadBuffer, which is not allowed in OpenCL. You should check the specification notes which functions are not allowed from the following link.
https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clSetEventCallback.html
I also recommend you to read the whole callback section from the specification your driver supports, I am adding the corresponding section of the latest OpenCL spec 2.2 here.
https://www.khronos.org/registry/OpenCL/specs/opencl-2.2.pdf#page=197

OPENCL API's take almost same time irrespective of sample size

I've been trying to profile an OpenCL host code for FIR filtering on MAC, Ubuntu and other platforms. My Host code and kernel are as below.
The issue is that irrespective of the number of samples that I provide for the FIR filter, the clenquendrangelernel ends up taking the same amount of time. Also I've profiled the clEnqueueReadBuffer and clEnqueueWriteBuffer as well and somehow they also end up taking the same amount of time. In mac I'm profiling with mach as well as using OpenCL events, in ubuntu, I'm profiling with PAPI. Im unable to understand why this is happening, ideally with increase in the number of samples, the clEnqueueReadBuffer and clEnqueueWriteBuffer should take more time and so should kernel execution.
Kernel:-
__kernel void fir4(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[4] = {5,7,5,7};
/*for(j=0;j<4;j++)
{
output[i] += coeff[j]*(input[i+4-j-1]);
}*/
//unrolled
output[i] += coeff[0]*(input[i+4-0-1]);
output[i] += coeff[1]*(input[i+4-1-1]);
output[i] += coeff[2]*(input[i+4-2-1]);
output[i] += coeff[3]*(input[i+4-3-1]);
}
__kernel void fir8(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[8] = {5,7,5,7,5,7,5,7};
for(j=0;j<8;j++)
{
output[i] += coeff[j]*(input[i+8-j-1]);
}
}
__kernel void fir12(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[12] = {5,7,5,7,5,7,5,7,5,7,5,7};
for(j=0;j<12;j++)
{
output[i] += coeff[j]*(input[i+12-j-1]);
}
}
Host Code:-
// Use a static data size for simplicity
//
#define DATA_SIZE (48000)
#define NUM_COEFF (4)
int main(int argc, char** argv)
{
uint64_t start;
uint64_t end;
uint64_t elapsed;
double elapsedmilli;
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float coeff[NUM_COEFF];
float results_host[DATA_SIZE] = {};
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_event event; //Linking event to kernel for profiling
cl_platform_id platform_id = NULL; // compute device platform id
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i,j = 0;
unsigned int count = DATA_SIZE;
unsigned int taps = NUM_COEFF;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
for(i=0; i < taps; i++)
{
if(!(i%2))
coeff[i] = 5;
else
coeff[i] = 7;
}
//Connect to a platform on device
err = clGetPlatformIDs(1, &platform_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to locate opencl platform!\n");
return EXIT_FAILURE;
}
// Connect to a compute device
//
int gpu = 0;
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
//Use function and load the kernel source from .cl files in the same folder
//
char *KernelSource = load_program_source("fir.cl");
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
switch(taps)
{
case(4):
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
case(8):
{
kernel = clCreateKernel(program, "fir8", &err);
break;
}
case(12):
{
kernel = clCreateKernel(program, "fir12", &err);
break;
}
default:
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
}
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel! - %d\n",err);
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
local = 48;
start = mach_absolute_time();
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &event);
if (err)
{
printf("Error: Failed to execute kernel!-%d\n",err);
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clWaitForEvents(1, &event);
clFinish(commands);
end = mach_absolute_time();
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);
elapsed = end - start;
struct mach_timebase_info info;
mach_timebase_info(&info);
double t = 1e-9 * (elapsed) * info.numer / info.denom;
elapsedmilli = 1e-6 * (elapsed) * info.numer / info.denom;
printf("cl:main timing:MACH clEnqueueNDRangeKernel %f ms, %d elapsed\n",elapsedmilli,elapsed);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i=0; i<DATA_SIZE; i++)
{
for(j=0;j<NUM_COEFF;j++)
{
results_host[i]+=coeff[j]*(data[i+NUM_COEFF-j-1]);
}
//printf("Host Output[%d]-%f\n",i,results_host[i]);
}
for(i = 0; i < count; i++)
{
if(results[i] == results_host[i])
correct++;
//printf("CL Output[%d]-%f\n",i,results[i]);
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values! Samples-%d,Taps-%d\n", correct, count, DATA_SIZE, NUM_COEFF);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}

Adding just 10-20 multiplications and additions per item is not comparable to kernel overhead time. Try with 100 or 1000-wide coefficients array.
Using more input elements per item with that way, just increases cache hit numbers(also ratio) because more threads read from same locations.
If DATA_SIZE is several millions, then all data could not fit in cache and become slower linearly with its length. 48000 means less than 200kB. A HD5850 has 512 k L2 cache(3x bandwidth of memory) and 8kB L1 per compute unit(too fast) for example.

clBuildProgram failed with error: Failed to build program executable

I'm a beginner at OpenCL. I was trying to build a simple app which just add 2 vectors to get results. This is my following host code
#define USE_PLATFORM 0
#define USE_DEVICE 2
#define DATA_SIZE 1024
#define USE_KERNEL_PATH "/Users/huangxin/Documents/August13Programming/FirstEGOpenCL/FirstEGOpenCL/kernel.cl"
using namespace std;
int main(int argc, const char * argv[]) {
int err;
cl_uint numPlatforms;
cl_uint numDevices;
cl_command_queue command;
size_t global;
//Query the number of platforms supported.
err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (err != CL_SUCCESS || USE_PLATFORM >= numPlatforms)
{
printf("Error at: clGetPlatformIDs(querying platforms count failed):\n");
exit(-1);
}
//Get all platforms.
vector<cl_platform_id> platforms(numPlatforms);
err = clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms);
if (err != CL_SUCCESS)
{
printf("Error at: clGetPlatformIDs(getting all platforms failed):\n");
exit(-1);
}
//Query the number of devices supported by the platform spicified.
err = clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
if (err != CL_SUCCESS || USE_PLATFORM >= numDevices)
{
printf("Error at: clGetDeviceIDs(querying devices count failed):\n");
exit(-1);
}
//Get all devices.
vector<cl_device_id> devices(numDevices);
err=clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &numDevices);
if (err != CL_SUCCESS)
{
printf("Error at: clGetDeviceIDs(getting all devices failed):\n");
exit(-1);
}
//Get device infomation.
char deviceInfo[1024];
//get device max work item dimensions.
size_t maxItemSize[3];
clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_NAME, sizeof(deviceInfo)*1024, deviceInfo, NULL);
clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, maxItemSize, NULL);
cout << "Device selected: " << deviceInfo << endl;
cout << "Max item size: " << maxItemSize[0] << "," << maxItemSize[1] << ","<< maxItemSize[2] << endl;
//Set property with certain platform
cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[USE_PLATFORM]), 0};
//create context with certain property.
cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_ALL, NULL, NULL, &err);
if (err != CL_SUCCESS)
{
printf("Error at: clCreateContextFromType(get context failed):\n");
exit(-1);
}
//create command queue using selected device and context.
command = clCreateCommandQueue(context, devices[USE_DEVICE], 0, NULL);
//create program with specified kernel source.
const char *kernelSource = getKernelSource(USE_KERNEL_PATH);
cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, 0, &err);
if (err != CL_SUCCESS)
{
printf("Error at: clCreateProgramWithSource(get program failed):\n");
exit(-1);
}
//since OpenCL is a dynamic-compile architechture, we need to build the program.
err = clBuildProgram(program, 0, 0, 0, 0, 0);
if (err != CL_SUCCESS)
{
cout << err << endl;
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, devices[USE_DEVICE], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
//kernel是OpenCL中对执行在一个最小粒度的compute item上的代码及参数的抽象
//create the kernel function using the built program.
cl_kernel adder = clCreateKernel(program, "adder", &err);
if (err != CL_SUCCESS)
{
printf("Error at: clCreateKernel(get kernel function failed):\n");
exit(-1);
}
//create the vector of input random data.
vector<float> inA(DATA_SIZE), inB(DATA_SIZE);
for(int i = 0; i < DATA_SIZE; i++) {
inA[i] = (float)(random() % DATA_SIZE) / 1000;
inB[i] = (float)(random() % DATA_SIZE) / 1000;
}
//create the read-only device mem using specified context, that is to copy the host mem to the device mem.
cl_mem cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inA[0], NULL);
cl_mem cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inB[0], NULL);
//create the result mem.
cl_mem cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
//setting up the arguement of kernel memory
clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b);
clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res);
START_CHECK_RUNNING_TIME
//enqueue the kernel into the specified command(#TODO:come back later to check the remaining arguement.
global = DATA_SIZE;
err = clEnqueueNDRangeKernel(command, adder, 1, 0, &global, 0, 0, 0, 0);
if (err != CL_SUCCESS)
{
printf("Error at: clEnqueueNDRangeKernel(enqueue kernel failed):\n");
exit(-1);
}
printf("*****************FLAG***************");
//copy the results from the kernel into the host(CPU).
vector<float> res(DATA_SIZE);
err = clEnqueueReadBuffer(command, cl_res, CL_TRUE, 0, sizeof(float) * DATA_SIZE, &res[0], 0, 0, 0);
END_CHECK_RUNNING_TIME
//check the number of right compute.
int cnt = 0;
for (int i = 0; i < res.size(); i++) {
cnt += (res[i] == inA[i] + inB[i] ? 1 : 0);
}
cout << "Computed " << res.size() << " values\n";
cout << "Correct values:(" << cnt << "/" << res.size() << "),correct rate:" << (float)cnt / res.size() * 100 << "%" << endl;
gettimeofday(&sTime, NULL);
for (int i = 0; i < res.size(); i++) {
for (int j = 0; j < 10000; j++)
res[i] = inA[i] + inB[i];
}
gettimeofday(&eTime, NULL);timeuse = 1000000 * ( eTime.tv_sec - sTime.tv_sec ) + eTime.tv_usec -sTime.tv_usec; printf("Running time: %fs\n", (double)timeuse/(1000000));
//cleaning up the variables.
clReleaseKernel(adder);
clReleaseProgram(program);
clReleaseMemObject(cl_a);
clReleaseMemObject(cl_b);
clReleaseMemObject(cl_res);
clReleaseCommandQueue(command);
clReleaseContext(context);
return 0;
}
It's a bit long code, but it's really doing simple stuff. this is my kernel code
kernel void adder(global const float* a, global const float* b, global float* result)
{
size_t idx = get_global_id(0);
for (int i = 0; i < 10000; i++)
result[idx] = a[idx] +b[idx];
}
And I got the following result:
Device selected: GeForce GT 650M
-11
Error: Failed to build program executable!
No kernels or only kernel prototypes found.
I don't quite understand what "No kernels or only kernel prototypes found." mean and it's really strange that if I use the first device(CPU) or my second device(HD Graphics 4000), the same code runs perfectly.
I want to know what is wrong and why it happens.
I was running these code in the Xcode with Mac OS X 10.10.

As the comments say, is a good practice to use:
__kernel void adder(__global const float* a, __global const float* b, __global float* result)
Because that way you clearly define those are special CL flags. Tpically all the CL kernels follow that rule, even if the spec allows both.
But your problem is probably due to running the clBuildProgram() without any device in the devices list. Therefore, not compiling anything at all!
In CL every device has an specific compiler (the CPUs don't have the same compiler as GPU, sometimes not even the same instruction sets). So you should give the API the list of devices for which the kernels have to be compiled.
The proper way would be this:
err = clBuildProgram(program, 1, &devices[USE_DEVICE], "", 0, 0);
Note: I added "", because probably in the future you will want to add some build parameters, better to have it ready :)

opencl program build failed

The following is the code I wrote for adding 2 2d arrays. Its getting compiled but when I try to run it its showing : error: failed to build program. runtime0.0000
why is it that the prpgram isnt built?
And also why is it that the buildlog that I have queried isnt getting displayed?
Actually since I am just initialising the arrays, I have directly stored to 1d array, not shown the conversion from 2d to 1d.
code:
# include <stdio.h>
#include <stdlib.h>
#ifdef APPLE
#include<OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define order 1000
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
float *A;
float *B;
float *C;
int n,m,p;
int err;
int szA, szB,szC;
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_uint nd;
cl_mem a_in;
cl_mem b_in;
cl_mem c_out;
int i,j;
n=order;
m=order;
p=order;
size_t global[2];
nd=1;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
szA=n*p;
szB=p*m;
szC=n*m;
A=(float *)malloc(sizeof(float)*szA);
B=(float *)malloc(sizeof(float)*szB);
C=(float *)malloc(sizeof(float)*szC);
for(i=0; i<order; i++)
for(j=0; j<order; j++)
A[i*m+j]=i;
B[i*m+j]=i;
FILE *fp;
char fileName[] = "./array_add_kernel.cl";
char *source_str;
size_t source_size;
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
err=clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
err=clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context_properties conpro[]={ CL_CONTEXT_PLATFORM,(cl_context_properties) firstPlatformId, 0};
context=clCreateContext(conpro, 1, &device_id, NULL, NULL, &err);
commands=clCreateCommandQueue(context, device_id,CL_QUEUE_PROFILING_ENABLE, &err);
a_in= clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*szA, NULL, NULL);
b_in= clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*szB, NULL, NULL);
c_out= clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*szC, NULL, NULL);
program= clCreateProgramWithSource(context, 1, (const char**)&source_str,(const size_t *)&source_size, &err);
err= clBuildProgram(program,0, NULL, NULL, NULL, NULL );
if(err!= CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error:Failed to build program executable!");
clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG,sizeof(buffer),buffer,&len);
printf("%s \n",buffer);
}
kernel= clCreateKernel(program, "array_add_kernel", &err);
err= 0;
err= clSetKernelArg(kernel, 0, sizeof(int), &n);
err|= clSetKernelArg(kernel, 1, sizeof(int), &p);
err|= clSetKernelArg(kernel, 2, sizeof(int), &m);
err|= clSetKernelArg(kernel, 3, sizeof(cl_mem), &a_in);
err|= clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_in);
err|= clSetKernelArg(kernel, 5, sizeof(cl_mem), &c_out);
err=clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float)*szA, A, 0, NULL, NULL);
err= clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float)*szB, B, 0, NULL, NULL);
cl_event prof_event;
global[0]= (size_t)n;
global[1]=(size_t)m;
err=clEnqueueNDRangeKernel(commands, kernel, nd, NULL, global, NULL, 0, NULL, &prof_event);
clFinish(commands);
cl_ulong ev_start_time=(cl_ulong)0;
cl_ulong ev_end_time=(cl_ulong)0;
size_t ret_size;
err= clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ev_start_time, NULL);
err= clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL);
err=clEnqueueReadBuffer(commands,c_out,CL_TRUE,0,sizeof(float)*szC,C,0,NULL,NULL);
cl_float runtime=(ev_end_time-ev_start_time)*1.0e-9;
printf("Runtime:%f ",runtime);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseMemObject(a_in);
clReleaseMemObject(b_in);
clReleaseMemObject(c_out);
clReleaseCommandQueue(commands);
clReleaseContext(context);
}
kernel:
kernel void array_add_kernel(
const int n, const int m, const p, _global const float * A, _global const float * B, , _global float * C )
{
int i= get_global_id(0);
int j= get_global_id(1);
C[i*m + j] = A[i*m + j] + B[i*m + j];
}

Fix your kernel. It's filled with errors.
kernel void array_add_kernel(
const int n,
const int m,
const p, // No type specifier
_global const float * A, // Should be global, not _global
_global const float * B, , // Double comma
_global float * C )
{
int i= get_global_id(0);
int j= get_global_id(1);
C[i*m + j] = A[i*m + j] + B[i*m + j];
}
This is the working kernel.
kernel void array_add_kernel(const int n, const int m, global const float * A, global const float * B, global float * C )
{
int i= get_global_id(0);
int j= get_global_id(1);
C[i*m + j] = A[i*m + j] + B[i*m + j];
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

OpenCl cleanup causes segfault - memory-management

For people who arrives here in the future: As Brafford suggested, this is resolved by adding clFinish(GPUCommandQueue) after clEnqueueNDRangeKernel as well as clEnqueueReadBuffer. Apparently trying to clean up any object (e.g. release a queue) that is still under execution yields segmentation fault.

Related

Why does L2 hardware prefetcher perform worse with only 1 KiB or 2 KiB access size?

OpenCL Callback hangs / freezes (deadlock, pthread_cond_wait)

OPENCL API's take almost same time irrespective of sample size

clBuildProgram failed with error: Failed to build program executable

opencl program build failed

Categories

Resources