OpenCL Matrix Multiplication Enqueue/Buffer Reading - parallel-processing

I'm attempting a basic matrix multiplication program in OpenCL. I believe my issues are in my enqueue and/or buffer reading, as I am getting completely incorrect output for the result matrix, as well as incorrect first rows for matrices A and B. I'm new to OpenCL and I've been banging my head against this for quite a while now, maybe someone here can give me a hint as to where I'm going wrong?
Host Code:
#define __NO_STD_VECTOR // Uses cl::vector instead of standard version
#include <CL/cl.hpp>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <math.h>
#include <string>
/* Defined matrix width/height constants */
#define numRowsA 3
#define numColsA 3
#define numRowsB 3
#define numColsB 3
#define numRowsC numRowsA
#define numColsC numColsB
using namespace std;
/* Function declarations */
inline void checkErr(cl_int err, string name);
void initMatrix (float* matrix, int numIndices);
void printMatrix (string displayName, float* matrix, int numIndices,
int rowSize);
//*************
// Main Program
//*************
int main(int argc, char* argv[]) {
/* Check for valid matrix sizes */
if (numColsA != numRowsB) {
cout << "ERROR: Invalid matrix dimensions." << endl;
} else {
srand(2013); // Set random seed
/* Allocate memory for matrices A, B, and C */
unsigned int sizeA = numRowsA * numColsA;
unsigned int sizeB = numRowsB * numColsB;
unsigned int sizeC = numRowsC * numColsC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/*
Allocate memoryA/memoryB/memoryC size blocks of bytes
(cast from void*)
*/
float* blockA = (float*) malloc(memoryA);
float* blockB = (float*) malloc(memoryB);
float* blockC = (float*) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(blockA, sizeA);
initMatrix(blockB, sizeB);
/* Display matrices A and B */
printMatrix("Matrix A", blockA, sizeA, numColsA);
printMatrix("Matrix B", blockB, sizeB, numColsB);
cl_int err; // Error code
string platformVendor; // Platform vendor
/* Create list of platforms */
cl::vector < cl::Platform > platformList;
cl::Platform::get(&platformList);
/*
Display potential Platform list generation error. If the
platform list size does not equal 0, CL_SUCCESS (0) is
sent to the function. If the platform list size does
equal 0, -1 is sent to the function.
*/
checkErr(platformList.size()!=0 ? CL_SUCCESS : -1,
"Platform");
/*
Replace empty value of platformVendor with device vendor
name
*/
platformList[0].getInfo((cl_platform_info) CL_PLATFORM_VENDOR,
&platformVendor);
/* Properties for Context constructor (Use unknown) */
cl_context_properties cprops[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties) (platformList[0]) (),
0
};
/* Create context */
cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL,
&err);
/* Display potential Context constructor error */
checkErr(err, "Context");
/* Create buffer for matrix A */
cl::Buffer deviceMemA(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeA, blockA, &err);
/* Create buffer for matrix B */
cl::Buffer deviceMemB(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeB, blockB, &err);
/* Create buffer for matrix C */
cl::Buffer deviceMemC(context,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeC, blockC, &err);
/* Create buffer for row (A) and col (C) */
cl::Buffer rowA(context, CL_MEM_READ_ONLY, sizeof(int),
(void *) numRowsA, &err);
cl::Buffer colC(context, CL_MEM_READ_ONLY, sizeof(int),
(void *) numColsC, &err);
/* Display potential Buffer constructor error */
checkErr(err, "Buffers");
/* Get list of devices */
cl::vector<cl::Device> devices =
context.getInfo<CL_CONTEXT_DEVICES>();
/* Check for at least one device, if not throw error */
checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "No Devices");
/* Read input from .cl file */
ifstream file("matrixMult1_kernels.cl");
/* Check for potential problem opening .cl input file */
checkErr(file.is_open() ? CL_SUCCESS:-1, "File Not Open");
/* Store file contents in a string */
string prog(istreambuf_iterator<char>(file),
(istreambuf_iterator<char>()));
/* Create source object */
cl::Program::Sources source(1, make_pair(prog.c_str(),
prog.length()+1));
/* Create program for given context and source */
cl::Program program(context, source);
err = program.build(devices, ""); // Check for build error
/* Display potential program build error */
checkErr(err, "Program Build");
/* Create kernel */
cl::Kernel kernel(program, "matrixMul", &err);
/* Display potential Kernel constructor error */
checkErr(err, "Kernel");
/*
Set matrixMul arguments, error checking after each
argument
*/
err = kernel.setArg(0, deviceMemA);
checkErr(err, "Arg0");
err = kernel.setArg(1, deviceMemB);
checkErr(err, "Arg1");
err = kernel.setArg(2, deviceMemC);
checkErr(err, "Arg2");
err = kernel.setArg(3, rowA);
checkErr(err, "Arg3");
err = kernel.setArg(4, colC);
checkErr(err, "Arg4");
/* Create command queue */
cl::CommandQueue queue(context, devices[0], 0, &err);
/* Display potential CommandQueue constructor error */
checkErr(err, "Command Queue");
/* Create event object */
cl::Event event;
cl::NDRange global(3, 3);
cl::NDRange local(1, 1);
/* Enqueue the kernel */
err = queue.enqueueNDRangeKernel(kernel, 2, global, local,
NULL, &event);
/* Display potential enqueueing error */
checkErr(err, "Enqueue");
/* Wait until kernel has completed execution before continuing */
event.wait();
/* Read kernel result back into host memory */
err = queue.enqueueReadBuffer(deviceMemC, CL_TRUE, 0, memoryC,
blockC, NULL, &event);
checkErr(err, "C");
err = queue.enqueueReadBuffer(deviceMemA, CL_TRUE, 0, sizeA,
blockA, NULL, &event);
err = queue.enqueueReadBuffer(deviceMemB, CL_TRUE, 0, sizeB,
blockB, NULL, &event);
/* Display potential kernel read error */
checkErr(err, "Read Buffer");
/* Display matrices */
cout << endl;
cout << "After:" << endl;
printMatrix("Matrix A", blockA, sizeA, numColsA);
printMatrix("Matrix B", blockB, sizeB, numColsB);
printMatrix("Matrix C", blockC, sizeC, numColsC);
/* Free up memory */
free(blockA);
free(blockB);
free(blockC);
}
}
//--------------------------------------------------------------------
// checkErr - Inline error checking function for OpenCL portion of
// host program.
//
// PRE: err is of type int in OpenCL; name is a string.
// POST: The program is terminated after display an error message
// indicating the location of the error and the error code.
//--------------------------------------------------------------------
inline void checkErr(cl_int err, string name) {
/* Check error code against OpenCL success constant */
if (err != CL_SUCCESS) {
/*
Display an error message stating the error origin and
error number.
*/
std::cerr << "ERROR: " << name << " (" << err << ")"
<< std::endl;
exit(EXIT_FAILURE); // Terminates process with status code 0
}
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix (float* matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; i++) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float) RAND_MAX;
}
}
//--------------------------------------------------------------------
// printMatrix - Outputs a readable version of the matrix.
//
// PRE: displayName is a string; matrix is a pointer to a block of
// bytes in memory; numIndices an integer indicating the number
// of indices in the matrix being displayed (read left-to-right,
// top-to-bottom); rowSize is an integer indicating the number
// of elements in one row of the matrix.
// POST: A readable version of the matrix is displayed.
//--------------------------------------------------------------------
void printMatrix (string displayName, float* matrix, int numIndices,
int rowSize) {
/* Output display name of matrix */
cout << "\n" << displayName << ":" << endl;
/* Loop through each indice of the matrix */
for (int i = 0; i < numIndices; i++) {
cout << matrix[i]; // Display value at this indice
/* Check for next row of the matrix */
if (((i + 1) % rowSize) == 0) {
cout << endl; // Line break
} else {
cout << " | "; // Indice separator
}
}
}
Kernel:
// matrixMult1_kernels.cl
// Multiply two matrices A * B = C
// Device code.
// OpenCL Kernel
__kernel void
matrixMul(__global float* A,
__global float* B,
__global float* C,
int wA, int wB) {
// 2D Thread ID
int tx = get_local_id(0);
int ty = get_local_id(1);
// value stores the element
// that is computed by the thread
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
// Write the matrix to device memory each
// thread writes one element
C[ty * wA + tx] = value;
}
Sample Output:
Matrix A:
0.398748 | 0.999793 | 0.206833
0.354238 | 0.674347 | 0.492022
0.707017 | 0.353635 | 0.430668
Matrix B:
0.91598 | 0.0260167 | 0.881732
0.810974 | 0.193091 | 0.589857
0.229151 | 0.0657822 | 0.965835
ERROR: C (-30)
I'm working with an NVIDIA GeForce 9800 GT, which only supports OpenCL 1.1. Any help here would be much appreciated.
Thanks,
Joe

The data for input matrices A and B is not passed to the device. When you create the buffers:
cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)
the blockA argument is ignored, because the flags do not specify how to use it. You need to add at least CL_MEM_COPY_HOST_PTR to initialize the buffer with the contents of blockA.
Alternatively, you can call clEnqueueWriteBuffer to send the data after the buffers are created.

Related

Accessing dynamically allocated arrays on device (without passing them as kernel arguments)

How can an array of structs that has been dynamically allocated on the host be used by a kernel, without passing the array of structs as a kernel argument? This seems like a common procedure with a good amount of documentation online, yet it doesn't work on the following program.
Note: Please note that the following questions have been studied before posting this question:
1) copying host memory to cuda __device__ variable 2) Global variable in CUDA 3) Is there any way to dynamically allocate constant memory? CUDA
So far, unsuccessful attempts have been made to:
Dynamically allocate array of structs with cudaMalloc(), then
Use cudaMemcpyToSymbol() with the pointer returned from cudaMalloc() to copy to a __device__ variable which can be used by the kernel.
Code attempt:
NBody.cu (error checking using cudaStatus has mostly been omitted for better readability, and function to read data from file into dynamic array removed):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle.x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}
OUTPUT:
"ERROR: kernel launch failed."
Summary:
How can I print the contents of the array of structs from the kernel, without passing it as a kernel argument?
Coding in C using VS2019 with CUDA 10.2
With the help of #Robert Crovella and #talonmies, here is the solution that outputs a sequence that cycles from 0 to 9 repeatedly.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
//#include "Nbody.h"
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody* d_particle;
//__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle[i].x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}

OpenCL Callback hangs / freezes (deadlock, pthread_cond_wait)

I created a basic snippet:
Kernel:
__kernel void
kernel1(__global int* a, __global int* b, __global int* c, int size)
{
int idx = get_global_id(0);
if (idx >= 0 && idx < size){
c[idx] = a[idx] + b[idx];
}
}
Code:
#include <CL/cl.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_FILE_SIZE 1024000
#include <sys/stat.h>
#include <sys/types.h>
typedef enum ocl_type_e_t {
OCL_TYPE_NULL = 0,
OCL_TYPE_CPU = 1,
OCL_TYPE_GPU = 2,
OCL_TYPE_IGPU = 3,
OCL_TYPE_ACC = 4
} ocl_type_e_t;
const char*
cl_device_type_to_str(cl_device_type type)
{
static char* strings[] = {
"(invalid)", // invalid
"CL_DEVICE_TYPE_CPU",
"CL_DEVICE_TYPE_GPU",
"CL_DEVICE_TYPE_ACCELERATOR",
"CL_DEVICE_TYPE_CUSTOM",
"CL_DEVICE_TYPE_DEFAULT",
"CL_DEVICE_TYPE_ALL",
};
char* ret;
switch (type) {
case CL_DEVICE_TYPE_CPU:
ret = strings[1];
break;
case CL_DEVICE_TYPE_GPU:
ret = strings[2];
break;
case CL_DEVICE_TYPE_ACCELERATOR:
ret = strings[3];
break;
case CL_DEVICE_TYPE_CUSTOM:
ret = strings[4];
break;
case CL_DEVICE_TYPE_DEFAULT:
ret = strings[5];
break;
case CL_DEVICE_TYPE_ALL:
ret = strings[6];
break;
default:
ret = strings[0];
break;
}
return ret;
}
const char*
file_read(char* const path)
{
struct stat st;
/* st = (struct stat*)malloc(sizeof(stat)); */
int error = stat(path, &st);
if (error != 0) {
printf("Invalid file %s\n", path);
exit(EXIT_FAILURE);
}
int size_file = st.st_size;
if (size_file > MAX_FILE_SIZE) {
printf("File %s is bigger than the max allowed size (%d > %d bytes)\n",
path, size_file, MAX_FILE_SIZE);
exit(EXIT_FAILURE);
}
FILE* fp = fopen(path, "r");
if (fp == NULL) {
printf("Error opening the file %s\n", path);
exit(EXIT_FAILURE);
}
char* const buf = (char* const)malloc(size_file);
if (buf == NULL) {
printf("Error allocating %d bytes for the contents of the file %s\n",
size_file, path);
exit(EXIT_FAILURE);
}
int size_read;
while ((size_read = fread(buf, sizeof(char), size_file, fp)) > 0) {
;
}
fclose(fp);
return buf;
}
cl_event clb_events_waiting[100];
int clb_events_waiting_device[100];
int clb_events_init_read[100];
int clb_num_events_waiting = 0;
void
clbWaitEvents(int * c)
{
if (clb_num_events_waiting > 0){
printf("About to wait events: %d\n", clb_num_events_waiting);
int i;
int waiting = 0;
cl_event ev_waiting[100];
printf("%d = CL_QUEUED, %d = CL_COMPLETE, %d = CL_SUBMITTED, %d = CL_RUNNING\n", CL_QUEUED, CL_COMPLETE, CL_SUBMITTED, CL_RUNNING);
for (i=0; i<clb_num_events_waiting; i++){
cl_int ret;
clGetEventInfo(clb_events_waiting[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);
int dev = clb_events_waiting_device[i];
int init = clb_events_init_read[i] / sizeof(int);
printf("cl_event %s init %6d [%d] = status %d (ref %p)\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, i, ret, (void*)clb_events_waiting[i]);
if (ret != CL_COMPLETE){
ev_waiting[waiting] = clb_events_waiting[i];
waiting++;
}
}
for (i=0; i<clb_num_events_waiting; i++){
int dev = clb_events_waiting_device[i];
int init = clb_events_init_read[i] / sizeof(int);
printf("%s [%d] = %d, [%d] = %d, [%d] = %d\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, c[init], init + 1, c[init + 1], init + 2, c[init + 2]);
}
if (waiting > 0){
printf("about to wait %d events\n", waiting);
clWaitForEvents(waiting, ev_waiting);
printf("wait events finished\n");
}
/* clWaitForEvents(clb_num_events_waiting, clb_events_waiting); */
}
}
typedef struct callback_data
{
cl_command_queue* queue;
cl_mem* buf_c;
int* c_v;
uint size;
cl_event* end;
bool nested_callbacks;
bool blocking;
} callback_data;
void CL_CALLBACK callback_read_fn(cl_event event, cl_int ev_status,
void* user_data);
void CL_CALLBACK callback_kernel_fn(cl_event event, cl_int ev_status,
void* user_data);
int
main(int argc, char* argv[])
{
bool use_callbacks = true;
bool use_nested_callbacks = true;
bool use_blocking = false;
int numSelPlatform = 0;
int numSelDevice = 0;
int doUseCallbacks = 0;
int doUseNestedCallbacks = 0;
int doUseBlocking = 0;
int use_type = 0;
if (argc != 7) {
printf("./%s (platform) (device) (type cpu 0|gpu 1|igpu 2|acc 3) (use "
"callbacks) (use nested callbacks) (use blocking)\n",
argv[0]);
exit(EXIT_FAILURE);
} else {
numSelPlatform = atoi(argv[1]);
numSelDevice = atoi(argv[2]);
use_type = atoi(argv[3]);
doUseCallbacks = atoi(argv[4]);
doUseNestedCallbacks = atoi(argv[5]);
doUseBlocking = atoi(argv[6]);
}
cl_event end;
uint size = 1024;
int* a_v = (int*)malloc(size * sizeof(int));
int* b_v = (int*)malloc(size * sizeof(int));
int* c_v = (int*)malloc(size * sizeof(int));
for (size_t i = 0; i < size; i++) {
a_v[i] = i;
b_v[i] = i + 1;
c_v[i] = 0;
}
const char* kernel_str = file_read("src/kernel.cl");
use_callbacks = doUseCallbacks;
use_nested_callbacks = doUseNestedCallbacks;
use_blocking = doUseBlocking ? CL_TRUE : CL_FALSE;
cl_int st;
cl_int err;
int len = 256;
char buflog[len];
cl_uint numPlatforms = 0;
st = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
st = clGetPlatformIDs(numPlatforms, platforms, NULL);
printf("platforms: %d (%d)\n", numPlatforms, st);
cl_uint selPlatform = numSelPlatform; // 1;
numPlatforms = 1;
cl_platform_id platform = platforms[selPlatform];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, len, &buflog, NULL);
if (buflog != NULL) {
printf("platform name: %s\n", buflog);
}
cl_uint numDevices = 0;
st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
printf("num devices: %d (%d)\n", numDevices, st);
if (st != CL_SUCCESS) {
/* printf("explain error: %s\n", clErrorString(st)); */
printf("error: %d\n", st);
}
cl_device_id* devices = NULL;
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
printf("devices: %d (%d)\n", numDevices, st);
// Context
cl_context context;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &err);
printf("context (%d)\n", err);
// Select device
cl_uint selDevice = numSelDevice; // 0;
numDevices = 1; // clBuildProgram
cl_device_id device = devices[selDevice];
// Device Info
clGetDeviceInfo(device, CL_DEVICE_NAME, len, &buflog, NULL);
if (buflog != NULL) {
printf("device name: %s\n", buflog);
}
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
printf("device type: %s\n", cl_device_type_to_str(type));
// events
cl_event ev_kernel;
// CommandQueue
/* cl_command_queue_properties props; */
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &err);
printf("command queue (%d)\n", err);
// CreateBuffer
cl_mem buf_a;
cl_mem buf_b;
cl_mem buf_c;
ocl_type_e_t ocl_type;
if (use_type == 0) {
ocl_type = OCL_TYPE_CPU;
printf("mode CPU\n");
} else if (use_type == 1) {
ocl_type = OCL_TYPE_GPU;
printf("mode GPU\n");
} else if (use_type == 2) {
ocl_type = OCL_TYPE_IGPU;
printf("mode IGPU\n");
} else if (use_type == 3) {
ocl_type = OCL_TYPE_ACC;
printf("mode ACC\n");
}
/* cl_mem buf_x; */
switch (ocl_type) {
case OCL_TYPE_IGPU:
buf_a = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
a_v, &err);
/* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
* CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
/* Acpy, &err); */
break;
case OCL_TYPE_GPU:
buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
a_v, &err);
break;
case OCL_TYPE_ACC:
buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), a_v, &err);
break;
case OCL_TYPE_CPU:
buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), a_v, &err);
break;
default:
printf("no ocl_type defined\n");
exit(EXIT_FAILURE);
break;
}
printf("create buffer a (%d)\n", err);
if (err != CL_SUCCESS) {
/* printf("create buffer error: %s\n", clErrorString(err)); */
printf("create buffer error: %d\n", err);
}
switch (ocl_type) {
case OCL_TYPE_IGPU:
buf_b = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
b_v, &err);
break;
case OCL_TYPE_GPU:
buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
b_v, &err);
break;
case OCL_TYPE_ACC:
buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), b_v, &err);
break;
case OCL_TYPE_CPU:
buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), b_v, &err);
break;
default:
printf("no ocl_type defined\n");
exit(EXIT_FAILURE);
break;
}
printf("create buffer b (%d)\n", err);
if (err != CL_SUCCESS) {
printf("create buffer error: %d\n", err);
/* printf("create buffer error: %s\n", clErrorString(err)); */
}
switch (ocl_type) {
case OCL_TYPE_IGPU:
buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
c_v, &err);
/* buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, c_rows * c_cols *
* sizeof(int), */
/* c_v, &err); */
/* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
* CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
/* Acpy, &err); */
break;
case OCL_TYPE_GPU:
buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
c_v, &err);
break;
case OCL_TYPE_ACC:
buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(int), c_v, &err);
break;
case OCL_TYPE_CPU:
buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_USE_HOST_PTR,
/* buf_c = */
/* clCreateBuffer(context, CL_MEM_USE_HOST_PTR, */
/* buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, */
size * sizeof(int), c_v, &err);
break;
default:
printf("no ocl_type defined\n");
exit(EXIT_FAILURE);
break;
}
printf("create buffer c (%d)\n", err);
if (err != CL_SUCCESS) {
/* printf("create buffer error: %s\n", clErrorString(err)); */
printf("create buffer error: %d\n", err);
}
/* b_x = clCreateBuffer(context, CL_MEM_WRITE_ONLY, n * sizeof(float), x,
* &err); */
/* printf("create buffer x (%d)\n", err); */
// WriteBuffer
/* st = clEnqueueWriteBuffer(queue, b_a, CL_FALSE, 0, n * n * sizeof(float),
*/
/* Acpy, 0, NULL, NULL); */
/* printf("write buffer Acpy - b_a (%d)\n", st); */
/* st = clEnqueueWriteBuffer(queue, b_b, CL_FALSE, 0, n * sizeof(float), bcpy,
* 0, */
/* NULL, NULL); */
/* printf("write buffer bcpy - b_b (%d)\n", st); */
// Create Program
cl_program program;
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_str,
NULL, &err);
printf("create program (%d)\n", err);
// Build Program
/* st = clBuildProgram(program, numDevices, (cl_device_id*)&device, NULL,
* NULL, */
/* NULL); */
char* opts = "-Werror";
st = clBuildProgram(program, numDevices, (cl_device_id*)&device, opts, NULL,
NULL);
printf("build program (%d)\n", st);
if (st != CL_SUCCESS) {
/* printf("build status: %s\n", clErrorString(st)); */
printf("build status: %d\n", st);
char log[512];
st = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 512, &log,
NULL);
printf("build info (%d)\n", st);
if (st == CL_SUCCESS) {
printf("%s\n", log);
}
}
// Create Kernel
cl_kernel kernel1;
kernel1 = clCreateKernel(program, "kernel1", &st);
printf("create kernel1 (%d)\n", st);
/* cl_kernel kernel2; */
/* kernel2 = clCreateKernel(program, "ocl1_2", &st); */
/* printf("create kernel2 (%d)\n", st); */
// workgroup size
size_t dims = 1;
size_t gws[] = { 1, 1, 1 };
/* size_t gws[dims]; */
gws[0] = size; // a_rows;
/* gws[0] = 32; */
/* size_t* lws = NULL; */
/* size_t lws[dims]; */
/* size_t lws[dims]; */
/* size_t lws[dims] = NULL; */
/* size_t lws[] = {0, 0, 0}; */
size_t lws[] = { 128, 1, 1 };
printf("gws {%lu, %lu, %lu}\n", gws[0], gws[1], gws[2]);
if (lws != NULL) {
printf("lws {%lu, %lu, %lu}\n", lws[0], lws[1], lws[2]);
} else {
printf("lws unspecified\n");
}
// Set Kernel Args
st = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_a);
printf("set arg %d (%d)\n", 0, st);
st = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &buf_b);
printf("set arg %d (%d)\n", 1, st);
/* printf("set kernel1 arg: %d (%d)\n", 0, st); */
st = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &buf_c);
printf("set arg %d (%d)\n", 2, st);
st = clSetKernelArg(kernel1, 3, sizeof(int), (int*)&size);
printf("set arg %d (%d)\n", 3, st);
// Execute kernel
st = clEnqueueNDRangeKernel(queue, kernel1, dims, NULL, (const size_t*)gws,
(const size_t*)lws, 0, NULL, &ev_kernel);
/* (const size_t*)lws, 0, NULL, NULL); */
/* printf("nd range kernel1 (%d %s)\n", st, clErrorString(st)); */
printf("nd range kernel1 (%d)\n", st);
end = clCreateUserEvent(context, &st);
printf("create user event (%d)\n", st);
callback_data* user_data = (callback_data*)malloc(sizeof(callback_data));
printf("c_v %p\n", (void*)c_v);
user_data->queue = &queue;
user_data->buf_c = &buf_c;
user_data->c_v = c_v;
user_data->size = size;
user_data->end = &end;
user_data->nested_callbacks = use_nested_callbacks;
user_data->blocking = use_blocking;
if (use_callbacks) {
st =
clSetEventCallback(ev_kernel, CL_COMPLETE, callback_kernel_fn, user_data);
printf("set event callback (%d)\n", st);
}
/* printf("first: %2.5f\n", c_v[0]); */
/* print_matrix_float_s_t("c", c); */
// ReadBuffer
/* float* ptr = (float*)clEnqueueMapBuffer(queue, buf_c, CL_TRUE, CL_MAP_READ,
* 0, c_rows * c_cols * sizeof(float), 0, NULL, NULL, &st); */
/* printf("read buffer c_v - buf_c (%d)\n", st); */
/* printf("finish queue\n"); */
/* clFinish(queue); */
/* printf("finished queue\n"); */
if (use_callbacks) {
/* clWaitForCompletion(context); */
printf("waiting for events\n");
/* /\* cl_event events[] = {ev_kernel}; *\/ */
cl_event events[] = { end };
clWaitForEvents(1, events); // ev_kernel);
printf("waited for events\n");
clbWaitEvents(c_v);
} else {
printf("about to read the c buffer\n");
st = clEnqueueReadBuffer(queue, buf_c, use_blocking, 0, size * sizeof(int),
c_v, 0, NULL, NULL);
printf("read buffer c_v - buf_c (%d)\n", st);
}
/* print_matrix("c_v", c_v, c_rows, c_cols); */
/* printf("first: %2.5f\n", c_v[0]); */
/* print_matrix_float_s_t("c", c); */
free(user_data);
clReleaseKernel(kernel1);
/* clReleaseKernel(kernel2); */
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseMemObject(buf_a);
clReleaseMemObject(buf_b);
clReleaseMemObject(buf_c);
/* clReleaseMemObject(b_x); */
clReleaseContext(context);
free(devices);
free(platforms);
#define THRESHOLD 0
// check
printf("about to check (first: %d)\n", c_v[0]);
for (size_t i = 0; i < size; i++) {
if (abs(c_v[i] - (a_v[i] + b_v[i])) > THRESHOLD) {
printf("Wrong checking: a_v[%ld] = %d, b_v[%ld] = %d, c_v[%ld] = %d\n", i,
a_v[i], i, b_v[i], i, c_v[i]);
exit(EXIT_FAILURE);
}
}
return EXIT_SUCCESS;
}
void CL_CALLBACK
callback_read_fn(cl_event event, cl_int ev_status, void* user_data)
{
printf("-- BEGIN callback read executed (%d)\n", ev_status);
callback_data* cb_data = (callback_data*)user_data;
/* cl_command_queue queue = *(cb_data->queue); */
/* cl_mem buf_c = *(cb_data->buf_c); */
int* c_v = cb_data->c_v;
cl_event end = *(cb_data->end);
/* int size = cb_data->size; */
cl_int st;
printf("c_v %p\n", (void*)c_v);
printf("c_v[0] = %d\n", c_v[0]);
/* c_v[1] = 1; */
st = clSetUserEventStatus(end, CL_COMPLETE);
printf("set user event status (%d)\n", st);
// haz que salga el finish
printf("-- END\n");
}
cl_event ev_read;
void CL_CALLBACK
callback_kernel_fn(cl_event event, cl_int ev_status, void* user_data)
{
printf("-- BEGIN callback kernel executed (%d)\n", ev_status);
callback_data* cb_data = (callback_data*)user_data;
cl_command_queue queue = *(cb_data->queue);
cl_mem buf_c = *(cb_data->buf_c);
int* c_v = cb_data->c_v;
int size = cb_data->size;
bool nested_callbacks = cb_data->nested_callbacks;
bool blocking = cb_data->blocking;
cl_event end = *(cb_data->end);
printf("c_v %p\n", (void*)c_v);
printf("c_v[0] = %d\n", c_v[0]);
cl_int st;
/* printf("about to flush\n"); */
/* clFlush(queue); */
/* printf("flushed\n"); */
size_t offset = 0;
/* size = size + 4; */
printf("about to read the c buffer\n");
printf("blocking %d\n", blocking);
clb_events_waiting_device[clb_num_events_waiting] = 0;
clb_events_init_read[clb_num_events_waiting] = 0;
/* why it does not work? (blocking CL_TRUE) */
st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),
c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);
ev_read = clb_events_waiting[clb_num_events_waiting - 1];
printf("enqueue read buffer (%d)\n", st);
/* size * sizeof(int), c_v, 0, NULL, NULL); */
if (nested_callbacks) {
st = clSetEventCallback(ev_read, CL_COMPLETE, callback_read_fn, user_data);
printf("set event callback (%d)\n", st);
/* st = clSetUserEventStatus(end, CL_COMPLETE); */
/* printf("set user event status (%d)\n", st); */
}
/* c_v[1] = 1; */
/* st = clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, ); */
/* printf("event info (%d)\n", st); */
/* int len = 512; */
/* char buflog[len]; */
/* cl_command_type; */
/* clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, len, &buflog, NULL); */
/* if (buflog != NULL) { */
/* printf("- event: %s\n", buflog); */
/* } */
if (!nested_callbacks) {
st = clSetUserEventStatus(end, CL_COMPLETE);
printf("set user event status (%d)\n", st);
/* printf("read buffer c_v - buf_c (%d)\n", st); */
}
printf("-- END\n");
}
And now, if I select the Intel CPU as device:
./callback 0 1 0 1 1 0
It works:
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
context (0)
device name: Intel(R) Core(TM) i5-6200U CPU # 2.30GHz
device type: CL_DEVICE_TYPE_CPU
command queue (0)
mode CPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x1420030
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x1420030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
-- BEGIN callback read executed (0)
c_v 0x1420030
c_v[0] = 1
set user event status (0)
-- END
waited for events
About to wait events: 1
3 = CL_QUEUED, 0 = CL_COMPLETE, 2 = CL_SUBMITTED, 1 = CL_RUNNING
cl_event CPU init 0 [0] = status 0 (ref 0x7f7568000a90)
CPU [0] = 1, [1] = 3, [2] = 5
about to check (first: 1)
Now, if I select the Intel IGPU (Intel Integrated GPU):
./callback 0 0 2 1 1 0
It is freezes / hangs:
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
context (0)
device name: Intel(R) HD Graphics
device type: CL_DEVICE_TYPE_GPU
command queue (0)
mode IGPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x18b7030
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x18b7030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
If I use gdb and run the same test, and do C-c, I can see:
(gdb) r 0 0 2 1 1 0
Starting program: /callbacks/build/callback 0 0 2 1 1 0
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
[New Thread 0x7ffff4cd9700 (LWP 21291)]
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
[New Thread 0x7fffeede2700 (LWP 21292)]
[New Thread 0x7fffee5e0700 (LWP 21293)]
[New Thread 0x7fffee9e1700 (LWP 21294)]
context (0)
device name: Intel(R) HD Graphics
device type: CL_DEVICE_TYPE_GPU
command queue (0)
mode IGPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x607030
[New Thread 0x7fffec827700 (LWP 21295)]
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x607030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
^C
Thread 1 "callback" received signal SIGINT, Interrupt.
0x00007ffff730a756 in pthread_cond_wait##GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
(gdb) bt
#0 0x00007ffff730a756 in pthread_cond_wait##GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
#1 0x00007ffff64c635b in ?? () from /opt/intel/opencl/libintelocl.so
#2 0x00007ffff648c63a in ?? () from /opt/intel/opencl/libintelocl.so
#3 0x00007ffff647b5d1 in ?? () from /opt/intel/opencl/libintelocl.so
#4 0x00007ffff63f3e75 in clWaitForEvents () from /opt/intel/opencl/libintelocl.so
#5 0x00007ffff6edca43 in ?? () from /opt/intel/opencl/libIntelOpenCL.so
#6 0x000000000040237e in main (argc=7, argv=0x7fffffffdc58) at ./src/callback.c:532
As you can see in the first example of execution (CPU) it should appear the two callbacks (two BEGIN/END pairs). In the case of HD Graphics GPU it hangs after the first callback (only one BEGIN/END pair).
Why?
(gdb shows that is freezed in the pthread_cond_wait of the intel opencl driver).
Can anyone explain really what is the behavior with the callbacks/events and the host thread? (best practices, how to avoid deadlocks)
I need fine grained control and the fastest performance, and it looks like is callbacks, but they have weird behaviors...
Expected behavior (only occurs in the CPU, not in the IGPU):
1. The host creates an user event. Then, the host calls a EnqueueKernelNDRange (vector addition) and waits for the user event (WaitForEvents). When the kernel finishes it triggers the callback "callback_kernel".
2. This "callback_kernel" calls a EnqueueReadBuffer non-blocking, and when it finishes triggers the callback "callback_read".
3. The "callback_read" sets CL_COMPLETE the user event.
4. The host continues after the WaitForEvents with the content filled (buffer read).
Your problem is the following line:
/* why it does not work? (blocking CL_TRUE) */
st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);
Inside the callback function, you are trying to issue a blocking call to clEnqueueReadBuffer, which is not allowed in OpenCL. You should check the specification notes which functions are not allowed from the following link.
https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clSetEventCallback.html
I also recommend you to read the whole callback section from the specification your driver supports, I am adding the corresponding section of the latest OpenCL spec 2.2 here.
https://www.khronos.org/registry/OpenCL/specs/opencl-2.2.pdf#page=197

OPENCL API's take almost same time irrespective of sample size

I've been trying to profile an OpenCL host code for FIR filtering on MAC, Ubuntu and other platforms. My Host code and kernel are as below.
The issue is that irrespective of the number of samples that I provide for the FIR filter, the clenquendrangelernel ends up taking the same amount of time. Also I've profiled the clEnqueueReadBuffer and clEnqueueWriteBuffer as well and somehow they also end up taking the same amount of time. In mac I'm profiling with mach as well as using OpenCL events, in ubuntu, I'm profiling with PAPI. Im unable to understand why this is happening, ideally with increase in the number of samples, the clEnqueueReadBuffer and clEnqueueWriteBuffer should take more time and so should kernel execution.
Kernel:-
__kernel void fir4(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[4] = {5,7,5,7};
/*for(j=0;j<4;j++)
{
output[i] += coeff[j]*(input[i+4-j-1]);
}*/
//unrolled
output[i] += coeff[0]*(input[i+4-0-1]);
output[i] += coeff[1]*(input[i+4-1-1]);
output[i] += coeff[2]*(input[i+4-2-1]);
output[i] += coeff[3]*(input[i+4-3-1]);
}
__kernel void fir8(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[8] = {5,7,5,7,5,7,5,7};
for(j=0;j<8;j++)
{
output[i] += coeff[j]*(input[i+8-j-1]);
}
}
__kernel void fir12(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[12] = {5,7,5,7,5,7,5,7,5,7,5,7};
for(j=0;j<12;j++)
{
output[i] += coeff[j]*(input[i+12-j-1]);
}
}
Host Code:-
// Use a static data size for simplicity
//
#define DATA_SIZE (48000)
#define NUM_COEFF (4)
int main(int argc, char** argv)
{
uint64_t start;
uint64_t end;
uint64_t elapsed;
double elapsedmilli;
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float coeff[NUM_COEFF];
float results_host[DATA_SIZE] = {};
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_event event; //Linking event to kernel for profiling
cl_platform_id platform_id = NULL; // compute device platform id
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i,j = 0;
unsigned int count = DATA_SIZE;
unsigned int taps = NUM_COEFF;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
for(i=0; i < taps; i++)
{
if(!(i%2))
coeff[i] = 5;
else
coeff[i] = 7;
}
//Connect to a platform on device
err = clGetPlatformIDs(1, &platform_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to locate opencl platform!\n");
return EXIT_FAILURE;
}
// Connect to a compute device
//
int gpu = 0;
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
//Use function and load the kernel source from .cl files in the same folder
//
char *KernelSource = load_program_source("fir.cl");
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
switch(taps)
{
case(4):
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
case(8):
{
kernel = clCreateKernel(program, "fir8", &err);
break;
}
case(12):
{
kernel = clCreateKernel(program, "fir12", &err);
break;
}
default:
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
}
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel! - %d\n",err);
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
local = 48;
start = mach_absolute_time();
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &event);
if (err)
{
printf("Error: Failed to execute kernel!-%d\n",err);
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clWaitForEvents(1, &event);
clFinish(commands);
end = mach_absolute_time();
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);
elapsed = end - start;
struct mach_timebase_info info;
mach_timebase_info(&info);
double t = 1e-9 * (elapsed) * info.numer / info.denom;
elapsedmilli = 1e-6 * (elapsed) * info.numer / info.denom;
printf("cl:main timing:MACH clEnqueueNDRangeKernel %f ms, %d elapsed\n",elapsedmilli,elapsed);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i=0; i<DATA_SIZE; i++)
{
for(j=0;j<NUM_COEFF;j++)
{
results_host[i]+=coeff[j]*(data[i+NUM_COEFF-j-1]);
}
//printf("Host Output[%d]-%f\n",i,results_host[i]);
}
for(i = 0; i < count; i++)
{
if(results[i] == results_host[i])
correct++;
//printf("CL Output[%d]-%f\n",i,results[i]);
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values! Samples-%d,Taps-%d\n", correct, count, DATA_SIZE, NUM_COEFF);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
Adding just 10-20 multiplications and additions per item is not comparable to kernel overhead time. Try with 100 or 1000-wide coefficients array.
Using more input elements per item with that way, just increases cache hit numbers(also ratio) because more threads read from same locations.
If DATA_SIZE is several millions, then all data could not fit in cache and become slower linearly with its length. 48000 means less than 200kB. A HD5850 has 512 k L2 cache(3x bandwidth of memory) and 8kB L1 per compute unit(too fast) for example.

Simple OpenCL program not working

This program is a simple parallel program which adds the elements of 2 vectors.
The program was error free and it was compiled successfully but the results are not right
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
// number of points in Both A and B files (number of rows)
const int number_of_points = 11;
// number of points axis in Both A and B files (number of Columns)
const int number_of_axis = 3;
using namespace std;
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// array contains file A data
float arrayA[number_of_points][number_of_axis]={{0}};
// array contains file B data
float arrayB[number_of_points][number_of_axis]={{0}};
// float X1[number_of_points]; // X values of file A points
float Y1[number_of_points]; // Y values of file A points
// float X2[number_of_points]; // X values of file B points
float Y2[number_of_points]; // Y values of file B points
float *X1 = (float*)malloc(sizeof(float)*number_of_points);
float *X2 = (float*)malloc(sizeof(float)*number_of_points);
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
arrayA[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
arrayB[row][col] = x;
col++;
}
row++;
}
// put Xs of points in X vectors and Ys of points in Y vectors
// input file A
for (int i = 0; i<number_of_points; i++){
X1[i] = arrayA[i][1];
Y1[i] = arrayA[i][2];
}
// input file B
for (int i = 0; i<number_of_points; i++){
X2[i] = arrayB[i][1];
Y2[i] = arrayB[i][2];
}
// int i;
// const int LIST_SIZE = 50;
// int *A = (int*)malloc(sizeof(int)*number_of_points);
// int *B = (int*)malloc(sizeof(int)*number_of_points);
// for(i = 0; i < number_of_points; i++) {
// A[i] = X1[i];
// B[i] = X2[i];
// }
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context =
clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue =
clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem x1_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem x2_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
number_of_points * sizeof(float), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, x1_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X1, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, x2_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X2, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&x1_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&x2_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
float *C = (float*)malloc(sizeof(float)*number_of_points);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < number_of_points; i++)
printf("%f + %f = %f\n", X1[i], X2[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(x1_mem_obj);
ret = clReleaseMemObject(x2_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(X1);
free(X2);
free(C);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
and the kernel file
__kernel void vector_add(__global float *X1,
__global float *X2,
__global float *C) {
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = X1[i] + X2[i];
}
The result was
0.000000 + 0.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
2.000000 + 2.000000 = 0.000000
3.000000 + 3.000000 = 0.000000
4.000000 + 4.000000 = 0.000000
5.000000 + 5.000000 = 0.000000
6.000000 + 6.000000 = 0.000000
7.000000 + 7.000000 = 0.000000
8.000000 + 8.000000 = 0.000000
9.000000 + 9.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
ALL Time taken: 0.07s
You've committed one of the cardinal sins of OpenCL programming, in that you are not checking the error codes from any of your OpenCL API calls! You should always check the return code from every single OpenCL API call. If you did this, it would point you towards the problem very quickly.
The problem is in your kernel enqueue call. If you check the error code, you'll see that you are getting -54 back, which corresponds to CL_INVALID_WORK_GROUP_SIZE. Specifically, kernel invocations have the requirement that the work-group size (local size) exactly divides the global size. You are asking for a work-group size of 64 and a global size of 11, which does not fulfil this requirement.
You can also pass NULL as the work-group size parameter, and the OpenCL implementation will pick a work-group size that will definitely work on your behalf.

OpenCL HelloWorld OSX Linker Error

I tried to build the HelloWorld OpenCL example from the OpenCL Mac Programming Guide:
http://developer.apple.com/library/mac/documentation/Performance/Conceptual/OpenCL_MacProgGuide/OpenCL_MacProgGuide.pdf
through Xcode 4.6 on an iMac (Late 2012, OSX 10.8.3) and on my MacBook (Early 2008, OSX 10.7.5) and I'm getting what seems to be linking errors:
https://www.dropbox.com/s/iih87g0495qn4c6/Screen%20Shot%202013-06-04%20at%203.35.55%20PM.png (I tried copy-pasting but the formatting looks horrendous.)
I followed all the instructions in OpenCL Mac Programming Guide, dated 2012-07-23, but no cigar.
// mykernel.cl
kernel void square(global float* input, global float* output) {
size_t i = get_global_id(0);
output[i] = input[i] * input[i];
}
// main.c
// HelloWorld
#include <stdio.h>
#include <OpenCL/OpenCL.h>
#include "mykernel.cl.h"
static void print_device_info(cl_device_id device) {
char name[128];
char vendor[128];
clGetDeviceInfo(device, CL_DEVICE_NAME, 128, name, NULL);
clGetDeviceInfo(device, CL_DEVICE_VENDOR, 128, vendor, NULL);
fprintf(stdout, "%s : %s\n", vendor, name);
}
However, the older, longer version of the OpenCL HelloWorld from Apple, works correctly:
//
// File: hello.c
//
// Abstract: A simple "Hello World" compute example showing basic usage of OpenCL which
// calculates the mathematical square (X[i] = pow(X[i],2)) for a buffer of
// floating point values.
//
//
// Version: <1.0>
//
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
// in consideration of your agreement to the following terms, and your use,
// installation, modification or redistribution of this Apple software
// constitutes acceptance of these terms. If you do not agree with these
// terms, please do not use, install, modify or redistribute this Apple
// software.
//
// In consideration of your agreement to abide by the following terms, and
// subject to these terms, Apple grants you a personal, non - exclusive
// license, under Apple's copyrights in this original Apple software ( the
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
// Software, with or without modifications, in source and / or binary forms;
// provided that if you redistribute the Apple Software in its entirety and
// without modifications, you must retain this notice and the following text
// and disclaimers in all such redistributions of the Apple Software. Neither
// the name, trademarks, service marks or logos of Apple Inc. may be used to
// endorse or promote products derived from the Apple Software without specific
// prior written permission from Apple. Except as expressly stated in this
// notice, no other rights or licenses, express or implied, are granted by
// Apple herein, including but not limited to any patent rights that may be
// infringed by your derivative works or by other works in which the Apple
// Software may be incorporated.
//
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
////////////////////////////////////////////////////////////////////////////////
// Use a static data size for simplicity
//
#define DATA_SIZE (1024)
////////////////////////////////////////////////////////////////////////////////
// Simple compute kernel which computes the square of an input array
//
const char *KernelSource = "\n" \
"__kernel void square( \n" \
" __global float* input, \n" \
" __global float* output, \n" \
" const unsigned int count) \n" \
"{ \n" \
" int i = get_global_id(0); \n" \
" if(i < count) \n" \
" output[i] = input[i] * input[i]; \n" \
"} \n" \
"\n";
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i = 0;
unsigned int count = DATA_SIZE;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
// Connect to a compute device
//
int gpu = 1;
err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
kernel = clCreateKernel(program, "square", &err);
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clFinish(commands);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i = 0; i < count; i++)
{
if(results[i] == data[i] * data[i])
correct++;
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values!\n", correct, count);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
I have not used OpenCL, but the linker error is saying it can't find a main() function in your program. Is your first code listing the complete program? If so, you need to add a main() function to the main.c file that calls your print_device_info() function.

Resources