Issue with clEnqueueReadBuffer Function in OpenCL - in an Array Sum Sample - parallel-processing

everyone,
I'm a beginner on OpenCL and I wrote some simple code in C which sums two arrays. Here is part of the code:
// Create Kernel.
cl_kernel kernelSum = clCreateKernel( myProgram, "sum", &error );
// Set Input Array.
size_t arraySize = 1000;
char* a = ( char* ) malloc( sizeof( char ) * arraySize );
char* b = ( char* ) malloc( sizeof( char ) * arraySize );
char* c = ( char* ) malloc( sizeof( char ) * arraySize );
for (int i = 0; i < arraySize; i += 1)
{
a[ i ] = 1;
b[ i ] = 2;
c[ i ] = -1;
}
// Set Buffers.
cl_mem a_buffer = clCreateBuffer(
myContext,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
arraySize * sizeof( char ), a,
&error );
cl_mem b_buffer = clCreateBuffer(
myContext,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
arraySize * sizeof( char ), b,
&error );
cl_mem c_buffer = clCreateBuffer(
myContext,
CL_MEM_WRITE_ONLY,
arraySize * sizeof( char ), NULL,
&error );
printf( "Buffers created.\n" );
// Setting Kernel Arguments.
error = clSetKernelArg( kernelSum, 0, sizeof( cl_mem ), &a_buffer );
error |= clSetKernelArg( kernelSum, 1, sizeof( cl_mem ), &b_buffer );
error |= clSetKernelArg( kernelSum, 2, sizeof( cl_mem ), &c_buffer );
printf( "Arguments Set.\n" );
// Enqueue kernels to execute.
cl_event event;
size_t globalWorkOffset = 0;
size_t globalWorkSize[ 1 ] = { arraySize };
size_t localWorkSize[ 1 ] = { 1 };
clEnqueueNDRangeKernel(
myCommandQueue,
kernelSum,
1, // work_dim
0, // global work offset
globalWorkSize,
localWorkSize, // local work offset
0, NULL,
&event
);
printf( "Kernel Enqueued.\n" );
error = clEnqueueReadBuffer(
myCommandQueue,
c_buffer,
CL_TRUE, // blocking option
( size_t ) 0, arraySize * sizeof( char ), // offset, data_size
c, // host_ptr
0, NULL,
&event );
if ( error != CL_SUCCESS )
{
printf( "Buffer Reading Back Failed.\n" );
exit( 1 );
}
However, I got incorrect result : all the numbers in "c" array are zeros. I thought it has something to do with clEnqueueReadBuffer, or perhaps not. Any ideas about this issue? Expecting your suggestions! :-)

Your call to clEnqueueReadBuffer will not wait for the kernel to finish. It will most likely execute simultaneously with the kernel. Change the call to:
error = clEnqueueReadBuffer(
myCommandQueue,
c_buffer,
CL_TRUE, // blocking option
( size_t ) 0, arraySize * sizeof( char ), // offset, data_size
c, // host_ptr
1, &event,
NULL );
This will cause clEnqueueReadBuffer to wait for the kernel event to finish before starting to read the buffer.

Everyone. All your suggestions are really helpful. It's with your help I finally found what I've done wrong :
clBuildProgram( myProgram, 1, &device, NULL, NULL, &error );
This is my original code.
The official definition of this function is :
cl_int clBuildProgram(cl_program program,
cl_uint num_devices,
const cl_device_id *device_list,
const char *options,
void (CL_CALLBACK *pfn_notify)
(cl_program program,
void *user_data),
void *user_data)
I found I set the last of the argument list as "&error"; I got this function mixed with some others whose argument list usually ends with an error number. Here, I should have call it like this:
clBuildProgram( myProgram, 1, &device, NULL, NULL, NULL );
The last one set to NULL. Then the program run correctly.
Thank you all very much! :-)

Related

KNEM cookie and declared region

First question is PROT_WRITE and PROT_READ i wasn't able to find anywhere and it's giving me a hard time compiling. I replaced with 0 and 1 but it doesn't seem to work.
Second, "rejected (unexisting region cookie)"
int rank;
MPI_Init( &argc, &argv );
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
MPI_Win win;
int knem_fd = open("/dev/knem", O_RDWR);
int err;
uint64_t size = 64;
if( rank == 0 ){
char *inbuf = malloc(size);
for( int i = 0; i < size; i++ )
inbuf[i] = rand() % 26 + 97;
print_array( inbuf, size, '0' );
struct knem_cmd_create_region create;
struct knem_cmd_param_iovec knem_iov[1];
knem_iov[0].base = (uint64_t)&inbuf;
knem_iov[0].len = size;
create.iovec_array = (uintptr_t) &knem_iov[0];
create.iovec_nr = 1;
create.flags = KNEM_FLAG_SINGLEUSE;
//create.protection = 1;
err = ioctl( knem_fd, KNEM_CMD_CREATE_REGION, &create );
MPI_Send( &(create.cookie), 1, MPI_UINT64_T, 1, 0, MPI_COMM_WORLD );
MPI_Barrier( MPI_COMM_WORLD );
} else if( rank == 1 ){
char *obuf = malloc(size);
int err;
struct knem_cmd_copy copy;
struct knem_cmd_create_region create;
struct knem_cmd_param_iovec knem_iov[1];
knem_iov[0].base = (uint64_t)&obuf;
knem_iov[0].len = size;
create.iovec_array = (uintptr_t) &knem_iov[0];
create.iovec_nr = 1;
//create.protection = 0;
create.flags = KNEM_FLAG_SINGLEUSE;
err = ioctl( knem_fd, KNEM_CMD_CREATE_REGION, &create );
MPI_Recv( &(copy.src_cookie), 1, MPI_UINT64_T, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
copy.src_offset = 0;
copy.dst_cookie = create.cookie;
copy.dst_offset = 0;
copy.flags = 0;
err = ioctl(knem_fd, KNEM_CMD_COPY, &copy);
print_array( obuf, size, '1' );
MPI_Barrier( MPI_COMM_WORLD );
}
0 and 1 both create a region, 0 sends its cookie to 1 and 1 goes in grab data from 0. I checked the received cookie is the same as the send cookie, but it just failed to find the declared region.
PROT_READ and PROT_WRITE are mmap flags, you need to include sys/mman.h to get them. In the second part of the code, you need to set copy.src_cookie to create.cookie (or just use an inline copy to avoid creating that region at all since it'll be destroyed immediately because of the SINGLEUSE flag). Also, make sure ou check the return values of all ioctl before continuing. Copy cannot work if create.cookie wasn't initialized because the create ioctl failed.

Simple OpenCL program not working

This program is a simple parallel program which adds the elements of 2 vectors.
The program was error free and it was compiled successfully but the results are not right
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
// number of points in Both A and B files (number of rows)
const int number_of_points = 11;
// number of points axis in Both A and B files (number of Columns)
const int number_of_axis = 3;
using namespace std;
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// array contains file A data
float arrayA[number_of_points][number_of_axis]={{0}};
// array contains file B data
float arrayB[number_of_points][number_of_axis]={{0}};
// float X1[number_of_points]; // X values of file A points
float Y1[number_of_points]; // Y values of file A points
// float X2[number_of_points]; // X values of file B points
float Y2[number_of_points]; // Y values of file B points
float *X1 = (float*)malloc(sizeof(float)*number_of_points);
float *X2 = (float*)malloc(sizeof(float)*number_of_points);
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
arrayA[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
arrayB[row][col] = x;
col++;
}
row++;
}
// put Xs of points in X vectors and Ys of points in Y vectors
// input file A
for (int i = 0; i<number_of_points; i++){
X1[i] = arrayA[i][1];
Y1[i] = arrayA[i][2];
}
// input file B
for (int i = 0; i<number_of_points; i++){
X2[i] = arrayB[i][1];
Y2[i] = arrayB[i][2];
}
// int i;
// const int LIST_SIZE = 50;
// int *A = (int*)malloc(sizeof(int)*number_of_points);
// int *B = (int*)malloc(sizeof(int)*number_of_points);
// for(i = 0; i < number_of_points; i++) {
// A[i] = X1[i];
// B[i] = X2[i];
// }
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context =
clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue =
clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem x1_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem x2_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
number_of_points * sizeof(float), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, x1_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X1, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, x2_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X2, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&x1_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&x2_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
float *C = (float*)malloc(sizeof(float)*number_of_points);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < number_of_points; i++)
printf("%f + %f = %f\n", X1[i], X2[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(x1_mem_obj);
ret = clReleaseMemObject(x2_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(X1);
free(X2);
free(C);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
and the kernel file
__kernel void vector_add(__global float *X1,
__global float *X2,
__global float *C) {
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = X1[i] + X2[i];
}
The result was
0.000000 + 0.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
2.000000 + 2.000000 = 0.000000
3.000000 + 3.000000 = 0.000000
4.000000 + 4.000000 = 0.000000
5.000000 + 5.000000 = 0.000000
6.000000 + 6.000000 = 0.000000
7.000000 + 7.000000 = 0.000000
8.000000 + 8.000000 = 0.000000
9.000000 + 9.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
ALL Time taken: 0.07s
You've committed one of the cardinal sins of OpenCL programming, in that you are not checking the error codes from any of your OpenCL API calls! You should always check the return code from every single OpenCL API call. If you did this, it would point you towards the problem very quickly.
The problem is in your kernel enqueue call. If you check the error code, you'll see that you are getting -54 back, which corresponds to CL_INVALID_WORK_GROUP_SIZE. Specifically, kernel invocations have the requirement that the work-group size (local size) exactly divides the global size. You are asking for a work-group size of 64 and a global size of 11, which does not fulfil this requirement.
You can also pass NULL as the work-group size parameter, and the OpenCL implementation will pick a work-group size that will definitely work on your behalf.

The ".text" section accessed in memory and loaded from disc differs for GCC compiled application

The content of the '.text' section is accessed using code like this:
1) For the application which is loaded into memory (i.e. executing):
//accessing code in memory
PIMAGE_DOS_HEADER pDOSHeader = NULL;
pDOSHeader = static_cast<PIMAGE_DOS_HEADER>( (void*)hModule);
...
PIMAGE_NT_HEADERS pNTHeader = reinterpret_cast<PIMAGE_NT_HEADERS>((byte*)hModule + pDOSHeader->e_lfanew );
...
PIMAGE_FILE_HEADER pFileHeader = reinterpret_cast<PIMAGE_FILE_HEADER>((byte*)&pNTHeader->FileHeader );
...
PIMAGE_OPTIONAL_HEADER pOptionalHeader =
reinterpret_cast<PIMAGE_OPTIONAL_HEADER>((byte*)&pNTHeader->OptionalHeader );
...
PIMAGE_SECTION_HEADER pSectionHeader = reinterpret_cast<PIMAGE_SECTION_HEADER>(
(byte*)&pNTHeader->OptionalHeader +
pNTHeader->FileHeader.SizeOfOptionalHeader );
//so iterate headers and select one with right name
const char TEXT[] = ".text";
const char BSSTEXT[] = ".textbss";
unsigned int nSectionCount = pNTHeader->FileHeader.NumberOfSections;
char szSectionName[ IMAGE_SIZEOF_SHORT_NAME + 1 ];
szSectionName[ IMAGE_SIZEOF_SHORT_NAME ] = '\0';
for( unsigned int i = 0; i < nSectionCount; i++ )
{
memcpy( szSectionName, pSectionHeader->Name,
IMAGE_SIZEOF_SHORT_NAME );
if( 0 == strncmp( TEXT, szSectionName,
IMAGE_SIZEOF_SHORT_NAME ) )
{
break;
}
pSectionHeader++;
}
pVirtualAddress = (void*)(pSectionHeader->VirtualAddress);
dwCodeSize = pSectionHeader->Misc.VirtualSize;
//seems resonable: To calculate the real starting address of a given section in memory,
//add the base address of the image to the section's VirtualAddress stored in this field.
pCodeStart = (void*)(((byte*)hModule) +(size_t)((byte*)pVirtualAddress) );
pCodeEnd = (void*)((byte*)pCodeStart + dwCodeSize);
2) For the application file read from hdd and mapped into memory:
//loading code from file and mapping
hFile = CreateFile( filename, FILE_READ_DATA, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
...
hFileMapping = CreateFileMapping( hFile, NULL, PAGE_READONLY ),0, 0, NULL );
...
pBaseAddress = MapViewOfFile( hFileMapping, FILE_MAP_READ, 0, 0, 0 );
...
PIMAGE_DOS_HEADER pDOSHeader = static_cast<PIMAGE_DOS_HEADER>( pBaseAddress);
...
PIMAGE_NT_HEADERS pNTHeader = reinterpret_cast<PIMAGE_NT_HEADERS>(
(PBYTE)_pBaseAddress() + pDOSHeader->e_lfanew );
...
PIMAGE_FILE_HEADER pFileHeader = reinterpret_cast<PIMAGE_FILE_HEADER>(
(PBYTE)&pNTHeader->FileHeader );
...
PIMAGE_OPTIONAL_HEADER pOptionalHeader =
reinterpret_cast<PIMAGE_OPTIONAL_HEADER>(
(PBYTE)&pNTHeader->OptionalHeader );
PIMAGE_SECTION_HEADER pSectionHeader =
reinterpret_cast<PIMAGE_SECTION_HEADER>(
(PBYTE)&pNTHeader->OptionalHeader +
pNTHeader->FileHeader.SizeOfOptionalHeader );
DWORD dwEntryPoint = pNTHeader->OptionalHeader.AddressOfEntryPoint;
UINT nSectionCount = pNTHeader->FileHeader.NumberOfSections;
const char TEXT[] = ".text";
const char BSSTEXT[] = ".textbss";
char szSectionName[ IMAGE_SIZEOF_SHORT_NAME + 1 ];
szSectionName[ IMAGE_SIZEOF_SHORT_NAME ] = '\0';
for( unsigned int i = 0; i < nSectionCount; i++ )
{
memcpy( szSectionName, pSectionHeader->Name,
IMAGE_SIZEOF_SHORT_NAME );
if( 0 == strncmp( TEXT, szSectionName,
IMAGE_SIZEOF_SHORT_NAME ) )
{
break;
}
pSectionHeader++;
}
// Use this when probing On Disk. It is where things
// are on disk - not where they will be in memory
dwRawData = pSectionHeader->PointerToRawData;
// Use this when probing On Disk. It is where things
// are on disk - not where they will be in memory
pCodeStart = (void*)((byte*)pBaseAddress +
pSectionHeader->PointerToRawData );
pEntryPoint = (void*)(((byte*)pBaseAddress) + dwEntryPoint);
dwCodeSize = pSectionHeader->Misc.VirtualSize;
pCodeEnd = (void*)((byte*)pCodeStart + pSectionHeader->Misc.VirtualSize );
If the application is built with Visual Studio, all the bytes between pCodeStart and pCodeEnd are matching in both cases.
But if the application is built with GCC (MinGW) some bytes which are following pCodeStart and prior pCodeEnd are the same but somewhere in the middle some different bytes are appearing.
Why does it happen?

CUDA more than max threads without errors?

The original problem was launching more threads that it is possible like this:
someKernel<<<1 , 1025>>> ( ... );
and not detecting the error, as I did not know how to detect kernel call errors. This is explained well in talonmies answer in this question:
What is the canonical way to check for errors using the CUDA runtime API?
Instead of modifying the code I presented I wrote my own for conciseness:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t cudaError, char *file, int line, bool abort=true)
{
if (cudaError != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(cudaError), file, line);
}
}
__global__ void addKernel(const int *dev_a, const int *dev_b, int *dev_c)
{
int i = threadIdx.x;
if ( i < 5 )
dev_c[i] = dev_a[i] + dev_b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
int *dev_a(nullptr), *dev_b(nullptr), *dev_c(nullptr);
gpuErrchk( cudaMalloc((void**)&dev_a, arraySize * sizeof(int)) );
gpuErrchk( cudaMalloc((void**)&dev_b, arraySize * sizeof(int)) );
gpuErrchk( cudaMalloc((void**)&dev_c, arraySize * sizeof(int)) );
gpuErrchk( cudaMemcpy(dev_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(dev_b, b, arraySize * sizeof(int), cudaMemcpyHostToDevice) );
const int testMax1D = 1025;
dim3 testMax2D ( 32, 33 );
addKernel<<<1, testMax2D>>> ( dev_a , dev_b, dev_c );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk( cudaMemcpy( c, dev_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost) );
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
I now get correct error reports. Thank you for your patience.
I don't understand this call in the gpuAssert function, so I ommited it:
if (abort) exit(code);
Is exit a custom written function or something I missed?
There are two classes of errors that can occur with kernel launches and they need to be checked for in separate steps, following a particular order.
The first class of errors is reported synchronously when a kernel call is made and prior to the kernel actually being launched on the device, i.e. these are "pre-launch" errors. These errors typically involve requesting more of a particular resource than is available (e.g. too much shared memory, too many threads). Check for these by calling cudaGetLastError() immediately after a kernel call.
The second class of errors are those that occur at some point in time after the kernel was launched on the device (e.g. memory access violation, timeout of watchdog timer). These are "post-launch" errors. The reason they are reported some time after a kernel call, is a natural consequence of kernel launches occuring asynchronously. They are reported at the next opportunity, which is usually the next synchronous API call. Check for these by calling cudaDeviceSynchronize() and examining its status return.
The posted code is missing a check for errors of the first class.

opencl program build failed

The following is the code I wrote for adding 2 2d arrays. Its getting compiled but when I try to run it its showing : error: failed to build program. runtime0.0000
why is it that the prpgram isnt built?
And also why is it that the buildlog that I have queried isnt getting displayed?
Actually since I am just initialising the arrays, I have directly stored to 1d array, not shown the conversion from 2d to 1d.
code:
# include <stdio.h>
#include <stdlib.h>
#ifdef APPLE
#include<OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define order 1000
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
float *A;
float *B;
float *C;
int n,m,p;
int err;
int szA, szB,szC;
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_uint nd;
cl_mem a_in;
cl_mem b_in;
cl_mem c_out;
int i,j;
n=order;
m=order;
p=order;
size_t global[2];
nd=1;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
szA=n*p;
szB=p*m;
szC=n*m;
A=(float *)malloc(sizeof(float)*szA);
B=(float *)malloc(sizeof(float)*szB);
C=(float *)malloc(sizeof(float)*szC);
for(i=0; i<order; i++)
for(j=0; j<order; j++)
A[i*m+j]=i;
B[i*m+j]=i;
FILE *fp;
char fileName[] = "./array_add_kernel.cl";
char *source_str;
size_t source_size;
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
err=clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
err=clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context_properties conpro[]={ CL_CONTEXT_PLATFORM,(cl_context_properties) firstPlatformId, 0};
context=clCreateContext(conpro, 1, &device_id, NULL, NULL, &err);
commands=clCreateCommandQueue(context, device_id,CL_QUEUE_PROFILING_ENABLE, &err);
a_in= clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*szA, NULL, NULL);
b_in= clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*szB, NULL, NULL);
c_out= clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*szC, NULL, NULL);
program= clCreateProgramWithSource(context, 1, (const char**)&source_str,(const size_t *)&source_size, &err);
err= clBuildProgram(program,0, NULL, NULL, NULL, NULL );
if(err!= CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error:Failed to build program executable!");
clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG,sizeof(buffer),buffer,&len);
printf("%s \n",buffer);
}
kernel= clCreateKernel(program, "array_add_kernel", &err);
err= 0;
err= clSetKernelArg(kernel, 0, sizeof(int), &n);
err|= clSetKernelArg(kernel, 1, sizeof(int), &p);
err|= clSetKernelArg(kernel, 2, sizeof(int), &m);
err|= clSetKernelArg(kernel, 3, sizeof(cl_mem), &a_in);
err|= clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_in);
err|= clSetKernelArg(kernel, 5, sizeof(cl_mem), &c_out);
err=clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float)*szA, A, 0, NULL, NULL);
err= clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float)*szB, B, 0, NULL, NULL);
cl_event prof_event;
global[0]= (size_t)n;
global[1]=(size_t)m;
err=clEnqueueNDRangeKernel(commands, kernel, nd, NULL, global, NULL, 0, NULL, &prof_event);
clFinish(commands);
cl_ulong ev_start_time=(cl_ulong)0;
cl_ulong ev_end_time=(cl_ulong)0;
size_t ret_size;
err= clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ev_start_time, NULL);
err= clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL);
err=clEnqueueReadBuffer(commands,c_out,CL_TRUE,0,sizeof(float)*szC,C,0,NULL,NULL);
cl_float runtime=(ev_end_time-ev_start_time)*1.0e-9;
printf("Runtime:%f ",runtime);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseMemObject(a_in);
clReleaseMemObject(b_in);
clReleaseMemObject(c_out);
clReleaseCommandQueue(commands);
clReleaseContext(context);
}
kernel:
kernel void array_add_kernel(
const int n, const int m, const p, _global const float * A, _global const float * B, , _global float * C )
{
int i= get_global_id(0);
int j= get_global_id(1);
C[i*m + j] = A[i*m + j] + B[i*m + j];
}
Fix your kernel. It's filled with errors.
kernel void array_add_kernel(
const int n,
const int m,
const p, // No type specifier
_global const float * A, // Should be global, not _global
_global const float * B, , // Double comma
_global float * C )
{
int i= get_global_id(0);
int j= get_global_id(1);
C[i*m + j] = A[i*m + j] + B[i*m + j];
}
This is the working kernel.
kernel void array_add_kernel(const int n, const int m, global const float * A, global const float * B, global float * C )
{
int i= get_global_id(0);
int j= get_global_id(1);
C[i*m + j] = A[i*m + j] + B[i*m + j];
}

Resources