I tried to build the HelloWorld OpenCL example from the OpenCL Mac Programming Guide:
http://developer.apple.com/library/mac/documentation/Performance/Conceptual/OpenCL_MacProgGuide/OpenCL_MacProgGuide.pdf
through Xcode 4.6 on an iMac (Late 2012, OSX 10.8.3) and on my MacBook (Early 2008, OSX 10.7.5) and I'm getting what seems to be linking errors:
https://www.dropbox.com/s/iih87g0495qn4c6/Screen%20Shot%202013-06-04%20at%203.35.55%20PM.png (I tried copy-pasting but the formatting looks horrendous.)
I followed all the instructions in OpenCL Mac Programming Guide, dated 2012-07-23, but no cigar.
// mykernel.cl
kernel void square(global float* input, global float* output) {
size_t i = get_global_id(0);
output[i] = input[i] * input[i];
}
// main.c
// HelloWorld
#include <stdio.h>
#include <OpenCL/OpenCL.h>
#include "mykernel.cl.h"
static void print_device_info(cl_device_id device) {
char name[128];
char vendor[128];
clGetDeviceInfo(device, CL_DEVICE_NAME, 128, name, NULL);
clGetDeviceInfo(device, CL_DEVICE_VENDOR, 128, vendor, NULL);
fprintf(stdout, "%s : %s\n", vendor, name);
}
However, the older, longer version of the OpenCL HelloWorld from Apple, works correctly:
//
// File: hello.c
//
// Abstract: A simple "Hello World" compute example showing basic usage of OpenCL which
// calculates the mathematical square (X[i] = pow(X[i],2)) for a buffer of
// floating point values.
//
//
// Version: <1.0>
//
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
// in consideration of your agreement to the following terms, and your use,
// installation, modification or redistribution of this Apple software
// constitutes acceptance of these terms. If you do not agree with these
// terms, please do not use, install, modify or redistribute this Apple
// software.
//
// In consideration of your agreement to abide by the following terms, and
// subject to these terms, Apple grants you a personal, non - exclusive
// license, under Apple's copyrights in this original Apple software ( the
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
// Software, with or without modifications, in source and / or binary forms;
// provided that if you redistribute the Apple Software in its entirety and
// without modifications, you must retain this notice and the following text
// and disclaimers in all such redistributions of the Apple Software. Neither
// the name, trademarks, service marks or logos of Apple Inc. may be used to
// endorse or promote products derived from the Apple Software without specific
// prior written permission from Apple. Except as expressly stated in this
// notice, no other rights or licenses, express or implied, are granted by
// Apple herein, including but not limited to any patent rights that may be
// infringed by your derivative works or by other works in which the Apple
// Software may be incorporated.
//
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
////////////////////////////////////////////////////////////////////////////////
// Use a static data size for simplicity
//
#define DATA_SIZE (1024)
////////////////////////////////////////////////////////////////////////////////
// Simple compute kernel which computes the square of an input array
//
const char *KernelSource = "\n" \
"__kernel void square( \n" \
" __global float* input, \n" \
" __global float* output, \n" \
" const unsigned int count) \n" \
"{ \n" \
" int i = get_global_id(0); \n" \
" if(i < count) \n" \
" output[i] = input[i] * input[i]; \n" \
"} \n" \
"\n";
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i = 0;
unsigned int count = DATA_SIZE;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
// Connect to a compute device
//
int gpu = 1;
err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
kernel = clCreateKernel(program, "square", &err);
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clFinish(commands);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i = 0; i < count; i++)
{
if(results[i] == data[i] * data[i])
correct++;
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values!\n", correct, count);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
I have not used OpenCL, but the linker error is saying it can't find a main() function in your program. Is your first code listing the complete program? If so, you need to add a main() function to the main.c file that calls your print_device_info() function.
Related
I've been trying to profile an OpenCL host code for FIR filtering on MAC, Ubuntu and other platforms. My Host code and kernel are as below.
The issue is that irrespective of the number of samples that I provide for the FIR filter, the clenquendrangelernel ends up taking the same amount of time. Also I've profiled the clEnqueueReadBuffer and clEnqueueWriteBuffer as well and somehow they also end up taking the same amount of time. In mac I'm profiling with mach as well as using OpenCL events, in ubuntu, I'm profiling with PAPI. Im unable to understand why this is happening, ideally with increase in the number of samples, the clEnqueueReadBuffer and clEnqueueWriteBuffer should take more time and so should kernel execution.
Kernel:-
__kernel void fir4(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[4] = {5,7,5,7};
/*for(j=0;j<4;j++)
{
output[i] += coeff[j]*(input[i+4-j-1]);
}*/
//unrolled
output[i] += coeff[0]*(input[i+4-0-1]);
output[i] += coeff[1]*(input[i+4-1-1]);
output[i] += coeff[2]*(input[i+4-2-1]);
output[i] += coeff[3]*(input[i+4-3-1]);
}
__kernel void fir8(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[8] = {5,7,5,7,5,7,5,7};
for(j=0;j<8;j++)
{
output[i] += coeff[j]*(input[i+8-j-1]);
}
}
__kernel void fir12(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[12] = {5,7,5,7,5,7,5,7,5,7,5,7};
for(j=0;j<12;j++)
{
output[i] += coeff[j]*(input[i+12-j-1]);
}
}
Host Code:-
// Use a static data size for simplicity
//
#define DATA_SIZE (48000)
#define NUM_COEFF (4)
int main(int argc, char** argv)
{
uint64_t start;
uint64_t end;
uint64_t elapsed;
double elapsedmilli;
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float coeff[NUM_COEFF];
float results_host[DATA_SIZE] = {};
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_event event; //Linking event to kernel for profiling
cl_platform_id platform_id = NULL; // compute device platform id
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i,j = 0;
unsigned int count = DATA_SIZE;
unsigned int taps = NUM_COEFF;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
for(i=0; i < taps; i++)
{
if(!(i%2))
coeff[i] = 5;
else
coeff[i] = 7;
}
//Connect to a platform on device
err = clGetPlatformIDs(1, &platform_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to locate opencl platform!\n");
return EXIT_FAILURE;
}
// Connect to a compute device
//
int gpu = 0;
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
//Use function and load the kernel source from .cl files in the same folder
//
char *KernelSource = load_program_source("fir.cl");
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
switch(taps)
{
case(4):
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
case(8):
{
kernel = clCreateKernel(program, "fir8", &err);
break;
}
case(12):
{
kernel = clCreateKernel(program, "fir12", &err);
break;
}
default:
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
}
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel! - %d\n",err);
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
local = 48;
start = mach_absolute_time();
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &event);
if (err)
{
printf("Error: Failed to execute kernel!-%d\n",err);
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clWaitForEvents(1, &event);
clFinish(commands);
end = mach_absolute_time();
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);
elapsed = end - start;
struct mach_timebase_info info;
mach_timebase_info(&info);
double t = 1e-9 * (elapsed) * info.numer / info.denom;
elapsedmilli = 1e-6 * (elapsed) * info.numer / info.denom;
printf("cl:main timing:MACH clEnqueueNDRangeKernel %f ms, %d elapsed\n",elapsedmilli,elapsed);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i=0; i<DATA_SIZE; i++)
{
for(j=0;j<NUM_COEFF;j++)
{
results_host[i]+=coeff[j]*(data[i+NUM_COEFF-j-1]);
}
//printf("Host Output[%d]-%f\n",i,results_host[i]);
}
for(i = 0; i < count; i++)
{
if(results[i] == results_host[i])
correct++;
//printf("CL Output[%d]-%f\n",i,results[i]);
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values! Samples-%d,Taps-%d\n", correct, count, DATA_SIZE, NUM_COEFF);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
Adding just 10-20 multiplications and additions per item is not comparable to kernel overhead time. Try with 100 or 1000-wide coefficients array.
Using more input elements per item with that way, just increases cache hit numbers(also ratio) because more threads read from same locations.
If DATA_SIZE is several millions, then all data could not fit in cache and become slower linearly with its length. 48000 means less than 200kB. A HD5850 has 512 k L2 cache(3x bandwidth of memory) and 8kB L1 per compute unit(too fast) for example.
I'm a beginner at OpenCL. I was trying to build a simple app which just add 2 vectors to get results. This is my following host code
#define USE_PLATFORM 0
#define USE_DEVICE 2
#define DATA_SIZE 1024
#define USE_KERNEL_PATH "/Users/huangxin/Documents/August13Programming/FirstEGOpenCL/FirstEGOpenCL/kernel.cl"
using namespace std;
int main(int argc, const char * argv[]) {
int err;
cl_uint numPlatforms;
cl_uint numDevices;
cl_command_queue command;
size_t global;
//Query the number of platforms supported.
err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (err != CL_SUCCESS || USE_PLATFORM >= numPlatforms)
{
printf("Error at: clGetPlatformIDs(querying platforms count failed):\n");
exit(-1);
}
//Get all platforms.
vector<cl_platform_id> platforms(numPlatforms);
err = clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms);
if (err != CL_SUCCESS)
{
printf("Error at: clGetPlatformIDs(getting all platforms failed):\n");
exit(-1);
}
//Query the number of devices supported by the platform spicified.
err = clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
if (err != CL_SUCCESS || USE_PLATFORM >= numDevices)
{
printf("Error at: clGetDeviceIDs(querying devices count failed):\n");
exit(-1);
}
//Get all devices.
vector<cl_device_id> devices(numDevices);
err=clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &numDevices);
if (err != CL_SUCCESS)
{
printf("Error at: clGetDeviceIDs(getting all devices failed):\n");
exit(-1);
}
//Get device infomation.
char deviceInfo[1024];
//get device max work item dimensions.
size_t maxItemSize[3];
clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_NAME, sizeof(deviceInfo)*1024, deviceInfo, NULL);
clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, maxItemSize, NULL);
cout << "Device selected: " << deviceInfo << endl;
cout << "Max item size: " << maxItemSize[0] << "," << maxItemSize[1] << ","<< maxItemSize[2] << endl;
//Set property with certain platform
cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[USE_PLATFORM]), 0};
//create context with certain property.
cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_ALL, NULL, NULL, &err);
if (err != CL_SUCCESS)
{
printf("Error at: clCreateContextFromType(get context failed):\n");
exit(-1);
}
//create command queue using selected device and context.
command = clCreateCommandQueue(context, devices[USE_DEVICE], 0, NULL);
//create program with specified kernel source.
const char *kernelSource = getKernelSource(USE_KERNEL_PATH);
cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, 0, &err);
if (err != CL_SUCCESS)
{
printf("Error at: clCreateProgramWithSource(get program failed):\n");
exit(-1);
}
//since OpenCL is a dynamic-compile architechture, we need to build the program.
err = clBuildProgram(program, 0, 0, 0, 0, 0);
if (err != CL_SUCCESS)
{
cout << err << endl;
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, devices[USE_DEVICE], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
//kernel是OpenCL中对执行在一个最小粒度的compute item上的代码及参数的抽象
//create the kernel function using the built program.
cl_kernel adder = clCreateKernel(program, "adder", &err);
if (err != CL_SUCCESS)
{
printf("Error at: clCreateKernel(get kernel function failed):\n");
exit(-1);
}
//create the vector of input random data.
vector<float> inA(DATA_SIZE), inB(DATA_SIZE);
for(int i = 0; i < DATA_SIZE; i++) {
inA[i] = (float)(random() % DATA_SIZE) / 1000;
inB[i] = (float)(random() % DATA_SIZE) / 1000;
}
//create the read-only device mem using specified context, that is to copy the host mem to the device mem.
cl_mem cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inA[0], NULL);
cl_mem cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inB[0], NULL);
//create the result mem.
cl_mem cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
//setting up the arguement of kernel memory
clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b);
clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res);
START_CHECK_RUNNING_TIME
//enqueue the kernel into the specified command(#TODO:come back later to check the remaining arguement.
global = DATA_SIZE;
err = clEnqueueNDRangeKernel(command, adder, 1, 0, &global, 0, 0, 0, 0);
if (err != CL_SUCCESS)
{
printf("Error at: clEnqueueNDRangeKernel(enqueue kernel failed):\n");
exit(-1);
}
printf("*****************FLAG***************");
//copy the results from the kernel into the host(CPU).
vector<float> res(DATA_SIZE);
err = clEnqueueReadBuffer(command, cl_res, CL_TRUE, 0, sizeof(float) * DATA_SIZE, &res[0], 0, 0, 0);
END_CHECK_RUNNING_TIME
//check the number of right compute.
int cnt = 0;
for (int i = 0; i < res.size(); i++) {
cnt += (res[i] == inA[i] + inB[i] ? 1 : 0);
}
cout << "Computed " << res.size() << " values\n";
cout << "Correct values:(" << cnt << "/" << res.size() << "),correct rate:" << (float)cnt / res.size() * 100 << "%" << endl;
gettimeofday(&sTime, NULL);
for (int i = 0; i < res.size(); i++) {
for (int j = 0; j < 10000; j++)
res[i] = inA[i] + inB[i];
}
gettimeofday(&eTime, NULL);timeuse = 1000000 * ( eTime.tv_sec - sTime.tv_sec ) + eTime.tv_usec -sTime.tv_usec; printf("Running time: %fs\n", (double)timeuse/(1000000));
//cleaning up the variables.
clReleaseKernel(adder);
clReleaseProgram(program);
clReleaseMemObject(cl_a);
clReleaseMemObject(cl_b);
clReleaseMemObject(cl_res);
clReleaseCommandQueue(command);
clReleaseContext(context);
return 0;
}
It's a bit long code, but it's really doing simple stuff. this is my kernel code
kernel void adder(global const float* a, global const float* b, global float* result)
{
size_t idx = get_global_id(0);
for (int i = 0; i < 10000; i++)
result[idx] = a[idx] +b[idx];
}
And I got the following result:
Device selected: GeForce GT 650M
-11
Error: Failed to build program executable!
No kernels or only kernel prototypes found.
I don't quite understand what "No kernels or only kernel prototypes found." mean and it's really strange that if I use the first device(CPU) or my second device(HD Graphics 4000), the same code runs perfectly.
I want to know what is wrong and why it happens.
I was running these code in the Xcode with Mac OS X 10.10.
As the comments say, is a good practice to use:
__kernel void adder(__global const float* a, __global const float* b, __global float* result)
Because that way you clearly define those are special CL flags. Tpically all the CL kernels follow that rule, even if the spec allows both.
But your problem is probably due to running the clBuildProgram() without any device in the devices list. Therefore, not compiling anything at all!
In CL every device has an specific compiler (the CPUs don't have the same compiler as GPU, sometimes not even the same instruction sets). So you should give the API the list of devices for which the kernels have to be compiled.
The proper way would be this:
err = clBuildProgram(program, 1, &devices[USE_DEVICE], "", 0, 0);
Note: I added "", because probably in the future you will want to add some build parameters, better to have it ready :)
The original problem was launching more threads that it is possible like this:
someKernel<<<1 , 1025>>> ( ... );
and not detecting the error, as I did not know how to detect kernel call errors. This is explained well in talonmies answer in this question:
What is the canonical way to check for errors using the CUDA runtime API?
Instead of modifying the code I presented I wrote my own for conciseness:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t cudaError, char *file, int line, bool abort=true)
{
if (cudaError != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(cudaError), file, line);
}
}
__global__ void addKernel(const int *dev_a, const int *dev_b, int *dev_c)
{
int i = threadIdx.x;
if ( i < 5 )
dev_c[i] = dev_a[i] + dev_b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
int *dev_a(nullptr), *dev_b(nullptr), *dev_c(nullptr);
gpuErrchk( cudaMalloc((void**)&dev_a, arraySize * sizeof(int)) );
gpuErrchk( cudaMalloc((void**)&dev_b, arraySize * sizeof(int)) );
gpuErrchk( cudaMalloc((void**)&dev_c, arraySize * sizeof(int)) );
gpuErrchk( cudaMemcpy(dev_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(dev_b, b, arraySize * sizeof(int), cudaMemcpyHostToDevice) );
const int testMax1D = 1025;
dim3 testMax2D ( 32, 33 );
addKernel<<<1, testMax2D>>> ( dev_a , dev_b, dev_c );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk( cudaMemcpy( c, dev_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost) );
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
I now get correct error reports. Thank you for your patience.
I don't understand this call in the gpuAssert function, so I ommited it:
if (abort) exit(code);
Is exit a custom written function or something I missed?
There are two classes of errors that can occur with kernel launches and they need to be checked for in separate steps, following a particular order.
The first class of errors is reported synchronously when a kernel call is made and prior to the kernel actually being launched on the device, i.e. these are "pre-launch" errors. These errors typically involve requesting more of a particular resource than is available (e.g. too much shared memory, too many threads). Check for these by calling cudaGetLastError() immediately after a kernel call.
The second class of errors are those that occur at some point in time after the kernel was launched on the device (e.g. memory access violation, timeout of watchdog timer). These are "post-launch" errors. The reason they are reported some time after a kernel call, is a natural consequence of kernel launches occuring asynchronously. They are reported at the next opportunity, which is usually the next synchronous API call. Check for these by calling cudaDeviceSynchronize() and examining its status return.
The posted code is missing a check for errors of the first class.
I'm attempting a basic matrix multiplication program in OpenCL. I believe my issues are in my enqueue and/or buffer reading, as I am getting completely incorrect output for the result matrix, as well as incorrect first rows for matrices A and B. I'm new to OpenCL and I've been banging my head against this for quite a while now, maybe someone here can give me a hint as to where I'm going wrong?
Host Code:
#define __NO_STD_VECTOR // Uses cl::vector instead of standard version
#include <CL/cl.hpp>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <math.h>
#include <string>
/* Defined matrix width/height constants */
#define numRowsA 3
#define numColsA 3
#define numRowsB 3
#define numColsB 3
#define numRowsC numRowsA
#define numColsC numColsB
using namespace std;
/* Function declarations */
inline void checkErr(cl_int err, string name);
void initMatrix (float* matrix, int numIndices);
void printMatrix (string displayName, float* matrix, int numIndices,
int rowSize);
//*************
// Main Program
//*************
int main(int argc, char* argv[]) {
/* Check for valid matrix sizes */
if (numColsA != numRowsB) {
cout << "ERROR: Invalid matrix dimensions." << endl;
} else {
srand(2013); // Set random seed
/* Allocate memory for matrices A, B, and C */
unsigned int sizeA = numRowsA * numColsA;
unsigned int sizeB = numRowsB * numColsB;
unsigned int sizeC = numRowsC * numColsC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/*
Allocate memoryA/memoryB/memoryC size blocks of bytes
(cast from void*)
*/
float* blockA = (float*) malloc(memoryA);
float* blockB = (float*) malloc(memoryB);
float* blockC = (float*) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(blockA, sizeA);
initMatrix(blockB, sizeB);
/* Display matrices A and B */
printMatrix("Matrix A", blockA, sizeA, numColsA);
printMatrix("Matrix B", blockB, sizeB, numColsB);
cl_int err; // Error code
string platformVendor; // Platform vendor
/* Create list of platforms */
cl::vector < cl::Platform > platformList;
cl::Platform::get(&platformList);
/*
Display potential Platform list generation error. If the
platform list size does not equal 0, CL_SUCCESS (0) is
sent to the function. If the platform list size does
equal 0, -1 is sent to the function.
*/
checkErr(platformList.size()!=0 ? CL_SUCCESS : -1,
"Platform");
/*
Replace empty value of platformVendor with device vendor
name
*/
platformList[0].getInfo((cl_platform_info) CL_PLATFORM_VENDOR,
&platformVendor);
/* Properties for Context constructor (Use unknown) */
cl_context_properties cprops[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties) (platformList[0]) (),
0
};
/* Create context */
cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL,
&err);
/* Display potential Context constructor error */
checkErr(err, "Context");
/* Create buffer for matrix A */
cl::Buffer deviceMemA(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeA, blockA, &err);
/* Create buffer for matrix B */
cl::Buffer deviceMemB(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeB, blockB, &err);
/* Create buffer for matrix C */
cl::Buffer deviceMemC(context,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeC, blockC, &err);
/* Create buffer for row (A) and col (C) */
cl::Buffer rowA(context, CL_MEM_READ_ONLY, sizeof(int),
(void *) numRowsA, &err);
cl::Buffer colC(context, CL_MEM_READ_ONLY, sizeof(int),
(void *) numColsC, &err);
/* Display potential Buffer constructor error */
checkErr(err, "Buffers");
/* Get list of devices */
cl::vector<cl::Device> devices =
context.getInfo<CL_CONTEXT_DEVICES>();
/* Check for at least one device, if not throw error */
checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "No Devices");
/* Read input from .cl file */
ifstream file("matrixMult1_kernels.cl");
/* Check for potential problem opening .cl input file */
checkErr(file.is_open() ? CL_SUCCESS:-1, "File Not Open");
/* Store file contents in a string */
string prog(istreambuf_iterator<char>(file),
(istreambuf_iterator<char>()));
/* Create source object */
cl::Program::Sources source(1, make_pair(prog.c_str(),
prog.length()+1));
/* Create program for given context and source */
cl::Program program(context, source);
err = program.build(devices, ""); // Check for build error
/* Display potential program build error */
checkErr(err, "Program Build");
/* Create kernel */
cl::Kernel kernel(program, "matrixMul", &err);
/* Display potential Kernel constructor error */
checkErr(err, "Kernel");
/*
Set matrixMul arguments, error checking after each
argument
*/
err = kernel.setArg(0, deviceMemA);
checkErr(err, "Arg0");
err = kernel.setArg(1, deviceMemB);
checkErr(err, "Arg1");
err = kernel.setArg(2, deviceMemC);
checkErr(err, "Arg2");
err = kernel.setArg(3, rowA);
checkErr(err, "Arg3");
err = kernel.setArg(4, colC);
checkErr(err, "Arg4");
/* Create command queue */
cl::CommandQueue queue(context, devices[0], 0, &err);
/* Display potential CommandQueue constructor error */
checkErr(err, "Command Queue");
/* Create event object */
cl::Event event;
cl::NDRange global(3, 3);
cl::NDRange local(1, 1);
/* Enqueue the kernel */
err = queue.enqueueNDRangeKernel(kernel, 2, global, local,
NULL, &event);
/* Display potential enqueueing error */
checkErr(err, "Enqueue");
/* Wait until kernel has completed execution before continuing */
event.wait();
/* Read kernel result back into host memory */
err = queue.enqueueReadBuffer(deviceMemC, CL_TRUE, 0, memoryC,
blockC, NULL, &event);
checkErr(err, "C");
err = queue.enqueueReadBuffer(deviceMemA, CL_TRUE, 0, sizeA,
blockA, NULL, &event);
err = queue.enqueueReadBuffer(deviceMemB, CL_TRUE, 0, sizeB,
blockB, NULL, &event);
/* Display potential kernel read error */
checkErr(err, "Read Buffer");
/* Display matrices */
cout << endl;
cout << "After:" << endl;
printMatrix("Matrix A", blockA, sizeA, numColsA);
printMatrix("Matrix B", blockB, sizeB, numColsB);
printMatrix("Matrix C", blockC, sizeC, numColsC);
/* Free up memory */
free(blockA);
free(blockB);
free(blockC);
}
}
//--------------------------------------------------------------------
// checkErr - Inline error checking function for OpenCL portion of
// host program.
//
// PRE: err is of type int in OpenCL; name is a string.
// POST: The program is terminated after display an error message
// indicating the location of the error and the error code.
//--------------------------------------------------------------------
inline void checkErr(cl_int err, string name) {
/* Check error code against OpenCL success constant */
if (err != CL_SUCCESS) {
/*
Display an error message stating the error origin and
error number.
*/
std::cerr << "ERROR: " << name << " (" << err << ")"
<< std::endl;
exit(EXIT_FAILURE); // Terminates process with status code 0
}
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix (float* matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; i++) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float) RAND_MAX;
}
}
//--------------------------------------------------------------------
// printMatrix - Outputs a readable version of the matrix.
//
// PRE: displayName is a string; matrix is a pointer to a block of
// bytes in memory; numIndices an integer indicating the number
// of indices in the matrix being displayed (read left-to-right,
// top-to-bottom); rowSize is an integer indicating the number
// of elements in one row of the matrix.
// POST: A readable version of the matrix is displayed.
//--------------------------------------------------------------------
void printMatrix (string displayName, float* matrix, int numIndices,
int rowSize) {
/* Output display name of matrix */
cout << "\n" << displayName << ":" << endl;
/* Loop through each indice of the matrix */
for (int i = 0; i < numIndices; i++) {
cout << matrix[i]; // Display value at this indice
/* Check for next row of the matrix */
if (((i + 1) % rowSize) == 0) {
cout << endl; // Line break
} else {
cout << " | "; // Indice separator
}
}
}
Kernel:
// matrixMult1_kernels.cl
// Multiply two matrices A * B = C
// Device code.
// OpenCL Kernel
__kernel void
matrixMul(__global float* A,
__global float* B,
__global float* C,
int wA, int wB) {
// 2D Thread ID
int tx = get_local_id(0);
int ty = get_local_id(1);
// value stores the element
// that is computed by the thread
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
// Write the matrix to device memory each
// thread writes one element
C[ty * wA + tx] = value;
}
Sample Output:
Matrix A:
0.398748 | 0.999793 | 0.206833
0.354238 | 0.674347 | 0.492022
0.707017 | 0.353635 | 0.430668
Matrix B:
0.91598 | 0.0260167 | 0.881732
0.810974 | 0.193091 | 0.589857
0.229151 | 0.0657822 | 0.965835
ERROR: C (-30)
I'm working with an NVIDIA GeForce 9800 GT, which only supports OpenCL 1.1. Any help here would be much appreciated.
Thanks,
Joe
The data for input matrices A and B is not passed to the device. When you create the buffers:
cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)
the blockA argument is ignored, because the flags do not specify how to use it. You need to add at least CL_MEM_COPY_HOST_PTR to initialize the buffer with the contents of blockA.
Alternatively, you can call clEnqueueWriteBuffer to send the data after the buffers are created.
suppose I have this class :
class Particle
{
double *_w;
};
And I want to send nParticles objects of Particle to my kernel. Allocating space for these objects is easy :
Particle *dev_p;
cudaStatus = cudaMalloc((void**)&dev_P, nParticles * sizeof(Particle));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
Also suppose that nParticles is 100. Now I need to allocate 300 double for each _w in a Particle object. How can I do this? I tried this code :
for( int i = 0; i < nParticles; i++){
cudaStatus = cudaMalloc((void**)&(dev_P[i]._w), 300 * sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
}
But debugging with Nsight stops when I access dev_p[i]._w[j] .
Perhaps you should include a complete simple example. (If I compile your code above and run it by itself, on linux, I get a seg fault at the second cudaMalloc operation). One wrinkle I see is that since you have in the first step allocated the particle objects in device memory, when you go to allocate the _w pointers, you are passing a pointer to cudaMalloc that is already in device memory. You're supposed to pass a host-based pointer to cudaMalloc, which it will then assign to the allocated area in device (global) memory.
One possible solution that I think conforms to what I see in yoru example is like this:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
class Particle
{
public:
double *_w;
};
__global__ void test(Particle *p){
int idx=threadIdx.x + blockDim.x*blockIdx.x;
if (idx == 2){
printf("dev_p[2]._w[2] = %f\n", p[idx]._w[2]);
}
}
int main() {
int nParticles=100;
Particle *dev_p;
double *w[nParticles];
cudaMalloc((void**)&dev_p, nParticles * sizeof(Particle));
cudaCheckErrors("cudaMalloc1 fail");
for( int i = 0; i < nParticles; i++){
cudaMalloc((void**)&(w[i]), 300 * sizeof(double));
cudaCheckErrors("cudaMalloc2 fail");
cudaMemcpy(&(dev_p[i]._w), &(w[i]), sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
}
double testval = 32.7;
cudaMemcpy(w[2]+2, &testval, sizeof(double), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy2 fail");
test<<<1, 32>>>(dev_p);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
printf("Done!\n");
}
Here we are creating a separate set of pointers on the host to use for cudaMalloc purposes, then copying those allocated pointers down to the device for use as device pointers (this is legal with UVA).
Another approach would be to allocate the _w pointers on the device side. This may serve your purposes as well.
All of the above I am assuming cc 2.0 or greater.
Using a methodology similar to what is described here, it may be possible to collapse the device-side allocations done in a loop down to a single allocation:
cudaMalloc(&(w[0]), nParticles*300*sizeof(double));
cudaCheckErrors("cudaMalloc2 fail");
cudaMemcpy(&(dev_p[0]._w), &(w[0]), sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
for( int i = 1; i < nParticles; i++){
w[i] = w[i-1] + 300;
cudaMemcpy(&(dev_p[i]._w), &(w[i]), sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
}
The cudaMemcpy operations still have to be done individually.
There are two ways of doing it. First one - you allocate the memory on the host filling up host array of particle objects. Once complete, you copy the host array to the device through cudaMemcpy.
Second way - on Fermi and higher you can call malloc in the kernel, filling the dev_P array from the kernel.