I'm trying to implement a simple opencl program to compute an histogram.
Below is what I currently have:
#include <CL/cl.h>
#include <iostream>
#include <vector>
#define STB_IMAGE_IMPLEMENTATION
#include <stb_image.h>
#include <algorithm>
//Getting platform, device, context and command queue
void setup(
cl_platform_id &platformId, cl_device_id &deviceId, cl_context& context, cl_command_queue& commandQueue,
std::string platformName = "NVIDIA CUDA", cl_device_type deviceType = CL_DEVICE_TYPE_GPU,
std::string deviceName = "GeForce GTX 1070")
{
using std::vector;
using std::string;
using std::cout;
using std::endl;
cl_uint numberOfPlatforms, numberOfDevices;
cl_int error;
//Finding platform id
error = clGetPlatformIDs(0,nullptr,&numberOfPlatforms);
vector<cl_platform_id> platform(numberOfPlatforms);
error = clGetPlatformIDs(numberOfPlatforms,platform.data(),nullptr);
for(const auto & currentPlatform : platform)
{
size_t stringSize;
error = clGetPlatformInfo(currentPlatform,CL_PLATFORM_NAME,0,nullptr,&stringSize);
char * currentPlatformName = new char[stringSize];
error = clGetPlatformInfo(currentPlatform,CL_PLATFORM_NAME,stringSize,currentPlatformName,nullptr);
if(string(currentPlatformName).compare(platformName) == 0)
{
cout << "Platform " << platformName << " found!" << endl;
delete [] currentPlatformName;
platformId = currentPlatform;
break;
}
delete [] currentPlatformName;
}
error = clGetDeviceIDs(platformId,deviceType,0,nullptr,&numberOfDevices);
vector<cl_device_id> device(numberOfDevices);
error = clGetDeviceIDs(platformId,deviceType,numberOfDevices,device.data(),nullptr);
for(const auto & currentDevice : device)
{
size_t stringSize;
error = clGetDeviceInfo(currentDevice,CL_DEVICE_NAME,0,nullptr,&stringSize);
char * currentDeviceName = new char[stringSize];
error = clGetDeviceInfo(currentDevice,CL_DEVICE_NAME,stringSize,currentDeviceName,nullptr);
if(string(currentDeviceName).compare(deviceName) == 0)
{
cout << "Device " << deviceName << " found!" << endl;
delete [] currentDeviceName;
deviceId = currentDevice;
break;
}
delete [] currentDeviceName;
}
context = clCreateContext(nullptr,1,&deviceId,nullptr,nullptr,&error);
commandQueue = clCreateCommandQueue(context,deviceId,0,&error);
}
void run(const std::string & imagePath, const std::string& programSource, const cl_device_id deviceId,
const cl_context& context, const cl_command_queue& commandQueue, int histogram[256])
{
cl_int error;
int width, height, channels;
stbi_set_flip_vertically_on_load(true);
unsigned char *image = stbi_load(imagePath.c_str(),
&width,
&height,
&channels,
STBI_grey);
char min = 0;
char max = 255;
for(int i = 0; i < width*height; ++i)
{
min = (image[i] < min) ? image[i]:min;
max = (image[i] > max) ? image[i]:max;
}
std::cout << "(min, max) := (" << min << ", " << max << ")" << std::endl;
//create buffers
cl_mem memImage = clCreateBuffer(context,CL_MEM_READ_ONLY,width*height*sizeof(char),image,&error);
cl_mem memHistogram = clCreateBuffer(context,CL_MEM_READ_WRITE,256*sizeof(int),&histogram,&error);
//Create program, kernel and setting kernel args
size_t programSize = programSource.length();
const char * source = programSource.c_str();
cl_program program = clCreateProgramWithSource(context,1,&source,&programSize,&error);
error = clBuildProgram(program,1,&deviceId,nullptr,nullptr,nullptr);
cl_kernel kernel = clCreateKernel(program,"computeHistogram",&error);
error = clEnqueueWriteBuffer(commandQueue,memImage,CL_TRUE,0,sizeof(cl_mem),&image,0,nullptr,nullptr);
error = clSetKernelArg(kernel,0,sizeof(cl_mem),&memImage);
error = clSetKernelArg(kernel,1,sizeof(cl_mem),&memHistogram);
clFinish(commandQueue);
size_t globalWorkSize = width*height;
error = clEnqueueNDRangeKernel(commandQueue,kernel,1,nullptr,&globalWorkSize,nullptr,0,nullptr,nullptr);
error = clEnqueueWriteBuffer(commandQueue,memHistogram,CL_TRUE,0,256*sizeof(int),&histogram,0,nullptr,nullptr);
clFinish(commandQueue);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
}
int main(int argc, char** argv)
{
cl_platform_id platformId;
cl_device_id deviceId;
cl_context context;
cl_command_queue commandQueue;
setup(platformId,deviceId,context,commandQueue);
std::string filename = "gray.jpeg";
std::string programSource =
"__kernel void computeHistogram(\n"
" __global char * image, __global int * histogram)\n"
"{\n"
" size_t idx = get_global_id(0);\n"
" char pixelValue = image[idx];\n"
" atomic_inc(&histogram[pixelValue]);\n"
"}\n";
int histogram[256] = {0};
run(filename,programSource, deviceId, context, commandQueue,histogram);
for(int i = 0; i < 256; ++i)
{
std::cout << "i : " << histogram[i] << std::endl;
}
return 0;
}
However I get 0 in every bin. I think the logic I'm trying to apply is correct, but I cannot figure what the error is.
There are several problems. To name a few:
clCreateBuffer returns error -38 (CL_INVALID_MEM_OBJECT) because host_ptr is being passed and this is not being reflected in the flags parameter. CL_MEM_USE_HOST_PTR can be used in addition to CL_MEM_READ_ONLY and CL_MEM_READ_WRITE respectively.
To clEnqueueWriteBuffer size of cl_mem object is being passed instead of the size of image buffer.
After clEnqueueNDRangeKernel again clEnqueueWriteBuffer is being used. I suspect the intention here was to read data back and for that clEnqueueReadBuffer needs to be used.
There may be more problems. These are just the major ones and it's hard to imagine that you checked cl functions return codes and all of them returned CL_SUCCESS...
The actual program that works is the following:
#include <CL/cl.h>
#include <iostream>
#include <vector>
#define STB_IMAGE_IMPLEMENTATION
#include <stb_image.h>
#include <algorithm>
//Getting platform, device, context and command queue
void setup(
cl_platform_id &platformId, cl_device_id &deviceId, cl_context& context, cl_command_queue& commandQueue,
std::string platformName = "NVIDIA CUDA", cl_device_type deviceType = CL_DEVICE_TYPE_GPU,
std::string deviceName = "GeForce GTX 1070")
{
using std::vector;
using std::string;
using std::cout;
using std::endl;
cl_uint numberOfPlatforms, numberOfDevices;
cl_int error;
//Finding platform id
error = clGetPlatformIDs(0,nullptr,&numberOfPlatforms);
vector<cl_platform_id> platform(numberOfPlatforms);
error = clGetPlatformIDs(numberOfPlatforms,platform.data(),nullptr);
for(const auto & currentPlatform : platform)
{
size_t stringSize;
error = clGetPlatformInfo(currentPlatform,CL_PLATFORM_NAME,0,nullptr,&stringSize);
char * currentPlatformName = new char[stringSize];
error = clGetPlatformInfo(currentPlatform,CL_PLATFORM_NAME,stringSize,currentPlatformName,nullptr);
if(string(currentPlatformName).compare(platformName) == 0)
{
cout << "Platform " << platformName << " found!" << endl;
delete [] currentPlatformName;
platformId = currentPlatform;
break;
}
delete [] currentPlatformName;
}
error = clGetDeviceIDs(platformId,deviceType,0,nullptr,&numberOfDevices);
vector<cl_device_id> device(numberOfDevices);
error = clGetDeviceIDs(platformId,deviceType,numberOfDevices,device.data(),nullptr);
for(const auto & currentDevice : device)
{
size_t stringSize;
error = clGetDeviceInfo(currentDevice,CL_DEVICE_NAME,0,nullptr,&stringSize);
char * currentDeviceName = new char[stringSize];
error = clGetDeviceInfo(currentDevice,CL_DEVICE_NAME,stringSize,currentDeviceName,nullptr);
if(string(currentDeviceName).compare(deviceName) == 0)
{
cout << "Device " << deviceName << " found!" << endl;
delete [] currentDeviceName;
deviceId = currentDevice;
break;
}
delete [] currentDeviceName;
}
context = clCreateContext(nullptr,1,&deviceId,nullptr,nullptr,&error);
commandQueue = clCreateCommandQueue(context,deviceId,0,&error);
}
void run(const std::string & imagePath, const std::string& programSource, const cl_device_id deviceId,
const cl_context& context, const cl_command_queue& commandQueue, int histogram[256])
{
cl_int error;
int width, height, channels;
stbi_set_flip_vertically_on_load(true);
unsigned char *image = stbi_load(imagePath.c_str(),
&width,
&height,
&channels,
STBI_grey);
unsigned char min = 255;
unsigned char max = 0;
for(int i = 0; i < width*height; ++i)
{
min = (image[i] < min) ? image[i]:min;
max = (image[i] > max) ? image[i]:max;
}
std::cout << "(min, max) := (" << static_cast<int>(min) << ", " << static_cast<int>(max) << ")" << std::endl;
//create buffers
cl_mem memImage = clCreateBuffer(context,CL_MEM_READ_ONLY,width*height*sizeof(unsigned char),image,&error);
cl_mem memHistogram = clCreateBuffer(context,CL_MEM_READ_WRITE,256*sizeof(int),&histogram,&error);
//Create program, kernel and setting kernel args
size_t programSize = programSource.length();
const char * source = programSource.c_str();
cl_program program = clCreateProgramWithSource(context,1,&source,&programSize,&error);
error = clBuildProgram(program,1,&deviceId,nullptr,nullptr,nullptr);
cl_kernel kernel = clCreateKernel(program,"computeHistogram",&error);
error = clEnqueueWriteBuffer(commandQueue,memImage,CL_TRUE,0,width*height*sizeof(unsigned char),image,0,nullptr,nullptr);
error = clSetKernelArg(kernel,0,sizeof(cl_mem),&memImage);
error = clSetKernelArg(kernel,1,sizeof(cl_mem),&memHistogram);
clFinish(commandQueue);
const size_t globalWorkSize = width*height;
error = clEnqueueNDRangeKernel(commandQueue,kernel,1,nullptr,&globalWorkSize,nullptr,0,nullptr,nullptr);
error = clEnqueueReadBuffer(commandQueue,memHistogram,CL_TRUE,0,256*sizeof(int),histogram,0,nullptr,nullptr);
clFinish(commandQueue);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
}
int main(int argc, char** argv)
{
cl_platform_id platformId;
cl_device_id deviceId;
cl_context context;
cl_command_queue commandQueue;
setup(platformId,deviceId,context,commandQueue);
std::string filename = "gray.jpeg";
std::string programSource =
"__kernel void computeHistogram(\n"
" __global unsigned char * image, __global int * histogram)\n"
"{\n"
" size_t idx = get_global_id(0);\n"
" unsigned char pixelValue = image[idx];\n"
" atomic_inc(&histogram[pixelValue]);\n"
" barrier(CLK_GLOBAL_MEM_FENCE);"
"}\n";
int histogram[256] = {0};
run(filename,programSource, deviceId, context, commandQueue,histogram);
for(int i = 0; i < 256; ++i)
{
std::cout << i << " : " << histogram[i] << std::endl;
}
return 0;
}
The main issue the line
error = clEnqueueReadBuffer(commandQueue,memHistogram,CL_TRUE,0,256*sizeof(int),histogram,0,nullptr,nullptr);
In the original post this was a clEnqueueWriteBuffer and the size was wrong. I was also using char instead of unsigned char and finally the kernel is different.
This is my code:
int main (int argc, char *argv[])
{
typedef int KeyType;
typedef string MappedType;
typedef std::pair<KeyType, MappedType> ValueType;
typedef boost::interprocess::allocator<ValueType, boost::interprocess::managed_shared_memory::segment_manager> ShmAlloc;
typedef boost::unordered_map<KeyType, MappedType, boost::hash<KeyType>, std::equal_to<KeyType>, ShmAlloc> ShmHashMap;
boost::interprocess::managed_shared_memory segment(boost::interprocess::open_or_create, "ContainerSharedMemory", 65536);
if(argc == 2 && string(argv[1]) == "clear")
{
boost::interprocess::shared_memory_object::remove("ContainerSharedMemory");
return 0;
}
ShmHashMap *hash_map = segment.find_or_construct<ShmHashMap>(boost::interprocess::unique_instance)(segment.get_segment_manager());
if(hash_map == NULL)
{
cout << "find_or_construct error" << endl;
return 0;
}
for(int i = 0; i < 5; ++i) {
ShmHashMap::iterator iter = hash_map->find(i);
if (iter == hash_map->end()) {
hash_map->insert(ValueType(i, "test"));
}
}
cout << "all..." << endl;
for(ShmHashMap::iterator iter = hash_map->begin(); iter != hash_map->end(); ++iter)
{
cout << iter->first << "|" << iter->second << endl;
}
cout << "end..." << endl;
return 0;
}
Everything is ok when MappedType is int, but a segment fault whit this code like this:
Rerun this program to access hash map in shared memory will coredump
----------------------------edit again----------------------------------
the problem about string is solved by sehe, thank you
and if i design a template class want to hide that detail, how could i do? if there is some perfect way
template<typename MappedType>
struct ComplexMappedType
{
ComplexMappedType(): t_access(0), t_expire(0) {}
ComplexMappedType(const MappedType& v, uint32_t a, uint32_t e): value(v), t_access(a), t_expire(e) {}
MappedType value;
uint32_t t_access;
uint32_t t_expire;
};
template <typename KeyType, typename MappedType>
class MMSHashMap
{
private:
typedef ComplexMappedType<MappedType> DataType;
typedef std::pair<KeyType, DataType> ValueType;
typedef boost::interprocess::allocator<ValueType, boost::interprocess::managed_shared_memory::segment_manager> ShmAlloc;
typedef boost::unordered_map<KeyType, DataType, boost::hash<KeyType>, std::equal_to<KeyType>, ShmAlloc> ShmHashMap;
public:
MMSHashMap(const std::string& name, size_t size, float e_thr, float e_scale);
~MMSHashMap() {delete pMemorySegment;}
size_t getMEMSize() { return pMemorySegment->get_size(); }
size_t getMEMFreeSize() { return pMemorySegment->get_free_memory(); }
bool get(const KeyType& key, MappedType& value, uint32_t& expire);
bool set(const KeyType& key, const MappedType& value, uint32_t expire);
bool del(const KeyType& key);
private:
void doCapacityElimination();
std::string _name;
boost::interprocess::managed_shared_memory* pMemorySegment;
boost::shared_mutex mutex, mutex_eliminate;
float fEliminateThreshold, fEliminateScale;
};
Of course. std::string allocates from the heap.
The heap is in your process address space, so any other process reading the same shared memory is going to get a wrong raw pointer there and invoke UB.
You need to use a shared-memory allocator with the strings too.
Live On Coliru (using mapped file for Coliru)
With shared memory:
#include <boost/interprocess/managed_shared_memory.hpp>
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/container/scoped_allocator.hpp>
#include <boost/container/string.hpp>
#include <boost/unordered_map.hpp>
#include <iostream>
namespace bip = boost::interprocess;
int main (int argc, char *argv[])
{
typedef int KeyType;
typedef boost::container::basic_string<char, std::char_traits<char>, bip::allocator<char, bip::managed_shared_memory::segment_manager> > MappedType;
typedef std::pair<KeyType, MappedType> ValueType;
typedef boost::interprocess::allocator<ValueType, boost::interprocess::managed_shared_memory::segment_manager> ShmAlloc;
typedef boost::unordered_map<KeyType, MappedType, boost::hash<KeyType>, std::equal_to<KeyType>, boost::container::scoped_allocator_adaptor<ShmAlloc> > ShmHashMap;
boost::interprocess::managed_shared_memory segment(boost::interprocess::open_or_create, "ContainerSharedMemory", 65536);
if(argc == 2 && std::string(argv[1]) == "clear")
{
boost::interprocess::shared_memory_object::remove("ContainerSharedMemory");
return 0;
}
ShmHashMap *hash_map = segment.find_or_construct<ShmHashMap>(boost::interprocess::unique_instance)(segment.get_segment_manager());
if(hash_map == NULL)
{
std::cout << "find_or_construct error" << std::endl;
return 0;
}
for(int i = 0; i < 5; ++i) {
ShmHashMap::iterator iter = hash_map->find(i);
if (iter == hash_map->end()) {
hash_map->insert(ValueType(i, MappedType { "hello", segment.get_segment_manager() }));
}
}
std::cout << "all..." << std::endl;
for(ShmHashMap::iterator iter = hash_map->begin(); iter != hash_map->end(); ++iter)
{
std::cout << iter->first << "|" << iter->second << std::endl;
}
std::cout << "end..." << std::endl;
}
Prints
all...
4|hello
3|hello
2|hello
1|hello
0|hello
end...