Protobuf ParseFromZeroCopyStream incurs high memory usage with repeated field - c++11

I have encountered a problem of high memory usage when using ParseFromZeroCopyStream to load file in which a large buffer is written. Besides, the code snippet below uses 60Gb++ of RAM but failed as the system froze after reaching its RAM limit.
FYI, I am using protobuf as DLL.
scene.proto
syntax = "proto3";
package Recipe;
option cc_enable_arenas = true;
message Scene
{
repeated int32 image_data = 1 [packed=true];
}
source.cpp
#include <iostream>
#include <fstream>
#include <ostream>
#include <istream>
#include <string>
#include <cstdint>
#include "Scene.pb.h"
#include <google\protobuf\io\zero_copy_stream_impl.h>
#include <google\protobuf\io\gzip_stream.h>
#include <google\protobuf\arena.h>
int const _MIN = 0;
int const _MAX = 255;
unsigned int const _SIZE = 1280000000;
//unsigned int const _SIZE = 2000;
unsigned int const _COMPRESSION_LEVEL = 6;
void randWithinUnsignedCharSize(uint8_t * buffer, unsigned int size)
{
for (size_t i = 0; i < size; ++i)
{
buffer[i] = i;
}
}
using namespace google::protobuf::io;
int main()
{
GOOGLE_PROTOBUF_VERIFY_VERSION;
{
google::protobuf::Arena arena;
Recipe::Scene * scene = google::protobuf::Arena::CreateMessage<Recipe::Scene>(&arena);
uint8_t * imageData = new uint8_t[_SIZE];
randWithinUnsignedCharSize(imageData, _SIZE);
scene->mutable_image_data()->Resize(_SIZE, 0);
for (size_t i = 0; i < _SIZE; i++)
{
scene->set_image_data(i, imageData[i]);
}
std::cout << "done saving data to repeated field.\n";
{
std::fstream output("data.txt", std::ios::out | std::ios::trunc | std::ios::binary);
OstreamOutputStream outputFileStream(&output);
GzipOutputStream::Options options;
options.format = GzipOutputStream::GZIP;
options.compression_level = _COMPRESSION_LEVEL;
GzipOutputStream gzipOutputStream(&outputFileStream, options);
if (!scene->SerializeToZeroCopyStream(&gzipOutputStream)) {
std::cerr << "Failed to write scene." << std::endl;
return -1;
}
}
delete[] imageData;
}
std::cout << "Finish serializing into data.txt\n";
{
google::protobuf::Arena arena1;
Recipe::Scene * scene1 = google::protobuf::Arena::CreateMessage<Recipe::Scene>(&arena1);
{
std::fstream input("data.txt", std::ios::in | std::ios::binary);
IstreamInputStream inputFileStream(&input);
GzipInputStream gzipInputStream(&inputFileStream);
if (!scene1->ParseFromZeroCopyStream(&gzipInputStream)) {
std::cerr << "Failed to parse scene." << std::endl;
return -1;
}
}
std::cout << "scene1->imagedata_size() " << scene1->image_data_size() << std::endl;
}
google::protobuf::ShutdownProtobufLibrary();
return 0;
}

Related

no data while cpu profiling - visual studio

i tried to profile performance of my code, and thats what i get:
i took a code from microsoft docs from topic about profiling:
#include <iostream>
#include <limits>
#include <mutex>
#include <random>
#include <functional>
//.cpp file code:
static constexpr int MIN_ITERATIONS = std::numeric_limits<int>::max() / 1000;
static constexpr int MAX_ITERATIONS = MIN_ITERATIONS + 10000;
long long m_totalIterations = 0;
std::mutex m_totalItersLock;
int getNumber()
{
std::uniform_int_distribution<int> num_distribution(MIN_ITERATIONS, MAX_ITERATIONS);
std::mt19937 random_number_engine; // pseudorandom number generator
auto get_num = std::bind(num_distribution, random_number_engine);
int random_num = get_num();
auto result = 0;
{
std::lock_guard<std::mutex> lock(m_totalItersLock);
m_totalIterations += random_num;
}
// we're just spinning here
// to increase CPU usage
for (int i = 0; i < random_num; i++)
{
result = get_num();
}
return result;
}
void doWork()
{
std::wcout << L"The doWork function is running on another thread." << std::endl;
auto x = getNumber();
}
int main()
{
std::vector<std::thread> threads;
for (int i = 0; i < 10; ++i) {
threads.push_back(std::thread(doWork));
std::cout << "The Main() thread calls this after starting the new thread" << std::endl;
}
for (auto& thread : threads) {
thread.join();
}
return 0;
}
, and still i'm getting different output (or no output actually). Can someone help me pls? I'm trying to do that on Visual Studio Community 2019

OpenCL compute histogram program doesn't returns 0 in every bin

I'm trying to implement a simple opencl program to compute an histogram.
Below is what I currently have:
#include <CL/cl.h>
#include <iostream>
#include <vector>
#define STB_IMAGE_IMPLEMENTATION
#include <stb_image.h>
#include <algorithm>
//Getting platform, device, context and command queue
void setup(
cl_platform_id &platformId, cl_device_id &deviceId, cl_context& context, cl_command_queue& commandQueue,
std::string platformName = "NVIDIA CUDA", cl_device_type deviceType = CL_DEVICE_TYPE_GPU,
std::string deviceName = "GeForce GTX 1070")
{
using std::vector;
using std::string;
using std::cout;
using std::endl;
cl_uint numberOfPlatforms, numberOfDevices;
cl_int error;
//Finding platform id
error = clGetPlatformIDs(0,nullptr,&numberOfPlatforms);
vector<cl_platform_id> platform(numberOfPlatforms);
error = clGetPlatformIDs(numberOfPlatforms,platform.data(),nullptr);
for(const auto & currentPlatform : platform)
{
size_t stringSize;
error = clGetPlatformInfo(currentPlatform,CL_PLATFORM_NAME,0,nullptr,&stringSize);
char * currentPlatformName = new char[stringSize];
error = clGetPlatformInfo(currentPlatform,CL_PLATFORM_NAME,stringSize,currentPlatformName,nullptr);
if(string(currentPlatformName).compare(platformName) == 0)
{
cout << "Platform " << platformName << " found!" << endl;
delete [] currentPlatformName;
platformId = currentPlatform;
break;
}
delete [] currentPlatformName;
}
error = clGetDeviceIDs(platformId,deviceType,0,nullptr,&numberOfDevices);
vector<cl_device_id> device(numberOfDevices);
error = clGetDeviceIDs(platformId,deviceType,numberOfDevices,device.data(),nullptr);
for(const auto & currentDevice : device)
{
size_t stringSize;
error = clGetDeviceInfo(currentDevice,CL_DEVICE_NAME,0,nullptr,&stringSize);
char * currentDeviceName = new char[stringSize];
error = clGetDeviceInfo(currentDevice,CL_DEVICE_NAME,stringSize,currentDeviceName,nullptr);
if(string(currentDeviceName).compare(deviceName) == 0)
{
cout << "Device " << deviceName << " found!" << endl;
delete [] currentDeviceName;
deviceId = currentDevice;
break;
}
delete [] currentDeviceName;
}
context = clCreateContext(nullptr,1,&deviceId,nullptr,nullptr,&error);
commandQueue = clCreateCommandQueue(context,deviceId,0,&error);
}
void run(const std::string & imagePath, const std::string& programSource, const cl_device_id deviceId,
const cl_context& context, const cl_command_queue& commandQueue, int histogram[256])
{
cl_int error;
int width, height, channels;
stbi_set_flip_vertically_on_load(true);
unsigned char *image = stbi_load(imagePath.c_str(),
&width,
&height,
&channels,
STBI_grey);
char min = 0;
char max = 255;
for(int i = 0; i < width*height; ++i)
{
min = (image[i] < min) ? image[i]:min;
max = (image[i] > max) ? image[i]:max;
}
std::cout << "(min, max) := (" << min << ", " << max << ")" << std::endl;
//create buffers
cl_mem memImage = clCreateBuffer(context,CL_MEM_READ_ONLY,width*height*sizeof(char),image,&error);
cl_mem memHistogram = clCreateBuffer(context,CL_MEM_READ_WRITE,256*sizeof(int),&histogram,&error);
//Create program, kernel and setting kernel args
size_t programSize = programSource.length();
const char * source = programSource.c_str();
cl_program program = clCreateProgramWithSource(context,1,&source,&programSize,&error);
error = clBuildProgram(program,1,&deviceId,nullptr,nullptr,nullptr);
cl_kernel kernel = clCreateKernel(program,"computeHistogram",&error);
error = clEnqueueWriteBuffer(commandQueue,memImage,CL_TRUE,0,sizeof(cl_mem),&image,0,nullptr,nullptr);
error = clSetKernelArg(kernel,0,sizeof(cl_mem),&memImage);
error = clSetKernelArg(kernel,1,sizeof(cl_mem),&memHistogram);
clFinish(commandQueue);
size_t globalWorkSize = width*height;
error = clEnqueueNDRangeKernel(commandQueue,kernel,1,nullptr,&globalWorkSize,nullptr,0,nullptr,nullptr);
error = clEnqueueWriteBuffer(commandQueue,memHistogram,CL_TRUE,0,256*sizeof(int),&histogram,0,nullptr,nullptr);
clFinish(commandQueue);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
}
int main(int argc, char** argv)
{
cl_platform_id platformId;
cl_device_id deviceId;
cl_context context;
cl_command_queue commandQueue;
setup(platformId,deviceId,context,commandQueue);
std::string filename = "gray.jpeg";
std::string programSource =
"__kernel void computeHistogram(\n"
" __global char * image, __global int * histogram)\n"
"{\n"
" size_t idx = get_global_id(0);\n"
" char pixelValue = image[idx];\n"
" atomic_inc(&histogram[pixelValue]);\n"
"}\n";
int histogram[256] = {0};
run(filename,programSource, deviceId, context, commandQueue,histogram);
for(int i = 0; i < 256; ++i)
{
std::cout << "i : " << histogram[i] << std::endl;
}
return 0;
}
However I get 0 in every bin. I think the logic I'm trying to apply is correct, but I cannot figure what the error is.
There are several problems. To name a few:
clCreateBuffer returns error -38 (CL_INVALID_MEM_OBJECT) because host_ptr is being passed and this is not being reflected in the flags parameter. CL_MEM_USE_HOST_PTR can be used in addition to CL_MEM_READ_ONLY and CL_MEM_READ_WRITE respectively.
To clEnqueueWriteBuffer size of cl_mem object is being passed instead of the size of image buffer.
After clEnqueueNDRangeKernel again clEnqueueWriteBuffer is being used. I suspect the intention here was to read data back and for that clEnqueueReadBuffer needs to be used.
There may be more problems. These are just the major ones and it's hard to imagine that you checked cl functions return codes and all of them returned CL_SUCCESS...
The actual program that works is the following:
#include <CL/cl.h>
#include <iostream>
#include <vector>
#define STB_IMAGE_IMPLEMENTATION
#include <stb_image.h>
#include <algorithm>
//Getting platform, device, context and command queue
void setup(
cl_platform_id &platformId, cl_device_id &deviceId, cl_context& context, cl_command_queue& commandQueue,
std::string platformName = "NVIDIA CUDA", cl_device_type deviceType = CL_DEVICE_TYPE_GPU,
std::string deviceName = "GeForce GTX 1070")
{
using std::vector;
using std::string;
using std::cout;
using std::endl;
cl_uint numberOfPlatforms, numberOfDevices;
cl_int error;
//Finding platform id
error = clGetPlatformIDs(0,nullptr,&numberOfPlatforms);
vector<cl_platform_id> platform(numberOfPlatforms);
error = clGetPlatformIDs(numberOfPlatforms,platform.data(),nullptr);
for(const auto & currentPlatform : platform)
{
size_t stringSize;
error = clGetPlatformInfo(currentPlatform,CL_PLATFORM_NAME,0,nullptr,&stringSize);
char * currentPlatformName = new char[stringSize];
error = clGetPlatformInfo(currentPlatform,CL_PLATFORM_NAME,stringSize,currentPlatformName,nullptr);
if(string(currentPlatformName).compare(platformName) == 0)
{
cout << "Platform " << platformName << " found!" << endl;
delete [] currentPlatformName;
platformId = currentPlatform;
break;
}
delete [] currentPlatformName;
}
error = clGetDeviceIDs(platformId,deviceType,0,nullptr,&numberOfDevices);
vector<cl_device_id> device(numberOfDevices);
error = clGetDeviceIDs(platformId,deviceType,numberOfDevices,device.data(),nullptr);
for(const auto & currentDevice : device)
{
size_t stringSize;
error = clGetDeviceInfo(currentDevice,CL_DEVICE_NAME,0,nullptr,&stringSize);
char * currentDeviceName = new char[stringSize];
error = clGetDeviceInfo(currentDevice,CL_DEVICE_NAME,stringSize,currentDeviceName,nullptr);
if(string(currentDeviceName).compare(deviceName) == 0)
{
cout << "Device " << deviceName << " found!" << endl;
delete [] currentDeviceName;
deviceId = currentDevice;
break;
}
delete [] currentDeviceName;
}
context = clCreateContext(nullptr,1,&deviceId,nullptr,nullptr,&error);
commandQueue = clCreateCommandQueue(context,deviceId,0,&error);
}
void run(const std::string & imagePath, const std::string& programSource, const cl_device_id deviceId,
const cl_context& context, const cl_command_queue& commandQueue, int histogram[256])
{
cl_int error;
int width, height, channels;
stbi_set_flip_vertically_on_load(true);
unsigned char *image = stbi_load(imagePath.c_str(),
&width,
&height,
&channels,
STBI_grey);
unsigned char min = 255;
unsigned char max = 0;
for(int i = 0; i < width*height; ++i)
{
min = (image[i] < min) ? image[i]:min;
max = (image[i] > max) ? image[i]:max;
}
std::cout << "(min, max) := (" << static_cast<int>(min) << ", " << static_cast<int>(max) << ")" << std::endl;
//create buffers
cl_mem memImage = clCreateBuffer(context,CL_MEM_READ_ONLY,width*height*sizeof(unsigned char),image,&error);
cl_mem memHistogram = clCreateBuffer(context,CL_MEM_READ_WRITE,256*sizeof(int),&histogram,&error);
//Create program, kernel and setting kernel args
size_t programSize = programSource.length();
const char * source = programSource.c_str();
cl_program program = clCreateProgramWithSource(context,1,&source,&programSize,&error);
error = clBuildProgram(program,1,&deviceId,nullptr,nullptr,nullptr);
cl_kernel kernel = clCreateKernel(program,"computeHistogram",&error);
error = clEnqueueWriteBuffer(commandQueue,memImage,CL_TRUE,0,width*height*sizeof(unsigned char),image,0,nullptr,nullptr);
error = clSetKernelArg(kernel,0,sizeof(cl_mem),&memImage);
error = clSetKernelArg(kernel,1,sizeof(cl_mem),&memHistogram);
clFinish(commandQueue);
const size_t globalWorkSize = width*height;
error = clEnqueueNDRangeKernel(commandQueue,kernel,1,nullptr,&globalWorkSize,nullptr,0,nullptr,nullptr);
error = clEnqueueReadBuffer(commandQueue,memHistogram,CL_TRUE,0,256*sizeof(int),histogram,0,nullptr,nullptr);
clFinish(commandQueue);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
}
int main(int argc, char** argv)
{
cl_platform_id platformId;
cl_device_id deviceId;
cl_context context;
cl_command_queue commandQueue;
setup(platformId,deviceId,context,commandQueue);
std::string filename = "gray.jpeg";
std::string programSource =
"__kernel void computeHistogram(\n"
" __global unsigned char * image, __global int * histogram)\n"
"{\n"
" size_t idx = get_global_id(0);\n"
" unsigned char pixelValue = image[idx];\n"
" atomic_inc(&histogram[pixelValue]);\n"
" barrier(CLK_GLOBAL_MEM_FENCE);"
"}\n";
int histogram[256] = {0};
run(filename,programSource, deviceId, context, commandQueue,histogram);
for(int i = 0; i < 256; ++i)
{
std::cout << i << " : " << histogram[i] << std::endl;
}
return 0;
}
The main issue the line
error = clEnqueueReadBuffer(commandQueue,memHistogram,CL_TRUE,0,256*sizeof(int),histogram,0,nullptr,nullptr);
In the original post this was a clEnqueueWriteBuffer and the size was wrong. I was also using char instead of unsigned char and finally the kernel is different.

Why can't I receive UDP packets more than once with Boost ASIO?

HINT: This works if I instantiate the io_context inside the for loop.
I know this code looks a little goofy, but it's a simplified version of code that's bigger and has this structure. Why can't I receive a second packet with the below code? It works fine with bool synch = true;. Here's the output I get:
iteration 0
receive udp
posted receive
got a packet
iteration 1
receive udp
posted receive
I have to hit Ctrl-c to quit. I expect to see "got a packet" a second time.
The receiver:
#include <array>
#include <iostream>
#include <functional>
#include <thread>
#include <boost/asio.hpp>
namespace asio = boost::asio;
namespace ip = asio::ip;
using ip::udp;
using std::cout;
using std::endl;
using boost_ec = boost::system::error_code;
int main() {
asio::io_context ioContext;
std::array<char, 65500> buffer;
auto asioBuffer = asio::buffer(buffer);
bool synch = false;
udp::endpoint remoteEndpoint;
for (unsigned int i = 0; i < 2; ++i) {
cout << "iteration " << i << endl;
auto recvSocket = udp::socket(ioContext,
udp::endpoint(udp::v4(), 9090));
if (synch) {
recvSocket.receive_from(asioBuffer, remoteEndpoint);
cout << "received a packet" << endl;
} else {
std::function<void(const boost_ec&, size_t)> impl =
[&](const boost_ec &, size_t packetSize) {
if (packetSize > 0) {
cout << "got a packet" << endl;
return;
}
cout << "receive udp" << endl;
recvSocket.async_receive_from(asioBuffer,
remoteEndpoint,
impl);
cout << "posted receive" << endl;
};
impl(boost_ec(), 0);
while (ioContext.poll() == 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(200));
}
}
}
}
The sender:
#include <array>
#include <iostream>
#include <boost/asio.hpp>
namespace asio = boost::asio;
namespace ip = asio::ip;
namespace chrono = std::chrono;
using ip::udp;
using std::cout;
using std::endl;
int main() {
std::array<char, 65500> buffer;
asio::io_context ioContext;
auto socket = udp::socket(ioContext);
socket.open(udp::v4());
auto endpoint = udp::endpoint(udp::v4(), 9090);
size_t packetsSent = 0;
size_t bytesSent = 0;
const double APPROX_BYTES_PER_SEC = 1e6;
const auto CHECK_INTERVAL = chrono::microseconds(100);
auto beforeStart = chrono::steady_clock::now();
auto start = beforeStart;
size_t bytesSentSinceStart = 0;
while (true) {
auto now = chrono::steady_clock::now();
auto timePassed = now - start;
if (timePassed > CHECK_INTERVAL) {
auto expectedTime = chrono::duration<double>(bytesSentSinceStart /
APPROX_BYTES_PER_SEC);
if (expectedTime > timePassed) {
std::this_thread::sleep_for(expectedTime - timePassed);
}
start = chrono::steady_clock::now();
bytesSentSinceStart = 0;
}
bytesSent += socket.send_to(asio::buffer(buffer), endpoint);
bytesSentSinceStart += buffer.size();
++packetsSent;
}
return 0;
}
I think this is the key:
void restart();
This function must be called prior to any second or later set of invocations of the run(), run_one(), poll() or poll_one() functions when a previous invocation of these functions returned due to the io_context being stopped or running out of work.
So the above code needs to call restart after every poll that results in ioContext.stopped() being true, and it becomes true when the ioContext no longer has anything attached to it waiting to happen.

unordered_map with string in managed_shared_memory fails

This is my code:
int main (int argc, char *argv[])
{
typedef int KeyType;
typedef string MappedType;
typedef std::pair<KeyType, MappedType> ValueType;
typedef boost::interprocess::allocator<ValueType, boost::interprocess::managed_shared_memory::segment_manager> ShmAlloc;
typedef boost::unordered_map<KeyType, MappedType, boost::hash<KeyType>, std::equal_to<KeyType>, ShmAlloc> ShmHashMap;
boost::interprocess::managed_shared_memory segment(boost::interprocess::open_or_create, "ContainerSharedMemory", 65536);
if(argc == 2 && string(argv[1]) == "clear")
{
boost::interprocess::shared_memory_object::remove("ContainerSharedMemory");
return 0;
}
ShmHashMap *hash_map = segment.find_or_construct<ShmHashMap>(boost::interprocess::unique_instance)(segment.get_segment_manager());
if(hash_map == NULL)
{
cout << "find_or_construct error" << endl;
return 0;
}
for(int i = 0; i < 5; ++i) {
ShmHashMap::iterator iter = hash_map->find(i);
if (iter == hash_map->end()) {
hash_map->insert(ValueType(i, "test"));
}
}
cout << "all..." << endl;
for(ShmHashMap::iterator iter = hash_map->begin(); iter != hash_map->end(); ++iter)
{
cout << iter->first << "|" << iter->second << endl;
}
cout << "end..." << endl;
return 0;
}
Everything is ok when MappedType is int, but a segment fault whit this code like this:
Rerun this program to access hash map in shared memory will coredump
----------------------------edit again----------------------------------
the problem about string is solved by sehe, thank you
and if i design a template class want to hide that detail, how could i do? if there is some perfect way
template<typename MappedType>
struct ComplexMappedType
{
ComplexMappedType(): t_access(0), t_expire(0) {}
ComplexMappedType(const MappedType& v, uint32_t a, uint32_t e): value(v), t_access(a), t_expire(e) {}
MappedType value;
uint32_t t_access;
uint32_t t_expire;
};
template <typename KeyType, typename MappedType>
class MMSHashMap
{
private:
typedef ComplexMappedType<MappedType> DataType;
typedef std::pair<KeyType, DataType> ValueType;
typedef boost::interprocess::allocator<ValueType, boost::interprocess::managed_shared_memory::segment_manager> ShmAlloc;
typedef boost::unordered_map<KeyType, DataType, boost::hash<KeyType>, std::equal_to<KeyType>, ShmAlloc> ShmHashMap;
public:
MMSHashMap(const std::string& name, size_t size, float e_thr, float e_scale);
~MMSHashMap() {delete pMemorySegment;}
size_t getMEMSize() { return pMemorySegment->get_size(); }
size_t getMEMFreeSize() { return pMemorySegment->get_free_memory(); }
bool get(const KeyType& key, MappedType& value, uint32_t& expire);
bool set(const KeyType& key, const MappedType& value, uint32_t expire);
bool del(const KeyType& key);
private:
void doCapacityElimination();
std::string _name;
boost::interprocess::managed_shared_memory* pMemorySegment;
boost::shared_mutex mutex, mutex_eliminate;
float fEliminateThreshold, fEliminateScale;
};
Of course. std::string allocates from the heap.
The heap is in your process address space, so any other process reading the same shared memory is going to get a wrong raw pointer there and invoke UB.
You need to use a shared-memory allocator with the strings too.
Live On Coliru (using mapped file for Coliru)
With shared memory:
#include <boost/interprocess/managed_shared_memory.hpp>
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/container/scoped_allocator.hpp>
#include <boost/container/string.hpp>
#include <boost/unordered_map.hpp>
#include <iostream>
namespace bip = boost::interprocess;
int main (int argc, char *argv[])
{
typedef int KeyType;
typedef boost::container::basic_string<char, std::char_traits<char>, bip::allocator<char, bip::managed_shared_memory::segment_manager> > MappedType;
typedef std::pair<KeyType, MappedType> ValueType;
typedef boost::interprocess::allocator<ValueType, boost::interprocess::managed_shared_memory::segment_manager> ShmAlloc;
typedef boost::unordered_map<KeyType, MappedType, boost::hash<KeyType>, std::equal_to<KeyType>, boost::container::scoped_allocator_adaptor<ShmAlloc> > ShmHashMap;
boost::interprocess::managed_shared_memory segment(boost::interprocess::open_or_create, "ContainerSharedMemory", 65536);
if(argc == 2 && std::string(argv[1]) == "clear")
{
boost::interprocess::shared_memory_object::remove("ContainerSharedMemory");
return 0;
}
ShmHashMap *hash_map = segment.find_or_construct<ShmHashMap>(boost::interprocess::unique_instance)(segment.get_segment_manager());
if(hash_map == NULL)
{
std::cout << "find_or_construct error" << std::endl;
return 0;
}
for(int i = 0; i < 5; ++i) {
ShmHashMap::iterator iter = hash_map->find(i);
if (iter == hash_map->end()) {
hash_map->insert(ValueType(i, MappedType { "hello", segment.get_segment_manager() }));
}
}
std::cout << "all..." << std::endl;
for(ShmHashMap::iterator iter = hash_map->begin(); iter != hash_map->end(); ++iter)
{
std::cout << iter->first << "|" << iter->second << std::endl;
}
std::cout << "end..." << std::endl;
}
Prints
all...
4|hello
3|hello
2|hello
1|hello
0|hello
end...

boost::variant vs. polymorphism, very different performance results with clang and gcc

I'm trying to figure out how much the execution time of boost::variant differ from a polymorphism approach. In my first test I got very different results on gcc 4.9.1 and clang+llvm 3.5.
You can find the code below. Here are my results:
clang+llvm
polymorphism: 2.16401
boost::variant: 3.83487
gcc:
polymorphism: 2.46161
boost::variant: 1.33326
I compiled both with -O3.
Is someone able to explain that?
code
#include <iostream>
#include <vector>
#include <algorithm>
#include <boost/variant.hpp>
#include <boost/variant/apply_visitor.hpp>
#include <ctime>
struct value_type {
value_type() {}
virtual ~value_type() {}
virtual void inc() = 0;
};
struct int_type : value_type {
int_type() : value_type() {}
virtual ~int_type() {}
void inc() { value += 1; }
private:
int value = 0;
};
struct float_type : value_type {
float_type() : value_type() {}
virtual ~float_type() {}
void inc() { value += 1; }
private:
float value = 0;
};
void dyn_test() {
std::vector<std::unique_ptr<value_type>> v;
for (int i = 0; i < 1024; i++) {
if (i % 2 == 0)
v.emplace_back(new int_type());
else
v.emplace_back(new float_type());
}
for (int i = 0; i < 900000; i++) {
std::for_each(v.begin(), v.end(), [](auto &item) { item->inc(); });
}
}
struct visitor : boost::static_visitor<> {
template <typename T> void operator()(T &item) { item += 1; }
};
using mytype = boost::variant<int, float>;
void static_test() {
std::vector<mytype> v;
for (int i = 0; i < 1024; i++) {
if (i % 2 == 0)
v.emplace_back(0);
else
v.emplace_back(0.f);
}
visitor vi;
for (int i = 0; i < 900000; i++) {
std::for_each(v.begin(), v.end(), boost::apply_visitor(vi));
}
}
template <typename F> double measure(F f) {
clock_t start = clock();
f();
clock_t end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
return seconds;
}
int main() {
std::cout << "polymorphism: " << measure([] { dyn_test(); }) << std::endl;
std::cout << "boost::variant: " << measure([] { static_test(); }) << std::endl;
return 0;
}
assembler
gcc
clang+llvm
Clang is known to miscompile some std::vector functions from various Standard libraries, due to some edge cases in their inliner. I don't know if those have been fixed by now but quite possibly not. Since unique_ptr is smaller and simpler than boost::variant it's more likely that it does not trigger these edge cases.
The code you post is practically "Why boost::variant is great". A dynamic allocation and random pointer index in addition to the regular indirections that both perform? That's a heavy hit (relatively).

Resources