Visual C++ and concurrency threads..why this happened? - windows

I have to write a program that works with threads in concurrency, for processing OpenCV Mat images. Every thread pick an image from a queue, processing it and put result to another queue. I use a thread safe template queue (as you can see in code) of Mat images.
But the strange behaviour of threads is: If I launch program multiple times, every time I obtain a different result in numbers of elaborations of single thread (the counter "add" that I've inserted to monitoring number of images processing by single threads).
The first thread (zero) always doing all it's elaborations (in this example, 10) but the rest of threads, don't. Sometimes every thread doing 10 elaborations, sometimes 3, some times 5...2..with the new changes (CONDITION VARIABLES and CRITICAL SECTION) threads doing only 1 operation.
I don't know where is the problem...why this happened.
I past here my code, and ask you to check it and tell me in your opinion, what's the problem...I'm desperate.
This is the code:
#include <opencv\cv.h>
#include <opencv\highgui.h>
#include <opencv2\highgui\highgui.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
#include <process.h>
#include <queue>
using namespace std;
using namespace cv;
/*thread safe queue*/
template<typename T>
class coda_concorr
{
private:
std::queue<T> la_coda;
HANDLE mutex;
public:
bool elemento;
coda_concorr()
{
mutex = CreateMutex(NULL,FALSE,NULL);
}
~coda_concorr()
{}
void push(T& data)
{
WaitForSingleObject(mutex,INFINITE);
la_coda.push(data);
ReleaseMutex(mutex);
}
bool vuota() const
{
WaitForSingleObject(mutex,INFINITE);
bool RetCode = la_coda.empty();
ReleaseMutex(mutex);
return RetCode;
}
bool try_pop(T& popped)
{
WaitForSingleObject(mutex,INFINITE);
while (la_coda.empty()){
ReleaseMutex(mutex);
return false;
}
WaitForSingleObject(mutex,INFINITE);
popped = la_coda.front();
la_coda.pop();
ReleaseMutex(mutex);
return true;
}
};
struct Args
{
coda_concorr<cv::Mat> in;
coda_concorr<cv::Mat> *out; //puntatore a coda successiva
};
CONDITION_VARIABLE NonVuoto1;
CONDITION_VARIABLE NonVuoto2;
CONDITION_VARIABLE NonVuoto3;
CONDITION_VARIABLE NonVuoto4;
CRITICAL_SECTION Lock1;
CRITICAL_SECTION Lock2;
CRITICAL_SECTION Lock3;
CRITICAL_SECTION Lock4;
bool stop;
//initial populating queue
void puts (void* param){
Args* arg = (Args*)param;
int i=0;
Mat image;
while(!arg->in.vuota()){
arg->in.try_pop(image);
arg->out->push(image);
i++;
WakeConditionVariable(&NonVuoto1);
}
//fine
cout<<endl<<"Thread (PUSH) terminato con "<<i<<" elaborazioni."<<endl;
WakeConditionVariable(&NonVuoto1);
_endthread();
}
//grey funct
void grey (void *param){
Mat temp1,temp2;
int add = 0;
Args* arg = (Args*)param;
while(true){
EnterCriticalSection(&Lock1);
//se vuoto
while(arg->in.vuota() && !stop){
SleepConditionVariableCS(&NonVuoto1,&Lock1,INFINITE);
}
if(stop==true){
LeaveCriticalSection(&Lock1);
break;
}
arg->in.try_pop(temp1);
cvtColor(temp1,temp2,CV_BGR2GRAY);
arg->out->push(temp2);
add++;
cout<<endl<<"grey ha fatto: "<<add<<endl;
LeaveCriticalSection(&Lock1);
WakeConditionVariable(&NonVuoto2);
}
//fine
cout<<endl<<"Thread (GREY) terminato con "<<add<<" elaborazioni."<<endl;
_endthread();
}
//threshold funct
void soglia(void *param){
Mat temp1a,temp2a;
int add=0;
Args* arg = (Args*)param;
while(true){
EnterCriticalSection(&Lock2);
while(arg->in.vuota() && stop == false){
SleepConditionVariableCS(&NonVuoto2,&Lock2,INFINITE);
}
if(stop==true){
LeaveCriticalSection(&Lock2);
break;
}
arg->in.try_pop(temp1a);
threshold(temp1a,temp2a,128,255,THRESH_BINARY);
arg->out->push(temp2a);
add++;
LeaveCriticalSection(&Lock2);
WakeConditionVariable(&NonVuoto3);
cout<<endl<<"soglia ha fatto: "<<add<<endl;
}
//fine
cout<<endl<<"Thread (SOGLIA) terminato con "<<add<<" elaborazioni."<<endl;
_endthread();
}
//erode/dilate funct
void finitura(void *param){
Mat temp1b,temp2b,temp2c;
int add = 0;
Args* arg = (Args*)param;
//come consumatore
while(true){
EnterCriticalSection(&Lock3);
while(arg->in.vuota() && stop == false){
SleepConditionVariableCS(&NonVuoto3,&Lock3,INFINITE);
}
if(stop==TRUE){
LeaveCriticalSection(&Lock3);
break;
}
arg->in.try_pop(temp1b);
erode(temp1b,temp2b,cv::Mat());
dilate(temp2b,temp2c,Mat());
arg->out->push(temp2c);
add++;
LeaveCriticalSection(&Lock3);
WakeConditionVariable(&NonVuoto4);
cout<<endl<<"erode ha fatto: "<<add<<endl;
}
//fine
cout<<endl<<"Thread (ERODE) terminato con "<<add<<" elaborazioni."<<endl;
_endthread();
}
//contour funct
void contorno (void *param){
Mat temp;
int add=0;
Args* arg = (Args*)param;
//come consumatore
while(true){
EnterCriticalSection(&Lock4);
while(arg->in.vuota() && stop == false){
SleepConditionVariableCS(&NonVuoto4,&Lock4,INFINITE);
}
if(stop==TRUE){
LeaveCriticalSection(&Lock4);
break;
}
//esegue pop
arg->in.try_pop(temp);
//trova i contorni
vector<vector<Point>> contorni;
findContours(temp,contorni,CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE);
//disegna i contoni in un'immagine
Mat dst(temp.size(), CV_8UC3, Scalar(0,0,0));
Scalar colors[3];
colors[0] = Scalar(255,0,0);
colors[1] = Scalar(0,255,0);
colors[2] = Scalar(0,0,255);
for (size_t idx = 0; idx < contorni.size(); idx++){
drawContours(dst,contorni,idx,colors[idx %3]);
}
//come produttore
arg->out->push(dst);
add++;
cout<<endl<<"cont ha fatto: "<<add<<endl;
LeaveCriticalSection(&Lock4);
}
cout<<endl<<"Thread (CONTOUR) terminato con "<<add<<" elaborazioni."<<endl;
_endthread();
}
//main
int main()
{
coda_concorr<cv::Mat> ingresso;
coda_concorr<cv::Mat> uscita;
InitializeConditionVariable(&NonVuoto1);
InitializeConditionVariable(&NonVuoto2);
InitializeConditionVariable(&NonVuoto3);
InitializeConditionVariable(&NonVuoto4);
InitializeCriticalSection(&Lock1);
InitializeCriticalSection(&Lock2);
InitializeCriticalSection(&Lock3);
InitializeCriticalSection(&Lock4);
LARGE_INTEGER count1, count2, freq;
double elapsed;
Mat temp[10];
Mat out;
//dichiarazione code
Args dati0,dati1,dati2,dati3,dati4;
//avvio contatori
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter (&count1);
for(int i=0;i<10;i++){
temp[i] = imread("C:/OPENCV/Test/imgtest/bird1.jpg",1);
ingresso.push(temp[i]);
}
//next queue pointer
dati0.in=ingresso;
dati0.out=&dati1.in;
dati1.out=&dati2.in;
dati2.out=&dati3.in;
dati3.out=&dati4.in;
dati4.out=&uscita;
//handle
HANDLE handle0,handle1,handle2,handle3,handle4;
//start threads
handle0 = (HANDLE) _beginthread(puts,0,&dati0);
handle1 = (HANDLE) _beginthread(grey,0,&dati1);
handle2 = (HANDLE) _beginthread(soglia,0,&dati2);
handle3 = (HANDLE) _beginthread(finitura,0,&dati3);
handle4 = (HANDLE) _beginthread(contorno,0,&dati4);
cout<<endl<<"..Join dei threads..."<<endl;
//join
WaitForSingleObject(handle0,INFINITE);
WaitForSingleObject(handle1,INFINITE);
WaitForSingleObject(handle2,INFINITE);
WaitForSingleObject(handle3,INFINITE);
WaitForSingleObject(handle4,INFINITE);
//chiusura contatori
QueryPerformanceCounter (&count2);
CloseHandle(handle0);
CloseHandle(handle1);
CloseHandle(handle2);
CloseHandle(handle3);
CloseHandle(handle4);
elapsed = (count2.QuadPart - count1.QuadPart) * 1000.0 / freq.QuadPart;
cout <<endl<<"Tempo di esecuzione approssimativo: " <<elapsed<<" ms."<<endl;
system("PAUSE");
return 0;
}
If previuos thread put all it's images on queue, why next thread don't do the same?
I'm on Windows 7 64bit with Visual C++ 2010 OpenCV 2.4.4
Please, help me find where is the problem...

It seems what you want to implement is similar to an assembly-line work in a factory where everyone does his job and toss it to the next worker till all the work is done. Please correct me if I am wrong. The idiom on each of your thread function is
void dowork(){
while(noinput()){
sleep(0.01);
}
while(getSomeInput()){
processInput();
queueResult();
}
displayAmountOfWorkDone();
}
You are successful in providing mutual exclusion with your mutex. The issue with your design is that once a thread has observed his input queue to be non empty, he consumes all the work and then exits. Due to thread scheduling and to the processing time processInput(), a worker can consume his inputs at a higher rate than the worker before him can produce. For instance taking the queue between init and datit, this is possible:
datit: see 0, sleep
init : see 10 - add 1
datit: see 1, process
datit: see 0, exit and outputs 1
init : add 2
init : add 3
init : add 4
....
init : add 10
You need to change the design. There should be a different mechanism to signal that the work is over. Right now you are using the amount of inputs that a thread can observe. A quick and dirty fix would be to give to each thread the amount of inputs he is expected to process, and rewrite the algorithm as
void dowork(){
while(workIsnotDone()){//local test, no lock
if(getSomeInput()){
processInput();
queueResult();
updateWorkProgress();//local operation
}
else{
sleep(0.01);
}
}
displayAmountOfWorkDone();
}
A better alternative would be to set up the class coda_concorr as a producer-consumer mechanism . For that you can add a condition variable. Each thread is a consumer on one queue and a producer on another. You can also add a field to explicitly specify something like no more inputs. Take a look at this other question on SO

Related

C++11 std::threads not exiting

Could you please check the following code which is not exiting even after condition becomes false?
I'm trying to print numbers from 1 to 10 by first thread, 2 to 20 by second thread likewise & I have 10 threads, whenever count reaches to 100, my program should terminate safely by terminating all threads. But that is not happening, after printing, it stuck up and I don't understand why?
Is there any data race? Please guide.
#include<iostream>
#include<vector>
#include<thread>
#include<mutex>
#include<condition_variable>
std::mutex mu;
int count=1;
bool isDone = true;
std::condition_variable cv;
void Print10(int tid)
{
std::unique_lock<std::mutex> lock(mu);
while(isDone){
cv.wait(lock,[tid](){ return ((count/10)==tid);});
for(int i=0;i<10;i++)
std::cout<<"tid="<<tid<<" count="<<count++<<"\n";
isDone = count<100;//!(count == (((tid+1)*10)+1));
std::cout<<"tid="<<tid<<" isDone="<<isDone<<"\n";
cv.notify_all();
}
}
int main()
{
std::vector<std::thread> vec;
for(int i=0;i<10;i++)
{
vec.push_back(std::thread(Print10,i));
}
for(auto &th : vec)
{
if(th.joinable())
th.join();
}
}
I believe the following code should work for you
#include<iostream>
#include<vector>
#include<thread>
#include<mutex>
#include<condition_variable>
using namespace std;
mutex mu;
int count=1;
bool isDone = true;
condition_variable cv;
void Print10(int tid)
{
unique_lock<std::mutex> lock(mu);
// Wait until condition --> Wait till count/10 = tid
while(count/10 != tid)
cv.wait(lock);
// Core logic
for(int i=0;i<10;i++)
cout<<"tid="<<tid<<" count="<<count++<<"\n";
// Release the current thread thus ensuring serailization
cv.notify_one();
}
int main()
{
std::vector<std::thread> vec;
for(int i=0;i<10;i++)
{
vec.push_back(std::thread(Print10,i));
}
for(auto &th : vec)
{
if(th.joinable())
th.join();
}
return 0;
}

What is the most efficient ZeroMQ polling on a single PUB/SUB socket?

The ZeroMQ documentation mentions a zmq_poll as a method for multi-plexing multiple sockets on a single thread. Is there any benefit to polling in a thread that simply consumes data from one socket? Or should I just use zmq_recv?
For example:
/* POLLING A SINGLE SOCKET */
while (true) {
zmq::poll(&items[0], 1, -1);
if (items[0].revents & ZMQ_POLLIN) {
int size = zmq_recv(receiver, msg, 255, 0);
if (size != -1) {
// do something with msg
}
}
}
vs.
/* NO POLLING AND BLOCKING RECV */
while (true) {
int size = zmq_recv(receiver, msg, 255, 0);
if (size != -1) {
// do something with msg
}
}
Is there ever a situation to prefer the version with polling, or should I only use it for multi-plexing? Does polling result in more efficient CPU usage? Does the answer depend on the rate of messages being received?
*** Editing this post to include a toy example ***
The reason for asking this question is that I have observed that I can achieve a much higher throughput on my subscriber if I do not poll (more than an order of magnitude)
#include <thread>
#include <zmq.hpp>
#include <iostream>
#include <unistd.h>
#include <chrono>
using msg_t = char[88];
using timepoint_t = std::chrono::high_resolution_clock::time_point;
using milliseconds = std::chrono::milliseconds;
using microseconds = std::chrono::microseconds;
/* Log stats about how many packets were sent/received */
class SocketStats {
public:
SocketStats(const std::string& name) : m_socketName(name), m_timePrev(now()) {}
void update() {
m_numPackets++;
timepoint_t timeNow = now();
if (duration(timeNow, m_timePrev) > m_logIntervalMs) {
uint64_t packetsPerSec = m_numPackets - m_numPacketsPrev;
std::cout << m_socketName << " : " << "processed " << (packetsPerSec) << " packets" << std::endl;
m_numPacketsPrev = m_numPackets;
m_timePrev = timeNow;
}
}
private:
timepoint_t now() { return std::chrono::steady_clock::now(); }
static milliseconds duration(timepoint_t timeNow, timepoint_t timePrev) {
return std::chrono::duration_cast<milliseconds>(timeNow - timePrev);
}
timepoint_t m_timePrev;
uint64_t m_numPackets = 0;
uint64_t m_numPacketsPrev = 0;
milliseconds m_logIntervalMs = milliseconds{1000};
const std::string m_socketName;
};
/* non-polling subscriber uses blocking receive and no poll */
void startNonPollingSubscriber(){
SocketStats subStats("NonPollingSubscriber");
zmq::context_t ctx(1);
zmq::socket_t sub(ctx, ZMQ_SUB);
sub.connect("tcp://127.0.0.1:5602");
sub.setsockopt(ZMQ_SUBSCRIBE, "", 0);
while (true) {
zmq::message_t msg;
bool success = sub.recv(&msg);
if (success) { subStats.update(); }
}
}
/* polling subscriber receives messages when available */
void startPollingSubscriber(){
SocketStats subStats("PollingSubscriber");
zmq::context_t ctx(1);
zmq::socket_t sub(ctx, ZMQ_SUB);
sub.connect("tcp://127.0.0.1:5602");
sub.setsockopt(ZMQ_SUBSCRIBE, "", 0);
zmq::pollitem_t items [] = {{static_cast<void*>(sub), 0, ZMQ_POLLIN, 0 }};
while (true) {
zmq::message_t msg;
int rc = zmq::poll (&items[0], 1, -1);
if (rc < 1) { continue; }
if (items[0].revents & ZMQ_POLLIN) {
bool success = sub.recv(&msg, ZMQ_DONTWAIT);
if (success) { subStats.update(); }
}
}
}
void startFastPublisher() {
SocketStats pubStats("FastPublisher");
zmq::context_t ctx(1);
zmq::socket_t pub(ctx, ZMQ_PUB);
pub.bind("tcp://127.0.0.1:5602");
while (true) {
msg_t mymessage;
zmq::message_t msg(sizeof(msg_t));
memcpy((char *)msg.data(), (void*)(&mymessage), sizeof(msg_t));
bool success = pub.send(&msg, ZMQ_DONTWAIT);
if (success) { pubStats.update(); }
}
}
int main() {
std::thread t_sub1(startPollingSubscriber);
sleep(1);
std::thread t_sub2(startNonPollingSubscriber);
sleep(1);
std::thread t_pub(startFastPublisher);
while(true) {
sleep(10);
}
}
Q : "Is there any benefit to polling in a thread that simply consumes data from one socket?"
Oh sure there is.
As a principal promoter of non-blocking designs, I always advocate to design zero-waiting .poll()-s before deciding on .recv()-calls.
Q : "Does polling result in more efficient CPU usage?"
A harder one, yet I love it:
This question is decidable in two distinct manners:
a) read the source-code of both the .poll()-method and the .recv()-method, as ported onto your target platform and guesstimate the costs of calling each v/s the other
b) test either of the use-cases inside your run-time ecosystem and have the hard facts micro-benchmarked in-vivo.
Either way, you see the difference.
What you cannot see ATM are the add-on costs and other impacts that appear once you try (or once you are forced to) extend the use-case so as to accomodate other properties, not included in either the former or the latter.
Here, my principal preference to use .poll() before deciding further, enables other priority-based re-ordering of actual .recv()-calls and other, higher level, decisions, that neither the source-code, nor the test could ever decide.
Do not hesitate to test first and if tests will seem to be inconclusive (on your scale of { low | ultra-low }-latency sensitivity), may deep into the source-code to see why.
Typically you will get the best performance by draining the socket before doing another poll. In other words, once poll returns, you read until read returns "no more data", at which point you call poll.
For an example, see https://github.com/nyfix/OZ/blob/4627b0364be80de4451bf1a80a26c00d0ba9310f/src/transport.c#L524
There are trade-offs with this approach (mentioned in the code comments), but if you're only reading from a single socket you can ignore these.

std::atomic on struct bit-fields

I'm modifying some existing open source library and there is a struct (say named as Node) containing bit-fields, e.g.
struct Node {
std::atomic<uint32_t> size:30;
std::atomic<uint32_t> isnull:1;
};
To fit my needs, these fields need to be atomic so I was expecting to use std::atomic for this and faced compile time error:
bit-field 'size' has non-integral type 'std::atomic<uint32_t>'
According to documentation, there is a restricted set of types which can be used for std::atomic
Can anyone advise/have idea on how to get functionality of atomic fields with the minimum impact to the existing source code?
Thanks in advance!
I used an unsigned short as an example below.
This is less ideal, but you could sacrifice 8 bits and insert a std::atomic_flag in the bit field with a union. Unfortunately, std::atomic_flag type is a std::atomic_bool type.
This structure can be spin locked manually every time you access it. However, the code should have minimal performance degradation (unlike creating, locking, unlocking, destroying with a std::mutex and std::unique_lock).
This code may waste about 10-30 clock cycles to enable low cost multi-threading.
PS. Make sure the reserved 8 bits below are not messed up by the endian structure of the processor. You may have to define at the end for big-endian processors. I only tested this code on an Intel CPU (always little-endian).
#include <iostream>
#include <atomic>
#include <thread>
union Data
{
std::atomic_flag access = ATOMIC_FLAG_INIT; // one byte
struct
{
typedef unsigned short ushort;
ushort reserved : 8;
ushort count : 4;
ushort ready : 1;
ushort unused : 3;
} bits;
};
class SpinLock
{
public:
inline SpinLock(std::atomic_flag &access, bool locked=true)
: mAccess(access)
{
if(locked) lock();
}
inline ~SpinLock()
{
unlock();
}
inline void lock()
{
while (mAccess.test_and_set(std::memory_order_acquire))
{
}
}
// each attempt will take about 10-30 clock cycles
inline bool try_lock(unsigned int attempts=0)
{
while(mAccess.test_and_set(std::memory_order_acquire))
{
if (! attempts) return false;
-- attempts;
}
return true;
}
inline void unlock()
{
mAccess.clear(std::memory_order_release);
}
private:
std::atomic_flag &mAccess;
};
void aFn(int &i, Data &d)
{
SpinLock lock(d.access, false);
// manually locking/unlocking can be tighter
lock.lock();
if (d.bits.ready)
{
++d.bits.count;
}
d.bits.ready ^= true; // alternate each time
lock.unlock();
}
int main(void)
{
Data f;
f.bits.count = 0;
f.bits.ready = true;
std::thread *p[8];
for (int i = 0; i < 8; ++ i)
{
p[i] = new std::thread([&f] (int i) { aFn(i, f); }, i);
}
for (int i = 0; i < 8; ++i)
{
p[i]->join();
delete p[i];
}
std::cout << "size: " << sizeof(f) << std::endl;
std::cout << "count: " << f.bits.count << std::endl;
}
The result is as expected...
size: 2
count: 4

libtorrent - storage_interface readv explanation

I have implemented a custom storage interface in libtorrent as described in the help section here.
The storage_interface is working fine, although I can't figure out why readv is only called randomly while downloading a torrent. From my view the overriden virtual function readv should get called each time I call handle->read_piece in piece_finished_alert. It should read the piece for read_piece_alert?
The buffer is provided in read_piece_alert without getting notified in readv.
So the question is why it is called only randomly and why it's not called on a read_piece() call? Is my storage_interface maybe wrong?
The code looks like this:
struct temp_storage : storage_interface
{
virtual int readv(file::iovec_t const* bufs, int num_bufs
, int piece, int offset, int flags, storage_error& ec)
{
// Only called on random pieces while downloading a larger torrent
std::map<int, std::vector<char> >::const_iterator i = m_file_data.find(piece);
if (i == m_file_data.end()) return 0;
int available = i->second.size() - offset;
if (available <= 0) return 0;
if (available > num_bufs) available = num_bufs;
memcpy(&bufs, &i->second[offset], available);
return available;
}
virtual int writev(file::iovec_t const* bufs, int num_bufs
, int piece, int offset, int flags, storage_error& ec)
{
std::vector<char>& data = m_file_data[piece];
if (data.size() < offset + num_bufs) data.resize(offset + num_bufs);
std::memcpy(&data[offset], bufs, num_bufs);
return num_bufs;
}
virtual bool has_any_file(storage_error& ec) { return false; }
virtual ...
virtual ...
}
Intialized with
storage_interface* temp_storage_constructor(storage_params const& params)
{
printf("NEW INTERFACE\n");
return new temp_storage(*params.files);
}
p.storage = &temp_storage_constructor;
The function below sets up alerts and invokes read_piece on each completed piece.
while(true) {
std::vector<alert*> alerts;
s.pop_alerts(&alerts);
for (alert* i : alerts)
{
switch (i->type()) {
case read_piece_alert::alert_type:
{
read_piece_alert* p = (read_piece_alert*)i;
if (p->ec) {
// read_piece failed
break;
}
// piece buffer, size is provided without readv
// notification after invoking read_piece in piece_finished_alert
break;
}
case piece_finished_alert::alert_type: {
piece_finished_alert* p = (piece_finished_alert*)i;
p->handle.read_piece(p->piece_index);
// Once the piece is finished, we read it to obtain the buffer in read_piece_alert.
break;
}
default:
break;
}
}
Sleep(100);
}
I will answer my own question. As Arvid said in the comments: readv was not invoked because of caching. Setting settings_pack::use_read_cache to false will invoke readv always.

C++11: How to implement fast, lightweight, and fair synchronized resource access

Question
What can I do to get a locking mechanism that provides minimal and stable latency while guaranteeing that a thread cannot reacquire a resource before another thread has acquired and released it?
The desirability of answers to this question are ranked as follows:
Some combination of built-in C++11 features that work in MinGW on Windows 7 (note that the <thread> and <mutex> libraries do not work on a Windows platform)
Some combination of Windows API features
A modification to the FairLock listed below, my own attempt at implementing such a mechanism
Some features provided by a free, open-source library that does not require a .configure/make/make install process, (getting that to work in MSYS is more of an adventure than I care for)
Background
I am writing an application which is effectively a multi-stage producer/consumer. One thread generates input consumed by another thread, which produces output consumed by yet another thread. The application uses pairs of buffers so that, after an initial delay, all threads can work nearly simultaneously.
Since I am writing a Windows 7 application, I had been using CriticalSections to guard the buffers. The problem with using CriticalSections (or, so far as I can tell, any other Windows or C++11-built-in synchronization object) is that it does not allow for any provision that a thread that just released a lock cannot reacquire it until another thread has done so first. Because of this, many of my test drivers for the middle thread (the Encoder) never gave the Encoder a chance to acquire the test input buffers and completed without having tested them. The end result was a ridiculous process of trying to determine an artificial wait time that stochastically worked for my machine.
Since the structure of my application requires that each stage waits for the other stage to have acquired, finished using, and released the necessary buffers for getting to use the buffer again, I need, for lack of a better term, a fair locking mechanism. I took a crack at writing one (the source code is provided below). In testing, this FairLock allows my test driver to run my Encoder at the same speeds that I was able to achieve using the CriticalSection maybe 60% of the runs. The other 40% of the runs take anywhere between 10 to 100 ms longer, which is not acceptable for my application.
FairLock
// FairLock.hpp
#ifndef FAIRLOCK_HPP
#define FAIRLOCK_HPP
#include <atomic>
using namespace std;
class FairLock {
private:
atomic_bool owned {false};
atomic<DWORD> lastOwner {0};
public:
FairLock(bool owned);
bool inline hasLock() const;
bool tryLock();
void seizeLock();
void tryRelease();
void waitForLock();
};
#endif
// FairLock.cpp
#include <windows.h>
#include "FairLock.hpp"
#define ID GetCurrentThreadId()
FairLock::FairLock(bool owned) {
if (owned) {
this->owned = true;
this->lastOwner = ID;
} else {
this->owned = false;
this->lastOwner = 0;
}
}
bool inline FairLock::hasLock() const {
return owned && lastOwner == ID;
}
bool FairLock::tryLock() {
bool success = false;
DWORD id = ID;
if (owned) {
success = lastOwner == id;
} else if (
lastOwner != id &&
owned.compare_exchange_strong(success, true)
) {
lastOwner = id;
success = true;
} else {
success = false;
}
return success;
}
void FairLock::seizeLock() {
bool success = false;
DWORD id = ID;
if (!(owned && lastOwner == id)) {
while (!owned.compare_exchange_strong(success, true)) {
success = false;
}
lastOwner = id;
}
}
void FairLock::tryRelease() {
if (hasLock()) {
owned = false;
}
}
void FairLock::waitForLock() {
bool success = false;
DWORD id = ID;
if (!(owned && lastOwner == id)) {
while (lastOwner == id); // spin
while (!owned.compare_exchange_strong(success, true)) {
success = false;
}
lastOwner = id;
}
}
EDIT
DO NOT USE THIS FairLock CLASS; IT DOES NOT GUARANTEE MUTUAL EXCLUSION!
I reviewed the above code to compare it against The C++ Programming Language: 4th Edition text I had not read carefully and what CouchDeveloper's recommended Synchronous Queue. I realized that there are several sequences in which the thread that just released the FairLock can be tricked into thinking it still owns it. All it takes is interleaving instructions as follows:
New owner: set owned to true
Old owner: is owned true? yes
Old owner: am I the last owner? yes
New owner: set me as the last owner
At this point, the old and new owners both enter their critical sections.
I am considering whether this problem has a solution and whether it is worth attempting to solve this at all. In the meantime, don't use this unless you see a fix.
I would implement this in C++11 using a condition_variable-per-thread setup so that I could choose exactly which thread to wake up when (Live demo at Coliru):
class FairMutex {
private:
class waitnode {
std::condition_variable cv_;
waitnode* next_ = nullptr;
FairMutex& fmtx_;
public:
waitnode(FairMutex& fmtx) : fmtx_(fmtx) {
*fmtx.tail_ = this;
fmtx.tail_ = &next_;
}
~waitnode() {
for (waitnode** p = &fmtx_.waiters_; *p; p = &(*p)->next_) {
if (*p == this) {
*p = next_;
if (!next_) {
fmtx_.tail_ = &fmtx_.waiters_;
}
break;
}
}
}
void wait(std::unique_lock<std::mutex>& lk) {
while (fmtx_.held_ || fmtx_.waiters_ != this) {
cv_.wait(lk);
}
}
void notify() {
cv_.notify_one();
}
};
waitnode* waiters_ = nullptr;
waitnode** tail_ = &waiters_;
std::mutex mtx_;
bool held_ = false;
public:
void lock() {
auto lk = std::unique_lock<std::mutex>{mtx_};
if (held_ || waiters_) {
waitnode{*this}.wait(lk);
}
held_ = true;
}
bool try_lock() {
if (mtx_.try_lock()) {
std::lock_guard<std::mutex> lk(mtx_, std::adopt_lock);
if (!held_ && !waiters_) {
held_ = true;
return true;
}
}
return false;
}
void unlock() {
std::lock_guard<std::mutex> lk(mtx_);
held_ = false;
if (waiters_ != nullptr) {
waiters_->notify();
}
}
};
FairMutex models the Lockable concept so it can be used like any other standard library mutex type. Put simply, it achieves fairness by inserting waiters into a list in arrival order, and passing the mutex to the first waiter in the list when unlocking.
If it's useful:
This demonstrates *) an implementation of a "synchronous queue" using semaphores as synchronization primitives.
Note: the actually implementation uses semaphores implemented with GCD (Grand Central Dispatch):
using gcd::mutex;
using gcd::semaphore;
// A blocking queue in which each put must wait for a get, and vice
// versa. A synchronous queue does not have any internal capacity,
// not even a capacity of one.
template <typename T>
class simple_synchronous_queue {
public:
typedef T value_type;
enum result_type {
OK = 0,
TIMEOUT_NOT_DELIVERED = -1,
TIMEOUT_NOT_PICKED = -2,
TIMEOUT_NOTHING_OFFERED = -3
};
simple_synchronous_queue()
: sync_(0), send_(1), recv_(0)
{
}
void put(const T& v) {
send_.wait();
new (address()) T(v);
recv_.signal();
sync_.wait();
}
result_type put(const T& v, double timeout) {
if (send_.wait(timeout)) {
new (storage_) T(v);
recv_.signal();
if (sync_.wait(timeout)) {
return OK;
}
else {
return TIMEOUT_NOT_PICKED;
}
}
else {
return TIMEOUT_NOT_DELIVERED;
}
}
T get() {
recv_.wait();
T result = *address();
address()->~T();
sync_.signal();
send_.signal();
return result;
}
std::pair<result_type, T> get(double timeout) {
if (recv_.wait(timeout)) {
std::pair<result_type, T> result =
std::pair<result_type, T>(OK, *address());
address()->~T();
sync_.signal();
send_.signal();
return result;
}
else {
return std::pair<result_type, T>(TIMEOUT_NOTHING_OFFERED, T());
}
}
private:
using storage_t = typename std::aligned_storage<sizeof(T), std::alignment_of<T>::value>::type;
T* address() { 
return static_cast<T*>(static_cast<void*>(&storage_));
}
storage_t storage_;
semaphore sync_;
semaphore send_;
semaphore recv_;
};
*) demonstrates: be carefully about potential issues, could be improved, etc. ... ;)
I accepted CouchDeveloper's answer since it pointed me down the right path. I wrote a Windows-specific C++11 implementation of a synchronous queue, and added this answer so that others could consider/use it if they so choose.
// SynchronousQueue.hpp
#ifndef SYNCHRONOUSQUEUE_HPP
#define SYNCHRONOUSQUEUE_HPP
#include <atomic>
#include <exception>
#include <windows>
using namespace std;
class CouldNotEnterException: public exception {};
class NoPairedCallException: public exception {};
template typename<T>
class SynchronousQueue {
private:
atomic_bool valueReady {false};
CRITICAL_SECTION getCriticalSection;
CRITICAL_SECTION putCriticalSection;
DWORD wait {0};
HANDLE getSemaphore;
HANDLE putSemaphore;
const T* address {nullptr};
public:
SynchronousQueue(DWORD waitMS): wait {waitMS}, address {nullptr} {
initializeCriticalSection(&getCriticalSection);
initializeCriticalSection(&putCriticalSection);
getSemaphore = CreateSemaphore(nullptr, 0, 1, nullptr);
putSemaphore = CreateSemaphore(nullptr, 0, 1, nullptr);
}
~SynchronousQueue() {
EnterCriticalSection(&getCriticalSection);
EnterCriticalSection(&putCriticalSection);
CloseHandle(getSemaphore);
CloseHandle(putSemaphore);
DeleteCriticalSection(&putCriticalSection);
DeleteCriticalSection(&getCriticalSection);
}
void put(const T& value) {
if (!TryEnterCriticalSection(&putCriticalSection)) {
throw CouldNotEnterException();
}
ReleaseSemaphore(putSemaphore, (LONG) 1, nullptr);
if (WaitForSingleObject(getSemaphore, wait) != WAIT_OBJECT_0) {
if (WaitForSingleObject(putSemaphore, 0) == WAIT_OBJECT_0) {
LeaveCriticalSection(&putCriticalSection);
throw NoPairedCallException();
} else {
WaitForSingleObject(getSemaphore, 0);
}
}
address = &value;
valueReady = true;
while (valueReady);
LeaveCriticalSection(&putCriticalSection);
}
T get() {
if (!TryEnterCriticalSection(&getCriticalSection)) {
throw CouldNotEnterException();
}
ReleaseSemaphore(getSemaphore, (LONG) 1, nullptr);
if (WaitForSingleObject(putSemaphore, wait) != WAIT_OBJECT_0) {
if (WaitForSingleObject(getSemaphore, 0) == WAIT_OBJECT_0) {
LeaveCriticalSection(&getCriticalSection);
throw NoPairedCallException();
} else {
WaitForSingleObject(putSemaphore, 0);
}
}
while (!valueReady);
T toReturn = *address;
valueReady = false;
LeaveCriticalSection(&getCriticalSection);
return toReturn;
}
};
#endif

Resources