Resolution of WaitForSingleObject with timeSetEvent & SetWaitableTimer - winapi

I am using a Win32 multimedia timer to put a delay between the dispatch of large numbers of UDP packets, but i am finding that the resulting delay is substantially longer than it should be. Delays of ~40ms are sometimes nearer 1000ms, even when using Windows Miltimedia timers and upping the timer resolution. Below is a simplifed version of the code i used:
if( timeGetDevCaps(&tc,sizeof(TIMECAPS)) == TIMERR_NOERROR)
{
timeRes = min( max(tc.wPeriodMin,1), tc.wPeriodMax);
timeBeginPeriod(timeRes);
printf("Timer Res: %u\n", timeRes);
}
/* ... */
while( ptrHead )
{
NALU_t *ptrLink = ptrHead;
unsigned long tsNALU = ptrLink->timestamp - tsFirst;
printf("Timestamp: %umsec\n", ptrLink->timestamp / 90 );
int idxPort;
for(idxPort=0;idxPort<12;idxPort++)
{
ip4Addr.sin_port = htons( 60000 + idxPort );
struct sockaddr *saAddr = (struct sockaddr*)&ip4Addr;
sendto(fdSocket,(char*)ptrLink->ptrData,ptrLink->lenData,
0,saAddr,lenAddr);
}
if( 1 )
{
unsigned long millis = (tsNALU - tsPrev) / 90;
valTime.QuadPart = 10000;
valTime.QuadPart *= millis;
valTime.QuadPart *= -1;
if(SetWaitableTimer(hdlTimer,&valTime,0,NULL,NULL,TIME_ONESHOT))
WaitForSingleObject(hdlTimer,INFINITE);
}
tsPrev = tsNALU;
ptrHead = ptrLink->next;
free( ptrLink );
}
I suspect the problem is that Windows7 no longer guarantees the resolution of timers when signalled by events as opposed to call-backs, but i am loathed to use the latter. Anyone know why even supposedly high-resolution timers in single-threaded test cases are so wildly inaccurate?

If timing is critical it's best to run in a busy loop (you can give up a timeslice every iteration using Sleep(0) if you want), using the QueryPerformanceCounter() API to measure elapsed time.

From subsequent experiments, my best guess is that Windows moving threads between CPU cores (possibly for load-balancing reasons - this is on a Quad-core i7) is disruptive to timing functions. I used SetThreadAffinityMask() to lock my timing-critical thread to one CPU (and my non-timing threads to all other cores), and that has sorted out the problems.

Related

Why does the ZeroMQ ROUTER-DEALER pattern have high latency?

Using libzmq 4.2.5 on centos 7. Getting very high latency when messages are sent from DEALER to ROUTER and even from ROUTER to DEALER. So I wrote a simple client-server program using tcp and sent messages between them just for comparison. Tcp appears to be fast.
Sending single byte from DEALER to ROUTER, zmq takes 900 microseconds.
Sending single byte from client to server, tcp takes 150 microseconds.
What am I doing wrong. I thought zmq will be at least as fast as tcp. Is there any tuning I can do to make zmq faster?
Update
router.cpp
#include <zmq.hpp>
struct data
{
char one[21];
unsigned long two;
};
data * pdata;
std::size_t counter=0;
int main()
{
zmq::context_t context(1);
zmq::socket_t Device(context,ZMQ_ROUTER);
int iHighWaterMark=0;
Device.setsockopt(ZMQ_SNDHWM,&iHighWaterMark,sizeof(int));
Device.setsockopt(ZMQ_RCVHWM,&iHighWaterMark,sizeof(int));
Device.bind("tcp://0.0.0.0:5555");
pdata=new data[10000];
struct timespec ts_dtime;
unsigned long sec;
zmq::message_t message;
zmq::pollitem_t arrPollItems[]={{Device, 0, ZMQ_POLLIN, 0},{NULL,
0, ZMQ_POLLIN, 0}};
while(counter < 10000)
{
try
{
int iAssert = zmq::poll(arrPollItems, 1, -1);
if (iAssert <= 0)
{
if (-1 == iAssert)
{
printf("zmq_poll failed errno: %d error:%s", errno,
zmq_strerror(errno));
}
continue;
}
if (arrPollItems[0].revents == ZMQ_POLLIN)
{
while(true)
{
if(! Device.recv(&message,ZMQ_DONTWAIT))
break;
Device.recv(&message);
strncpy(pdata[counter].one,
(char*)message.data(),message.size());
clock_gettime(CLOCK_REALTIME, &ts_dtime);
pdata[counter].two = (ts_dtime.tv_sec*1e9)+
ts_dtime.tv_nsec;
++counter;
}
}
}
catch(...)
{
}
}
for(int i=0;i<counter;++i)
printf("%d %s %lu\n",i+1,pdata[i].one,pdata[i].two);
return 0;
}
dealer.cpp
#include <zmq.hpp>
#include<unistd.h>
int main()
{
zmq::context_t context(1);
zmq::socket_t Device(context,ZMQ_DEALER);
int iHighWaterMark=0;
Device.setsockopt(ZMQ_SNDHWM,&iHighWaterMark,sizeof(int));
Device.setsockopt(ZMQ_RCVHWM,&iHighWaterMark,sizeof(int));
Device.setsockopt(ZMQ_IDENTITY,"TEST",4);
Device.connect("tcp://0.0.0.0:5555");
usleep(100000);
struct timespec ts_dtime;
unsigned long sec;
for(std::size_t i=0;i<10000;++i)
{
clock_gettime(CLOCK_REALTIME, &ts_dtime);
sec=(ts_dtime.tv_sec*1e9)+ ts_dtime.tv_nsec;
zmq::message_t message(21);
sprintf((char *)message.data(),"%lu",sec);
Device.send(message);
usleep(500);
}
return 0;
}
update 2:
router.cpp
#include <zmq.hpp>
#include <stdio.h>
#include <stdlib.h>
int main (int argc, char *argv[])
{
const char *bind_to;
int roundtrip_count;
size_t message_size;
int rc;
int i;
if (argc != 4) {
printf ("usage: local_lat <bind-to> <message-size> "
"<roundtrip-count>\n");
return 1;
}
bind_to = argv[1];
message_size = atoi (argv[2]);
roundtrip_count = atoi (argv[3]);
zmq::context_t ctx(1);
zmq::socket_t s(ctx,ZMQ_ROUTER);
zmq::message_t msg,id;
int iHighWaterMark=0;
s.setsockopt(ZMQ_SNDHWM , &iHighWaterMark,
sizeof (int));
s.setsockopt(ZMQ_RCVHWM , &iHighWaterMark,
sizeof (int));
s.bind( bind_to);
struct timespec ts_dtime;
unsigned long sec;
for (i = 0; i != roundtrip_count; i++) {
rc =s.recv(&id);
if (rc < 0) {
printf ("error in zmq_recvmsg: %s\n", zmq_strerror (errno));
return -1;
}
rc = s.recv(&msg, 0);
if (rc < 0) {
printf ("error in zmq_recvmsg: %s\n", zmq_strerror (errno));
return -1;
}
clock_gettime(CLOCK_REALTIME, &ts_dtime);
sec=((ts_dtime.tv_sec*1e9)+ ts_dtime.tv_nsec);
printf("%.*s %lu\n",20,(char *)msg.data(),sec);
}
s.close();
return 0;
}
dealer.cpp
#include <zmq.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
int main (int argc, char *argv[])
{
const char *connect_to;
int roundtrip_count;
size_t message_size;
int rc;
int i;
void *watch;
unsigned long elapsed;
double latency;
if (argc != 4) {
printf ("usage: remote_lat <connect-to> <message-size> "
"<roundtrip-count>\n");
return 1;
}
connect_to = argv[1];
message_size = atoi (argv[2]);
roundtrip_count = atoi (argv[3]);
zmq::context_t ctx(1);
zmq::socket_t s(ctx,ZMQ_DEALER);
struct timespec ts_dtime;
unsigned long sec;
int iHighWaterMark=0;
s.setsockopt(ZMQ_SNDHWM , &iHighWaterMark,
sizeof (int));
s.setsockopt(ZMQ_RCVHWM , &iHighWaterMark,
sizeof (int));
s.connect(connect_to);
for (i = 0; i != roundtrip_count; i++) {
zmq::message_t msg(message_size+20);
clock_gettime(CLOCK_REALTIME, &ts_dtime);
sec=(ts_dtime.tv_sec*1e9)+ ts_dtime.tv_nsec;
sprintf((char *)msg.data(),"%lu",sec);
rc = s.send(msg);
if (rc < 0) {
printf ("error in zmq_sendmsg: %s\n", zmq_strerror (errno));
return -1;
}
sleep(1);
}
s.close();
return 0;
}
output :
1562125527489432576 1562125527489773568
1562125528489582848 1562125528489961472
1562125529489740032 1562125529490124032
1562125530489944832 1562125530490288896
1562125531490101760 1562125531490439424
1562125532490261248 1562125532490631680
1562125533490422272 1562125533490798080
1562125534490555648 1562125534490980096
1562125535490745856 1562125535491161856
1562125536490894848 1562125536491245824
1562125537491039232 1562125537491416320
1562125538491229184 1562125538491601152
1562125539491375872 1562125539491764736
1562125540491517184 1562125540491908352
1562125541491657984 1562125541492027392
1562125542491816704 1562125542492193536
1562125543491963136 1562125543492338944
1562125544492103680 1562125544492564992
1562125545492248832 1562125545492675328
1562125546492397312 1562125546492783616
1562125547492543744 1562125547492926720
1562125564495211008 1562125564495629824
1562125565495372032 1562125565495783168
1562125566495515904 1562125566495924224
1562125567495660800 1562125567496006144
1562125568495806464 1562125568496160000
1562125569495896064 1562125569496235520
1562125570496080128 1562125570496547584
1562125571496235008 1562125571496666624
1562125572496391424 1562125572496803584
1562125573496532224 1562125573496935680
1562125574496652800 1562125574497053952
1562125575496843776 1562125575497277184
1562125576496997120 1562125576497417216
1562125577497182208 1562125577497726976
1562125578497336832 1562125578497726464
1562125579497549312 1562125579497928704
1562125580497696512 1562125580498115328
1562125581497847808 1562125581498198528
1562125582497998336 1562125582498340096
1562125583498140160 1562125583498622464
1562125584498295296 1562125584498680832
1562125585498445312 1562125585498842624
1562125586498627328 1562125586499025920
All are in the range for 350-450us
Q1: What am I doing wrong?I thought zmq will be at least as fast as tcp.
Code-wise, nothing.
Performance-wise, the ZeroMQ is fantastic plus has so many features that tcp does not and will not provide right out of the box:
Test-setup "Sending single byte..." seems to step right into the left edge of the high-performance / low-latency messaging service:
Lets first understand the Latency and where did it come from:
The observed resulting latency figures are product of the overall sum of the resources-usage ( resources allocations + resources pools management operations + data manipulations ) and processing-efforts ( all we try to do with the data, here including times, that our task had to spend in a waiting queue, due to the system-scheduler planned multi-tasking workunits scheduling, that are not from our testing workload, but the operating system has to schedule and execute, according to the fair-scheduling-policy and actual process-priority settings ) and communications channels transport-delays ( comms E2E transport latency )
Lets next understand what do we try to compare with:
A difference between a Transmission Control Protocol ( raw tcp ) and a ZeroMQ zmq framework of smart Scalable Formal Communication Archetypes with a rich set of high-level, distributed behaviours, is about a few galaxies big.
ZeroMQ was designed as rather a Signalling and Messaging infrastructure using some of these feature-rich set of behaviours that match together - often depicted by some human-alike behaviour-archetype:
One PUSH-es, any number of joined counterparties PULL
One REQ-ests, someone from a group on the other end of the phone REP-lies
One, even potentially a one from some larger group of agents, PUB-lishes, any amount of already subscribed subscribers receive such a SUB-scribed message.
For details, one may kindly read a brief overview about the main conceptual differences in [ ZeroMQ hierarchy in less than a five seconds ] Section.
This is nothing a TCP-protocol will ever provide on its own.
This is a comfort one likes to pay for by some negligible amount of latency. Negligible? Yes, negligible once compared to the many man*years of ultimate software craftsmanship anyone would have to pay for designing another at least similarly smart messaging framework to compete with ZeroMQ.
Q2: Is there any tuning I can do to make zmq faster?
Maybe yes, maybe not.
Update:- try avoiding Identity management ( tcp has no such thing either, so measured RTT-s are the lesser comparable or meaningful )
- try avoiding the blocking manner of the HWM-configurations ( tcp has no such thing either )
- may try to measure the same over a non-tcp-protocol ( a PAIR/PAIR Formal Scalable Communication Archetype, best over the least complex protocol data-pumps as inproc:// is or ipc:// in case your SandBox test bed needs to still keep distributed non-local copies etc. ) ZeroMQ context-instance's internal overheads spent on the .send() resp. .receive() methods
- may try to allow for slight increase in performance by using more threads available for the Context instance
( other performance demasking tricks depend on the nature of real-world usage - as a robustness to dropped messages, feasibility to use a conflated mode of operations, better buffer-alignment with O/S, zero-copy tricks - all being of some interest here, yet have to let and keep the smart ZeroMQ infrastructure of distributed behaviours operational, which is by far more complex task to execute, than a trivial serial sequence of otherwise blind and isolated tcp-socket byte-level operations - so, comparing times is possible, but comparing individual draconic dragster-class car ( well, better a vehicle, not even a car ) with something like a globally operating infrastructure of distributed behaviour ( like Taxify or Uber, named here just to make a use of a trivialised (dis-)similarity of approximately same scale of magnitude ) leaves the numbers reporting about phenomena, that do not provide the similar comfort, scalability of use-cases, almost linear performance scaling and robustness of the real-world use )
- may add more scheduling determinism with making the Context-instance's respective IoTHREADs-hard-wired onto respective CPU-core(s), so that the overall I/O-performance never gets evicted from CPU-schedule and remains deterministically mapped / pre-locked on even exclusively administratively dedicated CPU-core(s) - depends on a level of need and administrative policies if trying to do this ultimate performance hack
For any performance related tweaking, one will need to post an MCVE + a fully described benchmark test suite. The ZeroMQ Latency test results report shows:
Conclusion
In a controlled environment RDTSC instruction can be used to measure
time rapidly. This allows us to measure latency/density for individual
messages instead of computing averages for the whole test.
We've used this approach to get performance figures of ØMQ lightweight
messaging kernel (version 0.1) and we've got following results:
-- In low-volume case the latency is almost the same as the latency of the underlying transport (TCP): 50 microseconds.
-- The average jitter of latency is minimal: 0.225 microsecond.
-- The throughput on sender side is 4.8 million messages a second.
-- The density on sender side is mostly about 0.140 microsecond, however, with occasional peaks the mean density is 0.208 microsecond.
-- The throughput on receiver side is 2.7 million messages a second.
-- The density on receiver side is mostly about 0.3 microsecond. Approximately each 100 messages new batch is received causing density
to grow to 3-6 microseconds. The mean density is 0.367 microsecond.
If in an ultimate need for latency shaving, one may try nanomsg, the ZeroMQ's younger sister originated by Martin SUSTRIK, the co-father of ZeroMQ ( now maintained afaik by someone else )

Trigger Countdown with 433 MHz transmission

I would like to use an arduino to read 433 MHz transmission from multiple Soil Moisture Sensors. Since I can never be sure all transmissions reach the receiver I'd like to set a countdown from the moment the first transmission is received. If another transmission is received, the countdown starts again.
After a defined amount of time (e.g. 10 Minutes) without any more signals or if all signals have been received (e.g 4 Sensors) the receiving unit should stop and come to decision based on the data it got to the point.
For transmitting and receiving I am using the <RCSwitch.h>library.
The loop of the receiving unit and one Sensor looks like this:
#include <RCSwitch.h>
RCSwitch mySwitch = RCSwitch();
void Setup(){
Serial.begin(9600);
mySwitch.enableReceive(4);
}
void loop() {
if (mySwitch.available()) {
int value = mySwitch.getReceivedValue();
if (value == 0) {
lcd.clear();
Serial.print("Unknown encoding");
}
else {
Serial.print(mySwitch.getReceivedValue());
Serial.print("%");
}
The full code includes some differentiation mechanism for all sensors but I figured that might not be relevant for my question.
Question:
What's the best way to do this without a real time clock module. As far as I know I can't wait by using delay(...)since then I won't receive any data while the processor waiting.
You can use millis() as a clock. It returns the number of milliseconds since the arduino started.
#define MINUTES(x) ((x) * 60000UL)
unsigned long countStart = 0;
void loop()
{
if (/*read from module ok*/)
{
countStart = millis();
// sanity check, since millis() eventually rolls over
if (countStart == 0)
countStart = 1;
}
if (countStart && ((millis() - countStart) > MINUTES(10)))
{
countStart = 0;
// trigger event
}
}
Arduino's internal timers can also be used in this situation. If a long time period is needed, it's better to use 16bit counter (usually timer1) at 1024 prescaler (largest available). If the largest time interval of timer is greater than time required, then a counter have to be added in order to keep track of 1 minute interval.
For example, for 1-minute interval, initialize registers as:
TCCR1A = 0; //Initially setting every register as 0x0000
TCCR1B = 0;
TCNT1 = 0;
OCR1A = 468750; // compare match register 16MHz/1024/2/frequency(hz)
TCCR1B |= (1 << WGM12); // Timer compare mode
TCCR1B |= (1 << CS10) | (1 << CS10); // 1024 prescaler
TIMSK1 |= (1 << OCIE1A); // enable timer compare interrupt
These configuration of timer will give interrupt time of 1 minute. And upon timer completion ISR TIMER1_COMPA_vect will be run. You can play around with value of OCR1A for different interrupt periods.
Main advantage of using interrupts is that they don't block any task and can will be executed instantaneously (if interrupts are not disabled explicitly).

How to make a fast context switch from one process to another?

I need to run unsafe native code on a sandbox process and I need to reduce bottleneck of process switch. Both processes (controller and sandbox) shares two auto-reset events and a coherent view of a mapped file (shared memory) that is used for communication.
To make this article smaller, I removed initializations from sample code, but the events are created by the controller, duplicated using DuplicateHandle, and then sent to sandbox process prior to work.
Controller source:
void inSandbox(HANDLE hNewRequest, HANDLE hAnswer, volatile int *shared) {
int before = *shared;
for (int i = 0; i < 100000; ++i) {
// Notify sandbox of a new request and wait for answer.
SignalObjectAndWait(hNewRequest, hAnswer, INFINITE, FALSE);
}
assert(*shared == before + 100000);
}
void inProcess(volatile int *shared) {
int before = *shared;
for (int i = 0; i < 100000; ++i) {
newRequest(shared);
}
assert(*shared == before + 100000);
}
void newRequest(volatile int *shared) {
// In this test, the request only increments an int.
(*shared)++;
}
Sandbox source:
void sandboxLoop(HANDLE hNewRequest, HANDLE hAnswer, volatile int *shared) {
// Wait for the first request from controller.
assert(WaitForSingleObject(hNewRequest, INFINITE) == WAIT_OBJECT_0);
for(;;) {
// Perform request.
newRequest(shared);
// Notify controller and wait for next request.
SignalObjectAndWait(hAnswer, hNewRequest, INFINITE, FALSE);
}
}
void newRequest(volatile int *shared) {
// In this test, the request only increments an int.
(*shared)++;
}
Measurements:
inSandbox() - 550ms, ~350k context switches, 42% CPU (25% kernel, 17% user).
inProcess() - 20ms, ~2k context switches, 55% CPU (2% kernel, 53% user).
The machine is Windows 7 Pro, Core 2 Duo P9700 with 8gb of memory.
An interesting fact is that sandbox solution uses 42% of CPU vs 55% of in-process solution. Another noteworthy fact is that sandbox solution contains 350k context switches, which is much more than the 200k context switches that we can infer from source code.
I need to know if there's a way to reduce the overhead of transfer control to another process. I already tried to use pipes instead of events, and it was much worse. I also tried to use no event at all, by making the sandbox call SuspendThread(GetCurrentThread()) and making the controller call ResumeThread(hSandboxThread) on every request, but the performance was similar to using events.
If you have a solution that uses assembly (like performing a manual context switch) or Windows Driver Kit, please let me know as well. I don't mind having to install a driver to make this faster.
I heard that Google Native Client does something similar, but I only found this documentation. If you have more information, please let me know.
The first thing to try is raising the priority of the waiting thread. This should reduce the number of extraneous context switches.
Alternatively, since you're on a 2-core system, using spinlocks instead of events would make your code much much faster, at the cost of system performance and power consumption:
void inSandbox(volatile int *lock, volatile int *shared)
{
int i, before = *shared;
for (i = 0; i < 100000; ++i) {
*lock = 1;
while (*lock != 0) { }
}
assert(*shared == before + 100000);
}
void newRequest(volatile int *shared) {
// In this test, the request only increments an int.
(*shared)++;
}
void sandboxLoop(volatile int *lock, volatile int * shared)
{
for(;;) {
while (*lock != 1) { }
newRequest(shared);
*lock = 0;
}
}
In this scenario, you should probably set thread affinity masks and/or lower the priority of the spinning thread so that it doesn't compete with the busy thread for CPU time.
Ideally, you'd use a hybrid approach. When one side is going to be busy for a while, let the other side wait on an event so that other processes can get some CPU time. You could trigger the event a little ahead of time (using the spinlock to retain synchronization) so that the other thread will be ready when you are.

How to put my structure variable into CPU caches to eliminate main memory page access time? Options

It's clear that there is no explicit way or certain system calls that
help programmers to put a variable into the CPU cache.
But I think that a certain programming style or well designed
algorithm can make it possible to increase the possibilities that the
variable can be cached into the CPU caches.
Here is my example:
I want to append an 8 byte structure at the end of an array consisting
of the same type of structures, declared in the global main memory
region.
This process is continuously repeated for 4 million operations. This process takes 6 seconds, 1.5 us for each operation. I think this result tells that the two memory areas have not been cached.
I got some clues from a cache-oblivious algorithm, so I tried several
ways to enhance this. Until now, no enhancement.
I think some clever codes can reduce the elapsed time, up to 10 to 100
times. Please show me the way.
-------------------------------------------------------------------------
Appended (2011-04-01)
Damon~ thank you for your comment!
After reading your comment, I analyzed my code again, and found several things
that I missed. The following code that I attached is the abbreviated version of my original code.
To accurately measure each operation's execution time (in the original code, there are several different types of operations), I inserted the time measuring code using clock_gettime() function. I thought if I measure each operation's execution time and accumulate them, the additional cost by the main loop can be avoided.
In the original code, the time measuring code was hidden by a macro function, so I totally forgot about it.
The running time of this code is almost 6 seconds. But if I get rid of the time measuring function in the main loop, it becomes 0.1 seconds.
Since the clock_gettime() function supports very high precision (upto 1 nano second), executed on the basis of an independent thread, and also it requires very big structure,
I think the function caused the cache-out of the main memory area where the consecutive insertions are performed.
Thank you again for your comment. For further enhancement, any suggestion will be very helpful for me to optimize my code.
I think the hierachically defined structure variable might cause unnecessary time cost,
but first I want to know how much it would be, before I change it to the more C-style code.
typedef struct t_ptr {
uint32 isleaf :1, isNextLeaf :1, ptr :30;
t_ptr(void) {
isleaf = false;
isNextLeaf = false;
ptr = NIL;
}
} PTR;
typedef struct t_key {
uint32 op :1, key :31;
t_key(void) {
op = OP_INS;
key = 0;
}
} KEY;
typedef struct t_key_pair {
KEY key;
PTR ptr;
t_key_pair() {
}
t_key_pair(KEY k, PTR p) {
key = k;
ptr = p;
}
} KeyPair;
typedef struct t_op {
KeyPair keyPair;
uint seq;
t_op() {
seq = 0;
}
} OP;
#define MAX_OP_LEN 4000000
typedef struct t_opq {
OP ops[MAX_OP_LEN];
int freeOffset;
int globalSeq;
bool queueOp(register KeyPair keyPair);
} OpQueue;
bool OpQueue::queueOp(register KeyPair keyPair) {
bool isFull = false;
if (freeOffset == (int) (MAX_OP_LEN - 1)) {
isFull = true;
}
ops[freeOffset].keyPair = keyPair;
ops[freeOffset].seq = globalSeq++;
freeOffset++;
}
OpQueue opQueue;
#include <sys/time.h>
int main() {
struct timespec startTime, endTime, totalTime;
for(int i = 0; i < 4000000; i++) {
clock_gettime(CLOCK_REALTIME, &startTime);
opQueue.queueOp(KeyPair());
clock_gettime(CLOCK_REALTIME, &endTime);
totalTime.tv_sec += (endTime.tv_sec - startTime.tv_sec);
totalTime.tv_nsec += (endTime.tv_nsec - startTime.tv_nsec);
}
printf("\n elapsed time: %ld", totalTime.tv_sec * 1000000LL + totalTime.tv_nsec / 1000L);
}
YOU don't put the structure into any cache. The CPU does that automatically for you. The CPU is even more clever than that; if you access sequential memory, it will start putting things from memory into the cache before you read them.
And really, it should be common sense that for a simple bit of code like this, the time you spend on measuring is ten times more than the time to perform the code (apparently 60 times in your case).
Since you put so much confidence in clock_gettime (): I suggest you call it five times in a row and store the results, then print the differences. There's resolution, there's precision, and there's how long it takes to return the current time, which is pretty damned long.
I have been unable to force caching, but you can force memory to be uncache-able. If you have large other datastructures you might exclude these so that they will not pollute your caches. This can be done by specifying PAGE_NOCACHE for the Windows VirutalAllocXXX functions.
http://msdn.microsoft.com/en-us/library/windows/desktop/aa366786(v=vs.85).aspx

How can I get the Windows system time with millisecond resolution?

How can I get the Windows system time with millisecond resolution?
If the above is not possible, then how can I get the operating system start time? I would like to use this value together with timeGetTime() in order to compute a system time with millisecond resolution.
Try this article from MSDN Magazine. It's actually quite complicated.
Implement a Continuously Updating, High-Resolution Time Provider for Windows
(archive link)
This is an elaboration of the above comments to explain the some of the whys.
First, the GetSystemTime* calls are the only Win32 APIs providing the system's time. This time has a fairly coarse granularity, as most applications do not need the overhead required to maintain a higher resolution. Time is (likely) stored internally as a 64-bit count of milliseconds. Calling timeGetTime gets the low order 32 bits. Calling GetSystemTime, etc requests Windows to return this millisecond time, after converting into days, etc and including the system start time.
There are two time sources in a machine: the CPU's clock and an on-board clock (e.g., real-time clock (RTC), Programmable Interval Timers (PIT), and High Precision Event Timer (HPET)). The first has a resolution of around ~0.5ns (2GHz) and the second is generally programmable down to a period of 1ms (though newer chips (HPET) have higher resolution). Windows uses these periodic ticks to perform certain operations, including updating the system time.
Applications can change this period via timerBeginPeriod; however, this affects the entire system. The OS will check / update regular events at the requested frequency. Under low CPU loads / frequencies, there are idle periods for power savings. At high frequencies, there isn't time to put the processor into low power states. See Timer Resolution for further details. Finally, each tick has some overhead and increasing the frequency consumes more CPU cycles.
For higher resolution time, the system time is not maintained to this accuracy, no more than Big Ben has a second hand. Using QueryPerformanceCounter (QPC) or the CPU's ticks (rdtsc) can provide the resolution between the system time ticks. Such an approach was used in the MSDN magazine article Kevin cited. Though these approaches may have drift (e.g., due to frequency scaling), etc and therefore need to be synced to the system time.
In Windows, the base of all time is a function called GetSystemTimeAsFiletime.
It returns a structure that is capable of holding a time with 100ns resoution.
It is kept in UTC
The FILETIME structure records the number of 100ns intervals since January 1, 1600; meaning its resolution is limited to 100ns.
This forms our first function:
A 64-bit number of 100ns ticks since January 1, 1600 is somewhat unwieldy. Windows provides a handy helper function, FileTimeToSystemTime that can decode this 64-bit integer into useful parts:
record SYSTEMTIME {
wYear: Word;
wMonth: Word;
wDayOfWeek: Word;
wDay: Word;
wHour: Word;
wMinute: Word;
wSecond: Word;
wMilliseconds: Word;
}
Notice that SYSTEMTIME has a built-in resolution limitation of 1ms
Now we have a way to go from FILETIME to SYSTEMTIME:
We could write the function to get the current system time as a SYSTEIMTIME structure:
SYSTEMTIME GetSystemTime()
{
//Get the current system time utc in it's native 100ns FILETIME structure
FILETIME ftNow;
GetSytemTimeAsFileTime(ref ft);
//Decode the 100ns intervals into a 1ms resolution SYSTEMTIME for us
SYSTEMTIME stNow;
FileTimeToSystemTime(ref stNow);
return stNow;
}
Except Windows already wrote such a function for you: GetSystemTime
Local, rather than UTC
Now what if you don't want the current time in UTC. What if you want it in your local time? Windows provides a function to convert a FILETIME that is in UTC into your local time: FileTimeToLocalFileTime
You could write a function that returns you a FILETIME in local time already:
FILETIME GetLocalTimeAsFileTime()
{
FILETIME ftNow;
GetSystemTimeAsFileTime(ref ftNow);
//convert to local
FILETIME ftNowLocal
FileTimeToLocalFileTime(ftNow, ref ftNowLocal);
return ftNowLocal;
}
And lets say you want to decode the local FILETIME into a SYSTEMTIME. That's no problem, you can use FileTimeToSystemTime again:
Fortunately, Windows already provides you a function that returns you the value:
Precise
There is another consideration. Before Windows 8, the clock had a resolution of around 15ms. In Windows 8 they improved the clock to 100ns (matching the resolution of FILETIME).
GetSystemTimeAsFileTime (legacy, 15ms resolution)
GetSystemTimeAsPreciseFileTime (Windows 8, 100ns resolution)
This means we should always prefer the new value:
You asked for the time
You asked for the time; but you have some choices.
The timezone:
UTC (system native)
Local timezone
The format:
FILETIME (system native, 100ns resolution)
SYTEMTIME (decoded, 1ms resolution)
Summary
100ns resolution: FILETIME
UTC: GetSytemTimeAsPreciseFileTime (or GetSystemTimeAsFileTime)
Local: (roll your own)
1ms resolution: SYSTEMTIME
UTC: GetSystemTime
Local: GetLocalTime
GetTickCount will not get it done for you.
Look into QueryPerformanceFrequency / QueryPerformanceCounter. The only gotcha here is CPU scaling though, so do your research.
Starting with Windows 8 Microsoft has introduced the new API command GetSystemTimePreciseAsFileTime
Unfortunately you can't use that if you create software which must also run on older operating systems.
My current solution is as follows, but be aware: The determined time is not exact, it is only near to the real time. The result should always be smaller or equal to the real time, but with a fixed error (unless the computer went to standby). The result has a millisecond resolution. For my purpose it is exact enough.
void GetHighResolutionSystemTime(SYSTEMTIME* pst)
{
static LARGE_INTEGER uFrequency = { 0 };
static LARGE_INTEGER uInitialCount;
static LARGE_INTEGER uInitialTime;
static bool bNoHighResolution = false;
if(!bNoHighResolution && uFrequency.QuadPart == 0)
{
// Initialize performance counter to system time mapping
bNoHighResolution = !QueryPerformanceFrequency(&uFrequency);
if(!bNoHighResolution)
{
FILETIME ftOld, ftInitial;
GetSystemTimeAsFileTime(&ftOld);
do
{
GetSystemTimeAsFileTime(&ftInitial);
QueryPerformanceCounter(&uInitialCount);
} while(ftOld.dwHighDateTime == ftInitial.dwHighDateTime && ftOld.dwLowDateTime == ftInitial.dwLowDateTime);
uInitialTime.LowPart = ftInitial.dwLowDateTime;
uInitialTime.HighPart = ftInitial.dwHighDateTime;
}
}
if(bNoHighResolution)
{
GetSystemTime(pst);
}
else
{
LARGE_INTEGER uNow, uSystemTime;
{
FILETIME ftTemp;
GetSystemTimeAsFileTime(&ftTemp);
uSystemTime.LowPart = ftTemp.dwLowDateTime;
uSystemTime.HighPart = ftTemp.dwHighDateTime;
}
QueryPerformanceCounter(&uNow);
LARGE_INTEGER uCurrentTime;
uCurrentTime.QuadPart = uInitialTime.QuadPart + (uNow.QuadPart - uInitialCount.QuadPart) * 10000000 / uFrequency.QuadPart;
if(uCurrentTime.QuadPart < uSystemTime.QuadPart || abs(uSystemTime.QuadPart - uCurrentTime.QuadPart) > 1000000)
{
// The performance counter has been frozen (e. g. after standby on laptops)
// -> Use current system time and determine the high performance time the next time we need it
uFrequency.QuadPart = 0;
uCurrentTime = uSystemTime;
}
FILETIME ftCurrent;
ftCurrent.dwLowDateTime = uCurrentTime.LowPart;
ftCurrent.dwHighDateTime = uCurrentTime.HighPart;
FileTimeToSystemTime(&ftCurrent, pst);
}
}
GetSystemTimeAsFileTime gives the best precision of any Win32 function for absolute time. QPF/QPC as Joel Clark suggested will give better relative time.
Since we all come here for quick snippets instead of boring explanations, I'll write one:
FILETIME t;
GetSystemTimeAsFileTime(&t); // unusable as is
ULARGE_INTEGER i;
i.LowPart = t.dwLowDateTime;
i.HighPart = t.dwHighDateTime;
int64_t ticks_since_1601 = i.QuadPart; // now usable
int64_t us_since_1601 = (i.QuadPart * 1e-1);
int64_t ms_since_1601 = (i.QuadPart * 1e-4);
int64_t sec_since_1601 = (i.QuadPart * 1e-7);
// unix epoch
int64_t unix_us = (i.QuadPart * 1e-1) - 11644473600LL * 1000000;
int64_t unix_ms = (i.QuadPart * 1e-4) - 11644473600LL * 1000;
double unix_sec = (i.QuadPart * 1e-7) - 11644473600LL;
// i.QuadPart is # of 100ns ticks since 1601-01-01T00:00:00Z
// difference to Unix Epoch is 11644473600 seconds (attention to units!)
No idea how drifting performance-counter-based answers went up, don't do slippage bugs, guys.
QueryPerformanceCounter() is built for fine-grained timer resolution.
It is the highest resolution timer that the system has to offer that you can use in your application code to identify performance bottlenecks
Here is a simple implementation for C# devs:
[DllImport("kernel32.dll")]
extern static short QueryPerformanceCounter(ref long x);
[DllImport("kernel32.dll")]
extern static short QueryPerformanceFrequency(ref long x);
private long m_endTime;
private long m_startTime;
private long m_frequency;
public Form1()
{
InitializeComponent();
}
public void Begin()
{
QueryPerformanceCounter(ref m_startTime);
}
public void End()
{
QueryPerformanceCounter(ref m_endTime);
}
private void button1_Click(object sender, EventArgs e)
{
QueryPerformanceFrequency(ref m_frequency);
Begin();
for (long i = 0; i < 1000; i++) ;
End();
MessageBox.Show((m_endTime - m_startTime).ToString());
}
If you are a C/C++ dev, then take a look here: How to use the QueryPerformanceCounter function to time code in Visual C++
Well, this one is very old, yet there is another useful function in Windows C library _ftime, which returns a structure with local time as time_t, milliseconds, timezone, and daylight saving time flag.
In C11 and above (or C++17 and above) you can use timespec_get() to get time with higher precision portably
#include <stdio.h>
#include <time.h>
int main(void)
{
struct timespec ts;
timespec_get(&ts, TIME_UTC);
char buff[100];
strftime(buff, sizeof buff, "%D %T", gmtime(&ts.tv_sec));
printf("Current time: %s.%09ld UTC\n", buff, ts.tv_nsec);
}
If you're using C++ then since C++11 you can use std::chrono::high_resolution_clock, std::chrono::system_clock (wall clock), or std::chrono::steady_clock (monotonic clock) in the new <chrono> header. No need to use Windows-specific APIs anymore
auto start1 = std::chrono::high_resolution_clock::now();
auto start2 = std::chrono::system_clock::now();
auto start3 = std::chrono::steady_clock::now();
// do some work
auto end1 = std::chrono::high_resolution_clock::now();
auto end2 = std::chrono::system_clock::now();
auto end3 = std::chrono::steady_clock::now();
std::chrono::duration<long long, std::milli> diff1 = end1 - start1;
std::chrono::duration<double, std::milli> diff2 = end2 - start2;
auto diff3 = std::chrono::duration_cast<std::chrono::milliseconds>(end3 - start3);
std::cout << diff.count() << ' ' << diff2.count() << ' ' << diff3.count() << '\n';

Resources