Why does the ZeroMQ ROUTER-DEALER pattern have high latency? - c++11

Using libzmq 4.2.5 on centos 7. Getting very high latency when messages are sent from DEALER to ROUTER and even from ROUTER to DEALER. So I wrote a simple client-server program using tcp and sent messages between them just for comparison. Tcp appears to be fast.
Sending single byte from DEALER to ROUTER, zmq takes 900 microseconds.
Sending single byte from client to server, tcp takes 150 microseconds.
What am I doing wrong. I thought zmq will be at least as fast as tcp. Is there any tuning I can do to make zmq faster?
Update
router.cpp
#include <zmq.hpp>
struct data
{
char one[21];
unsigned long two;
};
data * pdata;
std::size_t counter=0;
int main()
{
zmq::context_t context(1);
zmq::socket_t Device(context,ZMQ_ROUTER);
int iHighWaterMark=0;
Device.setsockopt(ZMQ_SNDHWM,&iHighWaterMark,sizeof(int));
Device.setsockopt(ZMQ_RCVHWM,&iHighWaterMark,sizeof(int));
Device.bind("tcp://0.0.0.0:5555");
pdata=new data[10000];
struct timespec ts_dtime;
unsigned long sec;
zmq::message_t message;
zmq::pollitem_t arrPollItems[]={{Device, 0, ZMQ_POLLIN, 0},{NULL,
0, ZMQ_POLLIN, 0}};
while(counter < 10000)
{
try
{
int iAssert = zmq::poll(arrPollItems, 1, -1);
if (iAssert <= 0)
{
if (-1 == iAssert)
{
printf("zmq_poll failed errno: %d error:%s", errno,
zmq_strerror(errno));
}
continue;
}
if (arrPollItems[0].revents == ZMQ_POLLIN)
{
while(true)
{
if(! Device.recv(&message,ZMQ_DONTWAIT))
break;
Device.recv(&message);
strncpy(pdata[counter].one,
(char*)message.data(),message.size());
clock_gettime(CLOCK_REALTIME, &ts_dtime);
pdata[counter].two = (ts_dtime.tv_sec*1e9)+
ts_dtime.tv_nsec;
++counter;
}
}
}
catch(...)
{
}
}
for(int i=0;i<counter;++i)
printf("%d %s %lu\n",i+1,pdata[i].one,pdata[i].two);
return 0;
}
dealer.cpp
#include <zmq.hpp>
#include<unistd.h>
int main()
{
zmq::context_t context(1);
zmq::socket_t Device(context,ZMQ_DEALER);
int iHighWaterMark=0;
Device.setsockopt(ZMQ_SNDHWM,&iHighWaterMark,sizeof(int));
Device.setsockopt(ZMQ_RCVHWM,&iHighWaterMark,sizeof(int));
Device.setsockopt(ZMQ_IDENTITY,"TEST",4);
Device.connect("tcp://0.0.0.0:5555");
usleep(100000);
struct timespec ts_dtime;
unsigned long sec;
for(std::size_t i=0;i<10000;++i)
{
clock_gettime(CLOCK_REALTIME, &ts_dtime);
sec=(ts_dtime.tv_sec*1e9)+ ts_dtime.tv_nsec;
zmq::message_t message(21);
sprintf((char *)message.data(),"%lu",sec);
Device.send(message);
usleep(500);
}
return 0;
}
update 2:
router.cpp
#include <zmq.hpp>
#include <stdio.h>
#include <stdlib.h>
int main (int argc, char *argv[])
{
const char *bind_to;
int roundtrip_count;
size_t message_size;
int rc;
int i;
if (argc != 4) {
printf ("usage: local_lat <bind-to> <message-size> "
"<roundtrip-count>\n");
return 1;
}
bind_to = argv[1];
message_size = atoi (argv[2]);
roundtrip_count = atoi (argv[3]);
zmq::context_t ctx(1);
zmq::socket_t s(ctx,ZMQ_ROUTER);
zmq::message_t msg,id;
int iHighWaterMark=0;
s.setsockopt(ZMQ_SNDHWM , &iHighWaterMark,
sizeof (int));
s.setsockopt(ZMQ_RCVHWM , &iHighWaterMark,
sizeof (int));
s.bind( bind_to);
struct timespec ts_dtime;
unsigned long sec;
for (i = 0; i != roundtrip_count; i++) {
rc =s.recv(&id);
if (rc < 0) {
printf ("error in zmq_recvmsg: %s\n", zmq_strerror (errno));
return -1;
}
rc = s.recv(&msg, 0);
if (rc < 0) {
printf ("error in zmq_recvmsg: %s\n", zmq_strerror (errno));
return -1;
}
clock_gettime(CLOCK_REALTIME, &ts_dtime);
sec=((ts_dtime.tv_sec*1e9)+ ts_dtime.tv_nsec);
printf("%.*s %lu\n",20,(char *)msg.data(),sec);
}
s.close();
return 0;
}
dealer.cpp
#include <zmq.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
int main (int argc, char *argv[])
{
const char *connect_to;
int roundtrip_count;
size_t message_size;
int rc;
int i;
void *watch;
unsigned long elapsed;
double latency;
if (argc != 4) {
printf ("usage: remote_lat <connect-to> <message-size> "
"<roundtrip-count>\n");
return 1;
}
connect_to = argv[1];
message_size = atoi (argv[2]);
roundtrip_count = atoi (argv[3]);
zmq::context_t ctx(1);
zmq::socket_t s(ctx,ZMQ_DEALER);
struct timespec ts_dtime;
unsigned long sec;
int iHighWaterMark=0;
s.setsockopt(ZMQ_SNDHWM , &iHighWaterMark,
sizeof (int));
s.setsockopt(ZMQ_RCVHWM , &iHighWaterMark,
sizeof (int));
s.connect(connect_to);
for (i = 0; i != roundtrip_count; i++) {
zmq::message_t msg(message_size+20);
clock_gettime(CLOCK_REALTIME, &ts_dtime);
sec=(ts_dtime.tv_sec*1e9)+ ts_dtime.tv_nsec;
sprintf((char *)msg.data(),"%lu",sec);
rc = s.send(msg);
if (rc < 0) {
printf ("error in zmq_sendmsg: %s\n", zmq_strerror (errno));
return -1;
}
sleep(1);
}
s.close();
return 0;
}
output :
1562125527489432576 1562125527489773568
1562125528489582848 1562125528489961472
1562125529489740032 1562125529490124032
1562125530489944832 1562125530490288896
1562125531490101760 1562125531490439424
1562125532490261248 1562125532490631680
1562125533490422272 1562125533490798080
1562125534490555648 1562125534490980096
1562125535490745856 1562125535491161856
1562125536490894848 1562125536491245824
1562125537491039232 1562125537491416320
1562125538491229184 1562125538491601152
1562125539491375872 1562125539491764736
1562125540491517184 1562125540491908352
1562125541491657984 1562125541492027392
1562125542491816704 1562125542492193536
1562125543491963136 1562125543492338944
1562125544492103680 1562125544492564992
1562125545492248832 1562125545492675328
1562125546492397312 1562125546492783616
1562125547492543744 1562125547492926720
1562125564495211008 1562125564495629824
1562125565495372032 1562125565495783168
1562125566495515904 1562125566495924224
1562125567495660800 1562125567496006144
1562125568495806464 1562125568496160000
1562125569495896064 1562125569496235520
1562125570496080128 1562125570496547584
1562125571496235008 1562125571496666624
1562125572496391424 1562125572496803584
1562125573496532224 1562125573496935680
1562125574496652800 1562125574497053952
1562125575496843776 1562125575497277184
1562125576496997120 1562125576497417216
1562125577497182208 1562125577497726976
1562125578497336832 1562125578497726464
1562125579497549312 1562125579497928704
1562125580497696512 1562125580498115328
1562125581497847808 1562125581498198528
1562125582497998336 1562125582498340096
1562125583498140160 1562125583498622464
1562125584498295296 1562125584498680832
1562125585498445312 1562125585498842624
1562125586498627328 1562125586499025920
All are in the range for 350-450us

Q1: What am I doing wrong?I thought zmq will be at least as fast as tcp.
Code-wise, nothing.
Performance-wise, the ZeroMQ is fantastic plus has so many features that tcp does not and will not provide right out of the box:
Test-setup "Sending single byte..." seems to step right into the left edge of the high-performance / low-latency messaging service:
Lets first understand the Latency and where did it come from:
The observed resulting latency figures are product of the overall sum of the resources-usage ( resources allocations + resources pools management operations + data manipulations ) and processing-efforts ( all we try to do with the data, here including times, that our task had to spend in a waiting queue, due to the system-scheduler planned multi-tasking workunits scheduling, that are not from our testing workload, but the operating system has to schedule and execute, according to the fair-scheduling-policy and actual process-priority settings ) and communications channels transport-delays ( comms E2E transport latency )
Lets next understand what do we try to compare with:
A difference between a Transmission Control Protocol ( raw tcp ) and a ZeroMQ zmq framework of smart Scalable Formal Communication Archetypes with a rich set of high-level, distributed behaviours, is about a few galaxies big.
ZeroMQ was designed as rather a Signalling and Messaging infrastructure using some of these feature-rich set of behaviours that match together - often depicted by some human-alike behaviour-archetype:
One PUSH-es, any number of joined counterparties PULL
One REQ-ests, someone from a group on the other end of the phone REP-lies
One, even potentially a one from some larger group of agents, PUB-lishes, any amount of already subscribed subscribers receive such a SUB-scribed message.
For details, one may kindly read a brief overview about the main conceptual differences in [ ZeroMQ hierarchy in less than a five seconds ] Section.
This is nothing a TCP-protocol will ever provide on its own.
This is a comfort one likes to pay for by some negligible amount of latency. Negligible? Yes, negligible once compared to the many man*years of ultimate software craftsmanship anyone would have to pay for designing another at least similarly smart messaging framework to compete with ZeroMQ.
Q2: Is there any tuning I can do to make zmq faster?
Maybe yes, maybe not.
Update:- try avoiding Identity management ( tcp has no such thing either, so measured RTT-s are the lesser comparable or meaningful )
- try avoiding the blocking manner of the HWM-configurations ( tcp has no such thing either )
- may try to measure the same over a non-tcp-protocol ( a PAIR/PAIR Formal Scalable Communication Archetype, best over the least complex protocol data-pumps as inproc:// is or ipc:// in case your SandBox test bed needs to still keep distributed non-local copies etc. ) ZeroMQ context-instance's internal overheads spent on the .send() resp. .receive() methods
- may try to allow for slight increase in performance by using more threads available for the Context instance
( other performance demasking tricks depend on the nature of real-world usage - as a robustness to dropped messages, feasibility to use a conflated mode of operations, better buffer-alignment with O/S, zero-copy tricks - all being of some interest here, yet have to let and keep the smart ZeroMQ infrastructure of distributed behaviours operational, which is by far more complex task to execute, than a trivial serial sequence of otherwise blind and isolated tcp-socket byte-level operations - so, comparing times is possible, but comparing individual draconic dragster-class car ( well, better a vehicle, not even a car ) with something like a globally operating infrastructure of distributed behaviour ( like Taxify or Uber, named here just to make a use of a trivialised (dis-)similarity of approximately same scale of magnitude ) leaves the numbers reporting about phenomena, that do not provide the similar comfort, scalability of use-cases, almost linear performance scaling and robustness of the real-world use )
- may add more scheduling determinism with making the Context-instance's respective IoTHREADs-hard-wired onto respective CPU-core(s), so that the overall I/O-performance never gets evicted from CPU-schedule and remains deterministically mapped / pre-locked on even exclusively administratively dedicated CPU-core(s) - depends on a level of need and administrative policies if trying to do this ultimate performance hack
For any performance related tweaking, one will need to post an MCVE + a fully described benchmark test suite. The ZeroMQ Latency test results report shows:
Conclusion
In a controlled environment RDTSC instruction can be used to measure
time rapidly. This allows us to measure latency/density for individual
messages instead of computing averages for the whole test.
We've used this approach to get performance figures of ØMQ lightweight
messaging kernel (version 0.1) and we've got following results:
-- In low-volume case the latency is almost the same as the latency of the underlying transport (TCP): 50 microseconds.
-- The average jitter of latency is minimal: 0.225 microsecond.
-- The throughput on sender side is 4.8 million messages a second.
-- The density on sender side is mostly about 0.140 microsecond, however, with occasional peaks the mean density is 0.208 microsecond.
-- The throughput on receiver side is 2.7 million messages a second.
-- The density on receiver side is mostly about 0.3 microsecond. Approximately each 100 messages new batch is received causing density
to grow to 3-6 microseconds. The mean density is 0.367 microsecond.
If in an ultimate need for latency shaving, one may try nanomsg, the ZeroMQ's younger sister originated by Martin SUSTRIK, the co-father of ZeroMQ ( now maintained afaik by someone else )

Related

An OpenCL code in MQL5 does not get distributed jobs to each GPU core

I have created a GPU based indicator for MetaTrader Terminal platform, using OpenCL and MQL5.
I have tried hard that my [ MetaTrader Terminal: Strategy Tester ] optimization job must get transferred on GPU to maximum. Most of the calculations are done by the indicator. Hence, I made changes in the indicator and has completely transferred on GPU.
But the real issue arises when I try to go for optimization process in the strategy tester section.
The process I see uses both my GPU and CPU but there is no effect on the complete process.
I suspect that the process is not getting distributed to each GPU core for processing, instead all the GPU cores are working on the same process or function for execution.
Kindly, let me know what I need to do to get the single GPU work for on single function execution to give faster output.
Here is my code link attached: Complete code with Expert
The kernel of my code is :
__kernel void calSMA(
int limit,
int rates_total,
__global double *price,
__global double *ExtLineBuffer,
int InpMAPeriod
)
{
int count = 0;
int len = get_global_id(2);
for(int i=limit;i<rates_total;i++)
ExtLineBuffer[len+i] = ExtLineBuffer[len+ i-1]+(price[len+i]-price[len+i-InpMAPeriod])/InpMAPeriod;
}
__kernel void calcSMALoop(int begin, int limit, __global double *price, __global double *firstValue, int InpMAPeriod)
{
int i, len = get_global_id(2);
for(i=begin;i<limit;i++)
firstValue[len]+=price[i];
firstValue[len]/=InpMAPeriod;
}
__kernel void calcEMA(int begin, int limit, __global double *price, __global double *ExtLineBuffer, double SmoothFactor)
{
int len = get_global_id(2);
for(int i=begin;i<limit;i++)
ExtLineBuffer[len + i]=price[len + i]*SmoothFactor+ExtLineBuffer[len + i-1]*(1.0-SmoothFactor);
}
__kernel void calcSSMA(int limit, int rates_total, __global double *price, __global double *ExtLineBuffer, int InpMAPeriod)
{
int len = get_global_id(2);
for(int i=limit;i<rates_total;i++)
ExtLineBuffer[len + i]=(ExtLineBuffer[len + i-1]*(InpMAPeriod-1)+price[len + i])/InpMAPeriod;
}
__kernel void calcLWMALoop(int begin, int limit, __global double *price, __global double *firstValue, int weightsum, __global int *weightreturn)
{
weightsum = 0;
int len = get_global_id(2);
for(int i=begin;i<limit;i++)
{
weightsum+=(i-begin+1);
firstValue[len]+=(i-begin+1)*price[i];
}
firstValue[len]/=(double)weightsum;
weightreturn[0] = weightsum;
}
//__global int counter = 0;
double returnCalculation(int InpMAPeriod, double price, int j)
{
return ((InpMAPeriod-j)*price);
}
__kernel void calcLWMA(int limit, int rates_total, __global double *price, __global double *ExtLineBuffer, int InpMAPeriod, int weightsum)
{
int len = get_global_id(2);
for(int i=limit;i<rates_total;i++)
{
double sum = 0;
for(int j=0;j<InpMAPeriod;j++) sum+=returnCalculation(InpMAPeriod,price[len + i-j],j);
ExtLineBuffer[len + i]=sum/weightsum;
}
}
Please suggest me the way out for distributing the function with different values or frames in MQL5 using GPU on OpenCL.
EDITED
Its a great challenge for the challenge seekers... Even I am eager to know whether there can be anything done with OpenCL and MQL5 for optimization task. I hope I will get answers for what I am seeking.
EDITED AGAIN the MAGPU.mqh file
#include "CHECKMA.mq5"
#define CUDA_CORE 2
int Execute_SMA(
const double &price[],
int rates_total,
int limit
)
{
int cl_mem = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE),
cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);
Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel1, "Execute_SMA function error");
if(!CLSetKernelArgMem(cl_CommonKernel1,2,cl_price))
Print("Input Bufer Not Set");
//else Print("Input Buffer Set");
if(!CLSetKernelArgMem(cl_CommonKernel1,3,cl_mem))
Print("Output Bufer Not Set");
//else Print("Output Buffer Set");
if(!CLBufferWrite(cl_price, price))
Print("Could not copy Input buffer");
//else Print("Copied: ",cl_price);
if(!CLBufferWrite(cl_mem, ExtLineBuffer))
Print("Could not copy Input buffer");
//else Print("Copied: ",cl_mem);
//else Print("Input Buffer Copied");
if(!CLSetKernelArg(cl_CommonKernel1,0,limit))
Print("Could Not Set Arg 0");
//else Print("Set Arg 0");
if(!CLSetKernelArg(cl_CommonKernel1,1,rates_total))
Print("Could Not Set Arg 1");
//else Print("Set Arg 1");
//if(!CLSetKernelArg(cl_CommonKernel1,4,previous_value))
//Print("Could Not Set Arg2");
//else Print("Set Arg 2");
if(!CLSetKernelArg(cl_CommonKernel1,4,InpMAPeriod))
Print("Could Not Set Arg3: ",GetLastError());
//Print(CLGetInfoInteger(cl_ctx,CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
if(!CLExecute(cl_CommonKernel1,CUDA_CORE,offset,work))
Print("Kernel not executed",GetLastError());
//else Print("Executing Now!");
//if(CLExecutionStatus(cl_krn) == 0) Print("Completed");
//if(CLExecutionStatus(cl_krn) == 1) Print("CL_RUNNING");
//if(CLExecutionStatus(cl_krn) == 2) Print("CL_SUBMITTED");
//if(CLExecutionStatus(cl_krn) == 3) Print("CL_QUEUED");
//if(CLExecutionStatus(cl_krn) == -1)Print("Error Occurred:", GetLastError());
//if(!CLExecutionStatus(cl_krn))
//Print(CLExecutionStatus(cl_krn));
if(!CLBufferRead(cl_mem,ExtLineBuffer))
Print("Buffer Copy Nothing: ", GetLastError());
CLBufferFree(cl_price);
CLBufferFree(cl_mem);
return(1);
}
double ExecuteLoop(
int begin,
int limit,
const double &price[]
)
{
int cl_mem = CLBufferCreate(cl_ctx,sizeof(double),CL_MEM_READ_WRITE),
cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);
double temp[];
ArrayResize(temp,1);
temp[0] = 0;
Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel2, "ExecuteLoop function error");
if(!CLSetKernelArgMem(cl_CommonKernel2,2,cl_price))
Print("Input Bufer Not Set 2");
if(!CLSetKernelArgMem(cl_CommonKernel2,3,cl_mem))
Print("Output Bufer Not Set 2");
if(!CLBufferWrite(cl_price, price))
Print("Could not copy Input buffer 2");
if(!CLSetKernelArg(cl_CommonKernel2,0,begin))
Print("Could Not Set Arg 0");
if(!CLSetKernelArg(cl_CommonKernel2,1,limit))
Print("Could Not Set Arg 1");
if(!CLSetKernelArg(cl_CommonKernel2,4,InpMAPeriod))
Print("Could Not Set Arg3: ",GetLastError());
if(!CLExecute(cl_CommonKernel2,CUDA_CORE,offset,work))
Print("Kernel not executed",GetLastError());
if(!CLBufferRead(cl_mem,temp))
Print("Buffer Copy Nothing: ", GetLastError());
CLBufferFree(cl_price);
CLBufferFree(cl_mem);
return(temp[0]);
}
int ExecuteEMA(int begin, int limit, const double &price[], double SmoothFactor)
{
int cl_mem = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE),
cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);
Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel1, "ExecuteEMA function error");
if(!CLSetKernelArgMem(cl_CommonKernel1,2,cl_price))
Print("Input Bufer Not Set");
if(!CLSetKernelArgMem(cl_CommonKernel1,3,cl_mem))
Print("Output Bufer Not Set");
if(!CLBufferWrite(cl_price, price))
Print("Could not copy Input buffer");
if(!CLBufferWrite(cl_mem, ExtLineBuffer))
Print("Could not copy Input buffer");
if(!CLSetKernelArg(cl_CommonKernel1,0,begin))
Print("Could Not Set Arg 0");
if(!CLSetKernelArg(cl_CommonKernel1,1,limit))
Print("Could Not Set Arg 1");
if(!CLSetKernelArg(cl_CommonKernel1,4,SmoothFactor))
Print("Could Not Set Arg3: ",GetLastError());
if(!CLExecute(cl_CommonKernel1,CUDA_CORE,offset,work))
Print("Kernel not executed",GetLastError());
if(!CLBufferRead(cl_mem,ExtLineBuffer))
Print("Buffer Copy Nothing: ", GetLastError());
CLBufferFree(cl_price);
CLBufferFree(cl_mem);
return(1);
}
int Execute_SSMA(
const double &price[],
int rates_total,
int limit
)
{
int cl_mem = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE),
cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);
Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel1, "Execute_SSMA function error");
if(!CLSetKernelArgMem(cl_CommonKernel1,2,cl_price))
Print("Input Bufer Not Set");
if(!CLSetKernelArgMem(cl_CommonKernel1,3,cl_mem))
Print("Output Bufer Not Set");
if(!CLBufferWrite(cl_price, price))
Print("Could not copy Input buffer");
if(!CLBufferWrite(cl_mem, ExtLineBuffer))
Print("Could not copy Input buffer");
//
//else Print("Input Buffer Copied");
if(!CLSetKernelArg(cl_CommonKernel1,0,limit))
Print("Could Not Set Arg 0");
if(!CLSetKernelArg(cl_CommonKernel1,1,rates_total))
Print("Could Not Set Arg 1");
if(!CLSetKernelArg(cl_CommonKernel1,4,InpMAPeriod))
Print("Could Not Set Arg3: ",GetLastError());
if(!CLExecute(cl_CommonKernel1,CUDA_CORE,offset,work))
Print("Kernel not executed",GetLastError());
if(!CLBufferRead(cl_mem,ExtLineBuffer))
Print("Buffer Copy Nothing: ", GetLastError());
CLBufferFree(cl_price);
CLBufferFree(cl_mem);
return(1);
}
double ExecuteLWMALoop(
int begin,
int limit,
const double &price[],
int weightsumlocal
)
{
int cl_mem = CLBufferCreate(cl_ctx,sizeof(double),CL_MEM_READ_WRITE),
cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE),
cl_weightsumlocal = CLBufferCreate(cl_ctx,sizeof(int),CL_MEM_READ_WRITE);
double temp[];
int weight[];
ArrayResize(temp,1);
ArrayResize(weight,1);
weight[0] = 0;
temp[0] = 0;
Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel2, "ExecuteLWMALoop function error");
if(!CLSetKernelArgMem(cl_CommonKernel2,2,cl_price))
Print("Input Bufer Not Set 2");
if(!CLSetKernelArgMem(cl_CommonKernel2,3,cl_mem))
Print("Output Bufer Not Set 2");
if(!CLSetKernelArgMem(cl_CommonKernel2,5,cl_weightsumlocal))
Print("Output Bufer Not Set 2");
if(!CLBufferWrite(cl_price, price))
Print("Could not copy Input buffer 2");
if(!CLSetKernelArg(cl_CommonKernel2,0,begin))
Print("Could Not Set Arg 0");
if(!CLSetKernelArg(cl_CommonKernel2,1,limit))
Print("Could Not Set Arg 1");
if(!CLSetKernelArg(cl_CommonKernel2,4,weightsumlocal))
Print("Could Not Set Arg3: ",GetLastError());
if(!CLExecute(cl_CommonKernel2,CUDA_CORE,offset,work))
Print("Kernel not executed",GetLastError());
if(!CLBufferRead(cl_mem,temp))
Print("Buffer Copy Nothing: ", GetLastError());
if(!CLBufferRead(cl_weightsumlocal,weight))
Print("Buffer Copy Nothing: ", GetLastError());
weightsum = weight[0];
CLBufferFree(cl_weightsumlocal);
CLBufferFree(cl_price);
CLBufferFree(cl_mem);
return(temp[0]);
}
int Execute_LWMA(const double &price[], int rates_total, int limit, int weightsum1)
{
int cl_mem = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE),
cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);
Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel1, "Execute_SSMA function error");
if(!CLSetKernelArgMem(cl_CommonKernel1,2,cl_price))
Print("Input Bufer Not Set");
if(!CLSetKernelArgMem(cl_CommonKernel1,3,cl_mem))
Print("Output Bufer Not Set");
if(!CLBufferWrite(cl_price, price))
Print("Could not copy Input buffer");
if(!CLBufferWrite(cl_mem, ExtLineBuffer))
Print("Could not copy Input buffer");
//else Print("Input Buffer Copied");
if(!CLSetKernelArg(cl_CommonKernel1,0,limit))
Print("Could Not Set Arg 0");
if(!CLSetKernelArg(cl_CommonKernel1,1,rates_total))
Print("Could Not Set Arg 1");
if(!CLSetKernelArg(cl_CommonKernel1,4,InpMAPeriod))
Print("Could Not Set Arg4: ",GetLastError());
if(!CLSetKernelArg(cl_CommonKernel1,5,weightsum1))
Print("Could Not Set Arg5: ",GetLastError());
if(!CLExecute(cl_CommonKernel1,CUDA_CORE,offset,work))
Print("Kernel not executed",GetLastError());
if(!CLBufferRead(cl_mem,ExtLineBuffer))
Print("Buffer Copy Nothing: ", GetLastError());
CLBufferFree(cl_price);
CLBufferFree(cl_mem);
return(1);
}
void checkKernel(int cl_kernel, string var_name)
{
if(cl_kernel==INVALID_HANDLE )
{
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
Print("OpenCL kernel create failed: ERR_OPENCL_INVALID_HANDLE ", var_name);
return;
}
if(cl_kernel==ERR_INVALID_PARAMETER )
{
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
Print("OpenCL kernel create failed: ERR_INVALID_PARAMETER ", var_name);
return;
}
if(cl_kernel==ERR_OPENCL_TOO_LONG_KERNEL_NAME )
{
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
Print("OpenCL kernel create failed: ERR_OPENCL_TOO_LONG_KERNEL_NAME ", var_name);
return;
}
if(cl_kernel==ERR_OPENCL_KERNEL_CREATE )
{
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
Print("OpenCL kernel create failed 1: ERR_OPENCL_KERNEL_CREATE ", var_name);
return;
}
}
int Check_Memory_Initialization(int cl_mem, int cl_price, int cl_ker, string name_process_call)
{
if(cl_mem==INVALID_HANDLE)
{
CLKernelFree(cl_ker);
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
Print("OpenCL buffer create failed: cl_mem INVALID_HANDLE: ", name_process_call);
return(0);
}
if(cl_mem==ERR_NOT_ENOUGH_MEMORY )
{
CLKernelFree(cl_ker);
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
Print("OpenCL buffer create failed: cl_mem ERR_NOT_ENOUGH_MEMORY: ", name_process_call);
return(0);
}
if(cl_mem==ERR_OPENCL_BUFFER_CREATE )
{
CLKernelFree(cl_ker);
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
Print("OpenCL buffer create failed: cl_mem ERR_OPENCL_BUFFER_CREATE: ", name_process_call);
return(0);
}
if(cl_price==INVALID_HANDLE)
{
CLKernelFree(cl_ker);
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
CLBufferFree(cl_mem);
Print("OpenCL buffer create failed: cl_price: ", name_process_call);
return(0);
}
if(cl_price==ERR_NOT_ENOUGH_MEMORY)
{
CLKernelFree(cl_ker);
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
CLBufferFree(cl_mem);
Print("OpenCL buffer create failed: cl_price ERR_NOT_ENOUGH_MEMORY: ", name_process_call);
return(0);
}
if(cl_price==ERR_OPENCL_BUFFER_CREATE)
{
CLKernelFree(cl_ker);
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
CLBufferFree(cl_mem);
Print("OpenCL buffer create failed: cl_price ERR_OPENCL_BUFFER_CREATE: ", name_process_call);
return(0);
}
return(1);
}
MAIN INDICATOR FILE CHECKMA.mq5 file
#resource "program_MA_GPU.cl" as string cl_program
#include "MAGPU.mqh"
#property indicator_chart_window
#property indicator_buffers 1
#property indicator_plots 1
#property indicator_type1 DRAW_LINE
#property indicator_color1 Red
input int InpMAPeriod=13; // Period
input int InpMAShift=0; // Shift
input ENUM_MA_METHOD InpMAMethod=MODE_SMA; // Method
//--- indicator buffers
double ExtLineBuffer[];
int offset[CUDA_CORE], work[CUDA_CORE];//={0,19,38,57,76,95,114,123};
string str;
int cl_ctx, cl_prg, cl_CommonKernel1, cl_CommonKernel2;
static int weightsum;
void CalculateSimpleMA(int rates_total,int prev_calculated,int begin,const double &price[])
{
int limit;
if(prev_calculated==0)
{
limit=InpMAPeriod+begin;
ArrayFill(ExtLineBuffer,0,limit-1,0.0);
ExtLineBuffer[limit-1]=ExecuteLoop(begin,limit,price);
}
else limit=prev_calculated-ArraySize(price)+InpMAPeriod+17;
Execute_SMA(price,rates_total,limit);
}
void CalculateEMA(int rates_total,int prev_calculated,int begin,const double &price[])
{
int limit;
double SmoothFactor=2.0/(1.0+InpMAPeriod);
if(prev_calculated==0)
{
limit=InpMAPeriod+begin;
ExtLineBuffer[begin]=price[begin];
ExecuteEMA(begin+1,limit,price,SmoothFactor);
}
else limit=prev_calculated;
ExecuteEMA(begin+99900,limit,price,SmoothFactor);
}
void CalculateLWMA(int rates_total,int prev_calculated,int begin,const double &price[])
{
int limit;
if(prev_calculated==0)
{
weightsum=0;
limit=InpMAPeriod+begin;
//--- set empty value for first limit bars
ArrayFill(ExtLineBuffer,0,limit,0.0);
//--- calculate first visible value
ExtLineBuffer[limit-1]=ExecuteLWMALoop(begin,limit,price,weightsum);
}
else limit=prev_calculated-ArraySize(price)+InpMAPeriod+17;
//--- main loop
Execute_LWMA(price,rates_total,limit,weightsum);
}
void CalculateSmoothedMA(int rates_total,int prev_calculated,int begin,const double &price[])
{
int limit;
//--- first calculation or number of bars was changed
if(prev_calculated==0)
{
limit=InpMAPeriod+begin;
//--- set empty value for first limit bars
ArrayFill(ExtLineBuffer,0,limit-1,0.0);
ExtLineBuffer[limit-1]=ExecuteLoop(begin,limit,price);
}
else limit=prev_calculated-ArraySize(price)+InpMAPeriod+17;
Execute_SSMA(price,rates_total,limit);
//---
}
void OnInit()
{
//--- indicator buffers mapping
SetIndexBuffer(0,ExtLineBuffer,INDICATOR_DATA);
//--- set accuracy
IndicatorSetInteger(INDICATOR_DIGITS,_Digits+1);
//--- sets first bar from what index will be drawn
PlotIndexSetInteger(0,PLOT_DRAW_BEGIN,InpMAPeriod);
//---- line shifts when drawing
PlotIndexSetInteger(0,PLOT_SHIFT,InpMAShift);
//--- name for DataWindow
//---- sets drawing line empty value--
PlotIndexSetDouble(0,PLOT_EMPTY_VALUE,0.0);
//---- initialization done
cl_ctx = CLContextCreate(CL_USE_GPU_ONLY);
cl_prg=CLProgramCreate(cl_ctx,cl_program,str);
if(cl_ctx==INVALID_HANDLE)
{
Print("OpenCL not found: ", GetLastError() );
return;
}
if(cl_prg==INVALID_HANDLE)
{
CLContextFree(cl_ctx);
Print("OpenCL program create failed: ", str);
return;
}
if(cl_prg==ERR_INVALID_PARAMETER )
{
CLContextFree(cl_ctx);
Print("OpenCL program create failed: ", str);
return;
}
if(cl_prg==ERR_NOT_ENOUGH_MEMORY )
{
CLContextFree(cl_ctx);
Print("OpenCL program create failed: ", str);
return;
}
if(cl_prg==ERR_OPENCL_PROGRAM_CREATE )
{
CLContextFree(cl_ctx);
Print("OpenCL program create failed: ", str);
return;
}
int c = 1;
ArrayFill(work,0,CUDA_CORE,c);
//ArrayInitialize(offset,0);
int enter = -c;
for (int i =0; i < CUDA_CORE; i++)
{
offset[i] = enter + c;
enter = offset[i];
}
switch(InpMAMethod)
{
case MODE_SMA : cl_CommonKernel1 = CLKernelCreate(cl_prg,"calSMA");
checkKernel(cl_CommonKernel1,"cl_CommonKernel1 SMA");
cl_CommonKernel2 = CLKernelCreate(cl_prg,"calcSMALoop");
checkKernel(cl_CommonKernel2,"cl_CommonKernel2 SMA");
break;
case MODE_EMA : cl_CommonKernel1 = CLKernelCreate(cl_prg,"calcEMA");
checkKernel(cl_CommonKernel1,"cl_CommonKernel1 EMA");
break;
case MODE_LWMA : cl_CommonKernel1 = CLKernelCreate(cl_prg,"calcLWMA");
checkKernel(cl_CommonKernel1,"cl_CommonKernel1 LWMA");
cl_CommonKernel2 = CLKernelCreate(cl_prg,"calcLWMALoop");
checkKernel(cl_CommonKernel2,"cl_CommonKernel2 LWMA");
break;
case MODE_SMMA : cl_CommonKernel1 = CLKernelCreate(cl_prg,"calcSSMA");
checkKernel(cl_CommonKernel1,"cl_CommonKernel1 SSMA");
cl_CommonKernel2 = CLKernelCreate(cl_prg,"calcSMALoop");
checkKernel(cl_CommonKernel2,"cl_CommonKernel2 SSMA");
break;
}
}
int OnCalculate(const int rates_total,
const int prev_calculated,
const int begin,
const double &price[])
{
if(rates_total<InpMAPeriod-1+begin)
return(0);
if(prev_calculated==0)
ArrayInitialize(ExtLineBuffer,0);
PlotIndexSetInteger(0,PLOT_DRAW_BEGIN,InpMAPeriod-1+begin);
switch(InpMAMethod)
{
case MODE_EMA: CalculateEMA(rates_total,prev_calculated,begin,price); break;
case MODE_LWMA: CalculateLWMA(rates_total,prev_calculated,begin,price); break;
case MODE_SMMA: CalculateSmoothedMA(rates_total,prev_calculated,begin,price); break;
case MODE_SMA: CalculateSimpleMA(rates_total,prev_calculated,begin,price); break;
}
//--- return value of prev_calculated for next call
return(rates_total);
}
void OnDeinit(const int reason)
{
CLKernelFree(cl_CommonKernel1);
CLKernelFree(cl_CommonKernel2);
CLProgramFree(cl_prg);
CLContextFree(cl_ctx);
}
Help me write this code in a proper way so that my process become faster and will give proper result with my GPU.
Facts first,since April-2ndyou already know,MQL5 CustomIndicator will not work this way ...
Facts matter - if MQL4/5 code-execution architecture has explicitly documented that there is no place for any extended latency / async / blocking operation(s) to be put ever inside any MQL4/5 CustomIndicator code-execution unit, there might be a reasonable time to stop all these S/O Community members attack and to start simply respect the documented & published fact.
MQL5 documentation is quite explicit and warning on adverse effects of its single-shared-thread architecture implications on performance and/or warns on the risk of a complete, inadvertent deadlocking of the whole system :
All indicators calculated on one symbol, even if they are attached to different charts, work in the same thread. Thus, all indicators on one symbol share the resources of one thread.
An infinite loop ( block / increased latency / unexpected delay ) in one indicator will stop all other indicators on this symbol.
Sure, one may ask many kind OpenCL professionals, present in S/O Community, to help, express extreme interest in getting their sponsored knowledge et cetera, et cetera, if they decide to further spend their valuable time to sponsor and extend such efforts.
All this still has to meet reality and best respect the know facts, before any reasonable effort ( under whatever forces expressed ) may at least start to work in the proper direction.
How does the CPU : GPU topology work internally ?
Using an async, Out-of-Order delivery operated, async by definition, Queue-stored, sequence of zero-to-many GPU-device targetted computing job(s), all having a start-to-finish principally in-deterministic RTT-duration.
The CPU-side end of a GPU-device access-Queue can be instructed to send a job to GPU ( a task ~ a program to execute ):
The Finite State Automaton of the GPU-task-management workflow has the following topology-graph:
< START____________> s = GetMicrosecondCount();
( MQL5 RQSTs )
|
|
|
T0:+---+--> CL_QUEUED : 3 == "queued", i.e. waiting for its turn ( submission )
? | | |
? +----+ |
? v
T0+? +--> CL_SUBMITTED : 2 == "submitted" for an OpenCL-device execution
? | | |
? +----+ |
? v
T0+? +--> CL_RUNNING : 1 == "running" the kernel-code on a mapped OpenCL resource pool
? | | |
? +----+ |
? v
T0+? CL_COMPLETE : 0 == "program complete", processing has finished its remote outputs
? |
T0+?-----------------+
|
( MQL5 FREEs )
< END_____________> e = GetMicrosecondCount();
PrintFormat( "RTT-COST WAS ~ %9d [us] ( CLES==0 ? %d )",
( e - s ),
CLExecutionStatus( _gpuKernelHANDLE )
);
Next, let's respect the realm of GPU-architectures :
GPU computing devices are different, having other silicon-hardwired architectures, than any universal CPU CISC/RISC computing devices.
The reason WHY is very important here.
GPU devices use Streaming Multiprocessor eXecution units ( SMX units ), that are referred in some hardware-inspection tools.
While the letter M in the SMX abbreviation emphasises, there are multiple executions loadable onto the SMX-unit, yet, all such cases actually do execute ( sure, only if instructed in such a manner, which goes outside of the scope of this topic, to cover / span all over each of the SMX-present SM-cores ) the very same computing instructions - this is the only way they can operate - it is called a SIMT/SIMD-type of limited scope of parallelism achievable ( co-locally ) on the perimeter of the SMX only, where single-instruction-multiple-{ threads | data } can become executed within a present SIMT/SIMD-( WARP-wide | half-WARP-wide | WARP-ignoring-GreedyMode )-scheduler capabilities. Important to note, the more narrow the width of the scheduled SIMT/SIMD-execution gets, the less SMX/SM-cores actually do any useful part of the global job execution and the more wasted time devastates the battle on performance due to falling number of N-(CPUs) in effect, as discussed below.
Having listed those 384 cores, posted above, means a hardware limit, beyond which this co-locally orchestrated SIMT/SIMD-type of limited-scope parallelism cannot grow, and all attempts into this direction will lead to a pure-[SERIAL] internal scheduling of GPU-jobs ( yes, i.e. one-after-another ).
Understanding these basics is cardinal, as without these architecture features, one may expect a behaviour, that is actually principally impossible to get orchestrated in any whatever kind of the GPGPU system, having a formal shape of [ 1-CPU-host : N-GPU-device(s) ] compositions of autonomous, asynchronous distributed-system star-of-nodes.
Any GPU-kernel loaded from a CPU-host onto GPU will get mapped onto a non-empty set of SMX-unit(s), where a specified number of cores ( another, finer grain geometry-of-computing resources is applied, again going way beyond the scope of this post ) gets loaded with a stream of SIMT/SIMD-instructions, not violating the GPU-device limits:
...
+----------------------------------------------------------------------------------------
Max work items dimensions: 3 // 3D-geometry grids possible
Max work items[0]: 1024 // 1st dimension max.
Max work items[1]: 1024
Max work items[2]: 64 // theoretical max. 1024 x 1024 x 64 BUT...
+----------------------------------------------------------------------------------------
Max work group size: 1024 // actual max. "geometry"-size
+----------------------------------------------------------------------------------------
...
So,
if 1-SM-core was internally instructed to execute some GPU-task unit ( a GPU-job ), just this one SM-core will fetch one GPU-RISC-instruction after another ( ignoring any possible ILP for the simplicity here ) and execute it one at a time, stepping through the stream of SIMD-instructions of the said GPU-job. All the rest of the SM-cores present on the same SMX-unit typically do nothing during that time, until this GPU-job get finished and the internal GPU-process management system decides about mapping some other work for this SMX.
if 2-SM-cores were instructed to execute some GPU-job, just this pair of SM-cores will fetch one ( and the very same ) GPU-RISC-instruction after another ( ignoring any possible ILP for the simplicity here ) and both execute it one at a time, stepping through the stream of SIMT/SIMD-instructions of the said GPU-job. In this case, if one SM-core gets into a condition, where an if-ed, or similarly branched, flow of execution makes one SM-core into going into another code-execution-flow path than the other, the SIMT/SIMD-parallelism gets into divergent scenario, where one SM-core gets a next SIMT/SIMD-instruction, belonging to it's code-execution path, whereas the other one does nothing ( gets a GPU_NOP(s) ), until the first one finished the whole job ( or was enforced to stop at some synchronisation barrier of fell into an unmaskable latency wait-state, when waiting for a piece of data to get fetched from "far" ( slow ) non-local memory location, again, details go way beyond the scope of this post ) - only after any one of this happens, the divergent-path, so far just GPU_NOP-ed SM-core can receive any next SIMT/SIMD-instruction, belonging to its ( divergent ) code-execution-path to move any forward. All the rest of the SM-cores present on the same SMX-unit typically do nothing during that time, until this GPU-job get finished and the internal GPU-process management system decides about mapping some other work for this SMX.
if 16-SM-cores were instructed to execute some GPU-job by the task-specific "geometry", just this "herd" of SM-cores will fetch one ( and the very same ) GPU-RISC SIMT/SIMD-instruction after another ( ignoring any possible ILP for the simplicity here ) and all execute it one at a time, stepping through the stream of SIMT/SIMD-instructions of the said GPU-job. Any divergence inside the "herd" reduce the SIMT/SIMD-effect and GPU_NOP-blocked cores remain waiting for the main part of the "herd" to finish the job ( same as was sketched right above this point ).
if more-SIMT/SIMD-threads-than-SM-cores-available were instructed to execute some GPU-job by the task-specific "geometry", the GPU-device silicon will operate this to flow as [SERIAL]-sequence of as many { WARP-wide | half-WARP-wide }-SIMT/SIMD-thread packs, until such sequence finishes all the instructed number of SIMT/SIMD-threads mapped onto the SMX. Time-coherence of such pack uniform-finalisation is therefore principally impossible, as they arrive to their respective ends in a WARP-scheduler specific fashion, but never synchronously ( yes, your CPU-side code here will have to wait till the very last ( the laziest ( due-to whatever reason, be it a capacity constrained scheduling reason, or a code-divergence scheduling reason or a bad mutual (re-)synchronisation reason ) code-execution flow ) will eventually, in some unknown time in the future, finish the __kernel-code processing and the OpenCL-operated device will allow for "remote"-detection of CL_COMPLETE state, before being able to fetch any meaningful results ( as you ask in a surprise in one of your other questions ).
anyways, all the other SM-cores, not mapped by the task-specific "geometry" on the respective GPU-devices' SMX-unit will typically remain doing nothing useful at all - so the importance of knowing the hardware details for the proper task-specific "geometry" is indeed important and profiling may help to identify the peak performance for any such GPU-task constellation ( differences may range several orders of magnitude - from best to common to worse - among all possible task-specific "geometry" setups ).
Secondly, when I have many cores, how openCL is distributing the task, is it on each core same process and same data or is it different core with different data ?
As explained in brief above - the SIMT/SIMD-type device silicon-architecture does not permit any of the SMX SM-cores to execute anything other than the very same SIMT/SIMD-instruction on the whole "herd"-of-SM-cores, that was mapped by a task-"geometry" onto the SMX-unit ( not counting the GPU_NOP(s) as doing " something else " as it is just wasting CPU:GPU-system time ).
So, yes, " .. on each core same process .. " ( best if never divergent in its internal code-execution paths after if or while or any other kind of code-execution path branching ), so if algorithm, based on data-driven values results in different internal state, each core may have different thread-local-state, based on which the processing may differ ( as exemplified with if-driven divergent code-execution paths above ). More details on SM-local registers, SM-local caching, restricted shared-memory usage ( and latency costs ), GPU-device global-memory usage ( and latency costs and cache-line lengths and associativity for best coalescing access-patterns for latency masking options - many hardware-related + programming eco-system details go into small thousands of pages of hardware + software specific documentation and are well beyond the scope of this simplified for clarity post )
same data or is it different core with different data ?
This is the last, but not least, dilemma - any well parametrised GPU-kernel activation may also pass some amount of external-world data downto the GPU-kernel, which may make SMX thread-local data different from SM-core to SM-core. Mapping practices and best performance for doing this are principally device specific ( { SMX | SM-registers | GPU_GDDR gloMEM : shaMEM : constMEM | GPU SMX-local cache-hierarchy }-details and capacities
...
+---------------------------------------------------------
... 901 MHz
Cache type: Read/Write
Cache line size: 128
Cache size: 32768
Global memory size: 4294967296
Constant buffer size: 65536
Max number of constant args: 9
Local memory size: 49152
+---------------------------------------------------------
... 4000 MHz
Cache type: Read/Write
Cache line size: 64
Cache size: 262144
Global memory size: 536838144
Constant buffer size: 131072
Max number of constant args: 480
Local memory size: 32768
+---------------------------------------------------------
... 1300 MHz
Cache type: Read/Write
Cache line size: 64
Cache size: 262144
Global memory size: 1561123226
Constant buffer size: 65536
Max number of constant args: 8
Local memory size: 65536
+---------------------------------------------------------
... 4000 MHz
Cache type: Read/Write
Cache line size: 64
Cache size: 262144
Global memory size: 2147352576
Constant buffer size: 131072
Max number of constant args: 480
Local memory size: 32768
are principally so different device to device, that each high-performance code project principally can but profile its respective GPU-device task-"geometry and resources-usage maps composition for actual deployment device. What may work faster on one GPU-device / GPU-drives stack, need not work as smart on another one ( or after GPU-driver + exo-programming eco-system update / upgrade ), simply only the real-life benchmark will tell ( as theory could be easily printed, but hardly as easily executed, as many device-specific and workload-injected limitations will apply in real-life deployment ).
suggest me the way out for distributing the function with different values or frames in MQL5 using GPU on OpenCL.
The honest and best suggestions is the very same as it was presented to you already on April-2nd.
Do not attempt to block / delay a flow-of-execution of any MQL5 CustomIndicator-type of code-execution-unit with any extensive-latency / async / blocking - code. Never, until MetaTrader Terminal platform documentation will explicitly remove such warnings ( present still in 2018/Q2 there ) and will explicitly advice on techniques using latency-avoided non-blocking distributed agents communication tools for coordinated (almost)-synchronous exchange of processing data/results between MQL5-side and the GPU-device-side ( which will not be available any soon, due to SIMT/SIMD nature of the Out-of-Order scheduling of the GPU-jobs in the contemporary classes of GPU-devices available.
This was documented for a natural flow-of-time, strobed by a flow of an external FX Market ( Broker-broadcast propagated ) Events, having about a few hundreds of [us] Event-to-Event cadence.
If going into the synthetic flow-of-time, as is orchestrated in the Terminal's [ Strategy Tester ] simulator eco-system, the problem documented above goes many orders of magnitude worse, as the simulator actually accelerates the flow-of-time / cadency and anything not capable of keeping pace will ( again ) block any speedup ( which was already bad in a natural pace of the flow-of-time above ). So, no, this is a very bad direction to invest in a single next bit of efforts ( again, at least until both platforms will have changed their architectural limits ).
... so that my process become faster ...
This part of the problem-definition has been decided already ~ 60 years back, by Dr. Gene AMDAHL.
His ( then simplified ) Law of Diminishing Returns explains WHY a principal ceiling of any process speedup is linked to the still [SERIAL] part, given a distinction between a pure-[SERIAL] part and a potentially N-(CPU)-times true-[PARALLEL] parts are clearly identified.
This helps pre-estimate a cost / benefit effect of process re-engineering.
So, here, your GPU-kernel-code is the sort of (almost)-[PARALLEL] processing part. All the rest is still a pure-[SERIAL] processing part.
This suffice to guess the limits of the effect of trying to go into OpenCL-wrapped process re-design.
But, the Devil is hidden in detail ...
The real costs are way higher.
The [SERIAL]-part will never get faster per-se.
The [SERIAL]-part will actually get "slower" and "extended", as there will be many more steps to execute, before the first SIMT/SIMD-instruction of the payload(s) ... being "remotely" delivered onto the OpenCL-Queue + OpenCL-Data-Transfer(s) + OpenCL-Queue Task Management waiting... + OpenCL-Queue TaskManagement submission onto device ... will even start to get executed + the task == the intended OpenCL-Device WARP-scheduled / SIMT/SIMD-execution + all the way back, from the remote circus --- OpenCL-Device task-completion overheads + MQL5-side async completion detection async add-on latencies + OpenCL-Data-Transfer(s)
The [PARALLEL]-part will get executed only "after" or
"at" all the add-on costs were accrued ( not depicted in the Figure above, due to a need to avoid making it too complex and harder to comprehend the limit of the theoretical, overhead ignoring, speedup ( not- )scaling ), yet even worse, as getting executed at only about ~ 4x lower GPU_CLOCK-rate ( not mentioning ~ 10x ~ 1000x slower access-latency times to memory and cache ), and as the there teleported algorithm still remains a [SERIAL]-only, linearly-convoluted TimeSeries data-processing, thus cannot have but adverse net effect of << 1.0 improvement factor on theoretical processing speedup ( the achieved resulting performance gets worse than without such an attempt to "improve" ).
For a full reference of these net-effects, kindly read the section on Criticism, where both Overhead-strict re-formulation of the Amdahl's Law speedup and Overhead-strict and resources-aware re-formulation were more detailed :
1
S = __________________________; where s, ( 1 - s ), N were defined above
( 1 - s ) pSO:= [PAR]-Setup-Overhead add-on
s + pSO + _________ + pTO pTO:= [PAR]-Terminate-Overhead add-on
N
1 where s, ( 1 - s ), N
S = ______________________________________________ ; pSO, pTO
/ ( 1 - s ) \ were defined above
s + pSO + max| _________ , atomicP | + pTO atomicP:= further indivisible duration of atomic-process-block
\ N /
The header graph, cited on top of this post, provides a link towards a live-GUI with interactive inputs and animated outputs, where one may test impacts of values for p == ( 1 - s ) going anywhere under 1.00 ( which is a just theoretical, absolutely 100% [PARALLEL] schedule ( which is technically impossible in any real-world scenario ) ) and also tweak impacts of all add-on overheads in o ( expressed as just a scalar fraction for simplicity reasons ) over an editable range of ~ < 0.0 ~ 0.0001 > values, so as to better sense the principal limits of real-world behaviour of many-core devices and become able to make better engineering decisions before even thinking about any coding steps.
And given the known ( easily measurable downto a single [us]-resolution on the MQL5-side of the code-execution, using a call to GetMicrosecondCount() ) values for add-on overheads and atomicity-of-processing -- pSO, pTO, atomicP -- the net-effect of trying to continue towards OpenCL-wrapped Simple Moving Average as was sketched in the GPU-kernel-code :
kernel void SMA_executeSMA( float ExtLineBufferi_1,
float price1,
float price2,
int InpMAPeriod,
__global float *output
)
{ // 1: .STO 0x0001, REG
int len = get_global_id( 1 ); // 2: .JMP intrinsic_OpenCL_fun(), ... may get masked by reading a hardwired-const-ID#
// 3: .GET len, REG
output[len] = // 4: .STO MEM[*],
ExtLineBufferi_1 // 5: .ADD const,
+ ( price1 - price2 ) // ( .SUB const, const
/ InpMAPeriod; // .FDIV REG, const )
} // 6: .RET
which has nothing but a few 900 MHz-clocked instructions - i.e. the p = ( 1 - s )-factor in the animated graph-visualisation will go somewhere close to p == 0 end, making the game ultimately dominated by the pure-[SERIAL]-part of the CPU:GPU-composition of the distributed-computing system -- ( ~ a few, max small tens of [ns] + naked ( non-maskable, as having zero-re-use here ) on-GPU-device memory access-latency ~ 350 - 700+ [ns] ).
Having such a low p is a performance-tweaking bad-sign ( if not an ANTI-PATTERN ) for any attempts of doing this.
Because even if going into N-(CPUs) ~ +INF, it will still never make the wished-for speedup ( ref.: may try to modify such factors in the interactive graph offered above and visually see the effect -- how low the numbers there will get ) - while the same could have been computed in almost less than ~ 0.5 [ns], further still vectorise-able, CPU instructions, here also having zero-add-on costs at all ).
These are the "economy-of-costs" reasons( besides the principal MQL5 one ) WHY better not doing this
that will never pay back the sum of all the [SERIAL] add-on costs, introduced during the whole OpenCL-re-wrapping-there-sending-there-calc'd-and-after-detected-back-sending circus on the CPU-code / MQL5-side ( all in the name of making not more than just these indeed very few GPU_INSTR-s to happen ), that were just briefly mentioned above, even if an infinite number of GPU-cores were used.
You simply still try to pay way more than one will ever receive back.

do_gettimeofday() in Beaglebone giving wrong time

I am trying to measure the time period of a square wave on a Beaglebone running Angstrom OS. I have written a kernel driver to register an ISR in which I'm timing the pulses. Everything is working fine, but the time interval being measured is completely wrong. I'm using do_gettimeofday() function to measure the time. When I do the same in userspace program using poll() function, I'm able to achieve correct values (it shows around 1007 us for a 1000us wave), but when I use the driver to measure the pulse, I get the interval as 1923us. I have no idea why the time interval in the kernel is higher than that in user space. I have attached my code below.
I would be grateful if someone can find the mistake in my program.
kernel ISR:
static irqreturn_t ISR ( int irq, void *dev_id)
{
prev = c;
do_gettimeofday(&c);
printk(KERN_ALERT "%ld", (c.tv_usec - prev.tv_usec));
return IRQ_HANDLED;
}
userspace prog:
while(1){
prev = start;
gettimeofday(&start, NULL);
rc = poll(&fdset, 1, 20000);
if(rc < 0){
printf("Error in rc\n");
return -1;
}
if(rc == 0){
printf("Timed out\n");
return -1;
}
if (fdset.revents & POLLPRI) {
len = read(fdset.fd, buf, 2);
printf("%ld\n", (start.tv_usec - prev.tv_usec));
}
}
For profiling interrupt latency, I find it quite useful to be lazy and to set a GPIO pin then measure the time with an oscilloscope. Probably not the answer you want, but it might help you over a hurdle quickly.

Resolution of WaitForSingleObject with timeSetEvent & SetWaitableTimer

I am using a Win32 multimedia timer to put a delay between the dispatch of large numbers of UDP packets, but i am finding that the resulting delay is substantially longer than it should be. Delays of ~40ms are sometimes nearer 1000ms, even when using Windows Miltimedia timers and upping the timer resolution. Below is a simplifed version of the code i used:
if( timeGetDevCaps(&tc,sizeof(TIMECAPS)) == TIMERR_NOERROR)
{
timeRes = min( max(tc.wPeriodMin,1), tc.wPeriodMax);
timeBeginPeriod(timeRes);
printf("Timer Res: %u\n", timeRes);
}
/* ... */
while( ptrHead )
{
NALU_t *ptrLink = ptrHead;
unsigned long tsNALU = ptrLink->timestamp - tsFirst;
printf("Timestamp: %umsec\n", ptrLink->timestamp / 90 );
int idxPort;
for(idxPort=0;idxPort<12;idxPort++)
{
ip4Addr.sin_port = htons( 60000 + idxPort );
struct sockaddr *saAddr = (struct sockaddr*)&ip4Addr;
sendto(fdSocket,(char*)ptrLink->ptrData,ptrLink->lenData,
0,saAddr,lenAddr);
}
if( 1 )
{
unsigned long millis = (tsNALU - tsPrev) / 90;
valTime.QuadPart = 10000;
valTime.QuadPart *= millis;
valTime.QuadPart *= -1;
if(SetWaitableTimer(hdlTimer,&valTime,0,NULL,NULL,TIME_ONESHOT))
WaitForSingleObject(hdlTimer,INFINITE);
}
tsPrev = tsNALU;
ptrHead = ptrLink->next;
free( ptrLink );
}
I suspect the problem is that Windows7 no longer guarantees the resolution of timers when signalled by events as opposed to call-backs, but i am loathed to use the latter. Anyone know why even supposedly high-resolution timers in single-threaded test cases are so wildly inaccurate?
If timing is critical it's best to run in a busy loop (you can give up a timeslice every iteration using Sleep(0) if you want), using the QueryPerformanceCounter() API to measure elapsed time.
From subsequent experiments, my best guess is that Windows moving threads between CPU cores (possibly for load-balancing reasons - this is on a Quad-core i7) is disruptive to timing functions. I used SetThreadAffinityMask() to lock my timing-critical thread to one CPU (and my non-timing threads to all other cores), and that has sorted out the problems.

How to make a fast context switch from one process to another?

I need to run unsafe native code on a sandbox process and I need to reduce bottleneck of process switch. Both processes (controller and sandbox) shares two auto-reset events and a coherent view of a mapped file (shared memory) that is used for communication.
To make this article smaller, I removed initializations from sample code, but the events are created by the controller, duplicated using DuplicateHandle, and then sent to sandbox process prior to work.
Controller source:
void inSandbox(HANDLE hNewRequest, HANDLE hAnswer, volatile int *shared) {
int before = *shared;
for (int i = 0; i < 100000; ++i) {
// Notify sandbox of a new request and wait for answer.
SignalObjectAndWait(hNewRequest, hAnswer, INFINITE, FALSE);
}
assert(*shared == before + 100000);
}
void inProcess(volatile int *shared) {
int before = *shared;
for (int i = 0; i < 100000; ++i) {
newRequest(shared);
}
assert(*shared == before + 100000);
}
void newRequest(volatile int *shared) {
// In this test, the request only increments an int.
(*shared)++;
}
Sandbox source:
void sandboxLoop(HANDLE hNewRequest, HANDLE hAnswer, volatile int *shared) {
// Wait for the first request from controller.
assert(WaitForSingleObject(hNewRequest, INFINITE) == WAIT_OBJECT_0);
for(;;) {
// Perform request.
newRequest(shared);
// Notify controller and wait for next request.
SignalObjectAndWait(hAnswer, hNewRequest, INFINITE, FALSE);
}
}
void newRequest(volatile int *shared) {
// In this test, the request only increments an int.
(*shared)++;
}
Measurements:
inSandbox() - 550ms, ~350k context switches, 42% CPU (25% kernel, 17% user).
inProcess() - 20ms, ~2k context switches, 55% CPU (2% kernel, 53% user).
The machine is Windows 7 Pro, Core 2 Duo P9700 with 8gb of memory.
An interesting fact is that sandbox solution uses 42% of CPU vs 55% of in-process solution. Another noteworthy fact is that sandbox solution contains 350k context switches, which is much more than the 200k context switches that we can infer from source code.
I need to know if there's a way to reduce the overhead of transfer control to another process. I already tried to use pipes instead of events, and it was much worse. I also tried to use no event at all, by making the sandbox call SuspendThread(GetCurrentThread()) and making the controller call ResumeThread(hSandboxThread) on every request, but the performance was similar to using events.
If you have a solution that uses assembly (like performing a manual context switch) or Windows Driver Kit, please let me know as well. I don't mind having to install a driver to make this faster.
I heard that Google Native Client does something similar, but I only found this documentation. If you have more information, please let me know.
The first thing to try is raising the priority of the waiting thread. This should reduce the number of extraneous context switches.
Alternatively, since you're on a 2-core system, using spinlocks instead of events would make your code much much faster, at the cost of system performance and power consumption:
void inSandbox(volatile int *lock, volatile int *shared)
{
int i, before = *shared;
for (i = 0; i < 100000; ++i) {
*lock = 1;
while (*lock != 0) { }
}
assert(*shared == before + 100000);
}
void newRequest(volatile int *shared) {
// In this test, the request only increments an int.
(*shared)++;
}
void sandboxLoop(volatile int *lock, volatile int * shared)
{
for(;;) {
while (*lock != 1) { }
newRequest(shared);
*lock = 0;
}
}
In this scenario, you should probably set thread affinity masks and/or lower the priority of the spinning thread so that it doesn't compete with the busy thread for CPU time.
Ideally, you'd use a hybrid approach. When one side is going to be busy for a while, let the other side wait on an event so that other processes can get some CPU time. You could trigger the event a little ahead of time (using the spinlock to retain synchronization) so that the other thread will be ready when you are.

How to put my structure variable into CPU caches to eliminate main memory page access time? Options

It's clear that there is no explicit way or certain system calls that
help programmers to put a variable into the CPU cache.
But I think that a certain programming style or well designed
algorithm can make it possible to increase the possibilities that the
variable can be cached into the CPU caches.
Here is my example:
I want to append an 8 byte structure at the end of an array consisting
of the same type of structures, declared in the global main memory
region.
This process is continuously repeated for 4 million operations. This process takes 6 seconds, 1.5 us for each operation. I think this result tells that the two memory areas have not been cached.
I got some clues from a cache-oblivious algorithm, so I tried several
ways to enhance this. Until now, no enhancement.
I think some clever codes can reduce the elapsed time, up to 10 to 100
times. Please show me the way.
-------------------------------------------------------------------------
Appended (2011-04-01)
Damon~ thank you for your comment!
After reading your comment, I analyzed my code again, and found several things
that I missed. The following code that I attached is the abbreviated version of my original code.
To accurately measure each operation's execution time (in the original code, there are several different types of operations), I inserted the time measuring code using clock_gettime() function. I thought if I measure each operation's execution time and accumulate them, the additional cost by the main loop can be avoided.
In the original code, the time measuring code was hidden by a macro function, so I totally forgot about it.
The running time of this code is almost 6 seconds. But if I get rid of the time measuring function in the main loop, it becomes 0.1 seconds.
Since the clock_gettime() function supports very high precision (upto 1 nano second), executed on the basis of an independent thread, and also it requires very big structure,
I think the function caused the cache-out of the main memory area where the consecutive insertions are performed.
Thank you again for your comment. For further enhancement, any suggestion will be very helpful for me to optimize my code.
I think the hierachically defined structure variable might cause unnecessary time cost,
but first I want to know how much it would be, before I change it to the more C-style code.
typedef struct t_ptr {
uint32 isleaf :1, isNextLeaf :1, ptr :30;
t_ptr(void) {
isleaf = false;
isNextLeaf = false;
ptr = NIL;
}
} PTR;
typedef struct t_key {
uint32 op :1, key :31;
t_key(void) {
op = OP_INS;
key = 0;
}
} KEY;
typedef struct t_key_pair {
KEY key;
PTR ptr;
t_key_pair() {
}
t_key_pair(KEY k, PTR p) {
key = k;
ptr = p;
}
} KeyPair;
typedef struct t_op {
KeyPair keyPair;
uint seq;
t_op() {
seq = 0;
}
} OP;
#define MAX_OP_LEN 4000000
typedef struct t_opq {
OP ops[MAX_OP_LEN];
int freeOffset;
int globalSeq;
bool queueOp(register KeyPair keyPair);
} OpQueue;
bool OpQueue::queueOp(register KeyPair keyPair) {
bool isFull = false;
if (freeOffset == (int) (MAX_OP_LEN - 1)) {
isFull = true;
}
ops[freeOffset].keyPair = keyPair;
ops[freeOffset].seq = globalSeq++;
freeOffset++;
}
OpQueue opQueue;
#include <sys/time.h>
int main() {
struct timespec startTime, endTime, totalTime;
for(int i = 0; i < 4000000; i++) {
clock_gettime(CLOCK_REALTIME, &startTime);
opQueue.queueOp(KeyPair());
clock_gettime(CLOCK_REALTIME, &endTime);
totalTime.tv_sec += (endTime.tv_sec - startTime.tv_sec);
totalTime.tv_nsec += (endTime.tv_nsec - startTime.tv_nsec);
}
printf("\n elapsed time: %ld", totalTime.tv_sec * 1000000LL + totalTime.tv_nsec / 1000L);
}
YOU don't put the structure into any cache. The CPU does that automatically for you. The CPU is even more clever than that; if you access sequential memory, it will start putting things from memory into the cache before you read them.
And really, it should be common sense that for a simple bit of code like this, the time you spend on measuring is ten times more than the time to perform the code (apparently 60 times in your case).
Since you put so much confidence in clock_gettime (): I suggest you call it five times in a row and store the results, then print the differences. There's resolution, there's precision, and there's how long it takes to return the current time, which is pretty damned long.
I have been unable to force caching, but you can force memory to be uncache-able. If you have large other datastructures you might exclude these so that they will not pollute your caches. This can be done by specifying PAGE_NOCACHE for the Windows VirutalAllocXXX functions.
http://msdn.microsoft.com/en-us/library/windows/desktop/aa366786(v=vs.85).aspx

Resources