I am using the Transport Client to index data. Here's a small test program:
#Test
public void LatencyTest() throws Exception {
// warmup
for (int i = 0; i < 10; i++) {
client.prepareGet("ts_head", "doc", "latency_test_id").get();
}
List<Long> times = Lists.newArrayList();
for (int i = 0; i < 100; i++) {
final long t1 = System.nanoTime();
final IndexResponse response = client.prepareIndex("ts_head", "doc", "latency_test_id").setSource("{}", XContentType.JSON).get();
final long t2 = System.nanoTime();
times.add(t2 - t1);
}
final double average = times.stream().mapToLong(v -> v).average().getAsDouble();
System.out.println("Average time: " + (average / 1_000_000f) + "ms");
}
the output is: Average time: 79.3905588ms, which seems awfully slow. It accesses a cluster consisting of 2 nodes (both master) over a local network with a gigabit connection. The network itself doesn't seem to have high latency:
$ ping -U 172.***
PING 172.*** (172.***) 56(84) bytes of data.
64 bytes from 172.***: icmp_seq=1 ttl=128 time=0.324 ms
64 bytes from 172.***: icmp_seq=2 ttl=128 time=0.280 ms
64 bytes from 172.***: icmp_seq=3 ttl=128 time=0.348 ms
64 bytes from 172.***: icmp_seq=4 ttl=128 time=0.306 ms
Indexing via HTTP is pretty slow as well:
$ curl -w "#curl-format.txt" -X POST -s "http://172.***:9200/ts_head/doc/latency_test_id" --data '{}'
{"_index":"ts_head","_type":"doc","_id":"latency_test_id","_version":211,"result":"updated","_shards":{"total":2,"successful":2,"failed":0},"created":false} time_namelookup: 0,001381
time_connect: 0,135607
time_appconnect: 0,000000
time_pretransfer: 0,136056
time_redirect: 0,000000
time_starttransfer: 0,283840
----------
time_total: 0,284044
Now my question is: What am I doing wrong? Elasticsearch must surely be capable of indexing faster, right?
For testing, I also read data, and it reported "took": 1, but still too consistently over 100ms to respond via curl.
Related
I have a Frankenstein ESP 32 setup using my dead Wemos Lolin 32 board and an external Ai thinker ESP 32 chip.
It has been working alright until recently it's failing to connect to any Wifi while dumping garbage data for anything beyond the Wifi. begin() function. It occasionally connects in roughly 1 of 15 reboots. It also seems to sometimes stall UART before triggering a reboot and then working.
I have been using it with a GC9A01 1.28-inch TFT display. I may have damaged it while making the connections but I can't be sure. If I don't use Wifi most other functionality seems to work ok.
This is a sample of the many codes I have tried.
#include <Arduino.h>
#include <WiFi.h>
#include "time.h"
const char *ssid = "VUMA FIBER ";
const char *password = "mysecurepassword";
const char *ntpServer = "pool.ntp.org";
const long gmtOffset_sec = 3 * 3600;
const int daylightOffset_sec = 0;
hw_timer_t *My_timer = NULL;
class Timehandler
{
private:
hw_timer_t *My_timer = NULL;
uint8_t counter = 0;
public:
uint8_t hour;
uint8_t minutes;
uint8_t seconds;
unsigned int year;
uint8_t date;
uint8_t day;
struct tm timeinfo;
bool fetchtime(uint8_t gmtOffset = 0, uint8_t daylightOffset_sec = 0, const char *ntpServer = "pool.ntp.org")
{
// configTime(gmtOffset * 3600, daylightOffset_sec, ntpServer);
// struct tm timeinfo;
// if (!getLocalTime(&timeinfo)) {
// return false;
// } else {
Serial.println(&timeinfo, "%A, %B %d %Y %H:%M:%S");
return true;
// }
}
void maintainTime()
{
seconds++;
if (seconds >= 60)
{
minutes++;
seconds = 0;
}
if (minutes >= 60)
{
hour++;
minutes = 0;
}
if (hour >= 24)
{
date++;
hour = 0;
}
Serial.printf("%02d", hour);
Serial.print(":");
Serial.printf("%02d", minutes);
Serial.print(":");
Serial.printf("%02d", seconds);
Serial.println("");
}
void getTime()
{
struct tm timeinfo;
if (!getLocalTime(&timeinfo))
{
Serial.println("Failed to obtain time");
return;
}
hour = timeinfo.tm_hour;
minutes = timeinfo.tm_min;
seconds = timeinfo.tm_sec;
year = timeinfo.tm_year;
date = timeinfo.tm_mday;
day = timeinfo.tm_wday;
}
};
bool connected = false;
Timehandler t;
void setup()
{
Serial.begin(115200);
// connect to WiFi
Serial.print("Connecting to ");
Serial.println(ssid);
WiFi.begin(ssid, password);
t.fetchtime();
t.getTime();
}
void loop()
{
if (!connected)
{
Serial.println("failed...retrying");
if (WiFi.status() == WL_CONNECTED)
{
Serial.println(" CONNECTED");
while (!t.fetchtime(3))
{
Serial.println("failed...retrying");
delay(500);
t.fetchtime(3);
WiFi.disconnect(true);
WiFi.mode(WIFI_OFF);
}
t.getTime();
connected = true;
}
}else{
t.maintainTime();
}
t.maintainTime();
delay(1000);
}
Here is a sample of the serial output.
CURRENT: upload_protocol = esptool
Looking for upload port...
Auto-detected: COM5
Uploading .pio\build\lolin32\firmware.bin
esptool.py v3.1
Serial port COM5
Connecting....
Chip is ESP32-D0WD (revision 1)
Features: WiFi, BT, Dual Core, 240MHz, VRef calibration in efuse, Coding Scheme None
Crystal is 40MHz
MAC: 94:3c:c6:10:4b:30
Uploading stub...
Running stub...
Stub running...
Changing baud rate to 460800
Changed.
Configuring flash size...
Auto-detected Flash size: 4MB
Flash will be erased from 0x00001000 to 0x00005fff...
Flash will be erased from 0x00008000 to 0x00008fff...
Flash will be erased from 0x0000e000 to 0x0000ffff...
Flash will be erased from 0x00010000 to 0x000abfff...
Compressed 17120 bytes to 11164...
Writing at 0x00001000... (100 %)
Wrote 17120 bytes (11164 compressed) at 0x00001000 in 0.6 seconds (effective 213.4 kbit/s)...
Hash of data verified.
Compressed 3072 bytes to 128...
Writing at 0x00008000... (100 %)
Wrote 3072 bytes (128 compressed) at 0x00008000 in 0.1 seconds (effective 247.8 kbit/s)...
Hash of data verified.
Compressed 8192 bytes to 47...
Writing at 0x0000e000... (100 %)
Wrote 8192 bytes (47 compressed) at 0x0000e000 in 0.2 seconds (effective 397.7 kbit/s)...
Hash of data verified.
Compressed 638784 bytes to 391514...
Writing at 0x00010000... (4 %)
Writing at 0x0001bce6... (8 %)
Writing at 0x00029453... (12 %)
Writing at 0x00031fdd... (16 %)
Writing at 0x00037279... (20 %)
Writing at 0x0003c526... (25 %)
Writing at 0x000419cd... (29 %)
Writing at 0x00046f5c... (33 %)
Writing at 0x0004c242... (37 %)
Writing at 0x000533e8... (41 %)
Writing at 0x0005ba7c... (45 %)
Writing at 0x00061195... (50 %)
Writing at 0x00066a45... (54 %)
Writing at 0x0006bdb0... (58 %)
Writing at 0x000715e8... (62 %)
Writing at 0x00077550... (66 %)
Writing at 0x0007cdc5... (70 %)
Writing at 0x000830c2... (75 %)
Writing at 0x00088d20... (79 %)
Writing at 0x0008ebd9... (83 %)
Writing at 0x00094b83... (87 %)
Writing at 0x0009ab4a... (91 %)
Writing at 0x000a0842... (95 %)
Writing at 0x000a6686... (100 %)
Wrote 638784 bytes (391514 compressed) at 0x00010000 in 10.5 seconds (effective 486.4 kbit/s)...
Hash of data verified.
Leaving...
clk_drv:0x00,q_drv:0x00,d_drv:0x00,cs0_drv:0x00,hd_drv:0x00,wp_drv:0x00
mode:DIO, clock div:1
load:0x3fff0018,len:4
load:0x3fff001c,len:1044load:0x40078000,len:10124
load:0x40080400,len:5856
entry 0x400806a8
Connecting to VUMA FIBER
���n���b�l�|�␒b␒␂␌␌␌ll␌␌␌␌␌�␌␌�␌␌l`␃␜␒␒nn�␐␂␌�n�np�␒␒nn␌��r��`␃␜␒␒no�␐�bbb|␒�b␒␒on�␎l�␌�␌␌�␌ll`␃␜␒␒oo�␐�bbc|␒�b␓␒nn�␎l�␌�␌␌�␌�l`␃␜␒␒no�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌�␌�l`␃␒␒nn�␐�ccc|␒�b␒␒nn�␎l�␌�␌␌�␌␌�␎l␜␒␒on�␐�cbc|␒�b␒␒on�␎l�␌�␌␌�␌l�␎l␜␒␒oo�␐�ccc|␒�b␓␒oo�␎l�␌�␌␌�␌��␏l␜␒␒oo�␐�ccc|␒�b␓␒nn�␎l�␌�␌␌�␌�␎l␜␒␒nn�␐�ccc|␒�b␒␒on�␎l�␌�␌␌�␌␌l`␃␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌�␌ll`␂␜␒␒nn�␐�bbb|␒�b␓␒on�␎l�␌�␌␌�l␌l`␂␜␒␒on�␐�cbb|␒�b␓␒oo�␎l�␌�␌␌�lll`␂␜␒␒nn�␐�bcc|␒�b␓␒on�␎l�␌�␌␌�l�l`␂␜␓␒oo�␐�ccc|␒�b␛␒og�␎l�␌�␌␌�l�d`␃␜␒␒on�␐�ccc|␒�b␓␒oo�␎l�␌�␌␌�l␌�␏l␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌�ll�␎l␜␒␒oo�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌�l��␎l␜␒␒on�␐�bbb|␒�b␒␒on�␎l�␌�␌␌�l�␏l␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌�l␌l`␂␜␒␒oo�␐�ccc|␒�b␓␒no�␎l�␌�␌␌�lll`␂␜␒␒nn�␐�ccb|␒�b␒␒oo�␎l�␌�␌␌��␌l`␃␜␒␒oo�␐�cbb|␒�b␒␒oo�␎l�␌�␌␌��ll`␂␜␒␒on�␐�bbb|␒�b␒␒no�␎l�␌�␌␌���l`␂␜␒␒oo�␐�bcc|␒�b␒␒on�␎l�␌�␌␌���l`␂␜␒␒on�␐�bbc|␒�b␒␒no�␎l�␌�␌␌��␌�␏l␜␒␒nn�␐�ccc|␒�b␒␒nn�␎l�␌�␌␌��l�␏l␜␒␒on�␐�cbb|␒�b␓␒on�␎l�␌�␌␌����␎l␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌���␎l␜␒␒on�␐�ccc|␒�b␓␒no�␎l�␌�␌␌��␌l`␃␜␒␒nn�␐�bcb|␒�b␒␒nn�␎l�␌�␌␌��ll`␂␜␒␒no�␐�bcb|␒�b␒␒nn�␎l�␌�␌␌��␌l`␃␜␒␒nn�␐�bbb|␒�b␓␒nn�␎l�␌�␌␌��ll`␂␜␒␒nn�␐�bbb|␒�b␓␒no�␎l�␌�␌␌��l`␃␜␒␒no�␐�bbb|␒�b␒␒no�␎l�␌�␌␌���l`␂␜␒␒nn�␐�bcb|␒�b␒␒nn�␎l�␌�␌␌��␌�␏l␜␒␒oo�␐�ccc|␒�b␓␒go�␎l�␄�␄␌��l�␏l␜␒␒oo�␐�ccc|␒�b␓␒oo�␎l�␌�␌␌�쌎␎l␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌���␎l␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌��␌l`␂␜␒␒oo�␐�bbb|␒�b␒␒oo�␎l�␌�␌␌��ll`␃␜␒␒no�␐�bcb|␒�b␓␒no�␎l�␌�␌␌�␌�l`␂␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌�␌�prl␜␒␒on�␐�ccc|␒�b␓␒no�␎l�␌�␌␌�␌�␜rl␜␒␒no�␐�cbb|␒�b␓␒on�␎l�␌�␌␌�␌�␜rl␜␒␒on�␐�ccb|␒�b␒␒no�␎l�␌�␌␌�␌��␎l␜␒␒no�␐�bcb|␒�b␒␒on�␎l�␌�␌␌�␌�rrl␜␒␒no�␐�ccc|␒�b␒␒no�␎l�␌�␌␌�␌��␎l␜␒␒nn�␐�bbb|␒�b␒␒oo�␎l�␌�␌␌�␌��␎l␜␒␒oo�␐�ccc|␒�b␒␒oo�␎l�␌�␌␌�␌�l`␃␜␒␒on�␐�bcb|␒�b␒␒on�␎l�␌�␌␌�␌�|rl␜␒␒oo�␐�ccc|␒�b␓␒oo�␎l�␌�␌␌�l�l`␃␜␒␒no�␐�bcc|␒�b␓␒oo�␎l�␌�␌␌�l�prl␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌␌�l�␜rl␜␒␒no�␐�bbb|␒�b␓␒on�␎l�␌�␌␌�l�␜rl␜␒␒on�␐�ccc|␒�b␓␒oo�␎l�␌�␌␌�l��␎l␜␒␒nn�␐�ccb|␒�b␒␒nn�␎l�␌�␌␌�l�rrl␜␒␒nn�␐�cbc|␒�b␓␒no�␎l�␌�␌␌�l��␎l␜␒␒no�␐�ccc|␒�b␓␒no�␎l�␌�␌␌�l��␏l␜␒␒on�␐�ccb|␒�b␒␒oo�␎l�␌�␌␌�l�l`␂␜␒␒oo�␐�ccc|␒�b␓␒no�␎l�␌�␌␄�d�|rl␜␒␒no�␐�ccb|␒�b␓␒nn�␎l�␌�␌l�␌␌l`␂␜␒␒on�␐�bcb|␒�b␒␒nn�␎l�␌�␌l�␌ll`␂␜␒␒nn�␐�bbb|␒�b␒␒no�␎l�␌�␌l�␌�l`␂␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌l�␌�l`␃␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌l�␌␌�␎l␜␒␒nn�␐�bbc|␒�b␒␒nn�␎l�␌�␌l�␌l�␎l␜␒␒no�␐�bbb|␒�b␒␒nn�␎l�␌�␌l�␌��␏l␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌l�␌�␎l␜␒␒no�␐�ccc|␒�b␓␒on�␎l�␌�␌l�␌␌l`␃␜␒␒on�␐�bcb|␒�b␒␒on�␎l�␌�␌l�␌ll`␃␜␒␒on�␐�bcb|␒�b␓␒on�␎l�␌�␌l�l␌l`␃␜␒␒oo�␐�cbb|␒�b␓␒oo�␎l�␌�␌l�lll`␂␜␒␒no�␐�bcb|␒�b␓␒on�␎l�␌�␌l�l�l`␂␜␒␒on�␐�bcb|␒�b␒␒no�␎l�␌�␌l�l�l`␂␜␒␒on�␐�bcc|␒�b␓␒oo�␎l�␌�␌l�l␌�␎l␜␒␒oo�␐�bcc|␒�b␓␒nn�␎l�␌�␌l�ll�␏l␜␒␒oo�␐�bcc|␒�b␒␒oo�␎l�␌�␌l�l��␎l␜␒␒no�␐�bbc|␒�b␒␒nn�␎l�␌�␌l�l�␎l␜␒␒oo�␐�bbb|␒�b␓␒oo�␎l�␌�␌l�l␌l`␃␜␒␒no�␐�cbc|␒�b␓␒no�␎l�␌�␌l�lll`␃␜␒␒no�␐�bcb|␒�b␓␒nn�␎l�␌�␌l��␌l`␂␜␒␒on�␐�bcb|␒�b␒␒nn�␎l�␌�␌l��ll`␃␜␒␒on�␐�ccc|␒�b␓␒oo�␎l�␌�␌l���l`␂␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌l���l`␂␜␒␒on�␐�bbb|␒�b␓␒oo�␎l�␌�␌l��␌�␎l␜␒␒on�␐�bcb|␒�b␒␒oo�␎l�␌�␌l��l�␏l␜␒␒on�␐�bcc|␒�b␒␒oo�␎l�␌�␌l����␎l␜␒␒oo�␐�ccb|␒�b␓␒oo�␎l�␌�␌l���␏l␜␒␒'g�␐�#c#<␒�b␛␒'o�␏l�␄�␌d��␌l`␃␜␒␒no�␐�ccb|␒�b␓␒oo�␎l�␌�␌l��ll`␃␜␒␒oo�␐�bbc|␒�b␒␒on�␎l�␌�␌l��␌l`␂␜␒␒nn�␐�cbc|␒�b␒␒nn�␎l�␌�␌l��ll`␃␜␒␒oo�␐�ccc|␒�b␓␒oo�␎l�␌�␌l��l`␃␜␒␒nn�␐�cbb|␒�b␒␒nn�␎l�␌�␌l���l`␂␜␒␒on�␐�cbc|␒�b␓␒oo�␎l�␌�␌l��␌�␏l␜␒␒nn�␐�bbc|␒�b␓␒no�␎l�␌�␌l��l�␎l␜␒␒nn�␐�bbb|␒�b␒␒nn�␎l�␌�␌l�쌎␏l␜␓␒''�␐�###<␒�#␛␒''�␇$�␄�␄$���␇l␜␒␒no�␐�bcb|␒�b␒␒oo�␎l�␌�␌l��␌l`␃␜␒␒no�␐�ccb|␒�b␓␒no�␎l�␌�␌l��ll`␃␜␒␒oo�␐�ccc|␒�b␓␒oo�␎l�␌�␌l�␌�l`␃␜␒␒nn�␐�bbb|␒�b␒␒on�␎l�␌�␌l�␌�prl␜␒␒no�␐�bbb|␒�b␒␒no�␎l�␌�␌l�␌�␜rl␜␒␒oo�␐�bcb|␒�b␒␒no�␎l�␌�␌l�␌�␜rl␜␒␒nn�␐�bbc|␒�b␒␒on�␎l�␌�␌l�␌��␎l␜␒␒on�␐�bbb|␒�b␒␒nn�␎l�␌�␌l�␌�rrl␜␒␒on�␐�cbb|␒�b␒␒nn�␎l�␌�␌l�␌��␎l␜␒␒oo�␐�cbc|␒�b␒␒no�␎l�␌�␌l�␌��␏l␜␒␛''�␐�###<␓�b␛␛''�␎$�␄�␄$�␄�$ ␃␜␒␒oo�␐�cbc|␒�b␓␒on�␎l�␌�␌l�␌�|rl␜␒␒nn�␐�cbc|␒�b␒␒on�␎l�␌�␌l�l�l`␂␜␒␒on�␐�ccb|␒�b␒␒no�␎l�␌�␌l�l�prlets Jun 8 2016 00:22:57
rst:0x1 (POWERON_RESET),boot:0x13 (SPI_FAST_FLASH_BOOT)
configsip: 0, SPIWP:0xee
clk_drv:0x00,q_drv:0x00,d_drv:0x00,cs0_drv:0x00,hd_drv:0x00,wp_drv:0x00
mode:DIO, clock div:1
load:0x3fff0018,len:4
load:0x3fff001c,len:1044
load:0x40078000,len:10124
load:0x40080400,len:5856
entry 0x400806a8
ets Jun 8 2016 00:22:57
rst:0x10 (RTCWDT_RTC_RESET),boot:0x13 (SPI_FAST_FLASH_BOOT)
configsip: 0, SPIWP:0xee
clk_drv:0x00,q_drv:0x00,d_drv:0x00,cs0_drv:0x00,hd_drv:0x00,wp_drv:0x00
mode:DIO, clock div:1
load:0x3fff0018,len:4
load:0x3fff001c,len:1044
load:0x40078000,len:10124
load:0x40080400,len:5856
entry 0x400806a8
Connecting to VUMA FIBER
Sunday, January 00 1900 00:00:00
Failed to obtain time
failed...retrying
00:00:01
failed...retrying
00:00:02
failed...retrying
00:00:03
failed...retrying
CONNECTED
Sunday, January 00 1900 00:00:00
Failed to obtain time
00:00:04
00:00:05
00:00:06
00:00:07
00:00:08
00:00:09
00:00:10
00:00:11
00:00:12
00:00:13
00:00:14
00:00:15
00:00:16
00:00:17
00:00:18
00:00:19
00:00:20
00:00:21
00:00:22
00:00:23
00:00:24
00:00:25
00:00:26
00:00:27
00:00:28
00:00:29
00:00:30
00:00:31
00:00:32
00:00:33
00:00:34
00:00:35
00:00:36
00:00:37
00:00:38
00:00:39
00:00:40
00:00:41
00:00:42
Any help would be appreciated
I am trying to parallelize Monte Carlo simulation by using OpenCL. I use the MWC64X as a uniform random number generator. The code runs well on different Intel CPUs, since the output of parallel computation is very close to the sequential one.
Using OpenCL device: Intel(R) Xeon(R) CPU E5-2630L v3 # 1.80GHz
Literal influence running time: 0.029048 seconds r1 seqInfl= 0.4771
Literal influence running time: 0.029762 seconds r2 seqInfl= 0.4771
Literal influence running time: 0.029742 seconds r3 seqInfl= 0.4771
Literal influence running time: 0.02971 seconds ra seqInfl= 0.4771
Literal influence running time: 0.029225 seconds trust1-57 seqInfl= 0.6001
Literal influence running time: 0.04992 seconds trust110-1 seqInfl= 0
Literal influence running time: 0.034636 seconds trust4-57 seqInfl= 0
Literal influence running time: 0.049079 seconds trust57-110 seqInfl= 0
Literal influence running time: 0.024442 seconds trust57-4 seqInfl= 0.8026
Literal influence running time: 0.04946 seconds trust33-1 seqInfl= 0
Literal influence running time: 0.049071 seconds trust57-33 seqInfl= 0
Literal influence running time: 0.053117 seconds trust4-1 seqInfl= 0.1208
Literal influence running time: 0.051642 seconds trust57-1 seqInfl= 0
Literal influence running time: 0.052052 seconds trust57-64 seqInfl= 0
Literal influence running time: 0.052118 seconds trust64-1 seqInfl= 0
Literal influence running time: 0.051998 seconds trust57-7 seqInfl= 0
Literal influence running time: 0.052069 seconds trust7-1 seqInfl= 0
Total number of literals: 17
Sequential influence running time: 0.71728 seconds
Sequential maxInfluence Literal: trust57-4 0.8026
index1= 17 size= 51 dim1_size= 6
sum0:4781 influence0:0.478100 sum2:4781 influence2:0.478100 sum6:0 influence6:0.000000 sum10:0 sum12:0 influence12:0.000000 sum7:0 influence7:0.000000 influence10:0.000000 sum4:5962 influence4:0.596200 sum8:7971 influence8:0.797100 sum1:4781 influence1:0.478100 sum3:4781 influence3:0.478100 sum13:0 influence13:0.000000 sum11:1261 influence11:0.126100 sum9:0 influence9:0.000000 sum14:0 influence14:0.000000 sum5:0 influence5:0.000000 sum15:0 influence15:0.000000 sum16:0 influence16:0.000000
Parallel influence running time: 0.054391 seconds
Parallel maxInfluence Literal: trust57-4 Infl=0.7971
However, when I run the code on GeForce GTX 1080 Ti, with NVIDIA-SMI 430.40 and CUDA 10.1 and OpenCL 1.2 CUDA installed, the output is as below:
Using OpenCL device: GeForce GTX 1080 Ti
Influence:
Literal influence running time: 0.011119 seconds r1 seqInfl= 0.4771
Literal influence running time: 0.011238 seconds r2 seqInfl= 0.4771
Literal influence running time: 0.011408 seconds r3 seqInfl= 0.4771
Literal influence running time: 0.01109 seconds ra seqInfl= 0.4771
Literal influence running time: 0.011132 seconds trust1-57 seqInfl= 0.6001
Literal influence running time: 0.018978 seconds trust110-1 seqInfl= 0
Literal influence running time: 0.013093 seconds trust4-57 seqInfl= 0
Literal influence running time: 0.018968 seconds trust57-110 seqInfl= 0
Literal influence running time: 0.009105 seconds trust57-4 seqInfl= 0.8026
Literal influence running time: 0.018753 seconds trust33-1 seqInfl= 0
Literal influence running time: 0.018583 seconds trust57-33 seqInfl= 0
Literal influence running time: 0.02005 seconds trust4-1 seqInfl= 0.1208
Literal influence running time: 0.01957 seconds trust57-1 seqInfl= 0
Literal influence running time: 0.019686 seconds trust57-64 seqInfl= 0
Literal influence running time: 0.019632 seconds trust64-1 seqInfl= 0
Literal influence running time: 0.019687 seconds trust57-7 seqInfl= 0
Literal influence running time: 0.019859 seconds trust7-1 seqInfl= 0
Total number of literals: 17
Sequential influence running time: 0.272032 seconds
Sequential maxInfluence Literal: trust57-4 0.8026
index1= 17 size= 51 dim1_size= 6
sum0:10000 sum1:10000 sum2:10000 sum3:10000 sum4:10000 sum5:0 sum6:0 sum7:0 sum8:10000 sum9:0 sum10:0 sum11:0 sum12:0 sum13:0 sum14:0 sum15:0 sum16:0
Parallel influence running time: 0.193581 seconds
The "Influence" value equals sum*1.0/10000, thus the parallel influence only composes of 1 and 0, which is incorrect (in GPU runs) and doesn't happen when parallelizing on a Intel CPU.
When I check the output of the random number generator if(flag==0) printf("randint=%u",randint);, it seems the outputs are all zero on GPU. Below is the clinfo and the .cl code:
Device Name GeForce GTX 1080 Ti
Device Vendor NVIDIA Corporation
Device Vendor ID 0x10de
Device Version OpenCL 1.2 CUDA
Driver Version 430.40
Device OpenCL C Version OpenCL C 1.2
Device Type GPU
Device Topology (NV) PCI-E, 68:00.0
Device Profile FULL_PROFILE
Device Available Yes
Compiler Available Yes
Linker Available Yes
Max compute units 28
Max clock frequency 1721MHz
Compute Capability (NV) 6.1
Device Partition (core)
Max number of sub-devices 1
Supported partition types None
Max work item dimensions 3
Max work item sizes 1024x1024x64
Max work group size 1024
Preferred work group size multiple 32
Warp size (NV) 32
Preferred / native vector sizes
char 1 / 1
short 1 / 1
int 1 / 1
long 1 / 1
half 0 / 0 (n/a)
float 1 / 1
double 1 / 1 (cl_khr_fp64)
Half-precision Floating-point support (n/a)
Single-precision Floating-point support (core)
Denormals Yes
Infinity and NANs Yes
Round to nearest Yes
Round to zero Yes
Round to infinity Yes
IEEE754-2008 fused multiply-add Yes
Support is emulated in software No
Correctly-rounded divide and sqrt operations Yes
Double-precision Floating-point support (cl_khr_fp64)
Denormals Yes
Infinity and NANs Yes
Round to nearest Yes
Round to zero Yes
Round to infinity Yes
IEEE754-2008 fused multiply-add Yes
Support is emulated in software No
Address bits 64, Little-Endian
Global memory size 11720130560 (10.92GiB)
Error Correction support No
Max memory allocation 2930032640 (2.729GiB)
Unified memory for Host and Device No
Integrated memory (NV) No
Minimum alignment for any data type 128 bytes
Alignment of base address 4096 bits (512 bytes)
Global Memory cache type Read/Write
Global Memory cache size 458752 (448KiB)
Global Memory cache line size 128 bytes
Image support Yes
Max number of samplers per kernel 32
Max size for 1D images from buffer 134217728 pixels
Max 1D or 2D image array size 2048 images
Max 2D image size 16384x32768 pixels
Max 3D image size 16384x16384x16384 pixels
Max number of read image args 256
Max number of write image args 16
Local memory type Local
Local memory size 49152 (48KiB)
Registers per block (NV) 65536
Max number of constant args 9
Max constant buffer size 65536 (64KiB)
Max size of kernel argument 4352 (4.25KiB)
Queue properties
Out-of-order execution Yes
Profiling Yes
Prefer user sync for interop No
Profiling timer resolution 1000ns
Execution capabilities
Run OpenCL kernels Yes
Run native kernels No
Kernel execution timeout (NV) Yes
Concurrent copy and kernel execution (NV) Yes
Number of async copy engines 2
printf() buffer size 1048576 (1024KiB)
#define N 70 // N > index, which is the total number of literals
#define BASE 4294967296UL
//! Represents the state of a particular generator
typedef struct{ uint x; uint c; } mwc64x_state_t;
enum{ MWC64X_A = 4294883355U };
enum{ MWC64X_M = 18446383549859758079UL };
void MWC64X_Step(mwc64x_state_t *s)
{
uint X=s->x, C=s->c;
uint Xn=MWC64X_A*X+C;
uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
uint Cn=mad_hi(MWC64X_A,X,carry);
s->x=Xn;
s->c=Cn;
}
//! Return a 32-bit integer in the range [0..2^32)
uint MWC64X_NextUint(mwc64x_state_t *s)
{
uint res=s->x ^ s->c;
MWC64X_Step(s);
return res;
}
__kernel void setInfluence(const int literals, const int size, const int dim1_size, __global int* lambdas, __global float* lambdap, __global int* dim2_size, __global float* influence){
int flag=get_global_id(0);
int sum=0;
int count=10000;
int assignment[N];
//or try to get newlambda like original version does
if(flag < literals){
mwc64x_state_t rng;
for(int i=0; i<count; i++){
for(int j=0; j<size; j++){
uint randint=MWC64X_NextUint(&rng);
float rand=randint*1.0/BASE;
//if(flag==0)
// printf("randint=%u",randint);
if(lambdap[j]<rand)
assignment[lambdas[j]]=0;
else
assignment[lambdas[j]]=1;
}
//the true case
assignment[flag]=1;
int valuet=0;
int index=0;
for(int m=0; m<dim1_size; m++){
int valueMono=1;
for(int n=0; n<dim2_size[m]; n++){
if(assignment[lambdas[index+n]]==0){
valueMono=0;
index+=dim2_size[m];
break;
}
}
if(valueMono==1){
valuet=1;
break;
}
}
//the false case
assignment[flag]=0;
int valuef=0;
index=0;
for(int m=0; m<dim1_size; m++){
int valueMono=1;
for(int n=0; n<dim2_size[m]; n++){
if(assignment[lambdas[index+n]]==0){
valueMono=0;
index+=dim2_size[m];
break;
}
}
if(valueMono==1){
valuef=1;
break;
}
}
sum += valuet-valuef;
}
influence[flag] = 1.0*sum/count;
printf("sum%d:%d\t", flag, sum);
}
}
What might be the problem when running the code on GPU? Is it MWC64X? According to its author, it can perform well on NVIDIA GPUs. If so, how can I fix it; if not, what might be the problem?
(This started out as a comment, it turns out this was the source of the problem so I'm turning it into an answer.)
You're not initialising your mwc64x_state_t rng; variable before reading from it, so any results will be undefined:
mwc64x_state_t rng;
for(int i=0; i<count; i++){
for(int j=0; j<size; j++){
uint randint=MWC64X_NextUint(&rng);
Where MWC64X_NextUint() immediately reads from the rng state before updating it:
uint MWC64X_NextUint(mwc64x_state_t *s)
{
uint res=s->x ^ s->c;
Note that you will probably want to seed your RNG differently for each work-item, otherwise you will get nasty correlation artifacts in your results.
All use-cases of a pseudo-random number are a next-level challenge in true-[PARALLEL] computing platforms (not languages, platforms).
Either, there is some source-of-randomness, which gets us into a trouble once massively-parallel requests are to get fair-handled in a truly [PARALLEL] fashion (here, hardware resources may help, yet at a cost of not being able to reproduce the same behaviour "outside" of this very same platform ( and moment-in-time, if such a source is not software-operated with some seed-injection feature, that may setup the "just"-pseudo-random algorithm that creates a pure-[SERIAL] sequence-of-produced "just"-pseudo-random numbers ) )
Or,there is some "shared"-generator of pseudo-random numbers, which enjoys of a higher level of system-wide level-of-entropy (which is good for the resulting "quality" of pseudo-randomness) but at a cost of pure-serial dependence (no parallel execution possible,serial sequence gets served one after another in a sequential manner) and having close to zero chance for repeatable runs (a must for reproducible science) providing repeatably same sequences, needed for testing and for method-validation cases.
RESUME :
The code may employ a work-item-"private" pseudo-random generating function(s) ( privacy is a must for the sake of both the parallel code-execution and the mutual independence (non-intervening processes) of generating these pseudo-random numbers ) , yet each of instances must be a) independently initialised, so as to provide the expected level of randomness achievable in parallelised code-runs and b) any such initialisation ought be performed in a repeatably reproducible manner, for the sake of running the test on different times, often using different OpenCL target computing-platforms.
For __kernel-s, that do not rely on hardware-specific sources-of-randomness, meeting the conditions a && b will suffice for receiving repeatably reproducible (same) results for testing "in vitro" and thus providing a reasonably random method for generating results during generic production-level use-case code-runs "in vivo".
The comparison of net-run-times (benchmarked above) seems to show that Amdahl's law add-on overhead costs plus a tail-end effect of the atomicity-of-work have finally decided the net-run-time was ~ 3.6x faster on XEON compared to GPU:
index1 = 17
size = 51
dim1_size = 6
sum0: 4781 influence0: 0.478100
sum2: 4781 influence2: 0.478100
sum6: 0 influence6: 0.000000
sum10: 0 influence10: 0.000000
sum12: 0 influence12: 0.000000
sum7: 0 influence7: 0.000000
sum4: 5962 influence4: 0.596200
sum8: 7971 influence8: 0.797100
sum1: 4781 influence1: 0.478100
sum3: 4781 influence3: 0.478100
sum13: 0 influence13: 0.000000
sum11: 1261 influence11: 0.126100
sum9: 0 influence9: 0.000000
sum14: 0 influence14: 0.000000
sum5: 0 influence5: 0.000000
sum15: 0 influence15: 0.000000
sum16: 0 influence16: 0.000000
Parallel influence running time: 0.054391 seconds on XEON E5-2630L v3 # 1.80GHz using OpenCL
|....
index1 = 17 |....
size = 51 |....
dim1_size = 6 |....
sum0: 10000 |....
sum1: 10000 |....
sum2: 10000 |....
sum3: 10000 |....
sum4: 10000 |....
sum5: 0 |....
sum6: 0 |....
sum7: 0 |....
sum8: 10000 |....
sum9: 0 |....
sum10: 0 |....
sum11: 0 |....
sum12: 0 |....
sum13: 0 |....
sum14: 0 |....
sum15: 0 |....
sum16: 0 |....
Parallel influence running time: 0.193581 seconds on GeForce GTX 1080 Ti using OpenCL
[EDIT: added MCVE in the text, clarifications]
I have the following program that sets RLIMIT_CPU to 2 seconds using setrlimit() and catches the signal. RLIMIT_CPU limits CPU time. «When the process reaches the soft limit, it is sent a SIGXCPU signal. The default action for this signal is to terminate the process. However, the signal can be caught, and the handler can return control to the main program.» (man)
The following program sets RLIMIT_CPU and a signal handler for SIGXCPU, then it generates random numbers until SIGXCPU gets raised, the signal handler simply exits the program.
test_signal.cpp
/*
* Test program for signal handling on CMS.
*
* Compile with:
* /usr/bin/g++ [-DDEBUG] -Wall -std=c++11 -O2 -pipe -static -s \
* -o test_signal test_signal.cpp
*
* The option -DDEBUG activates some debug logging in the helpers library.
*/
#include <iostream>
#include <fstream>
#include <random>
#include <chrono>
#include <iostream>
#include <unistd.h>
#include <csignal>
#include <sys/time.h>
#include <sys/resource.h>
using namespace std;
namespace helpers {
long long start_time = -1;
volatile sig_atomic_t timeout_flag = false;
unsigned const timelimit = 2; // soft limit on CPU time (in seconds)
void setup_signal(void);
void setup_time_limit(void);
static void signal_handler(int signum);
long long get_elapsed_time(void);
bool has_reached_timeout(void);
void setup(void);
}
namespace {
unsigned const minrand = 5;
unsigned const maxrand = 20;
int const numcycles = 5000000;
};
/*
* Very simple debugger, enabled at compile time with -DDEBUG.
* If enabled, it prints on stderr, otherwise it does nothing (it does not
* even evaluate the expression on its right-hand side).
*
* Main ideas taken from:
* - C++ enable/disable debug messages of std::couts on the fly
* (https://stackoverflow.com/q/3371540/2377454)
* - Standard no-op output stream
* (https://stackoverflow.com/a/11826787/2377454)
*/
#ifdef DEBUG
#define debug true
#else
#define debug false
#endif
#define debug_logger if (!debug) \
{} \
else \
cerr << "[DEBUG] helpers::"
// conversion factor betwen seconds and nanoseconds
#define NANOS 1000000000
// signal to handle
#define SIGNAL SIGXCPU
#define TIMELIMIT RLIMIT_CPU
/*
* This could be a function factory where and a closure of the signal-handling
* function so that we could explicitly pass the output ofstream and close it.
* C++ support closures only for lambdas, alas, at the moment we also need
* the signal-handling function to be a pointer to a function and lambaa are
* a different object that can not be converted. See:
* - Passing lambda as function pointer
* (https://stackoverflow.com/a/28746827/2377454)
*/
void helpers::signal_handler(int signum) {
helpers::timeout_flag = true;
debug_logger << "signal_handler:\t" << "signal " << signum \
<< " received" << endl;
debug_logger << "signal_handler:\t" << "exiting after " \
<< helpers::get_elapsed_time() << " microseconds" << endl;
exit(0);
}
/*
* Set function signal_handler() as handler for SIGXCPU using sigaction. See
* - https://stackoverflow.com/q/4863420/2377454
* - https://stackoverflow.com/a/17572787/2377454
*/
void helpers::setup_signal() {
debug_logger << "set_signal:\t" << "set_signal() called" << endl;
struct sigaction new_action;
//Set the handler in the new_action struct
new_action.sa_handler = signal_handler;
// Set to empty the sa_mask. It means that no signal is blocked
// while the handler run.
sigemptyset(&new_action.sa_mask);
// Block the SIGXCPU signal, while the handler run, SIGXCPU is ignored.
sigaddset(&new_action.sa_mask, SIGNAL);
// Remove any flag from sa_flag
new_action.sa_flags = 0;
// Set new action
sigaction(SIGNAL,&new_action,NULL);
if(debug) {
struct sigaction tmp;
// read the old signal associated to SIGXCPU
sigaction(SIGNAL, NULL, &tmp);
debug_logger << "set_signal:\t" << "action.sa_handler: " \
<< tmp.sa_handler << endl;
}
return;
}
/*
* Set soft CPU time limit.
* RLIMIT_CPU set teg CPU time limit in seconds..
* See:
* - https://www.go4expert.com/articles/
* getrlimit-setrlimit-control-resources-t27477/
* - https://gist.github.com/Leporacanthicus/11086960
*/
void helpers::setup_time_limit(void) {
debug_logger << "set_limit:\t\t" << "set_limit() called" << endl;
struct rlimit limit;
if(getrlimit(TIMELIMIT, &limit) != 0) {
perror("error calling getrlimit()");
exit(EXIT_FAILURE);
}
limit.rlim_cur = helpers::timelimit;
if(setrlimit(TIMELIMIT, &limit) != 0) {
perror("error calling setrlimit()");
exit(EXIT_FAILURE);
}
if (debug) {
struct rlimit tmp;
getrlimit(TIMELIMIT, &tmp);
debug_logger << "set_limit:\t\t" << "current limit: " << tmp.rlim_cur \
<< " seconds" << endl;
}
return;
}
void helpers::setup(void) {
struct timespec start;
if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start)) {
exit(EXIT_FAILURE);
}
start_time = start.tv_sec*NANOS + start.tv_nsec;
setup_signal();
setup_time_limit();
return;
}
long long helpers::get_elapsed_time(void) {
struct timespec current;
if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ¤t)) {
exit(EXIT_FAILURE);
}
long long current_time = current.tv_sec*NANOS + current.tv_nsec;
long long elapsed_micro = (current_time - start_time)/1000 + \
((current_time - start_time) % 1000 >= 500);
return elapsed_micro;
}
bool helpers::has_reached_timeout(void) {
return helpers::timeout_flag;
}
int main() {
helpers::setup();
ifstream in("input.txt");
in.close();
ofstream out("output.txt");
random_device rd;
mt19937 eng(rd());
uniform_int_distribution<> distr(minrand, maxrand);
int i = 0;
while(!helpers::has_reached_timeout()) {
int nmsec;
for(int n=0; n<numcycles; n++) {
nmsec = distr(eng);
}
cout << "i: " << i << "\t- nmsec: " << nmsec << "\t- ";
out << "i: " << i << "\t- nmsec: " << nmsec << "\t- ";
cout << "program has been running for " << \
helpers::get_elapsed_time() << " microseconds" << endl;
out << "program has been running for " << \
helpers::get_elapsed_time() << " microseconds" << endl;
i++;
}
return 0;
}
I compile it as follows:
/usr/bin/g++ -DDEBUG -Wall -std=c++11 -O2 -pipe -static -s -o test_signal test_signal.cpp
On my laptop it correctly gets a SIGXCPU after 2 seconds, see the output:
$ /usr/bin/time -v ./test_signal
[DEBUG] helpers::set_signal: set_signal() called
[DEBUG] helpers::set_signal: action.sa_handler: 1
[DEBUG] helpers::set_limit: set_limit() called
[DEBUG] helpers::set_limit: current limit: 2 seconds
i: 0 - nmsec: 11 - program has been running for 150184 microseconds
i: 1 - nmsec: 18 - program has been running for 294497 microseconds
i: 2 - nmsec: 9 - program has been running for 422220 microseconds
i: 3 - nmsec: 5 - program has been running for 551882 microseconds
i: 4 - nmsec: 20 - program has been running for 685373 microseconds
i: 5 - nmsec: 16 - program has been running for 816642 microseconds
i: 6 - nmsec: 9 - program has been running for 951208 microseconds
i: 7 - nmsec: 20 - program has been running for 1085614 microseconds
i: 8 - nmsec: 20 - program has been running for 1217199 microseconds
i: 9 - nmsec: 12 - program has been running for 1350183 microseconds
i: 10 - nmsec: 17 - program has been running for 1486431 microseconds
i: 11 - nmsec: 13 - program has been running for 1619845 microseconds
i: 12 - nmsec: 20 - program has been running for 1758074 microseconds
i: 13 - nmsec: 11 - program has been running for 1895408 microseconds
[DEBUG] helpers::signal_handler: signal 24 received
[DEBUG] helpers::signal_handler: exiting after 2003326 microseconds
Command being timed: "./test_signal"
User time (seconds): 1.99
System time (seconds): 0.00
Percent of CPU this job got: 99%
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:02.01
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 1644
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 0
Minor (reclaiming a frame) page faults: 59
Voluntary context switches: 1
Involuntary context switches: 109
Swaps: 0
File system inputs: 0
File system outputs: 16
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Page size (bytes): 4096
Exit status: 0
If I compile and run in a virtual machine (VirtualBox, running Ubuntu), I get this:
$ /usr/bin/time -v ./test_signal
[DEBUG] helpers::set_signal: set_signal() called
[DEBUG] helpers::set_signal: action.sa_handler: 1
[DEBUG] helpers::set_limit: set_limit() called
[DEBUG] helpers::set_limit: current limit: 2 seconds
i: 0 - nmsec: 12 - program has been running for 148651 microseconds
i: 1 - nmsec: 13 - program has been running for 280494 microseconds
i: 2 - nmsec: 7 - program has been running for 428390 microseconds
i: 3 - nmsec: 5 - program has been running for 580805 microseconds
i: 4 - nmsec: 10 - program has been running for 714362 microseconds
i: 5 - nmsec: 19 - program has been running for 846853 microseconds
i: 6 - nmsec: 20 - program has been running for 981253 microseconds
i: 7 - nmsec: 7 - program has been running for 1114686 microseconds
i: 8 - nmsec: 7 - program has been running for 1249530 microseconds
i: 9 - nmsec: 12 - program has been running for 1392096 microseconds
i: 10 - nmsec: 20 - program has been running for 1531859 microseconds
i: 11 - nmsec: 19 - program has been running for 1667021 microseconds
i: 12 - nmsec: 13 - program has been running for 1818431 microseconds
i: 13 - nmsec: 17 - program has been running for 1973182 microseconds
i: 14 - nmsec: 7 - program has been running for 2115423 microseconds
i: 15 - nmsec: 20 - program has been running for 2255140 microseconds
i: 16 - nmsec: 13 - program has been running for 2394162 microseconds
i: 17 - nmsec: 10 - program has been running for 2528274 microseconds
i: 18 - nmsec: 15 - program has been running for 2667978 microseconds
i: 19 - nmsec: 8 - program has been running for 2803725 microseconds
i: 20 - nmsec: 9 - program has been running for 2940610 microseconds
i: 21 - nmsec: 19 - program has been running for 3075349 microseconds
i: 22 - nmsec: 14 - program has been running for 3215255 microseconds
i: 23 - nmsec: 5 - program has been running for 3356515 microseconds
i: 24 - nmsec: 5 - program has been running for 3497369 microseconds
[DEBUG] helpers::signal_handler: signal 24 received
[DEBUG] helpers::signal_handler: exiting after 3503271 microseconds
Command being timed: "./test_signal"
User time (seconds): 3.50
System time (seconds): 0.00
Percent of CPU this job got: 99%
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:03.52
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 1636
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 0
Minor (reclaiming a frame) page faults: 59
Voluntary context switches: 0
Involuntary context switches: 106
Swaps: 0
File system inputs: 0
File system outputs: 16
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Page size (bytes): 4096
Exit status: 0
Even running the binary compiled on my laptop, the process gets killed after around 3 seconds of elapsed user time.
Any idea of what could be causing this? For a broader context see, this thread: https://github.com/cms-dev/cms/issues/851
I need to read data from a serial port device(which sends data per second) on Windows in REALTIME(<= 5 ms). But the time cost by ReadFile is unpredictable, which drives me to crazy. Some piece of the code can be found at:
https://gist.github.com/morris-stock/62b1674b4cda0e9df84d4738e54773f8
the delay is dumped at https://gist.github.com/morris-stock/62b1674b4cda0e9df84d4738e54773f8#file-serialport_win-cc-L283
Poco::Timestamp now;
if (!ReadFile(_handle, buffer, size, &bytesRead, NULL))
throw Poco::IOException("failed to read from serial port");
Poco::Timestamp::TimeDiff elapsed = now.elapsed();
std::cout << Poco::DateTimeFormatter::format(now, "%Y-%m-%d %H:%M:%S.%i")
<< ", elapsed: " << elapsed << ", data len: " << bytesRead << std::endl << std::flush;
Sometimes ReadFile costs about 3000 us(which is OK, affected by COMMTIMEOUTS) to return, but sometimes, it costs 15600 us(NOT affected by COMMTIMEOUTS).
Please let me know if there is anything I can do to make the problem clear.
P.S.
COMMTIMEOUTS:
COMMTIMEOUTS cto;
cto.ReadIntervalTimeout = 1;
cto.ReadTotalTimeoutConstant = 1;
cto.ReadTotalTimeoutMultiplier = 0;
cto.WriteTotalTimeoutConstant = MAXDWORD;
cto.WriteTotalTimeoutMultiplier = 0;
the main reading thread part:
https://gist.github.com/morris-stock/62b1674b4cda0e9df84d4738e54773f8#file-serialdevice-cc-L31
device data type
baudrate: 9600, it sends about 400 bytes per second(continuously, then no data in the rest of the second).
consle output
wPacketLength: 64
wPacketVersion: 2
dwServiceMask: 1
dwReserved1: 0
dwMaxTxQueue: 0
dwMaxRxQueue: 0
dwMaxBaud: 268435456
dwProvSubType: 1
dwProvCapabilities: 255
dwSettableParams: 127
dwSettableBaud: 268959743
wSettableData: 15
wSettableStopParity: 7943
dwCurrentTxQueue: 0
dwCurrentRxQueue: 68824
dwProvSpec1: 0
dwProvSpec2: 1128813859
wcProvChar: 0039F16C
2018-01-22 03:35:52.658, elapsed: 15600, data len: 0
2018-01-22 03:35:52.673, elapsed: 15600, data len: 0
2018-01-22 03:35:52.689, elapsed: 15600, data len: 0
2018-01-22 03:35:52.704, elapsed: 15600, data len: 0
2018-01-22 03:35:52.720, elapsed: 15600, data len: 0
2018-01-22 03:35:52.736, elapsed: 15600, data len: 0
2018-01-22 03:35:52.751, elapsed: 15600, data len: 0
In my case, it's the Windows system clock resolution that matters.
ClockRes gives me:
C:\work\utils\ClockRes>Clockres.exe
Clockres v2.1 - Clock resolution display utility
Copyright (C) 2016 Mark Russinovich
Sysinternals
Maximum timer interval: 15.600 ms
Minimum timer interval: 0.500 ms
Current timer interval: 1.000 ms
and
C:\work\utils\ClockRes>Clockres.exe
Clockres v2.1 - Clock resolution display utility
Copyright (C) 2016 Mark Russinovich
Sysinternals
Maximum timer interval: 15.600 ms
Minimum timer interval: 0.500 ms
Current timer interval: 15.600 ms
by calling timeBeginPeriod(1) when my app starts, I can get a more consistent result.
Thanks everyone for your kindly help.
I created a basic TCP server that reads incoming binary data in protocol buffer format, and writes a binary msg as response. I would like to benchmark the the roundtrip time.
I tried iperf, but could not make it send the same input file multiple times. Is there another benchmark tool than can send a binary input file repeatedly?
If you have access to a linux or unix machine1, you should use tcptrace. All you need to do is loop through your binary traffic test while capturing with wireshark or tcpdump file.
After you have that .pcap file2, analyze with tcptrace -xtraffic <pcap_filename>3. This will generate two text files, and the average RTT stats for all connections in that pcap are shown at the bottom of the one called traffic_stats.dat.
[mpenning#Bucksnort tcpperf]$ tcptrace -xtraffic willers.pcap
mod_traffic: characterizing traffic
1 arg remaining, starting with 'willers.pcap'
Ostermann's tcptrace -- version 6.6.1 -- Wed Nov 19, 2003
16522 packets seen, 16522 TCP packets traced
elapsed wallclock time: 0:00:00.200709, 82318 pkts/sec analyzed
trace file elapsed time: 0:03:21.754962
Dumping port statistics into file traffic_byport.dat
Dumping overall statistics into file traffic_stats.dat
Plotting performed at 15.000 second intervals
[mpenning#Bucksnort tcpperf]$
[mpenning#Bucksnort tcpperf]$ cat traffic_stats.dat
Overall Statistics over 201 seconds (0:03:21.754962):
4135308 ttl bytes sent, 20573.672 bytes/second
4135308 ttl non-rexmit bytes sent, 20573.672 bytes/second
0 ttl rexmit bytes sent, 0.000 bytes/second
16522 packets sent, 82.199 packets/second
200 connections opened, 0.995 conns/second
11 dupacks sent, 0.055 dupacks/second
0 rexmits sent, 0.000 rexmits/second
average RTT: 67.511 msecs <------------------
[mpenning#Bucksnort tcpperf]$
The .pcap file used in this example was a capture I generated when I looped through an expect script that pulled data from one of my servers. This was how I generated the loop...
#!/usr/bin/python
from subprocess import Popen, PIPE
import time
for ii in xrange(0,200):
# willers.exp is an expect script
Popen(['./willers.exp'], stdin=PIPE, stdout=PIPE, stderr=PIPE)
time.sleep(1)
You can adjust the sleep time between loops based on your server's accept() performance and the duration of your tests.
END NOTES:
A Knoppix Live-CD will do
Filtered to only capture test traffic
tcptrace is capable of very detailed per-socket stats if you use other options...
================================
[mpenning#Bucksnort tcpperf]$ tcptrace -lr willers.pcap
1 arg remaining, starting with 'willers.pcap'
Ostermann's tcptrace -- version 6.6.1 -- Wed Nov 19, 2003
16522 packets seen, 16522 TCP packets traced
elapsed wallclock time: 0:00:00.080496, 205252 pkts/sec analyzed
trace file elapsed time: 0:03:21.754962
TCP connection info:
200 TCP connections traced:
TCP connection 1:
host c: myhost.local:44781
host d: willers.local:22
complete conn: RESET (SYNs: 2) (FINs: 1)
first packet: Tue May 31 22:52:24.154801 2011
last packet: Tue May 31 22:52:25.668430 2011
elapsed time: 0:00:01.513628
total packets: 73
filename: willers.pcap
c->d: d->c:
total packets: 34 total packets: 39
resets sent: 4 resets sent: 0
ack pkts sent: 29 ack pkts sent: 39
pure acks sent: 11 pure acks sent: 2
sack pkts sent: 0 sack pkts sent: 0
dsack pkts sent: 0 dsack pkts sent: 0
max sack blks/ack: 0 max sack blks/ack: 0
unique bytes sent: 2512 unique bytes sent: 14336
actual data pkts: 17 actual data pkts: 36
actual data bytes: 2512 actual data bytes: 14336
rexmt data pkts: 0 rexmt data pkts: 0
rexmt data bytes: 0 rexmt data bytes: 0
zwnd probe pkts: 0 zwnd probe pkts: 0
zwnd probe bytes: 0 zwnd probe bytes: 0
outoforder pkts: 0 outoforder pkts: 0
pushed data pkts: 17 pushed data pkts: 33
SYN/FIN pkts sent: 1/1 SYN/FIN pkts sent: 1/0
req 1323 ws/ts: Y/Y req 1323 ws/ts: Y/Y
adv wind scale: 6 adv wind scale: 1
req sack: Y req sack: Y
sacks sent: 0 sacks sent: 0
urgent data pkts: 0 pkts urgent data pkts: 0 pkts
urgent data bytes: 0 bytes urgent data bytes: 0 bytes
mss requested: 1460 bytes mss requested: 1460 bytes
max segm size: 792 bytes max segm size: 1448 bytes
min segm size: 16 bytes min segm size: 32 bytes
avg segm size: 147 bytes avg segm size: 398 bytes
max win adv: 40832 bytes max win adv: 66608 bytes
min win adv: 5888 bytes min win adv: 66608 bytes
zero win adv: 0 times zero win adv: 0 times
avg win adv: 14035 bytes avg win adv: 66608 bytes
initial window: 32 bytes initial window: 40 bytes
initial window: 1 pkts initial window: 1 pkts
ttl stream length: 2512 bytes ttl stream length: NA
missed data: 0 bytes missed data: NA
truncated data: 0 bytes truncated data: 0 bytes
truncated packets: 0 pkts truncated packets: 0 pkts
data xmit time: 1.181 secs data xmit time: 1.236 secs
idletime max: 196.9 ms idletime max: 196.9 ms
throughput: 1660 Bps throughput: 9471 Bps
RTT samples: 18 RTT samples: 24
RTT min: 43.8 ms RTT min: 0.0 ms
RTT max: 142.5 ms RTT max: 7.2 ms
RTT avg: 68.5 ms RTT avg: 0.7 ms
RTT stdev: 35.8 ms RTT stdev: 1.6 ms
RTT from 3WHS: 80.8 ms RTT from 3WHS: 0.0 ms
RTT full_sz smpls: 1 RTT full_sz smpls: 3
RTT full_sz min: 142.5 ms RTT full_sz min: 0.0 ms
RTT full_sz max: 142.5 ms RTT full_sz max: 0.0 ms
RTT full_sz avg: 142.5 ms RTT full_sz avg: 0.0 ms
RTT full_sz stdev: 0.0 ms RTT full_sz stdev: 0.0 ms
post-loss acks: 0 post-loss acks: 0
segs cum acked: 0 segs cum acked: 9
duplicate acks: 0 duplicate acks: 1
triple dupacks: 0 triple dupacks: 0
max # retrans: 0 max # retrans: 0
min retr time: 0.0 ms min retr time: 0.0 ms
max retr time: 0.0 ms max retr time: 0.0 ms
avg retr time: 0.0 ms avg retr time: 0.0 ms
sdv retr time: 0.0 ms sdv retr time: 0.0 ms
================================
You can always stick a shell loop around a program like iperf. Also, assuming iperf can read from a file (thus stdin) or programs like ttcp, could allow a shell loop catting a file N times into iperf/ttcp.
If you want a program which sends a file, waits for your binary response, and then sends another copy of the file, you probably are going to need to code that yourself.
You will need to measure the time in the client application for a roundtrip time, or monitor the network traffic going from, and coming to, the client to get the complete time interval. Measuring the time at the server will exclude any kernel level delays in the server and all the network transmission times.
Note that TCP performance will go down as the load goes up. If you're going to test under heavy load, you need professional tools that can scale to thousands (or even millions in some cases) of new connection/second or concurrent established TCP connections.
I wrote an article about this on my blog (feel free to remove if this is considered advertisement, but I think it's relevant to this thread): http://synsynack.wordpress.com/2012/04/09/realistic-latency-measurement-in-the-application-layers
As a very simple highlevel tool netcat comes to mind ... so something like time (nc hostname 1234 < input.binary | head -c 100) assuming the response is 100 bytes long.