Hello I am developing an embedded HID device which gives me an error when connected:
This device cannot start. (Code 10)
Extra end collection found or end collection not found.
This seems to be a problem with my Report Map, but according to HID tool it is fine:
char ReportDescriptor[52] = {
0x05, 0x01, // USAGE_PAGE (Generic Desktop)
0x09, 0x02, // USAGE (Mouse)
0xa1, 0x01, // COLLECTION (Application)
0x09, 0x01, // USAGE (Pointer)
0x85, 0x01, // REPORT_ID (1)
0x05, 0x09, // USAGE_PAGE (Button)
0x19, 0x01, // USAGE_MINIMUM (Button 1)
0x29, 0x03, // USAGE_MAXIMUM (Button 3)
0x15, 0x00, // LOGICAL_MINIMUM (0)
0x25, 0x01, // LOGICAL_MAXIMUM (1)
0x95, 0x03, // REPORT_COUNT (3)
0x75, 0x01, // REPORT_SIZE (1)
0x81, 0x02, // INPUT (Data,Var,Abs)
0x95, 0x01, // REPORT_COUNT (1)
0x75, 0x05, // REPORT_SIZE (5)
0x81, 0x03, // INPUT (Cnst,Var,Abs)
0x05, 0x01, // USAGE_PAGE (Generic Desktop)
0x09, 0x30, // USAGE (X)
0x09, 0x31, // USAGE (Y)
0x15, 0x81, // LOGICAL_MINIMUM (-127)
0x25, 0x7f, // LOGICAL_MAXIMUM (127)
0x75, 0x08, // REPORT_SIZE (8)
0x95, 0x02, // REPORT_COUNT (2)
0x81, 0x06, // INPUT (Data,Var,Rel)
0xc0, // END_COLLECTION
0xc0 // END_COLLECTION
};
Any ideas?
You appear to have omitted a PHYSICAL COLLECTION item (which accounts for the extra END COLLECTION item at the end I guess). Did you mean:
char ReportDescriptor[52] = {
0x05, 0x01, // USAGE_PAGE (Generic Desktop)
0x09, 0x02, // USAGE (Mouse)
0xa1, 0x01, // COLLECTION (Application)
0x09, 0x01, // USAGE (Pointer)
0xA1, 0x00, // COLLECTION (Physical) <-- inserted
0x85, 0x01, // REPORT_ID (1)
0x05, 0x09, // USAGE_PAGE (Button)
0x19, 0x01, // USAGE_MINIMUM (Button 1)
0x29, 0x03, // USAGE_MAXIMUM (Button 3)
0x15, 0x00, // LOGICAL_MINIMUM (0)
0x25, 0x01, // LOGICAL_MAXIMUM (1)
0x95, 0x03, // REPORT_COUNT (3)
0x75, 0x01, // REPORT_SIZE (1)
0x81, 0x02, // INPUT (Data,Var,Abs)
0x95, 0x01, // REPORT_COUNT (1)
0x75, 0x05, // REPORT_SIZE (5)
0x81, 0x03, // INPUT (Cnst,Var,Abs)
0x05, 0x01, // USAGE_PAGE (Generic Desktop)
0x09, 0x30, // USAGE (X)
0x09, 0x31, // USAGE (Y)
0x15, 0x81, // LOGICAL_MINIMUM (-127)
0x25, 0x7f, // LOGICAL_MAXIMUM (127)
0x75, 0x08, // REPORT_SIZE (8)
0x95, 0x02, // REPORT_COUNT (2)
0x81, 0x06, // INPUT (Data,Var,Rel)
0xc0, // END_COLLECTION
0xc0 // END_COLLECTION
};
The free HID Report Descriptor Decoder tool when invoked with:
rexx rd.rex -d samples/win10.rd
decodes this as:
//--------------------------------------------------------------------------------
// Decoded Application Collection
//--------------------------------------------------------------------------------
/*
05 01 (GLOBAL) USAGE_PAGE 0x0001 Generic Desktop Page
09 02 (LOCAL) USAGE 0x00010002 Mouse (CA=Application Collection)
A1 01 (MAIN) COLLECTION 0x01 Application (Usage=0x00010002: Page=Generic Desktop Page, Usage=Mouse, Type=CA)
09 01 (LOCAL) USAGE 0x00010001 Pointer (CP=Physical Collection)
A1 00 (MAIN) COLLECTION 0x00 Physical (Usage=0x00010001: Page=Generic Desktop Page, Usage=Pointer, Type=CP)
85 01 (GLOBAL) REPORT_ID 0x01 (1)
05 09 (GLOBAL) USAGE_PAGE 0x0009 Button Page
19 01 (LOCAL) USAGE_MINIMUM 0x00090001 Button 1 Primary/trigger (MULTI=Selector, On/Off, Momentary, or One Shot)
29 03 (LOCAL) USAGE_MAXIMUM 0x00090003 Button 3 Tertiary (MULTI=Selector, On/Off, Momentary, or One Shot)
15 00 (GLOBAL) LOGICAL_MINIMUM 0x00 (0) <-- Redundant: LOGICAL_MINIMUM is already 0 <-- Info: Consider replacing 15 00 with 14
25 01 (GLOBAL) LOGICAL_MAXIMUM 0x01 (1)
95 03 (GLOBAL) REPORT_COUNT 0x03 (3) Number of fields
75 01 (GLOBAL) REPORT_SIZE 0x01 (1) Number of bits per field
81 02 (MAIN) INPUT 0x00000002 (3 fields x 1 bit) 0=Data 1=Variable 0=Absolute 0=NoWrap 0=Linear 0=PrefState 0=NoNull 0=NonVolatile 0=Bitmap
95 01 (GLOBAL) REPORT_COUNT 0x01 (1) Number of fields
75 05 (GLOBAL) REPORT_SIZE 0x05 (5) Number of bits per field
81 03 (MAIN) INPUT 0x00000003 (1 field x 5 bits) 1=Constant 1=Variable 0=Absolute 0=NoWrap 0=Linear 0=PrefState 0=NoNull 0=NonVolatile 0=Bitmap
05 01 (GLOBAL) USAGE_PAGE 0x0001 Generic Desktop Page
09 30 (LOCAL) USAGE 0x00010030 X (DV=Dynamic Value)
09 31 (LOCAL) USAGE 0x00010031 Y (DV=Dynamic Value)
15 81 (GLOBAL) LOGICAL_MINIMUM 0x81 (-127)
25 7F (GLOBAL) LOGICAL_MAXIMUM 0x7F (127)
75 08 (GLOBAL) REPORT_SIZE 0x08 (8) Number of bits per field
95 02 (GLOBAL) REPORT_COUNT 0x02 (2) Number of fields
81 06 (MAIN) INPUT 0x00000006 (2 fields x 8 bits) 0=Data 1=Variable 1=Relative 0=NoWrap 0=Linear 0=PrefState 0=NoNull 0=NonVolatile 0=Bitmap
C0 (MAIN) END_COLLECTION Physical
C0 (MAIN) END_COLLECTION Application
*/
And:
rexx rd.rex -s samples/win10.rd
...generates a C typedef for it as follows:
//--------------------------------------------------------------------------------
// Button Page inputReport 01 (Device --> Host)
//--------------------------------------------------------------------------------
typedef struct
{
uint8_t reportId; // Report ID = 0x01 (1)
// Collection: Mouse Pointer
uint8_t BTN_MousePointerButton1 : 1; // Usage 0x00090001: Button 1 Primary/trigger, Value = 0 to 1
uint8_t BTN_MousePointerButton2 : 1; // Usage 0x00090002: Button 2 Secondary, Value = 0 to 1
uint8_t BTN_MousePointerButton3 : 1; // Usage 0x00090003: Button 3 Tertiary, Value = 0 to 1
uint8_t : 5; // Pad
int8_t GD_MousePointerX; // Usage 0x00010030: X, Value = -127 to 127
int8_t GD_MousePointerY; // Usage 0x00010031: Y, Value = -127 to 127
} inputReport01_t;
Related
I am trying to access eSign application of the smart card. If I understood correctly for that I first need to authenticate as Signature Terminal (ST) during PACE. (Because currently if I try to select eSign application I get file not found.)
This is the APDU I am sending during MSESetAT to achieve that:
0x00 - instruction class
0x22 - instruction code
0xC1 - p1
0xA4 - p2
0x20 - length
0x80 - oid tag
0x0A - oid length
0x04, 0x00, 0x7F, 0x00, 0x07, 0x02, 0x02, 0x04, 0x02, 0x04 - PACE oid
0x83, 0x01, 0x02 - CAN password id
0x7F, 0x4C - Certificate Holder Authorization Template
0x0E - length
0x06 - oid tag
0x09 - oid length
0x04, 0x00, 0x7F, 0x00, 0x07, 0x03, 0x01, 0x02, 0x03 - id-ST oid (0.4.0.127.0.7.3.1.2.3)
0x53 - tag for data
0x01 - length of data
0x03 - 2 lowest bits set for generating qualified electronic signature, and electronic signature
0x00 - expected response length
The response I get is sw1 - 0x6A, sw2 - 0x80 which corresponds to: Incorrect parameters in the data field.
However, if I swap the id-ST oid with id-AT oid: 0x04, 0x00, 0x7F, 0x00, 0x07, 0x03, 0x01, 0x02, 0x02, it succeeds and I get sw1 - 0x90, sw2 - 0x00
Am I forgetting something or is it maybe not even possible to achieve over NFC?
The smart card I am trying with is a national ID card of Croatia, and I am following TR-03110 specifications.
ESP-IDF provided hid_device example as consumer controller (volume up/down, play.....). I tried to modify USB description to mouse or joystick. Unfortunately, ESP32 can connect to PC BT, but no function as mouse/joystick. Provide the link of my source code.
https://drive.google.com/file/d/1SJmk0Ul37iJ1fC5FgiZ1vXz8VgC_hEoJ/view?usp=sharing
Is it wrong with my USB description?
const unsigned char hidapiReportMap[] = { //8 bytes input, 8 bytes feature
0x06, 0x00, 0xFF, // Usage Page (Vendor Defined 0xFF00)
0x0A, 0x00, 0x01, // Usage (0x0100)
0xA1, 0x01, // Collection (Application)
0x85, 0x02, // Report ID (2)
0x15, 0x00, // Logical Minimum (0)
0x26, 0xFF, 0x00, // Logical Maximum (255)
0x75, 0x08, // Report Size (8)
0x95, 0x08, // Report Count (8)
0x09, 0x01, // Usage (0x01)
0x82, 0x02, 0x01, // Input (Data,Var,Abs,No Wrap,Linear,Preferred State,No Null Position,Buffered Bytes)
0x95, 0x08, // Report Count (8)
0x09, 0x02, // Usage (0x02)
0xB2, 0x02, 0x01, // Feature (Data,Var,Abs,No Wrap,Linear,Preferred State,No Null Position,Non-volatile,Buffered Bytes)
0x95, 0x08, // Report Count (8)
0x09, 0x03, // Usage (0x03)
0x91, 0x02, // Output (Data,Var,Abs,No Wrap,Linear,Preferred State,No Null Position,Non-volatile)
0xC0, // End Collection
// 38 bytes
};
const unsigned char mediaReportMap[] = {
0x05, 0x01, // Usage Page(Generic Desktop)
0x09, 0x02, // Usage(Mouse) <--------------------- mouse
0xA1, 0x01, // Collection(Physical)
0x09, 0x01, // Usage(Pointer)
0x85, 0x01, // Report ID
0x05, 0x09, // Usage Page(Buttons)
0x75, 0x08, // Report Size(8)
0x95, 0x01, // Report Count(1)
0x81, 0x02, // Input(Variable)
0x09, 0x30, // Usage Page(X)
0x09, 0x31, // Usage Page(Y)
0x09, 0x38, // Usage(Wheel)
0x75, 0x08, // Report Size(8)
0x95, 0x03, // Report Count(3)
0x15, 0x81, // Logical Minimum(-127)
0x25, 0x7F, // Logical Maximum(127)
0x81, 0x06, // Input(Variable, Relative)
0xC0 // End Collection
/*
0x05, 0x01, // Usage Page(Generic Desktop)
0x09, 0x04, // Usage(Joystick) <---------------------------
0xA1, 0x01, // Collection(Application)
0x05, 0x01, // Usage Page(Generic Desktop)
// 8
0x85, 0x01, // Report ID
0x05, 0x01, // Usage Page(Generic Desktop)
0x09, 0x30, // Usage Page(X)
0x75, 0x10, // Report Size(16)
0x95, 0x01, // Report Count(1)
0x15, 0x00, // Logical Minimum(0)
0x26, 0xFF, // Logical Maximum(4095)
0x0F,
0x46, 0xFF, // Physical Maximum(4095)
0x0F,
0x81, 0x02, // Input(Variable)
// 28
0x05, 0x01, // Usage Page(Generic Desktop)
0x09, 0x31, // Usage Page(Y)
0x75, 0x10, // Report Size(16)
0x95, 0x01, // Report Count(1)
0x15, 0x00, // Logical Minimum(0)
0x26, 0xFF, // Logical Maximum(4095)
0x0F,
0x46, 0xFF, // Physical Maximum(4095)
0x0F,
0x81, 0x02, // Input(Variable)
0xC0 // End Collection*/
};
Here is the function of report ID.
void esp_hidd_send_axis(int xVal, int yVal)
{
uint8_t Mouse_Buffer[8] = {0};
Mouse_Buffer[0] = 0;
Mouse_Buffer[1] = xVal & 0xFF;
Mouse_Buffer[2] = yVal & 0xFF;
Mouse_Buffer[3] = 0;
esp_hidd_dev_input_set(hid_dev, 1, 1, Mouse_Buffer, 4);
}
Thanks for help.
ESP-IDF privodes sample code : \esp-idf\examples\bluetooth\esp_hid_device, somehow can't support BLE mouse.
Using \esp\esp-idf\examples\bluetooth\bluedroid\ble\ble_hid_device_demo, it supports BLE keypad, BLE mouse, and BLE consumer device. Share my reference code:
enter link description here
I'm trying to write a KMDF driver to simulate keystrokes.
When the driver receives IOCTL_HID_READ_REPORT it redirects the request to a queue:
switch (IoControlCode)
{
case IOCTL_HID_GET_DEVICE_DESCRIPTOR:
KdPrint(("GET DEVICE DESCRIPTOR\n"));
_Analysis_assume_(deviceContext->HidDescriptor.bLength != 0);
status = RequestCopyFromBuffer(Request, &deviceContext->HidDescriptor, deviceContext->HidDescriptor.bLength);
break;
case IOCTL_HID_GET_DEVICE_ATTRIBUTES:
KdPrint(("GET DEVICE ATTRIBUTES\n"));
status = RequestCopyFromBuffer(Request, &queueContext->DeviceContext->HidDeviceAttributes, sizeof(HID_DEVICE_ATTRIBUTES));
break;
case IOCTL_HID_GET_REPORT_DESCRIPTOR:
KdPrint(("GET REPORT DESCRIPTOR\n"));
status = RequestCopyFromBuffer(Request, deviceContext->ReportDescriptor, deviceContext->HidDescriptor.DescriptorList[0].wReportLength);
break;
case IOCTL_HID_READ_REPORT:
WdfRequestForwardToIoQueue(Request, QueueContext->DeviceContext->ManualQueue); // <= HERE
break;}
With a timer, items are regularly dequeued and the keyboard input report is copied in the request
void EvtTimerFunc(_In_ WDFTIMER Timer)
{
NTSTATUS status;
WDFQUEUE queue;
PMANUAL_QUEUE_CONTEXT queueContext;
WDFREQUEST request;
KdPrint(("EvtTimerFunc\n"));
queue = (WDFQUEUE)WdfTimerGetParentObject(Timer);
queueContext = GetManualQueueContext(queue);
//
// see if we have a request in manual queue
//
status = WdfIoQueueRetrieveNextRequest(queueContext->Queue, &request);
if (NT_SUCCESS(status))
{
KdPrint(("Handling"));
HID_XFER_PACKET hidXferPacket;
BYTE keycodes[6] = {0};
keycodes[0] = 0x04;
HID_KEYBOARD_INPUT_REPORT report;
report.ReportId = REPORT_ID_KEYBOARD_INPUT;
report.Modifiers = 0;
report._reserved = 0;
memcpy(&report.KeyCodes, &keycodes, 6);
hidXferPacket.reportBuffer = (UCHAR*)&report;
hidXferPacket.reportBufferLen = sizeof(HID_KEYBOARD_INPUT_REPORT);
hidXferPacket.reportId = REPORT_ID_KEYBOARD_INPUT;
RequestCopyFromBuffer(request, hidXferPacket.reportBuffer, sizeof(HID_KEYBOARD_INPUT_REPORT));
WdfRequestComplete(request, status);
}
}
Although everything works without error, no keystroke are emitted. What am I missing ?!
Descriptor:
HID_REPORT_DESCRIPTOR g_reportDescriptor[] = {
0x05, 0x01, // USAGE_PAGE (Generic Desktop)
0x09, 0x02, // USAGE (Mouse)
0xA1, 0x01, // COLLECTION (Application)
0x85, REPORT_ID_MOUSE_INPUT,
0x09, 0x01, // USAGE_PAGE (Pointer)
0xA1, 0x00, // COLLECTION (Physical)
0x05, 0x09, // USAGE_PAGE (Buttons)
0x19, 0x01, // USAGE_MINIMUM (1)
0x29, 0x03, // USAGE_MAXIMUM (3)
0x15, 0x00, // LOGICAL_MINIMUM (0)
0x25, 0x01, // LOGICAL_MAXIMUM (1)
0x95, 0x03, // REPORT_COUNT (3)
0x75, 0x01, // REPORT_SIZE (1)
0x81, 0x02, // INPUT (Data, Variable, Absolute)
0x95, 0x01, // REPORT_COUNT (1)
0x75, 0x05, // REPORT_SIZE (5)
0x81, 0x01, // INPUT (Constant)
0x05, 0x01, // USAGE_PAGE (Generic Desktop)
0x09, 0x30, // USAGE (X)
0x09, 0x31, // USAGE (Y)
0x15, 0x81, // LOGICAL_MINIMUM (-127)
0x25, 0x7F, // LOGICAL_MAXIMUM (127)
0x75, 0x08, // REPORT_SIZE (8)
0x95, 0x02, // REPORT_COUNT (2)
0x81, 0x06, // Input (Data, Variable, Relative)
0xC0, // END_COLLECTION
0xC0, // END_COLLECTION
0x05, 0x01, // USAGE_PAGE (Generic Desktop)
0x09, 0x00, // USAGE (Undefined)
0xa1, 0x01, // COLLECTION (Application)
0x85, REPORT_ID_MOUSE_OUTPUT,
0x09, 0x00, // USAGE (Undefined)
0x15, 0x00, // LOGICAL_MINIMUM (0)
0x26, 0xff, 0x00, // LOGICAL_MAXIMUM (255)
0x95, 0x03, // REPORT_COUNT (3)
0x75, 0x08, // REPORT_SIZE (8)
0x91, 0x02, // OUTPUT (Data, Variable, Absolute)
0xc0, // END_COLLECTION
0x05, 0x01, // USAGE_PAGE (Generic Desktop)
0x09, 0x06, // USAGE (Keyboard)
0xA1, 0x01, // COLLECTION (Application)
0x85, REPORT_ID_KEYBOARD_INPUT,
0x05, 0x07, // USAGE_PAGE (Keyboard Key Codes)
0x19, 0xE0, // USAGE_MINIMUM (224)
0x29, 0xE7, // USAGE_MAXIMUM (231)
0x15, 0x00, // LOGICAL_MINIMUM (0)
0x25, 0x01, // LOGICAL_MAXIMUM (1)
0x75, 0x01, // REPORT_SIZE (1)
0x95, 0x08, // REPORT_COUNT (8)
0x81, 0x02, // INPUT (Data, Variable, Absolute)
0x95, 0x01, // REPORT_COUNT (1)
0x75, 0x08, // REPORT_SIZE (8)
0x81, 0x01, // INPUT (Constant)
0x19, 0x00, // USAGE_MINIMUM (0)
0x29, 0x65, // USAGE_MAXIMUM (101)
0x15, 0x00, // LOGICAL_MINIMUM (0)
0x25, 0x65, // LOGICAL_MAXIMUM (101)
0x95, 0x06, // REPORT_COUNT (6)
0x75, 0x08, // REPORT_SIZE (8)
0x81, 0x00, // INPUT (Data, Array, Absolute)
0x05, 0x08, // USAGE_PAGE (LEDs)
0x19, 0x01, // USAGE_MINIMUM (Num Lock)
0x29, 0x05, // USAGE_MAXIMUM (Kana)
0x95, 0x05, // REPORT_COUNT (5)
0x75, 0x01, // REPORT_SIZE (1)
0x91, 0x02, // OUTPUT (Data, Variable, Absolute)
0x95, 0x01, // REPORT_COUNT (1)
0x75, 0x03, // REPORT_SIZE (3)
0x91, 0x01, // OUTPUT (Constant)
0xC0, // END_COLLECTION
0x05, 0x01, // USAGE_PAGE (Generic Desktop)
0x09, 0x00, // USAGE (Undefined)
0xa1, 0x01, // COLLECTION (Application)
0x85, REPORT_ID_KEYBOARD_OUTPUT,
0x09, 0x00, // USAGE (Undefined)
0x15, 0x00, // LOGICAL_MINIMUM (0)
0x26, 0xff, 0x00, // LOGICAL_MAXIMUM (255)
0x95, 0x08, // REPORT_COUNT (8)
0x75, 0x08, // REPORT_SIZE (8)
0x91, 0x02, // OUTPUT (Data, Variable, Absolute)
0xc0 // END_COLLECTION
};
HID_DESCRIPTOR g_hidDescriptor = {
0x09, // length of HID descriptor
0x21, // descriptor type == HID 0x21
0x0100, // hid spec release
0x00, // country code == Not Specified
0x01, // number of HID class descriptors
{ // DescriptorList[0]
0x22, // report descriptor type 0x22
sizeof(g_reportDescriptor) // total length of report descriptor
}
};
I was using a Hyper V virtual machine as a debug machine, and this is why it didn't work.
As soon as I used another computer, the keystrokes were sent.
If you're trying to emulate keystroke or mouse move, this is what I recommend:
Have a look at https://github.com/djpnewton/vmulti (integrate HidMapper to your driver)
Avoid using Hyper V for testing purposes
I'm trying to execute a rather trivial WebAssembly benchmark with Google's V8 engine (both in-browser using the current Version of Google Chrome (Version 83.0.4103.106, 64-bit) and via embedding V8 (Version 8.5.183) in a C++ program. All benchmarks are executed on macOS 10.14.6 with an Intel i7 8850H processor. No RAM swap has been used.
I am using the following C code as a benchmark. (Note that runtime is in the order of seconds on a current Intel Core i7)
static void init(int n, int path[1000][1000]) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
path[i][j] = i*j%7+1;
if ((i+j)%13 == 0 || (i+j)%7==0 || (i+j)%11 == 0) {
path[i][j] = 999;
}
}
}
}
static void kernel(int n, int path[1000][1000]) {
for (int k = 0; k < n; k++) {
for(int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
path[i][j] = path[i][j] < path[i][k] + path[k][j] ? path[i][j] : path[i][k] + path[k][j];
}
}
}
}
int path[1000][1000];
int main(void) {
int n = 1000;
init(n, path);
kernel(n, path);
return 0;
}
This can be easily executed via https://wasdk.github.io/WasmFiddle/. The corresponding JS code measuring time in the most basic way is the following:
var wasmModule = new WebAssembly.Module(wasmCode);
var wasmInstance = new WebAssembly.Instance(wasmModule, wasmImports);
var a = new Date();
wasmInstance.exports.main();
var b = new Date();
log(b-a);
The result I'm getting in browser (e.g. in WasmFiddle or on a custom website) in Google Chrome is the following (for multiple consecutive executions) in milliseconds:
3687
1757
1837
1753
1726
1731
1774
1741
1771
1727
3549
1742
1731
1847
1734
1745
3515
1731
1772
Note the outliers performing at half the speed of the rest. How and why are there outliers with still such consistent performance? As much care as possible has been taken to ensure that no other processes are using up CPU time.
For the embedded version, the monolithic V8 library has been built from source using the following build config:
is_component_build = false
is_debug = false
target_cpu = "x64"
use_custom_libcxx = false
v8_monolithic = true
v8_use_external_startup_data = false
v8_enable_pointer_compression = false
The C++ code embedding the V8 library and executing the Wasm script (The Wasm code is the exact code produced by the WasmFiddle compiler):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "include/libplatform/libplatform.h"
#include "include/v8.h"
int main(int argc, char* argv[]) {
// Initialize V8.
v8::V8::InitializeICUDefaultLocation(argv[0]);
v8::V8::InitializeExternalStartupData(argv[0]);
std::unique_ptr<v8::Platform> platform = v8::platform::NewDefaultPlatform();
v8::V8::InitializePlatform(platform.get());
v8::V8::Initialize();
// Create a new Isolate and make it the current one.
v8::Isolate::CreateParams create_params;
create_params.array_buffer_allocator = v8::ArrayBuffer::Allocator::NewDefaultAllocator();
v8::Isolate* isolate = v8::Isolate::New(create_params);
{
v8::Isolate::Scope isolate_scope(isolate);
// Create a stack-allocated handle scope.
v8::HandleScope handle_scope(isolate);
// Create a new context.
v8::Local<v8::Context> context = v8::Context::New(isolate);
v8::Context::Scope context_scope(context);
{
const char csource[] = R"(
let bytes = new Uint8Array([
0x0, 0x61, 0x73, 0x6D, 0x01, 0x00, 0x00, 0x00, 0x01, 0x85, 0x80, 0x80, 0x80, 0x00, 0x01, 0x60,
0x00, 0x01, 0x7F, 0x03, 0x82, 0x80, 0x80, 0x80, 0x00, 0x01, 0x00, 0x04, 0x84, 0x80, 0x80, 0x80,
0x00, 0x01, 0x70, 0x00, 0x00, 0x05, 0x83, 0x80, 0x80, 0x80, 0x00, 0x01, 0x00, 0x3E, 0x06, 0x81,
0x80, 0x80, 0x80, 0x00, 0x00, 0x07, 0x91, 0x80, 0x80, 0x80, 0x00, 0x02, 0x06, 0x6D, 0x65, 0x6D,
0x6F, 0x72, 0x79, 0x02, 0x00, 0x04, 0x6D, 0x61, 0x69, 0x6E, 0x00, 0x00, 0x0A, 0x8F, 0x82, 0x80,
0x80, 0x00, 0x01, 0x89, 0x82, 0x80, 0x80, 0x00, 0x01, 0x08, 0x7F, 0x41, 0x00, 0x21, 0x02, 0x41,
0x10, 0x21, 0x05, 0x03, 0x40, 0x20, 0x05, 0x21, 0x07, 0x41, 0x00, 0x21, 0x04, 0x41, 0x00, 0x21,
0x03, 0x03, 0x40, 0x20, 0x07, 0x20, 0x04, 0x41, 0x07, 0x6F, 0x41, 0x01, 0x6A, 0x41, 0xE7, 0x07,
0x20, 0x02, 0x20, 0x03, 0x6A, 0x22, 0x00, 0x41, 0x07, 0x6F, 0x1B, 0x41, 0xE7, 0x07, 0x20, 0x00,
0x41, 0x0D, 0x6F, 0x1B, 0x41, 0xE7, 0x07, 0x20, 0x00, 0x41, 0x0B, 0x6F, 0x1B, 0x36, 0x02, 0x00,
0x20, 0x07, 0x41, 0x04, 0x6A, 0x21, 0x07, 0x20, 0x04, 0x20, 0x02, 0x6A, 0x21, 0x04, 0x20, 0x03,
0x41, 0x01, 0x6A, 0x22, 0x03, 0x41, 0xE8, 0x07, 0x47, 0x0D, 0x00, 0x0B, 0x20, 0x05, 0x41, 0xA0,
0x1F, 0x6A, 0x21, 0x05, 0x20, 0x02, 0x41, 0x01, 0x6A, 0x22, 0x02, 0x41, 0xE8, 0x07, 0x47, 0x0D,
0x00, 0x0B, 0x41, 0x00, 0x21, 0x06, 0x41, 0x10, 0x21, 0x05, 0x03, 0x40, 0x41, 0x10, 0x21, 0x00,
0x41, 0x00, 0x21, 0x01, 0x03, 0x40, 0x20, 0x01, 0x41, 0xA0, 0x1F, 0x6C, 0x20, 0x06, 0x41, 0x02,
0x74, 0x6A, 0x41, 0x10, 0x6A, 0x21, 0x02, 0x41, 0x00, 0x21, 0x07, 0x03, 0x40, 0x20, 0x00, 0x20,
0x07, 0x6A, 0x22, 0x04, 0x20, 0x04, 0x28, 0x02, 0x00, 0x22, 0x04, 0x20, 0x05, 0x20, 0x07, 0x6A,
0x28, 0x02, 0x00, 0x20, 0x02, 0x28, 0x02, 0x00, 0x6A, 0x22, 0x03, 0x20, 0x04, 0x20, 0x03, 0x48,
0x1B, 0x36, 0x02, 0x00, 0x20, 0x07, 0x41, 0x04, 0x6A, 0x22, 0x07, 0x41, 0xA0, 0x1F, 0x47, 0x0D,
0x00, 0x0B, 0x20, 0x00, 0x41, 0xA0, 0x1F, 0x6A, 0x21, 0x00, 0x20, 0x01, 0x41, 0x01, 0x6A, 0x22,
0x01, 0x41, 0xE8, 0x07, 0x47, 0x0D, 0x00, 0x0B, 0x20, 0x05, 0x41, 0xA0, 0x1F, 0x6A, 0x21, 0x05,
0x20, 0x06, 0x41, 0x01, 0x6A, 0x22, 0x06, 0x41, 0xE8, 0x07, 0x47, 0x0D, 0x00, 0x0B, 0x41, 0x00,
0x0B
]);
let module = new WebAssembly.Module(bytes);
let instance = new WebAssembly.Instance(module);
instance.exports.main();
)";
// Create a string containing the JavaScript source code.
v8::Local<v8::String> source = v8::String::NewFromUtf8Literal(isolate, csource);
// Compile the source code.
v8::Local<v8::Script> script = v8::Script::Compile(context, source).ToLocalChecked();
// Run the script to get the result.
v8::Local<v8::Value> result = script->Run(context).ToLocalChecked();
}
}
// Dispose the isolate and tear down V8.
isolate->Dispose();
v8::V8::Dispose();
v8::V8::ShutdownPlatform();
delete create_params.array_buffer_allocator;
return 0;
}
I compile it as follows:
g++ -I. -O2 -Iinclude samples/wasm.cc -o wasm -lv8_monolith -Lout.gn/x64.release.sample/obj/ -pthread -std=c++17
On execution with time ./wasm, I get execution times between 4.9s and 5.1s - almost triple that of in-Chrome/WasmFiddle execution! Did I miss anything? Maybe some optimization switches? This result is perfectly reproducible and I have even tested various different versions of the V8 library - still the same result.
Ah, the joys of microbenchmarking :-)
V8 has two compilers for Wasm: a non-optimizing baseline compiler that produces code really fast, and an optimizing compiler that takes quite a bit longer to produce code, but that code is typically about twice as fast. When a module is loaded, current versions first compile all functions with the baseline compiler. Once that's done, execution can start, and optimized compilation jobs are scheduled to run in the background. When an optimized compilation job is complete, the respective function's code is swapped, and the next invocation of the function will use it. (The details here will very likely change in the future, but the general principle will remain.) That way, typical applications get both good startup latency, and good peak performance.
But, as with any heuristic or strategy, you can craft a case where it gets it wrong...
In your benchmark, each function is called only once. In the fast cases, optimizing kernel finishes before init returns. In the slow cases, kernel is called before its optimized compilation job is done, so its baseline version runs. Apparently when embedding V8 directly, you reliably get the latter scenario, whereas when running via WasmFiddle in Chrome, you get the former most of the time, but not always.
I can't explain why your custom embedding runs are even slower than the slow case in Chrome; I'm not seeing that on my machine (OTOH, in Chrome, I'm seeing an even bigger delta: about 1100ms for a fast run and 4400ms for a slow run); however I used the d8 shell instead of compiling my own embedding. One thing that's different is that when measuring with time on the command line, you include process startup and initialization, which the Date.now() calls around main() don't include. But that should only account for 10-50 milliseconds or so, not for a 3.6s → 5.0s difference.
While this situation might look quite unfortunate for your microbenchmark, it is generally working as intended, i.e. not a bug, and hence unlikely to change on V8's side. There are several things you can do to make the benchmark more reflective of real-world behavior (assuming this one doesn't exactly represent some real application you have):
execute functions multiple times; you'll see that the first run will be slower (or, depending on function size and module size and number of available CPU cores and scheduling luck, the first few runs)
wait a bit before calling the hottest functions, e.g. by doing
var wasmModule = new WebAssembly.Module(wasmCode);
var wasmInstance = new WebAssembly.Instance(wasmModule, wasmImports);
window.setTimeout(() => {
var a = Date.now();
wasmInstance.exports.main();
var b = Date.now();
log(b-a);
}, 10);
In my tests with d8 I've found that even a silly busy-wait did the trick:
let wait = Date.now() + 10;
while (Date.now() < wait) {}
instance.exports.main();
generally make the benchmark bigger and more complex: have and execute more different functions, don't just spend 99% of the time in a single line.
(FWIW, the earliest V8 versions that supported WebAssembly had no tiering, only optimized compilation. So modules always had to wait for that to finish. It was not a good user experience; for large modules the wait time could be tens of seconds. Having a baseline compiler is quite clearly the better solution overall, even if it comes at the cost of not having maximum performance available immediately. Looking good on artificial one-liners is not what matters in practice; providing a good user experience for large real-world applications matters.)
So I'm trying to write some inline assembly that would load variables into SSE2 registers.
However the inline assembly isn't going that well.
I'm hitting a detour while compiling in GCC.
g++-4.8.2 -g -maes -msse4 aesni.c -o aesni
aesni.c:101:junk `ptr[%rax]' after expression
aesni.c:101:32-bit absolute addressing is not supported for x86-64
aesni.c:101:cannot do signed 4 byte relocation
aesni.c:102:junk `ptr[%rdx]' after expression
aesni.c:102:32-bit absolute addressing is not supported for x86-64
aesni.c:102:cannot do signed 4 byte relocation
Code:
#include <string.h> // string
using namespace std;
int main (int argc, const char *argv[])
{
// Nr = 10 (128bit), 12 (192bit), 14 (256bit)
__attribute__((aligned (16))) unsigned char Key128bit[] = { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c, 0x00 };
__attribute__((aligned (16))) unsigned char Key192bit[] = { 0x8e, 0x73, 0xb0, 0xf7, 0xda, 0x0e, 0x64, 0x52, 0xc8, 0x10, 0xf3, 0x2b, 0x80, 0x90, 0x79, 0xe5,
0x62, 0xf8, 0xea, 0xd2, 0x52, 0x2c, 0x6b, 0x7b, 0x00 };
__attribute__((aligned (16))) unsigned char Key256bit[] = { 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4, 0x00 };
char Nr = 10; // 128bit / 12 for 192bit / 14 for 256bit
/* zero out XMM registers */
asm volatile ("PXOR %xmm0, %xmm0\n"
"PXOR %xmm1, %xmm1\n"
"PXOR %xmm2, %xmm2\n"
"PXOR %xmm3, %xmm3\n"
"PXOR %xmm4, %xmm4\n"
"PXOR %xmm5, %xmm5\n"
"PXOR %xmm6, %xmm6\n"
"PXOR %xmm7, %xmm7\n"
"PXOR %xmm8, %xmm8\n"
"PXOR %xmm9, %xmm9\n"
"PXOR %xmm10, %xmm10\n"
"PXOR %xmm11, %xmm11\n"
"PXOR %xmm12, %xmm12\n"
"PXOR %xmm13, %xmm13\n"
"PXOR %xmm14, %xmm14\n"
"PXOR %xmm15, %xmm15\n");
__attribute__((aligned (16))) unsigned char *KeyRound0, *KeyRound1;
__attribute__((aligned (16))) unsigned char KeyRound1Temp[16];
switch(Nr+1)
{
case 11:
KeyRound0 = Key128bit;
break;
case 13:
KeyRound0 = Key192bit;
memset( KeyRound1Temp, 0x00, 16);
memcpy( KeyRound1Temp, Key192bit+16, 8);
KeyRound1 = KeyRound1Temp;
break;
case 15:
KeyRound0 = Key256bit;
memcpy( KeyRound1Temp, Key256bit+16, 16);
KeyRound1 = KeyRound1Temp;
break;
}
asm volatile("MOVDQA qword ptr [%0], %%xmm0\n"
"MOVDQA qword ptr [%1], %%xmm0\n"
: /* output operands */
: /* input operands */ "r" (KeyRound0), "r" (KeyRound1)
: /* list of clobbered registers */
);
}