Efficient sse shuffle mask generation for left-packing byte elements

Efficient sse shuffle mask generation for left-packing byte elements - performance

What would be an efficient way to optimize the following code with sse ?
uint16_t change1= ... ;
uint8_t* pSrc = ... ;
uint8_t* pDest = ... ;
if(change1 & 0x0001) *pDest++ = pSrc[0];
if(change1 & 0x0002) *pDest++ = pSrc[1];
if(change1 & 0x0004) *pDest++ = pSrc[2];
if(change1 & 0x0008) *pDest++ = pSrc[3];
if(change1 & 0x0010) *pDest++ = pSrc[4];
if(change1 & 0x0020) *pDest++ = pSrc[5];
if(change1 & 0x0040) *pDest++ = pSrc[6];
if(change1 & 0x0080) *pDest++ = pSrc[7];
if(change1 & 0x0100) *pDest++ = pSrc[8];
if(change1 & 0x0200) *pDest++ = pSrc[9];
if(change1 & 0x0400) *pDest++ = pSrc[10];
if(change1 & 0x0800) *pDest++ = pSrc[11];
if(change1 & 0x1000) *pDest++ = pSrc[12];
if(change1 & 0x2000) *pDest++ = pSrc[13];
if(change1 & 0x4000) *pDest++ = pSrc[14];
if(change1 & 0x8000) *pDest++ = pSrc[15];
So far I am using a quite big lookup table for it, but I really want to get rid of it:
SSE3Shuffle::Entry& e0 = SSE3Shuffle::g_Shuffle.m_Entries[change1];
_mm_storeu_si128((__m128i*)pDest, _mm_shuffle_epi8(*(__m128i*)pSrc, e0.mask));
pDest += e0.offset;

Assuming:
change1 = _mm_movemask_epi8(bytemask);
offset = popcnt(change1);
On large buffers, using two shuffles and a 1 KiB table is only ~10% slower than using 1 shuffle and a 1MiB table. My attempts at generating the shuffle mask via prefix sums and bit twiddling are about about half the speed of the table based methods
(solutions using pext/pdep were not explored).
Reducing table size: Use two lookups into a 2 KiB table instead of 1 lookup into a 1 MiB table. Always keep the top-most byte - if that byte is to be discarded then it doesn't matter what byte is at that position (down to 7-bit indices, or 1 KiB table). Further reduce possible combinations by manually packing the two bytes in each 16-bit lane (down to a 216 byte table).
The following example strips whitespace from text using SSE4.1. If only SSSE3 is available then blendv can be emulated. The 64-bit halves are re-combined by overlapping writes to memory, but they could be re-combined in the xmm register (as seen in the AVX2 example).
#include <stdint.h>
#include <smmintrin.h> // SSE4.1
size_t despacer (void* dst_void, void* src_void, size_t length)
{
uint8_t* src = (uint8_t*)src_void;
uint8_t* dst = (uint8_t*)dst_void;
if (length >= 16) {
// table of control characters (space, tab, newline, carriage return)
const __m128i lut_cntrl = _mm_setr_epi8(' ', 0, 0, 0, 0, 0, 0, 0, 0, '\t', '\n', 0, 0, '\r', 0, 0);
// bits[4:0] = index -> ((trit_d * 0) + (trit_c * 9) + (trit_b * 3) + (trit_a * 1))
// bits[15:7] = popcnt
const __m128i sadmask = _mm_set1_epi64x(0x8080898983838181);
// adding 8 to each shuffle index is cheaper than extracting the high qword
const __m128i offset = _mm_cvtsi64_si128(0x0808080808080808);
// shuffle control indices
static const uint64_t table[27] = {
0x0000000000000706, 0x0000000000070600, 0x0000000007060100, 0x0000000000070602,
0x0000000007060200, 0x0000000706020100, 0x0000000007060302, 0x0000000706030200,
0x0000070603020100, 0x0000000000070604, 0x0000000007060400, 0x0000000706040100,
0x0000000007060402, 0x0000000706040200, 0x0000070604020100, 0x0000000706040302,
0x0000070604030200, 0x0007060403020100, 0x0000000007060504, 0x0000000706050400,
0x0000070605040100, 0x0000000706050402, 0x0000070605040200, 0x0007060504020100,
0x0000070605040302, 0x0007060504030200, 0x0706050403020100
};
const uint8_t* end = &src[length & ~15];
do {
__m128i v = _mm_loadu_si128((__m128i*)src);
src += 16;
// detect spaces
__m128i mask = _mm_cmpeq_epi8(_mm_shuffle_epi8(lut_cntrl, v), v);
// shift w/blend: each word now only has 3 states instead of 4
// which reduces the possiblities per qword from 128 to 27
v = _mm_blendv_epi8(v, _mm_srli_epi16(v, 8), mask);
// extract bitfields describing each qword: index, popcnt
__m128i desc = _mm_sad_epu8(_mm_and_si128(mask, sadmask), sadmask);
size_t lo_desc = (size_t)_mm_cvtsi128_si32(desc);
size_t hi_desc = (size_t)_mm_extract_epi16(desc, 4);
// load shuffle control indices from pre-computed table
__m128i lo_shuf = _mm_loadl_epi64((__m128i*)&table[lo_desc & 0x1F]);
__m128i hi_shuf = _mm_or_si128(_mm_loadl_epi64((__m128i*)&table[hi_desc & 0x1F]), offset);
// store an entire qword then advance the pointer by how ever
// many of those bytes are actually wanted. Any trailing
// garbage will be overwritten by the next store.
// note: little endian byte memory order
_mm_storel_epi64((__m128i*)dst, _mm_shuffle_epi8(v, lo_shuf));
dst += (lo_desc >> 7);
_mm_storel_epi64((__m128i*)dst, _mm_shuffle_epi8(v, hi_shuf));
dst += (hi_desc >> 7);
} while (src != end);
}
// tail loop
length &= 15;
if (length != 0) {
const uint64_t bitmap = 0xFFFFFFFEFFFFC1FF;
do {
uint64_t c = *src++;
*dst = (uint8_t)c;
dst += ((bitmap >> c) & 1) | ((c + 0xC0) >> 8);
} while (--length);
}
// return pointer to the location after the last element in dst
return (size_t)(dst - ((uint8_t*)dst_void));
}
Whether the tail loop should be vectorized or use cmov is left as an exercise for the reader. Writing each byte unconditionally/branchlessly is fast when the input is unpredictable.
Using AVX2 to generate the shuffle control mask using an in-register table is only slightly slower than using large precomputed tables.
#include <stdint.h>
#include <immintrin.h>
// probably needs improvment...
size_t despace_avx2_vpermd(const char* src_void, char* dst_void, size_t length)
{
uint8_t* src = (uint8_t*)src_void;
uint8_t* dst = (uint8_t*)dst_void;
const __m256i lut_cntrl2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(' ', 0, 0, 0, 0, 0, 0, 0, 0, '\t', '\n', 0, 0, '\r', 0, 0));
const __m256i permutation_mask = _mm256_set1_epi64x( 0x0020100884828180 );
const __m256i invert_mask = _mm256_set1_epi64x( 0x0020100880808080 );
const __m256i zero = _mm256_setzero_si256();
const __m256i fixup = _mm256_set_epi32(
0x08080808, 0x0F0F0F0F, 0x00000000, 0x07070707,
0x08080808, 0x0F0F0F0F, 0x00000000, 0x07070707
);
const __m256i lut = _mm256_set_epi32(
0x04050607, // 0x03020100', 0x000000'07
0x04050704, // 0x030200'00, 0x0000'0704
0x04060705, // 0x030100'00, 0x0000'0705
0x04070504, // 0x0300'0000, 0x00'070504
0x05060706, // 0x020100'00, 0x0000'0706
0x05070604, // 0x0200'0000, 0x00'070604
0x06070605, // 0x0100'0000, 0x00'070605
0x07060504 // 0x00'000000, 0x'07060504
);
// hi bits are ignored by pshufb, used to reject movement of low qword bytes
const __m256i shuffle_a = _mm256_set_epi8(
0x7F, 0x7E, 0x7D, 0x7C, 0x7B, 0x7A, 0x79, 0x78, 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
0x7F, 0x7E, 0x7D, 0x7C, 0x7B, 0x7A, 0x79, 0x78, 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70
);
// broadcast 0x08 then blendd...
const __m256i shuffle_b = _mm256_set_epi32(
0x08080808, 0x08080808, 0x00000000, 0x00000000,
0x08080808, 0x08080808, 0x00000000, 0x00000000
);
for( uint8_t* end = &src[(length & ~31)]; src != end; src += 32){
__m256i r0,r1,r2,r3,r4;
unsigned int s0,s1;
r0 = _mm256_loadu_si256((__m256i *)src); // asrc
// detect spaces
r1 = _mm256_cmpeq_epi8(_mm256_shuffle_epi8(lut_cntrl2, r0), r0);
r2 = _mm256_sad_epu8(zero, r1);
s0 = (unsigned)_mm256_movemask_epi8(r1);
r1 = _mm256_andnot_si256(r1, permutation_mask);
r1 = _mm256_sad_epu8(r1, invert_mask); // index_bitmap[0:5], low32_spaces_count[7:15]
r2 = _mm256_shuffle_epi8(r2, zero);
r2 = _mm256_sub_epi8(shuffle_a, r2); // add space cnt of low qword
s0 = ~s0;
r3 = _mm256_slli_epi64(r1, 29); // move top part of index_bitmap to high dword
r4 = _mm256_srli_epi64(r1, 7); // number of spaces in low dword
r4 = _mm256_shuffle_epi8(r4, shuffle_b);
r1 = _mm256_or_si256(r1, r3);
r1 = _mm256_permutevar8x32_epi32(lut, r1);
s1 = _mm_popcnt_u32(s0);
r4 = _mm256_add_epi8(r4, shuffle_a);
s0 = s0 & 0xFFFF; // isolate low oword
r2 = _mm256_shuffle_epi8(r4, r2);
s0 = _mm_popcnt_u32(s0);
r2 = _mm256_max_epu8(r2, r4); // pin low qword bytes
r1 = _mm256_xor_si256(r1, fixup);
r1 = _mm256_shuffle_epi8(r1, r2); // complete shuffle mask
r0 = _mm256_shuffle_epi8(r0, r1); // despace!
_mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(r0));
_mm_storeu_si128((__m128i*)&dst[s0], _mm256_extracti128_si256(r0,1));
dst += s1;
}
// tail loop
length &= 31;
if (length != 0) {
const uint64_t bitmap = 0xFFFFFFFEFFFFC1FF;
do {
uint64_t c = *src++;
*dst = (uint8_t)c;
dst += ((bitmap >> c) & 1) | ((c + 0xC0) >> 8);
} while (--length);
}
return (size_t)(dst - ((uint8_t*)dst_void));
}
For posterity, the 1 KiB version (generating the table is left as an exercise for the reader).
static const uint64_t table[128] __attribute__((aligned(64))) = {
0x0706050403020100, 0x0007060504030201, ..., 0x0605040302010700, 0x0605040302010007
};
const __m128i mask_01 = _mm_set1_epi8( 0x01 );
__m128i vector0 = _mm_loadu_si128((__m128i*)src);
__m128i vector1 = _mm_shuffle_epi32( vector0, 0x0E );
__m128i bytemask0 = _mm_cmpeq_epi8( ???, vector0); // detect bytes to omit
uint32_t bitmask0 = _mm_movemask_epi8(bytemask0) & 0x7F7F;
__m128i hsum = _mm_sad_epu8(_mm_add_epi8(bytemask0, mask_01), _mm_setzero_si128());
vector0 = _mm_shuffle_epi8(vector0, _mm_loadl_epi64((__m128i*) &table[(uint8_t)bitmask0]));
_mm_storel_epi64((__m128i*)dst, vector0);
dst += (uint32_t)_mm_cvtsi128_si32(hsum);
vector1 = _mm_shuffle_epi8(vector1, _mm_loadl_epi64((__m128i*) &table[bitmask0 >> 8]));
_mm_storel_epi64((__m128i*)dst, vector1);
dst += (uint32_t)_mm_cvtsi128_si32(_mm_unpackhi_epi64(hsum, hsum));
https://github.com/InstLatx64/AVX512_VPCOMPRESSB_Emu has some benchmarks.

If one is willing to use BMI2 available on haswell and later, one can use pdep to first compress unwanted nibbles out from uint64_t, and then use pext to scatter the result to shuffle mask.
// Step 1 -- replicate mask to nibbles
uint64_t change4 = pdep(change1, 0x1111111111111111ULL) * 0x0F;
// Step 2 -- extract index from array of nibbles
uint64_t indices = pext(0xfedcba09876543210, change4);
// Step 3 -- interleave nibbles to octects
uint64_t high = pdep(indices >> 32ULL,0x0F0F0F0F0F0F0F0F);
uint64_t low = pdep(indices, 0x0F0F0F0F0F0F0F0FULL);
// Step 4 -- use these two masks to compress pSrc
__m128i compressed = _mm_shuffle_epi8(pSrc, _mm_set_epi64(high, low));
// Step 5 -- store 16 bytes unaligned
_mm_storeu_si128(pDst, compressed);
// Step 6 -- increment target pointer
pDst += __mm_popcnt(change1);
Also other variants (based on cumulative sum or sorting the 'X's (or zero bits) out from XX23456789abXXef will first require some technique to spread the bits from uint16_t evenly to __m128i (i.e. reverse of movemask_epi8).
The 64k entry LUT can however be split to top and bottom parts:
int c = change1 & 0xff;
int p = __popcount(c);
uint64_t a = LUT256[c]; // low part of index
uint64_t b = LUT256[change1 >> 8]; // top part of index
b += addlut9[p]; // 0x0101010101010101 * p
// Then must concatenate b|a at pth position of 'a'
if (p < 8)
{
a |= b << (8*(8-p));
b >>= 8*p;
}
__m128i d = _mm_shuffle_epi8(_mm_loadu_si128(pSrc),_mm_set1_epi64(b,a));
// and continue with steps 5 and 6 as before

Related

Analog measurement incorrect on Teensy 2.0++

I have a Joystick wired up to my Teensy 2.0++ and I want to read the analog values from it.
I took this implementation from PJRC:
static uint8_t aref = (1<<REFS0); // default to AREF = Vcc, this is a 5V Vcc Teensy
void analogReference(uint8_t mode)
{
aref = mode & 0xC0;
}
// Mux input
int16_t adc_read(uint8_t mux)
{
#if defined(__AVR_AT90USB162__)
return 0;
#else
uint8_t low;
ADCSRA = (1<<ADEN) | ADC_PRESCALER; // enable ADC
ADCSRB = (1<<ADHSM) | (mux & 0x20); // high speed mode
ADMUX = aref | (mux & 0x1F); // configure mux input
ADCSRA = (1<<ADEN) | ADC_PRESCALER | (1<<ADSC); // start the conversion
while (ADCSRA & (1<<ADSC)) ; // wait for result
low = ADCL; // must read LSB first
return (ADCH << 8) | low; // must read MSB only once!
#endif
}
// Arduino compatible pin input
int16_t analogRead(uint8_t pin)
{
#if defined(__AVR_ATmega32U4__)
static const uint8_t PROGMEM pin_to_mux[] = {
0x00, 0x01, 0x04, 0x05, 0x06, 0x07,
0x25, 0x24, 0x23, 0x22, 0x21, 0x20};
if (pin >= 12) return 0;
return adc_read(pgm_read_byte(pin_to_mux + pin));
#elif defined(__AVR_AT90USB646__) || defined(__AVR_AT90USB1286__)
if (pin >= 8) return 0;
return adc_read(pin);
#else
return 0;
#endif
}
I have my X and Y pins wired up to F1 and F0, and I want to retrieve values with the following code:
long map(long x, long in_min, long in_max, long out_min, long out_max) // map method shamelessy ripped from Arduino
{
return (x - in_min) * (out_max - out_min) / (in_max - in_min) + out_min;
}
joy_ly = map(analogRead(0), 0, 65535, 0, 255);
joy_lx = map(analogRead(1), 0, 65535, 0, 255);
I measured my Joystick with a multimeter and it works perfectly (around 2.43V on center, 0V on min, and 5V on max), but the center value always ends up being very close to zero.
Is there anything I'm doing wrong?
NOTE: This is an at90usb1286 chip.

The ADC max value is 1024, not 65535.

Simple OpenCL program not working

This program is a simple parallel program which adds the elements of 2 vectors.
The program was error free and it was compiled successfully but the results are not right
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
// number of points in Both A and B files (number of rows)
const int number_of_points = 11;
// number of points axis in Both A and B files (number of Columns)
const int number_of_axis = 3;
using namespace std;
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// array contains file A data
float arrayA[number_of_points][number_of_axis]={{0}};
// array contains file B data
float arrayB[number_of_points][number_of_axis]={{0}};
// float X1[number_of_points]; // X values of file A points
float Y1[number_of_points]; // Y values of file A points
// float X2[number_of_points]; // X values of file B points
float Y2[number_of_points]; // Y values of file B points
float *X1 = (float*)malloc(sizeof(float)*number_of_points);
float *X2 = (float*)malloc(sizeof(float)*number_of_points);
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
arrayA[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
arrayB[row][col] = x;
col++;
}
row++;
}
// put Xs of points in X vectors and Ys of points in Y vectors
// input file A
for (int i = 0; i<number_of_points; i++){
X1[i] = arrayA[i][1];
Y1[i] = arrayA[i][2];
}
// input file B
for (int i = 0; i<number_of_points; i++){
X2[i] = arrayB[i][1];
Y2[i] = arrayB[i][2];
}
// int i;
// const int LIST_SIZE = 50;
// int *A = (int*)malloc(sizeof(int)*number_of_points);
// int *B = (int*)malloc(sizeof(int)*number_of_points);
// for(i = 0; i < number_of_points; i++) {
// A[i] = X1[i];
// B[i] = X2[i];
// }
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context =
clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue =
clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem x1_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem x2_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
number_of_points * sizeof(float), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
number_of_points * sizeof(float), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, x1_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X1, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, x2_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), X2, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&x1_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&x2_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
float *C = (float*)malloc(sizeof(float)*number_of_points);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
number_of_points * sizeof(float), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < number_of_points; i++)
printf("%f + %f = %f\n", X1[i], X2[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(x1_mem_obj);
ret = clReleaseMemObject(x2_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(X1);
free(X2);
free(C);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
and the kernel file
__kernel void vector_add(__global float *X1,
__global float *X2,
__global float *C) {
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = X1[i] + X2[i];
}
The result was
0.000000 + 0.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
2.000000 + 2.000000 = 0.000000
3.000000 + 3.000000 = 0.000000
4.000000 + 4.000000 = 0.000000
5.000000 + 5.000000 = 0.000000
6.000000 + 6.000000 = 0.000000
7.000000 + 7.000000 = 0.000000
8.000000 + 8.000000 = 0.000000
9.000000 + 9.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
ALL Time taken: 0.07s

You've committed one of the cardinal sins of OpenCL programming, in that you are not checking the error codes from any of your OpenCL API calls! You should always check the return code from every single OpenCL API call. If you did this, it would point you towards the problem very quickly.
The problem is in your kernel enqueue call. If you check the error code, you'll see that you are getting -54 back, which corresponds to CL_INVALID_WORK_GROUP_SIZE. Specifically, kernel invocations have the requirement that the work-group size (local size) exactly divides the global size. You are asking for a work-group size of 64 and a global size of 11, which does not fulfil this requirement.
You can also pass NULL as the work-group size parameter, and the OpenCL implementation will pick a work-group size that will definitely work on your behalf.

Optimal uint8_t bitmap into a 8 x 32bit SIMD "bool" vector

As part of a compression algorithm, I am looking for the optimal way to achieve the following:
I have a simple bitmap in a uint8_t. For example 01010011
What I want is a __m256i of the form: (0, maxint, 0, maxint, 0, 0, maxint, maxint)
One way to achieve this is by shuffling a vector of 8 x maxint into a vector of zeros. But that first requires me to expand my uint8_t to the right shuffle bitmap.
I am wondering if there is a better way?

I think I'd probably go for the "brute force and ignorance" approach initially, maybe something like this:
uint8_t u = 0x53; // 01010011
const union {
uint32_t a[4];
__m128i v;
} kLUT[16] = { { { 0, 0, 0, 0 } },
{ { -1, 0, 0, 0 } },
{ { 0, -1, 0, 0 } },
{ { -1, -1, 0, 0 } },
{ { 0, 0, -1, 0 } },
{ { -1, 0, -1, 0 } },
{ { 0, -1, -1, 0 } },
{ { -1, -1, -1, 0 } },
{ { 0, 0, 0, -1 } },
{ { -1, 0, 0, -1 } },
{ { 0, -1, 0, -1 } },
{ { -1, -1, 0, -1 } },
{ { 0, 0, -1, -1 } },
{ { -1, 0, -1, -1 } },
{ { 0, -1, -1, -1 } },
{ { -1, -1, -1, -1 } } };
__m256i v = _mm256_set_m128i(kLUT[u >> 4].v, kLUT[u & 15].v);
Using clang -O3 this compiles to:
movl %ebx, %eax ;; eax = ebx = u
andl $15, %eax ;; get low offset = (u & 15) * 16
shlq $4, %rax
leaq _main.kLUT(%rip), %rcx ;; rcx = kLUT
vmovaps (%rax,%rcx), %xmm0 ;; load low half of ymm0 from kLUT
andl $240, %ebx ;; get high offset = (u >> 4) * 16
vinsertf128 $1, (%rbx,%rcx), %ymm0, %ymm0
;; load high half of ymm0 from kLUT
FWIW I threw together a simple test harness for three implementations: (i) a simple scalar code reference implementation, (ii) the above code, (iii) an implementation based on #Zboson's answer, (iv) a slightly improved version of (iii) and (v) a further improvement on (iv) using a suggestion from #MarcGlisse. I got the following results with a 2.6GHz Haswell CPU (compiled with clang -O3):
scalar code: 7.55336 ns / vector
Paul R: 1.36016 ns / vector
Z boson: 1.24863 ns / vector
Z boson (improved): 1.07590 ns / vector
Z boson (improved + #MarcGlisse suggestion): 1.08195 ns / vector
So #Zboson's solution(s) win, by around 10% - 20%, presumably because they need only 1 load, versus 2 for mine.
If we get any other implementations I'll add these to the test harness and update the results.
Slightly improved version of #Zboson's implementation:
__m256i v = _mm256_set1_epi8(u);
v = _mm256_and_si256(v, mask);
v = _mm256_xor_si256(v, mask);
return _mm256_cmpeq_epi32(v, _mm256_setzero_si256());
Further improved version of #Zboson's implementation incorporating suggestion from #MarcGlisse:
__m256i v = _mm256_set1_epi8(u);
v = _mm256_and_si256(v, mask);
return _mm256_cmpeq_epi32(v, mask);
(Note that mask needs to contain replicated 8 bit values in each 32 bit element, i.e. 0x01010101, 0x02020202, ..., 0x80808080)

Here is a solution (PaulR improved my solution, see the end of my answer or his answer) based on a variation of this question fastest-way-to-broadcast-32-bits-in-32-bytes.
__m256i t1 = _mm256_set1_epi8(x);
__m256i t2 = _mm256_and_si256(t1, mask);
__m256i t4 = _mm256_cmpeq_epi32(t2, _mm256_setzero_si256());
t4 = _mm256_xor_si256(t4, _mm256_set1_epi32(-1));
I don't have AVX2 hardware to test this on right now but here is a SSE2 version showing that it works which also shows how to define the mask.
#include <x86intrin.h>
#include <stdint.h>
#include <stdio.h>
int main(void) {
char mask[32] = {
0x01, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00,
0x20, 0x00, 0x00, 0x00,
0x40, 0x00, 0x00, 0x00,
0x80, 0x00, 0x00, 0x00,
};
__m128i mask1 = _mm_loadu_si128((__m128i*)&mask[ 0]);
__m128i mask2 = _mm_loadu_si128((__m128i*)&mask[16]);
uint8_t x = 0x53; //0101 0011
__m128i t1 = _mm_set1_epi8(x);
__m128i t2 = _mm_and_si128(t1, mask1);
__m128i t3 = _mm_and_si128(t1, mask2);
__m128i t4 = _mm_cmpeq_epi32(t2,_mm_setzero_si128());
__m128i t5 = _mm_cmpeq_epi32(t3,_mm_setzero_si128());
t4 = _mm_xor_si128(t4, _mm_set1_epi32(-1));
t5 = _mm_xor_si128(t5, _mm_set1_epi32(-1));
int o1[4], o2[4];
_mm_store_si128((__m128i*)o1, t4);
_mm_store_si128((__m128i*)o2, t5);
for(int i=0; i<4; i++) printf("%d \n", o1[i]);
for(int i=0; i<4; i++) printf("%d \n", o2[i]);
}
Edit:
PaulR improved my solution
__m256i v = _mm256_set1_epi8(u);
v = _mm256_and_si256(v, mask);
v = _mm256_xor_si256(v, mask);
return _mm256_cmpeq_epi32(v, _mm256_setzero_si256());
with the mask defined as
int mask[8] = {
0x01010101, 0x02020202, 0x04040404, 0x08080808,
0x10101010, 0x20202020, 0x40404040, 0x80808080,
};
See his answer with performance testing for more details.

Based on all the answers, I hacked up a solution using Agner Fog's excellent library (which handles both AVX2, AVX and SSE solutions with a common abstraction). Figured I would share it as an alternative answer:
// Used to generate 32 bit vector bitmasks from 8 bit ints
static const Vec8ui VecBitMask8(
0x01010101
, 0x02020202
, 0x04040404
, 0x08080808
, 0x10101010
, 0x20202020
, 0x40404040
, 0x80808080);
// As above, but for 64 bit vectors and 4 bit ints
static const Vec4uq VecBitMask4(
0x0101010101010101
, 0x0202020202020202
, 0x0404040404040404
, 0x0808080808080808);
template <typename V>
inline static Vec32c getBitmapMask();
template <> inline Vec32c getBitmapMask<Vec8ui>() {return VecBitMask8;};
template <> inline Vec32c getBitmapMask<Vec8i>() {return VecBitMask8;};
template <> inline Vec32c getBitmapMask<Vec4uq>() {return VecBitMask4;};
template <> inline Vec32c getBitmapMask<Vec4q>() {return VecBitMask4;};
// Returns a bool vector representing the bitmask passed.
template <typename V>
static inline V getBitmap(const uint8_t bitMask) {
Vec32c mask = getBitmapMask<V>();
Vec32c v1(bitMask);
v1 = v1 & mask;
return ((V)v1 == (V)mask);
}

Fast Converting RGBA to ARGB

I am trying to convert a rgba buffer into argb, is there any way to improve the next algorithm, or any other faster way to perform such operation?
Taking into account that the alpha value is not important once in the argb buffer, and should always end up as 0xFF.
int y, x, pixel;
for (y = 0; y < height; y++)
{
for (x = 0; x < width; x++)
{
pixel = rgbaBuffer[y * width + x];
argbBuffer[(height - y - 1) * width + x] = (pixel & 0xff00ff00) | ((pixel << 16) & 0x00ff0000) | ((pixel >> 16) & 0xff);
}
}

I will focus only in the swap function:
typedef unsigned int Color32;
inline Color32 Color32Reverse(Color32 x)
{
return
// Source is in format: 0xAARRGGBB
((x & 0xFF000000) >> 24) | //______AA
((x & 0x00FF0000) >> 8) | //____RR__
((x & 0x0000FF00) << 8) | //__GG____
((x & 0x000000FF) << 24); //BB______
// Return value is in format: 0xBBGGRRAA
}

Assuming that the code is not buggy (just inefficient), I can guess that all you want to do is swap every second (even-numbered) byte (and of course invert the buffer), isn't it?
So you can achieve some optimizations by:
Avoiding the shift and masking operations
Optimizing the loop, eg economizing in the indices calculations
I would rewrite the code as follows:
int y, x;
for (y = 0; y < height; y++)
{
unsigned char *pRGBA= (unsigned char *)(rgbaBuffer+y*width);
unsigned char *pARGB= (unsigned char *)(argbBuffer+(height-y-1)*width);
for (x = 4*(width-1); x>=0; x-=4)
{
pARGB[x ] = pRGBA[x+2];
pARGB[x+1] = pRGBA[x+1];
pARGB[x+2] = pRGBA[x ];
pARGB[x+3] = 0xFF;
}
}
Please note that the more complex indices calculation is performed in the outer loop only. There are four acesses to both rgbaBuffer and argbBuffer for each pixel, but I think this is more than offset by avoiding the bitwise operations and the indixes calculations. An alternative would be (like in your code) fetch/store one pixel (int) at a time, and make the processing locally (this econimizes in memory accesses), but unless you have some efficient way to swap the two bytes and set the alpha locally (eg some inline assembly, so that you make sure that everything is performed at registers level), it won't really help.

Code you provided is very strange since it shuffles color components not rgba->argb, but rgba->rabg.
I've made a correct and optimized version of this routine.
int pixel;
int size = width * height;
for (unsigned int * rgba_ptr = rgbaBuffer, * argb_ptr = argbBuffer + size - 1; argb_ptr >= argbBuffer; rgba_ptr++, argb_ptr--)
{
// *argb_ptr = *rgba_ptr >> 8 | 0xff000000; // - this version doesn't change endianess
*argb_ptr = __builtin_bswap32(*rgba_ptr) >> 8 | 0xff000000; // This does
}
The first thing i've made is simplifying your shuffling expression. It is obvious that XRGB is just RGBA >> 8.
Also i've removed calculation of array index on each iteration and used pointers as loop variables.
This version is about 2 times faster than the original on my machine.
You can also use SSE for shuffling if this code is intended for x86 CPU.

I am very late to this one. But I had the exact same problem when generating video on the fly. By reusing the buffer, I could get away with only setting the R, G, B values for every frame and only setting the A once.
See below code:
byte[] _workingBuffer = null;
byte[] GetProcessedPixelData(SKBitmap bitmap)
{
ReadOnlySpan<byte> sourceSpan = bitmap.GetPixelSpan();
if (_workingBuffer == null || _workingBuffer.Length != bitmap.ByteCount)
{
// Alloc buffer
_workingBuffer = new byte[sourceSpan.Length];
// Set all the alpha
for (int i = 0; i < sourceSpan.Length; i += 4) _workingBuffer[i] = byte.MaxValue;
}
Stopwatch w = Stopwatch.StartNew();
for (int i = 0; i < sourceSpan.Length; i += 4)
{
// A
// Dont set alpha here. The alpha is already set in the buffer
//_workingBuffer[i] = byte.MaxValue;
//_workingBuffer[i] = sourceSpan[i + 3];
// R
_workingBuffer[i + 1] = sourceSpan[i];
// G
_workingBuffer[i + 2] = sourceSpan[i + 1];
// B
_workingBuffer[i + 3] = sourceSpan[i + 2];
}
Debug.Print("Copied " + sourceSpan.Length + " in " + w.Elapsed.TotalMilliseconds);
return _workingBuffer;
}
This got me to around 15 milliseconds on an iPhone for a (1920 * 1080 * 4) buffer which is ~8mb.
This was not nearly enough for me. My final solution was instead to do a offset memcopy (Buffer.BlockCopy in C#) since the alpha is not important.
byte[] _workingBuffer = null;
byte[] GetProcessedPixelData(SKBitmap bitmap)
{
ReadOnlySpan<byte> sourceSpan = bitmap.GetPixelSpan();
byte[] sourceArray = sourceSpan.ToArray();
if (_workingBuffer == null || _workingBuffer.Length != bitmap.ByteCount)
{
// Alloc buffer
_workingBuffer = new byte[sourceSpan.Length];
// Set first byte. This is the alpha component of the first pixel
_workingBuffer[0] = byte.MaxValue;
}
// Converts RGBA to ARGB in ~2 ms instead of ~15 ms
//
// Copies the whole buffer with a offset of 1
// R G B A R G B A R G B A
// Originally the source buffer has: R1, G1, B1, A1, R2, G2, B2, A2, R3, G3, B3, A3
// A R G B A R G B A R G B A
// After the copy it looks like: 0, R1, G1, B1, A1, R2, G2, B2, A2, R3, G3, B3, A3
// So essentially we get the wrong alpha for every pixel. But all alphas should be 255 anyways.
// The first byte is set in the alloc
Buffer.BlockCopy(sourceArray, 0, _workingBuffer, 1, sourceSpan.Length - 1);
// Below is an inefficient method of converting RGBA to ARGB. Takes ~15 ms on iPhone 12 Pro Max for a 8mb buffer (1920 * 1080 * 4 bytes)
/*
for (int i = 0; i < sourceSpan.Length; i += 4)
{
// A
// Dont set alpha here. The alpha is already set in the buffer
//_workingBuffer[i] = byte.MaxValue;
//_workingBuffer[i] = sourceSpan[i + 3];
byte sR = sourceSpan[i];
byte sG = sourceSpan[i + 1];
byte sB = sourceSpan[i + 2];
if (sR == 0 && sG == byte.MaxValue && sB == 0)
continue;
// R
_workingBuffer[i + 1] = sR;
// G
_workingBuffer[i + 2] = sG;
// B
_workingBuffer[i + 3] = sB;
}
*/
return _workingBuffer;
}
The code is commented on how this works. On my same iPhone it takes ~2 ms which is sufficient for my use case.

Use assembly, the following is for Intel.
This example swaps Red and Blue.
void* b = pixels;
UINT len = textureWidth*textureHeight;
__asm
{
mov ecx, len // Set loop counter to pixels memory block size
mov ebx, b // Set ebx to pixels pointer
label:
mov al,[ebx+0] // Load Red to al
mov ah,[ebx+2] // Load Blue to ah
mov [ebx+0],ah // Swap Red
mov [ebx+2],al // Swap Blue
add ebx,4 // Move by 4 bytes to next pixel
dec ecx // Decrease loop counter
jnz label // If not zero jump to label
}

(pixel << 24) | (pixel >> 8) rotates a 32-bit integer 8 bits to the right, which would convert a 32-bit RGBA value to ARGB. This works because:
pixel << 24 discards the RGB portion of RGBA off the left side, resulting in A000.
pixel >> 8 discards the A portion of RGBA off the right side, resulting in 0RGB.
A000 | 0RGB == ARGB.

efficiently find the first element matching a bit mask

I have a list of N 64-bit integers whose bits represent small sets. Each integer has at most k bits set to 1. Given a bit mask, I would like to find the first element in the list that matches the mask, i.e. element & mask == element.
Example:
If my list is:
index abcdef
0 001100
1 001010
2 001000
3 000100
4 000010
5 000001
6 010000
7 100000
8 000000
and my mask is 111000, the first element matching the mask is at index 2.
Method 1:
Linear search through the entire list. This takes O(N) time and O(1) space.
Method 2:
Precompute a tree of all possible masks, and at each node keep the answer for that mask. This takes O(1) time for the query, but takes O(2^64) space.
Question:
How can I find the first element matching the mask faster than O(N), while still using a reasonable amount of space? I can afford to spend polynomial time in precomputation, because there will be a lot of queries. The key is that k is small. In my application, k <= 5 and N is in the thousands. The mask has many 1s; you can assume that it is drawn uniformly from the space of 64-bit integers.
Update:
Here is an example data set and a simple benchmark program that runs on Linux: http://up.thirld.com/binmask.tar.gz. For large.in, N=3779 and k=3. The first line is N, followed by N unsigned 64-bit ints representing the elements. Compile with make. Run with ./benchmark.e >large.out to create the true output, which you can then diff against. (Masks are generated randomly, but the random seed is fixed.) Then replace the find_first() function with your implementation.
The simple linear search is much faster than I expected. This is because k is small, and so for a random mask, a match is found very quickly on average.

A suffix tree (on bits) will do the trick, with the original priority at the leaf nodes:
000000 -> 8
1 -> 5
10 -> 4
100 -> 3
1000 -> 2
10 -> 1
100 -> 0
10000 -> 6
100000 -> 7
where if the bit is set in the mask, you search both arms, and if not, you search only the 0 arm; your answer is the minimum number you encounter at a leaf node.
You can improve this (marginally) by traversing the bits not in order but by maximum discriminability; in your example, note that 3 elements have bit 2 set, so you would create
2:0 0:0 1:0 3:0 4:0 5:0 -> 8
5:1 -> 5
4:1 5:0 -> 4
3:1 4:0 5:0 -> 3
1:1 3:0 4:0 5:0 -> 6
0:1 1:0 3:0 4:0 5:0 -> 7
2:1 0:0 1:0 3:0 4:0 5:0 -> 2
4:1 5:0 -> 1
3:1 4:0 5:0 -> 0
In your example mask this doesn't help (since you have to traverse both the bit2==0 and bit2==1 sides since your mask is set in bit 2), but on average it will improve the results (but at a cost of setup and more complex data structure). If some bits are much more likely to be set than others, this could be a huge win. If they're pretty close to random within the element list, then this doesn't help at all.
If you're stuck with essentially random bits set, you should get about (1-5/64)^32 benefit from the suffix tree approach on average (13x speedup), which might be better than the difference in efficiency due to using more complex operations (but don't count on it--bit masks are fast). If you have a nonrandom distribution of bits in your list, then you could do almost arbitrarily well.

This is the bitwise Kd-tree. It typically needs less than 64 visits per lookup operation. Currently, the selection of the bit (dimension) to pivot on is random.
#include <limits.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
typedef unsigned long long Thing;
typedef unsigned long Number;
unsigned thing_ffs(Thing mask);
Thing rand_mask(unsigned bitcnt);
#define WANT_RANDOM 31
#define WANT_BITS 3
#define BITSPERTHING (CHAR_BIT*sizeof(Thing))
#define NONUMBER ((Number)-1)
struct node {
Thing value;
Number num;
Number nul;
Number one;
char pivot;
} *nodes = NULL;
unsigned nodecount=0;
unsigned itercount=0;
struct node * nodes_read( unsigned *sizp, char *filename);
Number *find_ptr_to_insert(Number *ptr, Thing value, Thing mask);
unsigned grab_matches(Number *result, Number num, Thing mask);
void initialise_stuff(void);
int main (int argc, char **argv)
{
Thing mask;
Number num;
unsigned idx;
srand (time(NULL));
nodes = nodes_read( &nodecount, argv[1]);
fprintf( stdout, "Nodecount=%u\n", nodecount );
initialise_stuff();
#if WANT_RANDOM
mask = nodes[nodecount/2].value | nodes[nodecount/3].value ;
#else
mask = 0x38;
#endif
fprintf( stdout, "\n#### Search mask=%llx\n", (unsigned long long) mask );
itercount = 0;
num = NONUMBER;
idx = grab_matches(&num,0, mask);
fprintf( stdout, "Itercount=%u\n", itercount );
fprintf(stdout, "KdTree search %16llx\n", (unsigned long long) mask );
fprintf(stdout, "Count=%u Result:\n", idx);
idx = num;
if (idx >= nodecount) idx = nodecount-1;
fprintf( stdout, "num=%4u Value=%16llx\n"
,(unsigned) nodes[idx].num
,(unsigned long long) nodes[idx].value
);
fprintf( stdout, "\nLinear search %16llx\n", (unsigned long long) mask );
for (idx = 0; idx < nodecount; idx++) {
if ((nodes[idx].value & mask) == nodes[idx].value) break;
}
fprintf(stdout, "Cnt=%u\n", idx);
if (idx >= nodecount) idx = nodecount-1;
fprintf(stdout, "Num=%4u Value=%16llx\n"
, (unsigned) nodes[idx].num
, (unsigned long long) nodes[idx].value );
return 0;
}
void initialise_stuff(void)
{
unsigned num;
Number root, *ptr;
root = 0;
for (num=0; num < nodecount; num++) {
nodes[num].num = num;
nodes[num].one = NONUMBER;
nodes[num].nul = NONUMBER;
nodes[num].pivot = -1;
}
nodes[num-1].value = 0; /* last node is guaranteed to match anything */
root = 0;
for (num=1; num < nodecount; num++) {
ptr = find_ptr_to_insert (&root, nodes[num].value, 0ull );
if (*ptr == NONUMBER) *ptr = num;
else fprintf(stderr, "Found %u for %u\n"
, (unsigned)*ptr, (unsigned) num );
}
}
Thing rand_mask(unsigned bitcnt)
{struct node * nodes_read( unsigned *sizp, char *filename)
{
struct node *ptr;
unsigned size,used;
FILE *fp;
if (!filename) {
size = (WANT_RANDOM+0) ? WANT_RANDOM : 9;
ptr = malloc (size * sizeof *ptr);
#if (!WANT_RANDOM)
ptr[0].value = 0x0c;
ptr[1].value = 0x0a;
ptr[2].value = 0x08;
ptr[3].value = 0x04;
ptr[4].value = 0x02;
ptr[5].value = 0x01;
ptr[6].value = 0x10;
ptr[7].value = 0x20;
ptr[8].value = 0x00;
#else
for (used=0; used < size; used++) {
ptr[used].value = rand_mask(WANT_BITS);
}
#endif /* WANT_RANDOM */
*sizp = size;
return ptr;
}
fp = fopen( filename, "r" );
if (!fp) return NULL;
fscanf(fp,"%u\n", &size );
fprintf(stderr, "Size=%u\n", size);
ptr = malloc (size * sizeof *ptr);
for (used = 0; used < size; used++) {
fscanf(fp,"%llu\n", &ptr[used].value );
}
fclose( fp );
*sizp = used;
return ptr;
}
Thing value = 0;
unsigned bit, cnt;
for (cnt=0; cnt < bitcnt; cnt++) {
bit = 54321*rand();
bit %= BITSPERTHING;
value |= 1ull << bit;
}
return value;
}
Number *find_ptr_to_insert(Number *ptr, Thing value, Thing done)
{
Number num=NONUMBER;
while ( *ptr != NONUMBER) {
Thing wrong;
num = *ptr;
wrong = (nodes[num].value ^ value) & ~done;
if (nodes[num].pivot < 0) { /* This node is terminal */
/* choose one of the wrong bits for a pivot .
** For this bit (nodevalue==1 && searchmask==0 )
*/
if (!wrong) wrong = ~done ;
nodes[num].pivot = thing_ffs( wrong );
}
ptr = (wrong & 1ull << nodes[num].pivot) ? &nodes[num].nul : &nodes[num].one;
/* Once this bit has been tested, it can be masked off. */
done |= 1ull << nodes[num].pivot ;
}
return ptr;
}
unsigned grab_matches(Number *result, Number num, Thing mask)
{
Thing wrong;
unsigned count;
for (count=0; num < *result; ) {
itercount++;
wrong = nodes[num].value & ~mask;
if (!wrong) { /* we have a match */
if (num < *result) { *result = num; count++; }
/* This is cheap pruning: the break will omit both subtrees from the results.
** But because we already have a result, and the subtrees have higher numbers
** than our current num, we can ignore them. */
break;
}
if (nodes[num].pivot < 0) { /* This node is terminal */
break;
}
if (mask & 1ull << nodes[num].pivot) {
/* avoid recursion if there is only one non-empty subtree */
if (nodes[num].nul >= *result) { num = nodes[num].one; continue; }
if (nodes[num].one >= *result) { num = nodes[num].nul; continue; }
count += grab_matches(result, nodes[num].nul, mask);
count += grab_matches(result, nodes[num].one, mask);
break;
}
mask |= 1ull << nodes[num].pivot;
num = (wrong & 1ull << nodes[num].pivot) ? nodes[num].nul : nodes[num].one;
}
return count;
}
unsigned thing_ffs(Thing mask)
{
unsigned bit;
#if 1
if (!mask) return (unsigned)-1;
for ( bit=random() % BITSPERTHING; 1 ; bit += 5, bit %= BITSPERTHING) {
if (mask & 1ull << bit ) return bit;
}
#elif 0
for (bit =0; bit < BITSPERTHING; bit++ ) {
if (mask & 1ull <<bit) return bit;
}
#else
mask &= (mask-1); // Kernighan-trick
for (bit =0; bit < BITSPERTHING; bit++ ) {
mask >>=1;
if (!mask) return bit;
}
#endif
return 0xffffffff;
}
struct node * nodes_read( unsigned *sizp, char *filename)
{
struct node *ptr;
unsigned size,used;
FILE *fp;
if (!filename) {
size = (WANT_RANDOM+0) ? WANT_RANDOM : 9;
ptr = malloc (size * sizeof *ptr);
#if (!WANT_RANDOM)
ptr[0].value = 0x0c;
ptr[1].value = 0x0a;
ptr[2].value = 0x08;
ptr[3].value = 0x04;
ptr[4].value = 0x02;
ptr[5].value = 0x01;
ptr[6].value = 0x10;
ptr[7].value = 0x20;
ptr[8].value = 0x00;
#else
for (used=0; used < size; used++) {
ptr[used].value = rand_mask(WANT_BITS);
}
#endif /* WANT_RANDOM */
*sizp = size;
return ptr;
}
fp = fopen( filename, "r" );
if (!fp) return NULL;
fscanf(fp,"%u\n", &size );
fprintf(stderr, "Size=%u\n", size);
ptr = malloc (size * sizeof *ptr);
for (used = 0; used < size; used++) {
fscanf(fp,"%llu\n", &ptr[used].value );
}
fclose( fp );
*sizp = used;
return ptr;
}
UPDATE:
I experimented a bit with the pivot-selection, favouring bits with the highest discriminatory value ("information content"). This involves:
making a histogram of the usage of bits (can be done while initialising)
while building the tree: choosing the one with frequency closest to 1/2 in the remaining subtrees.
The result: the random pivot selection performed better.

Construct a a binary tree as follows:
Every level corresponds to a bit
It corresponding bit is on go right, otherwise left
This way insert every number in the database.
Now, for searching: if the corresponding bit in the mask is 1, traverse both children. If it is 0, traverse only the left node. Essentially keep traversing the tree until you hit the leaf node (BTW, 0 is a hit for every mask!).
This tree will have O(N) space requirements.
Eg of tree for 1 (001), 2(010) and 5 (101)
root
/ \
0 1
/ \ |
0 1 0
| | |
1 0 1
(1) (2) (5)

With precomputed bitmasks. Formally is is still O(N), since the and-mask operations are O(N). The final pass is also O(N), because it needs to find the lowest bit set, but that could be sped up, too.
#include <limits.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/* For demonstration purposes.
** In reality, this should be an unsigned long long */
typedef unsigned char Thing;
#define BITSPERTHING (CHAR_BIT*sizeof (Thing))
#define COUNTOF(a) (sizeof a / sizeof a[0])
Thing data[] =
/****** index abcdef */
{ 0x0c /* 0 001100 */
, 0x0a /* 1 001010 */
, 0x08 /* 2 001000 */
, 0x04 /* 3 000100 */
, 0x02 /* 4 000010 */
, 0x01 /* 5 000001 */
, 0x10 /* 6 010000 */
, 0x20 /* 7 100000 */
, 0x00 /* 8 000000 */
};
/* Note: this is for demonstration purposes.
** Normally, one should choose a machine wide unsigned int
** for bitmask arrays.
*/
struct bitmap {
char data[ 1+COUNTOF (data)/ CHAR_BIT ];
} nulmaps [ BITSPERTHING ];
#define BITSET(a,i) (a)[(i) / CHAR_BIT ] |= (1u << ((i)%CHAR_BIT) )
#define BITTEST(a,i) ((a)[(i) / CHAR_BIT ] & (1u << ((i)%CHAR_BIT) ))
void init_tabs(void);
void map_empty(struct bitmap *dst);
void map_full(struct bitmap *dst);
void map_and2(struct bitmap *dst, struct bitmap *src);
int main (void)
{
Thing mask;
struct bitmap result;
unsigned ibit;
mask = 0x38;
init_tabs();
map_full(&result);
for (ibit = 0; ibit < BITSPERTHING; ibit++) {
/* bit in mask is 1, so bit at this position is in fact a don't care */
if (mask & (1u <<ibit)) continue;
/* bit in mask is 0, so we can only select items with a 0 at this bitpos */
map_and2(&result, &nulmaps[ibit] );
}
/* This is not the fastest way to find the lowest 1 bit */
for (ibit = 0; ibit < COUNTOF (data); ibit++) {
if (!BITTEST(result.data, ibit) ) continue;
fprintf(stdout, " %u", ibit);
}
fprintf( stdout, "\n" );
return 0;
}
void init_tabs(void)
{
unsigned ibit, ithing;
/* 1 bits in data that dont overlap with 1 bits in the searchmask are showstoppers.
** So, for each bitpos, we precompute a bitmask of all *entrynumbers* from data[], that contain 0 in bitpos.
*/
memset(nulmaps, 0 , sizeof nulmaps);
for (ithing=0; ithing < COUNTOF(data); ithing++) {
for (ibit=0; ibit < BITSPERTHING; ibit++) {
if ( data[ithing] & (1u << ibit) ) continue;
BITSET(nulmaps[ibit].data, ithing);
}
}
}
/* Logical And of two bitmask arrays; simular to dst &= src */
void map_and2(struct bitmap *dst, struct bitmap *src)
{
unsigned idx;
for (idx = 0; idx < COUNTOF(dst->data); idx++) {
dst->data[idx] &= src->data[idx] ;
}
}
void map_empty(struct bitmap *dst)
{
memset(dst->data, 0 , sizeof dst->data);
}
void map_full(struct bitmap *dst)
{
unsigned idx;
/* NOTE this loop sets too many bits to the left of COUNTOF(data) */
for (idx = 0; idx < COUNTOF(dst->data); idx++) {
dst->data[idx] = ~0;
}
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio