Finding the most frequently occurring element in an SSE register - algorithm

Does anyone have any thoughts on how to calculate the mode (statistic) of a vector of 8-bit integers in SSE4.x? To clarify, this would be 16x8-bit values in a 128-bit register.
I want the result as a vector mask which selects the mode-valued elements. i.e. the result of _mm_cmpeq_epi8(v, set1(mode(v))), as well as the scalar value.
Providing some additional context; while the above problem is an interesting one to solve in its own right, I have been through most algorithms I can think of with linear complexity. This class will wipe out any gains I can get from calculating this number.
I hope to engage you all in searching for some deep magic, here. It's possible that an approximation may be necessary to break this bound, such as "select a frequently occurring element" for example (N.B. difference against the most), which would be of merit. A probabilistic answer would be usable, too.
SSE and x86 have some very interesting semantics. It may be worth exploring a superoptimization pass.

Probably a relatively simple brute force SSEx approach is suitable here, see the code below.
The idea is to byte-rotate the input vector v by 1 to 15 positions and compare the rotated vector
with the original v for equality. To shorten the dependency chain and to increase the
instruction level parallelism, two counters are used to count (vertical sum) these equal elements:
sum1 and sum2, because there might be architectures that benefit from this.
Equal elements are counted as -1. Variable sum = sum1 + sum2 contains the total count with values
between -1 and -16. min_brc contains the horizontal minimum of sum broadcasted to all elements.
mask = _mm_cmpeq_epi8(sum,min_brc) is the mask for the mode-valued elements requested as an
intermediate result by the OP. In the next few lines of the code the actual mode is extracted.
This solution is certainly faster than a scalar solution.
Note that with AVX2 the upper 128-bit lanes can be used to speedup the computation further.
It takes 20 cycles (throughput) to compute only the a mask for the mode-valued elements. With the actual mode broadcasted
across the SSE register it takes about 21.4 cycles.
Note the behaviour in the next example:
[1, 1, 3, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] returns mask=[-1,-1,-1,-1,0,0,...,0]
and the mode value is 1, although 1 occurs as often as 3.
The code below is tested, but not thoroughly tested
#include <stdio.h>
#include <x86intrin.h>
/* gcc -O3 -Wall -m64 -march=nehalem mode_uint8.c */
int print_vec_char(__m128i x);
__m128i mode_statistic(__m128i v){
__m128i sum2 = _mm_set1_epi8(-1); /* Each integer occurs at least one time */
__m128i v_rot1 = _mm_alignr_epi8(v,v,1);
__m128i v_rot2 = _mm_alignr_epi8(v,v,2);
__m128i sum1 = _mm_cmpeq_epi8(v,v_rot1);
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot2));
__m128i v_rot3 = _mm_alignr_epi8(v,v,3);
__m128i v_rot4 = _mm_alignr_epi8(v,v,4);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot3));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot4));
__m128i v_rot5 = _mm_alignr_epi8(v,v,5);
__m128i v_rot6 = _mm_alignr_epi8(v,v,6);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot5));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot6));
__m128i v_rot7 = _mm_alignr_epi8(v,v,7);
__m128i v_rot8 = _mm_alignr_epi8(v,v,8);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot7));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot8));
__m128i v_rot9 = _mm_alignr_epi8(v,v,9);
__m128i v_rot10 = _mm_alignr_epi8(v,v,10);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot9));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot10));
__m128i v_rot11 = _mm_alignr_epi8(v,v,11);
__m128i v_rot12 = _mm_alignr_epi8(v,v,12);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot11));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot12));
__m128i v_rot13 = _mm_alignr_epi8(v,v,13);
__m128i v_rot14 = _mm_alignr_epi8(v,v,14);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot13));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot14));
__m128i v_rot15 = _mm_alignr_epi8(v,v,15);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot15));
__m128i sum = _mm_add_epi8(sum1,sum2); /* Sum contains values such as -1, -2 ,...,-16 */
/* The next three instructions compute the horizontal minimum of sum */
__m128i sum_shft = _mm_srli_epi16(sum,8); /* Shift right 8 bits, while shifting in zeros */
__m128i min1 = _mm_min_epu8(sum,sum_shft); /* sum and sum_shuft are considered as unsigned integers. sum_shft is zero at the odd positions and so is min1 */
__m128i min2 = _mm_minpos_epu16(min1); /* Byte 0 within min2 contains the horizontal minimum of sum */
__m128i min_brc = _mm_shuffle_epi8(min2,_mm_setzero_si128()); /* Broadcast horizontal minimum */
__m128i mask = _mm_cmpeq_epi8(sum,min_brc); /* Mask = -1 at the byte positions where the value of v is equal to the mode of v */
/* comment next 4 lines out if there is no need to broadcast the mode value */
int bitmask = _mm_movemask_epi8(mask);
int indx = __builtin_ctz(bitmask); /* Index of mode */
__m128i v_indx = _mm_set1_epi8(indx); /* Broadcast indx */
__m128i answer = _mm_shuffle_epi8(v,v_indx); /* Broadcast mode to each element of answer */
/* Uncomment lines below to print intermediate results, to see how it works. */
// printf("sum = ");print_vec_char (sum );
// printf("sum_shft = ");print_vec_char (sum_shft );
// printf("min1 = ");print_vec_char (min1 );
// printf("min2 = ");print_vec_char (min2 );
// printf("min_brc = ");print_vec_char (min_brc );
// printf("mask = ");print_vec_char (mask );
// printf("v_indx = ");print_vec_char (v_indx );
// printf("answer = ");print_vec_char (answer );
return answer; /* or return mask, or return both .... :) */
}
int main() {
/* To test throughput set throughput_test to 1, otherwise 0 */
/* Use e.g. perf stat -d ./a.out to test throughput */
#define throughput_test 0
/* Different test vectors */
int i;
char x1[16] = {5, 2, 2, 7, 21, 4, 7, 7, 3, 9, 2, 5, 4, 3, 5, 5};
char x2[16] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
char x3[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
char x4[16] = {1, 2, 3, 2, 1, 6, 7, 8, 2, 2, 2, 3, 3, 2, 15, 16};
char x5[16] = {1, 1, 3, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
printf("\n15...0 = 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0\n\n");
__m128i x_vec = _mm_loadu_si128((__m128i*)x1);
printf("x_vec = ");print_vec_char(x_vec );
__m128i y = mode_statistic (x_vec);
printf("answer = ");print_vec_char(y );
#if throughput_test == 1
__m128i x_vec1 = _mm_loadu_si128((__m128i*)x1);
__m128i x_vec2 = _mm_loadu_si128((__m128i*)x2);
__m128i x_vec3 = _mm_loadu_si128((__m128i*)x3);
__m128i x_vec4 = _mm_loadu_si128((__m128i*)x4);
__m128i x_vec5 = _mm_loadu_si128((__m128i*)x5);
__m128i y1, y2, y3, y4, y5;
__asm__ __volatile__ ( "vzeroupper" : : : ); /* Remove this line on non-AVX processors */
for (i=0;i<100000000;i++){
y1 = mode_statistic (x_vec1);
y2 = mode_statistic (x_vec2);
y3 = mode_statistic (x_vec3);
y4 = mode_statistic (x_vec4);
y5 = mode_statistic (x_vec5);
x_vec1 = mode_statistic (y1 );
x_vec2 = mode_statistic (y2 );
x_vec3 = mode_statistic (y3 );
x_vec4 = mode_statistic (y4 );
x_vec5 = mode_statistic (y5 );
}
printf("mask mode = ");print_vec_char(y1 );
printf("mask mode = ");print_vec_char(y2 );
printf("mask mode = ");print_vec_char(y3 );
printf("mask mode = ");print_vec_char(y4 );
printf("mask mode = ");print_vec_char(y5 );
#endif
return 0;
}
int print_vec_char(__m128i x){
char v[16];
_mm_storeu_si128((__m128i *)v,x);
printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi\n",
v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
return 0;
}
Output:
15...0 = 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
x_vec = 5 5 3 4 | 5 2 9 3 | 7 7 4 21 | 7 2 2 5
sum = -4 -4 -2 -2 | -4 -3 -1 -2 | -3 -3 -2 -1 | -3 -3 -3 -4
min_brc = -4 -4 -4 -4 | -4 -4 -4 -4 | -4 -4 -4 -4 | -4 -4 -4 -4
mask = -1 -1 0 0 | -1 0 0 0 | 0 0 0 0 | 0 0 0 -1
answer = 5 5 5 5 | 5 5 5 5 | 5 5 5 5 | 5 5 5 5
The horizontal minimum is computed with Evgeny Kluev's method.

Sort the data in the register.
Insertion sort can be done in 16 (15) steps, by initializing the register to "Infinity", which tries to illustrate a monotonically decreasing array and inserting the new element in parallel to all possible places:
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 78
__m128i sorted = _mm_or_si128(my_array, const_FFFFF00);
for (int i = 1; i < 16; ++i)
{
// Trying to insert e.g. A0, we must shift all the FF's to left
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF 78 00
__m128i shifted = _mm_bslli_si128(sorted, 1);
// Taking the MAX of shifted and 'A0 on all places'
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF A0 A0
shifted = _mm_max_epu8(shifted, _mm_set1_epi8(my_array[i]));
// and minimum of the shifted + original --
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF A0 78
sorted = _mm_min_epu8(sorted, shifted);
}
Then calculate mask for vec[n+1] == vec[n], move mask to GPR and use that to index a 32768 entry LUT for best index location.
In real case one probably want's to sort more than just one vector; i.e. sort 16 16-entry vectors at once;
__m128i input[16]; // not 1, but 16 vectors
transpose16x16(input); // inplace vector transpose
sort(transpose); // 60-stage network exists for 16 inputs
// linear search -- result in 'mode'
__m128i mode = input[0];
__m128i previous = mode;
__m128i count = _mm_set_epi8(0);
__m128i max_count = _mm_setzero_si128(0);
for (int i = 1; i < 16; i++)
{
__m128i &current = input[i];
// histogram count is off by one
// if (current == previous) count++;
// else count = 0;
// if (count > max_count)
// mode = current, max_count = count
prev = _mm_cmpeq_epi8(prev, current);
count = _mm_and_si128(_mm_sub_epi8(count, prev), prev);
__m128i max_so_far = _mm_cmplt_epi8(max_count, count);
mode = _mm_blendv_epi8(mode, current, max_so_far);
max_count = _mm_max_epi8(max_count, count);
previous = current;
}
The inner loop totals amortized cost of 7-8 instructions per result;
Sorting has typically 2 instructions per stage -- i.e. 8 instructions per result, when 16 results need 60 stages or 120 instructions.
(This still leaves the transpose as an exercise -- but I think it should be vastly faster than sorting?)
So, this should be in the ball park of 24 instructions per 8-bit result.

For performance comparison with scalar code. Non-vectorized on main part but vectorized on table-clear and tmp initialization. (168 cycles per f() call for fx8150 (22M calls complete in 1.0002 seconds at 3.7 GHz))
#include <x86intrin.h>
unsigned char tmp[16]; // extracted values are here (single instruction, store_ps)
unsigned char table[256]; // counter table containing zeroes
char f(__m128i values)
{
_mm_store_si128((__m128i *)tmp,values);
int maxOccurence=0;
int currentValue=0;
for(int i=0;i<16;i++)
{
unsigned char ind=tmp[i];
unsigned char t=table[ind];
t++;
if(t>maxOccurence)
{
maxOccurence=t;
currentValue=ind;
}
table[ind]=t;
}
for(int i=0;i<256;i++)
table[i]=0;
return currentValue;
}
g++ 6.3 output:
f: # #f
movaps %xmm0, tmp(%rip)
movaps %xmm0, -24(%rsp)
xorl %r8d, %r8d
movq $-15, %rdx
movb -24(%rsp), %sil
xorl %eax, %eax
jmp .LBB0_1
.LBB0_2: # %._crit_edge
cmpl %r8d, %esi
cmovgel %esi, %r8d
movb tmp+16(%rdx), %sil
incq %rdx
.LBB0_1: # =>This Inner Loop Header: Depth=1
movzbl %sil, %edi
movb table(%rdi), %cl
incb %cl
movzbl %cl, %esi
cmpl %r8d, %esi
cmovgl %edi, %eax
movb %sil, table(%rdi)
testq %rdx, %rdx
jne .LBB0_2
xorps %xmm0, %xmm0
movaps %xmm0, table+240(%rip)
movaps %xmm0, table+224(%rip)
movaps %xmm0, table+208(%rip)
movaps %xmm0, table+192(%rip)
movaps %xmm0, table+176(%rip)
movaps %xmm0, table+160(%rip)
movaps %xmm0, table+144(%rip)
movaps %xmm0, table+128(%rip)
movaps %xmm0, table+112(%rip)
movaps %xmm0, table+96(%rip)
movaps %xmm0, table+80(%rip)
movaps %xmm0, table+64(%rip)
movaps %xmm0, table+48(%rip)
movaps %xmm0, table+32(%rip)
movaps %xmm0, table+16(%rip)
movaps %xmm0, table(%rip)
movsbl %al, %eax
ret

Related

Efficient code to load AVX vectors for 1D convolution kernel of length 8

An implementation of a 1D convolution operation will often need to load a vectors of data that sequentially step through a buffer of data offset by one element each iteration.
For example, consider a buffer of input data X[0], X[1], ..., X[n-1], where n is greater than twice the kernel length. If the convolution length is three, and we can fit eight elements in each vector, we might first want a vector with X[0], X[1], ..., X[7], then the next with X[1], X[2], ..., X[8] and the last with X[2], X[3], ..., X[9].
Consider the case where the kernel length as well as the vector length is 8. We must load eight vectors, that might look sequentially like this:
{ 0 1 2 3 4 5 6 7 }
{ 1 2 3 4 5 6 7 8 }
{ 2 3 4 5 6 7 8 9 }
{ 3 4 5 6 7 8 9 10 }
{ 4 5 6 7 8 9 10 11 }
{ 5 6 7 8 9 10 11 12 }
{ 6 7 8 9 10 11 12 13 }
{ 7 8 9 10 11 12 13 14 }
By reducing this sequence vertically, we could produce a running mean or sum. I.e., the sum of these vectors will have the sum of the first 8 elements in it's first position.
Consider that the order of the elements in the column does not matter. Any permutation of the elements in each column will still produce the same result. For a convolution, this permutation can be accounted for by altering the order of the constants used in the kernel.
Is there a faster way to load these vectors that takes advantage of this? Consider as a baseline the simple sequence of unaligned loads:
// Any sort of sliding window function, i.e. running mean, running max, convolution, etc.
void sliding_window(const float* input, unsigned length)
{
for (unsigned i = 0; i < length - 7; i += 8) {
for (unsigned j = 0; i < 8; j++) {
__m256 v = _mm256_loadu_ps(input[i + j]);
// reduction operation on v (e.g. max or fmadd) goes here
}
}
// handle tail here
}
First of all, you should note that if your convolution is separable, this is very often worth doing. Simple example:
res[i] = x[i]+x[i+1]+x[i+2]+x[i+3]+x[i+4]+x[i+5]+x[i+6]+x[i+7];
This can be done by convoluting with [1 1] * [1 0 1] * [1 0 0 0 1] in three steps, for example like so:
void sliding_window(float* output, const float* input, size_t length)
{
// Nomenclature
// aX input at i+X
// bX convolution with [1 1] starting at i+X
// cX convolution with [1 1] * [1 0 1] starting at i+X
// dX convolution with [1 1] * [1 0 1] * [1 0 0 0 1] starting at i+X
__m256 a0 = _mm256_load_ps(input), a8 = _mm256_load_ps(input + 8);
__m256 b0 = _mm256_add_ps(a0, _mm256_loadu_ps(input+1)), b8 = _mm256_add_ps(a8, _mm256_loadu_ps(input+9));
__m256 b4 = _mm256_permute2f128_ps(b0, b8, 1+16*2);
__m256 b2 = _mm256_shuffle_ps(b0, b4, 2+3*4+0*16+1*64);
__m256 c0 = _mm256_add_ps(b0, b2);
for (unsigned i = 0; i < length - 25; i += 8) {
// Convolute input with [1 1]
__m256 a16 = _mm256_load_ps( input + i + 16);
__m256 a17 = _mm256_loadu_ps(input + i + 17);
__m256 b16 = _mm256_add_ps(a16, a17);
// Convolute first convolution with [1 0 1]
__m256 b12 = _mm256_permute2f128_ps(b8, b16, 1+16*2);
__m256 b10 = _mm256_shuffle_ps(b8, b12, 2+3*4+0*16+1*64);
__m256 c8 = _mm256_add_ps(b8, b10);
// Convolute second convolution with [1 0 0 0 1]
__m256 c4 = _mm256_permute2f128_ps(c0, c8, 1+16*2);
__m256 d0 = _mm256_add_ps(c0, c4);
// Store result
_mm256_store_ps(output + i, d0);
// rename registers for next iteration:
b8 = b16;
c0 = c8;
}
// handle tail here ...
}
You can of course replace addps by maxps. Godbolt-Demo: https://godbolt.org/z/W9K9o943o
Overall, this takes 1 aligned + 1 unaligned load, 3 shuffles, 3 additions and 1 store for 8 elements (actually only using AVX1). On Intel CPUs with only 1 shuffle per cycle this may actually just be slightly faster than a naïve 8-load, 7-addition implementation (I did not benchmark this). On Zen3 I'm not sure about the actual cost of loading unaligned data.
If you have a non-trivial kernel it is probably hard to determine if it is separable, though.
The best I've been able to come up with is this sequence:
{ 0 1 2 3 4 5 6 7 }
{ 1 2 3 8 5 6 7 12 }
{ 2 3 8 9 6 7 12 13 }
{ 3 8 9 10 7 12 13 14 }
{ 4 5 6 7 8 9 10 11 }
{ 5 6 7 4 9 10 11 8 }
{ 6 7 4 5 10 11 8 9 }
{ 7 4 5 6 11 8 9 10 }
Each column contains the necessary elements. The first column contains 0 - 7, the next 1 - 8, then 2 - 9, etc.
This can be produced with the following sequence of operations:
void sliding_window(const float* input, unsigned length)
{
__m256 a = _mm256_load_ps(input);
for (unsigned i = 8; i < length - 7; i += 8) {
__m256 b = _mm256_load_ps(input + i);
__m256i ai = _mm256_castps_si256(a); // not part of sequence
__m256i bi = _mm256_castps_si256(b); // just for code reduction
// a is the first vector, these are remaining 7
__m256 j1 = _mm256_castsi256_ps(_mm256_alignr_epi8(bi, ai, 4));
// Reduction operation (add, fmadd, max, etc.) between a and j1 goes here
__m256 j2 = _mm256_castsi256_ps(_mm256_alignr_epi8(bi, ai, 8));
// Reduction with j2 goes here, and so on after each value
__m256 j3 = _mm256_castsi256_ps(_mm256_alignr_epi8(bi, ai, 12));
__m256 r0 = _mm256_permute2f128_ps(a, b, 0x21);
a = b; // Register with "b" isn't needed anymore
__m256 r1 = _mm256_permute_ps(r0, 0x39);
__m256 r2 = _mm256_permute_ps(r0, 0x4e);
__m256 r3 = _mm256_permute_ps(r0, 0x93);
// Final reduction with r3 to produce result
}
// handle tail here
}
On Zen3, I benchmark this as about 10% faster than the sequence of unaligned loads.

Understanding branch prediction efficiency

I tried to measure branch prediction cost, I created a little program.
It creates a little buffer on stack, fills with random 0/1. I can set the size of the buffer with N. The code repeatedly causes branches for the same 1<<N random numbers.
Now, I've expected, that if 1<<N is sufficiently large (like >100), then the branch predictor will not be effective (as it has to predict >100 random numbers). However, these are the results (on a 5820k machine), as N grows, the program becomes slower:
N time
=========
8 2.2
9 2.2
10 2.2
11 2.2
12 2.3
13 4.6
14 9.5
15 11.6
16 12.7
20 12.9
For reference, if buffer is initialized with zeros (use the commented init), time is more-or-less constant, it varies between 1.5-1.7 for N 8..16.
My question is: can branch predictor effective for predicting such a large amount of random numbers? If not, then what's going on here?
(Some more explanation: the code executes 2^32 branches, no matter of N. So I expected, that the code runs the same speed, no matter of N, because the branch cannot be predicted at all. But it seems that if buffer size is less than 4096 (N<=12), something makes the code fast. Can branch prediction be effective for 4096 random numbers?)
Here's the code:
#include <cstdint>
#include <iostream>
volatile uint64_t init[2] = { 314159165, 27182818 };
// volatile uint64_t init[2] = { 0, 0 };
volatile uint64_t one = 1;
uint64_t next(uint64_t s[2]) {
uint64_t s1 = s[0];
uint64_t s0 = s[1];
uint64_t result = s0 + s1;
s[0] = s0;
s1 ^= s1 << 23;
s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5);
return result;
}
int main() {
uint64_t s[2];
s[0] = init[0];
s[1] = init[1];
uint64_t sum = 0;
#if 1
const int N = 16;
unsigned char buffer[1<<N];
for (int i=0; i<1<<N; i++) buffer[i] = next(s)&1;
for (uint64_t i=0; i<uint64_t(1)<<(32-N); i++) {
for (int j=0; j<1<<N; j++) {
if (buffer[j]) {
sum += one;
}
}
}
#else
for (uint64_t i=0; i<uint64_t(1)<<32; i++) {
if (next(s)&1) {
sum += one;
}
}
#endif
std::cout<<sum<<"\n";
}
(The code contains a non-buffered version as well, use #if 0. It runs around the same speed as the buffered version with N=16)
Here's the inner loop disassembly (compiled with clang. It generates the same code for all N between 8..16, only the loop count differs. Clang unrolled the loop twice):
401270: 80 3c 0c 00 cmp BYTE PTR [rsp+rcx*1],0x0
401274: 74 07 je 40127d <main+0xad>
401276: 48 03 35 e3 2d 00 00 add rsi,QWORD PTR [rip+0x2de3] # 404060 <one>
40127d: 80 7c 0c 01 00 cmp BYTE PTR [rsp+rcx*1+0x1],0x0
401282: 74 07 je 40128b <main+0xbb>
401284: 48 03 35 d5 2d 00 00 add rsi,QWORD PTR [rip+0x2dd5] # 404060 <one>
40128b: 48 83 c1 02 add rcx,0x2
40128f: 48 81 f9 00 00 01 00 cmp rcx,0x10000
401296: 75 d8 jne 401270 <main+0xa0>
Branch prediction can be such effective. As Peter Cordes suggests, I've checked branch-misses with perf stat. Here are the results:
N time cycles branch-misses (%) approx-time
===============================================================
8 2.2 9,084,889,375 34,806 ( 0.00) 2.2
9 2.2 9,212,112,830 39,725 ( 0.00) 2.2
10 2.2 9,264,903,090 2,394,253 ( 0.06) 2.2
11 2.2 9,415,103,000 8,102,360 ( 0.19) 2.2
12 2.3 9,876,827,586 27,169,271 ( 0.63) 2.3
13 4.6 19,572,398,825 486,814,972 (11.33) 4.6
14 9.5 39,813,380,461 1,473,662,853 (34.31) 9.5
15 11.6 49,079,798,916 1,915,930,302 (44.61) 11.7
16 12.7 53,216,900,532 2,113,177,105 (49.20) 12.7
20 12.9 54,317,444,104 2,149,928,923 (50.06) 12.9
Note: branch-misses (%) is calculated for 2^32 branches
As you can see, when N<=12, branch predictor can predict most of the branches (which is surprising: the branch predictor can memorize the outcome of 4096 consecutive random branches!). When N>12, branch-misses starts to grow. At N>=16, it can only predict ~50% correctly, which means it is as effective as random coin flips.
The time taken can be approximated by looking at the time and branch-misses (%) column: I've added the last column, approx-time. I've calculated it by this: 2.2+(12.9-2.2)*branch-misses %/100. As you can see, approx-time equals to time (not considering rounding error). So this effect can be explained perfectly by branch prediction.
The original intent was to calculate how many cycles a branch-miss costs (in this particular case - as for other cases this number can differ):
(54,317,444,104-9,084,889,375)/(2,149,928,923-34,806) = 21.039 = ~21 cycles.

subtracting two 8 bits integer bit by bit in assembly x86

so I'm trying to implement this algorithm to calculate the difference of two 8 bits integers
b = 0
difference = 0
for i = 0 to (n-1)
x = bit i of X
y = bit i of Y
bit i of difference = x xor y xor b
b = ((not x) and y) or ((not x) and b) or (y and b)
end for loop
this is what i did
calculation:
mov ebx, 0
mov diff, 0
mov ecx, 7
subtract:
mov al, X
and al, 1h ; find bit i of X
mov dl, Y
and dl, 1h ; find bit i of Y
mov ah, al
mov dh, al
xor al, dl
xor al, bl
mov diff, al ; find bit i of the difference
; calculate b value for the next interation
not ah
and ah, dl
not dh
and dh, dl
and dl, bl
or ah, dh
or ah, dl
mov bl, ah
; rotate X and Y to get ready for the next iteration
rol X, 1
rol Y, 1
loop subtract
the problem with this code is its only work on the first iteration of the loop
so for example if I enter first number to be 2 and the second number to be 1
the when i go through the loop,first iteration, the x value would be 0 and the y value would be 1, the i bit of the difference would be 1 and b value calculated would be 1
, but this only work for the first iteration, on the next iteration, I had x = 0, y = 0 and b = 1(from the last calculation), so I wanted my diff to be 1 and my b value for this iteration to be 1, instead I got 0 for both of them.
why doesn't the code work, as i was following the algorithm, and implement accordingly.
thank in advance
and
Try a higher level language first to understand the algorithm, then port that to asm.
#include <stdio.h>
//b = 0
//difference = 0
//for i = 0 to (n-1)
//
// x = bit i of X
// y = bit i of Y
// bit i of difference = x xor y xor b
// b = ((not x) and y) or ((not x) and b) or (y and b)
//
//end for loop
int main ( void )
{
unsigned char X,Y,Z;
unsigned char x,y,z,b,bnext;
unsigned char i;
X=0Xf5; Y=0Xf1;
b=0;
Z=0;
for (i=1;i;i<<=1)
{
x=0;
y=0;
if(i&X) x=1;
if(i&Y) y=1;
z=((x^y)^b)&1;
if(z) Z|=i;
bnext = ((~x)&y) | ((~x)&b) | (y&b);
b=bnext&1;
}
printf("0x%02X 0x%02X\n",Z,X-Y);
return(0);
}
you might even re-write it a few times to approach real instructions.
z=((x^y)^b)&1;
becomes
z = x;
z = z ^ y;
z = z ^ b;
z = z & 1;

Find nth SET bit in an int

Instead of just the lowest set bit, I want to find the position of the nth lowest set bit. (I'm NOT talking about value on the nth bit position)
For example, say I have:
0000 1101 1000 0100 1100 1000 1010 0000
And I want to find the 4th bit that is set. Then I want it to return:
0000 0000 0000 0000 0100 0000 0000 0000
If popcnt(v) < n, it would make sense if this function returned 0, but any behavior for this case is acceptable for me.
I'm looking for something faster than a loop if possible.
Nowadays this is very easy with PDEP from the BMI2 instruction set. Here is a 64-bit version with some examples:
#include <cassert>
#include <cstdint>
#include <x86intrin.h>
inline uint64_t nthset(uint64_t x, unsigned n) {
return _pdep_u64(1ULL << n, x);
}
int main() {
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 0) ==
0b0000'0000'0000'0000'0000'0000'0010'0000);
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 1) ==
0b0000'0000'0000'0000'0000'0000'1000'0000);
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 3) ==
0b0000'0000'0000'0000'0100'0000'0000'0000);
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 9) ==
0b0000'1000'0000'0000'0000'0000'0000'0000);
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 10) ==
0b0000'0000'0000'0000'0000'0000'0000'0000);
}
If you just want the (zero-based) index of the nth set bit, add a trailing zero count.
inline unsigned nthset(uint64_t x, unsigned n) {
return _tzcnt_u64(_pdep_u64(1ULL << n, x));
}
It turns out that it is indeed possible to do this with no loops. It is fastest to precompute the (at least) 8 bit version of this problem. Of course, these tables use up cache space, but there should still be a net speedup in virtually all modern pc scenarios. In this code, n=0 returns the least set bit, n=1 is second-to-least, etc.
Solution with __popcnt
There is a solution using the __popcnt intrinsic (you need __popcnt to be extremely fast or any perf gains over a simple loop solution will be moot. Fortunately most SSE4+ era processors support it).
// lookup table for sub-problem: 8-bit v
byte PRECOMP[256][8] = { .... } // PRECOMP[v][n] for v < 256 and n < 8
ulong nthSetBit(ulong v, ulong n) {
ulong p = __popcnt(v & 0xFFFF);
ulong shift = 0;
if (p <= n) {
v >>= 16;
shift += 16;
n -= p;
}
p = __popcnt(v & 0xFF);
if (p <= n) {
shift += 8;
v >>= 8;
n -= p;
}
if (n >= 8) return 0; // optional safety, in case n > # of set bits
return PRECOMP[v & 0xFF][n] << shift;
}
This illustrates how the divide and conquer approach works.
General Solution
There is also a solution for "general" architectures- without __popcnt. It can be done by processing in 8-bit chunks. You need one more lookup table that tells you the popcnt of a byte:
byte PRECOMP[256][8] = { .... } // PRECOMP[v][n] for v<256 and n < 8
byte POPCNT[256] = { ... } // POPCNT[v] is the number of set bits in v. (v < 256)
ulong nthSetBit(ulong v, ulong n) {
ulong p = POPCNT[v & 0xFF];
ulong shift = 0;
if (p <= n) {
n -= p;
v >>= 8;
shift += 8;
p = POPCNT[v & 0xFF];
if (p <= n) {
n -= p;
shift += 8;
v >>= 8;
p = POPCNT[v & 0xFF];
if (p <= n) {
n -= p;
shift += 8;
v >>= 8;
}
}
}
if (n >= 8) return 0; // optional safety, in case n > # of set bits
return PRECOMP[v & 0xFF][n] << shift;
}
This could, of course, be done with a loop, but the unrolled form is faster and the unusual form of the loop would make it unlikely that the compiler could automatically unroll it for you.
v-1 has a zero where v has its least significant "one" bit, while all more significant bits are the same. This leads to the following function:
int ffsn(unsigned int v, int n) {
for (int i=0; i<n-1; i++) {
v &= v-1; // remove the least significant bit
}
return v & ~(v-1); // extract the least significant bit
}
The version from bit-twiddling hacks adapted to this case is, for example,
unsigned int nth_bit_set(uint32_t value, unsigned int n)
{
const uint32_t pop2 = (value & 0x55555555u) + ((value >> 1) & 0x55555555u);
const uint32_t pop4 = (pop2 & 0x33333333u) + ((pop2 >> 2) & 0x33333333u);
const uint32_t pop8 = (pop4 & 0x0f0f0f0fu) + ((pop4 >> 4) & 0x0f0f0f0fu);
const uint32_t pop16 = (pop8 & 0x00ff00ffu) + ((pop8 >> 8) & 0x00ff00ffu);
const uint32_t pop32 = (pop16 & 0x000000ffu) + ((pop16 >>16) & 0x000000ffu);
unsigned int rank = 0;
unsigned int temp;
if (n++ >= pop32)
return 32;
temp = pop16 & 0xffu;
/* if (n > temp) { n -= temp; rank += 16; } */
rank += ((temp - n) & 256) >> 4;
n -= temp & ((temp - n) >> 8);
temp = (pop8 >> rank) & 0xffu;
/* if (n > temp) { n -= temp; rank += 8; } */
rank += ((temp - n) & 256) >> 5;
n -= temp & ((temp - n) >> 8);
temp = (pop4 >> rank) & 0x0fu;
/* if (n > temp) { n -= temp; rank += 4; } */
rank += ((temp - n) & 256) >> 6;
n -= temp & ((temp - n) >> 8);
temp = (pop2 >> rank) & 0x03u;
/* if (n > temp) { n -= temp; rank += 2; } */
rank += ((temp - n) & 256) >> 7;
n -= temp & ((temp - n) >> 8);
temp = (value >> rank) & 0x01u;
/* if (n > temp) rank += 1; */
rank += ((temp - n) & 256) >> 8;
return rank;
}
which, when compiled in a separate compilation unit, on gcc-5.4.0 using -Wall -O3 -march=native -mtune=native on Intel Core i5-4200u, yields
00400a40 <nth_bit_set>:
400a40: 89 f9 mov %edi,%ecx
400a42: 89 f8 mov %edi,%eax
400a44: 55 push %rbp
400a45: 40 0f b6 f6 movzbl %sil,%esi
400a49: d1 e9 shr %ecx
400a4b: 25 55 55 55 55 and $0x55555555,%eax
400a50: 53 push %rbx
400a51: 81 e1 55 55 55 55 and $0x55555555,%ecx
400a57: 01 c1 add %eax,%ecx
400a59: 41 89 c8 mov %ecx,%r8d
400a5c: 89 c8 mov %ecx,%eax
400a5e: 41 c1 e8 02 shr $0x2,%r8d
400a62: 25 33 33 33 33 and $0x33333333,%eax
400a67: 41 81 e0 33 33 33 33 and $0x33333333,%r8d
400a6e: 41 01 c0 add %eax,%r8d
400a71: 45 89 c1 mov %r8d,%r9d
400a74: 44 89 c0 mov %r8d,%eax
400a77: 41 c1 e9 04 shr $0x4,%r9d
400a7b: 25 0f 0f 0f 0f and $0xf0f0f0f,%eax
400a80: 41 81 e1 0f 0f 0f 0f and $0xf0f0f0f,%r9d
400a87: 41 01 c1 add %eax,%r9d
400a8a: 44 89 c8 mov %r9d,%eax
400a8d: 44 89 ca mov %r9d,%edx
400a90: c1 e8 08 shr $0x8,%eax
400a93: 81 e2 ff 00 ff 00 and $0xff00ff,%edx
400a99: 25 ff 00 ff 00 and $0xff00ff,%eax
400a9e: 01 d0 add %edx,%eax
400aa0: 0f b6 d8 movzbl %al,%ebx
400aa3: c1 e8 10 shr $0x10,%eax
400aa6: 0f b6 d0 movzbl %al,%edx
400aa9: b8 20 00 00 00 mov $0x20,%eax
400aae: 01 da add %ebx,%edx
400ab0: 39 f2 cmp %esi,%edx
400ab2: 77 0c ja 400ac0 <nth_bit_set+0x80>
400ab4: 5b pop %rbx
400ab5: 5d pop %rbp
400ab6: c3 retq
400ac0: 83 c6 01 add $0x1,%esi
400ac3: 89 dd mov %ebx,%ebp
400ac5: 29 f5 sub %esi,%ebp
400ac7: 41 89 ea mov %ebp,%r10d
400aca: c1 ed 08 shr $0x8,%ebp
400acd: 41 81 e2 00 01 00 00 and $0x100,%r10d
400ad4: 21 eb and %ebp,%ebx
400ad6: 41 c1 ea 04 shr $0x4,%r10d
400ada: 29 de sub %ebx,%esi
400adc: c4 42 2b f7 c9 shrx %r10d,%r9d,%r9d
400ae1: 41 0f b6 d9 movzbl %r9b,%ebx
400ae5: 89 dd mov %ebx,%ebp
400ae7: 29 f5 sub %esi,%ebp
400ae9: 41 89 e9 mov %ebp,%r9d
400aec: 41 81 e1 00 01 00 00 and $0x100,%r9d
400af3: 41 c1 e9 05 shr $0x5,%r9d
400af7: 47 8d 14 11 lea (%r9,%r10,1),%r10d
400afb: 41 89 e9 mov %ebp,%r9d
400afe: 41 c1 e9 08 shr $0x8,%r9d
400b02: c4 42 2b f7 c0 shrx %r10d,%r8d,%r8d
400b07: 41 83 e0 0f and $0xf,%r8d
400b0b: 44 21 cb and %r9d,%ebx
400b0e: 45 89 c3 mov %r8d,%r11d
400b11: 29 de sub %ebx,%esi
400b13: 5b pop %rbx
400b14: 41 29 f3 sub %esi,%r11d
400b17: 5d pop %rbp
400b18: 44 89 da mov %r11d,%edx
400b1b: 41 c1 eb 08 shr $0x8,%r11d
400b1f: 81 e2 00 01 00 00 and $0x100,%edx
400b25: 45 21 d8 and %r11d,%r8d
400b28: c1 ea 06 shr $0x6,%edx
400b2b: 44 29 c6 sub %r8d,%esi
400b2e: 46 8d 0c 12 lea (%rdx,%r10,1),%r9d
400b32: c4 e2 33 f7 c9 shrx %r9d,%ecx,%ecx
400b37: 83 e1 03 and $0x3,%ecx
400b3a: 41 89 c8 mov %ecx,%r8d
400b3d: 41 29 f0 sub %esi,%r8d
400b40: 44 89 c0 mov %r8d,%eax
400b43: 41 c1 e8 08 shr $0x8,%r8d
400b47: 25 00 01 00 00 and $0x100,%eax
400b4c: 44 21 c1 and %r8d,%ecx
400b4f: c1 e8 07 shr $0x7,%eax
400b52: 29 ce sub %ecx,%esi
400b54: 42 8d 14 08 lea (%rax,%r9,1),%edx
400b58: c4 e2 6b f7 c7 shrx %edx,%edi,%eax
400b5d: 83 e0 01 and $0x1,%eax
400b60: 29 f0 sub %esi,%eax
400b62: 25 00 01 00 00 and $0x100,%eax
400b67: c1 e8 08 shr $0x8,%eax
400b6a: 01 d0 add %edx,%eax
400b6c: c3 retq
When compiled as a separate compilation unit, timing on this machine is difficult, because the actual operation is as fast as calling a do-nothing function (also compiled in a separate compilation unit); essentially, the calculation is done during the latencies associated with the function call.
It seems to be slightly faster than my suggestion of a binary search,
unsigned int nth_bit_set(uint32_t value, unsigned int n)
{
uint32_t mask = 0x0000FFFFu;
unsigned int size = 16u;
unsigned int base = 0u;
if (n++ >= __builtin_popcount(value))
return 32;
while (size > 0) {
const unsigned int count = __builtin_popcount(value & mask);
if (n > count) {
base += size;
size >>= 1;
mask |= mask << size;
} else {
size >>= 1;
mask >>= size;
}
}
return base;
}
where the loop is executed exactly five times, compiling to
00400ba0 <nth_bit_set>:
400ba0: 83 c6 01 add $0x1,%esi
400ba3: 31 c0 xor %eax,%eax
400ba5: b9 10 00 00 00 mov $0x10,%ecx
400baa: ba ff ff 00 00 mov $0xffff,%edx
400baf: 45 31 db xor %r11d,%r11d
400bb2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
400bb8: 41 89 c9 mov %ecx,%r9d
400bbb: 41 89 f8 mov %edi,%r8d
400bbe: 41 d0 e9 shr %r9b
400bc1: 41 21 d0 and %edx,%r8d
400bc4: c4 62 31 f7 d2 shlx %r9d,%edx,%r10d
400bc9: f3 45 0f b8 c0 popcnt %r8d,%r8d
400bce: 41 09 d2 or %edx,%r10d
400bd1: 44 38 c6 cmp %r8b,%sil
400bd4: 41 0f 46 cb cmovbe %r11d,%ecx
400bd8: c4 e2 33 f7 d2 shrx %r9d,%edx,%edx
400bdd: 41 0f 47 d2 cmova %r10d,%edx
400be1: 01 c8 add %ecx,%eax
400be3: 44 89 c9 mov %r9d,%ecx
400be6: 45 84 c9 test %r9b,%r9b
400be9: 75 cd jne 400bb8 <nth_bit_set+0x18>
400beb: c3 retq
as in, not more than 31 cycles in 95% of calls to the binary search version, compared to not more than 28 cycles in 95% of calls to the bit-hack version; both run within 28 cycles in 50% of the cases. (The loop version takes up to 56 cycles in 95% of calls, up to 37 cycles median.)
To determine which one is better in actual real-world code, one would have to do a proper benchmark within the real-world task; at least with current x86-64 architecture processors, the work done is easily hidden in latencies incurred elsewhere (like function calls).
My answer is mostly based on this implementation of a 64bit word select method (Hint: Look only at the MARISA_USE_POPCNT, MARISA_X64, MARISA_USE_SSE3 codepaths):
It works in two steps, first selecting the byte containing the n-th set bit and then using a lookup table inside the byte:
Extract the lower and higher nibbles for every byte (bitmasks 0xF, 0xF0, shift the higher nibbles down)
Replace the nibble values by their popcount (_mm_shuffle_epi8 with A000120)
Sum the popcounts of the lower and upper nibbles (Normal SSE addition) to get byte popcounts
Compute the prefix sum over all byte popcounts (multiplication with 0x01010101...)
Propagate the position n to all bytes (SSE broadcast or again multiplication with 0x01010101...)
Do a bytewise comparison (_mm_cmpgt_epi8 leaves 0xFF in every byte smaller than n)
Compute the byte offset by doing a popcount on the result
Now we know which byte contains the bit and a simple byte lookup table like in grek40's answer suffices to get the result.
Note however that I have not really benchmarked this result against other implementations, only that I have seen it to be quite efficient (and branchless)
I cant see a method without a loop, what springs to mind would be;
int set = 0;
int pos = 0;
while(set < n) {
if((bits & 0x01) == 1) set++;
bits = bits >> 1;
pos++;
}
after which, pos would hold the position of the nth lowest-value set bit.
The only other thing that I can think of would be a divide and conquer approach, which might yield O(log(n)) rather than O(n)...but probably not.
Edit: you said any behaviour, so non-termination is ok, right? :P
def bitN (l: Long, i: Int) : Long = {
def bitI (l: Long, i: Int) : Long =
if (i == 0) 1L else
2 * {
if (l % 2 == 0) bitI (l / 2, i) else bitI (l /2, i-1)
}
bitI (l, i) / 2
}
A recursive method (in scala). Decrement i, the position, if a modulo2 is 1. While returning, multiply by 2. Since the multiplication is invoced as last operation, it is not tail recursive, but since Longs are of known size in advance, the maximum stack is not too big.
scala> n.toBinaryString.replaceAll ("(.{8})", "$1 ")
res117: java.lang.String = 10110011 11101110 01011110 01111110 00111101 11100101 11101011 011000
scala> bitN (n, 40) .toBinaryString.replaceAll ("(.{8})", "$1 ")
res118: java.lang.String = 10000000 00000000 00000000 00000000 00000000 00000000 00000000 000000
Edit
After giving it some thought and using the __builtin_popcount function, I figured it might be better to decide on the relevant byte and then compute the whole result instead of incrementally adding/subtracting numbers. Here is an updated version:
int GetBitAtPosition(unsigned i, unsigned n)
{
unsigned bitCount;
bitCount = __builtin_popcount(i & 0x00ffffff);
if (bitCount <= n)
{
return (24 + LUT_BitPosition[i >> 24][n - bitCount]);
}
bitCount = __builtin_popcount(i & 0x0000ffff);
if (bitCount <= n)
{
return (16 + LUT_BitPosition[(i >> 16) & 0xff][n - bitCount]);
}
bitCount = __builtin_popcount(i & 0x000000ff);
if (bitCount <= n)
{
return (8 + LUT_BitPosition[(i >> 8) & 0xff][n - bitCount]);
}
return LUT_BitPosition[i & 0xff][n];
}
I felt like creating a LUT based solution where the number is inspected in byte-chunks, however, the LUT for the n-th bit position grew quite large (256*8) and the LUT-free version that was discussed in the comments might be better.
Generally the algorithm would look like this:
unsigned i = 0x000006B5;
unsigned n = 4;
unsigned result = 0;
unsigned bitCount;
while (i)
{
bitCount = LUT_BitCount[i & 0xff];
if (n < bitCount)
{
result += LUT_BitPosition[i & 0xff][n];
break; // found
}
else
{
n -= bitCount;
result += 8;
i >>= 8;
}
}
Might be worth to unroll the loop into its up to 4 iterations to get the best performance on 32 bit numbers.
The LUT for bitcount (could be replaced by __builtin_popcount):
unsigned LUT_BitCount[] = {
0, 1, 1, 2, 1, 2, 2, 3, // 0-7
1, 2, 2, 3, 2, 3, 3, 4, // 8-15
1, 2, 2, 3, 2, 3, 3, 4, // 16-23
2, 3, 3, 4, 3, 4, 4, 5, // 24-31
1, 2, 2, 3, 2, 3, 3, 4, // 32-39
2, 3, 3, 4, 3, 4, 4, 5, // 40-47
2, 3, 3, 4, 3, 4, 4, 5, // 48-55
3, 4, 4, 5, 4, 5, 5, 6, // 56-63
1, 2, 2, 3, 2, 3, 3, 4, // 64-71
2, 3, 3, 4, 3, 4, 4, 5, // 72-79
2, 3, 3, 4, 3, 4, 4, 5, // 80-87
3, 4, 4, 5, 4, 5, 5, 6, // 88-95
2, 3, 3, 4, 3, 4, 4, 5, // 96-103
3, 4, 4, 5, 4, 5, 5, 6, // 104-111
3, 4, 4, 5, 4, 5, 5, 6, // 112-119
4, 5, 5, 6, 5, 6, 6, 7, // 120-127
1, 2, 2, 3, 2, 3, 3, 4, // 128
2, 3, 3, 4, 3, 4, 4, 5, // 136
2, 3, 3, 4, 3, 4, 4, 5, // 144
3, 4, 4, 5, 4, 5, 5, 6, // 152
2, 3, 3, 4, 3, 4, 4, 5, // 160
3, 4, 4, 5, 4, 5, 5, 6, // 168
3, 4, 4, 5, 4, 5, 5, 6, // 176
4, 5, 5, 6, 5, 6, 6, 7, // 184
2, 3, 3, 4, 3, 4, 4, 5, // 192
3, 4, 4, 5, 4, 5, 5, 6, // 200
3, 4, 4, 5, 4, 5, 5, 6, // 208
4, 5, 5, 6, 5, 6, 6, 7, // 216
3, 4, 4, 5, 4, 5, 5, 6, // 224
4, 5, 5, 6, 5, 6, 6, 7, // 232
4, 5, 5, 6, 5, 6, 6, 7, // 240
5, 6, 6, 7, 6, 7, 7, 8, // 248-255
};
The LUT for bit position within a byte:
unsigned LUT_BitPosition[][8] = {
// 0-7
{UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
// 8-15
{3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
// 16-31
{4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,4,UINT_MAX,UINT_MAX,UINT_MAX},
// 32-63
{5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,5,UINT_MAX,UINT_MAX,UINT_MAX},
{4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
{3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,4,5,UINT_MAX,UINT_MAX},
// 64-127
{6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,6,UINT_MAX,UINT_MAX,UINT_MAX},
{4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
{3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,4,6,UINT_MAX,UINT_MAX},
{5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,5,6,UINT_MAX,UINT_MAX},
{4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,5,6,UINT_MAX,UINT_MAX},
{3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,5,6,UINT_MAX,UINT_MAX},
{2,3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,5,6,UINT_MAX,UINT_MAX},
{1,2,3,4,5,6,UINT_MAX,UINT_MAX},
{0,1,2,3,4,5,6,UINT_MAX},
// 128-255
{7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,7,UINT_MAX,UINT_MAX,UINT_MAX},
{4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
{3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,4,7,UINT_MAX,UINT_MAX},
{5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,5,7,UINT_MAX,UINT_MAX},
{4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,5,7,UINT_MAX,UINT_MAX},
{3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,5,7,UINT_MAX,UINT_MAX},
{2,3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,5,7,UINT_MAX,UINT_MAX},
{1,2,3,4,5,7,UINT_MAX,UINT_MAX},
{0,1,2,3,4,5,7,UINT_MAX},
{6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,6,7,UINT_MAX,UINT_MAX},
{4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,6,7,UINT_MAX,UINT_MAX},
{3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,6,7,UINT_MAX,UINT_MAX},
{2,3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,6,7,UINT_MAX,UINT_MAX},
{1,2,3,4,6,7,UINT_MAX,UINT_MAX},
{0,1,2,3,4,6,7,UINT_MAX},
{5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,5,6,7,UINT_MAX,UINT_MAX},
{3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,5,6,7,UINT_MAX,UINT_MAX},
{2,3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,5,6,7,UINT_MAX,UINT_MAX},
{1,2,3,5,6,7,UINT_MAX,UINT_MAX},
{0,1,2,3,5,6,7,UINT_MAX},
{4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,5,6,7,UINT_MAX,UINT_MAX},
{2,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,5,6,7,UINT_MAX,UINT_MAX},
{1,2,4,5,6,7,UINT_MAX,UINT_MAX},
{0,1,2,4,5,6,7,UINT_MAX},
{3,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,5,6,7,UINT_MAX,UINT_MAX},
{1,3,4,5,6,7,UINT_MAX,UINT_MAX},
{0,1,3,4,5,6,7,UINT_MAX},
{2,3,4,5,6,7,UINT_MAX,UINT_MAX},
{0,2,3,4,5,6,7,UINT_MAX},
{1,2,3,4,5,6,7,UINT_MAX},
{0,1,2,3,4,5,6,7},
};
My approach is to calculate the population count for each 8-bit quarters of the 32-bit integer in parallel, then find which quarter contains the nth bit. The population count of quarters that are lower than the found one can be summarized as the initial value of later calculation.
After that count set bits one-by-one until the n is reached. Without branches and using an incomplete implementation of population count algorithm, my example is the following:
#include <stdio.h>
#include <stdint.h>
int main() {
uint32_t n = 10, test = 3124375902u; /* 10111010001110100011000101011110 */
uint32_t index, popcnt, quarter = 0, q_popcnt;
/* count set bits of each quarter of 32-bit integer in parallel */
q_popcnt = test - ((test >> 1) & 0x55555555);
q_popcnt = (q_popcnt & 0x33333333) + ((q_popcnt >> 2) & 0x33333333);
q_popcnt = (q_popcnt + (q_popcnt >> 4)) & 0x0F0F0F0F;
popcnt = q_popcnt;
/* find which quarters can be summarized and summarize them */
quarter += (n + 1 >= (q_popcnt & 0xff));
quarter += (n + 1 >= ((q_popcnt += q_popcnt >> 8) & 0xff));
quarter += (n + 1 >= ((q_popcnt += q_popcnt >> 16) & 0xff));
quarter += (n + 1 >= ((q_popcnt += q_popcnt >> 24) & 0xff));
popcnt &= (UINT32_MAX >> (8 * quarter));
popcnt = (popcnt * 0x01010101) >> 24;
/* find the index of nth bit in quarter where it should be */
index = 8 * quarter;
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
printf("index = %u\n", index);
return 0;
}
A simple approach which uses loops and conditionals can be the following as well:
#include <stdio.h>
#include <stdint.h>
int main() {
uint32_t n = 11, test = 3124375902u; /* 10111010001110100011000101011110 */
uint32_t popcnt = 0, index = 0;
while(popcnt += ((test >> index) & 1), popcnt <= n && ++index < 32);
printf("index = %u\n", index);
return 0;
}
I know the question asks for something faster than a loop, but a complicated loop-less answer is likely to take longer than a quick loop.
If the computer has 32 bit ints and v is a random value then it might have for example 16 ones and if we are looking for a random place among the 16 ones, we might typically be looking for the 8th one. 7 or 8 times round a loop with just a couple of statements isn't too bad.
int findNthBit(unsigned int n, int v)
{
int next;
if (n > __builtin_popcount(v)) return 0;
while (next = v&v-1, --n)
{
v = next;
}
return v ^ next;
}
The loop works by removing the lowest set bit (n-1) times.
The n'th one bit that would be removed is the one bit we were looking for.
If anybody wants to test this ....
#include "stdio.h"
#include "assert.h"
// function here
int main() {
assert(findNthBit(1, 0)==0);
assert(findNthBit(1, 0xf0f)==1<<0);
assert(findNthBit(2, 0xf0f)==1<<1);
assert(findNthBit(3, 0xf0f)==1<<2);
assert(findNthBit(4, 0xf0f)==1<<3);
assert(findNthBit(5, 0xf0f)==1<<8);
assert(findNthBit(6, 0xf0f)==1<<9);
assert(findNthBit(7, 0xf0f)==1<<10);
assert(findNthBit(8, 0xf0f)==1<<11);
assert(findNthBit(9, 0xf0f)==0);
printf("looks good\n");
}
If there are concerns about the number of times the loop is executed, for example if the function is regularly called with large values of n, its simple to add an extra line or two of the following form
if (n > 8) return findNthBit(n-__builtin_popcount(v&0xff), v>>8) << 8;
or
if (n > 12) return findNthBit(n - __builtin_popcount(v&0xfff), v>>12) << 12;
The idea here is that the n'th one will never be located in the bottom n-1 bits. A better version clears not only the bottom 8 or 12 bits, but all the bottom (n-1) bits when n is large-ish and we don't want to loop that many times.
if (n > 7) return findNthBit(n - __builtin_popcount(v & ((1<<(n-1))-1)), v>>(n-1)) << (n-1);
I tested this with findNthBit(20, 0xaf5faf5f) and after clearing out the bottom 19 bits because the answer wasn't to be found there, it looked for the 5th bit in the remaining bits by looping 4 times to remove 4 ones.
So an improved version is
int findNthBit(unsigned int n, int v)
{
int next;
if (n > __builtin_popcount(v)) return 0;
if (n > 7) return findNthBit(n - __builtin_popcount(v & ((1<<(n-1))-1)), v>>(n-1)) << (n-1);
while (next = v&v-1, --n)
{
v = next;
}
return v ^ next;
}
The value 7, limiting looping is chosen fairly arbitrarily as a compromise between limiting looping and limiting recursion. The function could be further improved by removing recursion and keeping track of a shift amount instead. I may try this if I get some peace from home schooling my daughter!
Here is a final version with the recursion removed by keeping track of the number of low order bits shifted out from the bottom of the bits being searched.
Final version
int findNthBit(unsigned int n, int v)
{
int shifted = 0; // running total
int nBits; // value for this iteration
// handle no solution
if (n > __builtin_popcount(v)) return 0;
while (n > 7)
{
// for large n shift out lower n-1 bits from v.
nBits = n-1;
n -= __builtin_popcount(v & ((1<<nBits)-1));
v >>= nBits;
shifted += nBits;
}
int next;
// n is now small, clear out n-1 bits and return the next bit
// v&(v-1): a well known software trick to remove the lowest set bit.
while (next = v&(v-1), --n)
{
v = next;
}
return (v ^ next) << shifted;
}
Building on the answer given by Jukka Suomela, which uses a machine-specific instruction that may not necessarily be available, it is also possible to write a function that does exactly the same thing as _pdep_u64 without any machine dependencies. It must loop over the set bits in one of the arguments, but can still be described as a constexpr function for C++11.
constexpr inline uint64_t deposit_bits(uint64_t x, uint64_t mask, uint64_t b, uint64_t res) {
return mask != 0 ? deposit_bits(x, mask & (mask - 1), b << 1, ((x & b) ? (res | (mask & (-mask))) : res)) : res;
}
constexpr inline uint64_t nthset(uint64_t x, unsigned n) {
return deposit_bits(1ULL << n, x, 1, 0);
}
Based on a method by Juha Järvi published in the famous Bit Twiddling Hacks, I tested this implementation where n and i are used as in the question:
a = i - (i >> 1 & 0x55555555);
b = (a & 0x33333333) + (a >> 2 & 0x33333333);
c = b + (b >> 4) & 0x0f0f0f0f;
r = n + 1;
s = 0;
t = c + (c >> 8) & 0xff;
if (r > t) {
s += 16;
r -= t;
}
t = c >> s & 0xf;
if (r > t) {
s += 8;
r -= t;
}
t = b >> s & 0x7;
if (r > t) {
s += 4;
r -= t;
}
t = a >> s & 0x3;
if (r > t) {
s += 2;
r -= t;
}
t = i >> s & 0x1;
if (r > t)
s++;
return (s);
Based on my own tests, this is about as fast as the loop on x86, whereas it is 20% faster on arm64 and probably a lot faster on arm due to the fast conditional instructions, but I can't test this right now.
PDEP solution is great, but some languages like Java do not contain this intrinsic yet, however, are efficient in the other low-level operations. So I came up with the following fall back for such cases: a branchless binary search.
// n must be using 0-based indexing.
// This method produces correct results only if n is smaller
// than the number of set bits.
public static int getNthSetBit(long mask64, int n) {
// Binary search without branching
int base = 0;
final int low32 = (int) mask64;
final int high32n = n - Integer.bitCount(low32);
final int inLow32 = high32n >>> 31;
final int inHigh32 = inLow32 ^ 1;
final int shift32 = inHigh32 << 5;
final int mask32 = (int) (mask64 >>> shift32);
n = ((-inLow32) & n) | ((-inHigh32) & high32n);
base += shift32;
final int low16 = mask32 & 0xffff;
final int high16n = n - Integer.bitCount(low16);
final int inLow16 = high16n >>> 31;
final int inHigh16 = inLow16 ^ 1;
final int shift16 = inHigh16 << 4;
final int mask16 = (mask32 >>> shift16) & 0xffff;
n = ((-inLow16) & n) | ((-inHigh16) & high16n);
base += shift16;
final int low8 = mask16 & 0xff;
final int high8n = n - Integer.bitCount(low8);
final int inLow8 = high8n >>> 31;
final int inHigh8 = inLow8 ^ 1;
final int shift8 = inHigh8 << 3;
final int mask8 = (mask16 >>> shift8) & 0xff;
n = ((-inLow8) & n) | ((-inHigh8) & high8n);
base += shift8;
final int low4 = mask8 & 0xf;
final int high4n = n - Integer.bitCount(low4);
final int inLow4 = high4n >>> 31;
final int inHigh4 = inLow4 ^ 1;
final int shift4 = inHigh4 << 2;
final int mask4 = (mask8 >>> shift4) & 0xf;
n = ((-inLow4) & n) | ((-inHigh4) & high4n);
base += shift4;
final int low2 = mask4 & 3;
final int high2n = n - (low2 >> 1) - (low2 & 1);
final int inLow2 = high2n >>> 31;
final int inHigh2 = inLow2 ^ 1;
final int shift2 = inHigh2 << 1;
final int mask2 = (mask4 >>> shift2) & 3;
n = ((-inLow2) & n) | ((-inHigh2) & high2n);
base += shift2;
// For the 2 bits remaining, we can take a shortcut
return base + (n | ((mask2 ^ 1) & 1));
}

Sort points in clockwise order?

Given an array of x,y points, how do I sort the points of this array in clockwise order (around their overall average center point)? My goal is to pass the points to a line-creation function to end up with something looking rather "solid", as convex as possible with no lines intersecting.
For what it's worth, I'm using Lua, but any pseudocode would be appreciated.
Update: For reference, this is the Lua code based on Ciamej's excellent answer (ignore my "app" prefix):
function appSortPointsClockwise(points)
local centerPoint = appGetCenterPointOfPoints(points)
app.pointsCenterPoint = centerPoint
table.sort(points, appGetIsLess)
return points
end
function appGetIsLess(a, b)
local center = app.pointsCenterPoint
if a.x >= 0 and b.x < 0 then return true
elseif a.x == 0 and b.x == 0 then return a.y > b.y
end
local det = (a.x - center.x) * (b.y - center.y) - (b.x - center.x) * (a.y - center.y)
if det < 0 then return true
elseif det > 0 then return false
end
local d1 = (a.x - center.x) * (a.x - center.x) + (a.y - center.y) * (a.y - center.y)
local d2 = (b.x - center.x) * (b.x - center.x) + (b.y - center.y) * (b.y - center.y)
return d1 > d2
end
function appGetCenterPointOfPoints(points)
local pointsSum = {x = 0, y = 0}
for i = 1, #points do pointsSum.x = pointsSum.x + points[i].x; pointsSum.y = pointsSum.y + points[i].y end
return {x = pointsSum.x / #points, y = pointsSum.y / #points}
end
First, compute the center point.
Then sort the points using whatever sorting algorithm you like, but use special comparison routine to determine whether one point is less than the other.
You can check whether one point (a) is to the left or to the right of the other (b) in relation to the center by this simple calculation:
det = (a.x - center.x) * (b.y - center.y) - (b.x - center.x) * (a.y - center.y)
if the result is zero, then they are on the same line from the center, if it's positive or negative, then it is on one side or the other, so one point will precede the other.
Using it you can construct a less-than relation to compare points and determine the order in which they should appear in the sorted array. But you have to define where is the beginning of that order, I mean what angle will be the starting one (e.g. the positive half of x-axis).
The code for the comparison function can look like this:
bool less(point a, point b)
{
if (a.x - center.x >= 0 && b.x - center.x < 0)
return true;
if (a.x - center.x < 0 && b.x - center.x >= 0)
return false;
if (a.x - center.x == 0 && b.x - center.x == 0) {
if (a.y - center.y >= 0 || b.y - center.y >= 0)
return a.y > b.y;
return b.y > a.y;
}
// compute the cross product of vectors (center -> a) x (center -> b)
int det = (a.x - center.x) * (b.y - center.y) - (b.x - center.x) * (a.y - center.y);
if (det < 0)
return true;
if (det > 0)
return false;
// points a and b are on the same line from the center
// check which point is closer to the center
int d1 = (a.x - center.x) * (a.x - center.x) + (a.y - center.y) * (a.y - center.y);
int d2 = (b.x - center.x) * (b.x - center.x) + (b.y - center.y) * (b.y - center.y);
return d1 > d2;
}
This will order the points clockwise starting from the 12 o'clock. Points on the same "hour" will be ordered starting from the ones that are further from the center.
If using integer types (which are not really present in Lua) you'd have to assure that det, d1 and d2 variables are of a type that will be able to hold the result of performed calculations.
If you want to achieve something looking solid, as convex as possible, then I guess you're looking for a Convex Hull. You can compute it using the Graham Scan.
In this algorithm, you also have to sort the points clockwise (or counter-clockwise) starting from a special pivot point. Then you repeat simple loop steps each time checking if you turn left or right adding new points to the convex hull, this check is based on a cross product just like in the above comparison function.
Edit:
Added one more if statement if (a.y - center.y >= 0 || b.y - center.y >=0) to make sure that points that have x=0 and negative y are sorted starting from the ones that are further from the center. If you don't care about the order of points on the same 'hour' you can omit this if statement and always return a.y > b.y.
Corrected the first if statements with adding -center.x and -center.y.
Added the second if statement (a.x - center.x < 0 && b.x - center.x >= 0). It was an obvious oversight that it was missing. The if statements could be reorganized now because some checks are redundant. For example, if the first condition in the first if statement is false, then the first condition of the second if must be true. I decided, however, to leave the code as it is for the sake of simplicity. It's quite possible that the compiler will optimize the code and produce the same result anyway.
What you're asking for is a system known as polar coordinates. Conversion from Cartesian to polar coordinates is easily done in any language. The formulas can be found in this section.
After converting to polar coordinates, just sort by the angle, theta.
An interesting alternative approach to your problem would be to find the approximate minimum to the Traveling Salesman Problem (TSP), ie. the shortest route linking all your points. If your points form a convex shape, it should be the right solution, otherwise, it should still look good (a "solid" shape can be defined as one that has a low perimeter/area ratio, which is what we are optimizing here).
You can use any implementation of an optimizer for the TSP, of which I am pretty sure you can find a ton in your language of choice.
Another version (return true if a comes before b in counterclockwise direction):
bool lessCcw(const Vector2D &center, const Vector2D &a, const Vector2D &b) const
{
// Computes the quadrant for a and b (0-3):
// ^
// 1 | 0
// ---+-->
// 2 | 3
const int dax = ((a.x() - center.x()) > 0) ? 1 : 0;
const int day = ((a.y() - center.y()) > 0) ? 1 : 0;
const int qa = (1 - dax) + (1 - day) + ((dax & (1 - day)) << 1);
/* The previous computes the following:
const int qa =
( (a.x() > center.x())
? ((a.y() > center.y())
? 0 : 3)
: ((a.y() > center.y())
? 1 : 2)); */
const int dbx = ((b.x() - center.x()) > 0) ? 1 : 0;
const int dby = ((b.y() - center.y()) > 0) ? 1 : 0;
const int qb = (1 - dbx) + (1 - dby) + ((dbx & (1 - dby)) << 1);
if (qa == qb) {
return (b.x() - center.x()) * (a.y() - center.y()) < (b.y() - center.y()) * (a.x() - center.x());
} else {
return qa < qb;
}
}
This is faster, because the compiler (tested on Visual C++ 2015) doesn't generate jump to compute dax, day, dbx, dby. Here the output assembly from the compiler:
; 28 : const int dax = ((a.x() - center.x()) > 0) ? 1 : 0;
vmovss xmm2, DWORD PTR [ecx]
vmovss xmm0, DWORD PTR [edx]
; 29 : const int day = ((a.y() - center.y()) > 0) ? 1 : 0;
vmovss xmm1, DWORD PTR [ecx+4]
vsubss xmm4, xmm0, xmm2
vmovss xmm0, DWORD PTR [edx+4]
push ebx
xor ebx, ebx
vxorps xmm3, xmm3, xmm3
vcomiss xmm4, xmm3
vsubss xmm5, xmm0, xmm1
seta bl
xor ecx, ecx
vcomiss xmm5, xmm3
push esi
seta cl
; 30 : const int qa = (1 - dax) + (1 - day) + ((dax & (1 - day)) << 1);
mov esi, 2
push edi
mov edi, esi
; 31 :
; 32 : /* The previous computes the following:
; 33 :
; 34 : const int qa =
; 35 : ( (a.x() > center.x())
; 36 : ? ((a.y() > center.y()) ? 0 : 3)
; 37 : : ((a.y() > center.y()) ? 1 : 2));
; 38 : */
; 39 :
; 40 : const int dbx = ((b.x() - center.x()) > 0) ? 1 : 0;
xor edx, edx
lea eax, DWORD PTR [ecx+ecx]
sub edi, eax
lea eax, DWORD PTR [ebx+ebx]
and edi, eax
mov eax, DWORD PTR _b$[esp+8]
sub edi, ecx
sub edi, ebx
add edi, esi
vmovss xmm0, DWORD PTR [eax]
vsubss xmm2, xmm0, xmm2
; 41 : const int dby = ((b.y() - center.y()) > 0) ? 1 : 0;
vmovss xmm0, DWORD PTR [eax+4]
vcomiss xmm2, xmm3
vsubss xmm0, xmm0, xmm1
seta dl
xor ecx, ecx
vcomiss xmm0, xmm3
seta cl
; 42 : const int qb = (1 - dbx) + (1 - dby) + ((dbx & (1 - dby)) << 1);
lea eax, DWORD PTR [ecx+ecx]
sub esi, eax
lea eax, DWORD PTR [edx+edx]
and esi, eax
sub esi, ecx
sub esi, edx
add esi, 2
; 43 :
; 44 : if (qa == qb) {
cmp edi, esi
jne SHORT $LN37#lessCcw
; 45 : return (b.x() - center.x()) * (a.y() - center.y()) < (b.y() - center.y()) * (a.x() - center.x());
vmulss xmm1, xmm2, xmm5
vmulss xmm0, xmm0, xmm4
xor eax, eax
pop edi
vcomiss xmm0, xmm1
pop esi
seta al
pop ebx
; 46 : } else {
; 47 : return qa < qb;
; 48 : }
; 49 : }
ret 0
$LN37#lessCcw:
pop edi
pop esi
setl al
pop ebx
ret 0
?lessCcw##YA_NABVVector2D##00#Z ENDP ; lessCcw
Enjoy.
vector3 a = new vector3(1 , 0 , 0)..............w.r.t X_axis
vector3 b = any_point - Center;
- y = |a * b| , x = a . b
- Atan2(y , x)...............................gives angle between -PI to + PI in radians
- (Input % 360 + 360) % 360................to convert it from 0 to 2PI in radians
- sort by adding_points to list_of_polygon_verts by angle we got 0 to 360
Finally you get Anticlockwize sorted verts
list.Reverse()..................Clockwise_order
I know this is somewhat of an old post with an excellent accepted answer, but I feel like I can still contribute something useful. All the answers so far essentially use a comparison function to compare two points and determine their order, but what if you want to use only one point at a time and a key function?
Not only is this possible, but the resulting code is also extremely compact. Here is the complete solution using Python's built-in sorted function:
# Create some random points
num = 7
points = np.random.random((num, 2))
# Compute their center
center = np.mean(points, axis=0)
# Make arctan2 function that returns a value from [0, 2 pi) instead of [-pi, pi)
arctan2 = lambda s, c: angle if (angle := np.arctan2(s, c)) >= 0 else 2 * np.pi + angle
# Define the key function
def clockwise_around_center(point):
diff = point - center
rcos = np.dot(diff, center)
rsin = np.cross(diff, center)
return arctan2(rsin, rcos)
# Sort our points using the key function
sorted_points = sorted(points, key=clockwise_around_center)
This answer would also work in 3D, if the points are on a 2D plane embedded in 3D. We would only have to modify the calculation of rsin by dotting it with the normal vector of the plane. E.g.
rsin = np.dot([0,0,1], np.cross(diff, center))
if that plane has e_z as its normal vector.
The advantage of this code is that it works on only one point at the time using a key function. The quantity rsin, if you work it out on a coefficient level, is exactly the same as what is called det in the accepter answer, except that I compute it between point - center and center, not between point1 - center and point2 - center. But the geometrical meaning of this quantity is the radius times the sin of the angle, hence I call this variable rsin. Similarly for the dot product, which is the radius times the cosine of the angle and hence called rcos.
One could argue that this solution uses arctan2, and is therefore less clean. However, I personally think that the clearity of using a key function outweighs the need for one call to a trig function. Note that I prefer to have arctan2 return a value from [0, 2 pi), because then we get the angle 0 when point happens to be identical to center, and thus it will be the first point in our sorted list. This is an optional choice.
In order to understand why this code works, the crucial insight is that all our points are defined as arrows with respect to the origin, including the center point itself. So if we calculate point - center, this is equivalent to placing the arrow from the tip of center to the tip of point, at the origin. Hence we can sort the arrow point - center with respect to the angle it makes with the arrow pointing to center.
Here's a way to sort the vertices of a rectangle in clock-wise order. I modified the original solution provided by pyimagesearch and got rid of the scipy dependency.
import numpy as np
def pointwise_distance(pts1, pts2):
"""Calculates the distance between pairs of points
Args:
pts1 (np.ndarray): array of form [[x1, y1], [x2, y2], ...]
pts2 (np.ndarray): array of form [[x1, y1], [x2, y2], ...]
Returns:
np.array: distances between corresponding points
"""
dist = np.sqrt(np.sum((pts1 - pts2)**2, axis=1))
return dist
def order_points(pts):
"""Orders points in form [top left, top right, bottom right, bottom left].
Source: https://www.pyimagesearch.com/2016/03/21/ordering-coordinates-clockwise-with-python-and-opencv/
Args:
pts (np.ndarray): list of points of form [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
Returns:
[type]: [description]
"""
# sort the points based on their x-coordinates
x_sorted = pts[np.argsort(pts[:, 0]), :]
# grab the left-most and right-most points from the sorted
# x-roodinate points
left_most = x_sorted[:2, :]
right_most = x_sorted[2:, :]
# now, sort the left-most coordinates according to their
# y-coordinates so we can grab the top-left and bottom-left
# points, respectively
left_most = left_most[np.argsort(left_most[:, 1]), :]
tl, bl = left_most
# now that we have the top-left coordinate, use it as an
# anchor to calculate the Euclidean distance between the
# top-left and right-most points; by the Pythagorean
# theorem, the point with the largest distance will be
# our bottom-right point. Note: this is a valid assumption because
# we are dealing with rectangles only.
# We need to use this instead of just using min/max to handle the case where
# there are points that have the same x or y value.
D = pointwise_distance(np.vstack([tl, tl]), right_most)
br, tr = right_most[np.argsort(D)[::-1], :]
# return the coordinates in top-left, top-right,
# bottom-right, and bottom-left order
return np.array([tl, tr, br, bl], dtype="float32")
With numpy:
import matplotlib.pyplot as plt
import numpy as np
# List of coords
coords = np.array([7,7, 5, 0, 0, 0, 5, 10, 10, 0, 0, 5, 10, 5, 0, 10, 10, 10]).reshape(-1, 2)
centroid = np.mean(coords, axis=0)
sorted_coords = coords[np.argsort(np.arctan2(coords[:, 1] - centroid[1], coords[:, 0] - centroid[0])), :]
plt.scatter(coords[:,0],coords[:,1])
plt.plot(coords[:,0],coords[:,1])
plt.plot(sorted_coords[:,0],sorted_coords[:,1])
plt.show()

Resources