Find nth SET bit in an int - algorithm

Instead of just the lowest set bit, I want to find the position of the nth lowest set bit. (I'm NOT talking about value on the nth bit position)
For example, say I have:
0000 1101 1000 0100 1100 1000 1010 0000
And I want to find the 4th bit that is set. Then I want it to return:
0000 0000 0000 0000 0100 0000 0000 0000
If popcnt(v) < n, it would make sense if this function returned 0, but any behavior for this case is acceptable for me.
I'm looking for something faster than a loop if possible.

Nowadays this is very easy with PDEP from the BMI2 instruction set. Here is a 64-bit version with some examples:
#include <cassert>
#include <cstdint>
#include <x86intrin.h>
inline uint64_t nthset(uint64_t x, unsigned n) {
return _pdep_u64(1ULL << n, x);
}
int main() {
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 0) ==
0b0000'0000'0000'0000'0000'0000'0010'0000);
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 1) ==
0b0000'0000'0000'0000'0000'0000'1000'0000);
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 3) ==
0b0000'0000'0000'0000'0100'0000'0000'0000);
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 9) ==
0b0000'1000'0000'0000'0000'0000'0000'0000);
assert(nthset(0b0000'1101'1000'0100'1100'1000'1010'0000, 10) ==
0b0000'0000'0000'0000'0000'0000'0000'0000);
}
If you just want the (zero-based) index of the nth set bit, add a trailing zero count.
inline unsigned nthset(uint64_t x, unsigned n) {
return _tzcnt_u64(_pdep_u64(1ULL << n, x));
}

It turns out that it is indeed possible to do this with no loops. It is fastest to precompute the (at least) 8 bit version of this problem. Of course, these tables use up cache space, but there should still be a net speedup in virtually all modern pc scenarios. In this code, n=0 returns the least set bit, n=1 is second-to-least, etc.
Solution with __popcnt
There is a solution using the __popcnt intrinsic (you need __popcnt to be extremely fast or any perf gains over a simple loop solution will be moot. Fortunately most SSE4+ era processors support it).
// lookup table for sub-problem: 8-bit v
byte PRECOMP[256][8] = { .... } // PRECOMP[v][n] for v < 256 and n < 8
ulong nthSetBit(ulong v, ulong n) {
ulong p = __popcnt(v & 0xFFFF);
ulong shift = 0;
if (p <= n) {
v >>= 16;
shift += 16;
n -= p;
}
p = __popcnt(v & 0xFF);
if (p <= n) {
shift += 8;
v >>= 8;
n -= p;
}
if (n >= 8) return 0; // optional safety, in case n > # of set bits
return PRECOMP[v & 0xFF][n] << shift;
}
This illustrates how the divide and conquer approach works.
General Solution
There is also a solution for "general" architectures- without __popcnt. It can be done by processing in 8-bit chunks. You need one more lookup table that tells you the popcnt of a byte:
byte PRECOMP[256][8] = { .... } // PRECOMP[v][n] for v<256 and n < 8
byte POPCNT[256] = { ... } // POPCNT[v] is the number of set bits in v. (v < 256)
ulong nthSetBit(ulong v, ulong n) {
ulong p = POPCNT[v & 0xFF];
ulong shift = 0;
if (p <= n) {
n -= p;
v >>= 8;
shift += 8;
p = POPCNT[v & 0xFF];
if (p <= n) {
n -= p;
shift += 8;
v >>= 8;
p = POPCNT[v & 0xFF];
if (p <= n) {
n -= p;
shift += 8;
v >>= 8;
}
}
}
if (n >= 8) return 0; // optional safety, in case n > # of set bits
return PRECOMP[v & 0xFF][n] << shift;
}
This could, of course, be done with a loop, but the unrolled form is faster and the unusual form of the loop would make it unlikely that the compiler could automatically unroll it for you.

v-1 has a zero where v has its least significant "one" bit, while all more significant bits are the same. This leads to the following function:
int ffsn(unsigned int v, int n) {
for (int i=0; i<n-1; i++) {
v &= v-1; // remove the least significant bit
}
return v & ~(v-1); // extract the least significant bit
}

The version from bit-twiddling hacks adapted to this case is, for example,
unsigned int nth_bit_set(uint32_t value, unsigned int n)
{
const uint32_t pop2 = (value & 0x55555555u) + ((value >> 1) & 0x55555555u);
const uint32_t pop4 = (pop2 & 0x33333333u) + ((pop2 >> 2) & 0x33333333u);
const uint32_t pop8 = (pop4 & 0x0f0f0f0fu) + ((pop4 >> 4) & 0x0f0f0f0fu);
const uint32_t pop16 = (pop8 & 0x00ff00ffu) + ((pop8 >> 8) & 0x00ff00ffu);
const uint32_t pop32 = (pop16 & 0x000000ffu) + ((pop16 >>16) & 0x000000ffu);
unsigned int rank = 0;
unsigned int temp;
if (n++ >= pop32)
return 32;
temp = pop16 & 0xffu;
/* if (n > temp) { n -= temp; rank += 16; } */
rank += ((temp - n) & 256) >> 4;
n -= temp & ((temp - n) >> 8);
temp = (pop8 >> rank) & 0xffu;
/* if (n > temp) { n -= temp; rank += 8; } */
rank += ((temp - n) & 256) >> 5;
n -= temp & ((temp - n) >> 8);
temp = (pop4 >> rank) & 0x0fu;
/* if (n > temp) { n -= temp; rank += 4; } */
rank += ((temp - n) & 256) >> 6;
n -= temp & ((temp - n) >> 8);
temp = (pop2 >> rank) & 0x03u;
/* if (n > temp) { n -= temp; rank += 2; } */
rank += ((temp - n) & 256) >> 7;
n -= temp & ((temp - n) >> 8);
temp = (value >> rank) & 0x01u;
/* if (n > temp) rank += 1; */
rank += ((temp - n) & 256) >> 8;
return rank;
}
which, when compiled in a separate compilation unit, on gcc-5.4.0 using -Wall -O3 -march=native -mtune=native on Intel Core i5-4200u, yields
00400a40 <nth_bit_set>:
400a40: 89 f9 mov %edi,%ecx
400a42: 89 f8 mov %edi,%eax
400a44: 55 push %rbp
400a45: 40 0f b6 f6 movzbl %sil,%esi
400a49: d1 e9 shr %ecx
400a4b: 25 55 55 55 55 and $0x55555555,%eax
400a50: 53 push %rbx
400a51: 81 e1 55 55 55 55 and $0x55555555,%ecx
400a57: 01 c1 add %eax,%ecx
400a59: 41 89 c8 mov %ecx,%r8d
400a5c: 89 c8 mov %ecx,%eax
400a5e: 41 c1 e8 02 shr $0x2,%r8d
400a62: 25 33 33 33 33 and $0x33333333,%eax
400a67: 41 81 e0 33 33 33 33 and $0x33333333,%r8d
400a6e: 41 01 c0 add %eax,%r8d
400a71: 45 89 c1 mov %r8d,%r9d
400a74: 44 89 c0 mov %r8d,%eax
400a77: 41 c1 e9 04 shr $0x4,%r9d
400a7b: 25 0f 0f 0f 0f and $0xf0f0f0f,%eax
400a80: 41 81 e1 0f 0f 0f 0f and $0xf0f0f0f,%r9d
400a87: 41 01 c1 add %eax,%r9d
400a8a: 44 89 c8 mov %r9d,%eax
400a8d: 44 89 ca mov %r9d,%edx
400a90: c1 e8 08 shr $0x8,%eax
400a93: 81 e2 ff 00 ff 00 and $0xff00ff,%edx
400a99: 25 ff 00 ff 00 and $0xff00ff,%eax
400a9e: 01 d0 add %edx,%eax
400aa0: 0f b6 d8 movzbl %al,%ebx
400aa3: c1 e8 10 shr $0x10,%eax
400aa6: 0f b6 d0 movzbl %al,%edx
400aa9: b8 20 00 00 00 mov $0x20,%eax
400aae: 01 da add %ebx,%edx
400ab0: 39 f2 cmp %esi,%edx
400ab2: 77 0c ja 400ac0 <nth_bit_set+0x80>
400ab4: 5b pop %rbx
400ab5: 5d pop %rbp
400ab6: c3 retq
400ac0: 83 c6 01 add $0x1,%esi
400ac3: 89 dd mov %ebx,%ebp
400ac5: 29 f5 sub %esi,%ebp
400ac7: 41 89 ea mov %ebp,%r10d
400aca: c1 ed 08 shr $0x8,%ebp
400acd: 41 81 e2 00 01 00 00 and $0x100,%r10d
400ad4: 21 eb and %ebp,%ebx
400ad6: 41 c1 ea 04 shr $0x4,%r10d
400ada: 29 de sub %ebx,%esi
400adc: c4 42 2b f7 c9 shrx %r10d,%r9d,%r9d
400ae1: 41 0f b6 d9 movzbl %r9b,%ebx
400ae5: 89 dd mov %ebx,%ebp
400ae7: 29 f5 sub %esi,%ebp
400ae9: 41 89 e9 mov %ebp,%r9d
400aec: 41 81 e1 00 01 00 00 and $0x100,%r9d
400af3: 41 c1 e9 05 shr $0x5,%r9d
400af7: 47 8d 14 11 lea (%r9,%r10,1),%r10d
400afb: 41 89 e9 mov %ebp,%r9d
400afe: 41 c1 e9 08 shr $0x8,%r9d
400b02: c4 42 2b f7 c0 shrx %r10d,%r8d,%r8d
400b07: 41 83 e0 0f and $0xf,%r8d
400b0b: 44 21 cb and %r9d,%ebx
400b0e: 45 89 c3 mov %r8d,%r11d
400b11: 29 de sub %ebx,%esi
400b13: 5b pop %rbx
400b14: 41 29 f3 sub %esi,%r11d
400b17: 5d pop %rbp
400b18: 44 89 da mov %r11d,%edx
400b1b: 41 c1 eb 08 shr $0x8,%r11d
400b1f: 81 e2 00 01 00 00 and $0x100,%edx
400b25: 45 21 d8 and %r11d,%r8d
400b28: c1 ea 06 shr $0x6,%edx
400b2b: 44 29 c6 sub %r8d,%esi
400b2e: 46 8d 0c 12 lea (%rdx,%r10,1),%r9d
400b32: c4 e2 33 f7 c9 shrx %r9d,%ecx,%ecx
400b37: 83 e1 03 and $0x3,%ecx
400b3a: 41 89 c8 mov %ecx,%r8d
400b3d: 41 29 f0 sub %esi,%r8d
400b40: 44 89 c0 mov %r8d,%eax
400b43: 41 c1 e8 08 shr $0x8,%r8d
400b47: 25 00 01 00 00 and $0x100,%eax
400b4c: 44 21 c1 and %r8d,%ecx
400b4f: c1 e8 07 shr $0x7,%eax
400b52: 29 ce sub %ecx,%esi
400b54: 42 8d 14 08 lea (%rax,%r9,1),%edx
400b58: c4 e2 6b f7 c7 shrx %edx,%edi,%eax
400b5d: 83 e0 01 and $0x1,%eax
400b60: 29 f0 sub %esi,%eax
400b62: 25 00 01 00 00 and $0x100,%eax
400b67: c1 e8 08 shr $0x8,%eax
400b6a: 01 d0 add %edx,%eax
400b6c: c3 retq
When compiled as a separate compilation unit, timing on this machine is difficult, because the actual operation is as fast as calling a do-nothing function (also compiled in a separate compilation unit); essentially, the calculation is done during the latencies associated with the function call.
It seems to be slightly faster than my suggestion of a binary search,
unsigned int nth_bit_set(uint32_t value, unsigned int n)
{
uint32_t mask = 0x0000FFFFu;
unsigned int size = 16u;
unsigned int base = 0u;
if (n++ >= __builtin_popcount(value))
return 32;
while (size > 0) {
const unsigned int count = __builtin_popcount(value & mask);
if (n > count) {
base += size;
size >>= 1;
mask |= mask << size;
} else {
size >>= 1;
mask >>= size;
}
}
return base;
}
where the loop is executed exactly five times, compiling to
00400ba0 <nth_bit_set>:
400ba0: 83 c6 01 add $0x1,%esi
400ba3: 31 c0 xor %eax,%eax
400ba5: b9 10 00 00 00 mov $0x10,%ecx
400baa: ba ff ff 00 00 mov $0xffff,%edx
400baf: 45 31 db xor %r11d,%r11d
400bb2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
400bb8: 41 89 c9 mov %ecx,%r9d
400bbb: 41 89 f8 mov %edi,%r8d
400bbe: 41 d0 e9 shr %r9b
400bc1: 41 21 d0 and %edx,%r8d
400bc4: c4 62 31 f7 d2 shlx %r9d,%edx,%r10d
400bc9: f3 45 0f b8 c0 popcnt %r8d,%r8d
400bce: 41 09 d2 or %edx,%r10d
400bd1: 44 38 c6 cmp %r8b,%sil
400bd4: 41 0f 46 cb cmovbe %r11d,%ecx
400bd8: c4 e2 33 f7 d2 shrx %r9d,%edx,%edx
400bdd: 41 0f 47 d2 cmova %r10d,%edx
400be1: 01 c8 add %ecx,%eax
400be3: 44 89 c9 mov %r9d,%ecx
400be6: 45 84 c9 test %r9b,%r9b
400be9: 75 cd jne 400bb8 <nth_bit_set+0x18>
400beb: c3 retq
as in, not more than 31 cycles in 95% of calls to the binary search version, compared to not more than 28 cycles in 95% of calls to the bit-hack version; both run within 28 cycles in 50% of the cases. (The loop version takes up to 56 cycles in 95% of calls, up to 37 cycles median.)
To determine which one is better in actual real-world code, one would have to do a proper benchmark within the real-world task; at least with current x86-64 architecture processors, the work done is easily hidden in latencies incurred elsewhere (like function calls).

My answer is mostly based on this implementation of a 64bit word select method (Hint: Look only at the MARISA_USE_POPCNT, MARISA_X64, MARISA_USE_SSE3 codepaths):
It works in two steps, first selecting the byte containing the n-th set bit and then using a lookup table inside the byte:
Extract the lower and higher nibbles for every byte (bitmasks 0xF, 0xF0, shift the higher nibbles down)
Replace the nibble values by their popcount (_mm_shuffle_epi8 with A000120)
Sum the popcounts of the lower and upper nibbles (Normal SSE addition) to get byte popcounts
Compute the prefix sum over all byte popcounts (multiplication with 0x01010101...)
Propagate the position n to all bytes (SSE broadcast or again multiplication with 0x01010101...)
Do a bytewise comparison (_mm_cmpgt_epi8 leaves 0xFF in every byte smaller than n)
Compute the byte offset by doing a popcount on the result
Now we know which byte contains the bit and a simple byte lookup table like in grek40's answer suffices to get the result.
Note however that I have not really benchmarked this result against other implementations, only that I have seen it to be quite efficient (and branchless)

I cant see a method without a loop, what springs to mind would be;
int set = 0;
int pos = 0;
while(set < n) {
if((bits & 0x01) == 1) set++;
bits = bits >> 1;
pos++;
}
after which, pos would hold the position of the nth lowest-value set bit.
The only other thing that I can think of would be a divide and conquer approach, which might yield O(log(n)) rather than O(n)...but probably not.
Edit: you said any behaviour, so non-termination is ok, right? :P

def bitN (l: Long, i: Int) : Long = {
def bitI (l: Long, i: Int) : Long =
if (i == 0) 1L else
2 * {
if (l % 2 == 0) bitI (l / 2, i) else bitI (l /2, i-1)
}
bitI (l, i) / 2
}
A recursive method (in scala). Decrement i, the position, if a modulo2 is 1. While returning, multiply by 2. Since the multiplication is invoced as last operation, it is not tail recursive, but since Longs are of known size in advance, the maximum stack is not too big.
scala> n.toBinaryString.replaceAll ("(.{8})", "$1 ")
res117: java.lang.String = 10110011 11101110 01011110 01111110 00111101 11100101 11101011 011000
scala> bitN (n, 40) .toBinaryString.replaceAll ("(.{8})", "$1 ")
res118: java.lang.String = 10000000 00000000 00000000 00000000 00000000 00000000 00000000 000000

Edit
After giving it some thought and using the __builtin_popcount function, I figured it might be better to decide on the relevant byte and then compute the whole result instead of incrementally adding/subtracting numbers. Here is an updated version:
int GetBitAtPosition(unsigned i, unsigned n)
{
unsigned bitCount;
bitCount = __builtin_popcount(i & 0x00ffffff);
if (bitCount <= n)
{
return (24 + LUT_BitPosition[i >> 24][n - bitCount]);
}
bitCount = __builtin_popcount(i & 0x0000ffff);
if (bitCount <= n)
{
return (16 + LUT_BitPosition[(i >> 16) & 0xff][n - bitCount]);
}
bitCount = __builtin_popcount(i & 0x000000ff);
if (bitCount <= n)
{
return (8 + LUT_BitPosition[(i >> 8) & 0xff][n - bitCount]);
}
return LUT_BitPosition[i & 0xff][n];
}
I felt like creating a LUT based solution where the number is inspected in byte-chunks, however, the LUT for the n-th bit position grew quite large (256*8) and the LUT-free version that was discussed in the comments might be better.
Generally the algorithm would look like this:
unsigned i = 0x000006B5;
unsigned n = 4;
unsigned result = 0;
unsigned bitCount;
while (i)
{
bitCount = LUT_BitCount[i & 0xff];
if (n < bitCount)
{
result += LUT_BitPosition[i & 0xff][n];
break; // found
}
else
{
n -= bitCount;
result += 8;
i >>= 8;
}
}
Might be worth to unroll the loop into its up to 4 iterations to get the best performance on 32 bit numbers.
The LUT for bitcount (could be replaced by __builtin_popcount):
unsigned LUT_BitCount[] = {
0, 1, 1, 2, 1, 2, 2, 3, // 0-7
1, 2, 2, 3, 2, 3, 3, 4, // 8-15
1, 2, 2, 3, 2, 3, 3, 4, // 16-23
2, 3, 3, 4, 3, 4, 4, 5, // 24-31
1, 2, 2, 3, 2, 3, 3, 4, // 32-39
2, 3, 3, 4, 3, 4, 4, 5, // 40-47
2, 3, 3, 4, 3, 4, 4, 5, // 48-55
3, 4, 4, 5, 4, 5, 5, 6, // 56-63
1, 2, 2, 3, 2, 3, 3, 4, // 64-71
2, 3, 3, 4, 3, 4, 4, 5, // 72-79
2, 3, 3, 4, 3, 4, 4, 5, // 80-87
3, 4, 4, 5, 4, 5, 5, 6, // 88-95
2, 3, 3, 4, 3, 4, 4, 5, // 96-103
3, 4, 4, 5, 4, 5, 5, 6, // 104-111
3, 4, 4, 5, 4, 5, 5, 6, // 112-119
4, 5, 5, 6, 5, 6, 6, 7, // 120-127
1, 2, 2, 3, 2, 3, 3, 4, // 128
2, 3, 3, 4, 3, 4, 4, 5, // 136
2, 3, 3, 4, 3, 4, 4, 5, // 144
3, 4, 4, 5, 4, 5, 5, 6, // 152
2, 3, 3, 4, 3, 4, 4, 5, // 160
3, 4, 4, 5, 4, 5, 5, 6, // 168
3, 4, 4, 5, 4, 5, 5, 6, // 176
4, 5, 5, 6, 5, 6, 6, 7, // 184
2, 3, 3, 4, 3, 4, 4, 5, // 192
3, 4, 4, 5, 4, 5, 5, 6, // 200
3, 4, 4, 5, 4, 5, 5, 6, // 208
4, 5, 5, 6, 5, 6, 6, 7, // 216
3, 4, 4, 5, 4, 5, 5, 6, // 224
4, 5, 5, 6, 5, 6, 6, 7, // 232
4, 5, 5, 6, 5, 6, 6, 7, // 240
5, 6, 6, 7, 6, 7, 7, 8, // 248-255
};
The LUT for bit position within a byte:
unsigned LUT_BitPosition[][8] = {
// 0-7
{UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
// 8-15
{3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
// 16-31
{4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,4,UINT_MAX,UINT_MAX,UINT_MAX},
// 32-63
{5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,5,UINT_MAX,UINT_MAX,UINT_MAX},
{4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
{3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,4,5,UINT_MAX,UINT_MAX},
// 64-127
{6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,6,UINT_MAX,UINT_MAX,UINT_MAX},
{4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
{3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,4,6,UINT_MAX,UINT_MAX},
{5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,5,6,UINT_MAX,UINT_MAX},
{4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,5,6,UINT_MAX,UINT_MAX},
{3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,5,6,UINT_MAX,UINT_MAX},
{2,3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,5,6,UINT_MAX,UINT_MAX},
{1,2,3,4,5,6,UINT_MAX,UINT_MAX},
{0,1,2,3,4,5,6,UINT_MAX},
// 128-255
{7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,7,UINT_MAX,UINT_MAX,UINT_MAX},
{4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
{3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,4,7,UINT_MAX,UINT_MAX},
{5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,5,7,UINT_MAX,UINT_MAX},
{4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,5,7,UINT_MAX,UINT_MAX},
{3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,5,7,UINT_MAX,UINT_MAX},
{2,3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,5,7,UINT_MAX,UINT_MAX},
{1,2,3,4,5,7,UINT_MAX,UINT_MAX},
{0,1,2,3,4,5,7,UINT_MAX},
{6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{2,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,3,6,7,UINT_MAX,UINT_MAX},
{4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,4,6,7,UINT_MAX,UINT_MAX},
{3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,4,6,7,UINT_MAX,UINT_MAX},
{2,3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,4,6,7,UINT_MAX,UINT_MAX},
{1,2,3,4,6,7,UINT_MAX,UINT_MAX},
{0,1,2,3,4,6,7,UINT_MAX},
{5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{1,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{2,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,2,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,2,5,6,7,UINT_MAX,UINT_MAX},
{3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,3,5,6,7,UINT_MAX,UINT_MAX},
{2,3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,3,5,6,7,UINT_MAX,UINT_MAX},
{1,2,3,5,6,7,UINT_MAX,UINT_MAX},
{0,1,2,3,5,6,7,UINT_MAX},
{4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
{0,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{1,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,1,4,5,6,7,UINT_MAX,UINT_MAX},
{2,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,2,4,5,6,7,UINT_MAX,UINT_MAX},
{1,2,4,5,6,7,UINT_MAX,UINT_MAX},
{0,1,2,4,5,6,7,UINT_MAX},
{3,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
{0,3,4,5,6,7,UINT_MAX,UINT_MAX},
{1,3,4,5,6,7,UINT_MAX,UINT_MAX},
{0,1,3,4,5,6,7,UINT_MAX},
{2,3,4,5,6,7,UINT_MAX,UINT_MAX},
{0,2,3,4,5,6,7,UINT_MAX},
{1,2,3,4,5,6,7,UINT_MAX},
{0,1,2,3,4,5,6,7},
};

My approach is to calculate the population count for each 8-bit quarters of the 32-bit integer in parallel, then find which quarter contains the nth bit. The population count of quarters that are lower than the found one can be summarized as the initial value of later calculation.
After that count set bits one-by-one until the n is reached. Without branches and using an incomplete implementation of population count algorithm, my example is the following:
#include <stdio.h>
#include <stdint.h>
int main() {
uint32_t n = 10, test = 3124375902u; /* 10111010001110100011000101011110 */
uint32_t index, popcnt, quarter = 0, q_popcnt;
/* count set bits of each quarter of 32-bit integer in parallel */
q_popcnt = test - ((test >> 1) & 0x55555555);
q_popcnt = (q_popcnt & 0x33333333) + ((q_popcnt >> 2) & 0x33333333);
q_popcnt = (q_popcnt + (q_popcnt >> 4)) & 0x0F0F0F0F;
popcnt = q_popcnt;
/* find which quarters can be summarized and summarize them */
quarter += (n + 1 >= (q_popcnt & 0xff));
quarter += (n + 1 >= ((q_popcnt += q_popcnt >> 8) & 0xff));
quarter += (n + 1 >= ((q_popcnt += q_popcnt >> 16) & 0xff));
quarter += (n + 1 >= ((q_popcnt += q_popcnt >> 24) & 0xff));
popcnt &= (UINT32_MAX >> (8 * quarter));
popcnt = (popcnt * 0x01010101) >> 24;
/* find the index of nth bit in quarter where it should be */
index = 8 * quarter;
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
index += ((popcnt += (test >> index) & 1) <= n);
printf("index = %u\n", index);
return 0;
}
A simple approach which uses loops and conditionals can be the following as well:
#include <stdio.h>
#include <stdint.h>
int main() {
uint32_t n = 11, test = 3124375902u; /* 10111010001110100011000101011110 */
uint32_t popcnt = 0, index = 0;
while(popcnt += ((test >> index) & 1), popcnt <= n && ++index < 32);
printf("index = %u\n", index);
return 0;
}

I know the question asks for something faster than a loop, but a complicated loop-less answer is likely to take longer than a quick loop.
If the computer has 32 bit ints and v is a random value then it might have for example 16 ones and if we are looking for a random place among the 16 ones, we might typically be looking for the 8th one. 7 or 8 times round a loop with just a couple of statements isn't too bad.
int findNthBit(unsigned int n, int v)
{
int next;
if (n > __builtin_popcount(v)) return 0;
while (next = v&v-1, --n)
{
v = next;
}
return v ^ next;
}
The loop works by removing the lowest set bit (n-1) times.
The n'th one bit that would be removed is the one bit we were looking for.
If anybody wants to test this ....
#include "stdio.h"
#include "assert.h"
// function here
int main() {
assert(findNthBit(1, 0)==0);
assert(findNthBit(1, 0xf0f)==1<<0);
assert(findNthBit(2, 0xf0f)==1<<1);
assert(findNthBit(3, 0xf0f)==1<<2);
assert(findNthBit(4, 0xf0f)==1<<3);
assert(findNthBit(5, 0xf0f)==1<<8);
assert(findNthBit(6, 0xf0f)==1<<9);
assert(findNthBit(7, 0xf0f)==1<<10);
assert(findNthBit(8, 0xf0f)==1<<11);
assert(findNthBit(9, 0xf0f)==0);
printf("looks good\n");
}
If there are concerns about the number of times the loop is executed, for example if the function is regularly called with large values of n, its simple to add an extra line or two of the following form
if (n > 8) return findNthBit(n-__builtin_popcount(v&0xff), v>>8) << 8;
or
if (n > 12) return findNthBit(n - __builtin_popcount(v&0xfff), v>>12) << 12;
The idea here is that the n'th one will never be located in the bottom n-1 bits. A better version clears not only the bottom 8 or 12 bits, but all the bottom (n-1) bits when n is large-ish and we don't want to loop that many times.
if (n > 7) return findNthBit(n - __builtin_popcount(v & ((1<<(n-1))-1)), v>>(n-1)) << (n-1);
I tested this with findNthBit(20, 0xaf5faf5f) and after clearing out the bottom 19 bits because the answer wasn't to be found there, it looked for the 5th bit in the remaining bits by looping 4 times to remove 4 ones.
So an improved version is
int findNthBit(unsigned int n, int v)
{
int next;
if (n > __builtin_popcount(v)) return 0;
if (n > 7) return findNthBit(n - __builtin_popcount(v & ((1<<(n-1))-1)), v>>(n-1)) << (n-1);
while (next = v&v-1, --n)
{
v = next;
}
return v ^ next;
}
The value 7, limiting looping is chosen fairly arbitrarily as a compromise between limiting looping and limiting recursion. The function could be further improved by removing recursion and keeping track of a shift amount instead. I may try this if I get some peace from home schooling my daughter!
Here is a final version with the recursion removed by keeping track of the number of low order bits shifted out from the bottom of the bits being searched.
Final version
int findNthBit(unsigned int n, int v)
{
int shifted = 0; // running total
int nBits; // value for this iteration
// handle no solution
if (n > __builtin_popcount(v)) return 0;
while (n > 7)
{
// for large n shift out lower n-1 bits from v.
nBits = n-1;
n -= __builtin_popcount(v & ((1<<nBits)-1));
v >>= nBits;
shifted += nBits;
}
int next;
// n is now small, clear out n-1 bits and return the next bit
// v&(v-1): a well known software trick to remove the lowest set bit.
while (next = v&(v-1), --n)
{
v = next;
}
return (v ^ next) << shifted;
}

Building on the answer given by Jukka Suomela, which uses a machine-specific instruction that may not necessarily be available, it is also possible to write a function that does exactly the same thing as _pdep_u64 without any machine dependencies. It must loop over the set bits in one of the arguments, but can still be described as a constexpr function for C++11.
constexpr inline uint64_t deposit_bits(uint64_t x, uint64_t mask, uint64_t b, uint64_t res) {
return mask != 0 ? deposit_bits(x, mask & (mask - 1), b << 1, ((x & b) ? (res | (mask & (-mask))) : res)) : res;
}
constexpr inline uint64_t nthset(uint64_t x, unsigned n) {
return deposit_bits(1ULL << n, x, 1, 0);
}

Based on a method by Juha Järvi published in the famous Bit Twiddling Hacks, I tested this implementation where n and i are used as in the question:
a = i - (i >> 1 & 0x55555555);
b = (a & 0x33333333) + (a >> 2 & 0x33333333);
c = b + (b >> 4) & 0x0f0f0f0f;
r = n + 1;
s = 0;
t = c + (c >> 8) & 0xff;
if (r > t) {
s += 16;
r -= t;
}
t = c >> s & 0xf;
if (r > t) {
s += 8;
r -= t;
}
t = b >> s & 0x7;
if (r > t) {
s += 4;
r -= t;
}
t = a >> s & 0x3;
if (r > t) {
s += 2;
r -= t;
}
t = i >> s & 0x1;
if (r > t)
s++;
return (s);
Based on my own tests, this is about as fast as the loop on x86, whereas it is 20% faster on arm64 and probably a lot faster on arm due to the fast conditional instructions, but I can't test this right now.

PDEP solution is great, but some languages like Java do not contain this intrinsic yet, however, are efficient in the other low-level operations. So I came up with the following fall back for such cases: a branchless binary search.
// n must be using 0-based indexing.
// This method produces correct results only if n is smaller
// than the number of set bits.
public static int getNthSetBit(long mask64, int n) {
// Binary search without branching
int base = 0;
final int low32 = (int) mask64;
final int high32n = n - Integer.bitCount(low32);
final int inLow32 = high32n >>> 31;
final int inHigh32 = inLow32 ^ 1;
final int shift32 = inHigh32 << 5;
final int mask32 = (int) (mask64 >>> shift32);
n = ((-inLow32) & n) | ((-inHigh32) & high32n);
base += shift32;
final int low16 = mask32 & 0xffff;
final int high16n = n - Integer.bitCount(low16);
final int inLow16 = high16n >>> 31;
final int inHigh16 = inLow16 ^ 1;
final int shift16 = inHigh16 << 4;
final int mask16 = (mask32 >>> shift16) & 0xffff;
n = ((-inLow16) & n) | ((-inHigh16) & high16n);
base += shift16;
final int low8 = mask16 & 0xff;
final int high8n = n - Integer.bitCount(low8);
final int inLow8 = high8n >>> 31;
final int inHigh8 = inLow8 ^ 1;
final int shift8 = inHigh8 << 3;
final int mask8 = (mask16 >>> shift8) & 0xff;
n = ((-inLow8) & n) | ((-inHigh8) & high8n);
base += shift8;
final int low4 = mask8 & 0xf;
final int high4n = n - Integer.bitCount(low4);
final int inLow4 = high4n >>> 31;
final int inHigh4 = inLow4 ^ 1;
final int shift4 = inHigh4 << 2;
final int mask4 = (mask8 >>> shift4) & 0xf;
n = ((-inLow4) & n) | ((-inHigh4) & high4n);
base += shift4;
final int low2 = mask4 & 3;
final int high2n = n - (low2 >> 1) - (low2 & 1);
final int inLow2 = high2n >>> 31;
final int inHigh2 = inLow2 ^ 1;
final int shift2 = inHigh2 << 1;
final int mask2 = (mask4 >>> shift2) & 3;
n = ((-inLow2) & n) | ((-inHigh2) & high2n);
base += shift2;
// For the 2 bits remaining, we can take a shortcut
return base + (n | ((mask2 ^ 1) & 1));
}

Related

Understanding branch prediction efficiency

I tried to measure branch prediction cost, I created a little program.
It creates a little buffer on stack, fills with random 0/1. I can set the size of the buffer with N. The code repeatedly causes branches for the same 1<<N random numbers.
Now, I've expected, that if 1<<N is sufficiently large (like >100), then the branch predictor will not be effective (as it has to predict >100 random numbers). However, these are the results (on a 5820k machine), as N grows, the program becomes slower:
N time
=========
8 2.2
9 2.2
10 2.2
11 2.2
12 2.3
13 4.6
14 9.5
15 11.6
16 12.7
20 12.9
For reference, if buffer is initialized with zeros (use the commented init), time is more-or-less constant, it varies between 1.5-1.7 for N 8..16.
My question is: can branch predictor effective for predicting such a large amount of random numbers? If not, then what's going on here?
(Some more explanation: the code executes 2^32 branches, no matter of N. So I expected, that the code runs the same speed, no matter of N, because the branch cannot be predicted at all. But it seems that if buffer size is less than 4096 (N<=12), something makes the code fast. Can branch prediction be effective for 4096 random numbers?)
Here's the code:
#include <cstdint>
#include <iostream>
volatile uint64_t init[2] = { 314159165, 27182818 };
// volatile uint64_t init[2] = { 0, 0 };
volatile uint64_t one = 1;
uint64_t next(uint64_t s[2]) {
uint64_t s1 = s[0];
uint64_t s0 = s[1];
uint64_t result = s0 + s1;
s[0] = s0;
s1 ^= s1 << 23;
s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5);
return result;
}
int main() {
uint64_t s[2];
s[0] = init[0];
s[1] = init[1];
uint64_t sum = 0;
#if 1
const int N = 16;
unsigned char buffer[1<<N];
for (int i=0; i<1<<N; i++) buffer[i] = next(s)&1;
for (uint64_t i=0; i<uint64_t(1)<<(32-N); i++) {
for (int j=0; j<1<<N; j++) {
if (buffer[j]) {
sum += one;
}
}
}
#else
for (uint64_t i=0; i<uint64_t(1)<<32; i++) {
if (next(s)&1) {
sum += one;
}
}
#endif
std::cout<<sum<<"\n";
}
(The code contains a non-buffered version as well, use #if 0. It runs around the same speed as the buffered version with N=16)
Here's the inner loop disassembly (compiled with clang. It generates the same code for all N between 8..16, only the loop count differs. Clang unrolled the loop twice):
401270: 80 3c 0c 00 cmp BYTE PTR [rsp+rcx*1],0x0
401274: 74 07 je 40127d <main+0xad>
401276: 48 03 35 e3 2d 00 00 add rsi,QWORD PTR [rip+0x2de3] # 404060 <one>
40127d: 80 7c 0c 01 00 cmp BYTE PTR [rsp+rcx*1+0x1],0x0
401282: 74 07 je 40128b <main+0xbb>
401284: 48 03 35 d5 2d 00 00 add rsi,QWORD PTR [rip+0x2dd5] # 404060 <one>
40128b: 48 83 c1 02 add rcx,0x2
40128f: 48 81 f9 00 00 01 00 cmp rcx,0x10000
401296: 75 d8 jne 401270 <main+0xa0>
Branch prediction can be such effective. As Peter Cordes suggests, I've checked branch-misses with perf stat. Here are the results:
N time cycles branch-misses (%) approx-time
===============================================================
8 2.2 9,084,889,375 34,806 ( 0.00) 2.2
9 2.2 9,212,112,830 39,725 ( 0.00) 2.2
10 2.2 9,264,903,090 2,394,253 ( 0.06) 2.2
11 2.2 9,415,103,000 8,102,360 ( 0.19) 2.2
12 2.3 9,876,827,586 27,169,271 ( 0.63) 2.3
13 4.6 19,572,398,825 486,814,972 (11.33) 4.6
14 9.5 39,813,380,461 1,473,662,853 (34.31) 9.5
15 11.6 49,079,798,916 1,915,930,302 (44.61) 11.7
16 12.7 53,216,900,532 2,113,177,105 (49.20) 12.7
20 12.9 54,317,444,104 2,149,928,923 (50.06) 12.9
Note: branch-misses (%) is calculated for 2^32 branches
As you can see, when N<=12, branch predictor can predict most of the branches (which is surprising: the branch predictor can memorize the outcome of 4096 consecutive random branches!). When N>12, branch-misses starts to grow. At N>=16, it can only predict ~50% correctly, which means it is as effective as random coin flips.
The time taken can be approximated by looking at the time and branch-misses (%) column: I've added the last column, approx-time. I've calculated it by this: 2.2+(12.9-2.2)*branch-misses %/100. As you can see, approx-time equals to time (not considering rounding error). So this effect can be explained perfectly by branch prediction.
The original intent was to calculate how many cycles a branch-miss costs (in this particular case - as for other cases this number can differ):
(54,317,444,104-9,084,889,375)/(2,149,928,923-34,806) = 21.039 = ~21 cycles.

Subset sum with maximum equal sums and without using all elements

You are given a set of integers and your task is the following: split them into 2 subsets with an equal sum in such way that these sums are maximal. You are allowed not to use all given integers, that's fine. If it's just impossible, report error somehow.
My approach is rather straightforward: at each step, we pick a single item, mark it as visited, update current sum and pick another item recursively. Finally, try skipping current element.
It works on simpler test cases, but it fails one:
T = 1
N = 25
Elements: 5 27 24 12 12 2 15 25 32 21 37 29 20 9 24 35 26 8 31 5 25 21 28 3 5
One can run it as follows:
1 25 5 27 24 12 12 2 15 25 32 21 37 29 20 9 24 35 26 8 31 5 25 21 28 3 5
I expect sum to be equal 239, but it the algorithm fails to find such solution.
I've ended up with the following code:
#include <iostream>
#include <unordered_set>
using namespace std;
unordered_set<uint64_t> visited;
const int max_N = 50;
int data[max_N];
int p1[max_N];
int p2[max_N];
int out1[max_N];
int out2[max_N];
int n1 = 0;
int n2 = 0;
int o1 = 0;
int o2 = 0;
int N = 0;
void max_sum(int16_t &sum_out, int16_t sum1 = 0, int16_t sum2 = 0, int idx = 0) {
if (idx < 0 || idx > N) return;
if (sum1 == sum2 && sum1 > sum_out) {
sum_out = sum1;
o1 = n1;
o2 = n2;
for(int i = 0; i < n1; ++i) {
out1[i] = p1[i];
}
for (int i = 0; i < n2; ++i) {
out2[i] = p2[i];
}
}
if (idx == N) return;
uint64_t key = (static_cast<uint64_t>(sum1) << 48) | (static_cast<uint64_t>(sum2) << 32) | idx;
if (visited.find(key) != visited.end()) return;
visited.insert(key);
p1[n1] = data[idx];
++n1;
max_sum(sum_out, sum1 + data[idx], sum2, idx + 1);
--n1;
p2[n2] = data[idx];
++n2;
max_sum(sum_out, sum1, sum2 + data[idx], idx + 1);
--n2;
max_sum(sum_out, sum1, sum2, idx + 1);
}
int main() {
int T = 0;
cin >> T;
for (int t = 1; t <= T; ++t) {
int16_t sum_out;
cin >> N;
for(int i = 0; i < N; ++i) {
cin >> data[i];
}
n1 = 0;
n2 = 0;
o1 = 0;
o2 = 0;
max_sum(sum_out);
int res = 0;
int res2 = 0;
for (int i = 0; i < o1; ++i) res += out1[i];
for (int i = 0; i < o2; ++i) res2 += out2[i];
if (res != res2) cerr << "ERROR: " << "res1 = " << res << "; res2 = " << res2 << '\n';
cout << "#" << t << " " << res << '\n';
visited.clear();
}
}
I have the following questions:
Could someone help me to troubleshoot the failing test? Are there any obvious problems?
How could I get rid of unordered_set for marking already visited sums? I prefer to use plain C.
Is there a better approach? Maybe using dynamic programming?
Another approach is consider all the numbers till [1,(2^N-2)].
Consider the position of each bit to position of each element .Iterate all numbers from [1,(2^N-2)] then check for each number .
If bit is set you can count that number in set1 else you can put that number in set2 , then check if sum of both sets are equals or not . Here you will get all possible sets , if you want just one once you find just break.
1) Could someone help me to troubleshoot the failing test? Are there any obvious problems?
The only issue I could see is that you have not set sum_out to 0.
When I tried running the program it seemed to work correctly for your test case.
2) How could I get rid of unordered_set for marking already visited sums? I prefer to use plain C.
See the answer to question 3
3) Is there a better approach? Maybe using dynamic programming?
You are currently keeping track of whether you have seen each choice of value for first subset, value for second subset, amount through array.
If instead you keep track of the difference between the values then the complexity significantly reduces.
In particular, you can use dynamic programming to store an array A[diff] that for each value of the difference either stores -1 (to indicate that the difference is not reachable), or the greatest value of subset1 when the difference between subset1 and subset2 is exactly equal to diff.
You can then iterate over the entries in the input and update the array based on either assigning each element to subset1/subset2/ or not at all. (Note you need to make a new copy of the array when computing this update.)
In this form there is no use of unordered_set because you can simply use a straight C array. There is also no difference between subset1 and subset2 so you can only keep positive differences.
Example Python Code
from collections import defaultdict
data=map(int,"5 27 24 12 12 2 15 25 32 21 37 29 20 9 24 35 26 8 31 5 25 21 28 3 5".split())
A=defaultdict(int) # Map from difference to best value of subset sum 1
A[0] = 0 # We start with a difference of 0
for a in data:
A2 = defaultdict(int)
def add(s1,s2):
if s1>s2:
s1,s2=s2,s1
d = s2-s1
if d in A2:
A2[d] = max( A2[d], s1 )
else:
A2[d] = s1
for diff,sum1 in A.items():
sum2 = sum1 + diff
add(sum1,sum2)
add(sum1+a,sum2)
add(sum1,sum2+a)
A = A2
print A[0]
This prints 239 as the answer.
For simplicity I haven't bothered with the optimization of using a linear array instead of the dictionary.
A very different approach would be to use a constraint or mixed integer solver. Here is a possible formulation.
Let
x(i,g) = 1 if value v(i) belongs to group g
0 otherwise
The optimization model can look like:
max s
s = sum(i, x(i,g)*v(i)) for all g
sum(g, x(i,g)) <= 1 for all i
For two groups we get:
---- 31 VARIABLE s.L = 239.000
---- 31 VARIABLE x.L
g1 g2
i1 1
i2 1
i3 1
i4 1
i5 1
i6 1
i7 1
i8 1
i9 1
i10 1
i11 1
i12 1
i13 1
i14 1
i15 1
i16 1
i17 1
i18 1
i19 1
i20 1
i21 1
i22 1
i23 1
i25 1
We can easily do more groups. E.g. with 9 groups:
---- 31 VARIABLE s.L = 52.000
---- 31 VARIABLE x.L
g1 g2 g3 g4 g5 g6 g7 g8 g9
i2 1
i3 1
i4 1
i5 1
i6 1
i7 1
i8 1
i9 1
i10 1
i11 1
i12 1
i13 1
i14 1
i15 1
i16 1
i17 1
i19 1
i20 1
i21 1
i22 1
i23 1
i24 1
i25 1
If there is no solution, the solver will select zero elements in each group with a sum s=0.

Finding the most frequently occurring element in an SSE register

Does anyone have any thoughts on how to calculate the mode (statistic) of a vector of 8-bit integers in SSE4.x? To clarify, this would be 16x8-bit values in a 128-bit register.
I want the result as a vector mask which selects the mode-valued elements. i.e. the result of _mm_cmpeq_epi8(v, set1(mode(v))), as well as the scalar value.
Providing some additional context; while the above problem is an interesting one to solve in its own right, I have been through most algorithms I can think of with linear complexity. This class will wipe out any gains I can get from calculating this number.
I hope to engage you all in searching for some deep magic, here. It's possible that an approximation may be necessary to break this bound, such as "select a frequently occurring element" for example (N.B. difference against the most), which would be of merit. A probabilistic answer would be usable, too.
SSE and x86 have some very interesting semantics. It may be worth exploring a superoptimization pass.
Probably a relatively simple brute force SSEx approach is suitable here, see the code below.
The idea is to byte-rotate the input vector v by 1 to 15 positions and compare the rotated vector
with the original v for equality. To shorten the dependency chain and to increase the
instruction level parallelism, two counters are used to count (vertical sum) these equal elements:
sum1 and sum2, because there might be architectures that benefit from this.
Equal elements are counted as -1. Variable sum = sum1 + sum2 contains the total count with values
between -1 and -16. min_brc contains the horizontal minimum of sum broadcasted to all elements.
mask = _mm_cmpeq_epi8(sum,min_brc) is the mask for the mode-valued elements requested as an
intermediate result by the OP. In the next few lines of the code the actual mode is extracted.
This solution is certainly faster than a scalar solution.
Note that with AVX2 the upper 128-bit lanes can be used to speedup the computation further.
It takes 20 cycles (throughput) to compute only the a mask for the mode-valued elements. With the actual mode broadcasted
across the SSE register it takes about 21.4 cycles.
Note the behaviour in the next example:
[1, 1, 3, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] returns mask=[-1,-1,-1,-1,0,0,...,0]
and the mode value is 1, although 1 occurs as often as 3.
The code below is tested, but not thoroughly tested
#include <stdio.h>
#include <x86intrin.h>
/* gcc -O3 -Wall -m64 -march=nehalem mode_uint8.c */
int print_vec_char(__m128i x);
__m128i mode_statistic(__m128i v){
__m128i sum2 = _mm_set1_epi8(-1); /* Each integer occurs at least one time */
__m128i v_rot1 = _mm_alignr_epi8(v,v,1);
__m128i v_rot2 = _mm_alignr_epi8(v,v,2);
__m128i sum1 = _mm_cmpeq_epi8(v,v_rot1);
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot2));
__m128i v_rot3 = _mm_alignr_epi8(v,v,3);
__m128i v_rot4 = _mm_alignr_epi8(v,v,4);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot3));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot4));
__m128i v_rot5 = _mm_alignr_epi8(v,v,5);
__m128i v_rot6 = _mm_alignr_epi8(v,v,6);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot5));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot6));
__m128i v_rot7 = _mm_alignr_epi8(v,v,7);
__m128i v_rot8 = _mm_alignr_epi8(v,v,8);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot7));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot8));
__m128i v_rot9 = _mm_alignr_epi8(v,v,9);
__m128i v_rot10 = _mm_alignr_epi8(v,v,10);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot9));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot10));
__m128i v_rot11 = _mm_alignr_epi8(v,v,11);
__m128i v_rot12 = _mm_alignr_epi8(v,v,12);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot11));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot12));
__m128i v_rot13 = _mm_alignr_epi8(v,v,13);
__m128i v_rot14 = _mm_alignr_epi8(v,v,14);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot13));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot14));
__m128i v_rot15 = _mm_alignr_epi8(v,v,15);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot15));
__m128i sum = _mm_add_epi8(sum1,sum2); /* Sum contains values such as -1, -2 ,...,-16 */
/* The next three instructions compute the horizontal minimum of sum */
__m128i sum_shft = _mm_srli_epi16(sum,8); /* Shift right 8 bits, while shifting in zeros */
__m128i min1 = _mm_min_epu8(sum,sum_shft); /* sum and sum_shuft are considered as unsigned integers. sum_shft is zero at the odd positions and so is min1 */
__m128i min2 = _mm_minpos_epu16(min1); /* Byte 0 within min2 contains the horizontal minimum of sum */
__m128i min_brc = _mm_shuffle_epi8(min2,_mm_setzero_si128()); /* Broadcast horizontal minimum */
__m128i mask = _mm_cmpeq_epi8(sum,min_brc); /* Mask = -1 at the byte positions where the value of v is equal to the mode of v */
/* comment next 4 lines out if there is no need to broadcast the mode value */
int bitmask = _mm_movemask_epi8(mask);
int indx = __builtin_ctz(bitmask); /* Index of mode */
__m128i v_indx = _mm_set1_epi8(indx); /* Broadcast indx */
__m128i answer = _mm_shuffle_epi8(v,v_indx); /* Broadcast mode to each element of answer */
/* Uncomment lines below to print intermediate results, to see how it works. */
// printf("sum = ");print_vec_char (sum );
// printf("sum_shft = ");print_vec_char (sum_shft );
// printf("min1 = ");print_vec_char (min1 );
// printf("min2 = ");print_vec_char (min2 );
// printf("min_brc = ");print_vec_char (min_brc );
// printf("mask = ");print_vec_char (mask );
// printf("v_indx = ");print_vec_char (v_indx );
// printf("answer = ");print_vec_char (answer );
return answer; /* or return mask, or return both .... :) */
}
int main() {
/* To test throughput set throughput_test to 1, otherwise 0 */
/* Use e.g. perf stat -d ./a.out to test throughput */
#define throughput_test 0
/* Different test vectors */
int i;
char x1[16] = {5, 2, 2, 7, 21, 4, 7, 7, 3, 9, 2, 5, 4, 3, 5, 5};
char x2[16] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
char x3[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
char x4[16] = {1, 2, 3, 2, 1, 6, 7, 8, 2, 2, 2, 3, 3, 2, 15, 16};
char x5[16] = {1, 1, 3, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
printf("\n15...0 = 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0\n\n");
__m128i x_vec = _mm_loadu_si128((__m128i*)x1);
printf("x_vec = ");print_vec_char(x_vec );
__m128i y = mode_statistic (x_vec);
printf("answer = ");print_vec_char(y );
#if throughput_test == 1
__m128i x_vec1 = _mm_loadu_si128((__m128i*)x1);
__m128i x_vec2 = _mm_loadu_si128((__m128i*)x2);
__m128i x_vec3 = _mm_loadu_si128((__m128i*)x3);
__m128i x_vec4 = _mm_loadu_si128((__m128i*)x4);
__m128i x_vec5 = _mm_loadu_si128((__m128i*)x5);
__m128i y1, y2, y3, y4, y5;
__asm__ __volatile__ ( "vzeroupper" : : : ); /* Remove this line on non-AVX processors */
for (i=0;i<100000000;i++){
y1 = mode_statistic (x_vec1);
y2 = mode_statistic (x_vec2);
y3 = mode_statistic (x_vec3);
y4 = mode_statistic (x_vec4);
y5 = mode_statistic (x_vec5);
x_vec1 = mode_statistic (y1 );
x_vec2 = mode_statistic (y2 );
x_vec3 = mode_statistic (y3 );
x_vec4 = mode_statistic (y4 );
x_vec5 = mode_statistic (y5 );
}
printf("mask mode = ");print_vec_char(y1 );
printf("mask mode = ");print_vec_char(y2 );
printf("mask mode = ");print_vec_char(y3 );
printf("mask mode = ");print_vec_char(y4 );
printf("mask mode = ");print_vec_char(y5 );
#endif
return 0;
}
int print_vec_char(__m128i x){
char v[16];
_mm_storeu_si128((__m128i *)v,x);
printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi\n",
v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
return 0;
}
Output:
15...0 = 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
x_vec = 5 5 3 4 | 5 2 9 3 | 7 7 4 21 | 7 2 2 5
sum = -4 -4 -2 -2 | -4 -3 -1 -2 | -3 -3 -2 -1 | -3 -3 -3 -4
min_brc = -4 -4 -4 -4 | -4 -4 -4 -4 | -4 -4 -4 -4 | -4 -4 -4 -4
mask = -1 -1 0 0 | -1 0 0 0 | 0 0 0 0 | 0 0 0 -1
answer = 5 5 5 5 | 5 5 5 5 | 5 5 5 5 | 5 5 5 5
The horizontal minimum is computed with Evgeny Kluev's method.
Sort the data in the register.
Insertion sort can be done in 16 (15) steps, by initializing the register to "Infinity", which tries to illustrate a monotonically decreasing array and inserting the new element in parallel to all possible places:
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 78
__m128i sorted = _mm_or_si128(my_array, const_FFFFF00);
for (int i = 1; i < 16; ++i)
{
// Trying to insert e.g. A0, we must shift all the FF's to left
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF 78 00
__m128i shifted = _mm_bslli_si128(sorted, 1);
// Taking the MAX of shifted and 'A0 on all places'
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF A0 A0
shifted = _mm_max_epu8(shifted, _mm_set1_epi8(my_array[i]));
// and minimum of the shifted + original --
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF A0 78
sorted = _mm_min_epu8(sorted, shifted);
}
Then calculate mask for vec[n+1] == vec[n], move mask to GPR and use that to index a 32768 entry LUT for best index location.
In real case one probably want's to sort more than just one vector; i.e. sort 16 16-entry vectors at once;
__m128i input[16]; // not 1, but 16 vectors
transpose16x16(input); // inplace vector transpose
sort(transpose); // 60-stage network exists for 16 inputs
// linear search -- result in 'mode'
__m128i mode = input[0];
__m128i previous = mode;
__m128i count = _mm_set_epi8(0);
__m128i max_count = _mm_setzero_si128(0);
for (int i = 1; i < 16; i++)
{
__m128i &current = input[i];
// histogram count is off by one
// if (current == previous) count++;
// else count = 0;
// if (count > max_count)
// mode = current, max_count = count
prev = _mm_cmpeq_epi8(prev, current);
count = _mm_and_si128(_mm_sub_epi8(count, prev), prev);
__m128i max_so_far = _mm_cmplt_epi8(max_count, count);
mode = _mm_blendv_epi8(mode, current, max_so_far);
max_count = _mm_max_epi8(max_count, count);
previous = current;
}
The inner loop totals amortized cost of 7-8 instructions per result;
Sorting has typically 2 instructions per stage -- i.e. 8 instructions per result, when 16 results need 60 stages or 120 instructions.
(This still leaves the transpose as an exercise -- but I think it should be vastly faster than sorting?)
So, this should be in the ball park of 24 instructions per 8-bit result.
For performance comparison with scalar code. Non-vectorized on main part but vectorized on table-clear and tmp initialization. (168 cycles per f() call for fx8150 (22M calls complete in 1.0002 seconds at 3.7 GHz))
#include <x86intrin.h>
unsigned char tmp[16]; // extracted values are here (single instruction, store_ps)
unsigned char table[256]; // counter table containing zeroes
char f(__m128i values)
{
_mm_store_si128((__m128i *)tmp,values);
int maxOccurence=0;
int currentValue=0;
for(int i=0;i<16;i++)
{
unsigned char ind=tmp[i];
unsigned char t=table[ind];
t++;
if(t>maxOccurence)
{
maxOccurence=t;
currentValue=ind;
}
table[ind]=t;
}
for(int i=0;i<256;i++)
table[i]=0;
return currentValue;
}
g++ 6.3 output:
f: # #f
movaps %xmm0, tmp(%rip)
movaps %xmm0, -24(%rsp)
xorl %r8d, %r8d
movq $-15, %rdx
movb -24(%rsp), %sil
xorl %eax, %eax
jmp .LBB0_1
.LBB0_2: # %._crit_edge
cmpl %r8d, %esi
cmovgel %esi, %r8d
movb tmp+16(%rdx), %sil
incq %rdx
.LBB0_1: # =>This Inner Loop Header: Depth=1
movzbl %sil, %edi
movb table(%rdi), %cl
incb %cl
movzbl %cl, %esi
cmpl %r8d, %esi
cmovgl %edi, %eax
movb %sil, table(%rdi)
testq %rdx, %rdx
jne .LBB0_2
xorps %xmm0, %xmm0
movaps %xmm0, table+240(%rip)
movaps %xmm0, table+224(%rip)
movaps %xmm0, table+208(%rip)
movaps %xmm0, table+192(%rip)
movaps %xmm0, table+176(%rip)
movaps %xmm0, table+160(%rip)
movaps %xmm0, table+144(%rip)
movaps %xmm0, table+128(%rip)
movaps %xmm0, table+112(%rip)
movaps %xmm0, table+96(%rip)
movaps %xmm0, table+80(%rip)
movaps %xmm0, table+64(%rip)
movaps %xmm0, table+48(%rip)
movaps %xmm0, table+32(%rip)
movaps %xmm0, table+16(%rip)
movaps %xmm0, table(%rip)
movsbl %al, %eax
ret

Logic: Applying gravity to a vector

There is a method called gravity(Vector[] vector) The vector contains sequence of numbers. The gravity function should return a new vector after applying gravity which is explained below.
Assume 0's are air and 1's are brick. When gravity is applied the bricks should fall down to the lowest level.
Let vector = [3, 7, 8]
Converting this to binary we get:
0 0 1 1 for 3
0 1 1 1 for 7
1 0 0 0 for 8
Applying gravity:
0 0 0 0 which is 0
0 0 1 1 which is 3
1 1 1 1 which is 15
So the gravity function should return [0, 3, 15].
Hope you people understood the explanation. I tried a lot but I couldn't figure out the logic for this. One thing I observed was the sum of the numbers in the vector before and after applying gravity remains same.
That is,
3 + 7 + 8 = 18 = 0 + 3 + 15 for the above case.
I think it is as simple as counting the total '1' bit of each position...
Let N be the input vector size, b be the longest binary length of the input elements
Pre-compute the total # of '1' bit of each position, stored in count[], O(N*b)
Run Gravity Function, that is, to regenerate N numbers from the count[], O(N*b)
Total run time is O(N*b)
Below is the sample code in C++
#include<bits/stdc++.h>
using namespace std;
int v[5] = {3,9,7,8,5};
int cnt[5] = {0};
vector<int> ans;
vector<int> gravity(){
vector<int> ret;
for(int i=0; i<5;i++){
int s = 0;
for(int j=0; j<5;j++)
if(cnt[j]){
s += (1<<j); cnt[j]--;
}
ret.push_back(s);
}
return ret;
}
int main(){
// precompute sum of 1 of each bit
for(int i=0, j=0, tmp=v[i]; i<5; i++, j=0, tmp=v[i]){
while(tmp){
if(tmp&1) cnt[j]++;
tmp >>= 1; j++;
}
}
ans = gravity();
for(int i=ans.size()-1; i>=0; i--) printf("%d ", ans[i]);
return 0;
}
The output is as follows:
Success time: 0 memory: 3272 signal:0
0 1 1 15 15
Start at the bottom. Any bricks in the row on top of that one will fall down except where there is already a brick on the bottom. So, the new bottom row is:
bottom_new = bottom_old OR top_old
The new top is:
top_new = bottom_old AND top_old
That is, there will be a brick in the new bottom row if there was a brick in either row, but there's only going to be a brick in the new top row if there was a brick in both rows.
Then you just work your way up the stack, with the new top row becoming the old bottom row for the next step.
The only solution I can think of so far uses nested for loops:
v is the input vector of N integers
D is the number of digits in each integer
c keeps track of the bottom-most free space where a brick can fall
The algorithm checks if the ith bit in the number n is set using (n & (1<<i)), which works in most C-like languages.
The algorithm in C:
for (int j=0; j<D; ++j)
int bit = 1<<j;
int c = N-1;
for (int i=N-1; i>=0; --i)
if (v[i] & bit) { // if bit j of number v[i] is set...
v[i] ^= bit; // set bit j in the number i to 0 using XOR
v[c] ^= bit; // set bottom-most bit in the number i to 1 using XOR
c -= 1; //increment by bottom row 1
}
If N is small and known it advance, you could work out the truth tables for the values of each digit and get the correct result using only bitwise operations and no loops.
Solution:
So I found a solution which needs recursion I guess. Though I don't know the condition to stop the recursion.
The vector v = [3, 7, 8] is very simple that its not possible to explain why recursion is required so am considering a new vector v = [3, 9, 7, 8, 5]
In binary form :
0 0 1 1 - a4
1 0 0 1 - a3
0 1 1 1 - a2
1 0 0 0 - a1
0 1 0 1 - a0
Iteration 1 :
0 0 0 0 - b7 (b7 = a4 AND b5)
0 0 1 1 - b6 (b6 = a4 OR b5)
0 0 0 0 - b5 (b5 = a3 AND b3) ignore this
1 0 0 1 - b4 (b4 = a3 OR b3)
0 0 0 0 - b3 (b3 = a2 AND b1) ignore this
0 1 1 1 - b2 (b2 = a2 OR b1)
0 0 0 0 - b1 (b1 = a0 AND a1) ignore this
1 1 0 1 - b0 (b0 = a0 OR a1)
Intermediate vector = [b7, b6, b4, b2, b0] = [0, 3, 9, 7, 13]
Iteration 2 :
0 0 0 0 - c7 (c7 = b4 AND c5)
0 0 0 1 - c6 (c6 = b4 OR c5)
0 0 0 1 - c5 (c5 = b3 AND c3) ignore this
0 0 1 1 - c4 (c4 = b3 OR c3)
0 0 0 1 - c3 (c3 = b2 AND c1) ignore this
1 1 0 1 - c2 (c2 = b2 OR c1)
0 1 0 1 - c1 (c1 = b0 AND b1) ignore this
1 1 1 1 - c0 (c0 = b0 OR b1)
Intermediate vector = [c7, c6, c4, c2, c0] = [0, 1, 3, 13, 15]
Iteration 3 :
0 0 0 0 - d7 (d7 = c4 AND d5)
0 0 0 1 - d6 (d6 = c4 OR d5)
0 0 0 1 - d5 (d5 = c3 AND d3) ignore this
0 0 0 1 - d4 (d4 = c3 OR d3)
0 0 0 1 - d3 (d3 = c2 AND d1) ignore this
1 1 1 1 - d2 (d2 = c2 OR d1)
1 1 0 1 - d1 (d1 = c0 AND c1) ignore this
1 1 1 1 - d0 (d0 = c0 OR c1)
Resultant vector = [d7, d6, d4, d2, d0] = [0, 1, 1, 15, 15]
I got this solution by going backwards through the vector.
Another solution:
Construct a multidimensional array with all the bits of all the elements in the vector (i.e) if v = [3,7,8] then construct a 3x4 array and store all the bits.
Count the number of 1's in each column and store the count.
Fill each column with count number of 1's starting from the bottom bit.
This approach is simple but requires construction of large matrices.

Most elegant way to expand card hand suits

I'm storing 4-card hands in a way to treat hands with different suits the same, e.g.:
9h 8h 7c 6c
is the same as
9d 8d 7h 6h
since you can replace one suit with another and have the same thing. It's easy to turn these into a unique representation using wildcards for suits. THe previous would become:
9A 8A 7B 6B
My question is - what's the most elegant way to turn the latter back into a list of the former? For example, when the input is 9A 8A 7B 6B, the output should be:
9c 8c 7d 6d
9c 8c 7h 6h
9c 8c 7s 6s
9h 8h 7d 6d
9h 8h 7c 6c
9h 8h 7s 6s
9d 8d 7c 6c
9d 8d 7h 6h
9d 8d 7s 6s
9s 8s 7d 6d
9s 8s 7h 6h
9s 8s 7c 6c
I have some ugly code that does this on a case-by-case basis depending on how many unique suits there are. It won't scale to hands with more cards. Also in a situation like:
7A 7B 8A 8B
it will have duplicates, since in this case A=c and B=d is the same as A=d and B=c.
What's an elegant way to solve this problem efficiently? I'm coding in C, but I can convert higher-level code down to C.
There are only 4 suits so the space of possible substitutions is really small - 4! = 24 cases.
In this case, I don't think it is worth it, to try to come up with something especially clever.
Just parse the string like "7A 7B 8A 8B", count the number of different letters in it, and based on that number, generate substitutions based on a precomputed set of substitutions.
1 letter -> 4 possible substitutions c, d, h, or s
2 letters -> 12 substitutions like in Your example.
3 or 4 letters -> 24 substitutions.
Then sort the set of substitutions and remove duplicates. You have do sort the tokens in every string like "7c 8d 9d 9s" and then sort an array of the strings to detect duplicates but that shouldn't be a problem. It's good to have the patterns like "7A 7B 8A 8B" sorted too (the tokens like: "7A", "8B" are in an ascending order).
EDIT:
An alternative for sorting might be, to detect identical sets if ranks associated with two or more suits and take it into account when generating substitutions, but it's more complicated I think. You would have to create a set of ranks for each letter appearing in the pattern string.
For example, for the string "7A 7B 8A 8B", with the letter A, associated is the set {7, 8} and the same set is associated with the letter B. Then You have to look for identical sets associated with different letters. In most cases those sets will have just one element, but they might have two as in the example above. Letters associated with the same set are interchangeable. You can have following situations
1 letter no duplicates -> 4 possible substitutions c, d, h, or s
2 letters no duplicates -> 12 substitutions.
2 letters, 2 letters interchangeable (identical sets for both letters) -> 6 substitutions.
3 letters no duplicates -> 24 substitutions.
3 letters, 2 letters interchangeable -> 12 substitutions.
4 letters no duplicates -> 24 substitutions.
4 letters, 2 letters interchangeable -> 12 substitutions.
4 letters, 3 letters interchangeable -> 4 substitutions.
4 letters, 2 pairs of interchangeable letters -> 6 substitutions.
4 letters, 4 letters interchangeable -> 1 substitution.
I think a generic permutation function that takes an array arr and an integer n and returns all possible permutations of n elements in that array would be useful here.
Find how how many unique suits exist in the hand. Then generate all possible permutations with those many elements from the actual suits [c, d, h, s]. Finally go through each permutation of suits, and assign each unknown letter [A, B, C, D] in the hand to the permuted values.
The following code in Ruby takes a given hand and generates all suit permutations. The heaviest work is being done by the Array.permutation(n) method here which should simplify things a lot for a corresponding C program as well.
# all 4 suits needed for generating permutations
suits = ["c", "d", "h", "s"]
# current hand
hand = "9A 8A 7B 6B"
# find number of unique suits in the hand. In this case it's 2 => [A, B]
unique_suits_in_hand = hand.scan(/.(.)\s?/).uniq.length
# generate all possible permutations of 2 suits, and for each permutation
# do letter assignments in the original hand
# tr is a translation function which maps corresponding letters in both strings.
# it doesn't matter which unknowns are used (A, B, C, D) since they
# will be replaced consistently.
# After suit assignments are done, we split the cards in hand, and sort them.
possible_hands = suits.permutation(unique_suits_in_hand).map do |perm|
hand.tr("ABCD", perm.join ).split(' ').sort
end
# Remove all duplicates
p possible_hands.uniq
The above code outputs
9c 8c 7d 6d
9c 8c 7h 6h
9c 8c 7s 6s
9d 8d 7c 6c
9d 8d 7h 6h
9d 8d 7s 6s
9h 8h 7c 6c
9h 8h 7d 6d
9h 8h 7s 6s
9s 8s 7c 6c
9s 8s 7d 6d
9s 8s 7h 6h
Represent suits as sparse arrays or lists, numbers as indexes, hands as associative arrays
In your example
H [A[07080000] B[07080000] C[00000000] D[00000000] ] (place for four cards)
To get the "real" hands always apply the 24 permutations (fixed time), so you don't have to care about how many cards has your hand A,B,C,D -> c,d,h,s with the following "trick"> store always in alphabetical order>
H1 [c[xxxxxx] d[xxxxxx] s[xxxxxx] h[xxxxxx]]
Since Hands are associative arrays, duplicated permutations does not generate two different output hands.
#include <stdio.h>
#include <stdlib.h>
const int RANK = 0;
const int SUIT = 1;
const int NUM_SUITS = 4;
const char STANDARD_SUITS[] = "dchs";
int usedSuits[] = {0, 0, 0, 0};
const char MOCK_SUITS[] = "ABCD";
const char BAD_SUIT = '*';
char pullSuit (int i) {
if (usedSuits [i] > 0) {
return BAD_SUIT;
}
++usedSuits [i];
return STANDARD_SUITS [i];
}
void unpullSuit (int i) {
--usedSuits [i];
}
int indexOfSuit (char suit, const char suits[]) {
int i;
for (i = 0; i < NUM_SUITS; ++i) {
if (suit == suits [i]) {
return i;
}
}
return -1;
}
int legitimateSuits (const char suits[]) {
return indexOfSuit (BAD_SUIT, suits) == -1;
}
int distinctSuits (const char suits[]) {
int i, j;
for (i = 0; i < NUM_SUITS; ++i) {
for (j = 0; j < NUM_SUITS; ++j) {
if (i != j && suits [i] == suits [j]) {
return 0;
}
}
}
return 1;
}
void printCards (char* mockCards[], int numMockCards, const char realizedSuits[]) {
int i;
for (i = 0; i < numMockCards; ++i) {
char* mockCard = mockCards [i];
char rank = mockCard [RANK];
char mockSuit = mockCard [SUIT];
int idx = indexOfSuit (mockSuit, MOCK_SUITS);
char realizedSuit = realizedSuits [idx];
printf ("%c%c ", rank, realizedSuit);
}
printf ("\n");
}
/*
* Example usage:
* char** mockCards = {"9A", "8A", "7B", "6B"};
* expand (mockCards, 4);
*/
void expand (char* mockCards[], int numMockCards) {
int i, j, k, l;
for (i = 0; i < NUM_SUITS; ++i) {
char a = pullSuit (i);
for (j = 0; j < NUM_SUITS; ++j) {
char b = pullSuit (j);
for (k = 0; k < NUM_SUITS; ++k) {
char c = pullSuit (k);
for (l = 0; l < NUM_SUITS; ++l) {
char d = pullSuit (l);
char realizedSuits[] = {a, b, c, d};
int legitimate = legitimateSuits (realizedSuits);
if (legitimate) {
int distinct = distinctSuits (realizedSuits);
if (distinct) {
printCards (mockCards, numMockCards, realizedSuits);
}
}
unpullSuit (l);
}
unpullSuit (k);
}
unpullSuit (j);
}
unpullSuit (i);
}
}
int main () {
char* mockCards[] = {"9A", "8A", "7B", "6B"};
expand (mockCards, 4);
return 0;
}

Resources