why “mov” instruction cost so much time

why “mov” instruction cost so much time - performance

I profile my code by linux perf, but the results are not so easy to understand. Why does the mov cost so much time in the following code?
55.85 │ mov %rdx,-0x48(%rbp)
Could anyone help on that?
15.93 │ mov -0x48(%rbp),%rax
0.00 │ mov %rdx,%rcx
│ mov %rdx,-0x50(%rbp)
0.59 │ xor %edx,%edx
0.12 │ div %rsi
6.96 │ mov %rcx,%rax
0.00 │ not %ecx
│ shr $0x3,%rax
0.62 │ and $0x7,%ecx
0.17 │ movzbl 0x20(%r14,%rax,1),%eax
55.85│ mov %rdx,-0x48(%rbp) <= here
0.02 │ sar %cl,%eax
c code is here for reference:
struct resource {
uint64_t magic_num;
uint64_t m;
uint64_t k;
uint64_t count;
unsigned char c_vector[1];
};
#define get_array(v, n) ((v)[(n) >> 3] & (0x1 << (0x7 - ((n) & 0x7))))
int
compute_result(const struct resource *res, const void *key, size_t len)
{
uint32_t i;
uint64_t result[2];
for (i = 0; i < res->k; i++) {
get_x64_result(key, len, i, &result);
result[0] %= res->m;
result[1] %= res->m;
if (!get_array(res->c_vector, result[0])){
return 0;
}
if (!get_array(res->c_vector, result[1])){
return 0;
}
}
return 1;
}
maybe i can check if (result and res->m) < 0xffffffff then use 32bit div, but not sure could help.

Related

C getting a raw keypress with no stdlib

I am working an a very basic operating system for a learning experience, and I am trying to start with key presses. I am making a freestanding executable, so no standard library. How would I go about taking input from a keyboard? I have figured out how to print to the screen through video memory.
/*
* kernel.c
* */
void cls(int line) { // clear the screen
char *vidptr = (char*) 0xb8000;
/* 25 lines each of 80 columns; each element takes 2 bytes */
unsigned int x = 0;
while (x < 80 * 25 * 2) {
// blank character
vidptr[x] = ' ';
// attribute-byte - light grey on black screen
x += 1;
vidptr[x] = 0x07;
x += 1;
}
line = 0;
}
void printf(const char *str, int line, char attr) { // write a string to video memory
char *vidptr = (char*) 0xb8000;
unsigned int y =0, x = 0;
while (str[y] != '\0') {
// the character's ascii
vidptr[x] = str[y];
x++;
// attribute byte - give character black bg and light gray fg
vidptr[x+1] = attr;
x++;
y++;
}
}
void kmain(void) {
unsigned int line = 0;
cls(line);
printf("Testing the Kernel", line, 0x0a);
}
and my assembly:
;; entry point
bits 32 ; nasm directive - 32 bit
global entry
extern _kmain ; kmain is defined in the c file
section .text
entry: jmp start
;multiboot spec
align 4
dd 0x1BADB002 ; black magic
dd 0x00 ; flags
dd -(0x1BADB002 + 0x00) ; checksum. m+f+c should be zero
start:
cli ; block interrupts
mov esp, stack_space ; set stack pointer
call _kmain
hlt ; halt the CPU
section .bss
resb 8192 ; 8KB for stack
stack_space:

Interrupt STM8s issue with SDCC compiler

I want to use interrupts on the SMT8S003K3 (STM8SVL-DISCOVERY) and somehow it does not get recognized when I use the interrupt on a other file where my main() is located.
I made two tests:
- first I modified this code to Standard Peripheral Library:
uint8_t main(){
CLK->CKDIVR=0;
TIM1->PSCRH=0;
TIM1->PSCRL=0x80;
TIM1->CR1=1;
TIM1->IER=1;
GPIOD->DDR = 1;
GPIOD->CR1 = 1;
enableInterrupts();
while(1){
}
}
void TIM1_overflow_Handler() __interrupt(11)
{
TIM1->SR1 &= ~1; //reset interrupt
GPIOD->ODR ^= 1; //toggle LED
}
and it works...
Now to use UnitTests I putted this function into an other file: main_internal (just a simple example):
#ifdef STM8S003
#include "stm8s.h"
#endif
#include "main_internal.h"
void main_internal() {
CLK->CKDIVR=0;
TIM1->PSCRH=0;
TIM1->PSCRL=0x80;
TIM1->CR1=1;
TIM1->IER=1;
GPIOD->DDR = 1;
GPIOD->CR1 = 1;
enableInterrupts();
while(1){
}
}
void TIM1_overflow_Handler() __interrupt(11)
{
TIM1->SR1 &= ~1; //reset interrupt
GPIOD->ODR ^= 1; //toggle LED
}
with the main function:
#include "main_internal.h"
int main(){
main_internal();
return 0;
}
and it does nothing... well it flows around in the deep memory of the STM8S003 (as I have seen with the debugger). Is this a linking issue?
This is my Makefile:
ST_TARGET = main
INCLUDEDIR = Libraries/inc include
ST_LIBSRCDIR = Libraries/src
#ST_LIBSRC = $(ST_LIBSRCDIR)/stm8s_adc1.c $(ST_LIBSRCDIR)/stm8s_awu.c
#ST_LIBSRC += $(ST_LIBSRCDIR)/stm8s_beep.c $(ST_LIBSRCDIR)/stm8s_clk.c
#ST_LIBSRC += $(ST_LIBSRCDIR)/stm8s_exti.c $(ST_LIBSRCDIR)/stm8s_flash.c
#ST_LIBSRC += $(ST_LIBSRCDIR)/stm8s_gpio.c $(ST_LIBSRCDIR)/stm8s_i2c.c
#ST_LIBSRC += $(ST_LIBSRCDIR)/stm8s_itc.c $(ST_LIBSRCDIR)/stm8s_iwdg.c
#ST_LIBSRC += $(ST_LIBSRCDIR)/stm8s_rst.c $(ST_LIBSRCDIR)/stm8s_spi.c
#ST_LIBSRC += $(ST_LIBSRCDIR)/stm8s_tim1.c $(ST_LIBSRCDIR)/stm8s_wwdg.c
SRC_DIR = src
SRC_ALL = $(shell find -L $(SRC_DIR) -name '*.c')
SRCS = $(filter-out $(SRC_DIR)/$(ST_TARGET).c,$(SRC_ALL))
ST_OBJS = $(ST_LIBSRC:.c=.rel)
ST_LIB_FILES := $(addprefix $(ST_BUILD_DIR)/,$(notdir $(ST_OBJS)))
LIBD_FILES := $(addprefix $(ST_DEBUG_DIR)/,$(notdir $(ST_OBJS)))
PR_ST_OBJS = $(SRCS:.c=.rel)
ST_SRC_FILES := $(addprefix $(ST_BUILD_DIR)/,$(notdir $(PR_ST_OBJS)))
SRCD_FILES := $(addprefix $(ST_DEBUG_DIR)/,$(notdir $(PR_ST_OBJS)))
ST_OBJS += $(PR_ST_OBJS)
STD_OBJS= $(ST_OBJS)
MCU = STM8S003
ST_COMPILER = __SDCC__
DEFINES = -D$(ST_COMPILER) -D$(MCU) -DUSE_STDPERIPH_DRIVER
ST_CFLAGS = -mstm8 $(DEFINES)
ST_LDFLAGS = $(addprefix -I ,$(INCLUDEDIR))
ST_DEBUG_FLAGS = --out-fmt-elf --all-callee-saves --debug --stack-auto --fverbose-asm --float-reent --no-peep
IHX = $(ST_BUILD_DIR)/$(ST_TARGET).ihx
ELF = $(ST_DEBUG_DIR)/$(ST_TARGET).elf
$(info $$ srcfiles is [${ST_OBJS}])
$(info $$ srcfiles is [${LIBD_FILES}])
ifdef st-debug
ST_OUT_DIR=$(ST_DEBUG_DIR)
else
ST_OUT_DIR=$(ST_BUILD_DIR)
endif
################### BUILD PROCESS ###################
.PHONY: all st-build st-clean st-flash st-debug st-wipe
#all: clean st-build
st: st-clean st-build st-flash
st-debug: $(STD_OBJS) $(ELF)# ST_CFLAGS+=$(ST_DEBUG_FLAGS)
st-debug: ST_CFLAGS+=$(ST_DEBUG_FLAGS)
st-debug: ST_OUT_DIR=$(ST_DEBUG_DIR)
$(STD_OBJS):
$(ELF): $(SRC_DIR)/$(ST_TARGET).c
#echo $(ST_OUT_DIR)
#ST_CFLAGS += $(ST_DEBUG_FLAGS)
$(ST_CC) $(ST_CFLAGS) $(ST_LDFLAGS) -o $(ST_OUT_DIR)/ $< $(SRCD_FILES) $(LIBD_FILES)
$(SIZE) $#
$(SIZE) $# >> $(MEMORY_USAGE_DIR)/$(MEMORY_USAGE)
#echo Success!!!!\\n\\n
st-build: $(ST_OBJS) $(IHX)
$(ST_OBJS):
$(IHX): $(SRC_DIR)/$(ST_TARGET).c $(SRCS:.c)
$(ST_CC) $(ST_CFLAGS) $(ST_LDFLAGS) -o $(ST_OUT_DIR)/ $< $(ST_SRC_FILES) $(ST_LIB_FILES)
$(SIZE) $#
$(SIZE) $# >> $(MEMORY_USAGE_DIR)/$(MEMORY_USAGE)
#echo Success!!!!\\n\\n
This would be the code without Standard Peripheral library:
#include <stdint.h>
#define CLK_DIVR (*(volatile uint8_t *)0x50C6)
#define TIM1_CR1 (*(volatile uint8_t *)0x5250)
#define TIM1_IER (*(volatile uint8_t *)0x5254)
#define TIM1_SR1 (*(volatile uint8_t *)0x5255)
#define TIM1_CNTRH (*(volatile uint8_t *)0x525E)
#define TIM1_CNTRL (*(volatile uint8_t *)0x525F)
#define TIM1_PSCRH (*(volatile uint8_t *)0x5260)
#define TIM1_PSCRL (*(volatile uint8_t *)0x5261)
#define PD_ODR (*(volatile uint8_t *)0x500f)
#define PD_DDR (*(volatile uint8_t *)0x5011)
#define PD_CR1 (*(volatile uint8_t *)0x5012)
void main(void)
{
CLK_DIVR = 0x00; // Set the frequency to 16 MHz
TIM1_PSCRH = 0x00; // Configure timer
TIM1_PSCRL = 0x80;
TIM1_CR1 = 0x01; //Enable timer
TIM1_IER = 0x01; //Enable interrupt - update event
PD_DDR = 1;
PD_CR1 = 1;
__asm__ ("rim");
while(1){
}
}
void TIM1_overflow_Handler() __interrupt(11)
{
TIM1_SR1 &= ~1; //reset interrupt
PD_ODR ^= 1; //toggle LED
}
Here I have a diff from gdb of the two versions:
15c15
< 00008034: int 0x0080d7 ;0x80d7 <TIM1_overflow_Handler>
---
> 00008034: int 0x000000
44c44
< 00008098: ld A,(0x80fd,X) ;0x80fd <TIM1_overflow_Handler+38>
---
> 00008098: ld A,(0x8114,X) ;0x8114 <TIM1_overflow_Handler+38>
54c54
< 27 CLK_DIVR = 0x00; // Set the frequency to 16 MHz
---
> 8 main_internal();
56,163c56,136
< 000080b1: neg (0x50,SP) ;0x50
< 000080b3: ld A,0x3500 ;0x3500
< 29 TIM1_PSCRH = 0x00; // Configure timer
< 000080b4: mov 0x0052,#0x60 ;0x60
< 30 TIM1_PSCRL = 0x80;
< 000080b8: mov 0x8052,#0x61 ;0x61
< 32 TIM1_CR1 = 0x01; //Enable timer
< 000080bc: mov 0x0152,#0x50 ;0x50
< 33 TIM1_IER = 0x01; //Enable interrupt - update event
< 000080c0: mov 0x0152,#0x54 ;0x54
< 35 PD_DDR = 1;
< 000080c4: mov 0x0150,#0x11 ;0x11
< 36 PD_CR1 = 1;
< 000080c8: mov 0x0150,#0x12 ;0x12
< 37 __asm__ ("rim");
< 000080cc: rim
< 39 while(1){
< 000080cd: jp 0x80cd ;0x80cd <main+41>
< 41 }
< 000080d0: pop 0x0001 ;0x1
< 000080d3: pop 0x0002 ;0x2
< 000080d6: ret
< 43 void TIM1_overflow_Handler() __interrupt(11)
< TIM1_overflow_Handler:
< 000080d7: push 0x0002 ;0x2
< 000080da: push 0x0001 ;0x1
< 000080dd: ldw Y,SP
< 000080df: ldw 0x0001,Y ;0x1
< 45 TIM1_SR1 &= ~1; //reset interrupt
< 000080e3: ldw X,#0x5255 ;0x5255
< 000080e6: ld A,(X)
< 000080e7: and A,#0xfe ;0xfe
< 000080e9: ldw X,#0x5255 ;0x5255
< 000080ec: ld (X),A
< 46 PD_ODR ^= 1; //toggle LED
< 000080ed: ldw X,#0x500f ;0x500f
< 000080f0: ld A,(X)
< 000080f1: xor A,#0x01 ;0x1
< 000080f3: ldw X,#0x500f ;0x500f
< 000080f6: ld (X),A
< 47 }
< 000080f7: pop 0x0001 ;0x1
< 000080fa: pop 0x0002 ;0x2
< 000080fd: iret
< 000080fe: and A,#0xfe ;0xfe
< 00008100: ldw X,#0x5255 ;0x5255
< 00008103: ld (X),A
< 00008104: ldw X,#0x500f ;0x500f
< 00008107: ld A,(X)
< 00008108: xor A,#0x01 ;0x1
< 0000810a: ldw X,#0x500f ;0x500f
< 0000810d: ld (X),A
< 0000810e: pop 0x0001 ;0x1
< 00008111: pop 0x0002 ;0x2
< 00008114: iret
< 00008115: ld (X),A
< 00008116: ldw X,#0x5230 ;0x5230
< 00008119: ld A,(X)
< 0000811a: and A,#0x80 ;0x80
< 0000811c: cp A,#0x80 ;0x80
< 0000811e: jrne 0x8116 ;0x8116
< 00008120: ret
< 00008121: jra 0x8116 ;0x8116
< 00008123: ret
< 00008124: ldw X,#0x5235 ;0x5235
< 00008127: ld A,(X)
< 00008128: or A,#0x20 ;0x20
< 0000812a: ld (X),A
< 0000812b: ret
< 0000812c: ldw X,#0x5235 ;0x5235
< 0000812f: ld A,(X)
< 00008130: or A,#0x20 ;0x20
< 00008132: ld (X),A
< 00008133: ldw X,#0x5230 ;0x5230
< 00008136: ld A,(X)
< 00008137: and A,#0xd0 ;0xd0
< 00008139: ld (X),A
< 0000813a: ldw X,#0x5231 ;0x5231
< 0000813d: ld A,(X)
< 0000813e: ret
< 0000813f: sub SP,#0x08 ;0x8
< 00008141: mov 0x0050,#0xc6 ;0xc6
< 00008145: mov 0x0052,#0x60 ;0x60
< 00008149: mov 0x8052,#0x61 ;0x61
< 0000814d: mov 0x0152,#0x50 ;0x50
< 00008151: mov 0x0152,#0x54 ;0x54
< 00008155: mov 0x0150,#0x11 ;0x11
< 00008159: mov 0x0150,#0x12 ;0x12
< 0000815d: rim
< 0000815e: ldw X,#0x525e ;0x525e
< 00008161: ld A,(X)
< 00008162: ldw X,#0x5231 ;0x5231
< 00008165: ld (X),A
< 00008166: ldw X,#0x5230 ;0x5230
< 00008169: ld A,(X)
< 0000816a: and A,#0x80 ;0x80
< 0000816c: cp A,#0x80 ;0x80
< 0000816e: jrne 0x8166 ;0x8166
< 00008170: ldw X,#0x1388 ;0x1388
< 00008173: clr (0x02,SP) ;0x2
< 00008175: clr (0x01,SP) ;0x1
< 00008177: subw X,#0x0001 ;0x1
< 0000817a: ldw (0x07,SP),X ;0x7
< 0000817c: ld A,(0x02,SP) ;0x2
< 0000817e: sbc A,#0x00
< 00008180: ld (0x06,SP),A ;0x6
< 00008182: ld A,(0x01,SP) ;0x1
< 00008184: sbc A,#0x00
---
> 000080b1: iret
> ......... add A,0x5f ;0x5f
> 9 return 0;
> 000080b3: clrw X
> 10 }
> 000080b4: pop 0x0001 ;0x1
> 000080b7: pop 0x0002 ;0x2
> 000080ba: ret
> main_internal:
> 000080bb: push 0x0002 ;0x2
> 000080be: push 0x0001 ;0x1
> 000080c1: ldw Y,SP
> 000080c3: ldw 0x0001,Y ;0x1
> ......... ...
> 00ffffff: neg (0xa5,SP) ;0xa5
> 01000001: cpl (0xf9,SP) ;0xf9
> 01000003: neg (0xf7,SP) ;0xf7
> 01000005: xor A,0x6f ;0x6f
> 01000007: cpw X,#0xc9ac ;0xc9ac
> 0100000a: scf
> 0100000b: int 0x1ee2a7 ;0x1ee2a7
> 0100000f: sll 0x9b ;0x9b
> 01000011: clr (0x99,SP) ;0x99
> 01000013: jrne 0xffffef ;0xffffef
> 01000015:
> 01000016: addw SP,#0x11 ;0x11
> 01000018: or A,(0xaa,SP) ;0xaa
> 0100001a: sim
> 0100001b: mul X,A
> 0100001c: xor A,(0x3d,SP) ;0x3d
> 0100001e: sub A,0x63 ;0x63
> 01000020: adc A,0x53 ;0x53
> 01000022: ld (0xf8,SP),A ;0xf8
> 01000024: push 0xd95c ;0xd95c
> 01000027: pop A
> 01000028: sbc A,(0x53,SP) ;0x53
> 0100002a: cp A,(0x3e,SP) ;0x3e
> 0100002c: cpl 0x13 ;0x13
> 0100002e: sra (0xc0,X) ;0xc0
> 01000030: call (0xa2c9,X) ;0xa2c9
> 01000033: ld A,XH
> 01000034: tnzw X
> 01000035: subw X,#0x59cf ;0x59cf
> 01000038: ld (0x22,SP),A ;0x22
> 0100003a: decw X
> 0100003b: cpl (0xe9,X) ;0xe9
> 0100003d: and A,(0x7775,X) ;0x7775
> 01000040: sub A,(0x7e1c,X) ;0x7e1c
> 01000043: xor A,(X)
> 01000044: clrw X
> 01000045: swap 0xd5 ;0xd5
> 01000047: rlwa X,A
> 01000048: jreq 0x100006b ;0x100006b
> 0100004a: inc 0x22 ;0x22
> 0100004c:
> 0100004d: sra 0xdf ;0xdf
> 0100004f: cpw X,#0xb73f ;0xb73f
> 01000052: ld (0x0566,X),A ;0x566
> 01000055: sbc A,#0x4c ;0x4c
> 01000057: sll (X)
> 01000058: add A,0x40 ;0x40
> 0100005a: adc A,#0x05 ;0x5
> 0100005c: ld (0xc2,SP),A ;0xc2
> 0100005e: ld XL,A
> 0100005f: incw X
> 01000060: ldw (0xc8fe,X),Y ;0xc8fe
> 01000063: call 0x27e5 ;0x27e5
> 01000066: ldw X,(0xd0,X) ;0xd0
> 01000068: cpw X,0xc159 ;0xc159
> 0100006b: mov 0xc5,0xe8 ;0xe8
> 0100006e: cp A,(X)
> 0100006f: bcp A,(0xcb,SP) ;0xcb
> 01000071: cpw Y,(X)
> 01000072: xor A,(X)
> 01000073: pop CC
> 01000074: swap (0x1b,SP) ;0x1b
> 01000076: ldw X,#0x44cc ;0x44cc
> 01000079: div X,A
> 0100007a: rlc A
> 0100007b: sbc A,0xa8 ;0xa8
> 0100007d: inc (0x9b,X) ;0x9b
Surprisingly the line:
00008034: int 0x0080d7 ;0x80d7
is missing in the second version in the main_internal.
Any help would be appreciated

facepalm
I did not included the interrupt handler in the main_internal.h file:
void TIM1_overflow_Handler() __interrupt(11);
In the SDCC Compiler User Guide page 40 :
If you have multiple source files in your
project, interrupt service routines can be present in any of them, but
a prototype of the isr MUST be present or included in the file that
contains the function main.
So be sure you always make your isr visible to the main function.

Inline ASM: Use of MMX returns NaN seconds on timer

Problem
I am trying to find out whether mmx or xmm registers are faster for copying elements of an array to another array (I know about memcpy() but I need this function for a very specific purpose).
My souce code is below. The relevant function is copyarray(). I can use either mmx or xmm registers with movq or movsd respectively, and the result is correct. However, when I use mmx registers, any timer I use (either clock() or QueryPerformanceCounter) to time the operations returns NaN.
Compiled with: gcc -std=c99 -O2 -m32 -msse3 -mincoming-stack-boundary=2 -mfpmath=sse,387 -masm=intel copyasm.c -o copyasm.exe
This is a very strange bug and I cannot figure out why using mmx registers would cause a timer to return NaN seconds, while using xmm registers in exactly the same code returns a valid time value
EDIT
Results using xmm registers:
Elapsed time: 0.000000 seconds, Gigabytes copied per second: inf GB
Residual = 0.000000
0.937437 0.330424 0.883267 0.118717 0.962493 0.584826 0.344371 0.423719
0.937437 0.330424 0.883267 0.118717 0.962493 0.584826 0.344371 0.423719
Results using mmx register:
Elapsed time: nan seconds, Gigabytes copied per second: inf GB
Residual = 0.000000
0.000000 0.754173 0.615345 0.634724 0.611286 0.547655 0.729637 0.942381
0.935759 0.754173 0.615345 0.634724 0.611286 0.547655 0.729637 0.942381
Source Code
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <windows.h>
inline double
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) copyarray(
double* restrict dst,
const double* restrict src,
const int n)
{
// int i = n;
// do {
// *dst++ = *src++;
// i--;
// } while(i);
__asm__ __volatile__
(
"mov ecx, %[n] \n\t"
"mov edi, %[dst] \n\t"
"mov esi, %[src] \n\t"
"xor eax, eax \n\t"
"sub ecx,1 \n\t"
"L%=: \n\t"
"movq mm0, QWORD PTR [esi+ecx*8] \n\t"
"movq QWORD PTR [edi+ecx*8], mm0 \n\t"
"sub ecx, 1 \n\t"
"jge L%= \n\t"
: // no outputs
: // inputs
[dst] "m" (dst),
[src] "m" (src),
[n] "g" (n)
: // register clobber
"eax","ecx","edi","esi",
"mm0"
);
}
void printarray(double* restrict a, int n)
{
for(int i = 0; i < n; ++i) {
printf(" %f ", *(a++));
}
printf("\n");
}
double residual(const double* restrict dst,
const double* restrict src,
const int n)
{
double residual = 0.0;
for(int i = 0; i < n; ++i)
residual += *(dst++) - *(src++);
return(residual);
}
int main()
{
double *A = NULL;
double *B = NULL;
int n = 8;
double memops;
double time3;
clock_t time1;
// LARGE_INTEGER frequency, time1, time2;
// QueryPerformanceFrequency(&frequency);
int trials = 1 << 0;
A = _mm_malloc(n*sizeof(*A), 64);
B = _mm_malloc(n*sizeof(*B), 64);
srand(time(NULL));
for(int i = 0; i < n; ++i)
*(A+i) = (double) rand()/RAND_MAX;
// QueryPerformanceCounter(&time1);
time1 = clock();
for(int i = 0; i < trials; ++i)
copyarray(B,A,n);
// QueryPerformanceCounter(&time2);
// time3 = (double)(time2.QuadPart - time1.QuadPart) / frequency.QuadPart;
time3 = (double) (clock() - time1)/CLOCKS_PER_SEC;
memops = (double) trials*n/time3*sizeof(*A)/1.0e9;
printf("Elapsed time: %f seconds, Gigabytes copied per second: %f GB\n",time3, memops);
printf("Residual = %f\n",residual(B,A,n));
printarray(A,n);
printarray(B,n);
_mm_free(A);
_mm_free(B);
}

You have to be careful when mixing MMX with floating point - use SSE instead if possible. If you must use MMX then read the section titled "MMX - State Management" on this page - note the requirement for the emms instruction after any MMX instructions before you next perform any floating point operations.

efficiently find the first element matching a bit mask

I have a list of N 64-bit integers whose bits represent small sets. Each integer has at most k bits set to 1. Given a bit mask, I would like to find the first element in the list that matches the mask, i.e. element & mask == element.
Example:
If my list is:
index abcdef
0 001100
1 001010
2 001000
3 000100
4 000010
5 000001
6 010000
7 100000
8 000000
and my mask is 111000, the first element matching the mask is at index 2.
Method 1:
Linear search through the entire list. This takes O(N) time and O(1) space.
Method 2:
Precompute a tree of all possible masks, and at each node keep the answer for that mask. This takes O(1) time for the query, but takes O(2^64) space.
Question:
How can I find the first element matching the mask faster than O(N), while still using a reasonable amount of space? I can afford to spend polynomial time in precomputation, because there will be a lot of queries. The key is that k is small. In my application, k <= 5 and N is in the thousands. The mask has many 1s; you can assume that it is drawn uniformly from the space of 64-bit integers.
Update:
Here is an example data set and a simple benchmark program that runs on Linux: http://up.thirld.com/binmask.tar.gz. For large.in, N=3779 and k=3. The first line is N, followed by N unsigned 64-bit ints representing the elements. Compile with make. Run with ./benchmark.e >large.out to create the true output, which you can then diff against. (Masks are generated randomly, but the random seed is fixed.) Then replace the find_first() function with your implementation.
The simple linear search is much faster than I expected. This is because k is small, and so for a random mask, a match is found very quickly on average.

A suffix tree (on bits) will do the trick, with the original priority at the leaf nodes:
000000 -> 8
1 -> 5
10 -> 4
100 -> 3
1000 -> 2
10 -> 1
100 -> 0
10000 -> 6
100000 -> 7
where if the bit is set in the mask, you search both arms, and if not, you search only the 0 arm; your answer is the minimum number you encounter at a leaf node.
You can improve this (marginally) by traversing the bits not in order but by maximum discriminability; in your example, note that 3 elements have bit 2 set, so you would create
2:0 0:0 1:0 3:0 4:0 5:0 -> 8
5:1 -> 5
4:1 5:0 -> 4
3:1 4:0 5:0 -> 3
1:1 3:0 4:0 5:0 -> 6
0:1 1:0 3:0 4:0 5:0 -> 7
2:1 0:0 1:0 3:0 4:0 5:0 -> 2
4:1 5:0 -> 1
3:1 4:0 5:0 -> 0
In your example mask this doesn't help (since you have to traverse both the bit2==0 and bit2==1 sides since your mask is set in bit 2), but on average it will improve the results (but at a cost of setup and more complex data structure). If some bits are much more likely to be set than others, this could be a huge win. If they're pretty close to random within the element list, then this doesn't help at all.
If you're stuck with essentially random bits set, you should get about (1-5/64)^32 benefit from the suffix tree approach on average (13x speedup), which might be better than the difference in efficiency due to using more complex operations (but don't count on it--bit masks are fast). If you have a nonrandom distribution of bits in your list, then you could do almost arbitrarily well.

This is the bitwise Kd-tree. It typically needs less than 64 visits per lookup operation. Currently, the selection of the bit (dimension) to pivot on is random.
#include <limits.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
typedef unsigned long long Thing;
typedef unsigned long Number;
unsigned thing_ffs(Thing mask);
Thing rand_mask(unsigned bitcnt);
#define WANT_RANDOM 31
#define WANT_BITS 3
#define BITSPERTHING (CHAR_BIT*sizeof(Thing))
#define NONUMBER ((Number)-1)
struct node {
Thing value;
Number num;
Number nul;
Number one;
char pivot;
} *nodes = NULL;
unsigned nodecount=0;
unsigned itercount=0;
struct node * nodes_read( unsigned *sizp, char *filename);
Number *find_ptr_to_insert(Number *ptr, Thing value, Thing mask);
unsigned grab_matches(Number *result, Number num, Thing mask);
void initialise_stuff(void);
int main (int argc, char **argv)
{
Thing mask;
Number num;
unsigned idx;
srand (time(NULL));
nodes = nodes_read( &nodecount, argv[1]);
fprintf( stdout, "Nodecount=%u\n", nodecount );
initialise_stuff();
#if WANT_RANDOM
mask = nodes[nodecount/2].value | nodes[nodecount/3].value ;
#else
mask = 0x38;
#endif
fprintf( stdout, "\n#### Search mask=%llx\n", (unsigned long long) mask );
itercount = 0;
num = NONUMBER;
idx = grab_matches(&num,0, mask);
fprintf( stdout, "Itercount=%u\n", itercount );
fprintf(stdout, "KdTree search %16llx\n", (unsigned long long) mask );
fprintf(stdout, "Count=%u Result:\n", idx);
idx = num;
if (idx >= nodecount) idx = nodecount-1;
fprintf( stdout, "num=%4u Value=%16llx\n"
,(unsigned) nodes[idx].num
,(unsigned long long) nodes[idx].value
);
fprintf( stdout, "\nLinear search %16llx\n", (unsigned long long) mask );
for (idx = 0; idx < nodecount; idx++) {
if ((nodes[idx].value & mask) == nodes[idx].value) break;
}
fprintf(stdout, "Cnt=%u\n", idx);
if (idx >= nodecount) idx = nodecount-1;
fprintf(stdout, "Num=%4u Value=%16llx\n"
, (unsigned) nodes[idx].num
, (unsigned long long) nodes[idx].value );
return 0;
}
void initialise_stuff(void)
{
unsigned num;
Number root, *ptr;
root = 0;
for (num=0; num < nodecount; num++) {
nodes[num].num = num;
nodes[num].one = NONUMBER;
nodes[num].nul = NONUMBER;
nodes[num].pivot = -1;
}
nodes[num-1].value = 0; /* last node is guaranteed to match anything */
root = 0;
for (num=1; num < nodecount; num++) {
ptr = find_ptr_to_insert (&root, nodes[num].value, 0ull );
if (*ptr == NONUMBER) *ptr = num;
else fprintf(stderr, "Found %u for %u\n"
, (unsigned)*ptr, (unsigned) num );
}
}
Thing rand_mask(unsigned bitcnt)
{struct node * nodes_read( unsigned *sizp, char *filename)
{
struct node *ptr;
unsigned size,used;
FILE *fp;
if (!filename) {
size = (WANT_RANDOM+0) ? WANT_RANDOM : 9;
ptr = malloc (size * sizeof *ptr);
#if (!WANT_RANDOM)
ptr[0].value = 0x0c;
ptr[1].value = 0x0a;
ptr[2].value = 0x08;
ptr[3].value = 0x04;
ptr[4].value = 0x02;
ptr[5].value = 0x01;
ptr[6].value = 0x10;
ptr[7].value = 0x20;
ptr[8].value = 0x00;
#else
for (used=0; used < size; used++) {
ptr[used].value = rand_mask(WANT_BITS);
}
#endif /* WANT_RANDOM */
*sizp = size;
return ptr;
}
fp = fopen( filename, "r" );
if (!fp) return NULL;
fscanf(fp,"%u\n", &size );
fprintf(stderr, "Size=%u\n", size);
ptr = malloc (size * sizeof *ptr);
for (used = 0; used < size; used++) {
fscanf(fp,"%llu\n", &ptr[used].value );
}
fclose( fp );
*sizp = used;
return ptr;
}
Thing value = 0;
unsigned bit, cnt;
for (cnt=0; cnt < bitcnt; cnt++) {
bit = 54321*rand();
bit %= BITSPERTHING;
value |= 1ull << bit;
}
return value;
}
Number *find_ptr_to_insert(Number *ptr, Thing value, Thing done)
{
Number num=NONUMBER;
while ( *ptr != NONUMBER) {
Thing wrong;
num = *ptr;
wrong = (nodes[num].value ^ value) & ~done;
if (nodes[num].pivot < 0) { /* This node is terminal */
/* choose one of the wrong bits for a pivot .
** For this bit (nodevalue==1 && searchmask==0 )
*/
if (!wrong) wrong = ~done ;
nodes[num].pivot = thing_ffs( wrong );
}
ptr = (wrong & 1ull << nodes[num].pivot) ? &nodes[num].nul : &nodes[num].one;
/* Once this bit has been tested, it can be masked off. */
done |= 1ull << nodes[num].pivot ;
}
return ptr;
}
unsigned grab_matches(Number *result, Number num, Thing mask)
{
Thing wrong;
unsigned count;
for (count=0; num < *result; ) {
itercount++;
wrong = nodes[num].value & ~mask;
if (!wrong) { /* we have a match */
if (num < *result) { *result = num; count++; }
/* This is cheap pruning: the break will omit both subtrees from the results.
** But because we already have a result, and the subtrees have higher numbers
** than our current num, we can ignore them. */
break;
}
if (nodes[num].pivot < 0) { /* This node is terminal */
break;
}
if (mask & 1ull << nodes[num].pivot) {
/* avoid recursion if there is only one non-empty subtree */
if (nodes[num].nul >= *result) { num = nodes[num].one; continue; }
if (nodes[num].one >= *result) { num = nodes[num].nul; continue; }
count += grab_matches(result, nodes[num].nul, mask);
count += grab_matches(result, nodes[num].one, mask);
break;
}
mask |= 1ull << nodes[num].pivot;
num = (wrong & 1ull << nodes[num].pivot) ? nodes[num].nul : nodes[num].one;
}
return count;
}
unsigned thing_ffs(Thing mask)
{
unsigned bit;
#if 1
if (!mask) return (unsigned)-1;
for ( bit=random() % BITSPERTHING; 1 ; bit += 5, bit %= BITSPERTHING) {
if (mask & 1ull << bit ) return bit;
}
#elif 0
for (bit =0; bit < BITSPERTHING; bit++ ) {
if (mask & 1ull <<bit) return bit;
}
#else
mask &= (mask-1); // Kernighan-trick
for (bit =0; bit < BITSPERTHING; bit++ ) {
mask >>=1;
if (!mask) return bit;
}
#endif
return 0xffffffff;
}
struct node * nodes_read( unsigned *sizp, char *filename)
{
struct node *ptr;
unsigned size,used;
FILE *fp;
if (!filename) {
size = (WANT_RANDOM+0) ? WANT_RANDOM : 9;
ptr = malloc (size * sizeof *ptr);
#if (!WANT_RANDOM)
ptr[0].value = 0x0c;
ptr[1].value = 0x0a;
ptr[2].value = 0x08;
ptr[3].value = 0x04;
ptr[4].value = 0x02;
ptr[5].value = 0x01;
ptr[6].value = 0x10;
ptr[7].value = 0x20;
ptr[8].value = 0x00;
#else
for (used=0; used < size; used++) {
ptr[used].value = rand_mask(WANT_BITS);
}
#endif /* WANT_RANDOM */
*sizp = size;
return ptr;
}
fp = fopen( filename, "r" );
if (!fp) return NULL;
fscanf(fp,"%u\n", &size );
fprintf(stderr, "Size=%u\n", size);
ptr = malloc (size * sizeof *ptr);
for (used = 0; used < size; used++) {
fscanf(fp,"%llu\n", &ptr[used].value );
}
fclose( fp );
*sizp = used;
return ptr;
}
UPDATE:
I experimented a bit with the pivot-selection, favouring bits with the highest discriminatory value ("information content"). This involves:
making a histogram of the usage of bits (can be done while initialising)
while building the tree: choosing the one with frequency closest to 1/2 in the remaining subtrees.
The result: the random pivot selection performed better.

Construct a a binary tree as follows:
Every level corresponds to a bit
It corresponding bit is on go right, otherwise left
This way insert every number in the database.
Now, for searching: if the corresponding bit in the mask is 1, traverse both children. If it is 0, traverse only the left node. Essentially keep traversing the tree until you hit the leaf node (BTW, 0 is a hit for every mask!).
This tree will have O(N) space requirements.
Eg of tree for 1 (001), 2(010) and 5 (101)
root
/ \
0 1
/ \ |
0 1 0
| | |
1 0 1
(1) (2) (5)

With precomputed bitmasks. Formally is is still O(N), since the and-mask operations are O(N). The final pass is also O(N), because it needs to find the lowest bit set, but that could be sped up, too.
#include <limits.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/* For demonstration purposes.
** In reality, this should be an unsigned long long */
typedef unsigned char Thing;
#define BITSPERTHING (CHAR_BIT*sizeof (Thing))
#define COUNTOF(a) (sizeof a / sizeof a[0])
Thing data[] =
/****** index abcdef */
{ 0x0c /* 0 001100 */
, 0x0a /* 1 001010 */
, 0x08 /* 2 001000 */
, 0x04 /* 3 000100 */
, 0x02 /* 4 000010 */
, 0x01 /* 5 000001 */
, 0x10 /* 6 010000 */
, 0x20 /* 7 100000 */
, 0x00 /* 8 000000 */
};
/* Note: this is for demonstration purposes.
** Normally, one should choose a machine wide unsigned int
** for bitmask arrays.
*/
struct bitmap {
char data[ 1+COUNTOF (data)/ CHAR_BIT ];
} nulmaps [ BITSPERTHING ];
#define BITSET(a,i) (a)[(i) / CHAR_BIT ] |= (1u << ((i)%CHAR_BIT) )
#define BITTEST(a,i) ((a)[(i) / CHAR_BIT ] & (1u << ((i)%CHAR_BIT) ))
void init_tabs(void);
void map_empty(struct bitmap *dst);
void map_full(struct bitmap *dst);
void map_and2(struct bitmap *dst, struct bitmap *src);
int main (void)
{
Thing mask;
struct bitmap result;
unsigned ibit;
mask = 0x38;
init_tabs();
map_full(&result);
for (ibit = 0; ibit < BITSPERTHING; ibit++) {
/* bit in mask is 1, so bit at this position is in fact a don't care */
if (mask & (1u <<ibit)) continue;
/* bit in mask is 0, so we can only select items with a 0 at this bitpos */
map_and2(&result, &nulmaps[ibit] );
}
/* This is not the fastest way to find the lowest 1 bit */
for (ibit = 0; ibit < COUNTOF (data); ibit++) {
if (!BITTEST(result.data, ibit) ) continue;
fprintf(stdout, " %u", ibit);
}
fprintf( stdout, "\n" );
return 0;
}
void init_tabs(void)
{
unsigned ibit, ithing;
/* 1 bits in data that dont overlap with 1 bits in the searchmask are showstoppers.
** So, for each bitpos, we precompute a bitmask of all *entrynumbers* from data[], that contain 0 in bitpos.
*/
memset(nulmaps, 0 , sizeof nulmaps);
for (ithing=0; ithing < COUNTOF(data); ithing++) {
for (ibit=0; ibit < BITSPERTHING; ibit++) {
if ( data[ithing] & (1u << ibit) ) continue;
BITSET(nulmaps[ibit].data, ithing);
}
}
}
/* Logical And of two bitmask arrays; simular to dst &= src */
void map_and2(struct bitmap *dst, struct bitmap *src)
{
unsigned idx;
for (idx = 0; idx < COUNTOF(dst->data); idx++) {
dst->data[idx] &= src->data[idx] ;
}
}
void map_empty(struct bitmap *dst)
{
memset(dst->data, 0 , sizeof dst->data);
}
void map_full(struct bitmap *dst)
{
unsigned idx;
/* NOTE this loop sets too many bits to the left of COUNTOF(data) */
for (idx = 0; idx < COUNTOF(dst->data); idx++) {
dst->data[idx] = ~0;
}
}

clear all but the two most significant set bits in a word

Given an 32 bit int which is known to have at least 2 bits set, is there a way to efficiently clear all except the 2 most significant set bits? i.e. I want to ensure the output has exactly 2 bits set.
What if the input is guaranteed to have only 2 or 3 bits set.?
Examples:
0x2040 -> 0x2040
0x0300 -> 0x0300
0x0109 -> 0x0108
0x5040 -> 0x5000
Benchmarking Results:
Code:
QueryPerformanceFrequency(&freq);
/***********/
value = (base =2)|1;
QueryPerformanceCounter(&start);
for (l=0;l<A_LOT; l++)
{
//!!value calculation goes here
junk+=value; //use result to prevent optimizer removing it.
//advance to the next 2|3 bit word
if (value&0x80000000)
{ if (base&0x80000000)
{ base=6;
}
base*=2;
value=base|1;
}
else
{ value<<=1;
}
}
QueryPerformanceCounter(&end);
time = (end.QuadPart - start.QuadPart);
time /= freq.QuadPart;
printf("--------- name\n");
printf("%ld loops took %f sec (%f additional)\n",A_LOT, time, time-baseline);
printf("words /sec = %f Million\n",A_LOT/(time-baseline)/1.0e6);
Results on using VS2005 default release settings on Core2Duo E7500#2.93 GHz:
--------- BASELINE
1000000 loops took 0.001630 sec
--------- sirgedas
1000000 loops took 0.002479 sec (0.000849 additional)
words /sec = 1178.074206 Million
--------- ashelly
1000000 loops took 0.004640 sec (0.003010 additional)
words /sec = 332.230369 Million
--------- mvds
1000000 loops took 0.005250 sec (0.003620 additional)
words /sec = 276.242030 Million
--------- spender
1000000 loops took 0.009594 sec (0.007964 additional)
words /sec = 125.566361 Million
--------- schnaader
1000000 loops took 0.025680 sec (0.024050 additional)
words /sec = 41.580158 Million

If the input is guaranteed to have exactly 2 or 3 bits then the answer can be computed very quickly. We exploit the fact that the expression x&(x-1) is equal to x with the LSB cleared. Applying that expression twice to the input will produce 0, if 2 or fewer bits are set. If exactly 2 bits are set, we return the original input. Otherwise, we return the original input with the LSB cleared.
Here is the code in C++:
// assumes a has exactly 2 or 3 bits set
int topTwoBitsOf( int a )
{
int b = a&(a-1); // b = a with LSB cleared
return b&(b-1) ? b : a; // check if clearing the LSB of b produces 0
}
This can be written as a confusing single expression, if you like:
int topTwoBitsOf( int a )
{
return a&(a-1)&((a&(a-1))-1) ? a&(a-1) : a;
}

I'd create a mask in a loop. At the beginning, the mask is 0. Then go from the MSB to the LSB and set each corresponding bit in the mask to 1 until you found 2 set bits. Finally AND the value with this mask.
#include <stdio.h>
#include <stdlib.h>
int clear_bits(int value) {
unsigned int mask = 0;
unsigned int act_bit = 0x80000000;
unsigned int bit_set_count = 0;
do {
if ((value & act_bit) == act_bit) bit_set_count++;
mask = mask | act_bit;
act_bit >>= 1;
} while ((act_bit != 0) && (bit_set_count < 2));
return (value & mask);
}
int main() {
printf("0x2040 => %X\n", clear_bits(0x2040));
printf("0x0300 => %X\n", clear_bits(0x0300));
printf("0x0109 => %X\n", clear_bits(0x0109));
printf("0x5040 => %X\n", clear_bits(0x5040));
return 0;
}
This is quite complicated, but should be more efficient as using a for loop over the 32 bits every time (and clear all bits except the 2 most significant set ones). Anyway, be sure to benchmark different ways before using one.
Of course, if memory is not a problem, use a lookup table approach like some recommended - this will be much faster.

how much memory is available at what latency? I would propose a lookup table ;-)
but seriously: if you would perform this on 100s of numbers, an 8 bit lookup table giving 2 msb and another 8 bit lookup table giving 1 msb may be all you need. Depending on the processor this might beat really counting bits.
For speed, I would create a lookup table mapping an input byte to
M(I)=0 if 1 or 0 bits set
M(I)=B' otherwise, where B' is the value of B with the 2 msb bits set.
Your 32 bit int are 4 input bytes I1 I2 I3 I4.
Lookup M(I1), if nonzero, you're done.
Compare M(I1)==0, if zero, repeat previous step for I2.
Else, lookup I2 in a second lookup table with 1 MSB bits, if nonzero, you're done.
Else, repeat previous step for I3.
etc etc. Don't actually loop anything over I1-4 but unroll it fully.
Summing up: 2 lookup tables with 256 entries, 247/256 of cases are resolved with one lookup, approx 8/256 with two lookups, etc.
edit: the tables, for clarity (input, bits table 2 MSB, bits table 1 MSB)
I table2 table1
0 00000000 00000000
1 00000000 00000001
2 00000000 00000010
3 00000011 00000010
4 00000000 00000100
5 00000101 00000100
6 00000110 00000100
7 00000110 00000100
8 00000000 00001000
9 00001001 00001000
10 00001010 00001000
11 00001010 00001000
12 00001100 00001000
13 00001100 00001000
14 00001100 00001000
15 00001100 00001000
16 00000000 00010000
17 00010001 00010000
18 00010010 00010000
19 00010010 00010000
20 00010100 00010000
..
250 11000000 10000000
251 11000000 10000000
252 11000000 10000000
253 11000000 10000000
254 11000000 10000000
255 11000000 10000000

Here's another attempt (no loops, no lookup, no conditionals). This time it works:
var orig=0x109;
var x=orig;
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x = orig & ~(x & ~(x >> 1));
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
var solution=orig & ~(x >> 1);
Console.WriteLine(solution.ToString("X")); //0x108
Could probably be shortened by someone cleverer than me.

Following up on my previous answer, here's the complete implementation. I think it is as fast as it can get. (sorry for unrolling the whole thing ;-)
#include <stdio.h>
unsigned char bittable1[256];
unsigned char bittable2[256];
unsigned int lookup(unsigned int);
void gentable(void);
int main(int argc,char**argv)
{
unsigned int challenge = 0x42341223, result;
gentable();
if ( argc > 1 ) challenge = atoi(argv[1]);
result = lookup(challenge);
printf("%08x --> %08x\n",challenge,result);
}
unsigned int lookup(unsigned int i)
{
unsigned int ret;
ret = bittable2[i>>24]<<24; if ( ret ) return ret;
ret = bittable1[i>>24]<<24;
if ( !ret )
{
ret = bittable2[i>>16]<<16; if ( ret ) return ret;
ret = bittable1[i>>16]<<16;
if ( !ret )
{
ret = bittable2[i>>8]<<8; if ( ret ) return ret;
ret = bittable1[i>>8]<<8;
if ( !ret )
{
return bittable2[i] | bittable1[i];
} else {
return (ret | bittable1[i&0xff]);
}
} else {
if ( bittable1[(i>>8)&0xff] )
{
return (ret | (bittable1[(i>>8)&0xff]<<8));
} else {
return (ret | bittable1[i&0xff]);
}
}
} else {
if ( bittable1[(i>>16)&0xff] )
{
return (ret | (bittable1[(i>>16)&0xff]<<16));
} else if ( bittable1[(i>>8)&0xff] ) {
return (ret | (bittable1[(i>>8)&0xff]<<8));
} else {
return (ret | (bittable1[i&0xff]));
}
}
}
void gentable()
{
int i;
for ( i=0; i<256; i++ )
{
int bitset = 0;
int j;
for ( j=128; j; j>>=1 )
{
if ( i&j )
{
bitset++;
if ( bitset == 1 ) bittable1[i] = i&(~(j-1));
else if ( bitset == 2 ) bittable2[i] = i&(~(j-1));
}
}
//printf("%3d %02x %02x\n",i,bittable1[i],bittable2[i]);
}
}

Using a variation of this, I came up with the following:
var orig=56;
var x=orig;
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
Console.WriteLine(orig&~(x>>2));
In c# but should translate easily.
EDIT
I'm not so sure I've answered your question. This takes the highest bit and preserves it and the bit next to it, eg. 101 => 100

Here's some python that should work:
def bit_play(num):
bits_set = 0
upper_mask = 0
bit_index = 31
while bit_index >= 0:
upper_mask |= (1 << bit_index)
if num & (1 << bit_index) != 0:
bits_set += 1
if bits_set == 2:
num &= upper_mask
break
bit_index -= 1
return num
It makes one pass over the number. It builds a mask of the bits that it crosses so it can mask off the bottom bits as soon as it hits the second-most significant one. As soon as it finds the second bit, it proceeds to clear the lower bits. You should be able to create a mask of the upper bits and &= it in instead of the second while loop. Maybe I'll hack that in and edit the post.

I'd also use a table based approach, but I believe one table alone should be sufficient. Take the 4 bit case as an example. If you're input is guaranteed to have 2 or 3 bits, then your output can only be one of 6 values
0011
0101
0110
1001
1010
1100
Put these possible values in an array sorted by size. Starting with the largest, find the first value which is equal to or less than your target value. This is your answer. For the 8 bit version you'll have more possible return values, but still easily less than the maximum possible permutations of 8*7.
public static final int [] MASKS = {
0x03, //0011
0x05, //0101
0x06, //0110
0x09, //1001
0x0A, //1010
0x0C, //1100
};
for (int i = 0; i < 16; ++i) {
if (countBits(i) < 2) {
continue;
}
for (int j = MASKS.length - 1; j >= 0; --j) {
if (MASKS[j] <= i) {
System.out.println(Integer.toBinaryString(i) + " " + Integer.toBinaryString(MASKS[j]));
break;
}
}
}

Here's my implementation in C#
uint OnlyMostSignificant(uint value, int count) {
uint newValue = 0;
int c = 0;
for(uint high = 0x80000000; high != 0 && c < count; high >>= 1) {
if ((value & high) != 0) {
newValue = newValue | high;
c++;
}
}
return newValue;
}
Using count, you could make it the most significant (count) bits.

My solution:
Use "The best method for counting bits in a 32-bit integer", then clear the lower bit if the answer is 3. Only works when input is limited to 2 or 3 bits set.
unsigned int c; // c is the total bits set in v
unsigned int v = value;
v = v - ((v >> 1) & 0x55555555);
v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
crc+=value&value-(c-2);

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio