Related
golang
input := []uint{1,2,3,4,5,6}
o := C.fixU32_encode((*C.uint)(unsafe.Pointer(&input[0])), C.size_t(len(input)))
return C.GoString(o)
c
char* fixU32_encode(unsigned int* ptr,size_t length);
rust
pub extern "C" fn fixU32_encode(ptr: *const u32, length: libc::size_t) -> *const libc::c_char {
assert!(!ptr.is_null());
let slice = unsafe {
std::slice::from_raw_parts(ptr, length as usize)
};
println!("{:?}", slice);// there will print [1,0,2,0,3,0]
println!("{:?}", length);
let mut arr = [0u32; 6];
for (&x, p) in slice.iter().zip(arr.iter_mut()) {
*p = x;
}
CString::new(hex::encode(arr.encode())).unwrap().into_raw()
}
This will be passed in, but the array received by rust is like this
[1,0,2,0,3,0]
In Go an uint is 64bit (see https://golangbyexample.com/go-size-range-int-uint/). As a result, you are storing 64bit integers in input.
The C Code and Rust code treats the input now was 32bit unsigned integers (in little endian format). So the first input of 0x1 in 64bit:
00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000001
becomes 0x1 and 0x0 respectively. Due to little endian the least significant bits are read first.
You want to be specific in Go to use 32bit using uint32 or ensure your C code matches the machine dependent integers types in Go.
I am trying to develop a kernel module that hooks system call. I'm testing on Raspberry Pi 3B, running raspbian buster Linux 4.19.97-v7+ armv7l.
So typically on x86 we can overwrite CR0 register but there is no similar register on ARM architecture. I tried to do it via set_memory_rw and then enabling it before exiting using set_memory_ro like one answer to a similar question at Cannot use set_memory_rw in Linux kernel on ARM64
but it doesn't work.
My code:
// SPDX-License-Identifier: GPL-3.0
#include <linux/init.h> // module_{init,exit}()
#include <linux/module.h> // THIS_MODULE, MODULE_VERSION, ...
#include <linux/kernel.h> // printk(), pr_*()
#include <linux/kallsyms.h> // kallsyms_lookup_name()
#include <asm/syscall.h> // syscall_fn_t, __NR_*
#include <asm/ptrace.h> // struct pt_regs
#include <asm/tlbflush.h> // flush_tlb_kernel_range()
#include <asm/pgtable.h> // {clear,set}_pte_bit(), set_pte()
#include <linux/vmalloc.h> // vm_unmap_aliases()
#include <linux/mm.h> // struct mm_struct, apply_to_page_range()
#include <linux/kconfig.h> // IS_ENABLED()
#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
static struct mm_struct *init_mm_ptr;
static void* *syscall_table;
#define MAGIC "mamaliga"
#define SIZEOF_MAGIC 8
#define ROOTKIT_SYS_CALL_TABLE 0x801011c4
int pos;
// static void* original_read;
asmlinkage long (*original_read)(unsigned int fd, char __user *buf, size_t count);
/********** HELPERS **********/
// From arch/arm/mm/pageattr.c.
struct page_change_data {
pgprot_t set_mask;
pgprot_t clear_mask;
};
static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr,
void *data)
{
struct page_change_data *cdata = data;
pte_t pte = *ptep;
pte = clear_pte_bit(pte, cdata->clear_mask);
pte = set_pte_bit(pte, cdata->set_mask);
set_pte_ext(ptep, pte, 0);
return 0;
}
void (*flush)(unsigned long start, unsigned long end);
// From arch/arm64/mm/pageattr.c.
static int __change_memory_common(unsigned long start, unsigned long size,
pgprot_t set_mask, pgprot_t clear_mask)
{
struct page_change_data data;
int ret;
data.set_mask = set_mask;
data.clear_mask = clear_mask;
ret = apply_to_page_range(init_mm_ptr, start, size, change_page_range, &data);
flush = (void*)kallsyms_lookup_name("flush_tlb_kernel_range");
flush(start, start + size);
return ret;
}
// Simplified set_memory_rw() from arch/arm/mm/pageattr.c.
static int set_page_rw(unsigned long addr)
{
vm_unmap_aliases();
return __change_memory_common(addr, PAGE_SIZE,
__pgprot(0),
__pgprot(L_PTE_RDONLY));
}
// Simplified set_memory_ro() from arch/arm/mm/pageattr.c.
static int set_page_ro(unsigned long addr)
{
vm_unmap_aliases();
return __change_memory_common(addr, PAGE_SIZE,
__pgprot(L_PTE_RDONLY),
__pgprot(0));
}
/********** ACTUAL MODULE **********/
asmlinkage long myread(unsigned int fd, char __user *buf, size_t count)
{
long ret;
/* Call original read_syscall */
ret = original_read(fd, buf, count);
pr_info("Hooked!\n");
return ret;
}
static int __init modinit(void)
{
int res;
pr_info("init\n");
// Shouldn't fail.
init_mm_ptr = (struct mm_struct *)kallsyms_lookup_name("init_mm");
syscall_table = (void* *)kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO "syscall_table: 0xx%llx\n", syscall_table);
original_read = syscall_table[__NR_read];
res = set_page_rw(((unsigned long)syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_rw() failed: %d\n", res);
return res;
}
else {
pr_info("set_page_rw() OK");
}
syscall_table[__NR_read] = myread;
res = set_page_ro(((unsigned long)syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_ro() failed: %d\n", res);
return res;
}
else {
pr_info("set_page_ro() OK");
}
pr_info("init done\n");
return 0;
}
static void __exit modexit(void)
{
int res;
pr_info("exit\n");
res = set_page_rw(((unsigned long)syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_rw() failed: %d\n", res);
return;
}
syscall_table[__NR_read] = original_read;
res = set_page_ro(((unsigned long)syscall_table + __NR_read) & PAGE_MASK);
if (res != 0)
pr_err("set_page_ro() failed: %d\n", res);
pr_info("goodbye\n");
}
module_init(modinit);
module_exit(modexit);
MODULE_VERSION("0.1");
MODULE_DESCRIPTION("Syscall hijack on arm64.");
MODULE_AUTHOR("Marco Bonelli");
MODULE_LICENSE("GPL");
Dmesg trace errors as follow
[ 89.767769] interceptor: loading out-of-tree module taints kernel.
[ 89.771442] interceptor: init
[ 89.806435] syscall_table: 0xx75c5dda175c5dda1
[ 89.812886] interceptor: set_page_rw() OK
[ 89.812898] Unable to handle kernel paging request at virtual address 801011d0
[ 89.822616] pgd = c9de9ec3
[ 89.826529] [801011d0] *pgd=0001141e(bad)
[ 89.831760] Internal error: Oops: 80d [#1] SMP ARM
[ 89.837763] Modules linked in: interceptor(O+) cmac bnep hci_uart btbcm serdev bluetooth ecdh_generic binfmt_misc evdev brcmfmac brcmutil sha256_generic cfg80211 raspberrypi_hwmon rfkill hwmon bcm2835_codec(C) bcm2835_v4l2(C) snd_bcm2835(C) v4l2_mem2mem snd_pcm bcm2835_mmal_vchiq(C) v4l2_common videobuf2_dma_contig snd_timer videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_common snd videodev media vc_sm_cma(C) fixed uio_pdrv_genirq uio ip_tables x_tables ipv6
[ 89.887532] CPU: 1 PID: 981 Comm: insmod Tainted: G C O 4.19.97-v7+ #1293
[ 89.898046] Hardware name: BCM2835
[ 89.902693] PC is at modinit+0xb0/0x1000 [interceptor]
[ 89.909084] LR is at (null)
[ 89.913214] pc : [<7f7430b0>] lr : [<00000000>] psr: 60000013
[ 89.920704] sp : b20e5d80 ip : 80d0517c fp : b20e5d9c
[ 89.927176] r10: b5ab3340 r9 : 00000002 r8 : b5ab3300
[ 89.933650] r7 : 00000000 r6 : fffff000 r5 : 00000000 r4 : 801011c4
[ 89.941469] r3 : 7f73e068 r2 : 75c5dda1 r1 : 00000000 r0 : 0000001d
[ 89.949256] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user
[ 89.957653] Control: 10c5383d Table: 320ec06a DAC: 00000055
[ 89.964666] Process insmod (pid: 981, stack limit = 0x322dc319)
[ 89.971873] Stack: (0xb20e5d80 to 0xb20e6000)
[ 89.977495] 5d80: 7f740000 7f743000 80d04d48 00000000 b20e5e14 b20e5da0 8010312c 7f74300c
[ 89.988192] 5da0: 802821d4 8085dffc 00000000 006000c0 b20e5dcc b20e5dc0 8085dffc 802ba274
[ 89.998889] 5dc0: b20e5e14 b20e5dd0 802ba274 802c7118 802bb6cc 802bab54 00000001 00003c76
[ 90.009578] 5de0: 00000000 a0000013 bccc8000 75c5dda1 7f740000 7f740000 7f740000 b7c04880
[ 90.020281] 5e00: 80d04d48 b5ab3300 b20e5e3c b20e5e18 801ba19c 801030e8 b20e5e3c b20e5e28
[ 90.031099] 5e20: 802a8250 b20e5f30 7f740000 00000002 b20e5f0c b20e5e40 801b9114 801ba134
[ 90.042029] 5e40: 7f74000c 00007fff 7f740000 801b6100 00000000 80ae4f00 7f7401fc 7f740114
[ 90.053063] 5e60: 7f740130 00000000 b5ab3308 7f740048 b20e5e94 80afd2d0 802d061c 802d0488
[ 90.064243] 5e80: b20e5ea0 b21029c0 00000000 00000000 00000000 00000000 00000000 00000000
[ 90.075471] 5ea0: 6e72656b 00006c65 00000000 00000000 00000000 00000000 00000000 00000000
[ 90.086778] 5ec0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 75c5dda1
[ 90.098119] 5ee0: 7fffffff 80d04d48 00000000 00000003 0002d064 7fffffff 00000000 0000017b
[ 90.109451] 5f00: b20e5fa4 b20e5f10 801b9974 801b7360 7fffffff 00000000 00000003 00000000
[ 90.120790] 5f20: 00000000 bccc8000 0000209c 00000000 bccc84e2 bccc89c0 bccc8000 0000209c
[ 90.132121] 5f40: bccc9a5c bccc98a8 bccc92a0 00003000 000032c0 00000000 00000000 00000000
[ 90.143457] 5f60: 000018d8 00000025 00000026 0000001d 0000001b 00000017 00000000 75c5dda1
[ 90.154784] 5f80: 010d6c00 7e9317d4 0003fce8 0000017b 801011c4 b20e4000 00000000 b20e5fa8
[ 90.166119] 5fa0: 80101000 801b98c4 010d6c00 7e9317d4 00000003 0002d064 00000000 00000004
[ 90.177457] 5fc0: 010d6c00 7e9317d4 0003fce8 0000017b 01355818 00000000 00000002 00000000
[ 90.188783] 5fe0: 7e931608 7e9315f8 00022cb8 76cadaf0 60000010 00000003 00000000 00000000
[ 90.200155] [<7f7430b0>] (modinit [interceptor]) from [<8010312c>] (do_one_initcall+0x50/0x218)
[ 90.212033] [<8010312c>] (do_one_initcall) from [<801ba19c>] (do_init_module+0x74/0x220)
[ 90.223280] [<801ba19c>] (do_init_module) from [<801b9114>] (load_module+0x1dc0/0x2404)
[ 90.234455] [<801b9114>] (load_module) from [<801b9974>] (sys_finit_module+0xbc/0xcc)
[ 90.245459] [<801b9974>] (sys_finit_module) from [<80101000>] (ret_fast_syscall+0x0/0x28)
[ 90.256813] Exception stack(0xb20e5fa8 to 0xb20e5ff0)
[ 90.263464] 5fa0: 010d6c00 7e9317d4 00000003 0002d064 00000000 00000004
[ 90.274748] 5fc0: 010d6c00 7e9317d4 0003fce8 0000017b 01355818 00000000 00000002 00000000
[ 90.286023] 5fe0: 7e931608 7e9315f8 00022cb8 76cadaf0
[ 90.292627] Code: eb28f040 e594400c e30e3068 e3473f73 (e584300c)
[ 90.300277] ---[ end trace 9daed852fe9a568f ]---
I also tried some other suggestions from ARM64 - Linux Memory Write protection won't disable which disable the Memory Write protection via the corresponding PTE to an virtual address by using the Linux Kernel Functions. it doesn't work either.
#include <linux/module.h> /* Needed by all modules */
#include <linux/unistd.h> /* Needed for __NR_read */
#include <linux/reboot.h> /* Needed for kernel_restart() */
#include <linux/slab.h> /* Needed for kmalloc() */
#include <linux/mm.h>
#include <asm/cacheflush.h> /* Needed for cache flush */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#define CR0_WRITE_PROTECT_MASK (1 << 16)
#define MAGIC "mamaliga"
#define SIZEOF_MAGIC 8
#define ROOTKIT_SYS_CALL_TABLE 0x801011c4
// ARM
#define HIJACK_SIZE 12
void **sys_call_table;
asmlinkage long (*read_syscall_ref)(unsigned int fd, char __user *buf, size_t count);
int pos; /* Size of MAGIC matched so far */
/* Function that replaces the original read_syscall.*/
asmlinkage long my_read_syscall_ref(unsigned int fd, char __user *buf, size_t count)
{
long ret;
int i;
/* Call original read_syscall */
ret = read_syscall_ref(fd, buf, count);
return ret;
}
void mem_text_write_kernel_word(unsigned long *addr, unsigned long word)
{
*addr = word;
flush_icache_range((unsigned long)addr,
((unsigned long)addr + sizeof(long)));
}
void cacheflush ( void *begin, unsigned long size )
{
flush_icache_range((unsigned long)begin, (unsigned long)begin + size);
}
static pgd_t *get_global_pgd (void)
{
pgd_t *pgd;
unsigned int ttb_reg;
asm volatile (
" mrc p15, 0, %0, c2, c0, 1"
: "=r" (ttb_reg));
printk(KERN_INFO "1st try: %08x", ttb_reg);
asm volatile (
"mrrc p15, 1, %Q0, %R0, c2"
: "=r"(ttb_reg));
printk(KERN_INFO "2nd try: %08x", ttb_reg);
if (PAGE_OFFSET == 0x80000000) ttb_reg -= (1 << 4); else if (PAGE_OFFSET == 0xc0000000) ttb_reg -= (16 << 10);
printk(KERN_INFO "3rd try: %08x", ttb_reg);
ttb_reg &= ~(PTRS_PER_PGD*sizeof(pgd_t)-1);
printk(KERN_INFO "4th try: %08x", ttb_reg);
pgd = __va(ttb_reg);
printk(KERN_INFO "Global virt pgd: %08x", pgd);
return pgd;
}
static pte_t *lookup_address (unsigned long addr, unsigned int *level)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
printk(KERN_INFO "lookup_address %08x", addr);
pgd = get_global_pgd() + pgd_index(addr);
printk(KERN_INFO "pgd 0x%0x= %p",pgd_val(*pgd), pgd);
pud = pud_offset (pgd, addr);
printk(KERN_INFO "pud 0x%0x= %p", pud_val(*pud), pud);
pmd = pmd_offset (pud, addr);
printk(KERN_INFO "pmd 0x%0x= %p", pmd_val(*pmd), pmd);
return pte_offset_kernel (pmd, addr);
}
int make_rw(unsigned long address)
{
unsigned int level;
pte_t *ptep, pte;
ptep = lookup_address(address, &level);
pte = *ptep;
printk(KERN_INFO "pte = %08x", pte);
printk(KERN_INFO "PTE before 0x%lx\n", pte);
*ptep = pte_mkwrite(*ptep);
*ptep = clear_pte_bit(*ptep, __pgprot((_AT(pteval_t, 1) << 7)));
__flush_tlb_all();
printk(KERN_INFO "PTE after 0x%lx\n", pte);
return 0;
}
static int __init interceptor_start(void)
{
unsigned long original_cr0;
/* Reading contents of control register cr0. The cr0 register has various
control flags that modify the basic operation of the processor. */
/* Disable `write-protect` mode. Do so by setting the WP (Write protect)
bit to 0. When set to 1, the CPU can't write to read-only pages */
// /* Store original read() syscall */
static void **sys_call_table;
void *swi_table_addr = (long *)0xffff0008; // Known address of Software Interrupt handler
unsigned long offset_from_swi_vector_adr = 0;
unsigned long *swi_vector_adr = 0;
offset_from_swi_vector_adr = ((*(long *)swi_table_addr) & 0xfff) + 8;
swi_vector_adr = *(unsigned long *)(swi_table_addr + offset_from_swi_vector_adr);
while (swi_vector_adr++)
{
if (((*(unsigned long *)swi_vector_adr) & 0xfffff000) == 0xe28f8000)
{ // Copy the entire sys_call_table from the offset_from_swi_vector_adr starting the hardware interrupt table
offset_from_swi_vector_adr = ((*(unsigned long *)swi_vector_adr) & 0xfff) + 8; // 0xe28f8000 is end of interrupt space. Hence we stop.
sys_call_table = (void *)swi_vector_adr + offset_from_swi_vector_adr;
break;
}
}
// sys_call_table = (void *) ROOTKIT_SYS_CALL_TABLE;
printk(KERN_INFO "ROOTKIT_SYS_CALL_TABLE: 0x%08x\n", sys_call_table);
printk(KERN_INFO "__NR_read: 0x%d\n", __NR_read);
read_syscall_ref = (void *)sys_call_table[__NR_read];
printk(KERN_INFO "read_syscall_ref: 0x%p\n", read_syscall_ref);
printk("func: %pF at address: %p\n", read_syscall_ref, read_syscall_ref);
void *func = &my_read_syscall_ref;
printk("Func: %pF at address: %p\n", func, func);
printk(KERN_INFO "my_read_syscall_ref: 0x%p\n", my_read_syscall_ref);
/* Replace in the system call table the original
read() syscall with our intercepting function */
make_rw(&read_syscall_ref);
// sys_call_table[__NR_read] = (unsigned long *) my_read_syscall_ref;
//hijack_start(read_syscall_ref, &my_read_syscall_ref);
printk(KERN_INFO "sys_call_table[__NR_read]: 0x%p\n", sys_call_table[__NR_read]);
printk(KERN_INFO "%s\n", "Hello");
/* A non 0 return value means init_module failed; module can't be loaded */
return 0;
}
/* Cleanup function which is called just before module
is rmmoded. It restores the original read() syscall. */
static void __exit interceptor_end(void)
{
/* Restore original read() syscall */
sys_call_table[__NR_read] = (unsigned long *) read_syscall_ref;
printk(KERN_INFO "%s\n", "Bye bye");
return;
}
module_init(interceptor_start);
module_exit(interceptor_end);
MODULE_LICENSE("GPL");
I'm really stuck at the moment and your suggestions would be very much appreciated!
you can use ftrace implement arm hook in kernel 4.19.
Ok first of all, I know that if i have a 8 bit computer it can only handle 8 bit numbers, not higher than 8, but I know that it is still possible to represent a 16-bit number or even a 32,64,128-bit number by allocating more memory in ram.
But for the sake of simplicity lets just use a 16 bit number as an example.
Let's say we have a 16 bit number in ram like this:
12 34 <-- That's Hexadecimal btw
Let's also write it in binary just in case yall prefer binary form:
00010010 00110100 <-- Binary
&
4660 in decimal
Now, we know that the computer cant understand this big number(4660) as one single number, because the computer can only understand 8-bit numbers which only goes up to 255. So the byte on the right would stay as it is:
00110100 <-- 52 in decimal
but the left byte:
00010010 <-- would be 18 if it was the byte on the right,
but since it is on the left, it means that its
4608
So my question is, how does the computer read the second byte as 4608 if it can only understand numbers that are lower than 255, and then after that how does it interprets those two bytes as a single number(4660)?
Thanks, if you are confused feel free to ask me down in the comments. I made it as clear as possible.
well this is more programing question then HW architecture as the CPU only does 8bit operations in your test case and has no knowledge about 16bit. Your example is: 16bit arithmetics on 8bit ALU and is usually done by splitting to High and Low half of number (and joining latter). That can be done in more ways for example here few (using C++):
transfer
const int _h=0; // MSB location
const int _l=1; // LSB location
BYTE h,l; // 8 bit halves
WORD hl; // 16 bit value
h=((BYTE*)(&hl))[_h];
l=((BYTE*)(&hl))[_l];
// here do your 8bit stuff on h,l
((BYTE*)(&hl))[_h]=h;
((BYTE*)(&hl))[_l]=l;
You need to copy from/to the 8bit/16bit "register" copies which is slow but sometimes can ease up things.
pointers
const int _h=0; // MSB location
const int _l=1; // LSB location
WORD hl; // 16 bit value
BYTE *h=((BYTE*)(&hl))+_h;
BYTE *l=((BYTE*)(&hl))+_l;
// here do your 8bit stuff on *h,*l or h[0],l[0]
you do not need to copy anything but use pointer access *h,*l instead h,l. The pointer initialization is done just once.
union
const int _h=0; // MSB location
const int _l=1; // LSB location
union reg16
{
WORD dw; // 16 bit value
BYTE db[2]; // 8 bit values
} a;
// here do your 8bit stuff on a.db[_h],a.db[_l]
This is the same as #2 but in more manageable form
CPU 8/16 bit registers
Even 8 bit CPU's have usually 16 bit registers accesible by its half or even full registers. For example on Z80 you got AF,BC,DE,HL,PC,SP most of which are directly accessible by its half registers too. So there are instructions working with hl and also instructions working with h,l separately.
On x86 it is the same for example:
mov AX,1234h
Is the same (apart of timing and possibly code length) as:
mov AH,12h
mov AL,34h
Well that is conversion between 8/16 bit in a nutshell but I assume you are asking more about how the operations are done. That is done with use of Carry flag (which is sadly missing from most of higher languages then assembler). For example 16 bit addition on 8 bit ALU (x86 architecture) is done like this:
// ax=ax+bx
add al,bl
adc ah,bh
So first you add lowest BYTE and then highest + Carry. For more info see:
Cant make value propagate through carry
For more info about how to implement other operations see any implementation on bignum arithmetics.
[Edit1]
Here small C++ example of how to print 16 bit number with only 8 bit arithmetics. You can use 8 bit ALU as a building block to make N*8 bit operations in the same way as I did the 16 bit operations ...
//---------------------------------------------------------------------------
// unsigned 8 bit ALU in C++
//---------------------------------------------------------------------------
BYTE cy; // carry flag cy = { 0,1 }
void inc(BYTE &a); // a++
void dec(BYTE &a); // a--
BYTE add(BYTE a,BYTE b); // = a+b
BYTE adc(BYTE a,BYTE b); // = a+b+cy
BYTE sub(BYTE a,BYTE b); // = a-b
BYTE sbc(BYTE a,BYTE b); // = a-b-cy
void mul(BYTE &h,BYTE &l,BYTE a,BYTE b); // (h,l) = a/b
void div(BYTE &h,BYTE &l,BYTE &r,BYTE ah,BYTE al,BYTE b); // (h,l) = (ah,al)/b ; r = (ah,al)%b
//---------------------------------------------------------------------------
void inc(BYTE &a) { if (a==0xFF) cy=1; else cy=0; a++; }
void dec(BYTE &a) { if (a==0x00) cy=1; else cy=0; a--; }
BYTE add(BYTE a,BYTE b)
{
BYTE c=a+b;
cy=DWORD(((a &1)+(b &1) )>>1);
cy=DWORD(((a>>1)+(b>>1)+cy)>>7);
return c;
}
BYTE adc(BYTE a,BYTE b)
{
BYTE c=a+b+cy;
cy=DWORD(((a &1)+(b &1)+cy)>>1);
cy=DWORD(((a>>1)+(b>>1)+cy)>>7);
return c;
}
BYTE sub(BYTE a,BYTE b)
{
BYTE c=a-b;
if (a<b) cy=1; else cy=0;
return c;
}
BYTE sbc(BYTE a,BYTE b)
{
BYTE c=a-b-cy;
if (cy) { if (a<=b) cy=1; else cy=0; }
else { if (a< b) cy=1; else cy=0; }
return c;
}
void mul(BYTE &h,BYTE &l,BYTE a,BYTE b)
{
BYTE ah,al;
h=0; l=0; ah=0; al=a;
if ((a==0)||(b==0)) return;
// long binary multiplication
for (;b;b>>=1)
{
if (BYTE(b&1))
{
l=add(l,al); // (h,l)+=(ah,al)
h=adc(h,ah);
}
al=add(al,al); // (ah,al)<<=1
ah=adc(ah,ah);
}
}
void div(BYTE &ch,BYTE &cl,BYTE &r,BYTE ah,BYTE al,BYTE b)
{
BYTE bh,bl,sh,dh,dl,h,l;
// init
bh=0; bl=b; sh=0; // (bh,bl) = b<<sh so it is >= (ah,al) without overflow
ch=0; cl=0; r=0; // results = 0
dh=0; dl=1; // (dh,dl) = 1<<sh
if (!b) return; // division by zero error
if ((!ah)&&(!al)) return; // division of zero
for (;bh<128;)
{
if (( ah)&&(bh>=ah)) break;
if ((!ah)&&(bl>=al)) break;
bl=add(bl,bl);
bh=adc(bh,bh);
dl=add(dl,dl);
dh=adc(dh,dh);
sh++;
}
// long binary division
for (;;)
{
l=sub(al,bl); // (h,l) = (ah,al)-(bh,bl)
h=sbc(ah,bh);
if (cy==0) // no overflow
{
al=l; ah=h;
cl=add(cl,dl); // increment result by (dh,dl)
ch=adc(ch,dh);
}
else{ // overflow -> shoft right
if (sh==0) break;
sh--;
bl>>=1; // (bh,bl) >>= 1
if (BYTE(bh&1)) bl|=128;
bh>>=1;
dl>>=1; // (dh,dl) >>= 1
if (BYTE(dh&1)) dl|=128;
dh>>=1;
}
}
r=al; // remainder (low 8bit)
}
//---------------------------------------------------------------------------
// print 16bit dec with 8bit arithmetics
//---------------------------------------------------------------------------
AnsiString prn16(BYTE h,BYTE l)
{
AnsiString s="";
BYTE r; int i,j; char c;
// divide by 10 and print the remainders
for (;;)
{
if ((!h)&&(!l)) break;
div(h,l,r,h,l,10); // (h,l)=(h,l)/10; r=(h,l)%10;
s+=char('0'+r); // add digit to text
}
if (s=="") s="0";
// reverse order
i=1; j=s.Length();
for (;i<j;i++,j--) { c=s[i]; s[i]=s[j]; s[j]=c; }
return s;
}
//---------------------------------------------------------------------------
I use VCL AnsiString for text storage you can change it to what ever string or even char[] instead.
You need to divide the whole number not just the BYTE's separately. See how the div function works. Here example of least significant digit of 264 print 264%10...
a = 264 = 00000001 00001000 bin
b = 10 = 00000000 00001010 bin
d = 1 = 00000000 00000001 bin
// apply shift sh so b>=a
a = 00000001 00001000 bin
b = 00000001 01000000 bin
d = 00000000 00100000 bin
sh = 5
// a-=b c+=d while a>=b
// a<b already so no change
a = 00000001 00001000 bin b = 00000001 01000000 bin c = 00000000 00000000 bin d = 00000000 00100000 bin
// shift right
b = 00000000 10100000 bin d = 00000000 00010000 bin sh = 4
// a-=b c+=d while a>=b
a = 00000000 01101000 bin c = 00000000 00010000 bin
// shift right
b = 00000000 01010000 bin d = 00000000 00001000 bin sh = 3
// a-=b c+=d while a>=b
a = 00000000 00011000 bin c = 00000000 00011000 bin
// shift right
b = 00000000 00101000 bin d = 00000000 00000100 bin sh = 2
b = 00000000 00010100 bin d = 00000000 00000010 bin sh = 1
// a-=b c+=d while a>=b
a = 00000000 00000100 bin c = 00000000 00011010 bin
// shift right
b = 00000000 00001010 bin d = 00000000 00000001 bin sh = 0
// a<b so stop a is remainder -> digit = 4
//now a=c and divide again from the start to get next digit ...
By interpreting them as base-256.
>>> 18*256 + 52
4660
A new satellite data processing center has just been completed and ready for the initial testing using live data being sent down from an orbiting satellite. As the very first messages are displayed on the screen and you notice many of the data values are wildly out of range.
For example, on the terminal screen is something defined as “delta time” and it seems to be out of the expected range [0.01 to 10,000.00 seconds], but the value displayed (as a double) is [-4.12318024e-028 seconds]. After further investigation into the raw byte-based data stream, you find the original data being sent down from the satellite for this double word as [0xC0 0x83 0xA1 0xCA 0x66 0x55 0x40 0xBA]. On one of the old terminals, this data is displayed correctly and is within the expected range.
a. [5] What caused this problem?
b. [5] If this is the real problem, what should the actual value be?
Ah, Failure Mode Analysis. Very important indeed!
Well, other terminal shows data correctly --> there is incompatibility between terminal and data.
Big Endian, little Endian perhaps? I am expecting the "old" terminal to be little Endian because it may have been coded in C. Now you can interpret the data.
Here is some code
#include <stdio.h>
union myW {
double x;
// Recieved as:[0xC0 0x83 0xA1 0xCA 0x66 0x55 0x40 0xBA]
unsigned char d[8] = {0x83, 0xC0,0xCA, 0xA1, 0x55, 0x66, 0xBA, 0x40};
};
union myBad {
double x;
// Recieved as:[0xC0 0x83 0xA1 0xCA 0x66 0x55 0x40 0xBA]
unsigned char d[8] = {0xC0, 0x83,0xA1, 0xCA, 0x66, 0x55, 0x40, 0xBA};
};
int main(void)
{
myW value;
value.x = 1.0; // check how reasonable number looks like
printf("Something reasonable: \n");
for(int i = 0; i < 8; i++)
{
printf("%u ", value.d[i]);
}
myW received;
printf("\nWhat shouldve been displayed:\n");
for(int i = 0; i < 8; i++)
{
printf("%u ", received.d[i]);
}
printf("\n%f\n", received.x);
myBad bad;
printf("\nBad output as:\n");
for(int i = 0; i < 8; i++)
{
printf("%u ", bad.d[i]);
}
printf("\n%0.30f\n", bad.x);
}
Output:
Something reasonable:
0 0 0 0 0 0 240 63
What shouldve been displayed::
131 192 202 161 85 102 186 64
6758.334500
Bad output as:
192 131 161 202 102 85 64 186
-0.000000000000000000000000000412
Compiled with g++
Given an 32 bit int which is known to have at least 2 bits set, is there a way to efficiently clear all except the 2 most significant set bits? i.e. I want to ensure the output has exactly 2 bits set.
What if the input is guaranteed to have only 2 or 3 bits set.?
Examples:
0x2040 -> 0x2040
0x0300 -> 0x0300
0x0109 -> 0x0108
0x5040 -> 0x5000
Benchmarking Results:
Code:
QueryPerformanceFrequency(&freq);
/***********/
value = (base =2)|1;
QueryPerformanceCounter(&start);
for (l=0;l<A_LOT; l++)
{
//!!value calculation goes here
junk+=value; //use result to prevent optimizer removing it.
//advance to the next 2|3 bit word
if (value&0x80000000)
{ if (base&0x80000000)
{ base=6;
}
base*=2;
value=base|1;
}
else
{ value<<=1;
}
}
QueryPerformanceCounter(&end);
time = (end.QuadPart - start.QuadPart);
time /= freq.QuadPart;
printf("--------- name\n");
printf("%ld loops took %f sec (%f additional)\n",A_LOT, time, time-baseline);
printf("words /sec = %f Million\n",A_LOT/(time-baseline)/1.0e6);
Results on using VS2005 default release settings on Core2Duo E7500#2.93 GHz:
--------- BASELINE
1000000 loops took 0.001630 sec
--------- sirgedas
1000000 loops took 0.002479 sec (0.000849 additional)
words /sec = 1178.074206 Million
--------- ashelly
1000000 loops took 0.004640 sec (0.003010 additional)
words /sec = 332.230369 Million
--------- mvds
1000000 loops took 0.005250 sec (0.003620 additional)
words /sec = 276.242030 Million
--------- spender
1000000 loops took 0.009594 sec (0.007964 additional)
words /sec = 125.566361 Million
--------- schnaader
1000000 loops took 0.025680 sec (0.024050 additional)
words /sec = 41.580158 Million
If the input is guaranteed to have exactly 2 or 3 bits then the answer can be computed very quickly. We exploit the fact that the expression x&(x-1) is equal to x with the LSB cleared. Applying that expression twice to the input will produce 0, if 2 or fewer bits are set. If exactly 2 bits are set, we return the original input. Otherwise, we return the original input with the LSB cleared.
Here is the code in C++:
// assumes a has exactly 2 or 3 bits set
int topTwoBitsOf( int a )
{
int b = a&(a-1); // b = a with LSB cleared
return b&(b-1) ? b : a; // check if clearing the LSB of b produces 0
}
This can be written as a confusing single expression, if you like:
int topTwoBitsOf( int a )
{
return a&(a-1)&((a&(a-1))-1) ? a&(a-1) : a;
}
I'd create a mask in a loop. At the beginning, the mask is 0. Then go from the MSB to the LSB and set each corresponding bit in the mask to 1 until you found 2 set bits. Finally AND the value with this mask.
#include <stdio.h>
#include <stdlib.h>
int clear_bits(int value) {
unsigned int mask = 0;
unsigned int act_bit = 0x80000000;
unsigned int bit_set_count = 0;
do {
if ((value & act_bit) == act_bit) bit_set_count++;
mask = mask | act_bit;
act_bit >>= 1;
} while ((act_bit != 0) && (bit_set_count < 2));
return (value & mask);
}
int main() {
printf("0x2040 => %X\n", clear_bits(0x2040));
printf("0x0300 => %X\n", clear_bits(0x0300));
printf("0x0109 => %X\n", clear_bits(0x0109));
printf("0x5040 => %X\n", clear_bits(0x5040));
return 0;
}
This is quite complicated, but should be more efficient as using a for loop over the 32 bits every time (and clear all bits except the 2 most significant set ones). Anyway, be sure to benchmark different ways before using one.
Of course, if memory is not a problem, use a lookup table approach like some recommended - this will be much faster.
how much memory is available at what latency? I would propose a lookup table ;-)
but seriously: if you would perform this on 100s of numbers, an 8 bit lookup table giving 2 msb and another 8 bit lookup table giving 1 msb may be all you need. Depending on the processor this might beat really counting bits.
For speed, I would create a lookup table mapping an input byte to
M(I)=0 if 1 or 0 bits set
M(I)=B' otherwise, where B' is the value of B with the 2 msb bits set.
Your 32 bit int are 4 input bytes I1 I2 I3 I4.
Lookup M(I1), if nonzero, you're done.
Compare M(I1)==0, if zero, repeat previous step for I2.
Else, lookup I2 in a second lookup table with 1 MSB bits, if nonzero, you're done.
Else, repeat previous step for I3.
etc etc. Don't actually loop anything over I1-4 but unroll it fully.
Summing up: 2 lookup tables with 256 entries, 247/256 of cases are resolved with one lookup, approx 8/256 with two lookups, etc.
edit: the tables, for clarity (input, bits table 2 MSB, bits table 1 MSB)
I table2 table1
0 00000000 00000000
1 00000000 00000001
2 00000000 00000010
3 00000011 00000010
4 00000000 00000100
5 00000101 00000100
6 00000110 00000100
7 00000110 00000100
8 00000000 00001000
9 00001001 00001000
10 00001010 00001000
11 00001010 00001000
12 00001100 00001000
13 00001100 00001000
14 00001100 00001000
15 00001100 00001000
16 00000000 00010000
17 00010001 00010000
18 00010010 00010000
19 00010010 00010000
20 00010100 00010000
..
250 11000000 10000000
251 11000000 10000000
252 11000000 10000000
253 11000000 10000000
254 11000000 10000000
255 11000000 10000000
Here's another attempt (no loops, no lookup, no conditionals). This time it works:
var orig=0x109;
var x=orig;
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x = orig & ~(x & ~(x >> 1));
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
var solution=orig & ~(x >> 1);
Console.WriteLine(solution.ToString("X")); //0x108
Could probably be shortened by someone cleverer than me.
Following up on my previous answer, here's the complete implementation. I think it is as fast as it can get. (sorry for unrolling the whole thing ;-)
#include <stdio.h>
unsigned char bittable1[256];
unsigned char bittable2[256];
unsigned int lookup(unsigned int);
void gentable(void);
int main(int argc,char**argv)
{
unsigned int challenge = 0x42341223, result;
gentable();
if ( argc > 1 ) challenge = atoi(argv[1]);
result = lookup(challenge);
printf("%08x --> %08x\n",challenge,result);
}
unsigned int lookup(unsigned int i)
{
unsigned int ret;
ret = bittable2[i>>24]<<24; if ( ret ) return ret;
ret = bittable1[i>>24]<<24;
if ( !ret )
{
ret = bittable2[i>>16]<<16; if ( ret ) return ret;
ret = bittable1[i>>16]<<16;
if ( !ret )
{
ret = bittable2[i>>8]<<8; if ( ret ) return ret;
ret = bittable1[i>>8]<<8;
if ( !ret )
{
return bittable2[i] | bittable1[i];
} else {
return (ret | bittable1[i&0xff]);
}
} else {
if ( bittable1[(i>>8)&0xff] )
{
return (ret | (bittable1[(i>>8)&0xff]<<8));
} else {
return (ret | bittable1[i&0xff]);
}
}
} else {
if ( bittable1[(i>>16)&0xff] )
{
return (ret | (bittable1[(i>>16)&0xff]<<16));
} else if ( bittable1[(i>>8)&0xff] ) {
return (ret | (bittable1[(i>>8)&0xff]<<8));
} else {
return (ret | (bittable1[i&0xff]));
}
}
}
void gentable()
{
int i;
for ( i=0; i<256; i++ )
{
int bitset = 0;
int j;
for ( j=128; j; j>>=1 )
{
if ( i&j )
{
bitset++;
if ( bitset == 1 ) bittable1[i] = i&(~(j-1));
else if ( bitset == 2 ) bittable2[i] = i&(~(j-1));
}
}
//printf("%3d %02x %02x\n",i,bittable1[i],bittable2[i]);
}
}
Using a variation of this, I came up with the following:
var orig=56;
var x=orig;
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
Console.WriteLine(orig&~(x>>2));
In c# but should translate easily.
EDIT
I'm not so sure I've answered your question. This takes the highest bit and preserves it and the bit next to it, eg. 101 => 100
Here's some python that should work:
def bit_play(num):
bits_set = 0
upper_mask = 0
bit_index = 31
while bit_index >= 0:
upper_mask |= (1 << bit_index)
if num & (1 << bit_index) != 0:
bits_set += 1
if bits_set == 2:
num &= upper_mask
break
bit_index -= 1
return num
It makes one pass over the number. It builds a mask of the bits that it crosses so it can mask off the bottom bits as soon as it hits the second-most significant one. As soon as it finds the second bit, it proceeds to clear the lower bits. You should be able to create a mask of the upper bits and &= it in instead of the second while loop. Maybe I'll hack that in and edit the post.
I'd also use a table based approach, but I believe one table alone should be sufficient. Take the 4 bit case as an example. If you're input is guaranteed to have 2 or 3 bits, then your output can only be one of 6 values
0011
0101
0110
1001
1010
1100
Put these possible values in an array sorted by size. Starting with the largest, find the first value which is equal to or less than your target value. This is your answer. For the 8 bit version you'll have more possible return values, but still easily less than the maximum possible permutations of 8*7.
public static final int [] MASKS = {
0x03, //0011
0x05, //0101
0x06, //0110
0x09, //1001
0x0A, //1010
0x0C, //1100
};
for (int i = 0; i < 16; ++i) {
if (countBits(i) < 2) {
continue;
}
for (int j = MASKS.length - 1; j >= 0; --j) {
if (MASKS[j] <= i) {
System.out.println(Integer.toBinaryString(i) + " " + Integer.toBinaryString(MASKS[j]));
break;
}
}
}
Here's my implementation in C#
uint OnlyMostSignificant(uint value, int count) {
uint newValue = 0;
int c = 0;
for(uint high = 0x80000000; high != 0 && c < count; high >>= 1) {
if ((value & high) != 0) {
newValue = newValue | high;
c++;
}
}
return newValue;
}
Using count, you could make it the most significant (count) bits.
My solution:
Use "The best method for counting bits in a 32-bit integer", then clear the lower bit if the answer is 3. Only works when input is limited to 2 or 3 bits set.
unsigned int c; // c is the total bits set in v
unsigned int v = value;
v = v - ((v >> 1) & 0x55555555);
v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
crc+=value&value-(c-2);