golang
input := []uint{1,2,3,4,5,6}
o := C.fixU32_encode((*C.uint)(unsafe.Pointer(&input[0])), C.size_t(len(input)))
return C.GoString(o)
c
char* fixU32_encode(unsigned int* ptr,size_t length);
rust
pub extern "C" fn fixU32_encode(ptr: *const u32, length: libc::size_t) -> *const libc::c_char {
assert!(!ptr.is_null());
let slice = unsafe {
std::slice::from_raw_parts(ptr, length as usize)
};
println!("{:?}", slice);// there will print [1,0,2,0,3,0]
println!("{:?}", length);
let mut arr = [0u32; 6];
for (&x, p) in slice.iter().zip(arr.iter_mut()) {
*p = x;
}
CString::new(hex::encode(arr.encode())).unwrap().into_raw()
}
This will be passed in, but the array received by rust is like this
[1,0,2,0,3,0]
In Go an uint is 64bit (see https://golangbyexample.com/go-size-range-int-uint/). As a result, you are storing 64bit integers in input.
The C Code and Rust code treats the input now was 32bit unsigned integers (in little endian format). So the first input of 0x1 in 64bit:
00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000001
becomes 0x1 and 0x0 respectively. Due to little endian the least significant bits are read first.
You want to be specific in Go to use 32bit using uint32 or ensure your C code matches the machine dependent integers types in Go.
Related
I use cilium ebpf pakage to write a ebpf program for getting the goroutine id.
but failed. my uprobe.c like this :
I think the key problem is that golang struct g trans to goroutine.h is wrong. can anyone help?
uprobe.c
SEC("uprobe/runtime.newproc1")
int uprobe_runtime_newproc1(struct pt_regs *ctx) {
u32 key = 2;
u64 initval = 1, *valp;
valp = bpf_map_lookup_elem(&uprobe_map, &key);
if (!valp) {
bpf_map_update_elem(&uprobe_map, &key, &initval, BPF_ANY);
return 0;
}
__sync_fetch_and_add(valp, 1);
struct g* goroutine_struct = (void *)PT_REGS_PARM4(ctx);
// retrieve output parameter
s64 goid = 0;
bpf_probe_read(&goid, sizeof(goid), &goroutine_struct->goid);
bpf_printk("bpf_printk bpf_probe_read goroutine_struct->goid: %lld", goid);
struct g gs;
bpf_probe_read(&gs, sizeof(gs), (void *)PT_REGS_PARM4(ctx));
bpf_printk("bpf_printk bpf_probe_read goroutine_struct.goid: %lld", gs.goid);
// test
void* ptr = (void *)PT_REGS_PARM4(ctx);
s64 goid2 = 0;
bpf_probe_read(&goid2, sizeof(goid2), (void *)(ptr+152));
bpf_printk("bpf_printk bpf_probe_read goid2: %lld", goid2);
return 0;
}
goroutine.h
#include "common.h"
struct stack {
u64 lo;
u64 hi;
};
struct gobuf {
u64 sp;
u64 pc;
u64 g;
u64 ctxt;
u64 ret;
u64 lr;
u64 bp;
};
/*
go version go1.17.2 linux/amd64
type stack struct {
lo uintptr
hi uintptr
}
type gobuf struct {
sp uintptr
pc uintptr
g uintptr
ctxt uintptr
ret uintptr
lr uintptr
bp uintptr
}
type g struct {
stack stack // offset known to runtime/cgo
stackguard0 uintptr // offset known to liblink
stackguard1 uintptr // offset known to liblink
_panic *_panic // innermost panic - offset known to liblink
_defer *_defer // innermost defer
m *m // current m; offset known to arm liblink
sched gobuf
syscallsp uintptr // if status==Gsyscall, syscallsp = sched.sp to use during gc
syscallpc uintptr // if status==Gsyscall, syscallpc = sched.pc to use during gc
stktopsp uintptr // expected sp at top of stack, to check in traceback
param unsafe.Pointer
atomicstatus uint32
stackLock uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
goid int64
}
*/
struct g {
struct stack stack;
u64 stackguard0;
u64 stackguard1;
u64 _panic;
u64 _defer;
u64 m;
struct gobuf sched ;
u64 syscallsp;
u64 syscallpc;
u64 stktopsp;
u64 param;
u32 atomicstatus;
u32 stackLock;
s64 goid; // Here it is!
};
When I run my program , cat /sys/kernel/debug/tracing/trace_pipe output like this , get the wrong go id:
<...>-1336127 [000] d... 20113210.986990: bpf_trace_printk: bpf_printk bpf_probe_read goroutine_struct->goid: 4938558469562467144
<...>-1336127 [000] d... 20113210.986998: bpf_trace_printk: bpf_printk bpf_probe_read goroutine_struct.goid: 4938558469562467144
<...>-1336127 [000] d... 20113210.986998: bpf_trace_printk: bpf_printk bpf_probe_read goid2: 4938558469562467144
Blockquote
I found a solution:
my golang version is 1.17.2, amd64. and the amd64 architecture uses the following sequence of 9 registers for integer arguments and results:
RAX, RBX, RCX, RDI, RSI, R8, R9, R10, R11
runtime.newproc1 which in go 1.17.2 has 5 args. callergp *g is the 4th. when I gdb my userspace program, it use rdi register to save ptr addr of callergp *g.
so use PT_REGS_PARM1 is the right way. because (#define PT_REGS_PARM1(x) ((x)->rdi))
after all, the code like this :
SEC("uprobe/runtime.newproc1")
int uprobe_runtime_newproc1(struct pt_regs *ctx) {
u32 key = 2;
u64 initval = 1, *valp;
valp = bpf_map_lookup_elem(&uprobe_map, &key);
if (!valp) {
bpf_map_update_elem(&uprobe_map, &key, &initval, BPF_ANY);
return 0;
}
__sync_fetch_and_add(valp, 1);
// retrieve output parameter
struct g gs;
bpf_probe_read(&gs, sizeof(gs), (void *)PT_REGS_PARM1(ctx));
bpf_printk("uprobe_runtime_newproc1 bpf_printk bpf_probe_read goroutine_struct.goid: %lld", gs.goid);
return 0;
}
I need to swap alternate bytes in a buffer as quickly as possible in an embedded system using ARM Cortex M4 processor. I use gcc. The amount of data is variable but the max is a little over 2K. it doesn't matter if a few extra bytes are converted because I can use an over-sized buffer.
I know that the ARM has the REV16 instruction, which I can use to swap alternate bytes in a 32-bit word. What I don't know is:
Is there a way of getting at this instruction in gcc without resorting to assembler? The __builtin_bswap16 intrinsic appears to operate on 16-bit words only. Converting 4 bytes at a time will surely be faster than converting 2 bytes.
Does the Cortex M4 have a reorder buffer and/or do register renaming? If not, what do I need to do to minimise pipeline stalls when I convert the dwords of the buffer in a partially-unrolled loop?
For example, is this code efficient, where REV16 is appropriately defined to resolve (1):
uint32_t *buf = ... ;
size_t n = ... ; // (number of bytes to convert + 15)/16
for (size_t i = 0; i < n; ++i)
{
uint32_t a = buf[0];
uint32_t b = buf[1];
uint32_t c = buf[2];
uint32_t d = buf[3];
REV16(a, a);
REV16(b, b);
REV16(c, c);
REV16(d, d);
buf[0] = a;
buf[1] = b;
buf[2] = c;
buf[3] = d;
buf += 4;
}
You can't use the __builtin_bswap16 function for the reason you stated, it works on 16 bit words so will 0 the other halfword. I guess the reason for this is to keep the intrinsic working the same on processors which don't have an instruction behaving similarly to REV16 on ARM.
The function
uint32_t swap(uint32_t in)
{
in = __builtin_bswap32(in);
in = (in >> 16) | (in << 16);
return in;
}
compiles to (ARM GCC 5.4.1 -O3 -std=c++11 -march=armv7-m -mtune=cortex-m4 -mthumb)
rev r0, r0
ror r0, r0, #16
bx lr
And you could probably ask the compiler to inline it, which would give you 2 instructions per 32bit word. I can't think of a way to get GCC to generate REV16 with a 32bit operand, without declaring your own function with inline assembly.
EDIT
As a follow up, and based on artless noise's comment about the non portability of the __builtin_bswap functions, the compiler recognizes
uint32_t swap(uint32_t in)
{
in = ((in & 0xff000000) >> 24) | ((in & 0x00FF0000) >> 8) | ((in & 0x0000FF00) << 8) | ((in & 0xFF) << 24);
in = (in >> 16) | (in << 16);
return in;
}
and creates the same 3 instruction function as above, so that is a more portable way to achieve it. Whether different compilers would produce the same output though...
EDIT EDIT
If inline assembler is allowed, the following function
inline uint32_t Rev16(uint32_t a)
{
asm ("rev16 %1,%0"
: "=r" (a)
: "r" (a));
return a;
}
gets inlined, and acts as a single instruction as can be seen here.
Ok first of all, I know that if i have a 8 bit computer it can only handle 8 bit numbers, not higher than 8, but I know that it is still possible to represent a 16-bit number or even a 32,64,128-bit number by allocating more memory in ram.
But for the sake of simplicity lets just use a 16 bit number as an example.
Let's say we have a 16 bit number in ram like this:
12 34 <-- That's Hexadecimal btw
Let's also write it in binary just in case yall prefer binary form:
00010010 00110100 <-- Binary
&
4660 in decimal
Now, we know that the computer cant understand this big number(4660) as one single number, because the computer can only understand 8-bit numbers which only goes up to 255. So the byte on the right would stay as it is:
00110100 <-- 52 in decimal
but the left byte:
00010010 <-- would be 18 if it was the byte on the right,
but since it is on the left, it means that its
4608
So my question is, how does the computer read the second byte as 4608 if it can only understand numbers that are lower than 255, and then after that how does it interprets those two bytes as a single number(4660)?
Thanks, if you are confused feel free to ask me down in the comments. I made it as clear as possible.
well this is more programing question then HW architecture as the CPU only does 8bit operations in your test case and has no knowledge about 16bit. Your example is: 16bit arithmetics on 8bit ALU and is usually done by splitting to High and Low half of number (and joining latter). That can be done in more ways for example here few (using C++):
transfer
const int _h=0; // MSB location
const int _l=1; // LSB location
BYTE h,l; // 8 bit halves
WORD hl; // 16 bit value
h=((BYTE*)(&hl))[_h];
l=((BYTE*)(&hl))[_l];
// here do your 8bit stuff on h,l
((BYTE*)(&hl))[_h]=h;
((BYTE*)(&hl))[_l]=l;
You need to copy from/to the 8bit/16bit "register" copies which is slow but sometimes can ease up things.
pointers
const int _h=0; // MSB location
const int _l=1; // LSB location
WORD hl; // 16 bit value
BYTE *h=((BYTE*)(&hl))+_h;
BYTE *l=((BYTE*)(&hl))+_l;
// here do your 8bit stuff on *h,*l or h[0],l[0]
you do not need to copy anything but use pointer access *h,*l instead h,l. The pointer initialization is done just once.
union
const int _h=0; // MSB location
const int _l=1; // LSB location
union reg16
{
WORD dw; // 16 bit value
BYTE db[2]; // 8 bit values
} a;
// here do your 8bit stuff on a.db[_h],a.db[_l]
This is the same as #2 but in more manageable form
CPU 8/16 bit registers
Even 8 bit CPU's have usually 16 bit registers accesible by its half or even full registers. For example on Z80 you got AF,BC,DE,HL,PC,SP most of which are directly accessible by its half registers too. So there are instructions working with hl and also instructions working with h,l separately.
On x86 it is the same for example:
mov AX,1234h
Is the same (apart of timing and possibly code length) as:
mov AH,12h
mov AL,34h
Well that is conversion between 8/16 bit in a nutshell but I assume you are asking more about how the operations are done. That is done with use of Carry flag (which is sadly missing from most of higher languages then assembler). For example 16 bit addition on 8 bit ALU (x86 architecture) is done like this:
// ax=ax+bx
add al,bl
adc ah,bh
So first you add lowest BYTE and then highest + Carry. For more info see:
Cant make value propagate through carry
For more info about how to implement other operations see any implementation on bignum arithmetics.
[Edit1]
Here small C++ example of how to print 16 bit number with only 8 bit arithmetics. You can use 8 bit ALU as a building block to make N*8 bit operations in the same way as I did the 16 bit operations ...
//---------------------------------------------------------------------------
// unsigned 8 bit ALU in C++
//---------------------------------------------------------------------------
BYTE cy; // carry flag cy = { 0,1 }
void inc(BYTE &a); // a++
void dec(BYTE &a); // a--
BYTE add(BYTE a,BYTE b); // = a+b
BYTE adc(BYTE a,BYTE b); // = a+b+cy
BYTE sub(BYTE a,BYTE b); // = a-b
BYTE sbc(BYTE a,BYTE b); // = a-b-cy
void mul(BYTE &h,BYTE &l,BYTE a,BYTE b); // (h,l) = a/b
void div(BYTE &h,BYTE &l,BYTE &r,BYTE ah,BYTE al,BYTE b); // (h,l) = (ah,al)/b ; r = (ah,al)%b
//---------------------------------------------------------------------------
void inc(BYTE &a) { if (a==0xFF) cy=1; else cy=0; a++; }
void dec(BYTE &a) { if (a==0x00) cy=1; else cy=0; a--; }
BYTE add(BYTE a,BYTE b)
{
BYTE c=a+b;
cy=DWORD(((a &1)+(b &1) )>>1);
cy=DWORD(((a>>1)+(b>>1)+cy)>>7);
return c;
}
BYTE adc(BYTE a,BYTE b)
{
BYTE c=a+b+cy;
cy=DWORD(((a &1)+(b &1)+cy)>>1);
cy=DWORD(((a>>1)+(b>>1)+cy)>>7);
return c;
}
BYTE sub(BYTE a,BYTE b)
{
BYTE c=a-b;
if (a<b) cy=1; else cy=0;
return c;
}
BYTE sbc(BYTE a,BYTE b)
{
BYTE c=a-b-cy;
if (cy) { if (a<=b) cy=1; else cy=0; }
else { if (a< b) cy=1; else cy=0; }
return c;
}
void mul(BYTE &h,BYTE &l,BYTE a,BYTE b)
{
BYTE ah,al;
h=0; l=0; ah=0; al=a;
if ((a==0)||(b==0)) return;
// long binary multiplication
for (;b;b>>=1)
{
if (BYTE(b&1))
{
l=add(l,al); // (h,l)+=(ah,al)
h=adc(h,ah);
}
al=add(al,al); // (ah,al)<<=1
ah=adc(ah,ah);
}
}
void div(BYTE &ch,BYTE &cl,BYTE &r,BYTE ah,BYTE al,BYTE b)
{
BYTE bh,bl,sh,dh,dl,h,l;
// init
bh=0; bl=b; sh=0; // (bh,bl) = b<<sh so it is >= (ah,al) without overflow
ch=0; cl=0; r=0; // results = 0
dh=0; dl=1; // (dh,dl) = 1<<sh
if (!b) return; // division by zero error
if ((!ah)&&(!al)) return; // division of zero
for (;bh<128;)
{
if (( ah)&&(bh>=ah)) break;
if ((!ah)&&(bl>=al)) break;
bl=add(bl,bl);
bh=adc(bh,bh);
dl=add(dl,dl);
dh=adc(dh,dh);
sh++;
}
// long binary division
for (;;)
{
l=sub(al,bl); // (h,l) = (ah,al)-(bh,bl)
h=sbc(ah,bh);
if (cy==0) // no overflow
{
al=l; ah=h;
cl=add(cl,dl); // increment result by (dh,dl)
ch=adc(ch,dh);
}
else{ // overflow -> shoft right
if (sh==0) break;
sh--;
bl>>=1; // (bh,bl) >>= 1
if (BYTE(bh&1)) bl|=128;
bh>>=1;
dl>>=1; // (dh,dl) >>= 1
if (BYTE(dh&1)) dl|=128;
dh>>=1;
}
}
r=al; // remainder (low 8bit)
}
//---------------------------------------------------------------------------
// print 16bit dec with 8bit arithmetics
//---------------------------------------------------------------------------
AnsiString prn16(BYTE h,BYTE l)
{
AnsiString s="";
BYTE r; int i,j; char c;
// divide by 10 and print the remainders
for (;;)
{
if ((!h)&&(!l)) break;
div(h,l,r,h,l,10); // (h,l)=(h,l)/10; r=(h,l)%10;
s+=char('0'+r); // add digit to text
}
if (s=="") s="0";
// reverse order
i=1; j=s.Length();
for (;i<j;i++,j--) { c=s[i]; s[i]=s[j]; s[j]=c; }
return s;
}
//---------------------------------------------------------------------------
I use VCL AnsiString for text storage you can change it to what ever string or even char[] instead.
You need to divide the whole number not just the BYTE's separately. See how the div function works. Here example of least significant digit of 264 print 264%10...
a = 264 = 00000001 00001000 bin
b = 10 = 00000000 00001010 bin
d = 1 = 00000000 00000001 bin
// apply shift sh so b>=a
a = 00000001 00001000 bin
b = 00000001 01000000 bin
d = 00000000 00100000 bin
sh = 5
// a-=b c+=d while a>=b
// a<b already so no change
a = 00000001 00001000 bin b = 00000001 01000000 bin c = 00000000 00000000 bin d = 00000000 00100000 bin
// shift right
b = 00000000 10100000 bin d = 00000000 00010000 bin sh = 4
// a-=b c+=d while a>=b
a = 00000000 01101000 bin c = 00000000 00010000 bin
// shift right
b = 00000000 01010000 bin d = 00000000 00001000 bin sh = 3
// a-=b c+=d while a>=b
a = 00000000 00011000 bin c = 00000000 00011000 bin
// shift right
b = 00000000 00101000 bin d = 00000000 00000100 bin sh = 2
b = 00000000 00010100 bin d = 00000000 00000010 bin sh = 1
// a-=b c+=d while a>=b
a = 00000000 00000100 bin c = 00000000 00011010 bin
// shift right
b = 00000000 00001010 bin d = 00000000 00000001 bin sh = 0
// a<b so stop a is remainder -> digit = 4
//now a=c and divide again from the start to get next digit ...
By interpreting them as base-256.
>>> 18*256 + 52
4660
I was wondering what the size of my enum type was, so I wrote the following code:
#include <stdio.h>
typedef enum port_config_e {
CFG_MDIX,
CFG_FRAMEMAX,
CFG_OFF_ON_DC,
CFG_AUTONEG_EN,
CFG_DUPLEX_SET,
CFG_LOOPBACK,
CFG_INGRESS_RATE,
CFG_EGRESS_RATE,
}port_config_t;
int main(void)
{
printf("%d\n",sizeof(port_config_t));
return 0;
}
but I can not understand the result. Why is the sizeof this enum type 4 Bytes?
Each enum has a value (int). The value is coded in 4 bytes (or 32 bits) for 32-bits computers.
0000 CFG_MDIX
0001 CFG_FRAMEMAX
0010 CFG_OFF_ON_DC
0011 CFG_AUTONEG_EN
[...]
And why each enum is an Int ? Because it's faster and simpler for the computer to compare int than char* (for example).
Thank you Jashaszun for the correction.
if TStruct is packed, then this code ends with Str.D == 0x00223344 (not 0x11223344). Why? ARM GCC 4.7
#include <string.h>
typedef struct {
unsigned char B;
unsigned int D;
} __attribute__ ((packed)) TStruct;
volatile TStruct Str;
int main( void) {
memset((void *)&Str, 0, sizeof(Str));
Str.D = 0x11223344;
if(Str.D != 0x11223344) {
return 1;
}
return 0;
}
I guess your problem has nothing to do with unaligned access, but with structure definition. int is not necessarily 32 bit long. According to the C standard, int is at least 16 bit long, and char is at least 8 bits long.
My guess is, Your compiler optimizes TStruct so it looks like this:
struct {
unsigned char B : 8;
unsigned int D : 24;
} ...;
When you are assigning 0x11223344 to Str.D, than according to the C standard, the compiler must only make sure that at least 16 bits (0x3344) are written to Str.D. You didn't specify that Str.D is 32 bit long, only that it is at least 16 bits long.
Your compiler may also arrange the struct like this:
struct {
unsigned char B : 16;
unsigned int D : 16;
} ...;
B is at least 8 bits long, and D is at least 16 bits long, all ok.
Probably, what you want to do, is:
#include <stdint.h>
typedef struct {
uint8_t B;
uint32_t D;
} __attribute__((packed)) TStruct;
That way You can ensure a 32-bit value 0x11223344 properly writes to Str.D. It is a good idea to use size constrained types for __packed structs.
As for unaligned access of a member inside a struct, the compiler should take care of it. If a compiler knows the structure definition, then when you are accessing Str.D it should take care of any unaligned access and bit/byte operations.