What's the purpose of many int3's in a row in WinDbg disassembly of ntdll code? [duplicate] - windows

This question already has answers here:
Visual C++ appends 0xCC (int3) bytes at the end of functions
(3 answers)
Several int3 in a row
(1 answer)
Closed 1 year ago.
I'm learning assembly and after assembly of:
format PE64 NX GUI 6.0
entry start
section '.text' code readable executable
start:
int3
ret
running in my debugger (at the end of the OS loader code and also ) I see
...
00007fff`bc78070d 4889442428 mov qword ptr [rsp+28h], rax
00007fff`bc780712 488364242000 and qword ptr [rsp+20h], 0
00007fff`bc780718 e8cf90f9ff call ntdll!RtlStringCbPrintfExW (00007fff`bc7197ec)
00007fff`bc78071d 488b8c24e0010000 mov rcx, qword ptr [rsp+1E0h]
00007fff`bc780725 4833cc xor rcx, rsp
00007fff`bc780728 e813bbfbff call ntdll!_security_check_cookie (00007fff`bc73c240)
00007fff`bc78072d 4881c4f0010000 add rsp, 1F0h
00007fff`bc780734 5b pop rbx
00007fff`bc780735 c3 ret
00007fff`bc780736 cc int 3
00007fff`bc780737 cc int 3
00007fff`bc780738 cc int 3
00007fff`bc780739 cc int 3
00007fff`bc78073a cc int 3
00007fff`bc78073b cc int 3
00007fff`bc78073c cc int 3
00007fff`bc78073d cc int 3
00007fff`bc78073e cc int 3
00007fff`bc78073f cc int 3
ntdll!LdrpDoDebuggerBreak:
00007fff`bc780740 4883ec38 sub rsp, 38h
00007fff`bc780744 488364242000 and qword ptr [rsp+20h], 0
00007fff`bc78074a 41b901000000 mov r9d, 1
00007fff`bc780750 4c8d442440 lea r8, [rsp+40h]
00007fff`bc780755 418d5110 lea edx, [r9+10h]
00007fff`bc780759 48c7c1feffffff mov rcx, 0FFFFFFFFFFFFFFFEh
00007fff`bc780760 e84bcbfcff call ntdll!NtQueryInformationThread (00007fff`bc74d2b0)
00007fff`bc780765 85c0 test eax, eax
00007fff`bc780767 780a js ntdll!LdrpDoDebuggerBreak+0x33 (00007fff`bc780773)
00007fff`bc780769 807c244000 cmp byte ptr [rsp+40h], 0
00007fff`bc78076e 7503 jne ntdll!LdrpDoDebuggerBreak+0x33 (00007fff`bc780773)
00007fff`bc780770 cc int 3
...
Can someone explain what the purpose of multiple int3's in a row? It reminds me of a nop slide but I can't imagine why you'd need to do such a thing with a debug command. Or is this just bad disassembly?

Related

Are I/O statements in FreeBasic compiled as function calls?

Example:
Dim x As Integer, y As Integer
Input "x=", x
y = x ^ 3 + 3 * x ^ 2 - 24 * x + 30
Print y
End
When I used FreeBasic compiler to generate the assembly code of this source code, I found
.globl _main
_main:
and
call ___main
in assembly code. In addition, it looks like that the Input statement is compiled as
call _fb_ConsoleInput#12
and
call _fb_InputInt#4
The "^" operator is compiled as
call _pow
(I am not sure whether the math function library of FreeBasic is integrated or external)
and the Print statement is compiled as
call _fb_PrintInt#12
and the End statement is compiled as
call _fb_End#4
The question is: How is FreeBasic source code compiled? Why _main and ___main appeared in assembly code? Are I/O statements compiled as function calls?
Reference: Assembly code generated by FreeBasic compiler
.intel_syntax noprefix
.section .text
.balign 16
.globl _main
_main:
push ebp
mov ebp, esp
and esp, 0xFFFFFFF0
sub esp, 20
mov dword ptr [ebp-4], 0
call ___main
push 0
push dword ptr [ebp+12]
push dword ptr [ebp+8]
call _fb_Init#12
.L_0002:
mov dword ptr [ebp-8], 0
mov dword ptr [ebp-12], 0
push -1
push 0
push 2
push offset _Lt_0004
call _fb_StrAllocTempDescZEx#8
push eax
call _fb_ConsoleInput#12
lea eax, [ebp-8]
push eax
call _fb_InputInt#4
push dword ptr [_Lt_0005+4]
push dword ptr [_Lt_0005]
fild dword ptr [ebp-8]
sub esp,8
fstp qword ptr [esp]
call _pow
add esp, 16
fild dword ptr [ebp-8]
fild dword ptr [ebp-8]
fxch st(1)
fmulp
fmul qword ptr [_Lt_0005]
fxch st(1)
faddp
mov eax, dword ptr [ebp-8]
imul eax, 24
push eax
fild dword ptr [esp]
add esp, 4
fxch st(1)
fsubrp
fadd qword ptr [_Lt_0006]
fistp dword ptr [ebp-12]
push 1
push dword ptr [ebp-12]
push 0
call _fb_PrintInt#12
push 0
call _fb_End#4
.L_0003:
push 0
call _fb_End#4
mov eax, dword ptr [ebp-4]
mov esp, ebp
pop ebp
ret
.section .data
.balign 4
_Lt_0004: .ascii "x=\0"
.balign 8
_Lt_0005: .quad 0x4008000000000000
.balign 8
_Lt_0006: .quad 0x403E000000000000
Yes, things like PRINT are implemented as function calls, though i am not sure why this matters to you unless you are currently learning assembly.
As for _main, that is the ASM name for the main() C function used as the main program.
On x86, it is common for global/exported function names in C to be preceded by _ in the ASM output.
___main is the ASM name for the __main() C function called by the MinGW C runtime library startup code before anything in _main is executed.
Again, you'll see the extra _ preceding the C function name.
After that is a call to fb_Init(argc, argv, FB_LANG_FB) to initialize the FreeBASIC runtime library with the default "fb" FreeBASIC dialect and argc elements in the argument vector argv.
The #12 means the argument list is 12 bytes long (e.g., 4+4+4=12 as with fb_Init here); see __stdcall | Microsoft Docs for more information on that.

Performance difference Rust and C++

I am currently learning Rust, and as a first exercise I wanted to implement a function that computes the nth fibonacci number:
fn main() {
for i in 0..48 {
println!("{}: {}", i, fibonacci(i));
}
}
fn fibonacci(n: u32) -> u32 {
match n {
0 => 0,
1 => 1,
_ => fibonacci(n - 1) + fibonacci(n - 2),
}
}
I run it as:
$ time cargo run --release
real 0m15.380s
user 0m15.362s
sys 0m0.014s
As an exercise, I also implemented the same algorithm in C++. I was expecting a similar performance, but the C++ code runs in 80% of the time:
#include<iostream>
unsigned int fibonacci(unsigned int n);
int main (int argc, char* argv[]) {
for(unsigned int i = 0; i < 48; ++i) {
std::cout << i << ": " << fibonacci(i) << '\n';
}
return 0;
}
unsigned int fibonacci(unsigned int n) {
if(n == 0) {
return 0;
} else if (n == 1) {
return 1;
} else {
return fibonacci(n - 1) + fibonacci(n - 2);
}
}
Compiled as:
$ g++ test.cpp -o test.exe -O2
And running:
$ time ./test.exe
real 0m12.127s
user 0m12.124s
sys 0m0.000s
Why do I see such a difference in performance? I am not interested in calculating the fibonacci faster in Rust (with a different algorithm); I am only interested on where the difference comes from. This is just an exercise in my progress as I learn Rust.
TL;DR: It's not Rust vs C++, it's LLVM (Clang) vs GCC.
Different optimizers optimize the code differently, and in this case GCC produces larger but faster code.
This can be verified using godbolt.
Here is Rust, compiled with both GCC (via rustgcc-master):
example::fibonacci:
push r15
push r14
push r13
push r12
push rbp
xor ebp, ebp
push rbx
mov ebx, edi
sub rsp, 24
.L2:
test ebx, ebx
je .L1
cmp ebx, 1
je .L4
lea r12d, -1[rbx]
xor r13d, r13d
.L19:
cmp r12d, 1
je .L6
lea r14d, -1[r12]
xor r15d, r15d
.L16:
cmp r14d, 1
je .L8
lea edx, -1[r14]
xor ecx, ecx
.L13:
cmp edx, 1
je .L10
lea edi, -1[rdx]
mov DWORD PTR 12[rsp], ecx
mov DWORD PTR 8[rsp], edx
call example::fibonacci.localalias
mov ecx, DWORD PTR 12[rsp]
mov edx, DWORD PTR 8[rsp]
add ecx, eax
sub edx, 2
jne .L13
.L14:
add r15d, ecx
sub r14d, 2
je .L17
jmp .L16
.L4:
add ebp, 1
.L1:
add rsp, 24
mov eax, ebp
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L6:
add r13d, 1
.L20:
sub ebx, 2
add ebp, r13d
jmp .L2
.L8:
add r15d, 1
.L17:
add r13d, r15d
sub r12d, 2
je .L20
jmp .L19
.L10:
add ecx, 1
jmp .L14
And with LLVM (via rustc):
example::fibonacci:
push rbp
push r14
push rbx
mov ebx, edi
xor ebp, ebp
mov r14, qword ptr [rip + example::fibonacci#GOTPCREL]
cmp ebx, 2
jb .LBB0_3
.LBB0_2:
lea edi, [rbx - 1]
call r14
add ebp, eax
add ebx, -2
cmp ebx, 2
jae .LBB0_2
.LBB0_3:
add ebx, ebp
mov eax, ebx
pop rbx
pop r14
pop rbp
ret
We can see that LLVM produces a naive version -- calling the function in each iteration of the loop -- while GCC partially unrolls the recursion by inlining some calls. This results in a smaller number of calls in the case of GCC, and at about 5ns of overhead per function call, it's significant enough.
We can do the same exercise with the C++ version using LLVM via Clang and GCC and note that the result is pretty much similar.
So, as announced, it's a LLVM vs GCC difference, not a language one.
Incidentally, the fact that optimizers may produce such widely different results is a reason why I am quite excited at the progress of the rustc_codegen_gcc initiative (dubbed rustgcc-master on godbolt) which aims at pluging a GCC backend into the rustc frontend: once complete anyone will be able to switch to the better optimizer for their own workload.

Addition of Two Numbers In Assembly

So I have this code here were I'm trying to add two numbers in it but I can't seem to get the output from this code that I've been trying do for a while. :
section .text
global _start ;must be declared for using gcc
_start: ;tell linker entry point
mov edx, '6'
sub edx,'0'
mov ecx ,'7'
sub ecx,'0'
add edx,ecx
mov [math_sum], edx
mov eax,msg
mov ebx, len
mov ecx,1
mov edx,4
int 0x80
mov eax,math_sum
mov ebx,1
mov ecx,1
mov edx,4
int 0x80
mov edx,1
int 0x80
section .data
msg db "Sum of 6 and 7 is:"
len equ $ - msg
segment .bss
math_sum resb 2
I'm getting Segmentation fault but I don't know how to fix it.

InitializeCriticalSection fails in NASM

UPDATE: based on comments below, I revised the code below to add a struc and a pointer (new or revised code has "THIS IS NEW" or "THIS IS UPDATED" beside the code). Now the program does not crash, so the pointer is initialized, but the programs hangs at EnterCriticalSection. I suspect that in translating the sample MASM code below into NASM syntax, I did not declare the struc correctly. Any ideas? Thanks very much.
ORIGINAL QUESTION:
Below is a simple test program in 64-bit NASM, to test a critical section in
Windows. This is a dll and the entry point is Main_Entry_fn, which calls Init_Cores_fn, where we initialize four threads (cores) to call Test_fn.
I suspect that the problem is the pointer to the critical section. None of the online resources specifies what that pointer is. The doc "Using Critical Section Objects" at https://learn.microsoft.com/en-us/windows/desktop/sync/using-critical-section-objects shows a C++ example where the pointer appears to be relevant only to EnterCriticalSection and LeaveCriticalSection, but it's not a pointer to an independent object.
For those not familiar with NASM, the first parameter in a C++ signature goes into rcx and the second parameter goes into rds, but otherwise it should function the same as in C or C++. It's the same thing as InitializeCriticalSectionAndSpinCount(&CriticalSection,0x00000400) in C++.
Here's the entire program:
; Header Section
[BITS 64]
[default rel]
extern malloc, calloc, realloc, free
global Main_Entry_fn
export Main_Entry_fn
extern CreateThread, CloseHandle, ExitThread
extern WaitForMultipleObjects, GetCurrentThreadId
extern InitializeCriticalSectionAndSpinCount, EnterCriticalSection
extern LeaveCriticalSection, DeleteCriticalSection, InitializeCriticalSection
struc CRITICAL_SECTION ; THIS IS NEW
.cs_quad: resq 5
endstruc
section .data align=16
const_1000000000: dq 1000000000
ThreadID: dq 0
TestInfo: times 20 dq 0
ThreadInfo: times 3 dq 0
ThreadInfo2: times 3 dq 0
ThreadInfo3: times 3 dq 0
ThreadInfo4: times 3 dq 0
ThreadHandles: times 4 dq 0
Division_Size: dq 0
Start_Byte: dq 0
End_Byte: dq 0
Return_Data_Array: times 4 dq 0
Core_Number: dq 0
const_inf: dq 0xFFFFFFFF
SpinCount: dq 0x00000400
CriticalSection: ; THIS IS NEW
istruc CRITICAL_SECTION
iend
section .text
; ______________________________________
Init_Cores_fn:
; Calculate the data divisions
mov rax,[const_1000000000]
mov rbx,4 ;cores
xor rdx,rdx
div rbx
mov [End_Byte],rax
mov [Division_Size],rax
mov rax,0
mov [Start_Byte],rax
; Populate the ThreadInfo arrays to pass for each core
; ThreadInfo: (1) startbyte; (2) endbyte; (3) Core_Number
mov rdi,ThreadInfo
mov rax,[Start_Byte]
mov [rdi],rax
mov rax,[End_Byte]
mov [rdi+8],rax
mov rax,[Core_Number]
mov [rdi+16],rax
call DupThreadInfo ; Create ThreadInfo arrays for cores 2-4
mov rbp,rsp ; preserve caller's stack frame
sub rsp,56 ; Shadow space (was 32)
; _____
; Create four threads
label_0:
mov rax,[Core_Number]
cmp rax,0
jne sb2
mov rdi,ThreadInfo
jmp sb5
sb2:cmp rax,8
jne sb3
mov rdi,ThreadInfo2
jmp sb5
sb3:cmp rax,16
jne sb4
mov rdi,ThreadInfo3
jmp sb5
sb4:cmp rax,24
jne sb5
mov rdi,ThreadInfo4
sb5:
; _____
; Create Threads
mov rcx,0 ; lpThreadAttributes (Security Attributes)
mov rdx,0 ; dwStackSize
mov r8,Test_fn ; lpStartAddress (function pointer)
mov r9,rdi ; lpParameter (array of data passed to each core)
mov rax,0
mov [rsp+32],rax ; use default creation flags
mov rdi,ThreadID
mov [rsp+40],rdi ; ThreadID
call CreateThread
; Move the handle into ThreadHandles array (returned in rax)
mov rdi,ThreadHandles
mov rcx,[Core_Number]
mov [rdi+rcx],rax
mov rdi,TestInfo
mov [rdi+rcx],rax
mov rax,[Core_Number]
add rax,8
mov [Core_Number],rax
mov rbx,32 ; Four cores
cmp rax,rbx
jl label_0
mov rcx,CriticalSection ; THIS IS REVISED
mov rdx,[SpinCount]
call InitializeCriticalSectionAndSpinCount
; _____
; Wait
mov rcx,4 ;rax ; number of handles
mov rdx,ThreadHandles ; pointer to handles array
mov r8,1 ; wait for all threads to complete
mov r9,[const_inf] ;4294967295 ;0xFFFFFFFF
call WaitForMultipleObjects
; _____
mov rsp,rbp ; can we push rbp so we can use it internally?
jmp label_900
; ______________________________________
Test_fn:
mov rdi,rcx
mov r14,[rdi] ; Start_Byte
mov r15,[rdi+8] ; End_Byte
mov r13,[rdi+16] ; Core_Number
;______
; while(n < 1000000000)
label_401:
cmp r14,r15
jge label_899
mov rcx,CriticalSection
call EnterCriticalSection
; n += 1
add r14,1
mov rcx,CriticalSection
call LeaveCriticalSection
jmp label_401
;______
label_899:
mov rdi,Return_Data_Array
mov [rdi+r13],r14
mov rbp,ThreadHandles
mov rax,[rbp+r13]
call ExitThread
ret
; __________
label_900:
mov rcx,CriticalSection
call DeleteCriticalSection
mov rdi,Return_Data_Array
mov rax,rdi
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
call Init_Cores_fn
pop rbp
pop rdi
ret
DupThreadInfo:
mov rdi,ThreadInfo2
mov rax,8
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
add rax,[Division_Size]
mov [rdi],rax
mov rax,[End_Byte]
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
mov rdi,ThreadInfo3
mov rax,16
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
mov [rdi],rax
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
mov rdi,ThreadInfo4
mov rax,24
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
mov [rdi],rax
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
ret
The code above shows the functions in three separate places, but of course we test them one at a time (but they all fail).
To summarize, my question is why do InitializeCriticalSection and InitializeCriticalSectionAndSpinCount both fail in the code above? The inputs are dead simple, so I don't understand why it should not work.
InitializeCriticalSection take pointer to critical section object
The process is responsible for allocating the memory used by a
critical section object, which it can do by declaring a variable of
type CRITICAL_SECTION.
so code can be something like (i use masm syntax)
CRITICAL_SECTION STRUCT
DQ 5 DUP(?)
CRITICAL_SECTION ends
extern __imp_InitializeCriticalSection:QWORD
extern __imp_InitializeCriticalSectionAndSpinCount:QWORD
.DATA?
CriticalSection CRITICAL_SECTION {}
.CODE
lea rcx,CriticalSection
;mov edx,400h
;call __imp_InitializeCriticalSectionAndSpinCount
call __imp_InitializeCriticalSection
also you need declare all imported functions as
extern __imp_funcname:QWORD
instead
extern funcname

Assembly Programming using SASM on Windows, with an example using int 0x80 (Linux system calls)

I need help. I'm trying to run the program (NASM) below in SASM.
SYS_EXIT equ 1
SYS_READ equ 3
SYS_WRITE equ 4
STDIN equ 0
STDOUT equ 1
segment .data
msg1 db "Enter a digit ", 0xA,0xD
len1 equ $- msg1
msg2 db "Please enter a second digit", 0xA,0xD
len2 equ $- msg2
msg3 db "The sum is: "
len3 equ $- msg3
segment .bss
num1 resb 2
num2 resb 2
res resb 1
section .text
global _start ;must be declared for using gcc
_start: ;tell linker entry point
mov eax, SYS_WRITE
mov ebx, STDOUT
mov ecx, msg1
mov edx, len1
int 0x80
mov eax, SYS_READ
mov ebx, STDIN
mov ecx, num1
mov edx, 2
int 0x80
mov eax, SYS_WRITE
mov ebx, STDOUT
mov ecx, msg2
mov edx, len2
int 0x80
mov eax, SYS_READ
mov ebx, STDIN
mov ecx, num2
mov edx, 2
int 0x80
mov eax, SYS_WRITE
mov ebx, STDOUT
mov ecx, msg3
mov edx, len3
int 0x80
; moving the first number to eax register and second number to ebx
; and subtracting ascii '0' to convert it into a decimal number
mov eax, [num1]
sub eax, '0'
mov ebx, [num2]
sub ebx, '0'
; add eax and ebx
add eax, ebx
; add '0' to to convert the sum from decimal to ASCII
add eax, '0'
; storing the sum in memory location res
mov [res], eax
; print the sum
mov eax, SYS_WRITE
mov ebx, STDOUT
mov ecx, res
mov edx, 1
int 0x80
exit:
mov eax, SYS_EXIT
xor ebx, ebx
int 0x80
I had this error:
[20:53:11] Warning! Errors have occurred in the build:
c:/program files (x86)/sasm/mingw/bin/../lib/gcc/mingw32/4.6.2/../../../libmingw32.a(main.o): In function 'main':
C:\MinGW\msys\1.0\src\mingwrt/../mingw/main.c:73: undefined reference to `WinMain#16'
Also, how do I limit users input up to 4 digits only?
global _start should change to global main and Linux system calls should be replaced by Windows API function calls and declared as external. Modern versions of Windows doesn't approve use of system calls due to malware or badware risks, so deprecated (permanent) system call codes. Every modern version of Windows has different system call number codes, though you can find them on internet, you shouldn't rely on them unless you want to revise your assembly code for each version of Windows thus reducing portability and increasing workload. There are significant differences between Linux/Mac and Windows in the way of handling registers, stack and function names.

Resources