Code Optimization Tips: - sorting

I am using the following ASM routine to bubble sort an array. I want to know of the inefficiencies of my code:
.386
.model flat, c
option casemap:none
.code
public sample
sample PROC
;[ebp+0Ch]Length
;[ebp+08h]Array
push ebp
mov ebp, esp
push ecx
push edx
push esi
push eax
mov ecx,[ebp+0Ch]
mov esi,[ebp+08h]
_bubbleSort:
push ecx
push esi
cmp ecx,1
je _exitLoop
sub ecx,01h
_miniLoop:
push ecx
mov edx,DWORD PTR [esi+4]
cmp DWORD PTR [esi],edx
ja _swap
jmp _continueLoop
_swap:
lodsd
mov DWORD PTR [esi-4],edx
xchg DWORD PTR [esi],eax
jmp _skipIncrementESI
_continueLoop:
add esi,4
_skipIncrementESI:
pop ecx
loop _miniLoop
_exitLoop:
pop esi
pop ecx
loop _bubbleSort
pop eax
pop esi
pop edx
pop ecx
pop ebp
ret
sample ENDP
END
Basically I have two loops, as usual for the bubble sort algorithm. The value of ecx for the outer loop is 10, and for the inner loop it is [ecx-1]. I have tried the routine and it compiles and runs successfully, but I am not sure if it is efficient.

There are several things you can do to speed up your assembly code:
don't do things like ja label_1 ; jmp label_2. Just do jbe label_2 instead.
loop is a very slow instruction. dec ebx; jnz loopstart is much faster
use all registers instead of repeatedly push/pop ecx and esi. Use ebx and edi too.
jmp-targets should be well aligned. Use align 4 before the two loop-starts and after the jbe
Get yourself a manual for your cpu from Intel (you can download it as pdf), it has the timings for the opcodes, maybe it has other hints too.

Several simple tips:
1) Try to minimize the number of conditional jumps, because they are very expensive. Unroll if possible.
2) Reorder instructions to minimize stalls because of data depencency:
cmp DWORD PTR [esi],edx ;// takes some time to compute,
mov edx,DWORD PTR [esi+4] ;
ja _swap ;// waits for results of cmp
3) Avoid old composite instructions (dec, jnz pair is faster than loop and is not bound to ecx register)
It would be quite difficult to write assembly code that is faster than the code generated by optimizing C compiler, because you should consider lots of factors: size of data and instruction caches, alignments, pipeline, instruction timings. You can find some good documentation about this here. I especially recommend the first book: Optimizing software in C++

Substitute for "add esi,4" if we do not need a flag for this instruction:
_continueLoop:
lea esi,[esi+4]

Related

How to get Windows system calls assembly statically?

I am trying to find a specific pattern in the Windows system calls, for research purposes.
So far i've been looking into the Windows dlls such as ntdll.dll, user32.dll, etc., but those seem to contain only wrapper codes for preparing to jump to the system call. For example:
mov eax, 101Eh
lea edx, [esp+arg_0]
mov ecx, 0
call large dword ptr fs:0C0h
retn 10h
I'm guessing the call large dword ptr fs:0C0h instruction is another gateway in the chain that finally leads to the actual assembly, but I was wondering if I can get to that assembly directly.
You're looking in the wrong dlls. The system calls are in ntoskrnl.exe.
If you look at NtOpenFile() in ntoskrnl.exe you'll see:
mov r11, rsp
sub rsp, 88h
mov eax, [rsp+88h+arg_28]
xor r10d, r10d
mov [r11-10h], r10
mov [rsp+88h+var_18], 20h ; int
mov [r11-20h], r10d
mov [r11-28h], r10
mov [r11-30h], r10d
mov [r11-38h], r10d
mov [r11-40h], r10
mov [rsp+88h+var_48], eax ; int
mov eax, [rsp+88h+arg_20]
mov [rsp+88h+var_50], 1 ; int
mov [rsp+88h+var_58], eax ; int
mov [r11-60h], r10d
mov [r11-68h], r10
call IopCreateFile
add rsp, 88h
retn
Which is the true body of the function. Most of the work is done in IopCreateFile(), but you can follow it statically and do whatever analysis you need.

Finding Smallest Number in List

My goal in this code is to find the smallest number in the list. I used bubble sort method in this case; unfortunately, the code is not giving me the smallest/minimum number. Please take a look, Thanks:
include irvine32.inc
.data
input byte 100 dup(0)
stringinput byte "Enter any string: ",0
totallength byte "The total length is: ",0
minimum byte "The minimum value is: ",0
.code
stringLength proc
push ebp
mov ebp, esp
push ebx
push ecx
mov eax, 0
mov ebx, [ebp+8]
L1:
mov ecx, [ebx] ;you can use ecx, cx, ch, cl
cmp ecx, 0 ;you can use ecx, cx, ch, cl
JE L2
add ebx, 1
add eax, 1
jmp L1
L2:
pop ecx
pop ebx
mov ebp, esp
pop ebp
ret 4
stringLength endp
BubbleSort PROC uses ECX
push edx
xor ecx,ecx
mov ecx, 50
OUTER_LOOP:
push ecx
xor ecx,ecx
mov ecx,14
mov esi, OFFSET input
COMPARE:
xor ebx,ebx
xor edx,edx
mov bl, byte ptr ds:[esi]
mov dl, byte ptr ds:[esi+1]
cmp bl,dl
jg SWAP1
CONTINUE:
add esi,2
loop COMPARE
mov esi, OFFSET input
pop ecx
loop OUTER_LOOP
jmp FINISHED
SWAP1:
xchg bl,dl
mov byte ptr ds:[esi+1],dl
mov byte ptr ds:[esi],bl
jmp CONTINUE
FINISHED:
pop edx
ret 4
BubbleSort ENDP
main proc
call clrscr
mov edx, offset stringinput
call writeString
mov edx, offset input
call writeString
call stringLength
mov edx, offset input
mov ecx, sizeof input
call readstring
call crlf
mov edx,offset totallength
call writestring
call writedec
call crlf
mov edx, offset minimum
call crlf
call writeString
push offset input
call BubbleSort
mov edx, offset input
call writeString
call crlf
exit
main endp
end main
I haven't looked over your code, because sorting is an over complicated method for what you want to do. Not only that, but most of us don't pay too much attention to uncommented code. Just takes to long to figure out what you're trying to do.
Simply iterate through the entire list and start with 255 (FFH) in AL let's say. Each time you come across a number that is smaller than the one in AL, then replace it with that value and then when loop is finished, AL will have the lowest value.
If you need to know where it is in the list, you could maybe use AH which would be the difference between start address and current address. Knowledge of the instruction set is essential as finding the length of the string can be simplified by;
mov di, input ; Point to beginning of buffer
mov cx, -1 ; for a maximum of 65535 characters
xor al, al ; Looking for NULL
rep scasb
neg cx
dec cx ; CX = length of string.
Remember, ES needs to point to #DATA

masm call procedure access violation

So I am working on an assignment in assembly to generate a fibonacci sequence. I've written the code successfully in the main procedure but when I try to wrap it in it's own procedure and call that procedure I run into an access violation error. Here's my code:
INCLUDE Irvine32.inc
.data
array DWORD 47 DUP(?)
.code
main proc
mov esi, OFFSET array
call generate_fibonacci
invoke ExitProcess,0
main endp
generate_fibonacci PROC
mov DWORD PTR [esi], 1h
add esi, 4
mov DWORD PTR [esi], 1h
push [esi]
push [esi - 4]
add esi, 4
mov ecx, 45
L1:
pop eax
pop ebx
add eax, ebx
mov DWORD PTR [esi], eax
add esi, 4
push [esi - 4]
push [esi - 8]
loop L1
ret
generate_fibonacci ENDP
end main
The error looks like this: "Exception thrown at some memory location in Project...: Access violation executing location same memory location.
I noticed that the memory location listed in the error message was being loaded onto the EIP register when the call generate_fibonacci instruction is executed. I'm not sure how to fix this.
The pushes and pops in your PROC are not balanced.
Before loop L1: you make 2 pushes. Within the loop L1: you make 2 pops and 2 pushes. When loop L1: ends, that leaves 2 items still on the stack when ret attempts to pull off the return address. So the code tries to resume execution somewhere that causes an access violation.
Please add two lines of code before the ret instruction to clean up the stack
pop eax
pop eax
ret
If the same code worked when it was in main, it worked because main does not end with ret.
EDIT. You could simplify it considerably by keeping the recent terms in registers. The last three terms will be in eax, ebx, edx.
generate_fibonacci PROC
mov eax, 1 ;init first two terms
mov DWORD PTR [esi], eax ;store first two terms
add esi, 4
mov DWORD PTR [esi], eax
add esi, 4
mov ebx, eax
mov ecx, 45 ;init loop count
L1:
mov edx, ebx ;move terms along
mov ebx, eax
add eax, edx ;sum last two terms
mov DWORD PTR [esi], eax
add esi, 4
loop L1
ret
generate_fibonacci ENDP

Euclidian GCD in Language Assembly. Code not working

I am writing this Euclidian GCD program in Language assembly and I think I know what is the problem but I don't know how to fix it. The thing is I am calling GCD recursively from within and every time I call GCD the ESP moves 4 bytes down because it has to store the return address on the stack with each call. Therefore, my EBP will point 4 bytes down from the previous call. Can someone help me fix this code?
;Kirtan Patel
;Create a Euclidian GCD Program
;10/30/2014
.586
.MODEL FLAT
.STACK 4096
.DATA
numberm DWORD 14
numbern DWORD 10
.CODE
main PROC
push numbern ;push 10 onto the stack
push numberm ;push 14 onto the stack
call gcd ; call gcd function
add esp, 8 ;pop off the parameters from the stack.
ret ;exit the program
main ENDP
gcd PROC
push ebp ;push ebp onto the stack to preserve previous contents of ebp
mov ebp, esp ;copy esp to ebp to access the parameters 10 and 14 later on
push edx ;save the registers
push ebx
push ecx
mov ecx, DWORD PTR[ebp+12] ;copy 10 to ecx
cmp ecx, 0 ;compare to see if the divisor is zero
jnz recur ;if it is not zero then recursively call gcd
mov eax, DWORD PTR[ebp+8] ; if it zero then copy 14 to eax and return
pop ecx ;restore the contents of registers before exiting the function
pop ebx
pop edx
pop ebp
ret
recur: mov eax, DWORD PTR[ebp+8] ;copy 14 to eax
cdq ; prepare the edx register for division to store the remainder
div ecx ;eax/ecx (14/10)
mov DWORD PTR[ebp+12], edx ;copy the remainder into numbern on the stack
mov DWORD PTR[ebp+8], ecx ;copy the new divisor into numberm on the stack
pop ecx ;restore registers
pop ebx
pop edx
pop ebp
call gcd ;recursively call gcd
gcd ENDP
END
You can pass parameters on the stack. Use this C program as a prototype for your recursive function, and use the techniques described here to pass your parameters on each recursive call.
int findgcd(int x,int y){
while(x!=y){
if(x>y)
return findgcd(x-y,y);
else
return findgcd(x,y-x);
}
return x;
}

Self modifying algorithms with Virtualprotect problems

I'm having problems with the Virtualprotect() api by windows.
I got an assignment from school, my teacher told us that in the past when memory was scarce and costly. Programmers had to create advanced algorithms that would modify itself on the fly to save memory. So there you have it, we must now write such an algorithm, it doesn't have to be effective but it must modify itself.
So I set out to do just that, and I think that I made it pretty far before asking for any help.
My program works like this:
I have a function and a loop with a built-in stack overflow. The stack gets overflown with the address of a memory location where code resides that is constructed during the loop. Control is passed to the code in memory. The code loads a dll and then exits, but before it exits it has to repair the loop. It is one of the conditions of our assignment, everything changed in the original loop must be restored.
The problem is that I don't have write access to the loop, only READ_EXECUTE, so to change my access I thought, I use virtualprotect. But that function returned an error:
ERROR_NOACCESS, the documentation on this error is very slim, windows only says: Invailid access to memory address. Which figures since I wanted to change the access in the first place. So what's wrong? Here's the code constructed in memory:
The names of all the data in my code is a little vague, so I provided a few comments
Size1:
TrapData proc
jmp pLocals
LocalDllName db 100 dup(?) ; name of the dll to be called ebx-82h
RestoreBuffer db 5 dup(?) ; previous bytes at the overflow location
LoadAddress dd 0h ; ebx - 19h ; address to kernel32.loadlibrary
RestoreAddress dd 0h ; ebx - 15h ; address to restore (with the restore buffer)
AddressToRestoreBuffer dd 0h ; ebx - 11h ; obsolete, I don't use this one
AddressToLea dd 0h ; ebx - 0Dh Changed, address to kernel32.virutalprotect
AddressToReturnTo dd 0h ; ebx - 9h address to return execution to(the same as RestoreAddress
pLocals:
call Refpnt
Refpnt: pop ebx ; get current address in ebx
push ebx
mov eax, ebx
sub ebx, 82h
push ebx ; dll name
sub eax, 19h ; load lib address
mov eax, [eax]
call eax
pop ebx ; Current address
push ebx
;BOOL WINAPI VirtualProtect(
; __in LPVOID lpAddress,
; __in SIZE_T dwSize,
; __in DWORD flNewProtect,
; __out PDWORD lpflOldProtect
;);
mov eax, ebx
mov esi, ebx
sub eax, 82h
push eax ; overwrite the buffer containing the dll name, we don't need it anymore
push PAGE_EXECUTE_READWRITE
push 5h
sub esi, 15h
mov esi, [esi]
push esi
sub ebx, 0Dh
mov ebx, [ebx]
call ebx ; Returns error 998 ERROR_NOACCESS (to what?)
pop ebx
push ebx
sub ebx, 1Eh
mov eax, ebx ; restore address buffer pointer
pop ebx
push ebx
sub ebx, 15h ; Restore Address
mov ebx, [ebx]
xor esi, esi ; counter to 0
#0:
push eax
mov al, byte ptr[eax+esi]
mov byte ptr[ebx+esi], al
pop eax
inc esi
cmp esi, 5
jne #0
pop ebx
sub ebx, 9h
mov ebx, [ebx]
push ebx ; address to return to
ret
Size2:
So what's wrong?
Can you guys help me?
EDIT, Working code:
Size1:
jmp pLocals
LocalDllName db 100 dup(?)
RestoreBuffer db 5 dup(?)
LoadAddress dd 0h ; ebx - 19h
RestoreAddress dd 0h ; ebx - 15h
AddressToRestoreBuffer dd 0h ; ebx - 11h
AddressToLea dd 0h ; ebx - 0Dh
AddressToReturnTo dd 0h ; ebx - 9h
pLocals:
call Refpnt
Refpnt: pop ebx ; get current address in ebx
push ebx
mov eax, ebx
sub ebx, 82h
push ebx ; dll name
sub eax, 19h ; load lib address
mov eax, [eax]
call eax
pop ebx ; Current address
push ebx
;BOOL WINAPI VirtualProtect(
; __in LPVOID lpAddress,
; __in SIZE_T dwSize,
; __in DWORD flNewProtect,
; __out PDWORD lpflOldProtect
;);
mov esi, ebx
push 0
push esp
push PAGE_EXECUTE_READWRITE
push 5h
sub esi, 15h
mov esi, [esi]
push esi
sub ebx, 0Dh
mov ebx, [ebx]
call ebx
pop ebx
pop ebx
push ebx
sub ebx, 1Eh
mov eax, ebx ; restore address buffer pointer
pop ebx
push ebx
sub ebx, 15h ; Restore Address
mov ebx, [ebx]
xor esi, esi ; counter to 0
#0:
push eax
mov al, byte ptr[eax+esi]
mov byte ptr[ebx+esi], al
pop eax
inc esi
cmp esi, 5
jne #0
pop ebx
sub ebx, 9h
mov ebx, [ebx]
push ebx ; address to return to
ret
Size2:
Maybe a little sloppy, but I that doesn't mater ;)
You are trying to make VirtualProtect write lpflOldProtect to a read-only memory location, i.e. your current code section which is what you're trying to unprotect in the first place! My guess is this is what gives you the ERROR_NO_ACCESS. Since you're using the stack anyway, have it write lpflOldProtect to a stack location.
This isn't nearly as easy as it was in the old days; read access used to imply execute access, and a lot of memory mappings were mapped writable.
These days, I'd be surprised if there are many (any?) memory mappings that are both writable and executable. (And modern CPUs with PAE support are sufficient for even 32-bit kernels to provide non-executable-yet-readable mappings.)
I'd say, first things first, find an older Windows system, Win2k or earlier, then start trying to tackle this problem. :)
EDIT: Oh! I thought loading the DLL failed. Good work. :)
What do you mean by 'restore the loop'? Since you smashed the stack to jump to your code, you didn't really destroy the loop's text segment, you only scribbled on the stack. You could insert another function before your loop, then return from your dll to the function that called your loop. (You 'returned' into your injected code from the loop, so you can't return into the loop without building a fake stack frame for it; returning to the previous function seems easier than building a fake stack frame.)

Resources