InitializeCriticalSection fails in NASM - windows

UPDATE: based on comments below, I revised the code below to add a struc and a pointer (new or revised code has "THIS IS NEW" or "THIS IS UPDATED" beside the code). Now the program does not crash, so the pointer is initialized, but the programs hangs at EnterCriticalSection. I suspect that in translating the sample MASM code below into NASM syntax, I did not declare the struc correctly. Any ideas? Thanks very much.
ORIGINAL QUESTION:
Below is a simple test program in 64-bit NASM, to test a critical section in
Windows. This is a dll and the entry point is Main_Entry_fn, which calls Init_Cores_fn, where we initialize four threads (cores) to call Test_fn.
I suspect that the problem is the pointer to the critical section. None of the online resources specifies what that pointer is. The doc "Using Critical Section Objects" at https://learn.microsoft.com/en-us/windows/desktop/sync/using-critical-section-objects shows a C++ example where the pointer appears to be relevant only to EnterCriticalSection and LeaveCriticalSection, but it's not a pointer to an independent object.
For those not familiar with NASM, the first parameter in a C++ signature goes into rcx and the second parameter goes into rds, but otherwise it should function the same as in C or C++. It's the same thing as InitializeCriticalSectionAndSpinCount(&CriticalSection,0x00000400) in C++.
Here's the entire program:
; Header Section
[BITS 64]
[default rel]
extern malloc, calloc, realloc, free
global Main_Entry_fn
export Main_Entry_fn
extern CreateThread, CloseHandle, ExitThread
extern WaitForMultipleObjects, GetCurrentThreadId
extern InitializeCriticalSectionAndSpinCount, EnterCriticalSection
extern LeaveCriticalSection, DeleteCriticalSection, InitializeCriticalSection
struc CRITICAL_SECTION ; THIS IS NEW
.cs_quad: resq 5
endstruc
section .data align=16
const_1000000000: dq 1000000000
ThreadID: dq 0
TestInfo: times 20 dq 0
ThreadInfo: times 3 dq 0
ThreadInfo2: times 3 dq 0
ThreadInfo3: times 3 dq 0
ThreadInfo4: times 3 dq 0
ThreadHandles: times 4 dq 0
Division_Size: dq 0
Start_Byte: dq 0
End_Byte: dq 0
Return_Data_Array: times 4 dq 0
Core_Number: dq 0
const_inf: dq 0xFFFFFFFF
SpinCount: dq 0x00000400
CriticalSection: ; THIS IS NEW
istruc CRITICAL_SECTION
iend
section .text
; ______________________________________
Init_Cores_fn:
; Calculate the data divisions
mov rax,[const_1000000000]
mov rbx,4 ;cores
xor rdx,rdx
div rbx
mov [End_Byte],rax
mov [Division_Size],rax
mov rax,0
mov [Start_Byte],rax
; Populate the ThreadInfo arrays to pass for each core
; ThreadInfo: (1) startbyte; (2) endbyte; (3) Core_Number
mov rdi,ThreadInfo
mov rax,[Start_Byte]
mov [rdi],rax
mov rax,[End_Byte]
mov [rdi+8],rax
mov rax,[Core_Number]
mov [rdi+16],rax
call DupThreadInfo ; Create ThreadInfo arrays for cores 2-4
mov rbp,rsp ; preserve caller's stack frame
sub rsp,56 ; Shadow space (was 32)
; _____
; Create four threads
label_0:
mov rax,[Core_Number]
cmp rax,0
jne sb2
mov rdi,ThreadInfo
jmp sb5
sb2:cmp rax,8
jne sb3
mov rdi,ThreadInfo2
jmp sb5
sb3:cmp rax,16
jne sb4
mov rdi,ThreadInfo3
jmp sb5
sb4:cmp rax,24
jne sb5
mov rdi,ThreadInfo4
sb5:
; _____
; Create Threads
mov rcx,0 ; lpThreadAttributes (Security Attributes)
mov rdx,0 ; dwStackSize
mov r8,Test_fn ; lpStartAddress (function pointer)
mov r9,rdi ; lpParameter (array of data passed to each core)
mov rax,0
mov [rsp+32],rax ; use default creation flags
mov rdi,ThreadID
mov [rsp+40],rdi ; ThreadID
call CreateThread
; Move the handle into ThreadHandles array (returned in rax)
mov rdi,ThreadHandles
mov rcx,[Core_Number]
mov [rdi+rcx],rax
mov rdi,TestInfo
mov [rdi+rcx],rax
mov rax,[Core_Number]
add rax,8
mov [Core_Number],rax
mov rbx,32 ; Four cores
cmp rax,rbx
jl label_0
mov rcx,CriticalSection ; THIS IS REVISED
mov rdx,[SpinCount]
call InitializeCriticalSectionAndSpinCount
; _____
; Wait
mov rcx,4 ;rax ; number of handles
mov rdx,ThreadHandles ; pointer to handles array
mov r8,1 ; wait for all threads to complete
mov r9,[const_inf] ;4294967295 ;0xFFFFFFFF
call WaitForMultipleObjects
; _____
mov rsp,rbp ; can we push rbp so we can use it internally?
jmp label_900
; ______________________________________
Test_fn:
mov rdi,rcx
mov r14,[rdi] ; Start_Byte
mov r15,[rdi+8] ; End_Byte
mov r13,[rdi+16] ; Core_Number
;______
; while(n < 1000000000)
label_401:
cmp r14,r15
jge label_899
mov rcx,CriticalSection
call EnterCriticalSection
; n += 1
add r14,1
mov rcx,CriticalSection
call LeaveCriticalSection
jmp label_401
;______
label_899:
mov rdi,Return_Data_Array
mov [rdi+r13],r14
mov rbp,ThreadHandles
mov rax,[rbp+r13]
call ExitThread
ret
; __________
label_900:
mov rcx,CriticalSection
call DeleteCriticalSection
mov rdi,Return_Data_Array
mov rax,rdi
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
call Init_Cores_fn
pop rbp
pop rdi
ret
DupThreadInfo:
mov rdi,ThreadInfo2
mov rax,8
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
add rax,[Division_Size]
mov [rdi],rax
mov rax,[End_Byte]
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
mov rdi,ThreadInfo3
mov rax,16
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
mov [rdi],rax
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
mov rdi,ThreadInfo4
mov rax,24
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
mov [rdi],rax
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
ret
The code above shows the functions in three separate places, but of course we test them one at a time (but they all fail).
To summarize, my question is why do InitializeCriticalSection and InitializeCriticalSectionAndSpinCount both fail in the code above? The inputs are dead simple, so I don't understand why it should not work.

InitializeCriticalSection take pointer to critical section object
The process is responsible for allocating the memory used by a
critical section object, which it can do by declaring a variable of
type CRITICAL_SECTION.
so code can be something like (i use masm syntax)
CRITICAL_SECTION STRUCT
DQ 5 DUP(?)
CRITICAL_SECTION ends
extern __imp_InitializeCriticalSection:QWORD
extern __imp_InitializeCriticalSectionAndSpinCount:QWORD
.DATA?
CriticalSection CRITICAL_SECTION {}
.CODE
lea rcx,CriticalSection
;mov edx,400h
;call __imp_InitializeCriticalSectionAndSpinCount
call __imp_InitializeCriticalSection
also you need declare all imported functions as
extern __imp_funcname:QWORD
instead
extern funcname

Related

What does DebugView's "Enable Verbose Kernel Output" do?

I've followed Microsoft's "Write a Hello World Windows Driver (KMDF)" and wasn't sure how to see the output on WinDbg. After trying to set the Debug Print Filter registry to 0xFFFFFFFF, rebooting and other rain dance solutions, the one thing that worked was enabling DebugView's "Enable Verbose Kernel Output" option. Now, WinDbg shows debug outputs. Its too verbose but at least it's there.
So what did DebugView modify for WinDbg to show more verbose debug output?
I'm running WinDbg attached to a VM from my Windows host with a bridged connection.
TL;DR: it calls a driver to repeatedly call NtSetDebugFilterState on all kernel components, so that they are all able to print something on the debug output.
Program
Let start with the program itself; there's only one occurrence of the sentence "Enable Verbose Kernel Output":
mov [rsp+78h+mi.wID], 9C7Ch
lea rax, aEnableVerboseK ; "Enable &Verbose Kernel Output"
sbb ecx, ecx
mov [rsp+78h+mi.dwTypeData], rax
and ecx, 8
mov [rsp+78h+mi.fState], ecx
mov rcx, cs:hMenu ; hMenu
call cs:GetSubMenu
mov rcx, rax ; hmenu
lea r9, [rsp+78h+mi] ; lpmi
lea edx, [rdi+3] ; item
lea r8d, [rdi+1] ; fByPosition
call cs:InsertMenuItemA
The above code insert the sub-menu into the main menu. What's important here is the the menu ID, namely 0x9C7C.
This menu ID is used only once more here:
movzx edx, al ; al can either be 0 or 1
xor edi, edi
mov qword ptr [rsp+830h+iNumButtons], rdi ; lpOverlapped
lea rax, [rsp+830h+BytesReturned]
mov [rsp+830h+lpButtons], rax ; lpBytesReturned
xor edx, 1
mov dword ptr [rsp+830h+wBMID], edi ; nOutBufferSize
xor r9d, r9d ; nInBufferSize
xor r8d, r8d ; lpInBuffer
mov [rsp+830h+dwInitParam], rdi ; lpOutBuffer
lea edx, ds:0FFFFFFFF8305003Ch[rdx*4] ; dwIoControlCode
call cs:DeviceIoControl
movzx eax, cs:byte_1400935A3
mov edx, 9C7Ch ; uIDCheckItem
mov rcx, cs:hMenu ; hMenu
mov cs:byte_1400A2776, al
neg al
sbb r8d, r8d
and r8d, 8 ; uCheck
call cs:CheckMenuItem
The above code calls DeviceIoControl and then checks the menu item. The former means the program is actually talking with a device driver.
If we remove a bit of code we can see which IOCTL can be sent to the driver:
movzx edx, al ; al can either be 0 or 1
; snip
xor edx, 1 ; invert AL
; snip
lea edx, ds:0FFFFFFFF8305003Ch[rdx*4] ; dwIoControlCode
call cs:DeviceIoControl
Since RDX can be either 0 or 1 we end up with (base 10):
[rdx*4-2096824260]
Thus:
4 - 2096824260 = -2096824256
0 - 2096824260 = -2096824260
Looking at the handles opened by dbgview64.exe we can see a \Device\dbgv is currently opened.
0: kd> !devobj \Device\dbgv
Device object (ffffd58a97007630) is for:
Dbgv \Driver\DBGV DriverObject ffffd58a8688aaa0
Current Irp 00000000 RefCount 0 Type 00008305 Flags 00000048
SecurityDescriptor ffffe58fb8bdeea0 DevExt 00000000 DevObjExt ffffd58a97007780
ExtensionFlags (0x00000800) DOE_DEFAULT_SD_PRESENT
Characteristics (0000000000)
Device queue is not busy.
0: kd> dt _driver_object ffffd58a8688aaa0
nt!_DRIVER_OBJECT
+0x000 Type : 0n4
+0x002 Size : 0n336
+0x008 DeviceObject : 0xffffd58a`97007630 _DEVICE_OBJECT
+0x010 Flags : 0x12
+0x018 DriverStart : 0xfffff800`dcf90000 Void
+0x020 DriverSize : 0x9000
+0x028 DriverSection : 0xffffd58a`a3ba9be0 Void
+0x030 DriverExtension : 0xffffd58a`8688abf0 _DRIVER_EXTENSION
+0x038 DriverName : _UNICODE_STRING "\Driver\DBGV"
+0x048 HardwareDatabase : 0xfffff800`8372e990 _UNICODE_STRING "\REGISTRY\MACHINE\HARDWARE\DESCRIPTION\SYSTEM"
+0x050 FastIoDispatch : (null)
+0x058 DriverInit : 0xfffff800`dcf97058 long +0
+0x060 DriverStartIo : (null)
+0x068 DriverUnload : (null)
+0x070 MajorFunction : [28] 0xfffff800`dcf91b80 long +0
0: kd> dt nt!_LDR_DATA_TABLE_ENTRY 0xffffd58a`a3ba9be0 Full*
+0x048 FullDllName : _UNICODE_STRING "\??\C:\WINDOWS\system32\Drivers\Dbgv.sys"
So the driver is currently loaded from C:\WINDOWS\system32\Drivers\Dbgv.sys (or you can extract it from the .rsrc section...).
Driver
Looking at the driver, in the driver entry we spot the function used for IRP_MJ_DEVICE_CONTROL:
lea rax, sub_180001B80
mov [rdi+0E0h], rax ; IRP_MJ_DEVICE_CONTROL
mov [rdi+80h], rax
mov [rdi+70h], rax
Inside that function we have the usual setup before calling the right IOCTL:
movzx eax, [rcx+_IO_STACK_LOCATION.MajorFunction]
mov r9d, [rcx+_IO_STACK_LOCATION.Parameters.DeviceIoControl.OutputBufferLength]
mov r10d, [rcx+_IO_STACK_LOCATION.Parameters.DeviceIoControl.IoControlCode]
test al, al ; IRP_MJ_CREATE
jz loc_180001C6C
cmp al, 2 ; IRP_MJ_CLOSE
jz short loc_180001C0C
cmp al, 0Eh ; IRP_MJ_DEVICE_CONTROL
jnz ##CompleteRequest
mov eax, r10d
and eax, 3
cmp al, METHOD_NEITHER
jnz short loc_180001BDF
mov rdx, [rdi+_IRP.UserBuffer]
loc_180001BDF:
mov [rsp+98h+do], r11 ; _DEVICE_OBJECT*
mov [rsp+98h+IoStatus], rbx ; IoStatus
mov [rsp+98h+ioctl], r10d ; IoCtl
mov [rsp+98h+OutputBufferLength], r9d ; OuputBufferLength
mov r9d, [rcx+_IO_STACK_LOCATION.Parameters.DeviceIoControl.InputBufferLength] ; int
mov rcx, [rcx+_IO_STACK_LOCATION.FileObject]
mov qword ptr [rsp+98h+Buffer], rdx ; Buffer
mov dl, 1 ; int
call sub_1800017E0
jmp ##CompleteRequest
Inside the call (sub_1800017E0) we have a big switch for the IOCTL, here's the case -2096824260 (case -2096824256 is slightly different):
loc_1800018B9:
call sub_180002470 ; jumptable 000000018000182F case -2096824260
jmp loc_180001AEB
This function is mostly comprised of two loops:
loc_1800024A0:
xor ebx, ebx
##LoopQuerySetDebugFilter:
mov edx, ebx
mov ecx, esi
call cs:qword_180005438 ; DbgQueryDebugFilterState
mov r8b, 1 ; State
mov edx, ebx ; Level (keeps incrementing up to 0x1E)
mov ecx, esi ; ComponentId (keeps incrementing up to 0x82)
mov [rdi], al ; save current state.
call cs:qword_180005440 ; DbgSetDebugFilterState
inc ebx
inc rdi
cmp ebx, 1Eh
jb short ##LoopQuerySetDebugFilter
inc esi
cmp esi, 82h ; '‚'
jb short loc_1800024A0
Both calls are on DbgQueryDebugFilterState and DbgSetDebugFilterState (reactos source)
which is just a minimal wrapper around NtSetDebugFilterState (reactos source).
As far as we can see the debug filter state is queried, saved, and then set for all kernel components (following is the component tables from the kernel, there are a lot of them):
.rdata:00000001400073E0 KdComponentTable dq offset Kd_SYSTEM_Mask
.rdata:00000001400073E0 ; DATA XREF: NtQueryDebugFilterState+36↓o
.rdata:00000001400073E0 ; NtSetDebugFilterState+43↓o ...
.rdata:00000001400073E8 dq offset Kd_SMSS_Mask
.rdata:00000001400073F0 dq offset Kd_SETUP_Mask
.rdata:00000001400073F8 dq offset Kd_NTFS_Mask
.rdata:0000000140007400 dq offset Kd_FSTUB_Mask
.rdata:0000000140007408 dq offset Kd_CRASHDUMP_Mask
.rdata:0000000140007410 dq offset Kd_CDAUDIO_Mask
.rdata:0000000140007418 dq offset Kd_CDROM_Mask
.rdata:0000000140007420 dq offset Kd_CLASSPNP_Mask
....
Which finally means that all kernel components are able to print something to the debug output.
Note that the other IOCTL just reset the components masks to what they were before checking the menu in the main program.

Reversing an array and printing it in x86-64

I am trying to print an array, reverse it, and then print it again. I manage to print it once. I can also make 2 consecutive calls to _printy and it works. But the code breaks with the _reverse function. It does not segfault, it exits with code 24 (I looked online but this seems to mean that the maximum number of file descriptors has been exceeded, and I cannot get what this means in this context). I stepped with a debugger and the loop logic seems to make sense.
I am not passing the array in RDI, because _printy restores the content of that register when it exits. I also tried to load it directly into RDI before calling _reverse but that does not solve the problem.
I cannot figure out what the problem is. Any idea?
BITS 64
DEFAULT REL
; -------------------------------------
; -------------------------------------
; PRINT LIST
; -------------------------------------
; -------------------------------------
%define SYS_WRITE 0x02000004
%define SYS_EXIT 0x02000001
%define SYS_OPEN 0x02000005
%define SYS_CLOSE 0x02000006
%define SYS_READ 0x02000003
%define EXIT_SUCCESS 0
%define STDOUT 1
%define LF 10
%define INT_OFFSET 48
section .text
extern _printf
extern _puts
extern _exit
global _main
_main:
push rbp
lea rdi, [rel array]
call _printy
call _reverse
call _printy
pop rbp
call _exit
_reverse:
push rbp
lea rsi, [rdi + 4 * (length - 1) ]
.LOOP2:
cmp rdi, rsi
jge .DONE2
mov r8, [rdi]
mov r9, [rsi]
mov [rdi], r9
mov [rsi], r8
add rdi,4
sub rsi,4
jmp .LOOP2
.DONE2:
xor rax, rax
lea rdi, [rel array]
pop rbp
ret
_printy:
push rbp
xor rcx, rcx
mov r8, rdi
.loop:
cmp rcx, length
jge .done
push rcx
push r8
lea rdi, [rel msg]
mov rsi, [r8 + rcx * 4]
xor rax, rax
call _printf
pop r8
pop rcx
add rcx, 1
jmp .loop
.done:
xor rax, rax
lea rdi, [rel array]
pop rbp
ret
section .data
array: dd 78, 2, 3, 4, 5, 6
length: equ ($ - array) / 4
msg: db "%d => ", 0
Edit with some info from the debugger
Stepping into the _printy function gives the following msg, once reaching the call to _printf.
* thread #1, queue = 'com.apple.main-thread', stop reason = step over failed (Could not create return address breakpoint.)
frame #0: 0x0000000100003f8e a.out`printf
a.out`printf:
-> 0x100003f8e <+0>: jmp qword ptr [rip + 0x4074] ; (void *)0x00007ff80258ef0b: printf
0x100003f94: lea r11, [rip + 0x4075] ; _dyld_private
0x100003f9b: push r11
0x100003f9d: jmp qword ptr [rip + 0x5d] ; (void *)0x00007ff843eeb520: dyld_stub_binder
I am not an expert, but a quick research online led to the following
During the 'thread step-out' command, check that the memory we are about to place a breakpoint in is executable. Previously, if the current function had a nonstandard stack layout/ABI, and had a valid data pointer in the location where the return address is usually located, data corruption would occur when the breakpoint was written. This could lead to an incorrectly reported crash or silent corruption of the program's state. Now, if the above check fails, the command safely aborts.
So after all this might not be a problem (I am also able to track the execution of the printf call). But this is really the only understandable piece of information I am able to extract from the debugger. Deep in some quite obscure (to me) function calls I reach this
* thread #1, queue = 'com.apple.main-thread', stop reason = instruction step into
frame #0: 0x00007ff80256db7f libsystem_c.dylib`flockfile + 10
libsystem_c.dylib`flockfile:
-> 0x7ff80256db7f <+10>: call 0x7ff8025dd480 ; symbol stub for: __error
0x7ff80256db84 <+15>: mov r14d, dword ptr [rax]
0x7ff80256db87 <+18>: mov rdi, qword ptr [rbx + 0x68]
0x7ff80256db8b <+22>: add rdi, 0x8
Target 0: (a.out) stopped.
(lldb)
Process 61913 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = instruction step into
frame #0: 0x00007ff8025dd480 libsystem_c.dylib`__error
This is one of the function calls happening in _printf.
Ask further questions if there is something more I can do.
Your array consists of int32 numbers aka dd in nasm terminology, but your swap operates on 64 bit numbers:
mov r8, [rdi]
mov r9, [rsi]
mov [rdi], r9
mov [rsi], r8
Assuming you were not after some crazy optimizations where you swap a pair of elements simultaneously you want this to remain in 32 bits:
mov r8d, [rdi]
mov r9d, [rsi]
mov [rdi], r9d
mov [rsi], r8d

WriteConsole returns false in MASM

I am trying to make an array, and access it's values and print them out
After calling the WriteConsole subroutine, it is returning false, however, all the values are supplied. Here we can see that - https://imgur.com/a/vUfwOo6
Eax register is 0 after calling WriteConsole. Here you can see the register values, that are being pushed to the stack. https://imgur.com/a/gv6s4uG
Considering, that WriteConsole is WINAPI subroutine, that means it's stdcall. So, I am passing values right to left.
lpReserved -> 0
lpNumberOfCharsWritten -> offset to 00403028 (CharsWritten variable)
nNumberOfCharsToWrite -> Just 2, because in array only ints are present of length 2
*lpBuffer -> ebx register, which contains array lvalue
hConsoleOutput -> Output from GetStdHandle (In this case -> edx register -> A0)
My MASM code:
.386
.model flat,stdcall
option casemap:none
include \masm32\include\windows.inc
include \masm32\include\kernel32.inc
includelib \masm32\lib\kernel32.lib
include \masm32\include\user32.inc
include C:\masm32\include\masm32.inc
includelib C:\masm32\lib\masm32.lib
includelib \masm32\lib\user32.lib
includelib \masm32\lib\msvcrt.lib ; Some default includes :P
.data
dArray dd 10 dup (?) ; Main array
CharsWritten dd ?
LoopCounter dd 0
StdHandle dd ?
.code
PrintArrayToScreen proc
mov eax, STD_OUTPUT_HANDLE
push eax
call GetStdHandle
mov StdHandle,eax
mov eax,[LoopCounter]
innerPrintLoop:
mov ecx,offset dArray
mov eax, [LoopCounter]
mov ebx,[ecx + eax * 4]
mov esi,offset CharsWritten
push 0
push esi
push 2
push ebx
mov edx,StdHandle
push edx
call WriteConsole
mov eax,[LoopCounter]
inc eax
mov LoopCounter,eax ; Storing the Loop Counter in the variable
cmp eax,11 ; +1 because of loop counter increment
jnz innerPrintLoop
ret
PrintArrayToScreen endp
arrayLoop proc ; Subroutine for the array filling
mov eax,offset dArray
mov bx,10
mov ecx,0
innerLoop:
mov [eax + ecx * 4],bx ; ecx * 4 => counter * 4 bytes
inc bx
add ecx,1
cmp ecx,10
jne innerLoop
mov eax,offset dArray
ret
arrayLoop endp
start:
call arrayLoop
call PrintArrayToScreen
mov eax,0
push eax
call ExitProcess
end start
From the documentation for WriteConsole:
lpBuffer [in]
A pointer to a buffer that contains characters to be written to the console screen buffer.
So you should be passing the address of the data to be written, but you're actually passing the data itself.
You could "fix" that by changing the line mov ebx,[ecx + eax * 4] to lea ebx,[ecx + eax * 4]. But note that WriteConsole doesn't do any integer-to-string conversion for you, so you still probably wouldn't get the result you expected. If you want that sort of functionality, use printf.

Multicore in Windows: where to go after all threads are finished?

I am writing code in NASM (64 bit) in Windows to run four simultaneous threads (each assigned to a separate core) on a four-core Windows x86-64 machine. Below is my thread instantiation code. Each thread calls the same program.
The code is shown below in the exact sequence it appears in the source file (without the .data section, and some code abbreviated). Each thread (core) executes Test_Function across a large array. The first core starts at data element zero, the second core starts at 1, the third core starts at 2, the fourth core starts at 3, and each core increments by four (e.g., 0, 4, 8, 12). When finished, Test_Function exits at label_899. They write their results to the corresponding locations in a shared array created with malloc.
My question is: where do I go after all threads are finished at label_899? Do the threads then return to the final line of ThreadStart (after jl label_0), where cleanup is performed? Test_Function exits with ret, so I assume that returns control to the thread instantiation section (ThreadStart).
Unfortunately, all of the information I have read on this focuses on the creation of a single thread. Some resources talk about thread synchronization while running, but I have found nothing directly relevant about what happens after all threads are finished.
Thanks for any help. I would post a complete example, but I don't know yet how to complete it, which is my question.
ThreadStart:
mov rbp,rsp ; preserve caller's stack frame
mov rdi,ThreadInfo ; two element array
sub rsp,32 ; Shadow space
label_0:
mov rax,StartByte
mov [rdi],rax ; 0, 8, 16, or 24
mov rax,[JumpCount]
mov [rdi+8],rax ; number of cores (4 in this example)
push 0 ; Creation flags, start immediately (may want to delay until all created)
push 0 ; ThreadID
mov rcx,0 ; lpThreadAttributes (Security Attributes)
mov rdx,0 ; dwStackSize
mov r8,Test_Function ; lpStartAddress (function pointer)
mov r9,rdi ; lpParameter
call CreateThread
mov rax,8
add [StartByte],rax
mov rax,[TreadCount]
add rax,1
mov [TreadCount],rax
cmp rax,4
jl label_0
[ Code to do cleanup if needed after the four threads finish ]
mov rsp,rbp
jmp final_out
;______
Test_Function:
xor rcx,rcx
mov [loop_counter_401],rcx
label_401:
lea rdi,[rel numbers_ptr]
mov rbp,qword [rdi] ; Pointer
mov rcx,[loop_counter_401]
mov rax,[numbers_length]
cmp rcx,rax
jge label_899
movsd xmm0,qword[rbp+rcx]
movsd [num_float],xmm0
add rcx,32 ;8 -- add 32 for four loops, not 8
mov [loop_counter_401],rcx
[ MORE CODE]
jmp label_401
label_899:
ret
;______
final_out:
mov rdi,Return_Pointer_Array
mov rax,r15
mov [rdi+0],rax
mov rax,r14
mov [rdi+8],rax
mov rax,rdi
ret
Update 030119
Following is the code that creates threads as it is now. Earlier I though it failed at CreateThread, but it does not. It fails now due to a problem with the code I call, and I am debugging it now. I will post back with my results. The failure may be in how I wrote the code for WaitForMultipleObjects.
Shown is the thread instantiation code, not the code called by the threads.
mov rdi,ThreadInfo
mov rax,StartByte
mov [rdi+8],rax ; 0, 8, 16, or 24
; _____
; Create Threads
mov rcx,0 ; lpThreadAttributes (Security Attributes)
mov rdx,0 ; dwStackSize
mov r8,Test_Function ; lpStartAddress (function pointer)
mov rax,rdi
mov r9,rax ; lpParameter (array of data passed to each core)
mov rax,0
mov [rsp+40],rax ; use default creation flags
mov rax,[ThreadCount]
mov [rsp+32],rax ; ThreadID
call CreateThread
; Move the handle into ThreadHandles array (returned in rax)
mov rdi,ThreadHandles
mov rcx,[StartByte]
mov [rdi+rcx],rax
mov rax,8
add [StartByte],rax
mov rax,[ThreadCount]
add rax,1
mov [ThreadCount],rax
cmp rax,4
jl label_0
; _____
; Wait
mov rcx,rax ; number of handles
mov rdx,ThreadHandles ; pointer to handles array
mov r8,1 ; wait for all threads to complete
mov r9,1000 ; milliseconds to wait
call WaitForMultipleObjects
; _____
;[ Code to do cleanup if needed after the four threads finish ]
mov rsp,rbp
jmp label_900

Reading from STDIN and printing to STDOUT with nasm assembly?

As the title suggest I seem to be having a hard time converting the below code to do the exact same thing, which is to read from stdin and stdout. My professor wants us to stop using int 80h and switch over to using gcc. I've had no problems with reading input with the below code however, switching over to gcc is where I start getting segmentation core dump errors.
section .bss
buf resb 1 ; 1000-byte buffer (in data section)
section .text
global _start
_start:
loop1: mov edx, 1 ; max length
mov ecx, buf ; buffer
mov ebx, 0 ; stdin
mov eax, 3 ; sys_read
int 80h
cmp eax, 0 ; end loop if read <= 0
jle lpend1
mov edx, eax ; length
mov ecx, buf ; buffer
mov ebx, 1 ; stdout
mov eax, 4 ; sys_write
int 80h
jmp loop1 ; go back for more
lpend1:
mov eax, 1
mov ebx, 0
int 80h
My attempt at converting the above to perform the same task
SECTION .data
format: db "%c",0
SECTION .bss
buff: resb 1
SECTION .text
extern printf
extern scanf
global main
main:
loop1:
push buff ;buff will hold the characters in the string/file
push format ;expect character for every buff
call scanf
add esp, 8 ;clear stack
cmp eax, 0 ;if eax is equal to 0 then EOF
je lpend1 ;jump to end main func
xor eax, eax ;clear eax
mov eax, buff ;mov buff to eax register
push eax ;push eax onto the stack
mov eax, format ;mov the string format to eax
push eax ;push onto the stack
call printf ;call printf, prints to screen
add esp, 8 ;clear the stack
jmp loop1 ;jump back to top and repeat
lpend1:
ret ;end of main

Resources