jmp using table performance (cycle (latency)) - performance

My question is about the performance (cycle (latency)) of the jmp in this source code:
...
jmp qword [8 * rax + .TABLE]
.... (about 10K instructions)
.TABLE:
dq .addr1
dq .addr2
dq .addr3
dq .addr4
...
dq .addr1024
here the jump table is far from jmp (about 10K instructions are between them (for some reason...))
and rax is the index and RAX is our index
so i want to know the type of this JUMP .... what is it ? is it a far jump ? which one of these jumps is my jump ?
JMP rel8
JMP rel16
JMP rel32
JMP r/m16
JMP r/m32
JMP r/m64
JMP ptr16:16
JMP ptr16:32
JMP m16:16
JMP m16:32
JMP m16:64
and what is the CYCLE (latency) for this jmp ?

Related

Why does a function double dereference arguments stored on stack and how is that possible? [duplicate]

This question already has answers here:
Basic use of immediates vs. square brackets in YASM/NASM x86 assembly
(4 answers)
x86 Nasm assembly - push'ing db vars on stack - how is the size known?
(2 answers)
Referencing the contents of a memory location. (x86 addressing modes)
(2 answers)
Why do you have to dereference the label of data to store something in there: Assembly 8086 FASM
(1 answer)
Closed 7 months ago.
I tried to understand "lfunction" stack arguments loading to "flist" in following assembly code I found on a book (The book doesn't explain it. Code compiles and run without errors giving intended output displaying "The string is: ABCDEFGHIJ".) but I can't grasp the legality or logic of the code. What I don't understand is listed below.
In lfunction:
Non-volatile (as per Microsoft x64 calling convention) register RBX is not backed up before 'XOR'ing. (But it is not what bugs me most.)
In portion ";arguments on stack"
mov rax, qword [rbp+8+8+32]
mov bl,[rax]
Here [rbp+8+8+32] dereferences corresponding address stored in stack so RAX should
be loaded with value represented by'fourth' which is char 'D'(0x44) as per my understanding (Why qword?). And if so, what dereferencing char 'D' in second line can possibly mean (There should be a memory address to dereference but 'D' is a char.)?
Original code is listed below:
%include "io64.inc"
; stack.asm
extern printf
section .data
first db "A"
second db "B"
third db "C"
fourth db "D"
fifth db "E"
sixth db "F"
seventh db "G"
eighth db "H"
ninth db "I"
tenth db "J"
fmt db "The string is: %s",10,0
section .bss
flist resb 14 ;length of string plus end 0
section .text
global main
main:
push rbp
mov rbp,rsp
sub rsp, 8
mov rcx, flist
mov rdx, first
mov r8, second
mov r9, third
push tenth ; now start pushing in
push ninth ; reverse order
push eighth
push seventh
push sixth
push fifth
push fourth
sub rsp,32 ; shadow
call lfunc
add rsp,32+8
; print the result
mov rcx, fmt
mov rdx, flist
sub rsp,32+8
call printf
add rsp,32+8
leave
ret
;––––––––––––––––––––––––-
lfunc:
push rbp
mov rbp,rsp
xor rax,rax ;clear rax (especially higher bits)
;arguments in registers
mov al,byte[rdx] ; move content argument to al
mov [rcx], al ; store al to memory(resrved at section .bss)
mov al, byte[r8]
mov [rcx+1], al
mov al, byte[r9]
mov [rcx+2], al
;arguments on stack
xor rbx,rbx
mov rax, qword [rbp+8+8+32] ; rsp + rbp + return address + shadow
mov bl,[rax]
mov [rcx+3], bl
mov rax, qword [rbp+48+8]
mov bl,[rax]
mov [rcx+4], bl
mov rax, qword [rbp+48+16]
mov bl,[rax]
mov [rcx+5], bl
mov rax, qword [rbp+48+24]
mov bl,[rax]
mov [rcx+6], bl
mov rax, qword [rbp+48+32]
mov bl,[rax]
mov [rcx+7], bl
mov rax, qword [rbp+48+40]
mov bl,[rax]
mov [rcx+8], bl
mov rax, qword [rbp+48+48]
mov bl,[rax]
mov [rcx+9], bl
mov bl,0 ; terminating zero
mov [rcx+10], bl
leave
ret
Additional info:
I cannot look at register values just after line 50 which
corresponds to "XOR RAX, RAX" in lfunc because debugger auto skips
single stepping to line 37 of main function which corresponds to
"add RSP, 32+8". Even If I marked breakpoints in between
aforementioned lines in lfunc code the debugger simply hangs so I
have to manually abort debugging.
In portion ";arguments on stack"
mov rax, qword [rbp+8+8+32]
mov bl,[rax]
I am mentioning this again to be more precise of what am asking because question was marked as duplicate and
provided links with answers that doesn't address my specific issue. At line
[rbp+8+8+32] == 0x44 because clearly, mov with square brackets dereferences reference address (which I assume 64bit width) rbp+3h. So, the size of 0x44 is byte. That is why ask "Why qword?" because it implies "lea [rbp+8+8+32]" which is a qword reference, not mov. So if [rbp+8+8+32] equals 0x44, then [rax] == [0x0000000000000044], which a garbage ( not relevant to our code here) address.

Reversing an array and printing it in x86-64

I am trying to print an array, reverse it, and then print it again. I manage to print it once. I can also make 2 consecutive calls to _printy and it works. But the code breaks with the _reverse function. It does not segfault, it exits with code 24 (I looked online but this seems to mean that the maximum number of file descriptors has been exceeded, and I cannot get what this means in this context). I stepped with a debugger and the loop logic seems to make sense.
I am not passing the array in RDI, because _printy restores the content of that register when it exits. I also tried to load it directly into RDI before calling _reverse but that does not solve the problem.
I cannot figure out what the problem is. Any idea?
BITS 64
DEFAULT REL
; -------------------------------------
; -------------------------------------
; PRINT LIST
; -------------------------------------
; -------------------------------------
%define SYS_WRITE 0x02000004
%define SYS_EXIT 0x02000001
%define SYS_OPEN 0x02000005
%define SYS_CLOSE 0x02000006
%define SYS_READ 0x02000003
%define EXIT_SUCCESS 0
%define STDOUT 1
%define LF 10
%define INT_OFFSET 48
section .text
extern _printf
extern _puts
extern _exit
global _main
_main:
push rbp
lea rdi, [rel array]
call _printy
call _reverse
call _printy
pop rbp
call _exit
_reverse:
push rbp
lea rsi, [rdi + 4 * (length - 1) ]
.LOOP2:
cmp rdi, rsi
jge .DONE2
mov r8, [rdi]
mov r9, [rsi]
mov [rdi], r9
mov [rsi], r8
add rdi,4
sub rsi,4
jmp .LOOP2
.DONE2:
xor rax, rax
lea rdi, [rel array]
pop rbp
ret
_printy:
push rbp
xor rcx, rcx
mov r8, rdi
.loop:
cmp rcx, length
jge .done
push rcx
push r8
lea rdi, [rel msg]
mov rsi, [r8 + rcx * 4]
xor rax, rax
call _printf
pop r8
pop rcx
add rcx, 1
jmp .loop
.done:
xor rax, rax
lea rdi, [rel array]
pop rbp
ret
section .data
array: dd 78, 2, 3, 4, 5, 6
length: equ ($ - array) / 4
msg: db "%d => ", 0
Edit with some info from the debugger
Stepping into the _printy function gives the following msg, once reaching the call to _printf.
* thread #1, queue = 'com.apple.main-thread', stop reason = step over failed (Could not create return address breakpoint.)
frame #0: 0x0000000100003f8e a.out`printf
a.out`printf:
-> 0x100003f8e <+0>: jmp qword ptr [rip + 0x4074] ; (void *)0x00007ff80258ef0b: printf
0x100003f94: lea r11, [rip + 0x4075] ; _dyld_private
0x100003f9b: push r11
0x100003f9d: jmp qword ptr [rip + 0x5d] ; (void *)0x00007ff843eeb520: dyld_stub_binder
I am not an expert, but a quick research online led to the following
During the 'thread step-out' command, check that the memory we are about to place a breakpoint in is executable. Previously, if the current function had a nonstandard stack layout/ABI, and had a valid data pointer in the location where the return address is usually located, data corruption would occur when the breakpoint was written. This could lead to an incorrectly reported crash or silent corruption of the program's state. Now, if the above check fails, the command safely aborts.
So after all this might not be a problem (I am also able to track the execution of the printf call). But this is really the only understandable piece of information I am able to extract from the debugger. Deep in some quite obscure (to me) function calls I reach this
* thread #1, queue = 'com.apple.main-thread', stop reason = instruction step into
frame #0: 0x00007ff80256db7f libsystem_c.dylib`flockfile + 10
libsystem_c.dylib`flockfile:
-> 0x7ff80256db7f <+10>: call 0x7ff8025dd480 ; symbol stub for: __error
0x7ff80256db84 <+15>: mov r14d, dword ptr [rax]
0x7ff80256db87 <+18>: mov rdi, qword ptr [rbx + 0x68]
0x7ff80256db8b <+22>: add rdi, 0x8
Target 0: (a.out) stopped.
(lldb)
Process 61913 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = instruction step into
frame #0: 0x00007ff8025dd480 libsystem_c.dylib`__error
This is one of the function calls happening in _printf.
Ask further questions if there is something more I can do.
Your array consists of int32 numbers aka dd in nasm terminology, but your swap operates on 64 bit numbers:
mov r8, [rdi]
mov r9, [rsi]
mov [rdi], r9
mov [rsi], r8
Assuming you were not after some crazy optimizations where you swap a pair of elements simultaneously you want this to remain in 32 bits:
mov r8d, [rdi]
mov r9d, [rsi]
mov [rdi], r9d
mov [rsi], r8d

is there a difference between al and dl? [duplicate]

This question already has answers here:
How do AX, AH, AL map onto EAX?
(6 answers)
Closed 2 years ago.
First, it is my first assembly. And I use it with "NASM" and "64bit ASM" and "Intel" code and Mac OS.
When I try to write strdup function, it didn't work with al.
In using my strdup function, the string was broken.
So, my code is here.
global _strdup
extern _malloc
section .text
_strdup:
xor rcx, rcx
jmp strlen
strlen:
cmp [rdi + rcx], byte 0
je malloc
inc rcx
jmp strlen
malloc:
inc rcx
push rdi
mov rdi, rcx
call _malloc
pop rdi
cmp rax, 0
je error
xor rcx, rcx
jmp copy
copy:
mov al, byte [rdi + rcx]
mov byte [rax + rcx], al
cmp al, byte 0
je return
jmp increment
increment:
inc rcx
jmp copy
error:
mov rax, 0
ret
return:
ret
But, when I write same code with dl not al, it works!
global _strdup
extern _malloc
section .text
_strdup:
xor rcx, rcx
jmp strlen
strlen:
cmp [rdi + rcx], byte 0
je malloc
inc rcx
jmp strlen
malloc:
inc rcx
push rdi
mov rdi, rcx
call _malloc
pop rdi
cmp rax, 0
je error
xor rcx, rcx
jmp copy
copy:
mov dl, byte [rdi + rcx]
mov byte [rax + rcx], dl
cmp dl, byte 0
je return
jmp increment
increment:
inc rcx
jmp copy
error:
mov rax, 0
ret
return:
ret
I don't know why dl works and the difference between al and dl.
Anyone know about it? Thanks.
al is the smallest part of rax while dl is the smallest part of rdx. Changing al will change the value of rax as well.
Why your code is not working with al?
copy:
mov al, byte [rdi + rcx] ; you move "byte [rdi + rcx]" in "al"
; thus changing the value of "rax" as well.
; the result of the following statement becomes unexpected
mov byte [rax + rcx], al
But when you do it with dl, rax is not affected and hence your program works as expected.
Try the following code and run it through debugger to see the effect of changing the value of al on `rax:
section .text
global main
main:
nop
mov rax, 1000
mov al, byte 10
nop
Result:
After mov rax, 1000, value of rax: 1000
00000000 00000000 00000000 00000000 00000000 00000000 00000011 11101000
After mov al, 10, value of rax: 778
00000000 00000000 00000000 00000000 00000000 00000000 00000011 00001010
^^^^^^^^
So, changing al will affect rax.

InitializeCriticalSection fails in NASM

UPDATE: based on comments below, I revised the code below to add a struc and a pointer (new or revised code has "THIS IS NEW" or "THIS IS UPDATED" beside the code). Now the program does not crash, so the pointer is initialized, but the programs hangs at EnterCriticalSection. I suspect that in translating the sample MASM code below into NASM syntax, I did not declare the struc correctly. Any ideas? Thanks very much.
ORIGINAL QUESTION:
Below is a simple test program in 64-bit NASM, to test a critical section in
Windows. This is a dll and the entry point is Main_Entry_fn, which calls Init_Cores_fn, where we initialize four threads (cores) to call Test_fn.
I suspect that the problem is the pointer to the critical section. None of the online resources specifies what that pointer is. The doc "Using Critical Section Objects" at https://learn.microsoft.com/en-us/windows/desktop/sync/using-critical-section-objects shows a C++ example where the pointer appears to be relevant only to EnterCriticalSection and LeaveCriticalSection, but it's not a pointer to an independent object.
For those not familiar with NASM, the first parameter in a C++ signature goes into rcx and the second parameter goes into rds, but otherwise it should function the same as in C or C++. It's the same thing as InitializeCriticalSectionAndSpinCount(&CriticalSection,0x00000400) in C++.
Here's the entire program:
; Header Section
[BITS 64]
[default rel]
extern malloc, calloc, realloc, free
global Main_Entry_fn
export Main_Entry_fn
extern CreateThread, CloseHandle, ExitThread
extern WaitForMultipleObjects, GetCurrentThreadId
extern InitializeCriticalSectionAndSpinCount, EnterCriticalSection
extern LeaveCriticalSection, DeleteCriticalSection, InitializeCriticalSection
struc CRITICAL_SECTION ; THIS IS NEW
.cs_quad: resq 5
endstruc
section .data align=16
const_1000000000: dq 1000000000
ThreadID: dq 0
TestInfo: times 20 dq 0
ThreadInfo: times 3 dq 0
ThreadInfo2: times 3 dq 0
ThreadInfo3: times 3 dq 0
ThreadInfo4: times 3 dq 0
ThreadHandles: times 4 dq 0
Division_Size: dq 0
Start_Byte: dq 0
End_Byte: dq 0
Return_Data_Array: times 4 dq 0
Core_Number: dq 0
const_inf: dq 0xFFFFFFFF
SpinCount: dq 0x00000400
CriticalSection: ; THIS IS NEW
istruc CRITICAL_SECTION
iend
section .text
; ______________________________________
Init_Cores_fn:
; Calculate the data divisions
mov rax,[const_1000000000]
mov rbx,4 ;cores
xor rdx,rdx
div rbx
mov [End_Byte],rax
mov [Division_Size],rax
mov rax,0
mov [Start_Byte],rax
; Populate the ThreadInfo arrays to pass for each core
; ThreadInfo: (1) startbyte; (2) endbyte; (3) Core_Number
mov rdi,ThreadInfo
mov rax,[Start_Byte]
mov [rdi],rax
mov rax,[End_Byte]
mov [rdi+8],rax
mov rax,[Core_Number]
mov [rdi+16],rax
call DupThreadInfo ; Create ThreadInfo arrays for cores 2-4
mov rbp,rsp ; preserve caller's stack frame
sub rsp,56 ; Shadow space (was 32)
; _____
; Create four threads
label_0:
mov rax,[Core_Number]
cmp rax,0
jne sb2
mov rdi,ThreadInfo
jmp sb5
sb2:cmp rax,8
jne sb3
mov rdi,ThreadInfo2
jmp sb5
sb3:cmp rax,16
jne sb4
mov rdi,ThreadInfo3
jmp sb5
sb4:cmp rax,24
jne sb5
mov rdi,ThreadInfo4
sb5:
; _____
; Create Threads
mov rcx,0 ; lpThreadAttributes (Security Attributes)
mov rdx,0 ; dwStackSize
mov r8,Test_fn ; lpStartAddress (function pointer)
mov r9,rdi ; lpParameter (array of data passed to each core)
mov rax,0
mov [rsp+32],rax ; use default creation flags
mov rdi,ThreadID
mov [rsp+40],rdi ; ThreadID
call CreateThread
; Move the handle into ThreadHandles array (returned in rax)
mov rdi,ThreadHandles
mov rcx,[Core_Number]
mov [rdi+rcx],rax
mov rdi,TestInfo
mov [rdi+rcx],rax
mov rax,[Core_Number]
add rax,8
mov [Core_Number],rax
mov rbx,32 ; Four cores
cmp rax,rbx
jl label_0
mov rcx,CriticalSection ; THIS IS REVISED
mov rdx,[SpinCount]
call InitializeCriticalSectionAndSpinCount
; _____
; Wait
mov rcx,4 ;rax ; number of handles
mov rdx,ThreadHandles ; pointer to handles array
mov r8,1 ; wait for all threads to complete
mov r9,[const_inf] ;4294967295 ;0xFFFFFFFF
call WaitForMultipleObjects
; _____
mov rsp,rbp ; can we push rbp so we can use it internally?
jmp label_900
; ______________________________________
Test_fn:
mov rdi,rcx
mov r14,[rdi] ; Start_Byte
mov r15,[rdi+8] ; End_Byte
mov r13,[rdi+16] ; Core_Number
;______
; while(n < 1000000000)
label_401:
cmp r14,r15
jge label_899
mov rcx,CriticalSection
call EnterCriticalSection
; n += 1
add r14,1
mov rcx,CriticalSection
call LeaveCriticalSection
jmp label_401
;______
label_899:
mov rdi,Return_Data_Array
mov [rdi+r13],r14
mov rbp,ThreadHandles
mov rax,[rbp+r13]
call ExitThread
ret
; __________
label_900:
mov rcx,CriticalSection
call DeleteCriticalSection
mov rdi,Return_Data_Array
mov rax,rdi
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
call Init_Cores_fn
pop rbp
pop rdi
ret
DupThreadInfo:
mov rdi,ThreadInfo2
mov rax,8
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
add rax,[Division_Size]
mov [rdi],rax
mov rax,[End_Byte]
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
mov rdi,ThreadInfo3
mov rax,16
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
mov [rdi],rax
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
mov rdi,ThreadInfo4
mov rax,24
mov [rdi+16],rax ; Core Number
mov rax,[Start_Byte]
mov [rdi],rax
add rax,[Division_Size]
mov [rdi+8],rax
mov [Start_Byte],rax
ret
The code above shows the functions in three separate places, but of course we test them one at a time (but they all fail).
To summarize, my question is why do InitializeCriticalSection and InitializeCriticalSectionAndSpinCount both fail in the code above? The inputs are dead simple, so I don't understand why it should not work.
InitializeCriticalSection take pointer to critical section object
The process is responsible for allocating the memory used by a
critical section object, which it can do by declaring a variable of
type CRITICAL_SECTION.
so code can be something like (i use masm syntax)
CRITICAL_SECTION STRUCT
DQ 5 DUP(?)
CRITICAL_SECTION ends
extern __imp_InitializeCriticalSection:QWORD
extern __imp_InitializeCriticalSectionAndSpinCount:QWORD
.DATA?
CriticalSection CRITICAL_SECTION {}
.CODE
lea rcx,CriticalSection
;mov edx,400h
;call __imp_InitializeCriticalSectionAndSpinCount
call __imp_InitializeCriticalSection
also you need declare all imported functions as
extern __imp_funcname:QWORD
instead
extern funcname

assembly code to scroll the screen one line down clearing the first line in the screen and then scrolls one line up if a key is pressed

I'm trying to scroll 1 line down then up but
a) I don't know how to test this code
b) I'm not sure which interrupt to use for "when a key is pressed"
I'd be much grateful for your help
Here's my code :
Data_segment_name segment para
firstline db 160 dup(0)
Data_segment_name ends
Stack_segment_name segment para stack
Stack_segment_name ends
Code_segment_name segment
Main_prog proc far
assume SS:Stack_segment_name,CS:Code_segment_name,DS:Data_segment_name
mov AX,Data_segment_name ; load the starting address of the data
mov DS,AX ; segment into DS reg.
;code scroll down (clear first line) then scroll back up(restore cleared line)
mov es,ax ;save first line
lea di,firstline
mov ax,0b800h
mov ds,ax
mov ax,0
mov si,ax
cld
mov cx,80
rep movsw ;save ends
;now let's scroll down :)
mov ax,0b800h
mov es,ax
mov ax,0
mov di,ax
mov ax,160
mov si,ax
cld
mov cx,24*80
rep movsw
;now let's scroll up :)
int 21h ;check
mov ax,160*24
mov si,ax
mov ax,160*25
mov di,ax
std
mov cx,24*80
rep movsw
;restore first line
mov AX,Data_segment_name ; load the starting address of the data
mov DS,AX ; segment into DS reg.
lea si,firstline
mov ax,0
mov di,ax
cld
mov cx,80
rep movsw
mov ax,4c00h ; exit program
int 21h
Main_prog endp
Code_segment_name ends
end Main_prog
Ad a):
The test tool is called "debugger". I recommend Turbo Debugger (google for it).
Ad b):
Ralf Brown's interrupt list and TechHelp are good references. At a glance: Int 10h is for video, Int 16h is for keyboard, Int 21h is for MS-DOS.
You should switch to the simplified segment directives .CODE, .DATA, .STACK and to procedural programming PROC, ENDP. When your project grows, it will help to keep track of it.
Example:
.MODEL small
.STACK 1000h
.DATA
firstline db 160 dup(0)
.CODE
save_firstline PROC
push ds
mov ax, ds
mov es, ax
lea di, firstline
mov ax, 0b800h
mov ds, ax
mov ax, 0
mov si, ax
mov cx, 80
rep movsw
pop ds
ret
save_firstline ENDP
restore_firstline PROC
lea si, firstline
mov ax, 0b800h
mov es, ax
mov ax, 0
mov di, ax
mov cx, 80
rep movsw
ret
restore_firstline ENDP
scroll_up PROC
call save_firstline
mov ah, 6 ; http://www.ctyme.com/intr/rb-0096.htm
mov al, 1 ; number of lines to scroll
mov bh, 0 ; attribute
mov ch, 0 ; row top
mov cl, 0 ; col left
mov dh, 25 ; row bottom
mov dl, 80 ; col right
int 10h
ret
scroll_up ENDP
scroll_down PROC
mov ah, 7 ; http://www.ctyme.com/intr/rb-0097.htm
mov al, 1 ; number of lines to scroll
mov bh, 0 ; attribute
mov ch, 0 ; row top
mov cl, 0 ; col left
mov dh, 25 ; row bottom
mov dl, 80 ; col right
int 10h
call restore_firstline
ret
scroll_down ENDP
main PROC
mov ax, #data
mov ds, ax
waitForKey: ; http://webpages.charter.net/danrollins/techhelp/0229.HTM
mov ah, 1
int 16h
jnz gotKey ; jmp if key is ready
jmp waitForKey ; loop back and check for a key
gotKey:
mov ah, 0 ; key is ready, get it
int 16h ; now process the key
cmp ah, 48h ; <UP>
jne #F
call scroll_up
jmp waitforKey
##:
cmp ah, 50h ; <DOWN>
jne #F
call scroll_down
jmp waitForKey
##:
cmp al, 1Bh ; <ESC>
jne waitForKey
mov ax, 4C00h
int 21h
main ENDP
END main

Resources