Why does LLVM appear to ignore Rust's assume intrinsic? - performance

LLVM appears to ignore core::intrinsics::assume(..) calls. They do end up in the bytecode, but don't change the resulting machine code. For example take the following (nonsensical) code:
pub fn one(xs: &mut Vec<i32>) {
if let Some(x) = xs.pop() {
xs.push(x);
}
}
This compiles to a whole lot of assembly:
example::one:
push rbp
push r15
push r14
push r12
push rbx
mov rbx, qword ptr [rdi + 16]
test rbx, rbx
je .LBB0_9
mov r14, rdi
lea rsi, [rbx - 1]
mov qword ptr [rdi + 16], rsi
mov rdi, qword ptr [rdi]
mov ebp, dword ptr [rdi + 4*rbx - 4]
cmp rsi, qword ptr [r14 + 8]
jne .LBB0_8
lea rax, [rsi + rsi]
cmp rax, rbx
cmova rbx, rax
mov ecx, 4
xor r15d, r15d
mov rax, rbx
mul rcx
mov r12, rax
setno al
jo .LBB0_11
mov r15b, al
shl r15, 2
test rsi, rsi
je .LBB0_4
shl rsi, 2
mov edx, 4
mov rcx, r12
call qword ptr [rip + __rust_realloc#GOTPCREL]
mov rdi, rax
test rax, rax
je .LBB0_10
.LBB0_7:
mov qword ptr [r14], rdi
mov qword ptr [r14 + 8], rbx
mov rsi, qword ptr [r14 + 16]
.LBB0_8:
or ebp, 1
mov dword ptr [rdi + 4*rsi], ebp
add qword ptr [r14 + 16], 1
.LBB0_9:
pop rbx
pop r12
pop r14
pop r15
pop rbp
ret
.LBB0_4:
mov rdi, r12
mov rsi, r15
call qword ptr [rip + __rust_alloc#GOTPCREL]
mov rdi, rax
test rax, rax
jne .LBB0_7
.LBB0_10:
mov rdi, r12
mov rsi, r15
call qword ptr [rip + alloc::alloc::handle_alloc_error#GOTPCREL]
ud2
.LBB0_11:
call qword ptr [rip + alloc::raw_vec::capacity_overflow#GOTPCREL]
ud2
Now we could introduce the assumption that xs is not full (at capacity) after
the pop() (this is nightly only):
#![feature(core_intrinsics)]
pub fn one(xs: &mut Vec<i32>) {
if let Some(x) = xs.pop() {
unsafe {
core::intrinsics::assume(xs.len() < xs.capacity());
}
xs.push(x);
}
}
Yet despite the assume showing up in the LLVM bytecode, the assembly is
unchanged. If however, we use core::hint::unreachable_unchecked() to create
a diverging path in the non-assumed case, such as:
pub fn one(xs: &mut Vec<i32>) {
if let Some(x) = xs.pop() {
if xs.len() >= xs.capacity() {
unsafe { core::hint::unreachable_unchecked() }
}
xs.push(x);
}
}
We get the following:
example::one:
mov rax, qword ptr [rdi + 16]
test rax, rax
je .LBB0_2
mov qword ptr [rdi + 16], rax
.LBB0_2:
ret
Which is essentially a no-op, but not too bad. Of course, we could have left the value in place by using:
pub fn one(xs: &mut Vec<i32>) {
xs.last_mut().map(|_e| ());
}
Which compiles down to what we'd expect:
example::one:
ret
Why does LLVM appear to ignore the assume intrinsic?

This now compiles to just a ret on recent versions of rustc due to improvements in rustc and LLVM. LLVM ignored the intrinsic because it wasn't able to optimize it before, but now it has the ability to optimize this better.

Related

How to deal with "undefined label XXX" in Go assembly for libc functions like malloc?

I found a project c2goasm that can convert assembly from a C compiler into Golang assembly, but I'm currently having some problems.
such as "linkedlist.c" :
void ListNodeCreat(int val, struct ListNode* ret) {
struct ListNode * node = (struct ListNode *)malloc(sizeof(struct ListNode));
node->val = val;
node->next = NULL;
ret = node;
}
The generated C assembly file "linkedlist.s" is as follows, in GNU assembler .intel_syntax noprefix
ListNodeCreat: # #ListNodeCreat
push rbp
mov rbp, rsp
and rsp, -16
sub rsp, 32
mov dword ptr [rsp + 28], edi
mov qword ptr [rsp + 16], rsi
mov edi, 16
call malloc
mov qword ptr [rsp + 8], rax
mov ecx, dword ptr [rsp + 28]
mov rax, qword ptr [rsp + 8]
mov dword ptr [rax], ecx
mov rax, qword ptr [rsp + 8]
mov qword ptr [rax + 8], 0
mov rax, qword ptr [rsp + 8]
mov qword ptr [rsp + 16], rax
mov rsp, rbp
pop rbp
ret
Pay attention to the "call malloc" in it,when using c2goasm to get go assembly "linkedlist_amd64.s", it still exists:
TEXT ·_ListNodeCreat(SB), $40-16
MOVQ val+0(FP), DI
MOVQ ret+8(FP), SI
ADDQ $8, SP
LONG $0x1c247c89 // mov dword [rsp + 28], edi
LONG $0x24748948; BYTE $0x10 // mov qword [rsp + 16], rsi
LONG $0x000010bf; BYTE $0x00 // mov edi, 16
CALL malloc
LONG $0x24448948; BYTE $0x08 // mov qword [rsp + 8], rax
LONG $0x1c244c8b // mov ecx, dword [rsp + 28]
LONG $0x24448b48; BYTE $0x08 // mov rax, qword [rsp + 8]
WORD $0x0889 // mov dword [rax], ecx
LONG $0x24448b48; BYTE $0x08 // mov rax, qword [rsp + 8]
QUAD $0x000000000840c748 // mov qword [rax + 8], 0
LONG $0x24448b48; BYTE $0x08 // mov rax, qword [rsp + 8]
LONG $0x24448948; BYTE $0x10 // mov qword [rsp + 16], rax
SUBQ $8, SP
RET
so when I run "go build" or "go tool asm linkedlist_amd64.s", I got:
linkedlist_amd64.s:28: undefined label malloc
asm: assembly of linkedlist_amd64.s failed
Does anyone know how to deal with it?

Replacing #pragma omp atomic with c++ atomics

I'm replacing some OpenMP code with standard C++11/C++14 atomics/thread support. Here is the OpenMP minimal code example:
#include <vector>
#include <cstdint>
void omp_atomic_add(std::vector<std::int64_t> const& rows,
std::vector<std::int64_t> const& cols,
std::vector<double>& values,
std::size_t const row,
std::size_t const col,
double const value)
{
for (auto i = rows[row]; i < rows[row+1]; ++i)
{
if (cols[i] == col)
{
#pragma omp atomic
values[i] += value;
return;
}
}
}
The code updates a CSR matrix format and occurs in a hot path for scientific computation. It is technically possible to use a std::mutex but the values vector can have millions of elements and is accessed many times more than that so a std::mutex is too heavy.
Checking the assembly https://godbolt.org/g/nPE9Dt, it seems to use CAS (with the disclaimer my atomic and assembly knowledge is severely limited so my comments are likely incorrect):
mov rax, qword ptr [rdi]
mov rdi, qword ptr [rax + 8*rcx]
mov rax, qword ptr [rax + 8*rcx + 8]
cmp rdi, rax
jge .LBB0_6
mov rcx, qword ptr [rsi]
.LBB0_2: # =>This Inner Loop Header: Depth=1
cmp qword ptr [rcx + 8*rdi], r8
je .LBB0_3
inc rdi
cmp rdi, rax
jl .LBB0_2
jmp .LBB0_6
#### Interesting stuff happens from here onwards
.LBB0_3:
mov rcx, qword ptr [rdx] # Load values pointer into register
mov rax, qword ptr [rcx + 8*rdi] # Offset to value[i]
.LBB0_4: # =>This Inner Loop Header: Depth=1
movq xmm1, rax # Move value into floating point register
addsd xmm1, xmm0 # Add function arg to the value from the vector<double>
movq rdx, xmm1 # Move result to register
lock # x86 lock
cmpxchg qword ptr [rcx + 8*rdi], rdx # Compare exchange on the value in the vector
jne .LBB0_4 # If failed, go back to the top and try again
.LBB0_6:
ret
Is this possible to do using C++ atomics? The examples I've seen only use std::atomic<double> value{} and nothing in the context of accessing a value through a pointer.
You can create a std::vector<std::atomic<double>> but you cannot change its size.
The first thing I'd do is get gsl::span or write my own variant. Then gsl::span<std::atomic<double>> is a better model for values than std::vector<std::atomic<double>>.
Once we have done that, simply remove the #pragma omp atomic and your code is atomic in c++20. In c++17 and before you have to manually implement +=.
double old = values[i];
while(!values[i].compare_exchange_weak(old, old+value))
{}
Live example.
Clang 5 generates:
omp_atomic_add(std::vector<long, std::allocator<long> > const&, std::vector<long, std::allocator<long> > const&, std::vector<std::atomic<double>, std::allocator<std::atomic<double> > >&, unsigned long, unsigned long, double): # #omp_atomic_add(std::vector<long, std::allocator<long> > const&, std::vector<long, std::allocator<long> > const&, std::vector<std::atomic<double>, std::allocator<std::atomic<double> > >&, unsigned long, unsigned long, double)
mov rax, qword ptr [rdi]
mov rdi, qword ptr [rax + 8*rcx]
mov rax, qword ptr [rax + 8*rcx + 8]
cmp rdi, rax
jge .LBB0_6
mov rcx, qword ptr [rsi]
.LBB0_2: # =>This Inner Loop Header: Depth=1
cmp qword ptr [rcx + 8*rdi], r8
je .LBB0_3
inc rdi
cmp rdi, rax
jl .LBB0_2
jmp .LBB0_6
.LBB0_3:
mov rax, qword ptr [rdx]
mov rax, qword ptr [rax + 8*rdi]
.LBB0_4: # =>This Inner Loop Header: Depth=1
mov rcx, qword ptr [rdx]
movq xmm1, rax
addsd xmm1, xmm0
movq rsi, xmm1
lock
cmpxchg qword ptr [rcx + 8*rdi], rsi
jne .LBB0_4
.LBB0_6:
ret
which seems identical to my casual glance.
There is a proposal for atomic_view that lets you manipulate a non-atomic value through an atomic view. In general, C++ only lets you operate atomically on atomic data.

x64 Assembly Optimization

I am trying to optimize several assembly procedures for size, I am not concerned about the speed.
The optimizations I am familiar with are situations as follows:
;the following two lines
mov rbp, rsp
add rbp, 50h
;can be changed to
lea rbp, [rsp+50h]
What other optimizations I can use to reduce the number of bytes in the following procedure?
I am not asking anyone to fully optimize this procedure, just point out where I can improve.
;get procedure address
asmGetProc proc
push rcx ;pointer to function name
push rdx ;DllBase address (IMAGE_DOS_HEADER pointer)
push r8 ;pointer to IMAGE_EXPORT_DIRECTORY
push r9 ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals[r9]
push rbx ;saved pointer to function name
push r10 ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNames
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfFunctions
mov rbx, rcx ;save the function name pointer to rax
mov r8d, [rdx+3ch] ;IMAGE_DOS_HEADER->e_lfanew (DWORD) (Offset to IMAGE_NT_HEADERS64)
add r8, rdx ;add DllBase to the e_lfanew offset
add r8, 88h ;18h - IMAGE_NT_HEADERS64->OptionalHeader (IMAGE_OPTIONAL_HEADER64) 18h bytes
;70h - skip entire IMAGE_OPTIONAL_HEADER64 structure
;r8 points to the IMAGE_DATA_DIRECTORY structure
mov r8d, [r8] ;IMAGE_DATA_DIRECTORY->VirtualAddress (DWORD)
add r8, rdx ;add DllBase to VirtualAddress (IMAGE_EXPORT_DIRECTORY)
mov r9d, [r8+18h] ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
mov r10d, [r8+20h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNames (DWORD)
add r10, rdx ;add DllBase to AddressOfNames (DWORD)
for_each_function:
;decrement function name counter
dec r9
;load current index of AddressOfNames into r11
lea rcx, [r10 + 4 * r9] ;AddressOfNames[i] - function string RVA (relative virtual address)
mov ecx, [rcx] ;r11d is the AddressOfName[r9] RVA (DWORD)
add rcx, rdx ;add DllBase to string RVA DWORD
call asmHsh ;hash the function name
cmp rax, rbx ;compare the function name hash with the passed hash
jnz for_each_function ;jump to top of loop is not a match
;r8 - export directory
;r9 - function name counter
;r10 - AddressOfNameOrdinals / AddressOfFunctions array
;rax - final point to function
mov r10d, [r8+24h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals (DWORD)
add r10, rdx ;add DllBase to AddressOfNameOrdinals DWORD
mov r9w, [r10+2*r9] ;AddressOfNameOrdinals[2*r9] - (2*r9 = 2 bytes * function name counter)
mov r10d, [r8+1ch] ;IMAGE_EXPORT_DIRECTORY->AddressOfFunctions (DWORD)
add r10, rdx ;add DllBase to AddressOfFunctions DWORD
mov eax, [r10+r9*4] ;AddressOfFunctions[4*r9] - (4*r9 = 4 bytes * function ordinal)
add rax, rdx ;add DllBase to function ordinal RVA DWORD
pop r10
pop rbx
pop r9
pop r8
pop rdx
pop rcx
ret ;return from procedure
asmGetProc endp
EDIT: Added asmHsh (my bad)
;hash function (djb2)
asmHsh proc
;rcx - null terminated function name
push rcx
push rdx
mov rax, 5381d
hl:
mov rdx, rax
shl rax, 5
add rax, rdx
xor al, [rcx]
inc rcx
;check for null termination
mov dl, [rcx]
cmp dl, 00h
jne short hl
pop rdx
pop rcx
ret
asmHsh endp
Optimizing assembly for space in 64-bit mode one should: (1) use DWORD width when that suffices (less prefixes); (2) stick to the old X86 registers eax-edx / esi / edi / ebp (tighter encoding).
Hopefully what's done below illustrates the idea. ML64 assembled the original routines to 135 bytes and the modified version to 103 bytes.
Examples of changes: (1) used rbp / rsi / rdi instead of r8 / r9 / r10; (2) shrunk instruction sequences that could be accomplished via multi-component address modes; (3) used DWORD dec where the data is known to be 32-bits; (4) used IMUL in place of shift/add.
" ;- " is in front of removed lines " ;## delta " is appended to added lines, where delta is the byte difference the new code produced. No attempt was made to adjust the comments.
;hash function (djb2)
asmHsh proc
;rcx - null terminated function name
push rcx
;-push rdx ;## -1
mov rax, 5381d
hl:
;- mov rdx, rax
;- shl rax, 5
;- add rax, rdx
imul rax,rax,33 ;## -6
xor al, [rcx]
inc rcx
;check for null termination
;-mov dl, [rcx]
;-cmp dl, 00h
cmp byte ptr [rcx], 00h ;## -2
jne short hl
;-pop rdx ;## -1
pop rcx
ret
asmHsh endp
;get procedure address
asmGetProc proc
push rcx ;pointer to function name
push rdx ;DllBase address (IMAGE_DOS_HEADER pointer)
;-push r8 ;pointer to IMAGE_EXPORT_DIRECTORY
push rbp ;## -1
;-push r9 ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
push rsi ;## -1
;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals[r9]
push rbx ;saved pointer to function name
;-push r10 ;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNames
push rdi ;## -1
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals
;pointer to IMAGE_EXPORT_DIRECTORY->AddressOfFunctions
mov rbx, rcx ;save the function name pointer to rax
;-mov r8d, [rdx+3ch] ;IMAGE_DOS_HEADER->e_lfanew (DWORD) (Offset to IMAGE_NT_HEADERS64)
mov ebp, [rdx+3ch] ;## -1
;-add r8, rdx ;add DllBase to the e_lfanew offset
;-add r8, 88h ;18h - IMAGE_NT_HEADERS64->OptionalHeader (IMAGE_OPTIONAL_HEADER64) 18h bytes
;- ;70h - skip entire IMAGE_OPTIONAL_HEADER64 structure
;- ;r8 points to the IMAGE_DATA_DIRECTORY structure
;-mov r8d, [r8] ;IMAGE_DATA_DIRECTORY->VirtualAddress (DWORD)
mov ebp, [rbp+rdx+88h] ;## -5
;-add r8, rdx ;add DllBase to VirtualAddress (IMAGE_EXPORT_DIRECTORY)
add rbp, rdx ;## 0
;-mov r9d, [r8+18h] ;IMAGE_EXPORT_DIRECTORY->NumberOfNames
mov esi, [rbp+18h] ;## -1
;-mov r10d, [r8+20h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNames (DWORD)
mov edi, [rbp+20h] ;## -1
;-add r10, rdx ;add DllBase to AddressOfNames (DWORD)
add rdi, rdx ;## 0
for_each_function:
;decrement function name counter
;- dec r9
dec esi ;## -1
;load current index of AddressOfNames into r11
;- lea rcx, [r10 + 4 * r9] ;AddressOfNames[i] - function string RVA (relative virtual address)
;- mov ecx, [rcx] ;r11d is the AddressOfName[r9] RVA (DWORD)
mov ecx, [rdi + 4 * rsi] ;## -3
add rcx, rdx ;add DllBase to string RVA DWORD
call asmHsh ;hash the function name
cmp rax, rbx ;compare the function name hash with the passed hash
jnz for_each_function ;jump to top of loop is not a match
;r8 - export directory
;r9 - function name counter
;r10 - AddressOfNameOrdinals / AddressOfFunctions array
;rax - final point to function
;-mov r10d, [r8+24h] ;IMAGE_EXPORT_DIRECTORY->AddressOfNameOrdinals (DWORD)
mov edi, [rbp+24h];## -1
;-add r10, rdx ;add DllBase to AddressOfNameOrdinals DWORD
add rdi, rdx; ## 0
;-mov r9w, [r10+2*r9] ;AddressOfNameOrdinals[2*r9] - (2*r9 = 2 bytes * function name counter)
mov si, [rdi+2*rsi] ;## -1
;-mov r10d, [r8+1ch] ;IMAGE_EXPORT_DIRECTORY->AddressOfFunctions (DWORD)
mov edi, [rbp+1ch] ;## -1
;-add r10, rdx ;add DllBase to AddressOfFunctions DWORD
add rdi, rdx ;## 0
;-mov eax, [r10+r9*4] ;AddressOfFunctions[4*r9] - (4*r9 = 4 bytes * function ordinal)
mov eax, [rdi+rsi*4] ; ## -1
add rax, rdx ;add DllBase to function ordinal RVA DWORD
;-pop r10
pop rdi ; ## -1
pop rbx
;-pop r9
pop rsi
;-pop r8
pop rbp ;## -1
pop rdx
pop rcx
ret ;return from procedure
asmGetProc endp

frame pointer register 'ebx' modified by inline assembly code

Unfortunately, I had to re-image my laptop to install Visual Studio 2012. My project build but with above warning. Previously I had Visual Studio 2010 and I never got the above warning. The code is as follows:
__asm
{
//Initialize pointers on matrices
mov eax, dword ptr [this]
mov ebx, dword ptr [eax+UPkk]
mov dword ptr [UPkk_ptr],ebx
mov ebx, dword ptr [eax+UPk1k]
mov dword ptr [UPk1k_ptr],ebx
mov ebx, dword ptr [eax+DPk1k]
mov dword ptr [DPk1k_ptr],ebx
mov ebx, dword ptr [eax+DPkk]
mov dword ptr [DPkk_ptr],ebx
mov ebx, dword ptr [eax+mat_A]
mov dword ptr [mat_A_ptr],ebx
mov ebx, dword ptr [eax+vec_a]
mov dword ptr [vec_a_ptr],ebx
mov ebx, dword ptr [eax+vec_b]
mov dword ptr [vec_b_ptr],ebx
}
Do I need to change any settings in the project?
Best Regards
Chintan
Edit: In the above code when I replace ebx with ecx, the warnings go away and the code works fine. However, there is another piece of code where I have used ebx and ecx and in that case my program crashes. Here is the code:
__asm
{
//Initialize UPk1k[idx_4] pointer
mov eax, dword ptr [UPk1k_ptr]
mov ebx, dword ptr [idx_4]
imul ebx,8
add eax,ebx
mov dword ptr [UPk1k_id4_ptr],eax
//Initialize UPkk[idx_4] pointer
mov eax, dword ptr [UPkk_ptr]
mov ebx, dword ptr [idx_4]
imul ebx,8
add eax,ebx
mov dword ptr [UPkk_id4_ptr],eax
//Initialize UPk1k[idx_4] pointer
mov eax, dword ptr [vec_b_ptr]
mov ebx, dword ptr [idx_1]
imul ebx,8
add eax,ebx
mov dword ptr [vec_b_id1_ptr],eax
mov edi, dword ptr [idx_1] //Load idx_1 in edi
mov esi, 0 //initialize loop counter
jmp start_proc11
start_for11:inc esi //idx_2++
start_proc11:cmp esi, edi //idx_2<idx_1 ?
jge end_for11 //If yes so end of the loop
mov eax, UPk1k_id4_ptr //load UPk1k[idx_4] adress
mov ebx, vec_b_ptr //load vec_b adress
mov ecx, esi
imul ecx,8
add eax, ecx //UPk1k[idx_4+idx_2] in eax
add ebx, ecx //vec_b[idx_2] in eax
fld qword ptr [eax]//push UPk1k[idx_4+idx_2]
fld qword ptr [ebx] //push vec_b[idx_2]
mov edx,dword ptr [Sd_ptr]
fmul qword ptr [edx] //vec_b[idx_2]*Sd
fadd //pop UPk1k[idx_4+idx_2]+vec_b[idx_2]*Sd
mov edx,dword ptr [UPkk_id4_ptr]
fstp qword ptr [edx+esi*8] //pop UPkk[idx_4+idx_2]=UPk1k[idx_4+idx_2]+vec_b[idx_2]*Sd
fld qword ptr [ebx] //push vec_b[idx_2]
mov edx,dword ptr [vec_b_id1_ptr]
fld qword ptr [edx] //push vec_b[idx_2]
fmul qword ptr [eax]
fadd
fstp qword ptr [ebx]
jmp start_for11 //end of the loop
end_for11:
}
Many Thanks
Best Regards
CS
See MSDN about registers and that warning. They explain why the warning is produced: it forces the compiler to preserve value of EBX, which might be counter-productive to performance, the usual reason inline asm is used. Relevant quote:
In addition, by using EBX, ESI or EDI in inline assembly code, you
force the compiler to save and restore those registers in the function
prologue and epilogue.
To disable the warning, I think the syntax is
#pragma warning( disable : 4731 )
However, I'd try to use some other register instead, because the warning is there for a good reason, really, like most warnings.
In fact, Looking at your asm code, simply replace ebx With ecx, that should solve the problem.

What syntax does a disassembled .s file is not acceptable in nasm?

This code is a C program (bubble sort) disassembled into assembly. How can I make the following code run if I put it in a .asm file and use nasm to assemble? If you know what needs changing, please say what to change it to. For instance I understand that nasm won't accept DWORD PTR, but I haven't found out what to do instead. Thanks
.file "sort.c" .intel_syntax noprefix .text .globl
sort .type sort, #function
sort: .LFB0:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov DWORD PTR [rbp-12], 0
jmp .L2
.L6:
mov DWORD PTR [rbp-8], 0
jmp .L3
.L5:
mov eax, DWORD PTR [rbp-8]
cdqe
sal rax, 2
add rax, QWORD PTR [rbp-24]
mov edx, DWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
sal rax, 2
add rax, QWORD PTR [rbp-24]
mov eax, DWORD PTR [rax]
cmp edx, eax
jle .L4
mov eax, DWORD PTR [rbp-8]
cdqe
sal rax, 2
add rax, QWORD PTR [rbp-24]
mov eax, DWORD PTR [rax]
mov DWORD PTR [rbp-4], eax
mov eax, DWORD PTR [rbp-8]
cdqe
sal rax, 2
add rax, QWORD PTR [rbp-24]
mov edx, DWORD PTR [rbp-8]
movsx rdx, edx
add rdx, 1
sal rdx, 2
add rdx, QWORD PTR [rbp-24]
mov edx, DWORD PTR [rdx]
mov DWORD PTR [rax], edx
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
sal rax, 2
add rax, QWORD PTR [rbp-24]
mov edx, DWORD PTR [rbp-4]
mov DWORD PTR [rax], edx
.L4:
add DWORD PTR [rbp-8], 1
.L3:
mov eax, DWORD PTR [rbp-28]
sub eax, 1
sub eax, DWORD PTR [rbp-12]
cmp eax, DWORD PTR [rbp-8]
jg .L5
add DWORD PTR [rbp-12], 1
.L2:
mov eax, DWORD PTR [rbp-28]
sub eax, 1
cmp eax, DWORD PTR [rbp-12]
jg .L6
pop rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size sort, .-sort
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",#progbits
Just remove PTR and all nonsensical .somethings.
This assembles just fine:
; file: gas-nasm-sort.asm
bits 64
sort:
push rbp
mov rbp, rsp
mov QWORD [rbp-24], rdi
mov DWORD [rbp-28], esi
mov DWORD [rbp-12], 0
jmp .L2
.L6:
mov DWORD [rbp-8], 0
jmp .L3
.L5:
mov eax, DWORD [rbp-8]
cdqe
sal rax, 2
add rax, QWORD [rbp-24]
mov edx, DWORD [rax]
mov eax, DWORD [rbp-8]
cdqe
add rax, 1
sal rax, 2
add rax, QWORD [rbp-24]
mov eax, DWORD [rax]
cmp edx, eax
jle .L4
mov eax, DWORD [rbp-8]
cdqe
sal rax, 2
add rax, QWORD [rbp-24]
mov eax, DWORD [rax]
mov DWORD [rbp-4], eax
mov eax, DWORD [rbp-8]
cdqe
sal rax, 2
add rax, QWORD [rbp-24]
mov edx, DWORD [rbp-8]
movsx rdx, edx
add rdx, 1
sal rdx, 2
add rdx, QWORD [rbp-24]
mov edx, DWORD [rdx]
mov DWORD [rax], edx
mov eax, DWORD [rbp-8]
cdqe
add rax, 1
sal rax, 2
add rax, QWORD [rbp-24]
mov edx, DWORD [rbp-4]
mov DWORD [rax], edx
.L4:
add DWORD [rbp-8], 1
.L3:
mov eax, DWORD [rbp-28]
sub eax, 1
sub eax, DWORD [rbp-12]
cmp eax, DWORD [rbp-8]
jg .L5
add DWORD [rbp-12], 1
.L2:
mov eax, DWORD [rbp-28]
sub eax, 1
cmp eax, DWORD [rbp-12]
jg .L6
pop rbp
ret
Command:
nasm gas-nasm-sort.asm -f bin -o gas-nasm-sort.bin
But again, there's NASM documentation. Read it. In particular these sections:
2.2.2 NASM Requires Square Brackets For Memory References
2.2.3 NASM Doesn't Store Variable Types

Resources