shrl vs sarl .. x86 Assembly gnu - gcc

I'm compiling my code with gcc and looking at the assembly, what is this code exactly doing?
shrl $20, %edx
leal (%edx,%eax), %eax
sarl 1, %eax
Say that the variable X is at the edx register, and Y is at eax and both are (32-bit int). What is going on here??
I know 'shrl $20, %edx' is shifting %eax right 20 bits, so is same as: eax/(2^20)
and then sarl is the same so 'sarl 1, %eax' = eax/(2^1).
Is that right, and if so what does leal do?

Assuming that sarl 1, %eax is really supposed to be sarl $1, %eax, then the whole thing equates to:
x = ((unsigned int) x) >> 20;
y = (x + y) >> 1
The leal instruction means: eax = eax + edx. This link might be useful to you, as well as this one.

Related

Load floating-point number from pointer to float and push on stack

This is a homework task. I've got a C program that calls a function calc(int, float*, float*, float*, float*) implemented with NASM. I want to do floating-point division with the data passed from C, but first I wanted to check if I access the data correctly.
This is an excerpt from the C program:
printf("read.c: F data1[0]=%f\n", data1[0]);
printf("read.c: X data1[0]=%X\n", *(int*)(&data1[0]));
calc(nlines, data1, data2, result1, result2);
For testing, I wanted to print out exactly the same from the assembler code, but whatever I tried, it wouldn't give me the right results. To be precise, outputting the %X format gives the same result, but the %f format gives some incredibly huge number.
global calc
extern printf
; -----------------------------------------------------------------------
; extern void calc(int nlines, float* data1, float* data2,
; float* result1, float* result2)
; -----------------------------------------------------------------------
calc:
section .data
.strf db "calc.asm: F data1[0]=%f", 10, 0
.strx db "calc.asm: X data1[0]=%X", 10, 0
section .text
enter 0, 0
; Move the value of float* data1 into ecx.
mov ecx, [esp + 12]
; Move the contents of data1[0] into esi.
mov esi, [ecx]
push esi
push .strf
call printf
add esp, 8
push esi
push .strx
call printf
add esp, 8
leave
ret
Outputs
read.c: F data1[0]=20.961977
read.c: X data1[0]=41A7B221
calc.asm: F data1[0]=-8796958457989122902187458235483374032941932827208012972482327255932202912296419757153331437662235555722313731094096197990916443553479942683040096290755684437514827018615169352974748429901549205109479495668937369584705401541113350145698235773041651907978442730240007381959397006695721667307435228446926569472.000000
calc.asm: X data1[0]=41A7B221
I've also looked into fld, but I couldn't find out how I can push the loaded value on stack. This didnt work:
; Move float* data1 into ecx
mov ecx, [esp + 12]
; Load the floating point number into esi.
fld dword [ecx]
fst esi
How to do it right?
I've stripped down read.c to this code
#include <stdio.h>
#include <stdlib.h>
#define MAXLINES 1024
extern void calc(int, float*, float*, float*, float*);
int main(int argc, char** argv)
{
int nlines;
float* data1 = malloc(sizeof(float)*MAXLINES);
float*data2, *results1, *results2;
printf("read.c: F data1[0]=%f\n", data1[0]);
printf("read.c: X data1[0]=%X\n", *(int*)(&data1[0]));
calc(nlines, data1, data2, results1, results2);
return 0;
}
and this is the assembler output:
.file "test.c"
.section .rodata
.LC0:
.string "read.c: F data1[0]=%f\n"
.LC1:
.string "read.c: X data1[0]=%X\n"
.text
.globl main
.type main, #function
main:
.LFB2:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $64, %esp
movl $4096, (%esp)
call malloc
movl %eax, 44(%esp)
movl 44(%esp), %eax
flds (%eax)
fstpl 4(%esp)
movl $.LC0, (%esp)
call printf
movl 44(%esp), %eax
movl (%eax), %eax
movl %eax, 4(%esp)
movl $.LC1, (%esp)
call printf
movl 60(%esp), %eax
movl %eax, 16(%esp)
movl 56(%esp), %eax
movl %eax, 12(%esp)
movl 52(%esp), %eax
movl %eax, 8(%esp)
movl 44(%esp), %eax
movl %eax, 4(%esp)
movl 48(%esp), %eax
movl %eax, (%esp)
call calc
movl $0, %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
.section .note.GNU-stack,"",#progbits
.LC1:
.string "read.c: F data1[0]=%f\n"
.LC2:
.string "read.c: X data1[0]=%X\n"
.text
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $64, %esp
movl 44(%esp), %eax
flds (%eax)
fstpl 4(%esp)
movl $.LC1, (%esp)
call printf
movl 44(%esp), %eax
movl (%eax), %eax
movl %eax, 4(%esp)
movl $.LC2, (%esp)
call printf
movl 60(%esp), %eax
movl %eax, 16(%esp)
movl 56(%esp), %eax
movl %eax, 12(%esp)
movl 52(%esp), %eax
movl %eax, 8(%esp)
movl 44(%esp), %eax
movl %eax, 4(%esp)
movl 48(%esp), %eax
movl %eax, (%esp)
call calc
movl $0, %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE4:
.size main, .-main
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
.section .note.GNU-stack,"",#progbits
Ok, I've now had a chance to test this and verify that what I suggested in my comment works. Here's my modified version of the assembly code, with some comments to explain the things I've added/changed:
global _calc
extern _printf
; -----------------------------------------------------------------------
; extern void calc(int nlines, float* data1, float* data2,
; float* result1, float* result2)
; -----------------------------------------------------------------------
_calc:
section .data
.strf db "calc.asm: F data1[0]=%f", 10, 0
.strx db "calc.asm: X data1[0]=%X", 10, 0
section .text
enter 0, 0
; Move the value of float* data1 into ecx.
mov ecx, [esp + 12]
; Move the contents of data1[0] into esi.
mov esi, [ecx]
fld dword [ecx] ; Load a single-precision float onto the FP stack.
sub esp,8 ; Make room for a double on the stack.
fstp qword [esp] ; Store the top of the FP stack on the regular stack as
; a double, and pop it off the FP stack.
push .strf
call _printf
add esp, 12 ; 12 == sizeof(char*) + sizeof(double)
push esi
push .strx
call _printf
add esp, 8
leave
ret

mmap from x86_64 on OSX

I've tried this over and over and looked at disassembly of a small C version that works fine, but trying to allocate this small block of memory keeps returning '9' ? Can somebody see what I'm doing wrong, thanks.
movl $0x0, %edi
movl $0x4000, %esi ## imm = 0x4000
movl $0x3, %edx
movl $0x1002, %ecx ## imm = 0x1002
movq $-0x1, %r8
movl $0x0, %r9d
movl $0x20000c5, %eax ## imm = 0x20000C5
syscall
Regards
Chris
OK, I found the problem and it's that if using syscall you need to pass r10 rather than rcx ! From C the disassembly uses rcx because it doesn't call the syscall directly !
Hope this helps others.
Chris

Why does this function prologue use several instructions to calculate the esp reduction?

I have looked at a few dumps of assembler code and there is this section (found here and here) in the main function:
<main+0>: push %ebp
<main+1>: mov %esp, %ebp
<main+3>: sub $0x8, %esp
<main+6>: and $0xfffffff0, %esp
<main+9>: mov $0x0, %eax
<main+14>: add $0xf, %eax
<main+17>: add $0xf, %eax
<main+20>: shr $0x4, %eax
<main+23>: shl $0x4, %eax
<main+26>: sub %eax, %esp
Can you explain me what (main+9) to (main+26) is used for?
Why is this done so 'inefficient'?
So you want a full walk-through without doing any research yourself? Sounds legit.
main+9: mov $0x0, %eax
Loads the register eax with hex 0 (=dec 0).
main+14: add $0xf, %eax
Adds hex F (= dec 15) to the zero in eax.
main+17: add $0xf, %eax
Adds hex F (= dec 15) to eax again. These three instructions could have also been done by
movl $0x1e, %eax
but who's counting instructions... Anyway, at this point eax contains hex 1E which is dec 30.
main+20: shr $0x4, %eax
Shifts the contents of eax to the right by four bits.
main+23: shl $0x4, %eax
Shifts eax right back. Why? Because this clears the lowest four bits. Now eax contains hex 10 (= dec 16)
main+26: sub %eax, %esp
Substracts eax from esp (the stack pointer). Since
main+6: and $0xfffffff0, %esp
cleared the lower four bits in esp previously, the new esp will be sixteen byte aligned, as per ABI. Why not simply use esp after main+6? Because on x86, the stack grows downwards from the top of memory. Simply masking off the lower bits of esp risks clobbering local variables. Hence the subtraction to grow the stack down to the sixteen byte boundary.

xorl %eax, %eax in x86_64 assembly code produced by gcc

I'm a total noob at assembly, just poking around a bit to see what's going on. Anyway, I wrote a very simple function:
void multA(double *x,long size)
{
long i;
for(i=0; i<size; ++i){
x[i] = 2.4*x[i];
}
}
I compiled it with:
gcc -S -m64 -O2 fun.c
And I get this:
.file "fun.c"
.text
.p2align 4,,15
.globl multA
.type multA, #function
multA:
.LFB34:
.cfi_startproc
testq %rsi, %rsi
jle .L1
movsd .LC0(%rip), %xmm1
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L3:
movsd (%rdi,%rax,8), %xmm0
mulsd %xmm1, %xmm0
movsd %xmm0, (%rdi,%rax,8)
addq $1, %rax
cmpq %rsi, %rax
jne .L3
.L1:
rep
ret
.cfi_endproc
.LFE34:
.size multA, .-multA
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 858993459
.long 1073951539
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",#progbits
The assembly output makes sense to me (mostly) except for the line xorl %eax, %eax. From googling, I gather that the purpose of this is simply to set %eax to zero, which in this case corresponds to my iterator long i;.
However, unless I am mistaken, %eax is a 32-bit register. So it seems to me that this should actually be xorq %rax, %rax, particularly since this is holding a 64-bit long int. Moreover, further down in the code, it actually uses the 64-bit register %rax to do the iterating, which never gets initialized outside of xorl %eax %eax, which would seem to only zero out the lower 32 bits of the register.
Am I missing something?
Also, out of curiosity, why are there two .long constants there at the bottom? The first one, 858993459 is equal to the double floating-point representation of 2.4 but I can't figure out what the second number is or why it is there.
I gather that the purpose of this is simply to set %eax to zero
Yes.
which in this case corresponds to my iterator long i;.
No. Your i is uninitialized in the declaration. Strictly speaking, that operation corresponds to the i = 0 expression in the for loop.
However, unless I am mistaken, %eax is a 32-bit register. So it seems to me that this should actually be xorq %rax, %rax, particularly since this is holding a 64-bit long int.
But clearing the lower double word of the register clears the entire register. This is not intuitive, but it's implicit.
Just to answer the second part: .long means 32 bit, and the two integral constants side-by-side form the IEEE-754 representation of the double 2.4:
Dec: 1073951539 858993459
Hex: 0x40033333 0x33333333
400 3333333333333
S+E Mantissa
The exponent is offset by 1023, so the actual exponent is 0x400 − 1023 = 1. The leading "one" in the mantissa is implied, so it's 21 × 0b1.001100110011... (You recognize this periodic expansion as 3/15, i.e. 0.2. Sure enough, 2 × 1.2 = 2.4.)

Division and modulus using single divl instruction (i386, amd64)

I was trying to come up with inline assembly for gcc to get both division and modulus using single divl instruction. Unfortunately, I am not that good at assembly. Could someone please help me on this? Thank you.
You're looking for something like this:
__asm__("divl %2\n"
: "=d" (remainder), "=a" (quotient)
: "g" (modulus), "d" (high), "a" (low));
Although I agree with the other commenters that usually GCC will do this for you and you should avoid inline assembly when possible, sometimes you need this construct.
For instance, if the high word is less than the modulus, then it is safe to perform the division like this. However, GCC isn't smart enough to realize this, because in the general case dividing a 64 bit number by a 32 bit number can lead to overflow, and so it calls to a library routine to do extra work. (Replace with 128 bit/64 bit for 64 bit ISAs.)
You shouldn't try to optimize this yourself. GCC already does this.
volatile int some_a = 18, some_b = 7;
int main(int argc, char *argv[]) {
int a = some_a, b = some_b;
printf("%d %d\n", a / b, a % b);
return 0;
}
Running
gcc -S test.c -O
yields
main:
.LFB11:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl some_a(%rip), %esi
movl some_b(%rip), %ecx
movl %esi, %eax
movl %esi, %edx
sarl $31, %edx
idivl %ecx
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
Notice that the remainder, %edx, is not moved because it is also the third argument passed to printf.
EDIT: The 32-bit version is less confusing. Passing -m32 yields
main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $16, %esp
movl some_a, %eax
movl some_b, %ecx
movl %eax, %edx
sarl $31, %edx
idivl %ecx
movl %edx, 8(%esp)
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call printf
movl $0, %eax
leave
ret
Fortunately, you don't have to resort to inline assembly to achieve this. gcc will do this automatically when it can.
$ cat divmod.c
struct sdiv { unsigned long quot; unsigned long rem; };
struct sdiv divide( unsigned long num, unsigned long divisor )
{
struct sdiv x = { num / divisor, num % divisor };
return x;
}
$ gcc -O3 -std=c99 -Wall -Wextra -pedantic -S divmod.c -o -
.file "divmod.c"
.text
.p2align 4,,15
.globl divide
.type divide, #function
divide:
.LFB0:
.cfi_startproc
movq %rdi, %rax
xorl %edx, %edx
divq %rsi
ret
.cfi_endproc
.LFE0:
.size divide, .-divide
.ident "GCC: (GNU) 4.4.4 20100630 (Red Hat 4.4.4-10)"
.section .note.GNU-stack,"",#progbits
Yes -- a divl will produce the quotient in eax and the remainder in edx. Using Intel syntax, for example:
mov eax, 17
mov ebx, 3
xor edx, edx
div ebx
; eax = 5
; edx = 2
Here is an example in linux kernel code about divl
/*
* do_div() is NOT a C function. It wants to return
* two values (the quotient and the remainder), but
* since that doesn't work very well in C, what it
* does is:
*
* - modifies the 64-bit dividend _in_place_
* - returns the 32-bit remainder
*
* This ends up being the most efficient "calling
* convention" on x86.
*/
#define do_div(n, base) \
({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
__mod = n & (__base - 1); \
n >>= ilog2(__base); \
} else { \
asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2" : "=a" (__low), "=d" (__mod) \
: "rm" (__base), "0" (__low), "1" (__upper)); \
asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
} \
__mod; \
})

Resources