Why does GCC not follow the System V AMD64 ABI here? [duplicate] - gcc

This question already has answers here:
Closed 10 years ago.
Possible Duplicate:
x86 assembly registers — Why do they work the way they do?
I've compiled the following program:
#include <stdio.h>
int square(int x) {
return x * x;
}
int main() {
int y = square(9);
printf("%d\n", y);
return 0;
}
two times with different options on OSX with GCC 4.2.1:
gcc foo.c -o foo_32.s -S -fverbose-asm -m32 -O1
gcc foo.c -o foo_64.s -S -fverbose-asm -m64 -O1
The result for 32 bit:
_square: ## #square
## BB#0: ## %entry
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
imull %eax, %eax
popl %ebp
ret
And for 64-bit:
_square: ## #square
Leh_func_begin1:
## BB#0: ## %entry
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movl %edi, %eax
imull %eax, %eax
popq %rbp
ret
As is evident, the 32-bit version retrieves the parameter from the stack, which is what one would expect with cdecl. The 64-bit version however uses the EDI register to pass the parameter.
Doesn't this violate the System V AMD64 ABI, which specifies that the RDI, RSI, RDX, RCX, R8, R9, XMM0–7 registers should be used? Or is this only the case for true 64-bit values like long?

EDI is simply the lower half of RDI so the compiler is passing the argument in RDI, but the argument is only 32-bits long, so it only takes up half of the register.

Related

Can not compile Assembly (.s) code with Cygwin on Win

I have this ".s" file, written in AT&T assembly.
.globl interleave
interleave:
pushl %ebx
pushl %esi
pushl %edi
movl 16(%esp), %ebx #a
movl 20(%esp), %esi #b
movl 24(%esp), %edi #c
D: movb (%ebx), %cl
testb %cl, %cl
jz W
movb %cl, (%edi) #*c
incl %edi
incl %ebx
T: movb (%esi), %dl
testb %dl, %dl
jz W
movb %dl, (%edi) #*c
incl %edi
incl %esi
W: orb %cl,%dl
jz E
#movb $0, %al
jmp D
E: movb $0, (%edi)
popl %edi
popl %esi
popl %ebx
ret
I want to compile it on windows 10 with cygwin with the following main file, but it does not work.
void interleave(const char* a, const char* b, char* c) ;
int main(int argc, char const *argv[]){
const char* a = "car";
const char* b = "old";
char c[] = "";
interleave(a,b,c);
printf("%s (expected coalrd)\n", c);
return 0;}
With gcc i get es1B.s:3: Error: invalid instruction suffix for push
With gcc -m32 I get collect2: error: ld returned 1 exit status
I even tried to compile it in 32 bit with i686-w64-mingw32-gcc but I get undefined reference to interleave
I am able to compile it and run it on linux with gcc -m32 , but is there a way to make this work on windows?
Thanks
Solved by adding an underscore before the function name as suggested in the comments:
.globl _interleave
_interleave:
...
Compiling with i686-w64-mingw32-gcc now works.

Why GCC does not generates assembly code with 1 **imul** instruction for 1 multiplication in C?

I'm checking the assembly code of the following simple program on https://godbolt.org/ with parameters -Wall -m32 using gcc 6.3 and to my surprise the generated assembly code takes three multiplications to perform one multiplication in C as shown below.
#include<stdio.h>
#include<inttypes.h>
int main(void){
int32_t ax=-854763;
int32_t bx=586478;
int64_t cx= (int64_t)ax*bx;
printf("%lld\n", cx);
}
The generated assembly code for the statement int64_t cx= (int64_t)ax*bx; is given below. There are two imull and one mull instructions.
movl -28(%ebp), %eax
movl %eax, %ecx
movl %eax, %ebx
sarl $31, %ebx
movl -32(%ebp), %eax
cltd
movl %ebx, %edi
imull %eax, %edi
movl %edx, %esi
imull %ecx, %esi
addl %edi, %esi
mull %ecx
leal (%esi,%edx), %ecx
movl %ecx, %edx
movl %eax, -40(%ebp)
movl %edx, -36(%ebp)
movl %eax, -40(%ebp)
movl %edx, -36(%ebp)`
Is it possible to enforce the gcc compiler/assembler to do it with just one multiplication without using inline assembly? Because in the original code I have to perform many such multiplications within a statement and in different statements.
Why the compiler does not perform it with one imul instruction? I have also checked the different versions of the gcc on https://godbolt.org/ but the result is same three multiplication instruction.

Load floating-point number from pointer to float and push on stack

This is a homework task. I've got a C program that calls a function calc(int, float*, float*, float*, float*) implemented with NASM. I want to do floating-point division with the data passed from C, but first I wanted to check if I access the data correctly.
This is an excerpt from the C program:
printf("read.c: F data1[0]=%f\n", data1[0]);
printf("read.c: X data1[0]=%X\n", *(int*)(&data1[0]));
calc(nlines, data1, data2, result1, result2);
For testing, I wanted to print out exactly the same from the assembler code, but whatever I tried, it wouldn't give me the right results. To be precise, outputting the %X format gives the same result, but the %f format gives some incredibly huge number.
global calc
extern printf
; -----------------------------------------------------------------------
; extern void calc(int nlines, float* data1, float* data2,
; float* result1, float* result2)
; -----------------------------------------------------------------------
calc:
section .data
.strf db "calc.asm: F data1[0]=%f", 10, 0
.strx db "calc.asm: X data1[0]=%X", 10, 0
section .text
enter 0, 0
; Move the value of float* data1 into ecx.
mov ecx, [esp + 12]
; Move the contents of data1[0] into esi.
mov esi, [ecx]
push esi
push .strf
call printf
add esp, 8
push esi
push .strx
call printf
add esp, 8
leave
ret
Outputs
read.c: F data1[0]=20.961977
read.c: X data1[0]=41A7B221
calc.asm: F data1[0]=-8796958457989122902187458235483374032941932827208012972482327255932202912296419757153331437662235555722313731094096197990916443553479942683040096290755684437514827018615169352974748429901549205109479495668937369584705401541113350145698235773041651907978442730240007381959397006695721667307435228446926569472.000000
calc.asm: X data1[0]=41A7B221
I've also looked into fld, but I couldn't find out how I can push the loaded value on stack. This didnt work:
; Move float* data1 into ecx
mov ecx, [esp + 12]
; Load the floating point number into esi.
fld dword [ecx]
fst esi
How to do it right?
I've stripped down read.c to this code
#include <stdio.h>
#include <stdlib.h>
#define MAXLINES 1024
extern void calc(int, float*, float*, float*, float*);
int main(int argc, char** argv)
{
int nlines;
float* data1 = malloc(sizeof(float)*MAXLINES);
float*data2, *results1, *results2;
printf("read.c: F data1[0]=%f\n", data1[0]);
printf("read.c: X data1[0]=%X\n", *(int*)(&data1[0]));
calc(nlines, data1, data2, results1, results2);
return 0;
}
and this is the assembler output:
.file "test.c"
.section .rodata
.LC0:
.string "read.c: F data1[0]=%f\n"
.LC1:
.string "read.c: X data1[0]=%X\n"
.text
.globl main
.type main, #function
main:
.LFB2:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $64, %esp
movl $4096, (%esp)
call malloc
movl %eax, 44(%esp)
movl 44(%esp), %eax
flds (%eax)
fstpl 4(%esp)
movl $.LC0, (%esp)
call printf
movl 44(%esp), %eax
movl (%eax), %eax
movl %eax, 4(%esp)
movl $.LC1, (%esp)
call printf
movl 60(%esp), %eax
movl %eax, 16(%esp)
movl 56(%esp), %eax
movl %eax, 12(%esp)
movl 52(%esp), %eax
movl %eax, 8(%esp)
movl 44(%esp), %eax
movl %eax, 4(%esp)
movl 48(%esp), %eax
movl %eax, (%esp)
call calc
movl $0, %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
.section .note.GNU-stack,"",#progbits
.LC1:
.string "read.c: F data1[0]=%f\n"
.LC2:
.string "read.c: X data1[0]=%X\n"
.text
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $64, %esp
movl 44(%esp), %eax
flds (%eax)
fstpl 4(%esp)
movl $.LC1, (%esp)
call printf
movl 44(%esp), %eax
movl (%eax), %eax
movl %eax, 4(%esp)
movl $.LC2, (%esp)
call printf
movl 60(%esp), %eax
movl %eax, 16(%esp)
movl 56(%esp), %eax
movl %eax, 12(%esp)
movl 52(%esp), %eax
movl %eax, 8(%esp)
movl 44(%esp), %eax
movl %eax, 4(%esp)
movl 48(%esp), %eax
movl %eax, (%esp)
call calc
movl $0, %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE4:
.size main, .-main
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
.section .note.GNU-stack,"",#progbits
Ok, I've now had a chance to test this and verify that what I suggested in my comment works. Here's my modified version of the assembly code, with some comments to explain the things I've added/changed:
global _calc
extern _printf
; -----------------------------------------------------------------------
; extern void calc(int nlines, float* data1, float* data2,
; float* result1, float* result2)
; -----------------------------------------------------------------------
_calc:
section .data
.strf db "calc.asm: F data1[0]=%f", 10, 0
.strx db "calc.asm: X data1[0]=%X", 10, 0
section .text
enter 0, 0
; Move the value of float* data1 into ecx.
mov ecx, [esp + 12]
; Move the contents of data1[0] into esi.
mov esi, [ecx]
fld dword [ecx] ; Load a single-precision float onto the FP stack.
sub esp,8 ; Make room for a double on the stack.
fstp qword [esp] ; Store the top of the FP stack on the regular stack as
; a double, and pop it off the FP stack.
push .strf
call _printf
add esp, 12 ; 12 == sizeof(char*) + sizeof(double)
push esi
push .strx
call _printf
add esp, 8
leave
ret

Why does gcc push %rbx at the beginning of main?

The latest version of gcc is producing assembly that doesn't make sense to me. I compiled the code using no optimization; but, some parts of this code don't make sense, even with no optimization.
Here is the C source:
#include <stdio.h>
int main()
{
int a = 1324;
int b = 5657;
int difference = 9876;
int printf_answer = 2221;
difference = a - b;
printf_answer = printf("%d + %d = %d\n", a, b, difference);
return difference;
}
It produces this assembly:
.file "exampleIML-1b.c"
.section .rodata
.LC0:
.string "%d + %d = %d\n"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $24, %rsp
movl $1324, -32(%rbp)
movl $5657, -28(%rbp)
movl $9876, -24(%rbp)
movl $2221, -20(%rbp)
movl -28(%rbp), %eax
movl -32(%rbp), %edx
movl %edx, %ecx
subl %eax, %ecx
movl %ecx, %eax
movl %eax, -24(%rbp)
movl $.LC0, %eax
movl -24(%rbp), %ecx
movl -28(%rbp), %edx
movl -32(%rbp), %ebx
.cfi_offset 3, -24
movl %ebx, %esi
movq %rax, %rdi
movl $0, %eax
call printf
movl %eax, -20(%rbp)
movl -24(%rbp), %eax
addq $24, %rsp
popq %rbx
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (GNU) 4.4.6 20120305 (Red Hat 4.4.6-4)"
.section .note.GNU-stack,"",#progbits
Several things don't make sense:
(1) Why are we pushing %rbx? What is in %rbx that needs to be saved?
(2) Why are we moving %edx to %ecx before subtracting? What doesn't it just do sub %eax, %edx?
(3) Similarly, why the move from %ecx back to %eax before storing the value?
(4) The compiler is putting the variable a in memory location -32(%rbp). Unless I'm adding wrong, isn't -32(%rbp) equal to the stack pointer? Shouldn't all local variables be stored at values less than the current stack pointer?
I'm using this version of gcc:
[eos17:~/Courses/CS451/IntelMachineLanguage]$ gcc -v
Using built-in specs.
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-languages=c,c++,objc,obj-c++,java,fortran,ada --enable-java-awt=gtk --disable-dssi --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-1.5.0.0/jre --enable-libgcj-multifile --enable-java-maintainer-mode --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --disable-libjava-multilib --with-ppl --with-cloog --with-tune=generic --with-arch_32=i686 --build=x86_64-redhat-linux
Thread model: posix
gcc version 4.4.6 20120305 (Red Hat 4.4.6-4) (GCC)
GCC dictates how the stack is used. Contract between caller and callee on x86:
* after call instruction:
o %eip points at first instruction of function
o %esp+4 points at first argument
o %esp points at return address
* after ret instruction:
o %eip contains return address
o %esp points at arguments pushed by caller
o called function may have trashed arguments
o %eax contains return value (or trash if function is void)
o %ecx, %edx may be trashed
o %ebp, %ebx, %esi, %edi must contain contents from time of call
* Terminology:
o %eax, %ecx, %edx are "caller save" registers
o %ebp, %ebx, %esi, %edi are "callee save" registers
The main function is like any other function in this context. gcc decided to use ebx for intermediate calculations, so it preserves its value.
By default gcc compiles with optimization disabled, which is the case here, apparently.
You need to enable it with one of the optimization switches (e.g. -O2 or -O3).
Then you will not see redundant and seemingly meaningless things.
As for rbx, it has to be preserved because that's what the calling conventions require. Your function modifies it (movl -32(%rbp), %ebx), so it has to be saved and restored explicitly.

Division and modulus using single divl instruction (i386, amd64)

I was trying to come up with inline assembly for gcc to get both division and modulus using single divl instruction. Unfortunately, I am not that good at assembly. Could someone please help me on this? Thank you.
You're looking for something like this:
__asm__("divl %2\n"
: "=d" (remainder), "=a" (quotient)
: "g" (modulus), "d" (high), "a" (low));
Although I agree with the other commenters that usually GCC will do this for you and you should avoid inline assembly when possible, sometimes you need this construct.
For instance, if the high word is less than the modulus, then it is safe to perform the division like this. However, GCC isn't smart enough to realize this, because in the general case dividing a 64 bit number by a 32 bit number can lead to overflow, and so it calls to a library routine to do extra work. (Replace with 128 bit/64 bit for 64 bit ISAs.)
You shouldn't try to optimize this yourself. GCC already does this.
volatile int some_a = 18, some_b = 7;
int main(int argc, char *argv[]) {
int a = some_a, b = some_b;
printf("%d %d\n", a / b, a % b);
return 0;
}
Running
gcc -S test.c -O
yields
main:
.LFB11:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl some_a(%rip), %esi
movl some_b(%rip), %ecx
movl %esi, %eax
movl %esi, %edx
sarl $31, %edx
idivl %ecx
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
Notice that the remainder, %edx, is not moved because it is also the third argument passed to printf.
EDIT: The 32-bit version is less confusing. Passing -m32 yields
main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $16, %esp
movl some_a, %eax
movl some_b, %ecx
movl %eax, %edx
sarl $31, %edx
idivl %ecx
movl %edx, 8(%esp)
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call printf
movl $0, %eax
leave
ret
Fortunately, you don't have to resort to inline assembly to achieve this. gcc will do this automatically when it can.
$ cat divmod.c
struct sdiv { unsigned long quot; unsigned long rem; };
struct sdiv divide( unsigned long num, unsigned long divisor )
{
struct sdiv x = { num / divisor, num % divisor };
return x;
}
$ gcc -O3 -std=c99 -Wall -Wextra -pedantic -S divmod.c -o -
.file "divmod.c"
.text
.p2align 4,,15
.globl divide
.type divide, #function
divide:
.LFB0:
.cfi_startproc
movq %rdi, %rax
xorl %edx, %edx
divq %rsi
ret
.cfi_endproc
.LFE0:
.size divide, .-divide
.ident "GCC: (GNU) 4.4.4 20100630 (Red Hat 4.4.4-10)"
.section .note.GNU-stack,"",#progbits
Yes -- a divl will produce the quotient in eax and the remainder in edx. Using Intel syntax, for example:
mov eax, 17
mov ebx, 3
xor edx, edx
div ebx
; eax = 5
; edx = 2
Here is an example in linux kernel code about divl
/*
* do_div() is NOT a C function. It wants to return
* two values (the quotient and the remainder), but
* since that doesn't work very well in C, what it
* does is:
*
* - modifies the 64-bit dividend _in_place_
* - returns the 32-bit remainder
*
* This ends up being the most efficient "calling
* convention" on x86.
*/
#define do_div(n, base) \
({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
__mod = n & (__base - 1); \
n >>= ilog2(__base); \
} else { \
asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2" : "=a" (__low), "=d" (__mod) \
: "rm" (__base), "0" (__low), "1" (__upper)); \
asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
} \
__mod; \
})

Resources