gcc optimization of struct assignment

gcc optimization of struct assignment - gcc

Here is my example code:
struct AAA {
union{
struct{
int a;
int b;
};
long A;
};
union{
struct{
short c;
char d;
char e;
};
int B;
};
} __attribute__((packed));
void fun1(struct AAA *aaa){
aaa->a = 1;
aaa->b = 2;
aaa->c = 3;
aaa->d = 4;
aaa->e = 5;
}
void fun2(struct AAA *aaa){
aaa->A = (2L<<32)+1;
aaa->B = (5 << 24) + (4<<16) + 3;
}
When I compile it to asm code using gcc 5.4.0, I got:
fun1:
.LFB0:
.cfi_startproc
movl $3, %eax
movl $1, (%rdi)
movl $2, 4(%rdi)
movw %ax, 8(%rdi)
movb $4, 10(%rdi)
movb $5, 11(%rdi)
ret
.cfi_endproc
.LFE0:
.size fun1, .-fun1
.section .text.unlikely
.LCOLDE0:
.text
.LHOTE0:
.section .text.unlikely
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl fun2
.type fun2, #function
fun2:
.LFB1:
.cfi_startproc
movabsq $8589934593, %rax
movl $84148227, 8(%rdi)
movq %rax, (%rdi)
ret
.cfi_endproc
And when I compile it with gcc 7.3.0, I got
fun1:
.LFB0:
.cfi_startproc
movabsq $8589934593, %rax
movl $84148227, 8(%rdi)
movq %rax, (%rdi)
ret
.cfi_endproc
.LFE0:
.size fun1, .-fun1
.p2align 4,,15
.globl fun2
.type fun2, #function
fun2:
.LFB1:
.cfi_startproc
movabsq $8589934593, %rax
movl $84148227, 8(%rdi)
movq %rax, (%rdi)
ret
.cfi_endproc
Both using -O3 option. The difference is obvious. Newer versions of gcc optimizes fun1 just like fun2.
Is fun2 really faster than fun1 when generating by gcc 5.4.0?
I have some old projects and they are compiled using even older versions of gcc (4.x), and I found many similar code like my example. If I want to do optimization, is it a good idea to change fun1 to fun2? I can't update gcc for now.

Assuming these programs are running on modern CPU architectures, the difference would be in the measure of nanoseconds.
Unless your code consists mostly of these assignments, and you really need to squeeze out the tiny bit of performance, I would keep it the same as fun1 for better readability and maintainability.

Related

Why does gcc optimize out this comparison and how can I get it to get compiled in?

unsigned char c;
int n;
int main(){
if ((c && 0xc0) == 0xc0 ) {
n=0;
}
}
When testing on godbolt, only gcc does this.

It's actually very simple. Since you use && as an operator, gcc can deduce that the condition always yields false.
If you use the bitwise and operator (&), gcc adds code for the if:
#include <stdio.h>
unsigned char c = 0xff;
int n = 1;
int main(){
if ((c & 0xc0) == 0xc0 ) {
n=0;
}
printf("%d\n", n);
}
yields:
ronald#oncilla:~/tmp$ cc -S x.c && cat x.s
.file "x.c"
.text
.globl c
.data
.type c, #object
.size c, 1
c:
.byte -1
.globl n
.align 4
.type n, #object
.size n, 4
n:
.long 1
.section .rodata
.LC0:
.string "%d\n"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movzbl c(%rip), %eax
movzbl %al, %eax
andl $192, %eax
cmpl $192, %eax
jne .L2
movl $0, n(%rip)
.L2:
movl n(%rip), %eax
movl %eax, %esi
leaq .LC0(%rip), %rdi
movl $0, %eax
call printf#PLT
movl $0, %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Ubuntu 9.4.0-1ubuntu1~20.04) 9.4.0"
.section .note.GNU-stack,"",#progbits
.section .note.gnu.property,"a"
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
0:
.string "GNU"
1:
.align 8
.long 0xc0000002
.long 3f - 2f
2:
.long 0x3
3:
.align 8
4:

In C && casts operands to bool which even though its an int, it is changed to 1 meaning true, if nonzero.
& is a bitwise and, which returns the bits that are the same. If you combine this with && it returns true if there are any bits left.
If you compile with -Wall you will get a warning when something gets optimized out at compile time.

storage register for scanf call in gas

I am trying to understand scanf function a have 3 question regarding it.
this is c file:
#include <stdio.h>
#include <stdlib.h>
int main(){
int x;
printf("Enter X:\n");
scanf("%i",&x);
printf("You entered %d...\n",x);
return 0;
}
and here is gas:
.text
.section .rodata
.LC0:
.string "Enter X:"
.LC1:
.string "%i"
.LC2:
.string "You entered %d...\n"
.text
.globl main
.type main, #function
main:
pushq %rbp #
movq %rsp, %rbp #,
subq $16, %rsp #,
# a.c:5: printf("Enter X:\n");
leaq .LC0(%rip), %rdi #,
call puts#PLT #
# a.c:6: scanf("%i",&x);
leaq -4(%rbp), %rax #, tmp90
movq %rax, %rsi # tmp90,
leaq .LC1(%rip), %rdi #,
movl $0, %eax #,
call __isoc99_scanf#PLT #
# a.c:7: printf("You entered %d...\n",x);
movl -4(%rbp), %eax # x, x.0_1
movl %eax, %esi # x.0_1,
leaq .LC2(%rip), %rdi #,
movl $0, %eax #,
call printf#PLT #
# a.c:8: return 0;
movl $0, %eax #, _6
# a.c:9: }
leave
ret
.size main, .-main
.ident "GCC: (Debian 8.3.0-6) 8.3.0"
.section .note.GNU-stack,"",#progbits
1)
The rsi should take address of x int, but it takes the address from -4(%rbp), where there is nothing, in time of execution. Because the initialization of x variable comes from the stdin as scanf waits for input to init the variable. But the what is in -4(%rbp) in the time of instruction leaq -4(%rbp), %rax? It looks like garbage, not address of x, which value should be initialized from stdin.
2)according to this Integer describing number of floating point arguments in xmm registers not passed to rax, the movl $0, %eax is to zero FP registers in al, but that is the same convention for printf. So my question is, to which functions from glibc or other libraries apply this convetion? (So I have to zero %al in printf, scanf, ....?). I assume to every, that has va_list or variable argument?
3) where in the gas source is stack canary in that should protect scanf buffer from overflow? according to this: https://reverseengineering.stackexchange.com/questions/10823/how-does-scanf-interact-with-my-code-in-assembly, this should set canary (in masm):
0x080484c5 <+6>: mov eax,gs:0x14
0x080484cb <+12>: mov DWORD PTR [ebp-0xc],eax
0x080484ce <+15>: xor eax,eax
But I see nothing similar to this in my gas source, which is also output from gcc, which should set it by itself (unless there is some checking in the the scanf function itself which is not visible in my source). So where is it?

Can not compile Assembly (.s) code with Cygwin on Win

I have this ".s" file, written in AT&T assembly.
.globl interleave
interleave:
pushl %ebx
pushl %esi
pushl %edi
movl 16(%esp), %ebx #a
movl 20(%esp), %esi #b
movl 24(%esp), %edi #c
D: movb (%ebx), %cl
testb %cl, %cl
jz W
movb %cl, (%edi) #*c
incl %edi
incl %ebx
T: movb (%esi), %dl
testb %dl, %dl
jz W
movb %dl, (%edi) #*c
incl %edi
incl %esi
W: orb %cl,%dl
jz E
#movb $0, %al
jmp D
E: movb $0, (%edi)
popl %edi
popl %esi
popl %ebx
ret
I want to compile it on windows 10 with cygwin with the following main file, but it does not work.
void interleave(const char* a, const char* b, char* c) ;
int main(int argc, char const *argv[]){
const char* a = "car";
const char* b = "old";
char c[] = "";
interleave(a,b,c);
printf("%s (expected coalrd)\n", c);
return 0;}
With gcc i get es1B.s:3: Error: invalid instruction suffix for push
With gcc -m32 I get collect2: error: ld returned 1 exit status
I even tried to compile it in 32 bit with i686-w64-mingw32-gcc but I get undefined reference to interleave
I am able to compile it and run it on linux with gcc -m32 , but is there a way to make this work on windows?
Thanks

Solved by adding an underscore before the function name as suggested in the comments:
.globl _interleave
_interleave:
...
Compiling with i686-w64-mingw32-gcc now works.

Load floating-point number from pointer to float and push on stack

This is a homework task. I've got a C program that calls a function calc(int, float*, float*, float*, float*) implemented with NASM. I want to do floating-point division with the data passed from C, but first I wanted to check if I access the data correctly.
This is an excerpt from the C program:
printf("read.c: F data1[0]=%f\n", data1[0]);
printf("read.c: X data1[0]=%X\n", *(int*)(&data1[0]));
calc(nlines, data1, data2, result1, result2);
For testing, I wanted to print out exactly the same from the assembler code, but whatever I tried, it wouldn't give me the right results. To be precise, outputting the %X format gives the same result, but the %f format gives some incredibly huge number.
global calc
extern printf
; -----------------------------------------------------------------------
; extern void calc(int nlines, float* data1, float* data2,
; float* result1, float* result2)
; -----------------------------------------------------------------------
calc:
section .data
.strf db "calc.asm: F data1[0]=%f", 10, 0
.strx db "calc.asm: X data1[0]=%X", 10, 0
section .text
enter 0, 0
; Move the value of float* data1 into ecx.
mov ecx, [esp + 12]
; Move the contents of data1[0] into esi.
mov esi, [ecx]
push esi
push .strf
call printf
add esp, 8
push esi
push .strx
call printf
add esp, 8
leave
ret
Outputs
read.c: F data1[0]=20.961977
read.c: X data1[0]=41A7B221
calc.asm: F data1[0]=-8796958457989122902187458235483374032941932827208012972482327255932202912296419757153331437662235555722313731094096197990916443553479942683040096290755684437514827018615169352974748429901549205109479495668937369584705401541113350145698235773041651907978442730240007381959397006695721667307435228446926569472.000000
calc.asm: X data1[0]=41A7B221
I've also looked into fld, but I couldn't find out how I can push the loaded value on stack. This didnt work:
; Move float* data1 into ecx
mov ecx, [esp + 12]
; Load the floating point number into esi.
fld dword [ecx]
fst esi
How to do it right?
I've stripped down read.c to this code
#include <stdio.h>
#include <stdlib.h>
#define MAXLINES 1024
extern void calc(int, float*, float*, float*, float*);
int main(int argc, char** argv)
{
int nlines;
float* data1 = malloc(sizeof(float)*MAXLINES);
float*data2, *results1, *results2;
printf("read.c: F data1[0]=%f\n", data1[0]);
printf("read.c: X data1[0]=%X\n", *(int*)(&data1[0]));
calc(nlines, data1, data2, results1, results2);
return 0;
}
and this is the assembler output:
.file "test.c"
.section .rodata
.LC0:
.string "read.c: F data1[0]=%f\n"
.LC1:
.string "read.c: X data1[0]=%X\n"
.text
.globl main
.type main, #function
main:
.LFB2:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $64, %esp
movl $4096, (%esp)
call malloc
movl %eax, 44(%esp)
movl 44(%esp), %eax
flds (%eax)
fstpl 4(%esp)
movl $.LC0, (%esp)
call printf
movl 44(%esp), %eax
movl (%eax), %eax
movl %eax, 4(%esp)
movl $.LC1, (%esp)
call printf
movl 60(%esp), %eax
movl %eax, 16(%esp)
movl 56(%esp), %eax
movl %eax, 12(%esp)
movl 52(%esp), %eax
movl %eax, 8(%esp)
movl 44(%esp), %eax
movl %eax, 4(%esp)
movl 48(%esp), %eax
movl %eax, (%esp)
call calc
movl $0, %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
.section .note.GNU-stack,"",#progbits
.LC1:
.string "read.c: F data1[0]=%f\n"
.LC2:
.string "read.c: X data1[0]=%X\n"
.text
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $64, %esp
movl 44(%esp), %eax
flds (%eax)
fstpl 4(%esp)
movl $.LC1, (%esp)
call printf
movl 44(%esp), %eax
movl (%eax), %eax
movl %eax, 4(%esp)
movl $.LC2, (%esp)
call printf
movl 60(%esp), %eax
movl %eax, 16(%esp)
movl 56(%esp), %eax
movl %eax, 12(%esp)
movl 52(%esp), %eax
movl %eax, 8(%esp)
movl 44(%esp), %eax
movl %eax, 4(%esp)
movl 48(%esp), %eax
movl %eax, (%esp)
call calc
movl $0, %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE4:
.size main, .-main
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
.section .note.GNU-stack,"",#progbits

Ok, I've now had a chance to test this and verify that what I suggested in my comment works. Here's my modified version of the assembly code, with some comments to explain the things I've added/changed:
global _calc
extern _printf
; -----------------------------------------------------------------------
; extern void calc(int nlines, float* data1, float* data2,
; float* result1, float* result2)
; -----------------------------------------------------------------------
_calc:
section .data
.strf db "calc.asm: F data1[0]=%f", 10, 0
.strx db "calc.asm: X data1[0]=%X", 10, 0
section .text
enter 0, 0
; Move the value of float* data1 into ecx.
mov ecx, [esp + 12]
; Move the contents of data1[0] into esi.
mov esi, [ecx]
fld dword [ecx] ; Load a single-precision float onto the FP stack.
sub esp,8 ; Make room for a double on the stack.
fstp qword [esp] ; Store the top of the FP stack on the regular stack as
; a double, and pop it off the FP stack.
push .strf
call _printf
add esp, 12 ; 12 == sizeof(char*) + sizeof(double)
push esi
push .strx
call _printf
add esp, 8
leave
ret

Division and modulus using single divl instruction (i386, amd64)

I was trying to come up with inline assembly for gcc to get both division and modulus using single divl instruction. Unfortunately, I am not that good at assembly. Could someone please help me on this? Thank you.

You're looking for something like this:
__asm__("divl %2\n"
: "=d" (remainder), "=a" (quotient)
: "g" (modulus), "d" (high), "a" (low));
Although I agree with the other commenters that usually GCC will do this for you and you should avoid inline assembly when possible, sometimes you need this construct.
For instance, if the high word is less than the modulus, then it is safe to perform the division like this. However, GCC isn't smart enough to realize this, because in the general case dividing a 64 bit number by a 32 bit number can lead to overflow, and so it calls to a library routine to do extra work. (Replace with 128 bit/64 bit for 64 bit ISAs.)

You shouldn't try to optimize this yourself. GCC already does this.
volatile int some_a = 18, some_b = 7;
int main(int argc, char *argv[]) {
int a = some_a, b = some_b;
printf("%d %d\n", a / b, a % b);
return 0;
}
Running
gcc -S test.c -O
yields
main:
.LFB11:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl some_a(%rip), %esi
movl some_b(%rip), %ecx
movl %esi, %eax
movl %esi, %edx
sarl $31, %edx
idivl %ecx
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
Notice that the remainder, %edx, is not moved because it is also the third argument passed to printf.
EDIT: The 32-bit version is less confusing. Passing -m32 yields
main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $16, %esp
movl some_a, %eax
movl some_b, %ecx
movl %eax, %edx
sarl $31, %edx
idivl %ecx
movl %edx, 8(%esp)
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call printf
movl $0, %eax
leave
ret

Fortunately, you don't have to resort to inline assembly to achieve this. gcc will do this automatically when it can.
$ cat divmod.c
struct sdiv { unsigned long quot; unsigned long rem; };
struct sdiv divide( unsigned long num, unsigned long divisor )
{
struct sdiv x = { num / divisor, num % divisor };
return x;
}
$ gcc -O3 -std=c99 -Wall -Wextra -pedantic -S divmod.c -o -
.file "divmod.c"
.text
.p2align 4,,15
.globl divide
.type divide, #function
divide:
.LFB0:
.cfi_startproc
movq %rdi, %rax
xorl %edx, %edx
divq %rsi
ret
.cfi_endproc
.LFE0:
.size divide, .-divide
.ident "GCC: (GNU) 4.4.4 20100630 (Red Hat 4.4.4-10)"
.section .note.GNU-stack,"",#progbits

Yes -- a divl will produce the quotient in eax and the remainder in edx. Using Intel syntax, for example:
mov eax, 17
mov ebx, 3
xor edx, edx
div ebx
; eax = 5
; edx = 2

Here is an example in linux kernel code about divl
/*
* do_div() is NOT a C function. It wants to return
* two values (the quotient and the remainder), but
* since that doesn't work very well in C, what it
* does is:
*
* - modifies the 64-bit dividend _in_place_
* - returns the 32-bit remainder
*
* This ends up being the most efficient "calling
* convention" on x86.
*/
#define do_div(n, base) \
({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
__mod = n & (__base - 1); \
n >>= ilog2(__base); \
} else { \
asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2" : "=a" (__low), "=d" (__mod) \
: "rm" (__base), "0" (__low), "1" (__upper)); \
asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
} \
__mod; \
})

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

gcc optimization of struct assignment - gcc

Related

Why does gcc optimize out this comparison and how can I get it to get compiled in?

storage register for scanf call in gas

Can not compile Assembly (.s) code with Cygwin on Win

Load floating-point number from pointer to float and push on stack

Division and modulus using single divl instruction (i386, amd64)

Categories

Resources