Accessing condition flag in IA32 - flags

Is there any way where I could access the value of a condition flag, such as the zero flag, and move that value into a register? Something like this:
cmpl %eax,%edx
movl ZF, %eax
Thanks,
Regards

So after surfing around on the web, it is possible to do somewhat similar:
Let res be the resulting value of all condition flags (Zero Flag (ZF), Carry Flag (CF), Sign Flag (SF), Overflow Flag (OF)) after the operation cmpl %eax,&edx. (Which performs %edx - %eax and throws away the result)
sete D //Sets D to ZF -> %eax == %edx
setne D //Sets D to ~ZF -> %eax != %edx
setg D //Sets D to ~(SF ^ OF) & ~ZF -> %edx > %eax
setge D //Sets D to ~(SF ^ OF) -> %edx >= %eax
setl D //Sets D to SF ^ OF -> %edx < %eax
setle D //Sets D to (SF ^ OF) | ZF -> %edx <= %eax

Related

leal and indirect addressing

Why is that replacing
movl $84, 3(%rsi)
with
movl $3, %ecx
leal (%esi, %ecx, 1), %r11d
movl $84, (%r11d)
results in Segmentation fault (core dumped) and how can I solve it?
(I will be using %ecx as a counter later on to loop through the array)
As I understand it, movl $84, 3(%rsi) moves the literal value 84 (ASCII 'T') into result[2] (which works and correct, based on the printed output:    24 = 02aTa)
On the other hand,
movl $3, %ecx
leal (%esi, %ecx, 1), %r11d
movl $84, (%r11d)
line 1 moves integer value 3 into %ecx
line 2 calculates %esi + 1 * %ecx = %esi + %ecx which is the same as 3(%esi) (incorrect, as it does not work)
line 3 moves integer value 84 ('T') into the address above (which should be the same as movl $84, 3(%rsi))
Replacing %esi with %rsi and so on (including using leaq, movq) in the non-working part, I get
*** stack smashing detected ***: ./a.out terminated
Aborted (core dumped)
So my question is: How can I achieve the effect of movl $84, %ecx(%esi)?
Compile with gcc main.c helper.s
main.c
#include <stdio.h>
void helper(unsigned int value, char * result);
int main() {
char result[6];
helper(24, result);
printf("%5u = %s\n", 24, result);
return 0;
}
helper.s
.globl helper
helper:
# 0
movl $48, 0(%rsi)
# 2
movl $50, 1(%rsi)
# a
movl $97, 2(%rsi)
#ifdef DEBUG
# T
movl $84, 3(%rsi)
#ELSE
/*
movl $3, %ecx
leal (%esi, %ecx, 1), %r11d
# T
movl $84, (%r11d)
*/
#ENDIF
# a
movl $97, 4(%rsi)
movl $0x0000000000000000, 5(%rsi)
ret
Replacing all %rsi with %esi (and all 64-bit registers with 32-bit registers), I get Segmentation fault (core dumped). Why is this?

Does gcc (or any other compiler) change (n%2==1) for (n&1==1)?

To test if a number is odd or even,
It is my understanding that a test using
(n%2==1)
Is the same thing as
(n&1==1)
I assume the first test is faster (please correct me if I'm wrong), but does any compiler recognize this and "correct" it? Does this makes any difference in performance?
void main()
{
int n = 5;
int i = n & 1;
}
call __main
movl $5, -4(%rbp)
movl -4(%rbp), %eax
andl $1, %eax
movl %eax, -8(%rbp)
addq $48, %rsp
popq %rbp
ret
void main()
{
int n = 5;
int i = n % 2;
}
call __main
movl $5, -4(%rbp)
movl -4(%rbp), %eax
cltd
shrl $31, %edx
addl %edx, %eax
andl $1, %eax
subl %edx, %eax
movl %eax, -8(%rbp)
addq $48, %rsp
popq %rbp
ret
Tried with gcc.exe (GCC) 4.9.2 using -S -O0
So it seams that & 1 to check parity is slightly better.
Actually
(n%2==1)
is not the same as
(n&1==1)
if type of n is signed int, so the compiler code(gcc 5.1, -Ofast, 64bit):
int f(int n)
{
return (n % 2) == 1;
0: 89 f8 mov %edi,%eax
2: c1 e8 1f shr $0x1f,%eax
5: 01 c7 add %eax,%edi
7: 83 e7 01 and $0x1,%edi
a: 29 c7 sub %eax,%edi
c: 31 c0 xor %eax,%eax
e: 83 ff 01 cmp $0x1,%edi
11: 0f 94 c0 sete %al
}
14: c3 retq
So main part looks like(pseudo code):
uint64_t edi = eax;
eax >>= 0x1f;
edi += eax;
edi &= 1;
edi -= eax;
But if type of n is "unsigned int" all looks great(gcc 5.1, -Ofast):
0000000000000000 <f>:
unsigned char f(unsigned int n)
{
return (n % 2) == 1;
0: 83 e7 01 and $0x1,%edi
}
3: 89 f8 mov %edi,%eax
5: c3 retq

Extended asm - Register constraints behaving oddly?

Context
Linux 64bit. GCC 4.8.2.
Gas assembly. AT&T syntax.
I just read this answer.
The code:
int operand1, operand2, sum, accumulator;
operand1 = 10; operand2 = 15;
__asm__ volatile ("movl %1, %0\n\t"
"addl %2, %0"
: "=r" (sum) /* output operands */
: "r" (operand1), "r" (operand2) /* input operands */
: "0"); /* clobbered operands */
accumulator = sum;
__asm__ volatile ("addl %1, %0\n\t"
"addl %2, %0"
: "=r" (accumulator)
: "0" (accumulator), "r" (operand1), "r" (operand2)
: "0");
Compiled with no optimizations of course.
I made my experiments with valgrind --tool=cachegrind ./my_bin
Actually, if I replace
"0" (accumulator), "r" (operand1), "r" (operand2)
With
"0" (accumulator), "m" (operand1), "m" (operand2)
I get one less instruction == one cpu cycle saved because there is no registry manipulation
Now, replacing
"0" (accumulator), "r" (operand1), "r" (operand2)
With
"r" (accumulator), "r" (operand1), "r" (operand2)
I get 1 cpu cycle shaved as well.
So
"r" (accumulator), "m" (operand1), "m" (operand2)
Saves 2 cpu cycles.
Questions
1) Why should we use at least one register if they slow things down ? Is there really a risk of overwrite or something ?
2) Why the heck do "0" instead of "r" slows things down ? it is non logical to me since we just reference the same value (which is accumulator). GCC should not output different code ! "r" could imply choosing another register -> nonsense && slow.
Without getting into an asm tutorial, I thought it might be better to look at code generation with and without optimization. I'm using OSX, which is basically the same ABI as x86-64 Linux.
First: you're finding sum <- op1 + op2,
followed by: acc <- sum; acc <- acc + op1 + op2,
which we can just replace with: acc <- sum + op1 + op2; don't need: acc = sum;
(this was broken by the way - op1, op2 are %2, %3 respectively, and %1 'aliases' %0)
This still isn't a particularly efficient use of inline assembly, but just to fix things up a bit into something that can be examined:
int test_fn (void)
{
int op1 = 10, op2 = 15, sum, acc;
__asm__ ("movl %k1, %k0\n\taddl %k2, %k0"
: "=&r" (sum) : "r" (op1), "r" (op2));
__asm__ ("addl %k2, %k0\n\taddl %k3, %k0"
: "=r" (acc) : "0" (sum), "r" (op1), "r" (op2));
return acc;
}
Without optimization: gcc -Wall -c -S src.c (comments are mine)
pushq %rbp
movq %rsp, %rbp
movl $10, -4(%rbp) # store 10 -> mem (op1)
movl $15, -8(%rbp) # store 15 -> mem (op2)
# asm(1)
movl -4(%rbp), %edx # load op1 -> reg (%1)
movl -8(%rbp), %ecx # load op2 -> reg (%2)
movl %edx, %eax # mov %1 to %0
addl %ecx, %eax # add %2 to %0
movl %eax, -12(%rbp) # store %0 -> mem (sum)
# asm(2)
movl -12(%rbp), %eax # load sum -> reg (%1 = %0)
movl -4(%rbp), %edx # load op1 -> reg (%2)
movl -8(%rbp), %ecx # load op2 -> reg (%3)
addl %edx, %eax # add %2 to %0
addl %ecx, %eax # add %3 to %0
movl %eax, -16(%rbp) # store %0 -> mem (acc)
movl -16(%rbp), %eax # load acc -> return value.
popq %rbp
ret
The compiler has made no effort to keep intermediate results in registers. It simply saves them back to temporary memory on the stack, and loads again as needed. It's fairly easy to follow though.
Let's apply your change to asm(2) inputs: "0" (sum), "m" (op1), "m" (op2)
...
# asm(2)
movl -4(%rbp), %eax # load sum -> reg (%1 = %0)
addl -12(%rbp), %eax # add op1 (mem) to %0
addl -16(%rbp), %eax # add op2 (mem) to %0
movl %eax, -8(%rbp) # store %0 -> mem (acc)
...
The memory locations are a bit different, but that doesn't matter. The fact that there's a form of add with reg <- reg + mem means we don't need to load to a register first. So indeed it does save an instruction, but we're still reading from and writing to memory.
With optimization: gcc -Wall -O2 -c -S src.c
movl $10, %edx
movl $15, %ecx
# asm(1)
movl %edx, %eax
addl %ecx, %eax
# asm(2)
addl %edx, %eax
addl %ecx, %eax
ret
There's no memory access. Everything is done in registers. That's as fast as it gets. No cache access, no main memory, etc. If we apply the change to use "m" constraints as we did in the unoptimized case:
movl $10, -8(%rsp)
movl $15, %ecx
movl $10, %edx
movl $15, -4(%rsp)
# asm(1)
movl %edx, %eax
addl %ecx, %eax
# asm(2)
addl -8(%rsp), %eax
addl -4(%rsp), %eax
ret
We're back to forcing the use of memory. Needlessly storing and loading operands for asm(2). It's not that valgrind was wrong - just the inference that register use was responsible for slowing things down.

shrl vs sarl .. x86 Assembly gnu

I'm compiling my code with gcc and looking at the assembly, what is this code exactly doing?
shrl $20, %edx
leal (%edx,%eax), %eax
sarl 1, %eax
Say that the variable X is at the edx register, and Y is at eax and both are (32-bit int). What is going on here??
I know 'shrl $20, %edx' is shifting %eax right 20 bits, so is same as: eax/(2^20)
and then sarl is the same so 'sarl 1, %eax' = eax/(2^1).
Is that right, and if so what does leal do?
Assuming that sarl 1, %eax is really supposed to be sarl $1, %eax, then the whole thing equates to:
x = ((unsigned int) x) >> 20;
y = (x + y) >> 1
The leal instruction means: eax = eax + edx. This link might be useful to you, as well as this one.

Division and modulus using single divl instruction (i386, amd64)

I was trying to come up with inline assembly for gcc to get both division and modulus using single divl instruction. Unfortunately, I am not that good at assembly. Could someone please help me on this? Thank you.
You're looking for something like this:
__asm__("divl %2\n"
: "=d" (remainder), "=a" (quotient)
: "g" (modulus), "d" (high), "a" (low));
Although I agree with the other commenters that usually GCC will do this for you and you should avoid inline assembly when possible, sometimes you need this construct.
For instance, if the high word is less than the modulus, then it is safe to perform the division like this. However, GCC isn't smart enough to realize this, because in the general case dividing a 64 bit number by a 32 bit number can lead to overflow, and so it calls to a library routine to do extra work. (Replace with 128 bit/64 bit for 64 bit ISAs.)
You shouldn't try to optimize this yourself. GCC already does this.
volatile int some_a = 18, some_b = 7;
int main(int argc, char *argv[]) {
int a = some_a, b = some_b;
printf("%d %d\n", a / b, a % b);
return 0;
}
Running
gcc -S test.c -O
yields
main:
.LFB11:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl some_a(%rip), %esi
movl some_b(%rip), %ecx
movl %esi, %eax
movl %esi, %edx
sarl $31, %edx
idivl %ecx
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
Notice that the remainder, %edx, is not moved because it is also the third argument passed to printf.
EDIT: The 32-bit version is less confusing. Passing -m32 yields
main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $16, %esp
movl some_a, %eax
movl some_b, %ecx
movl %eax, %edx
sarl $31, %edx
idivl %ecx
movl %edx, 8(%esp)
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call printf
movl $0, %eax
leave
ret
Fortunately, you don't have to resort to inline assembly to achieve this. gcc will do this automatically when it can.
$ cat divmod.c
struct sdiv { unsigned long quot; unsigned long rem; };
struct sdiv divide( unsigned long num, unsigned long divisor )
{
struct sdiv x = { num / divisor, num % divisor };
return x;
}
$ gcc -O3 -std=c99 -Wall -Wextra -pedantic -S divmod.c -o -
.file "divmod.c"
.text
.p2align 4,,15
.globl divide
.type divide, #function
divide:
.LFB0:
.cfi_startproc
movq %rdi, %rax
xorl %edx, %edx
divq %rsi
ret
.cfi_endproc
.LFE0:
.size divide, .-divide
.ident "GCC: (GNU) 4.4.4 20100630 (Red Hat 4.4.4-10)"
.section .note.GNU-stack,"",#progbits
Yes -- a divl will produce the quotient in eax and the remainder in edx. Using Intel syntax, for example:
mov eax, 17
mov ebx, 3
xor edx, edx
div ebx
; eax = 5
; edx = 2
Here is an example in linux kernel code about divl
/*
* do_div() is NOT a C function. It wants to return
* two values (the quotient and the remainder), but
* since that doesn't work very well in C, what it
* does is:
*
* - modifies the 64-bit dividend _in_place_
* - returns the 32-bit remainder
*
* This ends up being the most efficient "calling
* convention" on x86.
*/
#define do_div(n, base) \
({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
__mod = n & (__base - 1); \
n >>= ilog2(__base); \
} else { \
asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2" : "=a" (__low), "=d" (__mod) \
: "rm" (__base), "0" (__low), "1" (__upper)); \
asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
} \
__mod; \
})

Resources