leal and indirect addressing - gcc

Why is that replacing
movl $84, 3(%rsi)
movl $3, %ecx
leal (%esi, %ecx, 1), %r11d
movl $84, (%r11d)
results in Segmentation fault (core dumped) and how can I solve it?
(I will be using %ecx as a counter later on to loop through the array)
As I understand it, movl $84, 3(%rsi) moves the literal value 84 (ASCII 'T') into result[2] (which works and correct, based on the printed output:    24 = 02aTa)
On the other hand,
movl $3, %ecx
leal (%esi, %ecx, 1), %r11d
movl $84, (%r11d)
line 1 moves integer value 3 into %ecx
line 2 calculates %esi + 1 * %ecx = %esi + %ecx which is the same as 3(%esi) (incorrect, as it does not work)
line 3 moves integer value 84 ('T') into the address above (which should be the same as movl $84, 3(%rsi))
Replacing %esi with %rsi and so on (including using leaq, movq) in the non-working part, I get
*** stack smashing detected ***: ./a.out terminated
Aborted (core dumped)
So my question is: How can I achieve the effect of movl $84, %ecx(%esi)?
Compile with gcc main.c helper.s
#include <stdio.h>
void helper(unsigned int value, char * result);
int main() {
char result[6];
helper(24, result);
printf("%5u = %s\n", 24, result);
return 0;
.globl helper
# 0
movl $48, 0(%rsi)
# 2
movl $50, 1(%rsi)
# a
movl $97, 2(%rsi)
#ifdef DEBUG
# T
movl $84, 3(%rsi)
movl $3, %ecx
leal (%esi, %ecx, 1), %r11d
# T
movl $84, (%r11d)
# a
movl $97, 4(%rsi)
movl $0x0000000000000000, 5(%rsi)
Replacing all %rsi with %esi (and all 64-bit registers with 32-bit registers), I get Segmentation fault (core dumped). Why is this?


problem performing recursive call in quick sort assembly

I'm having a problem implementing quicksort in assembly, when I'm trying to perform the second recursive call to qsort(arr,pi+1,high) I don't know why, but it exits the recursion too early.
I've tried using the stack and writing helper functions, but everything I've tied created new problems. When debbuging, I see that I exit the function when %ebx = 4 and %ecx = 5 and the array is almost sorted array ={10,30,40,50,70,90,80} - I need only to perform the last swap when %ebx = 5 and %ecx = 6, but it exits before, and I don't understand why it won't go back to when %ecx was 6.
I thought perhaps the problem is when I'm making the comparisons between high and low, but If I change those, I'm having different problems (it exists too early the first recursive call). perhaps anyone has an idea what's wrong with my code?
here's my code:
array: .int 10, 80, 30, 90, 40, 50, 70
size: .int 7
.global main
movl $0, %ebx # low index
movl (size), %ecx # high index
dec %ecx # ecx = size -1
cmp %ecx, %ebx # high > low
jge end
movsx %ecx, %r13 # push high
movsx %ebx, %r12 # push low
call partition # rax now is the pivot
movq %r12, %rbx # pop low
movl %eax, %ecx # prepare new high = pi
subl $1, %ecx # high = pi-1
call qsort # qsort(arr,low,pi-1)
movl %eax,%ebx # push new low = pi+1
inc %ebx
call qsort # qsort(arr,pi+1,high)
movq %r12, %rax # get low
movq %r13, %rbx # get high
movl %eax, %ecx # j = low
subl $1, %eax # i = low-1
movl array(,%ebx,4), %edx # pivot = array[high]
cmp %ebx, %ecx # high > j
jge swap_greater
movl array(,%ecx,4), %esi # esi = array[j]
cmp %edx, %esi # pivot > array[j] ?
jge loops # jmp if array[j] > pivot
inc %eax # i++
# swap
movsx %esi, %rsi
pushq %rsi
movl array(,%eax,4), %edi # edi = array[i]
movl %edi, array(,%ecx,4) # array[j]=array[i]
pop %rsi
movl %esi, array(,%eax,4) # array[i]= previous array[j]
inc %ecx # j++
jmp swap_loop
inc %eax
movsx %edx, %rdx
pushq %rdx
movl array(,%eax,4), %edi # edi = array[i+1]
movl %edi, array(,%ebx,4) # array[high]=array[i+1]
pop %rdx
movl %edx, array(,%eax,4) # array[i+1]= previous array[high]
dec %eax
inc %eax
inc %ecx
movsx %ecx, %r13 # update high
edit: as Peter Cordes has pointed out - I need to work with the stack in order to retrieve the parameters when I go back from the recursive calls. after re-writing my code using the stack, I have a new problem, I always pop out the same thing on the second recursive call. I don't understand why it doesn't do the "recursive folding" like in C, when I look at the stack there's something weird, since I'm not seeing the values as I excpect ( I excpect to see layers or rax-high-low-rax-high-low...etc) but instead there are those numbers:
native process 42032 In: second_rec L?? PC: 0x4005ea
0x7fffffffdeb0: 0x0000000000000000 0x0000000000000001
0x7fffffffdec0: 0x0000000000000002 0x00000000004005ef
0x7fffffffded0: 0x0000000000000000 0x0000000000000002
0x7fffffffdee0: 0x0000000000000001 0x00000000004005df
0x7fffffffdef0: 0x0000000000000000 0x0000000000000001
0x7fffffffdf00: 0x0000000000000002 0x00000000004005df
0x7fffffffdf10: 0x0000000000000000 0x0000000000000002
0x7fffffffdf20: 0x0000000000000003 0x00000000004005df
0x7fffffffdf30: 0x0000000000000000 0x0000000000000003
0x7fffffffdf40: 0x0000000000000004 0x00007ffff7a05b97
0x7fffffffdf50: 0x0000000000000001 0x00007fffffffe028
0x7fffffffdf60: 0x0000000100008000 0x00000000004005a7
perhaps I'm not pushing\popping in the right timing, but from what I've understood, I'm pushing parameters before call, and popping them when I return from the call... does anyone have an idea what am I doing wrong? the new code:
array: .int 10, 80, 30, 90, 40, 50, 70
size: .int 7
.global main
movl $0, %ebx # low index
movl (size), %ecx # high index
dec %ecx # ecx = size -1
cmp %ecx, %ebx # high > low
jge end
movsx %ecx, %rcx # prepare high
movsx %ebx, %rbx # prepare low
pushq %rcx # push high
pushq %rbx # push low
call partition # rax now is the pivot
popq %rbx # pop low
popq %rcx # pop high
movsx %eax, %rax
pushq %rax # push pivot
movl %eax, %ecx # prepare new high = pi
subl $1, %ecx # pi-=1
movsx %ecx, %rcx
pushq %rcx # push high
pushq %rbx # push low
call qsort # qsort(arr,low,pi-1)
popq %rbx # pop low
popq %rcx # pop high
popq %rax # pop pivot
movl %eax, %ecx # high = pi
addl $1, %ecx # high = pi+1
pushq %rax # push pivot
pushq %rcx # push high
pushq %rbx # push low
call qsort # qsort(arr,pi+1,high)
addq $24, %rsp
movq 8(%rsp), %rax # get low
movq 16(%rsp), %rbx # get high
movl %eax, %ecx # j = low
subl $1, %eax # i = low-1
movl array(,%ebx,4), %edx # pivot = array[high]
cmp %ebx, %ecx # high > j
jge swap_greater
movl array(,%ecx,4), %esi # esi = array[j]
cmp %edx, %esi # pivot > array[j] ?
jge loops # jmp if array[j] > pivot
inc %eax # i++
# swap
movsx %esi, %rsi
pushq %rsi
movl array(,%eax,4), %edi # edi = array[i]
movl %edi, array(,%ecx,4) # array[j]=array[i]
pop %rsi
movl %esi, array(,%eax,4) # array[i]= previous array[j]
inc %ecx # j++
jmp swap_loop
inc %eax
movsx %edx, %rdx
pushq %rdx
movl array(,%eax,4), %edi # edi = array[i+1]
movl %edi, array(,%ebx,4) # array[high]=array[i+1]
pop %rdx
movl %edx, array(,%eax,4) # array[i+1]= previous array[high]
dec %eax
inc %eax
Thank you very much everyone for your help!

Why GCC does not generates assembly code with 1 **imul** instruction for 1 multiplication in C?

I'm checking the assembly code of the following simple program on https://godbolt.org/ with parameters -Wall -m32 using gcc 6.3 and to my surprise the generated assembly code takes three multiplications to perform one multiplication in C as shown below.
int main(void){
int32_t ax=-854763;
int32_t bx=586478;
int64_t cx= (int64_t)ax*bx;
printf("%lld\n", cx);
The generated assembly code for the statement int64_t cx= (int64_t)ax*bx; is given below. There are two imull and one mull instructions.
movl -28(%ebp), %eax
movl %eax, %ecx
movl %eax, %ebx
sarl $31, %ebx
movl -32(%ebp), %eax
movl %ebx, %edi
imull %eax, %edi
movl %edx, %esi
imull %ecx, %esi
addl %edi, %esi
mull %ecx
leal (%esi,%edx), %ecx
movl %ecx, %edx
movl %eax, -40(%ebp)
movl %edx, -36(%ebp)
movl %eax, -40(%ebp)
movl %edx, -36(%ebp)`
Is it possible to enforce the gcc compiler/assembler to do it with just one multiplication without using inline assembly? Because in the original code I have to perform many such multiplications within a statement and in different statements.
Why the compiler does not perform it with one imul instruction? I have also checked the different versions of the gcc on https://godbolt.org/ but the result is same three multiplication instruction.

Load floating-point number from pointer to float and push on stack

This is a homework task. I've got a C program that calls a function calc(int, float*, float*, float*, float*) implemented with NASM. I want to do floating-point division with the data passed from C, but first I wanted to check if I access the data correctly.
This is an excerpt from the C program:
printf("read.c: F data1[0]=%f\n", data1[0]);
printf("read.c: X data1[0]=%X\n", *(int*)(&data1[0]));
calc(nlines, data1, data2, result1, result2);
For testing, I wanted to print out exactly the same from the assembler code, but whatever I tried, it wouldn't give me the right results. To be precise, outputting the %X format gives the same result, but the %f format gives some incredibly huge number.
global calc
extern printf
; -----------------------------------------------------------------------
; extern void calc(int nlines, float* data1, float* data2,
; float* result1, float* result2)
; -----------------------------------------------------------------------
section .data
.strf db "calc.asm: F data1[0]=%f", 10, 0
.strx db "calc.asm: X data1[0]=%X", 10, 0
section .text
enter 0, 0
; Move the value of float* data1 into ecx.
mov ecx, [esp + 12]
; Move the contents of data1[0] into esi.
mov esi, [ecx]
push esi
push .strf
call printf
add esp, 8
push esi
push .strx
call printf
add esp, 8
read.c: F data1[0]=20.961977
read.c: X data1[0]=41A7B221
calc.asm: F data1[0]=-8796958457989122902187458235483374032941932827208012972482327255932202912296419757153331437662235555722313731094096197990916443553479942683040096290755684437514827018615169352974748429901549205109479495668937369584705401541113350145698235773041651907978442730240007381959397006695721667307435228446926569472.000000
calc.asm: X data1[0]=41A7B221
I've also looked into fld, but I couldn't find out how I can push the loaded value on stack. This didnt work:
; Move float* data1 into ecx
mov ecx, [esp + 12]
; Load the floating point number into esi.
fld dword [ecx]
fst esi
How to do it right?
I've stripped down read.c to this code
#include <stdio.h>
#include <stdlib.h>
#define MAXLINES 1024
extern void calc(int, float*, float*, float*, float*);
int main(int argc, char** argv)
int nlines;
float* data1 = malloc(sizeof(float)*MAXLINES);
float*data2, *results1, *results2;
printf("read.c: F data1[0]=%f\n", data1[0]);
printf("read.c: X data1[0]=%X\n", *(int*)(&data1[0]));
calc(nlines, data1, data2, results1, results2);
return 0;
and this is the assembler output:
.file "test.c"
.section .rodata
.string "read.c: F data1[0]=%f\n"
.string "read.c: X data1[0]=%X\n"
.globl main
.type main, #function
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $64, %esp
movl $4096, (%esp)
call malloc
movl %eax, 44(%esp)
movl 44(%esp), %eax
flds (%eax)
fstpl 4(%esp)
movl $.LC0, (%esp)
call printf
movl 44(%esp), %eax
movl (%eax), %eax
movl %eax, 4(%esp)
movl $.LC1, (%esp)
call printf
movl 60(%esp), %eax
movl %eax, 16(%esp)
movl 56(%esp), %eax
movl %eax, 12(%esp)
movl 52(%esp), %eax
movl %eax, 8(%esp)
movl 44(%esp), %eax
movl %eax, 4(%esp)
movl 48(%esp), %eax
movl %eax, (%esp)
call calc
movl $0, %eax
.cfi_restore 5
.cfi_def_cfa 4, 4
.size main, .-main
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
.section .note.GNU-stack,"",#progbits
.string "read.c: F data1[0]=%f\n"
.string "read.c: X data1[0]=%X\n"
.globl main
.type main, #function
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $64, %esp
movl 44(%esp), %eax
flds (%eax)
fstpl 4(%esp)
movl $.LC1, (%esp)
call printf
movl 44(%esp), %eax
movl (%eax), %eax
movl %eax, 4(%esp)
movl $.LC2, (%esp)
call printf
movl 60(%esp), %eax
movl %eax, 16(%esp)
movl 56(%esp), %eax
movl %eax, 12(%esp)
movl 52(%esp), %eax
movl %eax, 8(%esp)
movl 44(%esp), %eax
movl %eax, 4(%esp)
movl 48(%esp), %eax
movl %eax, (%esp)
call calc
movl $0, %eax
.cfi_restore 5
.cfi_def_cfa 4, 4
.size main, .-main
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
.section .note.GNU-stack,"",#progbits
Ok, I've now had a chance to test this and verify that what I suggested in my comment works. Here's my modified version of the assembly code, with some comments to explain the things I've added/changed:
global _calc
extern _printf
; -----------------------------------------------------------------------
; extern void calc(int nlines, float* data1, float* data2,
; float* result1, float* result2)
; -----------------------------------------------------------------------
section .data
.strf db "calc.asm: F data1[0]=%f", 10, 0
.strx db "calc.asm: X data1[0]=%X", 10, 0
section .text
enter 0, 0
; Move the value of float* data1 into ecx.
mov ecx, [esp + 12]
; Move the contents of data1[0] into esi.
mov esi, [ecx]
fld dword [ecx] ; Load a single-precision float onto the FP stack.
sub esp,8 ; Make room for a double on the stack.
fstp qword [esp] ; Store the top of the FP stack on the regular stack as
; a double, and pop it off the FP stack.
push .strf
call _printf
add esp, 12 ; 12 == sizeof(char*) + sizeof(double)
push esi
push .strx
call _printf
add esp, 8

Division and modulus using single divl instruction (i386, amd64)

I was trying to come up with inline assembly for gcc to get both division and modulus using single divl instruction. Unfortunately, I am not that good at assembly. Could someone please help me on this? Thank you.
You're looking for something like this:
__asm__("divl %2\n"
: "=d" (remainder), "=a" (quotient)
: "g" (modulus), "d" (high), "a" (low));
Although I agree with the other commenters that usually GCC will do this for you and you should avoid inline assembly when possible, sometimes you need this construct.
For instance, if the high word is less than the modulus, then it is safe to perform the division like this. However, GCC isn't smart enough to realize this, because in the general case dividing a 64 bit number by a 32 bit number can lead to overflow, and so it calls to a library routine to do extra work. (Replace with 128 bit/64 bit for 64 bit ISAs.)
You shouldn't try to optimize this yourself. GCC already does this.
volatile int some_a = 18, some_b = 7;
int main(int argc, char *argv[]) {
int a = some_a, b = some_b;
printf("%d %d\n", a / b, a % b);
return 0;
gcc -S test.c -O
subq $8, %rsp
.cfi_def_cfa_offset 16
movl some_a(%rip), %esi
movl some_b(%rip), %ecx
movl %esi, %eax
movl %esi, %edx
sarl $31, %edx
idivl %ecx
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
Notice that the remainder, %edx, is not moved because it is also the third argument passed to printf.
EDIT: The 32-bit version is less confusing. Passing -m32 yields
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $16, %esp
movl some_a, %eax
movl some_b, %ecx
movl %eax, %edx
sarl $31, %edx
idivl %ecx
movl %edx, 8(%esp)
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call printf
movl $0, %eax
Fortunately, you don't have to resort to inline assembly to achieve this. gcc will do this automatically when it can.
$ cat divmod.c
struct sdiv { unsigned long quot; unsigned long rem; };
struct sdiv divide( unsigned long num, unsigned long divisor )
struct sdiv x = { num / divisor, num % divisor };
return x;
$ gcc -O3 -std=c99 -Wall -Wextra -pedantic -S divmod.c -o -
.file "divmod.c"
.p2align 4,,15
.globl divide
.type divide, #function
movq %rdi, %rax
xorl %edx, %edx
divq %rsi
.size divide, .-divide
.ident "GCC: (GNU) 4.4.4 20100630 (Red Hat 4.4.4-10)"
.section .note.GNU-stack,"",#progbits
Yes -- a divl will produce the quotient in eax and the remainder in edx. Using Intel syntax, for example:
mov eax, 17
mov ebx, 3
xor edx, edx
div ebx
; eax = 5
; edx = 2
Here is an example in linux kernel code about divl
* do_div() is NOT a C function. It wants to return
* two values (the quotient and the remainder), but
* since that doesn't work very well in C, what it
* does is:
* - modifies the 64-bit dividend _in_place_
* - returns the 32-bit remainder
* This ends up being the most efficient "calling
* convention" on x86.
#define do_div(n, base) \
({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
__mod = n & (__base - 1); \
n >>= ilog2(__base); \
} else { \
asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2" : "=a" (__low), "=d" (__mod) \
: "rm" (__base), "0" (__low), "1" (__upper)); \
asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
} \
__mod; \

Can you convert this inline asm into non-inline one?

I came across this inline asm. I am not sure how it should look without this syntax... Could someone show it to me?
__asm__ volatile ("lock\n\tincl %0"
incl llvm_cbe_tmp__29
However, because the operand is specified abstractly, the compiler will generate the code needed to reference it, even if that means a load and store. As a result it is possible that more than two instructions or an addressing mode will be added.
Using gcc -S on this:
int main()
int *p;
asm volatile ("lock\n\tincl %0":"=m"(p):"m"(*(p)):"cc");
.type main, #function
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
pushl %ecx
subl $20, %esp
movl -8(%ebp), %eax
# 4 "asm.c" 1
incl -8(%ebp)
# 0 "" 2
addl $20, %esp
popl %ecx
popl %ebp
leal -4(%ecx), %esp
