error: 'asm' operand has impossible constraints (A53) - gcc 9.3.1 - gcc

This is code to read 64bit address space. Issue showed up with gcc 9.3.1, did not have issue with earlier version of gcc
static inline void write_to_64bit_address(uint64_t address,
uint32_t data) {
uint32_t address_upper = (uint32_t)(address >> 32);
uint32_t address_lower = (uint32_t)(address);
uint32_t smc_code = SMC_LONG_ADDRESS_WRITE_SINGLE;
asm volatile("mov r0, %[smc_code]\n"
"mov r1, %[addr_upper]\n"
"mov r2, %[addr_lower]\n"
"mov r3, %[data_all]\n"
"mov r4, %[smc_zero]\n"
"mov r5, %[smc_zero]\n"
"mov r6, %[smc_zero]\n"
"smc #0\n"
:
: [smc_code] "r"(smc_code),
[addr_upper] "r"(address_upper),
[addr_lower] "r"(address_lower),
[data_all] "r"(data),
[smc_zero] "g"(SMC_ZERO)
: "r0","r1","r2","r3","r4", "r5", "r6");
}
I don't understand this assembly as well and learning. Can someone help.

Related

x86_64 Inline Assembly ; Copying 64-bit register directly to 64-bit memory location

I am running the code below and suffering from two problems:
1) The moment I change movl (to copy values from registers) to movq I face the gcc error : Error: operand size mismatch for movq. In the normal assembly I see that this was possible by adding qword prefix or likes, but that also fails to satisfy gcc
uint64_t cpuid_0(uint64_t* _rax, uint64_t* _rbx, uint64_t* _rcx, uint64_t* _rdx){
int a, b, c, d;
*_rax = 0x0;
__asm__
__volatile__
(
"movq $0, %%rax\n"
"cpuid\n"
"movl %%eax, %0\n"
"movl %%ebx, %1\n"
"movl %%ecx, %2\n"
"movl %%edx, %3\n"
: "=r" (a), "=r" (b), "=r" (c), "=r" (d)
: "0" (a)
);
*_rax=a;*_rbx=b;*_rcx=c;*_rdx=d;
return *_rax;
}
2) I want to eliminate extra copy operation so I modified my code in the constraint specification:
uint64_t cpuid_0(uint64_t* _rax, uint64_t* _rbx, uint64_t* _rcx, uint64_t* _rdx){
int a, b, c, d;
*_rax = 0x0;
__asm__
__volatile__
(
"movq $0, %%rax\n"
"cpuid\n"
"movl %%eax, %0\n"
"movl %%ebx, %1\n"
"movl %%ecx, %2\n"
"movl %%edx, %3\n"
: "+m" (*_rax), "=m" (*_rbx), "=m" (*_rcx), "=m" (_rdx)
: "0" (*_rax)
);
*_rax=a;*_rbx=b;*_rcx=c;*_rdx=d;
return *_rax;
}
This gives me a host of errors like those below:
warning: matching constraint does not allow a register
error: inconsistent operand constraints in an ‘asm’
Also, I assume __volatile__ could be removed in this small code.
It's the input "0" (*_rax) which is foxing it... it seems that "0" does not work with a "=m" memory constraint, nor with "+m". (I do not know why.)
Changing your second function to compile and work:
uint32_t cpuid_0(uint32_t* _eax, uint32_t* _ebx, uint32_t* _ecx, uint32_t* _edx)
{
__asm__
(
"mov $0, %%eax\n"
"cpuid\n"
"mov %%eax, %0\n"
"mov %%ebx, %1\n"
"mov %%ecx, %2\n"
"mov %%edx, %3\n"
: "=m" (*_eax), "=m" (*_ebx), "=m" (*_ecx), "=m" (*_edx)
: //"0" (*_eax) -- not required and throws errors !!
: "%rax", "%rbx", "%rcx", "%rdx" // ESSENTIAL "clobbers"
) ;
return *_eax ;
}
where that:
does everything as uint32_t, for consistency.
discards the redundant int a, b, c, d;
omits the "0" input, which in any case was not being used.
declares simple "=m" output for (*_eax)
"clobbers" all "%rax", "%rbx", "%rcx", "%rdx"
discards the redundant volatile.
The last is essential, because without it the compiler has no idea that those registers are affected.
The above compiles to:
push %rbx # compiler (now) knows %rbx is "clobbered"
mov %rdx,%r8 # likewise %rdx
mov %rcx,%r9 # ditto %rcx
mov $0x0,%eax # the __asm__(....
cpuid
mov %eax,(%rdi)
mov %ebx,(%rsi)
mov %ecx,(%r8)
mov %edx,(%r9) # ....) ;
mov (%rdi),%eax
pop %rbx
retq
NB: without the "clobbers" compiles to:
mov $0x0,%eax
cpuid
mov %eax,(%rdi)
mov %ebx,(%rsi)
mov %ecx,(%rdx)
mov %edx,(%rcx)
mov (%rdi),%eax
retq
which is shorter, but sadly doesn't work !!
You could also (version 2):
struct cpuid
{
uint32_t eax ;
uint32_t ebx ;
uint32_t ecx ;
uint32_t edx ;
};
uint32_t cpuid_0(struct cpuid* cid)
{
uint32_t eax ;
__asm__
(
"mov $0, %%eax\n"
"cpuid\n"
"mov %%ebx, %1\n"
"mov %%ecx, %2\n"
"mov %%edx, %3\n"
: "=a" (eax), "=m" (cid->ebx), "=m" (cid->ecx), "=m" (cid->edx)
:: "%ebx", "%ecx", "%edx"
) ;
return cid->eax = eax ;
}
which compiles to something very slightly shorter:
push %rbx
mov $0x0,%eax
cpuid
mov %ebx,0x4(%rdi)
mov %ecx,0x8(%rdi)
mov %edx,0xc(%rdi)
pop %rbx
mov %eax,(%rdi)
retq
Or you could do something more like your first version (version 3):
uint32_t cpuid_0(struct cpuid* cid)
{
uint32_t eax, ebx, ecx, edx ;
eax = 0 ;
__asm__(" cpuid\n" : "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx));
cid->edx = edx ;
cid->ecx = ecx ;
cid->ebx = ebx ;
return cid->eax = eax ;
}
which compiles to:
push %rbx
xor %eax,%eax
cpuid
mov %ebx,0x4(%rdi)
mov %edx,0xc(%rdi)
pop %rbx
mov %ecx,0x8(%rdi)
mov %eax,(%rdi)
retq
This version uses the "+a", "=b" etc. magic to tell the compiler to allocate specific registers to the various variables. This reduces the amount of assembler to the bare minimum, which is generally a Good Thing. [Note that the compiler knows that xor %eax,%eax is better (and shorter) than mov $0,%eax and thinks there is some advantage to doing the pop %rbx earlier.]
Better yet -- following comment by #Peter Cordes (version 4):
uint32_t cpuid_1(struct cpuid* cid)
{
__asm__
(
"xor %%eax, %%eax\n"
"cpuid\n"
: "=a" (cid->eax), "=b" (cid->ebx), "=c" (cid->ecx), "=d" (cid->edx)
) ;
return cid->eax ;
}
where the compiler figures out that cid->eax is already in %eax, and so compiles to:
push %rbx
xor %eax,%eax
cpuid
mov %ebx,0x4(%rdi)
mov %eax,(%rdi)
pop %rbx
mov %ecx,0x8(%rdi)
mov %edx,0xc(%rdi)
retq
which is the same as version 3, apart from a small difference in the order of the instructions.
FWIW: an __asm__() is defined to be:
asm asm-qualifiers (AssemblerTemplate : OutputOperands [ : InputOperands [ : Clobbers ] ] )
The key to inline assembler is to understand that the compiler:
has no idea what the AssemblerTemplate part means.
It does expand the %xx place holders, but understands nothing else.
does understand the OutputOperands, InputOperands (if any) and Clobbers (if any)...
...these tell the compiler what the assembler needs as parameters, and how to expand the various %xx.
...but these also tell the compiler what the AssemblerTemplate does, in terms that the compiler understands.
So, what the compiler understands is a sort of "data flow". It understands that the assembler takes a number of inputs, returns a number of outputs and (may) as a side effect "clobber" some registers and/or amounts of memory. Armed with this information, the compiler can integrate the "black box" assembler sequence with the code generated around it. Among other things the compiler will:
allocate registers for output and input operands
and arrange for the inputs to be in the required registers (as required).
NB: the compiler looks on the assembler as a single operation, where all inputs are consumed before any outputs are generated. If an input is not used after the __asm__() the compiler can allocate a given register as an input and as an output. Hence the need so the so-called "early clobber".
move the "black box" around wrt the surrounding code, maintaining the dependencies the assembler has on the sources of its inputs and the dependencies the code that follows has on the assembler's outputs.
discard the "black box" altogether if nothing seems to depend on its outputs !

GCC Inline Assembler "memory" Clobber don't prevent from re-arrange the code in ARM

I read article about GCC Inline Assembler
(http://www.ethernut.de/en/documents/arm-inline-asm.html).
In this article, "memory" Clobber forces the compiler to store all
cached values before and reload them after executing the assembler
instructions. And it must retain the sequence.
this is the example.
The following code intends to multiply c with b, of which one or both
may be modified by an interrupt routine. Disabling interrupts before
accessing the variables and re-enable them afterwards looks like a
good idea.
This may fail. Because the optimizer may decide to do the
multiplication first and then execute both inline assembler
instructions or vice versa. :
asm volatile("mrs r12, cpsr\n\t"
"orr r12, r12, #0xC0\n\t"
"msr cpsr_c, r12\n\t" ::: "r12", "cc");
c *= b; /* This may fail. */
asm volatile("mrs r12, cpsr\n"
"bic r12, r12, #0xC0\n"
"msr cpsr_c, r12" ::: "r12", "cc");
This is safe by adding "memory" Clobber .
asm volatile("mrs r12, cpsr\n\t"
"orr r12, r12, #0xC0\n\t"
"msr cpsr_c, r12\n\t" :: : "r12", "cc", "memory");
c *= b; /* This is safe. */
asm volatile("mrs r12, cpsr\n"
"bic r12, r12, #0xC0\n"
"msr cpsr_c, r12" ::: "r12", "cc", "memory");
But I disassemble code by objdump -d . "memory" Clobber don't works,
the code is to do execute both inline assembler instructions, and then
do the multiplication.
mrs ip, CPSR
orr ip, ip, #192 ; 0xc0
msr CPSR_c, ip
mrs ip, CPSR
bic ip, ip, #192 ; 0xc0
msr CPSR_c, ip
mul r0, r1, r0
mov pc, lr
Can anyone explain why"memory" Clobber don't works?
note:
source code.it may fail.
#include <stdio.h>
int main()
{
int a = mul(20, 10);
printf("%d a\n", a);
return 0;
};
int mul(int b, int c)
{
asm volatile("mrs r12, cpsr\n\t"
"orr r12, r12, #0xC0\n\t"
"msr cpsr_c, r12\n\t" ::: "r12", "cc");
c *= b; /* This may fail. */
asm volatile("mrs r12, cpsr\n"
"bic r12, r12, #0xC0\n"
"msr cpsr_c, r12" : :: "r12", "cc");
return c;
}
~
this is safe.
#include <stdio.h>
int main()
{
int a = mul(20, 10);
printf("%d a\n", a);
return 0;
};
int mul(int b, int c)
{
asm volatile("mrs r12, cpsr\n\t"
"orr r12, r12, #0xC0\n\t"
"msr cpsr_c, r12\n\t" : "=X" (b) :: "r12", "cc");
c *= b; /* This is safe. */
asm volatile("mrs r12, cpsr\n"
"bic r12, r12, #0xC0\n"
"msr cpsr_c, r12" :: "X" (c) : "r12", "cc");
return c;
}
compile and disassemble command:
lumotuwe#ubuntu:~/test_nfc$ arm-linux-gcc -O2 inline_asm.c
lumotuwe#ubuntu:~/test_nfc$ arm-linux-objdump -d a.out
Contrary to the question, the variables a, b and c can not be modified by an interrupt as they are local variables and there is no pointer to them.
If a pointer to the variables is stored in a global variable which an interrupt handler could use access the variable, the "memory" clobber will ensure access to variables is not moved past the asm statement.
Either volatile or a "memory" is required, ther is no need to do both.

atomic64_read works in x86 but not for x64

Below atomic64_read code works in x86 environment but fails for x64.
asm volatile(
"mov %%ebx, %%eax\n"
"mov %%ecx, %%edx\n"
"lock cmpxchg8b %1\n"
: "=&A" (ret)
: "m" (v->counter64)
);
interestingly, lock operation refer to register 'ecx'( : edx) in x86, but in x64 it refer to 'rax' register.
lock cmpxchg8b (%ecx) => x86
lock cmpxchg8b (%rax) => x64
I also tried to convert above code as given below, considering rax and rcx are 64 bit register. It correctly mov value to rax register but gives segmentation fault at lock statement.
asm volatile(
"mov %%rcx, %%rax\n"
"lock cmpxchg8b %1\n"
: "=&A" (ret)
: "m" (v->counter64)
);
The original fails because the "A" constraints means rax/eax/ax/al and/or rdx/edx/dx/dl and on x64 rdx only is allocated for the result, and the mov instructions therefore overwrite the address in rax.
You can get the result in two halves:
uint32_t lo, hi;
asm volatile(
"mov %%ebx, %%eax\n"
"mov %%ecx, %%edx\n"
"lock cmpxchg8b %2\n"
: "=&a" (lo), "=&d" (hi)
: "m" (v->counter64)
);
ret = lo | ((uint64_t)hi << 32);
However would an ordinary read suffice?
ret = *(volatile uint64_t)&v->counter64
Or are the memory ordering insufficient?

Inline NOPs not optimized out in LLVM

I'm working through an example in this overview of compiling inline ARM assembly using GCC. Rather than GCC, I'm using llvm-gcc 4.2.1, and I'm compiling the following C code:
#include <stdio.h>
int main(void) {
printf("Volatile NOP\n");
asm volatile("mov r0, r0");
printf("Non-volatile NOP\n");
asm("mov r0, r0");
return 0;
}
Using the following commands:
llvm-gcc -emit-llvm -c -o compiled.bc input.c
llc -O3 -march=arm -o output.s compiled.bc
My output.s ARM ASM file looks like this:
.syntax unified
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.file "compiled.bc"
.text
.globl main
.align 2
.type main,%function
main: # #main
# BB#0: # %entry
str lr, [sp, #-4]!
sub sp, sp, #16
str r0, [sp, #12]
ldr r0, .LCPI0_0
str r1, [sp, #8]
bl puts
#APP
mov r0, r0
#NO_APP
ldr r0, .LCPI0_1
bl puts
#APP
mov r0, r0
#NO_APP
mov r0, #0
str r0, [sp, #4]
str r0, [sp]
ldr r0, [sp, #4]
add sp, sp, #16
ldr lr, [sp], #4
bx lr
# BB#1:
.align 2
.LCPI0_0:
.long .L.str
.align 2
.LCPI0_1:
.long .L.str1
.Ltmp0:
.size main, .Ltmp0-main
.type .L.str,%object # #.str
.section .rodata.str1.1,"aMS",%progbits,1
.L.str:
.asciz "Volatile NOP"
.size .L.str, 13
.type .L.str1,%object # #.str1
.section .rodata.str1.16,"aMS",%progbits,1
.align 4
.L.str1:
.asciz "Non-volatile NOP"
.size .L.str1, 17
The two NOPs are between their respective #APP/#NO_APP pairs. My expectation is that the asm() statement without the volatile keyword will be optimized out of existence due to the -O3 flag, but clearly both inline assembly statements survive.
Why does the asm("mov r0, r0") line not get recognized and removed as a NOP?
As Mystical and Mārtiņš Možeiko have describe the compiler does not optimize the code; ie, change the instructions. What the compiler does optimize is when the instruction is scheduled. When you use volatile, then the compiler will not re-schedule. In your example, re-scheduling would be moving before or after the printf.
The other optimization the compiler might make is to get C values to register for you. Register allocation is very important to optimization. This doesn't optimize the assembler, but allow the compiler to do sensible things with other code with-in the function.
To see the effect of volatile, here is some sample code,
int example(int test, int add)
{
int v1=5, v2=0;
int i=0;
if(test) {
asm volatile("add %0, %1, #7" : "=r" (v2) : "r" (v2));
i+= add * v1;
i+= v2;
} else {
asm ("add %0, %1, #7" : "=r" (v2) : "r" (v2));
i+= add * v1;
i+= v2;
}
return i;
}
The two branches have identical code except for the volatile. gcc 4.7.2 generates the following code for an ARM926,
example:
cmp r0, #0
bne 1f /* branch if test set? */
add r1, r1, r1, lsl #2
add r0, r0, #7 /* add seven delayed */
add r0, r0, r1
bx lr
1: mov r0, #0 /* test set */
add r0, r0, #7 /* add seven immediate */
add r1, r1, r1, lsl #2
add r0, r0, r1
bx lr
Note: The assembler branches are reversed to the 'C' code. The 2nd branch is slower on some processors due to pipe lining. The compiler prefers that
add r1, r1, r1, lsl #2
add r0, r0, r1
do not execute sequentially.
The Ethernut ARM Tutorial is an excellent resource. However, optimize is a bit of an overloaded word. The compiler doesn't analyze the assembler, only the arguments and where the code will be emitted.
volatile is implied if the asm statement has no outputs declared.

How to specify an individual register as constraint in ARM GCC inline assembly?

in x86 inline assembly i can write something like this:
asm ("cpuid"
: "=a" (_eax),
"=b" (_ebx),
"=c" (_ecx),
"=d" (_edx)
: "a" (op));
so in the matchin constraints instead of just writing "=r" and let the compiler chose the register, I can say which particular register i want to use (=a for example to use %eax)
how can I do this for ARM assembly?
the ARM GCC assembly cookbook http://www.ethernut.de/en/documents/arm-inline-asm.html states that i can for example use the constraints
"r" for one of the general purpose registers R0-R15
"w" for one of the VFP floating point registers S0-S31
but how can I constraint an operand for example exactly to s1? or to a particular general purpose register?
I don't think gcc for ARM allows you to use constraints to specify exactly which register to use. However, you can use explicit register variables to specify a register to store a variable in:
register int my_variable asm("r0");
Explicit register variables minimal runnable example
Here is an ARMv8 Linux C freestanding hello world exemplifying https://stackoverflow.com/a/3936064/9160762 with some disassembly analysis:
main.c
#include <inttypes.h>
void _start(void) {
uint64_t exit_status;
/* write */
{
char msg[] = "hello syscall v8\n";
uint64_t syscall_return;
register uint64_t x0 __asm__ ("x0") = 1; /* stdout */
register char *x1 __asm__ ("x1") = msg;
register uint64_t x2 __asm__ ("x2") = sizeof(msg);
register uint64_t x8 __asm__ ("x8") = 64; /* syscall number */
__asm__ __volatile__ (
"svc 0;"
: "+r" (x0)
: "r" (x1), "r" (x2), "r" (x8)
: "memory"
);
syscall_return = x0;
exit_status = (syscall_return != sizeof(msg));
}
/* exit */
{
register uint64_t x0 __asm__ ("x0") = exit_status;
register uint64_t x8 __asm__ ("x8") = 93;
__asm__ __volatile__ (
"svc 0;"
: "+r" (x0)
: "r" (x8)
:
);
}
}
GitHub upstream.
Compile and run:
sudo apt-get install qemu-user gcc-aarch64-linux-gnu
aarch64-linux-gnu-gcc -O3 -std=c99 -ggdb3 -march=armv8-a -pedantic -Wall -Wextra \
-ffreestanding -nostdlib -static -o main.out main.c
qemu-aarch64 main.out
Output:
hello syscall v8
Disassembly:
aarch64-linux-gnu-objdump -S main.out
Output:
main.out: file format elf64-littleaarch64
Disassembly of section .text:
0000000000400110 <_start>:
void _start(void) {
uint64_t exit_status;
/* write */
{
char msg[] = "hello syscall v8\n";
400110: 90000003 adrp x3, 400000 <_start-0x110>
400114: 91056063 add x3, x3, #0x158
void _start(void) {
400118: d10083ff sub sp, sp, #0x20
uint64_t syscall_return;
register uint64_t x0 __asm__ ("x0") = 1; /* stdout */
40011c: d2800020 mov x0, #0x1 // #1
register char *x1 __asm__ ("x1") = msg;
400120: 910023e1 add x1, sp, #0x8
register uint64_t x2 __asm__ ("x2") = sizeof(msg);
400124: d2800242 mov x2, #0x12 // #18
char msg[] = "hello syscall v8\n";
400128: a9401464 ldp x4, x5, [x3]
register uint64_t x8 __asm__ ("x8") = 64; /* syscall number */
40012c: d2800808 mov x8, #0x40 // #64
char msg[] = "hello syscall v8\n";
400130: 79402063 ldrh w3, [x3, #16]
400134: a90097e4 stp x4, x5, [sp, #8]
400138: 790033e3 strh w3, [sp, #24]
__asm__ __volatile__ (
40013c: d4000001 svc #0x0
: "+r" (x0)
: "r" (x1), "r" (x2), "r" (x8)
: "memory"
);
syscall_return = x0;
exit_status = (syscall_return != sizeof(msg));
400140: eb02001f cmp x0, x2
}
/* exit */
{
register uint64_t x0 __asm__ ("x0") = exit_status;
register uint64_t x8 __asm__ ("x8") = 93;
400144: d2800ba8 mov x8, #0x5d // #93
register uint64_t x0 __asm__ ("x0") = exit_status;
400148: 9a9f07e0 cset x0, ne // ne = any
__asm__ __volatile__ (
40014c: d4000001 svc #0x0
: "+r" (x0)
: "r" (x8)
:
);
}
}
400150: 910083ff add sp, sp, #0x20
400154: d65f03c0 ret
Attempt without explicit register variables
Mostly for fun, I tried to reach the same result without using register variables, but I was not able to do it.
In any case, the code would be more complicated, so you are better off just using register variables.
Here is my best attempt:
main.c
#include <inttypes.h>
void _start(void) {
uint64_t exit_status;
/* write */
{
char msg[] = "hello syscall v8\n";
uint64_t syscall_return;
__asm__ (
"mov x0, 1;" /* stdout */
"mov x1, %[msg];"
"mov x2, %[len];"
"mov x8, 64;" /* syscall number */
"svc 0;"
"mov %[syscall_return], x0;"
: [syscall_return] "=r" (syscall_return)
: [msg] "p" (msg),
[len] "i" (sizeof(msg))
: "x0", "x1", "x2", "x8", "memory"
);
exit_status = (syscall_return != sizeof(msg));
}
/* exit */
__asm__ (
"mov x0, %[exit_status];"
"mov x8, 93;" /* syscall number */
"svc 0;"
:
: [exit_status] "r" (exit_status)
: "x0", "x8"
);
}
GitHub upstream.
Disassembly:
main.out: file format elf64-littleaarch64
Disassembly of section .text:
0000000000400110 <_start>:
void _start(void) {
uint64_t exit_status;
/* write */
{
char msg[] = "hello syscall v8\n";
400110: 90000000 adrp x0, 400000 <_start-0x110>
400114: 9105a000 add x0, x0, #0x168
void _start(void) {
400118: d10083ff sub sp, sp, #0x20
char msg[] = "hello syscall v8\n";
40011c: a9400c02 ldp x2, x3, [x0]
400120: a9008fe2 stp x2, x3, [sp, #8]
400124: 79402000 ldrh w0, [x0, #16]
uint64_t syscall_return;
__asm__ (
400128: 910023e3 add x3, sp, #0x8
char msg[] = "hello syscall v8\n";
40012c: 790033e0 strh w0, [sp, #24]
__asm__ (
400130: d2800020 mov x0, #0x1 // #1
400134: aa0303e1 mov x1, x3
400138: d2800242 mov x2, #0x12 // #18
40013c: d2800808 mov x8, #0x40 // #64
400140: d4000001 svc #0x0
400144: aa0003e3 mov x3, x0
: [syscall_return] "=r" (syscall_return)
: [msg] "p" (msg),
[len] "i" (sizeof(msg))
: "x0", "x1", "x2", "x8", "memory"
);
exit_status = (syscall_return != sizeof(msg));
400148: f100487f cmp x3, #0x12
40014c: 9a9f07e1 cset x1, ne // ne = any
}
/* exit */
__asm__ (
400150: aa0103e0 mov x0, x1
400154: d2800ba8 mov x8, #0x5d // #93
400158: d4000001 svc #0x0
"svc 0;"
:
: [exit_status] "r" (exit_status)
: "x0", "x8"
);
}
40015c: 910083ff add sp, sp, #0x20
400160: d65f03c0 ret
This was less efficient for the following reasons:
write constraint p need to use an intermediate register x3 for the add to sp
I don't know how to get the syscall return status without an extra mov to an output register
exit status gets moved one extra time through x1. With register variables is just calculated directly into x0.
Tested in Ubuntu 18.10, GCC 8.2.0, QEMU 2.12.

Resources