GCC compilation with nostdlib flag for Aarch64 platform - gcc

I am trying to compile binary with nostdlib flag on Aarch64 platform.
I've dealt successfully with it on x86-64 platform this way:
void _start() {
/* main body of program: call main(), etc */
/* exit system call */
asm("movl $1,%eax;"
"xorl %ebx,%ebx;"
"int $0x80"
);
}
Is there any analogue to do the same thing on aarch64 platform?(specifically system exit call)

The example hereafter should work on an aarch64-linux-gnu system - It does work using running qemu-aarch64 3.0 on my x86_64 linux system.
The most concise/loosely coupled source of information for learning purpose would be musl-libc source code in my humble opinion:
syscall_arch.h does contain the _syscall functions to be used depending on the number of arguments required by a given syscall,
syscall.h.in does contain defines for all system calls.
We should then use:
static inline long __syscall1(long n, long a)
{
register long x8 __asm__("x8") = n;
register long x0 __asm__("x0") = a;
__asm_syscall("r"(x8), "0"(x0));
}
and __NR_exit:
#define __NR_exit 93
#define __NR_exit_group 94
A basic example in C would be syscall-exit.c:
#include "syscall_arch.h"
#include "syscall.h.in"
int main(void)
{
// exiting with return code 1.
__syscall1(__NR_exit, 1);
// we should have exited.
for (;;);
}
Compiling/executing/checking return code:
/opt/linaro/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-gcc -static -O0 -o exit-syscall exit-syscall.c
qemu-aarch64 exit-syscall
echo $?
1
A close look at the generated code for main() and __syscall1() using:
/opt/linaro/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-objdump -D exit-syscall > exit-syscall.lst
Would look like:
0000000000400554 <main>:
400554: a9bf7bfd stp x29, x30, [sp, #-16]!
400558: 910003fd mov x29, sp
40055c: d2800021 mov x1, #0x1 // #1
400560: d2800ba0 mov x0, #0x5d // #93
400564: 97fffff4 bl 400534 <__syscall1>
0000000000400534 <__syscall1>:
400534: d10043ff sub sp, sp, #0x10
400538: f90007e0 str x0, [sp, #8]
40053c: f90003e1 str x1, [sp]
400540: f94007e8 ldr x8, [sp, #8]
400544: f94003e0 ldr x0, [sp]
400548: d4000001 svc #0x0
40054c: 910043ff add sp, sp, #0x10
400550: d65f03c0 ret
See document "Procedure Call Standard for the ARM 64-bit Architecture(AArch64)" for more information.
Therefore, an Aarch64 equivalent of your x86_64 code would be exit-asm.c :
void main(void) {
/* exit system call - calling NR_exit with 1 as the return code*/
asm("mov x0, #1;"
"mov x8, #93;"
"svc #0x0;"
);
for (;;);
}
/opt/linaro/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-gcc -static -o example example.c
qemu-aarch64 example
echo $?
1
Please note that glibc implementation of exit() does call __NR_exit_group prior to call __NR_exit.

Related

Capture input in assembly arm 64 bit mac os

Trying to capture two characters and new line from user input.
The program prints the 3 helloworlds to screen and then users can type in some characters.
Everything seems to work, but it doesn't print the input
I suspect it is due to the way I operate on the X1 register in the _read function, or the way the buffer is allocated
No errors are reported when running the code.
The code is compiled using the following command. It should run on a Mac M1
as HelloWorld.s -o HelloWorld.o && ld -macosx_version_min 12.0.0 -o HelloWorld HelloWorld.o -lSystem -syslibroot `xcrun -sdk macosx --show-sdk-path` -e _start -arch arm64 && ./HelloWorld
//HelloWorld.s
.equ SYS_WRITE, 4
.equ SYS_READ, 3
.equ NEWLN, 10
.global _start // Provide program starting address to linker
.align 2
// Setup the parameters to print hello world
// and then call Linux to do it.
_start:
adr X4, helloworld1
mov X1, X4
bl _sizeof
bl _print
adr X4, helloworld2
mov X1, X4
bl _sizeof
bl _print
adr X4, helloworld3
mov X1, X4
bl _sizeof
bl _print
bl _read
//mov X2, 4
// bl _sizeof
bl _print
_exit:
mov X0, X2 // Use 0 return code
mov X16, #1 // Service command code 1 terminates this program
svc 0 // Call MacOS to terminate the program
_sizeof: //X1 = address, X2 = out length, string must terminate with \n
str LR, [SP, #-16]! //Store registers
//str W0, [SP, #-16]!
mov X2, #0
__loop:
ldrb W0, [X1, X2] //load a byte into W0 (32 bit)
add X2, X2, #1 //Add 1 offset
cmp W0, NEWLN //Compare byte with \n return
bne __loop
//ldr W0, [SP], #16
ldr LR, [SP], #16 //Load registers
ret
_print: //X2 = length, X1 = address
str LR, [SP, #-16]! //Store registers
mov X0, #1 // 1 = StdOut
// mov X1, X1 // string to print
// mov X2, X2 // length of string
mov X16, SYS_WRITE // MacOS write system call
svc 0 // Call kernel to output the string
ldr LR, [SP], #16 //Load registers
ret
_read:
//3 AUE_NULL ALL { user_ssize_t read(int fd, user_addr_t cbuf, user_size_t nbyte); }
str LR, [SP, #-16]! //Store registers
adr X1, msg
mov X0, #0 // 0 = StdIn
ldr X1, [x1] // address to store string
mov X2, #4 // length
mov X16, SYS_READ // MacOS read system call
svc 0 // Call system
ldr LR, [SP], #16 //Load registers
ret
msg: .ds 4 //memory buffer for keyboard input
helloworld1: .ascii "Hello World\n"
helloworld2: .ascii "Happy new year for 2022\n"
helloworld3: .ascii "Welcome to AARCH64 assembly on Mac Silicon\n"
First you need to move msg to a writeable segment:
.data
msg: .ds 4 //memory buffer for keyboard input
.text // keep everything else in __TEXT
Then, because segments may be moved around arbitrarily at link-time, Apple's toolchain will no longer allow you to use adr to get the address of that buffer - you will have to use adrp and add:
adrp x1, msg#page
add x1, x1, msg#pageoff
If you want, you can tell the linker to please optimise this back to an adr if possible:
Lloh0:
adrp x1, msg#page
Lloh1:
add x1, x1, msg#pageoff
.loh AdrpAdd Lloh0, Lloh1
Then you need to remove this line:
ldr X1, [x1]
That would load the contents of the buffer, which would just be null bytes.
And finally, you should change the x0 value to exit to a constant:
mov x0, 0
The value in x2 will have been clobbered at this point, and you don't need it anyway.
As a reference for anyone in the future looking for an example to read from Standard In on AppleSilicon (M1), this code (based on the above information) works. It takes in a string up to 20 characters and prints it back out to the Standard Output.
.global _start
.align 2
_start:
// READ IN FROM KEYBOARD
mov X16, 3 // Tell system we want to read from StdIn (#3)
mov X0, 0 // Focus on the keyboard (#0)
mov X2, 20 // Define length of string to read in
adrp x1, msg#page // Load the address of the message
add x1, x1, msg#pageoff // Store the address to x1
svc 0 // Call kernel to perform the action
_write:
mov X16, 4 // Tell system we want to write to StdOut (#4)
mov X0, 1 // Focus on the screen (#1)
adrp x1, msg#page // Load the address of the message
add x1, x1, msg#pageoff // Store the address to x1
svc 0 // Call kernel to perform the action
_end:
mov X0, 0 // Return 0 (get a run error without this)
mov X16, 1 // System call to terminate this program
svc 0 // Call kernel to perform the action
.data
msg:
.ds 20 // 20 bytes of memory for keyboard input
Your makefile should look like this:
temp: temp.o
ld -o temp temp.o -lSystem -syslibroot `xcrun -sdk macosx --show-sdk-path` -e _start -arch arm64
temp.o: temp.s
as -arch arm64 -o temp.o temp.s

ARM GCC hardfault when using -O2

When using ARM GCC g++ compiler with optimization level -O2 (and up) this code:
void foo(void)
{
DBB("#0x%08X: 0x%08X", 1, *((uint32_t *)1));
DBB("#0x%08X: 0x%08X", 0, *((uint32_t *)0));
}
Compiles to:
0800abb0 <_Z3foov>:
800abb0: b508 push {r3, lr}
800abb2: 2301 movs r3, #1
800abb4: 4619 mov r1, r3
800abb6: 681a ldr r2, [r3, #0]
800abb8: 4802 ldr r0, [pc, #8] ; (800abc4 <_Z3foov+0x14>)
800abba: f007 fa83 bl 80120c4 <debug_print_blocking>
800abbe: 2300 movs r3, #0
800abc0: 681b ldr r3, [r3, #0]
800abc2: deff udf #255 ; 0xff
800abc4: 08022704 stmdaeq r2, {r2, r8, r9, sl, sp}
And this gives me hardfault at undefined instruction #0x0800abc2.
Also, if there is more code after that, it is not compiled into final binary.
The question is why compiler generates it like that, why undefined istruction?
By the way, it works fine for stuff like this:
...
uint32_t num = 2;
num -= 2;
DBB("#0x%08X: 0x%08X", 0, *((uint32_t *)num));
...
Compiler version:
arm-none-eabi-g++.exe (GNU Tools for ARM Embedded Processors 6-2017-q2-update) 6.3.1 20170620 (release) [ARM/embedded-6-branch revision 249437]
You can disable this (and verify this answer) by using -fno-delete-null-pointer-checks
The pointer you are passing has a value which matches the null pointer, and the compiler can see that from static analysis, so it faults (because that is the defined behaviour).
In your second example, the static analysis doesn't identify a NULL.

Inline NOPs not optimized out in LLVM

I'm working through an example in this overview of compiling inline ARM assembly using GCC. Rather than GCC, I'm using llvm-gcc 4.2.1, and I'm compiling the following C code:
#include <stdio.h>
int main(void) {
printf("Volatile NOP\n");
asm volatile("mov r0, r0");
printf("Non-volatile NOP\n");
asm("mov r0, r0");
return 0;
}
Using the following commands:
llvm-gcc -emit-llvm -c -o compiled.bc input.c
llc -O3 -march=arm -o output.s compiled.bc
My output.s ARM ASM file looks like this:
.syntax unified
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.file "compiled.bc"
.text
.globl main
.align 2
.type main,%function
main: # #main
# BB#0: # %entry
str lr, [sp, #-4]!
sub sp, sp, #16
str r0, [sp, #12]
ldr r0, .LCPI0_0
str r1, [sp, #8]
bl puts
#APP
mov r0, r0
#NO_APP
ldr r0, .LCPI0_1
bl puts
#APP
mov r0, r0
#NO_APP
mov r0, #0
str r0, [sp, #4]
str r0, [sp]
ldr r0, [sp, #4]
add sp, sp, #16
ldr lr, [sp], #4
bx lr
# BB#1:
.align 2
.LCPI0_0:
.long .L.str
.align 2
.LCPI0_1:
.long .L.str1
.Ltmp0:
.size main, .Ltmp0-main
.type .L.str,%object # #.str
.section .rodata.str1.1,"aMS",%progbits,1
.L.str:
.asciz "Volatile NOP"
.size .L.str, 13
.type .L.str1,%object # #.str1
.section .rodata.str1.16,"aMS",%progbits,1
.align 4
.L.str1:
.asciz "Non-volatile NOP"
.size .L.str1, 17
The two NOPs are between their respective #APP/#NO_APP pairs. My expectation is that the asm() statement without the volatile keyword will be optimized out of existence due to the -O3 flag, but clearly both inline assembly statements survive.
Why does the asm("mov r0, r0") line not get recognized and removed as a NOP?
As Mystical and Mārtiņš Možeiko have describe the compiler does not optimize the code; ie, change the instructions. What the compiler does optimize is when the instruction is scheduled. When you use volatile, then the compiler will not re-schedule. In your example, re-scheduling would be moving before or after the printf.
The other optimization the compiler might make is to get C values to register for you. Register allocation is very important to optimization. This doesn't optimize the assembler, but allow the compiler to do sensible things with other code with-in the function.
To see the effect of volatile, here is some sample code,
int example(int test, int add)
{
int v1=5, v2=0;
int i=0;
if(test) {
asm volatile("add %0, %1, #7" : "=r" (v2) : "r" (v2));
i+= add * v1;
i+= v2;
} else {
asm ("add %0, %1, #7" : "=r" (v2) : "r" (v2));
i+= add * v1;
i+= v2;
}
return i;
}
The two branches have identical code except for the volatile. gcc 4.7.2 generates the following code for an ARM926,
example:
cmp r0, #0
bne 1f /* branch if test set? */
add r1, r1, r1, lsl #2
add r0, r0, #7 /* add seven delayed */
add r0, r0, r1
bx lr
1: mov r0, #0 /* test set */
add r0, r0, #7 /* add seven immediate */
add r1, r1, r1, lsl #2
add r0, r0, r1
bx lr
Note: The assembler branches are reversed to the 'C' code. The 2nd branch is slower on some processors due to pipe lining. The compiler prefers that
add r1, r1, r1, lsl #2
add r0, r0, r1
do not execute sequentially.
The Ethernut ARM Tutorial is an excellent resource. However, optimize is a bit of an overloaded word. The compiler doesn't analyze the assembler, only the arguments and where the code will be emitted.
volatile is implied if the asm statement has no outputs declared.

Using GCC's builtin functions in arm

I'm working on a cortex-m3 board with a bare-metal toolchain without libc.
I implemented memcpy which copies data byte-to-byte but it's too slow. In GCC manual, it says it provides __builtin_memcpy and I decided to use it. So here is the implementation with __builtin_memcpy.
#include <stddef.h>
void *memcpy(void *dest, const void *src, size_t n)
{
return __builtin_memcpy(dest,src,n);
}
When I compile this code, it becomes a recursive function which never ends.
$ arm-none-eabi-gcc -march=armv7-m -mcpu=cortex-m3 -mtune=cortex-m3 \
-O2 -ffreestanding -c memcpy.c -o memcpy.o
$ arm-none-eabi-objdump -d memcpy.o
memcpy.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <memcpy>:
0: f7ff bffe b.w 0 <memcpy>
Am I doing wrong? How can I use the compiler-generated memcpy version?
Builtin functions are not supposed to be used to implement itself :)
Builtin functions are supposed to be used in application code - then the compiler may or may not generate some special insn sequence or a call to the underlying real function
Compare:
int a [10], b [20];
void
foo ()
{
__builtin_memcpy (a, b, 10 * sizeof (int));
}
This results in:
foo:
stmfd sp!, {r4, r5}
ldr r4, .L2
ldr r5, .L2+4
ldmia r4!, {r0, r1, r2, r3}
mov ip, r5
stmia ip!, {r0, r1, r2, r3}
ldmia r4!, {r0, r1, r2, r3}
stmia ip!, {r0, r1, r2, r3}
ldmia r4, {r0, r1}
stmia ip, {r0, r1}
ldmfd sp!, {r4, r5}
bx lr
But:
void
bar (int n)
{
__builtin_memcpy (a, b, n * sizeof (int));
}
results in a call to the memcpy function:
bar:
mov r2, r0, asl #2
stmfd sp!, {r3, lr}
ldr r1, .L5
ldr r0, .L5+4
bl memcpy
ldmfd sp!, {r3, lr}
bx lr
Theoretically, library is not part of C compiler and not part of toolchain.
Thus, if you wrotememcpy(&a,&b,sizeof(a)) compiler MUST generate subroutine call.
The idea of __builtin : to inform compiler, that the function is standard and can be optimized. Thus, if you wrote __builtin_memcpy(&a,&b,sizeof(a)) compiler MAY generate subroutine call, but in most cases it will not happens. For example, if size is known as 4 at compile time - only one mov command will be generated. (Another advantage - even in case of subroutine call compiler is informed, that library function has no side effects).
So, it's ALWAYS better to use __builtin_memcpy instead of memcpy. In modern libraries it was done by #define memcpy __builtin_memcpy just in string.h
But you still need implement memcpy somewhere, call will be generated in sophistical places. For string functions on ARM, it's strictly recommended 4-byte implementation.

How to specify an individual register as constraint in ARM GCC inline assembly?

in x86 inline assembly i can write something like this:
asm ("cpuid"
: "=a" (_eax),
"=b" (_ebx),
"=c" (_ecx),
"=d" (_edx)
: "a" (op));
so in the matchin constraints instead of just writing "=r" and let the compiler chose the register, I can say which particular register i want to use (=a for example to use %eax)
how can I do this for ARM assembly?
the ARM GCC assembly cookbook http://www.ethernut.de/en/documents/arm-inline-asm.html states that i can for example use the constraints
"r" for one of the general purpose registers R0-R15
"w" for one of the VFP floating point registers S0-S31
but how can I constraint an operand for example exactly to s1? or to a particular general purpose register?
I don't think gcc for ARM allows you to use constraints to specify exactly which register to use. However, you can use explicit register variables to specify a register to store a variable in:
register int my_variable asm("r0");
Explicit register variables minimal runnable example
Here is an ARMv8 Linux C freestanding hello world exemplifying https://stackoverflow.com/a/3936064/9160762 with some disassembly analysis:
main.c
#include <inttypes.h>
void _start(void) {
uint64_t exit_status;
/* write */
{
char msg[] = "hello syscall v8\n";
uint64_t syscall_return;
register uint64_t x0 __asm__ ("x0") = 1; /* stdout */
register char *x1 __asm__ ("x1") = msg;
register uint64_t x2 __asm__ ("x2") = sizeof(msg);
register uint64_t x8 __asm__ ("x8") = 64; /* syscall number */
__asm__ __volatile__ (
"svc 0;"
: "+r" (x0)
: "r" (x1), "r" (x2), "r" (x8)
: "memory"
);
syscall_return = x0;
exit_status = (syscall_return != sizeof(msg));
}
/* exit */
{
register uint64_t x0 __asm__ ("x0") = exit_status;
register uint64_t x8 __asm__ ("x8") = 93;
__asm__ __volatile__ (
"svc 0;"
: "+r" (x0)
: "r" (x8)
:
);
}
}
GitHub upstream.
Compile and run:
sudo apt-get install qemu-user gcc-aarch64-linux-gnu
aarch64-linux-gnu-gcc -O3 -std=c99 -ggdb3 -march=armv8-a -pedantic -Wall -Wextra \
-ffreestanding -nostdlib -static -o main.out main.c
qemu-aarch64 main.out
Output:
hello syscall v8
Disassembly:
aarch64-linux-gnu-objdump -S main.out
Output:
main.out: file format elf64-littleaarch64
Disassembly of section .text:
0000000000400110 <_start>:
void _start(void) {
uint64_t exit_status;
/* write */
{
char msg[] = "hello syscall v8\n";
400110: 90000003 adrp x3, 400000 <_start-0x110>
400114: 91056063 add x3, x3, #0x158
void _start(void) {
400118: d10083ff sub sp, sp, #0x20
uint64_t syscall_return;
register uint64_t x0 __asm__ ("x0") = 1; /* stdout */
40011c: d2800020 mov x0, #0x1 // #1
register char *x1 __asm__ ("x1") = msg;
400120: 910023e1 add x1, sp, #0x8
register uint64_t x2 __asm__ ("x2") = sizeof(msg);
400124: d2800242 mov x2, #0x12 // #18
char msg[] = "hello syscall v8\n";
400128: a9401464 ldp x4, x5, [x3]
register uint64_t x8 __asm__ ("x8") = 64; /* syscall number */
40012c: d2800808 mov x8, #0x40 // #64
char msg[] = "hello syscall v8\n";
400130: 79402063 ldrh w3, [x3, #16]
400134: a90097e4 stp x4, x5, [sp, #8]
400138: 790033e3 strh w3, [sp, #24]
__asm__ __volatile__ (
40013c: d4000001 svc #0x0
: "+r" (x0)
: "r" (x1), "r" (x2), "r" (x8)
: "memory"
);
syscall_return = x0;
exit_status = (syscall_return != sizeof(msg));
400140: eb02001f cmp x0, x2
}
/* exit */
{
register uint64_t x0 __asm__ ("x0") = exit_status;
register uint64_t x8 __asm__ ("x8") = 93;
400144: d2800ba8 mov x8, #0x5d // #93
register uint64_t x0 __asm__ ("x0") = exit_status;
400148: 9a9f07e0 cset x0, ne // ne = any
__asm__ __volatile__ (
40014c: d4000001 svc #0x0
: "+r" (x0)
: "r" (x8)
:
);
}
}
400150: 910083ff add sp, sp, #0x20
400154: d65f03c0 ret
Attempt without explicit register variables
Mostly for fun, I tried to reach the same result without using register variables, but I was not able to do it.
In any case, the code would be more complicated, so you are better off just using register variables.
Here is my best attempt:
main.c
#include <inttypes.h>
void _start(void) {
uint64_t exit_status;
/* write */
{
char msg[] = "hello syscall v8\n";
uint64_t syscall_return;
__asm__ (
"mov x0, 1;" /* stdout */
"mov x1, %[msg];"
"mov x2, %[len];"
"mov x8, 64;" /* syscall number */
"svc 0;"
"mov %[syscall_return], x0;"
: [syscall_return] "=r" (syscall_return)
: [msg] "p" (msg),
[len] "i" (sizeof(msg))
: "x0", "x1", "x2", "x8", "memory"
);
exit_status = (syscall_return != sizeof(msg));
}
/* exit */
__asm__ (
"mov x0, %[exit_status];"
"mov x8, 93;" /* syscall number */
"svc 0;"
:
: [exit_status] "r" (exit_status)
: "x0", "x8"
);
}
GitHub upstream.
Disassembly:
main.out: file format elf64-littleaarch64
Disassembly of section .text:
0000000000400110 <_start>:
void _start(void) {
uint64_t exit_status;
/* write */
{
char msg[] = "hello syscall v8\n";
400110: 90000000 adrp x0, 400000 <_start-0x110>
400114: 9105a000 add x0, x0, #0x168
void _start(void) {
400118: d10083ff sub sp, sp, #0x20
char msg[] = "hello syscall v8\n";
40011c: a9400c02 ldp x2, x3, [x0]
400120: a9008fe2 stp x2, x3, [sp, #8]
400124: 79402000 ldrh w0, [x0, #16]
uint64_t syscall_return;
__asm__ (
400128: 910023e3 add x3, sp, #0x8
char msg[] = "hello syscall v8\n";
40012c: 790033e0 strh w0, [sp, #24]
__asm__ (
400130: d2800020 mov x0, #0x1 // #1
400134: aa0303e1 mov x1, x3
400138: d2800242 mov x2, #0x12 // #18
40013c: d2800808 mov x8, #0x40 // #64
400140: d4000001 svc #0x0
400144: aa0003e3 mov x3, x0
: [syscall_return] "=r" (syscall_return)
: [msg] "p" (msg),
[len] "i" (sizeof(msg))
: "x0", "x1", "x2", "x8", "memory"
);
exit_status = (syscall_return != sizeof(msg));
400148: f100487f cmp x3, #0x12
40014c: 9a9f07e1 cset x1, ne // ne = any
}
/* exit */
__asm__ (
400150: aa0103e0 mov x0, x1
400154: d2800ba8 mov x8, #0x5d // #93
400158: d4000001 svc #0x0
"svc 0;"
:
: [exit_status] "r" (exit_status)
: "x0", "x8"
);
}
40015c: 910083ff add sp, sp, #0x20
400160: d65f03c0 ret
This was less efficient for the following reasons:
write constraint p need to use an intermediate register x3 for the add to sp
I don't know how to get the syscall return status without an extra mov to an output register
exit status gets moved one extra time through x1. With register variables is just calculated directly into x0.
Tested in Ubuntu 18.10, GCC 8.2.0, QEMU 2.12.

Resources