linked output sections overlap in address - gcc

I have this link script test.ld :
/* write for machine virt */
ENTRY(_Reset)
MEMORY
{
rm(rx) : ORIGIN = 0x00000000, LENGTH = 0x80000
ram (rwx) : ORIGIN = 0x40000000, LENGTH = 0x40000000
}
SECTIONS
{
. = 0x10000;
.startup . : { startup.o(.startup) }
.text : { *(.text) }
.data : { *(.data) }
.bss : { *(.bss COMMON) }
. = ALIGN(8);
. = . + 0x1000; /* 4kB of stack memory */
stack_top = .;
}
and startup code startup.s :
.section .startup
.global _Reset
_Reset:
ldr x5, stack_top
msr sp_el0, x5
bl c_entry
b .
and test.c for print :
volatile unsigned int * const UART0DR = (unsigned int *)0x09000000;
void print_uart0(const char *s) {
while(*s != '\0') { /* Loop until end of string */
*UART0DR = (unsigned int)(*s); /* Transmit char */
s++; /* Next char */
}
}
void c_entry() {
print_uart0("Hello world!\n");
}
This is how I compiled and linked the program.
aarch64-elf-as startup.s -o startup.o
aarch64-elf-gcc -c -g test.c -o test.o
aarch64-elf-ld -T test.ld test.o startup.o -o test.elf
aarch64-elf-objcopy -O binary test.elf test.bin
When I do objdump for the final result by aarch64-none-elf-objdump -D test.elf, I see this result:
test.elf: file format elf64-littleaarch64
Disassembly of section .startup:
0000000000010000 <_Reset>:
10000: 58f88405 ldr x5, 1080 <stack_top>
10004: d5184105 msr sp_el0, x5
10008: 97ffc00f bl 44 <c_entry>
1000c: 14000000 b 1000c <_Reset+0xc>
Disassembly of section .text:
0000000000000000 <print_uart0>:
0: d10043ff sub sp, sp, #0x10
4: f90007e0 str x0, [sp, #8]
8: 14000008 b 28 <print_uart0+0x28>
c: f94007e0 ldr x0, [sp, #8]
10: 39400001 ldrb w1, [x0]
14: d2a12000 mov x0, #0x9000000 // #150994944
18: b9000001 str w1, [x0]
1c: f94007e0 ldr x0, [sp, #8]
20: 91000400 add x0, x0, #0x1
24: f90007e0 str x0, [sp, #8]
28: f94007e0 ldr x0, [sp, #8]
2c: 39400000 ldrb w0, [x0]
30: 7100001f cmp w0, #0x0
34: 54fffec1 b.ne c <print_uart0+0xc> // b.any
38: d503201f nop
3c: 910043ff add sp, sp, #0x10
40: d65f03c0 ret
0000000000000044 <c_entry>:
44: a9bf7bfd stp x29, x30, [sp, #-16]!
48: 910003fd mov x29, sp
4c: 90000000 adrp x0, 0 <print_uart0>
50: 9101c000 add x0, x0, #0x70
54: 97ffffeb bl 0 <print_uart0>
58: d503201f nop
5c: a8c17bfd ldp x29, x30, [sp], #16
60: d65f03c0 ret
Disassembly of section .rodata:
0000000000000068 <UART0DR>:
68: 09000000 .inst 0x09000000 ; undefined
6c: 00000000 .inst 0x00000000 ; undefined
70: 6c6c6548 ldnp d8, d25, [x10, #-320]
74: 6f77206f umlal2 v15.4s, v3.8h, v7.h[3]
78: 21646c72 .inst 0x21646c72 ; undefined
7c: Address 0x000000000000007c is out of bounds.
I don't know why the _Reset is located rightly at 0x10000, but the print_uart0 and c_entry functions are located at 0x00000000. According to the linker script, shouldn't the .text section placed right after the .startup section?

_Reset is located at 0x10000 because your script explicitly requested it with the . = 0x10000; statement.
And you probably should tell the linker that .startup should be put with, and at the beginning at, all .text related stuff:
/* write for machine virt */
ENTRY(_Reset)
MEMORY
{
rm(rx) : ORIGIN = 0x00000000, LENGTH = 0x80000
ram (rwx) : ORIGIN = 0x40000000, LENGTH = 0x40000000
}
SECTIONS
{
.text : {
*(.startup)
*(.text)
}
.data : { *(.data) }
.bss : { *(.bss COMMON) }
. = ALIGN(8);
. = . + 0x1000; /* 4kB of stack memory */
stack_top = .;
}
The result will be:
test.elf: file format elf64-littleaarch64
Disassembly of section .text:
0000000000000000 <_Reset>:
0: 58008485 ldr x5, 1090 <stack_top>
4: d5184105 msr sp_el0, x5
8: 94000014 bl 58 <c_entry>
c: 14000000 b c <_Reset+0xc>
0000000000000010 <print_uart0>:
10: d10043ff sub sp, sp, #0x10
14: f90007e0 str x0, [sp, #8]
18: 14000008 b 38 <print_uart0+0x28>
1c: f94007e0 ldr x0, [sp, #8]
20: 39400001 ldrb w1, [x0]
24: d2a12000 mov x0, #0x9000000 // #150994944
28: b9000001 str w1, [x0]
2c: f94007e0 ldr x0, [sp, #8]
30: 91000400 add x0, x0, #0x1
34: f90007e0 str x0, [sp, #8]
38: f94007e0 ldr x0, [sp, #8]
3c: 39400000 ldrb w0, [x0]
40: 7100001f cmp w0, #0x0
44: 54fffec1 b.ne 1c <print_uart0+0xc> // b.any
48: d503201f nop
4c: d503201f nop
50: 910043ff add sp, sp, #0x10
54: d65f03c0 ret
0000000000000058 <c_entry>:
58: a9bf7bfd stp x29, x30, [sp, #-16]!
5c: 910003fd mov x29, sp
60: 90000000 adrp x0, 0 <_Reset>
64: 91020000 add x0, x0, #0x80
68: 97ffffea bl 10 <print_uart0>
6c: d503201f nop
70: a8c17bfd ldp x29, x30, [sp], #16
74: d65f03c0 ret
Disassembly of section .rodata:
0000000000000078 <UART0DR>:
78: 09000000 .inst 0x09000000 ; undefined
7c: 00000000 udf #0
80: 6c6c6548 ldnp d8, d25, [x10, #-320]
84: 6f77206f umlal2 v15.4s, v3.8h, v7.h[3]
88: 21646c72 .inst 0x21646c72 ; undefined
8c: Address 0x000000000000008c is out of bounds.
Please note that if you want your code to start from 0x40000000, you have to specify it in the linker script by adding > ram the the definition for the .textsection (same thing for the .data section):
/* write for machine virt */
ENTRY(_Reset)
MEMORY
{
rm(rx) : ORIGIN = 0x00000000, LENGTH = 0x80000
ram (rwx) : ORIGIN = 0x40000000, LENGTH = 0x40000000
}
SECTIONS
{
.text : {
*(.startup)
*(.text)
} > ram
.data : {
*(.data)
} > ram
.bss : { *(.bss COMMON) }
. = ALIGN(8);
. = . + 0x1000; /* 4kB of stack memory */
stack_top = .;
}
The result will then be:
test.elf: file format elf64-littleaarch64
Disassembly of section .text:
0000000040000000 <_Reset>:
40000000: 58008485 ldr x5, 40001090 <stack_top>
40000004: d5184105 msr sp_el0, x5
40000008: 94000014 bl 40000058 <c_entry>
4000000c: 14000000 b 4000000c <_Reset+0xc>
0000000040000010 <print_uart0>:
40000010: d10043ff sub sp, sp, #0x10
40000014: f90007e0 str x0, [sp, #8]
40000018: 14000008 b 40000038 <print_uart0+0x28>
4000001c: f94007e0 ldr x0, [sp, #8]
40000020: 39400001 ldrb w1, [x0]
40000024: d2a12000 mov x0, #0x9000000 // #150994944
40000028: b9000001 str w1, [x0]
4000002c: f94007e0 ldr x0, [sp, #8]
40000030: 91000400 add x0, x0, #0x1
40000034: f90007e0 str x0, [sp, #8]
40000038: f94007e0 ldr x0, [sp, #8]
4000003c: 39400000 ldrb w0, [x0]
40000040: 7100001f cmp w0, #0x0
40000044: 54fffec1 b.ne 4000001c <print_uart0+0xc> // b.any
40000048: d503201f nop
4000004c: d503201f nop
40000050: 910043ff add sp, sp, #0x10
40000054: d65f03c0 ret
0000000040000058 <c_entry>:
40000058: a9bf7bfd stp x29, x30, [sp, #-16]!
4000005c: 910003fd mov x29, sp
40000060: 90000000 adrp x0, 40000000 <_Reset>
40000064: 91020000 add x0, x0, #0x80
40000068: 97ffffea bl 40000010 <print_uart0>
4000006c: d503201f nop
40000070: a8c17bfd ldp x29, x30, [sp], #16
40000074: d65f03c0 ret
Disassembly of section .rodata:
0000000040000078 <UART0DR>:
40000078: 09000000 .inst 0x09000000 ; undefined
4000007c: 00000000 udf #0
40000080: 6c6c6548 ldnp d8, d25, [x10, #-320]
40000084: 6f77206f umlal2 v15.4s, v3.8h, v7.h[3]
40000088: 21646c72 .inst 0x21646c72 ; undefined
4000008c: Address 0x000000004000008c is out of bounds.

Related

m1 mac, calling nanosleep from assembly

I would like to call something like nanosleep from assembly, using only SVC calls. But it is not obvious how to do it using only the limited information I have, this list of macos syscall call signatures:
https://opensource.apple.com/source/xnu/xnu-1504.3.12/bsd/kern/syscalls.master
I tried to figure out what C does when calling nanosleep and I was able to reduce it to this:
struct timespec { long tv_sec; long tv_nsec; };
int nanosleep(const struct timespec *__rqtp, struct timespec *__rmtp)
__asm("_" "nanosleep" );
int main() {
struct timespec remaining, request = { 3, 1 };
int response = nanosleep(&request, &remaining); }
I am not sure what that __asm does as that does not look like assembly. Anyway I found an implementation of nanosleep in Apples libc source code. It relies on a call to clock_get_time though and that isn't defined in libc. I found a mention of clock_get_time in the XNU source code, but this is in a .defs file, which I don't know what is and it doesn't seem have an implementation.
In any case, is there some better documentation on the SVC calls, or some place I can find the assembly for the libc SVC implementation?
Any information or ideas are much appreciated.
First off, let's get the title of your question out of the way. The way you call nanosleep from assembly is like this:
mov x8, 3
stp x8, xzr, [sp, -0x10]! // 3 seconds, 0 nanoseconds
mov x0, sp
mov x1, 0
bl _nanosleep
add sp, sp, 0x10
You just use the libc implementation. For most purposes, you really shouldn't go to a deeper level, since a) on arm64 you're forced by the OS to link against libSystem (and thus libc) anyway, and b) because the Darwin kernel ABI(s) are not stable.
That said, let's look at how it works under the hood.
In the latest Libc source drop, we find that there's actually two implementations of nanosleep in gen/nanosleep.c: one for #if __DARWIN_UNIX03, which uses clock_get_time, and one for the other case, which uses mach_absolute_time and mach_wait_until. The one actually used in production is the former:
int nanosleep(const struct timespec *requested_time, struct timespec *remaining_time) {
kern_return_t kret;
int ret;
mach_timespec_t current;
mach_timespec_t completion;
if (__unix_conforming == 0)
__unix_conforming = 1;
#ifdef VARIANT_CANCELABLE
pthread_testcancel();
#endif /* VARIANT_CANCELABLE */
if ((requested_time == NULL) || (requested_time->tv_sec < 0) || (requested_time->tv_nsec >= NSEC_PER_SEC)) {
errno = EINVAL;
return -1;
}
if (remaining_time != NULL) {
/* once we add requested_time, this will be the completion time */
kret = clock_get_time(clock_port, &completion);
if (kret != KERN_SUCCESS) {
fprintf(stderr, "clock_get_time() failed: %s\n", mach_error_string(kret));
errno = EINVAL;
return -1;
}
}
ret = SEMWAIT_SIGNAL(clock_sem, MACH_PORT_NULL, 1, 1, (int64_t)requested_time->tv_sec, (int32_t)requested_time->tv_nsec);
if (ret < 0) {
if (errno == ETIMEDOUT) {
return 0;
} else if (errno == EINTR) {
if (remaining_time != NULL) {
ret = clock_get_time(clock_port, &current);
if (ret != KERN_SUCCESS) {
fprintf(stderr, "clock_get_time() failed: %s\n", mach_error_string(ret));
return -1;
}
/* This depends on the layout of a mach_timespec_t and timespec_t being equivalent */
ADD_MACH_TIMESPEC(&completion, requested_time);
/* We have to compare first, since mach_timespect_t contains unsigned integers */
if(CMP_MACH_TIMESPEC(&completion, &current) > 0) {
SUB_MACH_TIMESPEC(&completion, &current);
remaining_time->tv_sec = completion.tv_sec;
remaining_time->tv_nsec = completion.tv_nsec;
} else {
bzero(remaining_time, sizeof(*remaining_time));
}
}
} else {
errno = EINVAL;
}
}
return -1;
}
This ends up in /usr/lib/system/libsystem_c.dylib (re-exported by /usr/lib/libSystem.B.dylib). We can look at the assembly by either using dlopen/dlsym to dump the bytes, extracting the dylib from the dyld_shared_cache, or grabbing it from the ramdisk:
;-- _nanosleep:
0x0000e4d4 7f2303d5 pacibsp
0x0000e4d8 ff0301d1 sub sp, sp, 0x40
0x0000e4dc f44f02a9 stp x20, x19, [sp, 0x20]
0x0000e4e0 fd7b03a9 stp x29, x30, [sp, 0x30]
0x0000e4e4 fdc30091 add x29, sp, 0x30
0x0000e4e8 f30301aa mov x19, x1
0x0000e4ec f40300aa mov x20, x0
0x0000e4f0 ff7f01a9 stp xzr, xzr, [sp, 0x10]
0x0000e4f4 a80300d0 adrp x8, reloc.__unix_conforming
0x0000e4f8 080140f9 ldr x8, [x8]
0x0000e4fc 090140b9 ldr w9, [x8]
0x0000e500 69000035 cbnz w9, 0xe50c
0x0000e504 29008052 mov w9, 1
0x0000e508 090100b9 str w9, [x8]
0x0000e50c e5b00194 bl sym.imp.pthread_testcancel
0x0000e510 140300b4 cbz x20, 0xe570
0x0000e514 840240f9 ldr x4, [x20]
0x0000e518 c402f8b7 tbnz x4, 0x3f, 0xe570
0x0000e51c 850640f9 ldr x5, [x20, 8]
0x0000e520 08409952 mov w8, 0xca00
0x0000e524 4873a772 movk w8, 0x3b9a, lsl 16
0x0000e528 bf0008eb cmp x5, x8
0x0000e52c 22020054 b.hs 0xe570
0x0000e530 330300b4 cbz x19, 0xe594
0x0000e534 48040090 adrp x8, 0x96000
0x0000e538 08112c91 add x8, x8, 0xb04
0x0000e53c 000140b9 ldr w0, [x8]
0x0000e540 e1430091 add x1, sp, 0x10
0x0000e544 c7ae0194 bl sym.imp.clock_get_time
0x0000e548 40020034 cbz w0, 0xe590
0x0000e54c 080400d0 adrp x8, sym._gCRAnnotations
0x0000e550 08210f91 add x8, x8, 0x3c8
0x0000e554 130140f9 ldr x19, [x8]
0x0000e558 c2af0194 bl sym.imp.mach_error_string
0x0000e55c e00300f9 str x0, [sp]
0x0000e560 610300f0 adrp x1, 0x7d000
0x0000e564 21682b91 add x1, x1, 0xada
0x0000e568 e00313aa mov x0, x19
0x0000e56c 73100094 bl sym._fprintf
0x0000e570 84ad0194 bl sym.imp.__error
0x0000e574 c8028052 mov w8, 0x16
0x0000e578 080000b9 str w8, [x0]
0x0000e57c 00008012 mov w0, -1
0x0000e580 fd7b43a9 ldp x29, x30, [sp, 0x30]
0x0000e584 f44f42a9 ldp x20, x19, [sp, 0x20]
0x0000e588 ff030191 add sp, sp, 0x40
0x0000e58c ff0f5fd6 retab
0x0000e590 841640a9 ldp x4, x5, [x20]
0x0000e594 48040090 adrp x8, 0x96000
0x0000e598 08012c91 add x8, x8, 0xb00
0x0000e59c 000140b9 ldr w0, [x8]
0x0000e5a0 01008052 mov w1, 0
0x0000e5a4 22008052 mov w2, 1
0x0000e5a8 23008052 mov w3, 1
0x0000e5ac d1ad0194 bl sym.imp.__semwait_signal
0x0000e5b0 60feff36 tbz w0, 0x1f, 0xe57c
0x0000e5b4 73ad0194 bl sym.imp.__error
0x0000e5b8 080040b9 ldr w8, [x0]
0x0000e5bc 1ff10071 cmp w8, 0x3c
0x0000e5c0 61000054 b.ne 0xe5cc
0x0000e5c4 00008052 mov w0, 0
0x0000e5c8 eeffff17 b 0xe580
0x0000e5cc 6dad0194 bl sym.imp.__error
0x0000e5d0 080040b9 ldr w8, [x0]
0x0000e5d4 1f110071 cmp w8, 4
0x0000e5d8 c1fcff54 b.ne 0xe570
0x0000e5dc 13fdffb4 cbz x19, 0xe57c
0x0000e5e0 48040090 adrp x8, 0x96000
0x0000e5e4 08112c91 add x8, x8, 0xb04
0x0000e5e8 000140b9 ldr w0, [x8]
0x0000e5ec e1630091 add x1, sp, 0x18
0x0000e5f0 9cae0194 bl sym.imp.clock_get_time
0x0000e5f4 60010034 cbz w0, 0xe620
0x0000e5f8 080400d0 adrp x8, sym._gCRAnnotations
0x0000e5fc 08210f91 add x8, x8, 0x3c8
0x0000e600 130140f9 ldr x19, [x8]
0x0000e604 97af0194 bl sym.imp.mach_error_string
0x0000e608 e00300f9 str x0, [sp]
0x0000e60c 610300f0 adrp x1, 0x7d000
0x0000e610 21682b91 add x1, x1, 0xada
0x0000e614 e00313aa mov x0, x19
0x0000e618 48100094 bl sym._fprintf
0x0000e61c d8ffff17 b 0xe57c
0x0000e620 ea3f9952 mov w10, 0xc9ff
0x0000e624 4a73a772 movk w10, 0x3b9a, lsl 16
0x0000e628 880a40b9 ldr w8, [x20, 8]
0x0000e62c ec274229 ldp w12, w9, [sp, 0x10]
0x0000e630 0bc08652 mov w11, 0x3600
0x0000e634 ab8cb872 movk w11, 0xc465, lsl 16
0x0000e638 2801080b add w8, w9, w8
0x0000e63c 09010b0b add w9, w8, w11
0x0000e640 1f010a6b cmp w8, w10
0x0000e644 2bc1881a csel w11, w9, w8, gt
0x0000e648 89d58c1a cinc w9, w12, gt
0x0000e64c 8c0240b9 ldr w12, [x20]
0x0000e650 e81b40b9 ldr w8, [sp, 0x18]
0x0000e654 29010c0b add w9, w9, w12
0x0000e658 3f01086b cmp w9, w8
0x0000e65c 69000054 b.ls 0xe668
0x0000e660 ec1f40b9 ldr w12, [sp, 0x1c]
0x0000e664 05000014 b 0xe678
0x0000e668 c3010054 b.lo 0xe6a0
0x0000e66c ec1f40b9 ldr w12, [sp, 0x1c]
0x0000e670 7f010c6b cmp w11, w12
0x0000e674 6d010054 b.le 0xe6a0
0x0000e678 6b010c6b subs w11, w11, w12
0x0000e67c a5000054 b.pl 0xe690
0x0000e680 4a010b0b add w10, w10, w11
0x0000e684 4b050011 add w11, w10, 1
0x0000e688 29050051 sub w9, w9, 1
0x0000e68c e92f0229 stp w9, w11, [sp, 0x10]
0x0000e690 2801084b sub w8, w9, w8
0x0000e694 697d4093 sxtw x9, w11
0x0000e698 682600a9 stp x8, x9, [x19]
0x0000e69c b8ffff17 b 0xe57c
0x0000e6a0 7f7e00a9 stp xzr, xzr, [x19]
0x0000e6a4 b6ffff17 b 0xe57c
I don't think this is something you really want to implement yourself.
But either way, you noticed that it uses clock_get_time, which is not defined in that library. Indeed clock_get_time is in /usr/lib/system/libsystem_kernel.dylib:
;-- _clock_get_time:
0x00006440 7f2303d5 pacibsp
0x00006444 ff8301d1 sub sp, sp, 0x60
0x00006448 f44f04a9 stp x20, x19, [sp, 0x40]
0x0000644c fd7b05a9 stp x29, x30, [sp, 0x50]
0x00006450 fd430191 add x29, sp, 0x50
0x00006454 f30301aa mov x19, x1
0x00006458 f40300aa mov x20, x0
0x0000645c ff7f02a9 stp xzr, xzr, [sp, 0x20]
0x00006460 ff3b00b9 str wzr, [sp, 0x38]
0x00006464 ff1b00f9 str xzr, [sp, 0x30]
0x00006468 f5eaff97 bl sym._mig_get_reply_port
0x0000646c 480100f0 adrp x8, 0x31000
0x00006470 008547fd ldr d0, [x8, 0xf08]
0x00006474 e00700fd str d0, [sp, 8]
0x00006478 f4030229 stp w20, w0, [sp, 0x10]
0x0000647c 480100f0 adrp x8, 0x31000
0x00006480 008947fd ldr d0, [x8, 0xf10]
0x00006484 e00f00fd str d0, [sp, 0x18]
0x00006488 057c60d3 lsl x5, x0, 0x20
0x0000648c e303142a mov w3, w20
0x00006490 037c60b3 bfi x3, x0, 0x20, 0x20
0x00006494 e0230091 add x0, sp, 8
0x00006498 610080d2 mov x1, 3
0x0000649c 4100c0f2 movk x1, 2, lsl 32
0x000064a0 62a282d2 mov x2, 0x1513
0x000064a4 0203c0f2 movk x2, 0x18, lsl 32
0x000064a8 047dc0d2 mov x4, 0x3e800000000
0x000064ac 86068052 mov w6, 0x34
0x000064b0 070080d2 mov x7, 0
0x000064b4 e8300094 bl sym._mach_msg2_internal
0x000064b8 f40300aa mov x20, x0
0x000064bc c8ff9f52 mov w8, 0xfffe
0x000064c0 e8ffbd72 movk w8, 0xefff, lsl 16
0x000064c4 0800080b add w8, w0, w8
0x000064c8 1f390071 cmp w8, 0xe
0x000064cc 29008052 mov w9, 1
0x000064d0 2821c81a lsl w8, w9, w8
0x000064d4 69008852 mov w9, 0x4003
0x000064d8 0801090a and w8, w8, w9
0x000064dc 0499407a ccmp w8, 0, 4, ls
0x000064e0 21040054 b.ne 0x6564
0x000064e4 94020035 cbnz w20, 0x6534
0x000064e8 e81f40b9 ldr w8, [sp, 0x1c]
0x000064ec 1f1d0171 cmp w8, 0x47
0x000064f0 80020054 b.eq 0x6540
0x000064f4 1f311171 cmp w8, 0x44c
0x000064f8 81020054 b.ne 0x6548
0x000064fc e80b40b9 ldr w8, [sp, 8]
0x00006500 c802f837 tbnz w8, 0x1f, 0x6558
0x00006504 e80f40b9 ldr w8, [sp, 0xc]
0x00006508 1fb10071 cmp w8, 0x2c
0x0000650c 20020054 b.eq 0x6550
0x00006510 1f910071 cmp w8, 0x24
0x00006514 21020054 b.ne 0x6558
0x00006518 e82b40b9 ldr w8, [sp, 0x28]
0x0000651c e91340b9 ldr w9, [sp, 0x10]
0x00006520 3f010071 cmp w9, 0
0x00006524 0409407a ccmp w8, 0, 4, eq
0x00006528 69258012 mov w9, -0x12c
0x0000652c 1411891a csel w20, w8, w9, ne
0x00006530 0b000014 b 0x655c
0x00006534 e01740b9 ldr w0, [sp, 0x14]
0x00006538 9bf6ff97 bl sym._mig_dealloc_reply_port
0x0000653c 0a000014 b 0x6564
0x00006540 74268012 mov w20, -0x134
0x00006544 06000014 b 0x655c
0x00006548 94258012 mov w20, -0x12d
0x0000654c 04000014 b 0x655c
0x00006550 e81340b9 ldr w8, [sp, 0x10]
0x00006554 28010034 cbz w8, 0x6578
0x00006558 74258012 mov w20, -0x12c
0x0000655c e0230091 add x0, sp, 8
0x00006560 9dedff97 bl sym._mach_msg_destroy
0x00006564 e00314aa mov x0, x20
0x00006568 fd7b45a9 ldp x29, x30, [sp, 0x50]
0x0000656c f44f44a9 ldp x20, x19, [sp, 0x40]
0x00006570 ff830191 add sp, sp, 0x60
0x00006574 ff0f5fd6 retab
0x00006578 f42b40b9 ldr w20, [sp, 0x28]
0x0000657c 14ffff35 cbnz w20, 0x655c
0x00006580 e8c342f8 ldur x8, [sp, 0x2c]
0x00006584 680200f9 str x8, [x19]
0x00006588 f7ffff17 b 0x6564
Again, likely something you wouldn't want to implement by yourself in assembly.
But where does this implementation come from? You already discovered that XNU only contains a .defs file. That file is a MIG (Mach Interface Generator) definitions file, which can be used to generate both client- and server-side code. To do that, you use the mig utility shipped with Xcode (also open source). For the clocks file, the invocation would look something like this:
mig -novouchers -DLIBSYSCALL_INTERFACE=1 -DPRIVATE=1 -DKERNEL_SERVER=1 -arch arm64e xnu-8792.61.2/osfmk/mach/clock.defs
That will generate a clock.h, clockServer.c and clockUser.c. We only care about the last one, as it contains the userland code for clock_get_time:
/* Routine clock_get_time */
mig_external kern_return_t clock_get_time
(
clock_serv_t clock_serv,
mach_timespec_t *cur_time
)
{
#ifdef __MigPackStructs
#pragma pack(push, 4)
#endif
typedef struct {
mach_msg_header_t Head;
} Request __attribute__((unused));
#ifdef __MigPackStructs
#pragma pack(pop)
#endif
#ifdef __MigPackStructs
#pragma pack(push, 4)
#endif
typedef struct {
mach_msg_header_t Head;
NDR_record_t NDR;
kern_return_t RetCode;
mach_timespec_t cur_time;
mach_msg_trailer_t trailer;
} Reply __attribute__((unused));
#ifdef __MigPackStructs
#pragma pack(pop)
#endif
#ifdef __MigPackStructs
#pragma pack(push, 4)
#endif
typedef struct {
mach_msg_header_t Head;
NDR_record_t NDR;
kern_return_t RetCode;
mach_timespec_t cur_time;
} __Reply __attribute__((unused));
#ifdef __MigPackStructs
#pragma pack(pop)
#endif
/*
* typedef struct {
* mach_msg_header_t Head;
* NDR_record_t NDR;
* kern_return_t RetCode;
* } mig_reply_error_t;
*/
union {
Request In;
Reply Out;
} Mess;
Request *InP = &Mess.In;
Reply *Out0P = &Mess.Out;
mach_msg_return_t msg_result;
#ifdef __MIG_check__Reply__clock_get_time_t__defined
kern_return_t check_result;
#endif /* __MIG_check__Reply__clock_get_time_t__defined */
__DeclareSendRpc(1000, "clock_get_time")
InP->Head.msgh_reply_port = mig_get_reply_port();
InP->Head.msgh_bits =
MACH_MSGH_BITS(19, MACH_MSG_TYPE_MAKE_SEND_ONCE);
InP->Head.msgh_size = (mach_msg_size_t)sizeof(Request);
InP->Head.msgh_request_port = clock_serv;
InP->Head.msgh_id = 1000;
InP->Head.msgh_reserved = 0;
__BeforeSendRpc(1000, "clock_get_time")
msg_result = mach_msg(&InP->Head, MACH_SEND_MSG|MACH_RCV_MSG|MACH_MSG_OPTION_NONE, (mach_msg_size_t)sizeof(Request), (mach_msg_size_t)sizeof(Reply), InP->Head.msgh_reply_port, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);
__AfterSendRpc(1000, "clock_get_time")
if (msg_result != MACH_MSG_SUCCESS) {
__MachMsgErrorWithoutTimeout(msg_result);
}
if (msg_result != MACH_MSG_SUCCESS) {
{ return msg_result; }
}
#if defined(__MIG_check__Reply__clock_get_time_t__defined)
check_result = __MIG_check__Reply__clock_get_time_t((__Reply__clock_get_time_t *)Out0P);
if (check_result != MACH_MSG_SUCCESS) {
mach_msg_destroy(&Out0P->Head);
{ return check_result; }
}
#endif /* defined(__MIG_check__Reply__clock_get_time_t__defined) */
*cur_time = Out0P->cur_time;
return KERN_SUCCESS;
}
(I omitted __MIG_check__Reply__clock_get_time_t here, but it is generated in the same file, right above that function.)
One thing of note here is the call to mach_msg though. In the assembly, we can see mach_msg2_internal being invoked, but the generated code simply calls mach_msg. If you took that code as-is and tried to run it on macOS 13 or iOS 16, it would not work. The reasons for this tie deeply into the security internals of Darwin and into a mitigation shipped in the most recent major release of Apple OSes. If you're interested in the story behind that, Luca Todesco covered that in his Hexacon presentation not long ago.
But by passing -DKERNEL_SERVER=1 to mig, we actually made it generate a little stub for us at the beginning of the file:
#include <TargetConditionals.h>
#if defined(MACH_SEND_AUX_TOO_SMALL) && (defined(__arm64__) || defined(__LP64__))
#undef mach_msg
#define mach_msg mig_mach_msg
static inline mach_msg_return_t
mig_mach_msg(
mach_msg_header_t *msg,
mach_msg_option_t option,
mach_msg_size_t send_size,
mach_msg_size_t rcv_size,
mach_port_name_t rcv_name,
mach_msg_timeout_t timeout,
mach_port_name_t notify)
{
(void)notify;
return mach_msg2(msg, option | MACH64_SEND_KOBJECT_CALL,
*msg, send_size, rcv_size, rcv_name, timeout, 0);
}
#endif
mach_msg2 isn't defined in the public SDK either (and is not a symbol in any library), but we can steal it together from osfmk/mach/message.h in XNU source:
typedef uint64_t mach_msg_option64_t;
#define MACH64_SEND_MSG MACH_SEND_MSG
#define MACH64_MSG_VECTOR 0x0000000100000000ull
#define MACH64_SEND_KOBJECT_CALL 0x0000000200000000ull
#if defined(__LP64__) || defined(__arm64__)
__API_AVAILABLE(macos(13.0), ios(16.0), tvos(16.0), watchos(9.0))
__IOS_PROHIBITED __WATCHOS_PROHIBITED __TVOS_PROHIBITED
extern mach_msg_return_t mach_msg2_internal(
void *data,
mach_msg_option64_t option64,
uint64_t msgh_bits_and_send_size,
uint64_t msgh_remote_and_local_port,
uint64_t msgh_voucher_and_id,
uint64_t desc_count_and_rcv_name,
uint64_t rcv_size_and_priority,
uint64_t timeout);
__API_AVAILABLE(macos(13.0), ios(16.0), tvos(16.0), watchos(9.0))
__IOS_PROHIBITED __WATCHOS_PROHIBITED __TVOS_PROHIBITED
static inline mach_msg_return_t
mach_msg2(
void *data,
mach_msg_option64_t option64,
mach_msg_header_t header,
mach_msg_size_t send_size,
mach_msg_size_t rcv_size,
mach_port_t rcv_name,
uint64_t timeout,
uint32_t priority)
{
mach_msg_base_t *base;
mach_msg_size_t descriptors;
if (option64 & MACH64_MSG_VECTOR) {
base = (mach_msg_base_t *)((mach_msg_vector_t *)data)->msgv_data;
} else {
base = (mach_msg_base_t *)data;
}
if ((option64 & MACH64_SEND_MSG) &&
(base->header.msgh_bits & MACH_MSGH_BITS_COMPLEX)) {
descriptors = base->body.msgh_descriptor_count;
} else {
descriptors = 0;
}
#define MACH_MSG2_SHIFT_ARGS(lo, hi) ((uint64_t)hi << 32 | (uint32_t)lo)
return mach_msg2_internal(data, option64,
MACH_MSG2_SHIFT_ARGS(header.msgh_bits, send_size),
MACH_MSG2_SHIFT_ARGS(header.msgh_remote_port, header.msgh_local_port),
MACH_MSG2_SHIFT_ARGS(header.msgh_voucher_port, header.msgh_id),
MACH_MSG2_SHIFT_ARGS(descriptors, rcv_name),
MACH_MSG2_SHIFT_ARGS(rcv_size, priority), timeout);
#undef MACH_MSG2_SHIFT_ARGS
}
#endif
This then goes through mach_msg2_internal, which is defined in libsyscall/mach/mach_msg.c and compiled into libsystem_kernel.dylib, and that calls down to mach_msg2_trap, which, at long long last, is where the svc happens:
;-- _mach_msg2_trap:
0x00000d68 d0058092 mov x16, -0x2f
0x00000d6c 011000d4 svc 0x80
0x00000d70 c0035fd6 ret
I guess with this you could build your own nanosleep on macOS 13 or iOS 16 now. It wouldn't work on versions before those though (you'd have to use plain mach_msg there), and an implementation that did work on those wouldn't work on macOS 13 and iOS 16, so you probably really shouldn't. Just call the libc implementation.

Calling printf from aarch64 asm code on Apple M1 / MacOS

It appears that the usual approach to calling printf from aarch64 asm code that works just fine on Linux does not work on MacOS running on the Apple M1.
Is there any documentation that explains what has changed?
I find that the parameters that I put in x0..x2 are getting garbled in the printf output.
The Darwin arm64 ABI passes all varags arguments on the stack, each padded to the next multiple of 8 bytes. (Types that don't fit into 8 bytes have a pointer passed instead. Regular arguments that don't fit into x0-x7/q0-q7 come before varargs on the stack, naturally aligned.)
Here's a simple example:
.globl _main
.align 2
_main:
stp x29, x30, [sp, -0x10]!
sub sp, sp, 0x10
mov x8, 66
str x8, [sp]
adr x0, Lstr
bl _printf
mov w0, 0
add sp, sp, 0x10
ldp x29, x30, [sp], 0x10
ret
Lstr:
.asciz "test: %x\n"
Note that this is different from non-varargs arguments to unprototyped functions that are passed on the stack, which are only padded up to 4 bytes (sizeof(int)). The following code:
#include <stdio.h>
#include <stdint.h>
extern void func();
__asm__
(
"_func:\n"
" ret\n"
);
int main(void)
{
uint8_t a = 1,
b = 2,
c = 3;
printf("%hhx %hhx %hhx %hhx %hhx %hhx\n", a, b, c, a, b, c);
func(a, b, c, a, b, c, a, b, c, a, b, c);
return 0;
}
compiles down to this with -O2:
;-- _main:
0x100003ee8 ff0301d1 sub sp, sp, 0x40
0x100003eec fd7b03a9 stp x29, x30, [sp, 0x30]
0x100003ef0 fdc30091 add x29, sp, 0x30
0x100003ef4 68008052 mov w8, 3
0x100003ef8 49008052 mov w9, 2
0x100003efc e92302a9 stp x9, x8, [sp, 0x20]
0x100003f00 2a008052 mov w10, 1
0x100003f04 e82b01a9 stp x8, x10, [sp, 0x10]
0x100003f08 ea2700a9 stp x10, x9, [sp]
0x100003f0c 20040010 adr x0, str._hhx__hhx__hhx__hhx__hhx__hhx_n
0x100003f10 1f2003d5 nop
0x100003f14 13000094 bl sym.imp.printf
0x100003f18 480080d2 mov x8, 2
0x100003f1c 6800c0f2 movk x8, 3, lsl 32
0x100003f20 690080d2 mov x9, 3
0x100003f24 2900c0f2 movk x9, 1, lsl 32
0x100003f28 e92300a9 stp x9, x8, [sp]
0x100003f2c 20008052 mov w0, 1
0x100003f30 41008052 mov w1, 2
0x100003f34 62008052 mov w2, 3
0x100003f38 23008052 mov w3, 1
0x100003f3c 44008052 mov w4, 2
0x100003f40 65008052 mov w5, 3
0x100003f44 26008052 mov w6, 1
0x100003f48 47008052 mov w7, 2
0x100003f4c e6ffff97 bl sym._func
0x100003f50 00008052 mov w0, 0
0x100003f54 fd7b43a9 ldp x29, x30, [sp, 0x30]
0x100003f58 ff030191 add sp, sp, 0x40
0x100003f5c c0035fd6 ret
Giving the function an actual prototype allows the removal of any padding (except the one that serves alignment purposes), like so (note the last argument being 8 bytes):
extern void func(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t,
uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint64_t);
The code then compiles down to:
;-- _main:
0x100003ee4 ff4301d1 sub sp, sp, 0x50
0x100003ee8 f44f03a9 stp x20, x19, [sp, 0x30]
0x100003eec fd7b04a9 stp x29, x30, [sp, 0x40]
0x100003ef0 fd030191 add x29, sp, 0x40
0x100003ef4 73008052 mov w19, 3
0x100003ef8 54008052 mov w20, 2
0x100003efc f44f02a9 stp x20, x19, [sp, 0x20]
0x100003f00 28008052 mov w8, 1
0x100003f04 f32301a9 stp x19, x8, [sp, 0x10]
0x100003f08 e85300a9 stp x8, x20, [sp]
0x100003f0c 20040010 adr x0, str._hhx__hhx__hhx__hhx__hhx__hhx_n
0x100003f10 1f2003d5 nop
0x100003f14 13000094 bl sym.imp.printf
0x100003f18 68208052 mov w8, 0x103
0x100003f1c f30700f9 str x19, [sp, 8]
0x100003f20 f40b0039 strb w20, [sp, 2]
0x100003f24 e8030079 strh w8, [sp]
0x100003f28 20008052 mov w0, 1
0x100003f2c 41008052 mov w1, 2
0x100003f30 62008052 mov w2, 3
0x100003f34 23008052 mov w3, 1
0x100003f38 44008052 mov w4, 2
0x100003f3c 65008052 mov w5, 3
0x100003f40 26008052 mov w6, 1
0x100003f44 47008052 mov w7, 2
0x100003f48 e6ffff97 bl sym._func
0x100003f4c 00008052 mov w0, 0
0x100003f50 fd7b44a9 ldp x29, x30, [sp, 0x40]
0x100003f54 f44f43a9 ldp x20, x19, [sp, 0x30]
0x100003f58 ff430191 add sp, sp, 0x50
0x100003f5c c0035fd6 ret

No FPU support with gcc for ARM Cortex M?

I have the following function from a well known benchmark that I am compiling with gcc-arm-none-eabi-10-2020-q4-major:
#include <unistd.h>
double b[1000], c[1000];
void tuned_STREAM_Scale(double scalar)
{
ssize_t j;
for (j = 0; j < 1000; j++)
b[j] = scalar* c[j];
}
I am using the following compiler options:
arm-none-eabi-gcc -O3 -mcpu=cortex-m7 -mthumb -mfloat-abi=hard -mfpu=fpv5-sp-d16 -c test.c
However, if I check the compiled code, the compiler seems unable to use a basic FPU multiply instruction, and just uses the __aeabi_dmul function from libgcc (we can however see that a FPU vmov is used):
00000000 <tuned_STREAM_Scale>:
0: e92d 41f0 stmdb sp!, {r4, r5, r6, r7, r8, lr}
4: 4c08 ldr r4, [pc, #32] ; (28 <tuned_STREAM_Scale+0x28>)
6: 4d09 ldr r5, [pc, #36] ; (2c <tuned_STREAM_Scale+0x2c>)
8: f504 58fa add.w r8, r4, #8000 ; 0x1f40
c: ec57 6b10 vmov r6, r7, d0
10: e8f4 0102 ldrd r0, r1, [r4], #8
14: 4632 mov r2, r6
16: 463b mov r3, r7
18: f7ff fffe bl 0 <__aeabi_dmul>
1c: 4544 cmp r4, r8
1e: e8e5 0102 strd r0, r1, [r5], #8
22: d1f5 bne.n 10 <tuned_STREAM_Scale+0x10>
24: e8bd 81f0 ldmia.w sp!, {r4, r5, r6, r7, r8, pc}
If I compare with another compiler, the code is incomparably more efficient:
00000000 <tuned_STREAM_Scale>:
0: 4808 ldr r0, [pc, #32] ; (24 <tuned_STREAM_Scale+0x24>)
2: b580 push {r7, lr}
4: 4b06 ldr r3, [pc, #24] ; (20 <tuned_STREAM_Scale+0x20>)
6: 27c8 movs r7, #200 ; 0xc8
8: c806 ldmia r0!, {r1, r2}
a: ec42 1b11 vmov d1, r1, r2
e: ee20 1b01 vmul.f64 d1, d0, d1
12: 1e7f subs r7, r7, #1
14: ec52 1b11 vmov r1, r2, d1
18: c306 stmia r3!, {r1, r2}
1a: d1f5 bne.n 8 <tuned_STREAM_Scale+0x8>
1c: bd80 pop {r7, pc}
If I check inside gcc package the various libgcc object files depending on CPU or FPU options, I cannot find any FPU instructions in __aeabi_dmul or any other function.
I find very strange that gcc is not able to use a basic FPU multiplication, and I could not find in any documentation or README this limitation, so I am wondering if I am not doing anything wrong. I have checked older gcc versions and I still have this problem. Would it be due to gcc or to the compiled binaries from ARM?
The clue is in the compiler options you already posted:
-mfpu=fpv5-sp-d16 "sp" means single precision.
You told it not to generate hardware double instructions, which is correct for most Cortex-M7 processors because they can't execute them. If you have an M7 which can then you need to set the correct fpu argument.

Does arm-none-eabi-ld rewrite the bl instruction?

I'm trying to understand why some Cortex-M0 code behaves differently when it is linked versus unlinked. In both cases it is loaded to 0x20000000. It looks like despite my best efforts to generate position independent code by passing -fPIC to the compiler, the bl instruction appears to differ after the code has passed through the linker. Am I reading this correctly, is that just a part of the linker's job in ARM Thumb, and is there a better way to generate a position independent function call?
Linked:
20000000:
20000000: 0003 movs r3, r0
20000002: 4852 ldr r0, [pc, #328]
20000004: 4685 mov sp, r0
20000006: 0018 movs r0, r3
20000008: f000 f802 bl 20000010
2000000c: 46c0 nop ; (mov r8, r8)
2000000e: 46c0 nop ; (mov r8, r8)
Unlinked:
00000000:
0: 0003 movs r3, r0
2: 4852 ldr r0, [pc, #328]
4: 4685 mov sp, r0
6: 0018 movs r0, r3
8: f7ff fffe bl 10
c: 46c0 nop ; (mov r8, r8)
e: 46c0 nop ; (mov r8, r8)
start.s
.globl _start
_start:
.word 0x20001000
.word reset
.word hang
.word hang
.thumb
.thumb_func
reset:
bl notmain
.thumb_func
hang:
b .
notmain.c
unsigned int x;
unsigned int fun ( unsigned int );
void notmain ( void )
{
x=fun(x+5);
}
fun.c
unsigned int y;
unsigned int fun ( unsigned int z )
{
return(y+z+1);
}
memmap
MEMORY
{
ram : ORIGIN = 0x20000000, LENGTH = 0x1000
}
SECTIONS
{
.text : { *(.text*) } > ram
.bss : { *(.bss*) } > ram
}
build
arm-none-eabi-as start.s -o start.o
arm-none-eabi-gcc -fPIC -O2 -c -mthumb fun.c -o fun.o
arm-none-eabi-gcc -fPIC -O2 -c -mthumb notmain.c -o notmain.o
arm-none-eabi-ld -T memmap start.o notmain.o fun.o -o so.elf
produces
20000000 <_start>:
20000000: 20001000 andcs r1, r0, r0
20000004: 20000011 andcs r0, r0, r1, lsl r0
20000008: 20000015 andcs r0, r0, r5, lsl r0
2000000c: 20000015 andcs r0, r0, r5, lsl r0
20000010 <reset>:
20000010: f000 f802 bl 20000018 <notmain>
20000014 <hang>:
20000014: e7fe b.n 20000014 <hang>
...
20000018 <notmain>:
20000018: b510 push {r4, lr}
2000001a: 4b06 ldr r3, [pc, #24] ; (20000034 <notmain+0x1c>)
2000001c: 4a06 ldr r2, [pc, #24] ; (20000038 <notmain+0x20>)
2000001e: 447b add r3, pc
20000020: 589c ldr r4, [r3, r2]
20000022: 6823 ldr r3, [r4, #0]
20000024: 1d58 adds r0, r3, #5
20000026: f000 f809 bl 2000003c <fun>
2000002a: 6020 str r0, [r4, #0]
2000002c: bc10 pop {r4}
2000002e: bc01 pop {r0}
20000030: 4700 bx r0
20000032: 46c0 nop ; (mov r8, r8)
20000034: 00000032 andeq r0, r0, r2, lsr r0
20000038: 00000000 andeq r0, r0, r0
2000003c <fun>:
2000003c: 4b03 ldr r3, [pc, #12] ; (2000004c <fun+0x10>)
2000003e: 4a04 ldr r2, [pc, #16] ; (20000050 <fun+0x14>)
20000040: 447b add r3, pc
20000042: 589b ldr r3, [r3, r2]
20000044: 681b ldr r3, [r3, #0]
20000046: 3301 adds r3, #1
20000048: 1818 adds r0, r3, r0
2000004a: 4770 bx lr
2000004c: 00000010 andeq r0, r0, r0, lsl r0
20000050: 00000004 andeq r0, r0, r4
Disassembly of section .got:
20000054 <.got>:
20000054: 20000068 andcs r0, r0, r8, rrx
20000058: 2000006c andcs r0, r0, ip, rrx
Disassembly of section .got.plt:
2000005c <_GLOBAL_OFFSET_TABLE_>:
...
Disassembly of section .bss:
20000068 <x>:
20000068: 00000000 andeq r0, r0, r0
2000006c <y>:
2000006c: 00000000 andeq r0, r0, r0
when it wants to find the global variable x what it appears to have done is it takes the program counter and a linker supplied/modfied offset 0x32 and uses that to find the entry in the global offset table. then takes an offset from that to find X. same for Y. so it appears that when you relocate you will need to modify the global offset table at runtime or load time depending.
If I get rid of those global variables, other than the vector table which is hardcoded and not PIC (and wasnt compiled anyway), this is all position independent.
20000000 <_start>:
20000000: 20001000 andcs r1, r0, r0
20000004: 20000011 andcs r0, r0, r1, lsl r0
20000008: 20000015 andcs r0, r0, r5, lsl r0
2000000c: 20000015 andcs r0, r0, r5, lsl r0
20000010 <reset>:
20000010: f000 f802 bl 20000018 <notmain>
20000014 <hang>:
20000014: e7fe b.n 20000014 <hang>
...
20000018 <notmain>:
20000018: b508 push {r3, lr}
2000001a: 2005 movs r0, #5
2000001c: f000 f804 bl 20000028 <fun>
20000020: 3006 adds r0, #6
20000022: bc08 pop {r3}
20000024: bc02 pop {r1}
20000026: 4708 bx r1
20000028 <fun>:
20000028: 3001 adds r0, #1
2000002a: 4770 bx lr
back to this version
unsigned int y;
unsigned int fun ( unsigned int z )
{
return(y+z+1);
}
position independent
00000000 <fun>:
0: 4b03 ldr r3, [pc, #12] ; (10 <fun+0x10>)
2: 4a04 ldr r2, [pc, #16] ; (14 <fun+0x14>)
4: 447b add r3, pc
6: 589b ldr r3, [r3, r2]
8: 681b ldr r3, [r3, #0]
a: 3301 adds r3, #1
c: 1818 adds r0, r3, r0
e: 4770 bx lr
10: 00000008 andeq r0, r0, r8
14: 00000000 andeq r0, r0, r0
not position independent
00000000 <fun>:
0: 4b02 ldr r3, [pc, #8] ; (c <fun+0xc>)
2: 681b ldr r3, [r3, #0]
4: 3301 adds r3, #1
6: 1818 adds r0, r3, r0
8: 4770 bx lr
a: 46c0 nop ; (mov r8, r8)
c: 00000000 andeq r0, r0, r0
the code has to do a bit more work to access the external variable. position dependent, some work because it is external but not as much. the linker will fill in the required items to make it work...to link it...
the elf file contains information for the linker to know to do this.
Relocation section '.rel.text' at offset 0x1a4 contains 2 entries:
Offset Info Type Sym.Value Sym. Name
00000010 00000a19 R_ARM_BASE_PREL 00000000 _GLOBAL_OFFSET_TABLE_
00000014 00000b1a R_ARM_GOT_BREL 00000004 y
or
Relocation section '.rel.text' at offset 0x174 contains 1 entries:
Offset Info Type Sym.Value Sym. Name
0000000c 00000a02 R_ARM_ABS32 00000004 y
notmain had these PIC
Relocation section '.rel.text' at offset 0x1cc contains 3 entries:
Offset Info Type Sym.Value Sym. Name
0000000e 00000a0a R_ARM_THM_CALL 00000000 fun
0000001c 00000b19 R_ARM_BASE_PREL 00000000 _GLOBAL_OFFSET_TABLE_
00000020 00000c1a R_ARM_GOT_BREL 00000004 x
and without.
Relocation section '.rel.text' at offset 0x198 contains 2 entries:
Offset Info Type Sym.Value Sym. Name
00000008 00000a0a R_ARM_THM_CALL 00000000 fun
00000014 00000b02 R_ARM_ABS32 00000004 x
so in short the toolchain is doing its job, you dont need to re-do its job. And note this has nothing to do with arm or thumb. any time you use the object and linker model and allow for external items from an object the linker has to patch things up to glue the code together. thats just how it works.

gcc for ARM - move code and stack

I am working on a project for an ARM Cortex-M3 (SiLabs) SOC. I need to move the interrupt vector [edit] and code away from the bottom of flash to make room for a "boot loader". The boot loader starts at address 0 to come up when the core comes out of reset. Its function is to validate the main image, loaded at a higher address and possibly replace that main image with a new one.
Therefore, the boot loader would have its vector table at 0, followed by its code. At a higher, fixed address, say 8KB, would be the main image, starting with its vector table.
I have found this page which describes the Vector Table Offset Register which the boot loader can use (with interrupts masked, obviously) to point the hardware to the new vector table.
My question is how to get the "main" image linked so that it will work when written to flash, starting not at zero. I'm not familiar with ARM assembly but I assume the code is not position independent.
I'm using SiLabs's Precision32 IDE which uses gcc for the toolchain. I've found how to add linker flags. My question is what gcc flag(s) will provide the change to the base of the vector table and code.
Thank you.
vectors.s
.cpu cortex-m3
.thumb
.word 0x20008000 /* stack top address */
.word _start /* Reset */
.word hang
.word hang
/* ... */
.thumb_func
hang: b .
.thumb_func
.globl _start
_start:
bl notmain
b hang
notmain.c
extern void fun ( unsigned int );
void notmain ( void )
{
fun(7);
}
fun.c
void fun ( unsigned int x )
{
}
Makefile
hello.elf : vectors.s fun.c notmain.c memmap
arm-none-eabi-as vectors.s -o vectors.o
arm-none-eabi-gcc -Wall -O2 -nostdlib -nostartfiles -ffreestanding -mthumb -mcpu=cortex-m3 -march=armv7-m -c notmain.c -o notmain.o
arm-none-eabi-gcc -Wall -O2 -nostdlib -nostartfiles -ffreestanding -mthumb -mcpu=cortex-m3 -march=armv7-m -c fun.c -o fun.o
arm-none-eabi-ld -o hello.elf -T memmap vectors.o notmain.o fun.o
arm-none-eabi-objdump -D hello.elf > hello.list
arm-none-eabi-objcopy hello.elf -O binary hello.bin
so if the linker script (memmap is the name I used) looks like this
MEMORY
{
rom : ORIGIN = 0x00000000, LENGTH = 0x40000
ram : ORIGIN = 0x20000000, LENGTH = 0x8000
}
SECTIONS
{
.text : { *(.text*) } > rom
.bss : { *(.bss*) } > ram
}
since all of the above is .text, no .bss nor .data, the linker takes the objects as listed on the ld command line and places them starting at that address...
mbly of section .text:
00000000 <hang-0x10>:
0: 20008000 andcs r8, r0, r0
4: 00000013 andeq r0, r0, r3, lsl r0
8: 00000011 andeq r0, r0, r1, lsl r0
c: 00000011 andeq r0, r0, r1, lsl r0
00000010 <hang>:
10: e7fe b.n 10 <hang>
00000012 <_start>:
12: f000 f801 bl 18 <notmain>
16: e7fb b.n 10 <hang>
00000018 <notmain>:
18: 2007 movs r0, #7
1a: f000 b801 b.w 20 <fun>
1e: bf00 nop
00000020 <fun>:
20: 4770 bx lr
22: bf00 nop
So for this to work you have to be careful to put your bootstrap code first on the command line. But you can also do things like this with a linker script.
the order appears to matter, listing the specific object files first then the generic .text later
MEMORY
{
romx : ORIGIN = 0x00000000, LENGTH = 0x1000
romy : ORIGIN = 0x00010000, LENGTH = 0x1000
ram : ORIGIN = 0x00030000, LENGTH = 0x1000
bob : ORIGIN = 0x00040000, LENGTH = 0x1000
ted : ORIGIN = 0x00050000, LENGTH = 0x1000
}
SECTIONS
{
abc : { vectors.o } > romx
def : { fun.o } > ted
.text : { *(.text*) } > romy
.bss : { *(.bss*) } > ram
}
and we get this
00000000 <hang-0x10>:
0: 20008000 andcs r8, r0, r0
4: 00000013 andeq r0, r0, r3, lsl r0
8: 00000011 andeq r0, r0, r1, lsl r0
c: 00000011 andeq r0, r0, r1, lsl r0
00000010 <hang>:
10: e7fe b.n 10 <hang>
00000012 <_start>:
12: f00f fff5 bl 10000 <notmain>
16: e7fb b.n 10 <hang>
Disassembly of section def:
00050000 <fun>:
50000: 4770 bx lr
50002: bf00 nop
Disassembly of section .text:
00010000 <notmain>:
10000: 2007 movs r0, #7
10002: f03f bffd b.w 50000 <fun>
10006: bf00 nop
the short answer is with gnu tools you use a linker script to manipulate where things end up, I assume you want these functions to be in the rom at some specified location. I dont quite understand exactly what you are doing. But if for example you are trying to put something simple like the branch to main() in the flash initially with main() being deeper in the flash, then somehow either through whatever code is deeper in the flash or through some other method, then later you erase and reprogram only the stuff near zero. you will still need a simple branch to main() the first time. you can force what I am calling vectors.o to be at address zero then .text can be deeper in the flash putting all of the reset of the code basically there, then leave that in flash and replace just the stuff at zero.
like this
MEMORY
{
romx : ORIGIN = 0x00000000, LENGTH = 0x1000
romy : ORIGIN = 0x00010000, LENGTH = 0x1000
ram : ORIGIN = 0x00030000, LENGTH = 0x1000
bob : ORIGIN = 0x00040000, LENGTH = 0x1000
ted : ORIGIN = 0x00050000, LENGTH = 0x1000
}
SECTIONS
{
abc : { vectors.o } > romx
.text : { *(.text*) } > romy
.bss : { *(.bss*) } > ram
}
giving
00000000 <hang-0x10>:
0: 20008000 andcs r8, r0, r0
4: 00000013 andeq r0, r0, r3, lsl r0
8: 00000011 andeq r0, r0, r1, lsl r0
c: 00000011 andeq r0, r0, r1, lsl r0
00000010 <hang>:
10: e7fe b.n 10 <hang>
00000012 <_start>:
12: f00f fff7 bl 10004 <notmain>
16: e7fb b.n 10 <hang>
Disassembly of section .text:
00010000 <fun>:
10000: 4770 bx lr
10002: bf00 nop
00010004 <notmain>:
10004: 2007 movs r0, #7
10006: f7ff bffb b.w 10000 <fun>
1000a: bf00 nop
then leave the 0x10000 stuff and replace 0x00000 stuff later.
Anyway, the short answer is linker script you need to craft a linker script to put things where you want them. gnu linker scripts can get extremely complicated, I lean toward the simple.
If you want to place everything at some other address, including your vector table, then perhaps something like this:
hop.s
.cpu cortex-m3
.thumb
.word 0x20008000 /* stack top address */
.word _start /* Reset */
.word hang
.word hang
/* ... */
.thumb_func
hang: b .
.thumb_func
ldr r0,=_start
bx r0
and this
MEMORY
{
romx : ORIGIN = 0x00000000, LENGTH = 0x1000
romy : ORIGIN = 0x00010000, LENGTH = 0x1000
ram : ORIGIN = 0x00030000, LENGTH = 0x1000
bob : ORIGIN = 0x00040000, LENGTH = 0x1000
ted : ORIGIN = 0x00050000, LENGTH = 0x1000
}
SECTIONS
{
abc : { hop.o } > romx
.text : { *(.text*) } > romy
.bss : { *(.bss*) } > ram
}
gives this
Disassembly of section abc:
00000000 <hang-0x10>:
0: 20008000 andcs r8, r0, r0
4: 00010013 andeq r0, r1, r3, lsl r0
8: 00000011 andeq r0, r0, r1, lsl r0
c: 00000011 andeq r0, r0, r1, lsl r0
00000010 <hang>:
10: e7fe b.n 10 <hang>
12: 4801 ldr r0, [pc, #4] ; (18 <hang+0x8>)
14: 4700 bx r0
16: 00130000
1a: 20410001
Disassembly of section .text:
00010000 <hang-0x10>:
10000: 20008000 andcs r8, r0, r0
10004: 00010013 andeq r0, r1, r3, lsl r0
10008: 00010011 andeq r0, r1, r1, lsl r0
1000c: 00010011 andeq r0, r1, r1, lsl r0
00010010 <hang>:
10010: e7fe b.n 10010 <hang>
00010012 <_start>:
10012: f000 f803 bl 1001c <notmain>
10016: e7fb b.n 10010 <hang>
00010018 <fun>:
10018: 4770 bx lr
1001a: bf00 nop
0001001c <notmain>:
1001c: 2007 movs r0, #7
1001e: f7ff bffb b.w 10018 <fun>
10022: bf00 nop
then you can change the vector table to 0x10000 for example.
if you are asking a different question like having the bootloader at 0x00000, then the bootloader modifies the flash to add an application say at 0x20000, then you want to run that application, there are simpler solutions that dont necessarily require you to modify the location of the vector table.

Resources