Really Minimal STM32 Application: linker failure - gcc

I'm building a tiny microcontroller with only the bare essentials for self-educational purposes. This way, I can refresh my knowledge about topics like the linkerscript, the startup code, ...
EDIT:
I got quite a lot of comments pointing out that the "absolute minimal STM32-application" shown below is no good. You are absolutely right when noticing that the vector table is not complete, the .bss-section is not taken care of, the peripheral addresses are not complete, ... Please allow me to explain why.
It has never been the purpose of the author to write a complete and useful application in this particular chapter. His purpose was to explain step-by-step how a linkerscript works, how startup code works, what the boot procedure of an STM32 looks like, ... purely for educational purposes. I can appreciate this approach, and learned a lot.
The example I have put below is taken from the middle of the chapter in question. The chapter keeps adding more parts to the linkerscript and startup code (for example initialization of .bss-section) as it goes forward.
The reason I put files here from the middle of his chapter, is because I got stuck at a particular error message. I want to get that fixed before continuing.
The chapter in question is somewhere at the end of his book. It is intended for the more experienced or curious reader who wants to gain deeper knowledge about topics most people don't even consider (most people use the standard linkerscript and startup code given by the manufacturer without ever reading it).
Keeping this in mind, please let us focus on the technical issue at hand (as described below in the error messages). Please also accept my sincere apologies that I didn't clarify the intentions of the writer earlier. But I've done it now, so we can move on ;-)
1. Absolute minimal STM32-application
The tutorial I'm following is chapter 20 from this book: "Mastering STM32" (https://leanpub.com/mastering-stm32). The book explains how to make a tiny microcontroller application with two files: main.c and linkerscript.ld. As I'm not using an IDE (like Eclipse), I also added build.bat and clean.bat to generate the compilation commands. So my project folder looks like this:
Before I continue, I should perhaps give some more details about my system:
OS: Windows 10, 64-bit
Microcontroller: NUCLEO-F401RE board with STM32F401RE microcontroller.
Compiler: arm-none-eabi-gcc version 6.3.1 20170620 (release) [ARM/embedded-6-branch revision 249437].
The main file looks like this:
/* ------------------------------------------------------------ */
/* Minimal application */
/* for NUCLEO-F401RE */
/* ------------------------------------------------------------ */
typedef unsigned long uint32_t;
/* Memory and peripheral start addresses (common to all STM32 MCUs) */
#define FLASH_BASE 0x08000000
#define SRAM_BASE 0x20000000
#define PERIPH_BASE 0x40000000
/* Work out end of RAM address as initial stack pointer
* (specific of a given STM32 MCU) */
#define SRAM_SIZE 96*1024 //STM32F401RE has 96 KB of RAM
#define SRAM_END (SRAM_BASE + SRAM_SIZE)
/* RCC peripheral addresses applicable to GPIOA
* (specific of a given STM32 MCU) */
#define RCC_BASE (PERIPH_BASE + 0x23800)
#define RCC_APB1ENR ((uint32_t*)(RCC_BASE + 0x30))
/* GPIOA peripheral addresses
* (specific of a given STM32 MCU) */
#define GPIOA_BASE (PERIPH_BASE + 0x20000)
#define GPIOA_MODER ((uint32_t*)(GPIOA_BASE + 0x00))
#define GPIOA_ODR ((uint32_t*)(GPIOA_BASE + 0x14))
/* Function headers */
int main(void);
void delay(uint32_t count);
/* Minimal vector table */
uint32_t *vector_table[] __attribute__((section(".isr_vector"))) = {
(uint32_t*)SRAM_END, // initial stack pointer (MSP)
(uint32_t*)main // main as Reset_Handler
};
/* Main function */
int main() {
/* Enable clock on GPIOA peripheral */
*RCC_APB1ENR = 0x1;
/* Configure the PA5 as output pull-up */
*GPIOA_MODER |= 0x400; // Sets MODER[11:10] = 0x1
while(1) { // Always true
*GPIOA_ODR = 0x20;
delay(200000);
*GPIOA_ODR = 0x0;
delay(200000);
}
}
void delay(uint32_t count) {
while(count--);
}
The linkerscript looks like this:
/* ------------------------------------------------------------ */
/* Linkerscript */
/* for NUCLEO-F401RE */
/* ------------------------------------------------------------ */
/* Memory layout for STM32F401RE */
MEMORY
{
FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 512K
SRAM (xrw) : ORIGIN = 0x20000000, LENGTH = 96K
}
/* The ENTRY(..) directive overrides the default entry point symbol _start.
* Here we define the main-routine as the entry point.
* In fact, the ENTRY(..) directive is meaningless for embedded chips,
* but it is informative for debuggers. */
ENTRY(main)
SECTIONS
{
/* Program code into FLASH */
.text : ALIGN(4)
{
*(.isr_vector) /* Vector table */
*(.text) /* Program code */
*(.text*) /* Merge all .text.* sections inside the .text section */
KEEP(*(.isr_vector)) /* Don't allow other tools to strip this off */
} >FLASH
_sidata = LOADADDR(.data); /* Used by startup code to initialize data */
.data : ALIGN(4)
{
. = ALIGN(4);
_sdata = .; /* Create a global symbol at data start */
*(.data)
*(.data*)
. = ALIGN(4);
_edata = .; /* Define a global symbol at data end */
} >SRAM AT >FLASH
}
The build.bat file calls the compiler on main.c, and next the linker:
#echo off
setlocal EnableDelayedExpansion
echo.
echo ----------------------------------------------------------------
echo. )\ ***************************
echo. ( =_=_=_=^< ^| * build NUCLEO-F401RE *
echo. )( ***************************
echo. ""
echo.
echo.
echo. Call the compiler on main.c
echo.
#arm-none-eabi-gcc main.c -o main.o -c -MMD -mcpu=cortex-m4 -mthumb -mfloat-abi=hard -mfpu=fpv4-sp-d16 -O0 -g3 -Wall -fmessage-length=0 -Werror-implicit-function-declaration -Wno-comment -Wno-unused-function -ffunction-sections -fdata-sections
echo.
echo. Call the linker
echo.
#arm-none-eabi-gcc main.o -o myApp.elf -mcpu=cortex-m4 -mthumb -mfloat-abi=hard -mfpu=fpv4-sp-d16 -specs=nosys.specs -specs=nano.specs -T linkerscript.ld -Wl,-Map=output.map -Wl,--gc-sections
echo.
echo. Post build
echo.
#arm-none-eabi-objcopy -O binary myApp.elf myApp.bin
arm-none-eabi-size myApp.elf
echo.
echo ----------------------------------------------------------------
The clean.bat file removes all the compiler output:
#echo off
setlocal EnableDelayedExpansion
echo ----------------------------------------------------------------
echo. __ **************
echo. __\ \___ * clean *
echo. \ _ _ _ \ **************
echo. \_`_`_`_\
echo.
del /f /q main.o
del /f /q main.d
del /f /q myApp.bin
del /f /q myApp.elf
del /f /q output.map
echo ----------------------------------------------------------------
Building this works. I get the following output:
C:\Users\Kristof\myProject>build
----------------------------------------------------------------
)\ ***************************
( =_=_=_=< | * build NUCLEO-F401RE *
)( ***************************
""
Call the compiler on main.c
Call the linker
Post build
text data bss dec hex filename
112 0 0 112 70 myApp.elf
----------------------------------------------------------------
2. Proper startup code
Maybe you have noticed that the minimal application didn't have proper startup code to initialize the global variables in the .data-section. Chapter 20.2.2 .data and .bss Sections initialization from the "Mastering STM32" book explains how to do this.
As I follow along, my main.c file now looks like this:
/* ------------------------------------------------------------ */
/* Minimal application */
/* for NUCLEO-F401RE */
/* ------------------------------------------------------------ */
typedef unsigned long uint32_t;
/* Memory and peripheral start addresses (common to all STM32 MCUs) */
#define FLASH_BASE 0x08000000
#define SRAM_BASE 0x20000000
#define PERIPH_BASE 0x40000000
/* Work out end of RAM address as initial stack pointer
* (specific of a given STM32 MCU) */
#define SRAM_SIZE 96*1024 //STM32F401RE has 96 KB of RAM
#define SRAM_END (SRAM_BASE + SRAM_SIZE)
/* RCC peripheral addresses applicable to GPIOA
* (specific of a given STM32 MCU) */
#define RCC_BASE (PERIPH_BASE + 0x23800)
#define RCC_APB1ENR ((uint32_t*)(RCC_BASE + 0x30))
/* GPIOA peripheral addresses
* (specific of a given STM32 MCU) */
#define GPIOA_BASE (PERIPH_BASE + 0x20000)
#define GPIOA_MODER ((uint32_t*)(GPIOA_BASE + 0x00))
#define GPIOA_ODR ((uint32_t*)(GPIOA_BASE + 0x14))
/* Function headers */
void __initialize_data(uint32_t*, uint32_t*, uint32_t*);
void _start (void);
int main(void);
void delay(uint32_t count);
/* Minimal vector table */
uint32_t *vector_table[] __attribute__((section(".isr_vector"))) = {
(uint32_t*)SRAM_END, // initial stack pointer (MSP)
(uint32_t*)_start // _start as Reset_Handler
};
/* Variables defined in linkerscript */
extern uint32_t _sidata;
extern uint32_t _sdata;
extern uint32_t _edata;
volatile uint32_t dataVar = 0x3f;
/* Data initialization */
inline void __initialize_data(uint32_t* flash_begin, uint32_t* data_begin, uint32_t* data_end) {
uint32_t *p = data_begin;
while(p < data_end)
*p++ = *flash_begin++;
}
/* Entry point */
void __attribute__((noreturn,weak)) _start (void) {
__initialize_data(&_sidata, &_sdata, &_edata);
main();
for(;;);
}
/* Main function */
int main() {
/* Enable clock on GPIOA peripheral */
*RCC_APB1ENR = 0x1;
/* Configure the PA5 as output pull-up */
*GPIOA_MODER |= 0x400; // Sets MODER[11:10] = 0x1
while(dataVar == 0x3f) { // Always true
*GPIOA_ODR = 0x20;
delay(200000);
*GPIOA_ODR = 0x0;
delay(200000);
}
}
void delay(uint32_t count) {
while(count--);
}
I've added the initialization code just above the main(..) function. The linkerscript has also some modification:
/* ------------------------------------------------------------ */
/* Linkerscript */
/* for NUCLEO-F401RE */
/* ------------------------------------------------------------ */
/* Memory layout for STM32F401RE */
MEMORY
{
FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 512K
SRAM (xrw) : ORIGIN = 0x20000000, LENGTH = 96K
}
/* The ENTRY(..) directive overrides the default entry point symbol _start.
* In fact, the ENTRY(..) directive is meaningless for embedded chips,
* but it is informative for debuggers. */
ENTRY(_start)
SECTIONS
{
/* Program code into FLASH */
.text : ALIGN(4)
{
*(.isr_vector) /* Vector table */
*(.text) /* Program code */
*(.text*) /* Merge all .text.* sections inside the .text section */
KEEP(*(.isr_vector)) /* Don't allow other tools to strip this off */
} >FLASH
_sidata = LOADADDR(.data); /* Used by startup code to initialize data */
.data : ALIGN(4)
{
. = ALIGN(4);
_sdata = .; /* Create a global symbol at data start */
*(.data)
*(.data*)
. = ALIGN(4);
_edata = .; /* Define a global symbol at data end */
} >SRAM AT >FLASH
}
The little application doesn't compile anymore. Actually, the compilation from main.c to main.o is still okay. But the linking process gets stuck:
C:\Users\Kristof\myProject>build
----------------------------------------------------------------
)\ ***************************
( =_=_=_=< | * build NUCLEO-F401RE *
)( ***************************
""
Call the compiler on main.c
Call the linker
c:/gnu_arm_embedded_toolchain/bin/../lib/gcc/arm-none-eabi/6.3.1/../../../../arm-none-eabi/lib/thumb/v7e-m/fpv4-sp/hard/crt0.o: In function `_start':
(.text+0x64): undefined reference to `__bss_start__'
c:/gnu_arm_embedded_toolchain/bin/../lib/gcc/arm-none-eabi/6.3.1/../../../../arm-none-eabi/lib/thumb/v7e-m/fpv4-sp/hard/crt0.o: In function `_start':
(.text+0x68): undefined reference to `__bss_end__'
collect2.exe: error: ld returned 1 exit status
Post build
arm-none-eabi-objcopy: 'myApp.elf': No such file
arm-none-eabi-size: 'myApp.elf': No such file
----------------------------------------------------------------
3. What I've tried
I've omitted this part, otherwise this question gets too long ;-)
4. Solution
#berendi provided the solution. Thank you #berendi! Apparently I need to add the flags -nostdlib and -ffreestanding to gcc and the linker. The build.bat file now looks like this:
#echo off
setlocal EnableDelayedExpansion
echo.
echo ----------------------------------------------------------------
echo. )\ ***************************
echo. ( =_=_=_=^< ^| * build NUCLEO-F401RE *
echo. )( ***************************
echo. ""
echo.
echo.
echo. Call the compiler on main.c
echo.
#arm-none-eabi-gcc main.c -o main.o -c -MMD -mcpu=cortex-m4 -mthumb -mfloat-abi=hard -mfpu=fpv4-sp-d16 -O0 -g3 -Wall -fmessage-length=0 -Werror-implicit-function-declaration -Wno-comment -Wno-unused-function -ffunction-sections -fdata-sections -ffreestanding -nostdlib
echo.
echo. Call the linker
echo.
#arm-none-eabi-gcc main.o -o myApp.elf -mcpu=cortex-m4 -mthumb -mfloat-abi=hard -mfpu=fpv4-sp-d16 -specs=nosys.specs -specs=nano.specs -T linkerscript.ld -Wl,-Map=output.map -Wl,--gc-sections -ffreestanding -nostdlib
echo.
echo. Post build
echo.
#arm-none-eabi-objcopy -O binary myApp.elf myApp.bin
arm-none-eabi-size myApp.elf
echo.
echo ----------------------------------------------------------------
Now it works!
In his answer, #berendi also gives a few interesting remarks about the main.c file. I've applied most of them:
Missing volatile keyword
Empty loop
Missing Memory Barrier (did I put the memory barrier in the correct place?)
Missing delay after RCC enable
Misleading symbolic name (apparently it should be RCC_AHB1ENR instead of RCC_APB1ENR).
The vector table: this part I've skipped. Right now I don't really need a HardFault_Handler, MemManage_Handler, ... as this is just a tiny test for educational purposes.
Nevertheless, I did notice that #berendi put a few interesting modifications in the way he declares the vector table. But I'm not entirely grasping what he's doing exactly.
The main.c file now looks like this:
/* ------------------------------------------------------------ */
/* Minimal application */
/* for NUCLEO-F401RE */
/* ------------------------------------------------------------ */
typedef unsigned long uint32_t;
/**
\brief Data Synchronization Barrier
\details Acts as a special kind of Data Memory Barrier.
It completes when all explicit memory accesses before this instruction complete.
*/
__attribute__((always_inline)) static inline void __DSB(void)
{
__asm volatile ("dsb 0xF":::"memory");
}
/* Memory and peripheral start addresses (common to all STM32 MCUs) */
#define FLASH_BASE 0x08000000
#define SRAM_BASE 0x20000000
#define PERIPH_BASE 0x40000000
/* Work out end of RAM address as initial stack pointer
* (specific of a given STM32 MCU) */
#define SRAM_SIZE 96*1024 //STM32F401RE has 96 KB of RAM
#define SRAM_END (SRAM_BASE + SRAM_SIZE)
/* RCC peripheral addresses applicable to GPIOA
* (specific of a given STM32 MCU) */
#define RCC_BASE (PERIPH_BASE + 0x23800)
#define RCC_AHB1ENR ((volatile uint32_t*)(RCC_BASE + 0x30))
/* GPIOA peripheral addresses
* (specific of a given STM32 MCU) */
#define GPIOA_BASE (PERIPH_BASE + 0x20000)
#define GPIOA_MODER ((volatile uint32_t*)(GPIOA_BASE + 0x00))
#define GPIOA_ODR ((volatile uint32_t*)(GPIOA_BASE + 0x14))
/* Function headers */
void __initialize_data(uint32_t*, uint32_t*, uint32_t*);
void _start (void);
int main(void);
void delay(uint32_t count);
/* Minimal vector table */
uint32_t *vector_table[] __attribute__((section(".isr_vector"))) = {
(uint32_t*)SRAM_END, // initial stack pointer (MSP)
(uint32_t*)_start // _start as Reset_Handler
};
/* Variables defined in linkerscript */
extern uint32_t _sidata;
extern uint32_t _sdata;
extern uint32_t _edata;
volatile uint32_t dataVar = 0x3f;
/* Data initialization */
inline void __initialize_data(uint32_t* flash_begin, uint32_t* data_begin, uint32_t* data_end) {
uint32_t *p = data_begin;
while(p < data_end)
*p++ = *flash_begin++;
}
/* Entry point */
void __attribute__((noreturn,weak)) _start (void) {
__initialize_data(&_sidata, &_sdata, &_edata);
asm volatile("":::"memory"); // <- Did I put this instruction at the right spot?
main();
for(;;);
}
/* Main function */
int main() {
/* Enable clock on GPIOA peripheral */
*RCC_AHB1ENR = 0x1;
__DSB();
/* Configure the PA5 as output pull-up */
*GPIOA_MODER |= 0x400; // Sets MODER[11:10] = 0x1
while(dataVar == 0x3f) { // Always true
*GPIOA_ODR = 0x20;
delay(200000);
*GPIOA_ODR = 0x0;
delay(200000);
}
}
void delay(uint32_t count) {
while(count--){
asm volatile("");
}
}
PS: The book "Mastering STM32" from Carmine Noviello is an absolute masterpiece. You should read it! => https://leanpub.com/mastering-stm32

You can tell gcc not to use the library.
The Compiler
By default, gcc assumes that you are using a standard C library, and can emit code that calls some functions. For example, when optimizations are enabled, it detects loops that copy a piece of memory, and may substitute them with a call to memcpy(). Disable it with -ffreestanding.
The Linker
The linker assumes as well that you want to link your program with the C library and startup code. The library startup code is responsible for initializing the library and the program execution environment. It has a function named _start() which has to be called after reset. One of its functions is to fill the .bss segment (see below) with zero. If the symbols that delimit .bss are not defined, then _startup() cannot be linked. Had you named your startup function anything else but _startup(), then the library startup would have been siletly dropped by the linker as an unused function, and the code could have been linked.
You can tell the linker not to link any standard library or startup code with -nostdlib, then the library supplied startup function name would not conflict with yours, and you would get a linker error every time you accidentally invoked a library function.
Missing volatile
Your register definitions are missing the volatile qualifier. Without it, subsequent writes to *GPIOA_ODR will be optimized out. The compiler will move this "invariant code" out of the loop. Changing the type in the register definitions to (volatile uint32_t*) would fix that.
Empty loop
The optimizer can recognize that the delay loop does nothing, and eliminate it completely to speed up execution. Add an empty but non-removable asm volatile(""); instruction to the delay loop.
Missing Memory Barrier
You are initializing the .data section that holds dataVar in a C function. The *p in __initialize_data() is effectively an alias for dataVar, and the compiler has no way to know it. The optimizer could theoretically rearrange the test of dataVar before __initialize_data(). Even if dataVar is volatile, *p is not, therefore ordering is not guaranteed.
After the data initialization loop, you should tell the compiler that program variables are changed by a mechanism unknown to the compiler:
asm volatile("":::"memory");
It's an old-fashioned gcc extension, the latest C standards might have defined a portable way to do this (which is not recognized by older gcc versions).
Missing delay after RCC enable
The Errata is saying,
A delay between an RCC peripheral clock enable and the effective peripheral enabling should be taken into account in order to manage the peripheral read/write to registers.
This delay depends on the peripheral mapping:
• If the peripheral is mapped on AHB: the delay should be equal to 2 AHB cycles.
• If the peripheral is mapped on APB: the delay should be equal to 1 + (AHB/APB prescaler) cycles.
Workarounds
Use the DSB instruction to stall the Cortex®-M4 CPU pipeline until the instruction is completed.
Therefore, insert a
__DSB();
after *RCC_APB1ENR = 0x1; (which should be called something else)
Misleading symbolic name
Although the address for enabling GPIOA in RCC seems to be correct, the register is called RCC_AHB1ENR in the documentation. It will confuse people trying to understand your code.
The Vector Table
Although technically you can get away with having only a stack pinter and a reset handler in it, I'd too recommend having a few more entries, at least the fault handlers for simple troubleshooting.
__attribute__ ((section(".isr_vector"),used))
void (* const _vectors[]) (void) = {
(void (*const)(void))(&__stack),
Reset_Handler,
NMI_Handler,
HardFault_Handler,
MemManage_Handler,
BusFault_Handler,
UsageFault_Handler
}
The Linker Script
At the bare minimum, it must define a section for your vector table, and the code. A program must have a start address and some code, static data is optional. The rest depends on what kind of data your program is using. You could technically omit them from the linker script if there are no data of a particular type.
.rodata: read-only data, const arrays and structs go here. They remain in flash. (simple const variables are usually put in the code)
.data: initialized variables, everything you declare with an = sign, and without const.
.bss: variables that should be zero-initialized in C, i.e. global and static ones.
As you don't need .rodata or .bss now, it's fine.

Linker scripts in general are an artform, they are their own programming language and gnu's are certainly a bit of a nightmare. Divide the task into figuring out the linker script from making a working binary, once you can see the linker script is doing what you want then make the bootstrap code to use it. Take advantage of the toolchain.
The example the author used was derived from code written specifically to be used as baremetal examples that maximize success. Avoided common language and toolchain issues, yet be portable across many versions of the toolchain and to be easily ported to other toolchains (minimal reliance on the toolchain, in particular the linker script which leads to the bootstrap). The author of the book used that code but added risk to it to not be as reliable of an example.
Avoiding .data specifically and not relying on .bss to be zeroed when you write baremetal code goes a very long way toward long term success.
It was also modified such that optimization would prevent that code from working (well blinking at a rate you can see).
An example somewhat minimal linker script for binutils that you can modify to work toward .data and .bss initialization looks generically like this
test.ld
MEMORY
{
bob : ORIGIN = 0x8000, LENGTH = 0x1000
ted : ORIGIN = 0xA000, LENGTH = 0x1000
}
SECTIONS
{
.text : { *(.text*) } > bob
__data_rom_start__ = .;
.data : {
__data_start__ = .;
*(.data*)
} > ted AT > bob
__data_end__ = .;
__data_size__ = __data_end__ - __data_start__;
.bss : {
__bss_start__ = .;
*(.bss*)
} > ted
__bss_end__ = .;
__bss_size__ = __bss_end__ - __bss_start__;
}
(note memory names dont have to be rom or ram or flash or data or whatever bob is program space and ted is memory btw, change the addresses as desired)
How you see what is going on is you can link with a simple example or with your code, you need some .data and some .bss (and some .text).
vectors.s
.thumb
.globl _start
_start:
.word 0x20001000
.word reset
.thumb_func
reset:
bl notmain
b .
.globl bss_start
bss_start: .word __bss_start__
.globl bss_end
bss_end: .word __bss_end__
.word __bss_size__
.globl data_rom_start
data_rom_start:
.word __data_rom_start__
.globl data_start
data_start:
.word __data_start__
.globl data_end
data_end:
.word __data_end__
.word __data_size__
so.c
unsigned int a=1;
unsigned int b=2;
unsigned int c;
unsigned int d;
unsigned int e;
unsigned int notmain ( void )
{
return(a+b+c+d+e);
}
build
arm-none-eabi-as vectors.s -o vectors.o
arm-none-eabi-gcc -O2 -c -mthumb so.c -o so.o
arm-none-eabi-ld -T test.ld vectors.o so.o -o vectors.elf
arm-none-eabi-objdump -D vectors.elf
The code so far is not specific to arm-none-whatever or arm-linux-whatever versions of the toolchain. If/when you need gcclib items you can use gcc instead of ld but you have to be careful when doing that...or provide the path to libgcc and use ld.
What we get from this code is linker script debugging on the cheap:
Disassembly of section .text:
00008000 <_start>:
8000: 20001000 andcs r1, r0, r0
8004: 00008009 andeq r8, r0, r9
00008008 <reset>:
8008: f000 f810 bl 802c <notmain>
800c: e7fe b.n 800c <reset+0x4>
0000800e <bss_start>:
800e: 0000a008 andeq sl, r0, r8
00008012 <bss_end>:
8012: 0000a014 andeq sl, r0, r4, lsl r0
8016: 0000000c andeq r0, r0, ip
0000801a <data_rom_start>:
801a: 00008058 andeq r8, r0, r8, asr r0
0000801e <data_start>:
801e: 0000a000 andeq sl, r0, r0
00008022 <data_end>:
8022: 0000a008 andeq sl, r0, r8
8026: 00000008 andeq r0, r0, r8
...
We care about the 32 bit values being created the andeq disassembly is because the disassembler is trying to disassemble those values as instructions which they are not. The reset instructions are real the rest is 32 bit values we are generating. might be able to use readelf, but getting used to disassembling, insuring the vector table is correct as step one, which is easy to see in the disassembly. Using the disassembler as a habit can then lead to using it as above to show you what the linker generated.
If you dont get the linker script variables right you wont be able to write a successful bootstrap, if you dont have a good way to see what the linker is producing you will fail on a regular basis.
Yes, you could have exposed them in C and not assembly, the toolchain would still help you there.
You can work toward this now that you can see what the linker is doing:
.thumb
.globl _start
_start:
.word 0x20001000
.word reset
.thumb_func
reset:
ldr r0,=__bss_start__
ldr r1,=__bss_size__
# zero this
ldr r0,=__data_rom_start__
ldr r1,=__data_start__
ldr r2,=__data_size__
# copy this
bl notmain
b .
giving something like this
00008000 <_start>:
8000: 20001000 andcs r1, r0, r0
8004: 00008009 andeq r8, r0, r9
00008008 <reset>:
8008: 4803 ldr r0, [pc, #12] ; (8018 <reset+0x10>)
800a: 4904 ldr r1, [pc, #16] ; (801c <reset+0x14>)
800c: 4804 ldr r0, [pc, #16] ; (8020 <reset+0x18>)
800e: 4905 ldr r1, [pc, #20] ; (8024 <reset+0x1c>)
8010: 4a05 ldr r2, [pc, #20] ; (8028 <reset+0x20>)
8012: f000 f80b bl 802c <notmain>
8016: e7fe b.n 8016 <reset+0xe>
8018: 0000a008 andeq sl, r0, r8
801c: 0000000c andeq r0, r0, ip
8020: 00008058 andeq r8, r0, r8, asr r0
8024: 0000a000 andeq sl, r0, r0
8028: 00000008 andeq r0, r0, r8
0000802c <notmain>:
802c: 4b06 ldr r3, [pc, #24] ; (8048 <notmain+0x1c>)
802e: 6818 ldr r0, [r3, #0]
8030: 685b ldr r3, [r3, #4]
8032: 18c0 adds r0, r0, r3
If you then align the items in the linker script the copy/zero code gets even simpler you can stick to 1 to some number N whole registers rather than dealing with bytes or halfwords, can use ldr/str, ldrd/strd (if available) or ldm/stm (and not need ldrb/strb nor ldrh/strh), tight simple few line loops to complete the job.
I highly recommend you do not use C for your bootstrap.
Note that the ld linker script variables are very sensitive to position (inside or outside curly braces)
The above linker script is somewhat typical of what you will find in stock linker scripts a defined start and end, sometimes the size is computed in the linker script sometimes the bootstrap code computes the size or the bootstrap code can just loop until the address equals the end value, depends on the overall system design between the two.
Your specific issue BTW is you linked in two bootstraps, at the time I wrote this I dont see your command line(s) in the question so that would tell us more. That is why you are seeing the bss_start, etc, things that you didnt put in your linker script but are often found in stock ones that come with a pre-built toolchain (similar to the above but more complicated)
It could be by using gcc instead of ld and without the the various -nostartfiles options (that it pulled in crt0.o), just try ld instead of gcc and see what changes. You would have failed with the original example had it been something like this though so I dont think that is the issue here. If you used the same command lines the failure should have been on both examples not just the latter.

The book you're reading has led you astray. Discard it and start learning from another source.
I see at least four major problems with what it has told you to do:
The linker script and _start function you included is missing a number of important sections, and will either malfunction or fail to link many executables. Most notably, it lacks any handling for BSS (zero-filled) sections.
The vector table in main.c is beyond "minimal"; it lacks the required definitions for even the standard ARM interrupt vectors. Without these, debugging hardfaults will become very difficult, as the microcontroller will treat random code following the vector table as an interrupt vector when a fault occurs, which will probably lead to a secondary fault as it fails to load code from that "address".
The startup functions given by your book bypass the libc startup functions. This will cause some portions of the standard C library, as well as any C++ code, to fail to work correctly.
You are defining peripheral addresses yourself in main.c. These addresses are all defined in standard ST header files (e.g. <stm32f4xx.h>), so there is no need to define them yourself.
As a starter, I would recommend that you refer to the startup code provided by ST in any of their examples. These will all include a complete linker script and startup code.

As old_timer hinted in the comments, using gcc to link is a problem.
If you change the linker call in your batch file to use ld, it links without error. Try the following:
echo.
echo. Call the linker
echo.
#arm-none-eabi-ld main.o -o myApp.elf -T linkerscript.ld

Related

gcc with intel x86-32 bit assembly : accessing C function arguments

I am doing an operating system implementation work.
Here's the code first :
//generate software interrupt
void generate_interrupt(int n) {
asm("mov al, byte ptr [n]");
asm("mov byte ptr [genint+1], al");
asm("jmp genint");
asm("genint:");
asm("int 0");
}
I am compiling above code with -masm=intel option in gcc. Also,
this is not complete code to generate software interrupt.
My problem is I am getting error as n undefined, how do I resolve it, please help?
Also it promts error at link time not at compile time, below is an image
When you are using GCC, you must use GCC-style extended asm to access variables declared in C, even if you are using Intel assembly syntax. The ability to write C variable names directly into an assembly insert is a feature of MSVC, which GCC does not copy.
For constructs like this, it is also important to use a single assembly insert, not several in a row; GCC can and will rearrange assembly inserts relative to the surrounding code, including relative to other assembly inserts, unless you take specific steps to prevent it.
This particular construct should be written
void generate_interrupt(unsigned char n)
{
asm ("mov byte ptr [1f+1], %0\n\t"
"jmp 1f\n"
"1:\n\t"
"int 0"
: /* no outputs */ : "r" (n));
}
Note that I have removed the initial mov and any insistence on involving the A register, instead telling GCC to load n into any convenient register for me with the "r" input constraint. It is best to do as little as possible in an assembly insert, and to leave the choice of registers to the compiler as much as possible.
I have also changed the type of n to unsigned char to match the actual requirements of the INT instruction, and I am using the 1f local label syntax so that this works correctly if generate_interrupt is made an inline function.
Having said all that, I implore you to find an implementation strategy for your operating system that does not involve self-modifying code. Well, unless you plan to get a whole lot more use out of the self-modifications, anyway.
This isn't an answer to your specific question about passing parameters into inline assembly (see #zwol's answer). This addresses using self modifying code unnecessarily for this particular task.
Macro Method if Interrupt Numbers are Known at Compile-time
An alternative to using self modifying code is to create a C macro that generates the specific interrupt you want. One trick is you need to a macro that converts a number to a string. Stringize macros are quite common and documented in the GCC documentation.
You could create a macro GENERATE_INTERRUPT that looks like this:
#define STRINGIZE_INTERNAL(s) #s
#define STRINGIZE(s) STRINGIZE_INTERNAL(s)
#define GENERATE_INTERRUPT(n) asm ("int " STRINGIZE(n));
STRINGIZE will take a numeric value and convert it into a string. GENERATE_INTERRUPT simply takes the number, converts it to a string and appends it to the end of the of the INT instruction.
You use it like this:
GENERATE_INTERRUPT(0);
GENERATE_INTERRUPT(3);
GENERATE_INTERRUPT(255);
The generated instructions should look like:
int 0x0
int3
int 0xff
Jump Table Method if Interrupt Numbers are Known Only at Run-time
If you need to call interrupts only known at run-time then one can create a table of interrupt calls (using int instruction) followed by a ret. generate_interrupt would then simply retrieve the interrupt number off the stack, compute the position in the table where the specific int can be found and jmp to it.
In the following code I get GNU assembler to generate the table of 256 interrupt call each followed by a ret using the .rept directive. Each code fragment fits in 4 bytes. The result code generation and the generate_interrupt function could look like:
/* We use GNU assembly to create a table of interrupt calls followed by a ret
* using the .rept directive. 256 entries (0 to 255) are generated.
* generate_interrupt is a simple function that takes the interrupt number
* as a parameter, computes the offset in the interrupt table and jumps to it.
* The specific interrupted needed will be called followed by a RET to return
* back from the function */
extern void generate_interrupt(unsigned char int_no);
asm (".pushsection .text\n\t"
/* Generate the table of interrupt calls */
".align 4\n"
"int_jmp_table:\n\t"
"intno=0\n\t"
".rept 256\n\t"
"\tint intno\n\t"
"\tret\n\t"
"\t.align 4\n\t"
"\tintno=intno+1\n\t"
".endr\n\t"
/* generate_interrupt function */
".global generate_interrupt\n" /* Give this function global visibility */
"generate_interrupt:\n\t"
#ifdef __x86_64__
"movzx edi, dil\n\t" /* Zero extend int_no (in DIL) across RDI */
"lea rax, int_jmp_table[rip]\n\t" /* Get base of interrupt jmp table */
"lea rax, [rax+rdi*4]\n\t" /* Add table base to offset = jmp address */
"jmp rax\n\t" /* Do sepcified interrupt */
#else
"movzx eax, byte ptr 4[esp]\n\t" /* Get Zero extend int_no (arg1 on stack) */
"lea eax, int_jmp_table[eax*4]\n\t" /* Compute jump address */
"jmp eax\n\t" /* Do specified interrupt */
#endif
".popsection");
int main()
{
generate_interrupt (0);
generate_interrupt (3);
generate_interrupt (255);
}
If you were to look at the generated code in the object file you'd find the interrupt call table (int_jmp_table) looks similar to this:
00000000 <int_jmp_table>:
0: cd 00 int 0x0
2: c3 ret
3: 90 nop
4: cd 01 int 0x1
6: c3 ret
7: 90 nop
8: cd 02 int 0x2
a: c3 ret
b: 90 nop
c: cc int3
d: c3 ret
e: 66 90 xchg ax,ax
10: cd 04 int 0x4
12: c3 ret
13: 90 nop
...
[snip]
Because I used .align 4 each entry is padded out to 4 bytes. This makes the address calculation for the jmp easier.

Compile an asm bootloader with external c code

I write a boot loader in asm and want to add some compiled C code in my project.
I created a test function here:
test.c
__asm__(".code16\n");
void print_str() {
__asm__ __volatile__("mov $'A' , %al\n");
__asm__ __volatile__("mov $0x0e, %ah\n");
__asm__ __volatile__("int $0x10\n");
}
And here is the asm code (the boot loader):
hw.asm
[org 0x7C00]
[BITS 16]
[extern print_str] ;nasm tip
start:
mov ax, 0
mov ds, ax
mov es, ax
mov ss, ax
mov sp, 0x7C00
mov si, name
call print_string
mov al, ' '
int 10h
mov si, version
call print_string
mov si, line_return
call print_string
call print_str ;call function
mov si, welcome
call print_string
jmp mainloop
mainloop:
mov si, prompt
call print_string
mov di, buffer
call get_str
mov si, buffer
cmp byte [si], 0
je mainloop
mov si, buffer
;call print_string
mov di, cmd_version
call strcmp
jc .version
jmp mainloop
.version:
mov si, name
call print_string
mov al, ' '
int 10h
mov si, version
call print_string
mov si, line_return
call print_string
jmp mainloop
name db 'MOS', 0
version db 'v0.1', 0
welcome db 'Developped by Marius Van Nieuwenhuyse', 0x0D, 0x0A, 0
prompt db '>', 0
line_return db 0x0D, 0x0A, 0
buffer times 64 db 0
cmd_version db 'version', 0
%include "functions/print.asm"
%include "functions/getstr.asm"
%include "functions/strcmp.asm"
times 510 - ($-$$) db 0
dw 0xaa55
I need to call the c function like a simple asm function
Without the extern and the call print_str, the asm script boot in VMWare.
I tried to compile with:
nasm -f elf32
But i can't call org 0x7C00
Compiling & Linking NASM and GCC Code
This question has a more complex answer than one might believe, although it is possible. Can the first stage of a bootloader (the original 512 bytes that get loaded at physical address 0x07c00) make a call into a C function? Yes, but it requires rethinking how you build your project.
For this to work you can no longer us -f bin with NASM. This also means you can't use the org 0x7c00 to tell the assembler what address the code expects to start from. You'll need to do this through a linker (either us LD directly or GCC for linking). Since the linker will lay things out in memory we can't rely on placing the boot sector signature 0xaa55 in our output file. We can get the linker to do that for us.
The first problem you will discover is that the default linker scripts used internally by GCC don't lay things out the way we want. We'll need to create our own. Such a linker script will have to set the origin point (Virtual Memory Address aka VMA) to 0x7c00, place the code from your assembly file before the data and place the boot signature at offset 510 in the file. I'm not going to write a tutorial on Linker scripts. The Binutils Documentation contains almost everything you need to know about linker scripts.
OUTPUT_FORMAT("elf32-i386");
/* We define an entry point to keep the linker quiet. This entry point
* has no meaning with a bootloader in the binary image we will eventually
* generate. Bootloader will start executing at whatever is at 0x07c00 */
ENTRY(start);
SECTIONS
{
. = 0x7C00;
.text : {
/* Place the code in hw.o before all other code */
hw.o(.text);
*(.text);
}
/* Place the data after the code */
.data : SUBALIGN(2) {
*(.data);
*(.rodata*);
}
/* Place the boot signature at LMA/VMA 0x7DFE */
.sig 0x7DFE : {
SHORT(0xaa55);
}
/* Place the uninitialised data in the area after our bootloader
* The BIOS only reads the 512 bytes before this into memory */
.bss : SUBALIGN(4) {
__bss_start = .;
*(COMMON);
*(.bss)
. = ALIGN(4);
__bss_end = .;
}
__bss_sizeb = SIZEOF(.bss);
/* Remove sections that won't be relevant to us */
/DISCARD/ : {
*(.eh_frame);
*(.comment);
}
}
This script should create an ELF executable that can be converted to a flat binary file with OBJCOPY. We could have output as a binary file directly but I separate the two processes out in the event I want to include debug information in the ELF version for debug purposes.
Now that we have a linker script we must remove the ORG 0x7c00 and the boot signature. For simplicity sake we'll try to get the following code (hw.asm) to work:
extern print_str
global start
bits 16
section .text
start:
xor ax, ax ; AX = 0
mov ds, ax
mov es, ax
mov ss, ax
mov sp, 0x7C00
call print_str ; call function
/* Halt the processor so we don't keep executing code beyond this point */
cli
hlt
You can include all your other code, but this sample will still demonstrate the basics of calling into a C function.
Assume the code above you can now generate the ELF object from hw.asm producing hw.o using this command:
nasm -f elf32 hw.asm -o hw.o
You compile each C file with something like:
gcc -ffreestanding -c kmain.c -o kmain.o
I placed the C code you had into a file called kmain.c . The command above will generate kmain.o. I noticed you aren't using a cross compiler so you'll want to use -fno-PIE to ensure we don't generate relocatable code. -ffreestanding tells GCC the C standard library may not exist, and main may not be the program entry point. You'd compile each C file in the same way.
To link this code to a final executable and then produce a flat binary file that can be booted we do this:
ld -melf_i386 --build-id=none -T link.ld kmain.o hw.o -o kernel.elf
objcopy -O binary kernel.elf kernel.bin
You specify all the object files to link with the LD command. The LD command above will produce a 32-bit ELF executable called kernel.elf. This file can be useful in the future for debugging purposes. Here we use OBJCOPY to convert kernel.elf to a binary file called kernel.bin. kernel.bin can be used as a bootloader image.
You should be able to run it with QEMU using this command:
qemu-system-i386 -fda kernel.bin
When run it may look like:
You'll notice the letter A appears on the last line. This is what we'd expect from the print_str code.
GCC Inline Assembly is Hard to Get Right
If we take your example code in the question:
__asm__ __volatile__("mov $'A' , %al\n");
__asm__ __volatile__("mov $0x0e, %ah\n");
__asm__ __volatile__("int $0x10\n");
The compiler is free to reorder these __asm__ statements if it wanted to. The int $0x10 could appear before the MOV instructions. If you want these 3 lines to be output in this exact order you can combine them into one like this:
__asm__ __volatile__("mov $'A' , %al\n\t"
"mov $0x0e, %ah\n\t"
"int $0x10");
These are basic assembly statements. It's not required to specify __volatile__on them as they are already implicitly volatile, so it has no effect. From the original poster's answer it is clear they want to eventually use variables in __asm__ blocks. This is doable with extended inline assembly (the instruction string is followed by a colon : followed by constraints.):
With extended asm you can read and write C variables from assembler and perform jumps from assembler code to C labels. Extended asm syntax uses colons (‘:’) to delimit the operand parameters after the assembler template:
asm [volatile] ( AssemblerTemplate
: OutputOperands
[ : InputOperands
[ : Clobbers ] ])
This answer isn't a tutorial on inline assembly. The general rule of thumb is that one should not use inline assembly unless you have to. Inline assembly done wrong can create hard to track bugs or have unusual side effects. Unfortunately doing 16-bit interrupts in C pretty much requires it, or you write the entire function in assembly (ie: NASM).
This is an example of a print_chr function that take a nul terminated string and prints each character out one by one using Int 10h/ah=0ah:
#include <stdint.h>
__asm__(".code16gcc\n");
void print_str(char *str) {
while (*str) {
/* AH=0x0e, AL=char to print, BH=page, BL=fg color */
__asm__ __volatile__ ("int $0x10"
:
: "a" ((0x0e<<8) | *str++),
"b" (0x0000));
}
}
hw.asm would be modified to look like this:
push welcome
call print_str ;call function
The idea when this is assembled/compiled (using the commands in the first section of this answer) and run is that it print out the welcome message. Unfortunately it will almost never work, and may even crash some emulators like QEMU.
code16 is Almost Useless and Should Not be Used
In the last section we learn that a simple function that takes a parameter ends up not working and may even crash an emulator like QEMU. The main problem is that the __asm__(".code16\n"); statement really doesn't work well with the code generated by GCC. The Binutils AS documentation says:
‘.code16gcc’ provides experimental support for generating 16-bit code from gcc, and differs from ‘.code16’ in that ‘call’, ‘ret’, ‘enter’, ‘leave’, ‘push’, ‘pop’, ‘pusha’, ‘popa’, ‘pushf’, and ‘popf’ instructions default to 32-bit size. This is so that the stack pointer is manipulated in the same way over function calls, allowing access to function parameters at the same stack offsets as in 32-bit mode. ‘.code16gcc’ also automatically adds address size prefixes where necessary to use the 32-bit addressing modes that gcc generates.
.code16gcc is what you really need to be using, not .code16. This force GNU assembler on the back end to emit address and operand prefixes on certain instructions so that the addresses and operands are treated as 4 bytes wide, and not 2 bytes.
The hand written code in NASM doesn't know it will be calling C instructions, nor does NASM have a directive like .code16gcc. You'll need to modify the assembly code to push 32-bit values on to the stack in real mode. You will also need to override the call instruction so that the return address needs to be treated as a 32-bit value, not 16-bit. This code:
push welcome
call print_str ;call function
Should be:
jmp 0x0000:setcs
setcs:
cld
push dword welcome
call dword print_str ;call function
GCC has a requirement that the direction flag be cleared before calling any C function. I added the CLD instruction to the top of the assembly code to make sure this is the case. GCC code also needs to have CS to 0x0000 to work properly. The FAR JMP does just that.
You can also drop the __asm__(".code16gcc\n"); on modern GCC that supports the -m16 option. -m16 automatically places a .code16gcc into the file that is being compiled.
Since GCC also uses the full 32-bit stack pointer it is a good idea to initialize ESP with 0x7c00, not just SP. Change mov sp, 0x7C00 to mov esp, 0x7C00. This ensures the full 32-bit stack pointer is 0x7c00.
The modified kmain.c code should now look like:
#include <stdint.h>
void print_str(char *str) {
while (*str) {
/* AH=0x0e, AL=char to print, BH=page, BL=fg color */
__asm__ __volatile__ ("int $0x10"
:
: "a" ((0x0e<<8) | *str++),
"b" (0x0000));
}
}
and hw.asm:
extern print_str
global start
bits 16
section .text
start:
xor ax, ax ; AX = 0
mov ds, ax
mov es, ax
mov ss, ax
mov esp, 0x7C00
jmp 0x0000:setcs ; Set CS to 0
setcs:
cld ; GCC code requires direction flag to be cleared
push dword welcome
call dword print_str ; call function
cli
hlt
section .data
welcome db 'Developped by Marius Van Nieuwenhuyse', 0x0D, 0x0A, 0
These commands can be build the bootloader with:
gcc -fno-PIC -ffreestanding -m16 -c kmain.c -o kmain.o
ld -melf_i386 --build-id=none -T link.ld kmain.o hw.o -o kernel.elf
objcopy -O binary kernel.elf kernel.bin
When run with qemu-system-i386 -fda kernel.bin it should look simialr to:
In Most Cases GCC Produces Code that Requires 80386+
There are number of disadvantages to GCC generated code using .code16gcc:
ES=DS=CS=SS must be 0
Code must fit in the first 64kb
GCC code has no understanding of 20-bit segment:offset addressing.
For anything but the most trivial C code, GCC doesn't generate code that can run on a 286/186/8086. It runs in real mode but it uses 32-bit operands and addressing not available on processors earlier than 80386.
If you want to access memory locations above the first 64kb then you need to be in Unreal Mode(big) before calling into C code.
If you want to produce real 16-bit code from a more modern C compiler I recommend OpenWatcom C
The inline assembly is not as powerful as GCC
The inline assembly syntax is different but it is easier to use and less error prone than GCC's inline assembly.
Can generate code that will run on antiquated 8086/8088 processors.
Understands 20-bit segment:offset real mode addressing and supports the concept of far and huge pointers.
wlink the Watcom linker can produce basic flat binary files usable as a bootloader.
Zero Fill the BSS Section
The BIOS boot sequence doesn't guarantee that memory is actually zero. This causes a potential problem for the zero initialized region BSS. Before calling into C code for the first time the region should be zero filled by our assembly code. The linker script I originally wrote defines a symbol __bss_start that is the offset of the BSS memory and __bss_sizeb is the size in bytes. Using this info you can use the STOSB instruction to easily zero fill it. At the top of hw.asm you can add:
extern __bss_sizeb
extern __bss_start
And after the CLD instruction and before calling any C code you can do the zero fill this way:
; Zero fill the BSS section
mov cx, __bss_sizeb ; Size of BSS computed in linker script
mov di, __bss_start ; Start of BSS defined in linker script
rep stosb ; AL still zero, Fill memory with zero
Other Suggestions
To reduce the bloat of the code generated by the compiler it can be useful to use -fomit-frame-pointer. Compiling with -Os can optimize for space (rather than speed). We have limited space (512 bytes) for the initial code loaded by the BIOS so these optimizations can be beneficial. The command line for compiling could appear as:
gcc -fno-PIC -fomit-frame-pointer -ffreestanding -m16 -Os -c kmain.c -o kmain.o
I write a boot loader in asm and want to add some compiled C code in my project.
Then you need to use a 16-bit x86 compiler, such as OpenWatcom.
GCC cannot safely build real-mode code, as it is unaware of some important features of the platform, including memory segmentation. Inserting the .code16 directive will make the compiler generate incorrect output. Despite appearing in many tutorials, this piece of advice is simply incorrect, and should not be used.
First i want to express how to link C compiled code with assembled file.
I put together some Q/A in SO and reach to this.
C code:
func.c
//__asm__(".code16gcc\n");when we use eax, 32 bit reg we cant use this as truncate
//problem
#include <stdio.h>
int x = 0;
int madd(int a, int b)
{
return a + b;
}
void mexit(){
__asm__ __volatile__("mov $0, %ebx\n");
__asm__ __volatile__("mov $1, %eax \n");
__asm__ __volatile__("int $0x80\n");
}
char* tmp;
///how to direct use of arguments in asm command
void print_str(int a, char* s){
x = a;
__asm__("mov x, %edx\n");// ;third argument: message length
tmp = s;
__asm__("mov tmp, %ecx\n");// ;second argument: pointer to message to write
__asm__("mov $1, %ebx\n");//first argument: file handle (stdout)
__asm__("mov $4, %eax\n");//system call number (sys_write)
__asm__ __volatile__("int $0x80\n");//call kernel
}
void mtest(){
printf("%s\n", "Hi");
//putchar('a');//why not work
}
///gcc -c func.c -o func
Assembly code:
hello.asm
extern mtest
extern printf
extern putchar
extern print_str
extern mexit
extern madd
section .text ;section declaration
;we must export the entry point to the ELF linker or
global _start ;loader. They conventionally recognize _start as their
;entry point. Use ld -e foo to override the default.
_start:
;write our string to stdout
push msg
push len
call print_str;
call mtest ;print "Hi"; call printf inside a void function
; use add inside func.c
push 5
push 10
call madd;
;direct call of <stdio.h> printf()
push eax
push format
call printf; ;printf(format, eax)
call mexit; ;exit to OS
section .data ;section declaration
format db "%d", 10, 0
msg db "Hello, world!",0xa ;our dear string
len equ $ - msg ;length of our dear string
; nasm -f elf32 hello.asm -o hello
;Link two files
;ld hello func -o hl -lc -I /lib/ld-linux.so.2
; ./hl run code
;chain to assemble, compile, Run
;; gcc -c func.c -o func && nasm -f elf32 hello.asm -o hello && ld hello func -o hl -lc -I /lib/ld-linux.so.2 && echo &&./hl
Chain commands for assemble, compile and Run
gcc -c func.c -o func && nasm -f elf32 hello.asm -o hello && ld hello func -o hl -lc -I /lib/ld-linux.so.2 && echo && ./hl
Edit[toDO]
Write boot loader code instead of this version
Some explanation on how ld, gcc, nasm works.

static C variable not getting initialized

I have one file-level static C variable that isn't getting initialized.
const size_t VGA_WIDTH = 80;
const size_t VGA_HEIGHT = 25;
static uint16_t* vgat_buffer = (uint16_t*)0x62414756; // VGAb
static char vgat_initialized= '\0';
In particular, vgat_initialized isn't always 0 the first time it is accessed. (Of course, the problem only appears on certain machines.)
I'm playing around with writing my own OS, so I'm pretty sure this is a problem with my linker script; but, I'm not clear how exactly the variables are supposed to be organized in the image produced by the linker (i.e., I'm not sure if this variable is supposed to go in .data, .bss, some other section, etc.)
VGA_WIDTH and VGA_HEIGHT get placed in the .rodata section as expected.
vgat_buffer is placed in the .data section, as expected (By initializing this variable to 0x62417656, I can clearly see where the linker places it in the resulting image file.)
I can't figure out where vgat_initialized is supposed to go. I've included the relevant parts of the assembly file below. From what I understand, the .comm directive is supposed to allocate space for the variable in the data section; but, I can't tell where. Looking in the linker's map file didn't provide any clues either.
Interestingly enough, if I change the initialization to
static char vgat_initialized= 'x';
everything works as expected: I can clearly see where the variable is placed in the resulting image file (i.e., I can see the x in the hexdump of the image file).
Assembly code generated from the C file:
.text
.LHOTE15:
.local buffer.1138
.comm buffer.1138,100,64
.local buffer.1125
.comm buffer.1125,100,64
.local vgat_initialized
.comm vgat_initialized,1,1
.data
.align 4
.type vgat_buffer, #object
.size vgat_buffer, 4
vgat_buffer:
.long 1648445270
.globl VGA_HEIGHT
.section .rodata
.align 4
.type VGA_HEIGHT, #object
.size VGA_HEIGHT, 4
VGA_HEIGHT:
.long 25
.globl VGA_WIDTH
.align 4
.type VGA_WIDTH, #object
.size VGA_WIDTH, 4
VGA_WIDTH:
.long 80
.ident "GCC: (GNU) 4.9.2"
compilers can conform to their own names for sections certainly but using the common .data, .text, .rodata, .bss that we know from specific compilers, this should land in .bss.
But that doesnt in any way automatically zero it out. There needs to be a mechanism, sometimes depending on your toolchain the toolchain takes care of it and creates a binary that in addition to .data, .rodata (and naturally .text) being filled in will fill in .bss in the binary. But depends on a few things, primarily is this a simple ram only image, is everything living under one memory space definition in the linker script.
you could for example put .data after .bss in the linker script and depending the binary format you use and/or tools that convert that you could end up with zeroed memory in the binary without any other work.
Normally though you should expect to using toolchain specific (linker scripts are linker specific not to be assumed to be universal to all tools) mechanism for defining where .bss is from your perspective, then some form of communication from the linker as to where it starts and what size, that information is used by the bootstrap whose job it is to zero it in that case, and one can assume it is always the bootstrap's job to zero .bss with naturally some exceptions. Likewise if the binary is meant to be on a read only media (rom, flash, etc) but .data, and .bss are read/write you need to have .data in its entirety on this media then someone has to copy it to its runtime position in ram, and .bss is either part of that depending on the toolchain and how you used it or the start address and size are on the read only media and someone has to zero that space at some point pre-main(). Here again this is the job of the bootstrap. Set the stack pointer, move .data if needed, zero .bss are the typical minimal jobs of the bootstrap, you can shortcut them in special cases or avoid using .data or .bss.
Since it is the linkers job to take all the little .data and .bss (and other) definitions from the objects being linked and combine them per the directions from the user (linker script, command line, whatever that tool uses), the linker ultimately knows.
In the case of gcc you use what I would call variables that are defined in the linker script, the linker script can fill in these values with matching variable/label names for the assembler such that a generic bootstrap can be used and you dont have to do any more work than that.
Like this but possibly more complicated
MEMORY
{
bob : ORIGIN = 0x8000, LENGTH = 0x1000
ted : ORIGIN = 0xA000, LENGTH = 0x1000
}
SECTIONS
{
.text : { *(.text*) } > bob
__data_rom_start__ = .;
.data : {
__data_start__ = .;
*(.data*)
} > ted AT > bob
__data_end__ = .;
__data_size__ = __data_end__ - __data_start__;
.bss : {
__bss_start__ = .;
*(.bss*)
} > bob
__bss_end__ = .;
__bss_size__ = __bss_end__ - __bss_start__;
}
then you can pull these into the assembly language bootstrap
.globl bss_start
bss_start: .word __bss_start__
.globl bss_end
bss_end: .word __bss_end__
.word __bss_size__
.globl data_rom_start
data_rom_start:
.word __data_rom_start__
.globl data_start
data_start:
.word __data_start__
.globl data_end
data_end:
.word __data_end__
.word __data_size__
and then write some code to operate on those as needed for your design.
you can simply put things like that in a linked in assembly language file without other code using them and assemble, compile other code and link and then the disassembly or other tools you prefer will show you what the linker generated, tweak that until you are satisfied then you can write or borrow or steal bootstrap code to use them.
for bare metal I prefer to not completely conform to the standard with my code, dont have any .data and dont expect .bss to be zero, so my bootstrap sets the stack pointer and calls main, done. For an operating system, you should conform. the toolchains already have this solved for the native platform, but if you are taking over that with your own linker script and boostrap then you need to deal with it, if you want to use an existing toolchains solution for an existing operating system then...done...just do that.
This answer is simply an extension of the others. As has been mentioned C standard has rules about initialization:
10) If an object that has automatic storage duration is not initialized explicitly, its value is indeterminate. If an object that has static storage duration is not initialized explicitly, then:
if it has pointer type, it is initialized to a null pointer;
if it has arithmetic type, it is initialized to (positive or unsigned) zero;
if it is an aggregate, every member is initialized (recursively) according to these rules;
if it is a union, the first named member is initialized (recursively) according to these rules.
The problem in your code is that a computers memory may not always be initialized to zero. It is up to you to make sure the BSS section is initialized to zero in a free standing environment (like your OS and bootloader).
The BSS sections usually don't (by default) take up space in a binary file and usually occupy memory in the area beyond the limits of the code and data that appears in the binary. This is done to reduce the size of the binary that has to be read into memory.
I know you are writing an OS for x86 booting with legacy BIOS. I know that you are using GCC from your other recent questions. I know you are using GNU assembler for part of your bootloader. I know that you have a linker script, but I don't know what it looks like. The usual mechanism to do this is via a linker script that places the BSS data at the end, and creates start and end symbols to define the address extents of the section. Once these symbols are defined by the linker they can be used by C code (or assembly code) to loop through the region and set it to zero.
I present a reasonably simple MCVE that does this. The code reads an extra sector with the kernel with Int 13h/AH=2h; enables the A20 line (using fast A20 method); loads a GDT with 32-bit descriptors; enables protected mode; completes the transition into 32-bit protected mode; and then calls a kernel entry point in C called kmain. kmain calls a C function called zero_bss that initializes the BSS section based on the starting and ending symbols (__bss_start and __bss_end) generated by a custom linker script.
boot.S:
.extern kmain
.globl mbrentry
.code16
.section .text
mbrentry:
# If trying to create USB media, a BPB here may be needed
# At entry DL contains boot drive number
# Segment registers to zero
xor %ax, %ax
mov %ax, %ds
mov %ax, %es
# Set stack to grow down from area under the place the bootloader was loaded
mov %ax, %ss
mov $0x7c00, %sp
cld # Ensure forward direction of MOVS/SCAS/LODS instructions
# which is required by generated C code
# Load kernel into memory
mov $0x02, %ah # Disk read
mov $1, %al # Read 1 sector
xor %ch, %ch # Cylinder 0
xor %dh, %dh # Head 0
mov $2, %cl # Start reading from second sector
mov $0x7e00, %bx # Load kernel at 0x7e00
int $0x13
# Quick and dirty A20 enabling. May not work on all hardware
a20fast:
in $0x92, %al
or $2, %al
out %al, $0x92
loadgdt:
cli # Turn off interrupts until a Interrupt Vector
# Table (IVT) is set
lgdt (gdtr)
mov %cr0, %eax
or $1, %al
mov %eax, %cr0 # Enable protected mode
jmp $0x08,$init_pm # FAR JMP to next instruction to set
# CS selector with a 32-bit code descriptor and to
# flush the instruction prefetch queue
.code32
init_pm:
# Set remaining 32-bit selectors
mov $DATA_SEG, %ax
mov %ax, %ds
mov %ax, %es
mov %ax, %fs
mov %ax, %gs
mov %ax, %ss
# Start executing kernel
call kmain
cli
loopend: # Infinite loop when finished
hlt
jmp loopend
.align 8
gdt_start:
.long 0 # null descriptor
.long 0
gdt_code:
.word 0xFFFF # limit low
.word 0 # base low
.byte 0 # base middle
.byte 0b10011010 # access
.byte 0b11001111 # granularity/limit high
.byte 0 # base high
gdt_data:
.word 0xFFFF # limit low (Same as code)
.word 0 # base low
.byte 0 # base middle
.byte 0b10010010 # access
.byte 0b11001111 # granularity/limit high
.byte 0 # base high
end_of_gdt:
gdtr:
.word end_of_gdt - gdt_start - 1
# limit (Size of GDT)
.long gdt_start # base of GDT
CODE_SEG = gdt_code - gdt_start
DATA_SEG = gdt_data - gdt_start
kernel.c:
#include <stdint.h>
extern uintptr_t __bss_start[];
extern uintptr_t __bss_end[];
/* Zero the BSS section 4-bytes at a time */
static void zero_bss(void)
{
uint32_t *memloc = __bss_start;
while (memloc < __bss_end)
*memloc++ = 0;
}
int kmain(){
zero_bss();
return 0;
}
link.ld
ENTRY(mbrentry)
SECTIONS
{
. = 0x7C00;
.mbr : {
boot.o(.text);
boot.o(.*);
}
. = 0x7dfe;
.bootsig : {
SHORT(0xaa55);
}
. = 0x7e00;
.kernel : {
*(.text*);
*(.data*);
*(.rodata*);
}
.bss : SUBALIGN(4) {
__bss_start = .;
*(COMMON);
*(.bss*);
}
. = ALIGN(4);
__bss_end = .;
/DISCARD/ : {
*(.eh_frame);
*(.comment);
}
}
To compile, link and generate a binary file that can be used in a disk image from this code, you could use commands like:
as --32 boot.S -o boot.o
gcc -c -m32 -ffreestanding -O3 kernel.c
gcc -ffreestanding -nostdlib -Wl,--build-id=none -m32 -Tlink.ld \
-o boot.elf -lgcc boot.o kernel.o
objcopy -O binary boot.elf boot.bin
The C standard says that static variables must be zero-initialized, even in absence of explicit initializer, so static char vgat_initialized= '\0'; is equivalent to static char vgat_initialized;.
In ELF and other similar formats, the zero-initialized data, such as this vgat_initialized goes to the .bss section. If you load such an executable yourself into memory, you need to explicitly zero the .bss part of the data segment.
The other answers are very complete and very helpful. In turns out that, in my specific case, I just needed to know that static variables initialized to 0 were put in .bss and not .data. Adding a .bss section to the linker script placed a zeroed-out section of memory in the image which solved the problem.

Link object files from header files in real mode when using GCC -m16 option?

I would like to implement header files in my c-code which consists partly of GCC inline assembly code for 16 bit real mode but i seem to have linking problems. This is what my header file console.h looks like:
#ifndef CONSOLE_H
#define CONSOLE_H
extern void kprintf(char*);
#endif
and this is console.c:
#include "console.h"
void kprintf(char *string)
{
for(int i=0;string[i]!='\0';i++)
{
asm("mov $0x0e,%%ah;"
"mov $0x00,%%bh;"
"mov %0,%%al;"
"int $0x10"::"g"(string[i]):"eax", "ebx");
}
}
the last one hellworld.c:
asm("jmp main");
#include "console.h"
void main()
{
asm("mov $0x1000,%ax;"
"mov %ax,%es;"
"mov %ax,%ds");
char string[]="hello world";
kprintf(string);
asm(".rept 512;"
"hlt;"
".endr");
}
My bootloader is in bootloader.asm:
org 0x7c00
bits 16
section .text
mov ax,0x1000
mov ss,ax
mov sp,0x000
mov esp,0xfffe
xor ax,ax
mov es,ax
mov ds,ax
mov [bootdrive],dl
mov bh,0
mov bp,zeichen
mov ah,13h
mov bl,06h
mov al,1
mov cx,6
mov dh,010h
mov dl,01h
int 10h
load:
mov dl,[bootdrive]
xor ah,ah
int 13h
jc load
load2:
mov ax,0x1000
mov es,ax
xor bx,bx
mov ah,2
mov al,1
mov cx,2
xor dh,dh
mov dl,[bootdrive]
int 13h
jc load2
mov ax,0
mov es,ax
mov bh,0
mov bp,zeichen3
mov ah,13h
mov bl,06h
mov al,1
mov cx,13
mov dh,010h
mov dl,01h
int 10h
mov ax,0x1000
mov es,ax
mov ds,ax
jmp 0x1000:0x000
zeichen db 'hello2'
zeichen3 db 'soweit so gut'
bootdrive db 0
times 510 - ($-$$) hlt
dw 0xaa55
Now I use the following buildscript build.sh:
#!bin/sh
nasm -f bin bootloader.asm -o bootloader.bin
gcc hellworld.c -m16 -c -o hellworld.o -nostdlib -ffreestanding
gcc console.c -m16 -c -o console.o -nostdlib link.ld -ffreestanding
ld -melf_i386 -Ttext=0x0000 console.o hellworld.o -o hellworld.elf
objcopy -O binary hellworld.elf hellworld.bin
cat bootloader.bin hellworld.bin >disk.img
qemu-system-i386 disk.img
and the linkscript link.ld:
/*
* link.ld
*/
OUTPUT_FORMAT(elf32-i386)
SECTIONS
{
. = 0x0000;
.text : { *(.startup); *(.text) }
.data : { *(.data) }
.bss : { *(.bss) }
}
Unfortunately it isn't working because it doesn't print the expected hello world. I think there must be something wrong with the linking command:
ld -melf_i386 -Ttext=0x0000 console.o hellword.o link.ld -o hellworld.elf`
How do I link header-files in 16-bit mode correctly?
When I write the kprintf function directly in the hellworld.c it is working correctly. I am using Linux Mint Cinnamon Version 18 64 bit for development.
The header files are not really the issue at all. When you restructured the code and split it into multiple objects it has identified issues with how you build and how jmp main is placed into the final kernel file.
I have created a set of files that make all the adjustments discussed below if you wish to test the complete set of changes to see if they rectify your problems.
Although you show the linker script, you aren't actually using it. In your build file you have:
ld -melf_i386 -Ttext=0x0000 console.o hellworld.o -o hellworld.elf
It should be:
ld -melf_i386 -Tlink.ld console.o hellworld.o -o hellworld.elf
When using -c (compiles but doesn't link) with GCC don't specify link.ld as a linker script. The linker script can be specified at link time when you invoke LD. This line:
gcc console.c -m16 -c -o console.o -nostdlib link.ld -ffreestanding
Should be:
gcc console.c -m16 -c -o console.o -nostdlib -ffreestanding
In order for this linker script to locate the jmp main in a place that is first in the output kernel file you need to change:
asm("jmp main");
To:
asm(".pushsection .startup\r\n"
"jmp main\r\n"
".popsection\r\n");
The .pushsection temporarily changes the section to .startup, outputs the instruction jmp main and then restores the section with .popsection to whatever it was before. The linker script deliberately places anything in the .startup section before anything else. This ensures the jmp main (or any other instructions you place there) appear as the very first instructions of the output kernel file. The \r\n can be replaced by ; (semicolon). \r\n makes for prettier output if you ever have GCC generate an assembly file.
As mentioned in the comments of a now deleted question your kernel file exceeds the size of a single sector. When you don't have a linker script, the default one will place the data section after the code. Your code has repeated the hlt instruction so that your kernel is greater than 1 sector (512 bytes) and your bootloader only reads a single sector with Int 13h/AH=2h .
To rectify this remove:
asm(".rept 512;"
"hlt;"
".endr");
And replace it with:
asm("cli;"
"hlt;");
You should be mindful that as your kernel grows you'll need to adjust the number of sectors read in bootloader.asm to ensure all of the kernel is loaded into memory.
I also suggest that to keep QEMU, and other virtual machines happy that you simply generate a well known disk image size and place the bootloader and kernel inside it. Rather than:
cat bootloader.bin hellworld.bin >disk.img
Use this:
dd if=/dev/zero of=disk.img bs=1024 count=1440
dd if=bootloader.bin of=disk.img seek=0 conv=notrunc
dd if=hellworld.bin of=disk.img seek=1 conv=notrunc
The first command makes a zero filled file of 1440kb. This is the exact size of a 1.44MB floppy. The second command inserts bootloader.bin in the first sector without truncating the disk file. The third command places the kernel file into the disk images starting at the second sector on the disk without truncating the disk image.
I had made available a slightly improved linker script. It was amended to remove some of the potential cruft that the linker may insert into the kernel that won't be of much use and specifically identifies some of the sections like .rodata (read only data) etc.
/*
* link.ld
*/
OUTPUT_FORMAT(elf32-i386)
SECTIONS
{
. = 0x0000;
.text : { *(.startup); *(.text) }
.data : { *(.data); *(.rodata) }
.bss : { *(COMMON); *(.bss) }
/DISCARD/ : {
*(.eh_frame);
*(.comment);
*(.note.gnu.build-id);
}
}
Other Comments
Not related to your question but this code can be removed:
asm("mov $0x1000,%ax;"
"mov %ax,%es;"
"mov %ax,%ds");
You do this in bootloader.asm, so setting these segment registers again with the same value won't do anything useful.
You can improve the extended assembly template by using input constraints to pass the values you need via register EAX(AX) and EBX(BX) rather than coding the moves inside the template. Your code could have looked like:
void kprintf(const char *string)
{
while (*string)
{
asm("int $0x10"
:
:"a"((0x0e<<8) | *string++), /* AH = 0x0e, AL = char to print */
"b"(0)); /* BH = 0x00 page #
BL = 0x00 unused in text mode */
}
}
<< is the C bit shift left operator. 0x0e<<8 would shift 0x0e left 8 bits which would be 0x0e00. | is bitwise OR which effectively places the character to print in the lower 8 bits. That value is then passed into the EAX register by the assembly template via input constraint "a".
It is hard to say without knowing what your bootloader.asm does, but:
The link order must be wrong;
ld -melf_i386 -Ttext=0x0000 console.o hellworld.o -o hellworld.elf
should be:
ld -melf_i386 -Ttext=0x0000 hellworld.o console.o -o hellworld.elf
(Edit: I see that you have a linker script which would remove the need for this re-arrangement, but you're not using it for the link).
I suspect that your bootloader loads a single sector, and your padding:
asm(".rept 512;"
"hlt;"
".endr");
... prevents the code from the other object file from ever being loaded, since it pads hellword.o to (more than) the size of a sector.
The problem is nothing to do with the use of header files, it is because you have two compilation units which become separate objects, and the combined size of both when linked is larger than a sector (512 bytes).

Can Libffi be built for Cortex-M3?

I'm trying to build the foreign function interface library for a Cortex-M3 processor using GCC. According to http://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html:
-mthumb
Generate code for the Thumb instruction set. The default is to use the 32-bit ARM instruction set. This option automatically enables either 16-bit Thumb-1 or mixed 16/32-bit Thumb-2 instructions based on the -mcpu=name and -march=name options. This option is not passed to the assembler. If you want to force assembler files to be interpreted as Thumb code, either add a `.thumb' directive to the source or pass the -mthumb option directly to the assembler by prefixing it with -Wa.
I've tried passing various various arguments to the assembler and can't seem to figure it out. Typical output as follows:
Building file: ../source/ffi/sysv.S
Invoking: GCC Assembler
arm-bare_newlib_cortex_m3_nommu-eabi-gcc -Wa,-mthumb-interwork -I"/home/neil/m3projects/robovero/firmware/include" -o"source/ffi/sysv.o" "../source/ffi/sysv.S"
../source/ffi/sysv.S: Assembler messages:
../source/ffi/sysv.S:145: Error: selected processor does not support ARM opcodes
../source/ffi/sysv.S:147: Error: attempt to use an ARM instruction on a Thumb-only processor -- `stmfd sp!,{r0-r3,fp,lr}'
...
Can I use libffi on Cortex-M3 without becoming an assembly expert?
It might be worth noting that when I invoke arm-bare_newlib_cortex_m3_nommu-eabi-as directly I get different errors.
I modify the sysV.S as follolwing, the error is caused by the ".arm" directive, when using cortex-m3, it should be comment out.
#ifdef __ARM_ARCH_7M__ /* cortex-m3 */
#undef __THUMB_INTERWORK__
#endif
#if __ARM_ARCH__ >= 5
# define call_reg(x) blx x
#elif defined (__ARM_ARCH_4T__)
# define call_reg(x) mov lr, pc ; bx x
# if defined(__thumb__) || defined(__THUMB_INTERWORK__)
# define __INTERWORKING__
# endif
#else
# define call_reg(x) mov lr, pc ; mov pc, x
#endif
/* Conditionally compile unwinder directives. */
#ifdef __ARM_EABI__
#define UNWIND
#else
#define UNWIND #
#endif
#if defined(__thumb__) && !defined(__THUMB_INTERWORK__)
.macro ARM_FUNC_START name
.text
.align 0
.thumb
.thumb_func
#ifdef __APPLE__
ENTRY($0)
#else
ENTRY(\name)
#endif
#ifndef __ARM_ARCH_7M__ /* not cortex-m3 */
bx pc
nop
.arm
#endif
UNWIND .fnstart
/* A hook to tell gdb that we've switched to ARM mode. Also used to call
directly from other local arm routines. */
#ifdef __APPLE__
_L__$0:
#else
_L__\name:
#endif
.endm
I hate to say it but it is a porting effort. Doable, not necessarily having to be an assembler expert, but will need to learn some. Going from thumb to arm is easy, thumb2, I would have to look that up, much of thumb2 is just thumb instructions. and thumb has a one to one mapping to arm instructions, but not the other way around. Thumb mostly limits you to the lower 8 registers on all the workhorse instructions, with special versions or special instructions to use the upper registers. So many of your arm instructions are going to turn into more than one thumb instruction.
Initially see if there is a build option to build this package without using assembler or go into that directory and see if there is something you can do in the makefile to use a C program instead of assembler. I assume there is a serious performance issue to using C which is why there is assembler to start with. Thumb2 in theory is more efficient than arm but that does not necessarily mean a direct port from arm to thumb2. So with some experience you may be able to hand port to thumb2 and keep some performance.
EDIT:
Downloaded the file in question. The define stuff up front implies that it is aware of both thumb and armv7m. is that how you are getting to where you were changing stm to push?
The assembler is telling you the truth - ARM assembly code can't be assembled to work successfully on a Thumb-2-only processor like the M3. There are no way for the assembler to map the ARM instruction mnemonics into opcodes that will make sense to a Cortex-M3. You'll need to port the assembly files to Thumb-2 assembly code to get things working. Depending on what the original assembly code does, you might get lucky and be able to port to C instead, but that may cost you a major performance hit.
Add "-Wa,-mimplicit-it=thumb" to the gcc CFLAGS to avoid "thumb conditional instruction should be in IT block" error
--- libffi.orig/src/arm/sysv.S
+++ libffi/src/arm/sysv.S
## -91,6 +91,10 ##
# define __ARM_ARCH__ 7
#endif
+#ifdef __ARM_ARCH_7M__ /* cortex-m3 */
+#undef __THUMB_INTERWORK__
+#endif
+
#if __ARM_ARCH__ >= 5
# define call_reg(x) blx x
#elif defined (__ARM_ARCH_4T__)
## -121,9 +125,11 ##
#else
ENTRY(\name)
#endif
+#ifndef __ARM_ARCH_7M__ /* not cortex-m3 */
bx pc
nop
.arm
+#endif
UNWIND .fnstart
/* A hook to tell gdb that we've switched to ARM mode. Also used to call
directly from other local arm routines. */
## -164,6 +170,10 ## _L__\name:
#endif
.endm
+#ifdef __ARM_ARCH_7M__ /* cortex-m3 */
+ .syntax unified
+#endif
+
# r0: ffi_prep_args
# r1: &ecif
# r2: cif->bytes
## -180,7 +190,11 ## ARM_FUNC_START ffi_call_SYSV
UNWIND .setfp fp, sp
# Make room for all of the new args.
+#ifdef __ARM_ARCH_7M__ /* cortex-m3 */
+ sub sp, sp, r2
+#else
sub sp, fp, r2
+#endif
# Place all of the ffi_prep_args in position
mov r0, sp
## -193,7 +207,12 ## ARM_FUNC_START ffi_call_SYSV
ldmia sp, {r0-r3}
# and adjust stack
+#ifdef __ARM_ARCH_7M__ /* cortex-m3 */
+ mov lr, sp
+ sub lr, fp, lr # cif->bytes == fp - sp
+#else
sub lr, fp, sp # cif->bytes == fp - sp
+#endif
ldr ip, [fp] # load fn() in advance
cmp lr, #16
movhs lr, #16
## -305,7 +324,13 ## ARM_FUNC_START ffi_closure_SYSV
beq .Lretlonglong
.Lclosure_epilogue:
add sp, sp, #16
+#ifdef __ARM_ARCH_7M__ /* cortex-m3 */
+ ldr ip, [sp, #4]
+ ldr sp, [sp]
+ mov pc, ip
+#else
ldmfd sp, {sp, pc}
+#endif
.Lretint:
ldr r0, [sp]
b .Lclosure_epilogue
## -381,7 +406,12 ## LSYM(Lbase_args):
ldmia sp, {r0-r3}
# and adjust stack
+#ifdef __ARM_ARCH_7M__ /* cortex-m3 */
+ mov lr, sp
+ sub lr, ip, lr # cif->bytes == (fp - 64) - sp
+#else
sub lr, ip, sp # cif->bytes == (fp - 64) - sp
+#endif
ldr ip, [fp] # load fn() in advance
cmp lr, #16
movhs lr, #16
## -469,7 +499,13 ## ARM_FUNC_START ffi_closure_VFP
.Lclosure_epilogue_vfp:
add sp, sp, #72
+#ifdef __ARM_ARCH_7M__ /* cortex-m3 */
+ ldr ip, [sp, #4]
+ ldr sp, [sp]
+ mov pc, ip
+#else
ldmfd sp, {sp, pc}
+#endif
.Lretfloat_vfp:
flds s0, [sp]

Resources