arm32 gcc 'unsigned long long' args passing how to ensure align8bytes on stack? - gcc

using toolchains:
"gcc-arm-none-eabi-9-2020-q2-update"
build cmd:
"arm-none-eabi-gcc -MMD -g -Wno-discarded-qualifiers -O0 -mcpu=cortex-r52 -c -DGCC -mthumb -mfloat-abi=hard -mfpu=fp-armv8 -nostartfiles -ffreestanding -falign-functions=16 -falign-jumps=8 -falign-loops=8 -fomit-frame-pointer -funroll-loops printf.c -o printf.o"
C code:
int _printf(const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
my_vprintf(fmt, ap);
va_end(ap);
return 0;
}
int printf_test(void) {
_printf("test hex long number = %d %d 0x%lx\n", 123, 321, 0x123456789abcdef0ull);
return 0;
}
static void out_num(long long n, int base, char lead, int maxwidth) {
unsigned long long m = 0;
char buf[MAX_NUMBER_BYTES], *s = buf + sizeof(buf);
int count = 0, i = 0;
*--s = '\0';
if (n < 0 && base == 10) {
m = -n;
} else {
m = n;
}
do {
*--s = hex_tab[m % base];
count++;
} while ((m /= base) != 0);
if( maxwidth && count < maxwidth) {
for (i = maxwidth - count; i; i--) {
*--s = lead;
}
}
if (n < 0 && base == 10) {
*--s = '-';
}
outs(s);
}
asm:
<printf_test>:
b500 push {lr}
b083 sub sp, #12
a30a add r3, pc, #40 ; (adr r3, 4012a0 <printf_test+0x30>)
e9d3 2300 ldrd r2, r3, [r3]
**e9cd 2300 strd r2, r3, [sp]** // save the longlong args into sp
f240 1241 movw r2, #321 ; 0x141
217b movs r1, #123 ; 0x7b
f245 0020 movw r0, #20512 ; 0x5020
f2c0 0040 movt r0, #64 ; 0x40
f7ff ffe0 bl 401250 <_printf>
2300 movs r3, #0
4618 mov r0, r3
b003 add sp, #12
f85d fb04 ldr.w pc, [sp], #4
bf00 nop
f3af 8000 nop.w
9abcdef0 bls ff338e68 <__bss_end+0xfe3f4ba8>
12345678 eorsne r5, r4, #120, 12 ; 0x7800000
f3af 8000 nop.w
f3af 8000 nop.w
<_printf>:
b40f push {r0, r1, r2, r3}
b500 push {lr}
b083 sub sp, #12
**ab05 add r3, sp, #20** // the va_list ap just point to r1 address, the longlong args on sp is missing?
9301 str r3, [sp, #4]
9901 ldr r1, [sp, #4]
9804 ldr r0, [sp, #16]
f7ff fe9f bl 400fa0 <my_vprintf>
2300 movs r3, #0
4618 mov r0, r3
b003 add sp, #12
f85d eb04 ldr.w lr, [sp], #4
b004 add sp, #16
4770 bx lr
when in my_vprintf func, the va_list ap args address is:
0x0000000000F439B8 0000007B 00000141 12345678 9ABCDEF0 {...A...xV4.ðÞ¼.
0x0000000000F439C8 12345678 0000000A 004044D1 00000000 xV4.....ÑD#.....
the result printf is:
test hex long number = 123 321 0x9abcdef012345678
expected is:
test hex long number = 123 321 0x123456789abcdef0
my_vprintf code:
static int my_vprintf(const char *fmt, va_list ap) {
char lead = ' ';
int alignleft = 0;
int maxwidth = 0;
for(; *fmt != '\0'; fmt++) {
if (*fmt != '%') {
outc(*fmt);
continue;
}
lead = ' ';
maxwidth = 0;
//format : %08d, %8d,%d,%u,%x,%f,%c,%s
fmt++;
if (*fmt == '-') {
fmt++;
alignleft = 1;
}
if(*fmt == '0') {
lead = '0';
fmt++;
}
while(*fmt >= '0' && *fmt <= '9') {
maxwidth *= 10;
maxwidth += (*fmt - '0');
fmt++;
}
switch (*fmt) {
case 'd':
out_num(va_arg(ap, int), 10, lead, maxwidth);
break;
case 'o':
out_num(va_arg(ap, unsigned int), 8, lead, maxwidth);
break;
case 'u':
out_num(va_arg(ap, unsigned int), 10, lead, maxwidth);
break;
case 'l':
fmt++;
if (*fmt == 'd') {
out_num(va_arg(ap, long long), 10, lead, maxwidth);
} else if (*fmt == 'u') {
out_num(va_arg(ap, unsigned long long), 10, lead, maxwidth);
} else if (*fmt == 'x' || *fmt == 'X') {
out_num(va_arg(ap, unsigned long long), 16, lead, maxwidth);
}
break;
case 'f':
out_float(va_arg(ap, double), maxwidth);
break;
case 'X':
case 'x':
out_num(va_arg(ap, unsigned int), 16, lead, maxwidth);
break;
case 'c':
outc(va_arg(ap, int));
break;
case 's':
out_str(va_arg(ap, char *), lead, alignleft, maxwidth);
break;
default:
outc(*fmt);
break;
}
}
return 0;
}
It seems that the longlong passing args on stack couldn't complie as expected and deal with appropriately. Why it happened?

Related

Is there a way to adjust the text size (trying to change text from language to other inside exe PE file) using hex editors

What I'm trying to do is to translate an application interface language from Korean to English
I've added a new section using CFF Explorer because the application doesn't have a size to extend the translated text
Then I used Ghidra to change the address for the text to the new address location
The problem I've faced is that the new text is cropped at specific character
eg.
The original text USB를 읽고 있습니다. and the new text Reading The USB
the problem is the new text appears like this Reading The US without the last character, and this problem appears in different places I've tried to change the text with the same approach, I know that this problem is related to the text size but I can't find where I can change the text size
here is a sample of code taken from IDA
.text:00049784 ; =============== S U B R O U T I N E =======================================
.text:00049784
.text:00049784
.text:00049784 sub_49784 ; CODE XREF: sub_491B0+50↑p
.text:00049784 ; DATA XREF: .pdata:00616468↓o
.text:00049784
.text:00049784 var_3C = -0x3C
.text:00049784 var_38 = -0x38
.text:00049784 var_34 = -0x34
.text:00049784 var_30 = -0x30
.text:00049784 var_2C = -0x2C
.text:00049784 var_28 = -0x28
.text:00049784 var_24 = -0x24
.text:00049784 var_20 = -0x20
.text:00049784 var_1C = -0x1C
.text:00049784 var_18 = -0x18
.text:00049784 var_14 = -0x14
.text:00049784 var_10 = -0x10
.text:00049784
.text:00049784 PUSH {R4,R5,LR}
.text:00049788 SUB SP, SP, #0x30
.text:0004978C MOV R4, R0
.text:00049790 LDR R2, [R4,#8] ; xLeft
.text:00049794 MOV R0, #0x41 ; 'A'
.text:00049798 STR R0, [SP,#0x3C+var_10] ; int
.text:0004979C MOV R1, #0xFFFFFF
.text:000497A0 LDR R0, =dword_146998
.text:000497A4 MOV R5, #1
.text:000497A8 STR R1, [SP,#0x3C+var_14] ; int
.text:000497AC MOV R3, #0 ; yTop
.text:000497B0 LDR R1, [R0]
.text:000497B4 LDR R0, =off_11D9E0 ; "Reading the USB"
.text:000497B8 STR R1, [SP,#0x3C+var_18] ; int
.text:000497BC MOV R1, #0x32 ; '2'
.text:000497C0 STR R0, [SP,#0x3C+var_1C] ; int
.text:000497C4 MOV R0, #0x3C ; '<'
.text:000497C8 STR R1, [SP,#0x3C+var_20] ; int
.text:000497CC MOV R1, #0x190
.text:000497D0 STR R0, [SP,#0x3C+var_28] ; int
.text:000497D4 MOV R0, #0xDA ; 'Ú'
.text:000497D8 STR R1, [SP,#0x3C+var_2C] ; wchar_t *
.text:000497DC MOV R1, #0x168
.text:000497E0 STR R0, [SP,#0x3C+var_30] ; int
.text:000497E4 LDR R0, =dword_146CA4
.text:000497E8 STR R1, [SP,#0x3C+var_34] ; int
.text:000497EC MOV R1, #0
.text:000497F0 STR R0, [SP,#0x3C+var_38] ; int
.text:000497F4 STR R1, [SP,#0x3C+var_3C] ; int
.text:000497F8 MOV R1, #0 ; int
.text:000497FC LDR R0, [R4,#0x30] ; int
.text:00049800 STR R5, [SP,#0x3C+var_24] ; int
.text:00049804 BL sub_55320
.text:00049808 LDR R0, [R4,#0x30]
.text:0004980C MOV R3, #1 ; int
.text:00049810 STR R5, [SP,#0x3C+var_3C] ; int
.text:00049814 LDR R0, [R0,#0x70] ; int
.text:00049818 MOV R2, #0 ; int
.text:0004981C MOV R1, #0 ; int
.text:00049820 BL sub_57990
.text:00049824 ADD SP, SP, #0x30 ; '0'
.text:00049828 POP {R4,R5,LR}
.text:0004982C BX LR
.text:0004982C ; ---------------------------------------------------------------------------
.text:00049830 ; const int off_49830
.text:00049830 off_49830 DCD dword_146CA4 ; DATA XREF: sub_49784+60↑r
.text:00049834 ; const int off_49834
.text:00049834 off_49834 DCD off_11D9E0 ; DATA XREF: sub_49784+30↑r
.text:00049834 ; "Reading the USB"
.text:00049838 off_49838 DCD dword_146998 ; DATA XREF: sub_49784+1C↑r
.text:00049838 ; End of function sub_49784
.text:00049838
.text:0004983C
.text:0004983C ; =============== S U B R O U T I N E =======================================
Any suggestions please

Why are non-consecutive loads faster, even when the cache miss penalty is guaranteed to be zero?

Background:
I wrote a function in C and compiled it with arm-none-eabi-gcc (7-2018-q2-update).
The generated assembly code for the loop body looks like it should take 20 cycles per iteration,
including 2 wait states for load operations accessing constant data from non-volatile program memory.
However, the NVM controller cache for my MCU says that the cache miss penalty is guaranteed to be zero,
so I'm not sure why it can't prefetch the data for the two NVM load operations.
Therefore, I think the loop should take 18 cycles per iteration.
Unfortunately, the measured performance is quite different from the expected performance.
If I change int8_t increment and int16_t patch_data_i so that both are int32_t,
then GCC generates effectively the same instructions in a slightly different order.
Let's call this version (b).
The interesting thing is that version (a) takes 21 cycles per iteration and version (b) takes 20 cycles per iteration!
This performance difference is highly repeatable.
I have measured it very precisely, by varying the number of iterations between (5, 6, 7, 8) for version (a) and version (b).
Timing measurements from a Tektronix 465 oscilloscope at a fixed B sweep setting:
T(a)[min, max, avg] = (20.0, 21.0, 20.3) c # 48 MHz.
T(b)[min, max, avg] = (21.0, 22.0, 21.3) c # 48 MHz.
(Peformance of this loop body is crucial, since it executes 8 iterations,
and this function is called once per every 2000 clock cycles.
For my application, even this single-cycle difference amounts to roughly 0.5 percent of total cpu time.)
My question has 4 parts:
What is going on here?
Why is it that version (a) takes 21 cycles and version (b) takes 20 cycles?
Why don't both versions take 18 cycles?
Is there any possible way to accurately predict the access latency for RAM and NVM on an Atmel SAMD21 microcontroller,
other than trying random permutations of assembly operations and measuring everything on an oscilloscope?
(Answers to any 1 of these 4 parts would be extremely appreciated.)
Source code (version a)
__attribute__((used))
void enc_calc_transition(struct enc *enc, uint16_t old_state, uint16_t
new_state)
{
uint32_t transitions = enc_interleave_states(old_state, new_state);
size_t j = 0;
for (size_t i = 0; i < 8; i++, j += 4) {
const size_t transition = (transitions >> j) & 0xf;
const int8_t increment = enc_increment_for_transition[transition];
int16_t patch_data_i = enc->patch_data[i];
patch_data_i += increment;
size_t patch_data_i_plus_one = patch_data_i + 1;
patch_data_i = enc_constrain_8x258[patch_data_i_plus_one];
enc->patch_data[i] = patch_data_i;
}
}
Source code (version b)
__attribute__((used))
void enc_calc_transition(struct enc *enc, uint16_t old_state, uint16_t
new_state)
{
uint32_t transitions = enc_interleave_states(old_state, new_state);
size_t j = 0;
for (size_t i = 0; i < 8; i++, j += 4) {
const size_t transition = (transitions >> j) & 0xf;
const int32_t increment = enc_increment_for_transition[transition];
int32_t patch_data_i = enc->patch_data[i];
patch_data_i += increment;
size_t patch_data_i_plus_one = patch_data_i + 1;
patch_data_i = enc_constrain_8x258[patch_data_i_plus_one];
enc->patch_data[i] = patch_data_i;
}
}
Generated assembly (version a)
cyc addr code instr fields
x 894e: 2200 movs r2, #0
x 8950: 250f movs r5, #15
x 8952: 4f09 ldr r7, [pc, #36] ; (8978)
x 8954: 4e09 ldr r6, [pc, #36] ; (897c)
x 8956: 4460 add r0, ip
1 8958: 000b movs r3, r1
1 895a: 40d3 lsrs r3, r2
1 895c: 402b ands r3, r5
2 895e: 7804 ldrb r4, [r0, #0]
2 8960: 56f3 ldrsb r3, [r6, r3]
1 8962: 3204 adds r2, #4
1 8964: 191b adds r3, r3, r4
1 8966: 18fb adds r3, r7, r3
2 8968: 785b ldrb r3, [r3, #1]
2 896a: 7003 strb r3, [r0, #0]
1 896c: 3001 adds r0, #1
1 896e: 2a20 cmp r2, #32
2 8970: d1f2 bne.n 8958 <enc_calc_transition+0x38>
18
x 8972: bdf0 pop {r4, r5, r6, r7, pc}
x 8974: 000090a8 ; <enc_expand_16x256>
x 8978: 00008fa4 ; <enc_constrain_8x258>
x 897c: 00008f94 ; <enc_increment_for_transition> [signed, 8x16]
instruction cycles:
movs lsrs ands ldrb ldrsb adds adds adds ldrb strb adds cmp bne
= 1 + 1 + 1 + 2 + 2 + 1 + 1 + 1 + 2 + 2 + 1 + 1 + 2
= 18
Generated assembly (version b)
cyc addr code instr fields
x 894e: 2200 movs r2, #0
x 8950: 250f movs r5, #15
x 8952: 4f09 ldr r7, [pc, #36] ; (8978)
x 8954: 4e09 ldr r6, [pc, #36] ; (897c)
x 8956: 4460 add r0, ip
1 8958: 0021 movs r1, r4
1 895a: 40d1 lsrs r1, r2
2 895c: 7803 ldrb r3, [r0, #0]
1 895e: 4029 ands r1, r5
2 8960: 5671 ldrsb r1, [r6, r1]
1 8962: 18fb adds r3, r7, r3
1 8964: 185b adds r3, r3, r1
2 8966: 785b ldrb r3, [r3, #1]
1 8968: 3204 adds r2, #4
2 896a: 7003 strb r3, [r0, #0]
1 896c: 3001 adds r0, #1
1 896e: 2a20 cmp r2, #32
2 8970: d1f2 bne.n 8958
18
x 8972: bdf0 pop {r4, r5, r6, r7, pc}
x 8974: 000090a8 ; <enc_expand_16x256>
x 8978: 00008fa4 ; <enc_constrain_8x258>
x 897c: 00008f94 ; <enc_increment_for_transition> [signed, 8x16]
instruction cycles:
movs lsrs ldrb ands ldrsb adds adds ldrb adds strb adds cmp bne
= 1 + 1 + 2 + 1 + 2 + 1 + 1 + 2 + 1 + 2 + 1 + 1 + 2
= 18
My interpretation of the generated assembly (version a)
I have written out my "interpretation" of the generated assembly for each case.
This section might be unnecessary, but I thought it might as well include it since it helped me understand the differences between (a) and (b).
As above, the portions before and after the loop are identical.
The only significant difference I can see is that the two versions execute the same instructions in a slightly different order.
In particular, version (b) (which takes 20 cycles per iteration),
has zero instances of consecutive load/store operations,
zero instances of consecutive load/load operations,
and zero instances of consecutive store/store operations.
(The documented number of wait states for each load operation is commented in brackets: 1 wait state would be indicated by // ^ ldrb [1].)
r2 size_t j = 0;
r5 uint32_t mask_0xf = 0xf;
r7 uint8_t *constrain = &enc_constrain_8x258[0]; // 0x8fa4
r6 uint8_t *increment_for_transition =
&enc_increment_for_transition[0]; // 0x8f94
r0 uint8_t *patch_data = &enc->patch_data[0]
do {
r3 uint32_t _transitions = transitions;
r3 uint32_t transitions_shifted = _transitions >> j;
r3 size_t transition = transitions_shifted & mask_0xf;
r4 int16_t patch_data_i = *(patch_data + 0); //
// ^ ldrb [0]
r3 int8_t _increment = *(increment_for_transition + transition);
// ^ ldrsb [1]
j += 4;
r3 int16_t inc_plus_pdata = _increment + patch_data_i;
r3 uint8_t *constrain_plus_inc_plus_pdata =
constrain + inc_plus_pdata;
r3 uint8_t constrained_pdata = *(constrain_plus_inc_plus_pdata + 1);
// ^ ldr [1]
*(patch_data + 0) = constrained_pdata;
// ^ strb [0]
patch_data++;
} while (j < 32);
My interpretation of the generated assembly (version b)
r2 size_t j = 0;
r5 uint32_t mask_0xf = 0xf;
r7 uint8_t *constrain = &enc_constrain_8x258[0]; // 0x8fa4
r6 uint8_t *increment_for_transition =
&enc_increment_for_transition[0]; // 0x8f94
r0 uint8_t *patch_data = &enc->patch_data[0]
do {
r1 uint32_t _transitions = transitions;
r1 uint32_t transitions_shifted = _transitions >> j;
r3 int32_t patch_data_i = *(patch_data + 0);
// ^ ldrb [0]
r1 size_t transition = transitions_shifted & mask_0xf;
r1 int32_t _increment = *(increment_for_transition + transition);
// ^ ldrsb [1]
r3 uint8_t *constrain_plus_pdata = constrain + patch_data_i;
r3 uint8_t *constrain_plus_pdata_plus_inc =
constrain_plus_pdata + _increment;
r3 uint8_t constrained_pdata = *(constrain_plus_pdata_plus_inc + 1);
// ^ ldr [1]
j += 4;
*(patch_data + 0) = constrained_pdata;
// ^ strb [0]
patch_data++;
} while (j < 32);
Platform information
The microcontroller is the Atmel/Microchip AT91SAMD21G18A.
The architecture is ARMv6-M.
The microarchitecture is ARM Cortex-M0+.
The master clock frequency of my MCU core is 48 MHz.
At 48 MHz, the SAMD21 [non-volatile] program memory requires 1 wait state if the cache is disabled.
At 48 MHz, the SAMD21 SRAM requires zero wait states.
However, I don't see any reason why it would be faster to execute the code from RAM.
I believe the NVM data path is separate from the RAM data path,
so instruction fetches from NVM should never contend with data fetches from RAM
(I'm not 100% sure about this fact, but I think it's true.).
Therefore, if the NVM controller cache is working as documented,
it seems that running this loop from NVM should almost certainly be faster than running this loop from RAM.
The SAMD21 has a 64-byte cache for accesses to non-volatile memory.
The NVM controller cache "is a direct-mapped cache that implements 8 lines of 64 bits (i.e., 64 bytes)."
The NVM controller cache is enabled, in NO_MISS_PENALTY mode.
This is the datasheet description of NO_MISS_PENALTY mode:
"The NVM controller (cache system) does not insert wait states on a cache miss.
Gives the best system performance."
The datasheet does not provide any more information about NO_MISS_PENALTY mode.
Cortex-M0+ uses a von-Neumann architecture. Instruction fetches always contend with data access, whether in zero-wait-state SRAM or in flash.

Is this a GCC bug or am I doing something wrong?

I am trying to get the final accumulate in the code below to use the ARM M7 SMLAL 32*32->64 bit accumulate function. If I include the T3 = T3 + 1 than it does use this, but if I comment it out it does a full 64*64 bit and accumulate using 3 multiply and 2 add instructions. I don't actually want to add 1 to T3 so it needs to go.
I've broken the code down so that I could analyse it in more detail and it definitely seems to be that the cast of T3 to int32_t and throwing away the bottom 32 bits from the multiply isn't being picked up by the compiler and it thinks T3 still has 64 bits. Bit when I add the simple increment of T3 it then gets it correct. I tried adding zero but then it goes back to the full 64*64 bit multiply.
I'm using the -O2 optimisation on STM's STM32CubeIDE which uses a version of GCC. Other optimations never use SMLAL or unroll everything.
int64_t T4 = 0;
osc = key * NumHarmonics;
harmonic = 0;
do
{
if (OscLevel[osc] > 1)
{
OscPhase[osc] = OscPhase[osc] + (uint32_t)(T2);
int32_t T5 = Sine[(OscPhase[osc] >> 16) & 0x0000FFFF];
int64_t T6 = (int64_t)T1 * Tremelo[harmonic];
int32_t T3 = (int32_t)(T6 >> 32); // grab the most significant register
// T3 = T3 + 1; // needs the +1 to force use of SMLAL in next instruction ! (+0 doesn't help)
T4 = T4 + (int64_t)T3 * (int64_t)T5; // should be SMLAL but does a full 64*64 mult if no +1 above
}
osc++;
harmonic++;
}
while (harmonic < NumHarmonics);
OscTotal = T4;
without the addition :
800054e: 4b13 ldr r3, [pc, #76] ; (800059c <main+0xd8>)
8000550: f853 1024 ldr.w r1, [r3, r4, lsl #2]
8000554: ea4f 79e1 mov.w r9, r1, asr #31
8000558: fba7 4501 umull r4, r5, r7, r1
800055c: fb07 f309 mul.w r3, r7, r9
8000560: fb01 3202 mla r2, r1, r2, r3
8000564: 4415 add r5, r2
8000566: e9dd 2300 ldrd r2, r3, [sp]
800056a: 1912 adds r2, r2, r4
800056c: 416b adcs r3, r5
800056e: e9cd 2300 strd r2, r3, [sp]
}
osc++;
8000572: 3001 adds r0, #1
harmonic++;
with the addition
8000542: 4b0b ldr r3, [pc, #44] ; (8000570 <main+0xac>)
8000544: f853 3020 ldr.w r3, [r3, r0, lsl #2]
8000548: fbc3 6701 smlal r6, r7, r3, r1
}
osc++;
800054c: 3201 adds r2, #1
harmonic++;

gcc ARM produces incorrect code - how to correct

gcc ARM for STM32F407 micro
The following function is used as a sanity check in FreeRtosTCP
UBaseType_t bIsValidNetworkDescriptor( const NetworkBufferDescriptor_t * pxDesc )
{
uint32_t offset = ( uint32_t ) ( ((const char *)pxDesc) - ((const char *)xNetworkBuffers) );
if( ( offset >= (uint32_t)(sizeof( xNetworkBuffers )) ) || ( ( offset % sizeof( xNetworkBuffers[0] ) ) != 0 ) )
return pdFALSE;
return (UBaseType_t) (pxDesc - xNetworkBuffers) + 1;
}
The line in question is ---> offset >= (uint32_t)(sizeof( xNetworkBuffers ))
gcc produces a bhi instruction after the cmp instead of a bhs.
If tries casting both as shown in the code above but nothing seems to get the bhs instruction to be used.
Any help appreciated.
Thanks.
Joe
Well knowing the exact size of the xNetworkBuffers array compiler can simply optimize it. Being curious I gave it a try. Following is the code with little modifications and the asm output and the explanation:
#include <stdint.h>
typedef struct abc {
char data[10];
}NetworkBufferDescriptor_t;
NetworkBufferDescriptor_t xNetworkBuffers[5];
int bIsValidNetworkDescriptor( const NetworkBufferDescriptor_t * pxDesc )
{
uint32_t offset = ( uint32_t ) ( ((const char *)pxDesc) - ((const char *)xNetworkBuffers) );
if( ( offset >= (uint32_t)(sizeof( xNetworkBuffers )) ) || ( ( offset % sizeof( xNetworkBuffers[0] ) ) != 0 ) )
return 0;
return (int) (pxDesc - xNetworkBuffers) + 1;
}
and the asm output is:
bIsValidNetworkDescriptor:
# Function supports interworking.
# args = 0, pretend = 0, frame = 16
# frame_needed = 1, uses_anonymous_args = 0
# link register save eliminated.
str fp, [sp, #-4]!
add fp, sp, #0
sub sp, sp, #20
str r0, [fp, #-16]
ldr r3, [fp, #-16]
ldr r2, .L5
sub r3, r3, r2
str r3, [fp, #-8]
ldr r3, [fp, #-8]
cmp r3, #49
bhi .L2
ldr r1, [fp, #-8]
ldr r3, .L5+4
umull r2, r3, r1, r3
lsr r2, r3, #3
mov r3, r2
lsl r3, r3, #2
add r3, r3, r2
lsl r3, r3, #1
sub r2, r1, r3
cmp r2, #0
beq .L3
.L2:
mov r3, #0
b .L4
.L3:
ldr r3, [fp, #-16]
ldr r2, .L5
sub r3, r3, r2
asr r2, r3, #1
mov r3, r2
lsl r3, r3, #1
add r3, r3, r2
lsl r1, r3, #4
add r3, r3, r1
lsl r1, r3, #8
add r3, r3, r1
lsl r1, r3, #16
add r3, r3, r1
lsl r3, r3, #2
add r3, r3, r2
add r3, r3, #1
.L4:
mov r0, r3
add sp, fp, #0
# sp needed
ldr fp, [sp], #4
bx lr
.L6:
.align 2
.L5:
In the block quoted asm code you can see that it is comparing with 49 not 50 (which is the actual size of xNetworkBuffers) so the conclusion I got is
offset >= (uint32_t)(sizeof( xNetworkBuffers ))
is also equal to
offset > (uint32_t)(sizeof( xNetworkBuffers ) - 1) )
and in that case compiler can use BHI producing the same results
I think the code generated by GCC is correct, technically speaking. offset cannot be larger than INT_MAX, because this is the maximum value representable in ptrdiff_t on this architecture.
You can compute the difference like this:
uintptr_t offset = (uintptr_t)pxDesc - (uintptr_t)xNetworkBuffers;
This is still implementation-defined, but it will avoid the overflow problem.

Draw a line with raspberry pi 3

I am trying to draw a line using this example I found online, but when I run it, it just prints the resolution of the screen I am using. I already connected different monitors but it still does the same. Some help on what I might be doing wrong?
This is draw.s
#---------------------------------
# For screen size
# 1824x984
#---------------------------------
.text
.align 2
.global main
main:
#-------------------------
#get screen address
#-------------------------
bl getScreenAddr
ldr r1,=pixelAddr
str r0,[r1]
#-------------------------
#draw line usign pixels
#-------------------------
mov r5,#0 #x count
loop:
cmp r5, #1824
bgt end
ldr r0,=pixelAddr
ldr r0,[r0]
ldr r1,=x
ldr r1,[r1]
ldr r2,=y
ldr r2,[r2]
ldr r3,=green
ldr r3,[r3]
bl pixel
ldr r1,=x
ldr r7,[r1]
add r7,#1
str r7,[r1]
add r5,#1
b loop
end:
#-------------------------
#draw a single pixel
#-------------------------
ldr r0,=pixelAddr
ldr r0,[r0]
mov r1,#900
mov r2,#100
mov r3,#24
bl pixel
#exit syscall
mov r7,#1
swi 0
.data
.balign 4
.global pixelAddr
pixelAddr: .word 0
x: .word 0 #value in x
y: .word 900 #value in y
red: .word 192
blue: .word 24
green: .word 7
.end
And this is the pixel.c library
#include<unistd.h>
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<fcntl.h>
#include<linux/fb.h>
#include<sys/mman.h>
int fbfd = 0;
struct fb_var_screeninfo vinfo;
struct fb_fix_screeninfo finfo;
long int screensize = 0;
int xRes = 0;
int yRes = 0;
char *fbp = 0;
//-----------------------------------------------
// Get screen buffer memory address
//-----------------------------------------------
int getScreenAddr(){
int addr;
// Open the file for reading and writing
fbfd = open("/dev/fb0", O_RDWR);
if (!fbfd) {
printf("Error: cannot open framebuffer device.\n");
return(1);
}
// Get fixed screen information
if (ioctl(fbfd, FBIOGET_FSCREENINFO, &finfo)) {
printf("Error reading fixed information.\n");
}
// Get variable screen information
if (ioctl(fbfd, FBIOGET_VSCREENINFO, &vinfo)) {
printf("Error reading variable information.\n");
}
printf("%dx%d,%d bits per pixel\n", vinfo.xres, vinfo.yres, vinfo.bits_per_pixel );
xRes = vinfo.xres;
yRes = vinfo.yres;
// map framebuffer to user memory
screensize = finfo.smem_len;
fbp = (char*)mmap(0, screensize, PROT_READ | PROT_WRITE, MAP_SHARED, fbfd, 0);
addr = (int)fbp;
close(fbfd);
return(addr);
}
//-----------------------------------------------
// Draw pixel in x,y coordinates with color c
//-----------------------------------------------
void pixel(int memPtr, int x, int y, int c){
memset((char*)memPtr + 2*(xRes*y +x), c, 2);
}

Resources