ARM assembly quicksort and recursion

ARM assembly quicksort and recursion - gcc

I'm trying to change C quick sort code to ARM assembly code.
Firstly, I apologize for my long question.
just in case, I wrote whole codes.
but only part that makes problem is function recursion part.
I almost done this work except funcion recursion part.
I've also tried converting C codes to ARM assembly with gcc cross compiler to see how compiler do. But there are still few things that I can't understand well.
This is C code.
void quicksort_c(int array[8],int first,int last)
{
int i, j, pivot, temp;
if(first < last)
{
pivot = first;
i = first;
j = last;
while(i < j)
{
while(array[i] <= array[pivot] && i < last)
{
i++;
}
while(array[j] > array[pivot])
{
j--;
}
if(i < j)
{
temp = array[i];
array[i] = array[j];
array[j] = temp;
}
}
temp = array[pivot];
array[pivot] = array[j];
array[j] = temp;
quicksort_c(array, first, j-1);
quicksort_c(array, j+1, last);
}
}
and this is arm assembly code that I wrote from scratch.
quicksort
STMFD sp!, {r0-r8,lr}
quicksort_recursion
CMP r1, r2 ;// if(first < last)
BGE out_of_if
outer_if
MOV r5, r1 ;// pivot = first
MOV r3, r1 ;// i = first;
MOV r4, r2 ;// j = last;
outer_while_loop
CMP r3, r4 ;// while(i < j)
BGE end_of_outer_while
outer_while
inner_while_1_loop
;//while(array[i] <= array[pivot] && i < last)
LDR r7, [r0, r3, lsl #2] ;//r7 = array[i]
LDR r8, [r0, r5, lsl #2] ;//r8 = array[pivot]
CMP r7, r8 ;//array[i] <= array[pivot]
BGT inner_while_2_loop
CMP r3, r2 ;// i < last
BGE inner_while_2_loop
inner_while_1
ADD r3, #1 ;//i++
B inner_while_1_loop
inner_while_2_loop
;//whlie(array[j] > array[pivot])
LDR r7, [r0, r4, lsl #2] ;// r7 = array[j]
LDR r8, [r0, r5, lsl #2] ;//r8 = array[pivot]
CMP r7, r8 ;// (array[j] > array[pivot])
BLE inner_if_cmp
inner_while_2
SUB r4, #1
B inner_while_2_loop
inner_if_cmp
CMP r3, r4;// if(i < j)
BGE outer_while_loop
inner_if
LDR r7, [r0, r3, lsl #2] ;// r7 = array[i]
MOV r6, r7 ;// temp = array[i]
LDR r8, [r0, r4, lsl #2] ;// r8 = array[j]
STR r8, [r0, r3, lsl #2] ;// array[i] = array[j]
STR r6, [r0, r4, lsl #2] ;// array[j] = temp;
B outer_while_loop
end_of_outer_while
LDR r7, [r0, r5, lsl #2] ;// r7 = array[pivot]
MOV r6, r7 ;// temp = array[pivot]
LDR r8, [r0, r4, lsl #2] ;// r8 = array[j]
STR r8, [r0, r5, lsl #2] ;// array[pivot] = array[j]
STR r6, [r0, r4, lsl #2] ;// array[j] = temp
;;//quicksort(array,first,j-1)
STMFD sp!, {r0-r8,lr}
SUBS r2, r4, #1
BL quicksort_recursion
;//LDMFD sp!, {r0-r8,pc}
;//quicksort(array, j+1, last)
STMFD sp!, {r0-r8,lr}
ADDS r1, r4, #1
BL quicksort_recursion
;//LDMFD sp!, {r0-r8,pc}
out_of_if
end_function
LDMFD sp!, {r0-r8,pc}
END
and this is how I use register
r0: array pointer
r1: first index
r2: last index
r3: i
r4: j
r5: pivot
r6: temp
r7: array[?] value
r8: array[?] value
I confirmed every parts except recursion of my code are work well.
At first, I did recursion in this way.
;;//quicksort(array,first,j-1)
SUBS r2, r4, #1
BL quicksort
;//quicksort(array, j+1, last)
ADDS r1, r4, #1
BL quicksort
but this code only do first function recursion.
ex)
Array before quicksorting : 5 1 4 7 8 3 6
Array after quicksorting : 1 2 4 3 5 8 7 6
I think it's because after function call, it doesn't go back to where it branched. so I added push and pop before and after function call like this.
;;//quicksort(array,first,j-1)
STMFD sp!, {r0-r8,lr}
SUBS r2, r4, #1
BL quicksort_recursion
;//quicksort(array, j+1, last)
STMFD sp!, {r0-r8,lr}
ADDS r1, r4, #1
BL quicksort_recursion
But this code falls in to infinite loop.
according to gcc compiled code,
quicksort_linux
cmp r1, r2
blt L16
bx lr
L16
push {r4, r5, r6, r7, r8, r9, r10, lr}
mov r10, r1
add ip, r0, r1, lsl #2
mov r5, r2
mov r4, r1
L3
lsls r3, r4, #2
add lr, r0, r3
ldr r7, [r0, r4, lsl #2]
ldr r6, [ip]
cmp r2, r4
ite le
movle r8, #0
movgt r8, #1
cmp r7, r6
it gt
movgt r8, #0
adds r3, r3, #4
add r3, r3, r0
cmp r8, #0
beq L9
L4
adds r4, r4, #1
mov lr, r3
ldr r7, [r3], #4
cmp r2, r4
ite le
movle r1, #0
movgt r1, #1
cmp r7, r6
it gt
movgt r1, #0
cmp r1, #0
bne L4
L9
lsls r3, r5, #2
add r9, r0, r3
ldr r1, [r0, r5, lsl #2]
cmp r1, r6
ble L5
subs r3, r3, #4
add r3, r3, r0
L6
subs r5, r5, #1
mov r9, r3
ldr r1, [r3], #-4
cmp r1, r6
bgt L6
L5
cmp r4, r5
bge L7
str r1, [lr]
str r7, [r9]
b L3
L7
mov r7, r2
mov r1, r10
mov r4, r0
ldr r3, [r0, r5, lsl #2]
str r3, [r0, r10, lsl #2]
str r6, [r0, r5, lsl #2]
subs r2, r5, #1
bl quicksort_linux
mov r2, r7
adds r1, r5, #1
mov r0, r4
bl quicksort_linux
pop {r4, r5, r6, r7, r8, r9, r10, pc}
END
It seems like they don't do any push or pop before or after branch but it works.
So, question is
1.what is problem of my function recursion code?
2.How can I fix that?
3.How can gcc compiled code works without any push and pop before and after function recursion?

Related

gcc ARM produces incorrect code - how to correct

gcc ARM for STM32F407 micro
The following function is used as a sanity check in FreeRtosTCP
UBaseType_t bIsValidNetworkDescriptor( const NetworkBufferDescriptor_t * pxDesc )
{
uint32_t offset = ( uint32_t ) ( ((const char *)pxDesc) - ((const char *)xNetworkBuffers) );
if( ( offset >= (uint32_t)(sizeof( xNetworkBuffers )) ) || ( ( offset % sizeof( xNetworkBuffers[0] ) ) != 0 ) )
return pdFALSE;
return (UBaseType_t) (pxDesc - xNetworkBuffers) + 1;
}
The line in question is ---> offset >= (uint32_t)(sizeof( xNetworkBuffers ))
gcc produces a bhi instruction after the cmp instead of a bhs.
If tries casting both as shown in the code above but nothing seems to get the bhs instruction to be used.
Any help appreciated.
Thanks.
Joe

Well knowing the exact size of the xNetworkBuffers array compiler can simply optimize it. Being curious I gave it a try. Following is the code with little modifications and the asm output and the explanation:
#include <stdint.h>
typedef struct abc {
char data[10];
}NetworkBufferDescriptor_t;
NetworkBufferDescriptor_t xNetworkBuffers[5];
int bIsValidNetworkDescriptor( const NetworkBufferDescriptor_t * pxDesc )
{
uint32_t offset = ( uint32_t ) ( ((const char *)pxDesc) - ((const char *)xNetworkBuffers) );
if( ( offset >= (uint32_t)(sizeof( xNetworkBuffers )) ) || ( ( offset % sizeof( xNetworkBuffers[0] ) ) != 0 ) )
return 0;
return (int) (pxDesc - xNetworkBuffers) + 1;
}
and the asm output is:
bIsValidNetworkDescriptor:
# Function supports interworking.
# args = 0, pretend = 0, frame = 16
# frame_needed = 1, uses_anonymous_args = 0
# link register save eliminated.
str fp, [sp, #-4]!
add fp, sp, #0
sub sp, sp, #20
str r0, [fp, #-16]
ldr r3, [fp, #-16]
ldr r2, .L5
sub r3, r3, r2
str r3, [fp, #-8]
ldr r3, [fp, #-8]
cmp r3, #49
bhi .L2
ldr r1, [fp, #-8]
ldr r3, .L5+4
umull r2, r3, r1, r3
lsr r2, r3, #3
mov r3, r2
lsl r3, r3, #2
add r3, r3, r2
lsl r3, r3, #1
sub r2, r1, r3
cmp r2, #0
beq .L3
.L2:
mov r3, #0
b .L4
.L3:
ldr r3, [fp, #-16]
ldr r2, .L5
sub r3, r3, r2
asr r2, r3, #1
mov r3, r2
lsl r3, r3, #1
add r3, r3, r2
lsl r1, r3, #4
add r3, r3, r1
lsl r1, r3, #8
add r3, r3, r1
lsl r1, r3, #16
add r3, r3, r1
lsl r3, r3, #2
add r3, r3, r2
add r3, r3, #1
.L4:
mov r0, r3
add sp, fp, #0
# sp needed
ldr fp, [sp], #4
bx lr
.L6:
.align 2
.L5:
In the block quoted asm code you can see that it is comparing with 49 not 50 (which is the actual size of xNetworkBuffers) so the conclusion I got is
offset >= (uint32_t)(sizeof( xNetworkBuffers ))
is also equal to
offset > (uint32_t)(sizeof( xNetworkBuffers ) - 1) )
and in that case compiler can use BHI producing the same results

I think the code generated by GCC is correct, technically speaking. offset cannot be larger than INT_MAX, because this is the maximum value representable in ptrdiff_t on this architecture.
You can compute the difference like this:
uintptr_t offset = (uintptr_t)pxDesc - (uintptr_t)xNetworkBuffers;
This is still implementation-defined, but it will avoid the overflow problem.

Why does _exit() jump to _etext?

I am running a project using the ARM Embedded Tollchain on a stm32 microcontroller which uses the newLib.
I called assert(false) to test the assert output and ended in a Hard Fault Exception. I debugged into the assembly of assert(...) and found out that a subsequent call to _exit(1) jumps to a Address which is called _etext. Taking a look to the manpage of _etext shows that _etext is the address of the end of the .text section.
I am really confused. Normally I had supposed that _exit() is calling __exit() (which is defined as global symbol by the newLib) which I had implemented in a file named syscalls.c.
Why does _exit() jump to _etext?
Here are some cope snippets for a better understanding:
The subsequent call to _exit() by assert() taken from newLib 2.5:
_VOID
_DEFUN_VOID (abort)
{
#ifdef ABORT_MESSAGE
write (2, "Abort called\n", sizeof ("Abort called\n")-1);
#endif
while (1)
{
raise (SIGABRT);
_exit (1);
}
}
The disassembly of abort and assert. Take a special look to address 0808a10a where the jump to 80a5198 (_etext) is performed:
abort:
0808a100: push {r3, lr}
0808a102: movs r0, #6
0808a104: bl 0x808bfdc <raise>
0808a108: movs r0, #1
0808a10a: bl 0x80a51d8
0808a10e: nop
__assert_func:
0808a110: push {lr}
0808a112: ldr r4, [pc, #40] ; (0x808a13c <__assert_func+44>)
0808a114: ldr r6, [r4, #0]
0808a116: mov r5, r0
0808a118: sub sp, #20
0808a11a: mov r4, r3
0808a11c: ldr r0, [r6, #12]
0808a11e: cbz r2, 0x808a136 <__assert_func+38>
0808a120: ldr r3, [pc, #28] ; (0x808a140 <__assert_func+48>)
0808a122: str r2, [sp, #8]
0808a124: stmia.w sp, {r1, r3}
0808a128: mov r2, r4
0808a12a: mov r3, r5
0808a12c: ldr r1, [pc, #20] ; (0x808a144 <__assert_func+52>)
0808a12e: bl 0x808a5f4 <fiprintf>
0808a132: bl 0x808a100 <abort>
0808a136: ldr r3, [pc, #16] ; (0x808a148 <__assert_func+56>)
0808a138: mov r2, r3
0808a13a: b.n 0x808a122 <__assert_func+18>
0808a13c: str r0, [r3, #120] ; 0x78
0808a13e: movs r0, #0
0808a140: add r12, r11
0808a142: lsrs r2, r1, #32
0808a144: add r12, sp
0808a146: lsrs r2, r1, #32
0808a148: add r8, sp
0808a14a: lsrs r2, r1, #32
The lss-file which shows that 80a5198 is the address of _etext:
0808a0c0 <abort>:
808a0c0: b508 push {r3, lr}
808a0c2: 2006 movs r0, #6
808a0c4: f001 ff6a bl 808bf9c <raise>
808a0c8: 2001 movs r0, #1
808a0ca: f01b f865 bl 80a5198 <_etext>
808a0ce: bf00 nop

Double division in sqrtf? [UPDATE]

I'm using floating point operations(software implementation) on a STM32F0 and found something weird in the listing. As soon as i use sqrtf, the linker is adding __aeabi_ddiv which is ~1.6kB of memory.
This code for example links to ddiv:
float value = 42.0f;
float root = sqrtf(value);
Removing sqrtf also removes ddiv. So my question:
Is this intended behavior?
If no, how can i fix it.
Is it possible to do sqrt without double?
Compiler: arm-atollic-eabi-gcc
Listing of sqrtf (ddiv at 0x800543e):
080053bc <sqrtf>:
80053bc: b5f0 push {r4, r5, r6, r7, lr}
80053be: 2500 movs r5, #0
80053c0: b08d sub sp, #52 ; 0x34
80053c2: 1c04 adds r4, r0, #0
80053c4: f000 f84a bl 800545c <__ieee754_sqrtf>
80053c8: 4b22 ldr r3, [pc, #136] ; (8005454 <sqrtf+0x98>)
80053ca: 1c06 adds r6, r0, #0
80053cc: 575d ldrsb r5, [r3, r5]
80053ce: 1c6b adds r3, r5, #1
80053d0: d030 beq.n 8005434 <sqrtf+0x78>
80053d2: 1c21 adds r1, r4, #0
80053d4: 1c20 adds r0, r4, #0
80053d6: f7fb febb bl 8001150 <__aeabi_fcmpun>
80053da: 1e07 subs r7, r0, #0
80053dc: d12a bne.n 8005434 <sqrtf+0x78>
80053de: 2100 movs r1, #0
80053e0: 1c20 adds r0, r4, #0
80053e2: f7fb f837 bl 8000454 <__aeabi_fcmplt>
80053e6: 2800 cmp r0, #0
80053e8: d024 beq.n 8005434 <sqrtf+0x78>
80053ea: 2301 movs r3, #1
80053ec: 9302 str r3, [sp, #8]
80053ee: 4b1a ldr r3, [pc, #104] ; (8005458 <sqrtf+0x9c>)
80053f0: 1c20 adds r0, r4, #0
80053f2: 9303 str r3, [sp, #12]
80053f4: 970a str r7, [sp, #40] ; 0x28
80053f6: f7fc faad bl 8001954 <__aeabi_f2d>
80053fa: 2200 movs r2, #0
80053fc: 9006 str r0, [sp, #24]
80053fe: 9107 str r1, [sp, #28]
8005400: 9004 str r0, [sp, #16]
8005402: 9105 str r1, [sp, #20]
8005404: 2300 movs r3, #0
8005406: 2d00 cmp r5, #0
8005408: d117 bne.n 800543a <sqrtf+0x7e>
800540a: 9208 str r2, [sp, #32]
800540c: 9309 str r3, [sp, #36] ; 0x24
800540e: a802 add r0, sp, #8
8005410: f000 f87a bl 8005508 <matherr>
8005414: 2800 cmp r0, #0
8005416: d018 beq.n 800544a <sqrtf+0x8e>
8005418: 9b0a ldr r3, [sp, #40] ; 0x28
800541a: 9301 str r3, [sp, #4]
800541c: 2b00 cmp r3, #0
800541e: d004 beq.n 800542a <sqrtf+0x6e>
8005420: f000 f874 bl 800550c <__errno>
8005424: 9b0a ldr r3, [sp, #40] ; 0x28
8005426: 9301 str r3, [sp, #4]
8005428: 6003 str r3, [r0, #0]
800542a: 9808 ldr r0, [sp, #32]
800542c: 9909 ldr r1, [sp, #36] ; 0x24
800542e: f7fc fae3 bl 80019f8 <__aeabi_d2f>
8005432: 1c06 adds r6, r0, #0
8005434: 1c30 adds r0, r6, #0
8005436: b00d add sp, #52 ; 0x34
8005438: bdf0 pop {r4, r5, r6, r7, pc}
800543a: 0010 movs r0, r2
800543c: 0019 movs r1, r3
800543e: f7fb ff55 bl 80012ec <__aeabi_ddiv>
8005442: 9008 str r0, [sp, #32]
8005444: 9109 str r1, [sp, #36] ; 0x24
8005446: 2d02 cmp r5, #2
8005448: d1e1 bne.n 800540e <sqrtf+0x52>
800544a: f000 f85f bl 800550c <__errno>
800544e: 2321 movs r3, #33 ; 0x21
8005450: 6003 str r3, [r0, #0]
8005452: e7e1 b.n 8005418 <sqrtf+0x5c>
8005454: 2000000c .word 0x2000000c
8005458: 08006096 .word 0x08006096
UPDATE I think I found the reason but still don't quite understand it.
Source of sqrtf
The double division is part of the exception handling, although 0.0/0.0 should be done at compile time right? If I call __ieee754_sqrtf directly ddiv is not linked. This solves my problem but I would like to know how to do this using sqrtf.

ARM Headers to Get Proper Call Stacks

I am currently carrying out optimizations on a linux-based software itself on an ARM processor. Those optimizations are mostly in the form of ARM and ARM NEON functions.
In order to profile the software I use perf record and flame-graphs, however, once I introduce the assembler functions, they do not stack on top of the functions that call them but rather seemingly random places.
My question therefore was, what should I include in my functions for them to appear properly in the call stacks.
There was a slightly related topic but no good answer was given How to get call graph profiling working with gcc compiled code and ARM Cortex A8 target?. I use the same flags plus mapcs-frame.
Below, I give an example of a C function translated to ARM by GCC. This ARM function seems to produces decent stacks but I would like to understand why.
int half(int in);
int sum(int in1, int in2);
int mean(int in1, int in2);
int half(int i)
{
return i / 2;
}
int sum(int i, int j)
{
return i + j;
}
int mean(int i, int j)
{
int s = sum(i, j);
int m = half(s);
return m;
}
int main()
{
int a = 1;
int b = 5;
int i;
int result;
for (i = 0; i<10000000; i++) {
result = mean(a, b);
}
return 0;
}
.cpu cortex-a9
.eabi_attribute 27, 3
.eabi_attribute 28, 1
.fpu neon
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 6
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "a.c"
.text
.align 2
.global half
.type half, %function
half:
# args = 0, pretend = 0, frame = 8
# frame_needed = 1, uses_anonymous_args = 0
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #8
str r0, [fp, #-16]
ldr r3, [fp, #-16]
mov r2, r3, lsr #31
add r3, r2, r3
mov r3, r3, asr #1
mov r0, r3
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
.size half, .-half
.align 2
.global sum
.type sum, %function
sum:
# args = 0, pretend = 0, frame = 8
# frame_needed = 1, uses_anonymous_args = 0
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #8
str r0, [fp, #-16]
str r1, [fp, #-20]
ldr r2, [fp, #-16]
ldr r3, [fp, #-20]
add r3, r2, r3
mov r0, r3
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
.size sum, .-sum
.align 2
.global mean
.type mean, %function
mean:
# args = 0, pretend = 0, frame = 16
# frame_needed = 1, uses_anonymous_args = 0
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #16
str r0, [fp, #-24]
str r1, [fp, #-28]
ldr r1, [fp, #-28]
ldr r0, [fp, #-24]
bl sum
str r0, [fp, #-16]
ldr r0, [fp, #-16]
bl half
str r0, [fp, #-20]
ldr r3, [fp, #-20]
mov r0, r3
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
.size mean, .-mean
.align 2
.global main
.type main, %function
main:
# args = 0, pretend = 0, frame = 16
# frame_needed = 1, uses_anonymous_args = 0
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #16
mov r3, #1
str r3, [fp, #-20]
mov r3, #5
str r3, [fp, #-24]
mov r3, #0
str r3, [fp, #-16]
b .L8
.L9:
ldr r1, [fp, #-24]
ldr r0, [fp, #-20]
bl mean
str r0, [fp, #-28]
ldr r3, [fp, #-16]
add r3, r3, #1
str r3, [fp, #-16]
.L8:
ldr r2, [fp, #-16]
movw r3, #38527
movt r3, 152
cmp r2, r3
ble .L9
mov r3, #0
mov r0, r3
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
.size main, .-main
.ident "GCC: (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09) 4.9.2 20140904 (prerelease)"
.section .note.GNU-stack,"",%progbits
-------------------EDIT-------------------
Here is the example of the kind of function I am trying to integrate. In terms of linkage, all it does is save the stack and link register at the beginning and set them a the end. What should I add to it?
.section .text
.global ARM_smoothing
ARM_smoothing:
STMFD sp!, {r4-r12,lr} //move used registers on stack (avoid segmentation fault)
MOV r5, r0
ADD r0, r0, r2
ADD r0, r0, r2
MOV r8, r0
ADD r8, r8, r2
ADD r8, r8, r2 //the 6 instructions create 3 pointers to the row above and below as well as the current one
ADD r1, r1, r2
ADD r1, r1, r2
ADD r1, r1, #2 //move destination pointer to first element (1 row down, 1 element left)
SUB r2, r2, #2
SUB r3, r3, #2 //counters decremented because smoothing function works with a margin of 1 on every side
LDR r9, =0x1C71C71D //(1/9)*2^32 pour effectuer la division par 9
LDR r10, =0x2
LDR r11, =0xC //shifts for pointers to data
VLDR.U64 d20, =0x1C71C71D //(1/9)*2^32 pour effectuer la division par 9
VLDR.U64 d22, =0x0 //initialization of zeros to be used (not ncessarily needed)
VLDR.U64 d23, =0x0
VDUP.32 d20, d20[0] //initialize vector for multiplication
height_loop:
MOV r4, r2 //reset width counter
CMP r4, #8
BLGE width_loop_eight_smoothing //use neon while more than 8 elements in row need smoothing
CMP r4, #1
BLGE width_loop_rest //use normal ARM for remaining elements, can't do in NEON because of margin
ADD r0, r0, #4 //skip margin
ADD r1, r1, #4
ADD r5, r5, #4
ADD r8, r8, #4
SUBS r3, r3, #1 //decrement row counter
BNE height_loop //loop while there still are rows
LDMFD sp!, {r4-r12,pc} //restore stack and return to calling function
width_loop_eight_smoothing:
SUB r4, r4, #8 //decrement width counter
VLD1.16 {d0, d1}, [r5], r10 //load upper left elements
VLD1.16 {d2, d3}, [r5], r10 //load upper middle elements
VADDL.S16 q2, d0, d2 //long addition of elements to be sure to not lose any data
VADDL.S16 q3, d1, d3
VLD1.16 {d0, d1}, [r5], r11 //load upper right elements
VLD1.16 {d2, d3}, [r0], r10 //load middle left elements
VADDL.S16 q4, d0, d2
VADDL.S16 q5, d1, d3
VADD.S32 q2, q4 //add to grand total
VADD.S32 q3, q5
VLD1.16 {d0, d1}, [r0], r10 //load current elements
VLD1.16 {d2, d3}, [r0], r11 //load middle right elements
VADDL.S16 q4, d0, d2
VADDL.S16 q5, d1, d3
VADD.S32 q2, q4
VADD.S32 q3, q5
VLD1.16 {d0, d1}, [r8], r10 //load lower left elements
VLD1.16 {d2, d3}, [r8], r10 //load lower middle elements
VADDL.S16 q4, d0, d2
VADDL.S16 q5, d1, d3
VADD.S32 q2, q4
VADD.S32 q3, q5
VLD1.16 {d0, d1}, [r8], r11 //load lower right elements
VADDL.S16 q4, d0, d22
VADDL.S16 q5, d1, d23
VADD.S32 q2, q4
VADD.S32 q3, q5
VMULL.S32 q6, d4, d20 //divide by 9 (upper element is total divided by 9)
VMULL.S32 q7, d5, d20
VMULL.S32 q8, d6, d20
VMULL.S32 q9, d7, d20
VUZP.32 q6, q7 //pack results into less registers and smaller elements
VUZP.32 q8, q9
VUZP.16 q7, q9
VSHR.U16 q8, q7, #15 //when multiplied element is negative, result is always one under
VADD.S16 q7, q8 //rectifying by adding sign bit to total
VST1.16 {d14, d15}, [r1]! //store results
CMP r4, #8 //check if theres enough elements to do 8 more in NEON
BCS width_loop_eight_smoothing //if yes, loop neon code
MOV PC, LR //return to ARM_smoothing if not
width_loop_rest: //works similaarly to NEON but one element at a time
LDRSH r6, [r0], #2 //converts loaded half words to signed full words
LDRSH r7, [r0] //main difference is with the way increments are done since there is an overlap
ADD r6, r7, r6
LDRSH r7, [r0, #2]
ADD r6, r7, r6
LDRSH r7, [r5], #2
ADD r6, r7, r6
LDRSH r7, [r5]
ADD r6, r7, r6
LDRSH r7, [r5, #2]
ADD r6, r7, r6
LDRSH r7, [r8], #2
ADD r6, r7, r6
LDRSH r7, [r8]
ADD r6, r7, r6
LDRSH r7, [r8, #2]
ADD r6, r7, r6
SMULLS r6, r7, r6, r9
ADDMI r7, #1
STRH r7, [r1], #2
SUBS r4, #1 //decrement width counter and check if there's any left
BNE width_loop_rest
MOV PC, LR

You can clearly see how the compiler is annotating the assembler with some pseudo-ops...
.global mean
.type mean, %function
...
.size mean, .-mean
These are put in COFF sections and need to make it to a build so that the call graph tools can know what PC range is for your assembler function.
.global ARM_smoothing
+ .type ARM_smoothing, %function
...
+ .size ARM_smoothing, .-ARM_smoothing
Other pseudo-ops depend on the debug information needed.
.func
.endfunc
.size
ARM CFI question
.cantunwind
Others are .fnend, .fnstart, .movsp, .save, .setfp, etc.
It depends on the debug/object format expected by the tool. There are also two types of data;
code extent information
stack and frame use
Both are typically needed for unwinding (or a stack back trace) but a sampling performance tool might only get away with the first. Exception handling code that does object clean up requires the most information.
Related: ARM Link and frame register

AFNetworking issues on Ios 8

When i try to call a java/soap web service from iOS (objective c) using AFNetworking, after many times I get this error: "EXC_BAD_ACCESS"
on 0x71d678: ldr r0, [r4, #0x68].
But I noticed that if on iPhone (particularly iphone 6 or in generally on iOS8) I disable WIFI the error disappeared
I post the assembly code
libdispatch.dylib`_dispatch_timers_run:
0x71d5f0: push {r4, r5, r6, r7, lr}
0x71d5f2: add r7, sp, #0xc
0x71d5f4: push.w {r8, r10, r11}
0x71d5f8: sub sp, #0x18
0x71d5fa: movs r6, #0x0
0x71d5fc: str r0, [sp]
0x71d5fe: movw r0, #0xa83e
0x71d602: movt r0, #0x1
0x71d606: add r0, pc
0x71d608: add.w r1, r0, r6, lsl #6
0x71d60c: ldr r0, [r1, #8]!
0x71d610: str r1, [sp, #0x14]
0x71d612: cmp r0, #0x0
0x71d614: beq.w 0x71d776 ; _dispatch_timers_run + 390
0x71d618: ubfx r4, r6, #0x2, #0x1
0x71d61c: ldr r5, [sp]
0x71d61e: cbz r5, 0x71d632 ; _dispatch_timers_run + 66
0x71d620: ldr.w r0, [r5, r4, lsl #3]
0x71d624: str r0, [sp, #0x10]
0x71d626: cbz r0, 0x71d632 ; _dispatch_timers_run + 66
0x71d628: add.w r0, r5, r4, lsl #3
0x71d62c: ldr r0, [r0, #0x4]
0x71d62e: str r0, [sp, #0xc]
0x71d630: b 0x71d664 ; _dispatch_timers_run + 116
0x71d632: cmp r4, #0x0
0x71d634: bne 0x71d63c ; _dispatch_timers_run + 76
0x71d636: bl 0x70df20 ; _dispatch_get_nanoseconds
0x71d63a: b 0x71d648 ; _dispatch_timers_run + 88
0x71d63c: cmp r4, #0x1
0x71d63e: str r0, [sp, #0x10]
0x71d640: str r0, [sp, #0xc]
0x71d642: bne 0x71d64c ; _dispatch_timers_run + 92
0x71d644: blx 0x72b380 ; symbol stub for: mach_absolute_time
0x71d648: str r0, [sp, #0x10]
0x71d64a: str r1, [sp, #0xc]
0x71d64c: cbz r5, 0x71d664 ; _dispatch_timers_run + 116
0x71d64e: ldr r0, [sp, #0x10]
0x71d650: ldr r1, [sp, #0xc]
0x71d652: str.w r0, [r5, r4, lsl #3]
0x71d656: add.w r0, r5, r4, lsl #3
0x71d65a: str r1, [r0, #0x4]
0x71d65c: b 0x71d664 ; _dispatch_timers_run + 116
0x71d65e: mov r0, r4
0x71d660: bl 0x71c5bc ; _dispatch_timers_update
0x71d664: ldr r0, [sp, #0x14]
0x71d666: ldr.w r10, [r0]
0x71d66a: cmp.w r10, #0x0
0x71d66e: beq.w 0x71d776 ; _dispatch_timers_run + 390
0x71d672: ldr.w r0, [r10, #8]
0x71d676: mvns r4, r0
0x71d678: ldr r0, [r4, #0x68]
0x71d67a: cmp r0, r6
0x71d67c: bne 0x71d65e ; _dispatch_timers_run + 110
0x71d67e: mov r8, r10
0x71d680: ldr r5, [r8, #24]!
0x71d684: ldr r2, [sp, #0x10]
0x71d686: ldr.w r11, [r8, #4]
0x71d68a: subs r0, r5, #0x1
0x71d68c: sbc r1, r11, #0x0
0x71d690: cmp r0, r2
0x71d692: mov.w r0, #0x0
0x71d696: it hs
0x71d698: movhs r0, #0x1
0x71d69a: ldr r2, [sp, #0xc]
0x71d69c: cmp r1, r2
0x71d69e: mov.w r1, #0x0
0x71d6a2: it hs
0x71d6a4: movhs r1, #0x1
0x71d6a6: it eq
0x71d6a8: moveq r1, r0
0x71d6aa: cmp r1, #0x0
0x71d6ac: bne 0x71d776 ; _dispatch_timers_run + 390
0x71d6ae: ldr r0, [r4, #0x1c]
0x71d6b0: cmp r0, #0x1
0x71d6b2: bhi 0x71d65e ; _dispatch_timers_run + 110
0x71d6b4: ldr r0, [r4, #0x70]
0x71d6b6: cmp r0, #0x0
0x71d6b8: bne 0x71d65e ; _dispatch_timers_run + 110
0x71d6ba: str r6, [sp, #0x4]
0x71d6bc: ldr.w r6, [r10, #48]
0x71d6c0: ldr.w r3, [r10, #52]
0x71d6c4: ldr r0, [sp, #0x10]
0x71d6c6: ldr r1, [sp, #0xc]
0x71d6c8: mov r2, r6
0x71d6ca: str r3, [sp, #0x8]
0x71d6cc: subs r0, r0, r5
0x71d6ce: sbc.w r1, r1, r11
0x71d6d2: blx 0x72b240 ; symbol stub for: __udivdi3
0x71d6d6: mov r9, r6
0x71d6d8: adds r6, r0, #0x1
0x71d6da: adc r12, r1, #0x0
0x71d6de: lsrs r1, r6, #0x1f
0x71d6e0: orr.w r1, r1, r12, lsl #1
0x71d6e4: movs r2, #0x0
0x71d6e6: mvn r3, #0x80000000
0x71d6ea: orr.w r1, r1, r12, lsr #31
0x71d6ee: cmp r1, #0x0
0x71d6f0: mov.w r1, #0x0
0x71d6f4: itt ne
0x71d6f6: movne.w r12, #0x0
0x71d6fa: mvnne r6, #0x80000000
0x71d6fe: cmp.w r9, #0xffffffff
0x71d702: it eq
0x71d704: moveq r1, #0x1
0x71d706: ldr r0, [sp, #0x8]
0x71d708: cmp r0, #0x0
0x71d70a: it lt
0x71d70c: movlt r2, #0x1
0x71d70e: cmp r0, r3
0x71d710: it eq
0x71d712: moveq r2, r1
0x71d714: mov r3, r0
0x71d716: cbnz r2, 0x71d73c ; _dispatch_timers_run + 332
0x71d718: umull r1, r2, r6, r9
0x71d71c: mla r2, r6, r3, r2
0x71d720: mla r0, r12, r9, r2
0x71d724: adds r2, r1, r5
0x71d726: adc.w r3, r0, r11
0x71d72a: strd r2, r3, [r8]
0x71d72e: ldrd r0, r1, [r10, #56]
0x71d732: adds r0, r0, r2
0x71d734: adcs r1, r3
0x71d736: strd r0, r1, [r10, #32]
0x71d73a: b 0x71d750 ; _dispatch_timers_run + 352
0x71d73c: mov.w r0, #0xffffffff
0x71d740: str.w r0, [r8]
0x71d744: str.w r0, [r8, #4]
0x71d748: str.w r0, [r8, #8]
0x71d74c: str.w r0, [r8, #12]
0x71d750: mov r0, r4
0x71d752: bl 0x71c5bc ; _dispatch_timers_update
0x71d756: ldr r0, [sp, #0x10]
0x71d758: ldr r1, [sp, #0xc]
0x71d75a: strd r0, r1, [r10, #40]
0x71d75e: ldrex r0, [r4, #0x70]
0x71d762: add r0, r6
0x71d764: strex r1, r0, [r4, #0x70]
0x71d768: cmp r1, #0x0
0x71d76a: bne 0x71d75e ; _dispatch_timers_run + 366
0x71d76c: mov r0, r4
0x71d76e: bl 0x715934 ; _dispatch_wakeup
0x71d772: ldr r6, [sp, #0x4]
0x71d774: b 0x71d664 ; _dispatch_timers_run + 116
0x71d776: adds r6, #0x1
0x71d778: cmp r6, #0x7
0x71d77a: bne.w 0x71d5fe ; _dispatch_timers_run + 14
0x71d77e: add sp, #0x18
0x71d780: pop.w {r8, r10, r11}
0x71d784: pop {r4, r5, r6, r7, pc}
0x71d786: nop
Can someone help me, please?
On iphone 4s it's ok (iOS 7)
Thank you in advance
Claudio

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

ARM assembly quicksort and recursion - gcc

Related

gcc ARM produces incorrect code - how to correct

Why does _exit() jump to _etext?

Double division in sqrtf? [UPDATE]

ARM Headers to Get Proper Call Stacks

AFNetworking issues on Ios 8

Categories

Resources