The Go code is here:
package main
func add(a, b int) int {
sum := 0
sum = a + b
return sum
}
func main() {
println(add(1, 2))
}
The Go version is
$ go version
go version go1.19.1 darwin/amd64
I use the following command to get assembly:
$ go tool compile -N -l -S main.go
main.add STEXT nosplit size=70 args=0x10 locals=0x18 funcid=0x0 align=0x0
0x0000 00000 (main.go:3) TEXT main.add(SB), NOSPLIT|ABIInternal, $24-16
0x0000 00000 (main.go:3) SUBQ $24, SP
0x0004 00004 (main.go:3) MOVQ BP, 16(SP)
0x0009 00009 (main.go:3) LEAQ 16(SP), BP
0x000e 00014 (main.go:3) FUNCDATA $0, gclocals·g2BeySu+wFnoycgXfElmcg==(SB)
0x000e 00014 (main.go:3) FUNCDATA $1, gclocals·g2BeySu+wFnoycgXfElmcg==(SB)
0x000e 00014 (main.go:3) FUNCDATA $5, main.add.arginfo1(SB)
0x000e 00014 (main.go:3) MOVQ AX, main.a+32(SP)
0x0013 00019 (main.go:3) MOVQ BX, main.b+40(SP)
0x0018 00024 (main.go:3) MOVQ $0, main.~r0(SP)
0x0020 00032 (main.go:4) MOVQ $0, main.sum+8(SP)
0x0029 00041 (main.go:5) MOVQ main.a+32(SP), AX
0x002e 00046 (main.go:5) ADDQ main.b+40(SP), AX
0x0033 00051 (main.go:5) MOVQ AX, main.sum+8(SP)
0x0038 00056 (main.go:6) MOVQ AX, main.~r0(SP)
0x003c 00060 (main.go:6) MOVQ 16(SP), BP
0x0041 00065 (main.go:6) ADDQ $24, SP
0x0045 00069 (main.go:6) RET
0x0000 48 83 ec 18 48 89 6c 24 10 48 8d 6c 24 10 48 89 H...H.l$.H.l$.H.
0x0010 44 24 20 48 89 5c 24 28 48 c7 04 24 00 00 00 00 D$ H.\$(H..$....
0x0020 48 c7 44 24 08 00 00 00 00 48 8b 44 24 20 48 03 H.D$.....H.D$ H.
0x0030 44 24 28 48 89 44 24 08 48 89 04 24 48 8b 6c 24 D$(H.D$.H..$H.l$
0x0040 10 48 83 c4 18 c3 .H....
main.main STEXT size=86 args=0x0 locals=0x20 funcid=0x0 align=0x0
0x0000 00000 (main.go:8) TEXT main.main(SB), ABIInternal, $32-0
0x0000 00000 (main.go:8) CMPQ SP, 16(R14)
0x0004 00004 (main.go:8) PCDATA $0, $-2
0x0004 00004 (main.go:8) JLS 79
0x0006 00006 (main.go:8) PCDATA $0, $-1
0x0006 00006 (main.go:8) SUBQ $32, SP
0x000a 00010 (main.go:8) MOVQ BP, 24(SP)
0x000f 00015 (main.go:8) LEAQ 24(SP), BP
0x0014 00020 (main.go:8) FUNCDATA $0, gclocals·g2BeySu+wFnoycgXfElmcg==(SB)
0x0014 00020 (main.go:8) FUNCDATA $1, gclocals·g2BeySu+wFnoycgXfElmcg==(SB)
0x0014 00020 (main.go:9) MOVL $1, AX
0x0019 00025 (main.go:9) MOVL $2, BX
0x001e 00030 (main.go:9) PCDATA $1, $0
0x001e 00030 (main.go:9) NOP
0x0020 00032 (main.go:9) CALL main.add(SB)
0x0025 00037 (main.go:9) MOVQ AX, main..autotmp_0+16(SP)
0x002a 00042 (main.go:9) CALL runtime.printlock(SB)
0x002f 00047 (main.go:9) MOVQ main..autotmp_0+16(SP), AX
0x0034 00052 (main.go:9) CALL runtime.printint(SB)
0x0039 00057 (main.go:9) CALL runtime.printnl(SB)
0x003e 00062 (main.go:9) NOP
0x0040 00064 (main.go:9) CALL runtime.printunlock(SB)
0x0045 00069 (main.go:10) MOVQ 24(SP), BP
0x004a 00074 (main.go:10) ADDQ $32, SP
0x004e 00078 (main.go:10) RET
0x004f 00079 (main.go:10) NOP
0x004f 00079 (main.go:8) PCDATA $1, $-1
0x004f 00079 (main.go:8) PCDATA $0, $-2
0x004f 00079 (main.go:8) CALL runtime.morestack_noctxt(SB)
0x0054 00084 (main.go:8) PCDATA $0, $-1
0x0054 00084 (main.go:8) JMP 0
0x0000 49 3b 66 10 76 49 48 83 ec 20 48 89 6c 24 18 48 I;f.vIH.. H.l$.H
0x0010 8d 6c 24 18 b8 01 00 00 00 bb 02 00 00 00 66 90 .l$...........f.
0x0020 e8 00 00 00 00 48 89 44 24 10 e8 00 00 00 00 48 .....H.D$......H
0x0030 8b 44 24 10 e8 00 00 00 00 e8 00 00 00 00 66 90 .D$...........f.
0x0040 e8 00 00 00 00 48 8b 6c 24 18 48 83 c4 20 c3 e8 .....H.l$.H.. ..
0x0050 00 00 00 00 eb aa ......
rel 33+4 t=7 main.add+0
rel 43+4 t=7 runtime.printlock+0
rel 53+4 t=7 runtime.printint+0
rel 58+4 t=7 runtime.printnl+0
rel 65+4 t=7 runtime.printunlock+0
rel 80+4 t=7 runtime.morestack_noctxt+0
go.cuinfo.producer.<unlinkable> SDWARFCUINFO dupok size=0
0x0000 2d 4e 20 2d 6c 20 72 65 67 61 62 69 -N -l regabi
go.cuinfo.packagename.main SDWARFCUINFO dupok size=0
0x0000 6d 61 69 6e main
main..inittask SNOPTRDATA size=24
0x0000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0x0010 00 00 00 00 00 00 00 00 ........
gclocals·g2BeySu+wFnoycgXfElmcg== SRODATA dupok size=8
0x0000 01 00 00 00 00 00 00 00 ........
main.add.arginfo1 SRODATA static dupok size=5
0x0000 00 08 08 08 ff .....
In my understanding, in the processing of the function main call function add, the stack would be(SP is the top of function add):
0~8: ~ro, the return value of add
8~16: sum, the local variable sum
16~24: BP, use to return caller main
32~40: a, the parameter a of add
40~456: b, the parameter b of add
But what is stored in the 24~32? I can not get it by reading assembly.
This is kind of a complex problem, that I'm unsure of it's cause.
So bare with me because, my English(writing) is rusty, and my Technial Language(terminology) is not so good either.
Dev: Atmega1284P
Compiler: AVR-GCC 11.1.0 from Zak's Electronics Blog
All the libraries that I'm using are written by me. They all work as expected, that includes my UART and it's "sister" UART Debugger.
Here's a part of it to better understand the other code segments:
void USART0_TX_Flash_String(const unsigned char* text)
{
while(pgm_read_byte(text)>0)
{
USART0_TXD(pgm_read_byte(text++));
}
}
#define DEBUGGER_UART_TXFS(string) USART0_TX_Flash_String(string)
void Print_Value_General(unsigned char* string, unsigned int value, unsigned char mode)
{
unsigned char StringA[8] = {};
DEBUGGER_UART_TX_String(string);
DEBUGGER_UART_TXD(tab_txt);
Convert_Value(value, mode, StringA); // basicly itoa()
DEBUGGER_UART_TX_String(StringA);
DEBUGGER_UART_TX_String(enter_txt);
}
#define Print_Value_DEC(string, value) Print_Value_General(string, value, 10)
This next segment is just a fraction of the ESP8266 library, namely the variables:
// ESP8266 Messages
const unsigned char flash_str_msg_report_wait_for_data[] PROGMEM = ">";
// Basic AT Commands
const unsigned char flash_str_at[] PROGMEM = "AT";
// Software Access Point Settings
// TCP/IP AT Commands
const unsigned char flash_str_cipclose[] PROGMEM = "CIPCLOSE";
const unsigned char flash_str_cifsr[] PROGMEM = "CIFSR";
const unsigned char flash_str_cipdns[] PROGMEM = "CIPDNS";
typedef struct td_esp8266_cmd_ptr
{
unsigned char* cmd_ptr;
void(*fp)(unsigned char array[]);
}td_esp8266_cmd_ptr;
void UnDefinedEmptyFoo(unsigned char string[])
{
DEBUGGER_UART_TX_String("void UnDefinedEmptyFoo(void) is Callled\r\n");
}
td_esp8266_cmd_ptr esp8266_cmd_ptr[6] = {
{
flash_str_msg_report_wait_for_data, // [] PROGMEM = ">";
UnDefinedEmptyFoo,
},
{
flash_str_at, // [] PROGMEM = "AT";
UnDefinedEmptyFoo,
},
{
flash_str_msg_report_busy, // [] PROGMEM = "busy p...";
UnDefinedEmptyFoo,
},
{
flash_str_cifsr, // [] PROGMEM = "CIFSR";
UnDefinedEmptyFoo,
},
{
flash_str_cipclose, // [] PROGMEM = "CIPCLOSE";
UnDefinedEmptyFoo,
},
{
flash_str_cipdns, // [] PROGMEM = "CIPDNS";
UnDefinedEmptyFoo,
}
};
const td_esp8266_cmd_ptr esp8266_cmd_flash_ptr[6] PROGMEM = {
{
flash_str_msg_report_wait_for_data, // [] PROGMEM = ">";
UnDefinedEmptyFoo,
},
{
flash_str_at, // [] PROGMEM = "AT";
UnDefinedEmptyFoo,
},
{
flash_str_msg_report_busy, // [] PROGMEM = "busy p...";
UnDefinedEmptyFoo,
},
{
flash_str_cifsr, // [] PROGMEM = "CIFSR";
UnDefinedEmptyFoo,
},
{
flash_str_cipclose, // [] PROGMEM = "CIPCLOSE";
UnDefinedEmptyFoo,
},
{
flash_str_cipdns, // [] PROGMEM = "CIPDNS";
UnDefinedEmptyFoo,
}
};
Here you can see two almost identical arrays of structure, or structure of arrays?
To bee clear, I'm referencing:
td_esp8266_cmd_ptr esp8266_cmd_ptr[6] = {};
const td_esp8266_cmd_ptr esp8266_cmd_flash_ptr[6] PROGMEM = {};
The first one is stored in RAM, and the second one in Flash.
The problem is with the second one.
This is the example code, to see where the problem starts:
void MXA_Test(void)
{
DEBUGGER_UART_TX_String("void MXA_Test(void) Start");
DEBUGGER_UART_TX_String(enter_txt);
DEBUGGER_UART_TX_String("For Loop Start for(u8b i = 0; i < 6; i++)");
DEBUGGER_UART_TX_String(enter_txt);
for(u8b i = 0; i < 6; i++)
{
Print_Value_DEC("esp8266_cmd_ptr[i].cmd_ptr: ", i);
DEBUGGER_UART_TXFS(esp8266_cmd_ptr[i].cmd_ptr);
DEBUGGER_UART_TX_String(enter_txt);
}
DEBUGGER_UART_TX_String("For Loop End");
DEBUGGER_UART_TX_String(enter_txt);
u8b var_name_str_start[] = {"\r\nesp8266_cmd_flash_ptr["};
u8b var_name_str_end[] = {"]: "};
DEBUGGER_UART_TX_String("Usig esp8266_cmd_flash_ptr (stored in FLASH):");
DEBUGGER_UART_TX_String(enter_txt);
DEBUGGER_UART_TX_String("Geting Var directly: esp8266_cmd_flash_ptr[0].cmd_ptr");
DEBUGGER_UART_TX_String(enter_txt);
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(0x30);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[0].cmd_ptr);
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(0x31);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[1].cmd_ptr);
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(0x32);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[2].cmd_ptr);
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(0x33);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[3].cmd_ptr);
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(0x34);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[4].cmd_ptr);
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(0x35);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[5].cmd_ptr);
DEBUGGER_UART_TX_String(enter_txt);
DEBUGGER_UART_TX_String("Geting Var INdirectly: esp8266_cmd_flash_ptr[i].cmd_ptr");
DEBUGGER_UART_TX_String(enter_txt);
u8b i = 0;
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(i+0x30);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
i++;
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(i+0x30);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
i++;
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(i+0x30);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
i++;
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(i+0x30);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
i++;
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(i+0x30);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
i++;
DEBUGGER_UART_TX_String(var_name_str_start);
DEBUGGER_UART_TXD(i+0x30);
DEBUGGER_UART_TX_String(var_name_str_end);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
i++;
DEBUGGER_UART_TX_String(enter_txt);
DEBUGGER_UART_TX_String("For Loop Start for(u8b i = 0; i < 3; i++)");
DEBUGGER_UART_TX_String(enter_txt);
for(i = 0; i < 3; i++)
{
Print_Value_DEC("esp8266_cmd_flash_ptr[i].cmd_ptr: ", i);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
DEBUGGER_UART_TX_String(enter_txt);
}
DEBUGGER_UART_TX_String("For Loop End");
DEBUGGER_UART_TX_String(enter_txt);
DEBUGGER_UART_TX_String("For Loop Start for(u8b i = 0; i < 6; i++)");
DEBUGGER_UART_TX_String(enter_txt);
for(i = 0; i < 6; i++)
{
Print_Value_DEC("esp8266_cmd_flash_ptr[i].cmd_ptr: ", i);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
DEBUGGER_UART_TX_String(enter_txt);
}
DEBUGGER_UART_TX_String("For Loop End");
DEBUGGER_UART_TX_String(enter_txt);
DEBUGGER_UART_TX_String("do while Loop End");
DEBUGGER_UART_TX_String(enter_txt);
i = 0;
DEBUGGER_UART_TX_String("Loop Start while(i < 3)");
DEBUGGER_UART_TX_String(enter_txt);
while(i < 3)
{
Print_Value_DEC("esp8266_cmd_flash_ptr[i].cmd_ptr: ", i);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
DEBUGGER_UART_TX_String(enter_txt);
i++;
}
DEBUGGER_UART_TX_String("while Loop End");
DEBUGGER_UART_TX_String(enter_txt);
DEBUGGER_UART_TX_String("void MXA_Test(void) End");
DEBUGGER_UART_TX_String(enter_txt);
}
This is it's Output via UART:
void MXA_Test(void) Start
For Loop Start for(u8b i = 0; i < 6; i++)
esp8266_cmd_ptr[i].cmd_ptr: 00
>
esp8266_cmd_ptr[i].cmd_ptr: 01
AT
esp8266_cmd_ptr[i].cmd_ptr: 02
busy p...
esp8266_cmd_ptr[i].cmd_ptr: 03
CIFSR
esp8266_cmd_ptr[i].cmd_ptr: 04
CIPCLOSE
esp8266_cmd_ptr[i].cmd_ptr: 05
CIPDNS
For Loop End
Usig esp8266_cmd_flash_ptr (stored in FLASH):
Geting Var directly: esp8266_cmd_flash_ptr[0].cmd_ptr
esp8266_cmd_flash_ptr[0]: >
esp8266_cmd_flash_ptr[1]: AT
esp8266_cmd_flash_ptr[2]: busy p...
esp8266_cmd_flash_ptr[3]: CIFSR
esp8266_cmd_flash_ptr[4]: CIPCLOSE
esp8266_cmd_flash_ptr[5]: CIPDNS
Geting Var INdirectly: esp8266_cmd_flash_ptr[i].cmd_ptr
esp8266_cmd_flash_ptr[0]: >
esp8266_cmd_flash_ptr[1]: AT
esp8266_cmd_flash_ptr[2]: busy p...
esp8266_cmd_flash_ptr[3]: CIFSR
esp8266_cmd_flash_ptr[4]: CIPCLOSE
esp8266_cmd_flash_ptr[5]: CIPDNS
For Loop Start for(u8b i = 0; i < 3; i++)
esp8266_cmd_flash_ptr[i].cmd_ptr: 00
K?\?m?~???????&? ?1?B?S?d?u??????^???
???
...
esp8266_cmd_flash_ptr[i].cmd_ptr: 01
??
...
esp8266_cmd_flash_ptr[i].cmd_ptr: 02
??
...
For Loop End
For Loop Start for(u8b i = 0; i < 6; i++)
esp8266_cmd_flash_ptr[i].cmd_ptr: 00
K?\?m?~???????&? ?1?B?S?d?u??????^???
???
...
esp8266_cmd_flash_ptr[i].cmd_ptr: 01
??
...
esp8266_cmd_flash_ptr[i].cmd_ptr: 02
??
...
esp8266_cmd_flash_ptr[i].cmd_ptr: 03
??
...
esp8266_cmd_flash_ptr[i].cmd_ptr: 04
K?\?m?~???????&? ?1?B?S?d?u??????^???
???
...
esp8266_cmd_flash_ptr[i].cmd_ptr: 05
K?\?m?~???????&? ?1?B?S?d?u??????^???
...
For Loop End
Loop Start while(i < 3)
esp8266_cmd_flash_ptr[i].cmd_ptr: 00
K?\?m?~???????&? ?1?B?S?d?u??????^???
???
...
esp8266_cmd_flash_ptr[i].cmd_ptr: 01
??
...
\
esp8266_cmd_flash_ptr[i].cmd_ptr: 02
??
...
\
while Loop End
void MXA_Test(void) End
I've made it much shorter, so where ever you see ... that is equal to a ton of yunk data.
So the problem only occurres when I'm tring to get a string with a pointer that is stored in Flash, like so:
for(i = 0; i < 6; i++)
{
Print_Value_DEC("esp8266_cmd_flash_ptr[i].cmd_ptr: ", i);
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
DEBUGGER_UART_TX_String(enter_txt);
}
But all other approaches work fine.
Here's my last and biggest code dump of the day, the .lss file:
0000a752 <MXA_Test>:
void MXA_Test(void)
{
a752: cf 92 push r12
a754: df 92 push r13
a756: ef 92 push r14
a758: ff 92 push r15
a75a: 0f 93 push r16
a75c: 1f 93 push r17
a75e: cf 93 push r28
a760: df 93 push r29
a762: cd b7 in r28, 0x3d ; 61
a764: de b7 in r29, 0x3e ; 62
a766: 6d 97 sbiw r28, 0x1d ; 29
a768: 0f b6 in r0, 0x3f ; 63
a76a: f8 94 cli
a76c: de bf out 0x3e, r29 ; 62
a76e: 0f be out 0x3f, r0 ; 63
a770: cd bf out 0x3d, r28 ; 61
DEBUGGER_UART_TX_String("void MXA_Test(void) Start");
a772: 81 e6 ldi r24, 0x61 ; 97
a774: 95 e0 ldi r25, 0x05 ; 5
a776: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
a77a: 8f e9 ldi r24, 0x9F ; 159
a77c: 94 e1 ldi r25, 0x14 ; 20
a77e: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String("For Loop Start for(u8b i = 0; i < 6; i++)");
a782: 8b e7 ldi r24, 0x7B ; 123
a784: 95 e0 ldi r25, 0x05 ; 5
a786: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
a78a: 8f e9 ldi r24, 0x9F ; 159
a78c: 94 e1 ldi r25, 0x14 ; 20
a78e: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
for(u8b i = 0; i < 6; i++)
a792: 30 e0 ldi r19, 0x00 ; 0
a794: e3 2e mov r14, r19
a796: 31 e0 ldi r19, 0x01 ; 1
a798: f3 2e mov r15, r19
DEBUGGER_UART_TX_String(enter_txt);
a79a: 10 e0 ldi r17, 0x00 ; 0
a79c: 00 e0 ldi r16, 0x00 ; 0
{
Print_Value_DEC("esp8266_cmd_ptr[i].cmd_ptr: ", i);
a79e: 4a e0 ldi r20, 0x0A ; 10
a7a0: b8 01 movw r22, r16
a7a2: 86 eb ldi r24, 0xB6 ; 182
a7a4: 95 e0 ldi r25, 0x05 ; 5
a7a6: 0e 94 64 14 call 0x28c8 ; 0x28c8 <Print_Value_General>
DEBUGGER_UART_TXFS(esp8266_cmd_ptr[i].cmd_ptr);
a7aa: f7 01 movw r30, r14
a7ac: 80 81 ld r24, Z
a7ae: 91 81 ldd r25, Z+1 ; 0x01
a7b0: 0e 94 78 10 call 0x20f0 ; 0x20f0 <USART0_TX_Flash_String>
DEBUGGER_UART_TX_String(enter_txt);
a7b4: 8f e9 ldi r24, 0x9F ; 159
a7b6: 94 e1 ldi r25, 0x14 ; 20
a7b8: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
for(u8b i = 0; i < 6; i++)
a7bc: 0f 5f subi r16, 0xFF ; 255
a7be: 1f 4f sbci r17, 0xFF ; 255
a7c0: f4 e0 ldi r31, 0x04 ; 4
a7c2: ef 0e add r14, r31
a7c4: f1 1c adc r15, r1
a7c6: 0a 32 cpi r16, 0x06 ; 6
a7c8: 11 05 cpc r17, r1
a7ca: 49 f7 brne .-46 ; 0xa79e <MXA_Test+0x4c>
}
DEBUGGER_UART_TX_String("For Loop End");
a7cc: 83 ed ldi r24, 0xD3 ; 211
a7ce: 95 e0 ldi r25, 0x05 ; 5
a7d0: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
a7d4: 8f e9 ldi r24, 0x9F ; 159
a7d6: 94 e1 ldi r25, 0x14 ; 20
a7d8: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
u8b var_name_str_start[] = {"\r\nesp8266_cmd_flash_ptr["};
a7dc: 89 e1 ldi r24, 0x19 ; 25
a7de: e5 e5 ldi r30, 0x55 ; 85
a7e0: f7 e0 ldi r31, 0x07 ; 7
a7e2: de 01 movw r26, r28
a7e4: 11 96 adiw r26, 0x01 ; 1
a7e6: 01 90 ld r0, Z+
a7e8: 0d 92 st X+, r0
a7ea: 8a 95 dec r24
a7ec: e1 f7 brne .-8 ; 0xa7e6 <MXA_Test+0x94>
u8b var_name_str_end[] = {"]: "};
a7ee: 80 91 6e 07 lds r24, 0x076E ; 0x80076e <esp8266_cmd_ptr+0x66e>
a7f2: 90 91 6f 07 lds r25, 0x076F ; 0x80076f <esp8266_cmd_ptr+0x66f>
a7f6: a0 91 70 07 lds r26, 0x0770 ; 0x800770 <esp8266_cmd_ptr+0x670>
a7fa: b0 91 71 07 lds r27, 0x0771 ; 0x800771 <esp8266_cmd_ptr+0x671>
a7fe: 8a 8f std Y+26, r24 ; 0x1a
a800: 9b 8f std Y+27, r25 ; 0x1b
a802: ac 8f std Y+28, r26 ; 0x1c
a804: bd 8f std Y+29, r27 ; 0x1d
DEBUGGER_UART_TX_String("Usig esp8266_cmd_flash_ptr (stored in FLASH):");
a806: 80 ee ldi r24, 0xE0 ; 224
a808: 95 e0 ldi r25, 0x05 ; 5
a80a: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
a80e: 8f e9 ldi r24, 0x9F ; 159
a810: 94 e1 ldi r25, 0x14 ; 20
a812: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String("Geting Var directly: esp8266_cmd_flash_ptr[0].cmd_ptr");
a816: 8e e0 ldi r24, 0x0E ; 14
a818: 96 e0 ldi r25, 0x06 ; 6
a81a: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
a81e: 8f e9 ldi r24, 0x9F ; 159
a820: 94 e1 ldi r25, 0x14 ; 20
a822: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(var_name_str_start);
a880: ce 01 movw r24, r28
a882: 01 96 adiw r24, 0x01 ; 1
a884: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TXD(0x33);
a888: 83 e3 ldi r24, 0x33 ; 51
a88a: 0e 94 2a 0f call 0x1e54 ; 0x1e54 <USART0_TXD>
DEBUGGER_UART_TX_String(var_name_str_end);
a88e: ce 01 movw r24, r28
a890: 4a 96 adiw r24, 0x1a ; 26
a892: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[3].cmd_ptr);
a896: 81 e6 ldi r24, 0x61 ; 97
a898: 91 e0 ldi r25, 0x01 ; 1
a89a: 0e 94 78 10 call 0x20f0 ; 0x20f0 <USART0_TX_Flash_String>
DEBUGGER_UART_TX_String(enter_txt);
a8da: 8f e9 ldi r24, 0x9F ; 159
a8dc: 94 e1 ldi r25, 0x14 ; 20
a8de: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String("Geting Var INdirectly: esp8266_cmd_flash_ptr[i].cmd_ptr");
a8e2: 84 e4 ldi r24, 0x44 ; 68
a8e4: 96 e0 ldi r25, 0x06 ; 6
a8e6: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
a8ea: 8f e9 ldi r24, 0x9F ; 159
a8ec: 94 e1 ldi r25, 0x14 ; 20
a8ee: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
u8b i = 0;
DEBUGGER_UART_TX_String(var_name_str_start);
a8f2: ce 01 movw r24, r28
a8f4: 01 96 adiw r24, 0x01 ; 1
a8f6: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TXD(i+0x30);
a8fa: 80 e3 ldi r24, 0x30 ; 48
a8fc: 0e 94 2a 0f call 0x1e54 ; 0x1e54 <USART0_TXD>
DEBUGGER_UART_TX_String(var_name_str_end);
a900: ce 01 movw r24, r28
a902: 4a 96 adiw r24, 0x1a ; 26
a904: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
a908: 8f e1 ldi r24, 0x1F ; 31
a90a: 92 e0 ldi r25, 0x02 ; 2
a90c: 0e 94 78 10 call 0x20f0 ; 0x20f0 <USART0_TX_Flash_String>
i++;
DEBUGGER_UART_TX_String(enter_txt);
a9a6: 8f e9 ldi r24, 0x9F ; 159
a9a8: 94 e1 ldi r25, 0x14 ; 20
a9aa: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String("For Loop Start for(u8b i = 0; i < 3; i++)");
a9ae: 8c e7 ldi r24, 0x7C ; 124
a9b0: 96 e0 ldi r25, 0x06 ; 6
a9b2: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
a9b6: 8f e9 ldi r24, 0x9F ; 159
a9b8: 94 e1 ldi r25, 0x14 ; 20
a9ba: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
for(i = 0; i < 3; i++)
a9be: 2c e8 ldi r18, 0x8C ; 140
a9c0: e2 2e mov r14, r18
a9c2: 20 e0 ldi r18, 0x00 ; 0
a9c4: f2 2e mov r15, r18
DEBUGGER_UART_TX_String(enter_txt);
a9c6: 67 01 movw r12, r14
a9c8: 10 e0 ldi r17, 0x00 ; 0
a9ca: 00 e0 ldi r16, 0x00 ; 0
{
Print_Value_DEC("esp8266_cmd_flash_ptr[i].cmd_ptr: ", i);
a9cc: 4a e0 ldi r20, 0x0A ; 10
a9ce: b8 01 movw r22, r16
a9d0: 86 ea ldi r24, 0xA6 ; 166
a9d2: 96 e0 ldi r25, 0x06 ; 6
a9d4: 0e 94 64 14 call 0x28c8 ; 0x28c8 <Print_Value_General>
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
a9d8: f6 01 movw r30, r12
a9da: 80 81 ld r24, Z
a9dc: 91 81 ldd r25, Z+1 ; 0x01
a9de: 0e 94 78 10 call 0x20f0 ; 0x20f0 <USART0_TX_Flash_String>
DEBUGGER_UART_TX_String(enter_txt);
a9e2: 8f e9 ldi r24, 0x9F ; 159
a9e4: 94 e1 ldi r25, 0x14 ; 20
a9e6: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
for(i = 0; i < 3; i++)
a9ea: 0f 5f subi r16, 0xFF ; 255
a9ec: 1f 4f sbci r17, 0xFF ; 255
a9ee: f4 e0 ldi r31, 0x04 ; 4
a9f0: cf 0e add r12, r31
a9f2: d1 1c adc r13, r1
a9f4: 03 30 cpi r16, 0x03 ; 3
a9f6: 11 05 cpc r17, r1
a9f8: 49 f7 brne .-46 ; 0xa9cc <MXA_Test+0x27a>
}
DEBUGGER_UART_TX_String("For Loop End");
a9fa: 83 ed ldi r24, 0xD3 ; 211
a9fc: 95 e0 ldi r25, 0x05 ; 5
a9fe: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
aa02: 8f e9 ldi r24, 0x9F ; 159
aa04: 94 e1 ldi r25, 0x14 ; 20
aa06: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String("For Loop Start for(u8b i = 0; i < 6; i++)");
aa0a: 89 ec ldi r24, 0xC9 ; 201
aa0c: 96 e0 ldi r25, 0x06 ; 6
aa0e: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
aa12: 8f e9 ldi r24, 0x9F ; 159
aa14: 94 e1 ldi r25, 0x14 ; 20
aa16: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
aa1a: 9c e8 ldi r25, 0x8C ; 140
aa1c: c9 2e mov r12, r25
aa1e: 90 e0 ldi r25, 0x00 ; 0
aa20: d9 2e mov r13, r25
aa22: 10 e0 ldi r17, 0x00 ; 0
aa24: 00 e0 ldi r16, 0x00 ; 0
for(i = 0; i < 6; i++)
{
Print_Value_DEC("esp8266_cmd_flash_ptr[i].cmd_ptr: ", i);
aa26: 4a e0 ldi r20, 0x0A ; 10
aa28: b8 01 movw r22, r16
aa2a: 86 ea ldi r24, 0xA6 ; 166
aa2c: 96 e0 ldi r25, 0x06 ; 6
aa2e: 0e 94 64 14 call 0x28c8 ; 0x28c8 <Print_Value_General>
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
aa32: f6 01 movw r30, r12
aa34: 80 81 ld r24, Z
aa36: 91 81 ldd r25, Z+1 ; 0x01
aa38: 0e 94 78 10 call 0x20f0 ; 0x20f0 <USART0_TX_Flash_String>
DEBUGGER_UART_TX_String(enter_txt);
aa3c: 8f e9 ldi r24, 0x9F ; 159
aa3e: 94 e1 ldi r25, 0x14 ; 20
aa40: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
for(i = 0; i < 6; i++)
aa44: 0f 5f subi r16, 0xFF ; 255
aa46: 1f 4f sbci r17, 0xFF ; 255
aa48: f4 e0 ldi r31, 0x04 ; 4
aa4a: cf 0e add r12, r31
aa4c: d1 1c adc r13, r1
aa4e: 06 30 cpi r16, 0x06 ; 6
aa50: 11 05 cpc r17, r1
aa52: 49 f7 brne .-46 ; 0xaa26 <MXA_Test+0x2d4>
}
DEBUGGER_UART_TX_String("For Loop End");
aa54: 83 ed ldi r24, 0xD3 ; 211
aa56: 95 e0 ldi r25, 0x05 ; 5
aa58: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
aa5c: 8f e9 ldi r24, 0x9F ; 159
aa5e: 94 e1 ldi r25, 0x14 ; 20
aa60: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
i = 0;
DEBUGGER_UART_TX_String("Loop Start while(i < 3)");
aabe: 85 e2 ldi r24, 0x25 ; 37
aac0: 97 e0 ldi r25, 0x07 ; 7
aac2: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
aac6: 8f e9 ldi r24, 0x9F ; 159
aac8: 94 e1 ldi r25, 0x14 ; 20
aaca: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
aace: 10 e0 ldi r17, 0x00 ; 0
aad0: 00 e0 ldi r16, 0x00 ; 0
while(i < 3)
{
Print_Value_DEC("esp8266_cmd_flash_ptr[i].cmd_ptr: ", i);
aad2: 4a e0 ldi r20, 0x0A ; 10
aad4: b8 01 movw r22, r16
aad6: 86 ea ldi r24, 0xA6 ; 166
aad8: 96 e0 ldi r25, 0x06 ; 6
aada: 0e 94 64 14 call 0x28c8 ; 0x28c8 <Print_Value_General>
DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
aade: f7 01 movw r30, r14
aae0: 80 81 ld r24, Z
aae2: 91 81 ldd r25, Z+1 ; 0x01
aae4: 0e 94 78 10 call 0x20f0 ; 0x20f0 <USART0_TX_Flash_String>
DEBUGGER_UART_TX_String(enter_txt);
aae8: 8f e9 ldi r24, 0x9F ; 159
aaea: 94 e1 ldi r25, 0x14 ; 20
aaec: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
while(i < 3)
aaf0: 0f 5f subi r16, 0xFF ; 255
aaf2: 1f 4f sbci r17, 0xFF ; 255
aaf4: f4 e0 ldi r31, 0x04 ; 4
aaf6: ef 0e add r14, r31
aaf8: f1 1c adc r15, r1
aafa: 03 30 cpi r16, 0x03 ; 3
aafc: 11 05 cpc r17, r1
aafe: 49 f7 brne .-46 ; 0xaad2 <MXA_Test+0x380>
i++;
}
DEBUGGER_UART_TX_String("while Loop End");
ab00: 86 e1 ldi r24, 0x16 ; 22
ab02: 97 e0 ldi r25, 0x07 ; 7
ab04: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
ab08: 8f e9 ldi r24, 0x9F ; 159
ab0a: 94 e1 ldi r25, 0x14 ; 20
ab0c: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String("void MXA_Test(void) End");
ab10: 8d e3 ldi r24, 0x3D ; 61
ab12: 97 e0 ldi r25, 0x07 ; 7
ab14: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
DEBUGGER_UART_TX_String(enter_txt);
ab18: 8f e9 ldi r24, 0x9F ; 159
ab1a: 94 e1 ldi r25, 0x14 ; 20
ab1c: 0e 94 50 10 call 0x20a0 ; 0x20a0 <USART0_TX_String>
}
ab20: 6d 96 adiw r28, 0x1d ; 29
ab22: 0f b6 in r0, 0x3f ; 63
ab24: f8 94 cli
ab26: de bf out 0x3e, r29 ; 62
ab28: 0f be out 0x3f, r0 ; 63
ab2a: cd bf out 0x3d, r28 ; 61
ab2c: df 91 pop r29
ab2e: cf 91 pop r28
ab30: 1f 91 pop r17
ab32: 0f 91 pop r16
ab34: ff 90 pop r15
ab36: ef 90 pop r14
ab38: df 90 pop r13
ab3a: cf 90 pop r12
ab3c: 08 95 ret
So can anyone make heads or tails out of it?
The Solution to the problem lies in the fact, that I was using pointers... (Me Schoolbus - Pointer.... U know).
for(i = 0; i < 6; i++)
{
Print_Value_DEC("esp8266_cmd_flash_ptr[i].cmd_ptr: ", i);
//DEBUGGER_UART_TXFS(esp8266_cmd_flash_ptr[i].cmd_ptr);
DEBUGGER_UART_TXFS(pgm_read_word(&(esp8266_cmd_flash_ptr[i].cmd_ptr))); // works
DEBUGGER_UART_TX_String(enter_txt);
}
Because
const td_esp8266_cmd_ptr esp8266_cmd_flash_ptr[6] PROGMEM = {};
contains pointers to the strings, and those pointers are stored in Flash, that is why an extra pgm_read_word has to be used before.
But then again, if this is the code that works in all cases,(in and outside of loops) why does the "bad" code work outside the loops?
This question already has an answer here:
Assembly do we need the endings? [duplicate]
(1 answer)
Closed 1 year ago.
Recently, I read some books about computer science. I wrote some C code, and disassembled them, using gcc and objdump.
The following C code:
#include <stdio.h>
#include <stdbool.h>
int dojob()
{
static short num[ ][4] = { {2, 9, -1, 5}, {3, 8, 2, -6}};
static short *pn[ ] = {num[0], num[1]};
static short s[2] = {0, 0};
int i, j;
for (i=0; i<2; i++) {
for (j=0; j<4; j++){
s[i] += *pn[i]++;
}
printf ("sum of line %d: %d\n", i+1, s[i]);
}
return 0;
}
int main ( )
{
dojob();
}
got the following assembly code (AT&T syntex; only assembly of function dojob and some data is list):
00401350 <_dojob>:
401350: 55 push %ebp
401351: 89 e5 mov %esp,%ebp
401353: 83 ec 28 sub $0x28,%esp
401356: c7 45 f4 00 00 00 00 movl $0x0,-0xc(%ebp)
40135d: eb 75 jmp 4013d4 <_dojob+0x84>
40135f: c7 45 f0 00 00 00 00 movl $0x0,-0x10(%ebp)
401366: eb 3c jmp 4013a4 <_dojob+0x54>
401368: 8b 45 f4 mov -0xc(%ebp),%eax
40136b: 8b 04 85 00 20 40 00 mov 0x402000(,%eax,4),%eax
401372: 8d 48 02 lea 0x2(%eax),%ecx
401375: 8b 55 f4 mov -0xc(%ebp),%edx
401378: 89 0c 95 00 20 40 00 mov %ecx,0x402000(,%edx,4)
40137f: 0f b7 10 movzwl (%eax),%edx
401382: 8b 45 f4 mov -0xc(%ebp),%eax
401385: 0f b7 84 00 08 50 40 movzwl 0x405008(%eax,%eax,1),%eax
40138c: 00
40138d: 89 c1 mov %eax,%ecx
40138f: 89 d0 mov %edx,%eax
401391: 01 c8 add %ecx,%eax
401393: 89 c2 mov %eax,%edx
401395: 8b 45 f4 mov -0xc(%ebp),%eax
401398: 66 89 94 00 08 50 40 mov %dx,0x405008(%eax,%eax,1)
40139f: 00
4013a0: 83 45 f0 01 addl $0x1,-0x10(%ebp)
4013a4: 83 7d f0 03 cmpl $0x3,-0x10(%ebp)
4013a8: 7e be jle 401368 <_dojob+0x18>
4013aa: 8b 45 f4 mov -0xc(%ebp),%eax
4013ad: 0f b7 84 00 08 50 40 movzwl 0x405008(%eax,%eax,1),%eax
4013b4: 00
4013b5: 98 cwtl
4013b6: 8b 55 f4 mov -0xc(%ebp),%edx
4013b9: 83 c2 01 add $0x1,%edx
4013bc: 89 44 24 08 mov %eax,0x8(%esp)
4013c0: 89 54 24 04 mov %edx,0x4(%esp)
4013c4: c7 04 24 24 30 40 00 movl $0x403024,(%esp)
4013cb: e8 50 08 00 00 call 401c20 <_printf>
4013d0: 83 45 f4 01 addl $0x1,-0xc(%ebp)
4013d4: 83 7d f4 01 cmpl $0x1,-0xc(%ebp)
4013d8: 7e 85 jle 40135f <_dojob+0xf>
4013da: b8 00 00 00 00 mov $0x0,%eax
4013df: c9 leave
4013e0: c3 ret
Disassembly of section .data:
00402000 <__data_start__>:
402000: 08 20 or %ah,(%eax)
402002: 40 inc %eax
402003: 00 10 add %dl,(%eax)
402005: 20 40 00 and %al,0x0(%eax)
Disassembly of section .bss:
...
00405008 <_s.1927>:
405008: 00 00 add %al,(%eax)
...
I have two questions:
I don't understand the difference between mov and movl instruction? Why the compiler generate mov for some code, and movl for others?
I completely understand the meaning of the C code, but not the assembly that the compiler generated. Who can make some comments for it for me to understand? I will thank a lot.
The MOVL instruction was generated because you put two int (i and j variables), MOVL will perform a MOV of 32 bits, and integer' size is 32 bits.
a non exhaustive list of all MOV* exist (like MOVD for doubleword or MOVQ for quadword) to allow to optimize your code and use the better expression to gain most time as possible.
PS: may be the -M intel objdump's argument can help you to have a better comprehension of the disassembly, a lot of man on the Intel syntax can may be find easily.
I am currently writing various implementations of a color to black/white image converter. I would like to do a :
Simple C++ implementation
Self made ASM implementation
Self made ASM implementation with AVX vector instructions.
The goal is to benchmark each one of these and analyse the performance improvement I get.
The following snippet of code is the C++ implementation. It only treats a single portion of the image, because I also want to do multithreaded computing.
void CBwConverter::run(const CImg<uint8_t> &src, CImg<uint8_t> &dst, uint32_t pixel, size_t size) const {
const uint8_t *rC = src.data(0,pixel,0,0);
const uint8_t *gC = src.data(0,pixel,0,1);
const uint8_t *bC = src.data(0,pixel,0,2);
uint8_t *mC = dst.data(0,pixel,0,0);
for(size_t c = 0; c < size; c++, rC++, gC++, bC++, mC++) {
*mC = (uint8_t)(0.299f*(*rC) + 0.587f*(*gC) + 0.114f*(*bC));
}
}
Now, before starting the ASM version, I had my C++ code compiled and disassembled just to see how it looks like. After compiling with gcc -std=c++11 -g -O2 -c CBwConverter.cc, I obtained the following output with objdump -d CBwConvert.o :
0000000000000000 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm>:
0: 53 push %rbx
1: 8b 3e mov (%rsi),%edi
3: 89 c8 mov %ecx,%eax
5: 44 8b 56 04 mov 0x4(%rsi),%r10d
9: 44 8b 5e 08 mov 0x8(%rsi),%r11d
d: 89 c9 mov %ecx,%ecx
f: 48 8b 5e 18 mov 0x18(%rsi),%rbx
13: 0f af c7 imul %edi,%eax
16: 4c 0f af d7 imul %rdi,%r10
1a: 4b 8d 34 1b lea (%r11,%r11,1),%rsi
1e: 4c 8d 0c 03 lea (%rbx,%rax,1),%r9
22: 4c 89 d7 mov %r10,%rdi
25: 49 0f af fb imul %r11,%rdi
29: 4c 0f af d6 imul %rsi,%r10
2d: 48 01 c7 add %rax,%rdi
30: 4c 01 d0 add %r10,%rax
33: 48 01 df add %rbx,%rdi
36: 48 8d 34 03 lea (%rbx,%rax,1),%rsi
3a: 8b 02 mov (%rdx),%eax
3c: 48 0f af c8 imul %rax,%rcx
40: 48 03 4a 18 add 0x18(%rdx),%rcx
44: 4d 85 c0 test %r8,%r8
47: 74 6b je b4 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0xb4>
49: 31 d2 xor %edx,%edx
4b: f3 0f 10 25 00 00 00 movss 0x0(%rip),%xmm4 # 53 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0x53>
52: 00
53: f3 0f 10 1d 00 00 00 movss 0x0(%rip),%xmm3 # 5b <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0x5b>
5a: 00
5b: f3 0f 10 15 00 00 00 movss 0x0(%rip),%xmm2 # 63 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0x63>
62: 00
63: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
68: 41 0f b6 04 11 movzbl (%r9,%rdx,1),%eax
6d: 66 0f ef c0 pxor %xmm0,%xmm0
71: f3 0f 2a c0 cvtsi2ss %eax,%xmm0
75: 0f b6 04 17 movzbl (%rdi,%rdx,1),%eax
79: 0f 28 c8 movaps %xmm0,%xmm1
7c: 66 0f ef c0 pxor %xmm0,%xmm0
80: f3 0f 59 cc mulss %xmm4,%xmm1
84: f3 0f 2a c0 cvtsi2ss %eax,%xmm0
88: 0f b6 04 16 movzbl (%rsi,%rdx,1),%eax
8c: f3 0f 59 c3 mulss %xmm3,%xmm0
90: f3 0f 58 c1 addss %xmm1,%xmm0
94: 66 0f ef c9 pxor %xmm1,%xmm1
98: f3 0f 2a c8 cvtsi2ss %eax,%xmm1
9c: f3 0f 59 ca mulss %xmm2,%xmm1
a0: f3 0f 58 c1 addss %xmm1,%xmm0
a4: f3 0f 2c c0 cvttss2si %xmm0,%eax
a8: 88 04 11 mov %al,(%rcx,%rdx,1)
ab: 48 83 c2 01 add $0x1,%rdx
af: 49 39 d0 cmp %rdx,%r8
b2: 75 b4 jne 68 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0x68>
b4: 5b pop %rbx
b5: c3 retq
I can already tell that the for-loop start at 68 and ends at b2.
Something bothers me in the disassembled program. Why does the compiler decide to set registers %xmm0 and %xmm1 to 0, typically at 6d with instruction pxor ? These registers are overwritten just after with instruction cvtsi2ss which loads an integer and converts it to a single-precision number and then finally stores it into them. Why set them to 0 when they are overwritten just after ? If the compiler does it, am I supposed to do the same when writing my own asm version ?