golang doing unexpected heap memory allocation - go

While benchmarking, I noticed a surprising heap memory allocation. After reducing the repro, I ended up with the following:
// --- Repro file ---
func memAllocRepro(values []int) *[]int {
for {
break
}
return &values
}
// --- Benchmark file ---
func BenchmarkMemAlloc(b *testing.B) {
values := []int{1, 2, 3, 4}
for i := 0; i < b.N; i++ {
memAllocRepro(values)
}
}
And here is the benchmark output:
BenchmarkMemAlloc-4 50000000 40.2 ns/op 32 B/op 1 allocs/op
PASS
ok memalloc_debugging 2.113s
Success: Benchmarks passed.
Now the funny this is, if I remove the for loop, or if I return the slice directly instead of a slice pointer, there are no more heap alloc:
// --- Repro file ---
func noAlloc1(values []int) *[]int {
return &values // No alloc!
}
func noAlloc2(values []int) []int {
for {
break
}
return values // No alloc!
}
// --- Benchmark file ---
func BenchmarkNoAlloc(b *testing.B) {
values := []int{1, 2, 3, 4}
for i := 0; i < b.N; i++ {
noAlloc1(values)
noAlloc2(values)
}
Benchmark result:
BenchmarkNoAlloc-4 300000000 4.20 ns/op 0 B/op 0 allocs/op
PASS
ok memalloc_debugging 1.756s
Success: Benchmarks passed.
I found that very confusing and confirmed with Delve that the disassembly does has an allocation at the start of the memAllocRepro function:
(dlv) disassemble
TEXT main.memAllocRepro(SB) memalloc_debugging/main.go
main.go:10 0x44ce10 65488b0c2528000000 mov rcx, qword ptr gs:[0x28]
main.go:10 0x44ce19 488b8900000000 mov rcx, qword ptr [rcx]
main.go:10 0x44ce20 483b6110 cmp rsp, qword ptr [rcx+0x10]
main.go:10 0x44ce24 7662 jbe 0x44ce88
main.go:10 0x44ce26 4883ec18 sub rsp, 0x18
main.go:10 0x44ce2a 48896c2410 mov qword ptr [rsp+0x10], rbp
main.go:10 0x44ce2f 488d6c2410 lea rbp, ptr [rsp+0x10]
main.go:10 0x44ce34 488d0525880000 lea rax, ptr [rip+0x8825]
main.go:10 0x44ce3b 48890424 mov qword ptr [rsp], rax
=> main.go:10 0x44ce3f* e8bcebfbff call 0x40ba00 runtime.newobject
I must say though, once I hit that point, I couldn't easily dig further. I'm pretty sure it would be possible to know at least which type is allocated by looking at the structure pointed to by the RAX register, but I wasn't very successful doing so. It's been a long time since I've read disassembly like this.
(dlv) regs
Rip = 0x000000000044ce3f
Rsp = 0x000000c042039f30
Rax = 0x0000000000455660
(...)
All that being said, I have 2 questions:
* Anyone can tell why is there a heap allocation there and if it's "expected"?
* How could I have gone further in my debugging session? Dumping memory to hex has a different address layout and go tool objdump will output disassembly, which mangle the content at the address location
Full function dump with go tool objdump:
TEXT main.memAllocRepro(SB) memalloc_debugging/main.go
main.go:10 0x44ce10 65488b0c2528000000 MOVQ GS:0x28, CX
main.go:10 0x44ce19 488b8900000000 MOVQ 0(CX), CX
main.go:10 0x44ce20 483b6110 CMPQ 0x10(CX), SP
main.go:10 0x44ce24 7662 JBE 0x44ce88
main.go:10 0x44ce26 4883ec18 SUBQ $0x18, SP
main.go:10 0x44ce2a 48896c2410 MOVQ BP, 0x10(SP)
main.go:10 0x44ce2f 488d6c2410 LEAQ 0x10(SP), BP
main.go:10 0x44ce34 488d0525880000 LEAQ runtime.types+34656(SB), AX
main.go:10 0x44ce3b 48890424 MOVQ AX, 0(SP)
main.go:10 0x44ce3f e8bcebfbff CALL runtime.newobject(SB)
main.go:10 0x44ce44 488b7c2408 MOVQ 0x8(SP), DI
main.go:10 0x44ce49 488b442428 MOVQ 0x28(SP), AX
main.go:10 0x44ce4e 48894708 MOVQ AX, 0x8(DI)
main.go:10 0x44ce52 488b442430 MOVQ 0x30(SP), AX
main.go:10 0x44ce57 48894710 MOVQ AX, 0x10(DI)
main.go:10 0x44ce5b 8b052ff60600 MOVL runtime.writeBarrier(SB), AX
main.go:10 0x44ce61 85c0 TESTL AX, AX
main.go:10 0x44ce63 7517 JNE 0x44ce7c
main.go:10 0x44ce65 488b442420 MOVQ 0x20(SP), AX
main.go:10 0x44ce6a 488907 MOVQ AX, 0(DI)
main.go:16 0x44ce6d 48897c2438 MOVQ DI, 0x38(SP)
main.go:16 0x44ce72 488b6c2410 MOVQ 0x10(SP), BP
main.go:16 0x44ce77 4883c418 ADDQ $0x18, SP
main.go:16 0x44ce7b c3 RET
main.go:16 0x44ce7c 488b442420 MOVQ 0x20(SP), AX
main.go:10 0x44ce81 e86aaaffff CALL runtime.gcWriteBarrier(SB)
main.go:10 0x44ce86 ebe5 JMP 0x44ce6d
main.go:10 0x44ce88 e85385ffff CALL runtime.morestack_noctxt(SB)
main.go:10 0x44ce8d eb81 JMP main.memAllocRepro(SB)
:-1 0x44ce8f cc INT $0x3
Disassemble of the memory pointed to by the RAX register:
(dlv) disassemble -a 0x0000000000455660 0x0000000000455860
.:0 0x455660 1800 sbb byte ptr [rax], al
.:0 0x455662 0000 add byte ptr [rax], al
.:0 0x455664 0000 add byte ptr [rax], al
.:0 0x455666 0000 add byte ptr [rax], al
.:0 0x455668 0800 or byte ptr [rax], al
.:0 0x45566a 0000 add byte ptr [rax], al
.:0 0x45566c 0000 add byte ptr [rax], al
.:0 0x45566e 0000 add byte ptr [rax], al
.:0 0x455670 8e66f9 mov fs, word ptr [rsi-0x7]
.:0 0x455673 1b02 sbb eax, dword ptr [rdx]
.:0 0x455675 0808 or byte ptr [rax], cl
.:0 0x455677 17 ?
.:0 0x455678 60 ?
.:0 0x455679 0d4a000000 or eax, 0x4a
.:0 0x45567e 0000 add byte ptr [rax], al
.:0 0x455680 c01f47 rcr byte ptr [rdi], 0x47
.:0 0x455683 0000 add byte ptr [rax], al
.:0 0x455685 0000 add byte ptr [rax], al
.:0 0x455687 0000 add byte ptr [rax], al
.:0 0x455689 0c00 or al, 0x0
.:0 0x45568b 004062 add byte ptr [rax+0x62], al
.:0 0x45568e 0000 add byte ptr [rax], al
.:0 0x455690 c0684500 shr byte ptr [rax+0x45], 0x0

Escape analysis determines whether any references to a value escape the function in which the value is declared.
In Go, arguments are passed by value, typically on the stack; the stack is reclaimed at the end of the function. However, returning the reference &values from the memAllocRepro function gives the values parameter declared in memAllocRepro a lifetime beyond the end of the function. The values variable is moved to the heap.
memAllocRepro: &values: Alloc
./escape.go:3:6: cannot inline memAllocRepro: unhandled op FOR
./escape.go:7:9: &values escapes to heap
./escape.go:7:9: from ~r1 (return) at ./escape.go:7:2
./escape.go:3:37: moved to heap: values
The noAlloc1 function is inlined in the main function. The values argument, if necessary, is declared in and does not escape from the main function.
noAlloc1: &values: No Alloc
./escape.go:10:6: can inline noAlloc1 as: func([]int)*[]int{return &values}
./escape.go:23:10: inlining call to noAlloc1 func([]int)*[]int{return &values}
The noAlloc2 function values argument is returned as values. values is returned on the stack. There is no reference to values in the noAlloc2 function and so no escape.
noAlloc2: values: No Alloc
package main
func memAllocRepro(values []int) *[]int {
for {
break
}
return &values
}
func noAlloc1(values []int) *[]int {
return &values
}
func noAlloc2(values []int) []int {
for {
break
}
return values
}
func main() {
memAllocRepro(nil)
noAlloc1(nil)
noAlloc2(nil)
}
Output:
$ go build -a -gcflags='-m -m' escape.go
# command-line-arguments
./escape.go:3:6: cannot inline memAllocRepro: unhandled op FOR
./escape.go:10:6: can inline noAlloc1 as: func([]int) *[]int { return &values }
./escape.go:14:6: cannot inline noAlloc2: unhandled op FOR
./escape.go:21:6: cannot inline main: non-leaf function
./escape.go:23:10: inlining call to noAlloc1 func([]int) *[]int { return &values }
./escape.go:7:9: &values escapes to heap
./escape.go:7:9: from ~r1 (return) at ./escape.go:7:2
./escape.go:3:37: moved to heap: values
./escape.go:11:9: &values escapes to heap
./escape.go:11:9: from ~r1 (return) at ./escape.go:11:2
./escape.go:10:32: moved to heap: values
./escape.go:14:31: leaking param: values to result ~r1 level=0
./escape.go:14:31: from ~r1 (return) at ./escape.go:18:2
./escape.go:23:10: main &values does not escape
$

Related

When I use global variables it crashes my kernel

I have had this happen before and worked around it for a while but now it slowly becomes more and more unavoidable, because now I need them.
For some weird reason, my kernel crashes when I try to use a global variable in my code.
This works:
int global;
void kmain()
{
//do some stuff...
}
This does not work:
int global;
void kmain()
{
global = 1;
//do some stuff...
}
I have no idea why this is happening.
As some additional resources here is my linker script:
OUTPUT_FORMAT(binary)
phys = 0x0500;
SECTIONS
{
.text phys : AT(phys) {
code = .;
*(.text)
*(.rodata)
. = ALIGN(4096);
}
.data : AT(phys + (data - code))
{
data = .;
*(.data)
. = ALIGN(4096);
}
.bss : AT(phys + (bss - code)) {
bss = .;
*(.bss)
. = ALIGN(4096);
}
end = .;
/DISCARD/
: {
*(.comment)
*(.eh_frame)
*(.note.gnu.build-id)
}
}
and my makefile:
bin/UmbrellaOS.img: bin/boot.bin bin/kernel.bin bin/zeros.bin
cat $^ > $#
bin/kernel.bin: tmp/kernel_entry.o tmp/kernel.o
x86_64-elf-ld -o $# -T link.ld $^
tmp/kernel.o: src/kernel/main.c
x86_64-elf-gcc -ffreestanding -m64 -g -c $^ -o $#
Edit:
To be more specific I use QEMU to test my OS upon starting QEMU it instantly closes. It should also be noted that if I try something like this:
int global;
void kmain()
{
return;
global = 0;
}
it works for some reason.
I can see a green L printed to the screen which is the last thing my bootloader does before passing control to the kernel after long mode has been entered.
btw here is my bootloader:
[bits 16]
[org 0x7C00]
KERNEL_LOC equ 0x0500
_start:
mov [_BootDisk], dl
xor ax, ax
mov ds, ax
mov es, ax
mov ss, ax
mov bp, 0x7BFF
mov sp, bp
push 0x7E00 ; buffer
push 1 ; sectors to read
push 2 ; sector num
call DiskRead
jc .error
push ebx
pushfd
pop eax
mov ebx, eax
xor eax, 0x200000
push eax
popfd
pushfd
pop eax
cmp eax, ebx
jnz .supported
push _CpuErrorString
call Print
jmp .error
.supported:
mov eax, 0x80000000
cpuid
cmp eax, 0x80000001
jb .no64
mov eax, 0x80000001
cpuid
test edx, 1 << 29
jnz .is64
.no64:
push _64ErrorString
call Print
jmp .error
.is64:
push 0x8000
call MapMem
push KERNEL_LOC ; buffer
push 8 ; sectors to read
push 3 ; sector num
call DiskRead
jc .error
cli
lgdt [GDT_descriptor]
mov eax, cr0
or eax, 1
mov cr0, eax
jmp CODE_SEG:protected_mode
.error:
jmp $
Print:
push bp
mov bp, sp
mov bx, [bp+4]
mov ah, 0x0E
.loop:
mov al, [bx]
cmp al, 0
je .end
int 0x10
inc bx
jmp .loop
.end:
mov sp, bp
pop bp
ret 2
DiskRead:
push bp
mov bp, sp
mov ah, 0x02
mov al, [bp+6]
mov ch, 0
mov cl, [bp+4]
mov dh, 0
mov dl, [_BootDisk]
mov bx, [bp+8]
int 0x13
cmp al, [bp+6]
je .end
jnc .end
push _DiskErrorString
call Print
.end:
mov sp, bp
pop bp
ret 6
MapMem:
push bp
mov bp, sp
mov si, [bp+4]
mov di, [bp+4]
add di, 4
xor ebx, ebx
mov edx, 0x0534D4150
mov eax, 0xE820
mov [di+20], dword 1
mov ecx, 24
int 0x15
jc .failed
mov edx, 0x0534D4150
cmp eax, edx
jne .failed
test ebx, ebx
je .failed
.loop:
mov eax, 0xE820
mov [di+20], dword 1
mov ecx, 24
int 0x15
jc .finish
mov edx, 0x0534D4150
.jmpin:
jcxz .skip
cmp cl, 20
jbe .notext
test byte [di+20], 1
je .skip
.notext:
mov ecx, [di+8]
or ecx, [di+12]
jz .skip
inc dword [si]
add di, 24
.skip:
test ebx, ebx
jne .loop
.finish:
clc
jmp .end
.failed:
push _MemErrorString
call Print
stc
jmp .end
.end:
mov sp, bp
pop bp
ret 2
_BootDisk: db 0
_DiskErrorString: db "Disk read error!", 13, 10, 0
_MemErrorString: db "Memory mapping failed!", 13, 10, 0
_CpuErrorString: db "CPUID not supported!", 13, 10, 0
_64ErrorString: db "x64 bits not supported!", 13, 10, 0
CODE_SEG equ GDT_code - GDT_start
DATA_SEG equ GDT_data - GDT_start
GDT_start:
GDT_null:
dd 0x0
dd 0x0
GDT_code:
dw 0xffff
dw 0x0
db 0x0
db 0b10011010
db 0b11001111
db 0x0
GDT_data:
dw 0xffff
dw 0x0
db 0x0
db 0b10010010
db 0b11001111
db 0x0
GDT_end:
GDT_descriptor:
dw GDT_end - GDT_start - 1
dd GDT_start
times 510-($-$$) db 0
dw 0xAA55
[bits 32]
protected_mode:
mov ax, DATA_SEG
mov ds, ax
mov ss, ax
mov es, ax
mov fs, ax
mov gs, ax
mov ebp, 0x90000
mov esp, ebp
call Clear
mov ebx, VGA_MEM
mov byte [ebx], 'P'
inc ebx
mov byte [ebx], 14
mov eax, cr0
and eax, ~(1 << 31)
mov cr0, eax
mov edi, 0x1000
mov cr3, edi
xor eax, eax
mov ecx, 4096
rep stosd
mov edi, cr3
mov dword [edi], 0x2003
add edi, 0x1000
mov dword [edi], 0x3003
add edi, 0x1000
mov dword [edi], 0x4003
add edi, 0x1000
mov ebx, 0x00000003
mov ecx, 512
.set_entry:
mov dword [edi], ebx
add ebx, 0x1000
add edi, 8
loop .set_entry
mov eax, cr4
or eax, 1 << 5
mov cr4, eax
mov ecx, 0xC0000080
rdmsr
or eax, 1 << 8
wrmsr
mov eax, cr0
or eax, 1 << 31
mov cr0, eax
lgdt [GDT.Pointer]
jmp GDT.Code:long_mode
jmp $
Clear:
push ebp
mov ebp, esp
mov ecx, VGA_SIZE
mov eax, VGA_MEM
.loop:
mov byte [eax], 0
inc eax
loop .loop
mov esp, ebp
pop ebp
ret
PRESENT equ 1 << 7
NOT_SYS equ 1 << 4
EXEC equ 1 << 3
RW equ 1 << 1
ACCESSED equ 1 << 0
GRAN_4K equ 1 << 7
SZ_32 equ 1 << 6
LONG_MODE equ 1 << 5
GDT:
.Null: equ $ - GDT
dq 0
.Code: equ $ - GDT
dd 0xFFFF
db 0
db PRESENT | NOT_SYS | EXEC | RW
db GRAN_4K | LONG_MODE | 0xF
db 0
.Data: equ $ - GDT
dd 0xFFFF
db 0
db PRESENT | NOT_SYS | RW
db GRAN_4K | SZ_32 | 0xF
db 0
.TSS: equ $ - GDT
dd 0x00000068
dd 0x00CF8900
.Pointer:
dw $ - GDT - 1
dq GDT
[bits 64]
long_mode:
cli
mov ax, GDT.Data
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
mov ss, ax
mov rbp, 0x0007FFFF
mov rsp, rbp
mov rbx, VGA_MEM
mov byte [rbx], 'L'
inc rbx
mov byte [rbx], 2
jmp KERNEL_LOC
VGA_MEM equ 0xB8000
VGA_WIDTH equ 80
VGA_HEIGHT equ 25
VGA_STRIDE equ 2
VGA_SIZE equ VGA_WIDTH * VGA_STRIDE * VGA_HEIGHT
VGA_LENGTH equ VGA_WIDTH * VGA_HEIGHT
times 1024-($-$$) db 0
And for anyone wanting to see the big picture here's the Github repository I made.
The problem was that I simply forgot that I put my page table structures at 0x1000 and accidentally overrode them when loading my kernel at 0x0500.
I ended up leaving the structures at 0x1000 and moved my kernel to 0x5000.
This was a rather simple problem but I would recommend that you still take a look at the comments because there's still a lot of useful information and things to consider.
First: ALWAYS Initialize variables especially global ones.
Second: The problem is surely from your bootloader, can you edit the post to show us how you load your kernel?
Try objdump to see if the variable is declared, use -monitor stdio with QEMU and check the value of CR2 Register, it may be a page fault due to the second problem.
Here is a solution to check if the variable really has a valid pointer:
You can remove these edits after everything is ok.
instead of :
jmp KERNEL_LOC
do:
call KERNEL_LOC ; RAX Has the pointer of the global variable
jmp $
In kmain just type:
return &global
Then run it on QEMU and type in the console info registers, RAX should contain the pointer of the variable named global.

is there a difference between al and dl? [duplicate]

This question already has answers here:
How do AX, AH, AL map onto EAX?
(6 answers)
Closed 2 years ago.
First, it is my first assembly. And I use it with "NASM" and "64bit ASM" and "Intel" code and Mac OS.
When I try to write strdup function, it didn't work with al.
In using my strdup function, the string was broken.
So, my code is here.
global _strdup
extern _malloc
section .text
_strdup:
xor rcx, rcx
jmp strlen
strlen:
cmp [rdi + rcx], byte 0
je malloc
inc rcx
jmp strlen
malloc:
inc rcx
push rdi
mov rdi, rcx
call _malloc
pop rdi
cmp rax, 0
je error
xor rcx, rcx
jmp copy
copy:
mov al, byte [rdi + rcx]
mov byte [rax + rcx], al
cmp al, byte 0
je return
jmp increment
increment:
inc rcx
jmp copy
error:
mov rax, 0
ret
return:
ret
But, when I write same code with dl not al, it works!
global _strdup
extern _malloc
section .text
_strdup:
xor rcx, rcx
jmp strlen
strlen:
cmp [rdi + rcx], byte 0
je malloc
inc rcx
jmp strlen
malloc:
inc rcx
push rdi
mov rdi, rcx
call _malloc
pop rdi
cmp rax, 0
je error
xor rcx, rcx
jmp copy
copy:
mov dl, byte [rdi + rcx]
mov byte [rax + rcx], dl
cmp dl, byte 0
je return
jmp increment
increment:
inc rcx
jmp copy
error:
mov rax, 0
ret
return:
ret
I don't know why dl works and the difference between al and dl.
Anyone know about it? Thanks.
al is the smallest part of rax while dl is the smallest part of rdx. Changing al will change the value of rax as well.
Why your code is not working with al?
copy:
mov al, byte [rdi + rcx] ; you move "byte [rdi + rcx]" in "al"
; thus changing the value of "rax" as well.
; the result of the following statement becomes unexpected
mov byte [rax + rcx], al
But when you do it with dl, rax is not affected and hence your program works as expected.
Try the following code and run it through debugger to see the effect of changing the value of al on `rax:
section .text
global main
main:
nop
mov rax, 1000
mov al, byte 10
nop
Result:
After mov rax, 1000, value of rax: 1000
00000000 00000000 00000000 00000000 00000000 00000000 00000011 11101000
After mov al, 10, value of rax: 778
00000000 00000000 00000000 00000000 00000000 00000000 00000011 00001010
^^^^^^^^
So, changing al will affect rax.

printf gets stuck in an infinite loop with AL = 10 on x86-64 Linux with older gcc

Very simple assembly introduction code.
Seems to compile ok through gcc -o prog1 prog1.s, then ./prog1 just skips a line and shows nothing, like waiting an input the code doesn't ask. What's wrong?
Using gcc (Debian 4.7.2-5) 4.7.2 in 64-bit gNewSense running on VMware.
Code:
/*
int nums[] = {10, -21, -30, 45};
int main() {
int i, *p;
for (i = 0, p = nums; i != 4; i++, p++)
printf("%d\n", *p);
return 0;
}
*/
.data
nums: .int 10, -21, -30, 45
Sf: .string "%d\n" # string de formato para printf
.text
.globl main
main:
/********************************************************/
/* mantenha este trecho aqui e nao mexa - prologo !!! */
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movq %rbx, -8(%rbp)
movq %r12, -16(%rbp)
/********************************************************/
movl $0, %ebx /* ebx = 0; */
movq $nums, %r12 /* r12 = &nums */
L1:
cmpl $4, %ebx /* if (ebx == 4) ? */
je L2 /* goto L2 */
movl (%r12), %eax /* eax = *r12 */
/*************************************************************/
/* este trecho imprime o valor de %eax (estraga %eax) */
movq $Sf, %rdi /* primeiro parametro (ponteiro)*/
movl %eax, %esi /* segundo parametro (inteiro) */
call printf /* chama a funcao da biblioteca */
/*************************************************************/
addl $1, %ebx /* ebx += 1; */
addq $4, %r12 /* r12 += 4; */
jmp L1 /* goto L1; */
L2:
/***************************************************************/
/* mantenha este trecho aqui e nao mexa - finalizacao!!!! */
movq $0, %rax /* rax = 0 (valor de retorno) */
movq -8(%rbp), %rbx
movq -16(%rbp), %r12
leave
ret
/***************************************************************/
tl;dr: do xorl %eax, %eax before call printf.
printf is a varargs function. Here's what the System V AMD64 ABI has to say about varargs functions:
For calls that may call functions that use varargs or stdargs (prototype-less
calls or calls to functions containing ellipsis (. . . ) in the declaration) %al18 is used
as hidden argument to specify the number of vector registers used. The contents
of %al do not need to match exactly the number of registers, but must be an upper
bound on the number of vector registers used and is in the range 0–8 inclusive.
You broke that rule. You'll see that the first time your code calls printf, %al is 10, which is more than the upper bound of 8. On your gNewSense system, here's a disassembly of the beginning of printf:
printf:
sub $0xd8,%rsp
movzbl %al,%eax # rax = al;
mov %rdx,0x30(%rsp)
lea 0x0(,%rax,4),%rdx # rdx = rax * 4;
lea after_movaps(%rip),%rax # rax = &&after_movaps;
mov %rsi,0x28(%rsp)
mov %rcx,0x38(%rsp)
mov %rdi,%rsi
sub %rdx,%rax # rax -= rdx;
lea 0xcf(%rsp),%rdx
mov %r8,0x40(%rsp)
mov %r9,0x48(%rsp)
jmpq *%rax # goto *rax;
movaps %xmm7,-0xf(%rdx)
movaps %xmm6,-0x1f(%rdx)
movaps %xmm5,-0x2f(%rdx)
movaps %xmm4,-0x3f(%rdx)
movaps %xmm3,-0x4f(%rdx)
movaps %xmm2,-0x5f(%rdx)
movaps %xmm1,-0x6f(%rdx)
movaps %xmm0,-0x7f(%rdx)
after_movaps:
# nothing past here is relevant for your problem
A quasi-C translation of the important bits is goto *(&&after_movaps - al * 4); (see Labels as Values). For efficiency, gcc and/or glibc didn't want to save more vector registers than you used, and it also doesn't want to do a bunch of conditional branches. Each instruction to save a vector register is 4 bytes, so it takes the end of the vector register saving instructions, subtracts al * 4 bytes, and jumps there. This results in just enough of the instructions executing. Since you had more than 8, it ended up jumping too far back, and landing before the jump instruction it just took, thus creating an infinite loop.
As for why it's not reproducible on modern systems, here's a disassembly of the beginning of their printf:
printf:
sub $0xd8,%rsp
mov %rdi,%r10
mov %rsi,0x28(%rsp)
mov %rdx,0x30(%rsp)
mov %rcx,0x38(%rsp)
mov %r8,0x40(%rsp)
mov %r9,0x48(%rsp)
test %al,%al # if(!al)
je after_movaps # goto after_movaps;
movaps %xmm0,0x50(%rsp)
movaps %xmm1,0x60(%rsp)
movaps %xmm2,0x70(%rsp)
movaps %xmm3,0x80(%rsp)
movaps %xmm4,0x90(%rsp)
movaps %xmm5,0xa0(%rsp)
movaps %xmm6,0xb0(%rsp)
movaps %xmm7,0xc0(%rsp)
after_movaps:
# nothing past here is relevant for your problem
A quasi-C translation of the important bits is if(!al) goto after_movaps;. Why did this change? My guess is Spectre. The mitigations for Spectre make indirect jumps really slow, so it's no longer worth doing that trick. Or not; see comments. Instead, they do a much simpler check: if there's any vector registers, then save them all. With this code, your bad value of al isn't a disaster, since it just means the vector registers will be unnecessarily copied.

Base64 Assembler Fill Array Error "Operands different sizes" Visual Studio

Im trying to make a Base64Encode in inline assembler in Visual Studio.
I got this func
char* Base64Encode(char* data, int len)
{
// Tabelle mit den Zeichen für die Codierung
const char* encodeTable = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
//
char* result;
if (len > 0) // sonst gibt es nichts zu tun ...
{
int encodedLength = ((len + 2) / 3) * 4; // effektiv die Ganzzahlfassung von ceil(length/3)*4
result = new char[encodedLength+1]; // +1 wg. Null-Terminierung
_asm
{
mov esi,data
mov edi,encodeTable
xor eax, eax
// get 3 bytes
mov ah, byte ptr[esi]
mov al, byte ptr[esi+1]
shl eax,16
mov ah, byte ptr[esi+2]
//
mov edx,eax
shl eax,6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov [result],bl
//
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov[result+1], bl
//manipulate in edx bitset3
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov[result+2], bl
//manipulate in edx bitset4
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov[result+3], bl
}
}
else
{
result = "";
}
return result;
}
The encoding is working proper, I have in bl always the right letter, but the output is not working ( result array doesn't fill with the letters, im getting the error that the operands have different sizes, I am only allowed to make changes in the __asm function ) .
Could somebody help me how to fill the result-array with the letters I get in bl? Debugging always shows me the right letters in the bl ' s if i comment out all the result lines.
EDIT:
I get nothing in the result array when i use the byte ptr.
Any ideas?
EDIT2:
The issue in your code is a matter of indirection. You define and initialize a variable result like this in the C++ code:
char* result;
result = new char[encodedLength+1];
result is a memory location that holds a pointer to an array of characters returned by new. result is not the memory location where data will be stored, but contains a pointer to that data area. You then access it in the ASM block like this:
mov [result],bl
The compiler/assembler(MASM) warned that there was an operand mismatch when it said Operands different sizes. It knew that result was the location of a 32-bit pointer (not single characters). Since result is the address containing a pointer the code above would have moved the contents of bl to the memory location result. This had the effect of changing the pointer (returned by new) not what result was pointing at.
You need to deal with indirection here. You want to get the address that is stored in result and use that as a base for memory addressing. You can choose an available register like ECX and MOV the contents of result into it. You could do that with something like this at the top of your ASM block:
mov ecx, dword ptr [result]
This takes the 32-bit(dword) value at memory location result and stores that in ECX. Now that we have the memory location to the beginning of the character buffer we can now modify all references of result in the ASM block and change it to ECX. Examples:
mov [result],bl
would become:
mov byte ptr [ecx],bl
and
mov[result+1], bl
would become:
mov byte ptr [ecx+1], bl
The second example is called base plus displacement (or offset) addressing . That link also describes all the addressing modes on x86. If you were using 16-bit code (which you aren't) there are some extra restrictions in the register choices that can be use for base and indexing.
As well user3144770 also pointed out that you didn't null terminate your string (you only allocated space for it), so at the bottom of your ASM block you should have probably used something like:
mov byte ptr[ecx+4], 0
With the changes above your code could look something like:
char* Base64Encode(char* data, int len)
{
// Tabelle mit den Zeichen für die Codierung
const char* encodeTable = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
//
char* result;
if (len > 0) // sonst gibt es nichts zu tun ...
{
int encodedLength = ((len + 2) / 3) * 4; // effektiv die Ganzzahlfassung von ceil(length/3)*4
result = new char[encodedLength + 1]; // +1 wg. Null-Terminierung
_asm
{
mov esi, data
mov edi, encodeTable
mov ecx, dword ptr [result]
xor eax, eax
// get 3 bytes
mov ah, byte ptr[esi]
mov al, byte ptr[esi + 1]
shl eax, 16
mov ah, byte ptr[esi + 2]
//
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov byte ptr [ecx], bl
//
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov byte ptr [ecx + edx + 1], bl
//manipulate in edx bitset3
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov byte ptr [ecx + 2], bl
//manipulate in edx bitset4
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov byte ptr [ecx + 3], bl
mov byte ptr[ecx + 4], 0
}
}
else
{
result = "";
}
return result;
}
Perhaps it's enough to write byte ptr
mov bl, byte ptr[edi + edx]
mov byte ptr[result], bl
Also you don't actually do the null-termination. ( +1 wg. Null-Terminierung)
mov byte ptr[result+4], 0

sorting 32 bit numbers in 8086

i wrote this code and think for it about 5 hours;
However i stuck in sorting part:
its begin to loop and never stop !
where did i wrong ???
IT SEEMS that the array didnt get the numbers correctly.
can any one help please ?
page 110,100
Title 'decimal sorting'
Data_here segment
A DD Dup 20(?)
msg1 DB 'Enter a maximum 6 digits number : ',0DH,0AH,'$'
msg2 DB 0DH,0AH,'The sorted results are : ',0DH,0AH,'$'
Data_here Ends
Stack_here segment
DW 10 DUP(?)
Stack_here Ends
Code_here segment
Assume CS:code_here , DS:Data_here ,SS:Stack_here
Main Proc near
mov AX,Data_here
mov DS,AX
mov AX,Stack_here
mov SS,AX
Call decimal_input
call sorting
call decimal_output
mov AH,4CH
int 21H
main Endp
decimal_input proc near
PUSH A
mov AH,09H
Lea DX,msg1
int 21H
mov CH,20
next_number:
mov BH,6
mov DX,0
mov si,4
next_digit:
mov AH,07H
int 21H
CMP AL,0DH
JNE check_digit
CMP BH,6
JE next_digit
DEC CH
JZ end
mov DX,0DH
mov AH,02H
int 21H
mov DX,0AH
Int 21H
jmp next_number
check_digit:
cmp AL,30H
JB next_digit
cmp AL,39H
JA next_digit
mov AH,02H
mov DL,AL
int 21H
SUB AL,30H
SHL DX,4
ADD DL,AL
DEC SI
JZ save
DEC BH
JNZ next_digit
jmp remain
save:
mov A[DI],DX
SHL A[DI],8
add si,4
DEC BH
jmp next_digit
remain:
ADD byte ptr A[DI],DL
mov DX,0DH
mov AH,02H
int 21H
mov DX,0AH
INT 21H
INC DI
DEC CH
JNZ next_number
end:
pop A
RET
decimal_input ENDp
sorting proc near
Push A
mov SI,DI
check:
mov AX,word ptr A[SI]
mov BX,word ptr A[DI+2]
CMP AX,BX
JA change
JE extra_check
add SI,2
continue:
add DI,2
CMP DI,38
JA finish
JB check
extra_check:
mov CX,word ptr A[DI+1]
mov DX,word ptr A[DI+3]
cmp CX,DX
JNA continue
mov word ptr A[DI+1],DX
mov word ptr A[DI+3],CX
jmp continue
change:
xchg AX,BX
mov CX,word ptr A[DI+1]
mov DX,word ptr A[DI+3]
xchg CX,DX
mov word ptr A[DI],AX
mov word ptr A[DI+1],CX
mov word ptr A[DI+2],BX
mov word ptr A[DI+3],DX
jmp continue
finish:
Add sp,2
cmp Sp,40
JE ending
mov DI,SP
jmp check
ending:
POP A
RET
sorting ENDP
decimal_output proc near
PUSH A
mov AH,09H
lea DX,msg2
INT 21H
next_no:
mov DL,0
mov AL,0
mov AH,0
next:
CMP AL,0
JE next1
mov DL,byte ptr A[DI]
ADD DL,30H
mov AL,1
int 21H
next1:
inc DI
inc AH
cmp AH,8
JB next
CMP DI,80
JA THE_END
mov DX,0DH
mov AH,02H
int 21H
mov DX,0AH
int 21H
jmp next_no
THE_END:
pop A
RET
decimal_output ENDp
code_here Ends
END MAIN
I think your problem might be here
Add sp,2
cmp Sp,40
You are using sp as a loop counter, but you never initialize it. Additionaly I also don't understand why you are using sp in the first place. Of course it is theoretically possible, but then you would have to restore it before you return. When your proc returns it will screw up, because the stack pointer is corrupted. So you shouild use some other register or a memory variable (bp is not used in your code).
And I would strongly recommend to reformat your code, to make it more readable (when I looked at it I formatted it like this which makes it better readable):
sorting proc near
Push A
mov SI,DI
check:
mov AX,word ptr A[SI]
mov BX,word ptr A[DI+2]
CMP AX,BX
JA change
JE extra_check
add SI,2
continue:
add DI,2
CMP DI,38
JA finish
JB check
...

Resources