What is the most efficient way to do 128 bit shift on a modern Intel CPU (core i7, sandy bridge).
A similar code is in my most inner loop:
u128 a[N];
void xor() {
for (int i = 0; i < N; ++i) {
a[i] = a[i] ^ (a[i] >> 1) ^ (a[i] >> 2);
}
}
The data in a[N] is almost random.
Using instruction Shift Double.
So SHLD or SHRD instruction, because SSE isn't intended for this purpose.
There is a clasic method, here are you have test cases for 128 bit left shift by 16 bits under 32 and 64 bit CPU mode.
On this way you can perform unlimited size shift for up to 32/64 bits. Yoo can shift for immediate number of bits or for number in cl register. First instruction operant can also address variable in memory.
128 bit left shift by 16 bits under 32 bit x86 CPU mode:
mov eax, $04030201;
mov ebx, $08070605;
mov ecx, $0C0B0A09;
mov edx, $100F0E0D;
shld edx, ecx, 16
shld ecx, ebx, 16
shld ebx, eax, 16
shl eax, 16
And 128 bit left shift by 16 bits under 64 bit x86 CPU mode:
mov rax, $0807060504030201;
mov rdx, $100F0D0E0B0C0A09;
shld rdx, rax, 16
shl rax, 16
In this particular case you could use a combination of x86 SHR and RCR instructions:
; a0 - bits 0-31 of a[i]
; a1 - bits 32-63 of a[i]
; a2 - bits 64-95 of a[i]
; a3 - bits 96-127 of a[i]
mov eax, a0
mov ebx, a1
mov ecx, a2
mov ecx, a3
shr eax, 1
rcr ebx, 1
rcr ecx, 1
rcr edx, 1
; b0 - bits 0-31 of b[i] := a[i] >> 1
; b1 - bits 32-63 of b[i] := a[i] >> 1
; b2 - bits 64-95 of b[i] := a[i] >> 1
; b3 - bits 96-127 of b[i] := a[i] >> 1
mov b0, eax
mov b1, ebx
mov b2, ecx
mov b3, edx
shr eax, 1
rcr ebx, 1
rcr ecx, 1
rcr edx, 1
; c0 - bits 0-31 of c[i] := a[i] >> 2 = b[i] >> 1
; c1 - bits 32-63 of c[i] := a[i] >> 2 = b[i] >> 1
; c2 - bits 64-95 of c[i] := a[i] >> 2 = b[i] >> 1
; c3 - bits 96-127 of c[i] := a[i] >> 2 = b[i] >> 1
mov c0, eax
mov c1, ebx
mov c2, ecx
mov c3, edx
If your target is x86-64 this simplifies to:
; a0 - bits 0-63 of a[i]
; a1 - bits 64-127 of a[i]
mov rax, a0
mov rbx, a1
shr rax, 1
rcr rbx, 1
; b0 - bits 0-63 of b[i] := a[i] >> 1
; b1 - bits 64-127 of b[i] := a[i] >> 1
mov b0, rax
mov b1, rbx
shr rax, 1
rcr rbx, 1
; c0 - bits 0-63 of c[i] := a[i] >> 2 = b[i] >> 1
; c1 - bits 64-127 of c[i] := a[i] >> 2 = b[i] >> 1
mov c0, rax
mov c1, rbx
Update: corrected typos in 64-bit version
Related
Following is the code I wrote to find LCM of two numbers in EMU8086. When I ran it, I am getting value 0 in the Ans variable.
.MODEL SMALL
.DATA
Num1 DW 250
Num2 DW 100
Ans DW ?
.CODE
MOV AX,#DATA
MOV DS, AX
MOV AX, Num1
MOV BX, Num2
MOV DX, 0000h
NEXT: PUSH AX
PUSH DX
DIV BX
CMP DX, 0000h
JZ LAST
POP DX
POP AX
ADD AX, Num1
JNC NEXT
INC DX
JMP NEXT
LAST: POP Ans+2
POP Ans
MOV AH, 4Ch
INT 21h
END
LCM(a, b) = a * b / GCD(a, b)
Due to this equation, you can find GCD using Euclid's algorithm and then calculate LCM. Assuming numbers a and b are in al and dl, this code calculate LCM.
; Save multiplication value
MOV AL, DL ; This 2 lines is AL * DH
MUL DH
PUSH AX ; Store result in stack
FINDBCD: ; General idea is : LCM(a, b) = a*b/BCD(a,b)
; We calculate BCD with euclidean algorithm
CMP DH, DL
JNS CALCULATE ; Jump if DL < DH else swap them
MOV CL, DL ; This 3 line swap DL and DH
MOV DL, DH
MOV DH, CL
CALCULATE:
MOV AL, DH ; Move greater number in AL
XOR AH, AH ; Zero AX 8 second bits
DIV DL ; This is AL/DL
CMP AH, 0 ; If remainder is zero so we found BCD
JE AFTERFINDBCD
SUB DH, DL ; Else substract smaller number from greater number
JMP FINDBCD ; Do this until find BCD
AFTERFINDBCD:
CMP DH, DL
JNS FINDLCM ; Jump if DL < DH
MOV CL, DL ; This 3 line swap DL and DH
MOV DL, DH
MOV DH, CL
FINDLCM:
POP AX ; Retreive multiplication result
DIV DL ; This is AX/DL
AND AX, 0000000011111111B ; Ignore remainder
The aim of the program is to take in 3 arguments and an additional argument from a user and find the sum of it.
I have it working so that it finds the sum of the 3 arguments and prints out the sum.
How do I print a string with no arguments and how do I take input from the user and print that input?
includelib legacy_stdio_definitions.lib
extrn printf:near, scanf:near
.data ; Data section
istr db 'Please enter an integer: '
stri byte '%lld', 0AH, 00
ostr byte 'The sum of proc. and user inputs (%lld, %lld, %lld, %lld): %lld', 0AH, 00
.code ; Code section
public use_scanf ; int use_scanf(long long a, long long b, long long c)
use_scanf: ; {
mov rax, 0 ; sum = 0;
add rax, rcx ; sum += a;
add rax, rdx ; sum += b;
add rax, r8 ; sum += c;
; printf('Please enter an integer');
; scanf();
; printf('%lld', &inp_int);
mov r9, 3 ; temp inp_int
add rax, r9 ; sum += inp_int;
push rbx ; save reg rbx to make space
mov rbx, rax ; save sum in rbx
push rax ; sum.push
push rax ; sum.push
push r9 ; usrinput.push
sub rsp, 32 ; allocate shadow space
mov r9, r8 ; arg3 = c;
mov r8, rdx ; arg2 = b;
mov rdx, rcx ; arg1 = a
lea rcx, ostr ; arg0 = ostr;
call printf ; printf(ostr);
add rsp, 56 ; clear shadow space and vars from stack
mov rax, rbx ; retVal = sum
pop rbx ; return rbx to previous value
ret ; return retVal}
edit: output from console
Please enter an integer: 2
use_scanf_spill4 equ 12 * 8 ; The slots above return address that are
The sum of proc. and user inputs (1, 2, 3, 1002523588520): 1002523588526
use_scanf(1,2,3) = 73 ERROR: should be 7018134685179969542
Please enter an integer: 2
The sum of proc. and user inputs (-3, 2, -2, 1002523588520): 1002523588517
use_scanf(-3,2,-2) = 75 ERROR: should be 7018134685179969533
Please enter an integer: 3
The sum of proc. and user inputs (4, 3, -4, 1002523588520): 1002523588523
use_scanf(4,3,-4) = 74 ERROR: should be 7018134685179969539
Please enter an integer: -3
The sum of proc. and user inputs (-3, -3, -4, 1002523588520): 1002523588510
use_scanf(-3,-3,-4) = 76 ERROR: should be 7018134685179969526
The following is the cpp file to test it.
void check(const char *s, _int64 v, _int64 expected) {
std::cout << s << " = " << v;
if (v == expected) {
std::cout << " OK";
}
else {
std::cout << " ERROR: should be " << expected;
}
std::cout << "\n";
}
_int64 sum_scanf;
_int64 sum_check;
sum_scanf = use_scanf(1, 2, 3);
sum_check = 1 + 2 + 3 + inp_int;
check("use_scanf(1,2,3)", sum_scanf, sum_check);
std::cout << "\n";
sum_scanf = use_scanf(-3, 2, -2);
sum_check = -3 + 2 - 2 + inp_int;
check("use_scanf(-3,2,-2)", sum_scanf, sum_check);
std::cout << "\n";
sum_scanf = use_scanf(4, 3, -4);
sum_check = 4 + 3 - 4 + inp_int;
check("use_scanf(4,3,-4)", sum_scanf, sum_check);
std::cout << "\n";
sum_scanf = use_scanf(-3, -3, -4);
sum_check = -3 - 3 - 4 + inp_int;
check("use_scanf(-3,-3,-4)", sum_scanf, sum_check);
You have calling convention problems. First off, the stack depth divided by 8 must be even. Second off, you do not use push and pop except in the prologue and epilogue.
I'm guessing you started with a basic C implementation, disassembled that, and started editing. The compiler is pulling some advanced tricks here that you're better off avoiding at this early stage. So off to rewriting we go.
I didn't try to optimize anything.
;Take your globals from above -- you will need
use_scanf_spill4 equ 12 * 8 ; The slots above return address that are
use_scanf_spill3 equ 11 * 8 ; for spilling arguments. In theory you can
use_scanf_spill2 equ 10 * 8 ; use them for something else but I didn't.
use_scanf_spill1 equ 9 * 8
use_scanf_space equ 7 * 8 ; it holds return addess -- don't clobber
use_scanf_sum equ 6 * 8
use_scanf_input equ 5 * 8
use_scanf_arg6 equ 5 * 8 ; same as use_scanf_input, but only 1 is used at a time
use_scanf_arg5 equ1 4 * 8
; 0, 1, 2, 3 are argument zone
use_scanf:
; Allocate stack space
; this function doesn't use any non-call-clobbered registers
sub rsp, use_scanf_space
; spill input since we need it later
mov [rsp + use_scanf_spill1], rcx
mov [rsp + use_scanf_spill2], rdx
mov [rsp + use_scanf_spill3], r8
mov [rsp + use_scanf_spill4], r9
; Add arguments together and stash
add rcx, rdx
add rcx, r8
mov [rsp + use_scanf_sum], rcx
; print the promot
lea rcx, istr
call printf
; call scanf
lea rcx, [stri]
; Can't take the address of a register so we give it a memory slot
lea rdx, [rsp + use_scanf_input]
call scanf
; sum the values
mov rdx, [rsp + use_scanf_sum]
mov rax, [rsp + use_scanf_input]
add rax, rdx
; print out the arguments and total
; using rcx as scratch -- will clobber later
mov rdx, [rsp + use_scanf_spill1]
mov r8, [rsp + use_scanf_spill2]
mov r9, [rsp + use_scanf_spill3]
mov rcx, [rsp + use_scanf_input]
mov [rsp + use_scanf_arg5], rcx
mov [rsp + use_scanf_arg6], rax
lea rcx, [ostr]
call printf
; now leave
add rsp, use_scanf_space
ret
So we have like this safes challenge in assembly, you need to create safes and keys that will break them and end the infinite loop.
Here's an example for a safe:
loopy:
mov ax, [1900]
cmp ax,1234
jne loopy
and a key:
loopy2:
mov ax, 1234
mov [1900],ax
jmp loopy2
So I have a safe and a key, and I don't understand why it doesn't work:
here's my safe:
org 100h
mySafe:
mov dx,5
mov ax, [5768h]
mov bx,7
mov word [180h],2
mul word [180h]
mov [180h],bx
push ax
dec bx
mov cx,dx
mov ax,dx
loopy1:
add bx,ax
loop loopy1
dec bx
pop ax
add ax,bx
mul word [180h]
cmp ax,350
jne mySafe
And here's my key:
org 100h
loopy:
mov word [5768h],10
jmp loopy
ret
The right answer to break the loop should be 10 and it works when I put in on the safe, somehow with the key it doesn't work and I can't figure out why..
(the "word" is needed for nasm)
The value in dx used as the counter for the loop instruction comes from the first mul instruction.
This multiplication is just doubling the key, so dx is either 0 or 1 (an easy way to see this is to think of the multiplication as a left shift by one or by remembering that the sum of two n-bit numbers has at most n+1 bits)
If dx is zero, the whole loopy1 block does nothing (as dx also sets ax) and the value in ax at the end of the safe is 7*(5 +2k) where k is the key (see the commented code below).
It is then easy to see that 350 = 7*(5+2k) => 2k = 45 has no solution. Therefore no key for which dx is zero can unlock the safe.
A key has dx 0 iif its value is less than 32768 (again, this is easy to see when thinking of the multiplication as a left shift by one).
Corollary: 10 cannot be a solution.
safe:
mov dx,5
mov ax, [k] ;ax = k (key)
mov bx,7
mov word [aux],2
mul word [aux] ;dx = 0 ax = 2k
mov [aux],bx ;aux = 7
push ax ;ax = 2k
dec bx ;bx = 6
dec bx ;bx = 5
pop ax ;ax = 2k
add ax,bx ;ax = 5 + 2k
mul word [aux] ;ax = 7*(5 +2k)
cmp ax,350
ret
If there is a key that unlocks the safe then it must be greater or equal to 32768 so that dx is 1 after the first multiplication.
With this condition, the value in ax at the end of the safe can be written as 7*(6 + (2k & 0xffff)) => k & 0x7fff = 22.
Adding the condition stated at the very beginning of this section, the final value for k is 32768 + 22 = 32790 or 0x8016 in hex.
I've leaped quite a few logical steps in manipulating the equation and forming the result but, again, thinking of 2k as a shift may help visualize them.
Corollary: Due to the algebraic structure involved, this is the only solution.
safe:
mov dx,5
mov ax, [k] ;ax = k
mov bx,7
mov word [aux],2
mul word [aux] ;dx:ax = 2k
mov [aux],bx ;[aux] = 7
push ax ;dx = 1 ax = 2k & 0xffff
dec bx ;bx = 6
mov cx,dx ;cx = 1
mov ax,dx ;ax = 1
loopy1:
add bx,ax ;bx = 6 + 1
dec cx
jnz loopy1
dec bx ;bx = 6
pop ax ;ax = 2k & 0xffff
add ax,bx ;ax = 6 + (2k & 0xffff)
mul word [aux] ;ax = 7*(6 + (2k & 0xffff))
cmp ax,350
ret
Considering that you have a mov dx, 5 before the first multiplication, did you (or the author of the safe) forget that mul affects dx?
If you wrap the first mul in push dx / pop dx (or just move mov dx, 5 after it), you would get, at the end of the safe, a value in ax equals to 7*(30 +2k) which implies k = 10 indeed.
I'm writing an assembly project to perform Tiny Encryption algorithm. I followed the C-Code on Wikipedia and transformed it into x86 assembly.
However when I run the algorithm the inputs v0 and v1, they get transformed into gibberish as they should, however, when I try to decrypt the ciphertext, I don't get the original back.
Here is the encryption procedure
encrypt PROC
xor eax, eax
xor ebx, ebx
xor ecx, ecx
xor edx, edx
xor esi, esi
xor edi, edi
mov ecx, 32
LOOP1:
;sum+= delta
mov eax, sum
ADD eax, delta
mov sum, eax
; (v1 << 4) + k0
mov eax, v1
SAL eax, 4
ADD eax, key0
mov v1l4, eax
; (v1 + sum)
mov eax, v1
ADD eax, sum
mov v1ps, eax
; (v1 >> 5) + k1
mov eax, v1
SHR eax, 5
add eax, key1
mov v1r5, eax
;((v1 << 4) + k0) ^ (v1 + sum) ^ ((v1 >> 5) + k1)
mov eax, v1l4
xor eax, v1ps
xor eax, v1r5
; v0 += ((v1 << 4) + k0) ^ (v1 + sum) ^ ((v1 >> 5) + k1)
ADD v0, eax
; (v0 << 4) + k2
mov eax, v0
SAL eax, 4
ADD eax, key2
mov v0l4, eax
; (v0 + sum)
mov eax, v0
ADD eax, sum
mov v0ps, eax
; (v0 >> 5) + k3
mov eax, v0
SHR eax, 5
add eax, key3
mov v0r5, eax
; ((v0 << 4) + k2) ^ (v0 + sum) ^ ((v0 >> 5) + k3)
mov eax, v0l4
xor eax, v0ps
xor eax, v1r5
; v1 += ((v0 << 4) + k2) ^ (v0 + sum) ^ ((v0 >> 5) + k3)
ADD v1, eax
dec ecx
cmp ecx, 0
JG LOOP1
ret
encrypt ENDP
Here is the decryption procedure
decrypt PROC
xor eax, eax
xor ebx, ebx
xor ecx, ecx
xor edx, edx
xor esi, esi
xor edi, edi
mov ecx, 32
LOOP2:
; (v0 << 4) + k2
mov eax, v0
SAL eax, 4
ADD eax, key2
mov v0l4, eax
; (v0 + sum)
mov eax, v0
ADD eax, sum
mov v0ps, eax
; (v0 >> 5) + k3
mov eax, v0
SHR eax, 5
ADD eax, key3
mov v0r5, eax
; ((v0 << 4) + k2) ^ (v0 + sum) ^ ((v0 >> 5) + k3)
mov eax, v0l4
xor eax, v0ps
xor eax, v1r5
; v1 -= ((v0 << 4) + k2) ^ (v0 + sum) ^ ((v0 >> 5) + k3)
SUB v1, eax
; (v1 << 4) + k0
mov eax, v1
SAL eax, 4
ADD eax, key0
mov v1l4, eax
; (v1 + sum)
mov eax, v1
ADD eax, sum
mov v1ps, eax
; (v1 >> 5) + k1
mov eax, v1
SHR eax, 5
ADD eax, key1
mov v1r5, eax
; ((v1 << 4) + k0) ^ (v1 + sum) ^ ((v1 >> 5) + k1)
mov eax, v1l4
xor eax, v1ps
xor eax, v1r5
; v0 -= ((v1 << 4) + k0) ^ (v1 + sum) ^ ((v1 >> 5) + k1)
SUB v0, eax
; sum -= delta
mov eax, sum
SUB eax, delta
mov sum, eax
dec ecx
cmp ecx, 0
JG LOOP2
ret
decrypt ENDP
Many thanks!
I was asked to create a bubble sort program in NASM Ubuntu. Here's the code:
section .data
i db 0 ; Value to be incremented
question db 'Enter a number: ' ; Prompt
questionLen equ $-question
newLine db 10, 10, 0 ; New blank line
newLineLen equ $-newLine
section .bss
num resb 5 ; Array of size 5
counter resb 1 ; Value to be incremented
counter2 resb 1 ; Value to be incremented
temp resb 1
temp2 resb 1
section .text
global _start
_start:
mov esi, 0
getInput:
mov eax, 4
mov ebx, 1
mov ecx, question ; Prints the question
mov edx, questionLen
int 80h
add byte[i], 30h ; I'll retain this expression, since the program experienced an error
; when this expression is deleted
sub byte[i], 30h ; Converts the increment value to integer
mov eax, 3
mov ebx, 0
lea ecx, [num + esi] ; Element of the array
mov edx, 2
int 80h
inc esi
inc byte[i]
cmp byte[i], 5 ; As long as the array hasn't reached the size of 5,
jl getInput ; the program continues to ask input from the user
mov esi, 0
mov byte[i], 0
mov edi, 0 ; Index of the array
bubble_sort:
mov byte[counter], 0
mov byte[counter2], 0
begin_for_1:
mov al, 0
mov al, [counter] ; Acts as the outer for loop
cmp al, 5
jg printArray ; Prints the sorted list when the array size has reached 5
begin_for_2:
mov edi, [counter2] ; Acts as the inner for loop
cmp edi, 4
jg end_for_2
mov bl, 0 ; Acts as the if statement
mov cl, 0
mov bl, [num + edi]
mov cl, [num + edi + 1]
mov byte[temp], cl ; This is the same as if(a[j] > a[j + 1]){...}
cmp bl, [temp]
jg bubbleSortSwap
return:
inc edi ; Same as j++
jmp begin_for_2 ; Goes out of the inner for loop
end_for_2:
inc byte[counter] ; Same as i++
jmp begin_for_1 ; Goes out of the outer for loop
bubbleSortSwap:
mov [num + edi + 1], bl
mov [num + edi], cl ; The set of statements is the same as swap(&a[j], &a[j + 1]);
jmp return
printArray:
mov eax, 4
mov ebx, 1
mov ecx, [num + esi] ; Prints one element at a time
mov edx, 1
int 80h
inc esi
inc byte[i]
cmp byte[i], 5
jl printArray ; As long as the array size hasn't reached 5, printing continues
mov eax, 4
mov ebx, 1
mov ecx, newLine ; Displays a new blank line after the array
mov edx, newLineLen
int 80h
mov eax, 1 ; Exits the program
mov ebx, 0
int 80h
But the only problem is, it cannot print the rest of the iterations, because it only prints the 1st iteration like this:
Enter a number: 7
Enter a number: 1
Enter a number: 4
Enter a number: 3
Enter a number: 5
17435
What I want to output is the array input and the final output, from the 1st iteration up to the last.
Naw... he just needs some stuff sorted! :)
Doesn't print any output at all for me, as posted. Problem is you're putting "[contents]" in ecx - you want address - you do it right in the input routine.
You can get by with fewer variables - use esi and/or edi as both the "count" and the "index". If you use variables, make sure the size of the variable matches the size of the register you're moving it in/out of! ("mov edi, [counter2]" isn't doing what you want) Courage! If it wuz easy, everybody'd be doing it.
Best,
Frank