Most performant way to subtract one array from another - performance

I have the following code which is the bottleneck in one part of my application. All I do is subtract on Array from another. Both of these arrays have more around 100000 elements. I'm trying to find a way to make this more performant.
var
Array1, Array2 : array of integer;
.....
// Code that fills the arrays
.....
for ix := 0 to length(array1)-1
Array1[ix] := Array1[ix] - Array2[ix];
end;
Does anybody have a suggestion?

Running this on multiple threads, with that big an array will net linear speed-up. It's embarrassingly parallel as they say.

Running subtraction on more threads sounds good, but 100K integer sunstraction don't take a lot of CPU time, so maybe threadpool... However settings threads have also a lot of overhead, so short arrays will have slower productivity in parallel threads than in only one (main) thread!
Did you switch off in compiler settings, overflow and range checking?
You can try to use asm rutine, it is very simple...
Something like:
procedure SubArray(var ar1, ar2; length: integer);
asm
//length must be > than 0!
push ebx
lea ar1, ar1 -4
lea ar2, ar2 -4
#Loop:
mov ebx, [ar2 + length *4]
sub [ar1 + length *4], ebx
dec length
//Here you can put more folloving parts of rutine to more unrole it to speed up.
jz #exit
mov ebx, [ar2 + length *4]
sub [ar1 + length *4], ebx
dec length
//
jnz #Loop
#exit:
pop ebx
end;
begin
SubArray(Array1[0], Array2[0], length(Array1));
It can be much faster...
EDIT: Added procedure with SIMD instructions.
This procedure request SSE CPU support. It can take 4 integers in XMM register and subtract at once. There is also possibility to use movdqa instead movdqu it is faster, but you must first to ensure 16 byte aligment. You can also unrole the XMM par like in my first asm case. (I'm interesting about speed measurment. :) )
var
array1, array2: array of integer;
procedure SubQIntArray(var ar1, ar2; length: integer);
asm
//prepare length if not rounded to 4
push ecx
shr length, 2
jz #LengthToSmall
#Loop:
movdqu xmm1, [ar1] //or movdqa but ensure 16b aligment first
movdqu xmm2, [ar2] //or movdqa but ensure 16b aligment first
psubd xmm1, xmm2
movdqu [ar1], xmm1 //or movdqa but ensure 16b aligment first
add ar1, 16
add ar2, 16
dec length
jnz #Loop
#LengthToSmall:
pop ecx
push ebx
and ecx, 3
jz #Exit
mov ebx, [ar2]
sub [ar1], ebx
dec ecx
jz #Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec ecx
jz #Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
#Exit:
pop ebx
end;
begin
//Fill arrays first!
SubQIntArray(Array1[0], Array2[0], length(Array1));

I was very curious about speed optimisation in this simple case.
So I have made 6 simple procedures and measure CPU tick and time at array size 100000;
Pascal procedure with compiler option Range and Overflow Checking On
Pascal procedure with compiler option Range and Overflow Checking off
Classic x86 assembler procedure.
Assembler procedure with SSE instructions and unaligned 16 byte move.
Assembler procedure with SSE instructions and aligned 16 byte move.
Assembler 8 times unrolled loop with SSE instructions and aligned 16 byte move.
Check results on picture and code for more information.
To get 16 byte memory alignment first delite the dot in file 'FastMM4Options.inc' directive {$.define Align16Bytes}
!
program SubTest;
{$APPTYPE CONSOLE}
uses
//In file 'FastMM4Options.inc' delite the dot in directive {$.define Align16Bytes}
//to get 16 byte memory alignment!
FastMM4,
windows,
SysUtils;
var
Ar1 :array of integer;
Ar2 :array of integer;
ArLength :integer;
StartTicks :int64;
EndTicks :int64;
TicksPerMicroSecond :int64;
function GetCpuTicks: int64;
asm
rdtsc
end;
{$R+}
{$Q+}
procedure SubArPasRangeOvfChkOn(length: integer);
var
n: integer;
begin
for n := 0 to length -1 do
Ar1[n] := Ar1[n] - Ar2[n];
end;
{$R-}
{$Q-}
procedure SubArPas(length: integer);
var
n: integer;
begin
for n := 0 to length -1 do
Ar1[n] := Ar1[n] - Ar2[n];
end;
procedure SubArAsm(var ar1, ar2; length: integer);
asm
//Length must be > than 0!
push ebx
lea ar1, ar1 - 4
lea ar2, ar2 - 4
#Loop:
mov ebx, [ar2 + length * 4]
sub [ar1 + length * 4], ebx
dec length
jnz #Loop
#exit:
pop ebx
end;
procedure SubArAsmSimdU(var ar1, ar2; length: integer);
asm
//Prepare length
push length
shr length, 2
jz #Finish
#Loop:
movdqu xmm1, [ar1]
movdqu xmm2, [ar2]
psubd xmm1, xmm2
movdqu [ar1], xmm1
add ar1, 16
add ar2, 16
dec length
jnz #Loop
#Finish:
pop length
push ebx
and length, 3
jz #Exit
//Do rest, up to 3 subtractions...
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz #Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz #Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
#Exit:
pop ebx
end;
procedure SubArAsmSimdA(var ar1, ar2; length: integer);
asm
push ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz #exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz #exit
add ar1, 8
add ar2, 8
//Prepare length for 16 byte data transfer
push length
shr length, 2
jz #Finish
#Loop:
movdqa xmm1, [ar1]
movdqa xmm2, [ar2]
psubd xmm1, xmm2
movdqa [ar1], xmm1
add ar1, 16
add ar2, 16
dec length
jnz #Loop
#Finish:
pop length
and length, 3
jz #Exit
//Do rest, up to 3 subtractions...
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz #Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz #Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
#Exit:
pop ebx
end;
procedure SubArAsmSimdAUnrolled8(var ar1, ar2; length: integer);
asm
push ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz #exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz #exit
add ar1, 8 //Align pointer to 16 byte
add ar2, 8 //Align pointer to 16 byte
//Prepare length for 16 byte data transfer
push length
shr length, 5 //8 * 4 subtructions per loop
jz #Finish //To small for LoopUnrolled
#LoopUnrolled:
//Unrolle 1, 2, 3, 4
movdqa xmm4, [ar2]
movdqa xmm5, [16 + ar2]
movdqa xmm6, [32 + ar2]
movdqa xmm7, [48 + ar2]
//
movdqa xmm0, [ar1]
movdqa xmm1, [16 + ar1]
movdqa xmm2, [32 + ar1]
movdqa xmm3, [48 + ar1]
//
psubd xmm0, xmm4
psubd xmm1, xmm5
psubd xmm2, xmm6
psubd xmm3, xmm7
//
movdqa [48 + ar1], xmm3
movdqa [32 + ar1], xmm2
movdqa [16 + ar1], xmm1
movdqa [ar1], xmm0
//Unrolle 5, 6, 7, 8
movdqa xmm4, [64 + ar2]
movdqa xmm5, [80 + ar2]
movdqa xmm6, [96 + ar2]
movdqa xmm7, [112 + ar2]
//
movdqa xmm0, [64 + ar1]
movdqa xmm1, [80 + ar1]
movdqa xmm2, [96 + ar1]
movdqa xmm3, [112 + ar1]
//
psubd xmm0, xmm4
psubd xmm1, xmm5
psubd xmm2, xmm6
psubd xmm3, xmm7
//
movdqa [112 + ar1], xmm3
movdqa [96 + ar1], xmm2
movdqa [80 + ar1], xmm1
movdqa [64 + ar1], xmm0
//
add ar1, 128
add ar2, 128
dec length
jnz #LoopUnrolled
#FinishUnrolled:
pop length
and length, $1F
//Do rest, up to 31 subtractions...
#Finish:
mov ebx, [ar2]
sub [ar1], ebx
add ar1, 4
add ar2, 4
dec length
jnz #Finish
#Exit:
pop ebx
end;
procedure WriteOut(EndTicks: Int64; Str: string);
begin
WriteLn(Str + IntToStr(EndTicks - StartTicks)
+ ' Time: ' + IntToStr((EndTicks - StartTicks) div TicksPerMicroSecond) + 'us');
Sleep(5);
SwitchToThread;
StartTicks := GetCpuTicks;
end;
begin
ArLength := 100000;
//Set TicksPerMicroSecond
QueryPerformanceFrequency(TicksPerMicroSecond);
TicksPerMicroSecond := TicksPerMicroSecond div 1000000;
//
SetLength(Ar1, ArLength);
SetLength(Ar2, ArLength);
//Fill arrays
//...
//Tick time info
WriteLn('CPU ticks per mikro second: ' + IntToStr(TicksPerMicroSecond));
Sleep(5);
SwitchToThread;
StartTicks := GetCpuTicks;
//Test 1
SubArPasRangeOvfChkOn(ArLength);
WriteOut(GetCpuTicks, 'SubAr Pas Range and Overflow Checking On, Ticks: ');
//Test 2
SubArPas(ArLength);
WriteOut(GetCpuTicks, 'SubAr Pas, Ticks: ');
//Test 3
SubArAsm(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm, Ticks: ');
//Test 4
SubArAsmSimdU(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm SIMD mem unaligned, Ticks: ');
//Test 5
SubArAsmSimdA(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned, Ticks: ');
//Test 6
SubArAsmSimdAUnrolled8(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned 8*unrolled, Ticks: ');
//
ReadLn;
Ar1 := nil;
Ar2 := nil;
end.
...
The fastest asm procedure with 8 times unrolled SIMD instructions takes only 68us and is about 4 time faster than Pascal procedure.
As we can see the Pascal loop procedure probably isn't critical, it takes only about 277us (Overflow and Range checking off) on 2,4GHz CPU at 100000 subtractions.
So this code can't be bottleneck?

I'm not assembly expert but I think the following are near optimal if you don't take into account SIMD instructions or parallel processing, the later can be easily accomplished by passing portions of the array to the function.
like
Thread1: SubArray(ar1[0], ar2[0], 50);
Thread2: SubArray(ar1[50], ar2[50], 50);
procedure SubArray(var Array1, Array2; const Length: Integer);
var
ap1, ap2 : PInteger;
i : Integer;
begin
ap1 := #Array1;
ap2 := #Array2;
i := Length;
while i > 0 do
begin
ap1^ := ap1^ - ap2^;
Inc(ap1);
Inc(ap2);
Dec(i);
end;
end;
// similar assembly version
procedure SubArrayEx(var Array1, Array2; const Length: Integer);
asm
// eax = #Array1
// edx = #Array2
// ecx = Length
// esi = temp register for array2^
push esi
cmp ecx, 0
jle #Exit
#Loop:
mov esi, [edx]
sub [eax], esi
add eax, 4
add edx, 4
dec ecx
jnz #Loop
#Exit:
pop esi
end;
procedure Test();
var
a1, a2 : array of Integer;
i : Integer;
begin
SetLength(a1, 3);
a1[0] := 3;
a1[1] := 1;
a1[2] := 2;
SetLength(a2, 3);
a2[0] := 4;
a2[1] := 21;
a2[2] := 2;
SubArray(a1[0], a2[0], Length(a1));
for i := 0 to Length(a1) - 1 do
Writeln(a1[i]);
Readln;
end;

It's not a real answer to your question, but I would investigate if I could do the subtraction already at some time while filling the arrays with values. I would optionally even consider a third array in memory to store the result of the subtraction. In modern computing, the 'cost' of memory is considerably lower than the 'cost' of the time it takes to perform an extra action on memory.
In theory you'll gain at least a little performance when the subtraction can be done while the values are still in registers or processor cache, but in practice you just might stumble upon a few tricks that could enhance performance of the entire algorithm.

Related

Vmovntpd instruction on Intel Xeon Platinum 8168 CPU

I have a simple vector-vector addition algorithm implementation in assembly. It uses AVX to read 4 doubles from the A vector, and 4 doubles from B vector. The algorithm adds these numbers and writes the result back to the C vector. If I use vmovntpd to write back the result, the performance becames extremely random. I have made this test on an azure server, with Intel Xeon Platinum 8168 CPU. If I run this test on my laptop (Intel Core i7-2640M CPU), this random effect disappears. What is the problem on the server? One more info: The server has 44 CPU-s.
[Edit]
Here is my code:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Dense to dense
;; Without cache (for storing the result)
;; AVX-512
;; Without tolerances
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
global _denseToDenseAddAVX512_nocache_64_linux
_denseToDenseAddAVX512_nocache_64_linux:
push rbp
mov rbp, rsp
; c = a + lambda * b
; rdi: address1
; rsi: address2
; rdx: address3
; rcx: count
; xmm0: lambda
mov rax, rcx
shr rcx, 4
and rax, 0x0F
vzeroupper
vmovupd zmm5, [abs_mask]
sub rsp, 8
movlpd [rbp - 8], xmm0
vbroadcastsd zmm7, [rbp - 8]
vmovapd zmm6, zmm7
cmp rcx, 0
je after_loop_denseToDenseAddAVX512_nocache_64_linux
start_denseToDenseAddAVX512_nocache_64_linux:
vmovapd zmm0, [rdi] ; a
vmovapd zmm1, zmm7
vmulpd zmm1, zmm1, [rsi] ; b
vaddpd zmm0, zmm0, zmm1 ; zmm0 = c = a + b
vmovntpd [rdx], zmm0
vmovapd zmm2, [rdi + 64] ; a
vmovapd zmm3, zmm6
vmulpd zmm3, zmm3, [rsi + 64] ; b
vaddpd zmm2, zmm2, zmm3 ; zmm2 = c = a + b
vmovntpd [rdx + 64], zmm2
add rdi, 128
add rsi, 128
add rdx, 128
loop start_denseToDenseAddAVX512_nocache_64_linux
after_loop_denseToDenseAddAVX512_nocache_64_linux:
cmp rax, 0
je end_denseToDenseAddAVX512_nocache_64_linux
mov rcx, rax
last_loop_denseToDenseAddAVX512_nocache_64_linux:
movlpd xmm0, [rdi] ; a
movapd xmm1, xmm7
mulsd xmm1, [rsi] ; b
addsd xmm0, xmm1 ; xmm0 = c = a + b
movlpd [rdx], xmm0
add rdi, 8
add rsi, 8
add rdx, 8
loop last_loop_denseToDenseAddAVX512_nocache_64_linux
end_denseToDenseAddAVX512_nocache_64_linux:
mov rsp, rbp
pop rbp
ret
Okay, I've found the solution! This is a NUMA architecture with 44 CPUs, so I disabled the NUMA, and I've limited the number of online cpu-s to 1 with the following kernel parameters: numa=off maxcpus=1 nr_cpus=1.

What exactly is GCC's auto-vectorized SSE2 implementation of sum += 1..n doing?

When GCC 8.3 for x86-64 with -O3 option is fed this small C function
int sum(int n) {
int sum = 0;
for (int i = 1; i <= n; i++) {
sum += i;
}
return sum;
}
it produces the following assembly (courtesy of godbolt):
sum:
test edi, edi
jle .L8
lea eax, [rdi-1]
cmp eax, 17
jbe .L9
mov edx, edi
movdqa xmm1, XMMWORD PTR .LC0[rip]
xor eax, eax
pxor xmm0, xmm0
movdqa xmm2, XMMWORD PTR .LC1[rip]
shr edx, 2
.L4:
add eax, 1
paddd xmm0, xmm1
paddd xmm1, xmm2
cmp eax, edx
jne .L4
movdqa xmm1, xmm0
mov ecx, edi
psrldq xmm1, 8
and ecx, -4
paddd xmm0, xmm1
lea edx, [rcx+1]
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
cmp edi, ecx
je .L13
.L7:
add eax, edx
add edx, 1
cmp edi, edx
jge .L7
ret
.L13:
ret
.L8:
xor eax, eax
ret
.L9:
mov edx, 1
xor eax, eax
jmp .L7
.LC0:
.long 1
.long 2
.long 3
.long 4
.LC1:
.long 4
.long 4
.long 4
.long 4
I understand that for values of n less than 19, a completely unoptmized loop (code at .L9 and .L7) is used, but I can't make heads nor tails of what is happening for larger values of n — could someone explain it?
Clang, on the other hand, simply calculates (n-1)*(n-2)/2 + 2*n - 1, which is a slighlty more roundabout way of calculating n*(n+1)/2 — perhaps to prevent some problems with signed overflow — which seems to be a much more effective way to optimize this loop.

Why is gcc -O3 auto-vectorizing factorial? That many extra instructions looks worse

Here's a very simple factorial function.
int factorial(int num) {
if (num == 0)
return 1;
return num*factorial(num-1);
}
GCC's assembly for this function on -O2 is reasonable.
factorial(int):
mov eax, 1
test edi, edi
je .L1
.L2:
imul eax, edi
sub edi, 1
jne .L2
.L1:
ret
However, on -O3 or -Ofast, it decides to make things way more complicated (almost 100 lines!):
factorial(int):
test edi, edi
je .L28
lea edx, [rdi-1]
mov ecx, edi
cmp edx, 6
jbe .L8
mov DWORD PTR [rsp-12], edi
movd xmm5, DWORD PTR [rsp-12]
mov edx, edi
xor eax, eax
movdqa xmm0, XMMWORD PTR .LC0[rip]
movdqa xmm4, XMMWORD PTR .LC2[rip]
shr edx, 2
pshufd xmm2, xmm5, 0
paddd xmm2, XMMWORD PTR .LC1[rip]
.L5:
movdqa xmm3, xmm2
movdqa xmm1, xmm2
paddd xmm2, xmm4
add eax, 1
pmuludq xmm3, xmm0
psrlq xmm1, 32
psrlq xmm0, 32
pmuludq xmm1, xmm0
pshufd xmm0, xmm3, 8
pshufd xmm1, xmm1, 8
punpckldq xmm0, xmm1
cmp eax, edx
jne .L5
movdqa xmm2, xmm0
movdqa xmm1, xmm0
mov edx, edi
psrldq xmm2, 8
psrlq xmm0, 32
and edx, -4
pmuludq xmm1, xmm2
psrlq xmm2, 32
sub edi, edx
pmuludq xmm0, xmm2
pshufd xmm1, xmm1, 8
pshufd xmm0, xmm0, 8
punpckldq xmm1, xmm0
movdqa xmm0, xmm1
psrldq xmm1, 4
pmuludq xmm0, xmm1
movd eax, xmm0
cmp ecx, edx
je .L1
lea edx, [rdi-1]
.L3:
imul eax, edi
test edx, edx
je .L1
imul eax, edx
mov edx, edi
sub edx, 2
je .L1
imul eax, edx
mov edx, edi
sub edx, 3
je .L1
imul eax, edx
mov edx, edi
sub edx, 4
je .L1
imul eax, edx
mov edx, edi
sub edx, 5
je .L1
imul eax, edx
sub edi, 6
je .L1
imul eax, edi
.L1:
ret
.L28:
mov eax, 1
ret
.L8:
mov eax, 1
jmp .L3
.LC0:
.long 1
.long 1
.long 1
.long 1
.LC1:
.long 0
.long -1
.long -2
.long -3
.LC2:
.long -4
.long -4
.long -4
.long -4
I got these results using Compiler Explorer, so it should be the same in a real-world use case.
What's up with that? Are there any cases where this would be faster? Clang seems to do something like this too, but on -O2.
imul r32,r32 has 3 cycle latency on typical modern x86 CPUs (http://agner.org/optimize/). So the scalar implementation can do one multiply per 3 clock cycles, because they're dependent. It's fully pipelined, though, so your scalar loop leaves 2/3rds of the potential throughput unused.
In 3 cycles, the pipeline in Core2 or later can feed 12 uops into the out-of-order part of the core. For small inputs, it might be best to keep the code small and let out-of-order execution overlap the dependency chain with later code, especially if that later code doesn't all depend on the factorial result. But compilers aren't good at knowing when to optimize for latency vs. throughput, and without profile-guided optimization they have no data on how large n usually is.
I suspect that gcc's auto-vectorizer isn't looking at how quickly this will overflow for large n.
A useful scalar optimization would have been unrolling with multiple accumulators, e.g. take advantage of the fact that multiplication is associative and do these in parallel in the loop: prod(n*3/4 .. n) * prod(n/2 .. n*3/4) * prod(n/4 .. n/2) * prod(1..n/4) (with non-overlapping ranges, of course). Multiplication is associative even when it wraps; the product bits only depend on bits at that position and lower, not on (discarded) high bits.
Or more simply, do f0 *= i; f1 *= i+1; f2 *= i+2; f3 *= i+3; i+=4;. And then outside the loop, return (f0*f1) * (f2*f3);. This would be a win in scalar code, too. Of course you also have to account for n % 4 != 0 when unrolling.
What gcc has chosen to do is basically the latter, using pmuludq to do 2 packed multiplies with one instruction (5c latency / 1c or 0.5c throughput on Intel CPUs) It's similar on AMD CPUs; see Agner Fog's instruction tables. Each vector loop iteration does 4 iterations of the factorial loop in your C source, and there's significant instruction-level parallelism within one iteration
The inner loop is only 12 uops long (cmp/jcc macro-fuses into 1), so it can issue at 1 iteration per 3 cycles, same throughput as the latency bottleneck in your scalar version, but doing 4x as much work per iteration.
.L5:
movdqa xmm3, xmm2 ; copy the old i vector
movdqa xmm1, xmm2
paddd xmm2, xmm4 ; [ i0, i1 | i2, i3 ] += 4
add eax, 1
pmuludq xmm3, xmm0 ; [ f0 | f2 ] *= [ i0 | i2 ]
psrlq xmm1, 32 ; bring odd 32 bit elements down to even: [ i1 | i3 ]
psrlq xmm0, 32
pmuludq xmm1, xmm0 ; [ f1 | f3 ] *= [ i1 | i3 ]
pshufd xmm0, xmm3, 8
pshufd xmm1, xmm1, 8
punpckldq xmm0, xmm1 ; merge back into [ f0 f1 f2 f3 ]
cmp eax, edx
jne .L5
So gcc wastes a whole lot of effort emulating a packed 32-bit multiply instead of leaving two separate vector accumulators separate when using pmuludq. I also looked at clang6.0. I think it's falling into the same trap. (Source+asm on the Godbolt compiler explorer)
You didn't use -march=native or anything, so only SSE2 (baseline for x86-64) is available, so only widening 32x32 => 64 bit SIMD multiplies like pmuludq are available for 32-bit input elements. SSE4.1 pmulld is 2 uops on Haswell and later (single-uop on Sandybridge), but would avoid all of gcc's stupid shuffling.
Of course there's a latency bottleneck here, too, especially because of gcc's missed optimizations increasing the length of the loop-carried dep chain involving the accumulators.
Unrolling with more vector accumulators could hide a lot of the pmuludq latency.
With good vectorization, the SIMD integer multipliers can manage 2x or 4x the throughput of the scalar integer multiply unit. (Or, with AVX2, 8x the throughput using vectors of 8x 32-bit integers.)
But the wider the vectors and the more unrolling, the more cleanup code you need.
gcc -march=haswell
We get an inner loop like this:
.L5:
inc eax
vpmulld ymm1, ymm1, ymm0
vpaddd ymm0, ymm0, ymm2
cmp eax, edx
jne .L5
Super simple, but a 10c latency loop-carried dependency chain :/ (pmulld is 2 dependent uops on Haswell and later). Unrolling with multiple accumulators can give up to a 10x throughput boost for large inputs, for 5c latency / 0.5c throughput for SIMD integer multiply uops on Skylake.
But 4 multiplies per 5 cycles is still much better than 1 per 3 for scalar.
Clang unrolls with multiple accumulators by default, so it should be good. But it's a lot of code, so I didn't analyze it by hand. Plug it into IACA or benchmark it for large inputs. (What is IACA and how do I use it?)
Efficient strategies for handling unroll epilogue:
A lookup table for factorial [0..7] is probably the best bet. Arrange things so your vector / unrolled loop does n%8 .. n, instead of 1 .. n/8*8, so the left-over part is always the same for every n.
After a horizontal vector product, do one more scalar multiply with the table lookup result. A SIMD loop already needs some vector constants so you'll probably touch memory anyway, and the table lookup can happen in parallel with the main loop.
8! is 40320, which fits in 16 bits, so a 1..8 lookup table only needs 8 * 2 bytes of storage. Or use 32-bit entries so you can use a memory source operand for imul instead of a separate movzx.
It doesn't make it worse. It runs faster for large numbers. Here are the results for factorial(1000000000):
-O2: 0.78 sec
-O3: 0.5 sec
Of course, using that large number is undefined behavior (because of overflow with signed arithmetic). But the timing is the same with unsigned numbers, for which it is not undefined behavior.
Note, this usage of factorial is usually pointless, as it doesn't calculate num!, but num! & UINT_MAX. But the compiler doesn't know about this.
Maybe with PGO, the compiler won't vectorize this code, if it is always called with small numbers.
If you don't like this behavior, but you want to use -O3, turn off autovectorization with -fno-tree-loop-vectorize.

Base64 Assembler Fill Array Error "Operands different sizes" Visual Studio

Im trying to make a Base64Encode in inline assembler in Visual Studio.
I got this func
char* Base64Encode(char* data, int len)
{
// Tabelle mit den Zeichen für die Codierung
const char* encodeTable = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
//
char* result;
if (len > 0) // sonst gibt es nichts zu tun ...
{
int encodedLength = ((len + 2) / 3) * 4; // effektiv die Ganzzahlfassung von ceil(length/3)*4
result = new char[encodedLength+1]; // +1 wg. Null-Terminierung
_asm
{
mov esi,data
mov edi,encodeTable
xor eax, eax
// get 3 bytes
mov ah, byte ptr[esi]
mov al, byte ptr[esi+1]
shl eax,16
mov ah, byte ptr[esi+2]
//
mov edx,eax
shl eax,6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov [result],bl
//
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov[result+1], bl
//manipulate in edx bitset3
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov[result+2], bl
//manipulate in edx bitset4
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov[result+3], bl
}
}
else
{
result = "";
}
return result;
}
The encoding is working proper, I have in bl always the right letter, but the output is not working ( result array doesn't fill with the letters, im getting the error that the operands have different sizes, I am only allowed to make changes in the __asm function ) .
Could somebody help me how to fill the result-array with the letters I get in bl? Debugging always shows me the right letters in the bl ' s if i comment out all the result lines.
EDIT:
I get nothing in the result array when i use the byte ptr.
Any ideas?
EDIT2:
The issue in your code is a matter of indirection. You define and initialize a variable result like this in the C++ code:
char* result;
result = new char[encodedLength+1];
result is a memory location that holds a pointer to an array of characters returned by new. result is not the memory location where data will be stored, but contains a pointer to that data area. You then access it in the ASM block like this:
mov [result],bl
The compiler/assembler(MASM) warned that there was an operand mismatch when it said Operands different sizes. It knew that result was the location of a 32-bit pointer (not single characters). Since result is the address containing a pointer the code above would have moved the contents of bl to the memory location result. This had the effect of changing the pointer (returned by new) not what result was pointing at.
You need to deal with indirection here. You want to get the address that is stored in result and use that as a base for memory addressing. You can choose an available register like ECX and MOV the contents of result into it. You could do that with something like this at the top of your ASM block:
mov ecx, dword ptr [result]
This takes the 32-bit(dword) value at memory location result and stores that in ECX. Now that we have the memory location to the beginning of the character buffer we can now modify all references of result in the ASM block and change it to ECX. Examples:
mov [result],bl
would become:
mov byte ptr [ecx],bl
and
mov[result+1], bl
would become:
mov byte ptr [ecx+1], bl
The second example is called base plus displacement (or offset) addressing . That link also describes all the addressing modes on x86. If you were using 16-bit code (which you aren't) there are some extra restrictions in the register choices that can be use for base and indexing.
As well user3144770 also pointed out that you didn't null terminate your string (you only allocated space for it), so at the bottom of your ASM block you should have probably used something like:
mov byte ptr[ecx+4], 0
With the changes above your code could look something like:
char* Base64Encode(char* data, int len)
{
// Tabelle mit den Zeichen für die Codierung
const char* encodeTable = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
//
char* result;
if (len > 0) // sonst gibt es nichts zu tun ...
{
int encodedLength = ((len + 2) / 3) * 4; // effektiv die Ganzzahlfassung von ceil(length/3)*4
result = new char[encodedLength + 1]; // +1 wg. Null-Terminierung
_asm
{
mov esi, data
mov edi, encodeTable
mov ecx, dword ptr [result]
xor eax, eax
// get 3 bytes
mov ah, byte ptr[esi]
mov al, byte ptr[esi + 1]
shl eax, 16
mov ah, byte ptr[esi + 2]
//
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov byte ptr [ecx], bl
//
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov byte ptr [ecx + edx + 1], bl
//manipulate in edx bitset3
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov byte ptr [ecx + 2], bl
//manipulate in edx bitset4
mov edx, eax
shl eax, 6
shr edx, 26
mov bl, byte ptr[edi + edx]
mov byte ptr [ecx + 3], bl
mov byte ptr[ecx + 4], 0
}
}
else
{
result = "";
}
return result;
}
Perhaps it's enough to write byte ptr
mov bl, byte ptr[edi + edx]
mov byte ptr[result], bl
Also you don't actually do the null-termination. ( +1 wg. Null-Terminierung)
mov byte ptr[result+4], 0

Find most significant DWORD in an DWORD array

I want to find the most significant DWORD which isn't equal to 0 in an DWORD array. The algorithm should be optimized for data sizes up to 128 byte.
I've made three different functions, which all returns the index of the specific DWORD.
unsigned long msb_msvc(long* dw, std::intptr_t n)
{
while( --n )
{
if( dw[n] )
break;
}
return n;
}
static inline unsigned long msb_386(long* dw, std::intptr_t n)
{
__asm
{
mov ecx, [dw]
mov eax, [n]
__loop: sub eax, 1
jz SHORT __exit
cmp DWORD PTR [ecx + eax * 4], 0
jz SHORT __loop
__exit:
}
}
static inline unsigned long msb_sse2(long* dw, std::intptr_t n)
{
__asm
{
mov ecx, [dw]
mov eax, [n]
test ecx, 0x0f
jnz SHORT __128_unaligned
__128_aligned:
cmp eax, 4
jb SHORT __64
sub eax, 4
movdqa xmm0, XMMWORD PTR [ecx + eax * 4]
pxor xmm1, xmm1
pcmpeqd xmm0, xmm1
pmovmskb edx, xmm0
not edx
and edx, 0xffff
jz SHORT __128_aligned
jmp SHORT __exit
__128_unaligned:
cmp eax, 4
jb SHORT __64
sub eax, 4
movdqu xmm0, XMMWORD PTR [ecx + eax * 4]
pxor xmm1, xmm1
pcmpeqd xmm0, xmm1
pmovmskb edx, xmm0
not edx
and edx, 0xffff
jz SHORT __128_unaligned
jmp SHORT __exit
__64:
cmp eax, 2
jb __32
sub eax, 2
movq mm0, MMWORD PTR [ecx + eax * 4]
pxor mm1, mm1
pcmpeqd mm0, mm1
pmovmskb edx, mm0
not edx
and edx, 0xff
emms
jz SHORT __64
jmp SHORT __exit
__32:
test eax, eax
jz SHORT __exit
xor eax, eax
jmp __leave ; retn
__exit:
bsr edx, edx
shr edx, 2
add eax, edx
__leave:
}
}
These function should be used, to preselect data which will be compared against each other. So, it needs to be performant.
Does anybody know a better algorithm?
I think you are just looking for the first non-zero word in a given array. I would definitely go with a simple loop written in C. If there's some reason why this is super performance critical, I would recommend you look in the larger context of your program and ask e.g. the question why you need to find the non-zero object from the array and why can't you know its location already.

Resources