I'm benchmarking some SSE code (multiplying 4 floats by 4 floats) against traditional C code doing the same thing. I think my benchmark code must be incorrect in some way because it seems to say that the non-SSE code is faster than the SSE by a factor of 2-3.
Can someone tell me what is wrong with the benchmarking code below? And perhaps suggest another approach that accurately shows the speeds for both the SSE and non-SSE code.
#include <time.h>
#include <string.h>
#include <stdio.h>
#define ITERATIONS 100000
#define MULT_FLOAT4(X, Y) ({ \
asm volatile ( \
"movaps (%0), %%xmm0\n\t" \
"mulps (%1), %%xmm0\n\t" \
"movaps %%xmm0, (%1)" \
:: "r" (X), "r" (Y)); })
int main(void)
{
int i, j;
float a[4] __attribute__((aligned(16))) = { 10, 20, 30, 40 };
time_t timer, sse_time, std_time;
timer = time(NULL);
for(j = 0; j < 5000; ++j)
for(i = 0; i < ITERATIONS; ++i) {
float b[4] __attribute__((aligned(16))) = { 0.1, 0.1, 0.1, 0.1 };
MULT_FLOAT4(a, b);
}
sse_time = time(NULL) - timer;
timer = time(NULL);
for(j = 0; j < 5000; ++j)
for(i = 0; i < ITERATIONS; ++i) {
float b[4] __attribute__((aligned(16))) = { 0.1, 0.1, 0.1, 0.1 };
b[0] *= a[0];
b[1] *= a[1];
b[2] *= a[2];
b[3] *= a[3];
}
std_time = time(NULL) - timer;
printf("sse_time %d\nstd_time %d\n", sse_time, std_time);
return 0;
}
When you enable optimizations the non-SSE code is eliminated completely, whereas the SSE code remains there, so this case is trivial. The more interesting part is when the optimizations are turned off: in this case the SSE-code is still slower whereas the loops' code is the same.
Non-SSE code of the innermost loop's body:
movl $0x3dcccccd, %eax
movl %eax, -80(%rbp)
movl $0x3dcccccd, %eax
movl %eax, -76(%rbp)
movl $0x3dcccccd, %eax
movl %eax, -72(%rbp)
movl $0x3dcccccd, %eax
movl %eax, -68(%rbp)
movss -80(%rbp), %xmm1
movss -48(%rbp), %xmm0
mulss %xmm1, %xmm0
movss %xmm0, -80(%rbp)
movss -76(%rbp), %xmm1
movss -44(%rbp), %xmm0
mulss %xmm1, %xmm0
movss %xmm0, -76(%rbp)
movss -72(%rbp), %xmm1
movss -40(%rbp), %xmm0
mulss %xmm1, %xmm0
movss %xmm0, -72(%rbp)
movss -68(%rbp), %xmm1
movss -36(%rbp), %xmm0
mulss %xmm1, %xmm0
movss %xmm0, -68(%rbp)
SSE code of the innermost loop's body:
movl $0x3dcccccd, %eax
movl %eax, -64(%rbp)
movl $0x3dcccccd, %eax
movl %eax, -60(%rbp)
movl $0x3dcccccd, %eax
movl %eax, -56(%rbp)
movl $0x3dcccccd, %eax
movl %eax, -52(%rbp)
leaq -48(%rbp), %rax
leaq -64(%rbp), %rdx
movaps (%rax), %xmm0
mulps (%rdx), %xmm0
movaps %xmm0, (%rdx)
I'm not sure about this, but here's my guess:
As you can see the compiler just stores the 4 floating values by 4 32-bit stores. This is then read back by a 16 byte load. This causes store forwarding stall which is costly when happens. You can look up this in the Intel manuals. It doesn't occur in the scalar version and this makes the performance difference.
To make it faster you need to make sure that this stall doesn't occur. If you are using a constant array of 4 floats, make it const and store the results in an another aligned array. This way the compiler hopefully won't make those unnecessary 4 byte movs before the load. Or, if you need to fill up the resulting array, do it with a 16 byte store command. If you can't avoid those 4 byte movs, you need to do something else after the store but before the load (for example calculating something else).
Related
Very simple assembly introduction code.
Seems to compile ok through gcc -o prog1 prog1.s, then ./prog1 just skips a line and shows nothing, like waiting an input the code doesn't ask. What's wrong?
Using gcc (Debian 4.7.2-5) 4.7.2 in 64-bit gNewSense running on VMware.
Code:
/*
int nums[] = {10, -21, -30, 45};
int main() {
int i, *p;
for (i = 0, p = nums; i != 4; i++, p++)
printf("%d\n", *p);
return 0;
}
*/
.data
nums: .int 10, -21, -30, 45
Sf: .string "%d\n" # string de formato para printf
.text
.globl main
main:
/********************************************************/
/* mantenha este trecho aqui e nao mexa - prologo !!! */
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movq %rbx, -8(%rbp)
movq %r12, -16(%rbp)
/********************************************************/
movl $0, %ebx /* ebx = 0; */
movq $nums, %r12 /* r12 = &nums */
L1:
cmpl $4, %ebx /* if (ebx == 4) ? */
je L2 /* goto L2 */
movl (%r12), %eax /* eax = *r12 */
/*************************************************************/
/* este trecho imprime o valor de %eax (estraga %eax) */
movq $Sf, %rdi /* primeiro parametro (ponteiro)*/
movl %eax, %esi /* segundo parametro (inteiro) */
call printf /* chama a funcao da biblioteca */
/*************************************************************/
addl $1, %ebx /* ebx += 1; */
addq $4, %r12 /* r12 += 4; */
jmp L1 /* goto L1; */
L2:
/***************************************************************/
/* mantenha este trecho aqui e nao mexa - finalizacao!!!! */
movq $0, %rax /* rax = 0 (valor de retorno) */
movq -8(%rbp), %rbx
movq -16(%rbp), %r12
leave
ret
/***************************************************************/
tl;dr: do xorl %eax, %eax before call printf.
printf is a varargs function. Here's what the System V AMD64 ABI has to say about varargs functions:
For calls that may call functions that use varargs or stdargs (prototype-less
calls or calls to functions containing ellipsis (. . . ) in the declaration) %al18 is used
as hidden argument to specify the number of vector registers used. The contents
of %al do not need to match exactly the number of registers, but must be an upper
bound on the number of vector registers used and is in the range 0–8 inclusive.
You broke that rule. You'll see that the first time your code calls printf, %al is 10, which is more than the upper bound of 8. On your gNewSense system, here's a disassembly of the beginning of printf:
printf:
sub $0xd8,%rsp
movzbl %al,%eax # rax = al;
mov %rdx,0x30(%rsp)
lea 0x0(,%rax,4),%rdx # rdx = rax * 4;
lea after_movaps(%rip),%rax # rax = &&after_movaps;
mov %rsi,0x28(%rsp)
mov %rcx,0x38(%rsp)
mov %rdi,%rsi
sub %rdx,%rax # rax -= rdx;
lea 0xcf(%rsp),%rdx
mov %r8,0x40(%rsp)
mov %r9,0x48(%rsp)
jmpq *%rax # goto *rax;
movaps %xmm7,-0xf(%rdx)
movaps %xmm6,-0x1f(%rdx)
movaps %xmm5,-0x2f(%rdx)
movaps %xmm4,-0x3f(%rdx)
movaps %xmm3,-0x4f(%rdx)
movaps %xmm2,-0x5f(%rdx)
movaps %xmm1,-0x6f(%rdx)
movaps %xmm0,-0x7f(%rdx)
after_movaps:
# nothing past here is relevant for your problem
A quasi-C translation of the important bits is goto *(&&after_movaps - al * 4); (see Labels as Values). For efficiency, gcc and/or glibc didn't want to save more vector registers than you used, and it also doesn't want to do a bunch of conditional branches. Each instruction to save a vector register is 4 bytes, so it takes the end of the vector register saving instructions, subtracts al * 4 bytes, and jumps there. This results in just enough of the instructions executing. Since you had more than 8, it ended up jumping too far back, and landing before the jump instruction it just took, thus creating an infinite loop.
As for why it's not reproducible on modern systems, here's a disassembly of the beginning of their printf:
printf:
sub $0xd8,%rsp
mov %rdi,%r10
mov %rsi,0x28(%rsp)
mov %rdx,0x30(%rsp)
mov %rcx,0x38(%rsp)
mov %r8,0x40(%rsp)
mov %r9,0x48(%rsp)
test %al,%al # if(!al)
je after_movaps # goto after_movaps;
movaps %xmm0,0x50(%rsp)
movaps %xmm1,0x60(%rsp)
movaps %xmm2,0x70(%rsp)
movaps %xmm3,0x80(%rsp)
movaps %xmm4,0x90(%rsp)
movaps %xmm5,0xa0(%rsp)
movaps %xmm6,0xb0(%rsp)
movaps %xmm7,0xc0(%rsp)
after_movaps:
# nothing past here is relevant for your problem
A quasi-C translation of the important bits is if(!al) goto after_movaps;. Why did this change? My guess is Spectre. The mitigations for Spectre make indirect jumps really slow, so it's no longer worth doing that trick. Or not; see comments. Instead, they do a much simpler check: if there's any vector registers, then save them all. With this code, your bad value of al isn't a disaster, since it just means the vector registers will be unnecessarily copied.
I am passing an array of doubles into a function, how do I move each element into the appropriate SSE register? How do I move a specified value into the XMM registers? I am using AT&T syntax and the GNU assembler.
_variance:
push %rbp
movq %rsp, %rbp
xorq %r8, %r8
movq %rdi, %r9
movsd $0, %xmm1
movsd $0, %xmm2
cvtsi2sd %r9, %xmm3
decq %r9
jmp 2f
1: movd (%rsi, %r8, 8), %xmm0
addsd %xmm0, %xmm2
mulsd %xmm0, %xmm0
addsd %xmm0, %xmm1
incq %r8
2: cmpq %r8, %r9
jl 1b
divsd %xmm3, %xmm1
divsd %xmm3, %xmm2
mulsd %xmm2, %xmm2
subsd %xmm1, %xmm2
movsd %xmm2, %xmm0
pop %rbp
ret
.size _variance,.-variance
Here is the C program:
#include<stdio.h>
variance(int n, double x[]);
int main(void) {
double array[10] = {5.5, 3.4, 7.7, 3.2, 9.1, 113.5, .125, 33.2, 93.2, .00001};
int number = 10;
double answer = variance(number, array);
printf("Variance is: %d\n", answer);
return 0;
}
I am calculating the variance of a range of variables. Here is the function:
double sum1, sum2;
int i = 0;
while(i < n-1) {
sum1 = sum1 + (x[i] * x[i]);
sum2 = sum2 + x[i];
}
sum1 = sum1 / n;
sum2 = sum2 / n;
sum2 = sum2 * sum2;
return sum1 - sum2;
I have std::vector<double> X,Y both of size N (with N%16==0) and I want to calculate sum(X[i]*Y[i]). That's a classical use case for Fused Multiply and Add (FMA), which should be fast on AVX-capable processors. I know all my target CPU's are Intel, Haswell or newer.
How do I get GCC to emit that AVX code? -mfma is part of the solution, but do I need other switches?
And is std::vector<double>::operator[] hindering this? I know I can transform
size_t N = X.size();
double sum = 0.0;
for (size_t i = 0; i != N; ++i) sum += X[i] * Y[i];
to
size_t N = X.size();
double sum = 0.0;
double const* Xp = &X[0];
double const* Yp = &X[0];
for (size_t i = 0; i != N; ++i) sum += Xp[i] * Yp[i];
so the compiler can spot that &X[0] doesn't change in the loop. But is this sufficient or even necessary?
Current compiler is GCC 4.9.2, Debian 8, but could upgrade to GCC 5 if necessary.
Did you look at the assembly? I put
double foo(std::vector<double> &X, std::vector<double> &Y) {
size_t N = X.size();
double sum = 0.0;
for (size_t i = 0; i <N; ++i) sum += X[i] * Y[i];
return sum;
}
into http://gcc.godbolt.org/ and looked at the assembly in GCC 4.9.2 with -O3 -mfma and I see
.L3:
vmovsd (%rcx,%rax,8), %xmm1
vfmadd231sd (%rsi,%rax,8), %xmm1, %xmm0
addq $1, %rax
cmpq %rdx, %rax
jne .L3
So it uses fma. However, it doest not vectorize the loop (The s in sd means single (i.e. not packed) and the d means double floating point).
To vectorize the loop you need to enable associative math e.g. with -Ofast. Using -Ofast -mavx2 -mfma gives
.L8:
vmovupd (%rax,%rsi), %xmm2
addq $1, %r10
vinsertf128 $0x1, 16(%rax,%rsi), %ymm2, %ymm2
vfmadd231pd (%r12,%rsi), %ymm2, %ymm1
addq $32, %rsi
cmpq %r10, %rdi
ja .L8
So now it's vectorized (pd means packed doubles). However, it's not unrolled. This is currently a limitation of GCC. You need to unroll several times due to the dependency chain. If you want to have the compiler do this for you then consider using Clang which unrolls four times otherwise unroll by hand with intrinsics.
Note that unlike GCC, Clang does not use fma by default with -mfma. In order to use fma with Clang use -ffp-contract=fast (e.g. -O3 -mfma -ffp-contract=fast) or #pragma STDC FP_CONTRACT ON or enable associative math with e.g. -Ofast You're going to want to enable associate math anyway if you want to vectorize the loop with Clang.
See Fused multiply add and default rounding modes and https://stackoverflow.com/a/34461738/2542702 for more info about enabling fma with different compilers.
GCC creates a lot of extra code to handle misalignment and for N not a multiples of 8. You can tell the compiler to assume the arrays are aligned using __builtin_assume_aligned and that N is a multiple of 8 using N & -8
The following code with -Ofast -mavx2 -mfma
double foo2(double * __restrict X, double * __restrict Y, int N) {
X = (double*)__builtin_assume_aligned(X,32);
Y = (double*)__builtin_assume_aligned(Y,32);
double sum = 0.0;
for (int i = 0; i < (N &-8); ++i) sum += X[i] * Y[i];
return sum;
}
produces the following simple assembly
andl $-8, %edx
jle .L4
subl $4, %edx
vxorpd %xmm0, %xmm0, %xmm0
shrl $2, %edx
xorl %ecx, %ecx
leal 1(%rdx), %eax
xorl %edx, %edx
.L3:
vmovapd (%rsi,%rdx), %ymm2
addl $1, %ecx
vfmadd231pd (%rdi,%rdx), %ymm2, %ymm0
addq $32, %rdx
cmpl %eax, %ecx
jb .L3
vhaddpd %ymm0, %ymm0, %ymm0
vperm2f128 $1, %ymm0, %ymm0, %ymm1
vaddpd %ymm1, %ymm0, %ymm0
vzeroupper
ret
.L4:
vxorpd %xmm0, %xmm0, %xmm0
ret
I'm not sure this will get you all the way there, but I'm almost sure that a big part of the solution.
You have to break the loop into two: 0 to N, with step M>1. I'd try with M of 16, 8, 4, and look at the asm. And a inner loop of 0 to M. Don't worry about the math iterator math. Gcc is smart enough with it.
Gcc should unroll the inner loop and them it can SIMD it and maybe use FMA.
so im a total noob at assembly code and reading them as well
so i have a simple c code
void saxpy()
{
for(int i = 0; i < ARRAY_SIZE; i++) {
float product = a*x[i];
z[i] = product + y[i];
}
}
and the equivalent assembly code when compiled with
gcc -std=c99 -O3 -fno-tree-vectorize -S code.c -o code-O3.s
gives me the follows asssembly code
saxpy:
.LFB0:
.cfi_startproc
movss a(%rip), %xmm1
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L3:
movss x(%rax), %xmm0
addq $4, %rax
mulss %xmm1, %xmm0
addss y-4(%rax), %xmm0
movss %xmm0, z-4(%rax)
cmpq $262144, %rax
jne .L3
rep ret
.cfi_endproc
i do understand that loop unrolling has taken place
but im not able to understand the intention and idea behind
addq $4, %rax
mulss %xmm1, %xmm0
addss y-4(%rax), %xmm0
movss %xmm0, z-4(%rax)
Can someone explain, the usage of 4, and
what does the statements mean
y-4(%rax)
x, y, and z are global arrays. You left out the end of the listing where the symbols are declared.
I put your code on godbolt for you, with the necessary globals defined (and fixed the indenting). Look at the bottom.
BTW, there's no unrolling going on here. There's one each scalar single-precision mul and add in the loop. Try with -funroll-loops to see it unroll.
With -march=haswell, gcc will use an FMA instruction. If you un-cripple the compiler by leaving out -fno-tree-vectorize, and #define ARRAY_SIZE is small, like 100, it fully unrolls the loop with mostly 32byte FMA ymm instructions, ending with some 16byte FMA xmm.
Also, what is the need to add an immediate value 4 to rax register.
which is done as per the statement "addq $4, %rax"
The loop increments a pointer by 4 bytes, instead of using a scaled-index addressing mode.
Look at the links on https://stackoverflow.com/questions/tagged/x86. Also, single-stepping through code with a debugger is often a good way to make sure you understand what it's doing.
I was trying to come up with inline assembly for gcc to get both division and modulus using single divl instruction. Unfortunately, I am not that good at assembly. Could someone please help me on this? Thank you.
You're looking for something like this:
__asm__("divl %2\n"
: "=d" (remainder), "=a" (quotient)
: "g" (modulus), "d" (high), "a" (low));
Although I agree with the other commenters that usually GCC will do this for you and you should avoid inline assembly when possible, sometimes you need this construct.
For instance, if the high word is less than the modulus, then it is safe to perform the division like this. However, GCC isn't smart enough to realize this, because in the general case dividing a 64 bit number by a 32 bit number can lead to overflow, and so it calls to a library routine to do extra work. (Replace with 128 bit/64 bit for 64 bit ISAs.)
You shouldn't try to optimize this yourself. GCC already does this.
volatile int some_a = 18, some_b = 7;
int main(int argc, char *argv[]) {
int a = some_a, b = some_b;
printf("%d %d\n", a / b, a % b);
return 0;
}
Running
gcc -S test.c -O
yields
main:
.LFB11:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl some_a(%rip), %esi
movl some_b(%rip), %ecx
movl %esi, %eax
movl %esi, %edx
sarl $31, %edx
idivl %ecx
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
Notice that the remainder, %edx, is not moved because it is also the third argument passed to printf.
EDIT: The 32-bit version is less confusing. Passing -m32 yields
main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $16, %esp
movl some_a, %eax
movl some_b, %ecx
movl %eax, %edx
sarl $31, %edx
idivl %ecx
movl %edx, 8(%esp)
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call printf
movl $0, %eax
leave
ret
Fortunately, you don't have to resort to inline assembly to achieve this. gcc will do this automatically when it can.
$ cat divmod.c
struct sdiv { unsigned long quot; unsigned long rem; };
struct sdiv divide( unsigned long num, unsigned long divisor )
{
struct sdiv x = { num / divisor, num % divisor };
return x;
}
$ gcc -O3 -std=c99 -Wall -Wextra -pedantic -S divmod.c -o -
.file "divmod.c"
.text
.p2align 4,,15
.globl divide
.type divide, #function
divide:
.LFB0:
.cfi_startproc
movq %rdi, %rax
xorl %edx, %edx
divq %rsi
ret
.cfi_endproc
.LFE0:
.size divide, .-divide
.ident "GCC: (GNU) 4.4.4 20100630 (Red Hat 4.4.4-10)"
.section .note.GNU-stack,"",#progbits
Yes -- a divl will produce the quotient in eax and the remainder in edx. Using Intel syntax, for example:
mov eax, 17
mov ebx, 3
xor edx, edx
div ebx
; eax = 5
; edx = 2
Here is an example in linux kernel code about divl
/*
* do_div() is NOT a C function. It wants to return
* two values (the quotient and the remainder), but
* since that doesn't work very well in C, what it
* does is:
*
* - modifies the 64-bit dividend _in_place_
* - returns the 32-bit remainder
*
* This ends up being the most efficient "calling
* convention" on x86.
*/
#define do_div(n, base) \
({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
__mod = n & (__base - 1); \
n >>= ilog2(__base); \
} else { \
asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2" : "=a" (__low), "=d" (__mod) \
: "rm" (__base), "0" (__low), "1" (__upper)); \
asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
} \
__mod; \
})