I'd like to better understand why two very similar pieces of code seem to perform dramatically different on my computer.
These tests are on a Ryzen processor, with gcc-trunk and Julia 0.7-alpha (LLVM 6.0).
gcc-8 appears similar, while Julia 0.6.3 (LLVM 3.9) is slightly slower than v0.7.
I wrote generated functions (think C++ templates) that produce unrolled code for matrix operations, as well as a simple transpiler that can translate uncomplicated code to Fortran.
For 8x8 matrix multiplication, here is what the Fortran code looks like:
module mul8mod
implicit none
contains
subroutine mul8x8(A, B, C)
real(8), dimension(64), intent(in) :: A, B
real(8), dimension(64), intent(out) :: C
C(1) = A(1) * B(1) + A(9) * B(2) + A(17) * B(3) + A(25) * B(4)
C(1) = C(1) + A(33) * B(5) + A(41) * B(6) + A(49) * B(7) + A(57) * B(8)
C(2) = A(2) * B(1) + A(10) * B(2) + A(18) * B(3) + A(26) * B(4)
C(2) = C(2) + A(34) * B(5) + A(42) * B(6) + A(50) * B(7) + A(58) * B(8)
C(3) = A(3) * B(1) + A(11) * B(2) + A(19) * B(3) + A(27) * B(4)
C(3) = C(3) + A(35) * B(5) + A(43) * B(6) + A(51) * B(7) + A(59) * B(8)
C(4) = A(4) * B(1) + A(12) * B(2) + A(20) * B(3) + A(28) * B(4)
C(4) = C(4) + A(36) * B(5) + A(44) * B(6) + A(52) * B(7) + A(60) * B(8)
C(5) = A(5) * B(1) + A(13) * B(2) + A(21) * B(3) + A(29) * B(4)
C(5) = C(5) + A(37) * B(5) + A(45) * B(6) + A(53) * B(7) + A(61) * B(8)
C(6) = A(6) * B(1) + A(14) * B(2) + A(22) * B(3) + A(30) * B(4)
C(6) = C(6) + A(38) * B(5) + A(46) * B(6) + A(54) * B(7) + A(62) * B(8)
C(7) = A(7) * B(1) + A(15) * B(2) + A(23) * B(3) + A(31) * B(4)
C(7) = C(7) + A(39) * B(5) + A(47) * B(6) + A(55) * B(7) + A(63) * B(8)
C(8) = A(8) * B(1) + A(16) * B(2) + A(24) * B(3) + A(32) * B(4)
C(8) = C(8) + A(40) * B(5) + A(48) * B(6) + A(56) * B(7) + A(64) * B(8)
C(9) = A(1) * B(9) + A(9) * B(10) + A(17) * B(11) + A(25) * B(12)
C(9) = C(9) + A(33) * B(13) + A(41) * B(14) + A(49) * B(15) + A(57) * B(16)
C(10) = A(2) * B(9) + A(10) * B(10) + A(18) * B(11) + A(26) * B(12)
C(10) = C(10) + A(34) * B(13) + A(42) * B(14) + A(50) * B(15) + A(58) * B(16)
C(11) = A(3) * B(9) + A(11) * B(10) + A(19) * B(11) + A(27) * B(12)
C(11) = C(11) + A(35) * B(13) + A(43) * B(14) + A(51) * B(15) + A(59) * B(16)
C(12) = A(4) * B(9) + A(12) * B(10) + A(20) * B(11) + A(28) * B(12)
C(12) = C(12) + A(36) * B(13) + A(44) * B(14) + A(52) * B(15) + A(60) * B(16)
C(13) = A(5) * B(9) + A(13) * B(10) + A(21) * B(11) + A(29) * B(12)
C(13) = C(13) + A(37) * B(13) + A(45) * B(14) + A(53) * B(15) + A(61) * B(16)
C(14) = A(6) * B(9) + A(14) * B(10) + A(22) * B(11) + A(30) * B(12)
C(14) = C(14) + A(38) * B(13) + A(46) * B(14) + A(54) * B(15) + A(62) * B(16)
C(15) = A(7) * B(9) + A(15) * B(10) + A(23) * B(11) + A(31) * B(12)
C(15) = C(15) + A(39) * B(13) + A(47) * B(14) + A(55) * B(15) + A(63) * B(16)
C(16) = A(8) * B(9) + A(16) * B(10) + A(24) * B(11) + A(32) * B(12)
C(16) = C(16) + A(40) * B(13) + A(48) * B(14) + A(56) * B(15) + A(64) * B(16)
C(17) = A(1) * B(17) + A(9) * B(18) + A(17) * B(19) + A(25) * B(20)
C(17) = C(17) + A(33) * B(21) + A(41) * B(22) + A(49) * B(23) + A(57) * B(24)
C(18) = A(2) * B(17) + A(10) * B(18) + A(18) * B(19) + A(26) * B(20)
C(18) = C(18) + A(34) * B(21) + A(42) * B(22) + A(50) * B(23) + A(58) * B(24)
C(19) = A(3) * B(17) + A(11) * B(18) + A(19) * B(19) + A(27) * B(20)
C(19) = C(19) + A(35) * B(21) + A(43) * B(22) + A(51) * B(23) + A(59) * B(24)
C(20) = A(4) * B(17) + A(12) * B(18) + A(20) * B(19) + A(28) * B(20)
C(20) = C(20) + A(36) * B(21) + A(44) * B(22) + A(52) * B(23) + A(60) * B(24)
C(21) = A(5) * B(17) + A(13) * B(18) + A(21) * B(19) + A(29) * B(20)
C(21) = C(21) + A(37) * B(21) + A(45) * B(22) + A(53) * B(23) + A(61) * B(24)
C(22) = A(6) * B(17) + A(14) * B(18) + A(22) * B(19) + A(30) * B(20)
C(22) = C(22) + A(38) * B(21) + A(46) * B(22) + A(54) * B(23) + A(62) * B(24)
C(23) = A(7) * B(17) + A(15) * B(18) + A(23) * B(19) + A(31) * B(20)
C(23) = C(23) + A(39) * B(21) + A(47) * B(22) + A(55) * B(23) + A(63) * B(24)
C(24) = A(8) * B(17) + A(16) * B(18) + A(24) * B(19) + A(32) * B(20)
C(24) = C(24) + A(40) * B(21) + A(48) * B(22) + A(56) * B(23) + A(64) * B(24)
C(25) = A(1) * B(25) + A(9) * B(26) + A(17) * B(27) + A(25) * B(28)
C(25) = C(25) + A(33) * B(29) + A(41) * B(30) + A(49) * B(31) + A(57) * B(32)
C(26) = A(2) * B(25) + A(10) * B(26) + A(18) * B(27) + A(26) * B(28)
C(26) = C(26) + A(34) * B(29) + A(42) * B(30) + A(50) * B(31) + A(58) * B(32)
C(27) = A(3) * B(25) + A(11) * B(26) + A(19) * B(27) + A(27) * B(28)
C(27) = C(27) + A(35) * B(29) + A(43) * B(30) + A(51) * B(31) + A(59) * B(32)
C(28) = A(4) * B(25) + A(12) * B(26) + A(20) * B(27) + A(28) * B(28)
C(28) = C(28) + A(36) * B(29) + A(44) * B(30) + A(52) * B(31) + A(60) * B(32)
C(29) = A(5) * B(25) + A(13) * B(26) + A(21) * B(27) + A(29) * B(28)
C(29) = C(29) + A(37) * B(29) + A(45) * B(30) + A(53) * B(31) + A(61) * B(32)
C(30) = A(6) * B(25) + A(14) * B(26) + A(22) * B(27) + A(30) * B(28)
C(30) = C(30) + A(38) * B(29) + A(46) * B(30) + A(54) * B(31) + A(62) * B(32)
C(31) = A(7) * B(25) + A(15) * B(26) + A(23) * B(27) + A(31) * B(28)
C(31) = C(31) + A(39) * B(29) + A(47) * B(30) + A(55) * B(31) + A(63) * B(32)
C(32) = A(8) * B(25) + A(16) * B(26) + A(24) * B(27) + A(32) * B(28)
C(32) = C(32) + A(40) * B(29) + A(48) * B(30) + A(56) * B(31) + A(64) * B(32)
C(33) = A(1) * B(33) + A(9) * B(34) + A(17) * B(35) + A(25) * B(36)
C(33) = C(33) + A(33) * B(37) + A(41) * B(38) + A(49) * B(39) + A(57) * B(40)
C(34) = A(2) * B(33) + A(10) * B(34) + A(18) * B(35) + A(26) * B(36)
C(34) = C(34) + A(34) * B(37) + A(42) * B(38) + A(50) * B(39) + A(58) * B(40)
C(35) = A(3) * B(33) + A(11) * B(34) + A(19) * B(35) + A(27) * B(36)
C(35) = C(35) + A(35) * B(37) + A(43) * B(38) + A(51) * B(39) + A(59) * B(40)
C(36) = A(4) * B(33) + A(12) * B(34) + A(20) * B(35) + A(28) * B(36)
C(36) = C(36) + A(36) * B(37) + A(44) * B(38) + A(52) * B(39) + A(60) * B(40)
C(37) = A(5) * B(33) + A(13) * B(34) + A(21) * B(35) + A(29) * B(36)
C(37) = C(37) + A(37) * B(37) + A(45) * B(38) + A(53) * B(39) + A(61) * B(40)
C(38) = A(6) * B(33) + A(14) * B(34) + A(22) * B(35) + A(30) * B(36)
C(38) = C(38) + A(38) * B(37) + A(46) * B(38) + A(54) * B(39) + A(62) * B(40)
C(39) = A(7) * B(33) + A(15) * B(34) + A(23) * B(35) + A(31) * B(36)
C(39) = C(39) + A(39) * B(37) + A(47) * B(38) + A(55) * B(39) + A(63) * B(40)
C(40) = A(8) * B(33) + A(16) * B(34) + A(24) * B(35) + A(32) * B(36)
C(40) = C(40) + A(40) * B(37) + A(48) * B(38) + A(56) * B(39) + A(64) * B(40)
C(41) = A(1) * B(41) + A(9) * B(42) + A(17) * B(43) + A(25) * B(44)
C(41) = C(41) + A(33) * B(45) + A(41) * B(46) + A(49) * B(47) + A(57) * B(48)
C(42) = A(2) * B(41) + A(10) * B(42) + A(18) * B(43) + A(26) * B(44)
C(42) = C(42) + A(34) * B(45) + A(42) * B(46) + A(50) * B(47) + A(58) * B(48)
C(43) = A(3) * B(41) + A(11) * B(42) + A(19) * B(43) + A(27) * B(44)
C(43) = C(43) + A(35) * B(45) + A(43) * B(46) + A(51) * B(47) + A(59) * B(48)
C(44) = A(4) * B(41) + A(12) * B(42) + A(20) * B(43) + A(28) * B(44)
C(44) = C(44) + A(36) * B(45) + A(44) * B(46) + A(52) * B(47) + A(60) * B(48)
C(45) = A(5) * B(41) + A(13) * B(42) + A(21) * B(43) + A(29) * B(44)
C(45) = C(45) + A(37) * B(45) + A(45) * B(46) + A(53) * B(47) + A(61) * B(48)
C(46) = A(6) * B(41) + A(14) * B(42) + A(22) * B(43) + A(30) * B(44)
C(46) = C(46) + A(38) * B(45) + A(46) * B(46) + A(54) * B(47) + A(62) * B(48)
C(47) = A(7) * B(41) + A(15) * B(42) + A(23) * B(43) + A(31) * B(44)
C(47) = C(47) + A(39) * B(45) + A(47) * B(46) + A(55) * B(47) + A(63) * B(48)
C(48) = A(8) * B(41) + A(16) * B(42) + A(24) * B(43) + A(32) * B(44)
C(48) = C(48) + A(40) * B(45) + A(48) * B(46) + A(56) * B(47) + A(64) * B(48)
C(49) = A(1) * B(49) + A(9) * B(50) + A(17) * B(51) + A(25) * B(52)
C(49) = C(49) + A(33) * B(53) + A(41) * B(54) + A(49) * B(55) + A(57) * B(56)
C(50) = A(2) * B(49) + A(10) * B(50) + A(18) * B(51) + A(26) * B(52)
C(50) = C(50) + A(34) * B(53) + A(42) * B(54) + A(50) * B(55) + A(58) * B(56)
C(51) = A(3) * B(49) + A(11) * B(50) + A(19) * B(51) + A(27) * B(52)
C(51) = C(51) + A(35) * B(53) + A(43) * B(54) + A(51) * B(55) + A(59) * B(56)
C(52) = A(4) * B(49) + A(12) * B(50) + A(20) * B(51) + A(28) * B(52)
C(52) = C(52) + A(36) * B(53) + A(44) * B(54) + A(52) * B(55) + A(60) * B(56)
C(53) = A(5) * B(49) + A(13) * B(50) + A(21) * B(51) + A(29) * B(52)
C(53) = C(53) + A(37) * B(53) + A(45) * B(54) + A(53) * B(55) + A(61) * B(56)
C(54) = A(6) * B(49) + A(14) * B(50) + A(22) * B(51) + A(30) * B(52)
C(54) = C(54) + A(38) * B(53) + A(46) * B(54) + A(54) * B(55) + A(62) * B(56)
C(55) = A(7) * B(49) + A(15) * B(50) + A(23) * B(51) + A(31) * B(52)
C(55) = C(55) + A(39) * B(53) + A(47) * B(54) + A(55) * B(55) + A(63) * B(56)
C(56) = A(8) * B(49) + A(16) * B(50) + A(24) * B(51) + A(32) * B(52)
C(56) = C(56) + A(40) * B(53) + A(48) * B(54) + A(56) * B(55) + A(64) * B(56)
C(57) = A(1) * B(57) + A(9) * B(58) + A(17) * B(59) + A(25) * B(60)
C(57) = C(57) + A(33) * B(61) + A(41) * B(62) + A(49) * B(63) + A(57) * B(64)
C(58) = A(2) * B(57) + A(10) * B(58) + A(18) * B(59) + A(26) * B(60)
C(58) = C(58) + A(34) * B(61) + A(42) * B(62) + A(50) * B(63) + A(58) * B(64)
C(59) = A(3) * B(57) + A(11) * B(58) + A(19) * B(59) + A(27) * B(60)
C(59) = C(59) + A(35) * B(61) + A(43) * B(62) + A(51) * B(63) + A(59) * B(64)
C(60) = A(4) * B(57) + A(12) * B(58) + A(20) * B(59) + A(28) * B(60)
C(60) = C(60) + A(36) * B(61) + A(44) * B(62) + A(52) * B(63) + A(60) * B(64)
C(61) = A(5) * B(57) + A(13) * B(58) + A(21) * B(59) + A(29) * B(60)
C(61) = C(61) + A(37) * B(61) + A(45) * B(62) + A(53) * B(63) + A(61) * B(64)
C(62) = A(6) * B(57) + A(14) * B(58) + A(22) * B(59) + A(30) * B(60)
C(62) = C(62) + A(38) * B(61) + A(46) * B(62) + A(54) * B(63) + A(62) * B(64)
C(63) = A(7) * B(57) + A(15) * B(58) + A(23) * B(59) + A(31) * B(60)
C(63) = C(63) + A(39) * B(61) + A(47) * B(62) + A(55) * B(63) + A(63) * B(64)
C(64) = A(8) * B(57) + A(16) * B(58) + A(24) * B(59) + A(32) * B(60)
C(64) = C(64) + A(40) * B(61) + A(48) * B(62) + A(56) * B(63) + A(64) * B(64)
end subroutine mul8x8
end module mul8mod
The Julia code looks similar, but I first extract all the elements of the inputs, work on the scalars, and then insert them. I found that that works better in Julia, but worse in Fortran.
The expression looks super simple, like there should be no issue vectorizing it. Julia does so beautifully. Updating an 8x8 matrix in place:
# Julia benchmark; using YMM vectors
#benchmark mul!($c8, $a8, $b8)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 57.059 ns (0.00% GC)
median time: 58.901 ns (0.00% GC)
mean time: 59.522 ns (0.00% GC)
maximum time: 83.196 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 984
This works well.
Compiling the Fortran code with:
gfortran-trunk -march=native -Ofast -mprefer-vector-width=256 -shared -fPIC mul8module1.F08 -o libmul8mod1v15.so
Benchmark results:
# gfortran, using XMM vectors; code was unrolled 8x8 matrix multiplication
#benchmark mul8v15!($c8, $a8, $b8)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 122.175 ns (0.00% GC)
median time: 128.373 ns (0.00% GC)
mean time: 128.643 ns (0.00% GC)
maximum time: 194.090 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 905
Takes about twice as long. Looking at the assembly with -S reveals it ignored my -mprefer-vector-width=256, and used xmm registers instead.
This is also more or less what I get in Julia when I use pointers instead of arrays or mutable structs (when given pointers Julia assumes aliasing and compiles a slower version).
I tried a variety of variations on generating Fortran code (eg, sum(va * vb) statements, were va and vb are 4-length vectors), but the simplest was just calling the intrinsic function matmul.
Compiling matmul (for known 8x8 size) without -mprefer-vector-width=256,
# gfortran using XMM vectors generated from intrinsic matmul function
#benchmark mul8v2v2!($c8, $a8, $b8)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 92.983 ns (0.00% GC)
median time: 96.366 ns (0.00% GC)
mean time: 97.651 ns (0.00% GC)
maximum time: 166.845 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 954
and compiling WITH it:
# gfortran using YMM vectors with intrinsic matmul
#benchmark mul8v2v1!($c8, $a8, $b8)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 163.667 ns (0.00% GC)
median time: 166.544 ns (0.00% GC)
mean time: 168.320 ns (0.00% GC)
maximum time: 277.291 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 780
The avx-free matmul looks really fast for only using xmm registers, but when coerced into ymm -- dreadful.
Any idea what's going on? I want to understand why when instructed to do the same thing, and generating extremely similar assembly, one is so dramatically faster than the other.
FWIW, the input data is 8 byte aligned. I tried 16 byte aligned inputs, and it didn't seem to make a real difference.
I took a look at the assembly produced by gfortran with (note, this is just the intrinsic matmul function):
gfortran-trunk -march=native -Ofast -mprefer-vector-width=256 -shared -fPIC -S mul8module2.F08 -o mul8mod2v1.s
and that from Julia/LLVM, gotten via #code_native mul!(c8, a8, b8) (the unrolled matrix multiplication).
I would be more than happy to share all the assembly or anything else if someone is willing to take a look, but I'd hit the character limit on this post if I included it here.
Both correctly used ymm registers, and lots of vfmadd__pd instructions, also with lots of vmovupd, vmulpd, and vmovapd.
The biggest difference I noticed is that while LLVM used lots of vbroadcastsd, gcc instead has piles of vunpcklpd and vpermpd instructions.
A brief sample; gcc:
vpermpd $216, %ymm7, %ymm7
vpermpd $216, %ymm2, %ymm2
vpermpd $216, %ymm3, %ymm3
vpermpd $216, %ymm5, %ymm5
vunpckhpd %ymm6, %ymm4, %ymm4
vunpcklpd %ymm7, %ymm2, %ymm6
vunpckhpd %ymm7, %ymm2, %ymm2
vunpcklpd %ymm5, %ymm3, %ymm7
vpermpd $216, %ymm15, %ymm15
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm0, %ymm0
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm7, %ymm7
vunpckhpd %ymm5, %ymm3, %ymm3
vunpcklpd %ymm15, %ymm0, %ymm5
vunpckhpd %ymm15, %ymm0, %ymm0
vunpcklpd %ymm4, %ymm1, %ymm15
vunpckhpd %ymm4, %ymm1, %ymm1
vunpcklpd %ymm7, %ymm6, %ymm4
vunpckhpd %ymm7, %ymm6, %ymm6
Julia/LLVM:
vbroadcastsd 8(%rax), %ymm3
vbroadcastsd 72(%rax), %ymm2
vbroadcastsd 136(%rax), %ymm12
vbroadcastsd 200(%rax), %ymm8
vbroadcastsd 264(%rax), %ymm10
vbroadcastsd 328(%rax), %ymm15
vbroadcastsd 392(%rax), %ymm14
vmulpd %ymm7, %ymm0, %ymm1
vmulpd %ymm11, %ymm0, %ymm0
vmovapd %ymm8, %ymm4
Could this explain the difference?
Why would gcc be so poorly optimized here?
Is there any way I can help it, so that it could generate code more comparable to LLVM?
Overall, gcc tends to outperform Clang in benchmarks (eg, on Phoronix)... Maybe I could try Flang (LLVM backend to Fortran), as well as Eigen (with g++ and clang++).
To reproduce, the matmul intrinsic function:
module mul8mod
implicit none
contains
subroutine intrinsic_mul8x8(A, B, C)
real(8), dimension(8,8), intent(in) :: A, B
real(8), dimension(8,8), intent(out) :: C
C = matmul(A, B)
end subroutine
end module mul8mod
Compiled as above, and Julia code to reproduce benchmarks:
#Pkg.clone("https://github.com/chriselrod/TriangularMatrices.jl")
using TriangularMatrices, BenchmarkTools, Compat
a8 = randmat(8); b8 = randmat(8); c8 = randmat(8);
import TriangularMatrices: mul!
#benchmark mul!($c8, $a8, $b8)
#code_native mul!(c8, a8, b8)
# after compiling into the shared library in libmul8mod2v2.so
# If compiled outside the working directory, replace pwd() accordingly
const libmul8path2v1 = joinpath(pwd(), "libmul8mod2v1.so")
function mul8v2v1!(C, A, B)
ccall((:__mul8mod_MOD_intrinsic_mul8x8, libmul8path2v1),
Cvoid,(Ptr{Cvoid},Ptr{Cvoid},Ptr{Cvoid}),
pointer_from_objref(A),
pointer_from_objref(B),
pointer_from_objref(C))
C
end
#benchmark mul8v2v1!($c8, $a8, $b8)
EDIT:
Thanks for the responses everyone!
Because I noticed that the code with the broadcasts is dramatically faster, I decided to rewrite my code-generator to encourage broadcasting.
Generated code now looks more like this:
C[1] = B[1] * A[1]
C[2] = B[1] * A[2]
C[3] = B[1] * A[3]
C[4] = B[1] * A[4]
C[5] = B[1] * A[5]
C[6] = B[1] * A[6]
C[7] = B[1] * A[7]
C[8] = B[1] * A[8]
C[1] += B[2] * A[9]
C[2] += B[2] * A[10]
C[3] += B[2] * A[11]
C[4] += B[2] * A[12]
C[5] += B[2] * A[13]
C[6] += B[2] * A[14]
C[7] += B[2] * A[15]
C[8] += B[2] * A[16]
C[1] += B[3] * A[17]
...
I am intending for the compiler to broadcast B, and then use repeated vectorized fma instructions. Julia really liked this rewrite:
# Julia benchmark; using YMM vectors
#benchmark mul2!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 45.156 ns (0.00% GC)
median time: 47.058 ns (0.00% GC)
mean time: 47.390 ns (0.00% GC)
maximum time: 62.066 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 990
Figuring it was llvm being smart, I also built Flang (Fortran frontend to llvm):
# compiled with
# flang -march=native -Ofast -mprefer-vector-width=256 -shared -fPIC mul8module6.f95 -o libmul8mod6v2.so
#benchmark mul8v6v2!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 51.322 ns (0.00% GC)
median time: 52.791 ns (0.00% GC)
mean time: 52.944 ns (0.00% GC)
maximum time: 83.376 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 988
This is also really good.
gfortran still refused to use broadcasts, and was still slow.
I've still have questions on how best to generate code. Encouraging broadcasts is obviously the way to go. Right now, I'm basically doing matrix * vector multiplication, and then repeating it for every column of B. So my written code loops over A once per column of B.
I do not know if that is what the compiler is actually doing, or if some other pattern would lead to faster code.
The point of optimizing multiplication of tiny matrices is as a kernel for a recursive algorithm for multiplying larger matrices.
So I also need to figure out the best way to handle different sizes.
This algorithm is far better for 8x8 than it is other sizes.
For nrow(A) % 4 (ie, if A has 10 rows, 10 % 4 = 2) I used the old approach for the remainder, after the broadcastable block.
But for 10x10 matrices, it takes 151 ns.
12 is perfectly divisible by 4, but it takes 226.
If this approach scaled with O(n^3), the times should be 91 ns and 158 ns respectively. I am falling well short.
I think I need to block down to a very small size, and try and get as many 8x8 as possible.
It may be the case that 8x8 ought to be the maximum size.
This would be a good case for profiling and performance analysis using a low-level tool that can expose microarchitectural bottlenecks. While I have not used AMD μProf, my experience with Intel's equivalents like XTU suggest that you'll get the best possible results when using a tool written by someone working for the same company and maybe even sitting near the people responsible for the hardware implementation of Ryzen's AVX instructions.
Start with an event-based profile of your application when running through a large number of iterations. General areas you could look for are things like whether one or the other style of generated assembly makes better use of execution ports or related backend CPU resources, or whether they behave differently with respect to cache and memory accesses. None of that would answer your conceptual question of why gcc has chosen to generate assembly in one style and LLVM in another, but it might tell you more at a hardware level about why the LLVM-generated assembly runs faster.
I'm doing a program in VB6 that simulates the process of multiplying a 6x6 to 6x6 matrix. I just did the code only for the first row and second row. To be frankly honest I'm a beginner in programming so I'm still figuring things out.
Can someone show how to simplify this code using do until loop so that I can also apply it for the remaining 3rd, 4th, 5th and 6th row?
Re:
I also added the code for rows 3rd, 4th, 5th and 6th. Pls do help me.
an(0).Text = Val(a(0).Text) * Val(b(0).Text) + Val(a(1).Text) * Val(b(6).Text) + Val(a(2).Text) * Val(b(12).Text) + Val(a(3).Text) * Val(b(18).Text) + Val(a(4).Text) * Val(b(24).Text) + Val(a(5).Text) * Val(b(30).Text)
an(1).Text = Val(a(0).Text) * Val(b(1).Text) + Val(a(1).Text) * Val(b(7).Text) + Val(a(2).Text) * Val(b(13).Text) + Val(a(3).Text) * Val(b(19).Text) + Val(a(4).Text) * Val(b(25).Text) + Val(a(5).Text) * Val(b(31).Text)
an(2).Text = Val(a(0).Text) * Val(b(2).Text) + Val(a(1).Text) * Val(b(8).Text) + Val(a(2).Text) * Val(b(14).Text) + Val(a(3).Text) * Val(b(20).Text) + Val(a(4).Text) * Val(b(26).Text) + Val(a(5).Text) * Val(b(32).Text)
an(3).Text = Val(a(0).Text) * Val(b(3).Text) + Val(a(1).Text) * Val(b(9).Text) + Val(a(2).Text) * Val(b(15).Text) + Val(a(3).Text) * Val(b(21).Text) + Val(a(4).Text) * Val(b(27).Text) + Val(a(5).Text) * Val(b(33).Text)
an(4).Text = Val(a(0).Text) * Val(b(4).Text) + Val(a(1).Text) * Val(b(10).Text) + Val(a(2).Text) * Val(b(16).Text) + Val(a(3).Text) * Val(b(22).Text) + Val(a(4).Text) * Val(b(28).Text) + Val(a(5).Text) * Val(b(34).Text)
an(5).Text = Val(a(0).Text) * Val(b(5).Text) + Val(a(1).Text) * Val(b(11).Text) + Val(a(2).Text) * Val(b(17).Text) + Val(a(3).Text) * Val(b(23).Text) + Val(a(4).Text) * Val(b(29).Text) + Val(a(5).Text) * Val(b(35).Text)
an(6).Text = Val(a(6).Text) * Val(b(0).Text) + Val(a(7).Text) * Val(b(6).Text) + Val(a(8).Text) * Val(b(12).Text) + Val(a(9).Text) * Val(b(18).Text) + Val(a(10).Text) * Val(b(24).Text) + Val(a(11).Text) * Val(b(30).Text)
an(7).Text = Val(a(6).Text) * Val(b(1).Text) + Val(a(7).Text) * Val(b(7).Text) + Val(a(8).Text) * Val(b(13).Text) + Val(a(9).Text) * Val(b(19).Text) + Val(a(10).Text) * Val(b(25).Text) + Val(a(11).Text) * Val(b(31).Text)
an(8).Text = Val(a(6).Text) * Val(b(2).Text) + Val(a(7).Text) * Val(b(8).Text) + Val(a(8).Text) * Val(b(14).Text) + Val(a(9).Text) * Val(b(20).Text) + Val(a(10).Text) * Val(b(26).Text) + Val(a(11).Text) * Val(b(32).Text)
an(9).Text = Val(a(6).Text) * Val(b(3).Text) + Val(a(7).Text) * Val(b(9).Text) + Val(a(8).Text) * Val(b(15).Text) + Val(a(9).Text) * Val(b(21).Text) + Val(a(10).Text) * Val(b(27).Text) + Val(a(11).Text) * Val(b(33).Text)
an(10).Text = Val(a(6).Text) * Val(b(4).Text) + Val(a(7).Text) * Val(b(10).Text) + Val(a(8).Text) * Val(b(16).Text) + Val(a(9).Text) * Val(b(22).Text) + Val(a(10).Text) * Val(b(28).Text) + Val(a(11).Text) * Val(b(34).Text)
an(11).Text = Val(a(6).Text) * Val(b(5).Text) + Val(a(7).Text) * Val(b(11).Text) + Val(a(8).Text) * Val(b(17).Text) + Val(a(9).Text) * Val(b(23).Text) + Val(a(10).Text) * Val(b(29).Text) + Val(a(11).Text) * Val(b(35).Text)
an(12).Text = Val(A(12).Text) * Val(B(0).Text) + Val(A(13).Text) * Val(B(6).Text) + Val(A(14).Text) * Val(B(12).Text) + Val(A(15).Text) * Val(B(18).Text) + Val(A(16).Text) * Val(B(24).Text) + Val(A(17).Text) * Val(B(30).Text)
an(13).Text = Val(A(12).Text) * Val(B(1).Text) + Val(A(13).Text) * Val(B(7).Text) + Val(A(14).Text) * Val(B(13).Text) + Val(A(15).Text) * Val(B(19).Text) + Val(A(16).Text) * Val(B(25).Text) + Val(A(17).Text) * Val(B(31).Text)
an(14).Text = Val(A(12).Text) * Val(B(2).Text) + Val(A(13).Text) * Val(B(8).Text) + Val(A(14).Text) * Val(B(14).Text) + Val(A(15).Text) * Val(B(20).Text) + Val(A(16).Text) * Val(B(26).Text) + Val(A(17).Text) * Val(B(32).Text)
an(15).Text = Val(A(12).Text) * Val(B(3).Text) + Val(A(13).Text) * Val(B(9).Text) + Val(A(14).Text) * Val(B(15).Text) + Val(A(15).Text) * Val(B(21).Text) + Val(A(16).Text) * Val(B(27).Text) + Val(A(17).Text) * Val(B(33).Text)
an(16).Text = Val(A(12).Text) * Val(B(4).Text) + Val(A(13).Text) * Val(B(10).Text) + Val(A(14).Text) * Val(B(16).Text) + Val(A(15).Text) * Val(B(22).Text) + Val(A(16).Text) * Val(B(28).Text) + Val(A(17).Text) * Val(B(34).Text)
an(17).Text = Val(A(12).Text) * Val(B(5).Text) + Val(A(13).Text) * Val(B(11).Text) + Val(A(14).Text) * Val(B(17).Text) + Val(A(15).Text) * Val(B(23).Text) + Val(A(16).Text) * Val(B(29).Text) + Val(A(17).Text) * Val(B(35).Text)
an(18).Text = Val(A(18).Text) * Val(B(0).Text) + Val(A(19).Text) * Val(B(6).Text) + Val(A(20).Text) * Val(B(12).Text) + Val(A(21).Text) * Val(B(18).Text) + Val(A(22).Text) * Val(B(24).Text) + Val(A(23).Text) * Val(B(30).Text)
an(19).Text = Val(A(18).Text) * Val(B(1).Text) + Val(A(19).Text) * Val(B(7).Text) + Val(A(20).Text) * Val(B(13).Text) + Val(A(21).Text) * Val(B(19).Text) + Val(A(22).Text) * Val(B(25).Text) + Val(A(23).Text) * Val(B(31).Text)
an(20).Text = Val(A(18).Text) * Val(B(2).Text) + Val(A(19).Text) * Val(B(8).Text) + Val(A(20).Text) * Val(B(14).Text) + Val(A(21).Text) * Val(B(20).Text) + Val(A(22).Text) * Val(B(26).Text) + Val(A(23).Text) * Val(B(32).Text)
an(21).Text = Val(A(18).Text) * Val(B(3).Text) + Val(A(19).Text) * Val(B(9).Text) + Val(A(20).Text) * Val(B(15).Text) + Val(A(21).Text) * Val(B(21).Text) + Val(A(22).Text) * Val(B(27).Text) + Val(A(23).Text) * Val(B(33).Text)
an(22).Text = Val(A(18).Text) * Val(B(4).Text) + Val(A(19).Text) * Val(B(10).Text) + Val(A(20).Text) * Val(B(16).Text) + Val(A(21).Text) * Val(B(22).Text) + Val(A(22).Text) * Val(B(28).Text) + Val(A(23).Text) * Val(B(34).Text)
an(23).Text = Val(A(18).Text) * Val(B(5).Text) + Val(A(19).Text) * Val(B(11).Text) + Val(A(20).Text) * Val(B(17).Text) + Val(A(21).Text) * Val(B(23).Text) + Val(A(22).Text) * Val(B(29).Text) + Val(A(23).Text) * Val(B(35).Text)
an(24).Text = Val(A(24).Text) * Val(B(0).Text) + Val(A(25).Text) * Val(B(6).Text) + Val(A(26).Text) * Val(B(12).Text) + Val(A(27).Text) * Val(B(18).Text) + Val(A(28).Text) * Val(B(24).Text) + Val(A(29).Text) * Val(B(30).Text)
an(25).Text = Val(A(24).Text) * Val(B(1).Text) + Val(A(25).Text) * Val(B(7).Text) + Val(A(26).Text) * Val(B(13).Text) + Val(A(27).Text) * Val(B(19).Text) + Val(A(28).Text) * Val(B(25).Text) + Val(A(29).Text) * Val(B(31).Text)
an(26).Text = Val(A(24).Text) * Val(B(2).Text) + Val(A(25).Text) * Val(B(8).Text) + Val(A(26).Text) * Val(B(14).Text) + Val(A(27).Text) * Val(B(20).Text) + Val(A(28).Text) * Val(B(26).Text) + Val(A(29).Text) * Val(B(32).Text)
an(27).Text = Val(A(24).Text) * Val(B(3).Text) + Val(A(25).Text) * Val(B(9).Text) + Val(A(26).Text) * Val(B(15).Text) + Val(A(27).Text) * Val(B(21).Text) + Val(A(28).Text) * Val(B(27).Text) + Val(A(29).Text) * Val(B(33).Text)
an(28).Text = Val(A(24).Text) * Val(B(4).Text) + Val(A(25).Text) * Val(B(10).Text) + Val(A(26).Text) * Val(B(16).Text) + Val(A(27).Text) * Val(B(22).Text) + Val(A(28).Text) * Val(B(28).Text) + Val(A(29).Text) * Val(B(34).Text)
an(29).Text = Val(A(24).Text) * Val(B(5).Text) + Val(A(25).Text) * Val(B(11).Text) + Val(A(26).Text) * Val(B(17).Text) + Val(A(27).Text) * Val(B(23).Text) + Val(A(28).Text) * Val(B(29).Text) + Val(A(29).Text) * Val(B(35).Text)
an(30).Text = Val(A(30).Text) * Val(B(0).Text) + Val(A(31).Text) * Val(B(6).Text) + Val(A(32).Text) * Val(B(12).Text) + Val(A(33).Text) * Val(B(18).Text) + Val(A(34).Text) * Val(B(24).Text) + Val(A(35).Text) * Val(B(30).Text)
an(31).Text = Val(A(30).Text) * Val(B(1).Text) + Val(A(31).Text) * Val(B(7).Text) + Val(A(32).Text) * Val(B(13).Text) + Val(A(33).Text) * Val(B(19).Text) + Val(A(34).Text) * Val(B(25).Text) + Val(A(35).Text) * Val(B(31).Text)
an(32).Text = Val(A(30).Text) * Val(B(2).Text) + Val(A(31).Text) * Val(B(8).Text) + Val(A(32).Text) * Val(B(14).Text) + Val(A(33).Text) * Val(B(20).Text) + Val(A(34).Text) * Val(B(26).Text) + Val(A(35).Text) * Val(B(32).Text)
an(33).Text = Val(A(30).Text) * Val(B(3).Text) + Val(A(31).Text) * Val(B(9).Text) + Val(A(32).Text) * Val(B(15).Text) + Val(A(33).Text) * Val(B(21).Text) + Val(A(34).Text) * Val(B(27).Text) + Val(A(35).Text) * Val(B(33).Text)
an(34).Text = Val(A(30).Text) * Val(B(4).Text) + Val(A(31).Text) * Val(B(10).Text) + Val(A(32).Text) * Val(B(16).Text) + Val(A(33).Text) * Val(B(22).Text) + Val(A(34).Text) * Val(B(28).Text) + Val(A(35).Text) * Val(B(34).Text)
an(35).Text = Val(A(30).Text) * Val(B(5).Text) + Val(A(31).Text) * Val(B(11).Text) + Val(A(32).Text) * Val(B(17).Text) + Val(A(33).Text) * Val(B(23).Text) + Val(A(34).Text) * Val(B(29).Text) + Val(A(35).Text) * Val(B(35).Text)
A Quick sample below, you need to tweak it much more until you end up with 3 loops inside each other.
To create the textboxes first -
Option Explicit
Private Sub Command1_Click()
Dim xText As Integer
xText = 1
For xText = 1 To 36
Load a(xText)
Load b(xText)
Load an(xText)
a(xText).Move a(0).Left, a(xText - 1).Top + 360, a(xText - 1).Width, 285
a(xText).Visible = True
a(xText) = a(xText - 1) + 1
b(xText).Move b(0).Left, b(xText - 1).Top + 360, b(xText - 1).Width, 285
b(xText).Visible = True
b(xText) = b(xText - 1) + 1
an(xText).Move an(0).Left, an(xText - 1).Top + 360, an(xText - 1).Width, 285
an(xText).Visible = True
Next xText
End Sub
To get the values calculated - ONLY A SAMPLE...
Private Sub Command2_Click()
Dim xSumtotal As Long, xMatrixA As Integer, xMatrixB As Integer
xSumtotal = 0
xMatrixA = 0
xMatrixB = 0
For xSumtotal = 0 To 35
If xMatrixA >= 36 Then
xMatrixA = 0
Else
xMatrixA = xMatrixA + 1
End If
If xMatrixB >= 36 Then
xMatrixB = 0
Else
xMatrixB = xMatrixB + 6
End If
an(xSumtotal) = Val(a(xMatrixA).Text) * Val(b(xMatrixB).Text) + Val(a(xMatrixA + 1).Text) * Val(b(xMatrixB + 6).Text) + Val(a(xMatrixA + 2).Text) * Val(b(xMatrixB + 6).Text) + Val(a(xMatrixA + 3).Text) * Val(b(xMatrixB + 6).Text) + Val(a(xMatrixA + 4).Text) * Val(b(xMatrixB + 6).Text) + Val(a(xMatrixA + 5).Text) * Val(b(xMatrixB + 6).Text)
''The if statement will not work as you will get an overflow error because you will be jumping to xMatrixA = 36 on line 6...
''You needt to use another 2 FOR/NEXT loops nested inside here to calculate through the 36x36...
''Not enough time to make it all work, this will however put you on the right track...
Next xSumtotal
End Sub
Hope this helps...
This isn't strictly speaking a coding question as I'm in charge of a spreadsheet rather than code, but the same principles apply.
I'm trying to create a piece of my spreadsheet that is an "average predictor". As an example: say a batsman has an average of 24 from 40 innings (in other words, has scored 960 runs). If he consistently performs at an average of 40 runs per innings from here on in, how many innings will it take for him to raise his career average to 30?
It's pretty easy to work this example out by hand, and just as easy to solve the general problem with a while loop. However, as mentioned, I can't use loops. Any ideas or suggestions?
You don't need a loop for this purpose. You can solve it by using the following formula (moving average):
(current_avg * current_innings + avg * x)/(current_innings + x) = goal_avg
You have to solve the equation for x.
Your example calculated on Wolfram Alpha:
Input: (24 * 40 + 40 * x)/(40 + x) = 30 solve x
Result: x=24
Link
As recommended by #StriplingWarrior in a comment on the question, write out the general equation, solve it algebraically, and use the resulting formula in your spreadsheet. The raw equation is given in the prior answer by #trylimits. I'm using slightly different identifiers to bring out the symmetry of the problem:
(old_avg * old_innings + new_avg * new_innings)/(old_innings + new_innings)
= goal_avg
old_avg * old_innings + new_avg * new_innings
= goal_avg * (old_innings + new_innings)
old_avg * old_innings + new_avg * new_innings
= goal_avg * old_innings + goal_avg * new_innings
new_avg * new_innings - goal_avg * new_innings
= goal_avg * old_innings - old_avg * old_innings
new_innings * (new_avg - goal_avg) = old_innings * (goal_avg - old_avg)
If new_avg - goal_avg is zero there is no solution unless goal_avg - old_avg is also zero, in which case no change is required and new_innings can be zero. If new_avg - goal_avg is non-zero:
new_innings = old_innings * (goal_avg - old_avg) / (new_avg - goal_avg)
The right hand side of this equation is a formula you could put in a spreadsheet.
The values from the example are:
old_avg = 24
old_innings = 40
new_avg = 40
goal_avg = 30
new_innings = 40 * (30 - 24) / (40 - 30)
= 240 / 10
= 24
I have a simple function of time. If I take the first derivative and set it equal to 0.0 I get the extrema of that function. I'm not interested in a numerical result, I want an expresion involving time and the rest of the parameters in my equations.
Solve[] gives Solve::nsmet: This system cannot be solved with the methods available to Solve.
How do I get an expression by setting the first derivative equal to 0.0? My code is below. (I've solved this on paper. I want Mathematica to give some plots and typeset the equations.)
Related question: The step that computes the derivative gives a result that includes this expression: Sqr'[-p1x + p2x - t v1x + t v2x]. What is Sqr', the Sqr function followed by an apostrophe?
Thanks.
p1tx = p1x + v1x*t
p1ty = p1y + v1y*t
p1t = p1xt + p1yt
p2tx = p2x + v2x*t
p2ty = p2y + v2y*t
p2t = p2xt + p2yt
dt = Sqrt[Sqr[p2tx - p1tx] + Sqr[p2ty - p1ty]]
firstDeriv = D[dt,t]
Solve[firstDeriv==0,t]
What is Sqr', the Sqr function followed by an apostrophe
I think it is just a typo dt = Sqrt[Sqrt[p2tx - p1tx] + Sqrt[p2ty - p1ty]]
{{t -> (-p1y v1x^2 + p2y v1x^2 + p1x v1y^2 - p2x v1y^2 +
2 p1y v1x v2x - 2 p2y v1x v2x - p1y v2x^2 + p2y v2x^2 -
2 p1x v1y v2y + 2 p2x v1y v2y + p1x v2y^2 -
p2x v2y^2)/((v1y - v2y) (v1x^2 - v1x v1y - 2 v1x v2x + v1y v2x +
v2x^2 + v1x v2y - v2x v2y))}}
ps: I've solved this on paper So, how does your equation for derivative look like?
I'm having a hard time figuring out why my feed forward artificial neural networks almost always fail to learn my simple "OR" perceptron:
0 or 0 : 0
1 or 0 : 1
0 or 1 : 1
1 or 1 : 1
I tried various network configurations : 1 or 2 hidden layers, 2 to 8 neurons on hidden layers, a bias (value = 1, connected to each hidden neuron and output neuron)
For this example, the network looks like:
[1] [3]
[2] [4] [6]
[5]
input layer : [1] and [2]
hidden layer : [3] and [4] and [6]
output layer : [6]
Notations:
Neuron i : [i]
Weight between [i] and [j] : w(i,j)
[i] value : v(i)
[i] error: e(i)
sig(t) : 1/(1 + e^-t)
Weights initialization
Each weight is set to a random value between -0.5 and 0.5
Input
v(1) and v(2) are set with random values : 0 or 1
Propagate value (from left to right) :
v(3) = sig(v(1) * w(1,3) + v(2) * w(2,3))
v(4) = sig(v(1) * w(1,4) + v(2) * w(2,4))
v(5) = sig(v(1) * w(1,5) + v(2) * w(2,5))
v(6) = sig(v(3) * w(3,6) + v(4) * w(4,6) + v(5) * w(5,6))
expected output : 1 if (v(1) or v(2)) is true, 0 otherwise
e(6) = (expected - v(6)) * v(6) * (1 - v(6))
Propagate error (from right to left)
e(3) = e(6) * w(3,6) * v(3) * (1 - v(3))
e(4) = e(6) * w(4,6) * v(4) * (1 - v(4))
e(5) = e(6) * w(5,6) * v(5) * (1 - v(5))
Update weights (learning rate = 1)
w(1,3) = w(1,3) + v(1) * e(3)
w(1,4) = w(1,4) + v(1) * e(4)
w(1,5) = w(1,5) + v(1) * e(5)
w(2,3) = w(2,3) + v(2) * e(3)
w(2,4) = w(2,4) + v(2) * e(4)
w(2,5) = w(2,5) + v(2) * e(5)
w(3,6) = w(3,6) + v(3) * e(6)
w(4,6) = w(4,6) + v(4) * e(6)
w(5,6) = w(5,6) + v(5) * e(6)
Do it 300 times and print each result:
if v(6) > 0.5, it’s a True
if v(6) < 0.5, it’s a False
compare with expected output
After a few epoch, the network almost always return v(6) > 0.9, even for v(1) = v(2) = 0
Sometimes (roughly 1 out of 20 times), it works, the network learned correctly
What am I doing wrong ?
EDIT:
I found it !
The culprits was :
e(3) = e(6) * w(3,6) * v(3) * (1 - v(3))
e(4) = e(6) * w(4,6) * v(4) * (1 - v(4))
e(5) = e(6) * w(5,6) * v(5) * (1 - v(5))
Which should have been :
e(3) = e(6) * w(3,6)
e(4) = e(6) * w(4,6)
e(5) = e(6) * w(5,6)