I'd like to better understand why two very similar pieces of code seem to perform dramatically different on my computer.
These tests are on a Ryzen processor, with gcc-trunk and Julia 0.7-alpha (LLVM 6.0).
gcc-8 appears similar, while Julia 0.6.3 (LLVM 3.9) is slightly slower than v0.7.
I wrote generated functions (think C++ templates) that produce unrolled code for matrix operations, as well as a simple transpiler that can translate uncomplicated code to Fortran.
For 8x8 matrix multiplication, here is what the Fortran code looks like:
module mul8mod
implicit none
contains
subroutine mul8x8(A, B, C)
real(8), dimension(64), intent(in) :: A, B
real(8), dimension(64), intent(out) :: C
C(1) = A(1) * B(1) + A(9) * B(2) + A(17) * B(3) + A(25) * B(4)
C(1) = C(1) + A(33) * B(5) + A(41) * B(6) + A(49) * B(7) + A(57) * B(8)
C(2) = A(2) * B(1) + A(10) * B(2) + A(18) * B(3) + A(26) * B(4)
C(2) = C(2) + A(34) * B(5) + A(42) * B(6) + A(50) * B(7) + A(58) * B(8)
C(3) = A(3) * B(1) + A(11) * B(2) + A(19) * B(3) + A(27) * B(4)
C(3) = C(3) + A(35) * B(5) + A(43) * B(6) + A(51) * B(7) + A(59) * B(8)
C(4) = A(4) * B(1) + A(12) * B(2) + A(20) * B(3) + A(28) * B(4)
C(4) = C(4) + A(36) * B(5) + A(44) * B(6) + A(52) * B(7) + A(60) * B(8)
C(5) = A(5) * B(1) + A(13) * B(2) + A(21) * B(3) + A(29) * B(4)
C(5) = C(5) + A(37) * B(5) + A(45) * B(6) + A(53) * B(7) + A(61) * B(8)
C(6) = A(6) * B(1) + A(14) * B(2) + A(22) * B(3) + A(30) * B(4)
C(6) = C(6) + A(38) * B(5) + A(46) * B(6) + A(54) * B(7) + A(62) * B(8)
C(7) = A(7) * B(1) + A(15) * B(2) + A(23) * B(3) + A(31) * B(4)
C(7) = C(7) + A(39) * B(5) + A(47) * B(6) + A(55) * B(7) + A(63) * B(8)
C(8) = A(8) * B(1) + A(16) * B(2) + A(24) * B(3) + A(32) * B(4)
C(8) = C(8) + A(40) * B(5) + A(48) * B(6) + A(56) * B(7) + A(64) * B(8)
C(9) = A(1) * B(9) + A(9) * B(10) + A(17) * B(11) + A(25) * B(12)
C(9) = C(9) + A(33) * B(13) + A(41) * B(14) + A(49) * B(15) + A(57) * B(16)
C(10) = A(2) * B(9) + A(10) * B(10) + A(18) * B(11) + A(26) * B(12)
C(10) = C(10) + A(34) * B(13) + A(42) * B(14) + A(50) * B(15) + A(58) * B(16)
C(11) = A(3) * B(9) + A(11) * B(10) + A(19) * B(11) + A(27) * B(12)
C(11) = C(11) + A(35) * B(13) + A(43) * B(14) + A(51) * B(15) + A(59) * B(16)
C(12) = A(4) * B(9) + A(12) * B(10) + A(20) * B(11) + A(28) * B(12)
C(12) = C(12) + A(36) * B(13) + A(44) * B(14) + A(52) * B(15) + A(60) * B(16)
C(13) = A(5) * B(9) + A(13) * B(10) + A(21) * B(11) + A(29) * B(12)
C(13) = C(13) + A(37) * B(13) + A(45) * B(14) + A(53) * B(15) + A(61) * B(16)
C(14) = A(6) * B(9) + A(14) * B(10) + A(22) * B(11) + A(30) * B(12)
C(14) = C(14) + A(38) * B(13) + A(46) * B(14) + A(54) * B(15) + A(62) * B(16)
C(15) = A(7) * B(9) + A(15) * B(10) + A(23) * B(11) + A(31) * B(12)
C(15) = C(15) + A(39) * B(13) + A(47) * B(14) + A(55) * B(15) + A(63) * B(16)
C(16) = A(8) * B(9) + A(16) * B(10) + A(24) * B(11) + A(32) * B(12)
C(16) = C(16) + A(40) * B(13) + A(48) * B(14) + A(56) * B(15) + A(64) * B(16)
C(17) = A(1) * B(17) + A(9) * B(18) + A(17) * B(19) + A(25) * B(20)
C(17) = C(17) + A(33) * B(21) + A(41) * B(22) + A(49) * B(23) + A(57) * B(24)
C(18) = A(2) * B(17) + A(10) * B(18) + A(18) * B(19) + A(26) * B(20)
C(18) = C(18) + A(34) * B(21) + A(42) * B(22) + A(50) * B(23) + A(58) * B(24)
C(19) = A(3) * B(17) + A(11) * B(18) + A(19) * B(19) + A(27) * B(20)
C(19) = C(19) + A(35) * B(21) + A(43) * B(22) + A(51) * B(23) + A(59) * B(24)
C(20) = A(4) * B(17) + A(12) * B(18) + A(20) * B(19) + A(28) * B(20)
C(20) = C(20) + A(36) * B(21) + A(44) * B(22) + A(52) * B(23) + A(60) * B(24)
C(21) = A(5) * B(17) + A(13) * B(18) + A(21) * B(19) + A(29) * B(20)
C(21) = C(21) + A(37) * B(21) + A(45) * B(22) + A(53) * B(23) + A(61) * B(24)
C(22) = A(6) * B(17) + A(14) * B(18) + A(22) * B(19) + A(30) * B(20)
C(22) = C(22) + A(38) * B(21) + A(46) * B(22) + A(54) * B(23) + A(62) * B(24)
C(23) = A(7) * B(17) + A(15) * B(18) + A(23) * B(19) + A(31) * B(20)
C(23) = C(23) + A(39) * B(21) + A(47) * B(22) + A(55) * B(23) + A(63) * B(24)
C(24) = A(8) * B(17) + A(16) * B(18) + A(24) * B(19) + A(32) * B(20)
C(24) = C(24) + A(40) * B(21) + A(48) * B(22) + A(56) * B(23) + A(64) * B(24)
C(25) = A(1) * B(25) + A(9) * B(26) + A(17) * B(27) + A(25) * B(28)
C(25) = C(25) + A(33) * B(29) + A(41) * B(30) + A(49) * B(31) + A(57) * B(32)
C(26) = A(2) * B(25) + A(10) * B(26) + A(18) * B(27) + A(26) * B(28)
C(26) = C(26) + A(34) * B(29) + A(42) * B(30) + A(50) * B(31) + A(58) * B(32)
C(27) = A(3) * B(25) + A(11) * B(26) + A(19) * B(27) + A(27) * B(28)
C(27) = C(27) + A(35) * B(29) + A(43) * B(30) + A(51) * B(31) + A(59) * B(32)
C(28) = A(4) * B(25) + A(12) * B(26) + A(20) * B(27) + A(28) * B(28)
C(28) = C(28) + A(36) * B(29) + A(44) * B(30) + A(52) * B(31) + A(60) * B(32)
C(29) = A(5) * B(25) + A(13) * B(26) + A(21) * B(27) + A(29) * B(28)
C(29) = C(29) + A(37) * B(29) + A(45) * B(30) + A(53) * B(31) + A(61) * B(32)
C(30) = A(6) * B(25) + A(14) * B(26) + A(22) * B(27) + A(30) * B(28)
C(30) = C(30) + A(38) * B(29) + A(46) * B(30) + A(54) * B(31) + A(62) * B(32)
C(31) = A(7) * B(25) + A(15) * B(26) + A(23) * B(27) + A(31) * B(28)
C(31) = C(31) + A(39) * B(29) + A(47) * B(30) + A(55) * B(31) + A(63) * B(32)
C(32) = A(8) * B(25) + A(16) * B(26) + A(24) * B(27) + A(32) * B(28)
C(32) = C(32) + A(40) * B(29) + A(48) * B(30) + A(56) * B(31) + A(64) * B(32)
C(33) = A(1) * B(33) + A(9) * B(34) + A(17) * B(35) + A(25) * B(36)
C(33) = C(33) + A(33) * B(37) + A(41) * B(38) + A(49) * B(39) + A(57) * B(40)
C(34) = A(2) * B(33) + A(10) * B(34) + A(18) * B(35) + A(26) * B(36)
C(34) = C(34) + A(34) * B(37) + A(42) * B(38) + A(50) * B(39) + A(58) * B(40)
C(35) = A(3) * B(33) + A(11) * B(34) + A(19) * B(35) + A(27) * B(36)
C(35) = C(35) + A(35) * B(37) + A(43) * B(38) + A(51) * B(39) + A(59) * B(40)
C(36) = A(4) * B(33) + A(12) * B(34) + A(20) * B(35) + A(28) * B(36)
C(36) = C(36) + A(36) * B(37) + A(44) * B(38) + A(52) * B(39) + A(60) * B(40)
C(37) = A(5) * B(33) + A(13) * B(34) + A(21) * B(35) + A(29) * B(36)
C(37) = C(37) + A(37) * B(37) + A(45) * B(38) + A(53) * B(39) + A(61) * B(40)
C(38) = A(6) * B(33) + A(14) * B(34) + A(22) * B(35) + A(30) * B(36)
C(38) = C(38) + A(38) * B(37) + A(46) * B(38) + A(54) * B(39) + A(62) * B(40)
C(39) = A(7) * B(33) + A(15) * B(34) + A(23) * B(35) + A(31) * B(36)
C(39) = C(39) + A(39) * B(37) + A(47) * B(38) + A(55) * B(39) + A(63) * B(40)
C(40) = A(8) * B(33) + A(16) * B(34) + A(24) * B(35) + A(32) * B(36)
C(40) = C(40) + A(40) * B(37) + A(48) * B(38) + A(56) * B(39) + A(64) * B(40)
C(41) = A(1) * B(41) + A(9) * B(42) + A(17) * B(43) + A(25) * B(44)
C(41) = C(41) + A(33) * B(45) + A(41) * B(46) + A(49) * B(47) + A(57) * B(48)
C(42) = A(2) * B(41) + A(10) * B(42) + A(18) * B(43) + A(26) * B(44)
C(42) = C(42) + A(34) * B(45) + A(42) * B(46) + A(50) * B(47) + A(58) * B(48)
C(43) = A(3) * B(41) + A(11) * B(42) + A(19) * B(43) + A(27) * B(44)
C(43) = C(43) + A(35) * B(45) + A(43) * B(46) + A(51) * B(47) + A(59) * B(48)
C(44) = A(4) * B(41) + A(12) * B(42) + A(20) * B(43) + A(28) * B(44)
C(44) = C(44) + A(36) * B(45) + A(44) * B(46) + A(52) * B(47) + A(60) * B(48)
C(45) = A(5) * B(41) + A(13) * B(42) + A(21) * B(43) + A(29) * B(44)
C(45) = C(45) + A(37) * B(45) + A(45) * B(46) + A(53) * B(47) + A(61) * B(48)
C(46) = A(6) * B(41) + A(14) * B(42) + A(22) * B(43) + A(30) * B(44)
C(46) = C(46) + A(38) * B(45) + A(46) * B(46) + A(54) * B(47) + A(62) * B(48)
C(47) = A(7) * B(41) + A(15) * B(42) + A(23) * B(43) + A(31) * B(44)
C(47) = C(47) + A(39) * B(45) + A(47) * B(46) + A(55) * B(47) + A(63) * B(48)
C(48) = A(8) * B(41) + A(16) * B(42) + A(24) * B(43) + A(32) * B(44)
C(48) = C(48) + A(40) * B(45) + A(48) * B(46) + A(56) * B(47) + A(64) * B(48)
C(49) = A(1) * B(49) + A(9) * B(50) + A(17) * B(51) + A(25) * B(52)
C(49) = C(49) + A(33) * B(53) + A(41) * B(54) + A(49) * B(55) + A(57) * B(56)
C(50) = A(2) * B(49) + A(10) * B(50) + A(18) * B(51) + A(26) * B(52)
C(50) = C(50) + A(34) * B(53) + A(42) * B(54) + A(50) * B(55) + A(58) * B(56)
C(51) = A(3) * B(49) + A(11) * B(50) + A(19) * B(51) + A(27) * B(52)
C(51) = C(51) + A(35) * B(53) + A(43) * B(54) + A(51) * B(55) + A(59) * B(56)
C(52) = A(4) * B(49) + A(12) * B(50) + A(20) * B(51) + A(28) * B(52)
C(52) = C(52) + A(36) * B(53) + A(44) * B(54) + A(52) * B(55) + A(60) * B(56)
C(53) = A(5) * B(49) + A(13) * B(50) + A(21) * B(51) + A(29) * B(52)
C(53) = C(53) + A(37) * B(53) + A(45) * B(54) + A(53) * B(55) + A(61) * B(56)
C(54) = A(6) * B(49) + A(14) * B(50) + A(22) * B(51) + A(30) * B(52)
C(54) = C(54) + A(38) * B(53) + A(46) * B(54) + A(54) * B(55) + A(62) * B(56)
C(55) = A(7) * B(49) + A(15) * B(50) + A(23) * B(51) + A(31) * B(52)
C(55) = C(55) + A(39) * B(53) + A(47) * B(54) + A(55) * B(55) + A(63) * B(56)
C(56) = A(8) * B(49) + A(16) * B(50) + A(24) * B(51) + A(32) * B(52)
C(56) = C(56) + A(40) * B(53) + A(48) * B(54) + A(56) * B(55) + A(64) * B(56)
C(57) = A(1) * B(57) + A(9) * B(58) + A(17) * B(59) + A(25) * B(60)
C(57) = C(57) + A(33) * B(61) + A(41) * B(62) + A(49) * B(63) + A(57) * B(64)
C(58) = A(2) * B(57) + A(10) * B(58) + A(18) * B(59) + A(26) * B(60)
C(58) = C(58) + A(34) * B(61) + A(42) * B(62) + A(50) * B(63) + A(58) * B(64)
C(59) = A(3) * B(57) + A(11) * B(58) + A(19) * B(59) + A(27) * B(60)
C(59) = C(59) + A(35) * B(61) + A(43) * B(62) + A(51) * B(63) + A(59) * B(64)
C(60) = A(4) * B(57) + A(12) * B(58) + A(20) * B(59) + A(28) * B(60)
C(60) = C(60) + A(36) * B(61) + A(44) * B(62) + A(52) * B(63) + A(60) * B(64)
C(61) = A(5) * B(57) + A(13) * B(58) + A(21) * B(59) + A(29) * B(60)
C(61) = C(61) + A(37) * B(61) + A(45) * B(62) + A(53) * B(63) + A(61) * B(64)
C(62) = A(6) * B(57) + A(14) * B(58) + A(22) * B(59) + A(30) * B(60)
C(62) = C(62) + A(38) * B(61) + A(46) * B(62) + A(54) * B(63) + A(62) * B(64)
C(63) = A(7) * B(57) + A(15) * B(58) + A(23) * B(59) + A(31) * B(60)
C(63) = C(63) + A(39) * B(61) + A(47) * B(62) + A(55) * B(63) + A(63) * B(64)
C(64) = A(8) * B(57) + A(16) * B(58) + A(24) * B(59) + A(32) * B(60)
C(64) = C(64) + A(40) * B(61) + A(48) * B(62) + A(56) * B(63) + A(64) * B(64)
end subroutine mul8x8
end module mul8mod
The Julia code looks similar, but I first extract all the elements of the inputs, work on the scalars, and then insert them. I found that that works better in Julia, but worse in Fortran.
The expression looks super simple, like there should be no issue vectorizing it. Julia does so beautifully. Updating an 8x8 matrix in place:
# Julia benchmark; using YMM vectors
#benchmark mul!($c8, $a8, $b8)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 57.059 ns (0.00% GC)
median time: 58.901 ns (0.00% GC)
mean time: 59.522 ns (0.00% GC)
maximum time: 83.196 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 984
This works well.
Compiling the Fortran code with:
gfortran-trunk -march=native -Ofast -mprefer-vector-width=256 -shared -fPIC mul8module1.F08 -o libmul8mod1v15.so
Benchmark results:
# gfortran, using XMM vectors; code was unrolled 8x8 matrix multiplication
#benchmark mul8v15!($c8, $a8, $b8)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 122.175 ns (0.00% GC)
median time: 128.373 ns (0.00% GC)
mean time: 128.643 ns (0.00% GC)
maximum time: 194.090 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 905
Takes about twice as long. Looking at the assembly with -S reveals it ignored my -mprefer-vector-width=256, and used xmm registers instead.
This is also more or less what I get in Julia when I use pointers instead of arrays or mutable structs (when given pointers Julia assumes aliasing and compiles a slower version).
I tried a variety of variations on generating Fortran code (eg, sum(va * vb) statements, were va and vb are 4-length vectors), but the simplest was just calling the intrinsic function matmul.
Compiling matmul (for known 8x8 size) without -mprefer-vector-width=256,
# gfortran using XMM vectors generated from intrinsic matmul function
#benchmark mul8v2v2!($c8, $a8, $b8)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 92.983 ns (0.00% GC)
median time: 96.366 ns (0.00% GC)
mean time: 97.651 ns (0.00% GC)
maximum time: 166.845 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 954
and compiling WITH it:
# gfortran using YMM vectors with intrinsic matmul
#benchmark mul8v2v1!($c8, $a8, $b8)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 163.667 ns (0.00% GC)
median time: 166.544 ns (0.00% GC)
mean time: 168.320 ns (0.00% GC)
maximum time: 277.291 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 780
The avx-free matmul looks really fast for only using xmm registers, but when coerced into ymm -- dreadful.
Any idea what's going on? I want to understand why when instructed to do the same thing, and generating extremely similar assembly, one is so dramatically faster than the other.
FWIW, the input data is 8 byte aligned. I tried 16 byte aligned inputs, and it didn't seem to make a real difference.
I took a look at the assembly produced by gfortran with (note, this is just the intrinsic matmul function):
gfortran-trunk -march=native -Ofast -mprefer-vector-width=256 -shared -fPIC -S mul8module2.F08 -o mul8mod2v1.s
and that from Julia/LLVM, gotten via #code_native mul!(c8, a8, b8) (the unrolled matrix multiplication).
I would be more than happy to share all the assembly or anything else if someone is willing to take a look, but I'd hit the character limit on this post if I included it here.
Both correctly used ymm registers, and lots of vfmadd__pd instructions, also with lots of vmovupd, vmulpd, and vmovapd.
The biggest difference I noticed is that while LLVM used lots of vbroadcastsd, gcc instead has piles of vunpcklpd and vpermpd instructions.
A brief sample; gcc:
vpermpd $216, %ymm7, %ymm7
vpermpd $216, %ymm2, %ymm2
vpermpd $216, %ymm3, %ymm3
vpermpd $216, %ymm5, %ymm5
vunpckhpd %ymm6, %ymm4, %ymm4
vunpcklpd %ymm7, %ymm2, %ymm6
vunpckhpd %ymm7, %ymm2, %ymm2
vunpcklpd %ymm5, %ymm3, %ymm7
vpermpd $216, %ymm15, %ymm15
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm0, %ymm0
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm7, %ymm7
vunpckhpd %ymm5, %ymm3, %ymm3
vunpcklpd %ymm15, %ymm0, %ymm5
vunpckhpd %ymm15, %ymm0, %ymm0
vunpcklpd %ymm4, %ymm1, %ymm15
vunpckhpd %ymm4, %ymm1, %ymm1
vunpcklpd %ymm7, %ymm6, %ymm4
vunpckhpd %ymm7, %ymm6, %ymm6
Julia/LLVM:
vbroadcastsd 8(%rax), %ymm3
vbroadcastsd 72(%rax), %ymm2
vbroadcastsd 136(%rax), %ymm12
vbroadcastsd 200(%rax), %ymm8
vbroadcastsd 264(%rax), %ymm10
vbroadcastsd 328(%rax), %ymm15
vbroadcastsd 392(%rax), %ymm14
vmulpd %ymm7, %ymm0, %ymm1
vmulpd %ymm11, %ymm0, %ymm0
vmovapd %ymm8, %ymm4
Could this explain the difference?
Why would gcc be so poorly optimized here?
Is there any way I can help it, so that it could generate code more comparable to LLVM?
Overall, gcc tends to outperform Clang in benchmarks (eg, on Phoronix)... Maybe I could try Flang (LLVM backend to Fortran), as well as Eigen (with g++ and clang++).
To reproduce, the matmul intrinsic function:
module mul8mod
implicit none
contains
subroutine intrinsic_mul8x8(A, B, C)
real(8), dimension(8,8), intent(in) :: A, B
real(8), dimension(8,8), intent(out) :: C
C = matmul(A, B)
end subroutine
end module mul8mod
Compiled as above, and Julia code to reproduce benchmarks:
#Pkg.clone("https://github.com/chriselrod/TriangularMatrices.jl")
using TriangularMatrices, BenchmarkTools, Compat
a8 = randmat(8); b8 = randmat(8); c8 = randmat(8);
import TriangularMatrices: mul!
#benchmark mul!($c8, $a8, $b8)
#code_native mul!(c8, a8, b8)
# after compiling into the shared library in libmul8mod2v2.so
# If compiled outside the working directory, replace pwd() accordingly
const libmul8path2v1 = joinpath(pwd(), "libmul8mod2v1.so")
function mul8v2v1!(C, A, B)
ccall((:__mul8mod_MOD_intrinsic_mul8x8, libmul8path2v1),
Cvoid,(Ptr{Cvoid},Ptr{Cvoid},Ptr{Cvoid}),
pointer_from_objref(A),
pointer_from_objref(B),
pointer_from_objref(C))
C
end
#benchmark mul8v2v1!($c8, $a8, $b8)
EDIT:
Thanks for the responses everyone!
Because I noticed that the code with the broadcasts is dramatically faster, I decided to rewrite my code-generator to encourage broadcasting.
Generated code now looks more like this:
C[1] = B[1] * A[1]
C[2] = B[1] * A[2]
C[3] = B[1] * A[3]
C[4] = B[1] * A[4]
C[5] = B[1] * A[5]
C[6] = B[1] * A[6]
C[7] = B[1] * A[7]
C[8] = B[1] * A[8]
C[1] += B[2] * A[9]
C[2] += B[2] * A[10]
C[3] += B[2] * A[11]
C[4] += B[2] * A[12]
C[5] += B[2] * A[13]
C[6] += B[2] * A[14]
C[7] += B[2] * A[15]
C[8] += B[2] * A[16]
C[1] += B[3] * A[17]
...
I am intending for the compiler to broadcast B, and then use repeated vectorized fma instructions. Julia really liked this rewrite:
# Julia benchmark; using YMM vectors
#benchmark mul2!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 45.156 ns (0.00% GC)
median time: 47.058 ns (0.00% GC)
mean time: 47.390 ns (0.00% GC)
maximum time: 62.066 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 990
Figuring it was llvm being smart, I also built Flang (Fortran frontend to llvm):
# compiled with
# flang -march=native -Ofast -mprefer-vector-width=256 -shared -fPIC mul8module6.f95 -o libmul8mod6v2.so
#benchmark mul8v6v2!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 51.322 ns (0.00% GC)
median time: 52.791 ns (0.00% GC)
mean time: 52.944 ns (0.00% GC)
maximum time: 83.376 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 988
This is also really good.
gfortran still refused to use broadcasts, and was still slow.
I've still have questions on how best to generate code. Encouraging broadcasts is obviously the way to go. Right now, I'm basically doing matrix * vector multiplication, and then repeating it for every column of B. So my written code loops over A once per column of B.
I do not know if that is what the compiler is actually doing, or if some other pattern would lead to faster code.
The point of optimizing multiplication of tiny matrices is as a kernel for a recursive algorithm for multiplying larger matrices.
So I also need to figure out the best way to handle different sizes.
This algorithm is far better for 8x8 than it is other sizes.
For nrow(A) % 4 (ie, if A has 10 rows, 10 % 4 = 2) I used the old approach for the remainder, after the broadcastable block.
But for 10x10 matrices, it takes 151 ns.
12 is perfectly divisible by 4, but it takes 226.
If this approach scaled with O(n^3), the times should be 91 ns and 158 ns respectively. I am falling well short.
I think I need to block down to a very small size, and try and get as many 8x8 as possible.
It may be the case that 8x8 ought to be the maximum size.
This would be a good case for profiling and performance analysis using a low-level tool that can expose microarchitectural bottlenecks. While I have not used AMD μProf, my experience with Intel's equivalents like XTU suggest that you'll get the best possible results when using a tool written by someone working for the same company and maybe even sitting near the people responsible for the hardware implementation of Ryzen's AVX instructions.
Start with an event-based profile of your application when running through a large number of iterations. General areas you could look for are things like whether one or the other style of generated assembly makes better use of execution ports or related backend CPU resources, or whether they behave differently with respect to cache and memory accesses. None of that would answer your conceptual question of why gcc has chosen to generate assembly in one style and LLVM in another, but it might tell you more at a hardware level about why the LLVM-generated assembly runs faster.
I'm doing a program in VB6 that simulates the process of multiplying a 6x6 to 6x6 matrix. I just did the code only for the first row and second row. To be frankly honest I'm a beginner in programming so I'm still figuring things out.
Can someone show how to simplify this code using do until loop so that I can also apply it for the remaining 3rd, 4th, 5th and 6th row?
Re:
I also added the code for rows 3rd, 4th, 5th and 6th. Pls do help me.
an(0).Text = Val(a(0).Text) * Val(b(0).Text) + Val(a(1).Text) * Val(b(6).Text) + Val(a(2).Text) * Val(b(12).Text) + Val(a(3).Text) * Val(b(18).Text) + Val(a(4).Text) * Val(b(24).Text) + Val(a(5).Text) * Val(b(30).Text)
an(1).Text = Val(a(0).Text) * Val(b(1).Text) + Val(a(1).Text) * Val(b(7).Text) + Val(a(2).Text) * Val(b(13).Text) + Val(a(3).Text) * Val(b(19).Text) + Val(a(4).Text) * Val(b(25).Text) + Val(a(5).Text) * Val(b(31).Text)
an(2).Text = Val(a(0).Text) * Val(b(2).Text) + Val(a(1).Text) * Val(b(8).Text) + Val(a(2).Text) * Val(b(14).Text) + Val(a(3).Text) * Val(b(20).Text) + Val(a(4).Text) * Val(b(26).Text) + Val(a(5).Text) * Val(b(32).Text)
an(3).Text = Val(a(0).Text) * Val(b(3).Text) + Val(a(1).Text) * Val(b(9).Text) + Val(a(2).Text) * Val(b(15).Text) + Val(a(3).Text) * Val(b(21).Text) + Val(a(4).Text) * Val(b(27).Text) + Val(a(5).Text) * Val(b(33).Text)
an(4).Text = Val(a(0).Text) * Val(b(4).Text) + Val(a(1).Text) * Val(b(10).Text) + Val(a(2).Text) * Val(b(16).Text) + Val(a(3).Text) * Val(b(22).Text) + Val(a(4).Text) * Val(b(28).Text) + Val(a(5).Text) * Val(b(34).Text)
an(5).Text = Val(a(0).Text) * Val(b(5).Text) + Val(a(1).Text) * Val(b(11).Text) + Val(a(2).Text) * Val(b(17).Text) + Val(a(3).Text) * Val(b(23).Text) + Val(a(4).Text) * Val(b(29).Text) + Val(a(5).Text) * Val(b(35).Text)
an(6).Text = Val(a(6).Text) * Val(b(0).Text) + Val(a(7).Text) * Val(b(6).Text) + Val(a(8).Text) * Val(b(12).Text) + Val(a(9).Text) * Val(b(18).Text) + Val(a(10).Text) * Val(b(24).Text) + Val(a(11).Text) * Val(b(30).Text)
an(7).Text = Val(a(6).Text) * Val(b(1).Text) + Val(a(7).Text) * Val(b(7).Text) + Val(a(8).Text) * Val(b(13).Text) + Val(a(9).Text) * Val(b(19).Text) + Val(a(10).Text) * Val(b(25).Text) + Val(a(11).Text) * Val(b(31).Text)
an(8).Text = Val(a(6).Text) * Val(b(2).Text) + Val(a(7).Text) * Val(b(8).Text) + Val(a(8).Text) * Val(b(14).Text) + Val(a(9).Text) * Val(b(20).Text) + Val(a(10).Text) * Val(b(26).Text) + Val(a(11).Text) * Val(b(32).Text)
an(9).Text = Val(a(6).Text) * Val(b(3).Text) + Val(a(7).Text) * Val(b(9).Text) + Val(a(8).Text) * Val(b(15).Text) + Val(a(9).Text) * Val(b(21).Text) + Val(a(10).Text) * Val(b(27).Text) + Val(a(11).Text) * Val(b(33).Text)
an(10).Text = Val(a(6).Text) * Val(b(4).Text) + Val(a(7).Text) * Val(b(10).Text) + Val(a(8).Text) * Val(b(16).Text) + Val(a(9).Text) * Val(b(22).Text) + Val(a(10).Text) * Val(b(28).Text) + Val(a(11).Text) * Val(b(34).Text)
an(11).Text = Val(a(6).Text) * Val(b(5).Text) + Val(a(7).Text) * Val(b(11).Text) + Val(a(8).Text) * Val(b(17).Text) + Val(a(9).Text) * Val(b(23).Text) + Val(a(10).Text) * Val(b(29).Text) + Val(a(11).Text) * Val(b(35).Text)
an(12).Text = Val(A(12).Text) * Val(B(0).Text) + Val(A(13).Text) * Val(B(6).Text) + Val(A(14).Text) * Val(B(12).Text) + Val(A(15).Text) * Val(B(18).Text) + Val(A(16).Text) * Val(B(24).Text) + Val(A(17).Text) * Val(B(30).Text)
an(13).Text = Val(A(12).Text) * Val(B(1).Text) + Val(A(13).Text) * Val(B(7).Text) + Val(A(14).Text) * Val(B(13).Text) + Val(A(15).Text) * Val(B(19).Text) + Val(A(16).Text) * Val(B(25).Text) + Val(A(17).Text) * Val(B(31).Text)
an(14).Text = Val(A(12).Text) * Val(B(2).Text) + Val(A(13).Text) * Val(B(8).Text) + Val(A(14).Text) * Val(B(14).Text) + Val(A(15).Text) * Val(B(20).Text) + Val(A(16).Text) * Val(B(26).Text) + Val(A(17).Text) * Val(B(32).Text)
an(15).Text = Val(A(12).Text) * Val(B(3).Text) + Val(A(13).Text) * Val(B(9).Text) + Val(A(14).Text) * Val(B(15).Text) + Val(A(15).Text) * Val(B(21).Text) + Val(A(16).Text) * Val(B(27).Text) + Val(A(17).Text) * Val(B(33).Text)
an(16).Text = Val(A(12).Text) * Val(B(4).Text) + Val(A(13).Text) * Val(B(10).Text) + Val(A(14).Text) * Val(B(16).Text) + Val(A(15).Text) * Val(B(22).Text) + Val(A(16).Text) * Val(B(28).Text) + Val(A(17).Text) * Val(B(34).Text)
an(17).Text = Val(A(12).Text) * Val(B(5).Text) + Val(A(13).Text) * Val(B(11).Text) + Val(A(14).Text) * Val(B(17).Text) + Val(A(15).Text) * Val(B(23).Text) + Val(A(16).Text) * Val(B(29).Text) + Val(A(17).Text) * Val(B(35).Text)
an(18).Text = Val(A(18).Text) * Val(B(0).Text) + Val(A(19).Text) * Val(B(6).Text) + Val(A(20).Text) * Val(B(12).Text) + Val(A(21).Text) * Val(B(18).Text) + Val(A(22).Text) * Val(B(24).Text) + Val(A(23).Text) * Val(B(30).Text)
an(19).Text = Val(A(18).Text) * Val(B(1).Text) + Val(A(19).Text) * Val(B(7).Text) + Val(A(20).Text) * Val(B(13).Text) + Val(A(21).Text) * Val(B(19).Text) + Val(A(22).Text) * Val(B(25).Text) + Val(A(23).Text) * Val(B(31).Text)
an(20).Text = Val(A(18).Text) * Val(B(2).Text) + Val(A(19).Text) * Val(B(8).Text) + Val(A(20).Text) * Val(B(14).Text) + Val(A(21).Text) * Val(B(20).Text) + Val(A(22).Text) * Val(B(26).Text) + Val(A(23).Text) * Val(B(32).Text)
an(21).Text = Val(A(18).Text) * Val(B(3).Text) + Val(A(19).Text) * Val(B(9).Text) + Val(A(20).Text) * Val(B(15).Text) + Val(A(21).Text) * Val(B(21).Text) + Val(A(22).Text) * Val(B(27).Text) + Val(A(23).Text) * Val(B(33).Text)
an(22).Text = Val(A(18).Text) * Val(B(4).Text) + Val(A(19).Text) * Val(B(10).Text) + Val(A(20).Text) * Val(B(16).Text) + Val(A(21).Text) * Val(B(22).Text) + Val(A(22).Text) * Val(B(28).Text) + Val(A(23).Text) * Val(B(34).Text)
an(23).Text = Val(A(18).Text) * Val(B(5).Text) + Val(A(19).Text) * Val(B(11).Text) + Val(A(20).Text) * Val(B(17).Text) + Val(A(21).Text) * Val(B(23).Text) + Val(A(22).Text) * Val(B(29).Text) + Val(A(23).Text) * Val(B(35).Text)
an(24).Text = Val(A(24).Text) * Val(B(0).Text) + Val(A(25).Text) * Val(B(6).Text) + Val(A(26).Text) * Val(B(12).Text) + Val(A(27).Text) * Val(B(18).Text) + Val(A(28).Text) * Val(B(24).Text) + Val(A(29).Text) * Val(B(30).Text)
an(25).Text = Val(A(24).Text) * Val(B(1).Text) + Val(A(25).Text) * Val(B(7).Text) + Val(A(26).Text) * Val(B(13).Text) + Val(A(27).Text) * Val(B(19).Text) + Val(A(28).Text) * Val(B(25).Text) + Val(A(29).Text) * Val(B(31).Text)
an(26).Text = Val(A(24).Text) * Val(B(2).Text) + Val(A(25).Text) * Val(B(8).Text) + Val(A(26).Text) * Val(B(14).Text) + Val(A(27).Text) * Val(B(20).Text) + Val(A(28).Text) * Val(B(26).Text) + Val(A(29).Text) * Val(B(32).Text)
an(27).Text = Val(A(24).Text) * Val(B(3).Text) + Val(A(25).Text) * Val(B(9).Text) + Val(A(26).Text) * Val(B(15).Text) + Val(A(27).Text) * Val(B(21).Text) + Val(A(28).Text) * Val(B(27).Text) + Val(A(29).Text) * Val(B(33).Text)
an(28).Text = Val(A(24).Text) * Val(B(4).Text) + Val(A(25).Text) * Val(B(10).Text) + Val(A(26).Text) * Val(B(16).Text) + Val(A(27).Text) * Val(B(22).Text) + Val(A(28).Text) * Val(B(28).Text) + Val(A(29).Text) * Val(B(34).Text)
an(29).Text = Val(A(24).Text) * Val(B(5).Text) + Val(A(25).Text) * Val(B(11).Text) + Val(A(26).Text) * Val(B(17).Text) + Val(A(27).Text) * Val(B(23).Text) + Val(A(28).Text) * Val(B(29).Text) + Val(A(29).Text) * Val(B(35).Text)
an(30).Text = Val(A(30).Text) * Val(B(0).Text) + Val(A(31).Text) * Val(B(6).Text) + Val(A(32).Text) * Val(B(12).Text) + Val(A(33).Text) * Val(B(18).Text) + Val(A(34).Text) * Val(B(24).Text) + Val(A(35).Text) * Val(B(30).Text)
an(31).Text = Val(A(30).Text) * Val(B(1).Text) + Val(A(31).Text) * Val(B(7).Text) + Val(A(32).Text) * Val(B(13).Text) + Val(A(33).Text) * Val(B(19).Text) + Val(A(34).Text) * Val(B(25).Text) + Val(A(35).Text) * Val(B(31).Text)
an(32).Text = Val(A(30).Text) * Val(B(2).Text) + Val(A(31).Text) * Val(B(8).Text) + Val(A(32).Text) * Val(B(14).Text) + Val(A(33).Text) * Val(B(20).Text) + Val(A(34).Text) * Val(B(26).Text) + Val(A(35).Text) * Val(B(32).Text)
an(33).Text = Val(A(30).Text) * Val(B(3).Text) + Val(A(31).Text) * Val(B(9).Text) + Val(A(32).Text) * Val(B(15).Text) + Val(A(33).Text) * Val(B(21).Text) + Val(A(34).Text) * Val(B(27).Text) + Val(A(35).Text) * Val(B(33).Text)
an(34).Text = Val(A(30).Text) * Val(B(4).Text) + Val(A(31).Text) * Val(B(10).Text) + Val(A(32).Text) * Val(B(16).Text) + Val(A(33).Text) * Val(B(22).Text) + Val(A(34).Text) * Val(B(28).Text) + Val(A(35).Text) * Val(B(34).Text)
an(35).Text = Val(A(30).Text) * Val(B(5).Text) + Val(A(31).Text) * Val(B(11).Text) + Val(A(32).Text) * Val(B(17).Text) + Val(A(33).Text) * Val(B(23).Text) + Val(A(34).Text) * Val(B(29).Text) + Val(A(35).Text) * Val(B(35).Text)
A Quick sample below, you need to tweak it much more until you end up with 3 loops inside each other.
To create the textboxes first -
Option Explicit
Private Sub Command1_Click()
Dim xText As Integer
xText = 1
For xText = 1 To 36
Load a(xText)
Load b(xText)
Load an(xText)
a(xText).Move a(0).Left, a(xText - 1).Top + 360, a(xText - 1).Width, 285
a(xText).Visible = True
a(xText) = a(xText - 1) + 1
b(xText).Move b(0).Left, b(xText - 1).Top + 360, b(xText - 1).Width, 285
b(xText).Visible = True
b(xText) = b(xText - 1) + 1
an(xText).Move an(0).Left, an(xText - 1).Top + 360, an(xText - 1).Width, 285
an(xText).Visible = True
Next xText
End Sub
To get the values calculated - ONLY A SAMPLE...
Private Sub Command2_Click()
Dim xSumtotal As Long, xMatrixA As Integer, xMatrixB As Integer
xSumtotal = 0
xMatrixA = 0
xMatrixB = 0
For xSumtotal = 0 To 35
If xMatrixA >= 36 Then
xMatrixA = 0
Else
xMatrixA = xMatrixA + 1
End If
If xMatrixB >= 36 Then
xMatrixB = 0
Else
xMatrixB = xMatrixB + 6
End If
an(xSumtotal) = Val(a(xMatrixA).Text) * Val(b(xMatrixB).Text) + Val(a(xMatrixA + 1).Text) * Val(b(xMatrixB + 6).Text) + Val(a(xMatrixA + 2).Text) * Val(b(xMatrixB + 6).Text) + Val(a(xMatrixA + 3).Text) * Val(b(xMatrixB + 6).Text) + Val(a(xMatrixA + 4).Text) * Val(b(xMatrixB + 6).Text) + Val(a(xMatrixA + 5).Text) * Val(b(xMatrixB + 6).Text)
''The if statement will not work as you will get an overflow error because you will be jumping to xMatrixA = 36 on line 6...
''You needt to use another 2 FOR/NEXT loops nested inside here to calculate through the 36x36...
''Not enough time to make it all work, this will however put you on the right track...
Next xSumtotal
End Sub
Hope this helps...
In cocos2d the gradient is implemented using a quad, and the color is interpolated by opengl, but there is an extra parameter which controls the gradient's direction, so how does the algorithm to work.
//_alongVector is the gradient's direction
float h = _alongVector.getLength();
if (h == 0)
return;
// why a sqrt(2.0) ???
float c = sqrtf(2.0f);
Vec2 u(_alongVector.x / h, _alongVector.y / h);
// and what does this mean
if (_compressedInterpolation)
{
float h2 = 1 / ( fabsf(u.x) + fabsf(u.y) );
u = u * (h2 * (float)c);
}
float opacityf = (float)_displayedOpacity / 255.0f;
Color4F S(
_displayedColor.r / 255.0f,
_displayedColor.g / 255.0f,
_displayedColor.b / 255.0f,
_startOpacity * opacityf / 255.0f
);
Color4F E(
_endColor.r / 255.0f,
_endColor.g / 255.0f,
_endColor.b / 255.0f,
_endOpacity * opacityf / 255.0f
);
// what are these magic ??????
// (-1, -1)
_squareColors[0].r = E.r + (S.r - E.r) * ((c + u.x + u.y) / (2.0f * c));
_squareColors[0].g = E.g + (S.g - E.g) * ((c + u.x + u.y) / (2.0f * c));
_squareColors[0].b = E.b + (S.b - E.b) * ((c + u.x + u.y) / (2.0f * c));
_squareColors[0].a = E.a + (S.a - E.a) * ((c + u.x + u.y) / (2.0f * c));
// (1, -1)
_squareColors[1].r = E.r + (S.r - E.r) * ((c - u.x + u.y) / (2.0f * c));
_squareColors[1].g = E.g + (S.g - E.g) * ((c - u.x + u.y) / (2.0f * c));
_squareColors[1].b = E.b + (S.b - E.b) * ((c - u.x + u.y) / (2.0f * c));
_squareColors[1].a = E.a + (S.a - E.a) * ((c - u.x + u.y) / (2.0f * c));
// (-1, 1)
_squareColors[2].r = E.r + (S.r - E.r) * ((c + u.x - u.y) / (2.0f * c));
_squareColors[2].g = E.g + (S.g - E.g) * ((c + u.x - u.y) / (2.0f * c));
_squareColors[2].b = E.b + (S.b - E.b) * ((c + u.x - u.y) / (2.0f * c));
_squareColors[2].a = E.a + (S.a - E.a) * ((c + u.x - u.y) / (2.0f * c));
// (1, 1)
_squareColors[3].r = E.r + (S.r - E.r) * ((c - u.x - u.y) / (2.0f * c));
_squareColors[3].g = E.g + (S.g - E.g) * ((c - u.x - u.y) / (2.0f * c));
_squareColors[3].b = E.b + (S.b - E.b) * ((c - u.x - u.y) / (2.0f * c));
_squareColors[3].a = E.a + (S.a - E.a) * ((c - u.x - u.y) / (2.0f * c));
and here is another implementation in cocos2d-objc
// _vector apparently points towards the first color.
float g0 = 0.0f; // (0, 0) dot _vector
float g1 = -_vector.x; // (0, 1) dot _vector
float g2 = -_vector.x - _vector.y; // (1, 1) dot _vector
float g3 = -_vector.y; // (1, 0) dot _vector
float gmin = MIN(MIN(g0, g1), MIN(g2, g3));
float gmax = MAX(MAX(g0, g1), MAX(g2, g3));
GLKVector4 a = GLKVector4Make(_color.r*_color.a*_displayColor.a, _color.g*_color.a*_displayColor.a, _color.b*_color.a*_displayColor.a, _color.a*_displayColor.a);
GLKVector4 b = GLKVector4Make(_endColor.r*_endColor.a*_displayColor.a, _endColor.g*_endColor.a*_displayColor.a, _endColor.b*_endColor.a*_displayColor.a, _endColor.a*_displayColor.a);
_colors[0] = GLKVector4Lerp(a, b, (g0 - gmin)/(gmax - gmin));
_colors[1] = GLKVector4Lerp(a, b, (g1 - gmin)/(gmax - gmin));
_colors[2] = GLKVector4Lerp(a, b, (g2 - gmin)/(gmax - gmin));
_colors[3] = GLKVector4Lerp(a, b, (g3 - gmin)/(gmax - gmin));
are these algorithms the same?
I really want to know the math or algorithm behind those magic operations
I'm having a hard time figuring out why my feed forward artificial neural networks almost always fail to learn my simple "OR" perceptron:
0 or 0 : 0
1 or 0 : 1
0 or 1 : 1
1 or 1 : 1
I tried various network configurations : 1 or 2 hidden layers, 2 to 8 neurons on hidden layers, a bias (value = 1, connected to each hidden neuron and output neuron)
For this example, the network looks like:
[1] [3]
[2] [4] [6]
[5]
input layer : [1] and [2]
hidden layer : [3] and [4] and [6]
output layer : [6]
Notations:
Neuron i : [i]
Weight between [i] and [j] : w(i,j)
[i] value : v(i)
[i] error: e(i)
sig(t) : 1/(1 + e^-t)
Weights initialization
Each weight is set to a random value between -0.5 and 0.5
Input
v(1) and v(2) are set with random values : 0 or 1
Propagate value (from left to right) :
v(3) = sig(v(1) * w(1,3) + v(2) * w(2,3))
v(4) = sig(v(1) * w(1,4) + v(2) * w(2,4))
v(5) = sig(v(1) * w(1,5) + v(2) * w(2,5))
v(6) = sig(v(3) * w(3,6) + v(4) * w(4,6) + v(5) * w(5,6))
expected output : 1 if (v(1) or v(2)) is true, 0 otherwise
e(6) = (expected - v(6)) * v(6) * (1 - v(6))
Propagate error (from right to left)
e(3) = e(6) * w(3,6) * v(3) * (1 - v(3))
e(4) = e(6) * w(4,6) * v(4) * (1 - v(4))
e(5) = e(6) * w(5,6) * v(5) * (1 - v(5))
Update weights (learning rate = 1)
w(1,3) = w(1,3) + v(1) * e(3)
w(1,4) = w(1,4) + v(1) * e(4)
w(1,5) = w(1,5) + v(1) * e(5)
w(2,3) = w(2,3) + v(2) * e(3)
w(2,4) = w(2,4) + v(2) * e(4)
w(2,5) = w(2,5) + v(2) * e(5)
w(3,6) = w(3,6) + v(3) * e(6)
w(4,6) = w(4,6) + v(4) * e(6)
w(5,6) = w(5,6) + v(5) * e(6)
Do it 300 times and print each result:
if v(6) > 0.5, it’s a True
if v(6) < 0.5, it’s a False
compare with expected output
After a few epoch, the network almost always return v(6) > 0.9, even for v(1) = v(2) = 0
Sometimes (roughly 1 out of 20 times), it works, the network learned correctly
What am I doing wrong ?
EDIT:
I found it !
The culprits was :
e(3) = e(6) * w(3,6) * v(3) * (1 - v(3))
e(4) = e(6) * w(4,6) * v(4) * (1 - v(4))
e(5) = e(6) * w(5,6) * v(5) * (1 - v(5))
Which should have been :
e(3) = e(6) * w(3,6)
e(4) = e(6) * w(4,6)
e(5) = e(6) * w(5,6)
I am using open MP to speed up the flux calculation in my program. I basically want OpenMP to carry out both of these left and right flux calculations in parallel. But on the contrary, the following code takes even more time with the #pragma directives. What do i modify to get it right?
#pragma omp parallel num_threads(2)
{
#pragma omp single
{//first condition
//cerr<<"Executed thread 0"<<endl;
if ( (fabs(lcellMach-1.0)<EPSILON) || ( (lcellMach-1.0) > 0.0 ) ){//purpose of Epsilon!!!!
FluxP[0] = rhol * vnl;
FluxP[1] = rhol * ul * vnl + Pl*nx;
FluxP[2] = rhol * vl * vnl + Pl*ny;
FluxP[3] = rhol * wl * vnl + Pl*nz;
FluxP[4] = rhol * ((GAMMA * Pl / (rhol * (GAMMA-1.0))) + ((ul*ul + vl*vl + wl*wl)/2.0)) * vnl;
}else if ( (fabs(lcellMach+1.0)<EPSILON) || ( (lcellMach+1.0) < 0.0 ) ){
FluxP[0] = FluxP[1] = FluxP[2] = FluxP[3] = FluxP[4] = 0.0;// If flow direction is opposite the Flux + is zero
}else {
double ql = (ul*ul + vl*vl + wl*wl);// how did this come
FluxP[0] = rhol * lcell_a * (lcellMach+1.0)*(lcellMach+1.0) / 4.0;
FluxP[1] = FluxP[0] * ( ul + (nx*(0.0-vnl + 2.0*lcell_a)/GAMMA) );
FluxP[2] = FluxP[0] * ( vl + (ny*(0.0-vnl + 2.0*lcell_a)/GAMMA) );
FluxP[3] = FluxP[0] * ( wl + (nz*(0.0-vnl + 2.0*lcell_a)/GAMMA) );
FluxP[4] = FluxP[0] * ( ((ql - vnl*vnl)/2.0) + (((GAMMA-1.0)*vnl + 2.0*lcell_a)*((GAMMA-1.0)*vnl + 2.0*lcell_a) / (2.0*(GAMMA*GAMMA-1.0))) );
}
}//end of 1st
#pragma omp single
{//second condition
//cerr<<"Executed thread 1"<<endl;
if ((fabs(rcellMach+1.0)<EPSILON) || ((rcellMach+1.0) < 0.0)) {
FluxM[0] = rhor * vnr;
FluxM[1] = rhor * ur * vnr + Pr*nx;
FluxM[2] = rhor * vr * vnr + Pr*ny;
FluxM[3] = rhor * wr * vnr + Pr*nz;
FluxM[4] = rhor * ((GAMMA * Pr / (rhor * (GAMMA-1.0))) + ((ur*ur + vr*vr + wr*wr)/2.0)) * vnr;
}else if ((fabs(rcellMach-1.0)<EPSILON) || ((rcellMach-1.0) > 0.0)) {
FluxM[0] = FluxM[1] = FluxM[2] = FluxM[3] = FluxM[4] = 0.0;
}else {
tempFlux[0] = rhor * vnr;
tempFlux[1] = rhor * ur * vnr + Pr*nx;
tempFlux[2] = rhor * vr * vnr + Pr*ny;
tempFlux[3] = rhor * wr * vnr + Pr*nz;
tempFlux[4] = rhor * ((GAMMA * Pr / (rhor * (GAMMA-1.0))) + ((ur*ur + vr*vr + wr*wr)/2.0)) * vnr;
double qr = (ur*ur + vr*vr + wr*wr);
tempFluxP[0] = rhor * rcell_a * (rcellMach+1.0)*(rcellMach+1.0) / 4.0;
tempFluxP[1] = tempFluxP[0] * ( ur + (nx*(0.0-vnr + 2.0*rcell_a)/GAMMA) );
tempFluxP[2] = tempFluxP[0] * ( vr + (ny*(0.0-vnr + 2.0*rcell_a)/GAMMA) );
tempFluxP[3] = tempFluxP[0] * ( wr + (nz*(0.0-vnr + 2.0*rcell_a)/GAMMA) );
tempFluxP[4] = tempFluxP[0] * ( ((qr - vnr*vnr)/2.0) + (((GAMMA-1.0)*vnr + 2.0*rcell_a)*((GAMMA-1.0)*vnr + 2.0*rcell_a) / (2.0*(GAMMA*GAMMA-1.0))) );
for (int j=0; j<O; j++) FluxM[j] = tempFlux[j] - tempFluxP[j];
}
}
}//pragma
Urgent help required. Thanks.
What you need is the sections construct:
#pragma omp parallel sections num_threads(2)
{
#pragma omp section
{
... code that updates FluxP ...
}
#pragma omp section
{
... code that updates FluxM ...
}
}
But your code doesn't seem like it would take much time to do the calculations (no big for loops inside for example) so the overhead that OpenMP will put onto it will most likely be more time consuming than the saving in computation time and hence the parallel version will most likely execute slower than the serial.