Solve symmetric linear equation ax=b by using Gauss elimination - parallel-processing

I'm new to parallel programming, and I'm currently working on optimizing a code that works with electromagnetic calculations.
Analyzing how the program works, I realized that 85% of the time spent on execution is about solving a linear equation.
I studied a little bit of openmp but I have no idea how to parallelize a nested loop like this.
Any idea?
Thank you in advance. Follow the code below
Subroutine GaussEqSolver_Sym(n,ma,a,b,ep,kwji)
!------------------------------------------------------------------
! Solve sysmmetric linear equation ax=b by using Gauss elimination.
! If kwji=1, no solution;if kwji=0,has solution
! Input--n,ma,a(ma,n),b(n),ep,
! Output--b,kwji
!------------------------------------------------------------------
implicit real*8 (a-h,o-z)
dimension a(ma,n),b(n),m(n+1)
do 10 i=1,n
10 m(i)=i
do 120 k=1,n
p=0.0
do 20 i=k,n
do 20 j=k,n
if(dabs(a(i,j)).gt.dabs(p)) then
p=a(i,j)
io=i
jo=j
endif
20 continue
if(dabs(p)-ep) 30,30,35
30 kwji=1
return
35 continue
if(jo.eq.k) go to 45
do 40 i=1,n
t=a(i,jo)
a(i,jo)=a(i,k)
a(i,k)=t
40 continue
j=m(k)
m(k)=m(jo)
m(jo)=j
45 if(io.eq.k) go to 55
do 50 j=k,n
t=a(io,j)
a(io,j)=a(k,j)
a(k,j)=t
50 continue
t=b(io)
b(io)=b(k)
b(k)=t
55 p=1./p
in=n-1
if(k.eq.n) go to 65
do 60 j=k,in
60 a(k,j+1)=a(k,j+1)*p
65 b(k)=b(k)*p
if(k.eq.n) go to 120
do 80 i=k,in
do 70 j=k,in
70 a(i+1,j+1)=a(i+1,j+1)-a(i+1,k)*a(k,j+1)
80 b(i+1)=b(i+1)-a(i+1,k)*b(k)
120 continue
do 130 i1=2,n
i=n+1-i1
do 130 j=i,in
130 b(i)=b(i)-a(i,j+1)*b(j+1)
do 140 k=1,n
i=m(k)
140 a(1,i)=b(k)
do 150 k=1,n
150 b(k)=a(1,k)
kwji=0
return
end

If you are interested in performance you should be using LAPACK. To illustrate this I have written a simple driver program that compares the speed of the code you provided with calling DSYSV, the LAPACK routine that solves a set of linear equations for a symmetric, "double precision" matrix. The code and results are below, but in summary LAPACK varies from being 3.3 times faster than the Fortran, to 725 times faster. Note this is probably not an optimised LAPACK library, it is whatever comes with the Mint Linux I have installed on my laptop. Anyway, details below
ian#eris:~/work/stack$ cat solve.f90
Subroutine GaussEqSolver_Sym(n,ma,a,b,ep,kwji)
!------------------------------------------------------------------
! Solve sysmmetric linear equation ax=b by using Gauss elimination.
! If kwji=1, no solution;if kwji=0,has solution
! Input--n,ma,a(ma,n),b(n),ep,
! Output--b,kwji
!------------------------------------------------------------------
implicit real*8 (a-h,o-z)
dimension a(ma,n),b(n),m(n+1)
do 10 i=1,n
10 m(i)=i
do 120 k=1,n
p=0.0
do 20 i=k,n
do 20 j=k,n
if(dabs(a(i,j)).gt.dabs(p)) then
p=a(i,j)
io=i
jo=j
endif
20 continue
if(dabs(p)-ep) 30,30,35
30 kwji=1
return
35 continue
if(jo.eq.k) go to 45
do 40 i=1,n
t=a(i,jo)
a(i,jo)=a(i,k)
a(i,k)=t
40 continue
j=m(k)
m(k)=m(jo)
m(jo)=j
45 if(io.eq.k) go to 55
do 50 j=k,n
t=a(io,j)
a(io,j)=a(k,j)
a(k,j)=t
50 continue
t=b(io)
b(io)=b(k)
b(k)=t
55 p=1./p
in=n-1
if(k.eq.n) go to 65
do 60 j=k,in
60 a(k,j+1)=a(k,j+1)*p
65 b(k)=b(k)*p
if(k.eq.n) go to 120
do 80 i=k,in
do 70 j=k,in
70 a(i+1,j+1)=a(i+1,j+1)-a(i+1,k)*a(k,j+1)
80 b(i+1)=b(i+1)-a(i+1,k)*b(k)
120 continue
do 130 i1=2,n
i=n+1-i1
do 130 j=i,in
130 b(i)=b(i)-a(i,j+1)*b(j+1)
do 140 k=1,n
i=m(k)
140 a(1,i)=b(k)
do 150 k=1,n
150 b(k)=a(1,k)
kwji=0
return
end
Program solve_eqns
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Implicit None
Real( wp ), Dimension( :, : ), Allocatable :: a, a_copy
Real( wp ), Dimension( : ), Allocatable :: b
Real( wp ), Dimension( : ), Allocatable :: x_lap, x_for
Real( wp ), Dimension( : ), Allocatable :: work
Real( wp ) :: time_lap, time_for
Integer, Dimension( : ), Allocatable :: pivots
Integer :: i
Integer :: n, nb = 64 ! hack value for nb - should use ilaenv
Integer :: error
Integer( li ) :: start, finish, rate
Write( *, * ) 'n ?'
Read ( *, * ) n
Allocate( a( 1:n, 1:n ) )
Allocate( b( 1:n ) )
Allocate( pivots( 1:n ) )
! Set up matrix
Call Random_number( a )
a = a - 0.5_wp
! Make A symmetric
a = 0.5_wp * ( a + Transpose( a ) )
! Add n to diagonal of A to avoid any nasty condition numbers
Do i = 1, n
a( i, i ) = a( i, i ) + n
End Do
! And keep a back up of A
a_copy = a
! RHS
Call Random_number( b )
! Solve with LAPACK
x_lap = b
Allocate( work( 1:n * nb ) )
Call system_clock( start, rate )
Call dsysv( 'U', n, 1, a, Size( a, dim = 1 ), pivots, &
x_lap, Size( x_lap, Dim = 1 ), work, Size( work ), error )
Call system_clock( finish, rate )
time_lap = Real( finish - start, Kind( time_lap ) ) / rate
! Restore A
a = a_copy
Write( *, * ) 'Errors for LAPACK', error, Maxval( Abs( Matmul( a, x_lap ) - b ) )
Write( *, * ) 'Time for LAPACK', time_lap
! Solve with Fortran
x_for = b
Call system_clock( start, rate )
Call GaussEqSolver_Sym( Size( a, Dim = 2 ), Size( a, Dim = 1 ), a, x_for, Epsilon( a ), error )
Call system_clock( finish, rate )
time_for = Real( finish - start, Kind( time_for ) ) / rate
! Restore A
a = a_copy
Write( *, * ) 'Errors for Fortran', error, Maxval( Abs( Matmul( a, x_for ) - b ) )
Write( *, * ) 'Time_For for Fortran', time_for
Write( *, * ) 'Max difference in solutions', Maxval( Abs( x_lap - x_for ) )
Write( *, * ) 'LAPACK is ', time_for / time_lap, ' times quicker than the Fortran'
End Program solve_eqns
ian#eris:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
ian#eris:~/work/stack$ gfortran -O3 solve.f90 -llapack
ian#eris:~/work/stack$ ./a.out
n ?
100
Errors for LAPACK 0 4.4408920985006262E-016
Time for LAPACK 1.5952670000000000E-003
Errors for Fortran 0 9.9920072216264089E-016
Time_For for Fortran 5.3095140000000004E-003
Max difference in solutions 8.6736173798840355E-018
LAPACK is 3.3282917530419676 times quicker than the Fortran
ian#eris:~/work/stack$ ./a.out
n ?
1000
Errors for LAPACK 0 1.3322676295501878E-015
Time for LAPACK 3.9014976000000000E-002
Errors for Fortran 0 4.9960036108132044E-015
Time_For for Fortran 1.9314730620000000
Max difference in solutions 4.7704895589362195E-018
LAPACK is 49.505940026722044 times quicker than the Fortran
ian#eris:~/work/stack$ ./a.out
n ?
5000
Errors for LAPACK 0 4.3298697960381105E-015
Time for LAPACK 1.2611959250000000
Errors for Fortran 0 1.3322676295501878E-014
Time_For for Fortran 913.76959534100001
Max difference in solutions 2.7647155398380363E-018
LAPACK is 724.52628273517462 times quicker than the Fortran

Related

Unique random numbers series in fortran

I have this problem in Fortran where I need to generate a sequence of random numbers (precisely, 729 random numbers between 1 and 1000) and to do I use the intrinsic function random_number().
The problem, of course, is that it returns pseudorandom numbers and it happens to often that I get same values repeated. I'm not an expert in programming with Fortran, but how can I get 729 unique random numbers without repetitions? Can someone help me?
Here you go
ijb#LAPTOP-GUG8KQ9I:~/work/stack$ cat the_chosen.f90
Program possibly_homework
Use, Intrinsic :: iso_fortran_env, Only : wp => real64
Implicit None( Type, External )
Integer, Parameter :: from = 1000
Integer, Parameter :: to_choose = 729
Real( wp ) :: rand
Integer, Dimension( : ), Allocatable :: the_many
Integer, Dimension( : ), Allocatable :: the_chosen
Integer :: the_chosen_one
Integer :: unit
Integer :: i
Allocate( the_many( 1:from ) )
the_many = [ ( i, i = 1, from ) ]
Allocate( the_chosen( 1:0 ) )
Do i = 1, to_choose
Call Random_number( rand )
the_chosen_one = 1 + Int( Size( the_many ) * rand )
the_chosen = [ the_chosen, the_many( the_chosen_one ) ]
the_many = [ the_many( :the_chosen_one - 1 ), the_many( the_chosen_one + 1: ) ]
End Do
Open( newunit = unit, file = 'the_chosen.dat' )
Do i = 1, Size( the_chosen )
Write( unit, '( i0, t20, i0 )' ) i, the_chosen( i )
End Do
Close( unit )
End Program possibly_homework
ijb#LAPTOP-GUG8KQ9I:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
ijb#LAPTOP-GUG8KQ9I:~/work/stack$ gfortran -std=f2018 -Wall -Wextra -fcheck=all -O -g the_chosen.f90 -o the_chosen
ijb#LAPTOP-GUG8KQ9I:~/work/stack$ ./the_chosen
ijb#LAPTOP-GUG8KQ9I:~/work/stack$ head the_chosen.dat
1 303
2 916
3 414
4 452
5 116
6 962
7 392
8 675
9 153
10 458
ijb#LAPTOP-GUG8KQ9I:~/work/stack$ sort +1 -n the_chosen.dat | awk '{ print $2 }' | uniq -D
ijb#LAPTOP-GUG8KQ9I:~/work/stack$
This shows how you can in an inelegant way solve your problem.
program p
implicit none
integer, parameter :: n = 10
logical :: a(n) = .false.
integer b(n), cnt
real x
cnt = 1
do
call random_number(x)
b(cnt) = n * x + 1
if (.not. a(b(cnt))) then
if (b(cnt) > n) cycle
a(b(cnt)) = .true.
cnt = cnt + 1
if (cnt > n) exit
end if
end do
print '(*(1X,I0))', b
end program p
Order not important (as per Marta's question at How to remove random rows from a matrix in Fortran90 ). No shuffle required.
program test
implicit none
integer Nmax, Nsample, Nchoose
logical start
logical, allocatable :: mask(:)
integer cnt, i
real r
Nmax = 10 ! or 1000
Nsample = 7 ! or 729
start = .false.
Nchoose = Nsample
if ( Nsample > Nmax / 2 ) then
start = .true. ! go for the SMALLER likelihood of already taken
Nchoose = Nmax - Nsample
end if
allocate ( mask(Nmax) )
mask = start
cnt = 0
call random_seed()
do while ( cnt < Nchoose )
call random_number( r )
i = 1 + r * Nmax
if ( mask(i) .eqv. start ) then
mask(i) = .not. mask(i)
cnt = cnt + 1
end if
end do
do i = 1, Nmax
if ( mask(i) ) write( *, * ) i
end do
end program test

Transposition of a matrix by multithread in Fortran

I am calculating transposition of a very large dimension matrix using Fortran, P=TRANSPOSE(PP). I see that the built-in function TRANSPOSE in Fortran is very slow. I want to speed up by parallelizing the code as follows:
subroutine TP(nstate,P,PP)
integer i,j,nstate
double precision P(nstate,nstate),PP(nstate,nstate)
!$omp parallel shared ( P, PP,nstate) private (i, j)
!$omp do
do i=1,nstate
do j=1,nstate
P(j,i) = PP(i,j)
end do
end do
!$omp end do
!$omp end parallel do
end subroutine TP(nstate,P,PP)
Unfortunately, my code uses just only 1 thread and there isn't any improvement.
To answer your question about threads the most likely reason is that you will have to set up the compilation process to use OpenMP. All compilers I know do not use OpenMP by default, you have to explicitly turn it on.
However your real question is about speeding up a matrix transpose, and if I were doing this multithreading is about the third thing I would look at. The first would be, as noted in the comments, can I rewrite my algorithm without explicitly forming the transpose? In my experience you rarely have to explicitly form the transpose. For instance many BLAS and LAPACK routines allow you to specify that you want to use the transposed form of the matrix, so you never actually have to take the transpose. Of course whether you can or not depends upon the exact details of what you are doing, but not doing it is almost certainly the fastest way!
I would next look at the single threaded performance. A matrix transpose is one of the worst things you can do with a modern memory subsystem, so throwing more cores at it is not likely to improve things much as the limiting factor is the speed of the memory, not how fast you can do computations. Now for small matrices, smaller than the size of the cache, it's probably not an issue - the cache will save you. But for large matrices, bigger than the cache, you will run into problems. The way to address this is to break the operation into smaller blocks and transpose each block in turn - and choose the block size so it is smaller than the cache. Below is a code which does this in two slightly different ways, and some single threaded results on my laptop
! transpose.f90
Program tran
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Use omp_lib, Only : omp_get_max_threads
Implicit None ( type, external )
Real( wp ), Dimension( :, : ), Allocatable :: a, aT
Real( wp ) :: t, t_in, t_in1, t_in2
Real( wp ) :: error
Integer :: start, finish, rate
Integer :: bfac
Integer :: n
Write( *, * ) 'n ?'
Read ( *, * ) n
Allocate( a ( 1:n, 1:n ) )
Allocate( aT( 1:n, 1:n ) )
Call Random_number( a )
Write( *, * ) 'Size of matrix ', n
Write( *, * ) 'Using ', omp_get_max_threads(), ' threads'
Call System_clock( start, rate )
aT = Transpose( a )
Call System_clock( finish, rate )
error = Maxval( Abs( Transpose( a ) - aT ) )
t_in1 = Real( finish - start, wp ) / rate
Write( *, * ) 'Intrinsic 1: ', t_in1, ' Error: ', error
Call System_clock( start, rate )
aT = Transpose( a )
Call System_clock( finish, rate )
error = Maxval( Abs( Transpose( a ) - aT ) )
t_in2 = Real( finish - start, wp ) / rate
Write( *, * ) 'Intrinsic 2: ', t_in2, ' Error: ', error
t_in = 0.5_wp * ( t_in1 + t_in2 )
Write( *, * ) 'Read bound'
bfac = 10
Do While( bfac <= n )
aT = -2.0_wp
Call System_clock( start, rate )
Call blocked_transpose_rb( a, bfac, aT )
Call System_clock( finish, rate )
error = Maxval( Abs( Transpose( a ) - aT ) )
t = Real( finish - start, wp ) / rate
Write( *, * ) bfac, ' Intrinsic: ', t, ' Error: ', error, ' Speed up ', t_in / t
bfac = Nint( bfac * 1.5_wp )
End Do
Write( *, * ) 'Write bound'
bfac = 10
Do While( bfac <= n )
aT = -2.0_wp
Call System_clock( start, rate )
Call blocked_transpose_wb( a, bfac, aT )
Call System_clock( finish, rate )
error = Maxval( Abs( Transpose( a ) - aT ) )
t = Real( finish - start, wp ) / rate
Write( *, * ) bfac, ' Intrinsic: ', t, ' Error: ', error, ' Speed up: ', t_in / t
bfac = Nint( bfac * 1.5_wp )
End Do
Contains
Subroutine blocked_transpose_rb( a, bfac, aT )
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Real( wp ), Dimension( :, : ), Intent( In ) :: a
Integer , Intent( In ) :: bfac
Real( wp ), Dimension( :, : ), Intent( Out ) :: aT
Integer :: n
Integer :: ib, jb
Integer :: i , j
n = Ubound( a, Dim = 1 )
!$omp parallel default( none ) shared( n, bfac, a, AT ), private( ib, jb, i, j )
!$omp do collapse( 2 )
Do ib = 1, n, bfac
Do jb = 1, n, bfac
Do i = ib, Min( n, ib + bfac - 1 )
Do j = jb, Min( n, jb + bfac - 1 )
aT( j, i ) = a( i, j )
End Do
End Do
End Do
End Do
!$omp end do
!$omp end parallel
End Subroutine blocked_transpose_rb
Subroutine blocked_transpose_wb( a, bfac, aT )
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Real( wp ), Dimension( :, : ), Intent( In ) :: a
Integer , Intent( In ) :: bfac
Real( wp ), Dimension( :, : ), Intent( Out ) :: aT
Integer :: n
Integer :: ib, jb
Integer :: i , j
n = Ubound( a, Dim = 1 )
!$omp parallel default( none ) shared( n, bfac, a, AT ), private( ib, jb, i, j )
!$omp do collapse( 2 )
Do ib = 1, n, bfac
Do jb = 1, n, bfac
Do i = ib, Min( n, ib + bfac - 1 )
Do j = jb, Min( n, jb + bfac - 1 )
aT( i, j ) = a( j, i )
End Do
End Do
End Do
End Do
!$omp end do
!$omp end parallel
End Subroutine blocked_transpose_wb
End Program tran
Compilation
ijb#ijb-Latitude-5410:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
ijb#ijb-Latitude-5410:~/work/stack$ gfortran -O3 -fopenmp -Wall -Wextra -g transpose.f90 -o transpose
Code execution
ijb#ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=1; ./transpose < in
n ?
Size of matrix 20000
Using 1 threads
Intrinsic 1: 9.4619999999999997 Error: 0.0000000000000000
Intrinsic 2: 8.6950000000000003 Error: 0.0000000000000000
Read bound
10 Intrinsic: 2.2450000000000001 Error: 0.0000000000000000 Speed up 4.0438752783964365
15 Intrinsic: 2.0019999999999998 Error: 0.0000000000000000 Speed up 4.5347152847152854
23 Intrinsic: 1.8210000000000000 Error: 0.0000000000000000 Speed up 4.9854475562877543
35 Intrinsic: 1.4319999999999999 Error: 0.0000000000000000 Speed up 6.3397346368715084
53 Intrinsic: 1.2629999999999999 Error: 0.0000000000000000 Speed up 7.1880443388756934
80 Intrinsic: 1.2470000000000001 Error: 0.0000000000000000 Speed up 7.2802726543704885
120 Intrinsic: 1.1870000000000001 Error: 0.0000000000000000 Speed up 7.6482729570345409
180 Intrinsic: 1.1990000000000001 Error: 0.0000000000000000 Speed up 7.5717264386989154
270 Intrinsic: 1.1750000000000000 Error: 0.0000000000000000 Speed up 7.7263829787234037
405 Intrinsic: 1.1510000000000000 Error: 0.0000000000000000 Speed up 7.8874891398783662
608 Intrinsic: 1.1319999999999999 Error: 0.0000000000000000 Speed up 8.0198763250883403
912 Intrinsic: 1.2200000000000000 Error: 0.0000000000000000 Speed up 7.4413934426229513
1368 Intrinsic: 1.4050000000000000 Error: 0.0000000000000000 Speed up 6.4615658362989326
2052 Intrinsic: 3.2240000000000002 Error: 0.0000000000000000 Speed up 2.8159119106699748
3078 Intrinsic: 3.8510000000000000 Error: 0.0000000000000000 Speed up 2.3574396260711503
4617 Intrinsic: 4.0990000000000002 Error: 0.0000000000000000 Speed up 2.2148084898755793
6926 Intrinsic: 4.8529999999999998 Error: 0.0000000000000000 Speed up 1.8706985369874305
10389 Intrinsic: 5.5330000000000004 Error: 0.0000000000000000 Speed up 1.6407916139526477
15584 Intrinsic: 5.9450000000000003 Error: 0.0000000000000000 Speed up 1.5270815811606391
Write bound
10 Intrinsic: 1.5669999999999999 Error: 0.0000000000000000 Speed up: 5.7935545628589660
15 Intrinsic: 1.5389999999999999 Error: 0.0000000000000000 Speed up: 5.8989603638726447
23 Intrinsic: 1.4190000000000000 Error: 0.0000000000000000 Speed up: 6.3978153629316417
35 Intrinsic: 1.2110000000000001 Error: 0.0000000000000000 Speed up: 7.4966969446738227
53 Intrinsic: 1.3109999999999999 Error: 0.0000000000000000 Speed up: 6.9248665141113657
80 Intrinsic: 1.0700000000000001 Error: 0.0000000000000000 Speed up: 8.4845794392523359
120 Intrinsic: 1.0320000000000000 Error: 0.0000000000000000 Speed up: 8.7969961240310077
180 Intrinsic: 1.1960000000000000 Error: 0.0000000000000000 Speed up: 7.5907190635451505
270 Intrinsic: 1.2350000000000001 Error: 0.0000000000000000 Speed up: 7.3510121457489870
405 Intrinsic: 1.2480000000000000 Error: 0.0000000000000000 Speed up: 7.2744391025641022
608 Intrinsic: 1.2849999999999999 Error: 0.0000000000000000 Speed up: 7.0649805447470824
912 Intrinsic: 1.4750000000000001 Error: 0.0000000000000000 Speed up: 6.1549152542372880
1368 Intrinsic: 1.6970000000000001 Error: 0.0000000000000000 Speed up: 5.3497348261638180
2052 Intrinsic: 2.5249999999999999 Error: 0.0000000000000000 Speed up: 3.5954455445544555
3078 Intrinsic: 2.9569999999999999 Error: 0.0000000000000000 Speed up: 3.0701724721001016
4617 Intrinsic: 3.1490000000000000 Error: 0.0000000000000000 Speed up: 2.8829787234042552
6926 Intrinsic: 3.6120000000000001 Error: 0.0000000000000000 Speed up: 2.5134274640088594
10389 Intrinsic: 3.7269999999999999 Error: 0.0000000000000000 Speed up: 2.4358733565870674
15584 Intrinsic: 4.4550000000000001 Error: 0.0000000000000000 Speed up: 2.0378226711560044
The speed up quoted is how much faster the code ran for a giving blocking factor compared to the Transpose(), so a speed up of 2 means my code ran twice as fast as the intrinsic, and you can see that just by restructuring the loops you can get an improvement of 8+ [yes, I know I'm being a little unfair to the intrinsic here, but the point remains]; that's equivalent to an awful lot of threads! Further if you look at the read bound version, which should reflect cache use best, you can see the maximum speed is at a blocking factor of 608. Given each transpose requires two blocks of size (blocking factor)*(blocking factor) this means for bfac=608 it needs (608*608*8*2)/(1024*1024)=5.64 Mbytes, a little under the L3 cache size of my machine (8Mbytes). Thus in this case it is more important to consider how you use your memory than how you use your cores.
Of course you can then multithread. Below are the results on my laptop, again in terms of speed up relative to the intrinsic. Moving to 2 threads gets you a little more, but not as much as the improvement above.
You can speed up the algorithm by swapping the upper triangular elements with the lower triangular elements.
subroutine TP(nstate,P,PP)
implicit none
integer, intent(in) :: nstate
double precision, intent(in) :: PP(nstate,nstate)
double precision :: P(nstate,nstate)
integer :: i,j
!$omp parallel shared ( P, PP,nstate) private (i, j)
!$omp do
do i=1,nstate
do j=i,nstate ! go from 'i' to 'n'
P(i,j) = PP(j,i)
P(j,i) = PP(i,j)
end do
end do
!$omp end do
!$omp end parallel do
end subroutine

How to convert this code to scan random data instead of binned data?

I am new to fortran and I am trying to write code using random data instead of binned data in x, y, z as shown in my sample code.
implicit real*8(a-h,o-z)
dimension rm(4),rp1(4),rip1(4),rp2(4),rip2(4),rp3(4),rip3(4),
d rn(4),u1(4),u2(4),u3(4)
do ix= 1000,25000,1000
x = ix/1000000.
do iy= 1000,25000,1000
y = iy/100000.
do iz= 1,1000,25
z = iz/10000.
a=(x**2+y**2)/z
b=x*y*z
c=x*y**2+y*z**2+z*x**2
fr=(a*b)/c
if(fr.ge.0.05.and.fr.le.23)then
write(40,*)x,y,x,fr
else
endif
end do
end do
end do
stop
How to convert such code having binned data to a code using random draws.
As an example binned data here means possible fixed values of x are {1000/1000000.,2000/1000000., .....,25000/1000000.} i.e. 25 possible values in range {.001, .025} but they are not random values
In case of random values 25 points will be drawn from the range {.001, .025} randomly.
This my assumption about doing the analysis with random draws(previously I was not familiar with this ).
Something like
ian#eris:~/work/stack$ cat data.f90
Program random_data
Use, Intrinsic :: iso_fortran_env, Only : wp => real64
Implicit None
Real( wp ), Parameter :: min_rand = 0.001_wp
Real( wp ), Parameter :: max_rand = 0.025_wp
Integer, Parameter :: n_samples = 25
Real( wp ) :: x, y, z
Real( wp ) :: a, b, c
Real( wp ) :: fr
Integer :: i_sample
Do i_sample = 1, n_samples
Call Random_number( x )
Call Random_number( y )
Call Random_number( z )
x = x * ( max_rand - min_rand ) + min_rand
y = y * ( max_rand - min_rand ) + min_rand
z = z * ( max_rand - min_rand ) + min_rand
a=(x**2+y**2)/z
b=x*y*z
c=x*y**2+y*z**2+z*x**2
fr=(a*b)/c
If( fr >= 0.05_wp .And. fr <= 23.0_wp )Then
Write( 40, * ) x, y, x, fr
Endif
End Do
End Program random_data
ian#eris:~/work/stack$ gfortran-10 -Wall -Wextra -fcheck=all -std=f2008 -g -finit-real=snan data.f90
ian#eris:~/work/stack$ ./a.out;more fort.40
more: stat of fort.40 failed: No such file or directory
Unfortunately none of the random numbers in this run produced an output that lay in the desired range - however I did test it with 2500 samples and then a couple did.

Fortran OMP parallel do loop scales differently for gfortran and Intel

I abstracted some code from a much larger coding project I'm working on. The code has an OMP parallel do loop which scales well with processor number when compiled with gfortran but badly when compiled with Intel. With gfortran, the code takes 18 seconds to run with 1 processor and 5 seconds to run with 4 processors. With Intel it takes 7 seconds to run with 1 processor and 14 seconds to run with 4 processors. I don't understand what's going on here. The code is below.
MODULE test
TYPE walker
DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: R
END TYPE walker
TYPE walkerlist
INTEGER :: nwlkr
TYPE(walker), DIMENSION(:), ALLOCATABLE :: W
END TYPE walkerlist
CONTAINS
FUNCTION step( dTau, nelec, ndim ) RESULT ( dR )
DOUBLE PRECISION, INTENT(IN) :: dTau
INTEGER, INTENT(IN) :: nelec, ndim
DOUBLE PRECISION :: dR(ndim,nelec), rand1, rand2, N2DTau
INTEGER :: d, k
DOUBLE PRECISION, PARAMETER :: twopi = 8.d0 * atan(1.d0)
N2DTau = -2 * dTau
DO k = 1, nelec
DO d = 1, ndim
CALL RANDOM_NUMBER(rand1)
CALL RANDOM_NUMBER(rand2)
dR(d,k) = SQRT( N2DTau * LOG( rand1 ) ) * COS( twopi * rand2 )
END DO
END DO
END FUNCTION step
END MODULE test
PROGRAM walk
USE test
TYPE(walkerlist), TARGET :: Wl
DOUBLE PRECISION :: dTau
INTEGER :: istp, i, t1, t2, clock_rate, clock_max
Wl % nwlkr = 10000
ALLOCATE( Wl % W ( Wl % nwlkr ) )
DO i = 1, Wl % nwlkr
ALLOCATE( Wl % W(i) % R(3,2) )
END DO
dTau = 0.001
CALL SYSTEM_CLOCK ( t1, clock_rate, clock_max )
!$OMP PARALLEL DO SHARED( W ) DEFAULT( FIRSTPRIVATE )
DO i = 1, Wl % nwlkr
DO istp = 1, 4000
Wl % W(i) % R = Wl % W(i) % R + step( dTau, 2, 3 )
END DO
END DO
!$OMP END PARALLEL DO
CALL SYSTEM_CLOCK ( t2, clock_rate, clock_max )
Print*, "time:", REAL ( t2 - t1 ) / REAL ( clock_rate )
END PROGRAM walk
The issue was the random_number calls, where I'm guessing the threads were sharing seeds. I solved it by instead using the random number generating function ran.
rand1 = ran(s)
rand2 = ran(s)
Ran lets you input the seed s, which I made thread_private and of the save type. Ran changes the seed only for ifort and not gfortran, so I can't use it for the latter. Ran also sometimes outputs 0, which I personally need to always check for and discard. I also need to Ensure all threads start with a different seed.

Random number generator in PGI Fortran not so random

The following code just generates a simple triple of random numbers:
program testrand
integer, parameter :: nz = 160, nf = 160, nlt = 90
real :: tmpidx(3)
integer :: idxarr(3), idx1, idx2, idx3, seed_size, ticks
integer, allocatable :: seed(:)
call random_seed(size=seed_size)
allocate(seed(seed_size))
call system_clock(count=ticks)
seed = ticks+37*(/(i-1, i=1,seed_size)/)
call random_seed(put=seed)
deallocate(seed)
call random_number(tmpidx)
idxarr = tmpidx * (/nz, nf, nlt/)
idx1 = max(1,idxarr(1))
idx2 = max(1,idxarr(2))
idx3 = max(1,idxarr(3))
print *,idx1, idx2, idx3
end program
I compile this with gfortran and run a few times and I get:
> gfortran testrand.f90
> ./a.out
74 98 86
> ./a.out
113 3 10
> ./a.out
44 104 27
Looks pretty random. Now I compile with PGI Fortran and run a few times:
> pgf90 testrand.f90
> ./a.out
1 1 1
> ./a.out
1 1 1
> ./a.out
1 1 1
Of course, there's no way to be completely sure, but I suspect this is not random. :) Anyone know what is going on here? Anyone know the right way to get random numbers with PGI Fortran?
Somehow, PGI does not implement system_clock as in GNU compilers. I do not know why, I found it recently by doing similar stuff like you.
To see what I am talking about, just print ticks after calling system_clock. Chances are that you get 0 all the time with PGI and varying numbers with GNU compilers. To solve your problem, you can adapt the code bellow. It is a slightly modified version of a code that you can get at GNU fortran web site
program testrand
use iso_fortran_env, only: int64
integer, parameter :: nz = 160, nf = 160, nlt = 90
real :: tmpidx(3)
integer :: idxarr(3), idx1, idx2, idx3, seed_size, ticks
integer, allocatable :: seed(:)
call random_seed(size=seed_size)
allocate(seed(seed_size))
! call system_clock(count=ticks)
! seed = ticks+37*(/(i-1, i=1,seed_size)/)
! call random_seed(put=seed)
!
! deallocate(seed)
call init_random_seed()
call random_number(tmpidx)
idxarr = tmpidx * (/nz, nf, nlt/)
idx1 = max(1,idxarr(1))
idx2 = max(1,idxarr(2))
idx3 = max(1,idxarr(3))
print *,idx1, idx2, idx3
contains
!
subroutine init_random_seed()
implicit none
integer, allocatable :: seed(:)
integer :: i, n, istat, dt(8), pid
integer(int64) :: t
integer, parameter :: un=703
call random_seed(size = n)
allocate(seed(n))
! First try if the OS provides a random number generator
open(unit=un, file="/dev/urandom", access="stream", &
form="unformatted", action="read", status="old", iostat=istat)
if (istat == 0) then
read(un) seed
close(un)
else
! The PID is
! useful in case one launches multiple instances of the same
! program in parallel.
call system_clock(t)
if (t == 0) then
call date_and_time(values=dt)
t = (dt(1) - 1970) * 365_int64 * 24 * 60 * 60 * 1000 &
+ dt(2) * 31_int64 * 24 * 60 * 60 * 1000 &
+ dt(3) * 24_int64 * 60 * 60 * 1000 &
+ dt(5) * 60 * 60 * 1000 &
+ dt(6) * 60 * 1000 + dt(7) * 1000 &
+ dt(8)
end if
pid = getpid()
t = ieor( t, int(pid, kind(t)) )
do i = 1, n
seed(i) = lcg(t)
end do
end if
call random_seed(put=seed)
!print*, "optimal seed = ", seed
end subroutine init_random_seed
!
function lcg(s)
integer :: lcg
integer(int64), intent(in out) :: s
if (s == 0) then
s = 104729
else
s = mod(s, 4294967296_int64)
end if
s = mod(s * 279470273_int64, 4294967291_int64)
lcg = int(mod(s, int(huge(0), 8)), kind(0))
end function lcg
!
!this option is especially used for pgf90 to provide a getpid() function
!> #brief Returns the process ID of the current process
!! #todo write the actual code, for now returns a fixed value
!<
function getpid()result(pid)
integer pid
pid = 53 !just a prime number, no special meaning
end function getpid
end program

Resources