Long gfortran compiling time for large implied do loops - compilation

Compiling the following fortran90 program takes minutes
program implloop
implicit none
integer, parameter :: dp = selected_real_kind(15)
integer(dp) :: i
real(dp), allocatable :: array(:)
real(dp) :: t1 , t2
call cpu_time(t1)
allocate( array(10**8) )
array = [( sqrt(real(i,dp)) , i = 1 , 10**8 )]
call cpu_time(t2)
write(*,*) t2-t1
end program
It is compiled with GNU Fortran (Debian 6.3.0-18+deb9u1) 6.3.0 20170516.
On the other hand, if compiled using a do-loop,
program implloop
implicit none
integer, parameter :: dp = selected_real_kind(15)
integer(dp) :: i
real(dp), allocatable :: array(:)
real(dp) :: t1 , t2
call cpu_time(t1)
allocate( array(10**8) )
do i = 1 , 10**8
array(i) = sqrt(real(i,dp))
end do
call cpu_time(t2)
write(*,*) t2-t1
end program
it is ready in a second. What is the reason behind this compile-time difference?

Related

OpenMP application on fortran find best thread

For example, I have kmeans code
program read_from_file
use functions_module
!$ use omp_lib
character(len=2) :: c2
integer :: i, j, k, l, c, d
real, dimension(:,:), allocatable :: r, centroid, new_centro, converge
real, dimension(:), allocatable :: cost
integer,dimension(:),allocatable :: indices,distancereg,cluster
integer :: ios_read = 0
integer :: n = 0
integer :: omega, tid, n_threads
real, dimension(:,:), allocatable :: comparison_value
print *, 'which data index?'
read*, idx
write(c2, '(i2.2)') idx
open(unit=99, file='datatest1.dat', iostat=ios_read)
if (ios_read /= 0) then
print *, "kmeans_data_distrib_"//c2//"_small.dat could not be opened"
! print
end if
!find the maximum lines
do
read(99, *, iostat=ios_read) i, x, y
if (ios_read > 0) then
print *, "something is wrong"
stop
else if (ios_read < 0) then
print *, "end of file reached"
exit
else
n = n+1
end if
end do
rewind(99)
!do i=1,n
open(unit=98, file='rawdata.dat')
allocate(r(2, n))
do i = 1,n
read(99, *, iostat=ios_read) j, x, y
r(1, j) = x
r(2, j) = y
write(98, *) x, y
end do
close(99) ! close kmeans
close(98) ! close rawdatai
print*, 'put k'
read*, k
allocate (comparison_value(2,k))
comparison_value = 0.02
** do l=1,10
call centroid_inits(r, n, k, centroid)
call min_distance(r, n, k, centroid, distance,indices,distancereg)
call new_centroid(r,n,k,centroid,indices,new_centro,omega)
call costfunction(r,n,k,distancereg,indices,new_centro,cluster,cost)
end do
open(unit=99,file="kmeans3_test.dat")
do i = 1, n
write(99,"(2es14.5,i4)") r(:,i),indices(i)
enddo
close(99)
Contains
subroutine centroid_inits(r,n,k,centroid)
real,dimension (:,:),intent(in),allocatable :: r
real,dimension (:,:),intent(out),allocatable:: centroid
real,dimension(k),allocatable::xc(:) ,yc(:)
integer,intent(in) :: n,k
integer :: i
real :: maks_x,maks_y,min_x,min_y
allocate(centroid(2, k))
allocate(xc(k))
allocate(yc(k))
maks_x = maxval(r(1,:))
maks_y = maxval(r(2,:))
min_x = minval(r(1,:))
min_y = minval(r(2,:))
! print *, min_x, maks_x, min_y, maks_y
do i = 1,k
xc (i) = min_x + (maks_x - min_x) * fib_rnd()
yc (i) = min_y + (maks_y - min_y) * fib_rnd()
centroid (1,i) = xc(i)
centroid (2,i) = yc(i)
end do
do i = 1,k
print *, centroid(:,i)
end do
end subroutine centroid_inits
subroutine min_distance(r,n,k,centroid,distance,indices,distancereg)
integer, intent(out):: n,k
real,dimension(:,:),intent(in),allocatable::centroid
real,dimension(:,:),intent(in),allocatable::r
integer,dimension(:),intent(out),allocatable::indices,distancereg
real ::d_min
integer::y,i_min,j,i
integer,parameter :: data_dim=2
allocate (indices(n))
allocate (distancereg(k))
!cost=0.d0
do j=1,n
i_min = -1
d_min=1.d6
! !$ OMP DO
do i=1,k
distance=0.d0
distancereg(i)=0.d0
do y=1,data_dim
distance = distance+abs(r(y,j)-centroid(y,i))
distancereg(i)=distancereg(i)+abs(r(y,j)-centroid(y,i))
end do
if (distance<d_min) then
d_min=distance
i_min=i
end if
end do
!!$OMP END DO
if( i_min < 0 ) print*," found error by assigning k-index to particle ",j
indices(j)=i_min
end do
end subroutine
subroutine new_centroid(r,n,k,centroid,indices,new_centro,omega)
integer, intent(in):: n
real,dimension(:,:),intent(inout),allocatable ::centroid
real,dimension(:,:),intent(in),allocatable ::r
integer,dimension(:),intent(in),allocatable::indices
real,dimension(:,:),intent(out),allocatable:: new_centro
integer,intent(inout)::k
integer :: t,y,j,k_ind
integer,intent(out) :: omega
real,dimension(:),allocatable :: summ
allocate(summ(2))
allocate (new_centro(2,k))
t=2
do k_ind=1,k
omega = count(indices==k_ind)
summ(1)=0
summ(2)=0
do j=1,n
if (indices(j)==k_ind) then
summ(1) =+ r(1,j)
summ(2) =+ r(2,j)
end if
end do
new_centro(1,k_ind) = summ(1)/omega
new_centro(2,k_ind) = summ(2)/omega
end do
centroid = new_centro
!do k_ind=1,k
!print*, 'new centro',new_centro(:,k_ind)
!end do
end subroutine
subroutine costfunction(r,n,k,distancereg,indices,new_centro,cluster,cost)
integer, dimension (:), allocatable, intent(out) :: distancereg, indices
integer, dimension (:), allocatable, intent(out) :: cluster
real, dimension (:,:), allocatable, intent(in) :: r
real, dimension (:,:), intent(in), allocatable :: new_centro
real, dimension(:), intent(out), allocatable :: cost
integer :: i,k
allocate(cluster(k))
allocate(cost(k))
allocate(distancereg(k))
call min_distance(r,n,k,centroid,distance,indices,distancereg)
cluster = 0
do i=1,k
cost(i)=0
cluster(i)=count(indices==i)
cost(i)=(1.0/cluster(i))*distancereg(i)
! print*,cost(i)
end do
print*," total sum of cluster members ",sum(cluster)," vs N ",n
end subroutine
subroutine convergence_value(converge, centroid, new_centro, cost, cluster)
real, dimension (:,:), intent(inout), allocatable :: new_centro
real, dimension (:,:), intent(inout), allocatable :: centroid
real, dimension(:), allocatable, intent(out):: cost
integer, dimension (:), allocatable, intent(out) :: cluster
real, dimension(:,:), intent (inout), allocatable:: converge
allocate(converge(2,k))
call centroid_inits(r, n, k, centroid)
call min_distance(r, n, k, centroid, distance,indices,distancereg)
call new_centroid(r,n,k,centroid,indices,new_centro,omega)
converge = (abs(centroid-new_centro))
print*, 'this is c',converge
end subroutine
end program read_from_file
It runs okay with serial. But I want to apply openmp. I want to each thread doing the same calculation and find which thread have better cost function and time. (all thread doing the clusterization). My attemp and idea is to paralellized the code before encounter the subroutine, that two asterisk. But I do not know if its enough (though I tried it and showing error), and how do make display report of each thread ?
*You might notice from the code that I am a beginner

Fortran OMP parallel do loop scales differently for gfortran and Intel

I abstracted some code from a much larger coding project I'm working on. The code has an OMP parallel do loop which scales well with processor number when compiled with gfortran but badly when compiled with Intel. With gfortran, the code takes 18 seconds to run with 1 processor and 5 seconds to run with 4 processors. With Intel it takes 7 seconds to run with 1 processor and 14 seconds to run with 4 processors. I don't understand what's going on here. The code is below.
MODULE test
TYPE walker
DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: R
END TYPE walker
TYPE walkerlist
INTEGER :: nwlkr
TYPE(walker), DIMENSION(:), ALLOCATABLE :: W
END TYPE walkerlist
CONTAINS
FUNCTION step( dTau, nelec, ndim ) RESULT ( dR )
DOUBLE PRECISION, INTENT(IN) :: dTau
INTEGER, INTENT(IN) :: nelec, ndim
DOUBLE PRECISION :: dR(ndim,nelec), rand1, rand2, N2DTau
INTEGER :: d, k
DOUBLE PRECISION, PARAMETER :: twopi = 8.d0 * atan(1.d0)
N2DTau = -2 * dTau
DO k = 1, nelec
DO d = 1, ndim
CALL RANDOM_NUMBER(rand1)
CALL RANDOM_NUMBER(rand2)
dR(d,k) = SQRT( N2DTau * LOG( rand1 ) ) * COS( twopi * rand2 )
END DO
END DO
END FUNCTION step
END MODULE test
PROGRAM walk
USE test
TYPE(walkerlist), TARGET :: Wl
DOUBLE PRECISION :: dTau
INTEGER :: istp, i, t1, t2, clock_rate, clock_max
Wl % nwlkr = 10000
ALLOCATE( Wl % W ( Wl % nwlkr ) )
DO i = 1, Wl % nwlkr
ALLOCATE( Wl % W(i) % R(3,2) )
END DO
dTau = 0.001
CALL SYSTEM_CLOCK ( t1, clock_rate, clock_max )
!$OMP PARALLEL DO SHARED( W ) DEFAULT( FIRSTPRIVATE )
DO i = 1, Wl % nwlkr
DO istp = 1, 4000
Wl % W(i) % R = Wl % W(i) % R + step( dTau, 2, 3 )
END DO
END DO
!$OMP END PARALLEL DO
CALL SYSTEM_CLOCK ( t2, clock_rate, clock_max )
Print*, "time:", REAL ( t2 - t1 ) / REAL ( clock_rate )
END PROGRAM walk
The issue was the random_number calls, where I'm guessing the threads were sharing seeds. I solved it by instead using the random number generating function ran.
rand1 = ran(s)
rand2 = ran(s)
Ran lets you input the seed s, which I made thread_private and of the save type. Ran changes the seed only for ifort and not gfortran, so I can't use it for the latter. Ran also sometimes outputs 0, which I personally need to always check for and discard. I also need to Ensure all threads start with a different seed.

OpenMP FFTW with Fortran not thread safe

I am trying to use the FFTW with openMP and Fortran, but I get wrong results when executing in parallel, which also change their values every execution step, displaying typical behaviour when parallelisation goes wrong.
I am aiming for a simple 3d real-to-complex transformation. Following the FFTW tutorial, I took all but the call to dfftw_execute_dft_r2c() out of the parallel region, but it doesn't seem to work.
I use FFTW 3.3.8, configured with ./configure --enable-threads --enable-openmp --enable-mpi and compile my code with gfortran program.f03 -o program.o -I/usr/include -fopenmp -lfftw3_omp -lfftw3 -g -Wall.
This is how my program looks like:
program use_fftw
use,intrinsic :: iso_c_binding
use omp_lib
implicit none
include 'fftw3.f03'
integer, parameter :: dp=kind(1.0d0)
integer, parameter :: Nx = 10
integer, parameter :: Ny = 5
integer, parameter :: Nz = 5
real(dp), parameter :: pi = 3.1415926d0
real(dp), parameter :: physical_length_x = 20.d0
real(dp), parameter :: physical_length_y = 10.d0
real(dp), parameter :: physical_length_z = 10.d0
real(dp), parameter :: lambda1 = 0.5d0
real(dp), parameter :: lambda2 = 0.7d0
real(dp), parameter :: lambda3 = 0.9d0
real(dp), parameter :: dx = physical_length_x/real(Nx,dp)
real(dp), parameter :: dy = physical_length_y/real(Ny,dp)
real(dp), parameter :: dz = physical_length_z/real(Nz,dp)
integer :: void, nthreads
integer :: i, j, k
real(dp):: d
complex(dp), allocatable, dimension(:,:,:) :: arr_out
real(dp), allocatable, dimension(:,:,:) :: arr_in
integer*8 :: plan_forward
allocate(arr_in( 1:Nx, 1:Ny, 1:Nz)); arr_in = 0
allocate(arr_out(1:Nx/2+1, 1:Ny, 1:Nz)); arr_out = 0
!------------------------------
! Initialize fftw stuff
!------------------------------
! Call before any FFTW routine is called outside of parallel region
void = fftw_init_threads()
if (void==0) then
write(*,*) "Error in fftw_init_threads, quitting"
stop
endif
nthreads = omp_get_num_threads()
call fftw_plan_with_nthreads(nthreads)
! plan execution is thread-safe, but plan creation and destruction are not:
! you should create/destroy plans only from a single thread
call dfftw_plan_dft_r2c_3d(plan_forward, Nx, Ny, Nz, arr_in, arr_out, FFTW_ESTIMATE)
!--------------------------------
! Start parallel region
!--------------------------------
!$OMP PARALLEL PRIVATE( j, k, d)
! Fill array with wave
! NOTE: wave only depends on x so you can plot it later.
!$OMP DO
do i = 1, Nx
d = 2.0*pi*i*dx
do j = 1, Ny
do k = 1, Nz
arr_in(i,j,k) = cos(d/lambda1)+sin(d/lambda2)+cos(d/lambda3)
enddo
enddo
enddo
!$OMP END DO
call dfftw_execute_dft_r2c(plan_forward, arr_in, arr_out)
!$OMP END PARALLEL
!-----------------
! print results
!-----------------
do i=1, Nx/2+1
do j=1, Ny
do k=1, Nz
write(*,'(F12.6,A3,F12.6,A3)',advance='no') real(arr_out(i,j,k)), " , ", aimag(arr_out(i,j,k)), " ||"
enddo
write(*,*)
enddo
write(*,*)
enddo
deallocate(arr_in, arr_out)
! destroy plans is not thread-safe; do only with single
call dfftw_destroy_plan(plan_forward)
end program use_fftw
I also tried moving the initialisation part of FFTW (void = fftw_init_threads(); call fftw_plan_with_nthreads(nthreads); call dfftw_plan_dft_r2c_3d(...) into the parallel region, using a !$OMP SINGLE block and synchronising with a barrier afterwards, but the situation didn't improve.
Can anyone help me?
EDIT: I was able to test my program on another system, the problem remains. So the issue apparently isn't in my implementation of openmp or FFTW, but somewhere in the program itself.
You should normally call fftw execute routines outside of the parallel region. They have their own parallel regions inside them and they will take care of running the transform in parallel with that many threads as you requested during planning. They will re-use your existing OpenMP threads.
You can also call them inside a parallel region, but on different arrays, not on the same arrays! And then your plan should be planned to use 1 thread. Each thread would then preform a 2D transform of a slice of the array, for example.
The thread-safety means you can call the routines concurrently, but each for different data.

Why does a large matrix pass through several subroutine tasks as fast as a smaller matrix

What Exactly is happening to my matrix? how is Fortran handling it?
What's attached is a snippet of code inspired from a larger
project that simulates light transport in eye tissue. It
passes some large matrices through subroutines and then
randomly puts values in them.
My Goal: To see how passing such a large matrix through
several subroutines would have an impact on
performance.
My Reference: is the exact same code except the dimension of the matrix of interest is now [5,5] ( it was previously [250,200] )
My Question: Why is there no significant difference in results?
MY RESULTS
MATRIX A_rz dimension [250,200]
real 0m6.661s
user 0m6.638s
sys 0m0.012s
MATRIX A_rz dimension [5,5]
real 0m6.508s
user 0m6.489s
sys 0m0.011s
**bMatMOD.f90
module bMatMOD
implicit none
type :: INPUT
integer :: nLayers = 1
integer :: nPhotons = 50000000
real, dimension (2) :: dZR = (/0.0004, 0.001/)
integer, dimension(3) :: nZRA = (/250,200,30/)
real, dimension (1) :: d = (/0.03/)
end type INPUT
type :: OUTPUT
real, allocatable :: Rd_ra(:,:)
real, allocatable :: A_rz(:,:)
real, allocatable :: Tt_ra(:,:)
end type OUTPUT
contains
subroutine initOUTPUTS (in_INPUT,out_OUTPUT)
type (INPUT), intent (in) :: in_INPUT
type (OUTPUT),intent (out) :: out_OUTPUT
allocate (out_OUTPUT%A_rz(in_INPUT%nZRA(2),in_INPUT%nZRA(1)))
allocate (out_OUTPUT%Rd_ra(in_INPUT%nZRA(2),in_INPUT%nZRA(3)))
allocate (out_OUTPUT%Tt_ra(in_INPUT%nZRA(2),in_INPUT%nZRA(3)))
out_OUTPUT%A_rz = 0.0
out_OUTPUT%Rd_ra = 0.0
out_OUTPUT%Tt_ra = 0.0
return
end subroutine initOUTPUTS
end module bMatMOD
**bMatRoutines.f90
subroutine A (o)
use bMatMOD
type (OUTPUT) :: o
real :: rnd1, rnd2
rnd1 = rand()
rnd2 = rand()
call B(o,rnd1,rnd2)
return
end subroutine A
subroutine B (o,x,y)
use bMatMOD
type (OUTPUT) :: o
real, intent (in) :: x
real, intent (in) :: y
integer, dimension(2) :: temp
integer :: i, j
temp = SHAPE(o%A_rz)
i = INT(temp(1)*y)
j = INT(temp(2)*x)
if ( i .eq. 0) then
i = 1
endif
if (i .eq. temp(1)) then
i = i - 1
endif
if (j .eq. 0) then
j = 1
endif
if (j .eq. temp(2)) then
j = j - 1
endif
o%A_rz(i,j) = o%A_rz(i,j) + x + y
return
end subroutine B
**bMatmcml.f90
program bMatmcml
use bMatMOD
implicit none
type (INPUT) :: u
type (OUTPUT) :: o
integer :: i
call initOUTPUTS(u,o)
call srand(0)
do i = 1,u%nPhotons,1
call A(o)
enddo
end program bMatmcml
**bMat.sh
rm -f *.o *~ *.exe
echo "MATRIX A_rz dimension [250,200]"
gfortran bMatMOD.f90 bMatRoutines.f90 bMatmcml.f90 -g -Wall -Werror -O3 -ffast-math -o bMat.exe
time ./bMat.exe
echo "MATRIX A_rz dimension [5,5]"
gfortran bMatMOD-v1.f90 bMatRoutines.f90 bMatmcml.f90 -g -Wall -Werror -O3 -ffast-math -o bMat-v1.exe
time ./bMat.exe

Fortran: Call subroutine that returns unknow size vectors and array

I have a problem calling a subfunction that returns vectors and arrays with unknown size. I dont know how to declear the variables that I call the function with. The program does compile, but crash with error message:
Minnesegmentsfeil (core dumped)
Which is something like Memory segmentation fault in english.
Here is my main program.
program transfer
! use konstanter
implicit none
! real :: f3eksitasjon
! real :: amp, w, F3a, eta3a
real, allocatable, dimension(:) :: Aw, Vp, Vtot, Ym, Zm, Zt, Ds
real, allocatable, dimension(:,:) :: Spos
integer, allocatable, dimension(:) :: Ns, Dir
integer :: i, N
call geometry(N, Aw, Vp, Vtot, Ym, Zm, Zt, Ds, Ns, Spos, DIR)
print *, N, Aw(1), Vp(1), Vtot(1), Ym(1), Zm(1), Ds(1), Ns(1), Spos(1,1), Spos(1,2), Dir(1)
end program transfer
The subroutine geometry reads from a file. The length of the vectors and size of array is dependent on the file it reads.
subroutine geometry(N, Aw, Vp, Vtot, Ym, Zm, Zt, Ds, Ns, Spos, DIR)
implicit none
integer, intent(out) :: N
real, dimension(:), allocatable, intent(out):: Aw, Vp, Vtot, Ym, Zm, Zt, Ds
real, dimension(:,:), allocatable, intent(out) :: Spos
integer, dimension(:),allocatable, intent(out) :: Ns, DIR
real, dimension(:), allocatable :: Bp, Hp, Lp, Vs
real, parameter :: pi = 4.0*atan(1.0)
integer :: i
open(unit = 11, file = "semisub.dat")
read(11,*) N
allocate(Aw(N))
allocate(Vp(N))
allocate(Vtot(N))
allocate(Ym(N))
allocate(Zm(N))
allocate(Zt(N))
allocate(Ds(N))
allocate(Spos(N,4))
allocate(Ns(N))
allocate(DIR(N))
allocate(Hp(N))
do i = 1,N
read(11,*) Bp(i), Hp(i), Lp(i), Ym(i), Zm(i), DIR(i)
read(11,*) Ns(i), Ds(i)
If (Ns(i) > 8) STOP "Feil i semisub.dat. For mange sOyler"
read(11,*) Spos(i,1:Ns(i))
end do
Zt(:) = Zm(:) + Hp(:)/2
Aw(:) = 2 * Ds(:) * Ns(:)
Vp(:) = 2 * Lp(:) * Bp(:) * Hp(:)
Vs(:) = 2 * Ns(:) * pi * (Ds/2)**2 * (-Zt(:))
Vtot(:) = Vp(:) + Vs(:)
end subroutine geometry
The code, as shown, lacks an explicit interface for the subroutine in the scope of the main program. An explicit interface is required for that subroutine because some of the arguments are allocatable.
(If an explicit interface is available it means that the compiler explicitly knows what the interface of the subroutine looks like - the compiler needs this information because it typically has to do additional work to pass an allocatable object to an allocatable dummy argument.)
Putting the subroutine in a module and then USEing that module in the specification part of the main program is one way of giving the subroutine an explicit interface. Another way is to make the subroutine an internal procedure (put it after a contains statement in the main program). A third way is to put an interface block for the subroutine in the specification part of the main program.

Resources