Efficient (Fast) Binary Tree in Fortran - performance

I am using the procedure in the following code (that I took from here) to a program that I am trying to make run as fast as possible. The procedure is, however, very slow since it is probably optimized for pedagogical purposes not speed.
program tree_sort
! Sorts a file of integers by building a
! tree, sorted in infix order.
! This sort has expected behavior n log n,
! but worst case (input is sorted) n ** 2.
implicit none
type node
integer :: value
type (node), pointer :: left, right
end type node
type (node), pointer :: t ! A tree
integer :: number, ios
nullify (t) ! Start with empty tree
do
read (*, *, iostat = ios) number
if (ios < 0) exit
call insert (t, number) ! Put next number in tree
end do
! Print nodes of tree in infix order
call print_tree (t)
contains
recursive subroutine insert (t, number)
type (node), pointer :: t ! A tree
integer, intent (in) :: number
! If (sub)tree is empty, put number at root
if (.not. associated (t)) then
allocate (t)
t % value = number
nullify (t % left)
nullify (t % right)
! Otherwise, insert into correct subtree
else if (number < t % value) then
call insert (t % left, number)
else
call insert (t % right, number)
end if
end subroutine insert
recursive subroutine print_tree (t)
! Print tree in infix order
type (node), pointer :: t ! A tree
if (associated (t)) then
call print_tree (t % left)
print *, t % value
call print_tree (t % right)
end if
end subroutine print_tree
end program tree_sort
Is there any way to speed it up? I am using the procedure to sequentially add elements to a vector without adding repeated ones (so I changed the else in the insert subroutine to else if (number > t % value) then. Other than that, instead of printing I store the values in a global variable.
Edit:
Here is the actual code:
MODULE MOD_PARAMETERS
USE, INTRINSIC :: ISO_FORTRAN_ENV
IMPLICIT NONE
SAVE
INTEGER(INT32), PARAMETER :: d = 10 ! number of dimensions
INTEGER(INT32), PARAMETER :: L_0 = 5 ! after this adaptive grid kicks in, for L <= L_0 usual sparse grid
INTEGER(INT32), PARAMETER :: L_max = 5 ! maximum level
INTEGER(INT32), PARAMETER :: bound = 1 ! 0 -> for f = 0 at boundary
! 1 -> adding grid points at boundary
! 2 -> extrapolating close to boundary
INTEGER(INT32), PARAMETER :: testing_sample = 10**4
INTEGER(INT32), PARAMETER :: error_sample = 10**2
REAL(REAL64), PARAMETER :: eps = 0.001D0 ! epsilon for adaptive grid
TYPE NODE
INTEGER :: value
TYPE (NODE), POINTER :: left, right
END TYPE NODE
INTEGER(INT32), DIMENSION(:), ALLOCATABLE :: tree_vector
INTEGER(INT32) :: iii
END MODULE MOD_PARAMETERS
SUBROUTINE FF(x,output)
USE MOD_PARAMETERS
IMPLICIT NONE
REAL(REAL64), DIMENSION(d), INTENT(IN) :: x
REAL(REAL64) , INTENT(OUT) :: output
output = 1.0D0/(ABS(0.5D0-SUM(x(:)**4.0D0))+0.1D0)
END SUBROUTINE
SUBROUTINE XX(n,L,i,output)
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER(INT32) , INTENT(IN) :: n
INTEGER(INT32), DIMENSION(n), INTENT(IN) :: L, i
REAL(REAL64), DIMENSION(n), INTENT(OUT) :: output
INTEGER(INT32) :: j
DO j = 1,n
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
output(j) = REAL(i(j),REAL64)/REAL(2**L(j),REAL64)
ELSEIF (bound .EQ. 1) THEN
output(j) = REAL(i(j),REAL64)/REAL(2**MAX(L(j)-1,1),REAL64)
ENDIF
ENDDO
END SUBROUTINE
SUBROUTINE XX_INV(L,x,output)
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER(INT32), DIMENSION(d), INTENT(IN) :: L
REAL(REAL64), DIMENSION(d), INTENT(IN) :: x
INTEGER(INT32), DIMENSION(d), INTENT(OUT) :: output
INTEGER(INT32) :: j
DO j = 1,d
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
output(j) = 2*FLOOR(x(j)*REAL(2**(L(j)-1),REAL64))+1
ELSEIF (bound .EQ. 1) THEN
IF (L(j) .EQ. 2) THEN
IF (x(j) .LT. 0.5D0) THEN
output(j) = 0
ELSE
output(j) = 2
ENDIF
ELSE
output(j) = 2*FLOOR(x(j)*(REAL(2**MAX(L(j)-2,0),REAL64)))+1
ENDIF
ENDIF
ENDDO
END SUBROUTINE
SUBROUTINE BASE(x,L,i,output)
USE MOD_PARAMETERS
IMPLICIT NONE
REAL(REAL64), INTENT(IN) :: x
INTEGER(INT32), INTENT(IN) :: L,i
REAL(REAL64), INTENT(OUT) :: output
IF (bound .EQ. 0) THEN
output = MAX((1.0D0-ABS(x*REAL(2**L,REAL64)-REAL(i,REAL64))),0.0D0)
ELSEIF (bound .EQ. 1) THEN
IF ((L .EQ. 1) .AND. (i .EQ. 1)) THEN
output = 1.0D0
ELSEIF ((L .EQ. 2) .AND. (i .EQ. 0)) THEN
output = MAX(1.0D0-2.0D0*x,0.0D0)
ELSEIF ((L .EQ. 2) .AND. (i .EQ. 2)) THEN
output = MAX(2.0D0*x-1.0D0,0.0D0)
ELSE
output = MAX((1.0D0-ABS(x*REAL(2**(L-1),REAL64)-REAL(i,REAL64))),0.0D0)
ENDIF
ELSEIF (bound .EQ. 2) THEN
IF ((L .EQ. 1) .AND. (i .EQ. 1)) THEN
output = 1.0D0
ELSEIF ((L .GT. 1) .AND. (i .EQ. 1)) THEN
output = MAX(2.0D0-REAL(2**L,REAL64)*x,0.0D0)
ELSEIF ((L .GT. 1) .AND. (i .EQ. (2**L)-1)) THEN
output = MAX(REAL(2**L,REAL64)*x+REAL(1-i,REAL64),0.0D0)
ELSE
output = MAX((1.0D0-ABS(x*REAL(2**L,REAL64)-REAL(i,REAL64))),0.0D0)
ENDIF
ENDIF
END SUBROUTINE
PROGRAM MAIN
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER(INT32), DIMENSION(d,d) :: ident
REAL(REAL64), DIMENSION(1) :: x1
REAL(REAL64), DIMENSION(d) :: xd
INTEGER(INT32), DIMENSION(2*d) :: temp
INTEGER(INT32), DIMENSION(:,:), ALLOCATABLE :: grid_index, temp_grid_index, grid_index_new, J_index, &
adj_list, temp_adj_list
INTEGER(INT32), DIMENSION(:), ALLOCATABLE :: to_do, to_do_new, to_add_ind
REAL(REAL64), DIMENSION(:), ALLOCATABLE :: coeff, temp_coeff, J_coeff
REAL(REAL64) :: temp_min, temp_max, V, T, B, F
INTEGER(INT32) :: i, k, k1, k2, h, j, L, n, dd, dsize, count, count1, count2, count3, flag, &
first, repeated, add, ind, adj_list_ind
INTEGER(INT32) :: time1, time2, time3, time4, clock_rate, clock_max
INTEGER(INT32), DIMENSION(d) :: LL, ii
REAL(REAL64), DIMENSION(error_sample,d) :: sample_x
REAL(REAL64), DIMENSION(error_sample) :: sample_e, interp1
REAL(REAL64) :: max_error, L2_error
REAL(REAL64), DIMENSION(testing_sample,d) :: x_rand
REAL(REAL64), DIMENSION(testing_sample) :: interp2
TYPE(NODE), POINTER :: tree
! ============================================================================
! EXECUTABLE
! ============================================================================
ident = 0
DO i = 1,d
ident(i,i) = 1
ENDDO
! Initial grid point
dsize = 1
ALLOCATE(grid_index(dsize,2*d),grid_index_new(dsize,2*d),adj_list(dsize,2*d))
grid_index(1,:) = 1
grid_index_new = grid_index
adj_list = 0
ALLOCATE(coeff(0:dsize))
coeff(0) = 0.0D0
xd = 0.5D0
CALL FF(xd,coeff(1))
L = 1
n = SIZE(grid_index_new,1)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
ALLOCATE(to_add_ind(1))
to_add_ind = 1
CALL RANDOM_NUMBER(sample_x)
sample_e = 0.0D0
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO WHILE (L .LT. L_max)
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
L = L+1
n = SIZE(grid_index_new,1)
count = 0
first = 1
DEALLOCATE(J_index,J_coeff)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
J_index = 0
J_coeff = 0.0D0
DO k = 1,n
adj_list_ind = 0
DO i = 1,d
DO j = 1,2
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ELSEIF (bound .EQ. 1) THEN
IF (grid_index_new(k,i) .EQ. 1) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(-(-1)**j)/)
ELSE
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ENDIF
ENDIF
CALL XX(d,temp(1:d),temp(d+1:2*d),xd)
temp_min = MINVAL(xd)
temp_max = MAXVAL(xd)
IF ((temp_min .GE. 0.0D0) .AND. (temp_max .LE. 1.0D0)) THEN
IF (first .EQ. 1) THEN
first = 0
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k2 = 1,d
CALL XX(1,temp(k2),temp(d+k2),x1)
CALL BASE(x1(1),grid_index(k1,k2),grid_index(k1,k2+d),B)
T = T*B
ENDDO
V = V+coeff(k1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
adj_list(to_add_ind(k),adj_list_ind+1) = dsize+count
adj_list_ind = adj_list_ind+1
ELSE
repeated = 0
DO h = 1,count
IF (SUM(ABS(J_index(h,:)-temp)) .EQ. 0) THEN
repeated = 1
adj_list(to_add_ind(k),adj_list_ind+1) = dsize+h
adj_list_ind = adj_list_ind+1
ENDIF
ENDDO
IF (repeated .EQ. 0) THEN
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k2 = 1,d
CALL XX(1,temp(k2),temp(d+k2),x1)
CALL BASE(x1(1),grid_index(k1,k2),grid_index(k1,k2+d),B)
T = T*B
ENDDO
V = V+coeff(k1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
adj_list(to_add_ind(k),adj_list_ind+1) = dsize+count
adj_list_ind = adj_list_ind+1
ENDIF
ENDIF
ENDIF
ENDDO
ENDDO
ENDDO
ALLOCATE(temp_grid_index(dsize,2*d))
ALLOCATE(temp_coeff(dsize))
temp_grid_index = grid_index
temp_coeff = coeff
DEALLOCATE(grid_index,coeff)
ALLOCATE(grid_index(dsize+count,2*d))
ALLOCATE(coeff(0:dsize+count))
grid_index(1:dsize,:) = temp_grid_index
coeff(0:dsize) = temp_coeff
DEALLOCATE(temp_grid_index,temp_coeff)
grid_index(dsize+1:dsize+count,:) = J_index(1:count,:)
coeff(dsize+1:dsize+count) = J_coeff(1:count)
IF (L .LT. L_max) THEN ! put this after error threshhold when implemented
ALLOCATE(temp_adj_list(dsize,2*d))
temp_adj_list = adj_list
DEALLOCATE(adj_list)
ALLOCATE(adj_list(dsize+count,2*d))
adj_list = 0
adj_list(1:dsize,:) = temp_adj_list
DEALLOCATE(temp_adj_list)
ENDIF
dsize = dsize + count
IF (L .LE. L_0) THEN
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(count,2*d))
grid_index_new = J_index(1:count,:)
DEALLOCATE(to_add_ind)
ALLOCATE(to_add_ind(count))
to_add_ind = dsize-count + (/ (h,h=1,count) /)
ELSE
DEALLOCATE(to_add_ind)
ALLOCATE(to_add_ind(count))
add = 0
to_add_ind = 0
DO h = 1,count
IF (ABS(J_coeff(h)) .GT. eps) THEN
add = add + 1
J_index(add,:) = J_index(h,:)
to_add_ind(add) = dsize-count+h
ENDIF
ENDDO
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(add,2*d))
grid_index_new = J_index(1:add,:)
ENDIF
DO i = 1,error_sample
V = 0.0D0
DO k1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k2 = 1,d
CALL BASE(sample_x(i,k2),grid_index(k1,k2),grid_index(k1,k2+d),B)
T = T*B
ENDDO
V = V+coeff(k1)*T
ENDDO
CALL FF(sample_x(i,:),F)
sample_e(i) = F-V
interp1(i) = V
ENDDO
max_error = MAXVAL(ABS(sample_e))
L2_error = (SUM(sample_e**2.0D0)/REAL(error_sample,REAL64))**0.5D0
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
WRITE(*,'(A,I5,A,F10.5,A,I8,A,F15.10,A,F15.10)') ' level = ', L,&
' time = ',REAL(time4-time3,REAL64)/REAL(clock_rate,REAL64),&
' grid points = ',SIZE(grid_index,1),&
' max error = ',max_error,&
' L2 error = ',L2_error
ENDDO
!PRINT *, ' '
!PRINT *, ' '
!PRINT *, ' '
!DO i = 1,SIZE(adj_list,1)
! PRINT *, i, adj_list(i,:)
!ENDDO
!PRINT *, ' '
!PRINT *, ' '
!PRINT *, ' '
!DO i = 1,dsize
! PRINT *, i, grid_index(i,:), coeff(i)
!ENDDO
!PRINT *, ' '
!PRINT *, ' '
!PRINT *, ' '
ALLOCATE (to_do(dsize),to_do_new(dsize),tree_vector(dsize))
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' total time for setup = ', REAL(time2-time1,REAL64)/REAL(clock_rate,REAL64)
! ============================================================================
! Compute interpolated values:
! ============================================================================
IF (testing_sample .EQ. error_sample) THEN
! x_rand = sample_x
ELSE
CALL RANDOM_NUMBER(x_rand)
ENDIF
count1 = 0
count2 = 0
count3 = 0
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO i = 1,testing_sample
V = 0.0D0
to_do = 0
to_do(1) = 1
to_do_new = 0
k = 1
DO L = 1,L_max
NULLIFY (tree)
tree_vector = 0
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
DO j = 1,k
ind = to_do(j)
T = 1.0D0
DO dd = 1,d
CALL BASE(x_rand(i,dd),grid_index(ind,dd),grid_index(ind,d+dd),B)
T = T*B
ENDDO
V = V + coeff(ind)*T
ENDDO
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count1 = count1 + time4-time3
IF (L .LT. L_max) THEN
n = k
k = 0
DO j = 1,n
IF (adj_list(to_do(j),1) .GT. 0) THEN
DO h = 1,2*d
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
LL = grid_index(adj_list(to_do(j),h),1:d)
ii = grid_index(adj_list(to_do(j),h),d+1:2*d)
flag = 0
k1 = 1
DO WHILE ((flag .EQ. 0) .AND. (k1 .LE. d))
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
k2 = 2*FLOOR(x_rand(i,k1)*REAL(2**(LL(k1)-1),REAL64))+1
ELSEIF (bound .EQ. 1) THEN
IF (LL(k1) .EQ. 2) THEN
IF (x_rand(i,k1) .LT. 0.5D0) THEN
k2 = 0
ELSE
k2 = 2
ENDIF
ELSE
k2 = 2*FLOOR(x_rand(i,k1)*(REAL(2**MAX(LL(k1)-2,0),REAL64)))+1
ENDIF
ENDIF
IF (k2 .NE. ii(k1)) THEN
flag = 1
ENDIF
k1 = k1 +1
ENDDO
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count2 = count2 + time4-time3
! CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
IF (flag .EQ. 0) THEN
!IF (MINVAL(ABS(to_do_new(1:MAX(k,1))-adj_list(to_do(j),h))) .GT. 0) THEN
to_do_new(k+1) = adj_list(to_do(j),h)
k = k+1
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
CALL INSERT(tree,to_do_new(k))
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count3 = count3 + time4-time3
!ENDIF
ENDIF
! CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
! count3 = count3 + time4-time3
ENDDO
ENDIF
ENDDO
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
iii = 0
CALL PRINT_TREE(tree)
to_do = tree_vector
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count3 = count3 + time4-time3
!to_do = to_do_new
to_do_new = 0
ENDIF
ENDDO
interp2(i) = V
ENDDO
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, ' '
WRITE(*,'(A,F10.5,A,I10)') ' time for interpolation = ', REAL(time2-time1,REAL64)/REAL(clock_rate,REAL64),&
' points = ', testing_sample
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' time for base = ', REAL(count1,REAL64)/REAL(clock_rate,REAL64)
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' time for x_inv = ', REAL(count2,REAL64)/REAL(clock_rate,REAL64)
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' time for repeated = ', REAL(count3,REAL64)/REAL(clock_rate,REAL64)
!PRINT *, ' '
!WRITE(*,'(A,F20.15)') ' check = ', MAXVAL(ABS(interp2-interp1))
DEALLOCATE(grid_index,grid_index_new,J_index,coeff,J_coeff,adj_list,to_do,to_do_new,to_add_ind,tree_vector)
CONTAINS
RECURSIVE SUBROUTINE INSERT(tree,number)
TYPE(NODE), POINTER :: tree
INTEGER(INT32), INTENT(IN) :: number
IF (.NOT. ASSOCIATED(tree)) THEN
ALLOCATE(tree)
tree%value = number
NULLIFY(tree%left)
NULLIFY(tree%right)
ELSEIF (number .LT. tree%value) THEN
CALL INSERT (tree%left,number)
ELSEIF (number .GT. tree%value) THEN
CALL INSERT(tree%right,number)
ENDIF
END SUBROUTINE INSERT
RECURSIVE SUBROUTINE PRINT_TREE(tree)
TYPE (NODE), POINTER :: tree
IF (ASSOCIATED(tree)) THEN
CALL PRINT_TREE(tree%left)
iii = iii+1
tree_vector(iii) = tree%value
CALL PRINT_TREE (tree%right)
END IF
END SUBROUTINE PRINT_TREE
END PROGRAM
I am using optimization O3 but otherwise no flags. In my computer the time for repeated (which is where I am using the binary tree) is 18.3 seconds, whereas if I use an alternative method that is commented in the version (with MINVAL) it only takes 3.6 seconds.

Related

How to improve performance of the do loop by using openMP?

As shown below, this code snippet aims to compute two arrays, i.e. data_real and data_imag. Their shapes are both 1024*10000. I want to speed up the computation of DO loops by using OpenMP. But I'm an absolute beginner of openMP. I am not very clear how to parallelizing a loop with dependent iterations, such as the statements temp2 = dx * temp1(2*i), temp3 = dx * temp1(2*i+1) of the snippet below. I mean, if there are race conditions in this code snippet.
Is there a way to speed up the do loops as shown below?
Note: Four1 is the subroutine used to perform FFT, sinc2 is the square of the sinc function.
!Declare variables
Real, Allocatable, Dimension(:,:) :: data_complex, data_real, data_imag
Real, Dimension(0:2*1024-1) :: temp1
Real :: temp2, temp3
!Allocate
Allocate( data_real(0:1024-1,0:10000-1), &
data_imag(0:1024-1,0:10000-1), &
data_complex(0:2*1024-1,0:10000-1), STAT=istat1 )
!Initialise
data_real = 0.0
data_imag = 0.0
data_complex = 0.0
dk = 2*3.14159 / 75.0
!$OMP Parallel num_threads(24) private(i,j,k,temp1,temp2,temp3) shared( dk)
!$OMP Do schedule(dynamic,1)
Do j = 0, 10000-1
temp1(:) = data_complex(:,j)
Call Four1(temp1, 1024, 1) ! Calling the subroutine 'Four1' to
! perform Fast Fourier Transform
Do i = 0, 1023
k = dk * Real(i)
temp2 = dx * temp1(2*i)
temp3 = dx * temp1(2*i+1)
data_real(i,j) = temp2 / sinc2( dx * k / 2 ) ! sinc2(x) = sin(x) / x
data_imag(i,j) = temp3 / sinc2( dx * k / 2 )
End Do
End Do
!$OMP End Do nowait
!$OMP End Parallel
! --------------------------------------------------------------- !
! ----------------------------------------------------------------!
Subroutine Four1(data_complex, nn, isign)
Integer, Intent(in) :: nn
Integer, Intent(in) :: isign
Real, Intent(inout), Dimension(2*nn) :: data_complex
Integer :: i, istep, j, m, mmax, n
Real :: tempi, tempr
Real(8) :: theta, wi, wpi, wpr, wr, wtemp
! ---------------------------------------------------------
n=2*nn
j=1
Do i=1,n,2
If(j>i) then
tempr=data_complex(j)
tempi=data_complex(j+1)
data_complex(j)=data_complex(i)
data_complex(j+1)=data_complex(i+1)
data_complex(i)=tempr
data_complex(i+1)=tempi
endif
m=n/2
Do while ( (m>=2).and.(j>m) )
j=j-m
m=m/2
End do
j=j+m
EndDo
mmax=2
Do while ( n > mmax )
istep=2*mmax
theta=(2*pi)/(isign*mmax)
wpr=-2.0d0*sin(0.5d0*theta)**2
wpi=sin(theta)
wr=1.0d0
wi=0.0d0
Do m=1,mmax,2
Do i=m,n,istep
j=i+mmax
tempr=Real(wr)*data_complex(j)-Real(wi)*data_complex(j+1)
tempi=Real(wr)*data_complex(j+1)+Real(wi)*data_complex(j)
data_complex(j)=data_complex(i)-tempr
data_complex(j+1)=data_complex(i+1)-tempi
data_complex(i)=data_complex(i)+tempr
data_complex(i+1)=data_complex(i+1)+tempi
End Do
wtemp=wr
wr=wr*wpr-wi*wpi+wr
wi=wi*wpr+wtemp*wpi+wi
End Do
mmax=istep
End Do
End Subroutine Four1
! ------------------------------------------------------------ !
! ------------------------------------------------------------ !
Real Function sinc2 ( x )
!
! Define the square of sinc function
!
Real, Intent(in) :: x
If ( abs(x) < 1.e-16 ) then
! be careful with comparison to real numbers because of rounding errors
! better: if (abs(x).lt.1.e-16) thensinc=1.
sinc2 = 1.0
Else
sinc2 = ( sin(x)/x )**2
End If
End Function sinc2

MPI program for the Poisson equation stuck after one iteration

I am trying to solve the Poisson Equation in a Square domain [(0,1)--(0,1)] using MPI and overlapping domains. Currently, my code takes an input of the number of domain divisions on X and Y directions, the length of the overlap between two domains as a function of domain length and the number of elemntal divisions in the overlap.
Input file looks like
2,2
10,10
10,10
program main
!implicit none
include 'mpif.h'
integer cols, divfx, divfy, iter
integer xdiv, ydiv, info, max_iter, x_shift, y_shift
integer, allocatable:: ipiv(:)
double precision, allocatable :: A(:,:), Ainv(:,:)
real, allocatable:: edge(:,:)
double precision, allocatable :: u(:,:), f(:,:)
double precision, allocatable :: u_exact(:,:)
allocatable :: Left(:,:), Right(:,:)
allocatable :: Top(:,:), Bottom(:,:)
allocatable :: TempLeft(:,:), TempRight(:,:)
allocatable :: TempTop(:,:), TempBottom(:,:)
integer myid, master, numprocs, ierr, status(MPI_STATUS_SIZE)
integer i, j, numsent, sender, L, T, R, B
integer anstype, row, dovfx, dovfy, domx, domy, idx
real dom1,dom2,buff
double precision mesh(2), buffer(4), divx, divy, dx, dy, xd, yd
double precision error, derror, error_norm
character(len=100) :: domaindata
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr)
master = 0
divx=0.d0
divy=0.d0
! Input the number of divisions for domain decomposition and calculate sub-domain dimensions.
open(1, file='Inputdata.dat', status='old')
! read(1,*) domx,domy
! read(1,*) dovfx,dovfy
! read(1,*) divfx,divfy
write(*,*)'Starting the Program'
write(*,*) "Enter the number of domain divisions in x-direction &
and y-direction ( Enter 4 if you want three sub-domains)"
read(1,*) domx,domy
write(*,*) domx,domy
write(*,*) "Total number of sub-domains for the problem"
write(*,*) domx*domy
write(*,*) "Enter the sub-domain overlap in x & y -direction as &
a fraction of sub-domain length (multiple of 10)"
read(1,*) dovfx,dovfy
write(*,*) dovfx,dovfy
write(*,*) "Enter the number of divisions in the overlap in &
x & yas a fraction of sub-domain(multiple of 5)"
read(1,*) divfx,divfy
write(*,*) divfx,divfy
divx=1.d0/(((1.d0/domx)/dovfx)/divfx)
divy=1.d0/(((1.d0/domy)/dovfy)/divfy)
write(*,*)"Total number of elemental divisions for the &
problem domain (0,1) in both dimensions"
write(*,*) divx, divy, divx*divy
write(*,*)"Total number of nodal divisions for the problem domain"
write(*,*) (divx+1)*(divy+1)
! time
! **************************
tic = MPI_Wtime();
! Maximum number of iterations.
max_iter=100
! Mesh Size
mesh(1)=1/divx
mesh(2)=1/divy
write(*,*) 'Element Size'
write(*,*) mesh(1), mesh(2)
if ( myid .eq. master ) then
! Send iteration number to subdomain and receive the error from each to
! calculate total error.
write(*,*) 'still1'
do 10 iter = 1,max_iter
do 20 i = 1,domx*domy
call MPI_SEND(iter, 1, MPI_INTEGER, i, i, MPI_COMM_WORLD, ierr)
20 continue
! Receive results obtained from sub-processor/sub-domain
!
error = 0.d0
do 30 i = 1,domx*domy
call MPI_RECV(d_error , 1, MPI_DOUBLE_PRECISION, i, iter, &
MPI_COMM_WORLD, status, ierr)
error = error + d_error
30 continue
write(*,*) 'In iteration ', iter, 'cumulative error is', error*1.d0/domx/domy
10 continue
! time:
! *************
toc = MPI_Wtime();
! Write results to output
! **************************
write(*,*)
write(*,*) 'Time taken for parallel computation is: ',(toc-tic)*1000, 'miliseconds'
else
!************************************ Slaves receive mesh size for discretization ******************************************************
write(*,*) 'iter', iter
write(*,*) 'myid', myid
! Slaves receive corners, then creates a Cartesian grid for finite
! difference until done message received, for one iteration.
! This is done for the first iteration
! Get Domain ID :
if (myid.gt.(domx*domy)) goto 200
write(*,*) 'still31'
1000 call MPI_RECV(iter, 1, MPI_INTEGER, master, MPI_ANY_TAG, MPI_COMM_WORLD, status, ierr)
if (status(MPI_TAG) .eq. 0) goto 200
write(*,*) 'still4'
if (iter.eq.1) then
write(*,*) 'still5'
dom1=domx
dom2=domy
allocate (edge(domx*domy,4))
! Determining the edge matrices for each subdomain - the bounding box
do j =1,domx
do k=1,domy
idx=(j-1)*(domx-1)+k+(j-1)
buff=REAL((mod(idx-1,domx)))/domx
IF (buff-((1.d0/domx)/dovfx) .gt. 0) THEN
buff=buff-((1.d0/domx)/dovfx)
ENDIF
edge(idx,1) = buff
IF ((mod(idx ,domx)) .eq. 0) THEN
buff=1
ELSE
buff=REAL(mod(idx ,domx))/domx
ENDIF
!write(*,*) buff
IF (buff + ((1.d0/domx)/dovfx) .lt. 1) THEN
buff=buff+((1.d0/domx)/dovfx)
ENDIF
edge(idx,2) = buff
!
buff=REAL(floor((idx-1)/dom1))/dom1
IF (buff -((1.d0/domy)/dovfy) .gt. 0) THEN
buff=buff-((1.d0/domy)/dovfy)
ENDIF
edge(idx,3) = buff
buff=REAL(ceiling(idx/dom1))/dom1
IF (buff+((1.d0/domy)/dovfy) .lt. 1) THEN
buff= buff+((1.d0/domy)/dovfy)
ENDIF
edge(idx,4) = buff
end do
end do
write(*,*) myid, edge(myid,:)
write(*,*) 'iter', iter
call Surround_dom(myid,domx,domy,LeftC, RightC, BottomC, TopC)
! Calculate data for the matrices: Divisions in each subdomain. :
xdiv=(edge(myid,2)-edge(myid,1))/mesh(1)
ydiv=(edge(myid,4)-edge(myid,3))/mesh(2)
dx=mesh(1)
dy=mesh(2)
allocate (A((xdiv-1)*(ydiv-1),(xdiv-1)*(ydiv-1)))
allocate (Ainv((xdiv-1)*(ydiv-1),(xdiv-1)*(ydiv-1)))
allocate (u((xdiv-1)*(ydiv-1),1),f((xdiv-1)*(ydiv-1),1))
allocate (u_exact((xdiv-1)*(ydiv-1),1))
allocate (ipiv((xdiv-1)*(ydiv-1)))
allocate (Left((ydiv-1),1),Right((ydiv-1),1))
allocate (Top((xdiv-1),1), Bottom(((xdiv-1)),1))
allocate (TempLeft((ydiv-1),1),TempRight((ydiv-1),1))
allocate (TempTop((xdiv-1),1), TempBottom(((xdiv-1)),1))
Left = 0.d0; Right = 0.d0; Bottom = 0.d0; Top = 0.d0;
TempLeft = 0.d0; TempRight = 0.d0; TempBottom = 0.d0; TempTop = 0.d0;
A=0;
endif
write(*,*) 'still6'
! ******************************************************************
! SendReceive data based on location
! ******************************************************************
if (LeftC.ne.0) then
call MPI_SENDRECV(Left, ydiv - 1, MPI_DOUBLE_PRECISION, LeftC, iter, &
TempLeft, ydiv - 1, MPI_DOUBLE_PRECISION, LeftC, iter, MPI_COMM_WORLD, status, ierr)
end if
if (RightC.ne.0) then
call MPI_SENDRECV(Right, ydiv - 1, MPI_DOUBLE_PRECISION, RightC, iter, &
TempRight, ydiv - 1, MPI_DOUBLE_PRECISION, RightC, iter, MPI_COMM_WORLD, status, ierr)
end if
if (BottomC.ne.0) then
call MPI_SENDRECV(Bottom, xdiv - 1, MPI_DOUBLE_PRECISION, BottomC, iter, &
TempBottom, xdiv - 1, MPI_DOUBLE_PRECISION, BottomC, iter, MPI_COMM_WORLD, status, ierr)
end if
if (TopC.ne.0) then
call MPI_SENDRECV(Top, xdiv - 1, MPI_DOUBLE_PRECISION, TopC, iter, &
TempTop, xdiv - 1, MPI_DOUBLE_PRECISION, TopC, iter, MPI_COMM_WORLD, status, ierr)
end if
Left = TempLeft ;
Right = TempRight;
Top = TempTop ;
Bottom = TempBottom;
write(*,*) 'still7'
! Form the coefficient matrices
do i =1,(xdiv-1)*(ydiv-1)
A(i,i)=-2.d0*(1.d0/(dx**2)+1.d0/(dy**2))
enddo
do i=1,(xdiv-2)
do j=1,(ydiv-1)
A(i+(j-1)*(xdiv-1),i+(j-1)*(xdiv-1)+1)=1.d0/(dx**2)
A(i+(j-1)*(xdiv-1)+1,i+(j-1)*(xdiv-1))=1.d0/(dx**2)
enddo
enddo
do i=1,(xdiv-1)
do j=1,(ydiv-2)
A(i+(j-1)*(xdiv-1),i+(j)*(xdiv-1))=1.d0/(dy**2)
A(i+(j)*(xdiv-1),i+(j-1)*(xdiv-1))=1.d0/(dy**2)
enddo
enddo
write(*,*) 'still9'
L=1
T=1
R=1
B=1
write(*,*) 'still10'
! Impose Boundary Conditions in F matrix
do i=1,(xdiv-1)*(ydiv-1)
xd = edge(myid,1) + (dx)*mod(i,(xdiv-1))
if (mod(i,xdiv-1).eq.0) xd = edge(myid,1) + (dx)*(xdiv-1)
yd = edge(myid,3) + (dy)*ceiling(i*1.d0/(xdiv-1))
!if (iter.eq.1 .and. myid.eq.2) write(*,*) xd,yd
u_exact(i,1) = sin(2.d0*3.1415*xd)*sin(2.d0*3.1415*yd)
f(i,1) = 8.d0*3.1415*3.1415*u_exact(i,1)
IF (mod(i,(xdiv-1)) .eq. 1) THEN
f(i,1)= f(i,1)+Left(L,1)/dx/dx
L=L+1
ENDIF
IF (mod(i,(xdiv-1)) .eq. 0) THEN
f(i,1)=f(i,1)+Right(R,1)/dx/dx
R=R+1
ENDIF
IF (i .le. (xdiv-1)) THEN
f(i,1)=f(i,1)+Bottom(B,1)/dy/dy
B=B+1
ENDIF
IF (i .gt. (xdiv-1)*(ydiv-2)) THEN
f(i,1)=f(i,1)+Top(T,1)/dy/dy
T=T+1
END IF
! enddo
enddo
!Solve AU=F by LU factorization!
write(*,*) 'still11'
do i=1,(xdiv-1)*(ydiv-1)
do j=1,(xdiv-1)*(ydiv-1)
Ainv(i,j)=A(i,j)
end do
end do
! do i=1,(xdiv-1)*(ydiv-1)
! write(*,*) myid,Ainv(i,i)
!end do
call DGESV((xdiv-1)*(ydiv-1), 1, A, &
(xdiv-1)*(ydiv-1), ipiv, f, (xdiv-1)*(ydiv-1), info)
write(*,*) 'still12'
call ErrorNorm(f,u_exact,(xdiv-1)*(ydiv-1),error_norm)
write(*,*) 'still13'
! ****************************************************
! Update boundary conditions based on new solution:
! ****************************************************
x_shift = divfx-1 ;
y_shift = divfy-1 ;
! write(*,*) 'LeftC', myid,LeftC,RightC,TopC,BottomC
if (LeftC.ne.0) then
do 50 i = 1,ydiv - 1
Left(i,1) = f((xdiv - 1)*(i - 1) + 1 + x_shift,1)
!if ((myid.eq.2).and.(iter.eq.1)) write(*,*) 'for left',i, &
!(xdiv - 1)*(i - 1) + 1 + x_shift
50 continue
end if
if (RightC.ne.0) then
do 60 i = 1,ydiv - 1
Right(i,1) = f((xdiv - 1)*i - x_shift,1)
!if ((myid.eq.1).and.(iter.eq.1)) write(*,*) 'for right',i, &
!(xdiv - 1)*i - x_shift
60 continue
end if
if (TopC.ne.0) then
do 70 i = 1,xdiv - 1
Top(i,1) = f((xdiv - 1)*((ydiv - 2) - y_shift) + i,1)
!if ((myid.eq.1).and.(iter.eq.1)) write(*,*) 'for top',i, &
!((xdiv - 1)*((ydiv - 2) - y_shift) + i)
70 continue
end if
if ( BottomC.ne.0) then
do 80 i = 1,xdiv - 1
Bottom(i,1) = f((xdiv - 1)*y_shift + i,1)
!if ((myid.eq.3).and.(iter.eq.1)) write(*,*) 'for bottom',i, &
!((xdiv - 1)*y_shift + i)
80 continue
end if
write(*,*) 'still14'
TempLeft =Left;
TempRight = Right;
TempTop = Top;
TempBottom = Bottom;
call MPI_SEND(error_norm, 1, MPI_DOUBLE_PRECISION, master, iter, &
MPI_COMM_WORLD, ierr)
write(*,*) 'still15'
if (iter.lt.iter_max) go to 1000
! *********************************************************************************
! Write solution to data file to view the results.
! *********************************************************************************
write (domaindata, "(A7,I2,A4)") "domain_",myid,".dat"
open (unit=myid*10, file = domaindata)
write (myid*10,*) ' VARIABLE= "X","Y","U_EXACT","U_CALC" '
do i=1,(xdiv-1)*(ydiv-1)
xd = edge(myid,1) + (dx)*mod(i,(xdiv-1))
if (mod(i,xdiv-1).eq.0) xd = edge(myid,1) + (dx)*(xdiv-1)
yd = edge(myid,3) + (dy)*ceiling(i*1.d0/(xdiv-1))
write (myid*10,*) xd, yd, u_exact(i,1), f(i,1)
enddo
write(*,*) 'still16'
if (iter.eq.max_iter) go to 200
200 continue
write(*,*) 'still45'
endif
call MPI_FINALIZE(ierr)
stop
end program main
subroutine Surround_dom(myid,domx,domy,LeftID, RightID, BottomID, TopID)
implicit none
integer myid, j, k, domy, domx, BottomID, TopID, LeftID, RightID
j = ceiling(1.d0*myid/domx)
k = mod(myid,domx)
if (k.eq.0) k = domx
! Domain on the left
if(k.eq.1) then
LeftID = 0
else
LeftID = ((j-1)*domx + k) - 1
end if
! Domain on the Right
if(k.eq.domx) then
RightID = 0
else
RightID = ((j-1)*domx + k) + 1
end if
! Domain on the Bottom
if(j.eq.1) then
BottomID = 0
else
BottomID = ((j-1)*domx + k) - domx
end if
! Domain on the Top
if(j.eq.domy) then
TopID = 0
else
TopID = ((j-1)*domx + k) + domx
end if
return
end
subroutine ErrorNorm(u,u_exact,N,error_norm)
implicit none
double precision u(N), u_exact(N), err, error_norm
integer i, N
error_norm = 0.d0
do 10 i = 1,N
err = (u(i) - u_exact(i))
error_norm = error_norm + err*err
10 continue
error_norm = sqrt(error_norm)/(N*1.d0)
return
end
I expect the code to run through all the iterations giving me an respectable error about multiples of 1e-3/1e-4.?
Currently, no error shows up, the code successfully runs for 1 iteration and then doesn't produce any output at all, even after days. It would be really helpful if I could get some guidance. I am sorry since the structure of my code is a awkward, I am just a beginner. It won't run if number of domains is odd or if the number of domains is not equal to number of processors. Any suggestions on how to remove these limitations is also welcome.

Issues with setting random seed [duplicate]

This question already has an answer here:
Random numbers keep coming out the same, despite random seed being used
(1 answer)
Closed last year.
I am attempting to write a Montecarlo algorithm to simulate interaction between agents in a population. This algorithm needs to call two random numbers at each iteration (say, 10^9 iterations).
My issue here is that everytime I change the seed (to obtain different realizations), the RNG is giving me the same output (same Montecarlo events). I have tried different ways of implementing it with to no avail.
I am completely new to Fortran and copying this code from MATLAB. Am I doing something wrong in the way I'm implementing this code?
Below is what I tried:
program Gillespie
implicit none
integer*8, parameter :: n_max = 10.0**8 ! max. number of iterations
integer*8 :: t_ext, I_init, S_init, jump, S_now, I_now, i, u
real*8 :: t, N, a0, tau, st, r1, r2
real, parameter :: beta = 1000
real, parameter :: gammma = 99.98
real, parameter :: mu = 0.02
real, parameter :: R0 = beta/(gammma+mu)
integer :: seed = 11
real, dimension(n_max) :: S_new ! susceptible pop. array
real, dimension(n_max) :: I_new ! infected pop. array
real, dimension(n_max) :: t_new ! time array
real, dimension(5) :: events ! events array
open(unit=3, file='SIS_output.dat')
t = 0 ! initial time
N = 40 ! initial population size
jump = 1 ! time increment (save in uniform increments)
u = 2
t_ext = 0 ! extiction time
I_init = 2 ! initial infected pop.
S_init = N-I_init ! initial susceptible pop.
S_now = S_init
I_now = I_init
S_new(1) = S_init ! initialize susceptibles array
I_new(1) = I_init ! initialize infected array
t_new(1) = t ! initialize time array
write(3,*) t_new(1), S_new(1), I_new(1) ! write i.c. to array
call random_seed(seed)
do i=2, n_max
call random_number(r1)
call random_number(r2)
events(1) = mu*N ! Birth(S)
events(2) = mu*S_now ! Death(S)
events(3) = mu*I_now ! Death(I)
events(4) = (beta*S_now*I_now)/N ! Infection
events(5) = gammma*I_now ! Recovery
a0 = events(1)+events(2)+events(3)+events(4)+events(5)
tau = LOG(1/r1)*(1/a0) ! time increment
t = t + tau ! update time
st = r2*a0 ! stochastic time???
! update the populations
if (st .le. events(1)) then
S_now = S_now + 1
else if (st .gt. events(1) .AND. st .le.
#(events(1) + events(2))) then
S_now = S_now - 1
else if (st .gt. (events(1) + events(2)) .AND. st .le.
#(events(1) + events(2) + events(3))) then
I_now = I_now - 1
else if (st .gt. (events(1) + events(2) + events(3)) .AND.
#st .le. (events(1) + events(2) + events(3) + events(4))) then
S_now = S_now - 1
I_now = I_now + 1
else
S_now = S_now + 1
I_now = I_now - 1
end if
! save time in uniform increments
if(t .ge. jump) then
t_new(u) = floor(t)
S_new(u) = S_now
I_new(u) = I_now
write(3,*) t_new(u), S_new(u), I_new(u)
jump = jump+1
u = u+1
end if
! write(3,*) t_new(i), S_new(i), I_new(i)
!N = S_now + I_now ! update population post event
if(I_now .le. 0) then ! if extinct, terminate
print *, "extinct"
goto 2
end if
end do
2 end program Gillespie
I appreciate all input. Thanks.
Your call
call random_seed(seed)
is strange. I thought it should not be allowed without a keyword argument, but it actually is inquiring for the size of the random seed array.
For a proper way of initializing seed see the example in
https://gcc.gnu.org/onlinedocs/gfortran/RANDOM_005fSEED.html

Speed of dereferencing class properties in fortran

I expect answer like don't worry, compiler will take care of that but I wan't to be sure.
When I make some method in some custom type/class in fortran, is there any performance hit due to referencing/dereferencing fields of the object like this%a(i) = this%b(i) + this%c(i) in comparison to just working with arrays like a(i) = b(i) + c(i)
more complex example:
for example I have this function which should interpolate a value on 3D grid which is really performance critical (it would be called inside a triple loop over an other 3D array). So I'm thinking if it is better ( for performance) to make is using a method of the class, or rather make a normal subroutine which takes the array as an argument.
type grid3D ! 3D grid maps of observables
real, dimension (3) :: Rmin, Rmax, Rspan, step ! grid size and spacing (x,y,z)
integer, dimension (3) :: N ! dimension in x,y,z
real, dimension (3,:, :, :), allocatable :: f ! array storing values of othe observable
contains
procedure :: interpolate => grid3D_interpolate
end type grid3D
function grid3D_interpolate(this, R ) result(ff)
implicit none
! variables
class (grid3D) :: this
real, dimension (3), intent (in) :: R
real :: ff
integer ix0,iy0,iz0
integer ix1,iy1,iz1
real dx,dy,dz
real mx,my,mz
! function body
ix0 = int( (R(1)/this%step(1)) + fastFloorOffset ) - fastFloorOffset
iy0 = int( (R(2)/this%step(2)) + fastFloorOffset ) - fastFloorOffset
iz0 = int( (R(3)/this%step(3)) + fastFloorOffset ) - fastFloorOffset
dx = R(1) - x0*this%step(1)
dy = R(2) - y0*this%step(2)
dz = R(3) - z0*this%step(3)
ix0 = modulo( x0 , this%N(1) )+1
iy0 = modulo( y0 , this%N(2) )+1
iz0 = modulo( z0 , this%N(3) )+1
ix1 = modulo( x0+1 , this%N(1) )+1
iy1 = modulo( y0+1 , this%N(2) )+1
iz1 = modulo( z0+1 , this%N(3) )+1
mx=1.0-dx
my=1.0-dy
mz=1.0-dz
ff = mz*(my*(mx*this%f(ix0,iy0,iz0) &
+dx*this%f(ix1,iy0,iz0)) &
+dy*(mx*this%f(ix0,iy1,iz0) &
+dx*this%f(ix1,iy1,iz0))) &
+dz*(my*(mx*this%f(ix0,iy0,iz1) &
+dx*this%f(ix1,iy0,iz1)) &
+dy*(mx*this%f(ix0,iy1,iz1) &
+dx*this%f(ix1,iy1,iz1)))
end if
end function grid3D_interpolate
end module T_grid3Dvec
Not really.
As long as your code structure is quite clear (to the compiler), it can optimize that away quite easily.
Once your OOP structures get too complicated, or the level of dereferencing gets too large, you might get some improvement out of a manual dereferencing scheme. (I use that quite a lot, although usually to keep my code human-readable. But I had a little improvement here once, but with a code using >5 levels of dereferencing. )
Here is some example:
module vec_mod
implicit none
type t_vector
real :: x = 0.
real :: y = 0.
real :: z = 0.
end type
type t_group
type(t_vector),allocatable :: vecs(:)
end type
contains
subroutine sum_vec( vecs, res )
implicit none
type(t_vector),intent(in) :: vecs(:)
type(t_vector),intent(out) :: res
integer :: i
res%x = 0. ; res%y = 0. ; res%z = 0.
do i=1,size(vecs)
res%x = res%x + vecs(i)%x
res%y = res%y + vecs(i)%y
res%z = res%z + vecs(i)%z
enddo
end subroutine
subroutine sum_vec_ptr( vecs, res )
implicit none
type(t_vector),intent(in),target :: vecs(:)
type(t_vector),intent(out) :: res
integer :: i
type(t_vector),pointer :: curVec
res%x = 0. ; res%y = 0. ; res%z = 0.
do i=1,size(vecs)
curVec => vecs(i)
res%x = res%x + curVec%x
res%y = res%y + curVec%y
res%z = res%z + curVec%z
enddo
end subroutine
subroutine sum_vecGrp( vecGrp, res )
implicit none
type(t_group),intent(in) :: vecGrp
type(t_vector),intent(out) :: res
integer :: i
res%x = 0. ; res%y = 0. ; res%z = 0.
do i=1,size(vecGrp%vecs)
res%x = res%x + vecGrp%vecs(i)%x
res%y = res%y + vecGrp%vecs(i)%y
res%z = res%z + vecGrp%vecs(i)%z
enddo
end subroutine
subroutine sum_vecGrp_ptr( vecGrp, res )
implicit none
type(t_group),intent(in),target :: vecGrp
type(t_vector),intent(out) :: res
integer :: i
type(t_vector),pointer :: curVec, vecs(:)
res%x = 0. ; res%y = 0. ; res%z = 0.
vecs => vecGrp%vecs
do i=1,size(vecs)
curVec => vecs(i)
res%x = res%x + curVec%x
res%y = res%y + curVec%y
res%z = res%z + curVec%z
enddo
end subroutine
end module
program test
use omp_lib
use vec_mod
use,intrinsic :: ISO_Fortran_env
implicit none
type(t_vector),allocatable :: vecs(:)
type(t_vector) :: res
type(t_group) :: vecGrp
integer,parameter :: N=100000000
integer :: i, stat
real(REAL64) :: t1, t2
allocate( vecs(N), vecGrp%vecs(N), stat=stat )
if (stat /= 0) stop 'Cannot allocate memory'
do i=1,N
call random_number(vecs(i)%x)
call random_number(vecs(i)%y)
call random_number(vecs(i)%z)
enddo
print *,''
print *,'1 Level'
t1 = omp_get_wtime()
call sum_vec( vecs, res )
print *,res
t2 = omp_get_wtime()
print *,'Normal [s]:', t2-t1
t1 = omp_get_wtime()
call sum_vec_ptr( vecs, res )
print *,res
t2 = omp_get_wtime()
print *,'Pointer [s]:', t2-t1
print *,''
print *,'2 Levels'
vecGrp%vecs = vecs
t1 = omp_get_wtime()
call sum_vecGrp( vecGrp, res )
print *,res
t2 = omp_get_wtime()
print *,'Normal [s]:', t2-t1
t1 = omp_get_wtime()
call sum_vecGrp_ptr( vecGrp, res )
print *,res
t2 = omp_get_wtime()
print *,'Pointer [s]:', t2-t1
end program
Compiled with default options (gfortran test.F90 -fopenmp), three is a slight benefit from manually dereferencing, especially for two levels of dereferencing:
OMP_NUM_THREADS=1 ./a.out
1 Level
16777216.0 16777216.0 16777216.0
Normal [s]: 0.69216769299237058
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.67321390099823475
2 Levels
16777216.0 16777216.0 16777216.0
Normal [s]: 0.84902219301147852
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.71247501399193425
Once you turn on optimization (gfortran test.F90 -fopenmp -O3), you can see that the compiler actually does a better job automatically:
OMP_NUM_THREADS=1 ./a.out
1 Level
16777216.0 16777216.0 16777216.0
Normal [s]: 0.13888958499592263
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.19099253200693056
2 Levels
16777216.0 16777216.0 16777216.0
Normal [s]: 0.13436777899914887
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.21104205500159878

searching a prime number

I hope I am not duplication any question but the suggested topic did not provide with any similar problem. I have a function that check if a number is a prime one. Now this is the slowest possible way to search for a prime.
subroutine is_prime_slow(num, stat)
implicit none
logical :: stat
integer :: num
integer :: i
if ((num .le. 3) .and. (num .gt. 1)) then
stat = .true.
return
end if
! write(*,*) 'limit = ',limit
do i = 2,num - 1
! write(*,*) 'mod(',limit,i,') = ',mod(limit,i)
if (mod(num,i) == 0) then
stat = .false.
return
end if
end do
stat = .true.
return
end
Now let's say that I do some improvement to it.
subroutine is_prime_slow(num, stat)
implicit none
logical :: stat
integer :: num
integer :: i
if ((num .le. 3) .and. (num .gt. 1)) then
stat = .true.
return
end if
! IMPROVEMENT
if ((mod(num,2) == 0) .or. (mod(num,3) == 0) .or. (mod(num,5) == 0) .or. (mod(num,7) == 0)) then
stat = .false.
return
end if
! write(*,*) 'limit = ',limit
do i = 2,num - 1
! write(*,*) 'mod(',limit,i,') = ',mod(limit,i)
if (mod(num,i) == 0) then
stat = .false.
return
end if
end do
stat = .true.
return
end
Now the second version excludes more than half of numbers e.g. all that are divisible by 2,3,5,7. How is it possible that when I time the execution with the linux 'time' program, the 'improved' version performs just as slowly? Am I missing some obvious trick?
Searching the first 900000 numbers:
1st: 4m28sec
2nd 4m26sec
The multiples of 2, 3, 5, and 7 are quickly rejected by the original algorithm anyway, so jumping over them does not improve the performance at all. Where the algorithm spends most of its time is in proving that numbers with large prime factors are composite. To radically improve the performance you should use a better primality test, such as Miller-Rabin.
A simpler improvement is testing factors only up to sqrt(num), not num-1. If that doesn't make immediate sense, think about how big the smallest prime factor of a composite number can be. Also, if you are looking for primes from 1 to N, it may be more efficient to use a prime number sieve, or testing divisibility only by primes you have already found.
I just recently coded something similar ;-)
! Algorithm taken from https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes
subroutine eratosthenes_sieve(n, primes)
implicit none
integer,intent(in) :: n
integer,allocatable,intent(out) :: primes(:)
integer :: i, j, maxPrime, stat
logical :: A(n)
maxPrime = floor(sqrt(real(n)))
A = .true.
do i=2,maxPrime
j = i*i
do
A(j) = .false.
j = j + i ; if ( j .gt. n ) exit
enddo
enddo !i
allocate( primes( count(A)-1 ), stat=stat )
if ( stat /= 0 ) stop 'Cannot allocate memory!'
j = 1
do i=2,n ! Skip 1
if ( .not. A(i) ) cycle
primes( j ) = i
j = j + 1 ; if ( j > size(primes) ) exit
enddo
end subroutine
This algorithm gives you all prime numbers up to a certain number, so you can easily check whether your prime is included or not:
if ( any(number == prime) ) write(*,*) 'Prime found:',number

Resources