Efficient (Fast) Binary Tree in Fortran - performance
I am using the procedure in the following code (that I took from here) to a program that I am trying to make run as fast as possible. The procedure is, however, very slow since it is probably optimized for pedagogical purposes not speed.
program tree_sort
! Sorts a file of integers by building a
! tree, sorted in infix order.
! This sort has expected behavior n log n,
! but worst case (input is sorted) n ** 2.
implicit none
type node
integer :: value
type (node), pointer :: left, right
end type node
type (node), pointer :: t ! A tree
integer :: number, ios
nullify (t) ! Start with empty tree
do
read (*, *, iostat = ios) number
if (ios < 0) exit
call insert (t, number) ! Put next number in tree
end do
! Print nodes of tree in infix order
call print_tree (t)
contains
recursive subroutine insert (t, number)
type (node), pointer :: t ! A tree
integer, intent (in) :: number
! If (sub)tree is empty, put number at root
if (.not. associated (t)) then
allocate (t)
t % value = number
nullify (t % left)
nullify (t % right)
! Otherwise, insert into correct subtree
else if (number < t % value) then
call insert (t % left, number)
else
call insert (t % right, number)
end if
end subroutine insert
recursive subroutine print_tree (t)
! Print tree in infix order
type (node), pointer :: t ! A tree
if (associated (t)) then
call print_tree (t % left)
print *, t % value
call print_tree (t % right)
end if
end subroutine print_tree
end program tree_sort
Is there any way to speed it up? I am using the procedure to sequentially add elements to a vector without adding repeated ones (so I changed the else in the insert subroutine to else if (number > t % value) then. Other than that, instead of printing I store the values in a global variable.
Edit:
Here is the actual code:
MODULE MOD_PARAMETERS
USE, INTRINSIC :: ISO_FORTRAN_ENV
IMPLICIT NONE
SAVE
INTEGER(INT32), PARAMETER :: d = 10 ! number of dimensions
INTEGER(INT32), PARAMETER :: L_0 = 5 ! after this adaptive grid kicks in, for L <= L_0 usual sparse grid
INTEGER(INT32), PARAMETER :: L_max = 5 ! maximum level
INTEGER(INT32), PARAMETER :: bound = 1 ! 0 -> for f = 0 at boundary
! 1 -> adding grid points at boundary
! 2 -> extrapolating close to boundary
INTEGER(INT32), PARAMETER :: testing_sample = 10**4
INTEGER(INT32), PARAMETER :: error_sample = 10**2
REAL(REAL64), PARAMETER :: eps = 0.001D0 ! epsilon for adaptive grid
TYPE NODE
INTEGER :: value
TYPE (NODE), POINTER :: left, right
END TYPE NODE
INTEGER(INT32), DIMENSION(:), ALLOCATABLE :: tree_vector
INTEGER(INT32) :: iii
END MODULE MOD_PARAMETERS
SUBROUTINE FF(x,output)
USE MOD_PARAMETERS
IMPLICIT NONE
REAL(REAL64), DIMENSION(d), INTENT(IN) :: x
REAL(REAL64) , INTENT(OUT) :: output
output = 1.0D0/(ABS(0.5D0-SUM(x(:)**4.0D0))+0.1D0)
END SUBROUTINE
SUBROUTINE XX(n,L,i,output)
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER(INT32) , INTENT(IN) :: n
INTEGER(INT32), DIMENSION(n), INTENT(IN) :: L, i
REAL(REAL64), DIMENSION(n), INTENT(OUT) :: output
INTEGER(INT32) :: j
DO j = 1,n
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
output(j) = REAL(i(j),REAL64)/REAL(2**L(j),REAL64)
ELSEIF (bound .EQ. 1) THEN
output(j) = REAL(i(j),REAL64)/REAL(2**MAX(L(j)-1,1),REAL64)
ENDIF
ENDDO
END SUBROUTINE
SUBROUTINE XX_INV(L,x,output)
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER(INT32), DIMENSION(d), INTENT(IN) :: L
REAL(REAL64), DIMENSION(d), INTENT(IN) :: x
INTEGER(INT32), DIMENSION(d), INTENT(OUT) :: output
INTEGER(INT32) :: j
DO j = 1,d
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
output(j) = 2*FLOOR(x(j)*REAL(2**(L(j)-1),REAL64))+1
ELSEIF (bound .EQ. 1) THEN
IF (L(j) .EQ. 2) THEN
IF (x(j) .LT. 0.5D0) THEN
output(j) = 0
ELSE
output(j) = 2
ENDIF
ELSE
output(j) = 2*FLOOR(x(j)*(REAL(2**MAX(L(j)-2,0),REAL64)))+1
ENDIF
ENDIF
ENDDO
END SUBROUTINE
SUBROUTINE BASE(x,L,i,output)
USE MOD_PARAMETERS
IMPLICIT NONE
REAL(REAL64), INTENT(IN) :: x
INTEGER(INT32), INTENT(IN) :: L,i
REAL(REAL64), INTENT(OUT) :: output
IF (bound .EQ. 0) THEN
output = MAX((1.0D0-ABS(x*REAL(2**L,REAL64)-REAL(i,REAL64))),0.0D0)
ELSEIF (bound .EQ. 1) THEN
IF ((L .EQ. 1) .AND. (i .EQ. 1)) THEN
output = 1.0D0
ELSEIF ((L .EQ. 2) .AND. (i .EQ. 0)) THEN
output = MAX(1.0D0-2.0D0*x,0.0D0)
ELSEIF ((L .EQ. 2) .AND. (i .EQ. 2)) THEN
output = MAX(2.0D0*x-1.0D0,0.0D0)
ELSE
output = MAX((1.0D0-ABS(x*REAL(2**(L-1),REAL64)-REAL(i,REAL64))),0.0D0)
ENDIF
ELSEIF (bound .EQ. 2) THEN
IF ((L .EQ. 1) .AND. (i .EQ. 1)) THEN
output = 1.0D0
ELSEIF ((L .GT. 1) .AND. (i .EQ. 1)) THEN
output = MAX(2.0D0-REAL(2**L,REAL64)*x,0.0D0)
ELSEIF ((L .GT. 1) .AND. (i .EQ. (2**L)-1)) THEN
output = MAX(REAL(2**L,REAL64)*x+REAL(1-i,REAL64),0.0D0)
ELSE
output = MAX((1.0D0-ABS(x*REAL(2**L,REAL64)-REAL(i,REAL64))),0.0D0)
ENDIF
ENDIF
END SUBROUTINE
PROGRAM MAIN
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER(INT32), DIMENSION(d,d) :: ident
REAL(REAL64), DIMENSION(1) :: x1
REAL(REAL64), DIMENSION(d) :: xd
INTEGER(INT32), DIMENSION(2*d) :: temp
INTEGER(INT32), DIMENSION(:,:), ALLOCATABLE :: grid_index, temp_grid_index, grid_index_new, J_index, &
adj_list, temp_adj_list
INTEGER(INT32), DIMENSION(:), ALLOCATABLE :: to_do, to_do_new, to_add_ind
REAL(REAL64), DIMENSION(:), ALLOCATABLE :: coeff, temp_coeff, J_coeff
REAL(REAL64) :: temp_min, temp_max, V, T, B, F
INTEGER(INT32) :: i, k, k1, k2, h, j, L, n, dd, dsize, count, count1, count2, count3, flag, &
first, repeated, add, ind, adj_list_ind
INTEGER(INT32) :: time1, time2, time3, time4, clock_rate, clock_max
INTEGER(INT32), DIMENSION(d) :: LL, ii
REAL(REAL64), DIMENSION(error_sample,d) :: sample_x
REAL(REAL64), DIMENSION(error_sample) :: sample_e, interp1
REAL(REAL64) :: max_error, L2_error
REAL(REAL64), DIMENSION(testing_sample,d) :: x_rand
REAL(REAL64), DIMENSION(testing_sample) :: interp2
TYPE(NODE), POINTER :: tree
! ============================================================================
! EXECUTABLE
! ============================================================================
ident = 0
DO i = 1,d
ident(i,i) = 1
ENDDO
! Initial grid point
dsize = 1
ALLOCATE(grid_index(dsize,2*d),grid_index_new(dsize,2*d),adj_list(dsize,2*d))
grid_index(1,:) = 1
grid_index_new = grid_index
adj_list = 0
ALLOCATE(coeff(0:dsize))
coeff(0) = 0.0D0
xd = 0.5D0
CALL FF(xd,coeff(1))
L = 1
n = SIZE(grid_index_new,1)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
ALLOCATE(to_add_ind(1))
to_add_ind = 1
CALL RANDOM_NUMBER(sample_x)
sample_e = 0.0D0
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO WHILE (L .LT. L_max)
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
L = L+1
n = SIZE(grid_index_new,1)
count = 0
first = 1
DEALLOCATE(J_index,J_coeff)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
J_index = 0
J_coeff = 0.0D0
DO k = 1,n
adj_list_ind = 0
DO i = 1,d
DO j = 1,2
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ELSEIF (bound .EQ. 1) THEN
IF (grid_index_new(k,i) .EQ. 1) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(-(-1)**j)/)
ELSE
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ENDIF
ENDIF
CALL XX(d,temp(1:d),temp(d+1:2*d),xd)
temp_min = MINVAL(xd)
temp_max = MAXVAL(xd)
IF ((temp_min .GE. 0.0D0) .AND. (temp_max .LE. 1.0D0)) THEN
IF (first .EQ. 1) THEN
first = 0
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k2 = 1,d
CALL XX(1,temp(k2),temp(d+k2),x1)
CALL BASE(x1(1),grid_index(k1,k2),grid_index(k1,k2+d),B)
T = T*B
ENDDO
V = V+coeff(k1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
adj_list(to_add_ind(k),adj_list_ind+1) = dsize+count
adj_list_ind = adj_list_ind+1
ELSE
repeated = 0
DO h = 1,count
IF (SUM(ABS(J_index(h,:)-temp)) .EQ. 0) THEN
repeated = 1
adj_list(to_add_ind(k),adj_list_ind+1) = dsize+h
adj_list_ind = adj_list_ind+1
ENDIF
ENDDO
IF (repeated .EQ. 0) THEN
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k2 = 1,d
CALL XX(1,temp(k2),temp(d+k2),x1)
CALL BASE(x1(1),grid_index(k1,k2),grid_index(k1,k2+d),B)
T = T*B
ENDDO
V = V+coeff(k1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
adj_list(to_add_ind(k),adj_list_ind+1) = dsize+count
adj_list_ind = adj_list_ind+1
ENDIF
ENDIF
ENDIF
ENDDO
ENDDO
ENDDO
ALLOCATE(temp_grid_index(dsize,2*d))
ALLOCATE(temp_coeff(dsize))
temp_grid_index = grid_index
temp_coeff = coeff
DEALLOCATE(grid_index,coeff)
ALLOCATE(grid_index(dsize+count,2*d))
ALLOCATE(coeff(0:dsize+count))
grid_index(1:dsize,:) = temp_grid_index
coeff(0:dsize) = temp_coeff
DEALLOCATE(temp_grid_index,temp_coeff)
grid_index(dsize+1:dsize+count,:) = J_index(1:count,:)
coeff(dsize+1:dsize+count) = J_coeff(1:count)
IF (L .LT. L_max) THEN ! put this after error threshhold when implemented
ALLOCATE(temp_adj_list(dsize,2*d))
temp_adj_list = adj_list
DEALLOCATE(adj_list)
ALLOCATE(adj_list(dsize+count,2*d))
adj_list = 0
adj_list(1:dsize,:) = temp_adj_list
DEALLOCATE(temp_adj_list)
ENDIF
dsize = dsize + count
IF (L .LE. L_0) THEN
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(count,2*d))
grid_index_new = J_index(1:count,:)
DEALLOCATE(to_add_ind)
ALLOCATE(to_add_ind(count))
to_add_ind = dsize-count + (/ (h,h=1,count) /)
ELSE
DEALLOCATE(to_add_ind)
ALLOCATE(to_add_ind(count))
add = 0
to_add_ind = 0
DO h = 1,count
IF (ABS(J_coeff(h)) .GT. eps) THEN
add = add + 1
J_index(add,:) = J_index(h,:)
to_add_ind(add) = dsize-count+h
ENDIF
ENDDO
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(add,2*d))
grid_index_new = J_index(1:add,:)
ENDIF
DO i = 1,error_sample
V = 0.0D0
DO k1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k2 = 1,d
CALL BASE(sample_x(i,k2),grid_index(k1,k2),grid_index(k1,k2+d),B)
T = T*B
ENDDO
V = V+coeff(k1)*T
ENDDO
CALL FF(sample_x(i,:),F)
sample_e(i) = F-V
interp1(i) = V
ENDDO
max_error = MAXVAL(ABS(sample_e))
L2_error = (SUM(sample_e**2.0D0)/REAL(error_sample,REAL64))**0.5D0
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
WRITE(*,'(A,I5,A,F10.5,A,I8,A,F15.10,A,F15.10)') ' level = ', L,&
' time = ',REAL(time4-time3,REAL64)/REAL(clock_rate,REAL64),&
' grid points = ',SIZE(grid_index,1),&
' max error = ',max_error,&
' L2 error = ',L2_error
ENDDO
!PRINT *, ' '
!PRINT *, ' '
!PRINT *, ' '
!DO i = 1,SIZE(adj_list,1)
! PRINT *, i, adj_list(i,:)
!ENDDO
!PRINT *, ' '
!PRINT *, ' '
!PRINT *, ' '
!DO i = 1,dsize
! PRINT *, i, grid_index(i,:), coeff(i)
!ENDDO
!PRINT *, ' '
!PRINT *, ' '
!PRINT *, ' '
ALLOCATE (to_do(dsize),to_do_new(dsize),tree_vector(dsize))
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' total time for setup = ', REAL(time2-time1,REAL64)/REAL(clock_rate,REAL64)
! ============================================================================
! Compute interpolated values:
! ============================================================================
IF (testing_sample .EQ. error_sample) THEN
! x_rand = sample_x
ELSE
CALL RANDOM_NUMBER(x_rand)
ENDIF
count1 = 0
count2 = 0
count3 = 0
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO i = 1,testing_sample
V = 0.0D0
to_do = 0
to_do(1) = 1
to_do_new = 0
k = 1
DO L = 1,L_max
NULLIFY (tree)
tree_vector = 0
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
DO j = 1,k
ind = to_do(j)
T = 1.0D0
DO dd = 1,d
CALL BASE(x_rand(i,dd),grid_index(ind,dd),grid_index(ind,d+dd),B)
T = T*B
ENDDO
V = V + coeff(ind)*T
ENDDO
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count1 = count1 + time4-time3
IF (L .LT. L_max) THEN
n = k
k = 0
DO j = 1,n
IF (adj_list(to_do(j),1) .GT. 0) THEN
DO h = 1,2*d
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
LL = grid_index(adj_list(to_do(j),h),1:d)
ii = grid_index(adj_list(to_do(j),h),d+1:2*d)
flag = 0
k1 = 1
DO WHILE ((flag .EQ. 0) .AND. (k1 .LE. d))
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
k2 = 2*FLOOR(x_rand(i,k1)*REAL(2**(LL(k1)-1),REAL64))+1
ELSEIF (bound .EQ. 1) THEN
IF (LL(k1) .EQ. 2) THEN
IF (x_rand(i,k1) .LT. 0.5D0) THEN
k2 = 0
ELSE
k2 = 2
ENDIF
ELSE
k2 = 2*FLOOR(x_rand(i,k1)*(REAL(2**MAX(LL(k1)-2,0),REAL64)))+1
ENDIF
ENDIF
IF (k2 .NE. ii(k1)) THEN
flag = 1
ENDIF
k1 = k1 +1
ENDDO
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count2 = count2 + time4-time3
! CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
IF (flag .EQ. 0) THEN
!IF (MINVAL(ABS(to_do_new(1:MAX(k,1))-adj_list(to_do(j),h))) .GT. 0) THEN
to_do_new(k+1) = adj_list(to_do(j),h)
k = k+1
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
CALL INSERT(tree,to_do_new(k))
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count3 = count3 + time4-time3
!ENDIF
ENDIF
! CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
! count3 = count3 + time4-time3
ENDDO
ENDIF
ENDDO
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
iii = 0
CALL PRINT_TREE(tree)
to_do = tree_vector
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count3 = count3 + time4-time3
!to_do = to_do_new
to_do_new = 0
ENDIF
ENDDO
interp2(i) = V
ENDDO
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, ' '
WRITE(*,'(A,F10.5,A,I10)') ' time for interpolation = ', REAL(time2-time1,REAL64)/REAL(clock_rate,REAL64),&
' points = ', testing_sample
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' time for base = ', REAL(count1,REAL64)/REAL(clock_rate,REAL64)
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' time for x_inv = ', REAL(count2,REAL64)/REAL(clock_rate,REAL64)
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' time for repeated = ', REAL(count3,REAL64)/REAL(clock_rate,REAL64)
!PRINT *, ' '
!WRITE(*,'(A,F20.15)') ' check = ', MAXVAL(ABS(interp2-interp1))
DEALLOCATE(grid_index,grid_index_new,J_index,coeff,J_coeff,adj_list,to_do,to_do_new,to_add_ind,tree_vector)
CONTAINS
RECURSIVE SUBROUTINE INSERT(tree,number)
TYPE(NODE), POINTER :: tree
INTEGER(INT32), INTENT(IN) :: number
IF (.NOT. ASSOCIATED(tree)) THEN
ALLOCATE(tree)
tree%value = number
NULLIFY(tree%left)
NULLIFY(tree%right)
ELSEIF (number .LT. tree%value) THEN
CALL INSERT (tree%left,number)
ELSEIF (number .GT. tree%value) THEN
CALL INSERT(tree%right,number)
ENDIF
END SUBROUTINE INSERT
RECURSIVE SUBROUTINE PRINT_TREE(tree)
TYPE (NODE), POINTER :: tree
IF (ASSOCIATED(tree)) THEN
CALL PRINT_TREE(tree%left)
iii = iii+1
tree_vector(iii) = tree%value
CALL PRINT_TREE (tree%right)
END IF
END SUBROUTINE PRINT_TREE
END PROGRAM
I am using optimization O3 but otherwise no flags. In my computer the time for repeated (which is where I am using the binary tree) is 18.3 seconds, whereas if I use an alternative method that is commented in the version (with MINVAL) it only takes 3.6 seconds.
Related
How to improve performance of the do loop by using openMP?
As shown below, this code snippet aims to compute two arrays, i.e. data_real and data_imag. Their shapes are both 1024*10000. I want to speed up the computation of DO loops by using OpenMP. But I'm an absolute beginner of openMP. I am not very clear how to parallelizing a loop with dependent iterations, such as the statements temp2 = dx * temp1(2*i), temp3 = dx * temp1(2*i+1) of the snippet below. I mean, if there are race conditions in this code snippet. Is there a way to speed up the do loops as shown below? Note: Four1 is the subroutine used to perform FFT, sinc2 is the square of the sinc function. !Declare variables Real, Allocatable, Dimension(:,:) :: data_complex, data_real, data_imag Real, Dimension(0:2*1024-1) :: temp1 Real :: temp2, temp3 !Allocate Allocate( data_real(0:1024-1,0:10000-1), & data_imag(0:1024-1,0:10000-1), & data_complex(0:2*1024-1,0:10000-1), STAT=istat1 ) !Initialise data_real = 0.0 data_imag = 0.0 data_complex = 0.0 dk = 2*3.14159 / 75.0 !$OMP Parallel num_threads(24) private(i,j,k,temp1,temp2,temp3) shared( dk) !$OMP Do schedule(dynamic,1) Do j = 0, 10000-1 temp1(:) = data_complex(:,j) Call Four1(temp1, 1024, 1) ! Calling the subroutine 'Four1' to ! perform Fast Fourier Transform Do i = 0, 1023 k = dk * Real(i) temp2 = dx * temp1(2*i) temp3 = dx * temp1(2*i+1) data_real(i,j) = temp2 / sinc2( dx * k / 2 ) ! sinc2(x) = sin(x) / x data_imag(i,j) = temp3 / sinc2( dx * k / 2 ) End Do End Do !$OMP End Do nowait !$OMP End Parallel ! --------------------------------------------------------------- ! ! ----------------------------------------------------------------! Subroutine Four1(data_complex, nn, isign) Integer, Intent(in) :: nn Integer, Intent(in) :: isign Real, Intent(inout), Dimension(2*nn) :: data_complex Integer :: i, istep, j, m, mmax, n Real :: tempi, tempr Real(8) :: theta, wi, wpi, wpr, wr, wtemp ! --------------------------------------------------------- n=2*nn j=1 Do i=1,n,2 If(j>i) then tempr=data_complex(j) tempi=data_complex(j+1) data_complex(j)=data_complex(i) data_complex(j+1)=data_complex(i+1) data_complex(i)=tempr data_complex(i+1)=tempi endif m=n/2 Do while ( (m>=2).and.(j>m) ) j=j-m m=m/2 End do j=j+m EndDo mmax=2 Do while ( n > mmax ) istep=2*mmax theta=(2*pi)/(isign*mmax) wpr=-2.0d0*sin(0.5d0*theta)**2 wpi=sin(theta) wr=1.0d0 wi=0.0d0 Do m=1,mmax,2 Do i=m,n,istep j=i+mmax tempr=Real(wr)*data_complex(j)-Real(wi)*data_complex(j+1) tempi=Real(wr)*data_complex(j+1)+Real(wi)*data_complex(j) data_complex(j)=data_complex(i)-tempr data_complex(j+1)=data_complex(i+1)-tempi data_complex(i)=data_complex(i)+tempr data_complex(i+1)=data_complex(i+1)+tempi End Do wtemp=wr wr=wr*wpr-wi*wpi+wr wi=wi*wpr+wtemp*wpi+wi End Do mmax=istep End Do End Subroutine Four1 ! ------------------------------------------------------------ ! ! ------------------------------------------------------------ ! Real Function sinc2 ( x ) ! ! Define the square of sinc function ! Real, Intent(in) :: x If ( abs(x) < 1.e-16 ) then ! be careful with comparison to real numbers because of rounding errors ! better: if (abs(x).lt.1.e-16) thensinc=1. sinc2 = 1.0 Else sinc2 = ( sin(x)/x )**2 End If End Function sinc2
MPI program for the Poisson equation stuck after one iteration
I am trying to solve the Poisson Equation in a Square domain [(0,1)--(0,1)] using MPI and overlapping domains. Currently, my code takes an input of the number of domain divisions on X and Y directions, the length of the overlap between two domains as a function of domain length and the number of elemntal divisions in the overlap. Input file looks like 2,2 10,10 10,10 program main !implicit none include 'mpif.h' integer cols, divfx, divfy, iter integer xdiv, ydiv, info, max_iter, x_shift, y_shift integer, allocatable:: ipiv(:) double precision, allocatable :: A(:,:), Ainv(:,:) real, allocatable:: edge(:,:) double precision, allocatable :: u(:,:), f(:,:) double precision, allocatable :: u_exact(:,:) allocatable :: Left(:,:), Right(:,:) allocatable :: Top(:,:), Bottom(:,:) allocatable :: TempLeft(:,:), TempRight(:,:) allocatable :: TempTop(:,:), TempBottom(:,:) integer myid, master, numprocs, ierr, status(MPI_STATUS_SIZE) integer i, j, numsent, sender, L, T, R, B integer anstype, row, dovfx, dovfy, domx, domy, idx real dom1,dom2,buff double precision mesh(2), buffer(4), divx, divy, dx, dy, xd, yd double precision error, derror, error_norm character(len=100) :: domaindata call MPI_INIT(ierr) call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr) master = 0 divx=0.d0 divy=0.d0 ! Input the number of divisions for domain decomposition and calculate sub-domain dimensions. open(1, file='Inputdata.dat', status='old') ! read(1,*) domx,domy ! read(1,*) dovfx,dovfy ! read(1,*) divfx,divfy write(*,*)'Starting the Program' write(*,*) "Enter the number of domain divisions in x-direction & and y-direction ( Enter 4 if you want three sub-domains)" read(1,*) domx,domy write(*,*) domx,domy write(*,*) "Total number of sub-domains for the problem" write(*,*) domx*domy write(*,*) "Enter the sub-domain overlap in x & y -direction as & a fraction of sub-domain length (multiple of 10)" read(1,*) dovfx,dovfy write(*,*) dovfx,dovfy write(*,*) "Enter the number of divisions in the overlap in & x & yas a fraction of sub-domain(multiple of 5)" read(1,*) divfx,divfy write(*,*) divfx,divfy divx=1.d0/(((1.d0/domx)/dovfx)/divfx) divy=1.d0/(((1.d0/domy)/dovfy)/divfy) write(*,*)"Total number of elemental divisions for the & problem domain (0,1) in both dimensions" write(*,*) divx, divy, divx*divy write(*,*)"Total number of nodal divisions for the problem domain" write(*,*) (divx+1)*(divy+1) ! time ! ************************** tic = MPI_Wtime(); ! Maximum number of iterations. max_iter=100 ! Mesh Size mesh(1)=1/divx mesh(2)=1/divy write(*,*) 'Element Size' write(*,*) mesh(1), mesh(2) if ( myid .eq. master ) then ! Send iteration number to subdomain and receive the error from each to ! calculate total error. write(*,*) 'still1' do 10 iter = 1,max_iter do 20 i = 1,domx*domy call MPI_SEND(iter, 1, MPI_INTEGER, i, i, MPI_COMM_WORLD, ierr) 20 continue ! Receive results obtained from sub-processor/sub-domain ! error = 0.d0 do 30 i = 1,domx*domy call MPI_RECV(d_error , 1, MPI_DOUBLE_PRECISION, i, iter, & MPI_COMM_WORLD, status, ierr) error = error + d_error 30 continue write(*,*) 'In iteration ', iter, 'cumulative error is', error*1.d0/domx/domy 10 continue ! time: ! ************* toc = MPI_Wtime(); ! Write results to output ! ************************** write(*,*) write(*,*) 'Time taken for parallel computation is: ',(toc-tic)*1000, 'miliseconds' else !************************************ Slaves receive mesh size for discretization ****************************************************** write(*,*) 'iter', iter write(*,*) 'myid', myid ! Slaves receive corners, then creates a Cartesian grid for finite ! difference until done message received, for one iteration. ! This is done for the first iteration ! Get Domain ID : if (myid.gt.(domx*domy)) goto 200 write(*,*) 'still31' 1000 call MPI_RECV(iter, 1, MPI_INTEGER, master, MPI_ANY_TAG, MPI_COMM_WORLD, status, ierr) if (status(MPI_TAG) .eq. 0) goto 200 write(*,*) 'still4' if (iter.eq.1) then write(*,*) 'still5' dom1=domx dom2=domy allocate (edge(domx*domy,4)) ! Determining the edge matrices for each subdomain - the bounding box do j =1,domx do k=1,domy idx=(j-1)*(domx-1)+k+(j-1) buff=REAL((mod(idx-1,domx)))/domx IF (buff-((1.d0/domx)/dovfx) .gt. 0) THEN buff=buff-((1.d0/domx)/dovfx) ENDIF edge(idx,1) = buff IF ((mod(idx ,domx)) .eq. 0) THEN buff=1 ELSE buff=REAL(mod(idx ,domx))/domx ENDIF !write(*,*) buff IF (buff + ((1.d0/domx)/dovfx) .lt. 1) THEN buff=buff+((1.d0/domx)/dovfx) ENDIF edge(idx,2) = buff ! buff=REAL(floor((idx-1)/dom1))/dom1 IF (buff -((1.d0/domy)/dovfy) .gt. 0) THEN buff=buff-((1.d0/domy)/dovfy) ENDIF edge(idx,3) = buff buff=REAL(ceiling(idx/dom1))/dom1 IF (buff+((1.d0/domy)/dovfy) .lt. 1) THEN buff= buff+((1.d0/domy)/dovfy) ENDIF edge(idx,4) = buff end do end do write(*,*) myid, edge(myid,:) write(*,*) 'iter', iter call Surround_dom(myid,domx,domy,LeftC, RightC, BottomC, TopC) ! Calculate data for the matrices: Divisions in each subdomain. : xdiv=(edge(myid,2)-edge(myid,1))/mesh(1) ydiv=(edge(myid,4)-edge(myid,3))/mesh(2) dx=mesh(1) dy=mesh(2) allocate (A((xdiv-1)*(ydiv-1),(xdiv-1)*(ydiv-1))) allocate (Ainv((xdiv-1)*(ydiv-1),(xdiv-1)*(ydiv-1))) allocate (u((xdiv-1)*(ydiv-1),1),f((xdiv-1)*(ydiv-1),1)) allocate (u_exact((xdiv-1)*(ydiv-1),1)) allocate (ipiv((xdiv-1)*(ydiv-1))) allocate (Left((ydiv-1),1),Right((ydiv-1),1)) allocate (Top((xdiv-1),1), Bottom(((xdiv-1)),1)) allocate (TempLeft((ydiv-1),1),TempRight((ydiv-1),1)) allocate (TempTop((xdiv-1),1), TempBottom(((xdiv-1)),1)) Left = 0.d0; Right = 0.d0; Bottom = 0.d0; Top = 0.d0; TempLeft = 0.d0; TempRight = 0.d0; TempBottom = 0.d0; TempTop = 0.d0; A=0; endif write(*,*) 'still6' ! ****************************************************************** ! SendReceive data based on location ! ****************************************************************** if (LeftC.ne.0) then call MPI_SENDRECV(Left, ydiv - 1, MPI_DOUBLE_PRECISION, LeftC, iter, & TempLeft, ydiv - 1, MPI_DOUBLE_PRECISION, LeftC, iter, MPI_COMM_WORLD, status, ierr) end if if (RightC.ne.0) then call MPI_SENDRECV(Right, ydiv - 1, MPI_DOUBLE_PRECISION, RightC, iter, & TempRight, ydiv - 1, MPI_DOUBLE_PRECISION, RightC, iter, MPI_COMM_WORLD, status, ierr) end if if (BottomC.ne.0) then call MPI_SENDRECV(Bottom, xdiv - 1, MPI_DOUBLE_PRECISION, BottomC, iter, & TempBottom, xdiv - 1, MPI_DOUBLE_PRECISION, BottomC, iter, MPI_COMM_WORLD, status, ierr) end if if (TopC.ne.0) then call MPI_SENDRECV(Top, xdiv - 1, MPI_DOUBLE_PRECISION, TopC, iter, & TempTop, xdiv - 1, MPI_DOUBLE_PRECISION, TopC, iter, MPI_COMM_WORLD, status, ierr) end if Left = TempLeft ; Right = TempRight; Top = TempTop ; Bottom = TempBottom; write(*,*) 'still7' ! Form the coefficient matrices do i =1,(xdiv-1)*(ydiv-1) A(i,i)=-2.d0*(1.d0/(dx**2)+1.d0/(dy**2)) enddo do i=1,(xdiv-2) do j=1,(ydiv-1) A(i+(j-1)*(xdiv-1),i+(j-1)*(xdiv-1)+1)=1.d0/(dx**2) A(i+(j-1)*(xdiv-1)+1,i+(j-1)*(xdiv-1))=1.d0/(dx**2) enddo enddo do i=1,(xdiv-1) do j=1,(ydiv-2) A(i+(j-1)*(xdiv-1),i+(j)*(xdiv-1))=1.d0/(dy**2) A(i+(j)*(xdiv-1),i+(j-1)*(xdiv-1))=1.d0/(dy**2) enddo enddo write(*,*) 'still9' L=1 T=1 R=1 B=1 write(*,*) 'still10' ! Impose Boundary Conditions in F matrix do i=1,(xdiv-1)*(ydiv-1) xd = edge(myid,1) + (dx)*mod(i,(xdiv-1)) if (mod(i,xdiv-1).eq.0) xd = edge(myid,1) + (dx)*(xdiv-1) yd = edge(myid,3) + (dy)*ceiling(i*1.d0/(xdiv-1)) !if (iter.eq.1 .and. myid.eq.2) write(*,*) xd,yd u_exact(i,1) = sin(2.d0*3.1415*xd)*sin(2.d0*3.1415*yd) f(i,1) = 8.d0*3.1415*3.1415*u_exact(i,1) IF (mod(i,(xdiv-1)) .eq. 1) THEN f(i,1)= f(i,1)+Left(L,1)/dx/dx L=L+1 ENDIF IF (mod(i,(xdiv-1)) .eq. 0) THEN f(i,1)=f(i,1)+Right(R,1)/dx/dx R=R+1 ENDIF IF (i .le. (xdiv-1)) THEN f(i,1)=f(i,1)+Bottom(B,1)/dy/dy B=B+1 ENDIF IF (i .gt. (xdiv-1)*(ydiv-2)) THEN f(i,1)=f(i,1)+Top(T,1)/dy/dy T=T+1 END IF ! enddo enddo !Solve AU=F by LU factorization! write(*,*) 'still11' do i=1,(xdiv-1)*(ydiv-1) do j=1,(xdiv-1)*(ydiv-1) Ainv(i,j)=A(i,j) end do end do ! do i=1,(xdiv-1)*(ydiv-1) ! write(*,*) myid,Ainv(i,i) !end do call DGESV((xdiv-1)*(ydiv-1), 1, A, & (xdiv-1)*(ydiv-1), ipiv, f, (xdiv-1)*(ydiv-1), info) write(*,*) 'still12' call ErrorNorm(f,u_exact,(xdiv-1)*(ydiv-1),error_norm) write(*,*) 'still13' ! **************************************************** ! Update boundary conditions based on new solution: ! **************************************************** x_shift = divfx-1 ; y_shift = divfy-1 ; ! write(*,*) 'LeftC', myid,LeftC,RightC,TopC,BottomC if (LeftC.ne.0) then do 50 i = 1,ydiv - 1 Left(i,1) = f((xdiv - 1)*(i - 1) + 1 + x_shift,1) !if ((myid.eq.2).and.(iter.eq.1)) write(*,*) 'for left',i, & !(xdiv - 1)*(i - 1) + 1 + x_shift 50 continue end if if (RightC.ne.0) then do 60 i = 1,ydiv - 1 Right(i,1) = f((xdiv - 1)*i - x_shift,1) !if ((myid.eq.1).and.(iter.eq.1)) write(*,*) 'for right',i, & !(xdiv - 1)*i - x_shift 60 continue end if if (TopC.ne.0) then do 70 i = 1,xdiv - 1 Top(i,1) = f((xdiv - 1)*((ydiv - 2) - y_shift) + i,1) !if ((myid.eq.1).and.(iter.eq.1)) write(*,*) 'for top',i, & !((xdiv - 1)*((ydiv - 2) - y_shift) + i) 70 continue end if if ( BottomC.ne.0) then do 80 i = 1,xdiv - 1 Bottom(i,1) = f((xdiv - 1)*y_shift + i,1) !if ((myid.eq.3).and.(iter.eq.1)) write(*,*) 'for bottom',i, & !((xdiv - 1)*y_shift + i) 80 continue end if write(*,*) 'still14' TempLeft =Left; TempRight = Right; TempTop = Top; TempBottom = Bottom; call MPI_SEND(error_norm, 1, MPI_DOUBLE_PRECISION, master, iter, & MPI_COMM_WORLD, ierr) write(*,*) 'still15' if (iter.lt.iter_max) go to 1000 ! ********************************************************************************* ! Write solution to data file to view the results. ! ********************************************************************************* write (domaindata, "(A7,I2,A4)") "domain_",myid,".dat" open (unit=myid*10, file = domaindata) write (myid*10,*) ' VARIABLE= "X","Y","U_EXACT","U_CALC" ' do i=1,(xdiv-1)*(ydiv-1) xd = edge(myid,1) + (dx)*mod(i,(xdiv-1)) if (mod(i,xdiv-1).eq.0) xd = edge(myid,1) + (dx)*(xdiv-1) yd = edge(myid,3) + (dy)*ceiling(i*1.d0/(xdiv-1)) write (myid*10,*) xd, yd, u_exact(i,1), f(i,1) enddo write(*,*) 'still16' if (iter.eq.max_iter) go to 200 200 continue write(*,*) 'still45' endif call MPI_FINALIZE(ierr) stop end program main subroutine Surround_dom(myid,domx,domy,LeftID, RightID, BottomID, TopID) implicit none integer myid, j, k, domy, domx, BottomID, TopID, LeftID, RightID j = ceiling(1.d0*myid/domx) k = mod(myid,domx) if (k.eq.0) k = domx ! Domain on the left if(k.eq.1) then LeftID = 0 else LeftID = ((j-1)*domx + k) - 1 end if ! Domain on the Right if(k.eq.domx) then RightID = 0 else RightID = ((j-1)*domx + k) + 1 end if ! Domain on the Bottom if(j.eq.1) then BottomID = 0 else BottomID = ((j-1)*domx + k) - domx end if ! Domain on the Top if(j.eq.domy) then TopID = 0 else TopID = ((j-1)*domx + k) + domx end if return end subroutine ErrorNorm(u,u_exact,N,error_norm) implicit none double precision u(N), u_exact(N), err, error_norm integer i, N error_norm = 0.d0 do 10 i = 1,N err = (u(i) - u_exact(i)) error_norm = error_norm + err*err 10 continue error_norm = sqrt(error_norm)/(N*1.d0) return end I expect the code to run through all the iterations giving me an respectable error about multiples of 1e-3/1e-4.? Currently, no error shows up, the code successfully runs for 1 iteration and then doesn't produce any output at all, even after days. It would be really helpful if I could get some guidance. I am sorry since the structure of my code is a awkward, I am just a beginner. It won't run if number of domains is odd or if the number of domains is not equal to number of processors. Any suggestions on how to remove these limitations is also welcome.
Issues with setting random seed [duplicate]
This question already has an answer here: Random numbers keep coming out the same, despite random seed being used (1 answer) Closed last year. I am attempting to write a Montecarlo algorithm to simulate interaction between agents in a population. This algorithm needs to call two random numbers at each iteration (say, 10^9 iterations). My issue here is that everytime I change the seed (to obtain different realizations), the RNG is giving me the same output (same Montecarlo events). I have tried different ways of implementing it with to no avail. I am completely new to Fortran and copying this code from MATLAB. Am I doing something wrong in the way I'm implementing this code? Below is what I tried: program Gillespie implicit none integer*8, parameter :: n_max = 10.0**8 ! max. number of iterations integer*8 :: t_ext, I_init, S_init, jump, S_now, I_now, i, u real*8 :: t, N, a0, tau, st, r1, r2 real, parameter :: beta = 1000 real, parameter :: gammma = 99.98 real, parameter :: mu = 0.02 real, parameter :: R0 = beta/(gammma+mu) integer :: seed = 11 real, dimension(n_max) :: S_new ! susceptible pop. array real, dimension(n_max) :: I_new ! infected pop. array real, dimension(n_max) :: t_new ! time array real, dimension(5) :: events ! events array open(unit=3, file='SIS_output.dat') t = 0 ! initial time N = 40 ! initial population size jump = 1 ! time increment (save in uniform increments) u = 2 t_ext = 0 ! extiction time I_init = 2 ! initial infected pop. S_init = N-I_init ! initial susceptible pop. S_now = S_init I_now = I_init S_new(1) = S_init ! initialize susceptibles array I_new(1) = I_init ! initialize infected array t_new(1) = t ! initialize time array write(3,*) t_new(1), S_new(1), I_new(1) ! write i.c. to array call random_seed(seed) do i=2, n_max call random_number(r1) call random_number(r2) events(1) = mu*N ! Birth(S) events(2) = mu*S_now ! Death(S) events(3) = mu*I_now ! Death(I) events(4) = (beta*S_now*I_now)/N ! Infection events(5) = gammma*I_now ! Recovery a0 = events(1)+events(2)+events(3)+events(4)+events(5) tau = LOG(1/r1)*(1/a0) ! time increment t = t + tau ! update time st = r2*a0 ! stochastic time??? ! update the populations if (st .le. events(1)) then S_now = S_now + 1 else if (st .gt. events(1) .AND. st .le. #(events(1) + events(2))) then S_now = S_now - 1 else if (st .gt. (events(1) + events(2)) .AND. st .le. #(events(1) + events(2) + events(3))) then I_now = I_now - 1 else if (st .gt. (events(1) + events(2) + events(3)) .AND. #st .le. (events(1) + events(2) + events(3) + events(4))) then S_now = S_now - 1 I_now = I_now + 1 else S_now = S_now + 1 I_now = I_now - 1 end if ! save time in uniform increments if(t .ge. jump) then t_new(u) = floor(t) S_new(u) = S_now I_new(u) = I_now write(3,*) t_new(u), S_new(u), I_new(u) jump = jump+1 u = u+1 end if ! write(3,*) t_new(i), S_new(i), I_new(i) !N = S_now + I_now ! update population post event if(I_now .le. 0) then ! if extinct, terminate print *, "extinct" goto 2 end if end do 2 end program Gillespie I appreciate all input. Thanks.
Your call call random_seed(seed) is strange. I thought it should not be allowed without a keyword argument, but it actually is inquiring for the size of the random seed array. For a proper way of initializing seed see the example in https://gcc.gnu.org/onlinedocs/gfortran/RANDOM_005fSEED.html
Speed of dereferencing class properties in fortran
I expect answer like don't worry, compiler will take care of that but I wan't to be sure. When I make some method in some custom type/class in fortran, is there any performance hit due to referencing/dereferencing fields of the object like this%a(i) = this%b(i) + this%c(i) in comparison to just working with arrays like a(i) = b(i) + c(i) more complex example: for example I have this function which should interpolate a value on 3D grid which is really performance critical (it would be called inside a triple loop over an other 3D array). So I'm thinking if it is better ( for performance) to make is using a method of the class, or rather make a normal subroutine which takes the array as an argument. type grid3D ! 3D grid maps of observables real, dimension (3) :: Rmin, Rmax, Rspan, step ! grid size and spacing (x,y,z) integer, dimension (3) :: N ! dimension in x,y,z real, dimension (3,:, :, :), allocatable :: f ! array storing values of othe observable contains procedure :: interpolate => grid3D_interpolate end type grid3D function grid3D_interpolate(this, R ) result(ff) implicit none ! variables class (grid3D) :: this real, dimension (3), intent (in) :: R real :: ff integer ix0,iy0,iz0 integer ix1,iy1,iz1 real dx,dy,dz real mx,my,mz ! function body ix0 = int( (R(1)/this%step(1)) + fastFloorOffset ) - fastFloorOffset iy0 = int( (R(2)/this%step(2)) + fastFloorOffset ) - fastFloorOffset iz0 = int( (R(3)/this%step(3)) + fastFloorOffset ) - fastFloorOffset dx = R(1) - x0*this%step(1) dy = R(2) - y0*this%step(2) dz = R(3) - z0*this%step(3) ix0 = modulo( x0 , this%N(1) )+1 iy0 = modulo( y0 , this%N(2) )+1 iz0 = modulo( z0 , this%N(3) )+1 ix1 = modulo( x0+1 , this%N(1) )+1 iy1 = modulo( y0+1 , this%N(2) )+1 iz1 = modulo( z0+1 , this%N(3) )+1 mx=1.0-dx my=1.0-dy mz=1.0-dz ff = mz*(my*(mx*this%f(ix0,iy0,iz0) & +dx*this%f(ix1,iy0,iz0)) & +dy*(mx*this%f(ix0,iy1,iz0) & +dx*this%f(ix1,iy1,iz0))) & +dz*(my*(mx*this%f(ix0,iy0,iz1) & +dx*this%f(ix1,iy0,iz1)) & +dy*(mx*this%f(ix0,iy1,iz1) & +dx*this%f(ix1,iy1,iz1))) end if end function grid3D_interpolate end module T_grid3Dvec
Not really. As long as your code structure is quite clear (to the compiler), it can optimize that away quite easily. Once your OOP structures get too complicated, or the level of dereferencing gets too large, you might get some improvement out of a manual dereferencing scheme. (I use that quite a lot, although usually to keep my code human-readable. But I had a little improvement here once, but with a code using >5 levels of dereferencing. ) Here is some example: module vec_mod implicit none type t_vector real :: x = 0. real :: y = 0. real :: z = 0. end type type t_group type(t_vector),allocatable :: vecs(:) end type contains subroutine sum_vec( vecs, res ) implicit none type(t_vector),intent(in) :: vecs(:) type(t_vector),intent(out) :: res integer :: i res%x = 0. ; res%y = 0. ; res%z = 0. do i=1,size(vecs) res%x = res%x + vecs(i)%x res%y = res%y + vecs(i)%y res%z = res%z + vecs(i)%z enddo end subroutine subroutine sum_vec_ptr( vecs, res ) implicit none type(t_vector),intent(in),target :: vecs(:) type(t_vector),intent(out) :: res integer :: i type(t_vector),pointer :: curVec res%x = 0. ; res%y = 0. ; res%z = 0. do i=1,size(vecs) curVec => vecs(i) res%x = res%x + curVec%x res%y = res%y + curVec%y res%z = res%z + curVec%z enddo end subroutine subroutine sum_vecGrp( vecGrp, res ) implicit none type(t_group),intent(in) :: vecGrp type(t_vector),intent(out) :: res integer :: i res%x = 0. ; res%y = 0. ; res%z = 0. do i=1,size(vecGrp%vecs) res%x = res%x + vecGrp%vecs(i)%x res%y = res%y + vecGrp%vecs(i)%y res%z = res%z + vecGrp%vecs(i)%z enddo end subroutine subroutine sum_vecGrp_ptr( vecGrp, res ) implicit none type(t_group),intent(in),target :: vecGrp type(t_vector),intent(out) :: res integer :: i type(t_vector),pointer :: curVec, vecs(:) res%x = 0. ; res%y = 0. ; res%z = 0. vecs => vecGrp%vecs do i=1,size(vecs) curVec => vecs(i) res%x = res%x + curVec%x res%y = res%y + curVec%y res%z = res%z + curVec%z enddo end subroutine end module program test use omp_lib use vec_mod use,intrinsic :: ISO_Fortran_env implicit none type(t_vector),allocatable :: vecs(:) type(t_vector) :: res type(t_group) :: vecGrp integer,parameter :: N=100000000 integer :: i, stat real(REAL64) :: t1, t2 allocate( vecs(N), vecGrp%vecs(N), stat=stat ) if (stat /= 0) stop 'Cannot allocate memory' do i=1,N call random_number(vecs(i)%x) call random_number(vecs(i)%y) call random_number(vecs(i)%z) enddo print *,'' print *,'1 Level' t1 = omp_get_wtime() call sum_vec( vecs, res ) print *,res t2 = omp_get_wtime() print *,'Normal [s]:', t2-t1 t1 = omp_get_wtime() call sum_vec_ptr( vecs, res ) print *,res t2 = omp_get_wtime() print *,'Pointer [s]:', t2-t1 print *,'' print *,'2 Levels' vecGrp%vecs = vecs t1 = omp_get_wtime() call sum_vecGrp( vecGrp, res ) print *,res t2 = omp_get_wtime() print *,'Normal [s]:', t2-t1 t1 = omp_get_wtime() call sum_vecGrp_ptr( vecGrp, res ) print *,res t2 = omp_get_wtime() print *,'Pointer [s]:', t2-t1 end program Compiled with default options (gfortran test.F90 -fopenmp), three is a slight benefit from manually dereferencing, especially for two levels of dereferencing: OMP_NUM_THREADS=1 ./a.out 1 Level 16777216.0 16777216.0 16777216.0 Normal [s]: 0.69216769299237058 16777216.0 16777216.0 16777216.0 Pointer [s]: 0.67321390099823475 2 Levels 16777216.0 16777216.0 16777216.0 Normal [s]: 0.84902219301147852 16777216.0 16777216.0 16777216.0 Pointer [s]: 0.71247501399193425 Once you turn on optimization (gfortran test.F90 -fopenmp -O3), you can see that the compiler actually does a better job automatically: OMP_NUM_THREADS=1 ./a.out 1 Level 16777216.0 16777216.0 16777216.0 Normal [s]: 0.13888958499592263 16777216.0 16777216.0 16777216.0 Pointer [s]: 0.19099253200693056 2 Levels 16777216.0 16777216.0 16777216.0 Normal [s]: 0.13436777899914887 16777216.0 16777216.0 16777216.0 Pointer [s]: 0.21104205500159878
searching a prime number
I hope I am not duplication any question but the suggested topic did not provide with any similar problem. I have a function that check if a number is a prime one. Now this is the slowest possible way to search for a prime. subroutine is_prime_slow(num, stat) implicit none logical :: stat integer :: num integer :: i if ((num .le. 3) .and. (num .gt. 1)) then stat = .true. return end if ! write(*,*) 'limit = ',limit do i = 2,num - 1 ! write(*,*) 'mod(',limit,i,') = ',mod(limit,i) if (mod(num,i) == 0) then stat = .false. return end if end do stat = .true. return end Now let's say that I do some improvement to it. subroutine is_prime_slow(num, stat) implicit none logical :: stat integer :: num integer :: i if ((num .le. 3) .and. (num .gt. 1)) then stat = .true. return end if ! IMPROVEMENT if ((mod(num,2) == 0) .or. (mod(num,3) == 0) .or. (mod(num,5) == 0) .or. (mod(num,7) == 0)) then stat = .false. return end if ! write(*,*) 'limit = ',limit do i = 2,num - 1 ! write(*,*) 'mod(',limit,i,') = ',mod(limit,i) if (mod(num,i) == 0) then stat = .false. return end if end do stat = .true. return end Now the second version excludes more than half of numbers e.g. all that are divisible by 2,3,5,7. How is it possible that when I time the execution with the linux 'time' program, the 'improved' version performs just as slowly? Am I missing some obvious trick? Searching the first 900000 numbers: 1st: 4m28sec 2nd 4m26sec
The multiples of 2, 3, 5, and 7 are quickly rejected by the original algorithm anyway, so jumping over them does not improve the performance at all. Where the algorithm spends most of its time is in proving that numbers with large prime factors are composite. To radically improve the performance you should use a better primality test, such as Miller-Rabin. A simpler improvement is testing factors only up to sqrt(num), not num-1. If that doesn't make immediate sense, think about how big the smallest prime factor of a composite number can be. Also, if you are looking for primes from 1 to N, it may be more efficient to use a prime number sieve, or testing divisibility only by primes you have already found.
I just recently coded something similar ;-) ! Algorithm taken from https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes subroutine eratosthenes_sieve(n, primes) implicit none integer,intent(in) :: n integer,allocatable,intent(out) :: primes(:) integer :: i, j, maxPrime, stat logical :: A(n) maxPrime = floor(sqrt(real(n))) A = .true. do i=2,maxPrime j = i*i do A(j) = .false. j = j + i ; if ( j .gt. n ) exit enddo enddo !i allocate( primes( count(A)-1 ), stat=stat ) if ( stat /= 0 ) stop 'Cannot allocate memory!' j = 1 do i=2,n ! Skip 1 if ( .not. A(i) ) cycle primes( j ) = i j = j + 1 ; if ( j > size(primes) ) exit enddo end subroutine This algorithm gives you all prime numbers up to a certain number, so you can easily check whether your prime is included or not: if ( any(number == prime) ) write(*,*) 'Prime found:',number