MPI program for the Poisson equation stuck after one iteration - parallel-processing

I am trying to solve the Poisson Equation in a Square domain [(0,1)--(0,1)] using MPI and overlapping domains. Currently, my code takes an input of the number of domain divisions on X and Y directions, the length of the overlap between two domains as a function of domain length and the number of elemntal divisions in the overlap.
Input file looks like
2,2
10,10
10,10
program main
!implicit none
include 'mpif.h'
integer cols, divfx, divfy, iter
integer xdiv, ydiv, info, max_iter, x_shift, y_shift
integer, allocatable:: ipiv(:)
double precision, allocatable :: A(:,:), Ainv(:,:)
real, allocatable:: edge(:,:)
double precision, allocatable :: u(:,:), f(:,:)
double precision, allocatable :: u_exact(:,:)
allocatable :: Left(:,:), Right(:,:)
allocatable :: Top(:,:), Bottom(:,:)
allocatable :: TempLeft(:,:), TempRight(:,:)
allocatable :: TempTop(:,:), TempBottom(:,:)
integer myid, master, numprocs, ierr, status(MPI_STATUS_SIZE)
integer i, j, numsent, sender, L, T, R, B
integer anstype, row, dovfx, dovfy, domx, domy, idx
real dom1,dom2,buff
double precision mesh(2), buffer(4), divx, divy, dx, dy, xd, yd
double precision error, derror, error_norm
character(len=100) :: domaindata
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr)
master = 0
divx=0.d0
divy=0.d0
! Input the number of divisions for domain decomposition and calculate sub-domain dimensions.
open(1, file='Inputdata.dat', status='old')
! read(1,*) domx,domy
! read(1,*) dovfx,dovfy
! read(1,*) divfx,divfy
write(*,*)'Starting the Program'
write(*,*) "Enter the number of domain divisions in x-direction &
and y-direction ( Enter 4 if you want three sub-domains)"
read(1,*) domx,domy
write(*,*) domx,domy
write(*,*) "Total number of sub-domains for the problem"
write(*,*) domx*domy
write(*,*) "Enter the sub-domain overlap in x & y -direction as &
a fraction of sub-domain length (multiple of 10)"
read(1,*) dovfx,dovfy
write(*,*) dovfx,dovfy
write(*,*) "Enter the number of divisions in the overlap in &
x & yas a fraction of sub-domain(multiple of 5)"
read(1,*) divfx,divfy
write(*,*) divfx,divfy
divx=1.d0/(((1.d0/domx)/dovfx)/divfx)
divy=1.d0/(((1.d0/domy)/dovfy)/divfy)
write(*,*)"Total number of elemental divisions for the &
problem domain (0,1) in both dimensions"
write(*,*) divx, divy, divx*divy
write(*,*)"Total number of nodal divisions for the problem domain"
write(*,*) (divx+1)*(divy+1)
! time
! **************************
tic = MPI_Wtime();
! Maximum number of iterations.
max_iter=100
! Mesh Size
mesh(1)=1/divx
mesh(2)=1/divy
write(*,*) 'Element Size'
write(*,*) mesh(1), mesh(2)
if ( myid .eq. master ) then
! Send iteration number to subdomain and receive the error from each to
! calculate total error.
write(*,*) 'still1'
do 10 iter = 1,max_iter
do 20 i = 1,domx*domy
call MPI_SEND(iter, 1, MPI_INTEGER, i, i, MPI_COMM_WORLD, ierr)
20 continue
! Receive results obtained from sub-processor/sub-domain
!
error = 0.d0
do 30 i = 1,domx*domy
call MPI_RECV(d_error , 1, MPI_DOUBLE_PRECISION, i, iter, &
MPI_COMM_WORLD, status, ierr)
error = error + d_error
30 continue
write(*,*) 'In iteration ', iter, 'cumulative error is', error*1.d0/domx/domy
10 continue
! time:
! *************
toc = MPI_Wtime();
! Write results to output
! **************************
write(*,*)
write(*,*) 'Time taken for parallel computation is: ',(toc-tic)*1000, 'miliseconds'
else
!************************************ Slaves receive mesh size for discretization ******************************************************
write(*,*) 'iter', iter
write(*,*) 'myid', myid
! Slaves receive corners, then creates a Cartesian grid for finite
! difference until done message received, for one iteration.
! This is done for the first iteration
! Get Domain ID :
if (myid.gt.(domx*domy)) goto 200
write(*,*) 'still31'
1000 call MPI_RECV(iter, 1, MPI_INTEGER, master, MPI_ANY_TAG, MPI_COMM_WORLD, status, ierr)
if (status(MPI_TAG) .eq. 0) goto 200
write(*,*) 'still4'
if (iter.eq.1) then
write(*,*) 'still5'
dom1=domx
dom2=domy
allocate (edge(domx*domy,4))
! Determining the edge matrices for each subdomain - the bounding box
do j =1,domx
do k=1,domy
idx=(j-1)*(domx-1)+k+(j-1)
buff=REAL((mod(idx-1,domx)))/domx
IF (buff-((1.d0/domx)/dovfx) .gt. 0) THEN
buff=buff-((1.d0/domx)/dovfx)
ENDIF
edge(idx,1) = buff
IF ((mod(idx ,domx)) .eq. 0) THEN
buff=1
ELSE
buff=REAL(mod(idx ,domx))/domx
ENDIF
!write(*,*) buff
IF (buff + ((1.d0/domx)/dovfx) .lt. 1) THEN
buff=buff+((1.d0/domx)/dovfx)
ENDIF
edge(idx,2) = buff
!
buff=REAL(floor((idx-1)/dom1))/dom1
IF (buff -((1.d0/domy)/dovfy) .gt. 0) THEN
buff=buff-((1.d0/domy)/dovfy)
ENDIF
edge(idx,3) = buff
buff=REAL(ceiling(idx/dom1))/dom1
IF (buff+((1.d0/domy)/dovfy) .lt. 1) THEN
buff= buff+((1.d0/domy)/dovfy)
ENDIF
edge(idx,4) = buff
end do
end do
write(*,*) myid, edge(myid,:)
write(*,*) 'iter', iter
call Surround_dom(myid,domx,domy,LeftC, RightC, BottomC, TopC)
! Calculate data for the matrices: Divisions in each subdomain. :
xdiv=(edge(myid,2)-edge(myid,1))/mesh(1)
ydiv=(edge(myid,4)-edge(myid,3))/mesh(2)
dx=mesh(1)
dy=mesh(2)
allocate (A((xdiv-1)*(ydiv-1),(xdiv-1)*(ydiv-1)))
allocate (Ainv((xdiv-1)*(ydiv-1),(xdiv-1)*(ydiv-1)))
allocate (u((xdiv-1)*(ydiv-1),1),f((xdiv-1)*(ydiv-1),1))
allocate (u_exact((xdiv-1)*(ydiv-1),1))
allocate (ipiv((xdiv-1)*(ydiv-1)))
allocate (Left((ydiv-1),1),Right((ydiv-1),1))
allocate (Top((xdiv-1),1), Bottom(((xdiv-1)),1))
allocate (TempLeft((ydiv-1),1),TempRight((ydiv-1),1))
allocate (TempTop((xdiv-1),1), TempBottom(((xdiv-1)),1))
Left = 0.d0; Right = 0.d0; Bottom = 0.d0; Top = 0.d0;
TempLeft = 0.d0; TempRight = 0.d0; TempBottom = 0.d0; TempTop = 0.d0;
A=0;
endif
write(*,*) 'still6'
! ******************************************************************
! SendReceive data based on location
! ******************************************************************
if (LeftC.ne.0) then
call MPI_SENDRECV(Left, ydiv - 1, MPI_DOUBLE_PRECISION, LeftC, iter, &
TempLeft, ydiv - 1, MPI_DOUBLE_PRECISION, LeftC, iter, MPI_COMM_WORLD, status, ierr)
end if
if (RightC.ne.0) then
call MPI_SENDRECV(Right, ydiv - 1, MPI_DOUBLE_PRECISION, RightC, iter, &
TempRight, ydiv - 1, MPI_DOUBLE_PRECISION, RightC, iter, MPI_COMM_WORLD, status, ierr)
end if
if (BottomC.ne.0) then
call MPI_SENDRECV(Bottom, xdiv - 1, MPI_DOUBLE_PRECISION, BottomC, iter, &
TempBottom, xdiv - 1, MPI_DOUBLE_PRECISION, BottomC, iter, MPI_COMM_WORLD, status, ierr)
end if
if (TopC.ne.0) then
call MPI_SENDRECV(Top, xdiv - 1, MPI_DOUBLE_PRECISION, TopC, iter, &
TempTop, xdiv - 1, MPI_DOUBLE_PRECISION, TopC, iter, MPI_COMM_WORLD, status, ierr)
end if
Left = TempLeft ;
Right = TempRight;
Top = TempTop ;
Bottom = TempBottom;
write(*,*) 'still7'
! Form the coefficient matrices
do i =1,(xdiv-1)*(ydiv-1)
A(i,i)=-2.d0*(1.d0/(dx**2)+1.d0/(dy**2))
enddo
do i=1,(xdiv-2)
do j=1,(ydiv-1)
A(i+(j-1)*(xdiv-1),i+(j-1)*(xdiv-1)+1)=1.d0/(dx**2)
A(i+(j-1)*(xdiv-1)+1,i+(j-1)*(xdiv-1))=1.d0/(dx**2)
enddo
enddo
do i=1,(xdiv-1)
do j=1,(ydiv-2)
A(i+(j-1)*(xdiv-1),i+(j)*(xdiv-1))=1.d0/(dy**2)
A(i+(j)*(xdiv-1),i+(j-1)*(xdiv-1))=1.d0/(dy**2)
enddo
enddo
write(*,*) 'still9'
L=1
T=1
R=1
B=1
write(*,*) 'still10'
! Impose Boundary Conditions in F matrix
do i=1,(xdiv-1)*(ydiv-1)
xd = edge(myid,1) + (dx)*mod(i,(xdiv-1))
if (mod(i,xdiv-1).eq.0) xd = edge(myid,1) + (dx)*(xdiv-1)
yd = edge(myid,3) + (dy)*ceiling(i*1.d0/(xdiv-1))
!if (iter.eq.1 .and. myid.eq.2) write(*,*) xd,yd
u_exact(i,1) = sin(2.d0*3.1415*xd)*sin(2.d0*3.1415*yd)
f(i,1) = 8.d0*3.1415*3.1415*u_exact(i,1)
IF (mod(i,(xdiv-1)) .eq. 1) THEN
f(i,1)= f(i,1)+Left(L,1)/dx/dx
L=L+1
ENDIF
IF (mod(i,(xdiv-1)) .eq. 0) THEN
f(i,1)=f(i,1)+Right(R,1)/dx/dx
R=R+1
ENDIF
IF (i .le. (xdiv-1)) THEN
f(i,1)=f(i,1)+Bottom(B,1)/dy/dy
B=B+1
ENDIF
IF (i .gt. (xdiv-1)*(ydiv-2)) THEN
f(i,1)=f(i,1)+Top(T,1)/dy/dy
T=T+1
END IF
! enddo
enddo
!Solve AU=F by LU factorization!
write(*,*) 'still11'
do i=1,(xdiv-1)*(ydiv-1)
do j=1,(xdiv-1)*(ydiv-1)
Ainv(i,j)=A(i,j)
end do
end do
! do i=1,(xdiv-1)*(ydiv-1)
! write(*,*) myid,Ainv(i,i)
!end do
call DGESV((xdiv-1)*(ydiv-1), 1, A, &
(xdiv-1)*(ydiv-1), ipiv, f, (xdiv-1)*(ydiv-1), info)
write(*,*) 'still12'
call ErrorNorm(f,u_exact,(xdiv-1)*(ydiv-1),error_norm)
write(*,*) 'still13'
! ****************************************************
! Update boundary conditions based on new solution:
! ****************************************************
x_shift = divfx-1 ;
y_shift = divfy-1 ;
! write(*,*) 'LeftC', myid,LeftC,RightC,TopC,BottomC
if (LeftC.ne.0) then
do 50 i = 1,ydiv - 1
Left(i,1) = f((xdiv - 1)*(i - 1) + 1 + x_shift,1)
!if ((myid.eq.2).and.(iter.eq.1)) write(*,*) 'for left',i, &
!(xdiv - 1)*(i - 1) + 1 + x_shift
50 continue
end if
if (RightC.ne.0) then
do 60 i = 1,ydiv - 1
Right(i,1) = f((xdiv - 1)*i - x_shift,1)
!if ((myid.eq.1).and.(iter.eq.1)) write(*,*) 'for right',i, &
!(xdiv - 1)*i - x_shift
60 continue
end if
if (TopC.ne.0) then
do 70 i = 1,xdiv - 1
Top(i,1) = f((xdiv - 1)*((ydiv - 2) - y_shift) + i,1)
!if ((myid.eq.1).and.(iter.eq.1)) write(*,*) 'for top',i, &
!((xdiv - 1)*((ydiv - 2) - y_shift) + i)
70 continue
end if
if ( BottomC.ne.0) then
do 80 i = 1,xdiv - 1
Bottom(i,1) = f((xdiv - 1)*y_shift + i,1)
!if ((myid.eq.3).and.(iter.eq.1)) write(*,*) 'for bottom',i, &
!((xdiv - 1)*y_shift + i)
80 continue
end if
write(*,*) 'still14'
TempLeft =Left;
TempRight = Right;
TempTop = Top;
TempBottom = Bottom;
call MPI_SEND(error_norm, 1, MPI_DOUBLE_PRECISION, master, iter, &
MPI_COMM_WORLD, ierr)
write(*,*) 'still15'
if (iter.lt.iter_max) go to 1000
! *********************************************************************************
! Write solution to data file to view the results.
! *********************************************************************************
write (domaindata, "(A7,I2,A4)") "domain_",myid,".dat"
open (unit=myid*10, file = domaindata)
write (myid*10,*) ' VARIABLE= "X","Y","U_EXACT","U_CALC" '
do i=1,(xdiv-1)*(ydiv-1)
xd = edge(myid,1) + (dx)*mod(i,(xdiv-1))
if (mod(i,xdiv-1).eq.0) xd = edge(myid,1) + (dx)*(xdiv-1)
yd = edge(myid,3) + (dy)*ceiling(i*1.d0/(xdiv-1))
write (myid*10,*) xd, yd, u_exact(i,1), f(i,1)
enddo
write(*,*) 'still16'
if (iter.eq.max_iter) go to 200
200 continue
write(*,*) 'still45'
endif
call MPI_FINALIZE(ierr)
stop
end program main
subroutine Surround_dom(myid,domx,domy,LeftID, RightID, BottomID, TopID)
implicit none
integer myid, j, k, domy, domx, BottomID, TopID, LeftID, RightID
j = ceiling(1.d0*myid/domx)
k = mod(myid,domx)
if (k.eq.0) k = domx
! Domain on the left
if(k.eq.1) then
LeftID = 0
else
LeftID = ((j-1)*domx + k) - 1
end if
! Domain on the Right
if(k.eq.domx) then
RightID = 0
else
RightID = ((j-1)*domx + k) + 1
end if
! Domain on the Bottom
if(j.eq.1) then
BottomID = 0
else
BottomID = ((j-1)*domx + k) - domx
end if
! Domain on the Top
if(j.eq.domy) then
TopID = 0
else
TopID = ((j-1)*domx + k) + domx
end if
return
end
subroutine ErrorNorm(u,u_exact,N,error_norm)
implicit none
double precision u(N), u_exact(N), err, error_norm
integer i, N
error_norm = 0.d0
do 10 i = 1,N
err = (u(i) - u_exact(i))
error_norm = error_norm + err*err
10 continue
error_norm = sqrt(error_norm)/(N*1.d0)
return
end
I expect the code to run through all the iterations giving me an respectable error about multiples of 1e-3/1e-4.?
Currently, no error shows up, the code successfully runs for 1 iteration and then doesn't produce any output at all, even after days. It would be really helpful if I could get some guidance. I am sorry since the structure of my code is a awkward, I am just a beginner. It won't run if number of domains is odd or if the number of domains is not equal to number of processors. Any suggestions on how to remove these limitations is also welcome.

Related

OpenMP on fortran 90 lasts almost the same(if not more) as non parallelized program

I'm trying to parallelize a simulation of an Ising 2D model to get some expected values as a function of the temperature of the system. For L=48, the one-threaded version takes about 240 seconds to run 20 temperatures and 1 seed each, but the parallelized version takes about 268 seconds, which is similar.
If you take the time per seed per temperature, it results in 12 seconds for the one-threaded version and 13.4 seconds for the parallelized version. I'm looking for help with my code because I don't understand these durations. I thought that the parallelized version would split one temperature among all threads and therefore should take about 30 seconds to complete.
I need to run the simulation for 50 temperatures and 200 seeds each, for 5 values of L. It would be helpful to reduce the compute time, because otherwise it could take 20 hours for L=48 and some days for L=72.
I'm using an i7-10700KF (8 cores, 16 logical threads).
program Ising
use omp_lib
implicit none
integer L, seed, i, j, seed0, nseed,k
parameter (L=48)
integer s(1:L, 1:L)
integer*4 pbc(0:L+1), mctot, N, mcd, mcini, difE
real*8 genrand_real2, magne, energ, energia, temp, temp1, DE
real*8 mag, w(-8:8)
real*8 start, finish
real*8 sum, sume, sume2, summ, summ2, sumam, vare, varm, maxcv, maxx
real*8 cv, x, Tmaxcv, Tmaxx
integer irand, jrand
11 format(10(f20.6))
! Initialize variables
mctot = 80000
mcd = 20
mcini = 8000
N = L*L
seed0 = 20347880
nseed = 20
maxcv=0.d0
maxx=0.d0
! Initialize vector pbc
pbc(0) = L
pbc(L+1) = 1
do i = 1, L
pbc(i) = i
end do
! Initialize matrix s with random values
do i = 1, L
do j = 1, L
if (genrand_real2() < 0.5) then
s(i,j) = 1
else
s(i,j) = -1
endif
end do
end do
! Metropolis algorithm
open(1, file='Expectation values.dat')
start = omp_get_wtime()
write(1,*) '#Temp, ','E, ','E2, ','M, ','M2, ','|M|, ','VarE, ','VarM, ',&
'Cv, ','X, '
!Start loop to calculate for different temperatures
!$OMP PARALLEL PRIVATE(s,seed,w,energia,difE,irand,jrand,temp,mag,sum,sume,sume2,summ,summ2,sumam,vare,varm,cv,x)
temp1 = 1.59d0
!$OMP DO ordered schedule(dynamic)
do k = 1, 10
temp = temp1 + (0.01d0*k)
!Define the matrix w, which contains the values of the Boltzmann function for each temperature, so as not to have to calculate them each iteration
do i = -8, 8
w(i) = dexp(-i/temp)
end do
write(*,*) "Temperature: ", temp, "Thread", omp_get_thread_num()
sum = 0.d0
sume = 0.d0
sume2 = 0.d0
summ = 0.d0
summ2 = 0.d0
sumam = 0.d0
do seed = seed0, seed0 + nseed-1, 1
call init_genrand(seed)
call reinicia(s,l)
energia = energ(s,l,pbc)
do i = 1, mctot
do j = 1, N
irand = int(genrand_real2()*L) + 1
jrand = int(genrand_real2()*L) + 1
difE = int(DE(s,l,irand,jrand,pbc))
if (difE < 0) then
s(irand,jrand) = -s(irand,jrand)
energia = energia + difE
else if (genrand_real2() < w(int(difE))) then
s(irand,jrand) = -s(irand,jrand)
energia = energia + difE
endif
end do
if ((i > mcini).and.(mcd*(i/mcd)==i)) then
mag= magne(s,l)
sum = sum + 1.d0
sume = sume + energia
sume2 = sume2 + energia**2
summ = summ + mag
summ2 = summ2 + mag**2
sumam = sumam + abs(mag)
endif
end do
end do
!Energy
sume=sume/(sum*N)
sume2=sume2/(sum*N*N)
!Magnetitzation
summ = summ/(sum*N)
sumam=sumam/(sum*N)
summ2=summ2/(sum*N*N)
!Variances
vare = dsqrt(sume2-sume*sume)/dsqrt(sum)
varm = dsqrt(summ2-summ*summ)/dsqrt(sum)
!Cv
cv = (N*(sume2-sume*sume))/temp**2
if (cv.gt.maxcv) then
maxcv=cv
Tmaxcv=temp
endif
!X
x = (N*(summ2-summ*summ))/temp
if (x.gt.maxx) then
maxx=x
Tmaxx=temp
endif
write(1,11) temp,sume,sume2,summ,summ2,sumam,vare,varm,cv,x
end do
!$OMP END DO
!$OMP END PARALLEL
finish = omp_get_wtime()
close(1)
print*, "Time: ",(finish-start),"Seconds"
end program Ising
! Functions
!Function that calculates the energy of the matrix s
real*8 function energ(S,L, pbc)
implicit none
integer s(1:L, 1:L), i, j, L
integer*4 pbc(0:L+1)
real*8 ene
ene = 0.0d0
do i = 1, L
do j = 1, L
ene = ene - s(i,j) * s(pbc(i+1),j) - s(i,j) * s(i,pbc(j+1))
end do
end do
energ = ene
return
end function energ
!Function that calculates the difference in energy that occurs when the spin of position (i, j) is changed
real*8 function DE(S,L,i,j,pbc)
implicit none
integer s(1:L, 1:L), i, j, L, difE
integer*4 pbc(0:L+1)
real*8 suma
difE = 0
suma = 0.0d0
suma = suma + s(pbc(i-1),j) + s(pbc(i+1),j) + s(i,pbc(j-1)) + s(i,pbc(j+1))
difE = difE + int(2 * s(i,j) * suma)
DE = difE
return
end function DE
!Function that calculates the magnetization of the matrix s
real*8 function magne(S,L)
implicit none
integer s(1:L, 1:L),L
magne = sum(s)
return
end function magne
! SUBRUTINES
!Subroutine that resets the matrix s with random values
subroutine reinicia(S,L)
implicit none
integer s(1:L, 1:L), i,j,L
real*8 genrand_real2
do i = 1, L
do j = 1, L
if (genrand_real2() < 0.5) then
s(i,j) = 1
else
s(i,j) = -1
endif
end do
end do
return
end subroutine
I have tried parallelizing the seeds loop instead of the temperatures, but it lasts almost the same, so i think i'm not parallelizing it correctly, because it looks a nice code to parallelize.
The other option I thought of is to manually parallelize the simulation. I could do this by compiling 16 programs, each of which handles a different range of temperatures. Then I could run all of the programs concurrently, so each program would get its own thread. However, this approach would require a lot of extra RAM.

Problem in splitting array with MPI_ISEND with fortran

I wrote the following lines of code with the aim to divide the 1D array Jp (which lives on the master process only) over different processes (included the master one).
Each process have to receive a block of non-contigouos modified data (the values are changed in the inner loop) and I created a new format newF with the MPI_TYPE_INDEXED function to select the correct portion of data to send.
I used both MPI_RECV or MPI_IRECV to receive the data.
The problem is that this part of code works fine, with any numbers of tasks (from 1 to 8), until the number of element of Jp is small, when I increase such number (i.e. n = 5000), not all the processes receive the data and the splitted array JpS show the values I used to initialized it (i.e. -10000).
The lines commented show all the changes done in order to solve this problem, Does anyone have an idea?
program test_send
use mpi
implicit none
integer :: rank, nproc, mpi_stat
integer :: n, m, k, io, i, j
integer, allocatable :: Jp(:), JpS(:), JpAux(:)
integer :: count, n_distro,newF
integer, allocatable :: sendcounts(:), displ(:), &
blocklens(:), blockdisp(:), &
request(:)
integer :: ARRAY_OF_STATUS(MPI_STATUS_SIZE), error
data count /3/
call mpi_init(mpi_stat)
call mpi_comm_size(mpi_comm_world, nproc, mpi_stat)
call mpi_comm_rank(mpi_comm_world, rank, mpi_stat)
n = 400*count
allocate(sendcounts(nproc), displ(nproc), &
blocklens(count), blockdisp(count), request(nproc))
if (rank.eq.0) then
allocate(Jp(n+1),JpAux(n+1))
Jp = 0
do i = 1,n+1
Jp(i) = i
enddo
endif
call mpi_barrier(mpi_comm_world, mpi_stat)
m = n/count
n_distro = (m+1)/nproc
k = 0
do i = 1,nproc
if (i<nproc) then
sendcounts(i) = n_distro
else
sendcounts(i) = m - (nproc-1)*n_distro
endif
displ(i) = k
k = k + sendcounts(i)
enddo
call mpi_barrier(mpi_comm_world, mpi_stat)
allocate(JpS(count*sendcounts(rank+1)+1))
call mpi_barrier(mpi_comm_world, mpi_stat)
! call mpi_irecv(JpS, (sendcounts(rank+1))*count+1,mpi_int,0,0,mpi_comm_world, request(rank+1), mpi_stat)
! call mpi_recv(JpS, (sendcounts(rank+1))*count+1,mpi_int,0,0,mpi_comm_world, MPI_STATUS_IGNORE,mpi_stat)
!call mpi_waitall(1,request,ARRAY_OF_STATUS,error)
! call mpi_barrier(mpi_comm_world, mpi_stat)
if (rank.eq.0) then
do i = 0,nproc-1
JpAux = -100000
blocklens = spread(sendcounts(i+1),1,count)
blockdisp = spread(displ(i+1),1,count) + (/ (k*m, k=0,count-1) /)
blocklens(count) = blocklens(count)+1
do j = 1,count
if (j.eq.1) then
JpAux(blockdisp(j)+1:blockdisp(j)+blocklens(j)) = Jp(blockdisp(j)+1:blockdisp(j)+blocklens(j))&
-Jp(blockdisp(j)+1)
else
JpAux(blockdisp(j)+1:blockdisp(j)+blocklens(j)) = Jp( blockdisp(j) + 1 : blockdisp(j) + blocklens(j) )&
-Jp( blockdisp(j)+1 ) + JpAux( blockdisp(j-1) + blocklens(j-1))&
+(Jp( blockdisp(j-1)+blocklens(j-1)+1 )-Jp( blockdisp(j-1)+blocklens(j-1)))
endif
enddo
call mpi_type_indexed(count, blocklens, blockdisp, mpi_int, newF, mpi_stat)
call mpi_type_commit(newF, mpi_stat)
call mpi_isend(JpAux, 1, newF, i, i, mpi_comm_world, request(i+1), mpi_stat)
call mpi_type_free(newF, mpi_stat)
enddo
endif
! call mpi_wait(request(rank+1), ARRAY_OF_STATUS, mpi_stat)
call mpi_barrier(mpi_comm_world, mpi_stat)
!call mpi_waitall(1,request,ARRAY_OF_STATUS,error)
call mpi_recv(JpS, (sendcounts(rank+1))*count+1,mpi_int,0,MPI_ANY_TAG,mpi_comm_world, MPI_STATUS_IGNORE,mpi_stat)
! print*, request
print*, 'rank: ', rank, ', size: ', size(JpS), ', Jp: ', JpS
call mpi_barrier(mpi_comm_world, mpi_stat)
call mpi_finalize(mpi_stat)
end program test_send

Efficient (Fast) Binary Tree in Fortran

I am using the procedure in the following code (that I took from here) to a program that I am trying to make run as fast as possible. The procedure is, however, very slow since it is probably optimized for pedagogical purposes not speed.
program tree_sort
! Sorts a file of integers by building a
! tree, sorted in infix order.
! This sort has expected behavior n log n,
! but worst case (input is sorted) n ** 2.
implicit none
type node
integer :: value
type (node), pointer :: left, right
end type node
type (node), pointer :: t ! A tree
integer :: number, ios
nullify (t) ! Start with empty tree
do
read (*, *, iostat = ios) number
if (ios < 0) exit
call insert (t, number) ! Put next number in tree
end do
! Print nodes of tree in infix order
call print_tree (t)
contains
recursive subroutine insert (t, number)
type (node), pointer :: t ! A tree
integer, intent (in) :: number
! If (sub)tree is empty, put number at root
if (.not. associated (t)) then
allocate (t)
t % value = number
nullify (t % left)
nullify (t % right)
! Otherwise, insert into correct subtree
else if (number < t % value) then
call insert (t % left, number)
else
call insert (t % right, number)
end if
end subroutine insert
recursive subroutine print_tree (t)
! Print tree in infix order
type (node), pointer :: t ! A tree
if (associated (t)) then
call print_tree (t % left)
print *, t % value
call print_tree (t % right)
end if
end subroutine print_tree
end program tree_sort
Is there any way to speed it up? I am using the procedure to sequentially add elements to a vector without adding repeated ones (so I changed the else in the insert subroutine to else if (number > t % value) then. Other than that, instead of printing I store the values in a global variable.
Edit:
Here is the actual code:
MODULE MOD_PARAMETERS
USE, INTRINSIC :: ISO_FORTRAN_ENV
IMPLICIT NONE
SAVE
INTEGER(INT32), PARAMETER :: d = 10 ! number of dimensions
INTEGER(INT32), PARAMETER :: L_0 = 5 ! after this adaptive grid kicks in, for L <= L_0 usual sparse grid
INTEGER(INT32), PARAMETER :: L_max = 5 ! maximum level
INTEGER(INT32), PARAMETER :: bound = 1 ! 0 -> for f = 0 at boundary
! 1 -> adding grid points at boundary
! 2 -> extrapolating close to boundary
INTEGER(INT32), PARAMETER :: testing_sample = 10**4
INTEGER(INT32), PARAMETER :: error_sample = 10**2
REAL(REAL64), PARAMETER :: eps = 0.001D0 ! epsilon for adaptive grid
TYPE NODE
INTEGER :: value
TYPE (NODE), POINTER :: left, right
END TYPE NODE
INTEGER(INT32), DIMENSION(:), ALLOCATABLE :: tree_vector
INTEGER(INT32) :: iii
END MODULE MOD_PARAMETERS
SUBROUTINE FF(x,output)
USE MOD_PARAMETERS
IMPLICIT NONE
REAL(REAL64), DIMENSION(d), INTENT(IN) :: x
REAL(REAL64) , INTENT(OUT) :: output
output = 1.0D0/(ABS(0.5D0-SUM(x(:)**4.0D0))+0.1D0)
END SUBROUTINE
SUBROUTINE XX(n,L,i,output)
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER(INT32) , INTENT(IN) :: n
INTEGER(INT32), DIMENSION(n), INTENT(IN) :: L, i
REAL(REAL64), DIMENSION(n), INTENT(OUT) :: output
INTEGER(INT32) :: j
DO j = 1,n
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
output(j) = REAL(i(j),REAL64)/REAL(2**L(j),REAL64)
ELSEIF (bound .EQ. 1) THEN
output(j) = REAL(i(j),REAL64)/REAL(2**MAX(L(j)-1,1),REAL64)
ENDIF
ENDDO
END SUBROUTINE
SUBROUTINE XX_INV(L,x,output)
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER(INT32), DIMENSION(d), INTENT(IN) :: L
REAL(REAL64), DIMENSION(d), INTENT(IN) :: x
INTEGER(INT32), DIMENSION(d), INTENT(OUT) :: output
INTEGER(INT32) :: j
DO j = 1,d
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
output(j) = 2*FLOOR(x(j)*REAL(2**(L(j)-1),REAL64))+1
ELSEIF (bound .EQ. 1) THEN
IF (L(j) .EQ. 2) THEN
IF (x(j) .LT. 0.5D0) THEN
output(j) = 0
ELSE
output(j) = 2
ENDIF
ELSE
output(j) = 2*FLOOR(x(j)*(REAL(2**MAX(L(j)-2,0),REAL64)))+1
ENDIF
ENDIF
ENDDO
END SUBROUTINE
SUBROUTINE BASE(x,L,i,output)
USE MOD_PARAMETERS
IMPLICIT NONE
REAL(REAL64), INTENT(IN) :: x
INTEGER(INT32), INTENT(IN) :: L,i
REAL(REAL64), INTENT(OUT) :: output
IF (bound .EQ. 0) THEN
output = MAX((1.0D0-ABS(x*REAL(2**L,REAL64)-REAL(i,REAL64))),0.0D0)
ELSEIF (bound .EQ. 1) THEN
IF ((L .EQ. 1) .AND. (i .EQ. 1)) THEN
output = 1.0D0
ELSEIF ((L .EQ. 2) .AND. (i .EQ. 0)) THEN
output = MAX(1.0D0-2.0D0*x,0.0D0)
ELSEIF ((L .EQ. 2) .AND. (i .EQ. 2)) THEN
output = MAX(2.0D0*x-1.0D0,0.0D0)
ELSE
output = MAX((1.0D0-ABS(x*REAL(2**(L-1),REAL64)-REAL(i,REAL64))),0.0D0)
ENDIF
ELSEIF (bound .EQ. 2) THEN
IF ((L .EQ. 1) .AND. (i .EQ. 1)) THEN
output = 1.0D0
ELSEIF ((L .GT. 1) .AND. (i .EQ. 1)) THEN
output = MAX(2.0D0-REAL(2**L,REAL64)*x,0.0D0)
ELSEIF ((L .GT. 1) .AND. (i .EQ. (2**L)-1)) THEN
output = MAX(REAL(2**L,REAL64)*x+REAL(1-i,REAL64),0.0D0)
ELSE
output = MAX((1.0D0-ABS(x*REAL(2**L,REAL64)-REAL(i,REAL64))),0.0D0)
ENDIF
ENDIF
END SUBROUTINE
PROGRAM MAIN
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER(INT32), DIMENSION(d,d) :: ident
REAL(REAL64), DIMENSION(1) :: x1
REAL(REAL64), DIMENSION(d) :: xd
INTEGER(INT32), DIMENSION(2*d) :: temp
INTEGER(INT32), DIMENSION(:,:), ALLOCATABLE :: grid_index, temp_grid_index, grid_index_new, J_index, &
adj_list, temp_adj_list
INTEGER(INT32), DIMENSION(:), ALLOCATABLE :: to_do, to_do_new, to_add_ind
REAL(REAL64), DIMENSION(:), ALLOCATABLE :: coeff, temp_coeff, J_coeff
REAL(REAL64) :: temp_min, temp_max, V, T, B, F
INTEGER(INT32) :: i, k, k1, k2, h, j, L, n, dd, dsize, count, count1, count2, count3, flag, &
first, repeated, add, ind, adj_list_ind
INTEGER(INT32) :: time1, time2, time3, time4, clock_rate, clock_max
INTEGER(INT32), DIMENSION(d) :: LL, ii
REAL(REAL64), DIMENSION(error_sample,d) :: sample_x
REAL(REAL64), DIMENSION(error_sample) :: sample_e, interp1
REAL(REAL64) :: max_error, L2_error
REAL(REAL64), DIMENSION(testing_sample,d) :: x_rand
REAL(REAL64), DIMENSION(testing_sample) :: interp2
TYPE(NODE), POINTER :: tree
! ============================================================================
! EXECUTABLE
! ============================================================================
ident = 0
DO i = 1,d
ident(i,i) = 1
ENDDO
! Initial grid point
dsize = 1
ALLOCATE(grid_index(dsize,2*d),grid_index_new(dsize,2*d),adj_list(dsize,2*d))
grid_index(1,:) = 1
grid_index_new = grid_index
adj_list = 0
ALLOCATE(coeff(0:dsize))
coeff(0) = 0.0D0
xd = 0.5D0
CALL FF(xd,coeff(1))
L = 1
n = SIZE(grid_index_new,1)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
ALLOCATE(to_add_ind(1))
to_add_ind = 1
CALL RANDOM_NUMBER(sample_x)
sample_e = 0.0D0
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO WHILE (L .LT. L_max)
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
L = L+1
n = SIZE(grid_index_new,1)
count = 0
first = 1
DEALLOCATE(J_index,J_coeff)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
J_index = 0
J_coeff = 0.0D0
DO k = 1,n
adj_list_ind = 0
DO i = 1,d
DO j = 1,2
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ELSEIF (bound .EQ. 1) THEN
IF (grid_index_new(k,i) .EQ. 1) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(-(-1)**j)/)
ELSE
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ENDIF
ENDIF
CALL XX(d,temp(1:d),temp(d+1:2*d),xd)
temp_min = MINVAL(xd)
temp_max = MAXVAL(xd)
IF ((temp_min .GE. 0.0D0) .AND. (temp_max .LE. 1.0D0)) THEN
IF (first .EQ. 1) THEN
first = 0
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k2 = 1,d
CALL XX(1,temp(k2),temp(d+k2),x1)
CALL BASE(x1(1),grid_index(k1,k2),grid_index(k1,k2+d),B)
T = T*B
ENDDO
V = V+coeff(k1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
adj_list(to_add_ind(k),adj_list_ind+1) = dsize+count
adj_list_ind = adj_list_ind+1
ELSE
repeated = 0
DO h = 1,count
IF (SUM(ABS(J_index(h,:)-temp)) .EQ. 0) THEN
repeated = 1
adj_list(to_add_ind(k),adj_list_ind+1) = dsize+h
adj_list_ind = adj_list_ind+1
ENDIF
ENDDO
IF (repeated .EQ. 0) THEN
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k2 = 1,d
CALL XX(1,temp(k2),temp(d+k2),x1)
CALL BASE(x1(1),grid_index(k1,k2),grid_index(k1,k2+d),B)
T = T*B
ENDDO
V = V+coeff(k1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
adj_list(to_add_ind(k),adj_list_ind+1) = dsize+count
adj_list_ind = adj_list_ind+1
ENDIF
ENDIF
ENDIF
ENDDO
ENDDO
ENDDO
ALLOCATE(temp_grid_index(dsize,2*d))
ALLOCATE(temp_coeff(dsize))
temp_grid_index = grid_index
temp_coeff = coeff
DEALLOCATE(grid_index,coeff)
ALLOCATE(grid_index(dsize+count,2*d))
ALLOCATE(coeff(0:dsize+count))
grid_index(1:dsize,:) = temp_grid_index
coeff(0:dsize) = temp_coeff
DEALLOCATE(temp_grid_index,temp_coeff)
grid_index(dsize+1:dsize+count,:) = J_index(1:count,:)
coeff(dsize+1:dsize+count) = J_coeff(1:count)
IF (L .LT. L_max) THEN ! put this after error threshhold when implemented
ALLOCATE(temp_adj_list(dsize,2*d))
temp_adj_list = adj_list
DEALLOCATE(adj_list)
ALLOCATE(adj_list(dsize+count,2*d))
adj_list = 0
adj_list(1:dsize,:) = temp_adj_list
DEALLOCATE(temp_adj_list)
ENDIF
dsize = dsize + count
IF (L .LE. L_0) THEN
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(count,2*d))
grid_index_new = J_index(1:count,:)
DEALLOCATE(to_add_ind)
ALLOCATE(to_add_ind(count))
to_add_ind = dsize-count + (/ (h,h=1,count) /)
ELSE
DEALLOCATE(to_add_ind)
ALLOCATE(to_add_ind(count))
add = 0
to_add_ind = 0
DO h = 1,count
IF (ABS(J_coeff(h)) .GT. eps) THEN
add = add + 1
J_index(add,:) = J_index(h,:)
to_add_ind(add) = dsize-count+h
ENDIF
ENDDO
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(add,2*d))
grid_index_new = J_index(1:add,:)
ENDIF
DO i = 1,error_sample
V = 0.0D0
DO k1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k2 = 1,d
CALL BASE(sample_x(i,k2),grid_index(k1,k2),grid_index(k1,k2+d),B)
T = T*B
ENDDO
V = V+coeff(k1)*T
ENDDO
CALL FF(sample_x(i,:),F)
sample_e(i) = F-V
interp1(i) = V
ENDDO
max_error = MAXVAL(ABS(sample_e))
L2_error = (SUM(sample_e**2.0D0)/REAL(error_sample,REAL64))**0.5D0
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
WRITE(*,'(A,I5,A,F10.5,A,I8,A,F15.10,A,F15.10)') ' level = ', L,&
' time = ',REAL(time4-time3,REAL64)/REAL(clock_rate,REAL64),&
' grid points = ',SIZE(grid_index,1),&
' max error = ',max_error,&
' L2 error = ',L2_error
ENDDO
!PRINT *, ' '
!PRINT *, ' '
!PRINT *, ' '
!DO i = 1,SIZE(adj_list,1)
! PRINT *, i, adj_list(i,:)
!ENDDO
!PRINT *, ' '
!PRINT *, ' '
!PRINT *, ' '
!DO i = 1,dsize
! PRINT *, i, grid_index(i,:), coeff(i)
!ENDDO
!PRINT *, ' '
!PRINT *, ' '
!PRINT *, ' '
ALLOCATE (to_do(dsize),to_do_new(dsize),tree_vector(dsize))
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' total time for setup = ', REAL(time2-time1,REAL64)/REAL(clock_rate,REAL64)
! ============================================================================
! Compute interpolated values:
! ============================================================================
IF (testing_sample .EQ. error_sample) THEN
! x_rand = sample_x
ELSE
CALL RANDOM_NUMBER(x_rand)
ENDIF
count1 = 0
count2 = 0
count3 = 0
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO i = 1,testing_sample
V = 0.0D0
to_do = 0
to_do(1) = 1
to_do_new = 0
k = 1
DO L = 1,L_max
NULLIFY (tree)
tree_vector = 0
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
DO j = 1,k
ind = to_do(j)
T = 1.0D0
DO dd = 1,d
CALL BASE(x_rand(i,dd),grid_index(ind,dd),grid_index(ind,d+dd),B)
T = T*B
ENDDO
V = V + coeff(ind)*T
ENDDO
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count1 = count1 + time4-time3
IF (L .LT. L_max) THEN
n = k
k = 0
DO j = 1,n
IF (adj_list(to_do(j),1) .GT. 0) THEN
DO h = 1,2*d
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
LL = grid_index(adj_list(to_do(j),h),1:d)
ii = grid_index(adj_list(to_do(j),h),d+1:2*d)
flag = 0
k1 = 1
DO WHILE ((flag .EQ. 0) .AND. (k1 .LE. d))
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
k2 = 2*FLOOR(x_rand(i,k1)*REAL(2**(LL(k1)-1),REAL64))+1
ELSEIF (bound .EQ. 1) THEN
IF (LL(k1) .EQ. 2) THEN
IF (x_rand(i,k1) .LT. 0.5D0) THEN
k2 = 0
ELSE
k2 = 2
ENDIF
ELSE
k2 = 2*FLOOR(x_rand(i,k1)*(REAL(2**MAX(LL(k1)-2,0),REAL64)))+1
ENDIF
ENDIF
IF (k2 .NE. ii(k1)) THEN
flag = 1
ENDIF
k1 = k1 +1
ENDDO
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count2 = count2 + time4-time3
! CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
IF (flag .EQ. 0) THEN
!IF (MINVAL(ABS(to_do_new(1:MAX(k,1))-adj_list(to_do(j),h))) .GT. 0) THEN
to_do_new(k+1) = adj_list(to_do(j),h)
k = k+1
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
CALL INSERT(tree,to_do_new(k))
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count3 = count3 + time4-time3
!ENDIF
ENDIF
! CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
! count3 = count3 + time4-time3
ENDDO
ENDIF
ENDDO
CALL SYSTEM_CLOCK (time3,clock_rate,clock_max)
iii = 0
CALL PRINT_TREE(tree)
to_do = tree_vector
CALL SYSTEM_CLOCK (time4,clock_rate,clock_max)
count3 = count3 + time4-time3
!to_do = to_do_new
to_do_new = 0
ENDIF
ENDDO
interp2(i) = V
ENDDO
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, ' '
WRITE(*,'(A,F10.5,A,I10)') ' time for interpolation = ', REAL(time2-time1,REAL64)/REAL(clock_rate,REAL64),&
' points = ', testing_sample
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' time for base = ', REAL(count1,REAL64)/REAL(clock_rate,REAL64)
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' time for x_inv = ', REAL(count2,REAL64)/REAL(clock_rate,REAL64)
PRINT *, ' '
WRITE(*,'(A,F10.5)') ' time for repeated = ', REAL(count3,REAL64)/REAL(clock_rate,REAL64)
!PRINT *, ' '
!WRITE(*,'(A,F20.15)') ' check = ', MAXVAL(ABS(interp2-interp1))
DEALLOCATE(grid_index,grid_index_new,J_index,coeff,J_coeff,adj_list,to_do,to_do_new,to_add_ind,tree_vector)
CONTAINS
RECURSIVE SUBROUTINE INSERT(tree,number)
TYPE(NODE), POINTER :: tree
INTEGER(INT32), INTENT(IN) :: number
IF (.NOT. ASSOCIATED(tree)) THEN
ALLOCATE(tree)
tree%value = number
NULLIFY(tree%left)
NULLIFY(tree%right)
ELSEIF (number .LT. tree%value) THEN
CALL INSERT (tree%left,number)
ELSEIF (number .GT. tree%value) THEN
CALL INSERT(tree%right,number)
ENDIF
END SUBROUTINE INSERT
RECURSIVE SUBROUTINE PRINT_TREE(tree)
TYPE (NODE), POINTER :: tree
IF (ASSOCIATED(tree)) THEN
CALL PRINT_TREE(tree%left)
iii = iii+1
tree_vector(iii) = tree%value
CALL PRINT_TREE (tree%right)
END IF
END SUBROUTINE PRINT_TREE
END PROGRAM
I am using optimization O3 but otherwise no flags. In my computer the time for repeated (which is where I am using the binary tree) is 18.3 seconds, whereas if I use an alternative method that is commented in the version (with MINVAL) it only takes 3.6 seconds.

Issues with setting random seed [duplicate]

This question already has an answer here:
Random numbers keep coming out the same, despite random seed being used
(1 answer)
Closed last year.
I am attempting to write a Montecarlo algorithm to simulate interaction between agents in a population. This algorithm needs to call two random numbers at each iteration (say, 10^9 iterations).
My issue here is that everytime I change the seed (to obtain different realizations), the RNG is giving me the same output (same Montecarlo events). I have tried different ways of implementing it with to no avail.
I am completely new to Fortran and copying this code from MATLAB. Am I doing something wrong in the way I'm implementing this code?
Below is what I tried:
program Gillespie
implicit none
integer*8, parameter :: n_max = 10.0**8 ! max. number of iterations
integer*8 :: t_ext, I_init, S_init, jump, S_now, I_now, i, u
real*8 :: t, N, a0, tau, st, r1, r2
real, parameter :: beta = 1000
real, parameter :: gammma = 99.98
real, parameter :: mu = 0.02
real, parameter :: R0 = beta/(gammma+mu)
integer :: seed = 11
real, dimension(n_max) :: S_new ! susceptible pop. array
real, dimension(n_max) :: I_new ! infected pop. array
real, dimension(n_max) :: t_new ! time array
real, dimension(5) :: events ! events array
open(unit=3, file='SIS_output.dat')
t = 0 ! initial time
N = 40 ! initial population size
jump = 1 ! time increment (save in uniform increments)
u = 2
t_ext = 0 ! extiction time
I_init = 2 ! initial infected pop.
S_init = N-I_init ! initial susceptible pop.
S_now = S_init
I_now = I_init
S_new(1) = S_init ! initialize susceptibles array
I_new(1) = I_init ! initialize infected array
t_new(1) = t ! initialize time array
write(3,*) t_new(1), S_new(1), I_new(1) ! write i.c. to array
call random_seed(seed)
do i=2, n_max
call random_number(r1)
call random_number(r2)
events(1) = mu*N ! Birth(S)
events(2) = mu*S_now ! Death(S)
events(3) = mu*I_now ! Death(I)
events(4) = (beta*S_now*I_now)/N ! Infection
events(5) = gammma*I_now ! Recovery
a0 = events(1)+events(2)+events(3)+events(4)+events(5)
tau = LOG(1/r1)*(1/a0) ! time increment
t = t + tau ! update time
st = r2*a0 ! stochastic time???
! update the populations
if (st .le. events(1)) then
S_now = S_now + 1
else if (st .gt. events(1) .AND. st .le.
#(events(1) + events(2))) then
S_now = S_now - 1
else if (st .gt. (events(1) + events(2)) .AND. st .le.
#(events(1) + events(2) + events(3))) then
I_now = I_now - 1
else if (st .gt. (events(1) + events(2) + events(3)) .AND.
#st .le. (events(1) + events(2) + events(3) + events(4))) then
S_now = S_now - 1
I_now = I_now + 1
else
S_now = S_now + 1
I_now = I_now - 1
end if
! save time in uniform increments
if(t .ge. jump) then
t_new(u) = floor(t)
S_new(u) = S_now
I_new(u) = I_now
write(3,*) t_new(u), S_new(u), I_new(u)
jump = jump+1
u = u+1
end if
! write(3,*) t_new(i), S_new(i), I_new(i)
!N = S_now + I_now ! update population post event
if(I_now .le. 0) then ! if extinct, terminate
print *, "extinct"
goto 2
end if
end do
2 end program Gillespie
I appreciate all input. Thanks.
Your call
call random_seed(seed)
is strange. I thought it should not be allowed without a keyword argument, but it actually is inquiring for the size of the random seed array.
For a proper way of initializing seed see the example in
https://gcc.gnu.org/onlinedocs/gfortran/RANDOM_005fSEED.html

MPI_Gather 2-d arrays from all processors into bigger 2-d array on root with FORTRAN

Here's what I am trying to do with my code.
I have to calculate Fw and Fi on the 2-d grid that is of size nx by nz. I split the k loop between all processors so that each processor calculates nx by(nz/p) where p is the number of processors being used. After each processor is done I want to gather all the chunks, that is, each nx by nz/p Fw and Fi and put it into Fw and Fi in the root. I eventually want to use allgather i.e. Gather all the calculated Fw and Fi into all processors.
I have attached the code below.
I am not sure that I am specifying the sendcount and recvcount right or why my code is deadlocking. Any help is appreciated. Thanks!
PROGRAM gridtestpar
IMPLICIT NONE
INTEGER :: nx, nz, i, k, t
INTEGER :: order, mx, mz
INTEGER :: count
INTEGER :: ierror, comm, p, rank, npr, s, f, np(2)
REAL(KIND = 8) :: dx, dz, startx, startz, finishx, finishz
REAL(KIND = 8) :: dt
REAL(KIND = 8) :: cx, cz
REAL(KIND = 8) :: cbx, cbz
REAL(KIND = 8), ALLOCATABLE ::X(:), Z(:), Fw(:,:), Fi(:,:)
REAL(KIND = 8), ALLOCATABLE :: Fn(:,:), Fnp1(:,:)
include 'mpif.h'
!----------------------------------------------------------
!Parameters that can be changed
!---------------------------------------------------------
!Time step
dt = 0.000000001d0
!Number of points in x and z direction(i.e. streamwise and
!spanwise) directions respectively
nx = (400*5)
nz = (400*5)
!First and last grid point locations in x and z directions
startx = 0.d0
finishx = 60.d0*5.d0
startz = 0.d0
finishz = 60.d0*5.d0
!Distance between grid points
dx = (finishx-startx)/REAL(nx-1)
dz = (finishz-startz)/REAL(nz-1)
!Allocate
ALLOCATE(X(nx), Z(nz))
ALLOCATE(Fw(nx,nz), Fi(nx,nz))
ALLOCATE(Fn(nx,nz), Fnp1(nx,nz))
! Make Grid
!--------------------------------------------------------------
DO i = 1, nx
X(i) = (i-1)*dx
END DO
DO k = 1, nz
Z(k) = (k-1)*dx
END DO
CALL MPI_INIT(ierror)
comm = MPI_COMM_WORLD
!Get rank
CALL MPI_COMM_RANK(comm, rank, ierror)
!Get number of processors
CALL MPI_COMM_SIZE(comm, p, ierror)
!split job between all processors
npr = INT((nz-1)/p)
DO k = rank*npr+1, (rank+1)*npr
DO i = 1, nx
cx = 50.d0
Fi(i,k) = 0.d0
DO mx = 1,30
cz = 0.d0;
DO mz = 1,13*5
Fi(i,k) = Fi(i,k) + EXP(-0.9d0*((X(i)-cx)**2+(Z(k)-cz)**2))
cz = cz + 5.d0
END DO
cx = cx + 5.d0
END DO
cbz = 0.d0
cbx = 30.d0
DO mx = 1,4*5
Fw(i,k) = Fw(i,k) + 0.05d0 + 7.d0*EXP(-0.1*((X(i)-cbx)**2 &
+ (Z(k)-cbz)**2)) + 0.1d0*Fi(i,k)
cbz = cbz + 20.d0
END DO
END DO
END DO
s = rank*npr+1
f = (rank+1)*npr
np(1) = nx
np(2) = npr
CALL MPI_GATHER(Fw(:,s:f), np , MPI_DOUBLE_PRECISION, &
Fw,np , MPI_DOUBLE_PRECISION, 0, comm, ierror)
CALL MPI_GATHER(Fi(:,s:f), np , MPI_DOUBLE_PRECISION, &
Fi,np , MPI_DOUBLE_PRECISION, 0, comm, ierror)
Fn(:,:) = Fw(:,:) - Fi(:,:)
Fnp1 = Fn
WRITE(*,*) "I'm here"
IF(rank == 0) THEN
!Output initial condition
!----------------------------------------------------------------
OPEN(unit = 11, file = "Fiinitial.dat")
WRITE(11,*) 'Variables = "X", "Z", "Fi"'
WRITE(11,*) 'Zone I = ', nx, 'J = ', nz, 'F = POINT'
DO k = 1, nz
DO i = 1, nx
WRITE(11,*) X(i), Z(k), Fi(i,k)
END DO
END DO
WRITE(11,*) 'Zone I = ', nx, 'J = ', nz, 'F = POINT'
DO k = 1, nz
DO i = 1, nx
WRITE(11,*) X(i), Z(k), Fw(i,k)
END DO
END DO
CLOSE(11)
END IF
CALL MPI_FINALIZE(ierror)
END PROGRAM gridtestpar
You call the mpi_gather() subroutine incorrectly. You have to pass it the total nr. of elements which should be communicated as one integer for the send buffer and as an other integer for the receive buffer. You passed instead of each integer an array with two integers, which contained the number of elements along each dimension. Just multiply the numbers in your array, and pass the result as an integer instead:
program gridtestpar
use mpi
implicit none
integer, parameter :: dp = kind(1.0d0)
integer :: nx, nz
integer :: ierror, comm, p, rank, npr, s, f, np(2)
real(dp), allocatable :: Fw(:,:), Fi(:,:)
nx = (400*5)
nz = (400*5)
allocate(Fw(nx,nz))
allocate(Fi(nx,nz))
Fw(:,:) = 0.0_dp
Fi(:,:) = 0.0_dp
call mpi_init(ierror)
comm = MPI_COMM_WORLD
call mpi_comm_rank(comm, rank, ierror)
call mpi_comm_size(comm, p, ierror)
s = rank * npr + 1
f = (rank + 1) * npr
call mpi_gather(Fw(:,s:f), nx * (f - s + 1), MPI_DOUBLE_PRECISION, &
Fw, nx * npr, MPI_DOUBLE_PRECISION, 0, comm, ierror)
call mpi_gather(Fi(:,s:f), nx * (f - s + 1), MPI_DOUBLE_PRECISION, &
Fi, nx * npr, MPI_DOUBLE_PRECISION, 0, comm, ierror)
write(*,*) "I'm here"
call mpi_finalize(ierror)
end program gridtestpar
Maybe a few additional comments:
Please always post the shortest possible self containing code, which demonstrates the issue. Nobody likes to spend time on reading and trying to understand irrelevant code snippets. Leave away everything which is not essential for reproducing your problem. Maybe, this way you will even find the solution yourself.
Do not use kind = 8 when specifying precision. See the last part of this answer and some of the comments to it for alternatives.
You should use the mpi module instead of the include file.

Resources