blob: 0d787f90237c57d57ffabdb7c9056f4ec11c0eb7 [file] [log] [blame]
c---------------------------------------------------------------------
c---------------------------------------------------------------------
subroutine z_solve
c---------------------------------------------------------------------
c---------------------------------------------------------------------
c---------------------------------------------------------------------
c this function performs the solution of the approximate factorization
c step in the z-direction for all five matrix components
c simultaneously. The Thomas algorithm is employed to solve the
c systems for the z-lines. Boundary conditions are non-periodic
c---------------------------------------------------------------------
include 'header.h'
include 'mpinpb.h'
integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2,
> buffer_size, c, m, p, kstart, error,
> requests(2), statuses(MPI_STATUS_SIZE, 2)
double precision r1, r2, d, e, s(5), sm1, sm2,
> fac1, fac2
c---------------------------------------------------------------------
c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
c on this node in the direction of increasing i for the forward sweep,
c and after that reversing the direction for the backsubstitution
c---------------------------------------------------------------------
if (timeron) call timer_start(t_zsolve)
c---------------------------------------------------------------------
c FORWARD ELIMINATION
c---------------------------------------------------------------------
do stage = 1, ncells
c = slice(3,stage)
kstart = 0
kend = cell_size(3,c)-1
isize = cell_size(1,c)
jsize = cell_size(2,c)
ip = cell_coord(1,c)-1
jp = cell_coord(2,c)-1
buffer_size = (isize-start(1,c)-end(1,c)) *
> (jsize-start(2,c)-end(2,c))
if (stage .ne. 1) then
c---------------------------------------------------------------------
c if this is not the first processor in this row of cells,
c receive data from predecessor containing the right hand
c sides and the upper diagonal elements of the previous two rows
c---------------------------------------------------------------------
if (timeron) call timer_start(t_zcomm)
call mpi_irecv(in_buffer, 22*buffer_size,
> dp_type, predecessor(3),
> DEFAULT_TAG, comm_solve,
> requests(1), error)
if (timeron) call timer_stop(t_zcomm)
c---------------------------------------------------------------------
c communication has already been started.
c compute the left hand side while waiting for the msg
c---------------------------------------------------------------------
call lhsz(c)
c---------------------------------------------------------------------
c wait for pending communication to complete
c---------------------------------------------------------------------
if (timeron) call timer_start(t_zcomm)
call mpi_waitall(2, requests, statuses, error)
if (timeron) call timer_stop(t_zcomm)
c---------------------------------------------------------------------
c unpack the buffer
c---------------------------------------------------------------------
k = kstart
k1 = kstart + 1
n = 0
c---------------------------------------------------------------------
c create a running pointer
c---------------------------------------------------------------------
p = 0
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
> in_buffer(p+1) * lhs(i,j,k,n+1,c)
lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
> in_buffer(p+2) * lhs(i,j,k,n+1,c)
do m = 1, 3
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
> in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
end do
d = in_buffer(p+6)
e = in_buffer(p+7)
do m = 1, 3
s(m) = in_buffer(p+7+m)
end do
r1 = lhs(i,j,k,n+2,c)
lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
do m = 1, 3
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
end do
r2 = lhs(i,j,k1,n+1,c)
lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
do m = 1, 3
rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
end do
p = p + 10
end do
end do
do m = 4, 5
n = (m-3)*5
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
> in_buffer(p+1) * lhs(i,j,k,n+1,c)
lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
> in_buffer(p+2) * lhs(i,j,k,n+1,c)
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
> in_buffer(p+3) * lhs(i,j,k,n+1,c)
d = in_buffer(p+4)
e = in_buffer(p+5)
s(m) = in_buffer(p+6)
r1 = lhs(i,j,k,n+2,c)
lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
r2 = lhs(i,j,k1,n+1,c)
lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
p = p + 6
end do
end do
end do
else
c---------------------------------------------------------------------
c if this IS the first cell, we still compute the lhs
c---------------------------------------------------------------------
call lhsz(c)
endif
c---------------------------------------------------------------------
c perform the Thomas algorithm; first, FORWARD ELIMINATION
c---------------------------------------------------------------------
n = 0
do k = kstart, kend-2
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
k1 = k + 1
k2 = k + 2
fac1 = 1.d0/lhs(i,j,k,n+3,c)
lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
do m = 1, 3
rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
end do
lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
> lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
> lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
do m = 1, 3
rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
> lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
end do
lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
> lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
> lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
do m = 1, 3
rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
> lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
end do
end do
end do
end do
c---------------------------------------------------------------------
c The last two rows in this grid block are a bit different,
c since they do not have two more rows available for the
c elimination of off-diagonal entries
c---------------------------------------------------------------------
k = kend - 1
k1 = kend
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
fac1 = 1.d0/lhs(i,j,k,n+3,c)
lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
do m = 1, 3
rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
end do
lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
> lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
> lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
do m = 1, 3
rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
> lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
end do
c---------------------------------------------------------------------
c scale the last row immediately (some of this is
c overkill in case this is the last cell)
c---------------------------------------------------------------------
fac2 = 1.d0/lhs(i,j,k1,n+3,c)
lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
do m = 1, 3
rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
end do
end do
end do
c---------------------------------------------------------------------
c do the u+c and the u-c factors
c---------------------------------------------------------------------
do m = 4, 5
n = (m-3)*5
do k = kstart, kend-2
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
k1 = k + 1
k2 = k + 2
fac1 = 1.d0/lhs(i,j,k,n+3,c)
lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
> lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
> lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
> lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
> lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
> lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
> lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
end do
end do
end do
c---------------------------------------------------------------------
c And again the last two rows separately
c---------------------------------------------------------------------
k = kend - 1
k1 = kend
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
fac1 = 1.d0/lhs(i,j,k,n+3,c)
lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
> lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
> lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
> lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
c---------------------------------------------------------------------
c Scale the last row immediately (some of this is overkill
c if this is the last cell)
c---------------------------------------------------------------------
fac2 = 1.d0/lhs(i,j,k1,n+3,c)
lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
end do
end do
end do
c---------------------------------------------------------------------
c send information to the next processor, except when this
c is the last grid block,
c---------------------------------------------------------------------
if (stage .ne. ncells) then
c---------------------------------------------------------------------
c create a running pointer for the send buffer
c---------------------------------------------------------------------
p = 0
n = 0
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
do k = kend-1, kend
out_buffer(p+1) = lhs(i,j,k,n+4,c)
out_buffer(p+2) = lhs(i,j,k,n+5,c)
do m = 1, 3
out_buffer(p+2+m) = rhs(i,j,k,m,c)
end do
p = p+5
end do
end do
end do
do m = 4, 5
n = (m-3)*5
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
do k = kend-1, kend
out_buffer(p+1) = lhs(i,j,k,n+4,c)
out_buffer(p+2) = lhs(i,j,k,n+5,c)
out_buffer(p+3) = rhs(i,j,k,m,c)
p = p + 3
end do
end do
end do
end do
if (timeron) call timer_start(t_zcomm)
call mpi_isend(out_buffer, 22*buffer_size,
> dp_type, successor(3),
> DEFAULT_TAG, comm_solve,
> requests(2), error)
if (timeron) call timer_stop(t_zcomm)
endif
end do
c---------------------------------------------------------------------
c now go in the reverse direction
c---------------------------------------------------------------------
c---------------------------------------------------------------------
c BACKSUBSTITUTION
c---------------------------------------------------------------------
do stage = ncells, 1, -1
c = slice(3,stage)
kstart = 0
kend = cell_size(3,c)-1
isize = cell_size(1,c)
jsize = cell_size(2,c)
ip = cell_coord(1,c)-1
jp = cell_coord(2,c)-1
buffer_size = (isize-start(1,c)-end(1,c)) *
> (jsize-start(2,c)-end(2,c))
if (stage .ne. ncells) then
c---------------------------------------------------------------------
c if this is not the starting cell in this row of cells,
c wait for a message to be received, containing the
c solution of the previous two stations
c---------------------------------------------------------------------
if (timeron) call timer_start(t_zcomm)
call mpi_irecv(in_buffer, 10*buffer_size,
> dp_type, successor(3),
> DEFAULT_TAG, comm_solve,
> requests(1), error)
if (timeron) call timer_stop(t_zcomm)
c---------------------------------------------------------------------
c communication has already been started
c while waiting, do the block-diagonal inversion for the
c cell that was just finished
c---------------------------------------------------------------------
call tzetar(slice(3,stage+1))
c---------------------------------------------------------------------
c wait for pending communication to complete
c---------------------------------------------------------------------
if (timeron) call timer_start(t_zcomm)
call mpi_waitall(2, requests, statuses, error)
if (timeron) call timer_stop(t_zcomm)
c---------------------------------------------------------------------
c unpack the buffer for the first three factors
c---------------------------------------------------------------------
n = 0
p = 0
k = kend
k1 = k - 1
do m = 1, 3
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
sm1 = in_buffer(p+1)
sm2 = in_buffer(p+2)
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
> lhs(i,j,k,n+4,c)*sm1 -
> lhs(i,j,k,n+5,c)*sm2
rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
> lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) -
> lhs(i,j,k1,n+5,c) * sm1
p = p + 2
end do
end do
end do
c---------------------------------------------------------------------
c now unpack the buffer for the remaining two factors
c---------------------------------------------------------------------
do m = 4, 5
n = (m-3)*5
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
sm1 = in_buffer(p+1)
sm2 = in_buffer(p+2)
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
> lhs(i,j,k,n+4,c)*sm1 -
> lhs(i,j,k,n+5,c)*sm2
rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
> lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) -
> lhs(i,j,k1,n+5,c) * sm1
p = p + 2
end do
end do
end do
else
c---------------------------------------------------------------------
c now we know this is the first grid block on the back sweep,
c so we don't need a message to start the substitution.
c---------------------------------------------------------------------
k = kend - 1
k1 = kend
n = 0
do m = 1, 3
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
> lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
end do
end do
end do
do m = 4, 5
n = (m-3)*5
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
> lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
end do
end do
end do
endif
c---------------------------------------------------------------------
c Whether or not this is the last processor, we always have
c to complete the back-substitution
c---------------------------------------------------------------------
c---------------------------------------------------------------------
c The first three factors
c---------------------------------------------------------------------
n = 0
do m = 1, 3
do k = kend-2, kstart, -1
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
k1 = k + 1
k2 = k + 2
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
> lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
> lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
end do
end do
end do
end do
c---------------------------------------------------------------------
c And the remaining two
c---------------------------------------------------------------------
do m = 4, 5
n = (m-3)*5
do k = kend-2, kstart, -1
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
k1 = k + 1
k2 = k + 2
rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
> lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
> lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
end do
end do
end do
end do
c---------------------------------------------------------------------
c send on information to the previous processor, if needed
c---------------------------------------------------------------------
if (stage .ne. 1) then
k = kstart
k1 = kstart + 1
p = 0
do m = 1, 5
do j = start(2,c), jsize-end(2,c)-1
do i = start(1,c), isize-end(1,c)-1
out_buffer(p+1) = rhs(i,j,k,m,c)
out_buffer(p+2) = rhs(i,j,k1,m,c)
p = p + 2
end do
end do
end do
if (timeron) call timer_start(t_zcomm)
call mpi_isend(out_buffer, 10*buffer_size,
> dp_type, predecessor(3),
> DEFAULT_TAG, comm_solve,
> requests(2), error)
if (timeron) call timer_stop(t_zcomm)
endif
c---------------------------------------------------------------------
c If this was the last stage, do the block-diagonal inversion
c---------------------------------------------------------------------
if (stage .eq. 1) call tzetar(c)
end do
if (timeron) call timer_stop(t_zsolve)
return
end