src/npb/disk-image/npb/npb-hooks/NPB3.3.1/NPB3.3-MPI/FT/ft.f - public/gem5-resources - Git at Google

 !-------------------------------------------------------------------------!
 !                                                                         !
 !        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
 !                                                                         !
 !                                   F T                                   !
 !                                                                         !
 !-------------------------------------------------------------------------!
 !                                                                         !
 !    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
 !    It is described in NAS Technical Reports 95-020 and 02-007           !
 !                                                                         !
 !    Permission to use, copy, distribute and modify this software         !
 !    for any purpose with or without fee is hereby granted.  We           !
 !    request, however, that all derived work reference the NAS            !
 !    Parallel Benchmarks 3.3. This software is provided "as is"           !
 !    without express or implied warranty.                                 !
 !                                                                         !
 !    Information on NPB 3.3, including the technical report, the          !
 !    original specifications, source code, results and information        !
 !    on how to submit new results, is available at:                       !
 !                                                                         !
 !           http://www.nas.nasa.gov/Software/NPB/                         !
 !                                                                         !
 !    Send comments or suggestions to  npb@nas.nasa.gov                    !
 !                                                                         !
 !          NAS Parallel Benchmarks Group                                  !
 !          NASA Ames Research Center                                      !
 !          Mail Stop: T27A-1                                              !
 !          Moffett Field, CA   94035-1000                                 !
 !                                                                         !
 !          E-mail:  npb@nas.nasa.gov                                      !
 !          Fax:     (650) 604-3957                                        !
 !                                                                         !
 !-------------------------------------------------------------------------!

 !TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER
 !STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT
 !JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID
 !FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY
 !USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE
 !NPB3.0-SER)

 !TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY
 !MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE
 !NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE
 !COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS
 !((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS
 !THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE
 !PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A
 !FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER.
 !THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL.

 !UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT
 !FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE
 !"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM
 !DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER
 !OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1
 !AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION
 !OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE
 !ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO.

 c---------------------------------------------------------------------
 c
 c Authors: D. Bailey
 c          W. Saphir
 c          R. F. Van der Wijngaart
 c
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c FT benchmark
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       program ft

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none

       include 'mpif.h'
       include 'global.h'
       integer i, ierr

 c---------------------------------------------------------------------
 c u0, u1, u2 are the main arrays in the problem.
 c Depending on the decomposition, these arrays will have different
 c dimensions. To accomodate all possibilities, we allocate them as
 c one-dimensional arrays and pass them to subroutines for different
 c views
 c  - u0 contains the initial (transformed) initial condition
 c  - u1 and u2 are working arrays
 c---------------------------------------------------------------------

       double complex   u0(ntdivnp),
      >                 u1(ntdivnp),
      >                 u2(ntdivnp)
       double precision twiddle(ntdivnp)
 c---------------------------------------------------------------------
 c Large arrays are in common so that they are allocated on the
 c heap rather than the stack. This common block is not
 c referenced directly anywhere else. Padding is to avoid accidental
 c cache problems, since all array sizes are powers of two.
 c---------------------------------------------------------------------

       double complex pad1(3), pad2(3), pad3(3)
       common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle

       integer iter
       double precision total_time, mflops
       logical verified
       character class

       call MPI_Init(ierr)

 c---------------------------------------------------------------------
 c Run the entire problem once to make sure all data is touched.
 c This reduces variable startup costs, which is important for such a
 c short benchmark. The other NPB 2 implementations are similar.
 c---------------------------------------------------------------------
       do i = 1, t_max
          call timer_clear(i)
       end do

       call timer_start(T_init)
       call setup()
       call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
       call compute_initial_conditions(u1, dims(1,1), dims(2,1),
      >                                dims(3,1))
       call fft_init (dims(1,1))
       call fft(1, u1, u0)
       call timer_stop(T_init)
       if (me .eq. 0) then
          print *,'Initialization time =',timer_read(T_init)
       endif

 c---------------------------------------------------------------------
 c Start over from the beginning. Note that all operations must
 c be timed, in contrast to other benchmarks.
 c---------------------------------------------------------------------
       do i = 1, t_max
          call timer_clear(i)
       end do
       call MPI_Barrier(MPI_COMM_WORLD, ierr)

       call timer_start(T_total)
       if (timers_enabled) call timer_start(T_setup)

       call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
       call compute_initial_conditions(u1, dims(1,1), dims(2,1),
      >                                dims(3,1))
       call fft_init (dims(1,1))

 !      if (timers_enabled) call synchup()
       if (timers_enabled) call timer_stop(T_setup)

       if (timers_enabled) call timer_start(T_fft)
       call fft(1, u1, u0)
       if (timers_enabled) call timer_stop(T_fft)

       do iter = 1, niter
          if (timers_enabled) call timer_start(T_evolve)
          call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1))
          if (timers_enabled) call timer_stop(T_evolve)
          if (timers_enabled) call timer_start(T_fft)
          call fft(-1, u1, u2)
          if (timers_enabled) call timer_stop(T_fft)
 !         if (timers_enabled) call synchup()
          if (timers_enabled) call timer_start(T_checksum)
          call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1))
          if (timers_enabled) call timer_stop(T_checksum)
       end do

       call verify(nx, ny, nz, niter, verified, class)
       call timer_stop(t_total)
       if (np .ne. np_min) verified = .false.
       total_time = timer_read(t_total)

       if( total_time .ne. 0. ) then
          mflops = 1.0d-6*ntotal_f *
      >             (14.8157+7.19641*log(ntotal_f)
      >          +  (5.23518+7.21113*log(ntotal_f))*niter)
      >                 /total_time
       else
          mflops = 0.0
       endif
       if (me .eq. 0) then
          call print_results('FT', class, nx, ny, nz, niter, np_min, np,
      >     total_time, mflops, '          floating point', verified,
      >     npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
       endif
       if (timers_enabled) call print_timers()
       call MPI_Finalize(ierr)
       end

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine evolve(u0, u1, twiddle, d1, d2, d3)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c evolve u0 -> u1 (t time steps) in fourier space
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer d1, d2, d3
       double precision exi
       double complex u0(d1,d2,d3)
       double complex u1(d1,d2,d3)
       double precision twiddle(d1,d2,d3)
       integer i, j, k

       do k = 1, d3
          do j = 1, d2
             do i = 1, d1
                u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k))
                u1(i,j,k) = u0(i,j,k)
             end do
          end do
       end do

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine compute_initial_conditions(u0, d1, d2, d3)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c Fill in array u0 with initial conditions from
 c random number generator
 c---------------------------------------------------------------------
       implicit none
       include 'global.h'
       integer d1, d2, d3
       double complex u0(d1, d2, d3)
       integer k
       double precision x0, start, an, dummy

 c---------------------------------------------------------------------
 c 0-D and 1-D layouts are easy because each processor gets a contiguous
 c chunk of the array, in the Fortran ordering sense.
 c For a 2-D layout, it's a bit more complicated. We always
 c have entire x-lines (contiguous) in processor.
 c We can do ny/np1 of them at a time since we have
 c ny/np1 contiguous in y-direction. But then we jump
 c by z-planes (nz/np2 of them, total).
 c For the 0-D and 1-D layouts we could do larger chunks, but
 c this turns out to have no measurable impact on performance.
 c---------------------------------------------------------------------


       start = seed
 c---------------------------------------------------------------------
 c Jump to the starting element for our first plane.
 c---------------------------------------------------------------------
       call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an)
       dummy = randlc(start, an)
       call ipow46(a, 2*nx, ny, an)

 c---------------------------------------------------------------------
 c Go through by z planes filling in one square at a time.
 c---------------------------------------------------------------------
       do k = 1, dims(3, 1) ! nz/np2
          x0 = start
          call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k))
          if (k .ne. dims(3, 1)) dummy = randlc(start, an)
       end do

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine ipow46(a, exp_1, exp_2, result)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c compute a^exponent mod 2^46
 c---------------------------------------------------------------------

       implicit none
       double precision a, result, dummy, q, r
       integer exp_1, exp_2, n, n2, ierr
       external randlc
       double precision randlc
       logical  two_pow
 c---------------------------------------------------------------------
 c Use
 c   a^n = a^(n/2)*a^(n/2) if n even else
 c   a^n = a*a^(n-1)       if n odd
 c---------------------------------------------------------------------
       result = 1
       if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return
       q = a
       r = 1
       n = exp_1
       two_pow = .true.

       do while (two_pow)
          n2 = n/2
          if (n2 * 2 .eq. n) then
             dummy = randlc(q, q)
             n = n2
          else
             n = n * exp_2
             two_pow = .false.
          endif
       end do

       do while (n .gt. 1)
          n2 = n/2
          if (n2 * 2 .eq. n) then
             dummy = randlc(q, q)
             n = n2
          else
             dummy = randlc(r, q)
             n = n-1
          endif
       end do
       dummy = randlc(r, q)
       result = r
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine setup

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'mpinpb.h'
       include 'global.h'

       integer ierr, i, j, fstatus
       debug = .FALSE.

       call MPI_Comm_size(MPI_COMM_WORLD, np, ierr)
       call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr)

       if (.not. convertdouble) then
          dc_type = MPI_DOUBLE_COMPLEX
       else
          dc_type = MPI_COMPLEX
       endif


       if (me .eq. 0) then
          write(*, 1000)

          open (unit=2,file='timer.flag',status='old',iostat=fstatus)
          timers_enabled = .false.
          if (fstatus .eq. 0) then
             timers_enabled = .true.
             close(2)
          endif

          open (unit=2,file='inputft.data',status='old', iostat=fstatus)

          if (fstatus .eq. 0) then
             write(*,233)
  233        format(' Reading from input file inputft.data')
             read (2,*) niter
             read (2,*) layout_type
             read (2,*) np1, np2
             close(2)

 c---------------------------------------------------------------------
 c check to make sure input data is consistent
 c---------------------------------------------------------------------


 c---------------------------------------------------------------------
 c 1. product of processor grid dims must equal number of processors
 c---------------------------------------------------------------------

             if (np1 * np2 .ne. np) then
                write(*, 238)
  238           format(' np1 and np2 given in input file are not valid.')
                write(*, 239) np1*np2, np
  239           format(' Product is ', i5, ' and should be ', i5)
                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
             endif

 c---------------------------------------------------------------------
 c 2. layout type must be valid
 c---------------------------------------------------------------------

             if (layout_type .ne. layout_0D .and.
      >          layout_type .ne. layout_1D .and.
      >          layout_type .ne. layout_2D) then
                write(*, 240)
  240           format(' Layout type specified in inputft.data is
      >                  invalid ')
                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
             endif

 c---------------------------------------------------------------------
 c 3. 0D layout must be 1x1 grid
 c---------------------------------------------------------------------

             if (layout_type .eq. layout_0D .and.
      >            (np1 .ne.1 .or. np2 .ne. 1)) then
                write(*, 241)
  241           format(' For 0D layout, both np1 and np2 must be 1 ')
                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
             endif
 c---------------------------------------------------------------------
 c 4. 1D layout must be 1xN grid
 c---------------------------------------------------------------------

             if (layout_type .eq. layout_1D .and. np1 .ne. 1) then
                write(*, 242)
  242           format(' For 1D layout, np1 must be 1 ')
                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
             endif

          else
             write(*,234)
             niter = niter_default
             if (np .eq. 1) then
                np1 = 1
                np2 = 1
                layout_type = layout_0D
             else if (np .le. nz) then
                np1 = 1
                np2 = np
                layout_type = layout_1D
             else
                np1 = nz
                np2 = np/nz
                layout_type = layout_2D
             endif
          endif

          if (np .lt. np_min) then
             write(*, 10) np_min
  10         format(' Error: Compiled for ', I5, ' processors. ')
             write(*, 11) np
  11         format(' Only ',  i5, ' processors found ')
             call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
          endif

  234     format(' No input file inputft.data. Using compiled defaults')
          write(*, 1001) nx, ny, nz
          write(*, 1002) niter
          write(*, 1004) np
          write(*, 1005) np1, np2
          if (np .ne. np_min) write(*, 1006) np_min

          if (layout_type .eq. layout_0D) then
             write(*, 1010) '0D'
          else if (layout_type .eq. layout_1D) then
             write(*, 1010) '1D'
          else
             write(*, 1010) '2D'
          endif

  1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/)
  1001    format(' Size                : ', i4, 'x', i4, 'x', i4)
  1002    format(' Iterations          : ', 7x, i7)
  1004    format(' Number of processes : ', 7x, i7)
  1005    format(' Processor array     : ', 5x, i4, 'x', i4)
  1006    format(' WARNING: compiled for ', i5, ' processes. ',
      >          ' Will not verify. ')
  1010    format(' Layout type         : ', 9x, A5)
       endif


 c---------------------------------------------------------------------
 c Broadcast parameters
 c---------------------------------------------------------------------
       call MPI_BCAST(np1, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
       call MPI_BCAST(np2, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
       call MPI_BCAST(layout_type, 1, MPI_INTEGER, 0, MPI_COMM_WORLD,
      &               ierr)
       call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
       call MPI_BCAST(timers_enabled, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD,
      &               ierr)

       if (np1 .eq. 1 .and. np2 .eq. 1) then
         layout_type = layout_0D
       else if (np1 .eq. 1) then
          layout_type = layout_1D
       else
          layout_type = layout_2D
       endif

       if (layout_type .eq. layout_0D) then
          do i = 1, 3
             dims(1, i) = nx
             dims(2, i) = ny
             dims(3, i) = nz
          end do
       else if (layout_type .eq. layout_1D) then
          dims(1, 1) = nx
          dims(2, 1) = ny
          dims(3, 1) = nz

          dims(1, 2) = nx
          dims(2, 2) = ny
          dims(3, 2) = nz

          dims(1, 3) = nz
          dims(2, 3) = nx
          dims(3, 3) = ny
       else if (layout_type .eq. layout_2D) then
          dims(1, 1) = nx
          dims(2, 1) = ny
          dims(3, 1) = nz

          dims(1, 2) = ny
          dims(2, 2) = nx
          dims(3, 2) = nz

          dims(1, 3) = nz
          dims(2, 3) = nx
          dims(3, 3) = ny

       endif
       do i = 1, 3
          dims(2, i) = dims(2, i) / np1
          dims(3, i) = dims(3, i) / np2
       end do


 c---------------------------------------------------------------------
 c Determine processor coordinates of this processor
 c Processor grid is np1xnp2.
 c Arrays are always (n1, n2/np1, n3/np2)
 c Processor coords are zero-based.
 c---------------------------------------------------------------------
       me2 = mod(me, np2)  ! goes from 0...np2-1
       me1 = me/np2        ! goes from 0...np1-1
 c---------------------------------------------------------------------
 c Communicators for rows/columns of processor grid.
 c commslice1 is communicator of all procs with same me1, ranked as me2
 c commslice2 is communicator of all procs with same me2, ranked as me1
 c mpi_comm_split(comm, color, key, ...)
 c---------------------------------------------------------------------
       call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr)
       call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr)
 !      if (timers_enabled) call synchup()

       if (debug) print *, 'proc coords: ', me, me1, me2

 c---------------------------------------------------------------------
 c Determine which section of the grid is owned by this
 c processor.
 c---------------------------------------------------------------------
       if (layout_type .eq. layout_0d) then

          do i = 1, 3
             xstart(i) = 1
             xend(i)   = nx
             ystart(i) = 1
             yend(i)   = ny
             zstart(i) = 1
             zend(i)   = nz
          end do

       else if (layout_type .eq. layout_1d) then

          xstart(1) = 1
          xend(1)   = nx
          ystart(1) = 1
          yend(1)   = ny
          zstart(1) = 1 + me2 * nz/np2
          zend(1)   = (me2+1) * nz/np2

          xstart(2) = 1
          xend(2)   = nx
          ystart(2) = 1
          yend(2)   = ny
          zstart(2) = 1 + me2 * nz/np2
          zend(2)   = (me2+1) * nz/np2

          xstart(3) = 1
          xend(3)   = nx
          ystart(3) = 1 + me2 * ny/np2
          yend(3)   = (me2+1) * ny/np2
          zstart(3) = 1
          zend(3)   = nz

       else if (layout_type .eq. layout_2d) then

          xstart(1) = 1
          xend(1)   = nx
          ystart(1) = 1 + me1 * ny/np1
          yend(1)   = (me1+1) * ny/np1
          zstart(1) = 1 + me2 * nz/np2
          zend(1)   = (me2+1) * nz/np2

          xstart(2) = 1 + me1 * nx/np1
          xend(2)   = (me1+1)*nx/np1
          ystart(2) = 1
          yend(2)   = ny
          zstart(2) = zstart(1)
          zend(2)   = zend(1)

          xstart(3) = xstart(2)
          xend(3)   = xend(2)
          ystart(3) = 1 + me2 *ny/np2
          yend(3)   = (me2+1)*ny/np2
          zstart(3) = 1
          zend(3)   = nz
       endif

 c---------------------------------------------------------------------
 c Set up info for blocking of ffts and transposes.  This improves
 c performance on cache-based systems. Blocking involves
 c working on a chunk of the problem at a time, taking chunks
 c along the first, second, or third dimension.
 c
 c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
 c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)

 c Since 1st dim is always in processor, we'll assume it's long enough
 c (default blocking factor is 16 so min size for 1st dim is 16)
 c The only case we have to worry about is cffts1 in a 2d decomposition.
 c so the blocking factor should not be larger than the 2nd dimension.
 c---------------------------------------------------------------------

       fftblock = fftblock_default
       fftblockpad = fftblockpad_default

       if (layout_type .eq. layout_2d) then
          if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1)
          if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2)
          if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3)
       endif

       if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine compute_indexmap(twiddle, d1, d2, d3)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
 c for time evolution exponent.
 c---------------------------------------------------------------------

       implicit none
       include 'mpinpb.h'
       include 'global.h'
       integer d1, d2, d3
       integer i, j, k, ii, ii2, jj, ij2, kk
       double precision ap, twiddle(d1, d2, d3)

 c---------------------------------------------------------------------
 c this function is very different depending on whether
 c we are in the 0d, 1d or 2d layout. Compute separately.
 c basically we want to convert the fortran indices
 c   1 2 3 4 5 6 7 8
 c to
 c   0 1 2 3 -4 -3 -2 -1
 c The following magic formula does the trick:
 c mod(i-1+n/2, n) - n/2
 c---------------------------------------------------------------------

       ap = - 4.d0 * alpha * pi *pi

       if (layout_type .eq. layout_0d) then ! xyz layout
          do i = 1, dims(1,3)
             ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
             ii2 = ii*ii
             do j = 1, dims(2,3)
                jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
                ij2 = jj*jj+ii2
                do k = 1, dims(3,3)
                   kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
                   twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2))
                end do
             end do
          end do
       else if (layout_type .eq. layout_1d) then ! zxy layout
          do i = 1,dims(2,3)
             ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
             ii2 = ii*ii
             do j = 1,dims(3,3)
                jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
                ij2 = jj*jj+ii2
                do k = 1,dims(1,3)
                   kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
                   twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
                end do
             end do
          end do
       else if (layout_type .eq. layout_2d) then ! zxy layout
          do i = 1,dims(2,3)
             ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
             ii2 = ii*ii
             do j = 1, dims(3,3)
                jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
                ij2 = jj*jj+ii2
                do k =1,dims(1,3)
                   kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
                   twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
                end do
             end do
          end do
       else
          print *, ' Unknown layout type ', layout_type
          stop
       endif

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine print_timers()

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       integer i, ierr
       include 'global.h'
       include 'mpinpb.h'
       character*25 tstrings(T_max+2)
       double precision t1(T_max+2), tsum(T_max+2),
      >                 tming(T_max+2), tmaxg(T_max+2)
       data tstrings / '          total ',
      >                '          setup ',
      >                '            fft ',
      >                '         evolve ',
      >                '       checksum ',
      >                '         fftlow ',
      >                '        fftcopy ',
      >                '      transpose ',
      >                ' transpose1_loc ',
      >                ' transpose1_glo ',
      >                ' transpose1_fin ',
      >                ' transpose2_loc ',
      >                ' transpose2_glo ',
      >                ' transpose2_fin ',
      >                '           sync ',
      >                '           init ',
      >                '        totcomp ',
      >                '        totcomm ' /

       do i = 1, t_max
          t1(i) = timer_read(i)
       end do
       t1(t_max+2) = t1(t_transxzglo) + t1(t_transxyglo) + t1(t_synch)
       t1(t_max+1) = t1(t_total) - t1(t_max+2)

       call MPI_Reduce(t1, tsum,  t_max+2, MPI_DOUBLE_PRECISION,
      >                MPI_SUM, 0, MPI_COMM_WORLD, ierr)
       call MPI_Reduce(t1, tming, t_max+2, MPI_DOUBLE_PRECISION,
      >                MPI_MIN, 0, MPI_COMM_WORLD, ierr)
       call MPI_Reduce(t1, tmaxg, t_max+2, MPI_DOUBLE_PRECISION,
      >                MPI_MAX, 0, MPI_COMM_WORLD, ierr)

       if (me .ne. 0) return
       write(*, 800) np
       do i = 1, t_max+2
          if (tsum(i) .ne. 0.0d0) then
             write(*, 810) i, tstrings(i), tming(i), tmaxg(i), tsum(i)/np
          endif
       end do
  800  format(' nprocs =', i6, 19x, 'minimum', 5x, 'maximum',
      >       5x, 'average')
  810  format(' timer ', i2, '(', A16, ') :', 3(2X,F10.4))
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine fft(dir, x1, x2)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer dir
       double complex x1(ntdivnp), x2(ntdivnp)

       double complex scratch(fftblockpad_default*maxdim*2)

 c---------------------------------------------------------------------
 c note: args x1, x2 must be different arrays
 c note: args for cfftsx are (direction, layout, xin, xout, scratch)
 c       xin/xout may be the same and it can be somewhat faster
 c       if they are
 c note: args for transpose are (layout1, layout2, xin, xout)
 c       xin/xout must be different
 c---------------------------------------------------------------------

       if (dir .eq. 1) then
          if (layout_type .eq. layout_0d) then
             call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
      >                  x1, x1, scratch)
             call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
      >                  x1, x1, scratch)
             call cffts3(1, dims(1,3), dims(2,3), dims(3,3),
      >                  x1, x2, scratch)
          else if (layout_type .eq. layout_1d) then
             call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
      >                  x1, x1, scratch)
             call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
      >                  x1, x1, scratch)
             if (timers_enabled) call timer_start(T_transpose)
             call transpose_xy_z(2, 3, x1, x2)
             if (timers_enabled) call timer_stop(T_transpose)
             call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
      >                  x2, x2, scratch)
          else if (layout_type .eq. layout_2d) then
             call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
      >                  x1, x1, scratch)
             if (timers_enabled) call timer_start(T_transpose)
             call transpose_x_y(1, 2, x1, x2)
             if (timers_enabled) call timer_stop(T_transpose)
             call cffts1(1, dims(1,2), dims(2,2), dims(3,2),
      >                  x2, x2, scratch)
             if (timers_enabled) call timer_start(T_transpose)
             call transpose_x_z(2, 3, x2, x1)
             if (timers_enabled) call timer_stop(T_transpose)
             call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
      >                  x1, x2, scratch)
          endif
       else
          if (layout_type .eq. layout_0d) then
             call cffts3(-1, dims(1,3), dims(2,3), dims(3,3),
      >                  x1, x1, scratch)
             call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
      >                  x1, x1, scratch)
             call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
      >                  x1, x2, scratch)
          else if (layout_type .eq. layout_1d) then
             call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
      >                  x1, x1, scratch)
             if (timers_enabled) call timer_start(T_transpose)
             call transpose_x_yz(3, 2, x1, x2)
             if (timers_enabled) call timer_stop(T_transpose)
             call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
      >                  x2, x2, scratch)
             call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
      >                  x2, x2, scratch)
          else if (layout_type .eq. layout_2d) then
             call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
      >                  x1, x1, scratch)
             if (timers_enabled) call timer_start(T_transpose)
             call transpose_x_z(3, 2, x1, x2)
             if (timers_enabled) call timer_stop(T_transpose)
             call cffts1(-1, dims(1,2), dims(2,2), dims(3,2),
      >                  x2, x2, scratch)
             if (timers_enabled) call timer_start(T_transpose)
             call transpose_x_y(2, 1, x2, x1)
             if (timers_enabled) call timer_stop(T_transpose)
             call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
      >                  x1, x2, scratch)
          endif
       endif
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine cffts1(is, d1, d2, d3, x, xout, y)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none

       include 'global.h'
       integer is, d1, d2, d3, logd1
       double complex x(d1,d2,d3)
       double complex xout(d1,d2,d3)
       double complex y(fftblockpad, d1, 2)
       integer i, j, k, jj

       logd1 = ilog2(d1)

       do k = 1, d3
          do jj = 0, d2 - fftblock, fftblock
             if (timers_enabled) call timer_start(T_fftcopy)
             do j = 1, fftblock
                do i = 1, d1
                   y(j,i,1) = x(i,j+jj,k)
                enddo
             enddo
             if (timers_enabled) call timer_stop(T_fftcopy)

             if (timers_enabled) call timer_start(T_fftlow)
             call cfftz (is, logd1, d1, y, y(1,1,2))
             if (timers_enabled) call timer_stop(T_fftlow)

             if (timers_enabled) call timer_start(T_fftcopy)
             do j = 1, fftblock
                do i = 1, d1
                   xout(i,j+jj,k) = y(j,i,1)
                enddo
             enddo
             if (timers_enabled) call timer_stop(T_fftcopy)
          enddo
       enddo

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine cffts2(is, d1, d2, d3, x, xout, y)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none

       include 'global.h'
       integer is, d1, d2, d3, logd2
       double complex x(d1,d2,d3)
       double complex xout(d1,d2,d3)
       double complex y(fftblockpad, d2, 2)
       integer i, j, k, ii

       logd2 = ilog2(d2)

       do k = 1, d3
         do ii = 0, d1 - fftblock, fftblock
            if (timers_enabled) call timer_start(T_fftcopy)
            do j = 1, d2
               do i = 1, fftblock
                  y(i,j,1) = x(i+ii,j,k)
               enddo
            enddo
            if (timers_enabled) call timer_stop(T_fftcopy)

            if (timers_enabled) call timer_start(T_fftlow)
            call cfftz (is, logd2, d2, y, y(1, 1, 2))
            if (timers_enabled) call timer_stop(T_fftlow)

            if (timers_enabled) call timer_start(T_fftcopy)
            do j = 1, d2
               do i = 1, fftblock
                  xout(i+ii,j,k) = y(i,j,1)
               enddo
            enddo
            if (timers_enabled) call timer_stop(T_fftcopy)
         enddo
       enddo

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine cffts3(is, d1, d2, d3, x, xout, y)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none

       include 'global.h'
       integer is, d1, d2, d3, logd3
       double complex x(d1,d2,d3)
       double complex xout(d1,d2,d3)
       double complex y(fftblockpad, d3, 2)
       integer i, j, k, ii

       logd3 = ilog2(d3)

       do j = 1, d2
         do ii = 0, d1 - fftblock, fftblock
            if (timers_enabled) call timer_start(T_fftcopy)
            do k = 1, d3
               do i = 1, fftblock
                  y(i,k,1) = x(i+ii,j,k)
               enddo
            enddo
            if (timers_enabled) call timer_stop(T_fftcopy)

            if (timers_enabled) call timer_start(T_fftlow)
            call cfftz (is, logd3, d3, y, y(1, 1, 2))
            if (timers_enabled) call timer_stop(T_fftlow)

            if (timers_enabled) call timer_start(T_fftcopy)
            do k = 1, d3
               do i = 1, fftblock
                  xout(i+ii,j,k) = y(i,k,1)
               enddo
            enddo
            if (timers_enabled) call timer_stop(T_fftcopy)
         enddo
       enddo

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine fft_init (n)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c compute the roots-of-unity array that will be used for subsequent FFTs.
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'

       integer m,n,nu,ku,i,j,ln
       double precision t, ti


 c---------------------------------------------------------------------
 c   Initialize the U array with sines and cosines in a manner that permits
 c   stride one access at each FFT iteration.
 c---------------------------------------------------------------------
       nu = n
       m = ilog2(n)
       u(1) = m
       ku = 2
       ln = 1

       do j = 1, m
          t = pi / ln

          do i = 0, ln - 1
             ti = i * t
             u(i+ku) = dcmplx (cos (ti), sin(ti))
          enddo

          ku = ku + ln
          ln = 2 * ln
       enddo

       return
       end

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine cfftz (is, m, n, x, y)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
 c   to Swarztrauber.  X is both the input and the output array, while Y is a
 c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
 c   perform FFTs, the array U must be initialized by calling CFFTZ with IS
 c   set to 0 and M set to MX, where MX is the maximum value of M for any
 c   subsequent call.
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'

       integer is,m,n,i,j,l,mx
       double complex x, y

       dimension x(fftblockpad,n), y(fftblockpad,n)

 c---------------------------------------------------------------------
 c   Check if input parameters are invalid.
 c---------------------------------------------------------------------
       mx = u(1)
       if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)
      >  then
         write (*, 1)  is, m, mx
  1      format ('CFFTZ: Either U has not been initialized, or else'/
      >    'one of the input parameters is invalid', 3I5)
         stop
       endif

 c---------------------------------------------------------------------
 c   Perform one variant of the Stockham FFT.
 c---------------------------------------------------------------------
       do l = 1, m, 2
         call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
         if (l .eq. m) goto 160
         call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
       enddo

       goto 180

 c---------------------------------------------------------------------
 c   Copy Y to X.
 c---------------------------------------------------------------------
  160  do j = 1, n
         do i = 1, fftblock
           x(i,j) = y(i,j)
         enddo
       enddo

  180  continue

       return
       end

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c   Performs the L-th iteration of the second variant of the Stockham FFT.
 c---------------------------------------------------------------------

       implicit none

       integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
       double complex u,x,y,u1,x11,x21
       dimension u(n), x(ny1,n), y(ny1,n)


 c---------------------------------------------------------------------
 c   Set initial parameters.
 c---------------------------------------------------------------------

       n1 = n / 2
       lk = 2 ** (l - 1)
       li = 2 ** (m - l)
       lj = 2 * lk
       ku = li + 1

       do i = 0, li - 1
         i11 = i * lk + 1
         i12 = i11 + n1
         i21 = i * lj + 1
         i22 = i21 + lk
         if (is .ge. 1) then
           u1 = u(ku+i)
         else
           u1 = dconjg (u(ku+i))
         endif

 c---------------------------------------------------------------------
 c   This loop is vectorizable.
 c---------------------------------------------------------------------
         do k = 0, lk - 1
           do j = 1, ny
             x11 = x(j,i11+k)
             x21 = x(j,i12+k)
             y(j,i21+k) = x11 + x21
             y(j,i22+k) = u1 * (x11 - x21)
           enddo
         enddo
       enddo

       return
       end

 c---------------------------------------------------------------------


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       integer function ilog2(n)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       integer n, nn, lg
       if (n .eq. 1) then
          ilog2=0
          return
       endif
       lg = 1
       nn = 2
       do while (nn .lt. n)
          nn = nn*2
          lg = lg+1
       end do
       ilog2 = lg
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_x_yz(l1, l2, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer l1, l2
       double complex xin(ntdivnp), xout(ntdivnp)

       call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1),
      >                          xin, xout)

       call transpose2_global(xout, xin)

       call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1),
      >                          xin, xout)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_xy_z(l1, l2, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer l1, l2
       double complex xin(ntdivnp), xout(ntdivnp)

       call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1),
      >                          xin, xout)
       call transpose2_global(xout, xin)
       call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1),
      >                          xin, xout)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose2_local(n1, n2, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'mpinpb.h'
       include 'global.h'
       integer n1, n2
       double complex xin(n1, n2), xout(n2, n1)

       double complex z(transblockpad, transblock)

       integer i, j, ii, jj

       if (timers_enabled) call timer_start(T_transxzloc)

 c---------------------------------------------------------------------
 c If possible, block the transpose for cache memory systems.
 c How much does this help? Example: R8000 Power Challenge (90 MHz)
 c Blocked version decreases time spend in this routine
 c from 14 seconds to 5.2 seconds on 8 nodes class A.
 c---------------------------------------------------------------------

       if (n1 .lt. transblock .or. n2 .lt. transblock) then
          if (n1 .ge. n2) then
             do j = 1, n2
                do i = 1, n1
                   xout(j, i) = xin(i, j)
                end do
             end do
          else
             do i = 1, n1
                do j = 1, n2
                   xout(j, i) = xin(i, j)
                end do
             end do
          endif
       else
          do j = 0, n2-1, transblock
             do i = 0, n1-1, transblock

 c---------------------------------------------------------------------
 c Note: compiler should be able to take j+jj out of inner loop
 c---------------------------------------------------------------------
                do jj = 1, transblock
                   do ii = 1, transblock
                      z(jj,ii) = xin(i+ii, j+jj)
                   end do
                end do

                do ii = 1, transblock
                   do jj = 1, transblock
                      xout(j+jj, i+ii) = z(jj,ii)
                   end do
                end do

             end do
          end do
       endif
       if (timers_enabled) call timer_stop(T_transxzloc)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose2_global(xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       include 'mpinpb.h'
       double complex xin(ntdivnp)
       double complex xout(ntdivnp)
       integer ierr

 !      if (timers_enabled) call synchup()

       if (timers_enabled) call timer_start(T_transxzglo)
       call mpi_alltoall(xin, ntdivnp/np, dc_type,
      >                  xout, ntdivnp/np, dc_type,
      >                  commslice1, ierr)
       if (timers_enabled) call timer_stop(T_transxzglo)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose2_finish(n1, n2, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer n1, n2, ioff
       double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2)

       integer i, j, p

       if (timers_enabled) call timer_start(T_transxzfin)
       do p = 0, np2-1
          ioff = p*n2
          do j = 1, n1/np2
             do i = 1, n2
                xout(i+ioff, j) = xin(i, j, p)
             end do
          end do
       end do
       if (timers_enabled) call timer_stop(T_transxzfin)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_x_z(l1, l2, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer l1, l2
       double complex xin(ntdivnp), xout(ntdivnp)

       call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1),
      >                         xin, xout)
       call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1),
      >                          xout, xin)
       call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2),
      >                          xin, xout)
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_x_z_local(d1, d2, d3, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer d1, d2, d3
       double complex xin(d1,d2,d3)
       double complex xout(d3,d2,d1)
       integer block1, block3
       integer i, j, k, kk, ii, i1, k1

       double complex buf(transblockpad, maxdim)
       if (timers_enabled) call timer_start(T_transxzloc)
       if (d1 .lt. 32) goto 100
       block3 = d3
       if (block3 .eq. 1)  goto 100
       if (block3 .gt. transblock) block3 = transblock
       block1 = d1
       if (block1*block3 .gt. transblock*transblock)
      >          block1 = transblock*transblock/block3
 c---------------------------------------------------------------------
 c blocked transpose
 c---------------------------------------------------------------------
       do j = 1, d2
          do kk = 0, d3-block3, block3
             do ii = 0, d1-block1, block1

                do k = 1, block3
                   k1 = k + kk
                   do i = 1, block1
                      buf(k, i) = xin(i+ii, j, k1)
                   end do
                end do

                do i = 1, block1
                   i1 = i + ii
                   do k = 1, block3
                      xout(k+kk, j, i1) = buf(k, i)
                   end do
                end do

             end do
          end do
       end do
       goto 200


 c---------------------------------------------------------------------
 c basic transpose
 c---------------------------------------------------------------------
  100  continue

       do j = 1, d2
          do k = 1, d3
             do i = 1, d1
                xout(k, j, i) = xin(i, j, k)
             end do
          end do
       end do

 c---------------------------------------------------------------------
 c all done
 c---------------------------------------------------------------------
  200  continue

       if (timers_enabled) call timer_stop(T_transxzloc)
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_x_z_global(d1, d2, d3, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       include 'mpinpb.h'
       integer d1, d2, d3
       double complex xin(d3,d2,d1)
       double complex xout(d3,d2,d1) ! not real layout, but right size
       integer ierr

 !      if (timers_enabled) call synchup()

 c---------------------------------------------------------------------
 c do transpose among all  processes with same 1-coord (me1)
 c---------------------------------------------------------------------
       if (timers_enabled)call timer_start(T_transxzglo)
       call mpi_alltoall(xin, d1*d2*d3/np2, dc_type,
      >                  xout, d1*d2*d3/np2, dc_type,
      >                  commslice1, ierr)
       if (timers_enabled) call timer_stop(T_transxzglo)
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_x_z_finish(d1, d2, d3, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer d1, d2, d3
       double complex xin(d1/np2, d2, d3, 0:np2-1)
       double complex xout(d1,d2,d3)
       integer i, j, k, p, ioff
       if (timers_enabled) call timer_start(T_transxzfin)
 c---------------------------------------------------------------------
 c this is the most straightforward way of doing it. the
 c calculation in the inner loop doesn't help.
 c      do i = 1, d1/np2
 c         do j = 1, d2
 c            do k = 1, d3
 c               do p = 0, np2-1
 c                  ii = i + p*d1/np2
 c                  xout(ii, j, k) = xin(i, j, k, p)
 c               end do
 c            end do
 c         end do
 c      end do
 c---------------------------------------------------------------------

       do p = 0, np2-1
          ioff = p*d1/np2
          do k = 1, d3
             do j = 1, d2
                do i = 1, d1/np2
                   xout(i+ioff, j, k) = xin(i, j, k, p)
                end do
             end do
          end do
       end do
       if (timers_enabled) call timer_stop(T_transxzfin)
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_x_y(l1, l2, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer l1, l2
       double complex xin(ntdivnp), xout(ntdivnp)

 c---------------------------------------------------------------------
 c xy transpose is a little tricky, since we don't want
 c to touch 3rd axis. But alltoall must involve 3rd axis (most
 c slowly varying) to be efficient. So we do
 c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local)
 c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global)
 c then local finish.
 c---------------------------------------------------------------------


       call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1),
      >                         xin, xout)
       call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1),
      >                          xout, xin)
       call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2),
      >                          xin, xout)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_x_y_local(d1, d2, d3, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer d1, d2, d3
       double complex xin(d1, d2, d3)
       double complex xout(d2, d3, d1)
       integer i, j, k
       if (timers_enabled) call timer_start(T_transxyloc)

       do k = 1, d3
          do i = 1, d1
             do j = 1, d2
                xout(j,k,i)=xin(i,j,k)
             end do
          end do
       end do
       if (timers_enabled) call timer_stop(T_transxyloc)
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_x_y_global(d1, d2, d3, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       include 'mpinpb.h'
       integer d1, d2, d3
 c---------------------------------------------------------------------
 c array is in form (ny/np1, nz/np2, nx)
 c---------------------------------------------------------------------
       double complex xin(d2,d3,d1)
       double complex xout(d2,d3,d1) ! not real layout but right size
       integer ierr

 !      if (timers_enabled) call synchup()

 c---------------------------------------------------------------------
 c do transpose among all processes with same 1-coord (me1)
 c---------------------------------------------------------------------
       if (timers_enabled) call timer_start(T_transxyglo)
       call mpi_alltoall(xin, d1*d2*d3/np1, dc_type,
      >                  xout, d1*d2*d3/np1, dc_type,
      >                  commslice2, ierr)
       if (timers_enabled) call timer_stop(T_transxyglo)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine transpose_x_y_finish(d1, d2, d3, xin, xout)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer d1, d2, d3
       double complex xin(d1/np1, d3, d2, 0:np1-1)
       double complex xout(d1,d2,d3)
       integer i, j, k, p, ioff
       if (timers_enabled) call timer_start(T_transxyfin)
 c---------------------------------------------------------------------
 c this is the most straightforward way of doing it. the
 c calculation in the inner loop doesn't help.
 c      do i = 1, d1/np1
 c         do j = 1, d2
 c            do k = 1, d3
 c               do p = 0, np1-1
 c                  ii = i + p*d1/np1
 c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2)
 c                  xout(ii, j, k) = xin(i, k, j, p)
 c               end do
 c            end do
 c         end do
 c      end do
 c---------------------------------------------------------------------

       do p = 0, np1-1
          ioff = p*d1/np1
          do k = 1, d3
             do j = 1, d2
                do i = 1, d1/np1
                   xout(i+ioff, j, k) = xin(i, k, j, p)
                end do
             end do
          end do
       end do
       if (timers_enabled) call timer_stop(T_transxyfin)
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine checksum(i, u1, d1, d2, d3)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       include 'mpinpb.h'
       integer i, d1, d2, d3
       double complex u1(d1, d2, d3)
       integer j, q,r,s, ierr
       double complex chk,allchk
       chk = (0.0,0.0)

       do j=1,1024
          q = mod(j, nx)+1
          if (q .ge. xstart(1) .and. q .le. xend(1)) then
             r = mod(3*j,ny)+1
             if (r .ge. ystart(1) .and. r .le. yend(1)) then
                s = mod(5*j,nz)+1
                if (s .ge. zstart(1) .and. s .le. zend(1)) then
                   chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1)
                end if
             end if
          end if
       end do
       chk = chk/ntotal_f

       if (timers_enabled) call timer_start(T_synch)
       call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM,
      >                0, MPI_COMM_WORLD, ierr)
       if (timers_enabled) call timer_stop(T_synch)
       if (me .eq. 0) then
             write (*, 30) i, allchk
  30         format (' T =',I5,5X,'Checksum =',1P2D22.12)
       endif

 c      sums(i) = allchk
 c     If we compute the checksum for diagnostic purposes, we let i be
 c     negative, so the result will not be stored in an array
       if (i .gt. 0) sums(i) = allchk

       return
       end

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine synchup

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       include 'mpinpb.h'
       integer ierr
       call timer_start(T_synch)
       call mpi_barrier(MPI_COMM_WORLD, ierr)
       call timer_stop(T_synch)
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine verify (d1, d2, d3, nt, verified, class)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       include 'mpinpb.h'
       integer d1, d2, d3, nt
       character class
       logical verified
       integer ierr, size, i
       double precision err, epsilon

 c---------------------------------------------------------------------
 c   Reference checksums
 c---------------------------------------------------------------------
       double complex csum_ref(25)


       class = 'U'

       if (me .ne. 0) return

       epsilon = 1.0d-12
       verified = .FALSE.

       if (d1 .eq. 64 .and.
      >    d2 .eq. 64 .and.
      >    d3 .eq. 64 .and.
      >    nt .eq. 6) then
 c---------------------------------------------------------------------
 c   Sample size reference checksums
 c---------------------------------------------------------------------
          class = 'S'
          csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
          csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
          csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
          csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
          csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
          csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)

       else if (d1 .eq. 128 .and.
      >    d2 .eq. 128 .and.
      >    d3 .eq. 32 .and.
      >    nt .eq. 6) then
 c---------------------------------------------------------------------
 c   Class W size reference checksums
 c---------------------------------------------------------------------
          class = 'W'
          csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
          csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
          csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
          csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
          csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
          csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)

       else if (d1 .eq. 256 .and.
      >    d2 .eq. 256 .and.
      >    d3 .eq. 128 .and.
      >    nt .eq. 6) then
 c---------------------------------------------------------------------
 c   Class A size reference checksums
 c---------------------------------------------------------------------
          class = 'A'
          csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
          csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
          csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
          csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
          csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
          csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)

       else if (d1 .eq. 512 .and.
      >    d2 .eq. 256 .and.
      >    d3 .eq. 256 .and.
      >    nt .eq. 20) then
 c---------------------------------------------------------------------
 c   Class B size reference checksums
 c---------------------------------------------------------------------
          class = 'B'
          csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
          csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
          csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
          csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
          csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
          csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
          csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
          csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
          csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
          csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
          csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
          csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
          csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
          csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
          csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
          csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
          csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
          csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
          csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
          csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)

       else if (d1 .eq. 512 .and.
      >    d2 .eq. 512 .and.
      >    d3 .eq. 512 .and.
      >    nt .eq. 20) then
 c---------------------------------------------------------------------
 c   Class C size reference checksums
 c---------------------------------------------------------------------
          class = 'C'
          csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
          csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
          csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
          csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
          csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
          csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
          csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
          csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
          csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
          csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
          csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
          csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
          csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
          csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
          csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
          csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
          csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
          csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
          csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
          csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)

       else if (d1 .eq. 2048 .and.
      >    d2 .eq. 1024 .and.
      >    d3 .eq. 1024 .and.
      >    nt .eq. 25) then
 c---------------------------------------------------------------------
 c   Class D size reference checksums
 c---------------------------------------------------------------------
          class = 'D'
          csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
          csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
          csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
          csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
          csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
          csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
          csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
          csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
          csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
          csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
          csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
          csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
          csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
          csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
          csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
          csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
          csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
          csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
          csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
          csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
          csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
          csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
          csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
          csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
          csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)

       else if (d1 .eq. 4096 .and.
      >    d2 .eq. 2048 .and.
      >    d3 .eq. 2048 .and.
      >    nt .eq. 25) then
 c---------------------------------------------------------------------
 c   Class E size reference checksums
 c---------------------------------------------------------------------
          class = 'E'
          csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
          csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
          csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
          csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
          csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
          csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
          csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
          csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
          csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
          csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
          csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
          csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
          csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
          csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
          csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
          csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
          csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
          csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
          csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
          csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
          csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
          csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
          csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
          csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
          csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)

       endif


       if (class .ne. 'U') then

          do i = 1, nt
             err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
             if (.not.(err .le. epsilon)) goto 100
          end do
          verified = .TRUE.
  100     continue

       endif

       call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
       if (size .ne. np) then
          write(*, 4010) np
          write(*, 4011)
          write(*, 4012)
 c---------------------------------------------------------------------
 c multiple statements because some Fortran compilers have
 c problems with long strings.
 c---------------------------------------------------------------------
  4010    format( ' Warning: benchmark was compiled for ', i5,
      >           'processors')
  4011    format( ' Must be run on this many processors for official',
      >           ' verification')
  4012    format( ' so memory access is repeatable')
          verified = .false.
       endif

       if (class .ne. 'U') then
          if (verified) then
             write(*,2000)
  2000       format(' Result verification successful')
          else
             write(*,2001)
  2001       format(' Result verification failed')
          endif
       endif
       print *, 'class = ', class

       return
       end