src/npb/disk-image/npb/npb-hooks/NPB3.3.1/NPB3.3-OMP/FT/ft.f - public/gem5-resources - Git at Google

 !-------------------------------------------------------------------------!
 !                                                                         !
 !        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
 !                                                                         !
 !                       O p e n M P     V E R S I O N                     !
 !                                                                         !
 !                                   F T                                   !
 !                                                                         !
 !-------------------------------------------------------------------------!
 !                                                                         !
 !    This benchmark is an OpenMP version of the NPB FT code.              !
 !    It is described in NAS Technical Report 99-011.                      !
 !                                                                         !
 !    Permission to use, copy, distribute and modify this software         !
 !    for any purpose with or without fee is hereby granted.  We           !
 !    request, however, that all derived work reference the NAS            !
 !    Parallel Benchmarks 3.3. This software is provided "as is"           !
 !    without express or implied warranty.                                 !
 !                                                                         !
 !    Information on NPB 3.3, including the technical report, the          !
 !    original specifications, source code, results and information        !
 !    on how to submit new results, is available at:                       !
 !                                                                         !
 !           http://www.nas.nasa.gov/Software/NPB/                         !
 !                                                                         !
 !    Send comments or suggestions to  npb@nas.nasa.gov                    !
 !                                                                         !
 !          NAS Parallel Benchmarks Group                                  !
 !          NASA Ames Research Center                                      !
 !          Mail Stop: T27A-1                                              !
 !          Moffett Field, CA   94035-1000                                 !
 !                                                                         !
 !          E-mail:  npb@nas.nasa.gov                                      !
 !          Fax:     (650) 604-3957                                        !
 !                                                                         !
 !-------------------------------------------------------------------------!

 c---------------------------------------------------------------------
 c
 c Authors: D. Bailey
 c          W. Saphir
 c          H. Jin
 c
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c FT benchmark
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       program ft

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------


       implicit none

       include 'global.h'
       integer i

 c---------------------------------------------------------------------
 c u0, u1, u2 are the main arrays in the problem.
 c Depending on the decomposition, these arrays will have different
 c dimensions. To accomodate all possibilities, we allocate them as
 c one-dimensional arrays and pass them to subroutines for different
 c views
 c  - u0 contains the initial (transformed) initial condition
 c  - u1 and u2 are working arrays
 c  - twiddle contains exponents for the time evolution operator.
 c---------------------------------------------------------------------

       double complex   u0(ntotalp),
      >                 u1(ntotalp)
 c     >                 u2(ntotalp)
       double precision twiddle(ntotalp)
 c---------------------------------------------------------------------
 c Large arrays are in common so that they are allocated on the
 c heap rather than the stack. This common block is not
 c referenced directly anywhere else. Padding is to avoid accidental
 c cache problems, since all array sizes are powers of two.
 c---------------------------------------------------------------------

 c      double complex pad1(3), pad2(3), pad3(3)
 c      common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
       double complex pad1(3), pad2(3)
       common /bigarrays/ u0, pad1, u1, pad2, twiddle

       integer iter
       double precision total_time, mflops
       logical verified
       character class

 c---------------------------------------------------------------------
 c Run the entire problem once to make sure all data is touched.
 c This reduces variable startup costs, which is important for such a
 c short benchmark. The other NPB 2 implementations are similar.
 c---------------------------------------------------------------------
       do i = 1, t_max
          call timer_clear(i)
       end do
       call setup()
       call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3))
       call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
       call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
       call fft_init (dims(1))
       call fft(1, u1, u0)

 c---------------------------------------------------------------------
 c Start over from the beginning. Note that all operations must
 c be timed, in contrast to other benchmarks.
 c---------------------------------------------------------------------
       do i = 1, t_max
          call timer_clear(i)
       end do
 #ifdef HOOKS
       call roi_begin
 #endif

       call timer_start(T_total)
       if (timers_enabled) call timer_start(T_setup)

       call compute_indexmap(twiddle, dims(1), dims(2), dims(3))

       call compute_initial_conditions(u1, dims(1), dims(2), dims(3))

       call fft_init (dims(1))

       if (timers_enabled) call timer_stop(T_setup)
       if (timers_enabled) call timer_start(T_fft)
       call fft(1, u1, u0)
       if (timers_enabled) call timer_stop(T_fft)

       do iter = 1, niter
          if (timers_enabled) call timer_start(T_evolve)
          call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3))
          if (timers_enabled) call timer_stop(T_evolve)
          if (timers_enabled) call timer_start(T_fft)
 c         call fft(-1, u1, u2)
          call fft(-1, u1, u1)
          if (timers_enabled) call timer_stop(T_fft)
          if (timers_enabled) call timer_start(T_checksum)
 c         call checksum(iter, u2, dims(1), dims(2), dims(3))
          call checksum(iter, u1, dims(1), dims(2), dims(3))
          if (timers_enabled) call timer_stop(T_checksum)
       end do

       call verify(nx, ny, nz, niter, verified, class)

       call timer_stop(t_total)
       total_time = timer_read(t_total)

 #ifdef HOOKS
       call roi_end
 #endif

       if( total_time .ne. 0. ) then
          mflops = 1.0d-6*float(ntotal) *
      >             (14.8157+7.19641*log(float(ntotal))
      >          +  (5.23518+7.21113*log(float(ntotal)))*niter)
      >                 /total_time
       else
          mflops = 0.0
       endif
       call print_results('FT', class, nx, ny, nz, niter,
      >  total_time, mflops, '          floating point', verified,
      >  npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
       if (timers_enabled) call print_timers()

       end

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine init_ui(u0, u1, twiddle, d1, d2, d3)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c touch all the big data
 c---------------------------------------------------------------------

       implicit none
       integer d1, d2, d3
       double complex   u0(d1+1,d2,d3)
       double complex   u1(d1+1,d2,d3)
       double precision twiddle(d1+1,d2,d3)
       integer i, j, k

 !$omp parallel do default(shared) private(i,j,k)
       do k = 1, d3
          do j = 1, d2
             do i = 1, d1
                u0(i,j,k) = 0.d0
                u1(i,j,k) = 0.d0
                twiddle(i,j,k) = 0.d0
             end do
          end do
       end do

       return
       end

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine evolve(u0, u1, twiddle, d1, d2, d3)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c evolve u0 -> u1 (t time steps) in fourier space
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer d1, d2, d3
       double complex   u0(d1+1,d2,d3)
       double complex   u1(d1+1,d2,d3)
       double precision twiddle(d1+1,d2,d3)
       integer i, j, k

 !$omp parallel do default(shared) private(i,j,k)
       do k = 1, d3
          do j = 1, d2
             do i = 1, d1
                u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k)
                u1(i,j,k) = u0(i,j,k)
             end do
          end do
       end do

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine compute_initial_conditions(u0, d1, d2, d3)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c Fill in array u0 with initial conditions from
 c random number generator
 c---------------------------------------------------------------------
       implicit none
       include 'global.h'
       integer d1, d2, d3
       double complex u0(d1+1, d2, d3)
       integer k, j
       double precision x0, start, an, dummy, starts(nz)


       start = seed
 c---------------------------------------------------------------------
 c Jump to the starting element for our first plane.
 c---------------------------------------------------------------------
       call ipow46(a, 0, an)
       dummy = randlc(start, an)
       call ipow46(a, 2*nx*ny, an)

       starts(1) = start
       do k = 2, dims(3)
          dummy = randlc(start, an)
          starts(k) = start
       end do

 c---------------------------------------------------------------------
 c Go through by z planes filling in one square at a time.
 c---------------------------------------------------------------------
 !$omp parallel do default(shared) private(k,j,x0)
       do k = 1, dims(3)
          x0 = starts(k)
          do j = 1, dims(2)
             call vranlc(2*nx, x0, a, u0(1, j, k))
          end do
       end do

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine ipow46(a, exponent, result)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c compute a^exponent mod 2^46
 c---------------------------------------------------------------------

       implicit none
       double precision a, result, dummy, q, r
       integer exponent, n, n2
       external randlc
       double precision randlc
 c---------------------------------------------------------------------
 c Use
 c   a^n = a^(n/2)*a^(n/2) if n even else
 c   a^n = a*a^(n-1)       if n odd
 c---------------------------------------------------------------------
       result = 1
       if (exponent .eq. 0) return
       q = a
       r = 1
       n = exponent


       do while (n .gt. 1)
          n2 = n/2
          if (n2 * 2 .eq. n) then
             dummy = randlc(q, q)
             n = n2
          else
             dummy = randlc(r, q)
             n = n-1
          endif
       end do
       dummy = randlc(r, q)
       result = r
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine setup

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'

       integer fstatus
 !$    integer  omp_get_max_threads
 !$    external omp_get_max_threads
       debug = .FALSE.

       open (unit=2,file='timer.flag',status='old',iostat=fstatus)
       if (fstatus .eq. 0) then
          timers_enabled = .true.
          close(2)
       else
          timers_enabled = .false.
       endif

       write(*, 1000)

       niter = niter_default

       write(*, 1001) nx, ny, nz
       write(*, 1002) niter
 !$    write(*, 1003) omp_get_max_threads()
       write(*, *)


  1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)',
      >          ' - FT Benchmark', /)
  1001 format(' Size                : ', i4, 'x', i4, 'x', i4)
  1002 format(' Iterations                  :', i7)
  1003 format(' Number of available threads :', i7)

       dims(1) = nx
       dims(2) = ny
       dims(3) = nz


 c---------------------------------------------------------------------
 c Set up info for blocking of ffts and transposes.  This improves
 c performance on cache-based systems. Blocking involves
 c working on a chunk of the problem at a time, taking chunks
 c along the first, second, or third dimension.
 c
 c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
 c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)

 c Since 1st dim is always in processor, we'll assume it's long enough
 c (default blocking factor is 16 so min size for 1st dim is 16)
 c The only case we have to worry about is cffts1 in a 2d decomposition.
 c so the blocking factor should not be larger than the 2nd dimension.
 c---------------------------------------------------------------------

       fftblock = fftblock_default
       fftblockpad = fftblockpad_default

       if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine compute_indexmap(twiddle, d1, d2, d3)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
 c for time evolution exponent.
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer d1, d2, d3
       double precision twiddle(d1+1, d2, d3)
       integer i, j, k, kk, kk2, jj, kj2, ii
       double precision ap

 c---------------------------------------------------------------------
 c basically we want to convert the fortran indices
 c   1 2 3 4 5 6 7 8
 c to
 c   0 1 2 3 -4 -3 -2 -1
 c The following magic formula does the trick:
 c mod(i-1+n/2, n) - n/2
 c---------------------------------------------------------------------

       ap = - 4.d0 * alpha * pi *pi

 !$omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii)
       do k = 1, dims(3)
          kk =  mod(k-1+nz/2, nz) - nz/2
          kk2 = kk*kk
          do j = 1, dims(2)
             jj = mod(j-1+ny/2, ny) - ny/2
             kj2 = jj*jj+kk2
             do i = 1, dims(1)
                ii = mod(i-1+nx/2, nx) - nx/2
                twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2))
             end do
          end do
       end do

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine print_timers()

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       integer i
       include 'global.h'
       double precision t, t_m
       character*25 tstrings(T_max)
       data tstrings / '          total ',
      >                '          setup ',
      >                '            fft ',
      >                '         evolve ',
      >                '       checksum ',
      >                '           fftx ',
      >                '           ffty ',
      >                '           fftz ' /

       t_m = timer_read(T_total)
       if (t_m .le. 0.0d0) t_m = 1.0d0
       do i = 1, t_max
          t = timer_read(i)
          write(*, 100) i, tstrings(i), t, t*100.0/t_m
       end do
  100  format(' timer ', i2, '(', A16,  ') :', F9.4, ' (',F6.2,'%)')
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine fft(dir, x1, x2)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer dir
       double complex x1(ntotalp), x2(ntotalp)

       double complex y1(fftblockpad_default*maxdim),
      >               y2(fftblockpad_default*maxdim)

 c---------------------------------------------------------------------
 c note: args x1, x2 must be different arrays
 c note: args for cfftsx are (direction, layout, xin, xout, scratch)
 c       xin/xout may be the same and it can be somewhat faster
 c       if they are
 c---------------------------------------------------------------------

       if (dir .eq. 1) then
          call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
          call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
          call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
       else
          call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
          call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
          call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
       endif
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none

       include 'global.h'
       integer is, d1, d2, d3, logd1
       double complex x(d1+1,d2,d3)
       double complex xout(d1+1,d2,d3)
       double complex y1(fftblockpad, d1), y2(fftblockpad, d1)
       integer i, j, k, jj

       logd1 = ilog2(d1)

       if (timers_enabled) call timer_start(T_fftx)
 !$omp parallel do default(shared) private(i,j,k,jj,y1,y2)
 !$omp&  shared(is,logd1,d1)
       do k = 1, d3
          do jj = 0, d2 - fftblock, fftblock
             do j = 1, fftblock
                do i = 1, d1
                   y1(j,i) = x(i,j+jj,k)
                enddo
             enddo

             call cfftz (is, logd1, d1, y1, y2)


             do j = 1, fftblock
                do i = 1, d1
                   xout(i,j+jj,k) = y1(j,i)
                enddo
             enddo
          enddo
       enddo
       if (timers_enabled) call timer_stop(T_fftx)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none

       include 'global.h'
       integer is, d1, d2, d3, logd2
       double complex x(d1+1,d2,d3)
       double complex xout(d1+1,d2,d3)
       double complex y1(fftblockpad, d2), y2(fftblockpad, d2)
       integer i, j, k, ii

       logd2 = ilog2(d2)

       if (timers_enabled) call timer_start(T_ffty)
 !$omp parallel do default(shared) private(i,j,k,ii,y1,y2)
 !$omp&  shared(is,logd2,d2)
       do k = 1, d3
         do ii = 0, d1 - fftblock, fftblock
            do j = 1, d2
               do i = 1, fftblock
                  y1(i,j) = x(i+ii,j,k)
               enddo
            enddo

            call cfftz (is, logd2, d2, y1, y2)

            do j = 1, d2
               do i = 1, fftblock
                  xout(i+ii,j,k) = y1(i,j)
               enddo
            enddo
         enddo
       enddo
       if (timers_enabled) call timer_stop(T_ffty)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none

       include 'global.h'
       integer is, d1, d2, d3, logd3
       double complex x(d1+1,d2,d3)
       double complex xout(d1+1,d2,d3)
       double complex y1(fftblockpad, d3), y2(fftblockpad, d3)
       integer i, j, k, ii

       logd3 = ilog2(d3)

       if (timers_enabled) call timer_start(T_fftz)
 !$omp parallel do default(shared) private(i,j,k,ii,y1,y2)
 !$omp&  shared(is)
       do j = 1, d2
         do ii = 0, d1 - fftblock, fftblock
            do k = 1, d3
               do i = 1, fftblock
                  y1(i,k) = x(i+ii,j,k)
               enddo
            enddo

            call cfftz (is, logd3, d3, y1, y2)

            do k = 1, d3
               do i = 1, fftblock
                  xout(i+ii,j,k) = y1(i,k)
               enddo
            enddo
         enddo
       enddo
       if (timers_enabled) call timer_stop(T_fftz)

       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine fft_init (n)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c compute the roots-of-unity array that will be used for subsequent FFTs.
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'

       integer m,n,nu,ku,i,j,ln
       double precision t, ti


 c---------------------------------------------------------------------
 c   Initialize the U array with sines and cosines in a manner that permits
 c   stride one access at each FFT iteration.
 c---------------------------------------------------------------------
       nu = n
       m = ilog2(n)
       u(1) = m
       ku = 2
       ln = 1

       do j = 1, m
          t = pi / ln

          do i = 0, ln - 1
             ti = i * t
             u(i+ku) = dcmplx (cos (ti), sin(ti))
          enddo

          ku = ku + ln
          ln = 2 * ln
       enddo

       return
       end

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine cfftz (is, m, n, x, y)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
 c   to Swarztrauber.  X is both the input and the output array, while Y is a
 c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
 c   perform FFTs, the array U must be initialized by calling CFFTZ with IS
 c   set to 0 and M set to MX, where MX is the maximum value of M for any
 c   subsequent call.
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'

       integer is,m,n,i,j,l,mx
       double complex x, y

       dimension x(fftblockpad,n), y(fftblockpad,n)

 c---------------------------------------------------------------------
 c   Check if input parameters are invalid.
 c---------------------------------------------------------------------
       mx = u(1)
       if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)
      >  then
         write (*, 1)  is, m, mx
  1      format ('CFFTZ: Either U has not been initialized, or else'/
      >    'one of the input parameters is invalid', 3I5)
         stop
       endif

 c---------------------------------------------------------------------
 c   Perform one variant of the Stockham FFT.
 c---------------------------------------------------------------------
       do l = 1, m, 2
         call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
         if (l .eq. m) goto 160
         call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
       enddo

       goto 180

 c---------------------------------------------------------------------
 c   Copy Y to X.
 c---------------------------------------------------------------------
  160  do j = 1, n
         do i = 1, fftblock
           x(i,j) = y(i,j)
         enddo
       enddo

  180  continue

       return
       end

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

 c---------------------------------------------------------------------
 c   Performs the L-th iteration of the second variant of the Stockham FFT.
 c---------------------------------------------------------------------

       implicit none

       integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
       double complex u,x,y,u1,x11,x21
       dimension u(n), x(ny1,n), y(ny1,n)


 c---------------------------------------------------------------------
 c   Set initial parameters.
 c---------------------------------------------------------------------

       n1 = n / 2
       lk = 2 ** (l - 1)
       li = 2 ** (m - l)
       lj = 2 * lk
       ku = li + 1

       do i = 0, li - 1
         i11 = i * lk + 1
         i12 = i11 + n1
         i21 = i * lj + 1
         i22 = i21 + lk
         if (is .ge. 1) then
           u1 = u(ku+i)
         else
           u1 = dconjg (u(ku+i))
         endif

 c---------------------------------------------------------------------
 c   This loop is vectorizable.
 c---------------------------------------------------------------------
         do k = 0, lk - 1
           do j = 1, ny
             x11 = x(j,i11+k)
             x21 = x(j,i12+k)
             y(j,i21+k) = x11 + x21
             y(j,i22+k) = u1 * (x11 - x21)
           enddo
         enddo
       enddo

       return
       end

 c---------------------------------------------------------------------


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       integer function ilog2(n)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       integer n, nn, lg
       if (n .eq. 1) then
          ilog2=0
          return
       endif
       lg = 1
       nn = 2
       do while (nn .lt. n)
          nn = nn*2
          lg = lg+1
       end do
       ilog2 = lg
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine checksum(i, u1, d1, d2, d3)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer i, d1, d2, d3
       double complex u1(d1+1,d2,d3)
       integer j, q,r,s
       double complex chk
       chk = (0.0,0.0)

 !$omp parallel do default(shared) private(i,q,r,s) reduction(+:chk)
       do j=1,1024
          q = mod(j, nx)+1
          r = mod(3*j,ny)+1
          s = mod(5*j,nz)+1
          chk=chk+u1(q,r,s)
       end do

       chk = chk/dble(ntotal)

       write (*, 30) i, chk
  30   format (' T =',I5,5X,'Checksum =',1P2D22.12)
       sums(i) = chk
       return
       end


 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       subroutine verify (d1, d2, d3, nt, verified, class)

 c---------------------------------------------------------------------
 c---------------------------------------------------------------------

       implicit none
       include 'global.h'
       integer d1, d2, d3, nt
       character class
       logical verified
       integer i
       double precision err, epsilon

 c---------------------------------------------------------------------
 c   Reference checksums
 c---------------------------------------------------------------------
       double complex csum_ref(25)


       class = 'U'

       epsilon = 1.0d-12
       verified = .FALSE.

       if (d1 .eq. 64 .and.
      >    d2 .eq. 64 .and.
      >    d3 .eq. 64 .and.
      >    nt .eq. 6) then
 c---------------------------------------------------------------------
 c   Sample size reference checksums
 c---------------------------------------------------------------------
          class = 'S'
          csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
          csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
          csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
          csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
          csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
          csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)

       else if (d1 .eq. 128 .and.
      >    d2 .eq. 128 .and.
      >    d3 .eq. 32 .and.
      >    nt .eq. 6) then
 c---------------------------------------------------------------------
 c   Class W size reference checksums
 c---------------------------------------------------------------------
          class = 'W'
          csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
          csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
          csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
          csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
          csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
          csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)

       else if (d1 .eq. 256 .and.
      >    d2 .eq. 256 .and.
      >    d3 .eq. 128 .and.
      >    nt .eq. 6) then
 c---------------------------------------------------------------------
 c   Class A size reference checksums
 c---------------------------------------------------------------------
          class = 'A'
          csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
          csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
          csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
          csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
          csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
          csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)

       else if (d1 .eq. 512 .and.
      >    d2 .eq. 256 .and.
      >    d3 .eq. 256 .and.
      >    nt .eq. 20) then
 c---------------------------------------------------------------------
 c   Class B size reference checksums
 c---------------------------------------------------------------------
          class = 'B'
          csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
          csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
          csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
          csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
          csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
          csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
          csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
          csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
          csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
          csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
          csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
          csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
          csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
          csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
          csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
          csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
          csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
          csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
          csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
          csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)

       else if (d1 .eq. 512 .and.
      >    d2 .eq. 512 .and.
      >    d3 .eq. 512 .and.
      >    nt .eq. 20) then
 c---------------------------------------------------------------------
 c   Class C size reference checksums
 c---------------------------------------------------------------------
          class = 'C'
          csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
          csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
          csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
          csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
          csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
          csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
          csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
          csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
          csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
          csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
          csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
          csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
          csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
          csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
          csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
          csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
          csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
          csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
          csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
          csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)

       else if (d1 .eq. 2048 .and.
      >    d2 .eq. 1024 .and.
      >    d3 .eq. 1024 .and.
      >    nt .eq. 25) then
 c---------------------------------------------------------------------
 c   Class D size reference checksums
 c---------------------------------------------------------------------
          class = 'D'
          csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
          csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
          csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
          csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
          csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
          csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
          csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
          csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
          csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
          csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
          csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
          csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
          csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
          csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
          csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
          csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
          csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
          csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
          csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
          csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
          csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
          csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
          csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
          csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
          csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)

       else if (d1 .eq. 4096 .and.
      >    d2 .eq. 2048 .and.
      >    d3 .eq. 2048 .and.
      >    nt .eq. 25) then
 c---------------------------------------------------------------------
 c   Class E size reference checksums
 c---------------------------------------------------------------------
          class = 'E'
          csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
          csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
          csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
          csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
          csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
          csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
          csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
          csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
          csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
          csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
          csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
          csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
          csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
          csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
          csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
          csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
          csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
          csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
          csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
          csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
          csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
          csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
          csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
          csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
          csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)

       endif


       if (class .ne. 'U') then

          do i = 1, nt
             err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
             if (.not.(err .le. epsilon)) goto 100
          end do
          verified = .TRUE.
  100     continue

       endif


       if (class .ne. 'U') then
          if (verified) then
             write(*,2000)
  2000       format(' Result verification successful')
          else
             write(*,2001)
  2001       format(' Result verification failed')
          endif
       endif
       print *, 'class = ', class

       return
       end