blob: c11210090c9b29123148fc509716ca9fd6280770 [file] [log] [blame]
!-------------------------------------------------------------------------!
! !
! N A S P A R A L L E L B E N C H M A R K S 3.3 !
! !
! E P !
! !
!-------------------------------------------------------------------------!
! !
! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
! It is described in NAS Technical Reports 95-020 and 02-007 !
! !
! Permission to use, copy, distribute and modify this software !
! for any purpose with or without fee is hereby granted. We !
! request, however, that all derived work reference the NAS !
! Parallel Benchmarks 3.3. This software is provided "as is" !
! without express or implied warranty. !
! !
! Information on NPB 3.3, including the technical report, the !
! original specifications, source code, results and information !
! on how to submit new results, is available at: !
! !
! http://www.nas.nasa.gov/Software/NPB/ !
! !
! Send comments or suggestions to npb@nas.nasa.gov !
! !
! NAS Parallel Benchmarks Group !
! NASA Ames Research Center !
! Mail Stop: T27A-1 !
! Moffett Field, CA 94035-1000 !
! !
! E-mail: npb@nas.nasa.gov !
! Fax: (650) 604-3957 !
! !
!-------------------------------------------------------------------------!
c---------------------------------------------------------------------
c
c Authors: P. O. Frederickson
c D. H. Bailey
c A. C. Woo
c R. F. Van der Wijngaart
c---------------------------------------------------------------------
c---------------------------------------------------------------------
program EMBAR
c---------------------------------------------------------------------
C
c This is the MPI version of the APP Benchmark 1,
c the "embarassingly parallel" benchmark.
c
c
c M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c numbers. MK is the Log_2 of the size of each batch of uniform random
c numbers. MK can be set for convenience on a given system, since it does
c not affect the results.
implicit none
include 'npbparams.h'
include 'mpinpb.h'
double precision Mops, epsilon, a, s, t1, t2, t3, t4, x, x1,
> x2, q, sx, sy, tm, an, tt, gc, dum(3),
> timer_read
double precision sx_verify_value, sy_verify_value, sx_err, sy_err
integer mk, mm, nn, nk, nq, np, ierr, node, no_nodes,
> i, ik, kk, l, k, nit, ierrcode, no_large_nodes,
> np_add, k_offset, j
logical verified, timers_enabled
external randlc, timer_read
double precision randlc, qq
character*15 size
integer fstatus
integer t_total, t_gpairs, t_randn, t_rcomm, t_last
parameter (t_total=1, t_gpairs=2, t_randn=3, t_rcomm=4, t_last=4)
double precision tsum(t_last+2), t1m(t_last+2),
> tming(t_last+2), tmaxg(t_last+2)
character t_recs(t_last+2)*8
parameter (mk = 16, mm = m - mk, nn = 2 ** mm,
> nk = 2 ** mk, nq = 10, epsilon=1.d-8,
> a = 1220703125.d0, s = 271828183.d0)
common/storage/ x(2*nk), q(0:nq-1), qq(10000)
data dum /1.d0, 1.d0, 1.d0/
data t_recs/'total', 'gpairs', 'randn', 'rcomm',
> ' totcomp', ' totcomm'/
call mpi_init(ierr)
call mpi_comm_rank(MPI_COMM_WORLD,node,ierr)
call mpi_comm_size(MPI_COMM_WORLD,no_nodes,ierr)
root = 0
if (.not. convertdouble) then
dp_type = MPI_DOUBLE_PRECISION
else
dp_type = MPI_REAL
endif
if (node.eq.root) then
c Because the size of the problem is too large to store in a 32-bit
c integer for some classes, we put it into a string (for printing).
c Have to strip off the decimal point put in there by the floating
c point print statement (internal file)
write(*, 1000)
write(size, '(f15.0)' ) 2.d0**(m+1)
j = 15
if (size(j:j) .eq. '.') j = j - 1
write (*,1001) size(1:j)
write(*, 1003) no_nodes
1000 format(/,' NAS Parallel Benchmarks 3.3 -- EP Benchmark',/)
1001 format(' Number of random numbers generated: ', a15)
1003 format(' Number of active processes: ', 2x, i13, /)
open (unit=2,file='timer.flag',status='old',iostat=fstatus)
timers_enabled = .false.
if (fstatus .eq. 0) then
timers_enabled = .true.
close(2)
endif
endif
call mpi_bcast(timers_enabled, 1, MPI_LOGICAL, root,
> MPI_COMM_WORLD, ierr)
verified = .false.
c Compute the number of "batches" of random number pairs generated
c per processor. Adjust if the number of processors does not evenly
c divide the total number
np = nn / no_nodes
no_large_nodes = mod(nn, no_nodes)
if (node .lt. no_large_nodes) then
np_add = 1
else
np_add = 0
endif
np = np + np_add
if (np .eq. 0) then
write (6, 1) no_nodes, nn
1 format ('Too many nodes:',2i6)
ierrcode = 1
call mpi_abort(MPI_COMM_WORLD,ierrcode,ierr)
stop
endif
c Call the random number generator functions and initialize
c the x-array to reduce the effects of paging on the timings.
c Also, call all mathematical functions that are used. Make
c sure these initializations cannot be eliminated as dead code.
call vranlc(0, dum(1), dum(2), dum(3))
dum(1) = randlc(dum(2), dum(3))
do 5 i = 1, 2*nk
x(i) = -1.d99
5 continue
Mops = log(sqrt(abs(max(1.d0,1.d0))))
c---------------------------------------------------------------------
c Synchronize before placing time stamp
c---------------------------------------------------------------------
do i = 1, t_last
call timer_clear(i)
end do
call mpi_barrier(MPI_COMM_WORLD, ierr)
call timer_start(1)
t1 = a
call vranlc(0, t1, a, x)
c Compute AN = A ^ (2 * NK) (mod 2^46).
t1 = a
do 100 i = 1, mk + 1
t2 = randlc(t1, t1)
100 continue
an = t1
tt = s
gc = 0.d0
sx = 0.d0
sy = 0.d0
do 110 i = 0, nq - 1
q(i) = 0.d0
110 continue
c Each instance of this loop may be performed independently. We compute
c the k offsets separately to take into account the fact that some nodes
c have more numbers to generate than others
if (np_add .eq. 1) then
k_offset = node * np -1
else
k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1
endif
do 150 k = 1, np
kk = k_offset + k
t1 = s
t2 = an
c Find starting seed t1 for this kk.
do 120 i = 1, 100
ik = kk / 2
if (2 * ik .ne. kk) t3 = randlc(t1, t2)
if (ik .eq. 0) goto 130
t3 = randlc(t2, t2)
kk = ik
120 continue
c Compute uniform pseudorandom numbers.
130 continue
if (timers_enabled) call timer_start(t_randn)
call vranlc(2 * nk, t1, a, x)
if (timers_enabled) call timer_stop(t_randn)
c Compute Gaussian deviates by acceptance-rejection method and
c tally counts in concentric square annuli. This loop is not
c vectorizable.
if (timers_enabled) call timer_start(t_gpairs)
do 140 i = 1, nk
x1 = 2.d0 * x(2*i-1) - 1.d0
x2 = 2.d0 * x(2*i) - 1.d0
t1 = x1 ** 2 + x2 ** 2
if (t1 .le. 1.d0) then
t2 = sqrt(-2.d0 * log(t1) / t1)
t3 = (x1 * t2)
t4 = (x2 * t2)
l = max(abs(t3), abs(t4))
q(l) = q(l) + 1.d0
sx = sx + t3
sy = sy + t4
endif
140 continue
if (timers_enabled) call timer_stop(t_gpairs)
150 continue
if (timers_enabled) call timer_start(t_rcomm)
call mpi_allreduce(sx, x, 1, dp_type,
> MPI_SUM, MPI_COMM_WORLD, ierr)
sx = x(1)
call mpi_allreduce(sy, x, 1, dp_type,
> MPI_SUM, MPI_COMM_WORLD, ierr)
sy = x(1)
call mpi_allreduce(q, x, nq, dp_type,
> MPI_SUM, MPI_COMM_WORLD, ierr)
if (timers_enabled) call timer_stop(t_rcomm)
do i = 1, nq
q(i-1) = x(i)
enddo
do 160 i = 0, nq - 1
gc = gc + q(i)
160 continue
call timer_stop(1)
tm = timer_read(1)
call mpi_allreduce(tm, x, 1, dp_type,
> MPI_MAX, MPI_COMM_WORLD, ierr)
tm = x(1)
if (node.eq.root) then
nit=0
verified = .true.
if (m.eq.24) then
sx_verify_value = -3.247834652034740D+3
sy_verify_value = -6.958407078382297D+3
elseif (m.eq.25) then
sx_verify_value = -2.863319731645753D+3
sy_verify_value = -6.320053679109499D+3
elseif (m.eq.28) then
sx_verify_value = -4.295875165629892D+3
sy_verify_value = -1.580732573678431D+4
elseif (m.eq.30) then
sx_verify_value = 4.033815542441498D+4
sy_verify_value = -2.660669192809235D+4
elseif (m.eq.32) then
sx_verify_value = 4.764367927995374D+4
sy_verify_value = -8.084072988043731D+4
elseif (m.eq.36) then
sx_verify_value = 1.982481200946593D+5
sy_verify_value = -1.020596636361769D+5
elseif (m.eq.40) then
sx_verify_value = -5.319717441530D+05
sy_verify_value = -3.688834557731D+05
else
verified = .false.
endif
if (verified) then
sx_err = abs((sx - sx_verify_value)/sx_verify_value)
sy_err = abs((sy - sy_verify_value)/sy_verify_value)
verified = ((sx_err.le.epsilon) .and. (sy_err.le.epsilon))
endif
Mops = 2.d0**(m+1)/tm/1000000.d0
write (6,11) tm, m, gc, sx, sy, (i, q(i), i = 0, nq - 1)
11 format ('EP Benchmark Results:'//'CPU Time =',f10.4/'N = 2^',
> i5/'No. Gaussian Pairs =',f15.0/'Sums = ',1p,2d25.15/
> 'Counts:'/(i3,0p,f15.0))
call print_results('EP', class, m+1, 0, 0, nit, npm,
> no_nodes, tm, Mops,
> 'Random numbers generated',
> verified, npbversion, compiletime, cs1,
> cs2, cs3, cs4, cs5, cs6, cs7)
endif
if (.not.timers_enabled) goto 999
do i = 1, t_last
t1m(i) = timer_read(i)
end do
t1m(t_last+2) = t1m(t_rcomm)
t1m(t_last+1) = t1m(t_total) - t1m(t_last+2)
call MPI_Reduce(t1m, tsum, t_last+2, dp_type, MPI_SUM,
> 0, MPI_COMM_WORLD, ierr)
call MPI_Reduce(t1m, tming, t_last+2, dp_type, MPI_MIN,
> 0, MPI_COMM_WORLD, ierr)
call MPI_Reduce(t1m, tmaxg, t_last+2, dp_type, MPI_MAX,
> 0, MPI_COMM_WORLD, ierr)
if (node .eq. 0) then
write(*, 800) no_nodes
do i = 1, t_last+2
tsum(i) = tsum(i) / no_nodes
write(*, 810) i, t_recs(i), tming(i), tmaxg(i), tsum(i)
end do
endif
800 format(' nprocs =', i6, 11x, 'minimum', 5x, 'maximum',
> 5x, 'average')
810 format(' timer ', i2, '(', A8, ') :', 3(2x,f10.4))
999 continue
call mpi_finalize(ierr)
end