| /*========================================================================= |
| |
| Copyright (c) 2007, Los Alamos National Security, LLC |
| |
| All rights reserved. |
| |
| Copyright 2007. Los Alamos National Security, LLC. |
| This software was produced under U.S. Government contract DE-AC52-06NA25396 |
| for Los Alamos National Laboratory (LANL), which is operated by |
| Los Alamos National Security, LLC for the U.S. Department of Energy. |
| The U.S. Government has rights to use, reproduce, and distribute this software. |
| NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, |
| EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. |
| If software is modified to produce derivative works, such modified software |
| should be clearly marked, so as not to confuse it with the version available |
| from LANL. |
| |
| Additionally, redistribution and use in source and binary forms, with or |
| without modification, are permitted provided that the following conditions |
| are met: |
| - Redistributions of source code must retain the above copyright notice, |
| this list of conditions and the following disclaimer. |
| - Redistributions in binary form must reproduce the above copyright notice, |
| this list of conditions and the following disclaimer in the documentation |
| and/or other materials provided with the distribution. |
| - Neither the name of Los Alamos National Security, LLC, Los Alamos National |
| Laboratory, LANL, the U.S. Government, nor the names of its contributors |
| may be used to endorse or promote products derived from this software |
| without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS |
| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
| THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR |
| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; |
| OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
| WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
| OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
| ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| =========================================================================*/ |
| |
| /*========================================================================= |
| |
| Copyright (c) 2011-2012 Argonne National Laboratory |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions |
| are met: |
| 1. Redistributions of source code must retain the above copyright |
| notice, this list of conditions and the following disclaimer. |
| 2. Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
| BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| POSSIBILITY OF SUCH DAMAGE. |
| |
| =========================================================================*/ |
| |
| |
| /* |
| BG/Q tuned version of HACC: 69.2% of peak performance on 96 racks of Sequoia |
| Argonne Leadership Computing Facility, Argonne, IL 60439 |
| Vitali Morozov (morozov@anl.gov) |
| Hal Finkel (hfinkel@anl.gov) |
| */ |
| |
| |
| #include "hip/hip_runtime.h" |
| #include "Timings.h" |
| #include "RCBForceTree.h" |
| #include "Partition.h" |
| |
| #include <cstring> |
| #include <cstdio> |
| #include <ctime> |
| #include <stdexcept> |
| #include <assert.h> |
| using namespace std; |
| |
| #ifdef __HIPCC__ |
| #include <cudaUtil.h> |
| |
| #define TILEX 4 //Unroll factor in the x dimension, best if 2 or 4, could also add 8 but that is too many registers |
| #define TILEY 4 //Unroll factor in the y dimension, best if 2 or 4, could add 8 but that is too many registers |
| #define BLOCKX 32 //Block size in the x dimension (should be 32) |
| #define BLOCKY 4 //Block size in the y dimension |
| #define MAXX 32 //Maximum blocks in the X dimension, smaller=more reuse but less parallelism |
| #define MAXY 256 //Maximum blocks in the Y dimension, there isn't much reason to make this smaller |
| |
| #define ALIGNX(n) ((n+TILEX-1)/TILEX*TILEX) //Rounds an integer to align with TILEX |
| #define ALIGNY(n) ((n+TILEY-1)/TILEY*TILEY) //Rounds an integer to align with TILEY |
| |
| cudaDeviceSelector __selector__; |
| #endif |
| |
| #ifdef __HIPCC__ |
| #define __HOST__ __host__ |
| #define __DEVICE__ __device__ |
| #else |
| #define __HOST__ |
| #define __DEVICE__ |
| #endif |
| |
| |
| // References: |
| // Emanuel Gafton and Stephan Rosswog. A fast recursive coordinate bisection tree for |
| // neighbour search and gravity. Mon. Not. R. Astron. Soc. to appear, 2011. |
| // http://arxiv.org/abs/1108.0028v1 |
| // |
| // Atsushi Kawai, Junichiro Makino and Toshikazu Ebisuzaki. |
| // Performance Analysis of High-Accuracy Tree Code Based on the Pseudoparticle |
| // Multipole Method. The Astrophysical Journal Supplement Series, 151:13-33, 2004. |
| // Related: http://arxiv.org/abs/astro-ph/0012041v1 |
| // |
| // R. H. Hardin and N. J. Sloane |
| // New Spherical 4-Designs. Discrete Math, 106/107 255-264, 1992. |
| // |
| // The library of spherical designs: |
| // http://www2.research.att.com/~njas/sphdesigns/ |
| namespace { |
| template <int TDPTS> |
| struct sphdesign {}; |
| |
| #define DECLARE_SPHDESIGN(TDPTS) \ |
| template <> \ |
| struct sphdesign<TDPTS> \ |
| { \ |
| static const POSVEL_T x[TDPTS]; \ |
| static const POSVEL_T y[TDPTS]; \ |
| static const POSVEL_T z[TDPTS]; \ |
| }; \ |
| /**/ |
| |
| DECLARE_SPHDESIGN(1) |
| DECLARE_SPHDESIGN(2) |
| DECLARE_SPHDESIGN(3) |
| DECLARE_SPHDESIGN(4) |
| DECLARE_SPHDESIGN(6) |
| DECLARE_SPHDESIGN(12) |
| DECLARE_SPHDESIGN(14) |
| |
| #undef DECLARE_SPHDESIGN |
| |
| /* this is not a t-design, but puts the monopole moment |
| at the center of mass. */ |
| const POSVEL_T sphdesign<1>::x[] = { |
| 0 |
| }; |
| |
| const POSVEL_T sphdesign<1>::y[] = { |
| 0 |
| }; |
| |
| const POSVEL_T sphdesign<1>::z[] = { |
| 0 |
| }; |
| |
| const POSVEL_T sphdesign<2>::x[] = { |
| 1.0, |
| -1.0 |
| }; |
| |
| const POSVEL_T sphdesign<2>::y[] = { |
| 0, |
| 0 |
| }; |
| |
| const POSVEL_T sphdesign<2>::z[] = { |
| 0, |
| 0 |
| }; |
| |
| const POSVEL_T sphdesign<3>::x[] = { |
| 1.0, |
| -.5, |
| -.5 |
| }; |
| |
| const POSVEL_T sphdesign<3>::y[] = { |
| 0, |
| .86602540378443864675, |
| -.86602540378443864675 |
| }; |
| |
| const POSVEL_T sphdesign<3>::z[] = { |
| 0, |
| 0, |
| 0 |
| }; |
| |
| const POSVEL_T sphdesign<4>::x[] = { |
| .577350269189625763, |
| .577350269189625763, |
| -.577350269189625763, |
| -.577350269189625763 |
| }; |
| |
| const POSVEL_T sphdesign<4>::y[] = { |
| .577350269189625763, |
| -.577350269189625763, |
| .577350269189625763, |
| -.577350269189625763 |
| }; |
| |
| const POSVEL_T sphdesign<4>::z[] = { |
| .577350269189625763, |
| -.577350269189625763, |
| -.577350269189625763, |
| .577350269189625763 |
| }; |
| |
| const POSVEL_T sphdesign<6>::x[] = { |
| 1.0, |
| -1.0, |
| 0, |
| 0, |
| 0, |
| 0 |
| }; |
| |
| const POSVEL_T sphdesign<6>::y[] = { |
| 0, |
| 0, |
| 1.0, |
| -1.0, |
| 0, |
| 0 |
| }; |
| |
| const POSVEL_T sphdesign<6>::z[] = { |
| 0, |
| 0, |
| 0, |
| 0, |
| 1.0, |
| -1.0 |
| }; |
| |
| // This is a 3-D 12-point spherical 4-design |
| // (the verticies of a icosahedron) from Hardin and Sloane. |
| const POSVEL_T sphdesign<12>::x[] = { |
| 0, |
| 0, |
| 0.525731112119134, |
| -0.525731112119134, |
| 0.85065080835204, |
| -0.85065080835204, |
| 0, |
| 0, |
| -0.525731112119134, |
| 0.525731112119134, |
| -0.85065080835204, |
| 0.85065080835204 |
| }; |
| |
| const POSVEL_T sphdesign<12>::y[] = { |
| 0.85065080835204, |
| 0.85065080835204, |
| 0, |
| 0, |
| 0.525731112119134, |
| 0.525731112119134, |
| -0.85065080835204, |
| -0.85065080835204, |
| 0, |
| 0, |
| -0.525731112119134, |
| -0.525731112119134 |
| }; |
| |
| const POSVEL_T sphdesign<12>::z[] = { |
| 0.525731112119134, |
| -0.525731112119134, |
| 0.85065080835204, |
| 0.85065080835204, |
| 0, |
| 0, |
| -0.525731112119134, |
| 0.525731112119134, |
| -0.85065080835204, |
| -0.85065080835204, |
| 0, |
| 0 |
| }; |
| |
| // This is a 3-D 14-point spherical 4-design by |
| // R. H. Hardin and N. J. A. Sloane. |
| const POSVEL_T sphdesign<14>::x[] = { |
| 1.0e0, |
| 5.947189772040725e-1, |
| 5.947189772040725e-1, |
| 5.947189772040725e-1, |
| -5.947189772040725e-1, |
| -5.947189772040725e-1, |
| -5.947189772040725e-1, |
| 3.012536847870683e-1, |
| 3.012536847870683e-1, |
| 3.012536847870683e-1, |
| -3.012536847870683e-1, |
| -3.012536847870683e-1, |
| -3.012536847870683e-1, |
| -1.0e0 |
| }; |
| |
| const POSVEL_T sphdesign<14>::y[] = { |
| 0.0e0, |
| 1.776539926025823e-1, |
| -7.678419429698292e-1, |
| 5.90187950367247e-1, |
| 1.776539926025823e-1, |
| 5.90187950367247e-1, |
| -7.678419429698292e-1, |
| 8.79474443923065e-1, |
| -7.588425179318781e-1, |
| -1.206319259911869e-1, |
| 8.79474443923065e-1, |
| -1.206319259911869e-1, |
| -7.588425179318781e-1, |
| 0.0e0 |
| }; |
| |
| const POSVEL_T sphdesign<14>::z[] = { |
| 0.0e0, |
| 7.840589244857197e-1, |
| -2.381765915652909e-1, |
| -5.458823329204288e-1, |
| -7.840589244857197e-1, |
| 5.458823329204288e-1, |
| 2.381765915652909e-1, |
| 3.684710570566285e-1, |
| 5.774116818882528e-1, |
| -9.458827389448813e-1, |
| -3.684710570566285e-1, |
| 9.458827389448813e-1, |
| -5.774116818882528e-1, |
| 0.0e0 |
| }; |
| } // anonymous namespace |
| |
| // Note: In Gafton and Rosswog the far-field force contribution is calculated |
| // per-cell (at the center of mass), and then a Taylor expansion about the center |
| // of mass is used to calculate the force on the individual particles. For this to |
| // work, the functional form of the force must be known (because the Jacobian |
| // and Hessian are required). Here, however, the functional form is not known, |
| // and so the pseudo-particle method of Makino is used instead. |
| |
| template <int TDPTS> |
| RCBForceTree<TDPTS>::RCBForceTree( |
| POSVEL_T* minLoc, |
| POSVEL_T* maxLoc, |
| POSVEL_T* minForceLoc, |
| POSVEL_T* maxForceLoc, |
| ID_T count, |
| POSVEL_T* xLoc, |
| POSVEL_T* yLoc, |
| POSVEL_T* zLoc, |
| POSVEL_T* xVel, |
| POSVEL_T* yVel, |
| POSVEL_T* zVel, |
| POSVEL_T* ms, |
| POSVEL_T* phiLoc, |
| ID_T *idLoc, |
| MASK_T *maskLoc, |
| POSVEL_T avgMass, |
| POSVEL_T fsm, |
| POSVEL_T r, |
| POSVEL_T oa, |
| ID_T nd, |
| ID_T ds, |
| ID_T tmin, |
| ForceLaw *fl, |
| float fcoeff, |
| POSVEL_T ppc) |
| { |
| #ifdef __HIPCC__ |
| //nvtxRangeId_t r0; |
| //r0=nvtxRangeStartA("RCBForceTree()"); |
| #endif |
| // Extract the contiguous data block from a vector pointer |
| particleCount = count; |
| |
| xx = xLoc; |
| yy = yLoc; |
| zz = zLoc; |
| vx = xVel; |
| vy = yVel; |
| vz = zVel; |
| mass = ms; |
| |
| numThreads=1; |
| |
| // static size for the interaction list |
| #define VMAX ALIGNY(16384) |
| nx_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads); |
| ny_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads); |
| nz_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads); |
| nm_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads); |
| |
| #ifdef __HIPCC__ |
| hipHostRegister(nx_v,VMAX*sizeof(POSVEL_T)*numThreads,0); |
| hipHostRegister(ny_v,VMAX*sizeof(POSVEL_T)*numThreads,0); |
| hipHostRegister(nz_v,VMAX*sizeof(POSVEL_T)*numThreads,0); |
| hipHostRegister(nm_v,VMAX*sizeof(POSVEL_T)*numThreads,0); |
| hipHostRegister(xx,count*sizeof(POSVEL_T),0); |
| hipHostRegister(yy,count*sizeof(POSVEL_T),0); |
| hipHostRegister(zz,count*sizeof(POSVEL_T),0); |
| hipHostRegister(mass,count*sizeof(POSVEL_T),0); |
| hipHostRegister(vx,count*sizeof(POSVEL_T),0); |
| hipHostRegister(vy,count*sizeof(POSVEL_T),0); |
| hipHostRegister(vz,count*sizeof(POSVEL_T),0); |
| |
| int size=ALIGNY(nd); |
| hipMalloc(&d_xx,size*sizeof(POSVEL_T)*numThreads); |
| hipMalloc(&d_yy,size*sizeof(POSVEL_T)*numThreads); |
| hipMalloc(&d_zz,size*sizeof(POSVEL_T)*numThreads); |
| hipMalloc(&d_vx,size*sizeof(POSVEL_T)*numThreads); |
| hipMalloc(&d_vy,size*sizeof(POSVEL_T)*numThreads); |
| hipMalloc(&d_vz,size*sizeof(POSVEL_T)*numThreads); |
| hipMalloc(&d_mass,size*sizeof(POSVEL_T)*numThreads); |
| |
| hipMalloc(&d_nx_v,VMAX*sizeof(POSVEL_T)*numThreads); |
| hipMalloc(&d_ny_v,VMAX*sizeof(POSVEL_T)*numThreads); |
| hipMalloc(&d_nz_v,VMAX*sizeof(POSVEL_T)*numThreads); |
| hipMalloc(&d_nm_v,VMAX*sizeof(POSVEL_T)*numThreads); |
| cudaCheckError(); |
| |
| |
| event_v=(hipEvent_t*)malloc(sizeof(hipEvent_t)*numThreads); |
| stream_v=(hipStream_t*)malloc(sizeof(hipStream_t)*numThreads); |
| for(int i=0;i<numThreads;i++) { |
| hipEventCreate(&event_v[i]); |
| hipStreamCreate(&stream_v[i]); |
| } |
| cudaCheckError(); |
| #endif |
| |
| phi = phiLoc; |
| id = idLoc; |
| mask = maskLoc; |
| |
| particleMass = avgMass; |
| fsrrmax = fsm; |
| rsm = r; |
| sinOpeningAngle = sinf(oa); |
| tanOpeningAngle = tanf(oa); |
| nDirect = nd; |
| depthSafety = ds; |
| taskPartMin = tmin; |
| ppContract = ppc; |
| |
| // Find the grid size of this chaining mesh |
| for (int dim = 0; dim < DIMENSION; dim++) { |
| minRange[dim] = minLoc[dim]; |
| maxRange[dim] = maxLoc[dim]; |
| minForceRange[dim] = minForceLoc[dim]; |
| maxForceRange[dim] = maxForceLoc[dim]; |
| } |
| |
| if (fl) { |
| m_own_fl = false; |
| m_fl = fl; |
| m_fcoeff = fcoeff; |
| } else { |
| //maybe change this to Newton's law or something |
| m_own_fl = true; |
| m_fl = new ForceLawNewton(); |
| m_fcoeff = 1.0; |
| } |
| |
| // Because the tree may be built in parallel, and no efficient way of locking |
| // the tree seems to be available in OpenMP (no reader/writer locks, etc.), |
| // we just estimate the number of tree nodes that will be needed. Hopefully, |
| // this will be an over estimate. If we need more than this, then tree nodes |
| // that really should be subdivided will not be. |
| // |
| // If the tree were perfectly balanced, then it would have a depth of |
| // log_2(particleCount/nDirect). The tree needs to have (2^depth)+1 entries. |
| // To that, a safety factor is added to the depth. |
| ID_T nds = (((ID_T)(particleCount/(POSVEL_T)nDirect)) << depthSafety) + 1; |
| tree.reserve(nds); |
| |
| int nthreads = 1; |
| |
| timespec b_start, b_end; |
| clock_gettime(CLOCK_THREAD_CPUTIME_ID, &b_start); |
| // Create the recursive RCB tree from the particle locations |
| #ifdef __HIPCC__ |
| //nvtxRangeId_t r1; |
| //r1=nvtxRangeStartA("createRCBForceTree"); |
| #endif |
| createRCBForceTree(); |
| |
| #ifdef __HIPCC__ |
| //nvtxRangeEnd(r1); |
| #endif |
| |
| clock_gettime(CLOCK_THREAD_CPUTIME_ID, &b_end); |
| double b_time = (b_end.tv_sec - b_start.tv_sec); |
| b_time += 1e-9*(b_end.tv_nsec - b_start.tv_nsec); |
| |
| printStats(b_time); |
| |
| // Interaction lists. |
| inx.resize(nthreads); |
| iny.resize(nthreads); |
| inz.resize(nthreads); |
| inm.resize(nthreads); |
| iq.resize(nthreads); |
| |
| #ifdef __HIPCC__ |
| //r1=nvtxRangeStartA("calcInternodeForces"); |
| #endif |
| calcInternodeForces(); |
| #ifdef __HIPCC__ |
| //nvtxRangeEnd(r1); |
| #endif |
| |
| #ifdef __HIPCC__ |
| //nvtxRangeEnd(r0); |
| #endif |
| } |
| |
| template <int TDPTS> |
| RCBForceTree<TDPTS>::~RCBForceTree() |
| { |
| #ifdef __HIPCC__ |
| //nvtxRangeId_t r0; |
| //r0=nvtxRangeStartA("~RCBForceTree"); |
| #endif |
| if (m_own_fl) { |
| delete m_fl; |
| } |
| #ifdef __HIPCC__ |
| hipHostUnregister(xx); |
| hipHostUnregister(yy); |
| hipHostUnregister(zz); |
| hipHostUnregister(mass); |
| hipHostUnregister(vx); |
| hipHostUnregister(vy); |
| hipHostUnregister(vz); |
| hipHostUnregister(nx_v); |
| hipHostUnregister(ny_v); |
| hipHostUnregister(nz_v); |
| hipHostUnregister(nm_v); |
| |
| hipFree(d_xx); |
| hipFree(d_yy); |
| hipFree(d_zz); |
| hipFree(d_vx); |
| hipFree(d_vy); |
| hipFree(d_vz); |
| hipFree(d_mass); |
| hipFree(d_nx_v); |
| hipFree(d_ny_v); |
| hipFree(d_nz_v); |
| hipFree(d_nm_v); |
| cudaCheckError(); |
| |
| for(int i=0;i<numThreads;i++) { |
| hipEventDestroy(event_v[i]); |
| hipStreamDestroy(stream_v[i]); |
| } |
| cudaCheckError(); |
| |
| free(event_v); |
| free(stream_v); |
| |
| #endif |
| free(nx_v); |
| free(ny_v); |
| free(nz_v); |
| free(nm_v); |
| #ifdef __HIPCC__ |
| //nvtxRangeEnd(r0); |
| #endif |
| } |
| |
| template <int TDPTS> |
| void RCBForceTree<TDPTS>::printStats(double buildTime) |
| { |
| size_t zeroLeafNodes = 0; |
| size_t nonzeroLeafNodes = 0; |
| size_t maxPPN = 0; |
| size_t leafParts = 0; |
| |
| for (ID_T tl = 1; tl < (ID_T) tree.size(); ++tl) { |
| if (tree[tl].cl == 0 && tree[tl].cr == 0) { |
| if (tree[tl].count > 0) { |
| ++nonzeroLeafNodes; |
| |
| leafParts += tree[tl].count; |
| maxPPN = std::max((size_t) tree[tl].count, maxPPN); |
| } else { |
| ++zeroLeafNodes; |
| } |
| } |
| } |
| |
| double localParticleCount = particleCount; |
| double localTreeSize = tree.size(); |
| double localTreeCapacity = tree.capacity(); |
| double localLeaves = zeroLeafNodes+nonzeroLeafNodes; |
| double localEmptyLeaves = zeroLeafNodes; |
| double localMeanPPN = leafParts/((double) nonzeroLeafNodes); |
| unsigned long localMaxPPN = maxPPN; |
| double localBuildTime = buildTime; |
| |
| /* |
| double globalParticleCount; |
| double globalTreeSize; |
| double globalTreeCapacity; |
| double globalLeaves; |
| double globalEmptyLeaves; |
| double globalMeanPPN; |
| unsigned long globalMaxPPN; |
| double globalBuildTime; |
| |
| bool printHere = true; |
| */ |
| |
| if ( Partition::getMyProc() == 0 ) { |
| printf("\ttree post-build statistics (local for rank 0):\n"); |
| printf("\t\tparticles: %.2f\n", localParticleCount); |
| printf("\t\tnodes: %.2f (allocated: %.2f)\n", localTreeSize, localTreeCapacity); |
| printf("\t\tleaves: %.2f (empty: %.2f)\n", localLeaves, localEmptyLeaves); |
| printf("\t\tmean ppn: %.2f (max ppn: %lu)\n", localMeanPPN, localMaxPPN); |
| printf("\t\tbuild time: %g s\n", localBuildTime); |
| } |
| } |
| |
| |
| extern "C" void cm(ID_T count, const POSVEL_T* xx, const POSVEL_T* yy, |
| const POSVEL_T* zz, const POSVEL_T* mass, |
| POSVEL_T* xmin, POSVEL_T* xmax, POSVEL_T* xc); |
| |
| static inline POSVEL_T pptdr(const POSVEL_T* xmin, const POSVEL_T* xmax, const POSVEL_T* xc) |
| { |
| return std::min(xmax[0] - xc[0], std::min(xmax[1] - xc[1], std::min(xmax[2] - xc[2], std::min(xc[0] - xmin[0], |
| std::min(xc[1] - xmin[1], xc[2] - xmin[2]))))); |
| } |
| |
| template <int TDPTS> |
| static inline void pppts(POSVEL_T tdr, const POSVEL_T* xc, |
| POSVEL_T* ppx, POSVEL_T* ppy, POSVEL_T* ppz) |
| { |
| for (int i = 0; i < TDPTS; ++i) { |
| ppx[i] = tdr*sphdesign<TDPTS>::x[i] + xc[0]; |
| ppy[i] = tdr*sphdesign<TDPTS>::y[i] + xc[1]; |
| ppz[i] = tdr*sphdesign<TDPTS>::z[i] + xc[2]; |
| } |
| } |
| |
| template <int TDPTS> |
| static inline void pp(ID_T count, const POSVEL_T* xx, const POSVEL_T* yy, |
| const POSVEL_T* zz, const POSVEL_T* mass, const POSVEL_T* xc, |
| const POSVEL_T* ppx, const POSVEL_T* ppy, const POSVEL_T* ppz, |
| POSVEL_T* ppm, POSVEL_T tdr) |
| { |
| POSVEL_T K = TDPTS; |
| POSVEL_T odr0 = 1/K; |
| |
| for (int i = 0; i < count; ++i) { |
| POSVEL_T xi = xx[i] - xc[0]; |
| POSVEL_T yi = yy[i] - xc[1]; |
| POSVEL_T zi = zz[i] - xc[2]; |
| POSVEL_T ri = sqrtf(xi*xi + yi*yi + zi*zi); |
| |
| for (int j = 0; j < TDPTS; ++j) { |
| POSVEL_T xj = ppx[j] - xc[0]; |
| POSVEL_T yj = ppy[j] - xc[1]; |
| POSVEL_T zj = ppz[j] - xc[2]; |
| POSVEL_T rj2 = xj*xj + yj*yj + zj*zj; |
| |
| POSVEL_T odr1 = 0, odr2 = 0; |
| if (rj2 != 0) { |
| POSVEL_T rj = sqrtf(rj2); |
| POSVEL_T aij = (xi*xj + yi*yj + zi*zj)/(ri*rj); |
| |
| odr1 = (3/K)*(ri/tdr)*aij; |
| odr2 = (5/K)*(ri/tdr)*(ri/tdr)*0.5*(3*aij*aij - 1); |
| } |
| |
| ppm[j] += mass[i]*(odr0 + odr1 + odr2); |
| } |
| } |
| } |
| |
| #ifdef __HIPCC__ |
| |
| typedef long long int int64; |
| |
| template<typename T> |
| __device__ __forceinline__ |
| T load(T *t) |
| { |
| return __ldg(t); //texture load |
| } |
| |
| template<int TILE_SIZE, typename T>__device__ void loadT(T * out, const T * in); |
| |
| //generic version (inefficient) |
| template<int TILE_SIZE, typename T> |
| __device__ __forceinline__ |
| void loadT(T * out, const T * in) { |
| #pragma unroll |
| for(int i=0;i<TILE_SIZE;i++) { |
| out[i]=__ldg(in+i); |
| } |
| } |
| |
| //Vector loads |
| template<> |
| __device__ __forceinline__ |
| void loadT<2,float>(float * out, const float * in) { |
| *reinterpret_cast<float2*>(out)=load(reinterpret_cast<const float2*>(in)); |
| } |
| template<> |
| __device__ __forceinline__ |
| void loadT<4,float>(float * out, const float * in) { |
| *reinterpret_cast<float4*>(out)=load(reinterpret_cast<const float4*>(in)); |
| } |
| |
| //static __device__ __forceinline__ float __internal_fast_rsqrtf(float a) |
| //{ |
| // float r; |
| // asm ("rsqrt.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a)); |
| // return r; |
| //} |
| |
| //computes the forces between tiles i and j, adds the change in force to xi,yi,zi |
| template<int TX, int TY> |
| __device__ __forceinline__ void computeForces(POSVEL_T xxi[], POSVEL_T yyi[], POSVEL_T zzi[], |
| POSVEL_T xxj[], POSVEL_T yyj[], POSVEL_T zzj[], POSVEL_T massj[], |
| POSVEL_T xi[], POSVEL_T yi[], POSVEL_T zi[], |
| POSVEL_T ma0, POSVEL_T ma1, POSVEL_T ma2, POSVEL_T ma3, POSVEL_T ma4, POSVEL_T ma5, |
| POSVEL_T mp_rsm2, POSVEL_T fsrrmax2) { |
| |
| #pragma unroll |
| for(int i=0;i<TY;i++) { |
| #pragma unroll |
| for(int j=0;j<TX;j++) { |
| POSVEL_T dxc = xxj[j] - xxi[i]; //1 FADD |
| POSVEL_T dyc = yyj[j] - yyi[i]; //1 FADD |
| POSVEL_T dzc = zzj[j] - zzi[i]; //1 FADD |
| |
| POSVEL_T r2 = dxc * dxc + dyc * dyc + dzc * dzc; //1 FMUL 2 FMA |
| POSVEL_T v=r2+mp_rsm2; //1 FADD |
| POSVEL_T v3=v*v*v; //2 FMUL |
| |
| POSVEL_T f = __frsqrt_rn(v3); //1 MUFU, |
| // MDS: Should ask someone why this line is dangling |
| // - ( ma0 + r2*(ma1 + r2*(ma2 + r2*(ma3 + r2*(ma4 + r2*ma5))))); //5 FMA, 1 FADD |
| //#define BUG |
| #ifndef BUG |
| f*=massj[j]*(r2<fsrrmax2 && r2>0.0f); //2 FMUL, 1 FSETP, 1 FCMP |
| #else |
| f*=massj[j]; //1 FMUL |
| f*=(r2<fsrrmax2 && r2>0.0f); //1 FMUL, 1 FSETP, 1 FCMP |
| #endif |
| |
| xi[i] = xi[i] + f * dxc; //1 FMA |
| yi[i] = yi[i] + f * dyc; //1 FMA |
| zi[i] = zi[i] + f * dzc; //1 FMA |
| } |
| } |
| } |
| |
| //loads a tile from memory. Use checkBounds and loadMass to disable bounds check or mass load at compile time |
| template <bool checkBounds, bool loadMass, int TILE_SIZE> |
| __device__ __forceinline__ |
| void loadTile(int i, int bounds, |
| const POSVEL_T* xx, const POSVEL_T* yy, const POSVEL_T* zz, const POSVEL_T* mass, |
| POSVEL_T xxi[], POSVEL_T yyi[], POSVEL_T zzi[], POSVEL_T massi[]) { |
| if(checkBounds) { |
| #pragma unroll |
| for(int64 u=0;u<TILE_SIZE;u++) { |
| int64 idx=TILE_SIZE*i+u; // 1 IMAD |
| |
| #if 1 |
| bool cond=idx<bounds; |
| xxi[u] = (cond) ? load(xx+idx) : 0.0f; // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV |
| yyi[u] = (cond) ? load(yy+idx) : 0.0f; // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV |
| zzi[u] = (cond) ? load(zz+idx) : 0.0f; // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV |
| if(loadMass) massi[u] = (cond) ? load(mass+idx) : 0.0f; // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV |
| #else |
| massi[u] = 0.0f; //1 MOV |
| if(idx<bounds) { //1 ISETP, 1 BRA |
| xxi[u] = load(xx+idx); //1 LDG, 2 IMAD |
| yyi[u] = load(yy+idx); //1 LDG, 2 IMAD |
| zzi[u] = load(zz+idx); //1 LDG, 2 IMAD |
| if(loadMass) massi[u] = load(mass+idx); //1 LDG, 2 IMAD |
| } |
| #endif |
| } |
| } else { |
| |
| int idx=TILE_SIZE*i; |
| loadT<TILE_SIZE>(xxi,xx+idx); //1 LDG, 2 IMAD |
| loadT<TILE_SIZE>(yyi,yy+idx); //1 LDG, 2 IMAD |
| loadT<TILE_SIZE>(zzi,zz+idx); //1 LDG, 2 IMAD |
| if(loadMass) loadT<TILE_SIZE>(massi,mass+idx); //1 LDG, 2 IMAD |
| } |
| } |
| |
| //applies the force in xi,yi,zi to update vx, vy, vz |
| //use checkBounds to disable bounds checking at compile time |
| template <bool checkBounds, int TILE_SIZE> |
| __device__ __forceinline__ |
| void applyForce(int i, int bounds,POSVEL_T fcoeff, |
| const POSVEL_T xi[], const POSVEL_T yi[], const POSVEL_T zi[], |
| POSVEL_T *vx, POSVEL_T *vy, POSVEL_T *vz) { |
| #pragma unroll |
| for(int u=0;u<TILE_SIZE;u++) { |
| int idx=TILE_SIZE*i+u; //1 IMAD |
| |
| if(!checkBounds || idx<bounds) |
| { //1 ISETP |
| atomicWarpReduceAndUpdate(vx+idx,fcoeff * xi[u]); //2 IMAD, 6 FADD |
| atomicWarpReduceAndUpdate(vy+idx,fcoeff * yi[u]); //2 IMAD, 6 FADD |
| atomicWarpReduceAndUpdate(vz+idx,fcoeff * zi[u]); //2 IMAD, 6 FADD |
| } |
| } |
| } |
| |
| //Tell the compiler how many blocks we expect to be active. |
| //This gives the compiler a better idea of how many registers to use. The second number is tunable. |
| __launch_bounds__(BLOCKX*BLOCKY,7) |
| __global__ |
| void Step10_cuda_kernel(int count, int count1, |
| const POSVEL_T* xx, const POSVEL_T* yy, |
| const POSVEL_T* zz, const POSVEL_T* mass, |
| const POSVEL_T* xx1, const POSVEL_T* yy1, |
| const POSVEL_T* zz1, const POSVEL_T* mass1, |
| POSVEL_T* vx, POSVEL_T* vy, |
| POSVEL_T* vz, POSVEL_T fsrrmax2, POSVEL_T mp_rsm2, POSVEL_T fcoeff) |
| { |
| const POSVEL_T ma0 = 0.269327, ma1 = -0.0750978, ma2 = 0.0114808, ma3 = -0.00109313, ma4 = 0.0000605491, ma5 = -0.00000147177; |
| |
| //Register arrays to hold tiles of data |
| POSVEL_T xxi[TILEY]; |
| POSVEL_T yyi[TILEY]; |
| POSVEL_T zzi[TILEY]; |
| POSVEL_T xxj[TILEX]; |
| POSVEL_T yyj[TILEX]; |
| POSVEL_T zzj[TILEX]; |
| POSVEL_T massj[TILEX]; |
| |
| //loop over interior region and calculate forces. |
| //for each tile i |
| for(int i=hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count/TILEY;i+=hipBlockDim_y*hipGridDim_y) //1 ISETP |
| { |
| POSVEL_T xi[TILEY]={0}; //TILEY MOV |
| POSVEL_T yi[TILEY]={0}; //TILEY MOV |
| POSVEL_T zi[TILEY]={0}; //TILEY MOV |
| |
| //load tile i,mass and bounds check are not needed |
| loadTile<false,false,TILEY>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL); |
| |
| //for each tile j |
| for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP |
| { |
| //load tile j, bounds check is not needed |
| loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj); |
| |
| //compute forces between tile i and tile j |
| computeForces<TILEX,TILEY>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2); |
| } |
| |
| //process remaining elements at the end, use TILEX=1 |
| for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP |
| { |
| //load tile j, bounds check is needed, mass is needed |
| loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj); |
| |
| //compute forces between tile i and tile j |
| computeForces<1,TILEY>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2); |
| } |
| |
| //apply the force we have calculated above, bounds check is not needed |
| applyForce<false,TILEY>(i,count,fcoeff,xi,yi,zi,vx,vy,vz); |
| } |
| |
| //At this point we have computed almost all interactions. |
| //However we still need to add contributions for particles at the end |
| |
| #if 1 |
| //process ramining elements in set TILEY=1 |
| //for each tile i |
| for(int i=count/TILEY*TILEY+hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count;i+=hipBlockDim_y*hipGridDim_y) //1 ISETP |
| { |
| POSVEL_T xi[1]={0}; //1 MOV |
| POSVEL_T yi[1]={0}; //1 MOV |
| POSVEL_T zi[1]={0}; //1 MOV |
| |
| //load xxi, yyi, zzi tiles, mass is not needed, bounds check is needed |
| loadTile<true,false,1>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL); |
| |
| //for each tile j |
| for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP |
| { |
| //load tile j, bounds check is not needed |
| loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj); |
| |
| //compute forces between tile i and tile j |
| computeForces<TILEX,1>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2); |
| } |
| |
| //process remaining elements at the end, use TILEX=1 |
| for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP |
| { |
| //load tile j, bounds check is needed, mass is needed |
| loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj); |
| |
| //compute forces between tile i and tile j |
| computeForces<1,1>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2); |
| } |
| |
| applyForce<true,1>(i,count,fcoeff,xi,yi,zi,vx,vy,vz); |
| } |
| #endif |
| |
| } |
| |
| |
| |
| #endif |
| |
| #ifdef __bgq__ |
| extern "C" Step16_int( int count1, float xxi, float yyi, float zzi, float fsrrmax2, float mp_rsm2, const float *xx1, const float *yy1, const float *zz1,const float *mass1, float *ax, float *ay, float *az ); |
| #endif |
| |
| static inline void nbody1(ID_T count, ID_T count1, const POSVEL_T* xx, const POSVEL_T* yy, |
| const POSVEL_T* zz, const POSVEL_T* mass, |
| const POSVEL_T* xx1, const POSVEL_T* yy1, |
| const POSVEL_T* zz1, const POSVEL_T* mass1, |
| POSVEL_T* vx, POSVEL_T* vy, POSVEL_T* vz, |
| ForceLaw *fl, float fcoeff, float fsrrmax, float rsm |
| #ifdef __HIPCC__ |
| , hipStream_t stream |
| #endif |
| ) |
| { |
| POSVEL_T fsrrmax2 = fsrrmax*fsrrmax; |
| POSVEL_T rsm2 = rsm*rsm; |
| |
| #ifdef __bgq__ |
| float ax = 0.0f, ay = 0.0f, az = 0.0f; |
| |
| for (int i = 0; i < count; ++i) |
| { |
| |
| Step16_int ( count1, xx[i],yy[i],zz[i], fsrrmax2,rsm2,xx1,yy1,zz1,mass1, &ax, &ay, &az ); |
| |
| vx[i] = vx[i] + ax * fcoeff; |
| vy[i] = vy[i] + ay * fcoeff; |
| vz[i] = vz[i] + az * fcoeff; |
| } |
| |
| #else |
| |
| #ifdef __HIPCC__ |
| //const int MAXX=64; |
| //const int MAXY=64; |
| |
| dim3 threads(BLOCKX,BLOCKY); |
| int blocksX=(count1+threads.x-1)/threads.x; |
| int blocksY=(count+threads.y-1)/threads.y; |
| dim3 blocks( min(blocksX,MAXX), min(blocksY,MAXY)); |
| |
| //call kernel |
| |
| cudaCheckError(); |
| //printf("count: %d, count1: %d\n", count,count1); |
| #if 0 |
| checkCudaPtr(xx,"xx"); |
| checkCudaPtr(yy,"yy"); |
| checkCudaPtr(zz,"zz"); |
| checkCudaPtr(mass,"mass"); |
| checkCudaPtr(xx1,"xx1"); |
| checkCudaPtr(yy1,"yy1"); |
| checkCudaPtr(zz1,"zz1"); |
| checkCudaPtr(mass1,"mass1"); |
| checkCudaPtr(vx,"vx"); |
| checkCudaPtr(vy,"vy"); |
| checkCudaPtr(vz,"vz"); |
| #endif |
| hipLaunchKernelGGL(Step10_cuda_kernel, dim3(blocks), dim3(threads), 0, stream, count,count1,xx,yy,zz,mass,xx1,yy1,zz1,mass1, vx, vy, vz, fsrrmax2, rsm2, fcoeff); |
| cudaCheckError(); |
| |
| //hipDeviceSynchronize(); |
| //exit(0); |
| #else |
| |
| for (int i = 0; i < count; ++i) |
| for (int j = 0; j < count1; ++j) { |
| POSVEL_T dx = xx1[j] - xx[i]; |
| POSVEL_T dy = yy1[j] - yy[i]; |
| POSVEL_T dz = zz1[j] - zz[i]; |
| POSVEL_T dist2 = dx*dx + dy*dy + dz*dz; |
| POSVEL_T f_over_r = mass[i]*mass1[j] * fl->f_over_r(dist2); |
| |
| POSVEL_T updateq = 1.0; |
| updateq *= (dist2 < fsrrmax2); |
| |
| vx[i] += updateq*fcoeff*f_over_r*dx; |
| vy[i] += updateq*fcoeff*f_over_r*dy; |
| vz[i] += updateq*fcoeff*f_over_r*dz; |
| } |
| #endif //end __HIPCC__ |
| |
| |
| #endif //end __bgq__ |
| } |
| |
| |
| static inline ID_T partition(ID_T n, |
| POSVEL_T* xx, POSVEL_T* yy, POSVEL_T* zz, |
| POSVEL_T* vx, POSVEL_T* vy, POSVEL_T* vz, |
| POSVEL_T* mass, POSVEL_T* phi, |
| ID_T* id, MASK_T* mask, POSVEL_T pv |
| ) |
| { |
| float t0, t1, t2, t3, t4, t5, t6, t7; |
| int32_t is, i, j; |
| long i0; |
| uint16_t i1; |
| |
| int idx[n]; |
| |
| is = 0; |
| for ( i = 0; i < n; i = i + 1 ) |
| { |
| if (xx[i] < pv) |
| { |
| idx[is] = i; |
| is = is + 1; |
| } |
| } |
| |
| #pragma unroll (4) |
| for ( j = 0; j < is; j++ ) |
| { |
| i = idx[j]; |
| |
| t6 = mass[i]; mass[i] = mass[j]; mass[j] = t6; |
| t7 = phi [i]; phi [i] = phi [j]; phi [j] = t7; |
| i1 = mask[i]; mask[i] = mask[j]; mask[j] = i1; |
| i0 = id [i]; id [i] = id [j]; id [j] = i0; |
| } |
| |
| #pragma unroll (4) |
| for ( j = 0; j < is; j++ ) |
| { |
| i = idx[j]; |
| |
| t0 = xx[i]; xx[i] = xx[j]; xx[j] = t0; |
| t1 = yy[i]; yy[i] = yy[j]; yy[j] = t1; |
| t2 = zz[i]; zz[i] = zz[j]; zz[j] = t2; |
| t3 = vx[i]; vx[i] = vx[j]; vx[j] = t3; |
| t4 = vy[i]; vy[i] = vy[j]; vy[j] = t4; |
| t5 = vz[i]; vz[i] = vz[j]; vz[j] = t5; |
| } |
| |
| return is; |
| } |
| |
| template <int TDPTS> |
| void RCBForceTree<TDPTS>::createRCBForceSubtree(int d, ID_T tl, ID_T tlcl, ID_T tlcr) |
| { |
| POSVEL_T *x1, *x2, *x3; |
| switch (d) { |
| case 0: |
| x1 = xx; |
| x2 = yy; |
| x3 = zz; |
| break; |
| case 1: |
| x1 = yy; |
| x2 = zz; |
| x3 = xx; |
| break; |
| /*case 2*/ default: |
| x1 = zz; |
| x2 = xx; |
| x3 = yy; |
| break; |
| } |
| |
| #ifdef __bgq__ |
| int tid = 0; |
| |
| #endif |
| const bool geoSplit = false; |
| POSVEL_T split = geoSplit ? (tree[tl].xmax[d]+tree[tl].xmin[d])/2 : tree[tl].xc[d]; |
| ID_T is = ::partition(tree[tl].count, x1 + tree[tl].offset, x2 + tree[tl].offset, x3 + tree[tl].offset, |
| vx + tree[tl].offset, vy + tree[tl].offset, vz + tree[tl].offset, |
| mass + tree[tl].offset, phi + tree[tl].offset, |
| id + tree[tl].offset, mask + tree[tl].offset, split |
| ); |
| |
| if (is == 0 || is == tree[tl].count) { |
| return; |
| } |
| |
| tree[tlcl].count = is; |
| tree[tlcr].count = tree[tl].count - tree[tlcl].count; |
| |
| if (tree[tlcl].count > 0) { |
| tree[tl].cl = tlcl; |
| tree[tlcl].offset = tree[tl].offset; |
| tree[tlcl].xmax[d] = split; |
| |
| createRCBForceTreeInParallel(tlcl); |
| } |
| |
| if (tree[tlcr].count > 0) { |
| tree[tl].cr = tlcr; |
| tree[tlcr].offset = tree[tl].offset + tree[tlcl].count; |
| tree[tlcr].xmin[d] = split; |
| |
| createRCBForceTreeInParallel(tlcr); |
| } |
| } |
| |
| // This is basically the algorithm from (Gafton and Rosswog, 2011). |
| template <int TDPTS> |
| void RCBForceTree<TDPTS>::createRCBForceTreeInParallel(ID_T tl) |
| { |
| ID_T cnt = tree[tl].count; |
| ID_T off = tree[tl].offset; |
| |
| // Compute the center-of-mass coordinates (and recompute the min/max) |
| ::cm(cnt, xx + off, yy + off, zz + off, mass + off, |
| tree[tl].xmin, tree[tl].xmax, tree[tl].xc); |
| |
| if (cnt <= nDirect) { |
| // The pseudoparticles |
| tree[tl].tdr = ppContract*::pptdr(tree[tl].xmin, tree[tl].xmax, tree[tl].xc); |
| memset(tree[tl].ppm, 0, sizeof(POSVEL_T)*TDPTS); |
| if (cnt > TDPTS) { // Otherwise, the pseudoparticles are never used |
| POSVEL_T ppx[TDPTS], ppy[TDPTS], ppz[TDPTS]; |
| pppts<TDPTS>(tree[tl].tdr, tree[tl].xc, ppx, ppy, ppz); |
| pp<TDPTS>(cnt, xx + off, yy + off, zz + off, mass + off, tree[tl].xc, |
| ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr); |
| } |
| |
| return; |
| } |
| |
| // Index of the right and left child levels |
| ID_T tlcl, tlcr; |
| { |
| tlcl = tree.size(); |
| tlcr = tlcl+1; |
| size_t newSize = tlcr+1; |
| tree.resize(newSize); |
| } |
| memset(&tree[tlcl], 0, sizeof(TreeNode)*2); |
| |
| // Both children have similar bounding boxes to the current node (the |
| // parent), so copy the bounding box here, and then overwrite the changed |
| // coordinate later. |
| for (int i = 0; i < DIMENSION; ++i) { |
| tree[tlcl].xmin[i] = tree[tl].xmin[i]; |
| tree[tlcr].xmin[i] = tree[tl].xmin[i]; |
| tree[tlcl].xmax[i] = tree[tl].xmax[i]; |
| tree[tlcr].xmax[i] = tree[tl].xmax[i]; |
| } |
| |
| // Split the longest edge at the center of mass. |
| POSVEL_T xlen[DIMENSION]; |
| for (int i = 0; i < DIMENSION; ++i) { |
| xlen[i] = tree[tl].xmax[i] - tree[tl].xmin[i]; |
| } |
| |
| int d; |
| if (xlen[0] > xlen[1] && xlen[0] > xlen[2]) { |
| d = 0; // Split in the x direction |
| } |
| else if (xlen[1] > xlen[2]) { |
| d = 1; // Split in the y direction |
| } |
| else { |
| d = 2; // Split in the z direction |
| } |
| |
| createRCBForceSubtree(d, tl, tlcl, tlcr); |
| |
| // Compute the pseudoparticles based on those of the children |
| POSVEL_T ppx[TDPTS], ppy[TDPTS], ppz[TDPTS]; |
| tree[tl].tdr = ppContract*::pptdr(tree[tl].xmin, tree[tl].xmax, tree[tl].xc); |
| pppts<TDPTS>(tree[tl].tdr, tree[tl].xc, ppx, ppy, ppz); |
| memset(tree[tl].ppm, 0, sizeof(POSVEL_T)*TDPTS); |
| |
| if (tree[tlcl].count > 0) { |
| if (tree[tlcl].count <= TDPTS) { |
| ID_T offc = tree[tlcl].offset; |
| pp<TDPTS>(tree[tlcl].count, xx + offc, yy + offc, zz + offc, mass + offc, |
| tree[tl].xc, ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr); |
| } else { |
| POSVEL_T ppxc[TDPTS], ppyc[TDPTS], ppzc[TDPTS]; |
| pppts<TDPTS>(tree[tlcl].tdr, tree[tlcl].xc, ppxc, ppyc, ppzc); |
| pp<TDPTS>(TDPTS, ppxc, ppyc, ppzc, tree[tlcl].ppm, tree[tl].xc, |
| ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr); |
| } |
| } |
| if (tree[tlcr].count > 0) { |
| if (tree[tlcr].count <= TDPTS) { |
| ID_T offc = tree[tlcr].offset; |
| pp<TDPTS>(tree[tlcr].count, xx + offc, yy + offc, zz + offc, mass + offc, |
| tree[tl].xc, ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr); |
| } else { |
| POSVEL_T ppxc[TDPTS], ppyc[TDPTS], ppzc[TDPTS]; |
| pppts<TDPTS>(tree[tlcr].tdr, tree[tlcr].xc, ppxc, ppyc, ppzc); |
| pp<TDPTS>(TDPTS, ppxc, ppyc, ppzc, tree[tlcr].ppm, tree[tl].xc, |
| ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr); |
| } |
| } |
| } |
| |
| template <int TDPTS> |
| void RCBForceTree<TDPTS>::createRCBForceTree() |
| { |
| // The top tree is the entire box |
| tree.resize(1); |
| memset(&tree[0], 0, sizeof(TreeNode)); |
| |
| tree[0].count = particleCount; |
| tree[0].offset = 0; |
| |
| for (int i = 0; i < DIMENSION; ++i) { |
| tree[0].xmin[i] = minRange[i]; |
| tree[0].xmax[i] = maxRange[i]; |
| } |
| |
| createRCBForceTreeInParallel(); |
| } |
| |
| |
| template <int TDPTS> |
| void RCBForceTree<TDPTS>::calcInternodeForce(ID_T tl, |
| const std::vector<ID_T> &parents) { |
| |
| POSVEL_T fsrrmax2 = fsrrmax*fsrrmax; |
| const TreeNode* tree_ = &tree[0]; |
| |
| int tid = 0; |
| |
| std::vector<ID_T> &q = iq[tid]; |
| q.clear(); |
| q.push_back(0); |
| |
| |
| POSVEL_T *nx=nx_v+tid*VMAX; |
| POSVEL_T *ny=ny_v+tid*VMAX; |
| POSVEL_T *nz=nz_v+tid*VMAX; |
| POSVEL_T *nm=nm_v+tid*VMAX; |
| |
| #ifdef __HIPCC__ |
| //Adjust pointers to this threads workspace |
| POSVEL_T *d_nx=d_nx_v+tid*VMAX; |
| POSVEL_T *d_ny=d_ny_v+tid*VMAX; |
| POSVEL_T *d_nz=d_nz_v+tid*VMAX; |
| POSVEL_T *d_nm=d_nm_v+tid*VMAX; |
| int size=ALIGNY(nDirect); |
| POSVEL_T *d_xxl=d_xx+tid*size; |
| POSVEL_T *d_yyl=d_yy+tid*size; |
| POSVEL_T *d_zzl=d_zz+tid*size; |
| POSVEL_T *d_massl=d_mass+tid*size; |
| POSVEL_T *d_vxl=d_vx+tid*size; |
| POSVEL_T *d_vyl=d_vy+tid*size; |
| POSVEL_T *d_vzl=d_vz+tid*size; |
| |
| hipEvent_t& event=event_v[tid]; |
| hipStream_t& stream=stream_v[tid]; |
| hipEventSynchronize(event); //wait for transfers from previous call to finish before overwriting nx,ny,nz,nm |
| cudaCheckError(); |
| #endif |
| |
| // The interaction list. |
| int SIZE = 0; // current size of these arrays |
| |
| while (!q.empty()) { |
| ID_T tln = q.back(); |
| q.pop_back(); |
| |
| // We should not interact with our own parents. |
| if (tln < tl) { |
| bool isParent = std::binary_search(parents.begin(), parents.end(), tln); |
| if (isParent) { |
| ID_T tlncr = tree_[tln].cr; |
| ID_T tlncl = tree_[tln].cl; |
| |
| if (tlncl != tl && tlncl > 0 && tree_[tlncl].count > 0) { |
| q.push_back(tlncl); |
| } |
| if (tlncr != tl && tlncr > 0 && tree_[tlncr].count > 0) { |
| q.push_back(tlncr); |
| } |
| |
| continue; |
| } |
| } |
| |
| // Is this node have a small enough opening angle to interact with? |
| POSVEL_T dx = tree_[tln].xc[0] - tree_[tl].xc[0]; |
| POSVEL_T dy = tree_[tln].xc[1] - tree_[tl].xc[1]; |
| POSVEL_T dz = tree_[tln].xc[2] - tree_[tl].xc[2]; |
| POSVEL_T dist2 = dx*dx + dy*dy + dz*dz; |
| |
| POSVEL_T sx = tree_[tln].xmax[0]-tree_[tln].xmin[0]; |
| POSVEL_T sy = tree_[tln].xmax[1]-tree_[tln].xmin[1]; |
| POSVEL_T sz = tree_[tln].xmax[2]-tree_[tln].xmin[2]; |
| POSVEL_T l2 = std::min(sx*sx, std::min(sy*sy, sz*sz)); // under-estimate |
| |
| POSVEL_T dtt2 = dist2*tanOpeningAngle*tanOpeningAngle; |
| bool looksBig; |
| // l2/dist2 is really tan^2 theta, for small theta, tan(theta) ~ theta |
| if (l2 > dtt2) { |
| // the under-estimate is too big, so this is definitely too big |
| looksBig = true; |
| } else { |
| // There are 8 corner points of the remote node, and the maximum angular |
| // size will be from one of those points to its opposite points. So there |
| // are 8 vector dot products to compute to determine the maximum angular |
| // size at any given reference point. (do we need to do this for each point |
| // in leaf node, or will the c.m. point be sufficient?). |
| looksBig = false; |
| for (int i = 0; i < 2; ++i) |
| for (int j = 0; j < 2; ++j) { |
| POSVEL_T x1 = (i == 0 ? tree_[tln].xmin : tree_[tln].xmax)[0] - tree_[tl].xc[0]; |
| POSVEL_T y1 = (j == 0 ? tree_[tln].xmin : tree_[tln].xmax)[1] - tree_[tl].xc[1]; |
| POSVEL_T z1 = tree_[tln].xmin[2] - tree_[tl].xc[2]; |
| |
| POSVEL_T x2 = (i == 0 ? tree_[tln].xmax : tree_[tln].xmin)[0] - tree_[tl].xc[0]; |
| POSVEL_T y2 = (j == 0 ? tree_[tln].xmax : tree_[tln].xmin)[1] - tree_[tl].xc[1]; |
| POSVEL_T z2 = tree_[tln].xmax[2] - tree_[tl].xc[2]; |
| |
| const bool useRealOA = false; |
| if (useRealOA) { |
| // |a x b| = a*b*sin(theta) |
| POSVEL_T cx = y1*z2 - z1*y2; |
| POSVEL_T cy = z1*x2 - x1*z2; |
| POSVEL_T cz = x1*y2 - y1*x2; |
| if ((cx*cx + cy*cy + cz*cz) > sinOpeningAngle*sinOpeningAngle* |
| (x1*x1 + y1*y1 + z1*z1)*(x2*x2 + y2*y2 + z2*z2) |
| ) { |
| looksBig = true; |
| break; |
| } |
| } else { |
| // Instead of using the real opening angle, use the tan approximation; this is |
| // better than the opening-angle b/c it incorporates depth information. |
| POSVEL_T ddx = x1 - x2, ddy = y1 - y2, ddz = z1 - z2; |
| POSVEL_T dh2 = ddx*ddx + ddy*ddy + ddz*ddz; |
| if (dh2 > dtt2) { |
| looksBig = true; |
| break; |
| } |
| } |
| } |
| } |
| |
| if (!looksBig) { |
| if (dist2 > fsrrmax2) { |
| // We could interact with this node, but it is too far away to make |
| // any difference, so it will be skipped, along with all of its |
| // children. |
| continue; |
| } |
| |
| // This node has fewer particles than pseudo particles, so just use the |
| // particles that are actually there. |
| if (tree_[tln].count <= TDPTS) { |
| ID_T offn = tree_[tln].offset; |
| ID_T cntn = tree_[tln].count; |
| |
| int start = SIZE; |
| SIZE = SIZE + cntn; |
| assert( SIZE < VMAX ); |
| |
| for ( int i = 0; i < cntn; ++i) { |
| nx[start + i] = xx[offn + i]; |
| ny[start + i] = yy[offn + i]; |
| nz[start + i] = zz[offn + i]; |
| nm[start + i] = mass[offn + i]; |
| } |
| |
| continue; |
| } |
| |
| // Interact the particles in this node with the pseudoparticles of the |
| // other node. |
| int start = SIZE; |
| SIZE = SIZE + TDPTS; |
| assert( SIZE < VMAX ); |
| |
| pppts<TDPTS>(tree_[tln].tdr, tree_[tln].xc, &nx[start], &ny[start], &nz[start]); |
| for ( int i = 0; i < TDPTS; ++i) { |
| nm[start + i] = tree_[tln].ppm[i]; |
| } |
| |
| continue; |
| } else if (tree_[tln].cr == 0 && tree_[tln].cl == 0) { |
| // This is a leaf node with which we must interact. |
| ID_T offn = tree_[tln].offset; |
| ID_T cntn = tree_[tln].count; |
| |
| int start = SIZE; |
| SIZE = SIZE + cntn; |
| assert( SIZE < VMAX ); |
| |
| for ( int i = 0; i < cntn; ++i) { |
| nx[start + i] = xx[offn + i]; |
| ny[start + i] = yy[offn + i]; |
| nz[start + i] = zz[offn + i]; |
| nm[start + i] = mass[offn + i]; |
| } |
| |
| continue; |
| } |
| |
| // This other node is not a leaf, but has too large an opening angle |
| // for an approx. interaction: queue its children. |
| |
| ID_T tlncr = tree_[tln].cr; |
| ID_T tlncl = tree_[tln].cl; |
| |
| if (tlncl > 0 && tree_[tlncl].count > 0) { |
| bool close = true; |
| for (int i = 0; i < DIMENSION; ++i) { |
| POSVEL_T dist = 0; |
| if (tree_[tl].xmax[i] < tree_[tlncl].xmin[i]) { |
| dist = tree_[tlncl].xmin[i] - tree_[tl].xmax[i]; |
| } else if (tree_[tl].xmin[i] > tree_[tlncl].xmax[i]) { |
| dist = tree_[tl].xmin[i] - tree_[tlncl].xmax[i]; |
| } |
| |
| if (dist > fsrrmax) { |
| close = false; |
| break; |
| } |
| } |
| |
| if (close) q.push_back(tlncl); |
| } |
| if (tlncr > 0 && tree_[tlncr].count > 0) { |
| bool close = true; |
| for (int i = 0; i < DIMENSION; ++i) { |
| POSVEL_T dist = 0; |
| if (tree_[tl].xmax[i] < tree_[tlncr].xmin[i]) { |
| dist = tree_[tlncr].xmin[i] - tree_[tl].xmax[i]; |
| } else if (tree_[tl].xmin[i] > tree_[tlncr].xmax[i]) { |
| dist = tree_[tl].xmin[i] - tree_[tlncr].xmax[i]; |
| } |
| |
| if (dist > fsrrmax) { |
| close = false; |
| break; |
| } |
| } |
| |
| if (close) q.push_back(tlncr); |
| } |
| } |
| |
| ID_T off = tree_[tl].offset; |
| ID_T cnt = tree_[tl].count; |
| |
| // Add self interactions... |
| int start = SIZE; |
| SIZE = SIZE + cnt; |
| assert( SIZE < VMAX ); |
| |
| for ( int i = 0; i < cnt; ++i) { |
| nx[start + i] = xx[off + i]; |
| ny[start + i] = yy[off + i]; |
| nz[start + i] = zz[off + i]; |
| nm[start + i] = mass[off + i]; |
| } |
| |
| #ifdef __HIPCC__ |
| hipMemcpyAsync(d_nx,nx,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream); |
| hipMemcpyAsync(d_ny,ny,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream); |
| hipMemcpyAsync(d_nz,nz,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream); |
| hipMemcpyAsync(d_nm,nm,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream); |
| hipEventRecord(event,stream); //mark when transfers have finished |
| cudaCheckError(); |
| hipDeviceSynchronize(); |
| |
| |
| hipMemcpyAsync(d_xxl,xx+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream); |
| hipMemcpyAsync(d_yyl,yy+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream); |
| hipMemcpyAsync(d_zzl,zz+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream); |
| hipMemcpyAsync(d_massl,mass+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream); |
| hipMemcpyAsync(d_vxl,vx+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream); |
| hipMemcpyAsync(d_vyl,vy+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream); |
| hipMemcpyAsync(d_vzl,vz+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream); |
| cudaCheckError(); |
| hipDeviceSynchronize(); |
| #endif |
| |
| // Process the interaction list... |
| #ifdef __HIPCC__ |
| ::nbody1(cnt, SIZE, d_xxl, d_yyl, d_zzl, d_massl, d_nx, d_ny, d_nz, d_nm, d_vxl, d_vyl, d_vzl, m_fl, m_fcoeff, fsrrmax, rsm, stream); |
| hipDeviceSynchronize(); |
| #else |
| ::nbody1(cnt, SIZE, xx + off, yy + off, zz + off, mass + off, nx, ny, nz, nm, vx + off, vy + off, vz + off, m_fl, m_fcoeff, fsrrmax, rsm); |
| #endif |
| |
| #ifdef __HIPCC__ |
| //transfer up vx vy vz |
| hipMemcpyAsync(vx+off,d_vxl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream); |
| hipMemcpyAsync(vy+off,d_vyl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream); |
| hipMemcpyAsync(vz+off,d_vzl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream); |
| cudaCheckError(); |
| hipDeviceSynchronize(); |
| #endif |
| |
| } |
| |
| // Iterate through the tree nodes, for each leaf node, start a task. |
| // That task iterates through the tree nodes, skipping any node (and all |
| // of its children) if all corners are too far away. Then it compares the |
| // opening angle. |
| template <int TDPTS> |
| void RCBForceTree<TDPTS>::calcInternodeForces() |
| { |
| |
| std::vector<ID_T> q(1, 0); |
| std::vector<ID_T> parents; |
| while (!q.empty()) { |
| ID_T tl = q.back(); |
| if (tree[tl].cr == 0 && tree[tl].cl == 0) { |
| // This is a leaf node. |
| q.pop_back(); |
| |
| bool inside = true; |
| for (int i = 0; i < DIMENSION; ++i) { |
| inside &= (tree[tl].xmax[i] < maxForceRange[i] && tree[tl].xmax[i] > minForceRange[i]) || |
| (tree[tl].xmin[i] < maxForceRange[i] && tree[tl].xmin[i] > minForceRange[i]); |
| } |
| |
| if (inside) { |
| calcInternodeForce(tl, parents); |
| } |
| } else if (parents.size() > 0 && parents.back() == tl) { |
| // This is second time here; we've done with all children. |
| parents.pop_back(); |
| q.pop_back(); |
| } else { |
| // This is the first time at this parent node, queue the children. |
| if (tree[tl].cl > 0) q.push_back(tree[tl].cl); |
| if (tree[tl].cr > 0) q.push_back(tree[tl].cr); |
| parents.push_back(tl); |
| } |
| } |
| } |
| |
| // Explicit template instantiation... |
| template class RCBForceTree<QUADRUPOLE_TDPTS>; |
| template class RCBForceTree<MONOPOLE_TDPTS>; |
| |