src/gpu/halo-finder/src/RCBForceTree.cxx - public/gem5-resources - Git at Google

 /*=========================================================================

 Copyright (c) 2007, Los Alamos National Security, LLC

 All rights reserved.

 Copyright 2007. Los Alamos National Security, LLC.
 This software was produced under U.S. Government contract DE-AC52-06NA25396
 for Los Alamos National Laboratory (LANL), which is operated by
 Los Alamos National Security, LLC for the U.S. Department of Energy.
 The U.S. Government has rights to use, reproduce, and distribute this software.
 NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
 EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.
 If software is modified to produce derivative works, such modified software
 should be clearly marked, so as not to confuse it with the version available
 from LANL.

 Additionally, redistribution and use in source and binary forms, with or
 without modification, are permitted provided that the following conditions
 are met:
 -   Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
 -   Redistributions in binary form must reproduce the above copyright notice,
     this list of conditions and the following disclaimer in the documentation
     and/or other materials provided with the distribution.
 -   Neither the name of Los Alamos National Security, LLC, Los Alamos National
     Laboratory, LANL, the U.S. Government, nor the names of its contributors
     may be used to endorse or promote products derived from this software
     without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR
 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 =========================================================================*/

 /*=========================================================================

 Copyright (c) 2011-2012 Argonne National Laboratory
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 1. Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in the
    documentation and/or other materials provided with the distribution.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.

 =========================================================================*/


 /*
 BG/Q tuned version of HACC: 69.2% of peak performance on 96 racks of Sequoia
 Argonne Leadership Computing Facility, Argonne, IL 60439
 Vitali Morozov (morozov@anl.gov)
 Hal Finkel (hfinkel@anl.gov)
 */


 #include "hip/hip_runtime.h"
 #include "Timings.h"
 #include "RCBForceTree.h"
 #include "Partition.h"

 #include <cstring>
 #include <cstdio>
 #include <ctime>
 #include <stdexcept>
 #include <assert.h>
 using namespace std;

 #ifdef __HIPCC__
 #include <cudaUtil.h>

 #define TILEX 4                       //Unroll factor in the x dimension, best if 2 or 4, could also add 8 but that is too many registers
 #define TILEY 4                       //Unroll factor in the y dimension, best if 2 or 4, could add 8 but that is too many registers
 #define BLOCKX 32                     //Block size in the x dimension (should be 32)
 #define BLOCKY 4                      //Block size in the y dimension
 #define MAXX 32                       //Maximum blocks in the X dimension, smaller=more reuse but less parallelism
 #define MAXY 256                      //Maximum blocks in the Y dimension, there isn't much reason to make this smaller

 #define ALIGNX(n) ((n+TILEX-1)/TILEX*TILEX)  //Rounds an integer to align with TILEX
 #define ALIGNY(n) ((n+TILEY-1)/TILEY*TILEY)  //Rounds an integer to align with TILEY

 cudaDeviceSelector __selector__;
 #endif

 #ifdef __HIPCC__
 #define __HOST__ __host__
 #define __DEVICE__ __device__
 #else
 #define __HOST__
 #define __DEVICE__
 #endif


 // References:
 // Emanuel Gafton and Stephan Rosswog. A fast recursive coordinate bisection tree for
 // neighbour search and gravity. Mon. Not. R. Astron. Soc. to appear, 2011.
 // http://arxiv.org/abs/1108.0028v1
 //
 // Atsushi Kawai, Junichiro Makino and Toshikazu Ebisuzaki.
 // Performance Analysis of High-Accuracy Tree Code Based on the Pseudoparticle
 // Multipole Method. The Astrophysical Journal Supplement Series, 151:13-33, 2004.
 // Related: http://arxiv.org/abs/astro-ph/0012041v1
 //
 // R. H. Hardin and N. J. Sloane
 // New Spherical 4-Designs. Discrete Math, 106/107 255-264, 1992.
 //
 // The library of spherical designs:
 // http://www2.research.att.com/~njas/sphdesigns/
 namespace {
 template <int TDPTS>
 struct sphdesign {};

 #define DECLARE_SPHDESIGN(TDPTS) \
 template <> \
 struct sphdesign<TDPTS> \
 { \
   static const POSVEL_T x[TDPTS]; \
   static const POSVEL_T y[TDPTS]; \
   static const POSVEL_T z[TDPTS]; \
 }; \
 /**/

 DECLARE_SPHDESIGN(1)
 DECLARE_SPHDESIGN(2)
 DECLARE_SPHDESIGN(3)
 DECLARE_SPHDESIGN(4)
 DECLARE_SPHDESIGN(6)
 DECLARE_SPHDESIGN(12)
 DECLARE_SPHDESIGN(14)

 #undef DECLARE_SPHDESIGN

 /* this is not a t-design, but puts the monopole moment
    at the center of mass. */
 const POSVEL_T sphdesign<1>::x[] = {
   0
 };

 const POSVEL_T sphdesign<1>::y[] = {
   0
 };

 const POSVEL_T sphdesign<1>::z[] = {
   0
 };

 const POSVEL_T sphdesign<2>::x[] = {
   1.0,
   -1.0
 };

 const POSVEL_T sphdesign<2>::y[] = {
   0,
   0
 };

 const POSVEL_T sphdesign<2>::z[] = {
   0,
   0
 };

 const POSVEL_T sphdesign<3>::x[] = {
   1.0,
   -.5,
   -.5
 };

 const POSVEL_T sphdesign<3>::y[] = {
   0,
   .86602540378443864675,
   -.86602540378443864675
 };

 const POSVEL_T sphdesign<3>::z[] = {
   0,
   0,
   0
 };

 const POSVEL_T sphdesign<4>::x[] = {
   .577350269189625763,
   .577350269189625763,
   -.577350269189625763,
   -.577350269189625763
 };

 const POSVEL_T sphdesign<4>::y[] = {
   .577350269189625763,
   -.577350269189625763,
   .577350269189625763,
   -.577350269189625763
 };

 const POSVEL_T sphdesign<4>::z[] = {
   .577350269189625763,
   -.577350269189625763,
   -.577350269189625763,
   .577350269189625763
 };

 const POSVEL_T sphdesign<6>::x[] = {
   1.0,
   -1.0,
   0,
   0,
   0,
   0
 };

 const POSVEL_T sphdesign<6>::y[] = {
   0,
   0,
   1.0,
   -1.0,
   0,
   0
 };

 const POSVEL_T sphdesign<6>::z[] = {
   0,
   0,
   0,
   0,
   1.0,
   -1.0
 };

 // This is a 3-D 12-point spherical 4-design
 // (the verticies of a icosahedron) from Hardin and Sloane.
 const POSVEL_T sphdesign<12>::x[] = {
   0,
   0,
   0.525731112119134,
   -0.525731112119134,
   0.85065080835204,
   -0.85065080835204,
   0,
   0,
   -0.525731112119134,
   0.525731112119134,
   -0.85065080835204,
   0.85065080835204
 };

 const POSVEL_T sphdesign<12>::y[] = {
   0.85065080835204,
   0.85065080835204,
   0,
   0,
   0.525731112119134,
   0.525731112119134,
   -0.85065080835204,
   -0.85065080835204,
   0,
   0,
   -0.525731112119134,
   -0.525731112119134
 };

 const POSVEL_T sphdesign<12>::z[] = {
   0.525731112119134,
   -0.525731112119134,
   0.85065080835204,
   0.85065080835204,
   0,
   0,
   -0.525731112119134,
   0.525731112119134,
   -0.85065080835204,
   -0.85065080835204,
   0,
   0
 };

 // This is a 3-D 14-point spherical 4-design by
 // R. H. Hardin and N. J. A. Sloane.
 const POSVEL_T sphdesign<14>::x[] = {
   1.0e0,
   5.947189772040725e-1,
   5.947189772040725e-1,
   5.947189772040725e-1,
   -5.947189772040725e-1,
   -5.947189772040725e-1,
   -5.947189772040725e-1,
   3.012536847870683e-1,
   3.012536847870683e-1,
   3.012536847870683e-1,
   -3.012536847870683e-1,
   -3.012536847870683e-1,
   -3.012536847870683e-1,
   -1.0e0
 };

 const POSVEL_T sphdesign<14>::y[] = {
   0.0e0,
   1.776539926025823e-1,
   -7.678419429698292e-1,
   5.90187950367247e-1,
   1.776539926025823e-1,
   5.90187950367247e-1,
   -7.678419429698292e-1,
   8.79474443923065e-1,
   -7.588425179318781e-1,
   -1.206319259911869e-1,
   8.79474443923065e-1,
   -1.206319259911869e-1,
   -7.588425179318781e-1,
   0.0e0
 };

 const POSVEL_T sphdesign<14>::z[] = {
   0.0e0,
   7.840589244857197e-1,
   -2.381765915652909e-1,
   -5.458823329204288e-1,
   -7.840589244857197e-1,
   5.458823329204288e-1,
   2.381765915652909e-1,
   3.684710570566285e-1,
   5.774116818882528e-1,
   -9.458827389448813e-1,
   -3.684710570566285e-1,
   9.458827389448813e-1,
   -5.774116818882528e-1,
   0.0e0
 };
 } // anonymous namespace

 // Note: In Gafton and Rosswog the far-field force contribution is calculated
 // per-cell (at the center of mass), and then a Taylor expansion about the center
 // of mass is used to calculate the force on the individual particles. For this to
 // work, the functional form of the force must be known (because the Jacobian
 // and Hessian are required). Here, however, the functional form is not known,
 // and so the pseudo-particle method of Makino is used instead.

 template <int TDPTS>
 RCBForceTree<TDPTS>::RCBForceTree(
                          POSVEL_T* minLoc,
                          POSVEL_T* maxLoc,
                          POSVEL_T* minForceLoc,
                          POSVEL_T* maxForceLoc,
                          ID_T count,
                          POSVEL_T* xLoc,
                          POSVEL_T* yLoc,
                          POSVEL_T* zLoc,
                          POSVEL_T* xVel,
                          POSVEL_T* yVel,
                          POSVEL_T* zVel,
                          POSVEL_T* ms,
                          POSVEL_T* phiLoc,
                          ID_T *idLoc,
                          MASK_T *maskLoc,
                          POSVEL_T avgMass,
                          POSVEL_T fsm,
                          POSVEL_T r,
                          POSVEL_T oa,
                          ID_T nd,
                          ID_T ds,
                          ID_T tmin,
                          ForceLaw *fl,
                          float fcoeff,
                          POSVEL_T ppc)
 {
 #ifdef __HIPCC__
   //nvtxRangeId_t r0;
   //r0=nvtxRangeStartA("RCBForceTree()");
 #endif
   // Extract the contiguous data block from a vector pointer
   particleCount = count;

   xx = xLoc;
   yy = yLoc;
   zz = zLoc;
   vx = xVel;
   vy = yVel;
   vz = zVel;
   mass = ms;

   numThreads=1;

   // static size for the interaction list
   #define VMAX ALIGNY(16384)
   nx_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
   ny_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
   nz_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
   nm_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);

 #ifdef __HIPCC__
   hipHostRegister(nx_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
   hipHostRegister(ny_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
   hipHostRegister(nz_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
   hipHostRegister(nm_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
   hipHostRegister(xx,count*sizeof(POSVEL_T),0);
   hipHostRegister(yy,count*sizeof(POSVEL_T),0);
   hipHostRegister(zz,count*sizeof(POSVEL_T),0);
   hipHostRegister(mass,count*sizeof(POSVEL_T),0);
   hipHostRegister(vx,count*sizeof(POSVEL_T),0);
   hipHostRegister(vy,count*sizeof(POSVEL_T),0);
   hipHostRegister(vz,count*sizeof(POSVEL_T),0);

   int size=ALIGNY(nd);
   hipMalloc(&d_xx,size*sizeof(POSVEL_T)*numThreads);
   hipMalloc(&d_yy,size*sizeof(POSVEL_T)*numThreads);
   hipMalloc(&d_zz,size*sizeof(POSVEL_T)*numThreads);
   hipMalloc(&d_vx,size*sizeof(POSVEL_T)*numThreads);
   hipMalloc(&d_vy,size*sizeof(POSVEL_T)*numThreads);
   hipMalloc(&d_vz,size*sizeof(POSVEL_T)*numThreads);
   hipMalloc(&d_mass,size*sizeof(POSVEL_T)*numThreads);

   hipMalloc(&d_nx_v,VMAX*sizeof(POSVEL_T)*numThreads);
   hipMalloc(&d_ny_v,VMAX*sizeof(POSVEL_T)*numThreads);
   hipMalloc(&d_nz_v,VMAX*sizeof(POSVEL_T)*numThreads);
   hipMalloc(&d_nm_v,VMAX*sizeof(POSVEL_T)*numThreads);
   cudaCheckError();


   event_v=(hipEvent_t*)malloc(sizeof(hipEvent_t)*numThreads);
   stream_v=(hipStream_t*)malloc(sizeof(hipStream_t)*numThreads);
   for(int i=0;i<numThreads;i++) {
     hipEventCreate(&event_v[i]);
     hipStreamCreate(&stream_v[i]);
   }
   cudaCheckError();
 #endif

   phi = phiLoc;
   id = idLoc;
   mask = maskLoc;

   particleMass = avgMass;
   fsrrmax = fsm;
   rsm = r;
   sinOpeningAngle = sinf(oa);
   tanOpeningAngle = tanf(oa);
   nDirect = nd;
   depthSafety = ds;
   taskPartMin = tmin;
   ppContract = ppc;

   // Find the grid size of this chaining mesh
   for (int dim = 0; dim < DIMENSION; dim++) {
     minRange[dim] = minLoc[dim];
     maxRange[dim] = maxLoc[dim];
     minForceRange[dim] = minForceLoc[dim];
     maxForceRange[dim] = maxForceLoc[dim];
   }

   if (fl) {
     m_own_fl = false;
     m_fl = fl;
     m_fcoeff = fcoeff;
   } else {
     //maybe change this to Newton's law or something
     m_own_fl = true;
     m_fl = new ForceLawNewton();
     m_fcoeff = 1.0;
   }

   // Because the tree may be built in parallel, and no efficient way of locking
   // the tree seems to be available in OpenMP (no reader/writer locks, etc.),
   // we just estimate the number of tree nodes that will be needed. Hopefully,
   // this will be an over estimate. If we need more than this, then tree nodes
   // that really should be subdivided will not be.
   //
   // If the tree were perfectly balanced, then it would have a depth of
   // log_2(particleCount/nDirect). The tree needs to have (2^depth)+1 entries.
   // To that, a safety factor is added to the depth.
   ID_T nds = (((ID_T)(particleCount/(POSVEL_T)nDirect)) << depthSafety) + 1;
   tree.reserve(nds);

   int nthreads = 1;

   timespec b_start, b_end;
   clock_gettime(CLOCK_THREAD_CPUTIME_ID, &b_start);
   // Create the recursive RCB tree from the particle locations
 #ifdef __HIPCC__
   //nvtxRangeId_t r1;
   //r1=nvtxRangeStartA("createRCBForceTree");
 #endif
   createRCBForceTree();

 #ifdef __HIPCC__
   //nvtxRangeEnd(r1);
 #endif

   clock_gettime(CLOCK_THREAD_CPUTIME_ID, &b_end);
   double b_time = (b_end.tv_sec - b_start.tv_sec);
   b_time += 1e-9*(b_end.tv_nsec - b_start.tv_nsec);

   printStats(b_time);

   // Interaction lists.
   inx.resize(nthreads);
   iny.resize(nthreads);
   inz.resize(nthreads);
   inm.resize(nthreads);
   iq.resize(nthreads);

 #ifdef __HIPCC__
   //r1=nvtxRangeStartA("calcInternodeForces");
 #endif
   calcInternodeForces();
 #ifdef __HIPCC__
   //nvtxRangeEnd(r1);
 #endif

 #ifdef __HIPCC__
   //nvtxRangeEnd(r0);
 #endif
 }

 template <int TDPTS>
 RCBForceTree<TDPTS>::~RCBForceTree()
 {
 #ifdef __HIPCC__
   //nvtxRangeId_t r0;
   //r0=nvtxRangeStartA("~RCBForceTree");
 #endif
   if (m_own_fl) {
     delete m_fl;
   }
 #ifdef __HIPCC__
   hipHostUnregister(xx);
   hipHostUnregister(yy);
   hipHostUnregister(zz);
   hipHostUnregister(mass);
   hipHostUnregister(vx);
   hipHostUnregister(vy);
   hipHostUnregister(vz);
   hipHostUnregister(nx_v);
   hipHostUnregister(ny_v);
   hipHostUnregister(nz_v);
   hipHostUnregister(nm_v);

   hipFree(d_xx);
   hipFree(d_yy);
   hipFree(d_zz);
   hipFree(d_vx);
   hipFree(d_vy);
   hipFree(d_vz);
   hipFree(d_mass);
   hipFree(d_nx_v);
   hipFree(d_ny_v);
   hipFree(d_nz_v);
   hipFree(d_nm_v);
   cudaCheckError();

   for(int i=0;i<numThreads;i++) {
     hipEventDestroy(event_v[i]);
     hipStreamDestroy(stream_v[i]);
   }
   cudaCheckError();

   free(event_v);
   free(stream_v);

 #endif
   free(nx_v);
   free(ny_v);
   free(nz_v);
   free(nm_v);
 #ifdef __HIPCC__
   //nvtxRangeEnd(r0);
 #endif
 }

 template <int TDPTS>
 void RCBForceTree<TDPTS>::printStats(double buildTime)
 {
   size_t zeroLeafNodes = 0;
   size_t nonzeroLeafNodes = 0;
   size_t maxPPN = 0;
   size_t leafParts = 0;

   for (ID_T tl = 1; tl < (ID_T) tree.size(); ++tl) {
     if (tree[tl].cl == 0 && tree[tl].cr == 0) {
       if (tree[tl].count > 0) {
         ++nonzeroLeafNodes;

         leafParts += tree[tl].count;
         maxPPN = std::max((size_t) tree[tl].count, maxPPN);
       } else {
         ++zeroLeafNodes;
       }
     }
   }

   double localParticleCount = particleCount;
   double localTreeSize = tree.size();
   double localTreeCapacity = tree.capacity();
   double localLeaves = zeroLeafNodes+nonzeroLeafNodes;
   double localEmptyLeaves = zeroLeafNodes;
   double localMeanPPN = leafParts/((double) nonzeroLeafNodes);
   unsigned long localMaxPPN = maxPPN;
   double localBuildTime = buildTime;

   /*
   double globalParticleCount;
   double globalTreeSize;
   double globalTreeCapacity;
   double globalLeaves;
   double globalEmptyLeaves;
   double globalMeanPPN;
   unsigned long globalMaxPPN;
   double globalBuildTime;

   bool printHere = true;
   */

   if ( Partition::getMyProc() == 0 ) {
     printf("\ttree post-build statistics (local for rank 0):\n");
     printf("\t\tparticles: %.2f\n", localParticleCount);
     printf("\t\tnodes: %.2f (allocated:  %.2f)\n", localTreeSize, localTreeCapacity);
     printf("\t\tleaves: %.2f (empty: %.2f)\n", localLeaves, localEmptyLeaves);
     printf("\t\tmean ppn: %.2f (max ppn: %lu)\n", localMeanPPN, localMaxPPN);
     printf("\t\tbuild time: %g s\n", localBuildTime);
   }
 }


 extern "C" void cm(ID_T count, const POSVEL_T*  xx, const POSVEL_T*  yy,
                       const POSVEL_T*  zz, const POSVEL_T*  mass,
                       POSVEL_T*  xmin, POSVEL_T*  xmax, POSVEL_T*  xc);

 static inline POSVEL_T pptdr(const POSVEL_T*  xmin, const POSVEL_T*  xmax, const POSVEL_T*  xc)
 {
   return std::min(xmax[0] - xc[0], std::min(xmax[1] - xc[1], std::min(xmax[2] - xc[2], std::min(xc[0] - xmin[0],
                  std::min(xc[1] - xmin[1], xc[2] - xmin[2])))));
 }

 template <int TDPTS>
 static inline void pppts(POSVEL_T tdr, const POSVEL_T*  xc,
                          POSVEL_T*  ppx, POSVEL_T*  ppy, POSVEL_T*  ppz)
 {
   for (int i = 0; i < TDPTS; ++i) {
     ppx[i] = tdr*sphdesign<TDPTS>::x[i] + xc[0];
     ppy[i] = tdr*sphdesign<TDPTS>::y[i] + xc[1];
     ppz[i] = tdr*sphdesign<TDPTS>::z[i] + xc[2];
   }
 }

 template <int TDPTS>
 static inline void pp(ID_T count, const POSVEL_T*  xx, const POSVEL_T*  yy,
                       const POSVEL_T*  zz, const POSVEL_T*  mass, const POSVEL_T*  xc,
                       const POSVEL_T*  ppx, const POSVEL_T*  ppy, const POSVEL_T*  ppz,
                       POSVEL_T*  ppm, POSVEL_T tdr)
 {
   POSVEL_T K = TDPTS;
   POSVEL_T odr0 = 1/K;

   for (int i = 0; i < count; ++i) {
     POSVEL_T xi = xx[i] - xc[0];
     POSVEL_T yi = yy[i] - xc[1];
     POSVEL_T zi = zz[i] - xc[2];
     POSVEL_T ri = sqrtf(xi*xi + yi*yi + zi*zi);

     for (int j = 0; j < TDPTS; ++j) {
       POSVEL_T xj = ppx[j] - xc[0];
       POSVEL_T yj = ppy[j] - xc[1];
       POSVEL_T zj = ppz[j] - xc[2];
       POSVEL_T rj2 = xj*xj + yj*yj + zj*zj;

       POSVEL_T odr1 = 0, odr2 = 0;
       if (rj2 != 0) {
         POSVEL_T rj  = sqrtf(rj2);
         POSVEL_T aij = (xi*xj + yi*yj + zi*zj)/(ri*rj);

         odr1 = (3/K)*(ri/tdr)*aij;
         odr2 = (5/K)*(ri/tdr)*(ri/tdr)*0.5*(3*aij*aij - 1);
       }

       ppm[j] += mass[i]*(odr0 + odr1 + odr2);
     }
   }
 }

 #ifdef __HIPCC__

 typedef long long int int64;

 template<typename T>
 __device__ __forceinline__
 T load(T *t)
 {
   return __ldg(t);  //texture load
 }

 template<int TILE_SIZE, typename T>__device__ void loadT(T *  out, const T *  in);

 //generic version (inefficient)
 template<int TILE_SIZE, typename T>
 __device__ __forceinline__
 void loadT(T *  out, const T *  in) {
   #pragma unroll
   for(int i=0;i<TILE_SIZE;i++) {
     out[i]=__ldg(in+i);
   }
 }

 //Vector loads
 template<>
 __device__ __forceinline__
 void loadT<2,float>(float *  out, const float *  in) {
   *reinterpret_cast<float2*>(out)=load(reinterpret_cast<const float2*>(in));
 }
 template<>
 __device__ __forceinline__
 void loadT<4,float>(float *  out, const float *  in) {
   *reinterpret_cast<float4*>(out)=load(reinterpret_cast<const float4*>(in));
 }

 //static __device__ __forceinline__ float __internal_fast_rsqrtf(float a)
 //{
 //  float r;
 //  asm ("rsqrt.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
 //  return r;
 //}

 //computes the forces between tiles i and j, adds the change in force to xi,yi,zi
 template<int TX, int TY>
 __device__ __forceinline__ void computeForces(POSVEL_T xxi[], POSVEL_T yyi[], POSVEL_T zzi[],
                                               POSVEL_T xxj[], POSVEL_T yyj[], POSVEL_T zzj[], POSVEL_T massj[],
                                               POSVEL_T xi[], POSVEL_T yi[], POSVEL_T zi[],
                                               POSVEL_T ma0, POSVEL_T ma1, POSVEL_T ma2, POSVEL_T ma3, POSVEL_T ma4, POSVEL_T ma5,
                                               POSVEL_T mp_rsm2, POSVEL_T fsrrmax2) {

   #pragma unroll
   for(int i=0;i<TY;i++) {
     #pragma unroll
     for(int j=0;j<TX;j++) {
       POSVEL_T dxc = xxj[j] - xxi[i];                                                                //1 FADD
       POSVEL_T dyc = yyj[j] - yyi[i];                                                                //1 FADD
       POSVEL_T dzc = zzj[j] - zzi[i];                                                                //1 FADD

       POSVEL_T r2 = dxc * dxc + dyc * dyc + dzc * dzc;                                               //1 FMUL 2 FMA
       POSVEL_T v=r2+mp_rsm2;                                                                         //1 FADD
       POSVEL_T v3=v*v*v;                                                                             //2 FMUL

       POSVEL_T f = __frsqrt_rn(v3);                                                        //1 MUFU,
 	  // MDS: Should ask someone why this line is dangling
       //       - ( ma0 + r2*(ma1 + r2*(ma2 + r2*(ma3 + r2*(ma4 + r2*ma5)))));                          //5 FMA, 1 FADD
 //#define BUG
 #ifndef BUG
       f*=massj[j]*(r2<fsrrmax2 && r2>0.0f);                                                          //2 FMUL, 1 FSETP, 1 FCMP
 #else
       f*=massj[j];                                                                                   //1 FMUL
       f*=(r2<fsrrmax2 && r2>0.0f);                                                                   //1 FMUL, 1 FSETP, 1 FCMP
 #endif

       xi[i] = xi[i] + f * dxc;                                                                       //1 FMA
       yi[i] = yi[i] + f * dyc;                                                                       //1 FMA
       zi[i] = zi[i] + f * dzc;                                                                       //1 FMA
     }
   }
 }

 //loads a tile from memory.  Use checkBounds and loadMass to disable bounds check or mass load at compile time
 template <bool checkBounds, bool loadMass, int TILE_SIZE>
 __device__ __forceinline__
 void loadTile(int i, int bounds,
               const POSVEL_T*  xx, const POSVEL_T*  yy, const POSVEL_T*  zz, const POSVEL_T*  mass,
               POSVEL_T xxi[], POSVEL_T yyi[], POSVEL_T zzi[], POSVEL_T massi[]) {
   if(checkBounds) {
   #pragma unroll
   for(int64 u=0;u<TILE_SIZE;u++) {
     int64 idx=TILE_SIZE*i+u;                                                                        // 1 IMAD

 #if 1
       bool cond=idx<bounds;
       xxi[u] = (cond) ? load(xx+idx) : 0.0f;                                                     // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV
       yyi[u] = (cond) ? load(yy+idx) : 0.0f;                                                     // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV
       zzi[u] = (cond) ? load(zz+idx) : 0.0f;                                                     // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV
       if(loadMass) massi[u] = (cond) ? load(mass+idx) : 0.0f;                                    // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV
 #else
       massi[u] = 0.0f;                                                                           //1 MOV
       if(idx<bounds) {                                                                           //1 ISETP, 1 BRA
         xxi[u] = load(xx+idx);                                                                   //1 LDG, 2 IMAD
         yyi[u] = load(yy+idx);                                                                   //1 LDG, 2 IMAD
         zzi[u] = load(zz+idx);                                                                   //1 LDG, 2 IMAD
         if(loadMass) massi[u] = load(mass+idx);                                                  //1 LDG, 2 IMAD
       }
 #endif
     }
   } else {

     int idx=TILE_SIZE*i;
     loadT<TILE_SIZE>(xxi,xx+idx);                                                                //1 LDG, 2 IMAD
     loadT<TILE_SIZE>(yyi,yy+idx);                                                                //1 LDG, 2 IMAD
     loadT<TILE_SIZE>(zzi,zz+idx);                                                                //1 LDG, 2 IMAD
     if(loadMass) loadT<TILE_SIZE>(massi,mass+idx);                                               //1 LDG, 2 IMAD
   }
 }

 //applies the force in xi,yi,zi to update vx, vy, vz
 //use checkBounds to disable bounds checking at compile time
 template <bool checkBounds, int TILE_SIZE>
 __device__ __forceinline__
 void applyForce(int i, int bounds,POSVEL_T fcoeff,
                 const POSVEL_T xi[], const POSVEL_T yi[], const POSVEL_T zi[],
                 POSVEL_T *vx, POSVEL_T *vy, POSVEL_T *vz) {
     #pragma unroll
     for(int u=0;u<TILE_SIZE;u++) {
       int idx=TILE_SIZE*i+u;                                                                         //1 IMAD

       if(!checkBounds || idx<bounds)
       {                                                                                           //1 ISETP
         atomicWarpReduceAndUpdate(vx+idx,fcoeff * xi[u]);                                         //2 IMAD, 6 FADD
         atomicWarpReduceAndUpdate(vy+idx,fcoeff * yi[u]);                                         //2 IMAD, 6 FADD
         atomicWarpReduceAndUpdate(vz+idx,fcoeff * zi[u]);                                         //2 IMAD, 6 FADD
       }
     }
 }

   //Tell the compiler how many blocks we expect to be active.
   //This gives the compiler a better idea of how many registers to use.  The second number is tunable.
 __launch_bounds__(BLOCKX*BLOCKY,7)
 __global__
 void Step10_cuda_kernel(int count, int count1,
                         const POSVEL_T*  xx, const POSVEL_T*  yy,
                         const POSVEL_T*  zz, const POSVEL_T*  mass,
                         const POSVEL_T*  xx1, const POSVEL_T*  yy1,
                         const POSVEL_T*  zz1, const POSVEL_T*  mass1,
                         POSVEL_T*  vx, POSVEL_T*  vy,
                         POSVEL_T*  vz, POSVEL_T fsrrmax2, POSVEL_T mp_rsm2, POSVEL_T fcoeff)
 {
   const POSVEL_T ma0 = 0.269327, ma1 = -0.0750978, ma2 = 0.0114808, ma3 = -0.00109313, ma4 = 0.0000605491, ma5 = -0.00000147177;

   //Register arrays to hold tiles of data
   POSVEL_T xxi[TILEY];
   POSVEL_T yyi[TILEY];
   POSVEL_T zzi[TILEY];
   POSVEL_T xxj[TILEX];
   POSVEL_T yyj[TILEX];
   POSVEL_T zzj[TILEX];
   POSVEL_T massj[TILEX];

   //loop over interior region and calculate forces.
   //for each tile i
  for(int i=hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count/TILEY;i+=hipBlockDim_y*hipGridDim_y)                                //1 ISETP
   {
     POSVEL_T xi[TILEY]={0};                                                                                //TILEY MOV
     POSVEL_T yi[TILEY]={0};                                                                                //TILEY MOV
     POSVEL_T zi[TILEY]={0};                                                                                //TILEY MOV

     //load tile i,mass and bounds check are not needed
     loadTile<false,false,TILEY>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);

     //for each tile j
     for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
     {
       //load tile j, bounds check is not needed
       loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);

       //compute forces between tile i and tile j
       computeForces<TILEX,TILEY>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
     }

     //process remaining elements at the end, use TILEX=1
     for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
     {
       //load tile j, bounds check is needed, mass is needed
       loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);

       //compute forces between tile i and tile j
       computeForces<1,TILEY>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
     }

     //apply the force we have calculated above, bounds check is not needed
     applyForce<false,TILEY>(i,count,fcoeff,xi,yi,zi,vx,vy,vz);
   }

   //At this point we have computed almost all interactions.
   //However we still need to add contributions for particles at the end

 #if 1
   //process ramining elements in set TILEY=1
   //for each tile i
   for(int i=count/TILEY*TILEY+hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count;i+=hipBlockDim_y*hipGridDim_y)                             //1 ISETP
   {
     POSVEL_T xi[1]={0};                                                                                //1 MOV
     POSVEL_T yi[1]={0};                                                                                //1 MOV
     POSVEL_T zi[1]={0};                                                                                //1 MOV

     //load xxi, yyi, zzi tiles, mass is not needed, bounds check is needed
     loadTile<true,false,1>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);

     //for each tile j
     for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
     {
       //load tile j, bounds check is not needed
       loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);

       //compute forces between tile i and tile j
       computeForces<TILEX,1>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
     }

     //process remaining elements at the end, use TILEX=1
     for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
     {
       //load tile j, bounds check is needed, mass is needed
       loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);

       //compute forces between tile i and tile j
       computeForces<1,1>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
     }

     applyForce<true,1>(i,count,fcoeff,xi,yi,zi,vx,vy,vz);
   }
 #endif

 }


 #endif

 #ifdef __bgq__
 extern "C" Step16_int( int count1, float xxi, float yyi, float zzi, float fsrrmax2, float mp_rsm2, const float *xx1, const float *yy1, const float *zz1,const  float *mass1, float *ax, float *ay, float *az );
 #endif

 static inline void nbody1(ID_T count, ID_T count1, const POSVEL_T*  xx, const POSVEL_T*  yy,
                          const POSVEL_T*  zz, const POSVEL_T*  mass,
                          const POSVEL_T*  xx1, const POSVEL_T*  yy1,
                          const POSVEL_T*  zz1, const POSVEL_T*  mass1,
                          POSVEL_T*  vx, POSVEL_T*  vy, POSVEL_T*  vz,
                          ForceLaw *fl, float fcoeff, float fsrrmax, float rsm
 #ifdef __HIPCC__
                          , hipStream_t stream
 #endif
                          )
 {
   POSVEL_T fsrrmax2 = fsrrmax*fsrrmax;
   POSVEL_T rsm2 = rsm*rsm;

 #ifdef __bgq__
   float ax = 0.0f, ay = 0.0f, az = 0.0f;

   for (int i = 0; i < count; ++i)
   {

     Step16_int ( count1, xx[i],yy[i],zz[i], fsrrmax2,rsm2,xx1,yy1,zz1,mass1, &ax, &ay, &az );

     vx[i] = vx[i] + ax * fcoeff;
     vy[i] = vy[i] + ay * fcoeff;
     vz[i] = vz[i] + az * fcoeff;
   }

 #else

 #ifdef __HIPCC__
   //const int MAXX=64;
   //const int MAXY=64;

   dim3 threads(BLOCKX,BLOCKY);
   int blocksX=(count1+threads.x-1)/threads.x;
   int blocksY=(count+threads.y-1)/threads.y;
   dim3 blocks( min(blocksX,MAXX), min(blocksY,MAXY));

   //call kernel

   cudaCheckError();
   //printf("count: %d, count1: %d\n", count,count1);
 #if 0
   checkCudaPtr(xx,"xx");
   checkCudaPtr(yy,"yy");
   checkCudaPtr(zz,"zz");
   checkCudaPtr(mass,"mass");
   checkCudaPtr(xx1,"xx1");
   checkCudaPtr(yy1,"yy1");
   checkCudaPtr(zz1,"zz1");
   checkCudaPtr(mass1,"mass1");
   checkCudaPtr(vx,"vx");
   checkCudaPtr(vy,"vy");
   checkCudaPtr(vz,"vz");
 #endif
   hipLaunchKernelGGL(Step10_cuda_kernel, dim3(blocks), dim3(threads), 0, stream, count,count1,xx,yy,zz,mass,xx1,yy1,zz1,mass1, vx, vy, vz, fsrrmax2, rsm2, fcoeff);
   cudaCheckError();

   //hipDeviceSynchronize();
   //exit(0);
 #else

   for (int i = 0; i < count; ++i)
     for (int j = 0; j < count1; ++j) {
       POSVEL_T dx = xx1[j] - xx[i];
       POSVEL_T dy = yy1[j] - yy[i];
       POSVEL_T dz = zz1[j] - zz[i];
       POSVEL_T dist2 = dx*dx + dy*dy + dz*dz;
       POSVEL_T f_over_r = mass[i]*mass1[j] * fl->f_over_r(dist2);

       POSVEL_T updateq = 1.0;
       updateq *= (dist2 < fsrrmax2);

       vx[i] += updateq*fcoeff*f_over_r*dx;
       vy[i] += updateq*fcoeff*f_over_r*dy;
       vz[i] += updateq*fcoeff*f_over_r*dz;
     }
 #endif //end __HIPCC__


 #endif //end __bgq__
 }


 static inline ID_T partition(ID_T n,
                              POSVEL_T*  xx, POSVEL_T*  yy, POSVEL_T*  zz,
                              POSVEL_T*  vx, POSVEL_T*  vy, POSVEL_T*  vz,
                              POSVEL_T*  mass, POSVEL_T*  phi,
                              ID_T*  id, MASK_T*  mask, POSVEL_T pv
                             )
 {
   float t0, t1, t2, t3, t4, t5, t6, t7;
   int32_t is, i, j;
   long i0;
   uint16_t i1;

   int idx[n];

   is = 0;
   for ( i = 0; i < n; i = i + 1 )
   {
     if (xx[i] < pv)
     {
       idx[is] = i;
       is = is + 1;
     }
   }

 #pragma unroll (4)
   for ( j = 0; j < is; j++ )
   {
       i = idx[j];

       t6 = mass[i]; mass[i] = mass[j]; mass[j] = t6;
       t7 = phi [i]; phi [i] = phi [j]; phi [j] = t7;
       i1 = mask[i]; mask[i] = mask[j]; mask[j] = i1;
       i0 = id  [i]; id  [i] = id  [j]; id  [j] = i0;
   }

 #pragma unroll (4)
   for ( j = 0; j < is; j++ )
   {
       i = idx[j];

       t0 = xx[i]; xx[i] = xx[j]; xx[j] = t0;
       t1 = yy[i]; yy[i] = yy[j]; yy[j] = t1;
       t2 = zz[i]; zz[i] = zz[j]; zz[j] = t2;
       t3 = vx[i]; vx[i] = vx[j]; vx[j] = t3;
       t4 = vy[i]; vy[i] = vy[j]; vy[j] = t4;
       t5 = vz[i]; vz[i] = vz[j]; vz[j] = t5;
   }

   return is;
 }

 template <int TDPTS>
 void RCBForceTree<TDPTS>::createRCBForceSubtree(int d, ID_T tl, ID_T tlcl, ID_T tlcr)
 {
   POSVEL_T *x1, *x2, *x3;
   switch (d) {
   case 0:
     x1 = xx;
     x2 = yy;
     x3 = zz;
   break;
   case 1:
     x1 = yy;
     x2 = zz;
     x3 = xx;
   break;
   /*case 2*/ default:
     x1 = zz;
     x2 = xx;
     x3 = yy;
   break;
   }

 #ifdef __bgq__
   int tid = 0;

 #endif
   const bool geoSplit = false;
   POSVEL_T split = geoSplit ? (tree[tl].xmax[d]+tree[tl].xmin[d])/2 : tree[tl].xc[d];
   ID_T is = ::partition(tree[tl].count, x1 + tree[tl].offset, x2 + tree[tl].offset, x3 + tree[tl].offset,
                         vx + tree[tl].offset, vy + tree[tl].offset, vz + tree[tl].offset,
                         mass + tree[tl].offset, phi + tree[tl].offset,
                         id + tree[tl].offset, mask + tree[tl].offset, split
                        );

   if (is == 0 || is == tree[tl].count) {
     return;
   }

   tree[tlcl].count = is;
   tree[tlcr].count = tree[tl].count - tree[tlcl].count;

   if (tree[tlcl].count > 0) {
     tree[tl].cl = tlcl;
     tree[tlcl].offset = tree[tl].offset;
     tree[tlcl].xmax[d] = split;

     createRCBForceTreeInParallel(tlcl);
   }

   if (tree[tlcr].count > 0) {
     tree[tl].cr = tlcr;
     tree[tlcr].offset = tree[tl].offset + tree[tlcl].count;
     tree[tlcr].xmin[d] = split;

     createRCBForceTreeInParallel(tlcr);
   }
 }

 // This is basically the algorithm from (Gafton and Rosswog, 2011).
 template <int TDPTS>
 void RCBForceTree<TDPTS>::createRCBForceTreeInParallel(ID_T tl)
 {
   ID_T cnt = tree[tl].count;
   ID_T off = tree[tl].offset;

   // Compute the center-of-mass coordinates (and recompute the min/max)
   ::cm(cnt, xx + off, yy + off, zz + off, mass + off,
        tree[tl].xmin, tree[tl].xmax, tree[tl].xc);

   if (cnt <= nDirect) {
     // The pseudoparticles
     tree[tl].tdr = ppContract*::pptdr(tree[tl].xmin, tree[tl].xmax, tree[tl].xc);
     memset(tree[tl].ppm, 0, sizeof(POSVEL_T)*TDPTS);
     if (cnt > TDPTS) { // Otherwise, the pseudoparticles are never used
       POSVEL_T ppx[TDPTS], ppy[TDPTS], ppz[TDPTS];
       pppts<TDPTS>(tree[tl].tdr, tree[tl].xc, ppx, ppy, ppz);
       pp<TDPTS>(cnt, xx + off, yy + off, zz + off, mass + off, tree[tl].xc,
                 ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
     }

     return;
   }

   // Index of the right and left child levels
   ID_T tlcl, tlcr;
   {
     tlcl = tree.size();
     tlcr = tlcl+1;
     size_t newSize = tlcr+1;
     tree.resize(newSize);
   }
   memset(&tree[tlcl], 0, sizeof(TreeNode)*2);

   // Both children have similar bounding boxes to the current node (the
   // parent), so copy the bounding box here, and then overwrite the changed
   // coordinate later.
   for (int i = 0; i < DIMENSION; ++i) {
           tree[tlcl].xmin[i] = tree[tl].xmin[i];
           tree[tlcr].xmin[i] = tree[tl].xmin[i];
           tree[tlcl].xmax[i] = tree[tl].xmax[i];
           tree[tlcr].xmax[i] = tree[tl].xmax[i];
   }

   // Split the longest edge at the center of mass.
   POSVEL_T xlen[DIMENSION];
   for (int i = 0; i < DIMENSION; ++i) {
     xlen[i] = tree[tl].xmax[i] - tree[tl].xmin[i];
   }

   int d;
   if (xlen[0] > xlen[1] && xlen[0] > xlen[2]) {
         d = 0; // Split in the x direction
   }
   else if (xlen[1] > xlen[2]) {
         d = 1; // Split in the y direction
   }
   else {
         d = 2; // Split in the z direction
   }

   createRCBForceSubtree(d, tl, tlcl, tlcr);

   // Compute the pseudoparticles based on those of the children
   POSVEL_T ppx[TDPTS], ppy[TDPTS], ppz[TDPTS];
   tree[tl].tdr = ppContract*::pptdr(tree[tl].xmin, tree[tl].xmax, tree[tl].xc);
   pppts<TDPTS>(tree[tl].tdr, tree[tl].xc, ppx, ppy, ppz);
   memset(tree[tl].ppm, 0, sizeof(POSVEL_T)*TDPTS);

   if (tree[tlcl].count > 0) {
     if (tree[tlcl].count <= TDPTS) {
       ID_T offc = tree[tlcl].offset;
       pp<TDPTS>(tree[tlcl].count, xx + offc, yy + offc, zz + offc, mass + offc,
                 tree[tl].xc, ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
     } else {
       POSVEL_T ppxc[TDPTS], ppyc[TDPTS], ppzc[TDPTS];
       pppts<TDPTS>(tree[tlcl].tdr, tree[tlcl].xc, ppxc, ppyc, ppzc);
       pp<TDPTS>(TDPTS, ppxc, ppyc, ppzc, tree[tlcl].ppm, tree[tl].xc,
                 ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
     }
   }
   if (tree[tlcr].count > 0) {
     if (tree[tlcr].count <= TDPTS) {
       ID_T offc = tree[tlcr].offset;
       pp<TDPTS>(tree[tlcr].count, xx + offc, yy + offc, zz + offc, mass + offc,
                 tree[tl].xc, ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
     } else {
       POSVEL_T ppxc[TDPTS], ppyc[TDPTS], ppzc[TDPTS];
       pppts<TDPTS>(tree[tlcr].tdr, tree[tlcr].xc, ppxc, ppyc, ppzc);
       pp<TDPTS>(TDPTS, ppxc, ppyc, ppzc, tree[tlcr].ppm, tree[tl].xc,
                 ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
     }
   }
 }

 template <int TDPTS>
 void RCBForceTree<TDPTS>::createRCBForceTree()
 {
   // The top tree is the entire box
   tree.resize(1);
   memset(&tree[0], 0, sizeof(TreeNode));

   tree[0].count = particleCount;
   tree[0].offset = 0;

   for (int i = 0; i < DIMENSION; ++i) {
     tree[0].xmin[i] = minRange[i];
     tree[0].xmax[i] = maxRange[i];
   }

   createRCBForceTreeInParallel();
 }


 template <int TDPTS>
 void RCBForceTree<TDPTS>::calcInternodeForce(ID_T tl,
                                             const std::vector<ID_T> &parents) {

   POSVEL_T fsrrmax2 = fsrrmax*fsrrmax;
   const TreeNode* tree_ = &tree[0];

   int tid = 0;

   std::vector<ID_T> &q = iq[tid];
   q.clear();
   q.push_back(0);


   POSVEL_T *nx=nx_v+tid*VMAX;
   POSVEL_T *ny=ny_v+tid*VMAX;
   POSVEL_T *nz=nz_v+tid*VMAX;
   POSVEL_T *nm=nm_v+tid*VMAX;

 #ifdef __HIPCC__
   //Adjust pointers to this threads workspace
   POSVEL_T *d_nx=d_nx_v+tid*VMAX;
   POSVEL_T *d_ny=d_ny_v+tid*VMAX;
   POSVEL_T *d_nz=d_nz_v+tid*VMAX;
   POSVEL_T *d_nm=d_nm_v+tid*VMAX;
   int size=ALIGNY(nDirect);
   POSVEL_T *d_xxl=d_xx+tid*size;
   POSVEL_T *d_yyl=d_yy+tid*size;
   POSVEL_T *d_zzl=d_zz+tid*size;
   POSVEL_T *d_massl=d_mass+tid*size;
   POSVEL_T *d_vxl=d_vx+tid*size;
   POSVEL_T *d_vyl=d_vy+tid*size;
   POSVEL_T *d_vzl=d_vz+tid*size;

   hipEvent_t& event=event_v[tid];
   hipStream_t& stream=stream_v[tid];
   hipEventSynchronize(event);  //wait for transfers from previous call to finish before overwriting nx,ny,nz,nm
   cudaCheckError();
 #endif

   // The interaction list.
   int SIZE = 0; // current size of these arrays

   while (!q.empty()) {
     ID_T tln = q.back();
     q.pop_back();

     // We should not interact with our own parents.
     if (tln < tl) {
       bool isParent = std::binary_search(parents.begin(), parents.end(), tln);
       if (isParent) {
         ID_T tlncr = tree_[tln].cr;
         ID_T tlncl = tree_[tln].cl;

         if (tlncl != tl && tlncl > 0 && tree_[tlncl].count > 0) {
           q.push_back(tlncl);
         }
         if (tlncr != tl && tlncr > 0 && tree_[tlncr].count > 0) {
           q.push_back(tlncr);
         }

         continue;
       }
     }

     // Is this node have a small enough opening angle to interact with?
     POSVEL_T dx = tree_[tln].xc[0] - tree_[tl].xc[0];
     POSVEL_T dy = tree_[tln].xc[1] - tree_[tl].xc[1];
     POSVEL_T dz = tree_[tln].xc[2] - tree_[tl].xc[2];
     POSVEL_T dist2 = dx*dx + dy*dy + dz*dz;

     POSVEL_T sx = tree_[tln].xmax[0]-tree_[tln].xmin[0];
     POSVEL_T sy = tree_[tln].xmax[1]-tree_[tln].xmin[1];
     POSVEL_T sz = tree_[tln].xmax[2]-tree_[tln].xmin[2];
     POSVEL_T l2 = std::min(sx*sx, std::min(sy*sy, sz*sz)); // under-estimate

     POSVEL_T dtt2 = dist2*tanOpeningAngle*tanOpeningAngle;
     bool looksBig;
     // l2/dist2 is really tan^2 theta, for small theta, tan(theta) ~ theta
     if (l2 > dtt2) {
       // the under-estimate is too big, so this is definitely too big
       looksBig = true;
     } else {
       // There are 8 corner points of the remote node, and the maximum angular
       // size will be from one of those points to its opposite points. So there
       // are 8 vector dot products to compute to determine the maximum angular
       // size at any given reference point. (do we need to do this for each point
       // in leaf node, or will the c.m. point be sufficient?).
       looksBig = false;
       for (int i = 0; i < 2; ++i)
       for (int j = 0; j < 2; ++j) {
         POSVEL_T x1 = (i == 0 ? tree_[tln].xmin : tree_[tln].xmax)[0] - tree_[tl].xc[0];
         POSVEL_T y1 = (j == 0 ? tree_[tln].xmin : tree_[tln].xmax)[1] - tree_[tl].xc[1];
         POSVEL_T z1 = tree_[tln].xmin[2] - tree_[tl].xc[2];

         POSVEL_T x2 = (i == 0 ? tree_[tln].xmax : tree_[tln].xmin)[0] - tree_[tl].xc[0];
         POSVEL_T y2 = (j == 0 ? tree_[tln].xmax : tree_[tln].xmin)[1] - tree_[tl].xc[1];
         POSVEL_T z2 = tree_[tln].xmax[2] - tree_[tl].xc[2];

         const bool useRealOA = false;
         if (useRealOA) {
           // |a x b| = a*b*sin(theta)
           POSVEL_T cx = y1*z2 - z1*y2;
           POSVEL_T cy = z1*x2 - x1*z2;
           POSVEL_T cz = x1*y2 - y1*x2;
           if ((cx*cx + cy*cy + cz*cz) > sinOpeningAngle*sinOpeningAngle*
                                           (x1*x1 + y1*y1 + z1*z1)*(x2*x2 + y2*y2 + z2*z2)
              ) {
             looksBig = true;
             break;
           }
         } else {
           // Instead of using the real opening angle, use the tan approximation; this is
           // better than the opening-angle b/c it incorporates depth information.
           POSVEL_T ddx = x1 - x2, ddy = y1 - y2, ddz = z1 - z2;
           POSVEL_T dh2 = ddx*ddx + ddy*ddy + ddz*ddz;
           if (dh2 > dtt2) {
             looksBig = true;
             break;
           }
         }
       }
     }

     if (!looksBig) {
       if (dist2 > fsrrmax2) {
         // We could interact with this node, but it is too far away to make
         // any difference, so it will be skipped, along with all of its
         // children.
         continue;
       }

       // This node has fewer particles than pseudo particles, so just use the
       // particles that are actually there.
       if (tree_[tln].count <= TDPTS) {
         ID_T offn = tree_[tln].offset;
         ID_T cntn = tree_[tln].count;

         int start = SIZE;
         SIZE = SIZE + cntn;
         assert( SIZE < VMAX );

         for ( int i = 0; i < cntn; ++i) {
           nx[start + i] = xx[offn + i];
           ny[start + i] = yy[offn + i];
           nz[start + i] = zz[offn + i];
           nm[start + i] = mass[offn + i];
         }

         continue;
       }

       // Interact the particles in this node with the pseudoparticles of the
       // other node.
       int start = SIZE;
       SIZE = SIZE + TDPTS;
       assert( SIZE < VMAX );

       pppts<TDPTS>(tree_[tln].tdr, tree_[tln].xc, &nx[start], &ny[start], &nz[start]);
       for ( int i = 0; i < TDPTS; ++i) {
         nm[start + i] = tree_[tln].ppm[i];
       }

       continue;
     } else if (tree_[tln].cr == 0 && tree_[tln].cl == 0) {
       // This is a leaf node with which we must interact.
       ID_T offn = tree_[tln].offset;
       ID_T cntn = tree_[tln].count;

       int start = SIZE;
       SIZE = SIZE + cntn;
       assert( SIZE < VMAX );

       for ( int i = 0; i < cntn; ++i) {
         nx[start + i] = xx[offn + i];
         ny[start + i] = yy[offn + i];
         nz[start + i] = zz[offn + i];
         nm[start + i] = mass[offn + i];
       }

       continue;
     }

     // This other node is not a leaf, but has too large an opening angle
     // for an approx. interaction: queue its children.

     ID_T tlncr = tree_[tln].cr;
     ID_T tlncl = tree_[tln].cl;

     if (tlncl > 0 && tree_[tlncl].count > 0) {
       bool close = true;
       for (int i = 0; i < DIMENSION; ++i) {
         POSVEL_T dist = 0;
         if (tree_[tl].xmax[i] < tree_[tlncl].xmin[i]) {
           dist = tree_[tlncl].xmin[i] - tree_[tl].xmax[i];
         } else if (tree_[tl].xmin[i] > tree_[tlncl].xmax[i]) {
           dist = tree_[tl].xmin[i] - tree_[tlncl].xmax[i];
         }

         if (dist > fsrrmax) {
           close = false;
           break;
         }
       }

       if (close) q.push_back(tlncl);
     }
     if (tlncr > 0 && tree_[tlncr].count > 0) {
       bool close = true;
       for (int i = 0; i < DIMENSION; ++i) {
         POSVEL_T dist = 0;
         if (tree_[tl].xmax[i] < tree_[tlncr].xmin[i]) {
           dist = tree_[tlncr].xmin[i] - tree_[tl].xmax[i];
         } else if (tree_[tl].xmin[i] > tree_[tlncr].xmax[i]) {
           dist = tree_[tl].xmin[i] - tree_[tlncr].xmax[i];
         }

         if (dist > fsrrmax) {
           close = false;
           break;
         }
       }

       if (close) q.push_back(tlncr);
     }
   }

   ID_T off = tree_[tl].offset;
   ID_T cnt = tree_[tl].count;

   // Add self interactions...
   int start = SIZE;
   SIZE = SIZE + cnt;
   assert( SIZE < VMAX );

   for ( int i = 0; i < cnt; ++i) {
     nx[start + i] = xx[off + i];
     ny[start + i] = yy[off + i];
     nz[start + i] = zz[off + i];
     nm[start + i] = mass[off + i];
   }

 #ifdef __HIPCC__
   hipMemcpyAsync(d_nx,nx,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_ny,ny,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_nz,nz,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_nm,nm,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
   hipEventRecord(event,stream);  //mark when transfers have finished
   cudaCheckError();
   hipDeviceSynchronize();


   hipMemcpyAsync(d_xxl,xx+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_yyl,yy+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_zzl,zz+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_massl,mass+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_vxl,vx+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_vyl,vy+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_vzl,vz+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
   cudaCheckError();
   hipDeviceSynchronize();
 #endif

   // Process the interaction list...
 #ifdef __HIPCC__
   ::nbody1(cnt, SIZE, d_xxl, d_yyl, d_zzl, d_massl, d_nx, d_ny, d_nz, d_nm, d_vxl, d_vyl, d_vzl, m_fl, m_fcoeff, fsrrmax, rsm, stream);
   hipDeviceSynchronize();
 #else
   ::nbody1(cnt, SIZE, xx + off, yy + off, zz + off, mass + off, nx, ny, nz, nm, vx + off, vy + off, vz + off, m_fl, m_fcoeff, fsrrmax, rsm);
 #endif

 #ifdef __HIPCC__
   //transfer up vx vy vz
   hipMemcpyAsync(vx+off,d_vxl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
   hipMemcpyAsync(vy+off,d_vyl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
   hipMemcpyAsync(vz+off,d_vzl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
   cudaCheckError();
   hipDeviceSynchronize();
 #endif

 }

 // Iterate through the tree nodes, for each leaf node, start a task.
 // That task iterates through the tree nodes, skipping any node (and all
 // of its children) if all corners are too far away. Then it compares the
 // opening angle.
 template <int TDPTS>
 void RCBForceTree<TDPTS>::calcInternodeForces()
 {

   std::vector<ID_T> q(1, 0);
   std::vector<ID_T> parents;
   while (!q.empty()) {
     ID_T tl = q.back();
     if (tree[tl].cr == 0 && tree[tl].cl == 0) {
       // This is a leaf node.
       q.pop_back();

       bool inside = true;
       for (int i = 0; i < DIMENSION; ++i) {
         inside &= (tree[tl].xmax[i] < maxForceRange[i] && tree[tl].xmax[i] > minForceRange[i]) ||
                   (tree[tl].xmin[i] < maxForceRange[i] && tree[tl].xmin[i] > minForceRange[i]);
       }

       if (inside) {
         calcInternodeForce(tl, parents);
       }
     } else if (parents.size() > 0 && parents.back() == tl) {
       // This is second time here; we've done with all children.
       parents.pop_back();
       q.pop_back();
     } else {
       // This is the first time at this parent node, queue the children.
       if (tree[tl].cl > 0) q.push_back(tree[tl].cl);
       if (tree[tl].cr > 0) q.push_back(tree[tl].cr);
       parents.push_back(tl);
     }
   }
 }

 // Explicit template instantiation...
 template class RCBForceTree<QUADRUPOLE_TDPTS>;
 template class RCBForceTree<MONOPOLE_TDPTS>;