resources: Add halo-finder code from HACC benchmark

halo-finder is a GPU-accelerated component of the HACC benchmark.
The GPU code can be benchmarked using ForceTreeTest

Change-Id: I584194ff43363d18c9237d72314f63e0bb0a6e72
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/41193
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>
diff --git a/README.md b/README.md
index 756e6ae..48434eb 100644
--- a/README.md
+++ b/README.md
@@ -524,6 +524,51 @@
 
 <http://dist.gem5.org/dist/develop/test-progs/lulesh/lulesh>
 
+# Resource: halo-finder (HACC)
+
+[HACC](https://asc.llnl.gov/coral-2-benchmarks) is a DoE application designed to simulate the
+evolution of the universe by simulating the formation of structure in collisionless fluids
+under the influence of gravity. The halo-finder code can be GPU accelerated by using
+the code in RCBForceTree.cxx
+
+`src/halo-finder/src` contains the code required to build and run ForceTreeTest from `src/halo_finder` in the main HACC codebase.
+`src/halo-finder/src/dfft` contains the dfft code from `src/dfft` in the main HACC codebase.
+
+## Compilation and Running
+
+halo-finder requires that certain libraries that aren't installed by default in the
+GCN3 docker container provided by gem5, and that the environment is configured properly
+in order to build. We provide a Dockerfile that installs those libraries and
+sets the environment.
+
+In order to test the GPU code in halo-finder, we compile and run ForceTreeTest.
+
+To build the Docker image and the benchmark:
+```
+cd src/halo-finder
+docker build -t <image_name> .
+docker run --rm -v ${PWD}:${PWD} -w ${PWD}/src -u $UID:$GID <image_name> make hip/ForceTreeTest
+```
+
+The binary is built for gfx801 by default and is placed at `src/halo-finder/src/hip/ForceTreeTest`
+
+ForceTreeTest is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
+To build GCN3_X86:
+```
+# Working directory is your gem5 directory
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+```
+
+To run ForceTreeTest:
+```
+# Assuming gem5 and gem5-resources are in the working directory
+docker run --rm -v $PWD:$PWD -w $PWD -u $UID:$GID <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/halo-finder/src/hip -cForceTreeTest --options="0.5 0.1 64 0.1 1 N 12 rcb"
+```
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/develop/test-progs/halo-finder/ForceTreeTest>
+
 # Resource: SPEC 2006
 
 The [Standard Performance Evaluation Corporation](
@@ -660,6 +705,8 @@
 'src/hip-samples/src'
 * **heterosync**: Consult `src/heterosync/LICENSE.txt`
 * **lulesh**: Consult the copyright notice in `src/lulesh/src/lulesh.hip.cc`
+* **halo-finder**: halo-finder is a subcomponent of HACC, which is licensed under
+a BSD license.
 * **spec 2006**: SPEC CPU 2006 requires purchase of benchmark suite from
 [SPEC](https://www.spec.org/cpu2006/) thus, it cannot be freely distributed.
 Consult individual copyright notices of source files in `src/spec-2006`.
diff --git a/src/halo-finder/.gitignore b/src/halo-finder/.gitignore
new file mode 100644
index 0000000..920c932
--- /dev/null
+++ b/src/halo-finder/.gitignore
@@ -0,0 +1 @@
+src/hip/
diff --git a/src/halo-finder/Dockerfile b/src/halo-finder/Dockerfile
new file mode 100644
index 0000000..f02483e
--- /dev/null
+++ b/src/halo-finder/Dockerfile
@@ -0,0 +1,35 @@
+FROM gcr.io/gem5-test/gcn-gpu
+RUN apt-get update && apt-get -y install libopenmpi-dev libomp-dev
+
+ENV HIPCC_BIN=/opt/rocm/bin
+ENV MPI_INCLUDE=/usr/lib/openmpi/include
+
+ENV OPT="-O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO"
+ENV OMP="-fopenmp"
+
+ENV HIPCC_FLAGS="-v -ffast_math -DINLINE_FORCE -I${MPI_INCLUDE}"
+ENV HIPCC_FLAGS="-v -I${MPI_INCLUDE} -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include"
+
+ENV HACC_PLATFORM="hip"
+ENV HACC_OBJDIR="${HACC_PLATFORM}"
+
+ENV HACC_CFLAGS="$OPT $OMP $HIPCC_FLAGS"
+ENV HACC_CC="${HIPCC_BIN}/hcc -x c -Xclang -std=c99"
+
+ENV HACC_CXXFLAGS="$OPT $OMP $HIPCC_FLAGS"
+ENV HACC_CXX="${HIPCC_BIN}/hipcc -Xclang"
+
+ENV HACC_LDFLAGS="-lm -lrt"
+
+# USE_SERIAL_COSMO must be set to avoid building the code with MPI, which isn't
+# supported on the GPU model in gem5.
+ENV USE_SERIAL_COSMO="1"
+ENV HACC_NUM_CUDA_DEV="1"
+ENV HACC_MPI_CFLAGS="$OPT $OMP $HIPCC_FLAGS"
+ENV HACC_MPI_CC="${HIPCC_BIN}/hcc -x c -Xclang -std=c99 -Xclang -pthread"
+
+ENV HACC_MPI_CXXFLAGS="$OPT $OMP $HIPCC_FLAGS"
+ENV HACC_MPI_CXX="${HIPCC_BIN}/hipcc -Xclang -pthread"
+ENV HACC_MPI_LD="${HIPCC_BIN}/hipcc -Xclang -pthread"
+
+ENV HACC_MPI_LDFLAGS="-lm -lrt"
diff --git a/src/halo-finder/src/BGQCM.c b/src/halo-finder/src/BGQCM.c
new file mode 100644
index 0000000..7143ded
--- /dev/null
+++ b/src/halo-finder/src/BGQCM.c
@@ -0,0 +1,215 @@
+/*
+BG/Q tuned version of HACC: 69.2% of peak performance on 96 racks of Sequoia
+Argonne Leadership Computing Facility, Argonne, IL 60439
+Vitali Morozov (morozov@anl.gov)
+*/
+
+//#undef __bgq__
+
+#ifdef __bgq__
+
+//#include </soft/compilers/ibmcmp-feb2012/vacpp/bg/12.1/include/builtins.h>
+#include IBMCMP_BUILTINS
+
+int isAligned(void *in);
+
+void cm( int count, float *xx, float *yy, float *zz, float *mass, float *xmin, float *xmax, float *xc)
+{
+  // xmin/xmax are currently set to the whole bounding box, but this is too conservative, so we'll
+  // set them based on the actual particle content.
+
+  double x, y, z, m, w;
+  double xa, ya, za, ma;
+  int i, j, k;
+
+  float x1, x2, y1, y2, z1, z2;
+
+  vector4double xv, yv, zv, wv, dv0, dv1, dv2, dv3, dv4, dv5;
+  vector4double xi0, xi1, yi0, yi1, zi0, zi1;
+  vector4double xs, ys, zs, ms;
+
+  int ALXX, ALYY, ALZZ, ALMM;
+
+  ALXX = isAligned( (void *)xx );
+  //ALYY = isAligned( (void *)yy );
+  //ALZZ = isAligned( (void *)zz );
+  //ALMM = isAligned( (void *)mass );
+
+
+  i = 0; j = 0; k = 0;
+
+  if ( ( ALXX == 4 ) && ( isAligned( (void *)&xx[1]) == 8  ) ) i = 3;
+  if ( ( ALXX == 4 ) && ( isAligned( (void *)&xx[1]) >= 16 ) ) i = 1;
+  if ( ( ALXX == 8 ) ) i = 2;
+
+  ma = 0.; xa = 0.; ya = 0.; za = 0.;
+  
+  x1 = xx[0]; x2 = xx[0]; 
+  y1 = yy[0]; y2 = yy[0];
+  z1 = zz[0]; z2 = zz[0];
+
+  for ( k = 0; k < i; k++ )
+  {
+    if ( x1 > xx[k] ) x1 = xx[k]; 
+    if ( x2 < xx[k] ) x2 = xx[k]; 
+    if ( y1 > yy[k] ) y1 = yy[k]; 
+    if ( y2 < yy[k] ) y2 = yy[k]; 
+    if ( z1 > zz[k] ) z1 = zz[k]; 
+    if ( z2 < zz[k] ) z2 = zz[k]; 
+    
+    w = mass[k];
+    xa = xa + w * xx[k];
+    ya = ya + w * yy[k];
+    za = za + w * zz[k];
+    ma = ma + w;
+  } 
+  
+  xi0 = vec_splats( (double)x1 );
+  xi1 = vec_splats( (double)x2 );
+  yi0 = vec_splats( (double)y1 );
+  yi1 = vec_splats( (double)y2 );
+  zi0 = vec_splats( (double)z1 );
+  zi1 = vec_splats( (double)z2 );
+  
+  xs = vec_splats( 0. );
+  ys = vec_splats( 0. );
+  zs = vec_splats( 0. );
+  ms = vec_splats( 0. );
+
+  for ( i = k, j = k * 4; i < count-3; i = i + 4, j = j + 16 )
+  {
+    xv = vec_lda( j, xx );
+    yv = vec_lda( j, yy );
+    zv = vec_lda( j, zz );
+    wv = vec_lda( j, mass );
+    
+    dv0 = vec_cmpgt( xi0, xv );
+    dv1 = vec_cmplt( xi1, xv );
+    dv2 = vec_cmpgt( yi0, yv );
+    dv3 = vec_cmplt( yi1, yv );
+    dv4 = vec_cmpgt( zi0, zv );
+    dv5 = vec_cmplt( zi1, zv );
+    
+    xi0 = vec_sel( xi0, xv, dv0 );
+    xi1 = vec_sel( xi1, xv, dv1 );
+    yi0 = vec_sel( yi0, yv, dv2 );
+    yi1 = vec_sel( yi1, yv, dv3 );
+    zi0 = vec_sel( zi0, zv, dv4 );
+    zi1 = vec_sel( zi1, zv, dv5 );
+    
+    xs = vec_madd( wv, xv, xs );
+    ys = vec_madd( wv, yv, ys );
+    zs = vec_madd( wv, zv, zs );
+    ms = vec_add( ms, wv );
+  }
+  
+  if ( i > 0 ) 
+  {
+      if ( x1 > xi0[0] ) x1 = xi0[0];
+      if ( x1 > xi0[1] ) x1 = xi0[1];
+      if ( x1 > xi0[2] ) x1 = xi0[2];
+      if ( x1 > xi0[3] ) x1 = xi0[3];
+    
+      if ( x2 < xi1[0] ) x2 = xi1[0];
+      if ( x2 < xi1[1] ) x2 = xi1[1];
+      if ( x2 < xi1[2] ) x2 = xi1[2];
+      if ( x2 < xi1[3] ) x2 = xi1[3];
+    
+      if ( y1 > yi0[0] ) y1 = yi0[0];
+      if ( y1 > yi0[1] ) y1 = yi0[1];
+      if ( y1 > yi0[2] ) y1 = yi0[2];
+      if ( y1 > yi0[3] ) y1 = yi0[3];
+    
+      if ( y2 < yi1[0] ) y2 = yi1[0];
+      if ( y2 < yi1[1] ) y2 = yi1[1];
+      if ( y2 < yi1[2] ) y2 = yi1[2];
+      if ( y2 < yi1[3] ) y2 = yi1[3];
+    
+      if ( z1 > zi0[0] ) z1 = zi0[0];
+      if ( z1 > zi0[1] ) z1 = zi0[1];
+      if ( z1 > zi0[2] ) z1 = zi0[2];
+      if ( z1 > zi0[3] ) z1 = zi0[3];
+    
+      if ( z2 < zi1[0] ) z2 = zi1[0];
+      if ( z2 < zi1[1] ) z2 = zi1[1];
+      if ( z2 < zi1[2] ) z2 = zi1[2];
+      if ( z2 < zi1[3] ) z2 = zi1[3];
+
+      xa = xa + ( xs[0] + xs[1] + xs[2] + xs[3] );
+      ya = ya + ( ys[0] + ys[1] + ys[2] + ys[3] );
+      za = za + ( zs[0] + zs[1] + zs[2] + zs[3] );
+      ma = ma + ( ms[0] + ms[1] + ms[2] + ms[3] );
+  }    
+  
+  for ( k = i; k < count; k++ )
+  {
+    if ( x1 > xx[k] ) x1 = xx[k]; 
+    if ( x2 < xx[k] ) x2 = xx[k]; 
+    if ( y1 > yy[k] ) y1 = yy[k]; 
+    if ( y2 < yy[k] ) y2 = yy[k]; 
+    if ( z1 > zz[k] ) z1 = zz[k]; 
+    if ( z2 < zz[k] ) z2 = zz[k]; 
+    
+    w = mass[k];
+    xa = xa + w * xx[k];
+    ya = ya + w * yy[k];
+    za = za + w * zz[k];
+    ma = ma + w;
+  } 
+  
+  xmin[0] = x1; xmax[0] = x2;
+  xmin[1] = y1; xmax[1] = y2;
+  xmin[2] = z1; xmax[2] = z2;
+  
+  xc[0] = (float) ( xa / ma);
+  xc[1] = (float) ( ya / ma);
+  xc[2] = (float) ( za / ma);
+
+  return;
+}
+
+#else
+
+#include <math.h>
+
+/*
+static inline void cm(ID_T count, const POSVEL_T* __restrict xx, const POSVEL_T* __restrict yy,
+                      const POSVEL_T* __restrict zz, const POSVEL_T* __restrict mass,
+                      POSVEL_T* __restrict xmin, POSVEL_T* __restrict xmax, POSVEL_T* __restrict xc) 
+*/
+
+void cm( int count, float *xx, float *yy, float *zz, float *mass, float *xmin, float *xmax, float *xc)
+{
+  // xmin/xmax are currently set to the whole bounding box, but this is too conservative, so we'll
+  // set them based on the actual particle content.
+
+  double x = 0, y = 0, z = 0, m = 0;
+
+  for (int i = 0; i < count; ++i) {
+    if (i == 0) {
+      xmin[0] = xmax[0] = xx[0];
+      xmin[1] = xmax[1] = yy[0];
+      xmin[2] = xmax[2] = zz[0];
+    } else {
+      xmin[0] = fminf(xmin[0], xx[i]);
+      xmax[0] = fmaxf(xmax[0], xx[i]);
+      xmin[1] = fminf(xmin[1], yy[i]);
+      xmax[1] = fmaxf(xmax[1], yy[i]);
+      xmin[2] = fminf(xmin[2], zz[i]);
+      xmax[2] = fmaxf(xmax[2], zz[i]);
+    }
+
+    float w = mass[i];
+    x += w*xx[i];
+    y += w*yy[i];
+    z += w*zz[i];
+    m += w;
+  }
+
+  xc[0] = (float) (x/m);
+  xc[1] = (float) (y/m);
+  xc[2] = (float) (z/m);
+}
+
+#endif
+
diff --git a/src/halo-finder/src/BGQStep16.c b/src/halo-finder/src/BGQStep16.c
new file mode 100644
index 0000000..3a3dff8
--- /dev/null
+++ b/src/halo-finder/src/BGQStep16.c
@@ -0,0 +1,211 @@
+/*
+BG/Q tuned version of short force evaluation kernel: 81% of peak performance
+Argonne Leadership Computing Facility, Argonne, IL 60439
+Vitali Morozov (morozov@anl.gov)
+*/
+
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+//#include <spi/include/l1p/sprefetch.h>
+//#include </opt/ibmcmp/vacpp/bg/12.1/include/builtins.h>
+//#include </soft/compilers/ibmcmp-may2012/vacpp/bg/12.1/include/builtins.h>
+
+//#undef __bgq__
+
+#ifdef __bgq__
+
+#include IBMCMP_BUILTINS
+
+int isAligned( void *in );
+
+void Step16_int( int count1, float xxi, float yyi, float zzi, float fsrrmax2, float mp_rsm2, float *xx1, float *yy1, float *zz1, float *mass1, float *ax, float *ay, float *az )
+{
+    int i = 0, j, k;
+    const int offset = 32; 
+    
+    vector4double b0, b1, b2, b3, b4, b5, b6;
+    vector4double c0, c1, c2, c3, c4, c5, c6;
+    vector4double a0 = (vector4double){ 0.5, 0.5, 0.5, 0.5 };
+    vector4double a1 = (vector4double){ (double)xxi, (double)xxi, (double)xxi, (double)xxi };
+    vector4double a2 = (vector4double){ (double)yyi, (double)yyi, (double)yyi, (double)yyi };
+    vector4double a3 = (vector4double){ (double)zzi, (double)zzi, (double)zzi, (double)zzi };
+    vector4double a4 = (vector4double){ (double)fsrrmax2, (double)fsrrmax2, (double)fsrrmax2, (double)fsrrmax2 };
+    vector4double a5 = (vector4double){ (double)mp_rsm2, (double)mp_rsm2, (double)mp_rsm2, (double)mp_rsm2 };
+    vector4double a6 = (vector4double){ 0., 0., 0., 0. };
+    vector4double a7 = (vector4double){ 0., 0., 0., 0. };
+    vector4double a8 = (vector4double){ 0., 0., 0., 0. };
+    vector4double a9 = (vector4double){ 0., 0., 0., 0. };
+    vector4double a10 = (vector4double){ 0.269327, 0.269327, 0.269327, 0.269327 };
+    vector4double a11 = (vector4double){ -0.0750978, -0.0750978, -0.0750978, -0.0750978 };
+    vector4double a12 = (vector4double){ 0.0114808, 0.0114808, 0.0114808, 0.0114808 };
+    vector4double a13 = (vector4double){ -0.00109313, -0.00109313, -0.00109313, -0.00109313 };
+    vector4double a14 = (vector4double){ 0.0000605491, 0.0000605491, 0.0000605491, 0.0000605491 };
+    vector4double a15 = (vector4double){ -0.00000147177, -0.00000147177, -0.00000147177, -0.00000147177 };
+
+    /*
+    int32_t depth = 3;
+    L1P_SetStreamPolicy( L1P_stream_confirmed );
+    L1P_SetStreamDepth(depth);
+    */
+
+    for ( i = 0, j = 0; i < count1-7; i = i + 8, j = j + 32 ) 
+    { 
+    
+        __dcbt( (void *)&xx1  [i+offset] );
+        __dcbt( (void *)&yy1  [i+offset] );
+        __dcbt( (void *)&zz1  [i+offset] );
+        __dcbt( (void *)&mass1[i+offset] );
+    
+
+        b0 = vec_ld( j   , xx1 );
+        c0 = vec_ld( j+16, xx1 );
+        
+        b1 = vec_ld( j   , yy1 );
+        c1 = vec_ld( j+16, yy1 );
+
+        b2 = vec_ld( j   , zz1 );
+        c2 = vec_ld( j+16, zz1 );
+
+        b3 = vec_sub( b0, a1 );
+        c3 = vec_sub( c0, a1 );
+
+        b4 = vec_sub( b1, a2 );
+        c4 = vec_sub( c1, a2 );
+
+        b5 = vec_sub( b2, a3 );
+        c5 = vec_sub( c2, a3 );
+
+        b0 = vec_madd( b3, b3, a6 );
+        c0 = vec_madd( c3, c3, a6 );
+
+        b0 = vec_madd( b4, b4, b0 );
+        c0 = vec_madd( c4, c4, c0 );
+
+        b6 = vec_madd( b5, b5, b0 );
+        c6 = vec_madd( c5, c5, c0 );
+        
+        b0 = vec_madd( b5, b5, b0 );
+        c0 = vec_madd( c5, c5, c0 );
+        
+        b0 = vec_add( b0, a5 );
+        c0 = vec_add( c0, a5 );
+        
+        b1 = vec_madd( b0, b0, a6 );
+        c1 = vec_madd( c0, c0, a6 );
+        
+        b0 = vec_madd( b1, b0, a6 );
+        c0 = vec_madd( c1, c0, a6 );
+        
+        b1 = vec_rsqrte( b0 );
+        c1 = vec_rsqrte( c0 );
+        
+        b2 = vec_madd( b1, b1, a6 );
+        c2 = vec_madd( c1, c1, a6 );
+        
+        b0 = vec_madd( b0, b2, a6 );
+        c0 = vec_madd( c0, c2, a6 );
+        
+        b0 = vec_nmsub( a0, b0, a0 );
+        c0 = vec_nmsub( a0, c0, a0 );
+        
+        b0 = vec_madd( b1, b0, b1 );
+        c0 = vec_madd( c1, c0, c1 );
+        
+        b1 = vec_madd( b6, a15, a14 );
+        c1 = vec_madd( c6, a15, a14 );
+        
+        b1 = vec_madd( b6, b1, a13 );
+        c1 = vec_madd( c6, c1, a13 );
+        
+        b1 = vec_madd( b6, b1, a12 );
+        c1 = vec_madd( c6, c1, a12 );
+        
+        b1 = vec_madd( b6, b1, a11 );
+        c1 = vec_madd( c6, c1, a11 );
+        
+        b1 = vec_madd( b6, b1, a10 );
+        c1 = vec_madd( c6, c1, a10 );
+        
+        b0 = vec_sub( b0, b1 );
+        c0 = vec_sub( c0, c1 );
+        
+        b1 = vec_ld( j   , mass1 );
+        c1 = vec_ld( j+16, mass1 );
+        
+        b2 = vec_sub( b6, a4 );
+        c2 = vec_sub( c6, a4 );
+        
+        b1 = vec_sel( b1, a6, b2 );
+        c1 = vec_sel( c1, a6, c2 );
+        
+        b1 = vec_madd( b1, b0, a6 );
+        c1 = vec_madd( c1, c0, a6 );
+        
+        b2 = vec_sub( a6, b6 );
+        c2 = vec_sub( a6, c6 );
+        
+        b0 = vec_sel( b1, a6, b2 );
+        c0 = vec_sel( c1, a6, c2 );
+        
+        a7 = vec_madd( b0, b3, a7 );
+        a7 = vec_madd( c0, c3, a7 );
+        
+        a8 = vec_madd( b0, b4, a8 );
+        a8 = vec_madd( c0, c4, a8 );
+        
+        a9 = vec_madd( b0, b5, a9 );
+        a9 = vec_madd( c0, c5, a9 );
+    }
+
+    *ax = ( a7[0] + a7[1] + a7[2] + a7[3] );
+    *ay = ( a8[0] + a8[1] + a8[2] + a8[3] );
+    *az = ( a9[0] + a9[1] + a9[2] + a9[3] );
+
+    
+    const float ma0 = 0.269327, ma1 = -0.0750978, ma2 = 0.0114808, ma3 = -0.00109313, ma4 = 0.0000605491, ma5 = -0.00000147177;
+    float dxc, dyc, dzc, m, r2, f;
+
+    for ( k = i; k < count1; k++ ) 
+    {
+        dxc = xx1[k] - xxi;
+        dyc = yy1[k] - yyi;
+        dzc = zz1[k] - zzi;
+  
+        r2 = dxc * dxc + dyc * dyc + dzc * dzc;
+       
+        m = ( r2 < fsrrmax2 ) ? mass1[k] : 0.0f;
+
+        f =  pow( r2 + mp_rsm2, -1.5 ) - ( ma0 + r2*(ma1 + r2*(ma2 + r2*(ma3 + r2*(ma4 + r2*ma5)))));
+        
+        f = ( r2 > 0.0f ) ? m * f : 0.0f;
+
+        *ax = *ax + f * dxc;
+        *ay = *ay + f * dyc;
+        *az = *az + f * dzc;
+    }
+
+}
+
+int isAligned(void *in){
+  const int mask_04 = 0xFFFFFFFC;
+  const int mask_08 = 0xFFFFFFF8;
+  const int mask_16 = 0xFFFFFFF0;
+  const int mask_32 = 0xFFFFFFE0;
+
+
+  if((int)in == ((int)in & mask_32))
+    return 32;
+  if((int)in == ((int)in & mask_16))
+    return 16;
+  if((int)in == ((int)in & mask_08))
+    return 8;
+  if((int)in == ((int)in & mask_04))
+    return 4;
+
+  return -1;
+}
+
+#endif
+
diff --git a/src/halo-finder/src/BHForceTree.cxx b/src/halo-finder/src/BHForceTree.cxx
new file mode 100644
index 0000000..b8c9c99
--- /dev/null
+++ b/src/halo-finder/src/BHForceTree.cxx
@@ -0,0 +1,2400 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <set>
+#include <math.h>
+
+#include <time.h>
+
+#include "Timings.h"
+#include "BHForceTree.h"
+
+using namespace std;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// FParticle contains information about particles
+//
+/////////////////////////////////////////////////////////////////////////
+
+FParticle::FParticle()
+{
+  this->parent = -1;
+  this->nextNode = -1;
+  this->sibling = -1;
+  this->force = 0.0;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// FNode is a region of physical space divided into octants
+//
+/////////////////////////////////////////////////////////////////////////
+
+FNode::FNode(POSVEL_T* minLoc, POSVEL_T* maxLoc)
+{
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    this->geoSide[dim] = maxLoc[dim] - minLoc[dim];
+    this->geoCenter[dim] = minLoc[dim] + this->geoSide[dim] * 0.5;
+
+  }
+  for (int i = 0; i < NUM_CHILDREN; i++)
+    this->u.child[i] = -1;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// FNode constructed from an octant of a parent node
+//
+/////////////////////////////////////////////////////////////////////////
+
+FNode::FNode(FNode* parent, int oindx)
+{
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    this->geoSide[dim] = parent->geoSide[dim] * 0.5;
+  }
+
+  // Vary Z fastest when making octtree children
+  // If this changes must also change getChildIndex()
+  if (oindx & 4)
+    this->geoCenter[0] = parent->geoCenter[0] + this->geoSide[0] * 0.5;
+  else
+    this->geoCenter[0] = parent->geoCenter[0] - this->geoSide[0] * 0.5;
+
+  if (oindx & 2)
+    this->geoCenter[1] = parent->geoCenter[1] + this->geoSide[1] * 0.5;
+  else
+    this->geoCenter[1] = parent->geoCenter[1] - this->geoSide[1] * 0.5;
+
+  if (oindx & 1)
+    this->geoCenter[2] = parent->geoCenter[2] + this->geoSide[2] * 0.5;
+  else
+    this->geoCenter[2] = parent->geoCenter[2] - this->geoSide[2] * 0.5;
+
+  for (int i = 0; i < NUM_CHILDREN; i++)
+    this->u.child[i] = -1;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Barnes Hut Tree
+//
+/////////////////////////////////////////////////////////////////////////
+
+BHForceTree::BHForceTree(
+			 POSVEL_T* minLoc,
+			 POSVEL_T* maxLoc,
+			 ID_T count,
+			 POSVEL_T* xLoc,
+			 POSVEL_T* yLoc,
+			 POSVEL_T* zLoc,
+			 POSVEL_T* xVel,
+			 POSVEL_T* yVel,
+			 POSVEL_T* zVel,
+			 POSVEL_T* ms,
+			 POSVEL_T avgMass)
+{
+  // Extract the contiguous data block from a vector pointer
+  this->particleCount = count;
+  this->nodeOffset = this->particleCount;
+  this->xx = xLoc;
+  this->yy = yLoc;
+  this->zz = zLoc;
+  this->vx = xVel;
+  this->vy = yVel;
+  this->vz = zVel;
+  this->mass = ms;
+  this->particleMass = avgMass;
+
+  // Find the grid size of this chaining mesh
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    this->minRange[dim] = minLoc[dim];
+    this->maxRange[dim] = maxLoc[dim];
+  }
+  this->boxSize = this->maxRange[0] - this->minRange[0];
+
+  //maybe change this to Newton's law or something
+  this->m_fl = new ForceLawNewton();
+  this->m_fcoeff = 1.0;
+
+  // Create the recursive BH tree from the particle locations
+  createBHForceTree();
+
+  // Thread the recursive tree turning it into an iterative tree
+  ID_T rootIndx = this->particleCount;
+  ID_T sibling = -1;
+  ID_T parent = -1;
+  ID_T lastIndx = -1;
+  POSVEL_T radius = 0.0;
+
+  threadBHForceTree(rootIndx, sibling, parent, &lastIndx, &radius);
+}
+
+BHForceTree::BHForceTree(
+			 POSVEL_T* minLoc,
+			 POSVEL_T* maxLoc,
+			 ID_T count,
+			 POSVEL_T* xLoc,
+			 POSVEL_T* yLoc,
+			 POSVEL_T* zLoc,
+			 POSVEL_T* xVel,
+			 POSVEL_T* yVel,
+			 POSVEL_T* zVel,
+			 POSVEL_T* ms,
+			 POSVEL_T avgMass,
+			 ForceLaw *fl,
+			 float fcoeff)
+{
+  // Extract the contiguous data block from a vector pointer
+  this->particleCount = count;
+  this->nodeOffset = this->particleCount;
+  this->xx = xLoc;
+  this->yy = yLoc;
+  this->zz = zLoc;
+  this->vx = xVel;
+  this->vy = yVel;
+  this->vz = zVel;
+  this->mass = ms;
+  this->particleMass = avgMass;
+
+  // Find the grid size of this chaining mesh
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    this->minRange[dim] = minLoc[dim];
+    this->maxRange[dim] = maxLoc[dim];
+  }
+  this->boxSize = this->maxRange[0] - this->minRange[0];
+
+  this->m_fl = fl;
+  this->m_fcoeff = fcoeff;
+
+  // Create the recursive BH tree from the particle locations
+  createBHForceTree();
+
+  // Thread the recursive tree turning it into an iterative tree
+  ID_T rootIndx = this->particleCount;
+  ID_T sibling = -1;
+  ID_T parent = -1;
+  ID_T lastIndx = -1;
+  POSVEL_T radius = 0.0;
+  threadBHForceTree(rootIndx, sibling, parent, &lastIndx, &radius);
+}
+
+BHForceTree::~BHForceTree()
+{
+  /* empty */
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Find the subhalos of the FOF halo using SUBFIND algorithm which
+// requires subhalos to be locally overdense and self-bound
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::createBHForceTree()
+{
+  // Create the FParticles
+  this->fParticle.resize(this->particleCount);
+
+  // Reserve a basic amount of space in the BH tree
+  this->fNode.reserve(this->particleCount/NUM_CHILDREN);
+
+  // Create the root node of the BH tree
+  FNode root(this->minRange, this->maxRange);
+  this->fNode.push_back(root);
+  ID_T nodeIndex = 0;
+
+  // Iterate on all particles placing them in the BH tree
+  // Child slots in the tree contain the index of the FParticle or
+  // the index of the FNode offset by the number of particles
+  // This is so we can use an integer instead of pointers to refer to objects
+  //
+  for (ID_T pindx = 0; pindx < this->particleCount; pindx++) {
+
+    // Start at root of tree for insertion of a new particle
+    // pindx is index into the halo particles where location is stored
+    // tindx is index into the BH tree nodes
+    // oindx is index into the octant of the tree node
+    ID_T tindx = 0;
+    int oindx = getChildIndex(&this->fNode[tindx], pindx);
+
+    while (this->fNode[tindx].u.child[oindx] != -1) {
+
+      // Child slot in tree contains another SPHNode so go there
+      if (this->fNode[tindx].u.child[oindx] > this->particleCount) {
+        tindx = this->fNode[tindx].u.child[oindx] - this->particleCount;
+        oindx = getChildIndex(&this->fNode[tindx], pindx);
+      }
+
+      // Otherwise there is a particle in the slot and we make a new FNode
+      else {
+
+        // Get the particle index of particle already in the node
+        ID_T pindx2 = this->fNode[tindx].u.child[oindx];
+
+	// First, check to make sure that this particle is not at the exact
+	// same location as the particle that is already there. If it is, then
+	// we'll double the mass of the existing particle and leave this one
+	// out.
+        if (this->xx[pindx2] == this->xx[pindx] &&
+            this->yy[pindx2] == this->yy[pindx] &&
+            this->zz[pindx2] == this->zz[pindx]) {
+          this->mass[pindx2] += this->mass[pindx];
+          goto next_particle;
+        }
+
+ 
+        // Make sure that the vector does not over allocate
+        if (this->fNode.capacity() == this->fNode.size()) {
+          this->fNode.reserve(this->fNode.capacity() 
+            + this->particleCount/NUM_CHILDREN);
+        }
+
+        FNode node(&this->fNode[tindx], oindx);
+        this->fNode.push_back(node);
+        nodeIndex++;
+        ID_T tindx2 = nodeIndex;
+        
+        // Place the node that was sitting there already
+        int oindx2 = getChildIndex(&this->fNode[tindx2], pindx2);
+        this->fNode[tindx2].u.child[oindx2] = pindx2;
+
+        // Add the new SPHNode to the BHTree
+        this->fNode[tindx].u.child[oindx] = tindx2 + this->particleCount;
+
+        // Set to new node
+        tindx = tindx2;
+        oindx = getChildIndex(&this->fNode[tindx], pindx);
+      }
+    }
+    // Place the current particle in the BH tree
+    this->fNode[tindx].u.child[oindx] = pindx;
+next_particle:;
+  }
+  this->nodeCount = this->fNode.size();
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Update the FNode vector by walking using a depth first recursion
+// Set parent and sibling indices which can replace the child[8] already
+// there, and supply extra information about center of mass, total particle
+// mass and particle radius which is the distance from the center of mass
+// to the furthest particle.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::threadBHForceTree(
+			ID_T curIndx,	     // Current node/particle
+			ID_T sibling,        // Sibling of current
+			ID_T parent,         // Parent of current
+			ID_T* lastIndx,      // Last node/particle
+                        POSVEL_T* radius)    // Needed to pass up partRadius
+{
+  // Set the next index in the threading for node or particle
+  // Particles and nodes are threaded together so all are touched in iteration
+  if (*lastIndx >= 0) {
+    if (*lastIndx >= this->nodeOffset)
+      this->fNode[(*lastIndx - this->nodeOffset)].u.n.nextNode = curIndx;
+    else
+      this->fParticle[*lastIndx].nextNode = curIndx;
+  }
+  *lastIndx = curIndx;
+ 
+  // FParticle saves the parent and sibling FNode id
+  if (curIndx < this->nodeOffset) {
+    this->fParticle[curIndx].parent = parent;
+    this->fParticle[curIndx].sibling = sibling;
+
+  // FNode recurses on each of the children
+  } else {
+    ID_T child[NUM_CHILDREN];
+    FNode* curNode = &this->fNode[curIndx - this->nodeOffset];
+
+    // Store mass and center of mass for each child node or particle
+    POSVEL_T childMass[NUM_CHILDREN];
+    POSVEL_T childRadius[NUM_CHILDREN];
+    POSVEL_T childCenter[NUM_CHILDREN][DIMENSION];
+    for (int j = 0; j < NUM_CHILDREN; j++) {
+      child[j] = curNode->u.child[j];
+      childMass[j] = 0.0;
+      childRadius[j] = 0.0;
+    }
+
+    ////////////////////////////////////////////////////////////////////////
+    //
+    // Recurse on each of the children, recording information on the way up
+    //
+    for (int j = 0; j < NUM_CHILDREN; j++) {
+
+      // Skip any children which contain neither a particle or node
+      ID_T childIndx, childIndxNext, nextSibling;
+      if ((childIndx = child[j]) >= 0) {
+
+        // Check for a sibling on this level or set to the next level up
+        int jj;
+        for (jj = j + 1; jj < NUM_CHILDREN; jj++)
+          if ((childIndxNext = child[jj]) >= 0)
+            break;
+        if (jj < NUM_CHILDREN)
+          nextSibling = childIndxNext;
+        else
+          nextSibling = -1;
+
+        // Recursion to child
+        // Since value of partRadius set in child is not necessarily the
+        // distance between center of mass and futhest child return it
+        threadBHForceTree(childIndx, nextSibling, curIndx, lastIndx, radius);
+
+        // Child is a node or a particle
+        if (childIndx >= this->nodeOffset) {
+
+          // FNode, gather mass and center of mass of all contained particles
+          FNode* childNode = &this->fNode[childIndx - this->nodeOffset];
+          childMass[j] = childNode->u.n.partMass;
+          childRadius[j] = *radius;
+          for (int dim = 0; dim < DIMENSION; dim++)
+            childCenter[j][dim] = childNode->u.n.partCenter[dim];
+
+        } else {
+          // FParticle, set mass and center of mass using particle location
+          childMass[j] = this->particleMass;
+          childRadius[j] = 0.0;
+          childCenter[j][0] = this->xx[childIndx];
+          childCenter[j][1] = this->yy[childIndx];
+          childCenter[j][2] = this->zz[childIndx];
+        }
+      }
+    }
+
+    ////////////////////////////////////////////////////////////////////////
+    //
+    // Finished processing all children, collect information for this node
+    //
+    curNode->u.n.partMass = 0.0;
+    for (int dim = 0; dim < DIMENSION; dim++)
+      curNode->u.n.partCenter[dim] = 0.0;
+
+    // Collect total mass and total center of mass for all children
+    for (int j = 0; j < NUM_CHILDREN; j++) {
+      if (childMass[j] > 0) {
+        curNode->u.n.partMass += childMass[j];
+        for (int dim = 0; dim < DIMENSION; dim++)
+          curNode->u.n.partCenter[dim] += 
+                                     childCenter[j][dim] * childMass[j];
+      }
+    }
+
+    // Calculate center of mass for current node
+    if (curNode->u.n.partMass > 0.0) {
+      for (int dim = 0; dim < DIMENSION; dim++)
+        curNode->u.n.partCenter[dim] /= curNode->u.n.partMass;
+    } else {
+      for (int dim = 0; dim < DIMENSION; dim++)
+        curNode->u.n.partCenter[dim] = curNode->geoCenter[dim];
+    }
+
+    // First method for calculating particle radius
+    // Calculate the radius from node center of mass to furthest node corner
+    POSVEL_T partRadius1 = distanceToFarCorner(
+                                     curNode->u.n.partCenter[0],
+                                     curNode->u.n.partCenter[1],
+                                     curNode->u.n.partCenter[2],
+                                     curNode);
+
+    // Second method for calculating particle radius
+    // Calculate the radius from center of mass to furthest child
+    POSVEL_T partRadius2 = 0.0;
+    for (int j = 0; j < NUM_CHILDREN; j++) {
+      if (childMass[j] > 0.0) {
+
+        // Calculate the distance between this center of mass and that of child
+        POSVEL_T dist = distanceToCenterOfMass(childCenter[j][0],
+                                               childCenter[j][1],
+                                               childCenter[j][2],
+                                               curNode);
+        // Add in the particle radius of the child to get furthest point
+        dist += childRadius[j];
+        if (dist > partRadius2)
+          partRadius2 = dist;
+      }
+    }
+    // Used by parent of this node
+    *radius = partRadius2;
+
+    // Save the smaller of the two particle radii
+    curNode->u.n.partRadius = min(partRadius1, partRadius2);
+
+    // Set threaded structure for this node
+    curNode->u.n.sibling = sibling;
+    curNode->u.n.parent = parent;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Particle i has location vector x_i (xx_i, yy_i, zz_i)
+// Particle i has velocity vector v_i (vx_i, vy_i, vz_i)
+//
+// Node j has center of particle mass c_j (cx_j, cy_j, cz_j)
+// Node j has total particle mass of M_j
+// Node j has bounding radius of R_j
+//
+// Distance between particle i and node j is
+//   d_ji = fabs(c_j - x_i)
+//
+// Rule for updating
+//   v_i(t_1) = v_i(t_0) + alpha * Sum over j of f_ji(d_ji, Mj)
+//          where f_ji is the short range force over finite range r_f
+//          where alpha is some coeffient
+//          where Sum over j nodes is determined by a tree walk
+//
+// An opening angle is defined as
+//    theta_ji = (2 R_j) / d_ji
+//
+// This angle determines whether a node should be opened to a higher resolution
+// or whether it can be used as is because it is small enough or far enough away
+// This is determined by comparing to a passed in theta_0
+//
+// Three actions can occur for a node encountered on the walk
+//
+//   1. Node is too far away to contribute to force
+//      if d_ji - R_j > r_f
+//      or distance of x_i to nearest cornder of node > r_f
+//
+//   2. Node is close enough to contribute so check the opening angle
+//      if theta_ji > theta_0 follow nextNode to open this node to children
+//
+//   3. Node is close enough and theta_ji < theta_0
+//      calculate f_ji(d_ji, Mj) and update v_i
+//      follow the sibling link and not the nextNode link
+//
+// Force is calculated for each particle i by
+//   Starting at the root node and walking the entire tree collecting force
+//   Starting at the particle and walking up parents until a criteria is met
+//
+/////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Short range force full N^2 calculation
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::treeForceN2(
+                    POSVEL_T critRadius)  // Radius of furthest point
+{
+  POSVEL_T critRadius2 = critRadius * critRadius;
+
+  POTENTIAL_T* force = new POTENTIAL_T[this->particleCount];
+  for (int i = 0; i < this->particleCount; i++)
+    force[i] = 0.0;
+
+  // First particle in halo to calculate force on
+  for (int p = 0; p < this->particleCount; p++) {
+
+    // Next particle in halo force loop
+    for (int q = p+1; q < this->particleCount; q++) {
+
+      POSVEL_T dx = (POSVEL_T) fabs(this->xx[p] - this->xx[q]);
+      POSVEL_T dy = (POSVEL_T) fabs(this->yy[p] - this->yy[q]);
+      POSVEL_T dz = (POSVEL_T) fabs(this->zz[p] - this->zz[q]);
+      POSVEL_T r2 = dx * dx + dy * dy + dz * dz;
+
+      if (r2 != 0.0 && r2 < critRadius2) {
+        force[p] -= (this->mass[q] / r2);
+        force[q] -= (this->mass[p] / r2);
+      }
+    }
+  }
+  for (int p = 0; p < this->particleCount; p++) {
+    this->fParticle[p].force = force[p];
+  }
+  delete [] force;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Short range gravity calculation for group of particles in a node
+// Walk down the tree from the root until reaching node with less than the
+// maximum number of particles in a group.  Create an interaction list that
+// will work for all particles and calculate force.  For particles within
+// the group the calculation will be n^2.  For nodes outside the group
+// decisions are made on whether to include the node or ignore it, or
+// to accept it or open it.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::treeForceGroup(
+                    POSVEL_T bhAngle,     // Open node to examine children
+                    POSVEL_T critRadius,  // Accept or ignore node not opened
+                    int minGroup,         // Minimum particles in a group
+                    int maxGroup)         // Maximum particles in a group
+{
+  ID_T root = this->particleCount;
+  POSVEL_T maxMass = maxGroup * this->particleMass;
+  POSVEL_T minMass = minGroup * this->particleMass;
+
+  walkTreeGroup(root, minMass, maxMass, bhAngle, critRadius);
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Walk the tree in search of nodes which are less than the maximum
+// number of particles to constitute a group.  All particles in the group
+// will be treated together with the n^2 force calculated between members
+// of the group and then having an interaction list applied.  The group
+// may consist of other nodes and particles and so the recursive descent
+// will be needed to find all particles in the group.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::walkTreeGroup(
+                    ID_T curId,           // Particle to calculate force on
+                    POSVEL_T minMass,     // Minimum mass for a group
+                    POSVEL_T maxMass,     // Maximum mass for a group
+                    POSVEL_T bhAngle,     // Open node to examine children
+                    POSVEL_T critRadius)  // Accept or ignore node not opened
+{
+
+  if (curId < this->nodeOffset) {
+    // Current object is a particle
+    vector<ID_T>* partInteract = new vector<ID_T>;
+    vector<ID_T>* nodeInteract = new vector<ID_T>;
+
+    createParticleInteractList(curId, bhAngle, critRadius,
+                               partInteract, nodeInteract);
+    this->fParticle[curId].force = forceCalculation(
+                                      curId, partInteract, nodeInteract);
+    delete partInteract;
+    delete nodeInteract;
+  }
+  else {
+    // Current object is a node
+    ID_T child = this->fNode[curId - this->nodeOffset].u.n.nextNode;
+    while (child != -1) {
+      if (child < this->nodeOffset) {
+        // Child is a particle
+        vector<ID_T>* partInteract = new vector<ID_T>;
+        vector<ID_T>* nodeInteract = new vector<ID_T>;
+
+        createParticleInteractList(child, bhAngle, critRadius,
+                                   partInteract, nodeInteract);
+        this->fParticle[child].force = forceCalculation(
+                                          child, partInteract, nodeInteract);
+        child = this->fParticle[child].sibling;
+
+        delete partInteract;
+        delete nodeInteract;
+      }
+      else {
+        // Child is a node
+        FNode* childNode = &this->fNode[child - this->nodeOffset];
+        if (childNode->u.n.partMass < maxMass &&
+            childNode->u.n.partRadius < (critRadius * 0.5)) {
+
+          // If the group is too small it can't function as a group
+          // so run the topdown method on those particles
+          if (childNode->u.n.partMass < minMass) {
+
+            // Collect particles in subgroup
+            vector<ID_T>* particles = new vector<ID_T>;
+            collectParticles(child, particles);
+            int count = particles->size();
+
+            for (int i = 0; i < count; i++) {
+              ID_T pId = (*particles)[i];
+              treeForceGadgetTopDown(pId, bhAngle, critRadius);
+            }
+          }
+          else {
+
+            vector<ID_T>* partInteract = new vector<ID_T>;
+            vector<ID_T>* nodeInteract = new vector<ID_T>;
+
+            createNodeInteractList(child, bhAngle, critRadius,
+                                   partInteract, nodeInteract);
+            forceCalculationGroup(child, bhAngle, critRadius,
+                                  partInteract, nodeInteract);
+            delete partInteract;
+            delete nodeInteract;
+          }
+        }
+        else {
+          walkTreeGroup(child, minMass, maxMass, bhAngle, critRadius);
+        }
+        child = this->fNode[child - this->nodeOffset].u.n.sibling;
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Create the interaction list for the particle starting at root
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::createParticleInteractList(
+                    ID_T p,               // Particle to calculate force on
+                    POSVEL_T bhAngle,     // Open node to examine children
+                    POSVEL_T critRadius , // Accept or ignore node not opened
+                    vector<ID_T>* partInteract,
+                    vector<ID_T>* nodeInteract)
+{
+  POSVEL_T dx, dy, dz, r, partRadius, distToNearPoint;
+  POSVEL_T pos_x = this->xx[p];
+  POSVEL_T pos_y = this->yy[p];
+  POSVEL_T pos_z = this->zz[p];
+
+  // Follow thread through tree from root choosing nodes and particles
+  // which will contribute to the force of the given particle
+  ID_T root = this->particleCount;
+  ID_T index = root;
+
+  while (index >= 0) {
+
+    if (index < this->nodeOffset) {
+      // Particle
+      dx = this->xx[index] - pos_x;
+      dy = this->yy[index] - pos_y;
+      dz = this->zz[index] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      if (r < critRadius) {
+        partInteract->push_back(index);
+      }
+      index = this->fParticle[index].nextNode;
+    }
+
+    else {
+      // Node
+      FNode* curNode = &this->fNode[index - this->nodeOffset];
+      partRadius = curNode->u.n.partRadius;
+      distToNearPoint = distanceToNearestPoint(pos_x, pos_y, pos_z, curNode);
+
+      dx = curNode->u.n.partCenter[0] - pos_x;
+      dy = curNode->u.n.partCenter[1] - pos_y;
+      dz = curNode->u.n.partCenter[2] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      // Node is ignored if it is too far away from the particle
+      // Distance from particle to particle radius exceeds critical radius
+      // Distance from particle to nearest side of node exceeds critical radius
+
+      if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+
+        // Ignore node, move on to sibling of this node
+        index = curNode->u.n.sibling;
+
+        // If there is no sibling go up a level until we find a node
+        ID_T parent = curNode->u.n.parent;
+        while (index == -1 && parent != -1 && parent != root) {
+          index = this->fNode[parent - this->nodeOffset].u.n.sibling;
+          parent = this->fNode[parent - this->nodeOffset].u.n.parent;
+        }
+      }
+      else {
+        if (2*partRadius > (r * bhAngle)) {
+          // Open node
+          index = curNode->u.n.nextNode;
+        } else {
+          // Accept
+          nodeInteract->push_back(index);
+          index = curNode->u.n.sibling;
+
+          // If there is no sibling go up a level until we find a node
+          ID_T parent = curNode->u.n.parent;
+          while (index == -1 && parent != -1 && parent != root) {
+            index = this->fNode[parent - this->nodeOffset].u.n.sibling;
+            parent = this->fNode[parent - this->nodeOffset].u.n.parent;
+          }
+        }
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Create the interaction list for the node starting at root
+// must test for acceptance based on a radius from center of mass to
+// furthest particle to make sure it is most inclusive.
+// Make sure my definition of partRadius does this
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::createNodeInteractList(
+                    ID_T node,
+                    POSVEL_T bhAngle,    // Open node to examine children
+                    POSVEL_T critRadius, // Accept or ignore node not opened
+                    vector<ID_T>* partInteract,
+                    vector<ID_T>* nodeInteract)
+{
+  POSVEL_T dx, dy, dz, r, partRadius, distToNearPoint;
+  FNode* curNode = &this->fNode[node - this->nodeOffset];
+  POSVEL_T pos_x = curNode->u.n.partCenter[0];
+  POSVEL_T pos_y = curNode->u.n.partCenter[1];
+  POSVEL_T pos_z = curNode->u.n.partCenter[2];
+
+  // Follow thread through tree from root choosing nodes and particles
+  // which will contribute to the force of the given particle
+  ID_T root = this->particleCount;
+  ID_T index = root;
+
+  while (index >= 0) {
+
+    if (index < this->nodeOffset) {
+      // Particle
+      dx = this->xx[index] - pos_x;
+      dy = this->yy[index] - pos_y;
+      dz = this->zz[index] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      if (r < critRadius) {
+        partInteract->push_back(index);
+      }
+      index = this->fParticle[index].nextNode;
+    }
+
+    else {
+      // Node
+      FNode* childNode = &this->fNode[index - this->nodeOffset];
+
+      // If the child is the node we are building the list for skip
+      if (childNode != curNode) {
+        partRadius = childNode->u.n.partRadius;
+        distToNearPoint = distanceToNearestPoint(
+                            pos_x, pos_y, pos_z, childNode);
+
+        dx = childNode->u.n.partCenter[0] - pos_x;
+        dy = childNode->u.n.partCenter[1] - pos_y;
+        dz = childNode->u.n.partCenter[2] - pos_z;
+        r = sqrt(dx * dx + dy * dy + dz * dz);
+
+        if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+
+          // Ignore node, move on to sibling of this node
+          index = childNode->u.n.sibling;
+
+          // If there is no sibling go up a level until we find a node
+          ID_T parent = childNode->u.n.parent;
+          while (index == -1 && parent != -1 && parent != root) {
+            index = this->fNode[parent - this->nodeOffset].u.n.sibling;
+            parent = this->fNode[parent - this->nodeOffset].u.n.parent;
+          }
+        }
+        else {
+          if (2*partRadius > (r * bhAngle)) {
+            // Open node
+            index = childNode->u.n.nextNode;
+          } else {
+            // Accept
+            nodeInteract->push_back(index);
+            index = childNode->u.n.sibling;
+
+            // If there is no sibling go up a level until we find a node
+            ID_T parent = childNode->u.n.parent;
+            while (index == -1 && parent != -1 && parent != root) {
+              index = this->fNode[parent - this->nodeOffset].u.n.sibling;
+              parent = this->fNode[parent - this->nodeOffset].u.n.parent;
+            }
+          }
+        }
+      }
+      else {
+        index = childNode->u.n.sibling;
+        ID_T parent = childNode->u.n.parent;
+        while (index == -1 && parent != -1 && parent != root) {
+          index = this->fNode[parent-nodeOffset].u.n.sibling;
+          parent = this->fNode[parent-nodeOffset].u.n.parent;
+        }
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Force calculation for a group of particles
+// Force is calculated between every pair of particles in the group
+// Interaction lists are applied to every particle in the group
+// Tree walk will hav to continue from this node to locate all the
+// particles which might be in subnodes of this node.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::forceCalculationGroup(
+                    ID_T node,
+                    POSVEL_T bhAngle,    // Open or accepc
+                    POSVEL_T critRadius, // Accept or ignore node not opened
+                    vector<ID_T>* partInteract,
+                    vector<ID_T>* nodeInteract)
+{
+  // Collect all particles in the tree from this node downwards
+  vector<ID_T>* particles = new vector<ID_T>;
+  collectParticles(node, particles);
+  int count = particles->size();
+
+  // Process each particle against all others in the group
+  // Use the minimumPotential() code to make this n^2/2 for upper triangular
+  // Arrange in an upper triangular grid to save computation
+  POTENTIAL_T* force = new POTENTIAL_T[count];
+  for (int i = 0; i < count; i++)
+    force[i] = 0.0;
+
+  // First particle in halo to calculate force on
+  for (int p = 0; p < count; p++) {
+
+    // Next particle in halo force loop
+    for (int q = p+1; q < count; q++) {
+
+      ID_T pId = (*particles)[p];
+      ID_T qId = (*particles)[q];
+
+      POSVEL_T dx = (POSVEL_T) fabs(this->xx[pId] - this->xx[qId]);
+      POSVEL_T dy = (POSVEL_T) fabs(this->yy[pId] - this->yy[qId]);
+      POSVEL_T dz = (POSVEL_T) fabs(this->zz[pId] - this->zz[qId]);
+      POSVEL_T r2 = dx * dx + dy * dy + dz * dz;
+
+      if (r2 != 0.0) {
+        force[p] -= (this->mass[qId] / r2);
+        force[q] -= (this->mass[pId] / r2);
+      }
+    }
+  }
+
+  // Process each particle against the interaction lists
+  // Node interact list was created using the node this particle is in
+  // so it may need to be adjusted first
+  for (int p = 0; p < count; p++) {
+    ID_T pId = (*particles)[p];
+
+    POSVEL_T value = 
+      forceCalculationParticle(pId, critRadius, 
+                               partInteract, nodeInteract);
+    force[p] += value;
+    this->fParticle[pId].force = force[p];
+  }
+  delete particles;
+  delete [] force;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Short range force calculation
+// Potential is calculated and is used to determine the acceleration of
+// the particle.  Acceleration is applied to the current velocity to
+// produce the velocity at the next time step.
+//
+/////////////////////////////////////////////////////////////////////////
+
+POSVEL_T BHForceTree::forceCalculationParticle(
+                    ID_T p0,                    // Target particle index
+                    POSVEL_T critRadius,
+                    vector<ID_T>* partInteract, // Particles acting on p
+                    vector<ID_T>* nodeInteract) // Nodes acting on p
+{
+  POSVEL_T accel[DIMENSION];
+  POSVEL_T phi = 0.0;
+
+  POSVEL_T pos0_x = this->xx[p0];
+  POSVEL_T pos0_y = this->yy[p0];
+  POSVEL_T pos0_z = this->zz[p0];
+
+  for (int dim = 0; dim < DIMENSION; dim++)
+    accel[dim] = 0.0;
+
+  int numberOfNodes = (int) nodeInteract->size();
+  int numberOfParticles = (int) partInteract->size();
+
+  // Particles contributing to the force use location and mass of one particle
+  for (int p = 0; p < numberOfParticles; p++) {
+    ID_T particle = (*partInteract)[p];
+    if (p0 != particle) {
+      POSVEL_T dx = this->xx[particle] - pos0_x;
+      POSVEL_T dy = this->yy[particle] - pos0_y;
+      POSVEL_T dz = this->zz[particle] - pos0_z;
+
+      POSVEL_T r2 = dx * dx + dy * dy + dz * dz;
+      POSVEL_T r = sqrt(r2);
+
+      if (r < critRadius) {
+        POSVEL_T f_over_r = this->mass[particle] * m_fl->f_over_r(r2);
+        //POSVEL_T f_over_r = this->mass[particle] / r2;
+        phi -= f_over_r;
+
+        accel[0] += dx * f_over_r * m_fcoeff;
+        accel[1] += dy * f_over_r * m_fcoeff;
+        accel[2] += dz * f_over_r * m_fcoeff;
+
+        this->vx[p0] += dx * f_over_r * m_fcoeff;
+        this->vy[p0] += dy * f_over_r * m_fcoeff;
+        this->vz[p0] += dz * f_over_r * m_fcoeff;
+      }
+    }
+  }
+
+  // Nodes contributing to force use center of mass and total particle mass
+  for (int n = 0; n < numberOfNodes; n++) {
+    FNode* node = &this->fNode[(*nodeInteract)[n] - this->nodeOffset];
+    POSVEL_T partRadius = node->u.n.partRadius;
+    POSVEL_T distToNearPoint = distanceToNearestPoint(
+                                 pos0_x, pos0_y, pos0_z, node);
+
+    POSVEL_T dx = node->u.n.partCenter[0] - pos0_x;
+    POSVEL_T dy = node->u.n.partCenter[1] - pos0_y;
+    POSVEL_T dz = node->u.n.partCenter[2] - pos0_z;
+
+    POSVEL_T r2 = dx * dx + dy * dy + dz * dz;
+    POSVEL_T r = sqrt(r2);
+
+    if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+      // Ignore
+    } else {
+      POSVEL_T f_over_r = node->u.n.partMass * m_fl->f_over_r(r2);
+      //POSVEL_T f_over_r = node->u.n.partMass / r2;
+      phi -= f_over_r;
+
+      accel[0] += dx * f_over_r * m_fcoeff;
+      accel[1] += dy * f_over_r * m_fcoeff;
+      accel[2] += dz * f_over_r * m_fcoeff;
+    
+      this->vx[p0] += dx * f_over_r * m_fcoeff;
+      this->vy[p0] += dy * f_over_r * m_fcoeff;
+      this->vz[p0] += dz * f_over_r * m_fcoeff;
+    }
+  }
+  return phi;
+}
+
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Collect all particle ids from this node downwards in tree
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::collectParticles(ID_T curId, vector<ID_T>* particles)
+{
+  FNode* curNode = &this->fNode[curId - this->nodeOffset];
+  ID_T child = curNode->u.n.nextNode;
+  while (child != -1) {
+    if (child < this->nodeOffset) {
+      particles->push_back(child);
+      child = this->fParticle[child].sibling;
+    } else {
+      collectParticles(child, particles);
+      child = this->fNode[child - this->nodeOffset].u.n.sibling;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Short range gravity calculation for a single particle
+// Starting with the root and following threads and siblings makes decisions
+// about which nodes are opened, accepted or ignored
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::treeForceGadgetTopDown(
+                    ID_T p,               // Particle to calculate force on
+                    POSVEL_T bhAngle,     // Open node to examine children
+                    POSVEL_T critRadius)  // Accept or ignore node not opened
+{
+  // Keep vectors for now but eventually force can be accumulated
+  // on all particles and on nodes that have been accepted
+  vector<ID_T>* partInteract = new vector<ID_T>;
+  vector<ID_T>* nodeInteract = new vector<ID_T>;
+
+  POSVEL_T dx, dy, dz, r, partRadius, distToNearPoint;
+  POSVEL_T pos_x = this->xx[p];
+  POSVEL_T pos_y = this->yy[p];
+  POSVEL_T pos_z = this->zz[p];
+  
+  // Follow thread through tree from root choosing nodes and particles
+  // which will contribute to the force of the given particle
+  ID_T root = this->particleCount;
+  ID_T index = root;
+
+  while (index >= 0) {
+
+    if (index < this->nodeOffset) {
+      // Particle
+      dx = this->xx[index] - pos_x;
+      dy = this->yy[index] - pos_y;
+      dz = this->zz[index] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      if (r < critRadius) {
+        partInteract->push_back(index);
+      }
+      index = this->fParticle[index].nextNode;
+    } else {
+      // Node
+      FNode* curNode = &this->fNode[index - this->nodeOffset];
+      partRadius = curNode->u.n.partRadius;
+      distToNearPoint = distanceToNearestPoint(pos_x, pos_y, pos_z, curNode);
+      
+      dx = curNode->u.n.partCenter[0] - pos_x;
+      dy = curNode->u.n.partCenter[1] - pos_y;
+      dz = curNode->u.n.partCenter[2] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+      
+      // Node is ignored if it is too far away from the particle
+      // Distance from particle to particle radius exceeds critical radius
+      // Distance from particle to nearest side of node exceeds critical radius
+      if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+	
+        // Ignore node, move on to sibling of this node
+        index = curNode->u.n.sibling;
+	
+        // If there is no sibling go up a level until we find a node
+        ID_T parent = curNode->u.n.parent;
+        while (index == -1 && parent != -1 && parent != root) {
+          index = this->fNode[parent - this->nodeOffset].u.n.sibling;
+          parent = this->fNode[parent - this->nodeOffset].u.n.parent;
+        }
+      } else { 
+        if (2*partRadius > (r * bhAngle)) { 
+          // Open node, move on to first child
+          index = curNode->u.n.nextNode;
+
+        } else {
+          // Accept node, add to interact list, move on to sibling
+          nodeInteract->push_back(index);
+          index = curNode->u.n.sibling;
+ 
+          // If there is no sibling go up a level until we find a node
+          ID_T parent = curNode->u.n.parent;
+          while (index == -1 && parent != -1 && parent != root) {
+            index = this->fNode[parent-nodeOffset].u.n.sibling;
+            parent = this->fNode[parent-nodeOffset].u.n.parent;
+          }
+        }
+      }
+    }
+  }
+
+  // Force calculation for this particle
+  this->fParticle[p].force = 
+    forceCalculation(p, partInteract, nodeInteract);
+
+  delete partInteract;
+  delete nodeInteract;
+}
+
+
+void BHForceTree::treeForceGadgetTopDownFast(
+                    ID_T p,               // Particle to calculate force on
+                    POSVEL_T bhAngle,     // Open node to examine children
+                    POSVEL_T critRadius)  // Accept or ignore node not opened
+{
+  vector<POSVEL_T>* xInteract = new vector<POSVEL_T>;
+  vector<POSVEL_T>* yInteract = new vector<POSVEL_T>;
+  vector<POSVEL_T>* zInteract = new vector<POSVEL_T>;
+  vector<POSVEL_T>* mInteract = new vector<POSVEL_T>;
+
+  POSVEL_T dx, dy, dz, r, partRadius, distToNearPoint;
+  POSVEL_T pos_x = this->xx[p];
+  POSVEL_T pos_y = this->yy[p];
+  POSVEL_T pos_z = this->zz[p];
+  
+  // Follow thread through tree from root choosing nodes and particles
+  // which will contribute to the force of the given particle
+  ID_T root = this->particleCount;
+  ID_T index = root;
+
+  while (index >= 0) {
+
+    if (index < this->nodeOffset) {
+      // Particle
+      dx = this->xx[index] - pos_x;
+      dy = this->yy[index] - pos_y;
+      dz = this->zz[index] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      if (r < critRadius && p != index) {
+	xInteract->push_back(this->xx[index]);
+	yInteract->push_back(this->yy[index]);
+	zInteract->push_back(this->zz[index]);
+	mInteract->push_back(this->mass[index]);
+      }
+
+      index = this->fParticle[index].nextNode;
+    } else {
+      // Node
+      FNode* curNode = &this->fNode[index - this->nodeOffset];
+      partRadius = curNode->u.n.partRadius;
+      distToNearPoint = distanceToNearestPoint(pos_x, pos_y, pos_z, curNode);
+      
+      dx = curNode->u.n.partCenter[0] - pos_x;
+      dy = curNode->u.n.partCenter[1] - pos_y;
+      dz = curNode->u.n.partCenter[2] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+      
+      // Node is ignored if it is too far away from the particle
+      // Distance from particle to particle radius exceeds critical radius
+      // Distance from particle to nearest side of node exceeds critical radius
+      if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+	
+        // Ignore node, move on to sibling of this node
+        index = curNode->u.n.sibling;
+	
+        // If there is no sibling go up a level until we find a node
+        ID_T parent = curNode->u.n.parent;
+        while (index == -1 && parent != -1 && parent != root) {
+          index = this->fNode[parent - this->nodeOffset].u.n.sibling;
+          parent = this->fNode[parent - this->nodeOffset].u.n.parent;
+        }
+      } else { 
+        if (2*partRadius > (r * bhAngle)) { 
+          // Open node, move on to first child
+          index = curNode->u.n.nextNode;
+	  
+        } else {
+          // Accept node, add to interact list, move on to sibling
+	  xInteract->push_back(curNode->u.n.partCenter[0]);
+	  yInteract->push_back(curNode->u.n.partCenter[1]);
+	  zInteract->push_back(curNode->u.n.partCenter[2]);
+	  mInteract->push_back(curNode->u.n.partMass);
+
+          index = curNode->u.n.sibling;
+ 
+          // If there is no sibling go up a level until we find a node
+          ID_T parent = curNode->u.n.parent;
+          while (index == -1 && parent != -1 && parent != root) {
+            index = this->fNode[parent-nodeOffset].u.n.sibling;
+            parent = this->fNode[parent-nodeOffset].u.n.parent;
+          }
+        }
+      }
+    }
+  }
+
+  // Force calculation for this particle
+  this->fParticle[p].force = 
+    forceCalculationFast(p, xInteract, yInteract, zInteract, mInteract);
+
+  delete xInteract;
+  delete yInteract;
+  delete zInteract;
+  delete mInteract;
+}
+
+
+void BHForceTree::treeForceGadgetTopDownFast2(
+                    ID_T p,               // Particle to calculate force on
+                    POSVEL_T bhAngle,     // Open node to examine children
+                    POSVEL_T critRadius,  // Accept or ignore node not opened
+		    vector<POSVEL_T>* xInteract,
+		    vector<POSVEL_T>* yInteract,
+		    vector<POSVEL_T>* zInteract,
+		    vector<POSVEL_T>* mInteract,
+		    double *timeWalk,
+		    double *timeEval)
+{
+  POSVEL_T dx, dy, dz, r, partRadius, distToNearPoint;
+  POSVEL_T pos_x = this->xx[p];
+  POSVEL_T pos_y = this->yy[p];
+  POSVEL_T pos_z = this->zz[p];
+  
+  // Follow thread through tree from root choosing nodes and particles
+  // which will contribute to the force of the given particle
+  ID_T root = this->particleCount;
+  ID_T index = root;
+
+  clock_t start, end;
+
+  start = clock();
+  while (index >= 0) {
+
+    if (index < this->nodeOffset) {
+      // Particle
+      dx = this->xx[index] - pos_x;
+      dy = this->yy[index] - pos_y;
+      dz = this->zz[index] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      if (r < critRadius && p != index) {
+	xInteract->push_back(this->xx[index]);
+	yInteract->push_back(this->yy[index]);
+	zInteract->push_back(this->zz[index]);
+	mInteract->push_back(this->mass[index]);
+      }
+
+      index = this->fParticle[index].nextNode;
+    } else {
+      // Node
+      FNode* curNode = &this->fNode[index - this->nodeOffset];
+      partRadius = curNode->u.n.partRadius;
+      distToNearPoint = distanceToNearestPoint(pos_x, pos_y, pos_z, curNode);
+      
+      dx = curNode->u.n.partCenter[0] - pos_x;
+      dy = curNode->u.n.partCenter[1] - pos_y;
+      dz = curNode->u.n.partCenter[2] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+      
+      // Node is ignored if it is too far away from the particle
+      // Distance from particle to particle radius exceeds critical radius
+      // Distance from particle to nearest side of node exceeds critical radius
+      if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+	
+        // Ignore node, move on to sibling of this node
+        index = curNode->u.n.sibling;
+	
+        // If there is no sibling go up a level until we find a node
+        ID_T parent = curNode->u.n.parent;
+        while (index == -1 && parent != -1 && parent != root) {
+          index = this->fNode[parent - this->nodeOffset].u.n.sibling;
+          parent = this->fNode[parent - this->nodeOffset].u.n.parent;
+        }
+      } else { 
+        if (2*partRadius > (r * bhAngle)) { 
+          // Open node, move on to first child
+          index = curNode->u.n.nextNode;
+	  
+        } else {
+          // Accept node, add to interact list, move on to sibling
+	  xInteract->push_back(curNode->u.n.partCenter[0]);
+	  yInteract->push_back(curNode->u.n.partCenter[1]);
+	  zInteract->push_back(curNode->u.n.partCenter[2]);
+	  mInteract->push_back(curNode->u.n.partMass);
+
+          index = curNode->u.n.sibling;
+ 
+          // If there is no sibling go up a level until we find a node
+          ID_T parent = curNode->u.n.parent;
+          while (index == -1 && parent != -1 && parent != root) {
+            index = this->fNode[parent-nodeOffset].u.n.sibling;
+            parent = this->fNode[parent-nodeOffset].u.n.parent;
+          }
+        }
+      }
+    }
+  }
+  end = clock();
+  *timeWalk = 1.0*(end-start)/CLOCKS_PER_SEC;
+
+  // Force calculation for this particle
+  start = clock();
+  this->fParticle[p].force = 
+    forceCalculationFast(p, xInteract, yInteract, zInteract, mInteract);
+  end = clock();
+  *timeEval = 1.0*(end-start)/CLOCKS_PER_SEC;
+
+  xInteract->clear();
+  yInteract->clear();
+  zInteract->clear();
+  mInteract->clear();
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Short range gravity calculation for a single particle
+// Starting with the particle walk up the parents processing siblings
+// by testing particles and by opening, accepting or ignoring nodes.
+// Stop moving up parents when the nearest side is beyond critical radius.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::treeForceGadgetBottomUp(
+                    ID_T p,               // Particle to calculate force on
+                    POSVEL_T bhAngle,     // Open node to examine children
+                    POSVEL_T critRadius)  // Accept or ignore node not opened
+{
+  // Collect into interaction lists
+  vector<ID_T>* partInteract = new vector<ID_T>;
+  vector<ID_T>* nodeInteract = new vector<ID_T>;
+
+  // Location of particle
+  POSVEL_T dx, dy, dz, r, partRadius;
+  POSVEL_T pos_x = this->xx[p];
+  POSVEL_T pos_y = this->yy[p];
+  POSVEL_T pos_z = this->zz[p];
+
+  ID_T curId = p;
+  ID_T parent = this->fParticle[curId].parent;
+
+  while (parent != -1) {
+    ID_T child = this->fNode[parent - this->nodeOffset].u.n.nextNode;
+    while (child != -1) {
+      if (child != curId) {
+        if (child < this->nodeOffset) {
+          // Particle
+          dx = this->xx[child] - pos_x;
+          dy = this->yy[child] - pos_y;
+          dz = this->zz[child] - pos_z;
+          r = sqrt(dx * dx + dy * dy + dz * dz);
+
+          if (r < critRadius) {
+            partInteract->push_back(child);
+          }
+          child = this->fParticle[child].sibling;
+        }
+        else {
+          // Node
+          FNode* childNode = &this->fNode[child - this->nodeOffset];
+          partRadius = childNode->u.n.partRadius;
+          POSVEL_T distToNearPoint = 
+            distanceToNearestPoint(pos_x, pos_y, pos_z, childNode);
+
+          dx = childNode->u.n.partCenter[0] - pos_x;
+          dy = childNode->u.n.partCenter[1] - pos_y;
+          dz = childNode->u.n.partCenter[2] - pos_z;
+          r = sqrt(dx * dx + dy * dy + dz * dz);
+
+          // Check for ignore of node first
+          if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+            // Ignore
+          } else
+
+          if (2*partRadius < (r * bhAngle)) { 
+            // Accept
+            nodeInteract->push_back(child);
+
+          } else {
+            // Open node
+            recurseOpenNode(childNode, pos_x, pos_y, pos_z, 
+                            bhAngle, critRadius,
+                            partInteract, nodeInteract);
+          }
+          child = this->fNode[child - this->nodeOffset].u.n.sibling;
+        }
+      }
+      else {
+        if (curId < this->nodeOffset)
+          child = this->fParticle[curId].sibling;
+        else
+          child = this->fNode[child - this->nodeOffset].u.n.sibling;
+      }
+    }
+    curId = parent;
+    parent = this->fNode[parent - this->nodeOffset].u.n.parent;
+  }
+
+  // Force calculation for this particle
+  this->fParticle[p].force = 
+    forceCalculation(p, partInteract, nodeInteract);
+
+  delete partInteract;
+  delete nodeInteract;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Open this node recursively adding accepted nodes and particles
+// to the interact list
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::recurseOpenNode(
+                    FNode* curNode,
+                    POSVEL_T pos_x,
+                    POSVEL_T pos_y,
+                    POSVEL_T pos_z,
+                    POSVEL_T bhAngle,     // Open node to examine children
+                    POSVEL_T critRadius,  // Accept or ignore node not opened
+                    vector<ID_T>* partInteract,
+                    vector<ID_T>* nodeInteract)
+{
+  POSVEL_T dx, dy, dz, r, partRadius;
+  ID_T child = curNode->u.n.nextNode;
+
+  while (child != -1) {
+    if (child < this->nodeOffset) {
+      // Particle
+      dx = this->xx[child] - pos_x;
+      dy = this->yy[child] - pos_y;
+      dz = this->zz[child] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      if (r < critRadius) {
+        partInteract->push_back(child);
+      }
+      child = this->fParticle[child].sibling;
+    } else {
+      FNode* childNode = &this->fNode[child - this->nodeOffset];
+      partRadius = childNode->u.n.partRadius;
+      POSVEL_T distToNearPoint = 
+        distanceToNearestPoint(pos_x, pos_y, pos_z, childNode);
+
+      dx = childNode->u.n.partCenter[0] - pos_x;
+      dy = childNode->u.n.partCenter[1] - pos_y;
+      dz = childNode->u.n.partCenter[2] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      // Check for ignore of node first
+      if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+        // Ignore
+      } else
+
+      if (2*partRadius < (r * bhAngle)) { 
+        // Accept
+        nodeInteract->push_back(child);
+
+      } else {
+        // Open node
+        recurseOpenNode(childNode, pos_x, pos_y, pos_z, 
+                        bhAngle, critRadius,
+                        partInteract, nodeInteract);
+      }
+      child = this->fNode[child - this->nodeOffset].u.n.sibling;
+    }
+  }
+} 
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Short range gravity calculation for every particle in the tree
+// Recurses through the tree saving previous work for reuse when popping
+// out of recursion.  Based on Barnes treecode.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::treeForceBarnesAdjust(
+                    POSVEL_T bhAngle,     // Open node to examine children
+                    POSVEL_T critRadius)  // Accept or ignore node not opened
+{
+  ID_T root = this->particleCount;
+  vector<ID_T>* active = new vector<ID_T>;
+  vector<ID_T>* partInteract = new vector<ID_T>;
+  vector<ID_T>* nodeInteract = new vector<ID_T>;
+
+  active->push_back(root);
+
+  // Walk uses opening angle, critical radius for open, accept and ignore
+  walkTreeBarnesAdjust(active, partInteract, nodeInteract, 
+                       root, bhAngle, critRadius);
+  
+  delete active;
+  delete partInteract;
+  delete nodeInteract;
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// Walk the BH tree for the given particle or node (identifier curId)
+// Recursion starts with a new active list which will contain particles
+// and nodes which possibly will contribute to the force on a particle.
+// Particles on the active list will always be chosen for the interact list.
+// Nodes on the active list may be OPENED if they are close enough
+// or ACCEPTED and used in summary if they are within a critical radius
+// and IGNORED otherwise. Nodes that are opened 
+// have all their children (particles or nodes) added to the active list.
+//
+// After the children are added a new level of recursion starts by
+// calculating a new size for that level, starting a fresh active list
+// and building on the current interact lists.
+//
+// Recursion continues until the active list has been completely processed.
+// When a level of recursion is complete the active list is destroyed
+// and new items put on the interact lists are popped off.
+//
+// The advantage to this method is that items in the interaction list may
+// not need to be processed again when we are doing the low levels of the tree.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::walkTreeBarnesAdjust(
+                    vector<ID_T>* curActive,      // nodes to be acted on
+                    vector<ID_T>* partInteract,   // particles for force
+                    vector<ID_T>* nodeInteract,   // nodes for force
+                    ID_T curId,                   // current particle or node
+                    POSVEL_T bhAngle,             // open node
+                    POSVEL_T critRadius)          // accept or ignore node
+{
+  POSVEL_T dx, dy, dz, r, partRadius, distToNearPoint;
+
+  // Current active list
+  int begIndx = 0;
+  int endIndx = curActive->size();
+
+  // Construct active list for each recursion
+  vector<ID_T>* newActive = new vector<ID_T>;
+
+  // Set the location for the particle or node for the walk
+  POSVEL_T pos_x, pos_y, pos_z;
+  if (curId < this->nodeOffset) {
+    pos_x = this->xx[curId];
+    pos_y = this->yy[curId];
+    pos_z = this->zz[curId];
+  } else {
+    FNode* curNode = &this->fNode[curId - this->nodeOffset];
+    pos_x = curNode->u.n.partCenter[0];
+    pos_y = curNode->u.n.partCenter[1];
+    pos_z = curNode->u.n.partCenter[2];
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  //
+  // Process the active list window adding children to end of list
+  // Valid particles and accepted nodes are copied to the interact list
+  //
+  int hasChildren = 0;
+  int pcount = 0;
+  int ncount = 0;
+  for (int indx = begIndx; indx < endIndx; indx++) {
+
+    // If the current active element is a cell it will be
+    // ACCEPTED and copied to the interact list
+    // OPENED and its children will be added to the end of the active list
+    // IGNORED because it is too far away
+    if ((*curActive)[indx] >= this->nodeOffset) {
+      hasChildren = 1;
+
+      FNode* actNode = &this->fNode[(*curActive)[indx] - this->nodeOffset];
+      partRadius = actNode->u.n.partRadius;
+      distToNearPoint = distanceToNearestPoint(pos_x, pos_y, pos_z, actNode);
+
+      dx = actNode->u.n.partCenter[0] - pos_x;
+      dy = actNode->u.n.partCenter[1] - pos_y;
+      dz = actNode->u.n.partCenter[2] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      // Node is ignored if it is too far away from the particle
+      // Distance from particle to particle radius exceeds critical radius
+      // Distance from particle to nearest side of node exceeds critical radius
+      if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+
+        // Ignore node, move on to sibling of this node
+      }
+      else {
+        if (2*partRadius > (r * bhAngle)) {
+          // Open node, move on to first child
+          ID_T child =
+            this->fNode[(*curActive)[indx] - this->nodeOffset].u.n.nextNode;
+          while (child != -1) {
+            if (child >= this->nodeOffset) {
+
+              // Child is a node which is active and must be considered
+              newActive->push_back(child);
+              child = this->fNode[child - this->nodeOffset].u.n.sibling;
+            }
+            else {
+              // Child is a particle, add to interaction list
+              partInteract->push_back(child);
+              pcount++;
+              child = this->fParticle[child].sibling;
+            }
+          }
+        } else {
+          // Accept node, add to interact list, move on to sibling
+          nodeInteract->push_back((*curActive)[indx]);
+          ncount++;
+        }
+      }
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  //
+  // At this point a new level of children may have been added to active list
+  // Process children by dividing the node size and recursing
+  //
+  if (hasChildren) {
+
+    // Current item on active list is a cell
+    if (curId >= this->nodeOffset) {
+
+      // Process each child
+      ID_T child = fNode[curId - this->nodeOffset].u.n.nextNode;
+      while (child != -1) {
+
+        // Child is a node
+        if (child >= this->nodeOffset) {
+          FNode* childNode = &this->fNode[child - this->nodeOffset];
+
+          // Recurse on walk tree to process child
+          walkTreeBarnesAdjust(newActive, partInteract, nodeInteract,
+                               child, bhAngle, critRadius);
+          child = childNode->u.n.sibling;
+        }
+        // Child is a particle
+        else {
+           walkTreeBarnesAdjust(newActive, partInteract, nodeInteract,
+                                child, bhAngle, critRadius);
+           child = this->fParticle[child].sibling;
+        }
+      }
+    }
+    // Current item on active list is a particle
+    else {
+      walkTreeBarnesAdjust(newActive, partInteract, nodeInteract,
+                           curId, bhAngle, critRadius);
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  //
+  // If no new items were added to active list we are done and can process
+  // the interact list for this particle p (which can't be a cell)
+  //
+  else {
+    if (curId > this->nodeOffset)
+      cout << "ERROR: POP OUT ON NODE " << curId << endl;
+
+    // Since the interact lists might contain accepted nodes from upper levels
+    // which need to be opened for this particle, adjust the lists first
+    vector<ID_T>* adjNodeInteract = new vector<ID_T>;
+    vector<ID_T>* adjPartInteract = new vector<ID_T>;
+
+    static Timings::TimerRef adjtimer = Timings::getTimer("Barnes Adjustment");
+    Timings::startTimer(adjtimer);
+    adjustInteraction(curId,
+                      partInteract, nodeInteract,
+                      adjPartInteract, adjNodeInteract,
+                      bhAngle, critRadius);
+    Timings::stopTimer(adjtimer);
+
+    // Calculate force for the particle
+    this->fParticle[curId].force = 
+      forceCalculation(curId, adjPartInteract, adjNodeInteract);
+
+    delete adjNodeInteract;
+    delete adjPartInteract;
+  }
+
+  // Active list is new for every recursion level
+  // Interact lists are appended to at each recursion level
+  // So interact lists must be popped by the correct number for this recursion
+  for (int i = 0; i < pcount; i++)
+    partInteract->pop_back();
+  for (int i = 0; i < ncount; i++)
+    nodeInteract->pop_back();
+  delete newActive;
+}
+
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Recursion enters with a guess for the interact lists which were set
+// on previous invocations of the method.  Check the node interact list
+// which contains nodes which were accepted to see if they should 
+// actually be opened relative to this new current particle or node
+// If so remove from the nodeInteract list and add to the active list
+//
+// Particles in the interact list might actually be grouped and used
+// with their parent node as an accept, but leaving them will lead to
+// a better answer, not a worse.  So we won't change the partInteract
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::adjustInteraction(
+                    ID_T p0,
+                    vector<ID_T>* partInteract,
+                    vector<ID_T>* nodeInteract,
+                    vector<ID_T>* adjPartInteract,
+                    vector<ID_T>* adjNodeInteract,
+                    POSVEL_T bhAngle,
+                    POSVEL_T critRadius)
+{
+  POSVEL_T dx, dy, dz, r, partRadius, distToNearPoint;
+
+  // Get location of particle being adjusted for
+  POSVEL_T pos_x = this->xx[p0];
+  POSVEL_T pos_y = this->yy[p0];
+  POSVEL_T pos_z = this->zz[p0];
+
+  // Copy all particles to the adjust list, will only add new particles
+  int numberOfParticles = (int) partInteract->size();
+  for (int p = 0; p < numberOfParticles; p++)
+    adjPartInteract->push_back((*partInteract)[p]);
+
+  // Process each node to see if status changes from accept to ignore or open
+  int numberOfNodes = (int) nodeInteract->size();
+  for (int n = 0; n < numberOfNodes; n++) {
+    FNode* curNode = &this->fNode[(*nodeInteract)[n] - this->nodeOffset];
+    partRadius = curNode->u.n.partRadius;
+    distToNearPoint = distanceToNearestPoint(pos_x, pos_y, pos_z, curNode);
+
+    dx = curNode->u.n.partCenter[0] - pos_x;
+    dy = curNode->u.n.partCenter[1] - pos_y;
+    dz = curNode->u.n.partCenter[2] - pos_z;
+    r = sqrt(dx * dx + dy * dy + dz * dz);
+
+    // Node is ignored if it is too far away from the particle
+    // Distance from particle to particle radius exceeds critical radius
+    // Distance from particle to nearest side of node exceeds critical radius
+    if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+        // Ignore node, move on to sibling of this node
+      }
+    else {
+      if (2*partRadius > (r * bhAngle)) {
+        // Node must be opened and constituent parts examined
+        adjustNodeInteract(p0, curNode, adjPartInteract, adjNodeInteract,
+                           bhAngle, critRadius);
+      } else {
+        // Accept node, add to interact list, move on to sibling
+        adjNodeInteract->push_back((*nodeInteract)[n]);
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Recursive part of interaction adjustment
+// Examine children of current node recursively for inclusion into interaction
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::adjustNodeInteract(
+                    ID_T p0,
+                    FNode* curNode,
+                    vector<ID_T>* adjPartInteract,
+                    vector<ID_T>* adjNodeInteract,
+                    POSVEL_T bhAngle,
+                    POSVEL_T critRadius)
+{
+  POSVEL_T dx, dy, dz, r, partRadius, distToNearPoint;
+
+  // Get location of particle being adjusted for
+  POSVEL_T pos_x = this->xx[p0];
+  POSVEL_T pos_y = this->yy[p0];
+  POSVEL_T pos_z = this->zz[p0];
+
+  // Current node is to be opened and recursively checked for interactions
+  ID_T child = curNode->u.n.nextNode;
+  while (child != -1) {
+    if (child < this->nodeOffset) {
+      // Child is a particle, add to adjusted particle interact list
+      adjPartInteract->push_back(child);
+      child = this->fParticle[child].sibling;
+    }
+    else {
+      // Child is a node, check to see if it should be opened, accepted, ignored
+      FNode* childNode = &this->fNode[child - this->nodeOffset];
+      partRadius = childNode->u.n.partRadius;
+      distToNearPoint = distanceToNearestPoint(pos_x, pos_y, pos_z, childNode);
+
+      dx = childNode->u.n.partCenter[0] - pos_x;
+      dy = childNode->u.n.partCenter[1] - pos_y;
+      dz = childNode->u.n.partCenter[2] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      // Node is ignored if it is too far away from the particle
+      // Distance from particle to particle radius exceeds critical radius
+      // Distance from particle to nearest side of node exceeds critical radius
+      if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+          // Ignore node, move on to sibling of this node
+      }
+      else {
+        if (2*partRadius > (r * bhAngle)) {
+          // Node must be opened and constituent parts examined
+          adjustNodeInteract(p0, childNode, adjPartInteract, adjNodeInteract,
+                             bhAngle, critRadius);
+        } else {
+          // Accept node
+          adjNodeInteract->push_back(child);
+        }
+      }
+      child = this->fNode[child - this->nodeOffset].u.n.sibling;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Short range gravity calculation for every particle in the tree
+// Recurses through the tree saving previous work for reuse when popping
+// out of recursion.  Based on Barnes treecode with quick scan.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::treeForceBarnesQuick(
+                    POSVEL_T bhAngle,     // Open a node
+                    POSVEL_T critRadius)  // Accept or ignore node not opened
+{
+  ID_T root = this->particleCount;
+
+  vector<ID_T>* active = new vector<ID_T>;
+  vector<ID_T>* partInteract = new vector<ID_T>;
+  vector<ID_T>* nodeInteract = new vector<ID_T>;
+
+  active->push_back(root);
+
+  // Quick walk of tree accepts nodes that do not touch target node
+  walkTreeBarnesQuick(active, partInteract, nodeInteract, 
+                      root, bhAngle, critRadius);
+
+  delete active;
+  delete partInteract;
+  delete nodeInteract;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Walk the BH tree for the given particle or node (identifier curId)
+// Recursion starts with a new active list which will contain particles
+// and nodes which possibly will contribute to the force on a particle.
+// Particles on the active list will always be chosen for the interact list.
+// Nodes on the active list may be OPENED if they are close enough
+// or ACCEPTED and used in summary if they are not. Nodes that are opened 
+// have all their children (particles or nodes) added to the active list.
+//
+// After the children are added a new level of recursion starts by
+// calculating a new size for that level, starting a fresh active list
+// and building on the current interact lists.
+//
+// Recursion continues until the active list has been completely processed.
+// When a level of recursion is complete the active list is destroyed
+// and new items put on the interact lists are popped off.
+//
+// The advantage to this method is that items in the interaction list may
+// not need to be processed again when we are doing the low levels of the tree.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::walkTreeBarnesQuick(
+                    vector<ID_T>* curActive,      // nodes to be acted on
+                    vector<ID_T>* partInteract,   // particles for force
+                    vector<ID_T>* nodeInteract,   // nodes for force
+                    ID_T curId,                   // current particle or node
+                    POSVEL_T bhAngle,             // open node
+                    POSVEL_T critRadius)          // accept or ignore
+{
+  POSVEL_T dx, dy, dz, r, partRadius, distToNearPoint;
+
+  // Current active list
+  int begIndx = 0;
+  int endIndx = curActive->size();
+
+  // Construct active list for each recursion
+  vector<ID_T>* newActive = new vector<ID_T>;
+
+  // Set the location for the particle or node for the walk
+  POSVEL_T pos_x, pos_y, pos_z;
+  if (curId < this->nodeOffset) {
+    pos_x = this->xx[curId];
+    pos_y = this->yy[curId];
+    pos_z = this->zz[curId];
+  } else {
+    FNode* curNode = &this->fNode[curId - this->nodeOffset];
+    pos_x = curNode->u.n.partCenter[0];
+    pos_y = curNode->u.n.partCenter[1];
+    pos_z = curNode->u.n.partCenter[2];
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  //
+  // Process the active list window adding children to end of list
+  // Valid particles and accepted nodes are copied to the interact list
+  //
+  int hasChildren = 0;
+  int pcount = 0;
+  int ncount = 0;
+  for (int indx = begIndx; indx < endIndx; indx++) {
+
+    // If the current active element is a cell it will be
+    // ACCEPTED and copied to the interact list
+    // OPENED and its children will be added to the end of the active list
+    // IGNORED because it is too far away
+    if ((*curActive)[indx] >= this->nodeOffset) {
+      hasChildren = 1;
+
+      FNode* actNode = &this->fNode[(*curActive)[indx] - this->nodeOffset];
+      partRadius = actNode->u.n.partRadius;
+      distToNearPoint = distanceToNearestPoint(pos_x, pos_y, pos_z, actNode);
+
+      dx = actNode->u.n.partCenter[0] - pos_x;
+      dy = actNode->u.n.partCenter[1] - pos_y;
+      dz = actNode->u.n.partCenter[2] - pos_z;
+      r = sqrt(dx * dx + dy * dy + dz * dz);
+
+      // Node is ignored if it is too far away from the particle
+      // Distance from particle to particle radius exceeds critical radius
+      // Distance from particle to nearest side of node exceeds critical radius
+      if ((r - partRadius) > critRadius || distToNearPoint > critRadius) {
+        // Ignore node, move on to sibling of this node
+      }
+      else {
+        if (2*partRadius > (r * bhAngle)) {
+          // Open node, move on to first child
+          ID_T child = 
+            this->fNode[(*curActive)[indx] - this->nodeOffset].u.n.nextNode;
+          while (child != -1) {
+            if (child >= this->nodeOffset) {
+
+              // Child is a node which is active and must be considered
+              newActive->push_back(child);
+              child = this->fNode[child - this->nodeOffset].u.n.sibling;
+            }
+            else {
+              // Child is a particle, add to interaction list
+              partInteract->push_back(child);
+              pcount++;
+              child = this->fParticle[child].sibling;
+            }
+          }
+        } else {
+          // Accept node, add to interact list, move on to sibling
+          nodeInteract->push_back((*curActive)[indx]);
+          ncount++;
+        }
+      }
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  //
+  // At this point a new level of children may have been added to active list
+  // Process children by dividing the node size and recursing
+  //
+  if (hasChildren) {
+
+    // Current item on active list is a cell
+    if (curId >= this->nodeOffset) {
+
+      // Process each child
+      ID_T child = fNode[curId - this->nodeOffset].u.n.nextNode;
+      while (child != -1) {
+
+        // Child is a node
+        if (child >= this->nodeOffset) {
+          FNode* childNode = &this->fNode[child - this->nodeOffset];
+
+          // Recurse on walk tree to process child
+          walkTreeBarnesQuick(newActive, partInteract, nodeInteract,
+                              child, bhAngle, critRadius);
+          child = childNode->u.n.sibling;
+        }
+        // Child is a particle
+        else {
+           walkTreeBarnesQuick(newActive, partInteract, nodeInteract,
+                               child, bhAngle, critRadius);
+           child = this->fParticle[child].sibling;
+        }
+      }
+    }
+    // Current item on active list is a particle
+    else {
+      walkTreeBarnesQuick(newActive, partInteract, nodeInteract,
+                          curId, bhAngle, critRadius);
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  //
+  // If no new items were added to active list we are done and can process
+  // the interact list for this particle p (which can't be a cell)
+  //
+  else {
+    if (curId > this->nodeOffset)
+      cout << "ERROR: POP OUT ON NODE " << curId << endl;
+    this->fParticle[curId].force = 
+      forceCalculation(curId, partInteract, nodeInteract);
+  }
+
+  // Active list is new for every recursion level
+  // Interact lists are appended to at each recursion level
+  // So interact lists must be popped by the correct number for this recursion
+  for (int i = 0; i < pcount; i++)
+    partInteract->pop_back();
+  for (int i = 0; i < ncount; i++)
+    nodeInteract->pop_back();
+  delete newActive;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Short range force calculation
+// Potential is calculated and is used to determine the acceleration of
+// the particle.  Acceleration is applied to the current velocity to
+// produce the velocity at the next time step.
+//
+/////////////////////////////////////////////////////////////////////////
+
+POSVEL_T BHForceTree::forceCalculation(
+                    ID_T p0,                    // Target particle index
+                    vector<ID_T>* partInteract, // Particles acting on p
+                    vector<ID_T>* nodeInteract) // Nodes acting on p
+{
+  POSVEL_T accel[DIMENSION];
+  POSVEL_T phi = 0.0;
+
+  POSVEL_T pos0_x = this->xx[p0];
+  POSVEL_T pos0_y = this->yy[p0];
+  POSVEL_T pos0_z = this->zz[p0];
+
+  for (int dim = 0; dim < DIMENSION; dim++)
+    accel[dim] = 0.0;
+
+  int numberOfNodes = (int) nodeInteract->size();
+  int numberOfParticles = (int) partInteract->size();
+
+  // Particles contributing to the force use location and mass of one particle
+  for (int p = 0; p < numberOfParticles; p++) {
+    ID_T particle = (*partInteract)[p];
+    if (p0 != particle) {
+      POSVEL_T dx = this->xx[particle] - pos0_x;
+      POSVEL_T dy = this->yy[particle] - pos0_y;
+      POSVEL_T dz = this->zz[particle] - pos0_z;
+
+      POSVEL_T r2 = dx * dx + dy * dy + dz * dz;
+
+      POSVEL_T f_over_r = this->mass[particle] * m_fl->f_over_r(r2);
+      //POSVEL_T f_over_r = this->mass[particle] / r2;
+      phi -= f_over_r;
+      //if (p0 == 171893) cout << "Top Particle used " << particle << " phi " << phi << endl;
+
+      accel[0] += dx * f_over_r * m_fcoeff;
+      accel[1] += dy * f_over_r * m_fcoeff;
+      accel[2] += dz * f_over_r * m_fcoeff;
+
+      this->vx[p0] += dx * f_over_r * m_fcoeff;
+      this->vy[p0] += dy * f_over_r * m_fcoeff;
+      this->vz[p0] += dz * f_over_r * m_fcoeff;
+    }
+  }
+
+  // Nodes contributing to force use center of mass and total particle mass
+  for (int n = 0; n < numberOfNodes; n++) {
+    FNode* node = &this->fNode[(*nodeInteract)[n] - this->nodeOffset];
+    POSVEL_T dx = node->u.n.partCenter[0] - pos0_x;
+    POSVEL_T dy = node->u.n.partCenter[1] - pos0_y;
+    POSVEL_T dz = node->u.n.partCenter[2] - pos0_z;
+
+    POSVEL_T r2 = dx * dx + dy * dy + dz * dz;
+
+    POSVEL_T f_over_r = node->u.n.partMass * m_fl->f_over_r(r2);
+    //POSVEL_T f_over_r = node->u.n.partMass / r2;
+    phi -= f_over_r;
+    //if (p0 == 171893) cout << "Top node used " << (*nodeInteract)[n] << " phi " << phi << endl;
+
+    accel[0] += dx * f_over_r * m_fcoeff;
+    accel[1] += dy * f_over_r * m_fcoeff;
+    accel[2] += dz * f_over_r * m_fcoeff;
+    
+    this->vx[p0] += dx * f_over_r * m_fcoeff;
+    this->vy[p0] += dy * f_over_r * m_fcoeff;
+    this->vz[p0] += dz * f_over_r * m_fcoeff;
+  }
+  return phi;
+}
+
+POSVEL_T BHForceTree::forceCalculationFast(
+                    ID_T p0,                    // Target particle index
+                    vector<POSVEL_T>* xInteract,
+                    vector<POSVEL_T>* yInteract,
+                    vector<POSVEL_T>* zInteract,
+                    vector<POSVEL_T>* mInteract)
+{
+  POSVEL_T phi = 0.0;
+
+  POSVEL_T pos0_x = this->xx[p0];
+  POSVEL_T pos0_y = this->yy[p0];
+  POSVEL_T pos0_z = this->zz[p0];
+
+  int nInteract = (int) xInteract->size();
+
+  for (int p = 0; p < nInteract; p++) {
+    POSVEL_T dx = (*xInteract)[p] - pos0_x;
+    POSVEL_T dy = (*yInteract)[p] - pos0_y;
+    POSVEL_T dz = (*zInteract)[p] - pos0_z;
+
+    POSVEL_T r2 = dx * dx + dy * dy + dz * dz;
+
+    POSVEL_T f_over_r = (*mInteract)[p] * m_fl->f_over_r(r2);
+    //POSVEL_T f_over_r = this->mass[particle] / r2;
+
+    this->vx[p0] += dx * f_over_r * m_fcoeff;
+    this->vy[p0] += dy * f_over_r * m_fcoeff;
+    this->vz[p0] += dz * f_over_r * m_fcoeff;
+  }
+
+  return phi;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Return the distance^2 from location to the closest point on FNode
+//
+/////////////////////////////////////////////////////////////////////////
+
+POSVEL_T BHForceTree::distanceToNearestPoint(
+                              POSVEL_T pos_x,
+                              POSVEL_T pos_y,
+                              POSVEL_T pos_z,
+                              FNode* node)
+{
+  // Calculate bounding box of current node
+  // Nearest point in bounding box decides whether particle or node is used
+  POSVEL_T dx, dy, dz, r;
+  POSVEL_T minBound[DIMENSION], maxBound[DIMENSION];
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    minBound[dim] = node->geoCenter[dim] - (node->geoSide[dim] * 0.5);
+    maxBound[dim] = node->geoCenter[dim] + (node->geoSide[dim] * 0.5);
+  }
+
+  if (pos_x < minBound[0])
+    dx = minBound[0] - pos_x;
+  else if (pos_x > maxBound[0])
+    dx = pos_x - maxBound[0];
+  else
+    dx = 0.0;
+
+  if (pos_y < minBound[1])
+    dy = minBound[1] - pos_y;
+  else if (pos_y > maxBound[1])
+    dy = pos_y - maxBound[1];
+  else
+    dy = 0.0;
+
+  if (pos_z < minBound[2])
+    dz = minBound[2] - pos_z;
+  else if (pos_z > maxBound[2])
+    dz = pos_z - maxBound[2];
+  else
+    dz = 0.0;
+
+  r = sqrt(dx * dx + dy * dy + dz * dz);
+  return r;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Return the distance from location to the fNode center of mass
+//
+/////////////////////////////////////////////////////////////////////////
+
+POSVEL_T BHForceTree::distanceToCenterOfMass(
+                              POSVEL_T xLoc,
+                              POSVEL_T yLoc,
+                              POSVEL_T zLoc,
+                              FNode* node)
+{
+  POSVEL_T xdist = (POSVEL_T) fabs(xLoc - node->u.n.partCenter[0]);
+  POSVEL_T ydist = (POSVEL_T) fabs(yLoc - node->u.n.partCenter[1]);
+  POSVEL_T zdist = (POSVEL_T) fabs(zLoc - node->u.n.partCenter[2]);
+  POSVEL_T dist = sqrt((xdist * xdist) + (ydist * ydist) + (zdist * zdist));
+  return dist;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Return the distance from location to the fNode furthest corner
+//
+/////////////////////////////////////////////////////////////////////////
+
+POSVEL_T BHForceTree::distanceToFarCorner(
+                              POSVEL_T xLoc,
+                              POSVEL_T yLoc,
+                              POSVEL_T zLoc,
+                              FNode* node)
+{
+  POSVEL_T distance = 0.0;
+  POSVEL_T corner[DIMENSION];
+  POSVEL_T xdist, ydist, zdist, dist;
+
+  for (int k = -1; k <= 1; k=k+2) {
+    corner[2] = node->geoCenter[2] + (k * (node->geoSide[2] * 0.5));
+    for (int j = -1; j <= 1; j=j+2) {
+      corner[1] = node->geoCenter[1] + (j * (node->geoSide[1] * 0.5));
+      for (int i = -1; i <= 1; i=i+2) {
+        corner[0] = node->geoCenter[0] + (i * (node->geoSide[0] * 0.5));
+
+        xdist = (POSVEL_T) fabs(xLoc - corner[0]);
+        ydist = (POSVEL_T) fabs(yLoc - corner[1]);
+        zdist = (POSVEL_T) fabs(zLoc - corner[2]);
+        dist = sqrt((xdist * xdist) + (ydist * ydist) + (zdist * zdist));
+        if (dist > distance)
+          distance = dist;
+      }
+    }
+  }
+  return distance;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Return the distance from location to the fNode nearest corner
+//
+/////////////////////////////////////////////////////////////////////////
+
+POSVEL_T BHForceTree::distanceToNearCorner(
+                              POSVEL_T xLoc,
+                              POSVEL_T yLoc,
+                              POSVEL_T zLoc,
+                              FNode* node)
+{
+  POSVEL_T distance = MAX_FLOAT;
+  POSVEL_T corner[DIMENSION];
+  POSVEL_T xdist, ydist, zdist, dist;
+
+  for (int k = -1; k <= 1; k=k+2) {
+    corner[2] = node->geoCenter[2] + (k * (node->geoSide[2] * 0.5));
+    for (int j = -1; j <= 1; j=j+2) {
+      corner[1] = node->geoCenter[1] + (j * (node->geoSide[1] * 0.5));
+      for (int i = -1; i <= 1; i=i+2) {
+        corner[0] = node->geoCenter[0] + (i * (node->geoSide[0] * 0.5));
+
+        xdist = (POSVEL_T) fabs(xLoc - corner[0]);
+        ydist = (POSVEL_T) fabs(yLoc - corner[1]);
+        zdist = (POSVEL_T) fabs(zLoc - corner[2]);
+        dist = sqrt((xdist * xdist) + (ydist * ydist) + (zdist * zdist));
+        if (dist < distance)
+          distance = dist;
+      }
+    }
+  }
+  return distance;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Print BH tree with indentations indicating levels
+// Since the tree has been threaded changing the recursive tree with children
+// into an iterative tree with next nodes and parents, walk the tree
+// iteratively keeping track of parents to indicate when levels change
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::printBHForceTree()
+{
+  ID_T curIndex = this->nodeOffset;
+  vector<ID_T> parents;
+  parents.push_back(-1);
+  ID_T parentIndex = 0;
+
+  while (curIndex != -1) {
+
+    // Get the parent of the current index
+    ID_T parent;
+    if (curIndex >= this->nodeOffset)
+      parent = this->fNode[curIndex - this->nodeOffset].u.n.parent;
+    else
+      parent = this->fParticle[curIndex].parent;
+
+    // Pop the stack of parents until the level is right
+    while (parent != parents[parentIndex]) {
+      parents.pop_back();
+      parentIndex--;
+    }
+
+    // Print FNode
+    if (curIndex >= this->nodeOffset) {
+      FNode* fn = &this->fNode[curIndex-this->nodeOffset];
+
+      cout << parentIndex << ":" << setw(parentIndex) << " ";
+      cout << "N " << curIndex 
+           << " sibling " << fn->u.n.sibling 
+           << " next " << fn->u.n.nextNode 
+           << " parent " << fn->u.n.parent 
+
+           << " [" << (fn->geoCenter[0]-fn->geoSide[0]/2.0)
+           << ":" << (fn->geoCenter[0]+fn->geoSide[0]/2.0) << "] "
+           << " [" << (fn->geoCenter[1]-fn->geoSide[1]/2.0)
+           << ":" << (fn->geoCenter[1]+fn->geoSide[1]/2.0) << "] "
+           << " [" << (fn->geoCenter[2]-fn->geoSide[2]/2.0)
+           << ":" << (fn->geoCenter[2]+fn->geoSide[2]/2.0) << "] "
+
+           << " (" << fn->u.n.partCenter[0] 
+           << " ," << fn->u.n.partCenter[1] 
+           << " ," << fn->u.n.partCenter[2]
+
+           << ") MASS " << fn->u.n.partMass
+           << " RADIUS " << fn->u.n.partRadius
+           << endl;
+        
+      // Push back the new FNode which will have children
+      parents.push_back(curIndex);
+      parentIndex++;
+
+      // Walk to next node (either particle or node)
+      curIndex = this->fNode[curIndex-this->nodeOffset].u.n.nextNode;
+    }
+
+    // Print FParticle
+    else {
+      cout << parentIndex << ":" << setw(parentIndex) << " ";
+      cout << "P " << curIndex 
+           << " sibling " << this->fParticle[curIndex].sibling 
+           << " next " << this->fParticle[curIndex].nextNode 
+           << " parent " << this->fParticle[curIndex].parent
+           << " (" << xx[curIndex]
+           << " ," << yy[curIndex]
+           << " ," << zz[curIndex] << ")" << endl;
+
+      // Walk to next node (either particle or node)
+      curIndex = this->fParticle[curIndex].nextNode;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Print the force values for comparison
+//
+/////////////////////////////////////////////////////////////////////////
+
+void BHForceTree::printForceValues()
+{
+  for (int p = 0; p < this->particleCount; p++) {
+    cout << "Particle: " << setw(8) << p
+         << " force " << this->fParticle[p].force << endl;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Get the index of the child which should contain this particle
+//
+/////////////////////////////////////////////////////////////////////////
+
+int BHForceTree::getChildIndex(FNode* node, ID_T pindx)
+{
+  // Vary Z dimension fastest in making octtree children
+  int index = 0;
+  if (this->xx[pindx] >= node->geoCenter[0]) index += 4;
+  if (this->yy[pindx] >= node->geoCenter[1]) index += 2;
+  if (this->zz[pindx] >= node->geoCenter[2]) index += 1;
+  return index;
+}
diff --git a/src/halo-finder/src/BHForceTree.h b/src/halo-finder/src/BHForceTree.h
new file mode 100644
index 0000000..09d9fcc
--- /dev/null
+++ b/src/halo-finder/src/BHForceTree.h
@@ -0,0 +1,436 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+// .NAME BHForceTree - Create a Barnes Hut tree from the given particles
+//
+// .SECTION Description
+// BHTree takes particle locations and distributes them recursively in
+// a Barnes Hut tree.  The tree is an octree, dividing on the physical
+// location such that one particle or one node appears within a child
+// so that it is essentially AMR for particles.
+//
+// After the tree is created it is walked using depth first recursion and
+// the nodes are threaded together so that the tree becomes iterative.
+// By stringing nodes together rather than maintaining indices into children
+// summary information for each node can replace the 8 integer slots that
+// were taken up by the children.  Now each node can maintain the mass
+// below, the length of the physical box it represents and the center of
+// mass of particles within the node.
+//
+// Each particle and each node maintains an index for the next node and
+// also the parent, so that it is possible to represent the recursive tree
+// by paying attention to parents.
+//
+
+#ifndef BHForceTree_h
+#define BHForceTree_h
+
+#include "BasicDefinition.h"
+#include "bigchunk.h"
+
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include "ForceLaw.h"
+
+using namespace std;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Force Particles
+//
+/////////////////////////////////////////////////////////////////////////
+
+class FParticle {
+public:
+  FParticle();
+
+  ID_T      sibling;		// Next on same level, ending in -1
+  ID_T      nextNode;		// Next node in iteration, particle or node
+  ID_T      parent;		// Parent FNode
+  POSVEL_T  force;
+};
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Force Nodes
+//
+// Barnes Hut octree structure for N-body is represented by vector
+// of FNodes which divide space into octants which are filled with one
+// particle or one branching node.  As the tree is built the child[8]
+// array is used.  Afterwards the tree is walked linking the nodes and
+// replacing the child structure with data about the tree.  When building
+// the tree child information is an integer which is the index of the
+// halo particle which was put into a vector of FParticle, or the index
+// of the FNode offset by the number of particles
+//
+/////////////////////////////////////////////////////////////////////////
+
+class FNode {
+public:
+  FNode(POSVEL_T* minLoc, POSVEL_T* maxLoc);
+  FNode(FNode* parent, int child);
+
+  POSVEL_T geoSide[DIMENSION];		// Length of octant on each side
+  POSVEL_T geoCenter[DIMENSION];	// Physical center of octant
+
+  union {
+    ID_T  child[NUM_CHILDREN];		// Index of particle or node
+    struct NodeInfo {
+      POSVEL_T partCenter[DIMENSION];
+      POSVEL_T partMass;
+      POSVEL_T partRadius;
+      ID_T     sibling;
+      ID_T     nextNode;
+      ID_T     parent;
+    } n;
+  } u;
+};
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Barnes Hut octree of FParticles and FNodes threaded
+//
+/////////////////////////////////////////////////////////////////////////
+
+class BHForceTree {
+public:
+  BHForceTree(
+	      POSVEL_T* minLoc,       // Bounding box of halo
+	      POSVEL_T* maxLoc,       // Bounding box of halo
+	      ID_T count,             // Number of particles in halo
+	      POSVEL_T* xLoc,         // Locations of every particle
+	      POSVEL_T* yLoc,
+	      POSVEL_T* zLoc,
+	      POSVEL_T* xVel,         // Velocities of every particle
+	      POSVEL_T* yVel,
+	      POSVEL_T* zVel,
+	      POSVEL_T* mass,	      // Mass of each particle
+	      POSVEL_T avgMass);      // Average mass for estimation
+
+  BHForceTree(
+	      POSVEL_T* minLoc,       // Bounding box of halo
+	      POSVEL_T* maxLoc,       // Bounding box of halo
+	      ID_T count,             // Number of particles in halo
+	      POSVEL_T* xLoc,         // Locations of every particle
+	      POSVEL_T* yLoc,
+	      POSVEL_T* zLoc,
+	      POSVEL_T* xVel,         // Velocities of every particle
+	      POSVEL_T* yVel,
+	      POSVEL_T* zVel,
+	      POSVEL_T* mass,	      // Mass of each particle
+	      POSVEL_T avgMass,       // Average mass for estimation
+	      ForceLaw *fl,
+	      float fcoeff);
+
+  ~BHForceTree();
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Create the BH tree recursively by placing particles in empty octants
+  //
+  void createBHForceTree();
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Walk tree to thead so that it can be accessed iteratively or recursively
+  //
+  void threadBHForceTree(
+	ID_T curIndx,           // Current node/particle
+	ID_T sibling,           // Sibling of current
+	ID_T parent,            // Parent of current
+	ID_T* lastIndx,	        // Last node/particle
+	POSVEL_T* radius);      // Needed to pass up particle radius of child
+
+  POSVEL_T distanceToCenterOfMass(
+	POSVEL_T xLoc,		// Distance from point to node particle center
+	POSVEL_T yLoc,
+	POSVEL_T zLoc,
+	FNode* node);
+
+  POSVEL_T distanceToNearCorner(
+	POSVEL_T xLoc,		// Distance from point to node nearest corner
+	POSVEL_T yLoc,
+	POSVEL_T zLoc,
+	FNode* node);
+
+  POSVEL_T distanceToFarCorner(
+	POSVEL_T xLoc,		// Distance from point to node furthest corner
+	POSVEL_T yLoc,
+	POSVEL_T zLoc,
+	FNode* node);
+
+  POSVEL_T distanceToNearestPoint(
+	POSVEL_T xLoc,		// Distance from point to node closest point
+	POSVEL_T yLoc,
+	POSVEL_T zLoc,
+	FNode* node);
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Calculate force using N^2 method
+  //
+  void treeForceN2(
+	POSVEL_T critRadius);       // Criteria for ignoring a node
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Short range force calculation on all particles of the tree
+  // Recurse through levels saving information for reuse
+  // Based on Barnes treecode
+  //
+  void treeForceBarnesAdjust(
+	POSVEL_T openAngle,         // Criteria for opening a node
+	POSVEL_T critRadius);       // Criteria for ignoring a node
+
+  // Walk tree using opening angle and critical radius
+  void walkTreeBarnesAdjust(
+	vector<ID_T>* active,	    // List of nodes which must be acted on
+	vector<ID_T>* partInteract,  // Particles which act on object
+	vector<ID_T>* nodeInteract,  // Nodes which act on object
+	ID_T curId,                 // Id of current particle or node
+	POSVEL_T bhAngle,           // Opening angle squared
+	POSVEL_T critRadius);       // Critical radius squared
+
+  // Barnes tree walk will accept nodes that should be opened because of
+  // comparison between two nodes higher in the recursion.  Adust this
+  // when calculating for a particular particle.
+  void adjustInteraction(
+	ID_T p0,
+	vector<ID_T>* partInteract,
+	vector<ID_T>* nodeInteract,
+	vector<ID_T>* adjPartInteract,
+	vector<ID_T>* adjNodeInteract,
+	POSVEL_T bhAngle,
+	POSVEL_T critRadius);
+
+  // Recursive part of interaction adjustment
+  void adjustNodeInteract(
+	ID_T p0,
+	FNode* curNode,
+	vector<ID_T>* adjPartInteract,
+	vector<ID_T>* adjNodeInteract,
+	POSVEL_T bhAngle,
+	POSVEL_T critRadius);
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Short range force calculation on all particles of the tree
+  // Recurse through levels saving information for reuse
+  // Based on Barnes treecode with quick scan where nodes are accepted
+  // if they touch the target node.
+  //
+  void treeForceBarnesQuick(
+	POSVEL_T openAngle,         // Criteria fo opening a node
+	POSVEL_T critRadius);       // Criteria for ignoring a node
+
+  // Walk tree opening only nodes that physically touch target node
+  void walkTreeBarnesQuick(
+	vector<ID_T>* active,	    // List of nodes which must be acted on
+	vector<ID_T>* partInteract,  // Particles which act on object
+	vector<ID_T>* nodeInteract,  // Nodes which act on object
+	ID_T curId,                 // Id of current particle or node
+	POSVEL_T bhAngle,           // Opening angle squared
+	POSVEL_T critRadius);       // Critical radius squared
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Calculate force on individual particles using tree walks
+  // Short range force on one particle starting from root walking down
+  //
+  void treeForceGadgetTopDown(
+	ID_T p,                     // Index of particle for calculation
+	POSVEL_T openAngle,         // Criteria for opening a node
+	POSVEL_T critRadius);       // Criteria for ignoring a node
+
+  void treeForceGadgetTopDownFast(
+	ID_T p,                     // Index of particle for calculation
+	POSVEL_T openAngle,         // Criteria for opening a node
+	POSVEL_T critRadius);       // Criteria for ignoring a node
+
+  void treeForceGadgetTopDownFast2(
+	ID_T p,                     // Index of particle for calculation
+	POSVEL_T openAngle,         // Criteria for opening a node
+	POSVEL_T critRadius,        // Criteria for ignoring a node
+	vector<POSVEL_T>* xInteract,
+	vector<POSVEL_T>* yInteract,
+	vector<POSVEL_T>* zInteract,
+	vector<POSVEL_T>* mInteract,
+	double *timeWalk,
+	double *timeEval);
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Calculate force on individual particles using tree walks
+  // Short range force on one particle starting with particle walking up
+  //
+  void treeForceGadgetBottomUp(
+	ID_T p,                     // Index of particle for calculation
+	POSVEL_T openAngle,         // Criteria for opening a node
+	POSVEL_T critRadius);       // Criteria for ignoring a node
+
+  void recurseOpenNode(
+	FNode* curNode,
+	POSVEL_T pos_x,
+	POSVEL_T pos_y,
+	POSVEL_T pos_z,
+	POSVEL_T bhAngle,           // Open node to examine children
+	POSVEL_T critRadius,        // Accept or ignore node not opened
+	vector<ID_T>* partInteract,
+	vector<ID_T>* nodeInteract);
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Calculate force on groups particles in a cell
+  //
+  void treeForceGroup(
+	POSVEL_T openAngle,         // Criteria for opening a node
+	POSVEL_T critRadius,        // Criteria for ignoring a node
+	int minGroup,               // Minimum particles in one group
+	int maxGroup);              // Maximum particles in one group
+
+  // Short range force on one particle starting with particle walking up
+  void walkTreeGroup(
+	ID_T curId,                 // Index of particle or node
+	POSVEL_T minMass,           // Group of particles more than this mass
+	POSVEL_T maxMass,           // Group of particles less than this mass
+	POSVEL_T openAngle,         // Criteria for opening a node
+	POSVEL_T critRadius);       // Criteria for ignoring a node
+
+  // Create the interaction list for a particle starting from root
+  void createParticleInteractList(
+	ID_T p,
+	POSVEL_T bhAngle,
+	POSVEL_T critRadius,
+	vector<ID_T>* partInteract,
+	vector<ID_T>* nodeInteract);
+
+  // Create the interaction list for a node starting from root
+  void createNodeInteractList(
+	ID_T node,
+	POSVEL_T bhAngle,
+	POSVEL_T critRadius,
+	vector<ID_T>* partInteract,
+	vector<ID_T>* nodeInteract);
+
+  // Force calculation for a group of particles
+  void forceCalculationGroup(
+	ID_T node,
+	POSVEL_T bhAngle,
+	POSVEL_T critRadius,
+	vector<ID_T>* partInteract,
+	vector<ID_T>* nodeInteract);
+
+  // Like forceCalculation but with extra exclusion test since particles
+  // are grouped and not all particles and nodes will apply to each
+  POSVEL_T forceCalculationParticle(
+	ID_T p0,                    // Index of target particle
+	POSVEL_T critRadius,
+	vector<ID_T>* partInteract,  // Particles which act on object
+	vector<ID_T>* nodeInteract); // Nodes which act on object
+
+  // Collect the particles within the group
+  void collectParticles(
+	ID_T curId,
+	vector<ID_T>* particles);
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Force calculations
+  //
+  POSVEL_T forceCalculation(
+	ID_T p0,                    // Index of target particle
+	vector<ID_T>* partInteract,  // Particles which act on object
+	vector<ID_T>* nodeInteract); // Nodes which act on object
+
+  POSVEL_T forceCalculationFast(
+	ID_T p0,                    // Index of target particle
+	vector<POSVEL_T>* xInteract,
+	vector<POSVEL_T>* yInteract,
+	vector<POSVEL_T>* zInteract,
+	vector<POSVEL_T>* mInteract);
+
+  // Choose the correct octant for placing a node in the tree
+  int getChildIndex(
+	FNode* node,
+	ID_T pindx);
+
+  // Print BH tree depth first
+  void printBHForceTree();
+
+  // Print force values
+  void printForceValues();
+	
+private:
+  int    myProc;                // My processor number
+  int    numProc;               // Total number of processors
+
+  POSVEL_T boxSize;             // Physical box size of the data set
+  POSVEL_T openingAngle;	// Criteria for opening node to lower level
+
+  ID_T   particleCount;         // Total particles
+  ID_T   nodeCount;             // Total nodes
+  ID_T   nodeOffset;		// Index of first node is after last particle
+  POSVEL_T particleMass;	// Average particle mass
+
+  POSVEL_T* xx;                 // X location for particles on this processor
+  POSVEL_T* yy;                 // Y location for particles on this processor
+  POSVEL_T* zz;                 // Z location for particles on this processor
+  POSVEL_T* vx;                 // X velocity for particles on this processor
+  POSVEL_T* vy;                 // Y velocity for particles on this processor
+  POSVEL_T* vz;                 // Z velocity for particles on this processor
+  POSVEL_T* mass;               // Mass for particles on this processor
+
+  POSVEL_T minRange[DIMENSION]; // Physical range of data
+  POSVEL_T maxRange[DIMENSION]; // Physical range of data
+
+  vector<FParticle, bigchunk_allocator<FParticle> >  fParticle;	// Leaf particles in tree
+  vector<FNode, bigchunk_allocator<FNode> >          fNode;	// Internal nodes of tree
+
+  ForceLaw *m_fl;
+  float m_fcoeff;
+};
+
+#endif
diff --git a/src/halo-finder/src/BasicDefinition.h b/src/halo-finder/src/BasicDefinition.h
new file mode 100644
index 0000000..8c9f1a2
--- /dev/null
+++ b/src/halo-finder/src/BasicDefinition.h
@@ -0,0 +1,307 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#ifndef BasicDefinition_h
+#define BasicDefinition_h
+
+#ifdef USE_VTK_COSMO
+#include "vtkType.h"
+#else
+#include <stdint.h>
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+//
+
+#ifdef USE_VTK_COSMO
+#ifdef ID_64
+   typedef      vtkTypeInt64 ID_T;           // Particle and halo ids
+#else
+   typedef      vtkTypeInt32 ID_T;           // Particle and halo ids
+#endif
+#else
+#ifdef ID_64
+   typedef      int64_t ID_T;           // Particle and halo ids
+#else
+   typedef      int32_t ID_T;           // Particle and halo ids
+#endif
+#endif
+
+#ifdef POSVEL_64
+   typedef      double  POSVEL_T;       // Position,velocity
+   typedef      double  POTENTIAL_T;    // Potential
+#else
+   typedef      float   POSVEL_T;       // Position,velocity
+   typedef      float   POTENTIAL_T;    // Potential
+#endif
+
+#ifdef GRID_64
+   typedef      double  GRID_T;         // Grid types
+#else
+   typedef      float   GRID_T;         // Grid types
+#endif
+
+#ifdef USE_VTK_COSMO
+typedef vtkTypeInt32    STATUS_T; // Dead (which neighbor) or alive particles
+typedef vtkTypeUInt16   MASK_T;   // Other particle information
+#else
+typedef int32_t         STATUS_T; // Dead (which neighbor) or alive particles
+typedef uint16_t        MASK_T;   // Other particle information
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+
+const float MAX_FLOAT   = 1.0e15;
+const float MIN_FLOAT   = -1.0e15;
+
+const int   RECORD      = 0;    // Input data is by particle record
+const int   BLOCK       = 1;    // Input data is blocked by variable
+
+const int   DIMENSION   = 3;
+const int   BUF_SZ      = 512;  // Character buffer
+
+// Constants for Spherical Over Dense calculation
+const double CHAIN_SIZE         = 2.0;           // Size for bucket mesh
+const double RHO_C              = 2.77536627e11; // Critical density
+                                                 // in (M_sun/h) / (Mpc/h)^3
+const double RHO_RATIO          = 200.0;         // density/critical density
+const double SOD_MASS           = 1.0e14;        // for initial SOD radius
+                                                 // in (M_sun/h)
+const double MIN_RADIUS_FACTOR  = 0.5;           // Factor of initial SOD radius
+const double MAX_RADIUS_FACTOR  = 2.0;           // Factor of initial SOD radius
+const int    MIN_SOD_SIZE       = 1000;          // Min FOF halo for SOD
+const float  MIN_SOD_MASS       = 5.0e12;        // Min FOF mass for SOD
+const int    NUM_SOD_BINS       = 20;            // Log bins for SOD halo
+
+// Constants for subhalo finding
+const int    NUM_CHILDREN	= 8;             // Barnes Hut octree
+const double GRAVITY_C          = 43.015e-10;
+                                // Gravitional constant for potential energy
+const double ALPHA_SUBHALO	= 1.0;
+                                // Controls cut/grow of subhalo
+                                // 1.0 / alphaFactor is the number of times 
+                                // larger a candidate must be in order for the 
+                                // smaller to be cut rather than allowed to grow
+                                // Set to 1.0 means always cut as in SUBFIND
+                                // Set to 0.2 main halo always wins cut/grow
+                                // Set to 0.01 small structures grow
+const double BETA_SUBHALO	= 0.0;
+                                // Controls the Poisson noise significance test
+                                // If average density of particles in a group
+                                // is greater than (1 + beta) * density of
+                                // saddlepoint particle, group is considered
+                                // significant and stands on its own
+                                // Set to 0.0 means always significant
+                                // Set to 0.25 helps small scale connectivity 
+
+const int    NUM_SPH_DENSITY	= 64;
+				// Number of neigbor particles used in
+                                // calculating SPH smoothing length and density
+const int    NUM_SUBHALO_NEIGHBOR = 20;
+                                // Number of closest neighbors looked at
+                                // in placing particle in a subgroup
+const int    MIN_SUBHALO_SIZE	= 20;
+                                // Smallest allowed subhalo
+const int    MIN_FOF_SUBHALO	= 2000;
+                                // Smallest FOF halo which will have
+                                // subhalo finding run on it
+
+// Constants for speeding up unbind calculation on very large subhalos
+const int    MAX_UNBIND_1	= 100;
+                                // When unbinding reaches less than this
+                                // number of particles in subgroup remove
+                                // only one particle before running unbind again
+const int    MAX_UNBIND_2	= 2000;
+                                // When unbinding reaches less than this
+                                // number of particles in subgroup remove
+                                // remove (1 / FACTOR_UNBIND_1) total positive
+                                // energy particles before running unbind again
+const int    MAX_UNBIND_3	= 40000;
+                                // When unbinding reaches less than this
+                                // number of particles in subgroup remove
+                                // remove (1 / FACTOR_UNBIND_2) total positive
+                                // energy particles before running unbind again
+                                // Also maximum subhalo candidate for unbinding
+                                // Used for development because that stage
+                                // takes so long and normally this is only
+                                // the main subhalo and all particles unbound
+                                // would go to fuzz
+const int    FACTOR_UNBIND_1	= 4;
+				// Between MAX_UNBIND_1 and MAX_UNBIND_2
+                                // remove 25% of the positive total energy
+                                // particles
+const int    FACTOR_UNBIND_2	= 2;
+				// Betweend MAX_UNBIND_2 and MAX_UNBIND_3
+                                // remove 50% of the positive total energy
+                                // particles
+const int    MAX_UNBIND_DELETE	= 20;
+                                // To speed up unbinding when large candidate
+                                // reaches this number of particles with
+                                // positive total energy just quit
+                                                 
+
+// Cosmology record data in .cosmo format
+const int   COSMO_FLOAT = 7;    // x,y,z location and velocity plus mass
+const int   COSMO_INT   = 1;    // Particle id
+const int   RECORD_SIZE = sizeof(POSVEL_T) * COSMO_FLOAT + 
+                          sizeof(ID_T) * COSMO_INT;
+
+const bool  ENFORCE_MAX_READ = false;
+const int   MAX_READ    = 8000000;
+                                // Maximum number of particles to read at a time
+                                // Multipled by COSMO_FLOAT floats
+                                // makes the largest MPI allowed buffer
+
+const float DEAD_FACTOR = 1.20f; // Number of dead allocated is % more than max
+
+const int   ALIVE       = -1;   // Particle belongs to this processor
+const int   MIXED       = ALIVE - 1;
+                                // For a trick to quickly know what
+                                // particles should be output
+
+const int   UNMARKED    = -1;   // Mixed halo needs MASTER to arbitrate
+const int   INVALID     = 0;    // Mixed halo is not recorded on processor
+const int   VALID       = 1;    // Mixed halo is recorded on processor
+
+const int   MASTER      = 0;    // Processor to do merge step
+
+const int   MERGE_COUNT = 20;   // Number of tags to merge on in mixed
+
+// Parameters for center finding
+const int   MBP_THRESHOLD = 5000; // Threshold between n^2 and AStar methods
+const int   MCP_THRESHOLD = 8000;// Threshold between n^2 and Chain methods
+const int   MCP_CHAIN_FACTOR = 5; // Subdivide bb for building chaining mesh
+
+//
+// Neighbors are enumerated so that particles can be attached to the correct
+// neighbor, but these pairs must be preserved for the ParticleExchange.
+// Every processor should be able to send and receive on every iteration of
+// the exchange, so if everyone sends RIGHT and receives LEFT it works
+//
+// Do not change this pairing order.
+//
+enum NEIGHBOR
+{
+  X0,                   // Left face
+  X1,                   // Right face
+
+  Y0,                   // Bottom face
+  Y1,                   // Top face
+
+  Z0,                   // Front face
+  Z1,                   // Back face
+
+  X0_Y0,                // Left   bottom edge
+  X1_Y1,                // Right  top    edge
+
+  X0_Y1,                // Left   top    edge
+  X1_Y0,                // Right  bottom edge
+
+  Y0_Z0,                // Bottom front  edge
+  Y1_Z1,                // Top    back   edge
+
+  Y0_Z1,                // Bottom back   edge
+  Y1_Z0,                // Top    front  edge
+
+  Z0_X0,                // Front  left   edge
+  Z1_X1,                // Back   right  edge
+
+  Z0_X1,                // Front  right  edge
+  Z1_X0,                // Back   left   edge
+
+  X0_Y0_Z0,             // Left  bottom front corner
+  X1_Y1_Z1,             // Right top    back  corner
+
+  X0_Y0_Z1,             // Left  bottom back  corner
+  X1_Y1_Z0,             // Right top    front corner
+
+  X0_Y1_Z0,             // Left  top    front corner
+  X1_Y0_Z1,             // Right bottom back  corner
+
+  X0_Y1_Z1,             // Left  top    back  corner
+  X1_Y0_Z0              // Right bottom front corner
+};
+
+const int NUM_OF_NEIGHBORS      = 26;
+
+// Header for Gadget input files
+const int GADGET_GAS            = 0;
+const int GADGET_HALO           = 1;
+const int GADGET_DISK           = 2;
+const int GADGET_BULGE          = 3;
+const int GADGET_STARS          = 4;
+const int GADGET_BOUND          = 5;
+const int NUM_GADGET_TYPES      = 6;    // Types of gadget particles
+
+const int GADGET_HEADER_SIZE    = 256;  // Size when the endian matches
+const int GADGET_HEADER_SIZE_SWP= 65536;// Size when the endian doesn't match
+const int GADGET_FILL           = 60;   // Current fill to HEADER SIZE
+const int GADGET_SKIP           = 4;    // Bytes the indicate block size
+const int GADGET_2_SKIP         = 16;   // Extra bytes in gadget-2
+
+const int GADGET_1              = 1;
+const int GADGET_2              = 2;
+
+struct GadgetHeader {
+  int      npart[NUM_GADGET_TYPES];
+  double   mass[NUM_GADGET_TYPES];
+  double   time;
+  double   redshift;
+  int      flag_sfr;
+  int      flag_feedback;
+  int      npartTotal[NUM_GADGET_TYPES];
+  int      flag_cooling;
+  int      num_files;
+  double   BoxSize;
+  double   Omega0;
+  double   OmegaLambda;
+  double   HubbleParam;
+  int      flag_stellarage;
+  int      flag_metals;
+  int      HighWord[NUM_GADGET_TYPES];
+  int      flag_entropy;
+  char     fill[GADGET_FILL];
+};
+
+#endif
diff --git a/src/halo-finder/src/CMakeLists.txt b/src/halo-finder/src/CMakeLists.txt
new file mode 100644
index 0000000..4555ba3
--- /dev/null
+++ b/src/halo-finder/src/CMakeLists.txt
@@ -0,0 +1,66 @@
+ 
+project(Cosmo)
+
+#SET(Cosmo_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
+
+SET(Cosmo_LIBS)
+IF(VTK_USE_MPI)
+  INCLUDE_DIRECTORIES("${MPI_INCLUDE_PATH}")
+  IF(MPI_EXTRA_LIBRARY)
+    SET(Cosmo_LIBS ${MPI_LIBRARY} ${MPI_EXTRA_LIBRARY})
+  ELSE(MPI_EXTRA_LIBRARY)
+    SET(Cosmo_LIBS ${MPI_LIBRARY})
+  ENDIF(MPI_EXTRA_LIBRARY)
+  ADD_DEFINITIONS(-DMPICH_IGNORE_CXX_SEEK -DUSE_VTK_COSMO)
+ELSE(VTK_USE_MPI)
+  ADD_DEFINITIONS(-DUSE_SERIAL_COSMO -DUSE_VTK_COSMO)
+ENDIF(VTK_USE_MPI)
+# Needed for mpich 2
+
+CONFIGURE_FILE (${Cosmo_SOURCE_DIR}/CosmoDefinition.h.in
+                ${Cosmo_BINARY_DIR}/CosmoDefinition.h)
+ 
+SET(Cosmo_SOURCES
+  Partition.cxx
+  ParticleDistribute.cxx
+  ParticleExchange.cxx
+  Message.cxx
+  CosmoHaloFinder.cxx
+  CosmoHaloFinderP.cxx
+  FOFHaloProperties.cxx
+  ChainingMesh.cxx
+  HaloCenterFinder.cxx
+  SODHalo.cxx
+)
+
+INCLUDE_DIRECTORIES (${Cosmo_SOURCE_DIR} ${Cosmo_BINARY_DIR})
+
+VTK_ADD_LIBRARY(Cosmo ${Cosmo_SOURCES})
+TARGET_LINK_LIBRARIES(Cosmo vtksys vtkCommon ${Cosmo_LIBS})
+
+IF(NOT VTK_INSTALL_NO_LIBRARIES)
+  INSTALL(TARGETS Cosmo
+    EXPORT ${VTK_INSTALL_EXPORT_NAME}
+    RUNTIME DESTINATION ${VTK_INSTALL_BIN_DIR_CM24} COMPONENT RuntimeLibraries
+    LIBRARY DESTINATION ${VTK_INSTALL_LIB_DIR_CM24} COMPONENT RuntimeLibraries
+    ARCHIVE DESTINATION ${VTK_INSTALL_LIB_DIR_CM24} COMPONENT Development)
+ENDIF(NOT VTK_INSTALL_NO_LIBRARIES)
+
+IF(NOT VTK_INSTALL_NO_DEVELOPMENT)
+  INSTALL(FILES
+    ${Cosmo_SOURCE_DIR}/BasicDefinition.h
+    ${Cosmo_SOURCE_DIR}/ChainingMesh.h
+    ${Cosmo_SOURCE_DIR}/CosmoHalo.h
+    ${Cosmo_SOURCE_DIR}/CosmoHaloFinder.h
+    ${Cosmo_SOURCE_DIR}/CosmoHaloFinderP.h
+    ${Cosmo_SOURCE_DIR}/FOFHaloProperties.h
+    ${Cosmo_SOURCE_DIR}/HaloCenterFinder.h
+    ${Cosmo_SOURCE_DIR}/Message.h
+    ${Cosmo_SOURCE_DIR}/ParticleDistribute.h
+    ${Cosmo_SOURCE_DIR}/ParticleExchange.h
+    ${Cosmo_SOURCE_DIR}/Partition.h
+    ${Cosmo_SOURCE_DIR}/winDirent.h
+    ${Cosmo_BINARY_DIR}/CosmoDefinition.h
+    DESTINATION ${VTK_INSTALL_INCLUDE_DIR_CM24}/Cosmo
+    COMPONENT Development)
+ENDIF(NOT VTK_INSTALL_NO_DEVELOPMENT)
diff --git a/src/halo-finder/src/CosmoDefinition.h.in b/src/halo-finder/src/CosmoDefinition.h.in
new file mode 100644
index 0000000..0c457c1
--- /dev/null
+++ b/src/halo-finder/src/CosmoDefinition.h.in
@@ -0,0 +1,70 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#ifndef CosmoDefinition_h
+#define CosmoDefinition_h
+
+#include <vtksys/Configure.h>
+#include "vtkABI.h"
+
+#cmakedefine BUILD_SHARED_LIBS
+
+// Now set up all of the export macros
+#if defined(BUILD_SHARED_LIBS)
+ #if defined(Cosmo_EXPORTS)
+  #define COSMO_EXPORT VTK_ABI_EXPORT
+ #else
+  #define COSMO_EXPORT VTK_ABI_IMPORT
+ #endif
+#else
+ #define COSMO_EXPORT
+#endif
+
+#include "BasicDefinition.h"
+
+#ifndef USE_SERIAL_COSMO
+#include "vtkMPI.h"
+#endif
+
+#endif
diff --git a/src/halo-finder/src/CosmoHalo.h b/src/halo-finder/src/CosmoHalo.h
new file mode 100644
index 0000000..beb59a1
--- /dev/null
+++ b/src/halo-finder/src/CosmoHalo.h
@@ -0,0 +1,157 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#ifndef CosmoHalo_h
+#define CosmoHalo_h
+
+#ifdef USE_VTK_COSMO
+#include "CosmoDefinition.h"
+#else
+#include "Definition.h"
+#endif
+
+#include <string>
+#include <vector>
+#include <set>
+#include <algorithm>
+
+using namespace std;
+
+////////////////////////////////////////////////////////////////////////////
+//
+// CosmoHalo functions as a container for mixed halos received from the serial
+// halo finder where the particle vector contains the index of the particle
+// on a particular processor and the tag of that particle for the problem.
+//
+// It also functions as a merge container for MASTER processor where it
+// contains the mixed halos crossing more than one boundary.
+//
+////////////////////////////////////////////////////////////////////////////
+
+#ifdef USE_VTK_COSMO
+class COSMO_EXPORT CosmoHalo {
+#else
+class CosmoHalo {
+#endif
+public:
+  CosmoHalo(ID_T id, int alive, int dead)
+                {
+                  this->numberOfAlive = alive;
+                  this->numberOfDead = dead;
+                  this->haloID = id;
+                  this->valid = VALID;
+                  this->particles = new vector<ID_T>;
+                  this->tags = new vector<ID_T>;
+                  this->neighbors = new set<int>;
+                  this->partners = new set<int>;
+                }
+  ~CosmoHalo()
+                {
+                  delete this->particles;
+                  delete this->tags;
+                  delete this->neighbors;
+                  delete this->partners;
+                }
+
+  // Add a particle index for this halo on this processor
+  // Add to the neighbor zones to know how many processors share this halo
+  void addParticle(ID_T indx, ID_T tag, int neighbor)
+                {
+                  this->particles->push_back(indx);
+                  this->tags->push_back(tag);
+                  if (neighbor != ALIVE)
+                    this->neighbors->insert(neighbor);
+                }
+
+  // Add a mixed particle
+  void addParticle(ID_T tag)
+                {
+                  this->tags->push_back(tag);
+                }
+
+  // Add a matching mixed halo index indicating same halo
+  void addPartner(int index)
+                {
+                  this->partners->insert(index);
+                }
+
+  // Sort the members to help identify the same halo on multiple processors
+  void sortParticleTags()
+                {
+                  sort(this->tags->begin(), this->tags->end());
+                }
+
+  void         setAliveCount(int c)     { this->numberOfAlive = c; }
+  void         setDeadCount(int c)      { this->numberOfDead = c; }
+  void         setRankID(int rank)      { this->rankID = rank; }
+  void         setValid(int v)          { this->valid = v; }
+
+  ID_T         getHaloID()              { return this->haloID; }
+  int          getRankID()              { return this->rankID; }
+  int          getAliveCount()          { return this->numberOfAlive; }
+  int          getDeadCount()           { return this->numberOfDead; }
+  int          getValid()               { return this->valid; }
+
+  vector<ID_T>* getParticles()          { return this->particles; }
+  vector<ID_T>* getTags()               { return this->tags; }
+  set<int>*    getNeighbors()           { return this->neighbors; }
+  set<int>*    getPartners()            { return this->partners; }
+
+
+private:
+  ID_T haloID;                  // Halo id is smallest particle index/tag
+  int rankID;                   // Processor which owns this halo
+
+  vector<ID_T>* particles;      // Index of halo particle on this processor
+  vector<ID_T>* tags;           // Tag of halo particle
+  set<int>* neighbors;          // Zones with dead particles from this halo
+  set<int>* partners;           // Index of matching mixed halo
+
+  int numberOfAlive;            // Number of alive particles in halo
+  int numberOfDead;             // Number of dead particles in halo
+
+  int valid;                    // Mixed halo to be used or not
+};
+
+#endif
diff --git a/src/halo-finder/src/CosmoToGadget2.cxx b/src/halo-finder/src/CosmoToGadget2.cxx
new file mode 100644
index 0000000..f91d857
--- /dev/null
+++ b/src/halo-finder/src/CosmoToGadget2.cxx
@@ -0,0 +1,264 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#include <stdlib.h>
+
+#include <fstream>
+#include <iostream>
+#include "BasicDefinition.h"
+
+using namespace std;
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// First command line parameter is the Cosmo file name
+// Second command line parameter is the Gadget-2 file name
+//
+/////////////////////////////////////////////////////////////////////////////
+//
+// Gadget-2 format (BLOCK):
+//    SKIP_H 4 bytes (size of header)
+//    Header (256 bytes)
+//    SKIP_H 4 bytes (size of header)
+//
+//    SKIP_L 4 bytes (size of location block in bytes)
+//    Block of location data where each particle's (x,y,z) is stored together
+//    SKIP_L 4 bytes (size of location block in bytes)
+//
+//    SKIP_V 4 bytes (size of velocity block in bytes)
+//    Block of velocity data where each particle's (xv,yv,zv) is stored together
+//    SKIP_V 4 bytes (size of velocity block in bytes)
+//
+//    SKIP_T 4 bytes (size of tag block in bytes)
+//    Block of tag data
+//    SKIP_T 4 bytes (size of tag block in bytes)
+//
+//    Header file npart[6] array indicates the number of particles of each
+//    type stored in the file.  The types are:
+//
+//       0 Gas
+//       1 Halo
+//       2 Disk
+//       3 Bulge
+//       4 Stars
+//       5 Boundary
+//
+//    So npart[1] indicates the number of halo particles
+//
+/////////////////////////////////////////////////////////////////////////////
+//
+// Cosmo format (RECORD):
+//    X location
+//    X velocity
+//    Y location
+//    Y velocity
+//    Z location
+//    Z velocity
+//    Mass
+//    Tag
+//
+/////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv)
+{
+  if (argc != 3) {
+    cout << "Usage: CosmoToGadget2 cosmo-file gadget-file" << endl;
+    exit(-1);
+  }
+
+  string inFile = argv[1];
+  string outFile = argv[2];
+  int blockSize;
+
+  struct GadgetHeader {
+    int      npart[NUM_GADGET_TYPES];	// Number of particles
+    double   mass[NUM_GADGET_TYPES];	// Mass of particle
+    double   time;
+    double   redshift;
+    int      flag_sfr;
+    int      flag_feedback;
+    int      npartTotal[NUM_GADGET_TYPES];
+    int      flag_cooling;
+    int      num_files; 
+    double   BoxSize;
+    double   Omega0;
+    double   OmegaLambda;
+    double   HubbleParam; 
+    int      flag_stellarage;
+    int      flag_metals;
+    int      npartTotalHighWord[NUM_GADGET_TYPES];
+    int      flag_entropy_instead_u;
+    char     fill[60];			// Fills to 256 bytes
+  } gadgetHeader;                          
+
+  // Open input Cosmo file
+  ifstream *inStream = new ifstream(inFile.c_str(), ios::in | ios::binary);
+  if (inStream->fail()) {
+    cout << "File: " << inFile << " cannot be opened" << endl;
+    exit(-1);
+  }
+
+  // Determine the number of particles in cosmo file by checking file size
+  inStream->seekg(0L, ios::end);
+  int numberOfParticles = inStream->tellg() / RECORD_SIZE;
+  cout << "Number of particles: " << numberOfParticles << endl;
+  inStream->seekg(0L, ios::beg);
+
+  // Allocation the gadget2 blocks
+  POSVEL_T* location = new POSVEL_T[DIMENSION * numberOfParticles];
+  POSVEL_T* velocity = new POSVEL_T[DIMENSION * numberOfParticles];
+  ID_T* tag = new ID_T[numberOfParticles];
+
+#ifdef DEBUG
+  // Collect statistics
+  POSVEL_T minLoc[DIMENSION], maxLoc[DIMENSION];
+  POSVEL_T minVel[DIMENSION], maxVel[DIMENSION];
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    minLoc[dim] = MAX_FLOAT;
+    maxLoc[dim] = MIN_FLOAT;
+    minVel[dim] = MAX_FLOAT;
+    maxVel[dim] = MIN_FLOAT;
+  }
+#endif
+
+  // Read each cosmo record and transfer data to gadget blocks
+  int indx = 0;
+  POSVEL_T mass;
+  for (int i = 0; i < numberOfParticles; i++) {
+    inStream->read(reinterpret_cast<char*>(&location[indx]), sizeof(POSVEL_T));
+    inStream->read(reinterpret_cast<char*>(&velocity[indx]), sizeof(POSVEL_T));
+    inStream->read(reinterpret_cast<char*>(&location[indx+1]), sizeof(POSVEL_T));
+    inStream->read(reinterpret_cast<char*>(&velocity[indx+1]), sizeof(POSVEL_T));
+    inStream->read(reinterpret_cast<char*>(&location[indx+2]), sizeof(POSVEL_T));
+    inStream->read(reinterpret_cast<char*>(&velocity[indx+2]), sizeof(POSVEL_T));
+    inStream->read(reinterpret_cast<char*>(&mass), sizeof(POSVEL_T));
+    inStream->read(reinterpret_cast<char*>(&tag[i]), sizeof(ID_T));
+
+#ifdef DEBUG
+    // Collect ranges on this file
+    for (int dim = 0; dim < DIMENSION; dim++) {
+      if (minLoc[dim] > location[indx+dim])
+        minLoc[dim] = location[indx+dim];
+      if (maxLoc[dim] < location[indx+dim])
+        maxLoc[dim] = location[indx+dim];
+      if (minVel[dim] > velocity[indx+dim])
+        minVel[dim] = velocity[indx+dim];
+      if (maxVel[dim] < velocity[indx+dim])
+        maxVel[dim] = velocity[indx+dim];
+    }
+#endif
+    indx += DIMENSION;
+  }
+
+  // Open the output Gadget-2 file
+  ofstream *outStream = new ofstream(outFile.c_str(), ios::out | ios::binary);
+  if (outStream->fail()) {
+    cout << "File: " << outFile << " cannot be opened" << endl;
+    exit(-1);
+  }
+
+  // Fill in the Gadget-2 header
+  for (int i = 0; i < NUM_GADGET_TYPES; i++) {
+    gadgetHeader.npart[i] = 0;
+    gadgetHeader.mass[i] = 0.0;
+    gadgetHeader.npartTotal[i] = 0;
+    gadgetHeader.npartTotalHighWord[i] = 0;
+  }
+  gadgetHeader.time = 0.0;
+  gadgetHeader.redshift = 0.0;
+  gadgetHeader.flag_sfr = 0;
+  gadgetHeader.flag_feedback = 0;
+  gadgetHeader.flag_cooling = 0;
+  gadgetHeader.num_files = 0;
+  gadgetHeader.BoxSize = 0.0;
+  gadgetHeader.Omega0 = 0.0;
+  gadgetHeader.OmegaLambda = 0.0;
+  gadgetHeader.HubbleParam = 0.0;
+  gadgetHeader.flag_stellarage = 0;
+  gadgetHeader.flag_metals = 0;
+  gadgetHeader.flag_entropy_instead_u = 0;
+
+  gadgetHeader.npart[GADGET_HALO] = numberOfParticles;
+  gadgetHeader.mass[GADGET_HALO] = mass;
+
+  // Write the gadget header
+  blockSize = sizeof(GadgetHeader);
+  outStream->write(reinterpret_cast<char*>(&blockSize), GADGET_SKIP);
+  outStream->write(reinterpret_cast<char*>(&gadgetHeader), blockSize);
+  outStream->write(reinterpret_cast<char*>(&blockSize), GADGET_SKIP);
+
+  // Write location block
+  blockSize = DIMENSION * numberOfParticles * sizeof(POSVEL_T);
+  outStream->write(reinterpret_cast<char*>(&blockSize), GADGET_SKIP);
+  outStream->write(reinterpret_cast<char*>(location), blockSize);
+  outStream->write(reinterpret_cast<char*>(&blockSize), GADGET_SKIP);
+
+  // Write velocity block
+  outStream->write(reinterpret_cast<char*>(&blockSize), GADGET_SKIP);
+  outStream->write(reinterpret_cast<char*>(velocity), blockSize);
+  outStream->write(reinterpret_cast<char*>(&blockSize), GADGET_SKIP);
+
+  // Write tag block
+  blockSize = numberOfParticles * sizeof(ID_T);
+  outStream->write(reinterpret_cast<char*>(&blockSize), GADGET_SKIP);
+  outStream->write(reinterpret_cast<char*>(tag), blockSize);
+  outStream->write(reinterpret_cast<char*>(&blockSize), GADGET_SKIP);
+
+  outStream->close();
+
+#ifdef DEBUG
+  // Ranges of location and velocity in file
+  cout << endl;
+  cout << "Number of particles: " << numberOfParticles << endl;
+  cout << "Location: ";
+  for (int dim = 0; dim < DIMENSION; dim++)
+    cout << " [" << minLoc[dim] << ":" << maxLoc[dim] << "] ";
+  cout << endl;
+
+  cout << "Velocity: ";
+  for (int dim = 0; dim < DIMENSION; dim++)
+    cout << " [" << minVel[dim] << ":" << maxVel[dim] << "] ";
+  cout << endl << endl;
+#endif
+}
diff --git a/src/halo-finder/src/Definition.h b/src/halo-finder/src/Definition.h
new file mode 100644
index 0000000..9ffbeee
--- /dev/null
+++ b/src/halo-finder/src/Definition.h
@@ -0,0 +1,58 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#ifndef Definition_h
+#define Definition_h
+
+#ifndef USE_SERIAL_COSMO
+#ifdef USE_VTK_COSMO
+#include "vtkMPI.h"
+#else
+#include <rru_mpi.h>
+#endif
+#endif
+
+#include "BasicDefinition.h"
+
+#endif
diff --git a/src/halo-finder/src/ForceLaw.cxx b/src/halo-finder/src/ForceLaw.cxx
new file mode 100644
index 0000000..ee24a8d
--- /dev/null
+++ b/src/halo-finder/src/ForceLaw.cxx
@@ -0,0 +1,192 @@
+#include "ForceLaw.h"
+
+#define MEM_ALIGN 64
+
+//comment this out to leave out "inline" keywords
+#define INLINEQ 1
+
+#define POLY_ORDER 6
+
+/*
+int my_posix_memalign(void **memptr, size_t alignment, size_t size) {
+  int ret;
+
+  ret = posix_memalign(memptr, alignment, size);
+  assert(ret==0);
+
+  return ret;
+}
+*/
+
+//-----------------------------------------------------------------------------
+
+  FGrid::FGrid() :
+  m_b(0.72),
+  m_c(0.01),
+  m_d(0.27),
+  m_e(0.0001),
+  m_f(360.0),
+  m_g(100.0),
+  m_h(0.67),
+  m_l(17.0),
+  m_rmax(3.116326355) {
+};
+
+
+#ifdef INLINEQ
+inline
+#endif
+  float FGrid::fgor(float r) {
+  float f0 = m_c + 2.0/3.0*m_b*m_b*m_b;
+  float r2 = r*r;
+  float r4 = r2*r2;
+  float r6 = r4*r2;
+  float coshbr = coshf(m_b*r);
+  float r3fgor = tanhf(m_b*r) - m_b*r/coshbr/coshbr
+    + m_c*r*r2*(1.0 + m_d*r2)*expf(-1.0*m_d*r2)
+    + m_e*r2*(m_f*r2 + m_g*r4 + m_l*r6)*expf(-1.0*m_h*r2);
+  float rdiv = r + 1.0*(r<=0.0);
+  float rdiv3 = rdiv*rdiv*rdiv;
+  return (r3fgor/rdiv3 + (r<=0.0)*f0)*(r<=m_rmax)*(r>=0.0);
+}
+
+
+  void FGrid::fgor_r2_interp(int nInterp, float **r2, float **f) {
+  //my_posix_memalign((void **)r2, MEM_ALIGN, nInterp*sizeof(float) );
+  //my_posix_memalign((void **)f, MEM_ALIGN, nInterp*sizeof(float) );
+  *r2 = (float *)malloc(nInterp*sizeof(float));
+  *f = (float *)malloc(nInterp*sizeof(float));
+
+  double dr2 = (m_rmax*m_rmax)/(nInterp-1.0);
+  for(int i=0; i<nInterp; i++) {
+    (*r2)[i] = i*dr2;
+    (*f)[i] = fgor(sqrt(i*dr2));
+  }
+
+  return;
+}
+
+//-----------------------------------------------------------------------------
+
+  FGridEvalFit::FGridEvalFit(FGrid *fg) {
+  m_fg = fg;
+}
+
+
+#ifdef INLINEQ
+inline
+#endif
+  float FGridEvalFit::eval(float r2) {
+  return m_fg->fgor(sqrt(r2));
+}
+
+//-----------------------------------------------------------------------------
+
+
+
+// 0.263729 - 0.0686285 x + 0.00882248 x^2 - 0.000592487 x^3 + 0.0000164662 x^4
+
+// 0.269327 - 0.0750978 x + 0.0114808 x^2 - 0.00109313 x^3 + 0.0000605491 x^4 - 1.47177*10^-6 x^5
+
+// 0.271431 - 0.0783394 x + 0.0133122 x^2 - 0.00159485 x^3 + 0.000132336 x^4 - 6.63394*10^-6 x^5 + 1.47305*10^-7 x^6
+
+  FGridEvalPoly::FGridEvalPoly(FGrid *fg) {
+  m_fg = fg;
+
+#if POLY_ORDER == 6
+  //6th order
+  m_a[0] =  0.271431;
+  m_a[1] = -0.0783394;
+  m_a[2] =  0.0133122;
+  m_a[3] = -0.00159485;
+  m_a[4] =  0.000132336;
+  m_a[5] = -0.00000663394;
+  m_a[6] =  0.000000147305;
+#endif
+
+#if POLY_ORDER == 5
+  //5th order
+  m_a[0] =  0.269327;
+  m_a[1] = -0.0750978;
+  m_a[2] =  0.0114808;
+  m_a[3] = -0.00109313;
+  m_a[4] =  0.0000605491;
+  m_a[5] = -0.00000147177;
+  m_a[6] =  0.0;
+#endif
+
+#if POLY_ORDER == 4
+  //4th order
+  m_a[0] =  0.263729;
+  m_a[1] = -0.0686285;
+  m_a[2] =  0.00882248;
+  m_a[3] = -0.000592487;
+  m_a[4] =  0.0000164622;
+  m_a[5] = 0.0;
+  m_a[6] = 0.0;
+#endif
+
+  m_r2min = 0.0;
+  m_r2max = fg->rmax()*fg->rmax();
+}
+
+
+#ifdef INLINEQ
+inline
+#endif
+  float FGridEvalPoly::eval(float r2) {
+  float ret=0.0;
+  ret = m_a[0] + r2*(m_a[1] + r2*(m_a[2] + r2*(m_a[3] + r2*(m_a[4] + r2*(m_a[5] + r2*m_a[6])))));
+  return ret*(r2 >= m_r2min)*(r2 <= m_r2max);
+}
+
+//-----------------------------------------------------------------------------
+
+  FGridEvalInterp::FGridEvalInterp(FGrid *fg, int nInterp) {
+  m_nInterp = nInterp;
+  fg->fgor_r2_interp(m_nInterp, &m_r2, &m_f);
+  m_r2min = m_r2[0];
+  m_r2max = m_r2[m_nInterp-1];
+  m_dr2 = (m_r2max - m_r2min)/(m_nInterp - 1.0);
+  m_oodr2 = 1.0/m_dr2;
+}
+
+
+  FGridEvalInterp::~FGridEvalInterp() {
+  free(m_r2);
+  free(m_f);
+}
+
+
+#ifdef INLINEQ
+inline
+#endif
+  float FGridEvalInterp::eval(float r2) {
+  int inRange, indx;
+  float inRangef;
+  inRange = (r2 > m_r2min)*(r2 < m_r2max);
+  inRangef = 1.0*inRange;
+  indx = int((r2 - m_r2min)*m_oodr2)*inRange;
+  return inRangef*(m_f[indx]+(r2-m_r2[indx])*m_oodr2*(m_f[indx+1]-m_f[indx]));
+}
+
+//-----------------------------------------------------------------------------
+
+  ForceLawSR::ForceLawSR(FGridEval *fgore, float rsm) {
+  m_rsm = rsm;
+  m_rsm2 = rsm*rsm;
+  m_r2min = fgore->r2min();
+  m_r2max = fgore->r2max();
+  m_fgore = fgore;
+}
+
+
+#ifdef INLINEQ
+inline
+#endif
+  float ForceLawSR::f_over_r(float r2) {
+  float ret = powf(r2 + m_rsm2, -1.5) - m_fgore->eval(r2);
+  ret *= (r2>=m_r2min)*(r2<=m_r2max);
+  return ret;
+  //return ( powf(r2 + m_rsm2, -1.5) );
+}
diff --git a/src/halo-finder/src/ForceLaw.h b/src/halo-finder/src/ForceLaw.h
new file mode 100644
index 0000000..e093799
--- /dev/null
+++ b/src/halo-finder/src/ForceLaw.h
@@ -0,0 +1,134 @@
+#ifndef FORCELAW_H
+#define FORCELAW_H
+
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+
+using namespace std;
+
+class FGrid {
+public:
+    FGrid();
+    ~FGrid(){
+  };
+
+  float fgor(float r);
+  void fgor_r2_interp(int nInterp, float **r2, float **f);
+  float rmax(){return m_rmax;};
+
+ protected:
+  float m_b, m_c, m_d, m_e, m_f, m_g, m_h, m_l, m_rmax;
+};
+
+
+
+class FGridEval
+{
+  public:
+    FGridEval() {};
+    virtual ~FGridEval() {};
+    virtual float eval(float)  = 0;
+    virtual float r2min() = 0;
+    virtual float r2max() = 0;
+};
+
+
+
+class FGridEvalFit : public FGridEval
+{
+  public:
+    FGridEvalFit(FGrid *fg);
+    ~FGridEvalFit() {};
+    float eval(float);
+    float r2min(){return 0.0;};
+    float r2max(){return m_fg->rmax()*m_fg->rmax();};
+
+  protected:
+    FGrid *m_fg;
+};
+
+
+
+class FGridEvalPoly : public FGridEval
+{
+public:
+    FGridEvalPoly(FGrid *fg);
+    ~FGridEvalPoly() {
+  };
+    float eval(float);
+    float r2min(){return 0.0;};
+    float r2max(){return m_fg->rmax()*m_fg->rmax();};
+
+ protected:
+  FGrid *m_fg;
+  float m_r2min, m_r2max;
+  //float m_a0, m_a1, m_a2, m_a3, m_a4, m_a5, m_a6;
+  float m_a[7];
+};
+
+
+
+class FGridEvalInterp : public FGridEval
+{
+  public:
+    FGridEvalInterp(FGrid *fg, int nInterp);
+    ~FGridEvalInterp();
+    float eval(float);
+    float r2min(){return m_r2min;}
+    float r2max(){return m_r2max;}
+
+    int nInterp() {return m_nInterp;}
+    float* r2() {return m_r2;}
+    float* f() {return m_f;}
+
+  protected:
+    float *m_r2;
+    float *m_f;
+    float m_r2min;
+    float m_r2max;
+    float m_dr2;
+    float m_oodr2;
+    int m_nInterp;  
+};
+
+
+
+class ForceLaw
+{
+ public:
+    ForceLaw(){};
+  virtual   ~ForceLaw(){};
+  virtual   float f_over_r(float r2) = 0;
+};
+
+
+
+class ForceLawNewton : public ForceLaw
+{
+  public:
+    ~ForceLawNewton() {
+    };
+    float f_over_r(float r2){return 1.0/r2/sqrt(r2);}
+
+};
+
+
+
+class ForceLawSR : public ForceLaw
+{
+ public:
+    ForceLawSR(FGridEval *fgore, float rsm);
+    ~ForceLawSR() {
+  };
+    float f_over_r(float r2);
+
+ protected:
+  float m_rsm;
+  float m_rsm2;
+  float m_r2min;
+  float m_r2max;
+  FGridEval *m_fgore;
+};
+
+#endif
diff --git a/src/halo-finder/src/ForceTree.cxx b/src/halo-finder/src/ForceTree.cxx
new file mode 100644
index 0000000..1c57481
--- /dev/null
+++ b/src/halo-finder/src/ForceTree.cxx
@@ -0,0 +1,194 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <set>
+#include <math.h>
+
+#include "Partition.h"
+#include "ForceTree.h"
+
+#ifndef USE_VTK_COSMO
+#include "Timings.h"
+#endif
+
+using namespace std;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// ForceTree calculates particle force using a BHTree
+//
+/////////////////////////////////////////////////////////////////////////
+
+ForceTree::ForceTree()
+{
+  // Get the number of processors and rank of this processor
+  this->numProc = Partition::getNumProc();
+  this->myProc = Partition::getMyProc();
+
+  this->bhTree = 0;
+}
+
+ForceTree::~ForceTree()
+{
+  if (this->bhTree != 0) delete this->bhTree;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Set the parameters for algorithms
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ForceTree::setParameters(
+                        POSVEL_T* minPos,
+			POSVEL_T* maxPos,
+			POSVEL_T openAngle,
+			POSVEL_T critRadius,
+			int minGroup,
+			int maxGroup,
+			POSVEL_T pmass)
+{
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    this->minLoc[dim] = minPos[dim];
+    this->maxLoc[dim] = maxPos[dim];
+  }
+  this->openingAngle = openAngle;
+  this->criticalRadius = critRadius;
+  this->minimumGroup = minGroup;
+  this->maximumGroup = maxGroup;
+  this->particleMass = pmass;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Set the particle vectors that have already been read and which
+// contain only the alive particles for this processor
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ForceTree::setParticles(
+                        vector<POSVEL_T>* xLoc,
+                        vector<POSVEL_T>* yLoc,
+                        vector<POSVEL_T>* zLoc,
+                        vector<POSVEL_T>* xVel,
+                        vector<POSVEL_T>* yVel,
+                        vector<POSVEL_T>* zVel,
+                        vector<POSVEL_T>* pmass,
+                        vector<POTENTIAL_T>* potential)
+{
+  // Extract the contiguous data block from a vector pointer
+  this->particleCount = (long) xLoc->size();
+  this->xx = &(*xLoc)[0];
+  this->yy = &(*yLoc)[0];
+  this->zz = &(*zLoc)[0];
+  this->vx = &(*xVel)[0];
+  this->vy = &(*yVel)[0];
+  this->vz = &(*zVel)[0];
+  this->mass = &(*pmass)[0];
+  this->pot = &(*potential)[0];
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ForceTree::buildForceTree()
+{
+  // BHTree is constructed from halo particles
+  this->bhTree = new BHForceTree(
+                            minLoc, maxLoc,
+                            this->particleCount,
+                            this->xx, this->yy, this->zz,
+                            this->vx, this->vy, this->vz,
+                            this->mass, this->particleMass);
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ForceTree::forceCalculationBarnesAdjust()
+{
+  this->bhTree->treeForceBarnesAdjust(this->openingAngle,
+                                      this->criticalRadius);
+}
+
+void ForceTree::forceCalculationBarnesQuick()
+{
+  this->bhTree->treeForceBarnesQuick(this->openingAngle,
+                                     this->criticalRadius);
+}
+
+void ForceTree::forceCalculationGadgetTopDown()
+{
+  for (int p = 0; p < this->particleCount; p++) {
+    this->bhTree->treeForceGadgetTopDown(p, this->openingAngle, 
+                                         this->criticalRadius);
+  }
+}
+
+void ForceTree::forceCalculationGadgetBottomUp()
+{
+  for (int p = 0; p < this->particleCount; p++) {
+    this->bhTree->treeForceGadgetBottomUp(p, this->openingAngle, 
+                                          this->criticalRadius);
+  }
+}
+
+void ForceTree::forceCalculationGroup()
+{
+  this->bhTree->treeForceGroup(this->openingAngle, 
+                               this->criticalRadius,
+                               this->minimumGroup,
+                               this->maximumGroup);
+}
+
+void ForceTree::forceCalculationN2()
+{
+  this->bhTree->treeForceN2(this->criticalRadius);
+}
diff --git a/src/halo-finder/src/ForceTree.h b/src/halo-finder/src/ForceTree.h
new file mode 100644
index 0000000..402da35
--- /dev/null
+++ b/src/halo-finder/src/ForceTree.h
@@ -0,0 +1,135 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+// .NAME ForceTree - calculate particle force in tree
+//
+
+#ifndef ForceTree_h
+#define ForceTree_h
+
+#include "Definition.h"
+#include "BHForceTree.h"
+#include <string>
+#include <vector>
+#include <set>
+#include <algorithm>
+
+using namespace std;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// ForceTree takes the particle information, builds
+// a Barnes Hut octree from the particles, calculates force
+//
+/////////////////////////////////////////////////////////////////////////
+
+class ForceTree {
+public:
+  ForceTree();
+  ~ForceTree();
+
+  // Set parameters
+  void setParameters(
+	POSVEL_T* minLoc,
+	POSVEL_T* maxLoc,
+	POSVEL_T openAngle,
+	POSVEL_T critRadius,
+	int minGroup,
+	int maxGroup,
+	POSVEL_T pmass);
+
+  // Set alive particle vectors which were created elsewhere
+  void setParticles(
+        vector<POSVEL_T>* xx,
+        vector<POSVEL_T>* yy,
+        vector<POSVEL_T>* zz,
+        vector<POSVEL_T>* vx,
+        vector<POSVEL_T>* vy,
+        vector<POSVEL_T>* vz,
+        vector<POSVEL_T>* mass,
+        vector<POTENTIAL_T>* potential);
+
+  // Build the BH force tree structure
+  void buildForceTree();
+
+  // Calculate the particle force
+  void forceCalculationBarnesAdjust();
+  void forceCalculationBarnesQuick();
+  void forceCalculationGadgetTopDown();
+  void forceCalculationGadgetBottomUp();
+  void forceCalculationGroup();
+  void forceCalculationN2();
+
+  // Print values
+  void printForceValues()	{ this->bhTree->printForceValues(); }
+
+private:
+  int    myProc;                // My processor number
+  int    numProc;               // Total number of processors
+
+  ID_T   particleCount;         // Total particles on this processor
+  POSVEL_T particleMass;	// Average particle mass
+  POSVEL_T openingAngle;	// Open node in BH tree to higher resolution
+  POSVEL_T criticalRadius;	// Ignore node in BH tree
+  int minimumGroup;             // Number of particles for one interaction list
+  int maximumGroup;             // Number of particles for one interaction list
+
+  POSVEL_T minLoc[DIMENSION];	// Lower bound on box
+  POSVEL_T maxLoc[DIMENSION];	// Upper bound on box
+
+  POSVEL_T* xx;                 // X location for particles on this processor
+  POSVEL_T* yy;                 // Y location for particles on this processor
+  POSVEL_T* zz;                 // Z location for particles on this processor
+  POSVEL_T* vx;                 // X velocity for particles on this processor
+  POSVEL_T* vy;                 // Y velocity for particles on this processor
+  POSVEL_T* vz;                 // Z velocity for particles on this processor
+  POSVEL_T* mass;		// Mass of particles on this processor
+  POSVEL_T* pot;		// Potential of particles on this processor
+  ID_T* tag;                    // Id tag for particles on this processor
+
+  // Barnes Hut Tree
+  BHForceTree* bhTree;		// Particles organized by location
+};
+
+#endif
diff --git a/src/halo-finder/src/ForceTreeTest.cxx b/src/halo-finder/src/ForceTreeTest.cxx
new file mode 100644
index 0000000..176fb1f
--- /dev/null
+++ b/src/halo-finder/src/ForceTreeTest.cxx
@@ -0,0 +1,324 @@
+//Adrian's test 2
+//calculates force on particle due to sphere of particles
+//can use full newton or short range force
+//calculates direct particle-particle force for comparison
+//newton should match theory prediction
+//short range may not due to lack of gauss law
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <sys/time.h>
+
+#include <cstdio>
+#include <stdlib.h>
+#include <string.h>
+
+#include <ForceLaw.h>
+#include <BHForceTree.h>
+#include <RCBForceTree.h>
+#include <Partition.h>
+
+#include <fenv.h>
+#if defined(__i386__) && defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
+#include <mpi.h>
+
+int main(int argc, char *argv[])
+{
+  using namespace std;
+
+#if defined(FE_NOMASK_ENV) && !defined(__INTEL_COMPILER)
+  fesetenv(FE_NOMASK_ENV);
+  fedisableexcept(/* FE_OVERFLOW | */ FE_UNDERFLOW | FE_INEXACT);
+#elif defined(__i386__) && defined(__SSE__)
+  _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_OVERFLOW|_MM_MASK_INVALID|_MM_MASK_DIV_ZERO));
+#endif
+
+#ifndef USE_SERIAL_COSMO
+  MPI_Init(&argc, &argv);
+#endif
+  Partition::initialize();
+  
+  if(argc < 8) {
+    fprintf(stderr,"USAGE: %s <L> <rSphere> <nSphere> <theta> <nTrials> <N|S> <seed> <bh|bhall|rcb|rcbm>\n",argv[0]);
+    exit(-1);
+  }
+
+  POSVEL_T L = atof(argv[1]);
+  POSVEL_T rSphere = atof(argv[2]);
+  int nSphere = atoi(argv[3]);
+  float m_openAngle = atof(argv[4]);  
+  int trials = atoi(argv[5]);
+  char* forceType = argv[6];
+  long seed = atoi(argv[7]);
+
+  int rpart = 0;
+  int rcbN = 12;
+  int tMin = 128;
+  int useRCB = 0, bhAll = 0;
+  if (argc > 8 && strncmp("rcb", argv[8], 3) == 0) {
+    useRCB = argv[8][3] == 'm' ? 2 : 1;
+    if (argc > 9) {
+      rcbN = atoi(argv[9]);
+      if (argc > 10) {
+        tMin = atoi(argv[10]);
+        if (argc > 11 && strcmp("r", argv[11]) == 0) {
+          rpart = 1;
+        }
+      }
+    }
+  } else if (argc > 8 && strcmp("bhall", argv[8]) == 0) {
+    bhAll = 1;
+  }
+
+  float m_rsm = 0.1;
+
+  int Np = nSphere+1;
+  POSVEL_T* m_xArr = new POSVEL_T[Np];
+  POSVEL_T* m_yArr = new POSVEL_T[Np];
+  POSVEL_T* m_zArr = new POSVEL_T[Np];
+  POSVEL_T* m_vxArr = new POSVEL_T[Np];
+  POSVEL_T* m_vyArr = new POSVEL_T[Np];
+  POSVEL_T* m_vzArr = new POSVEL_T[Np];
+  POSVEL_T* m_massArr = new POSVEL_T[Np];
+
+  FGrid *m_fg = new FGrid();
+  FGridEval *m_fgore = new FGridEvalFit(m_fg);
+
+  ForceLaw *m_fl;
+  if(forceType[0] == 'N')
+    m_fl = new ForceLawNewton();
+  else
+    m_fl = new ForceLawSR(m_fgore, m_rsm);
+
+  POSVEL_T xlo = 0.0, xhi = L;
+  POSVEL_T ylo = 0.0, yhi = L;
+  POSVEL_T zlo = 0.0, zhi = L;
+  float ngltree[DIMENSION];
+  ngltree[2] = ngltree[1] = ngltree[0] = L;
+  float zero[DIMENSION] = {0.0, 0.0, 0.0};
+  
+  POSVEL_T c = 1.0;  
+  float m_fsrrmax = m_fg->rmax();
+
+  srand48(seed);  
+
+  POSVEL_T pos_p[DIMENSION], pos_s[DIMENSION], vel_p[DIMENSION];
+
+  int t=0;
+  while(t<trials) {
+
+    for(int i=0; i<DIMENSION; i++) {
+      pos_p[i] = L*drand48();
+      pos_s[i] = L*drand48();
+    }
+
+    float dx = pos_s[0]-pos_p[0];
+    float dy = pos_s[1]-pos_p[1];
+    float dz = pos_s[2]-pos_p[2];
+    float r2 = dx*dx + dy*dy + dz*dz;
+    float r = sqrt(r2);
+
+    int inBounds = 1;
+    inBounds *= (pos_s[0]>rSphere)*(pos_s[0]<L-rSphere);
+    inBounds *= (pos_s[1]>rSphere)*(pos_s[1]<L-rSphere);
+    inBounds *= (pos_s[2]>rSphere)*(pos_s[2]<L-rSphere);
+
+/*
+    if( (r<rSphere) || !inBounds)
+      continue;
+*/
+
+    for (int i = 0; i < 3; ++i) {
+      zero[i] = min(zero[i], pos_p[i]);
+      ngltree[i] = max(ngltree[i], pos_p[i]);
+    }
+
+    t++;
+    for (int i = 0; i < Np; ++i) {
+      m_xArr[i] = 0.0;
+      m_yArr[i] = 0.0;
+      m_zArr[i] = 0.0;
+      m_vxArr[i] = 0.0;
+      m_vyArr[i] = 0.0;
+      m_vzArr[i] = 0.0;
+      m_massArr[i] = 1.0;
+    }
+    m_xArr[0] = pos_p[0];
+    m_yArr[0] = pos_p[1];
+    m_zArr[0] = pos_p[2];
+
+    int p2=1;
+    while(p2 < Np) {
+      float sdx = 2.0*rSphere*(drand48()-0.5);
+      float sdy = 2.0*rSphere*(drand48()-0.5);
+      float sdz = 2.0*rSphere*(drand48()-0.5);
+
+      if( (sdx*sdx + sdy*sdy + sdz*sdz) > rSphere*rSphere)
+	continue;
+
+      m_xArr[p2] = pos_s[0] + sdx;
+      m_yArr[p2] = pos_s[1] + sdy;
+      m_zArr[p2] = pos_s[2] + sdz;
+
+      float pt[] = { m_xArr[p2], m_yArr[p2], m_zArr[p2] };
+      for (int i = 0; i < 3; ++i) {
+        zero[i] = min(zero[i], pt[i]);
+        ngltree[i] = max(ngltree[i], pt[i]);
+      }
+
+      p2++;
+    }
+
+    //build tree
+    if (useRCB) {
+      POSVEL_T* m_phiArr = new POSVEL_T[Np];
+      memset(m_phiArr, 0, sizeof(POSVEL_T)*Np);
+
+      ID_T* m_idArr = new ID_T[Np];
+      memset(m_idArr, 0, sizeof(ID_T)*Np);
+
+      MASK_T* m_maskArr = new MASK_T[Np];
+      memset(m_maskArr, 0, sizeof(MASK_T)*Np);
+
+      if (useRCB == 2) {
+        RCBMonopoleForceTree *sft = new RCBMonopoleForceTree(zero,
+                                          ngltree,
+                                          zero, ngltree,
+                                          Np,
+                                          m_xArr,
+                                          m_yArr,
+                                          m_zArr,
+                                          m_vxArr,
+                                          m_vyArr,
+                                          m_vzArr,
+                                          m_massArr,
+                                          m_phiArr,
+                                          m_idArr,
+                                          m_maskArr,
+                                          1.0,
+                                          m_fsrrmax,
+                                          m_rsm,
+                                          m_openAngle,
+                                          rcbN,
+                                          2,
+                                          tMin,
+                                          m_fl,
+                                          c);
+        delete sft;
+      } else {
+        RCBQuadrupoleForceTree *sft = new RCBQuadrupoleForceTree(zero,
+                                          ngltree,
+                                          zero, ngltree,
+                                          Np,
+                                          m_xArr,
+                                          m_yArr,
+                                          m_zArr,
+                                          m_vxArr,
+                                          m_vyArr,
+                                          m_vzArr,
+                                          m_massArr,
+                                          m_phiArr,
+                                          m_idArr,
+                                          m_maskArr,
+                                          1.0,
+                                          m_fsrrmax,
+                                          m_rsm,
+                                          m_openAngle,
+                                          rcbN,
+                                          2,
+                                          tMin,
+                                          m_fl,
+                                          c);
+        delete sft;
+      }
+
+      delete [] m_phiArr;
+      delete [] m_idArr;
+      delete [] m_maskArr;
+    } else {
+      BHForceTree *bhft = new BHForceTree(zero,
+  					ngltree,
+  					Np,
+  					m_xArr,
+  					m_yArr,
+  					m_zArr,
+  					m_vxArr,
+  					m_vyArr,
+  					m_vzArr,
+  					m_massArr,
+  					1.0,
+  					m_fl,
+  					c);
+  
+      bhft->treeForceGadgetTopDown(0, m_openAngle, m_fsrrmax);
+      if (bhAll) for (int i = 1; i < Np; ++i) {
+        bhft->treeForceGadgetTopDown(i, m_openAngle, m_fsrrmax);
+      }
+          
+      //bhft->printForceValues();
+      //bhft->printBHForceTree();
+      
+      delete bhft;
+    }
+
+    // The tree may have reordered the particles...
+    ID_T pidx = 0;
+    if (!rpart) {
+      for (; m_xArr[pidx] != pos_p[0]; ++pidx);
+    } else {
+      pidx = (ID_T) (Np*drand48());
+    }
+    // printf("\ttest particle: (%f, %f, %f)\n", m_xArr[pidx], m_yArr[pidx], m_zArr[pidx]);
+
+    float f0 = sqrt(m_vxArr[pidx]*m_vxArr[pidx] + 
+		    m_vyArr[pidx]*m_vyArr[pidx] + 
+		    m_vzArr[pidx]*m_vzArr[pidx]);
+
+    vel_p[0] = vel_p[1] = vel_p[2] = 0.0;
+    for(int i=0; i<Np; i++) {
+      if (i == pidx) continue;
+      float n2dx = m_xArr[i] - m_xArr[pidx];
+      float n2dy = m_yArr[i] - m_yArr[pidx];
+      float n2dz = m_zArr[i] - m_zArr[pidx];
+      float n2r2 = n2dx*n2dx + n2dy*n2dy + n2dz*n2dz;
+      float n2for = m_fl->f_over_r(n2r2);
+      vel_p[0] += n2dx*n2for;
+      vel_p[1] += n2dy*n2for;
+      vel_p[2] += n2dz*n2for;
+    }
+
+    float f1 = sqrt(vel_p[0]*vel_p[0] +
+		    vel_p[1]*vel_p[1] +
+		    vel_p[2]*vel_p[2]);
+
+    printf("%f\t%e\t%e\t%e\n",
+	   r,
+	   r2*f0,
+	   r2*f1,
+	   r2*r*m_fl->f_over_r(r2)*nSphere);
+  }
+  
+  delete m_fl;
+  delete m_fgore;
+  delete m_fg;
+
+  delete [] m_xArr;
+  delete [] m_yArr;
+  delete [] m_zArr;
+  delete [] m_vxArr;
+  delete [] m_vyArr;
+  delete [] m_vzArr;
+  delete [] m_massArr;
+
+#ifndef USE_SERIAL_COSMO
+  MPI_Finalize();
+#endif
+  
+  return 0;
+}
diff --git a/src/halo-finder/src/InitialExchange.cxx b/src/halo-finder/src/InitialExchange.cxx
new file mode 100644
index 0000000..873b588
--- /dev/null
+++ b/src/halo-finder/src/InitialExchange.cxx
@@ -0,0 +1,750 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+
+#include <sys/types.h>
+#include <dirent.h>
+
+#include "Partition.h"
+#include "InitialExchange.h"
+
+using namespace std;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// InitialExchange takes input from the particle initializer which originally
+// contained alive particles on this processor, but after the particles move
+// a little they might have moved to a neighbor processor.  Input is in the
+// form of arrays, but vectors are supplied which will be filled with the
+// alive particles for this processor.  As each particle is examined it is
+// placed immediately in the vector if it is alive on this processor and the
+// index is placed by neighbor if it is dead on this processor.  After
+// categorizing all particles, the dead are exchanged with neighbors where
+// they become alive.
+//
+// After this initial exchange, ParticleExchange is called to place all
+// dead particles (which will be the ones sent in this step being returned
+// plus the others which were originally placed correctly on the neighbor).
+//
+/////////////////////////////////////////////////////////////////////////
+
+InitialExchange::InitialExchange()
+{
+  // Get the number of processors running this problem and rank
+  this->numProc = Partition::getNumProc();
+  this->myProc = Partition::getMyProc();
+
+  // Get the number of processors in each dimension
+  Partition::getDecompSize(this->layoutSize);
+
+  // Get my position within the Cartesian topology
+  Partition::getMyPosition(this->layoutPos);
+
+  // Get neighbors of this processor including the wraparound
+  Partition::getNeighbors(this->neighbor);
+
+  this->numberOfAliveParticles = 0;
+}
+
+InitialExchange::~InitialExchange()
+{
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Set parameters for particle distribution
+//
+/////////////////////////////////////////////////////////////////////////
+
+void InitialExchange::setParameters(POSVEL_T rL, POSVEL_T deadSz)
+{
+  // Physical total space and amount of physical space to use for dead particles
+  this->boxSize = rL;
+  this->deadSize = deadSz;
+
+#ifdef DEBUG
+  if (this->myProc == MASTER) {
+    cout << endl << "------------------------------------" << endl;
+    cout << "boxSize:  " << this->boxSize << endl;
+    cout << "deltaBox: " << this->deadSize << endl;
+  }
+#endif
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Initialize the regions outside the alive area for this processor which
+// contain the particles which must be sent away to neighbors
+//
+/////////////////////////////////////////////////////////////////////////
+
+void InitialExchange::initialize()
+{
+#ifdef DEBUG
+  if (this->myProc == MASTER)
+    cout << "Decomposition: [" << this->layoutSize[0] << ":"
+         << this->layoutSize[1] << ":" << this->layoutSize[2] << "]" << endl;
+#endif
+
+  // Set subextents on particle locations for this processor
+  POSVEL_T boxStep[DIMENSION];
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    boxStep[dim] = this->boxSize / this->layoutSize[dim];
+
+    // Particles in this region belong to this processor as alive
+    this->minAlive[dim] = this->layoutPos[dim] * boxStep[dim];
+    this->maxAlive[dim] = this->minAlive[dim] + boxStep[dim];
+    if (this->maxAlive[dim] > this->boxSize)
+      this->maxAlive[dim] = this->boxSize;
+
+    // Particles in this region are dead on this processor but alive elsewhere
+    this->minDead[dim] = this->minAlive[dim] - this->deadSize;
+    this->maxDead[dim] = this->maxAlive[dim] + this->deadSize;
+  }
+
+  // Set the ranges on the dead particles for each neighbor direction
+  calculateExchangeRegions();
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Each of the 26 neighbors will be sent a rectangular region of particles
+// on this processor where they are dead.  The locations coming from the
+// initializer's arrays are in the range [0:rL] and so particles to the
+// left of the leftmost processor will have values at the far end of
+// the box.  When they are sent they won't need to be modified by offsets.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void InitialExchange::calculateExchangeRegions()
+{
+  // Initialize all neighbors to the entire available exchange range
+  for (int i = 0; i < NUM_OF_NEIGHBORS; i++) {
+    for (int dim = 0; dim < DIMENSION; dim++) {
+      this->minRange[i][dim] = this->minAlive[dim];
+      this->maxRange[i][dim] = this->maxAlive[dim];
+    }
+  }
+
+  // Left face
+  this->minRange[X0][0] = this->minDead[0];
+  this->maxRange[X0][0] = this->minAlive[0];
+
+  // Right face
+  this->minRange[X1][0] = this->maxAlive[0];
+  this->maxRange[X1][0] = this->maxDead[0];
+
+  // Bottom face
+  this->minRange[Y0][1] = this->minDead[1];
+  this->maxRange[Y0][1] = this->minAlive[1];
+
+  // Top face
+  this->minRange[Y1][1] = this->maxAlive[1];
+  this->maxRange[Y1][1] = this->maxDead[1];
+
+  // Front face
+  this->minRange[Z0][2] = this->minDead[2];
+  this->maxRange[Z0][2] = this->minAlive[2];
+
+  // Back face
+  this->minRange[Z1][2] = this->maxAlive[2];
+  this->maxRange[Z1][2] = this->maxDead[2];
+
+  // Left bottom and top bars
+  this->minRange[X0_Y0][0] = this->minDead[0];
+  this->maxRange[X0_Y0][0] = this->minAlive[0];
+  this->minRange[X0_Y0][1] = this->minDead[1];
+  this->maxRange[X0_Y0][1] = this->minAlive[1];
+
+  this->minRange[X0_Y1][0] = this->minDead[0];
+  this->maxRange[X0_Y1][0] = this->minAlive[0];
+  this->minRange[X0_Y1][1] = this->maxAlive[1];
+  this->maxRange[X0_Y1][1] = this->maxDead[1];
+
+  // Right bottom and top bars
+  this->minRange[X1_Y0][0] = this->maxAlive[0];
+  this->maxRange[X1_Y0][0] = this->maxDead[0];
+  this->minRange[X1_Y0][1] = this->minDead[1];
+  this->maxRange[X1_Y0][1] = this->minAlive[1];
+
+  this->minRange[X1_Y1][0] = this->maxAlive[0];
+  this->maxRange[X1_Y1][0] = this->maxDead[0];
+  this->minRange[X1_Y1][1] = this->maxAlive[1];
+  this->maxRange[X1_Y1][1] = this->maxDead[1];
+
+  // Bottom front and back bars
+  this->minRange[Y0_Z0][1] = this->minDead[1];
+  this->maxRange[Y0_Z0][1] = this->minAlive[1];
+  this->minRange[Y0_Z0][2] = this->minDead[2];
+  this->maxRange[Y0_Z0][2] = this->minAlive[2];
+
+  this->minRange[Y0_Z1][1] = this->minDead[1];
+  this->maxRange[Y0_Z1][1] = this->minAlive[1];
+  this->minRange[Y0_Z1][2] = this->maxAlive[2];
+  this->maxRange[Y0_Z1][2] = this->maxDead[2];
+
+  // Top front and back bars 
+  this->minRange[Y1_Z0][1] = this->maxAlive[1];
+  this->maxRange[Y1_Z0][1] = this->maxDead[1];
+  this->minRange[Y1_Z0][2] = this->minDead[2];
+  this->maxRange[Y1_Z0][2] = this->minAlive[2];
+
+  this->minRange[Y1_Z1][1] = this->maxAlive[1];
+  this->maxRange[Y1_Z1][1] = this->maxDead[1];
+  this->minRange[Y1_Z1][2] = this->maxAlive[2];
+  this->maxRange[Y1_Z1][2] = this->maxDead[2];
+
+  // Left front and back bars (vertical)
+  this->minRange[Z0_X0][0] = this->minDead[0];
+  this->maxRange[Z0_X0][0] = this->minAlive[0];
+  this->minRange[Z0_X0][2] = this->minDead[2];
+  this->maxRange[Z0_X0][2] = this->minAlive[2];
+
+  this->minRange[Z1_X0][0] = this->minDead[0];
+  this->maxRange[Z1_X0][0] = this->minAlive[0];
+  this->minRange[Z1_X0][2] = this->maxAlive[2];
+  this->maxRange[Z1_X0][2] = this->maxDead[2];
+
+  // Right front and back bars (vertical)
+  this->minRange[Z0_X1][0] = this->maxAlive[0];
+  this->maxRange[Z0_X1][0] = this->maxDead[0];
+  this->minRange[Z0_X1][2] = this->minDead[2];
+  this->maxRange[Z0_X1][2] = this->minAlive[2];
+
+  this->minRange[Z1_X1][0] = this->maxAlive[0];
+  this->maxRange[Z1_X1][0] = this->maxDead[0];
+  this->minRange[Z1_X1][2] = this->maxAlive[2];
+  this->maxRange[Z1_X1][2] = this->maxDead[2];
+
+  // Left bottom front corner
+  this->minRange[X0_Y0_Z0][0] = this->minDead[0];
+  this->maxRange[X0_Y0_Z0][0] = this->minAlive[0];
+  this->minRange[X0_Y0_Z0][1] = this->minDead[1];
+  this->maxRange[X0_Y0_Z0][1] = this->minAlive[1];
+  this->minRange[X0_Y0_Z0][2] = this->minDead[2];
+  this->maxRange[X0_Y0_Z0][2] = this->minAlive[2];
+
+  // Left bottom back corner
+  this->minRange[X0_Y0_Z1][0] = this->minDead[0];
+  this->maxRange[X0_Y0_Z1][0] = this->minAlive[0];
+  this->minRange[X0_Y0_Z1][1] = this->minDead[1];
+  this->maxRange[X0_Y0_Z1][1] = this->minAlive[1];
+  this->minRange[X0_Y0_Z1][2] = this->maxAlive[2];
+  this->maxRange[X0_Y0_Z1][2] = this->maxDead[2];
+
+  // Left top front corner
+  this->minRange[X0_Y1_Z0][0] = this->minDead[0];
+  this->maxRange[X0_Y1_Z0][0] = this->minAlive[0];
+  this->minRange[X0_Y1_Z0][1] = this->maxAlive[1];
+  this->maxRange[X0_Y1_Z0][1] = this->maxDead[1];
+  this->minRange[X0_Y1_Z0][2] = this->minDead[2];
+  this->maxRange[X0_Y1_Z0][2] = this->minAlive[2];
+
+  // Left top back corner
+  this->minRange[X0_Y1_Z1][0] = this->minDead[0];
+  this->maxRange[X0_Y1_Z1][0] = this->minAlive[0];
+  this->minRange[X0_Y1_Z1][1] = this->maxAlive[1];
+  this->maxRange[X0_Y1_Z1][1] = this->maxDead[1];
+  this->minRange[X0_Y1_Z1][2] = this->maxAlive[2];
+  this->maxRange[X0_Y1_Z1][2] = this->maxDead[2];
+
+  // Right bottom front corner
+  this->minRange[X1_Y0_Z0][0] = this->maxAlive[0];
+  this->maxRange[X1_Y0_Z0][0] = this->maxDead[0];
+  this->minRange[X1_Y0_Z0][1] = this->minDead[1];
+  this->maxRange[X1_Y0_Z0][1] = this->minAlive[1];
+  this->minRange[X1_Y0_Z0][2] = this->minDead[2];
+  this->maxRange[X1_Y0_Z0][2] = this->minAlive[2];
+
+  // Right bottom back corner
+  this->minRange[X1_Y0_Z1][0] = this->maxAlive[0];
+  this->maxRange[X1_Y0_Z1][0] = this->maxDead[0];
+  this->minRange[X1_Y0_Z1][1] = this->minDead[1];
+  this->maxRange[X1_Y0_Z1][1] = this->minAlive[1];
+  this->minRange[X1_Y0_Z1][2] = this->maxAlive[2];
+  this->maxRange[X1_Y0_Z1][2] = this->maxDead[2];
+
+  // Right top front corner
+  this->minRange[X1_Y1_Z0][0] = this->maxAlive[0];
+  this->maxRange[X1_Y1_Z0][0] = this->maxDead[0];
+  this->minRange[X1_Y1_Z0][1] = this->maxAlive[1];
+  this->maxRange[X1_Y1_Z0][1] = this->maxDead[1];
+  this->minRange[X1_Y1_Z0][2] = this->minDead[2];
+  this->maxRange[X1_Y1_Z0][2] = this->minAlive[2];
+
+  // Right top back corner
+  this->minRange[X1_Y1_Z1][0] = this->maxAlive[0];
+  this->maxRange[X1_Y1_Z1][0] = this->maxDead[0];
+  this->minRange[X1_Y1_Z1][1] = this->maxAlive[1];
+  this->maxRange[X1_Y1_Z1][1] = this->maxDead[1];
+  this->minRange[X1_Y1_Z1][2] = this->maxAlive[2];
+  this->maxRange[X1_Y1_Z1][2] = this->maxDead[2];
+
+  // Fix ranges for processors on a face in the decomposition
+  // Processor is on front edge in X dimension
+  if (this->layoutPos[0] == 0) {
+    this->minRange[X0][0] = this->boxSize - this->deadSize;
+    this->minRange[X0_Y0][0] = this->boxSize - this->deadSize;
+    this->minRange[X0_Y1][0] = this->boxSize - this->deadSize;
+    this->minRange[Z0_X0][0] = this->boxSize - this->deadSize;
+    this->minRange[Z1_X0][0] = this->boxSize - this->deadSize;
+    this->minRange[X0_Y0_Z0][0] = this->boxSize - this->deadSize;
+    this->minRange[X0_Y0_Z1][0] = this->boxSize - this->deadSize;
+    this->minRange[X0_Y1_Z0][0] = this->boxSize - this->deadSize;
+    this->minRange[X0_Y1_Z1][0] = this->boxSize - this->deadSize;
+
+    this->maxRange[X0][0] = this->boxSize;
+    this->maxRange[X0_Y0][0] = this->boxSize;
+    this->maxRange[X0_Y1][0] = this->boxSize;
+    this->maxRange[Z0_X0][0] = this->boxSize;
+    this->maxRange[Z1_X0][0] = this->boxSize;
+    this->maxRange[X0_Y0_Z0][0] = this->boxSize;
+    this->maxRange[X0_Y0_Z1][0] = this->boxSize;
+    this->maxRange[X0_Y1_Z0][0] = this->boxSize;
+    this->maxRange[X0_Y1_Z1][0] = this->boxSize;
+  }
+
+   // Processor is on back edge in X dimension
+   if (this->layoutPos[0] == (this->layoutSize[0] - 1)) {
+      this->minRange[X1][0] = 0;
+      this->minRange[X1_Y1][0] = 0;
+      this->minRange[X1_Y0][0] = 0;
+      this->minRange[Z1_X1][0] = 0;
+      this->minRange[Z0_X1][0] = 0;
+      this->minRange[X1_Y1_Z1][0] = 0;
+      this->minRange[X1_Y1_Z0][0] = 0;
+      this->minRange[X1_Y0_Z1][0] = 0;
+      this->minRange[X1_Y0_Z0][0] = 0;
+
+      this->maxRange[X1][0] = this->deadSize;
+      this->maxRange[X1_Y1][0] = this->deadSize;
+      this->maxRange[X1_Y0][0] = this->deadSize;
+      this->maxRange[Z1_X1][0] = this->deadSize;
+      this->maxRange[Z0_X1][0] = this->deadSize;
+      this->maxRange[X1_Y1_Z1][0] = this->deadSize;
+      this->maxRange[X1_Y1_Z0][0] = this->deadSize;
+      this->maxRange[X1_Y0_Z1][0] = this->deadSize;
+      this->maxRange[X1_Y0_Z0][0] = this->deadSize;
+   }
+
+   // Processor is on front edge in Y dimension
+   if (this->layoutPos[1] == 0) {
+      this->minRange[Y0][1] = this->boxSize - this->deadSize;
+      this->minRange[X0_Y0][1] = this->boxSize - this->deadSize; 
+      this->minRange[X1_Y0][1] = this->boxSize - this->deadSize;
+      this->minRange[Y0_Z0][1] = this->boxSize - this->deadSize;
+      this->minRange[Y0_Z1][1] = this->boxSize - this->deadSize;
+      this->minRange[X0_Y0_Z0][1] = this->boxSize - this->deadSize;
+      this->minRange[X0_Y0_Z1][1] = this->boxSize - this->deadSize;
+      this->minRange[X1_Y0_Z1][1] = this->boxSize - this->deadSize;
+      this->minRange[X1_Y0_Z0][1] = this->boxSize - this->deadSize;
+
+      this->maxRange[Y0][1] = this->boxSize;
+      this->maxRange[X0_Y0][1] = this->boxSize;
+      this->maxRange[X1_Y0][1] = this->boxSize;
+      this->maxRange[Y0_Z0][1] = this->boxSize;
+      this->maxRange[Y0_Z1][1] = this->boxSize;
+      this->maxRange[X0_Y0_Z0][1] = this->boxSize;
+      this->maxRange[X0_Y0_Z1][1] = this->boxSize;
+      this->maxRange[X1_Y0_Z1][1] = this->boxSize;
+      this->maxRange[X1_Y0_Z0][1] = this->boxSize;
+   }
+
+   // Processor is on back edge in Y dimension
+   if (this->layoutPos[1] == (this->layoutSize[1] - 1)) {
+      this->minRange[Y1][1] = 0;
+      this->minRange[X1_Y1][1] = 0;
+      this->minRange[X0_Y1][1] = 0;
+      this->minRange[Y1_Z1][1] = 0;
+      this->minRange[Y1_Z0][1] = 0;
+      this->minRange[X1_Y1_Z1][1] = 0;
+      this->minRange[X1_Y1_Z0][1] = 0;
+      this->minRange[X0_Y1_Z0][1] = 0;
+      this->minRange[X0_Y1_Z1][1] = 0;
+
+      this->maxRange[Y1][1] = this->deadSize;
+      this->maxRange[X1_Y1][1] = this->deadSize;
+      this->maxRange[X0_Y1][1] = this->deadSize;
+      this->maxRange[Y1_Z1][1] = this->deadSize;
+      this->maxRange[Y1_Z0][1] = this->deadSize;
+      this->maxRange[X1_Y1_Z1][1] = this->deadSize;
+      this->maxRange[X1_Y1_Z0][1] = this->deadSize;
+      this->maxRange[X0_Y1_Z0][1] = this->deadSize;
+      this->maxRange[X0_Y1_Z1][1] = this->deadSize;
+   }
+  
+   // Processor is on front edge in Z dimension
+   if (this->layoutPos[2] == 0) {
+      this->minRange[Z0][2] = this->boxSize - this->deadSize;
+      this->minRange[Y0_Z0][2] = this->boxSize - this->deadSize;
+      this->minRange[Y1_Z0][2] = this->boxSize - this->deadSize;
+      this->minRange[Z0_X0][2] = this->boxSize - this->deadSize;
+      this->minRange[Z0_X1][2] = this->boxSize - this->deadSize;
+      this->minRange[X0_Y0_Z0][2] = this->boxSize - this->deadSize;
+      this->minRange[X1_Y1_Z0][2] = this->boxSize - this->deadSize;
+      this->minRange[X0_Y1_Z0][2] = this->boxSize - this->deadSize;
+      this->minRange[X1_Y0_Z0][2] = this->boxSize - this->deadSize;
+
+      this->maxRange[Z0][2] = this->boxSize;
+      this->maxRange[Y0_Z0][2] = this->boxSize;
+      this->maxRange[Y1_Z0][2] = this->boxSize;
+      this->maxRange[Z0_X0][2] = this->boxSize;
+      this->maxRange[Z0_X1][2] = this->boxSize;
+      this->maxRange[X0_Y0_Z0][2] = this->boxSize;
+      this->maxRange[X1_Y1_Z0][2] = this->boxSize;
+      this->maxRange[X0_Y1_Z0][2] = this->boxSize;
+      this->maxRange[X1_Y0_Z0][2] = this->boxSize;
+   }
+    
+   // Processor is on back edge in Z dimension
+   if (this->layoutPos[2] == (this->layoutSize[2] - 1)) {
+      this->minRange[Z1][2] = 0;
+      this->minRange[Y1_Z1][2] = 0;
+      this->minRange[Y0_Z1][2] = 0;
+      this->minRange[Z1_X1][2] = 0;
+      this->minRange[Z1_X0][2] = 0;
+      this->minRange[X1_Y1_Z1][2] = 0;
+      this->minRange[X0_Y0_Z1][2] = 0;
+      this->minRange[X1_Y0_Z1][2] = 0;
+      this->minRange[X0_Y1_Z1][2] = 0;
+
+      this->maxRange[Z1][2] = this->deadSize;
+      this->maxRange[Y1_Z1][2] = this->deadSize;
+      this->maxRange[Y0_Z1][2] = this->deadSize;
+      this->maxRange[Z1_X1][2] = this->deadSize;
+      this->maxRange[Z1_X0][2] = this->deadSize;
+      this->maxRange[X1_Y1_Z1][2] = this->deadSize;
+      this->maxRange[X0_Y0_Z1][2] = this->deadSize;
+      this->maxRange[X1_Y0_Z1][2] = this->deadSize;
+      this->maxRange[X0_Y1_Z1][2] = this->deadSize;
+   }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Set the particle vectors that have already been read and which
+// contain only the alive particles for this processor
+//
+/////////////////////////////////////////////////////////////////////////
+
+void InitialExchange::setParticleArrays(
+                                        long count,
+                                        POSVEL_T* xLoc,
+                                        POSVEL_T* yLoc,
+                                        POSVEL_T* zLoc,
+                                        POSVEL_T* xVel,
+                                        POSVEL_T* yVel,
+                                        POSVEL_T* zVel,
+                                        POTENTIAL_T* potential,
+                                        ID_T* id,
+                                        MASK_T* maskData)
+{
+  this->particleCount = count;
+  this->xxInit = xLoc;
+  this->yyInit = yLoc;
+  this->zzInit = zLoc;
+  this->vxInit = xVel;
+  this->vyInit = yVel;
+  this->vzInit = zVel;
+  this->potInit = potential;
+  this->tagInit = id;
+  this->maskInit = maskData;
+}
+
+void InitialExchange::setParticleVectors(
+                                         vector<POSVEL_T>* xLoc,
+                                         vector<POSVEL_T>* yLoc,
+                                         vector<POSVEL_T>* zLoc,
+                                         vector<POSVEL_T>* xVel,
+                                         vector<POSVEL_T>* yVel,
+                                         vector<POSVEL_T>* zVel,
+                                         vector<POTENTIAL_T>* potential,
+                                         vector<ID_T>* id,
+                                         vector<MASK_T>* maskData,
+                                         vector<STATUS_T>* type)
+{
+  this->xx = xLoc;
+  this->yy = yLoc;
+  this->zz = zLoc;
+  this->vx = xVel;
+  this->vy = yVel;
+  this->vz = zVel;
+  this->pot = potential;
+  this->tag = id;
+  this->mask = maskData;
+  this->status = type;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Identify the border particles which will be alive on other processors
+// and send them and receive the particles which are dead on other processors
+// but alive on this processor.  Store all the newly acquired alive particles
+// in the empty supplied vectors for the rest of the simulation.
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void InitialExchange::exchangeParticles()
+{
+  // Identify dead particles on this processor which must be sent
+  // because they are alive particles on neighbor processors
+  // x,y,z are still in physical units with wraparound included
+  identifyExchangeParticles();
+
+  // Exchange those particles with appropriate neighbors
+  // x,y,z are not in normalized units
+  exchangeNeighborParticles();
+
+  // Count the particles across processors
+  long totalAliveParticles = 0;
+#ifndef USE_SERIAL_COSMO
+  MPI_Allreduce((void*) &this->numberOfAliveParticles, 
+                (void*) &totalAliveParticles, 
+                1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  //cout << "InitialExchange Particles Rank " << setw(3) << this->myProc 
+  //     << " #alive = " << this->numberOfAliveParticles << endl;
+ 
+  if (this->myProc == MASTER) {
+    cout << "InitialExchange TotalAliveParticles " 
+         << totalAliveParticles << endl;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Iterate over all the particles on this processor and determine which are
+// dead and  must be sent away and add them to the vector for that neighbor
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void InitialExchange::identifyExchangeParticles()
+{
+  for (int i = 0; i < this->particleCount; i++) {
+    bool found = false;
+
+    // Particle is alive on this processor so add to vectors
+    if ((this->xxInit[i] >= this->minAlive[0] && 
+         this->xxInit[i] < this->maxAlive[0]) &&
+        (this->yyInit[i] >= this->minAlive[1] && 
+         this->yyInit[i] < this->maxAlive[1]) &&
+        (this->zzInit[i] >= this->minAlive[2] && 
+         this->zzInit[i] < this->maxAlive[2])) {
+            this->xx->push_back(this->xxInit[i]);
+            this->yy->push_back(this->yyInit[i]);
+            this->zz->push_back(this->zzInit[i]);
+            this->vx->push_back(this->vxInit[i]);
+            this->vy->push_back(this->vyInit[i]);
+            this->vz->push_back(this->vzInit[i]);
+            this->tag->push_back(this->tagInit[i]);
+            this->pot->push_back(this->potInit[i]);
+            this->mask->push_back(this->maskInit[i]);
+            this->numberOfAliveParticles++;
+            found = true;
+
+    } else {
+      // Particle is dead here but which processor needs it as alive
+      for (int n = 0; n < NUM_OF_NEIGHBORS; n++) {
+        if ((this->xxInit[i] >= minRange[n][0]) && 
+            (this->xxInit[i] < maxRange[n][0]) &&
+            (this->yyInit[i] >= minRange[n][1]) && 
+            (this->yyInit[i] < maxRange[n][1]) &&
+            (this->zzInit[i] >= minRange[n][2]) && 
+            (this->zzInit[i] < maxRange[n][2])) {
+                this->neighborParticles[n].push_back(i);
+                found = true;
+        }
+      }
+    }
+    if (found == false) {
+      cout << "Rank " << myProc << " Problem particle " << xxInit[i] 
+           << " , " << yyInit[i] << " , " << zzInit[i] << endl;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Exchange the appropriate particles with neighbors
+// Only the index of the particle to be exchanged is stored so fill out
+// the message with location, velocity, tag.  Status information doesn't
+// have to be sent because all particles will be alive.
+// Use the Cartesian communicator for neighbor exchange
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void InitialExchange::exchangeNeighborParticles()
+{
+  // Calculate the maximum number of particles to share for calculating buffer
+  int myShareSize = 0;
+  for (int n = 0; n < NUM_OF_NEIGHBORS; n++)
+    if (myShareSize < (int) this->neighborParticles[n].size())
+      myShareSize = this->neighborParticles[n].size();
+
+  int maxDeadSize;
+#ifndef USE_SERIAL_COSMO
+  MPI_Allreduce((void*) &myShareSize,
+                (void*) &maxDeadSize,
+                1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+#endif
+
+  // Allocate messages to send and receive MPI buffers
+  int bufferSize = (1 * sizeof(int)) +          // number of particles
+                   (maxDeadSize * 
+                     ((7 * sizeof(POSVEL_T)) +  // location, velocity, potential
+                      (1 * sizeof(ID_T)) +      // id tag
+                      (1 * sizeof(MASK_T))));   // mask
+  Message* sendMessage = new Message(bufferSize);
+  Message* recvMessage = new Message(bufferSize);
+  
+  //debug statement added by Adrian to see how much buffer space we're using
+  if(this->myProc == MASTER) {
+    printf("PXCH buffer = 2*%d = %f MB\n",bufferSize,
+           2.0*bufferSize/1024.0/1024.0);
+  }
+  //MPI_Barrier(MPI_COMM_WORLD);
+
+  // Exchange with each neighbor, with everyone sending in one direction and
+  // receiving from the other.  Data corresponding to the particle index
+  // must be packed in the buffer.  When the data is received it is unpacked
+  // into the location, velocity and tag vectors and the status is set
+  // to the neighbor who sent it
+
+  for (int n = 0; n < NUM_OF_NEIGHBORS; n=n+2) {
+    // Neighbor pairs in Definition.h must match so that every processor
+    // sends and every processor receives on each exchange
+    exchange(n, n+1, sendMessage, recvMessage);
+    exchange(n+1, n, sendMessage, recvMessage);
+  }
+
+  delete sendMessage;
+  delete recvMessage;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Pack particle data for the indicated neighbor into MPI message
+// Send that message and receive from opposite neighbor
+// Unpack the received particle data and add to particle buffers with
+// an indication of dead and the neighbor on which particle is alive
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void InitialExchange::exchange(
+                               int sendTo, 
+                               int recvFrom, 
+                               Message* sendMessage, 
+                               Message* recvMessage)
+{
+  POSVEL_T posValue;
+  POTENTIAL_T potValue;
+  ID_T idValue;
+  MASK_T maskValue;
+
+#ifndef USE_SERIAL_COSMO
+  // Fill same message for each of the neighbors
+  sendMessage->reset();
+  recvMessage->reset();
+
+  // Number of particles to share with neighbor
+  int sendParticleCount = this->neighborParticles[sendTo].size();
+
+  // Pack the number of particles being sent
+  sendMessage->putValue(&sendParticleCount);
+
+  for (int i = 0; i < sendParticleCount; i++) {
+    int deadIndex = this->neighborParticles[sendTo][i];
+    sendMessage->putValue(&this->xxInit[deadIndex]);
+    sendMessage->putValue(&this->yyInit[deadIndex]);
+    sendMessage->putValue(&this->zzInit[deadIndex]);
+    sendMessage->putValue(&this->vxInit[deadIndex]);
+    sendMessage->putValue(&this->vyInit[deadIndex]);
+    sendMessage->putValue(&this->vzInit[deadIndex]);
+    sendMessage->putValue(&this->potInit[deadIndex]);
+    sendMessage->putValue(&this->tagInit[deadIndex]);
+    sendMessage->putValue(&this->maskInit[deadIndex]);
+    this->particleCount--;
+  }
+
+  // Send the message buffer
+  sendMessage->send(this->neighbor[sendTo]);
+
+  // Receive the buffer from neighbor on other side
+  recvMessage->receive(this->neighbor[recvFrom]);
+  MPI_Barrier(Partition::getComm());
+
+  // Process the received buffer
+  int recvParticleCount;
+  recvMessage->getValue(&recvParticleCount);
+
+  for (int i = 0; i < recvParticleCount; i++) {
+    recvMessage->getValue(&posValue);
+    this->xx->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->yy->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->zz->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->vx->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->vy->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->vz->push_back(posValue);
+    recvMessage->getValue(&potValue);
+    this->pot->push_back(potValue);
+    recvMessage->getValue(&idValue);
+    this->tag->push_back(idValue);
+    recvMessage->getValue(&maskValue);
+    this->mask->push_back(maskValue);
+
+    this->numberOfAliveParticles++;
+    this->particleCount++;
+  }
+#endif
+}
diff --git a/src/halo-finder/src/InitialExchange.h b/src/halo-finder/src/InitialExchange.h
new file mode 100644
index 0000000..ca1161c
--- /dev/null
+++ b/src/halo-finder/src/InitialExchange.h
@@ -0,0 +1,179 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+// .NAME InitialExchange - Get pointer to alive plus dead particles for this
+//                         processor.  Store the alive in supplied vectors
+//                         and send the dead to the correct neighbor processor
+//                         where they become alive.
+//
+// .SECTION Description
+// InitialExchange takes input from the particle initializer which originally
+// contained alive particles on this processor, but after the particles move
+// a little they might have moved to a neighbor processor.  Input is in the
+// form of arrays, but vectors are supplied which will be filled with the
+// alive particles for this processor.  As each particle is examined it is
+// placed immediately in the vector if it is alive on this processor and the
+// index is placed by neighbor if it is dead on this processor.  After
+// categorizing all particles, the dead are exchanged with neighbors where
+// they become alive.
+//
+// After this initial exchange, ParticleExchange is called to place all
+// dead particles (which will be the ones sent in this step being returned
+// plus the others which were originally placed correctly on the neighbor).
+
+#ifndef InitialExchange_h
+#define InitialExchange_h
+
+#include "Definition.h"
+#include "Message.h"
+#include <string>
+#include <vector>
+
+#define INITIAL_EXCHANGE_FUDGE 8.0
+
+using namespace std;
+
+class InitialExchange {
+public:
+  InitialExchange();
+  ~InitialExchange();
+
+  // Set parameters particle distribution
+  void setParameters(
+	POSVEL_T rL,		// Box size of the physical problem
+	POSVEL_T deadSize);	// Dead delta border for each processor
+
+  // Set neighbor processor numbers and calculate dead regions
+  void initialize();
+
+  // Calculate physical range of alive particles which must be shared
+  void calculateExchangeRegions();
+
+  // Set alive particle arrays from the initializer
+  void setParticleArrays(
+	long count,
+	POSVEL_T* xx,
+	POSVEL_T* yy,
+	POSVEL_T* zz,
+	POSVEL_T* vx,
+	POSVEL_T* vy,
+	POSVEL_T* vz,
+	POTENTIAL_T* potential,
+	ID_T* tag,
+	MASK_T* mask);
+
+  // Set alive particle vectors which will be filled in
+  void setParticleVectors(
+	vector<POSVEL_T>* xx,
+	vector<POSVEL_T>* yy,
+	vector<POSVEL_T>* zz,
+	vector<POSVEL_T>* vx,
+	vector<POSVEL_T>* vy,
+	vector<POSVEL_T>* vz,
+	vector<POTENTIAL_T>* potential,
+	vector<ID_T>* tag,
+	vector<MASK_T>* mask,
+	vector<STATUS_T>* status);
+
+  // Identify and exchange alive particles which must be shared with neighbors
+  void exchangeParticles();
+  void identifyExchangeParticles();
+  void exchangeNeighborParticles();
+  void exchange(
+	int sendTo,		// Neighbor to send particles to
+	int recvFrom,		// Neighbor to receive particles from
+	Message* sendMessage,
+	Message* recvMessage);
+
+  long getNumberOfAliveParticles() const { return numberOfAliveParticles; }
+
+private:
+  int    myProc;		// My processor number
+  int    numProc;		// Total number of processors
+
+  int    layoutSize[DIMENSION];	// Decomposition of processors
+  int    layoutPos[DIMENSION];	// Position of this processor in decomposition
+
+  POSVEL_T boxSize;		// Physical box size (rL)
+  POSVEL_T deadSize;		// Border size for dead particles
+
+  long   numberOfAliveParticles;
+  long   particleCount;		// Number of particles received from arrays
+
+  POSVEL_T minAlive[DIMENSION];	// Minimum alive particle not exchanged
+  POSVEL_T maxAlive[DIMENSION];	// Maximum alive particle not exchanged
+  POSVEL_T minDead[DIMENSION];	// Minimum particle sent
+  POSVEL_T maxDead[DIMENSION];	// Maximum particle sent
+
+  int    neighbor[NUM_OF_NEIGHBORS];		 // Neighbor processor indices
+  POSVEL_T minRange[NUM_OF_NEIGHBORS][DIMENSION]; // Range of dead particles
+  POSVEL_T maxRange[NUM_OF_NEIGHBORS][DIMENSION]; // Range of dead particles
+
+  vector<ID_T> neighborParticles[NUM_OF_NEIGHBORS];
+				// Particle ids sent to each neighbor as ALIVE
+
+  POSVEL_T* xxInit;		// X location from initializer
+  POSVEL_T* yyInit;		// Y location from initializer
+  POSVEL_T* zzInit;		// Z location from initializer
+  POSVEL_T* vxInit;		// X velocity from initializer
+  POSVEL_T* vyInit;		// Y velocity from initializer
+  POSVEL_T* vzInit;		// Z velocity from initializer
+  POTENTIAL_T* potInit;		// Particle potential
+  ID_T* tagInit;		// Tag from initializer
+  MASK_T* maskInit;		// Particle information
+
+  vector<POSVEL_T>* xx;		// X location for particles on this processor
+  vector<POSVEL_T>* yy;		// Y location for particles on this processor
+  vector<POSVEL_T>* zz;		// Z location for particles on this processor
+  vector<POSVEL_T>* vx;		// X velocity for particles on this processor
+  vector<POSVEL_T>* vy;		// Y velocity for particles on this processor
+  vector<POSVEL_T>* vz;		// Z velocity for particles on this processor
+  vector<POTENTIAL_T>* pot;	// Particle potential
+  vector<ID_T>* tag;		// Id tag for particles on this processor
+  vector<MASK_T>* mask;		// Particle information
+
+  vector<STATUS_T>* status;	// Particle is ALIVE when it leaves
+};
+
+#endif
diff --git a/src/halo-finder/src/Makefile b/src/halo-finder/src/Makefile
new file mode 100644
index 0000000..ba33cc8
--- /dev/null
+++ b/src/halo-finder/src/Makefile
@@ -0,0 +1,129 @@
+OBJDIR = ${HACC_OBJDIR}
+#SOBJDIR = ${HACC_OBJDIR}_serial
+
+
+PROGS    = \
+	$(OBJDIR)/ForceTreeTest
+
+
+LIBS	= \
+	$(OBJDIR)/libpartition.a \
+	$(OBJDIR)/libparticle.a \
+	$(OBJDIR)/libBHForceTree.a \
+	$(OBJDIR)/libhalotime.a \
+	$(OBJDIR)/libbigchunk.a
+
+
+# Probably need to fix dependencies on some of the header files.
+HDR= \
+	RCBForceTree.h \
+	RCOForceTree.h \
+	BHForceTree.h \
+	CosmoHalo.h \
+	Definition.h \
+	ForceLaw.h \
+	ForceTree.h \
+	InitialExchange.h \
+	Message.h \
+	Partition.h \
+	ParticleDistribute.h \
+	ParticleExchange.h \
+	Timer.h \
+	Timings.h \
+	cudaUtil.h
+
+
+all: ${LIBS}
+libs: ${LIBS}
+progs: ${PROGS}
+
+include include.mk
+#include ../include.mk
+include dfft/include.mk
+CXXFLAGS += ${DFFT_CXXFLAGS}
+
+CFLAGS += ${HACC_MPI_CFLAGS}
+CXXFLAGS += ${HACC_MPI_CXXFLAGS}
+LDFLAGS += ${HACC_MPI_LDFLAGS}
+
+#CXXFLAGS += ${MPI_COMPILE_FLAGS} 
+CXXFLAGS += ${HF_TYPE_FLAGS}
+CXXFLAGS += ${HF_WARNING}
+#CXXFLAGS += -g
+#CXXFLAGS += -DDEBUG
+CXXFLAGS += -I.
+#LDFLAGS  = ${MPI_LD_FLAGS} -lmpi_cxx
+
+CFLAGS += -Idfft
+CXXFLAGS += -Idfft
+
+
+$(OBJDIR):
+	mkdir -p $(OBJDIR)
+
+$(OBJDIR)/%.o: %.c | $(OBJDIR)
+	${HACC_MPI_CC} ${CFLAGS} -c -o $@ $<
+
+$(OBJDIR)/%.o: %.cxx | $(OBJDIR)
+	${HACC_MPI_CXX} ${CXXFLAGS} -c -o $@ $<
+
+$(OBJDIR)/%.o: %.cu | $(OBJDIR)
+	${HACC_MPI_CXX} ${CFLAGS} -c -o $@ $<
+
+
+#$(SOBJDIR):
+#	mkdir -p $(SOBJDIR)
+#
+#$(SOBJDIR)/%.o: %.c | $(SOBJDIR)
+#	${HACC_CC} -c -o $@ $<
+#
+#$(SOBJDIR)/%.o: %.cxx | $(SOBJDIR)
+#	${HACC_CXX} -c -o $@ $<
+
+
+#$(OBJDIR)/ForceLaw.o: ForceLaw.h
+
+
+dims-local.c: dfft/dims.c
+	cp -f dfft/dims.c dims-local.c
+
+
+$(OBJDIR)/ForceTreeTest: $(OBJDIR)/ForceTreeTest.o $(OBJDIR)/libparticle.a $(OBJDIR)/libBHForceTree.a $(OBJDIR)/libpartition.a $(OBJDIR)/libhalotime.a $(OBJDIR)/libbigchunk.a $(OBJDIR)/dims-local.o
+	${HACC_MPI_CXX} -o $@ $^ ${CXXFLAGS} ${LDFLAGS} -DUSE_SERIAL_COSMO=1
+
+$(OBJDIR)/libpartition.a: $(OBJDIR)/libpartition.a($(OBJDIR)/Partition.o)
+	ranlib $@
+
+
+PARTICLE_SOURCES += ParticleDistribute.cxx
+PARTICLE_SOURCES += ParticleExchange.cxx
+PARTICLE_SOURCES += InitialExchange.cxx
+PARTICLE_SOURCES += Message.cxx
+PARTICLE_OBJLIST = $(PARTICLE_SOURCES:.cxx=.o)
+PARTICLE_OBJECTS := $(addprefix $(OBJDIR)/,$(PARTICLE_OBJLIST))
+$(OBJDIR)/libparticle.a: $(OBJDIR)/libparticle.a($(PARTICLE_OBJECTS))
+	ranlib $@
+
+
+HT_SOURCES += Timings.cxx
+HT_SOURCES += Timer.cxx
+HT_OBJLIST = $(HT_SOURCES:.cxx=.o)
+HT_OBJECTS := $(addprefix $(OBJDIR)/,$(HT_OBJLIST))
+$(OBJDIR)/libhalotime.a: $(OBJDIR)/libhalotime.a($(HT_OBJECTS))
+	ranlib $@
+
+
+FORCE_SOURCES += ForceLaw.cxx
+FORCE_SOURCES += BHForceTree.cxx
+FORCE_SOURCES += RCOForceTree.cxx
+FORCE_SOURCES += RCBForceTree.cxx
+FORCE_SOURCES_C += BGQStep16.c
+FORCE_SOURCES_C += BGQCM.c
+FORCE_OBJLIST = $(FORCE_SOURCES:.cxx=.o)
+FORCE_OBJLIST_C = $(FORCE_SOURCES_C:.c=.o)
+FORCE_OBJECTS := $(addprefix $(OBJDIR)/,$(FORCE_OBJLIST) $(FORCE_OBJLIST_C))
+$(OBJDIR)/libBHForceTree.a: $(OBJDIR)/libBHForceTree.a($(FORCE_OBJECTS) $(FORCE_OBJLIST_C))
+	ranlib $@
+
+$(OBJDIR)/libbigchunk.a: $(OBJDIR)/libbigchunk.a($(OBJDIR)/bigchunk.o)
+	ranlib $@
diff --git a/src/halo-finder/src/Message.cxx b/src/halo-finder/src/Message.cxx
new file mode 100644
index 0000000..2239655
--- /dev/null
+++ b/src/halo-finder/src/Message.cxx
@@ -0,0 +1,242 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#ifdef USE_SERIAL_COSMO
+#include <string.h>
+#endif
+
+#include <iostream>
+
+using namespace std;
+
+#include "Message.h"
+#include "Partition.h"
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Create a Message for sending or receiving from MPI
+//
+////////////////////////////////////////////////////////////////////////////
+
+Message::Message(int size)
+{
+  this->bufSize = size;
+  this->buffer = new char[size];
+  this->bufPos = 0;
+}
+
+void Message::manualPackAtPosition(char* data, int pos, int count, size_t size)
+{
+  for(int i = 0; i < count; i = i + 1) {
+    for(size_t j = 0; j < size; j = j + 1) {
+      this->buffer[pos++] = data[i * size + j];
+    }
+  }
+}
+
+void Message::manualPack(char* data, int count, size_t size)
+{
+  for(int i = 0; i < count; i = i + 1) {
+    for(size_t j = 0; j < size; j = j + 1) {
+      this->buffer[this->bufPos++] = data[i * size + j];
+    }
+  }
+}
+
+void Message::manualUnpack(char* data, int count, size_t size)
+{
+  for(int i = 0; i < count; i = i + 1) {
+    for(size_t j = 0; j < size; j = j + 1) {
+      data[i * size + j] = this->buffer[this->bufPos++];
+    }
+  }
+}
+
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Destructor for a message
+//
+////////////////////////////////////////////////////////////////////////////
+Message::~Message()
+{
+  delete [] this->buffer;
+}
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Reset for another message of the same size
+//
+////////////////////////////////////////////////////////////////////////////
+void Message::reset()
+{
+  this->bufPos = 0;
+}
+
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Place an integer at a specific location in the buffer
+// Used to set a counter of particles in the first position when it is
+// only known after all the particles are packed
+//
+////////////////////////////////////////////////////////////////////////////
+void Message::putValueAtPosition(int* data, int pos, int count)
+{
+  manualPackAtPosition((char*)data, pos, count, sizeof(int));
+}
+////////////////////////////////////////////////////////////////////////////
+//
+// Packing of the buffer
+//
+////////////////////////////////////////////////////////////////////////////
+void Message::putValue(int* data, int count)
+{
+  manualPack((char*)data, count, sizeof(int));
+}
+void Message::putValue(unsigned short* data, int count)
+{
+  manualPack((char*)data, count, sizeof(unsigned short));
+}
+void Message::putValue(long int* data, int count)
+{
+  manualPack((char*)data, count, sizeof(long int));
+}
+void Message::putValue(long long* data, int count)
+{
+  manualPack((char*)data, count, sizeof(long long));
+}
+void Message::putValue(float* data, int count)
+{
+  manualPack((char*)data, count, sizeof(float));
+}
+void Message::putValue(double* data, int count)
+{
+  manualPack((char*)data, count, sizeof(double));
+}
+void Message::putValue(char* data, int count)
+{
+  manualPack((char*)data, count, sizeof(char));
+}
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Unpacking of the buffer
+//
+////////////////////////////////////////////////////////////////////////////
+void Message::getValue(int* data, int count)
+{
+  manualUnpack((char*)data, count, sizeof(int));
+}
+void Message::getValue(unsigned short* data, int count)
+{
+  manualUnpack((char*)data, count, sizeof(unsigned short));
+}
+void Message::getValue(long int* data, int count)
+{
+  manualUnpack((char*)data, count, sizeof(long int));
+}
+void Message::getValue(long long* data, int count)
+{
+  manualUnpack((char*)data, count, sizeof(long long));
+}
+void Message::getValue(float* data, int count)
+{
+  manualUnpack((char*)data, count, sizeof(float));
+}
+void Message::getValue(double* data, int count)
+{
+  manualUnpack((char*)data, count, sizeof(double));
+}
+void Message::getValue(char* data, int count)
+{
+  manualUnpack((char*)data, count, sizeof(char));
+}
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Nonblocking send
+//
+////////////////////////////////////////////////////////////////////////////
+void Message::send
+#ifdef USE_SERIAL_COSMO
+  (int , int )
+#else
+  (int mach, int tag)
+#endif
+{
+#ifdef USE_SERIAL_COSMO
+  char* in = new char[this->bufPos];
+  memcpy(in, this->buffer, this->bufPos);
+  q.push(in);
+#else
+  MPI_Request request;
+  MPI_Isend(this->buffer, this->bufPos, MPI_PACKED, 
+            mach, tag, Partition::getComm(), &request);
+#endif
+}
+
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Blocking receive
+//
+////////////////////////////////////////////////////////////////////////////
+void Message::receive
+#ifdef USE_SERIAL_COSMO
+(int, int)
+#else
+(int mach, int tag)
+#endif
+{
+#ifdef USE_SERIAL_COSMO
+  char* out = q.front(); q.pop();
+  memcpy(this->buffer, out, this->bufSize);
+  delete [] out;
+#else
+  MPI_Status status;
+  MPI_Recv(this->buffer, this->bufSize, MPI_PACKED, mach, tag,
+           Partition::getComm(), &status);
+#endif
+}
diff --git a/src/halo-finder/src/Message.h b/src/halo-finder/src/Message.h
new file mode 100644
index 0000000..3e64292
--- /dev/null
+++ b/src/halo-finder/src/Message.h
@@ -0,0 +1,126 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+// .NAME Message - create, send and receive MPI messages
+//
+// .SECTION Description
+// Message class packs and unpacks data into an MPI buffer
+
+#ifndef MESSAGE_H
+#define MESSAGE_H
+
+#ifdef USE_VTK_COSMO
+#include "CosmoDefinition.h"
+#include "vtkstd/queue"
+
+using namespace vtkstd;
+#else
+#include "Definition.h"
+#include <queue>
+
+using namespace std;
+#endif
+
+
+class Message {
+public:
+  Message(int size = BUF_SZ);
+
+   ~Message();
+
+  // Put values into the MPI buffer
+  void putValueAtPosition(int* data, int pos, int count = 1);
+  void putValue(int* data, int count = 1);
+  void putValue(unsigned short* data, int count = 1);
+  void putValue(long int* data, int count = 1);
+  void putValue(long long* data, int count = 1);
+  void putValue(float* data, int count = 1);
+  void putValue(double* data, int count = 1);
+  void putValue(char* data, int count = 1);
+
+  // Get values from the MPI buffer
+  void getValue(int* data, int count = 1);
+  void getValue(unsigned short* data, int count = 1);
+  void getValue(long int* data, int count = 1);
+  void getValue(long long* data, int count = 1);
+  void getValue(float* data, int count = 1);
+  void getValue(double* data, int count = 1);
+  void getValue(char* data, int count = 1);
+
+  int getBufPos() { return this->bufPos; }
+
+  void manualPackAtPosition(char* data, int pos, int count, size_t size);
+  void manualPack(char* data, int count, size_t size);
+  void manualUnpack(char* data, int count, size_t size);
+
+  // Send nonblocking
+  void send(
+        int mach,                       // Where to send message
+        int tag = 0                     // Identifying tag
+  );
+
+  // Receive blocking
+  void receive(
+#ifdef USE_SERIAL_COSMO
+        int mach = 0,
+#else
+        int mach = MPI_ANY_SOURCE,      // From where to receive
+#endif
+        int tag = 0                     // Identifying tag
+  );
+
+#ifdef USE_SERIAL_COSMO // message queue hack for serial
+  queue<char*> q;
+#endif
+
+  // Reset the buffer for another set of data
+  void reset();
+
+private:
+  char* buffer;         // Buffer to pack
+  int   bufSize;        // Size of buffer
+  int   bufPos;         // Position in buffer
+};
+
+#endif
diff --git a/src/halo-finder/src/ParticleDistribute.cxx b/src/halo-finder/src/ParticleDistribute.cxx
new file mode 100644
index 0000000..88b7899
--- /dev/null
+++ b/src/halo-finder/src/ParticleDistribute.cxx
@@ -0,0 +1,2033 @@
+/*=========================================================================
+
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC.
+This software was produced under U.S. Government contract DE-AC52-06NA25396
+for Los Alamos National Laboratory (LANL), which is operated by
+Los Alamos National Security, LLC for the U.S. Department of Energy.
+The U.S. Government has rights to use, reproduce, and distribute this software.
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.
+If software is modified to produce derivative works, such modified software
+should be clearly marked, so as not to confuse it with the version available
+from LANL.
+
+Additionally, redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the following conditions
+are met:
+-   Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=========================================================================*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+
+#include <stddef.h>
+
+#include <sys/types.h>
+
+#ifdef _WIN32
+#include "winDirent.h"
+#else
+#include <dirent.h>
+#endif
+
+#include "Partition.h"
+#include "ParticleDistribute.h"
+
+#ifdef USE_VTK_COSMO
+#include "vtkStdString.h"
+#include "vtkSetGet.h"
+#endif
+
+#include <cstring>
+using namespace std;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Particle data space is partitioned for the number of processors
+// which currently is a factor of two but is easily extended.  Particles
+// are read in from files where each processor reads one file into a buffer,
+// extracts the particles which really belong on the processor (ALIVE) and
+// those in a buffer region around the edge (DEAD).  The buffer is then
+// passed round robin to every other processor so that all particles are
+// examined by all processors.  All dead particles are tagged with the
+// neighbor zone (26 neighbors in 3D) so that later halos can be associated
+// with zones.
+//
+/////////////////////////////////////////////////////////////////////////
+
+ParticleDistribute::ParticleDistribute()
+{
+  // Get the number of processors running this problem and rank
+  this->numProc = Partition::getNumProc();
+  this->myProc = Partition::getMyProc();
+
+  // Get the number of processors in each dimension
+  Partition::getDecompSize(this->layoutSize);
+
+  // Get my position within the Cartesian topology
+  Partition::getMyPosition(this->layoutPos);
+
+  // Get neighbors of this processor including the wraparound
+  Partition::getNeighbors(this->neighbor);
+
+  this->numberOfAliveParticles = 0;
+  this->massConvertFactor = 1.0;
+  this->distConvertFactor = 1.0;
+}
+
+ParticleDistribute::~ParticleDistribute()
+{
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Set parameters for particle distribution
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::setParameters(
+                        const string& baseName,
+                        POSVEL_T rL,
+                        string dataType)
+{
+  // Base file name which will have processor id appended for actual files
+  this->baseFile = baseName;
+
+  // Physical total space and amount of physical space to use for dead particles
+  this->boxSize = rL;
+
+  // RECORD format is the binary .cosmo of one particle with all information
+  if (dataType == "RECORD")
+    this->inputType = RECORD;
+
+  // BLOCK format is Gadget format with a header and x,y,z locations for
+  // all particles, then x,y,z velocities for all particles, and all tags
+  else if (dataType == "BLOCK")
+    this->inputType = BLOCK;
+
+#ifndef USE_VTK_COSMO
+  if (this->myProc == MASTER) {
+    cout << endl << "------------------------------------" << endl;
+    cout << "boxSize:  " << this->boxSize << endl;
+  }
+#endif
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Set parameters for particle unit conversion
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::setConvertParameters(
+                        POSVEL_T massFactor,
+                        POSVEL_T distFactor)
+{
+  this->massConvertFactor = massFactor;
+  this->distConvertFactor = distFactor;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Set box sizes for determining if a particle is in the alive or dead
+// region of this processor.  Data space is a DIMENSION torus.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::initialize()
+{
+#ifndef USE_VTK_COSMO
+#ifdef DEBUG
+  if (this->myProc == MASTER)
+    cout << "Decomposition: [" << this->layoutSize[0] << ":"
+         << this->layoutSize[1] << ":" << this->layoutSize[2] << "]" << endl;
+#endif
+#endif
+
+  // Set subextents on particle locations for this processor
+  POSVEL_T boxStep[DIMENSION];
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    boxStep[dim] = this->boxSize / this->layoutSize[dim];
+
+    // Alive particles
+    this->minAlive[dim] = this->layoutPos[dim] * boxStep[dim];
+    this->maxAlive[dim] = this->minAlive[dim] + boxStep[dim];
+    if (this->maxAlive[dim] > this->boxSize)
+      this->maxAlive[dim] = this->boxSize;
+  }
+}
+
+
+void ParticleDistribute::setParticles(vector<POSVEL_T>* xLoc,
+                                      vector<POSVEL_T>* yLoc,
+                                      vector<POSVEL_T>* zLoc,
+                                      vector<POSVEL_T>* xVel,
+                                      vector<POSVEL_T>* yVel,
+                                      vector<POSVEL_T>* zVel,
+                                      vector<POSVEL_T>* mass,
+                                      vector<ID_T>* id)
+{
+  this->xx = xLoc;
+  this->yy = yLoc;
+  this->zz = zLoc;
+  this->vx = xVel;
+  this->vy = yVel;
+  this->vz = zVel;
+  this->ms = mass;
+  this->tag = id;
+}
+
+#ifndef USE_SERIAL_COSMO
+/////////////////////////////////////////////////////////////////////////
+//
+// Each processor reads 0 or more files, a buffer at a time, and shares
+// the particles using an all-to-all with every other processor
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readParticlesAllToAll(int reserveQ)
+{
+  // Find how many input files there are and deal them between the processors
+  // Calculates the max number of files per processor and max number of
+  // particles per file so that buffering can be done
+  // For round robin sharing determine where to send and receive buffers from
+  partitionInputFiles(true);
+
+  // Compute the total number of particles in the problem
+  // Compute the maximum number of particles in any one file to set buffer size
+  findFileParticleCount();
+
+  // If there is only one input file we don't have to do MPI messaging
+  // because each processor will read that same file and extract only
+  // the particles in range
+  if (this->numberOfFiles == 1) {
+    if (this->inputType == RECORD) {
+      readFromRecordFile();
+    } else {
+      readFromBlockFile();
+    }
+  } else {
+
+  // MPI buffer size might limit the number of particles read from a file
+  // and passed round robin
+  // Largest file will have a number of buffer chunks to send if it is too large
+  // Every processor must send that number of chunks even if its own file
+  // does not have that much information
+
+  if (ENFORCE_MAX_READ == true && this->maxParticles > MAX_READ) {
+    this->maxRead = MAX_READ;
+    this->maxReadsPerFile = (this->maxParticles / this->maxRead) + 1;
+  } else {
+    this->maxRead = this->maxParticles;
+    this->maxReadsPerFile = 1;
+  }
+
+  MPI_Datatype CosmoParticleType;
+
+  {
+    MPI_Datatype type[3] = { MPI_FLOAT, MPI_INT, MPI_UB };
+    int blocklen[3] = { COSMO_FLOAT, COSMO_INT, 1 };
+    MPI_Aint disp[3] = { offsetof(CosmoParticle, floatData),
+                         offsetof(CosmoParticle, intData),
+                         sizeof(CosmoParticle) };
+    MPI_Type_struct(3, blocklen, disp, type, &CosmoParticleType);
+    MPI_Type_commit(&CosmoParticleType);
+  }
+
+  int numProc = Partition::getNumProc();
+  std::vector<int> pCounts(numProc), pRecvCounts(numProc),
+                   pDisp(numProc),   pRecvDisp(numProc);
+  std::vector< std::vector<CosmoParticle> > pByProc(numProc);
+  std::vector<CosmoParticle> pBuffer, pRecvBuffer;
+
+  // Allocate space to hold buffer information for reading of files
+  // Mass is constant use that float to store the tag
+  // Number of particles is the first integer in the buffer
+
+  // Allocate space for the data read from the file
+  POSVEL_T *fBlock = 0;
+  POSVEL_T *lBlock = 0;
+  POSVEL_T *vBlock = 0;
+  ID_T* iBlock = 0;
+
+  // RECORD format reads one particle at a time
+  if (this->inputType == RECORD) {
+    fBlock = new POSVEL_T[COSMO_FLOAT];
+    iBlock = new ID_T[COSMO_INT];
+  }
+
+  // BLOCK format reads all particles at one time for triples
+  else if (this->inputType == BLOCK) {
+    lBlock = new POSVEL_T[this->maxRead * DIMENSION];
+    vBlock = new POSVEL_T[this->maxRead * DIMENSION];
+    iBlock = new ID_T[this->maxRead];
+  }
+
+  // Reserve particle storage to minimize reallocation
+  int reserveSize = (int) (this->maxFiles * this->maxParticles * DEAD_FACTOR);
+
+  // If multiple processors are reading the same file we can reduce size
+  reserveSize /= this->processorsPerFile;
+
+  if(reserveQ) {
+#ifndef USE_VTK_COSMO
+    cout << "readParticlesRoundRobin reserving vectors" << endl;
+#endif
+    this->xx->reserve(reserveSize);
+    this->yy->reserve(reserveSize);
+    this->zz->reserve(reserveSize);
+    this->vx->reserve(reserveSize);
+    this->vy->reserve(reserveSize);
+    this->vz->reserve(reserveSize);
+    this->ms->reserve(reserveSize);
+    this->tag->reserve(reserveSize);
+  }
+
+  // Running total and index into particle data on this processor
+  this->particleCount = 0;
+
+  // Using the input files assigned to this processor, read the input
+  // and push all-to-all to every other processor
+  // this->maxFiles is the maximum number to read on any processor
+  // Some processors may have no files to read but must still participate
+  // in the round robin distribution
+
+  for (int file = 0; file < this->maxFiles; file++) {
+
+    // Open file to read the data if any for this processor
+    ifstream* inStream = 0;
+    int firstParticle = 0;
+    int numberOfParticles = 0;
+    int remainingParticles = 0;
+
+    if ((int)this->inFiles.size() > file) {
+      inStream = new ifstream(this->inFiles[file].c_str(), ios::in|ios::binary);
+
+#ifndef USE_VTK_COSMO
+      cout << "Rank " << this->myProc << " open file " << inFiles[file]
+           << " with " << this->fileParticles[file] << " particles" << endl;
+#endif
+
+      // Number of particles read at one time depends on MPI buffer size
+      numberOfParticles = this->fileParticles[file];
+      if (numberOfParticles > this->maxRead)
+        numberOfParticles = this->maxRead;
+
+      // If a file is too large to be passed as an MPI message divide it up
+      remainingParticles = this->fileParticles[file];
+
+    } else {
+#ifndef USE_VTK_COSMO
+      cout << "Rank " << this->myProc << " no file to open " << endl;
+#endif
+    }
+
+    for (int piece = 0; piece < this->maxReadsPerFile; piece++) {
+      // Reset the comm buffers
+      for (int i = 0; i < numProc; ++i) {
+        pByProc[i].clear();
+      }
+
+      pBuffer.clear();
+      pRecvBuffer.clear();
+
+      // Processor has a file to read and share via round robin with others
+      if (file < (int)this->inFiles.size()) {
+        if (this->inputType == RECORD) {
+          readFromRecordFile(inStream, firstParticle, numberOfParticles,
+                             fBlock, iBlock, pByProc);
+        } else {
+          readFromBlockFile(inStream, firstParticle, numberOfParticles,
+                           this->fileParticles[file],
+                           lBlock, vBlock, iBlock, pByProc);
+        }
+        firstParticle += numberOfParticles;
+        remainingParticles -= numberOfParticles;
+        if (remainingParticles <= 0)
+          numberOfParticles = 0;
+        else if (remainingParticles < numberOfParticles)
+          numberOfParticles = remainingParticles;
+      }
+
+      // Record all sizes into a single buffer and send this to all ranks
+      long totalToSend = 0;
+      for (int i = 0; i < numProc; ++i) {
+        int sz = (int) pByProc[i].size();
+        pCounts[i] = sz;
+        totalToSend += sz;
+      }
+
+      MPI_Alltoall(&pCounts[0], 1, MPI_INT,
+                   &pRecvCounts[0], 1, MPI_INT,
+                   Partition::getComm());
+
+      // pRecvCounts now holds the number of particles that this rank should
+      // get from every other rank
+      long totalToRecv = 0;
+      for (int i = 0; i < numProc; ++i) {
+        totalToRecv += pRecvCounts[i];
+      }
+
+      // Allocate and pack the buffer with all particles to send
+      pBuffer.reserve(totalToSend);
+      for (int i = 0; i < numProc; ++i)
+      for (int j = 0; j < (int) pByProc[i].size(); ++j) {
+        pBuffer.push_back(pByProc[i][j]);
+      }
+
+      // Calculate displacements
+      pDisp[0] = pRecvDisp[0] = 0;
+      for (int i = 1; i < numProc; ++i) {
+        pDisp[i] = pDisp[i-1] + pCounts[i-1];
+        pRecvDisp[i] = pRecvDisp[i-1] + pRecvCounts[i-1];
+      }
+
+      // Send all particles to their new homes
+      pRecvBuffer.resize(totalToRecv);
+      MPI_Alltoallv(&pBuffer[0], &pCounts[0], &pDisp[0], CosmoParticleType,
+                    &pRecvBuffer[0], &pRecvCounts[0], &pRecvDisp[0], CosmoParticleType,
+                    Partition::getComm());
+
+      // We now have all of our particles, put them in our local arrays
+      for (long i = 0; i < totalToRecv; ++i) {
+        POSVEL_T loc[DIMENSION], vel[DIMENSION], mass;
+        ID_T id;
+
+        int j = 0;
+        for (int dim = 0; dim < DIMENSION; dim++) {
+          loc[dim] = pRecvBuffer[i].floatData[j++];
+        }
+        for (int dim = 0; dim < DIMENSION; dim++) {
+          vel[dim] = pRecvBuffer[i].floatData[j++];
+        }
+        id = pRecvBuffer[i].intData[0];
+
+        // Is the particle ALIVE on this processor
+        if ((loc[0] >= minAlive[0] && loc[0] < maxAlive[0]) &&
+            (loc[1] >= minAlive[1] && loc[1] < maxAlive[1]) &&
+            (loc[2] >= minAlive[2] && loc[2] < maxAlive[2])) {
+
+          this->xx->push_back(loc[0]);
+          this->yy->push_back(loc[1]);
+          this->zz->push_back(loc[2]);
+          this->vx->push_back(vel[0]);
+          this->vy->push_back(vel[1]);
+          this->vz->push_back(vel[2]);
+          this->ms->push_back(mass);
+          this->tag->push_back(id);
+
+          this->numberOfAliveParticles++;
+          this->particleCount++;
+        }
+      }
+    }
+
+    // Can delete the read buffers as soon as last file is read because
+    // information has been transferred into the double buffer1
+    if (file == (this->maxFiles - 1)) {
+      if (this->inputType == RECORD) {
+        delete [] fBlock;
+        delete [] iBlock;
+      } else if (this->inputType == BLOCK) {
+        delete [] lBlock;
+        delete [] vBlock;
+        delete [] iBlock;
+      }
+    }
+
+    if ((int)this->inFiles.size() > file)
+      inStream->close();
+  }
+
+  // Count the particles across processors
+  long totalAliveParticles = 0;
+  MPI_Allreduce((void*) &this->numberOfAliveParticles,
+                (void*) &totalAliveParticles,
+                1, MPI_LONG, MPI_SUM, Partition::getComm());
+
+#ifndef USE_VTK_COSMO
+#ifdef DEBUG
+  cout << "Rank " << setw(3) << this->myProc
+       << " #alive = " << this->numberOfAliveParticles << endl;
+#endif
+
+  if (this->myProc == MASTER) {
+    cout << "TotalAliveParticles " << totalAliveParticles << endl;
+  }
+#endif
+
+  MPI_Type_free(&CosmoParticleType);
+
+  }
+}
+#endif // USE_SERIAL_COSMO
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Each processor reads 0 or more files, a buffer at a time, and shares
+// the particles by passing the buffer round robin to every other processor
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readParticlesRoundRobin(int reserveQ)
+{
+  // Find how many input files there are and deal them between the processors
+  // Calculates the max number of files per processor and max number of
+  // particles per file so that buffering can be done
+  // For round robin sharing determine where to send and receive buffers from
+  partitionInputFiles();
+
+  // Compute the total number of particles in the problem
+  // Compute the maximum number of particles in any one file to set buffer size
+  findFileParticleCount();
+
+  // If there is only one input file we don't have to do MPI messaging
+  // because each processor will read that same file and extract only
+  // the particles in range
+  if (this->numberOfFiles == 1) {
+    if (this->inputType == RECORD) {
+      readFromRecordFile();
+    } else {
+      readFromBlockFile();
+    }
+  } else {
+
+  // MPI buffer size might limit the number of particles read from a file
+  // and passed round robin
+  // Largest file will have a number of buffer chunks to send if it is too large
+  // Every processor must send that number of chunks even if its own file
+  // does not have that much information
+
+  if (ENFORCE_MAX_READ == true && this->maxParticles > MAX_READ) {
+    this->maxRead = MAX_READ;
+    this->maxReadsPerFile = (this->maxParticles / this->maxRead) + 1;
+  } else {
+    this->maxRead = this->maxParticles;
+    this->maxReadsPerFile = 1;
+  }
+
+  // Allocate space to hold buffer information for reading of files
+  // Mass is constant use that float to store the tag
+  // Number of particles is the first integer in the buffer
+  int bufferSize = sizeof(int) + (this->maxRead * RECORD_SIZE);
+  Message* message1 = new Message(bufferSize);
+  Message* message2 = new Message(bufferSize);
+
+  // Allocate space for the data read from the file
+  POSVEL_T *fBlock = 0;
+  POSVEL_T *lBlock = 0;
+  POSVEL_T *vBlock = 0;
+  ID_T* iBlock = 0;
+
+  // RECORD format reads one particle at a time
+  if (this->inputType == RECORD) {
+    fBlock = new POSVEL_T[COSMO_FLOAT];
+    iBlock = new ID_T[COSMO_INT];
+  }
+
+  // BLOCK format reads all particles at one time for triples
+  else if (this->inputType == BLOCK) {
+    lBlock = new POSVEL_T[this->maxRead * DIMENSION];
+    vBlock = new POSVEL_T[this->maxRead * DIMENSION];
+    iBlock = new ID_T[this->maxRead];
+  }
+
+  // Reserve particle storage to minimize reallocation
+  int reserveSize = (int) (this->maxFiles * this->maxParticles * DEAD_FACTOR);
+
+  // If multiple processors are reading the same file we can reduce size
+  reserveSize /= this->processorsPerFile;
+
+  if(reserveQ) {
+#ifndef USE_VTK_COSMO
+    cout << "readParticlesRoundRobin reserving vectors" << endl;
+#endif
+    this->xx->reserve(reserveSize);
+    this->yy->reserve(reserveSize);
+    this->zz->reserve(reserveSize);
+    this->vx->reserve(reserveSize);
+    this->vy->reserve(reserveSize);
+    this->vz->reserve(reserveSize);
+    this->ms->reserve(reserveSize);
+    this->tag->reserve(reserveSize);
+  }
+
+  // Running total and index into particle data on this processor
+  this->particleCount = 0;
+
+  // Using the input files assigned to this processor, read the input
+  // and push round robin to every other processor
+  // this->maxFiles is the maximum number to read on any processor
+  // Some processors may have no files to read but must still participate
+  // in the round robin distribution
+
+  for (int file = 0; file < this->maxFiles; file++) {
+
+    // Open file to read the data if any for this processor
+    ifstream* inStream = 0;
+    int firstParticle = 0;
+    int numberOfParticles = 0;
+    int remainingParticles = 0;
+
+    if ((int)this->inFiles.size() > file) {
+      inStream = new ifstream(this->inFiles[file].c_str(), ios::in|ios::binary);
+
+#ifndef USE_VTK_COSMO
+      cout << "Rank " << this->myProc << " open file " << inFiles[file]
+           << " with " << this->fileParticles[file] << " particles" << endl;
+#endif
+
+      // Number of particles read at one time depends on MPI buffer size
+      numberOfParticles = this->fileParticles[file];
+      if (numberOfParticles > this->maxRead)
+        numberOfParticles = this->maxRead;
+
+      // If a file is too large to be passed as an MPI message divide it up
+      remainingParticles = this->fileParticles[file];
+
+    } else {
+#ifndef USE_VTK_COSMO
+      cout << "Rank " << this->myProc << " no file to open " << endl;
+#endif
+    }
+
+    for (int piece = 0; piece < this->maxReadsPerFile; piece++) {
+
+      // Reset each MPI message for each file read
+      message1->reset();
+      message2->reset();
+
+      // Processor has a file to read and share via round robin with others
+      if (file < (int)this->inFiles.size()) {
+        if (this->inputType == RECORD) {
+          readFromRecordFile(inStream, firstParticle, numberOfParticles,
+                             fBlock, iBlock, message1);
+        } else {
+          readFromBlockFile(inStream, firstParticle, numberOfParticles,
+                           this->fileParticles[file],
+                           lBlock, vBlock, iBlock, message1);
+        }
+        firstParticle += numberOfParticles;
+        remainingParticles -= numberOfParticles;
+        if (remainingParticles <= 0)
+          numberOfParticles = 0;
+        else if (remainingParticles < numberOfParticles)
+          numberOfParticles = remainingParticles;
+      }
+
+      // Processor does not have a file to open but must participate in the
+      // round robin with an empty buffer
+      else {
+        // Store number of particles used in first position
+        int zero = 0;
+        message1->putValue(&zero);
+      }
+
+      // Particles belonging to this processor are put in vectors
+      distributeParticles(message1, message2);
+    }
+
+    // Can delete the read buffers as soon as last file is read because
+    // information has been transferred into the double buffer1
+    if (file == (this->maxFiles - 1)) {
+      if (this->inputType == RECORD) {
+        delete [] fBlock;
+        delete [] iBlock;
+      } else if (this->inputType == BLOCK) {
+        delete [] lBlock;
+        delete [] vBlock;
+        delete [] iBlock;
+      }
+    }
+
+    if ((int)this->inFiles.size() > file)
+      inStream->close();
+  }
+
+  // After all particles have been distributed to vectors the double
+  // buffers can be deleted
+  delete message1;
+  delete message2;
+
+  // Count the particles across processors
+  long totalAliveParticles = 0;
+#ifdef USE_SERIAL_COSMO
+  totalAliveParticles = this->numberOfAliveParticles;
+#else
+  MPI_Allreduce((void*) &this->numberOfAliveParticles,
+                (void*) &totalAliveParticles,
+                1, MPI_LONG, MPI_SUM, Partition::getComm());
+#endif
+
+#ifndef USE_VTK_COSMO
+#ifdef DEBUG
+  cout << "Rank " << setw(3) << this->myProc
+       << " #alive = " << this->numberOfAliveParticles << endl;
+#endif
+
+  if (this->myProc == MASTER) {
+    cout << "TotalAliveParticles " << totalAliveParticles << endl;
+  }
+#endif
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Using the base name of the data, go to the subdirectory and determine
+// how many input files there are.  Parcel those files between all the
+// processors which will be responsible for actually reading 0 or more.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::partitionInputFiles(bool force1PPF)
+{
+  // Find number of input files for this problem given the base input name
+  // Get the subdirectory name containing the input files
+  string::size_type dirPos = this->baseFile.rfind("/");
+  string subdirectory;
+  string baseName;
+
+  // If the directory is not given use the current directory
+  if (dirPos == string::npos) {
+    subdirectory = "./";
+    baseName = this->baseFile;
+  } else {
+    subdirectory = this->baseFile.substr(0, dirPos + 1);
+    baseName = this->baseFile.substr(dirPos + 1);
+  }
+
+  // strip everything back to the first non-number
+  string::size_type pos = baseName.size() - 1;
+  int numbersOK = 1;
+
+  while(numbersOK)
+    {
+    if(baseName[pos] >= '0' && baseName[pos] <= '9')
+      {
+      if(pos > 0)
+        {
+        pos = pos - 1;
+        }
+      else
+        {
+        break;
+        }
+      }
+    else
+      {
+      numbersOK = 0;
+      }
+    }
+
+  // base name is everything up to the numbers
+  baseName = baseName.substr(0, pos + 1);
+
+  // Open the subdirectory and make a list of input files
+  DIR* directory = opendir(subdirectory.c_str());
+  struct dirent* directoryEntry;
+  vector<string> files;
+
+  if (directory != NULL) {
+  while ((directoryEntry = readdir(directory)))
+    {
+    // get the name
+    string fileName = directoryEntry->d_name;
+    pos = fileName.find(baseName.c_str());
+
+    // if it starts with the base name
+    if(pos == 0)
+      {
+      // check to see if it is all numbers on the end
+      pos = baseName.size() + 1;
+      numbersOK = 1;
+
+      while(pos < fileName.size())
+        {
+        if(fileName[pos] < '0' || fileName[pos] > '9')
+          {
+          numbersOK = 0;
+          break;
+          }
+
+        pos = pos + 1;
+        }
+
+      if(numbersOK)
+        {
+        fileName = subdirectory + fileName;
+        files.push_back(fileName);
+        }
+      }
+    }
+
+  closedir(directory);
+  }
+
+  this->numberOfFiles = (int)files.size();
+
+  if (this->numberOfFiles == 0) {
+#ifdef USE_VTK_COSMO
+    vtkStdString temp = "Processor ";
+    temp += this->myProc;
+    temp += " found no input files.\n";
+    vtkOutputWindowDisplayErrorText(temp.c_str());
+
+    return;
+#else
+    cout << "Rank " << this->myProc << " found no input files" << endl;
+    exit(1);
+#endif
+  }
+
+#ifndef USE_VTK_COSMO
+#ifdef DEBUG
+  if (this->myProc == MASTER) {
+    for (int i = 0; i < this->numberOfFiles; i++)
+      cout << "   File " << i << ": " << files[i] << endl;
+  }
+#endif
+#endif
+
+  // Divide the files between all the processors
+  // If there are 1 or more files per processor set the
+  // buffering up with a full round robin between all processors
+  if (this->numberOfFiles >= this->numProc || force1PPF) {
+
+    // Number of round robin sends to share all the files
+    this->processorsPerFile = 1;
+    this->numberOfFileSends = this->numProc - 1;
+    this->maxFileSends = this->numberOfFileSends;
+
+    // Which files does this processor read
+    for (int i = 0; i < this->numberOfFiles; i++)
+      if ((i % this->numProc) == this->myProc)
+        this->inFiles.push_back(files[i]);
+
+    // Where is the file sent, and where is it received
+    if (this->myProc == this->numProc - 1)
+      this->nextProc = 0;
+    else
+      this->nextProc = this->myProc + 1;
+    if (this->myProc == 0)
+      this->prevProc = this->numProc - 1;
+    else
+      this->prevProc = this->myProc - 1;
+  }
+
+  // If there are more processors than file set up as many round robin loops
+  // as possible so that multiple processors read the same file. If the number
+  // of files does not divide evenly into the number of processors the last
+  // round robin loop will be bigger and some processors will contribute
+  // buffers of 0 size to send
+
+  else {
+
+    // Assign the round robin circle (last circle is bigger than others)
+    this->processorsPerFile = this->numProc / this->numberOfFiles;
+    int numberOfRoundRobinCircles = this->processorsPerFile;
+    int myCircle = this->myProc / this->numberOfFiles;
+    int extraProcessors = this->numProc -
+            (numberOfRoundRobinCircles * this->numberOfFiles);
+    if (myCircle == numberOfRoundRobinCircles)
+      myCircle--;
+
+    int firstInCircle = myCircle * this->numberOfFiles;
+    int lastInCircle = firstInCircle + this->numberOfFiles - 1;
+    if (myCircle == (numberOfRoundRobinCircles - 1))
+      lastInCircle += extraProcessors;
+
+    // How big is the round robin circle this processor is in
+    // What is the biggest round robin circle (needed because of MPI_Barrier)
+    this->numberOfFileSends = lastInCircle - firstInCircle;
+    this->maxFileSends = this->numberOfFiles + extraProcessors;
+
+    // Which file does this processor read
+    int index = this->myProc % this->numberOfFiles;
+    if (myCircle == (this->myProc / this->numberOfFiles))
+      this->inFiles.push_back(files[index]);
+
+    // Where is the file sent, and where is it received
+    if (this->myProc == lastInCircle)
+      this->nextProc = firstInCircle;
+    else
+      this->nextProc = this->myProc + 1;
+    if (this->myProc == firstInCircle)
+      this->prevProc = lastInCircle;
+    else
+      this->prevProc = this->myProc - 1;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Open each input file belonging to this processor and find the number
+// of particles for setting buffer sizes
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::findFileParticleCount()
+{
+  // Compute the total number of particles in the problem
+  // Compute the maximum number of particles in any one file to set buffer size
+  long numberOfParticles = 0;
+  long maxNumberOfParticles = 0;
+  int numberOfMyFiles = (int)this->inFiles.size();
+
+  // Each processor counts the particles in its own files
+  for (int i = 0; i < numberOfMyFiles; i++) {
+
+    // Open my file
+    ifstream *inStream = new ifstream(this->inFiles[i].c_str(), ios::in);
+    if (inStream->fail()) {
+      delete inStream;
+#ifdef USE_VTK_COSMO
+      vtkStdString message = "File ";
+      message += this->inFiles[i];
+      message += " cannot be opened.\n";
+      vtkOutputWindowDisplayErrorText(message.c_str());
+
+      this->totalParticles = 0;
+      this->maxParticles = 0;
+      return;
+#else
+      cout << "File: " << this->inFiles[i] << " cannot be opened" << endl;
+      exit (-1);
+#endif
+    }
+
+    if (this->inputType == RECORD) {
+
+      // Compute the number of particles from file size
+      inStream->seekg(0L, ios::end);
+      int numberOfRecords = inStream->tellg() / RECORD_SIZE;
+      this->fileParticles.push_back(numberOfRecords);
+
+      numberOfParticles += numberOfRecords;
+      if (maxNumberOfParticles < numberOfRecords)
+        maxNumberOfParticles = numberOfRecords;
+    }
+
+    else if (this->inputType == BLOCK) {
+
+      // Find the number of particles in the header
+      readGadgetHeader(inStream);
+
+      int numberOfRecords = this->gadgetParticleCount;
+      this->fileParticles.push_back(numberOfRecords);
+
+      numberOfParticles += numberOfRecords;
+      if (maxNumberOfParticles < numberOfRecords)
+        maxNumberOfParticles = numberOfRecords;
+    }
+
+    inStream->close();
+    delete inStream;
+  }
+
+  // If multiple processors read the same file, just do the reduce on one set
+  if (this->processorsPerFile > 1) {
+    if (this->myProc >= this->numberOfFiles) {
+      numberOfParticles = 0;
+      maxNumberOfParticles = 0;
+    }
+  }
+
+  // Share the information about total particles
+#ifdef USE_SERIAL_COSMO
+  this->totalParticles = numberOfParticles;
+#else
+  MPI_Allreduce((void*) &numberOfParticles,
+                (void*) &this->totalParticles,
+                1, MPI_LONG, MPI_SUM, Partition::getComm());
+#endif
+
+  // Share the information about max particles in a file for setting buffer size
+#ifdef USE_SERIAL_COSMO
+  this->maxParticles = maxNumberOfParticles;
+#else
+  MPI_Allreduce((void*) &maxNumberOfParticles,
+                (void*) &this->maxParticles,
+                1, MPI_LONG, MPI_MAX, Partition::getComm());
+#endif
+
+  // Share the maximum number of files on a processor for setting the loop
+#ifdef USE_SERIAL_COSMO
+  this->maxFiles = numberOfMyFiles;
+#else
+  MPI_Allreduce((void*) &numberOfMyFiles,
+                (void*) &this->maxFiles,
+                1, MPI_INT, MPI_MAX, Partition::getComm());
+#endif
+
+#ifndef USE_VTK_COSMO
+#ifdef DEBUG
+  if (this->myProc == MASTER) {
+    cout << "Total particle count: " << this->totalParticles << endl;
+    cout << "Max particle count:   " << this->maxParticles << endl;
+  }
+#endif
+#endif
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Each processor reads 0 or more files, a buffer at a time.
+// The particles are processed by seeing if they are in the subextent of
+// this processor and are tagged either ALIVE or if dead, by the index of
+// the neighbor zone which contains that particle.  That buffer is sent
+// round robin to (myProc + 1) % numProc where it is processed and sent on.
+// After each processor reads one buffer and sends and receives numProc - 1
+// times the next buffer from the file is read.  Must use a double buffering
+// scheme so that on each send/recv we switch buffers.
+//
+// Input files may be BLOCK or RECORD structured
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::distributeParticles(
+                Message* message1,      // Send/receive buffers
+                Message* message2)      // Send/receive buffers
+{
+  // Each processor has filled a buffer with particles read from a file
+  // or had no particles to read but set the count in the buffer to 0
+  // Process the buffer to keep only those within range
+  Message* recvMessage = message1;
+  Message* sendMessage = message2;
+
+  // Process the original send buffer of particles from the file
+  collectLocalParticles(recvMessage, sendMessage);
+
+  // Distribute buffer round robin so that all processors see it
+  for (int step = 0; step < this->maxFileSends; step++) {
+
+    if (step < this->numberOfFileSends)
+      {
+      // Send buffer to the next processor if round robin loop is still active
+      sendMessage->send(this->nextProc);
+
+      // Receive buffer from the previous processor
+      recvMessage->receive(this->prevProc);
+      }
+
+#ifndef USE_SERIAL_COSMO
+    MPI_Barrier(Partition::getComm());
+#endif
+
+    // Process the send buffer for alive and dead before sending on
+    // the particles that were not claimed by this processor
+    if (step < this->numberOfFileSends)
+      collectLocalParticles(recvMessage, sendMessage);
+
+#ifndef USE_SERIAL_COSMO
+    MPI_Barrier(Partition::getComm());
+#endif
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Input file is RECORD structured so read each particle record and populate
+// the double buffer in particle order for the rest of the processing
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readFromRecordFile(
+                        ifstream* inStream,     // Stream to read from
+                        int firstParticle,      // First particle index
+                        int numberOfParticles,  // Number to read this time
+                        POSVEL_T* fBlock,       // Buffer for read in data
+                        ID_T* iBlock,           // Buffer for read in data
+                        Message* message)       // Reordered data
+{
+  // Store number of particles used in first position
+  message->putValue(&numberOfParticles);
+  if (numberOfParticles == 0)
+    return;
+
+  // Seek to the first particle locations and read
+  int skip = RECORD_SIZE * firstParticle;
+  inStream->seekg(skip, ios::beg);
+
+  // Store each particle location, velocity, mass and tag (as float) in buffer
+  int changeCount = 0;
+  for (int p = 0; p < numberOfParticles; p++) {
+
+    // Set file pointer to the requested particle
+    inStream->read(reinterpret_cast<char*>(fBlock),
+                   COSMO_FLOAT * sizeof(POSVEL_T));
+
+    if (inStream->gcount() != COSMO_FLOAT * sizeof(POSVEL_T)) {
+#ifdef USE_VTK_COSMO
+      vtkOutputWindowDisplayErrorText("Premature end-of-file.\n");
+      return;
+#else
+      cout << "Premature end-of-file" << endl;
+      exit (-1);
+#endif
+    }
+
+    // Convert units if requested
+    fBlock[0] *= this->distConvertFactor;
+    fBlock[2] *= this->distConvertFactor;
+    fBlock[4] *= this->distConvertFactor;
+    fBlock[6] *= this->massConvertFactor;
+
+    inStream->read(reinterpret_cast<char*>(iBlock),
+                   COSMO_INT * sizeof(ID_T));
+
+    if (inStream->gcount() != COSMO_INT * sizeof(ID_T)) {
+#ifdef USE_VTK_COSMO
+      vtkOutputWindowDisplayErrorText("Premature end-of-file.\n");
+      return;
+#else
+      cout << "Premature end-of-file" << endl;
+      exit (-1);
+#endif
+    }
+
+    // If the location is not within the bounding box wrap around
+    for (int i = 0; i <= 4; i = i + 2) {
+      if (fBlock[i] >= this->boxSize) {
+#ifndef USE_VTK_COSMO
+#ifdef DEBUG
+        cout << "Location at " << i << " changed from " << fBlock[i] << endl;
+#endif
+#endif
+        fBlock[i] -= this->boxSize;
+        changeCount++;
+      }
+    }
+
+    // Store location and velocity and mass in message buffer
+    // Reorder so that location vector is followed by velocity vector
+    message->putValue(&fBlock[0]);
+    message->putValue(&fBlock[2]);
+    message->putValue(&fBlock[4]);
+    message->putValue(&fBlock[1]);
+    message->putValue(&fBlock[3]);
+    message->putValue(&fBlock[5]);
+    message->putValue(&fBlock[6]);
+
+    // Store the integer tag
+    message->putValue(&iBlock[0]);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Input file is BLOCK structured so read head and each block of data.
+// Gadget format:
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_H 4 bytes (size of header)
+//    Header (6 types of particles with counts and masses)
+//    SKIP_H 4 bytes (size of header)
+//
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_L 4 bytes (size of location block in bytes)
+//    Block of location data where each particle's x,y,z is stored together
+//    SKIP_L 4 bytes (size of location block in bytes)
+//
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_V 4 bytes (size of velocity block in bytes)
+//    Block of velocity data where each particle's xv,yv,zv is stored together
+//    SKIP_V 4 bytes (size of velocity block in bytes)
+//
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_T 4 bytes (size of tag block in bytes)
+//    Block of tag data
+//    SKIP_T 4 bytes (size of tag block in bytes)
+//
+// Reorder the data after it is read into the same structure as the
+// RECORD data so that the rest of the code does not have to be changed
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readFromBlockFile(
+                        ifstream* inStream,     // Stream to read from
+                        int firstParticle,      // First particle index
+                        int numberOfParticles,  // Number to read this time
+                        int totParticles,       // Total particles in file
+                        POSVEL_T* lBlock,       // Buffer for read of location
+                        POSVEL_T* vBlock,       // Buffer for read of velocity
+                        ID_T* iBlock,           // Buffer for read in data
+                        Message* message)       // Reordered data
+{
+  // Store number of particles used in first position
+  message->putValue(&numberOfParticles);
+  if (numberOfParticles == 0)
+    return;
+
+  // Calculate skips to first location, velocity and tag
+  int skipToLocation = 0;
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToLocation += GADGET_SKIP;		// Size of header
+  skipToLocation += GADGET_HEADER_SIZE;		// Header
+  skipToLocation += GADGET_SKIP;		// Size of header
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToLocation += GADGET_SKIP;		// Size of location block
+
+  int skipToVelocity = skipToLocation;
+  skipToVelocity += DIMENSION * sizeof(POSVEL_T) * totParticles;
+  skipToVelocity += GADGET_SKIP;		// Size of location block
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToVelocity += GADGET_SKIP;		// Size of velocity block
+
+  int skipToTag = skipToVelocity;
+  skipToTag += DIMENSION * sizeof(POSVEL_T) * totParticles;
+  skipToTag += GADGET_SKIP;			// Size of velocity block
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToTag += GADGET_SKIP;			// Size of tag block
+
+  // Seek to the first requested particle location and read triples
+  inStream->seekg(skipToLocation, ios::beg);
+  int skip = (DIMENSION * sizeof(POSVEL_T) * firstParticle);
+  inStream->seekg(skip, ios::cur);
+
+  readData(this->gadgetSwap, (void*) lBlock, sizeof(POSVEL_T),
+                 DIMENSION * numberOfParticles, inStream);
+
+  // Convert units of distance
+  for (int i = 0; i < DIMENSION*numberOfParticles; i++)
+    lBlock[i] *= this->distConvertFactor;
+
+  // If the location is not within the bounding box wrap around
+  for (int i = 0; i < DIMENSION*numberOfParticles; i++) {
+    if (lBlock[i] >= this->boxSize)
+      lBlock[i] -= this->boxSize;
+  }
+
+  // Seek to first requested particle velocity and read triples
+  inStream->seekg(skipToVelocity, ios::beg);
+  skip = (DIMENSION * sizeof(POSVEL_T) * firstParticle); // skip to velocity
+  inStream->seekg(skip, ios::cur);
+
+  readData(this->gadgetSwap, (void*) vBlock, sizeof(POSVEL_T),
+                 DIMENSION * numberOfParticles, inStream);
+
+  // Seek to first requested particle tag and read
+  inStream->seekg(skipToTag, ios::beg);
+  skip = sizeof(ID_T) * firstParticle;             // skip to tag
+  inStream->seekg(skip, ios::cur);
+
+  readData(this->gadgetSwap, (void*) iBlock, sizeof(ID_T),
+                 numberOfParticles, inStream);
+
+  // Store the locations in the message buffer in record order
+  // so that the same distribution method for RECORD will work
+  int particlesRemaining = numberOfParticles;
+  int curParticle = firstParticle;
+  int type = 0;
+  int indx = 0;
+  int tagindx = 0;
+
+  // When more than one gadget particle type is in the file we must
+  // know what mass to assign to the current piece read
+  while (particlesRemaining > 0) {
+
+    // Set particle type and mass based on current particle
+    while (type < NUM_GADGET_TYPES && curParticle >= this->gadgetStart[type])
+      type++;
+    type--;
+
+    POSVEL_T particleMass =
+      (POSVEL_T) this->gadgetHeader.mass[type] * this->massConvertFactor;
+
+    // Place particles of this type and mass in the buffer
+    int numLeftInType = this->gadgetHeader.npart[type] - 
+                        (curParticle - this->gadgetStart[type]);
+    int count = min(particlesRemaining, numLeftInType);
+
+    for (int p = 0; p < count; p++) {
+
+      // Locations
+      message->putValue(&lBlock[indx]);           // X location
+      message->putValue(&lBlock[indx+1]);         // Y location
+      message->putValue(&lBlock[indx+2]);         // Z location
+
+      // Velocities
+      message->putValue(&vBlock[indx]);           // X velocity
+      message->putValue(&vBlock[indx+1]);         // Y velocity
+      message->putValue(&vBlock[indx+2]);         // Z velocity
+
+      // Mass
+      message->putValue(&particleMass);
+
+      // Id tag
+      message->putValue(&iBlock[tagindx]);
+      indx += DIMENSION;
+      tagindx++;
+    }
+
+    // Do we have more particles of a different type
+    particlesRemaining -= count;
+    curParticle += count;
+  }
+}
+
+#ifndef USE_SERIAL_COSMO
+/////////////////////////////////////////////////////////////////////////////
+//
+// Input file is RECORD structured so read each particle record and populate
+// the double buffer in particle order for the rest of the processing
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readFromRecordFile(
+                        ifstream* inStream,     // Stream to read from
+                        int firstParticle,      // First particle index
+                        int numberOfParticles,  // Number to read this time
+                        POSVEL_T* fBlock,       // Buffer for read in data
+                        ID_T* iBlock,           // Buffer for read in data
+                        std::vector< std::vector<CosmoParticle> > &pByProc)
+{
+  if (numberOfParticles == 0)
+    return;
+
+  // Seek to the first particle locations and read
+  int skip = RECORD_SIZE * firstParticle;
+  inStream->seekg(skip, ios::beg);
+
+  // Store each particle location, velocity, mass and tag (as float) in buffer
+  int changeCount = 0;
+  for (int p = 0; p < numberOfParticles; p++) {
+
+    // Set file pointer to the requested particle
+    inStream->read(reinterpret_cast<char*>(fBlock),
+                   COSMO_FLOAT * sizeof(POSVEL_T));
+
+    if (inStream->gcount() != COSMO_FLOAT * sizeof(POSVEL_T)) {
+#ifdef USE_VTK_COSMO
+      vtkOutputWindowDisplayErrorText("Premature end-of-file.\n");
+      return;
+#else
+      cout << "Premature end-of-file" << endl;
+      exit (-1);
+#endif
+    }
+
+    // Convert units if requested
+    fBlock[0] *= this->distConvertFactor;
+    fBlock[2] *= this->distConvertFactor;
+    fBlock[4] *= this->distConvertFactor;
+    fBlock[6] *= this->massConvertFactor;
+
+    inStream->read(reinterpret_cast<char*>(iBlock),
+                   COSMO_INT * sizeof(ID_T));
+
+    if (inStream->gcount() != COSMO_INT * sizeof(ID_T)) {
+#ifdef USE_VTK_COSMO
+      vtkOutputWindowDisplayErrorText("Premature end-of-file.\n");
+      return;
+#else
+      cout << "Premature end-of-file" << endl;
+      exit (-1);
+#endif
+    }
+
+    // If the location is not within the bounding box wrap around
+    for (int i = 0; i <= 4; i = i + 2) {
+      if (fBlock[i] >= this->boxSize) {
+#ifndef USE_VTK_COSMO
+#ifdef DEBUG
+        cout << "Location at " << i << " changed from " << fBlock[i] << endl;
+#endif
+#endif
+        fBlock[i] -= this->boxSize;
+        changeCount++;
+      }
+    }
+
+    // Figure out to which rank this particle belongs
+    float sizeX = this->boxSize / this->layoutSize[0];
+    float sizeY = this->boxSize / this->layoutSize[1];
+    float sizeZ = this->boxSize / this->layoutSize[2];
+    int coords[3] = { (int) (fBlock[0]/sizeX),
+                      (int) (fBlock[2]/sizeY),
+                      (int) (fBlock[4]/sizeZ)
+                    };
+    int home;
+    MPI_Cart_rank(Partition::getComm(), coords, &home);
+
+    // Store location and velocity and mass
+    // Reorder so that location vector is followed by velocity vector
+    CosmoParticle particle = { { fBlock[0],
+                                 fBlock[2],
+                                 fBlock[4],
+                                 fBlock[1],
+                                 fBlock[3],
+                                 fBlock[5],
+                                 fBlock[6]
+                               }, {
+                                 static_cast<int>(iBlock[0])
+                               }
+                             };
+    pByProc[home].push_back(particle);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Input file is BLOCK structured so read head and each block of data.
+// Gadget format:
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_H 4 bytes (size of header)
+//    Header (6 types of particles with counts and masses)
+//    SKIP_H 4 bytes (size of header)
+//
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_L 4 bytes (size of location block in bytes)
+//    Block of location data where each particle's x,y,z is stored together
+//    SKIP_L 4 bytes (size of location block in bytes)
+//
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_V 4 bytes (size of velocity block in bytes)
+//    Block of velocity data where each particle's xv,yv,zv is stored together
+//    SKIP_V 4 bytes (size of velocity block in bytes)
+//
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_T 4 bytes (size of tag block in bytes)
+//    Block of tag data
+//    SKIP_T 4 bytes (size of tag block in bytes)
+//
+// Reorder the data after it is read into the same structure as the
+// RECORD data so that the rest of the code does not have to be changed
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readFromBlockFile(
+                        ifstream* inStream,     // Stream to read from
+                        int firstParticle,      // First particle index
+                        int numberOfParticles,  // Number to read this time
+                        int totParticles,       // Total particles in file
+                        POSVEL_T* lBlock,       // Buffer for read of location
+                        POSVEL_T* vBlock,       // Buffer for read of velocity
+                        ID_T* iBlock,           // Buffer for read in data
+                        std::vector< std::vector<CosmoParticle> > &pByProc)
+{
+  if (numberOfParticles == 0)
+    return;
+
+  // Calculate skips to first location, velocity and tag
+  int skipToLocation = 0;
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToLocation += GADGET_SKIP;		// Size of header
+  skipToLocation += GADGET_HEADER_SIZE;		// Header
+  skipToLocation += GADGET_SKIP;		// Size of header
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToLocation += GADGET_SKIP;		// Size of location block
+
+  int skipToVelocity = skipToLocation;
+  skipToVelocity += DIMENSION * sizeof(POSVEL_T) * totParticles;
+  skipToVelocity += GADGET_SKIP;		// Size of location block
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToVelocity += GADGET_SKIP;		// Size of velocity block
+
+  int skipToTag = skipToVelocity;
+  skipToTag += DIMENSION * sizeof(POSVEL_T) * totParticles;
+  skipToTag += GADGET_SKIP;			// Size of velocity block
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToTag += GADGET_SKIP;			// Size of tag block
+
+  // Seek to the first requested particle location and read triples
+  inStream->seekg(skipToLocation, ios::beg);
+  int skip = (DIMENSION * sizeof(POSVEL_T) * firstParticle);
+  inStream->seekg(skip, ios::cur);
+
+  readData(this->gadgetSwap, (void*) lBlock, sizeof(POSVEL_T),
+                 DIMENSION * numberOfParticles, inStream);
+
+  // Convert units of distance
+  for (int i = 0; i < DIMENSION*numberOfParticles; i++)
+    lBlock[i] *= this->distConvertFactor;
+
+  // If the location is not within the bounding box wrap around
+  for (int i = 0; i < DIMENSION*numberOfParticles; i++) {
+    if (lBlock[i] >= this->boxSize)
+      lBlock[i] -= this->boxSize;
+  }
+
+  // Seek to first requested particle velocity and read triples
+  inStream->seekg(skipToVelocity, ios::beg);
+  skip = (DIMENSION * sizeof(POSVEL_T) * firstParticle); // skip to velocity
+  inStream->seekg(skip, ios::cur);
+
+  readData(this->gadgetSwap, (void*) vBlock, sizeof(POSVEL_T),
+                 DIMENSION * numberOfParticles, inStream);
+
+  // Seek to first requested particle tag and read
+  inStream->seekg(skipToTag, ios::beg);
+  skip = sizeof(ID_T) * firstParticle;             // skip to tag
+  inStream->seekg(skip, ios::cur);
+
+  readData(this->gadgetSwap, (void*) iBlock, sizeof(ID_T),
+                 numberOfParticles, inStream);
+
+  // Store the locations in the message buffer in record order
+  // so that the same distribution method for RECORD will work
+  int particlesRemaining = numberOfParticles;
+  int curParticle = firstParticle;
+  int type = 0;
+  int indx = 0;
+  int tagindx = 0;
+
+  // When more than one gadget particle type is in the file we must
+  // know what mass to assign to the current piece read
+  while (particlesRemaining > 0) {
+
+    // Set particle type and mass based on current particle
+    while (type < NUM_GADGET_TYPES && curParticle >= this->gadgetStart[type])
+      type++;
+    type--;
+
+    POSVEL_T particleMass =
+      (POSVEL_T) this->gadgetHeader.mass[type] * this->massConvertFactor;
+
+    // Place particles of this type and mass in the buffer
+    int numLeftInType = this->gadgetHeader.npart[type] - 
+                        (curParticle - this->gadgetStart[type]);
+    int count = min(particlesRemaining, numLeftInType);
+
+    float sizeX = this->boxSize / this->layoutSize[0];
+    float sizeY = this->boxSize / this->layoutSize[1];
+    float sizeZ = this->boxSize / this->layoutSize[2];
+
+    for (int p = 0; p < count; p++) {
+      // Figure out to which rank this particle belongs
+      int coords[3] = { (int) (lBlock[indx]/sizeX),
+                        (int) (lBlock[indx+1]/sizeY),
+                        (int) (lBlock[indx+2]/sizeZ)
+                      };
+      int home;
+      MPI_Cart_rank(Partition::getComm(), coords, &home);
+
+      // Store location and velocity and mass
+      // Reorder so that location vector is followed by velocity vector
+      CosmoParticle particle = { { lBlock[indx],   // X location
+                                   lBlock[indx+1], // Y location
+                                   lBlock[indx+2], // Z location
+                                   vBlock[indx],   // X velocity
+                                   vBlock[indx+1], // Y velocity
+                                   vBlock[indx+2], // Z velocity
+                                   particleMass
+                                 }, {
+                                   static_cast<int>(iBlock[tagindx])
+                                 }
+                               };
+      pByProc[home].push_back(particle);
+
+      indx += DIMENSION;
+      tagindx++;
+    }
+
+    // Do we have more particles of a different type
+    particlesRemaining -= count;
+    curParticle += count;
+  }
+}
+#endif // USE_SERIAL_COSMO
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Process the data buffer of particles to choose those which are ALIVE
+// or DEAD on this processor.  Do wraparound tests to populate as for a
+// 3D torus.  Dead particle status is the zone id of the neighbor processor
+// which contains it as an ALIVE particle.
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::collectLocalParticles(
+                Message* recvMessage,      // Read particles and extract
+                Message* sendMessage)      // Other particles copied here
+{
+  // In order to read a buffer, reset position to the beginning
+  recvMessage->reset();
+  sendMessage->reset();
+
+  int recvParticles;
+  int sendParticles = 0;
+  recvMessage->getValue(&recvParticles);
+  sendMessage->putValue(&sendParticles);
+
+  POSVEL_T loc[DIMENSION], vel[DIMENSION], mass;
+  ID_T id;
+
+  // Test each particle in the buffer to see if it is ALIVE or DEAD
+  // If it is DEAD assign it to the neighbor zone that it is in
+  // Check all combinations of wraparound
+
+  for (int i = 0; i < recvParticles; i++) {
+    for (int dim = 0; dim < DIMENSION; dim++)
+      recvMessage->getValue(&loc[dim]);
+    for (int dim = 0; dim < DIMENSION; dim++)
+      recvMessage->getValue(&vel[dim]);
+    recvMessage->getValue(&mass);
+    recvMessage->getValue(&id);
+
+    // Is the particle ALIVE on this processor
+    if ((loc[0] >= minAlive[0] && loc[0] < maxAlive[0]) &&
+        (loc[1] >= minAlive[1] && loc[1] < maxAlive[1]) &&
+        (loc[2] >= minAlive[2] && loc[2] < maxAlive[2])) {
+
+          this->xx->push_back(loc[0]);
+          this->yy->push_back(loc[1]);
+          this->zz->push_back(loc[2]);
+          this->vx->push_back(vel[0]);
+          this->vy->push_back(vel[1]);
+          this->vz->push_back(vel[2]);
+          this->ms->push_back(mass);
+          this->tag->push_back(id);
+
+          this->numberOfAliveParticles++;
+          this->particleCount++;
+    } else {
+
+      // Pass the particle along to the next processor in send buffer
+      sendParticles++;
+      for (int dim = 0; dim < DIMENSION; dim++)
+        sendMessage->putValue(&loc[dim]);
+      for (int dim = 0; dim < DIMENSION; dim++)
+        sendMessage->putValue(&vel[dim]);
+      sendMessage->putValue(&mass);
+      sendMessage->putValue(&id);
+    }
+  }
+  // Overwrite the send buffer first word with the known number of particles
+  sendMessage->putValueAtPosition(&sendParticles, 0);
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Each processor reads 1 file or gets a pointer to data eventually
+// As the particle is read it will be stored as an alive particle on this
+// processor and will be checked about neighbor ranges to see if it must
+// be exchanged
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readParticlesOneToOne(int reserveQ)
+{
+  // File name is the base file name with processor id appended
+  // Because MPI Cartesian topology is used the arrangement of files in
+  // physical space must follow the rule of last dimension varies fastest
+  ostringstream fileName;
+  fileName << this->baseFile << this->myProc;
+  this->inFiles.push_back(fileName.str());
+
+  // Compute the total number of particles in the problem
+  // Compute the maximum number of particles in any one file to set buffer size
+  findFileParticleCount();
+
+  // Reserve particle storage to minimize reallocation
+  int reserveSize = (int) (this->maxParticles * DEAD_FACTOR);
+
+  if(reserveQ) {
+#ifndef USE_VTK_COSMO
+    cout << "readParticlesOneToOne reserving vectors" << endl;
+#endif
+    this->xx->reserve(reserveSize);
+    this->yy->reserve(reserveSize);
+    this->zz->reserve(reserveSize);
+    this->vx->reserve(reserveSize);
+    this->vy->reserve(reserveSize);
+    this->vz->reserve(reserveSize);
+    this->ms->reserve(reserveSize);
+    this->tag->reserve(reserveSize);
+  }
+
+  // Running total and index into particle data on this processor
+  this->particleCount = 0;
+
+  // Read the input file storing particles immediately because all are alive
+  if (this->inputType == RECORD) {
+    readFromRecordFile();
+  } else {
+    readFromBlockFile();
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Input file is RECORD structured so read each particle record and populate
+// the vectors of particles marking all as ALIVE
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readFromRecordFile()
+{
+  // Only one file per processor named in index 0
+  ifstream inStream(this->inFiles[0].c_str(), ios::in);
+  int numberOfParticles = this->fileParticles[0];
+
+#ifndef USE_VTK_COSMO
+  cout << "Rank " << this->myProc << " open file " << this->inFiles[0]
+       << " with " << numberOfParticles << " particles" << endl;
+#endif
+
+  POSVEL_T* fBlock = new POSVEL_T[COSMO_FLOAT];
+  ID_T* iBlock = new ID_T[COSMO_INT];
+
+  // Store each particle location, velocity and tag
+  for (int i = 0; i < numberOfParticles; i++) {
+
+    // Set file pointer to the requested particle
+    inStream.read(reinterpret_cast<char*>(fBlock),
+                   COSMO_FLOAT * sizeof(POSVEL_T));
+
+    if (inStream.gcount() != COSMO_FLOAT * sizeof(POSVEL_T)) {
+#ifdef USE_VTK_COSMO
+      vtkOutputWindowDisplayErrorText("Premature end-of-file.\n");
+      inStream.close();
+      delete [] fBlock;
+      delete [] iBlock;
+
+      return;
+#else
+      cout << "Premature end-of-file" << endl;
+      exit (-1);
+#endif
+    }
+
+    // Convert units if requested
+    fBlock[0] *= this->distConvertFactor;
+    fBlock[2] *= this->distConvertFactor;
+    fBlock[4] *= this->distConvertFactor;
+    fBlock[6] *= this->massConvertFactor;
+
+    inStream.read(reinterpret_cast<char*>(iBlock),
+                   COSMO_INT * sizeof(ID_T));
+
+    if (inStream.gcount() != COSMO_INT * sizeof(ID_T)) {
+#ifdef USE_VTK_COSMO
+      vtkOutputWindowDisplayErrorText("Premature end-of-file.\n");
+      inStream.close();
+      delete [] fBlock;
+      delete [] iBlock;
+
+      return;
+#else
+      cout << "Premature end-of-file" << endl;
+      exit (-1);
+#endif
+    }
+
+    // Store information in buffer if within range on this processor
+    if ((fBlock[0] >= minAlive[0] && fBlock[0] <= maxAlive[0]) &&
+        (fBlock[2] >= minAlive[1] && fBlock[2] <= maxAlive[1]) &&
+        (fBlock[4] >= minAlive[2] && fBlock[4] <= maxAlive[2])) {
+
+      this->xx->push_back(fBlock[0]);
+      this->vx->push_back(fBlock[1]);
+      this->yy->push_back(fBlock[2]);
+      this->vy->push_back(fBlock[3]);
+      this->zz->push_back(fBlock[4]);
+      this->vz->push_back(fBlock[5]);
+      this->ms->push_back(fBlock[6]);
+      this->tag->push_back(iBlock[0]);
+
+      this->numberOfAliveParticles++;
+      this->particleCount++;
+    }
+  }
+
+  inStream.close();
+  delete [] fBlock;
+  delete [] iBlock;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Input file is BLOCK structured so read head and each block of data.
+// Gadget format:
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_H 4 bytes (size of header)
+//    Header (6 types of particles with counts and masses)
+//    SKIP_H 4 bytes (size of header)
+//
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_L 4 bytes (size of location block in bytes)
+//    Block of location data where each particle's x,y,z is stored together
+//    SKIP_L 4 bytes (size of location block in bytes)
+//
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_V 4 bytes (size of velocity block in bytes)
+//    Block of velocity data where each particle's xv,yv,zv is stored together
+//    SKIP_V 4 bytes (size of velocity block in bytes)
+//
+//    SKIP_GADGET_2 has extra 16 bytes
+//    SKIP_T 4 bytes (size of tag block in bytes)
+//    Block of tag data
+//    SKIP_T 4 bytes (size of tag block in bytes)
+//
+// Reorder the data after it is read into the same structure as the
+// RECORD data so that the rest of the code does not have to be changed
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readFromBlockFile()
+{
+  // Only one file per processor named in index 0
+  ifstream inStream(this->inFiles[0].c_str(), ios::in);
+  int numberOfParticles = this->fileParticles[0];
+
+#ifndef USE_VTK_COSMO
+  cout << "Rank " << this->myProc << " open file " << this->inFiles[0]
+       << " with " << numberOfParticles << " particles" << endl;
+#endif
+
+  // Calculate skips to first location, velocity and tag
+  int skipToLocation = 0;
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToLocation += GADGET_SKIP;                // Size of header
+  skipToLocation += GADGET_HEADER_SIZE;         // Header
+  skipToLocation += GADGET_SKIP;                // Size of header
+  if (this->gadgetFormat == GADGET_2)
+    skipToLocation += GADGET_2_SKIP;
+  skipToLocation += GADGET_SKIP;                // Size of location block
+
+  // Allocate blocks to read into
+  POSVEL_T* lBlock = new POSVEL_T[numberOfParticles * DIMENSION];
+  POSVEL_T* vBlock = new POSVEL_T[numberOfParticles * DIMENSION];
+  ID_T* iBlock = new ID_T[numberOfParticles];
+
+  // Seek to particle locations and read triples
+  inStream.seekg(skipToLocation, ios::beg);
+  readData(this->gadgetSwap, (void*) lBlock, sizeof(POSVEL_T),
+           DIMENSION * numberOfParticles, &inStream);
+
+  // Convert locations
+  for (int p = 0; p < DIMENSION * numberOfParticles; p++)
+    lBlock[p] *= this->distConvertFactor;
+
+  // Seek to particle velocities and read triples
+  inStream.seekg((2 * GADGET_SKIP), ios::cur);
+  readData(this->gadgetSwap, (void*) vBlock, sizeof(POSVEL_T),
+           DIMENSION * numberOfParticles, &inStream);
+
+
+  // Seek to particle tags and read
+  inStream.seekg((2 * GADGET_SKIP), ios::cur);
+  readData(this->gadgetSwap, (void*) iBlock, sizeof(ID_T),
+           numberOfParticles, &inStream);
+
+  // Store mass, locations, velocities and tags into arrays if in range
+  // Range test is needed because this code is used for ONE_TO_ONE where all
+  // particles must be added, and by one single input file over many
+  // processors where messaging is not needed, but some particles don't belong
+  int indx = 0;
+  int tagindx = 0;
+  for (int type = 0; type < NUM_GADGET_TYPES; type++) {
+
+    POSVEL_T particleMass =
+      (POSVEL_T) this->gadgetHeader.mass[type] * this->massConvertFactor;
+
+    for (int p = 0; p < this->gadgetHeader.npart[type]; p++) {
+
+      if ((lBlock[indx] >= minAlive[0] && lBlock[indx] < maxAlive[0]) &&
+          (lBlock[indx+1] >= minAlive[1] && lBlock[indx+1] < maxAlive[1]) &&
+          (lBlock[indx+2] >= minAlive[2] && lBlock[indx+2] < maxAlive[2])) {
+
+        this->xx->push_back(lBlock[indx]);
+        this->yy->push_back(lBlock[indx+1]);
+        this->zz->push_back(lBlock[indx+2]);
+        this->vx->push_back(vBlock[indx]);
+        this->vy->push_back(vBlock[indx+1]);
+        this->vz->push_back(vBlock[indx+2]);
+        this->ms->push_back(particleMass);
+        this->tag->push_back(iBlock[tagindx]);
+
+        this->numberOfAliveParticles++;
+        this->particleCount++;
+      }
+      indx += DIMENSION;
+      tagindx++;
+    }
+  }
+
+  delete [] lBlock;
+  delete [] vBlock;
+  delete [] iBlock;
+  inStream.close();
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Read the Gadget header from the stream
+// Gadget file may be Gadget-1 format with no block indicators or
+// Gadget-2 format with size of block 4 byte integers surrounding each block
+// Data may be big or little endian which we can tell by checking that
+// the header size is 256 in the first 4 bytes
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readGadgetHeader(ifstream* gStr)
+{
+  this->gadgetSwap = false;
+  this->gadgetFormat = 1;
+  int blockSize, blockSize2;
+  string gadget2;
+
+  // Set the gadget format type by reading the first 4 byte integer
+  // If it is not "256" or "65536" then gadget-2 format with 16 bytes in front
+  readData(this->gadgetSwap, (void*) &blockSize, GADGET_SKIP, 1, gStr);
+  if (blockSize != GADGET_HEADER_SIZE && blockSize != GADGET_HEADER_SIZE_SWP) {
+    this->gadgetFormat = GADGET_2;
+    gadget2 = readString(gStr, GADGET_2_SKIP - GADGET_SKIP);
+    readData(this->gadgetSwap, (void*) &blockSize, GADGET_SKIP, 1, gStr);
+  }
+
+  // Set the swap type
+  if (blockSize != GADGET_HEADER_SIZE) {
+    this->gadgetSwap = true;
+    blockSize = GADGET_HEADER_SIZE;
+  }
+
+  // Read the Gadget header
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.npart[0],
+                         sizeof(int), NUM_GADGET_TYPES, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.mass[0],
+                         sizeof(double), NUM_GADGET_TYPES, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.time,
+                         sizeof(double), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.redshift,
+                         sizeof(double), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.flag_sfr,
+                         sizeof(int), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.flag_feedback,
+                         sizeof(int), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.npartTotal[0],
+                         sizeof(int), NUM_GADGET_TYPES, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.flag_cooling,
+                         sizeof(int), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.num_files,
+                         sizeof(int), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.BoxSize,
+                         sizeof(double), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.Omega0,
+                         sizeof(double), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.OmegaLambda,
+                         sizeof(double), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.HubbleParam,
+                         sizeof(double), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.flag_stellarage,
+                         sizeof(int), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.flag_metals,
+                         sizeof(int), 1, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.HighWord[0],
+                         sizeof(int), NUM_GADGET_TYPES, gStr);
+  readData(this->gadgetSwap, (void*) &this->gadgetHeader.flag_entropy,
+                         sizeof(int), 1, gStr);
+  string fill = readString(gStr, GADGET_FILL);
+  strcpy(&this->gadgetHeader.fill[0], fill.c_str());
+
+  // Read the Gadget header size to verify block
+  readData(this->gadgetSwap, (void*) &blockSize2, GADGET_SKIP, 1, gStr);
+  if (blockSize != blockSize2)
+#ifdef USE_VTK_COSMO
+    vtkOutputWindowDisplayErrorText("Mismatch of header size and header structure.\n");
+#else
+    cout << "Mismatch of header size and header structure" << endl;
+#endif
+
+  // Every type particle will have location, velocity and tag so sum up
+  this->gadgetParticleCount = 0;
+  for (int i = 0; i < NUM_GADGET_TYPES; i++) {
+    this->gadgetStart[i] = this->gadgetParticleCount;
+    this->gadgetParticleCount += this->gadgetHeader.npart[i];
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Read in the requested number of characters
+//
+/////////////////////////////////////////////////////////////////////////
+
+string ParticleDistribute::readString(ifstream* inStr, int size)
+{
+   char* buffer = new char[size + 1];
+   inStr->read(buffer, size);
+   buffer[size] = '\0';
+
+   // Make sure string has legal values
+   if (isalnum(buffer[0]) == 0)
+      buffer[0] = '\0';
+   for (int i = 1; i < size; i++)
+      if (isprint(buffer[i]) == 0)
+         buffer[i] = '\0';
+
+   string retString = buffer;
+   delete [] buffer;
+   return retString;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Read in the number of items from the file pointer and
+// byte swap if necessary
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleDistribute::readData(
+        bool swap,
+        void* data,
+        unsigned long dataSize,
+        unsigned long dataCount,
+        ifstream* inStr)
+{
+   // Read all the data from the file
+   inStr->read(reinterpret_cast<char*>(data), dataSize*dataCount);
+
+   if (swap == true) {
+
+      // Byte swap each integer
+      char* dataPtr = (char*) data;
+      char temp;
+      for (unsigned long item = 0; item < dataCount; item++) {
+
+         // Do a byte-by-byte swap, reversing the order.
+         for (unsigned int i = 0; i < dataSize / 2; i++) {
+            temp = dataPtr[i];
+            dataPtr[i] = dataPtr[dataSize - 1 - i];
+            dataPtr[dataSize - 1 - i] = temp;
+         }
+         dataPtr += dataSize;
+      }
+   }
+}
diff --git a/src/halo-finder/src/ParticleDistribute.h b/src/halo-finder/src/ParticleDistribute.h
new file mode 100644
index 0000000..4a73960
--- /dev/null
+++ b/src/halo-finder/src/ParticleDistribute.h
@@ -0,0 +1,268 @@
+/*=========================================================================
+
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC.
+This software was produced under U.S. Government contract DE-AC52-06NA25396
+for Los Alamos National Laboratory (LANL), which is operated by
+Los Alamos National Security, LLC for the U.S. Department of Energy.
+The U.S. Government has rights to use, reproduce, and distribute this software.
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.
+If software is modified to produce derivative works, such modified software
+should be clearly marked, so as not to confuse it with the version available
+from LANL.
+
+Additionally, redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the following conditions
+are met:
+-   Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=========================================================================*/
+
+// .NAME ParticleDistribute - distribute particles to processors
+//
+// .SECTION Description
+// ParticleDistribute takes a series of data files containing RECORD style
+// .cosmo data or Gadget style BLOCK data
+// along with parameters defining the box size for the data and for
+// determining halos within the particle data.  It distributes the data
+// across processors including a healthy dead zone of particles belonging
+// to neighbor processors.  By definition all halos can be determined
+// completely for any processor because of this dead zone.  The serial
+// halo finder is called on each processor.
+//
+
+#ifndef ParticleDistribute_h
+#define ParticleDistribute_h
+
+#include "Message.h"
+
+#include <cstdlib>
+
+#ifdef USE_VTK_COSMO
+#include "CosmoDefinition.h"
+#include "vtkstd/string"
+#include "vtkstd/vector"
+
+using namespace vtkstd;
+#else
+#include "Definition.h"
+#include <string>
+#include <vector>
+
+using namespace std;
+#endif
+
+#ifdef USE_VTK_COSMO
+class COSMO_EXPORT ParticleDistribute {
+#else
+class ParticleDistribute {
+#endif
+public:
+  ParticleDistribute();
+  ~ParticleDistribute();
+
+  // Set parameters particle distribution
+  void setParameters(
+        const string& inName,   // Base file name to read from
+        POSVEL_T rL,            // Box size of the physical problem
+        string dataType);       // BLOCK or RECORD structured input data
+
+  // Set parameters unit conversion of mass and distance
+  void setConvertParameters(
+        POSVEL_T massConvertFactor,     // Multiply every mass by this
+        POSVEL_T distConvertFactor);    // Multiply every position by this
+
+  // Set neighbor processor numbers and calculate dead regions
+  void initialize();
+
+  // Read particle files per processor and share round robin with others
+  // extracting only the alive particles
+  void readParticlesRoundRobin(int reserveQ=0);
+  void partitionInputFiles(bool force1PPF = false);
+
+#ifndef USE_SERIAL_COSMO
+  struct CosmoParticle
+  {
+    float floatData[COSMO_FLOAT];
+    int   intData[COSMO_INT];
+  };
+
+  // Read particle files per processor and share all-to-all with others
+  // extracting only the alive particles
+  void readParticlesAllToAll(int reserveQ = 0);
+#endif
+
+  // Read one particle file per processor with alive particles
+  // and correct topology
+  void readParticlesOneToOne(int reserveQ=0);
+
+  // Get particle counts for allocating buffers
+  void findFileParticleCount();
+
+  // Round robin version must buffer for MPI sends to other processors
+  void readFromRecordFile(
+        ifstream* inStream,     // Stream to read from
+        int firstParticle,      // First particle index to read in this chunk
+        int numberOfParticles,  // Number of particles to read in this chunk
+        POSVEL_T* fblock,       // Buffer for read in data
+        ID_T* iblock,           // Buffer for read in data
+        Message* message);      // Message buffer for distribution
+
+  void readFromBlockFile(
+        ifstream* inStream,     // Stream to read from
+        int firstParticle,      // First particle index to read in this chunk
+        int numberOfParticles,  // Number of particles to read in this chunk
+        int totParticles,       // Total particles (used to get offset)
+        POSVEL_T* lblock,       // Buffer for read in location data
+        POSVEL_T* vblock,       // Buffer for read in velocity data
+        ID_T* iblock,           // Buffer for read in data
+        Message* message);      // Message buffer for distribution
+
+#ifndef USE_SERIAL_COSMO
+  // All-to-all version must buffer for MPI sends to other processors
+  void readFromRecordFile(
+        ifstream* inStream,     // Stream to read from
+        int firstParticle,      // First particle index to read in this chunk
+        int numberOfParticles,  // Number of particles to read in this chunk
+        POSVEL_T* fblock,       // Buffer for read in data
+        ID_T* iblock,           // Buffer for read in data
+        std::vector< std::vector<CosmoParticle> > &pByProc);
+
+  void readFromBlockFile(
+        ifstream* inStream,     // Stream to read from
+        int firstParticle,      // First particle index to read in this chunk
+        int numberOfParticles,  // Number of particles to read in this chunk
+        int totParticles,       // Total particles (used to get offset)
+        POSVEL_T* lblock,       // Buffer for read in location data
+        POSVEL_T* vblock,       // Buffer for read in velocity data
+        ID_T* iblock,           // Buffer for read in data
+        std::vector< std::vector<CosmoParticle> > &pByProc);
+#endif // USE_SERIAL_COSMO
+
+  // One to one version of read is simpler with no MPI buffering
+  void readFromRecordFile();
+  void readFromBlockFile();
+
+  // Reads for Gadget header and for possible byte swapping
+  void readGadgetHeader(ifstream* str);
+  string readString(ifstream* str, int size);
+  void readData(
+        bool swap,
+        void* data,
+        unsigned long size,
+        unsigned long count,
+        ifstream* inStr);
+
+  // Collect local alive particles from the input buffers
+  void distributeParticles(
+        Message* message1,      // Double buffering for reads
+        Message* message2);     // Double buffering for reads
+  void collectLocalParticles(
+        Message* message1,      // Read buffer to extract local particles
+        Message* message2);     // Remaining particles put here for next send
+
+  // Return data needed by other software
+  int     getParticleCount()    { return this->particleCount; }
+
+  void setParticles(vector<POSVEL_T>* xx,
+                    vector<POSVEL_T>* yy,
+                    vector<POSVEL_T>* zz,
+                    vector<POSVEL_T>* vx,
+                    vector<POSVEL_T>* vy,
+                    vector<POSVEL_T>* vz,
+                    vector<POSVEL_T>* mass,
+                    vector<ID_T>* tag);
+
+  vector<POSVEL_T>* getXLocation()      { return this->xx; }
+  vector<POSVEL_T>* getYLocation()      { return this->yy; }
+  vector<POSVEL_T>* getZLocation()      { return this->zz; }
+  vector<POSVEL_T>* getXVelocity()      { return this->vx; }
+  vector<POSVEL_T>* getYVelocity()      { return this->vy; }
+  vector<POSVEL_T>* getZVelocity()      { return this->vz; }
+  vector<POSVEL_T>* getMass()           { return this->ms; }
+  vector<ID_T>* getTag()                { return this->tag; }
+
+private:
+  int    myProc;                // My processor number
+  int    numProc;               // Total number of processors
+
+  string baseFile;              // Base name of input particle files
+  int    inputType;             // BLOCK or RECORD structure
+  int    maxFiles;              // Maximum number of files per processor
+  vector<string> inFiles;       // Files read by this processor
+  vector<long> fileParticles;   // Number of particles in files on processor
+
+  struct GadgetHeader gadgetHeader;// Gadget file header
+  int    gadgetFormat;          // GADGET-1 or GADGET-2
+  bool   gadgetSwap;            // Endian swap needed
+  long int gadgetParticleCount; // Total particles in the file
+  long int gadgetStart[NUM_GADGET_TYPES];
+				// Offset into all particles for that type
+
+  long   maxParticles;          // Largest number of particles in any file
+  long   maxRead;               // Largest number of particles read at one time
+  int    maxReadsPerFile;       // Max number of reads per file
+
+  long   totalParticles;        // Number of particles on all files
+  int    headerSize;            // For BLOCK files
+
+  int    nextProc;              // Where to send buffers to be shared
+  int    prevProc;              // Where to receive buffers from be shared
+  int    numberOfFiles;         // Number of input files total
+  int    processorsPerFile;     // Multiple processors read same file
+  int    numberOfFileSends;     // Number of round robin sends to share buffers
+  int    maxFileSends;          // Max of round robin sends to share buffers
+
+  int    layoutSize[DIMENSION]; // Decomposition of processors
+  int    layoutPos[DIMENSION];  // Position of this processor in decomposition
+
+  long   np;                    // Number of particles in the problem
+  POSVEL_T boxSize;             // Physical box size (rL)
+  POSVEL_T massConvertFactor;   // Multiply every mass read by this
+  POSVEL_T distConvertFactor;   // Multiply every position read by this
+
+  long   numberOfAliveParticles;
+
+  long   particleCount;         // Running index used to store data
+                                // Ends up as the number of alive plus dead
+
+  POSVEL_T minAlive[DIMENSION]; // Minimum alive particle location on processor
+  POSVEL_T maxAlive[DIMENSION]; // Maximum alive particle location on processor
+
+  int    neighbor[NUM_OF_NEIGHBORS];            // Neighbor processor ids
+
+  vector<POSVEL_T>* xx;         // X location for particles on this processor
+  vector<POSVEL_T>* yy;         // Y location for particles on this processor
+  vector<POSVEL_T>* zz;         // Z location for particles on this processor
+  vector<POSVEL_T>* vx;         // X velocity for particles on this processor
+  vector<POSVEL_T>* vy;         // Y velocity for particles on this processor
+  vector<POSVEL_T>* vz;         // Z velocity for particles on this processor
+  vector<POSVEL_T>* ms;         // Mass for particles on this processor
+  vector<ID_T>* tag;            // Id tag for particles on this processor
+};
+
+#endif
diff --git a/src/halo-finder/src/ParticleExchange.cxx b/src/halo-finder/src/ParticleExchange.cxx
new file mode 100644
index 0000000..630d191
--- /dev/null
+++ b/src/halo-finder/src/ParticleExchange.cxx
@@ -0,0 +1,762 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+
+#include <sys/types.h>
+
+#include "Partition.h"
+#include "ParticleExchange.h"
+
+using namespace std;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// ParticleExchange is initialized with physical size of particle space and
+// the margin of dead zone desired for each processor.  It is given the
+// physical x,y,z locations for particles on this processor and can get
+// the number of each neighbor processor.  Since the desired goal is to
+// populate every processor with the alive particles (which it enters this
+// class with) and dead particles belonging on the edges of all neighbors,
+// each processor categorizes its own particles and arranges to send them
+// to the appropriate neighbor, and to receive particles from each neighbor
+// which it adds the the location vectors.
+//
+/////////////////////////////////////////////////////////////////////////
+
+ParticleExchange::ParticleExchange()
+{
+  // Get the number of processors running this problem and rank
+  this->numProc = Partition::getNumProc();
+  this->myProc = Partition::getMyProc();
+
+  // Get the number of processors in each dimension
+  Partition::getDecompSize(this->layoutSize);
+
+  // Get my position within the Cartesian topology
+  Partition::getMyPosition(this->layoutPos);
+
+  // Get neighbors of this processor including the wraparound
+  Partition::getNeighbors(this->neighbor);
+
+  // For this processor calculate alterations needed for wraparound locations
+  calculateOffsetFactor();
+
+  this->numberOfAliveParticles = 0;
+  this->numberOfDeadParticles = 0;
+}
+
+ParticleExchange::~ParticleExchange()
+{
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Set parameters for particle distribution
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleExchange::setParameters(POSVEL_T rL, POSVEL_T deadSz)
+{
+  // Physical total space and amount of physical space to use for dead particles
+  this->boxSize = rL;
+  this->deadSize = deadSz;
+
+#ifndef USE_VTK_COSMO
+  if (this->myProc == MASTER) {
+    cout << endl << "------------------------------------" << endl;
+    cout << "boxSize:  " << this->boxSize << endl;
+    cout << "deltaBox: " << this->deadSize << endl;
+  }
+#endif
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// ParticleExchange will start with only ALIVE particles and will determine
+// which of those particles must be sent to neighbors for the overloading
+// of DEAD particles.  As a particle is examined it may fall into several
+// sharing regions.  For instance a particle in a corner will be sent across
+// three faces, three edges and one corner.  As it is sent the x,y,z must
+// be altered in different ways.  Face overloading requires changing one
+// dimension's location, while corner overloading require three changes.
+// And these changes are only needed for processors on an edge of the
+// decomposition where layoutPos = 0 or layoutPos = layoutSize - 1.
+//
+// This method calculates a simple matrix which can be applied at the
+// time that the exchange buffer is filled with locations.  The rule for
+// sending a location is location = location + (overLoadFactor * boxSize);
+//
+// The factors are
+//      0       location in that dimension is not changed
+//     +1       location in that dimension is incremented by box size
+//     -1       location in that dimension is decremented by box size
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleExchange::calculateOffsetFactor()
+{
+   // Default is that a location is not changed when shared with a neighbor
+   // This is the case for all interior processors
+   for (int n = 0; n < NUM_OF_NEIGHBORS; n++)
+      for (int dim = 0; dim < DIMENSION; dim++)
+         this->overLoadFactor[n][dim] = 0;
+
+   // If this processor is on the edge of the decomposition then when it
+   // sends overloaded locations they must be altered.  This will depend on
+   // the position of this processor in the layout and on the neighbor 
+   // which is receiving the data
+
+   // Processor is on front edge in X dimension so add rL to wraparound x
+   if (this->layoutPos[0] == 0) {
+      this->overLoadFactor[X0][0] = 1;
+      this->overLoadFactor[X0_Y0][0] = 1;
+      this->overLoadFactor[X0_Y1][0] = 1;
+      this->overLoadFactor[Z0_X0][0] = 1;
+      this->overLoadFactor[Z1_X0][0] = 1;
+      this->overLoadFactor[X0_Y0_Z0][0] = 1;
+      this->overLoadFactor[X0_Y0_Z1][0] = 1;
+      this->overLoadFactor[X0_Y1_Z0][0] = 1;
+      this->overLoadFactor[X0_Y1_Z1][0] = 1;
+   }
+
+   // Processor is on back edge in X dimension so subtract rL from wraparound x
+   if (this->layoutPos[0] == (this->layoutSize[0] - 1)) {
+      this->overLoadFactor[X1][0] = -1;
+      this->overLoadFactor[X1_Y1][0] = -1;
+      this->overLoadFactor[X1_Y0][0] = -1;
+      this->overLoadFactor[Z1_X1][0] = -1;
+      this->overLoadFactor[Z0_X1][0] = -1;
+      this->overLoadFactor[X1_Y1_Z1][0] = -1;
+      this->overLoadFactor[X1_Y1_Z0][0] = -1;
+      this->overLoadFactor[X1_Y0_Z1][0] = -1;
+      this->overLoadFactor[X1_Y0_Z0][0] = -1;
+   }
+
+   // Processor is on front edge in Y dimension so add rL to wraparound y
+   if (this->layoutPos[1] == 0) {
+      this->overLoadFactor[Y0][1] = 1;
+      this->overLoadFactor[X0_Y0][1] = 1;
+      this->overLoadFactor[X1_Y0][1] = 1;
+      this->overLoadFactor[Y0_Z0][1] = 1;
+      this->overLoadFactor[Y0_Z1][1] = 1;
+      this->overLoadFactor[X0_Y0_Z0][1] = 1;
+      this->overLoadFactor[X0_Y0_Z1][1] = 1;
+      this->overLoadFactor[X1_Y0_Z1][1] = 1;
+      this->overLoadFactor[X1_Y0_Z0][1] = 1;
+   }
+
+   // Processor is on back edge in Y dimension so subtract rL from wraparound y
+   if (this->layoutPos[1] == (this->layoutSize[1] - 1)) {
+      this->overLoadFactor[Y1][1] = -1;
+      this->overLoadFactor[X1_Y1][1] = -1;
+      this->overLoadFactor[X0_Y1][1] = -1;
+      this->overLoadFactor[Y1_Z1][1] = -1;
+      this->overLoadFactor[Y1_Z0][1] = -1;
+      this->overLoadFactor[X1_Y1_Z1][1] = -1;
+      this->overLoadFactor[X1_Y1_Z0][1] = -1;
+      this->overLoadFactor[X0_Y1_Z0][1] = -1;
+      this->overLoadFactor[X0_Y1_Z1][1] = -1;
+   }
+
+   // Processor is on front edge in Z dimension so add rL to wraparound z
+   if (this->layoutPos[2] == 0) {
+      this->overLoadFactor[Z0][2] = 1;
+      this->overLoadFactor[Y0_Z0][2] = 1;
+      this->overLoadFactor[Y1_Z0][2] = 1;
+      this->overLoadFactor[Z0_X0][2] = 1;
+      this->overLoadFactor[Z0_X1][2] = 1;
+      this->overLoadFactor[X0_Y0_Z0][2] = 1;
+      this->overLoadFactor[X1_Y1_Z0][2] = 1;
+      this->overLoadFactor[X0_Y1_Z0][2] = 1;
+      this->overLoadFactor[X1_Y0_Z0][2] = 1;
+   }
+
+   // Processor is on back edge in Z dimension so subtract rL from wraparound z
+   if (this->layoutPos[2] == (this->layoutSize[2] - 1)) {
+      this->overLoadFactor[Z1][2] = -1;
+      this->overLoadFactor[Y1_Z1][2] = -1;
+      this->overLoadFactor[Y0_Z1][2] = -1;
+      this->overLoadFactor[Z1_X1][2] = -1;
+      this->overLoadFactor[Z1_X0][2] = -1;
+      this->overLoadFactor[X1_Y1_Z1][2] = -1;
+      this->overLoadFactor[X0_Y0_Z1][2] = -1;
+      this->overLoadFactor[X1_Y0_Z1][2] = -1;
+      this->overLoadFactor[X0_Y1_Z1][2] = -1;
+   }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// All particles on this processor initially are alive, but some of those
+// alive must be exchanged with neighbors.  Determine the physical range
+// on this processor where an ALIVE particle will never be exchanged and
+// the ranges for each neighbor's future DEAD particles.  Then when
+// reading each particle it can quickly be assigned.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleExchange::initialize()
+{
+#ifndef USE_VTK_COSMO
+#ifdef DEBUG
+  if (this->myProc == MASTER)
+    cout << "Decomposition: [" << this->layoutSize[0] << ":"
+         << this->layoutSize[1] << ":" << this->layoutSize[2] << "]" << endl;
+#endif
+#endif
+
+  // Set subextents on particle locations for this processor
+  POSVEL_T boxStep[DIMENSION];
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    boxStep[dim] = this->boxSize / this->layoutSize[dim];
+
+    // All particles are alive and available for sharing
+    this->minShare[dim] = this->layoutPos[dim] * boxStep[dim];
+    this->maxShare[dim] = this->minShare[dim] + boxStep[dim];
+    if (this->maxShare[dim] > this->boxSize)
+      this->maxShare[dim] = this->boxSize;
+
+    // Particles in the middle of the shared region will not be shared
+    this->minMine[dim] = this->minShare[dim] + this->deadSize;
+    this->maxMine[dim] = this->maxShare[dim] - this->deadSize;
+  }
+
+  // Set the ranges on the dead particles for each neighbor direction
+  calculateExchangeRegions();
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Each of the 26 neighbors will be sent a rectangular region of my particles
+// Calculate the range in each dimension of the ghost area
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleExchange::calculateExchangeRegions()
+{
+  // Initialize all neighbors to the entire available exchange range
+  for (int i = 0; i < NUM_OF_NEIGHBORS; i++) {
+    for (int dim = 0; dim < DIMENSION; dim++) {
+      this->minRange[i][dim] = this->minShare[dim];
+      this->maxRange[i][dim] = this->maxShare[dim];
+    }
+  }
+
+  // Left face
+  this->minRange[X0][0] = this->minShare[0];
+  this->maxRange[X0][0] = this->minMine[0];
+
+  // Right face
+  this->minRange[X1][0] = this->maxMine[0];
+  this->maxRange[X1][0] = this->maxShare[0];
+
+  // Bottom face
+  this->minRange[Y0][1] = this->minShare[1];
+  this->maxRange[Y0][1] = this->minMine[1];
+
+  // Top face
+  this->minRange[Y1][1] = this->maxMine[1];
+  this->maxRange[Y1][1] = this->maxShare[1];
+
+  // Front face
+  this->minRange[Z0][2] = this->minShare[2];
+  this->maxRange[Z0][2] = this->minMine[2];
+
+  // Back face
+  this->minRange[Z1][2] = this->maxMine[2];
+  this->maxRange[Z1][2] = this->maxShare[2];
+
+  // Left bottom and top bars
+  this->minRange[X0_Y0][0] = this->minShare[0];
+  this->maxRange[X0_Y0][0] = this->minMine[0];
+  this->minRange[X0_Y0][1] = this->minShare[1];
+  this->maxRange[X0_Y0][1] = this->minMine[1];
+
+  this->minRange[X0_Y1][0] = this->minShare[0];
+  this->maxRange[X0_Y1][0] = this->minMine[0];
+  this->minRange[X0_Y1][1] = this->maxMine[1];
+  this->maxRange[X0_Y1][1] = this->maxShare[1];
+
+  // Right bottom and top bars
+  this->minRange[X1_Y0][0] = this->maxMine[0];
+  this->maxRange[X1_Y0][0] = this->maxShare[0];
+  this->minRange[X1_Y0][1] = this->minShare[1];
+  this->maxRange[X1_Y0][1] = this->minMine[1];
+
+  this->minRange[X1_Y1][0] = this->maxMine[0];
+  this->maxRange[X1_Y1][0] = this->maxShare[0];
+  this->minRange[X1_Y1][1] = this->maxMine[1];
+  this->maxRange[X1_Y1][1] = this->maxShare[1];
+
+  // Bottom front and back bars
+  this->minRange[Y0_Z0][1] = this->minShare[1];
+  this->maxRange[Y0_Z0][1] = this->minMine[1];
+  this->minRange[Y0_Z0][2] = this->minShare[2];
+  this->maxRange[Y0_Z0][2] = this->minMine[2];
+
+  this->minRange[Y0_Z1][1] = this->minShare[1];
+  this->maxRange[Y0_Z1][1] = this->minMine[1];
+  this->minRange[Y0_Z1][2] = this->maxMine[2];
+  this->maxRange[Y0_Z1][2] = this->maxShare[2];
+
+  // Top front and back bars 
+  this->minRange[Y1_Z0][1] = this->maxMine[1];
+  this->maxRange[Y1_Z0][1] = this->maxShare[1];
+  this->minRange[Y1_Z0][2] = this->minShare[2];
+  this->maxRange[Y1_Z0][2] = this->minMine[2];
+
+  this->minRange[Y1_Z1][1] = this->maxMine[1];
+  this->maxRange[Y1_Z1][1] = this->maxShare[1];
+  this->minRange[Y1_Z1][2] = this->maxMine[2];
+  this->maxRange[Y1_Z1][2] = this->maxShare[2];
+
+  // Left front and back bars (vertical)
+  this->minRange[Z0_X0][0] = this->minShare[0];
+  this->maxRange[Z0_X0][0] = this->minMine[0];
+  this->minRange[Z0_X0][2] = this->minShare[2];
+  this->maxRange[Z0_X0][2] = this->minMine[2];
+
+  this->minRange[Z1_X0][0] = this->minShare[0];
+  this->maxRange[Z1_X0][0] = this->minMine[0];
+  this->minRange[Z1_X0][2] = this->maxMine[2];
+  this->maxRange[Z1_X0][2] = this->maxShare[2];
+
+  // Right front and back bars (vertical)
+  this->minRange[Z0_X1][0] = this->maxMine[0];
+  this->maxRange[Z0_X1][0] = this->maxShare[0];
+  this->minRange[Z0_X1][2] = this->minShare[2];
+  this->maxRange[Z0_X1][2] = this->minMine[2];
+
+  this->minRange[Z1_X1][0] = this->maxMine[0];
+  this->maxRange[Z1_X1][0] = this->maxShare[0];
+  this->minRange[Z1_X1][2] = this->maxMine[2];
+  this->maxRange[Z1_X1][2] = this->maxShare[2];
+
+  // Left bottom front corner
+  this->minRange[X0_Y0_Z0][0] = this->minShare[0];
+  this->maxRange[X0_Y0_Z0][0] = this->minMine[0];
+  this->minRange[X0_Y0_Z0][1] = this->minShare[1];
+  this->maxRange[X0_Y0_Z0][1] = this->minMine[1];
+  this->minRange[X0_Y0_Z0][2] = this->minShare[2];
+  this->maxRange[X0_Y0_Z0][2] = this->minMine[2];
+
+  // Left bottom back corner
+  this->minRange[X0_Y0_Z1][0] = this->minShare[0];
+  this->maxRange[X0_Y0_Z1][0] = this->minMine[0];
+  this->minRange[X0_Y0_Z1][1] = this->minShare[1];
+  this->maxRange[X0_Y0_Z1][1] = this->minMine[1];
+  this->minRange[X0_Y0_Z1][2] = this->maxMine[2];
+  this->maxRange[X0_Y0_Z1][2] = this->maxShare[2];
+
+  // Left top front corner
+  this->minRange[X0_Y1_Z0][0] = this->minShare[0];
+  this->maxRange[X0_Y1_Z0][0] = this->minMine[0];
+  this->minRange[X0_Y1_Z0][1] = this->maxMine[1];
+  this->maxRange[X0_Y1_Z0][1] = this->maxShare[1];
+  this->minRange[X0_Y1_Z0][2] = this->minShare[2];
+  this->maxRange[X0_Y1_Z0][2] = this->minMine[2];
+
+  // Left top back corner
+  this->minRange[X0_Y1_Z1][0] = this->minShare[0];
+  this->maxRange[X0_Y1_Z1][0] = this->minMine[0];
+  this->minRange[X0_Y1_Z1][1] = this->maxMine[1];
+  this->maxRange[X0_Y1_Z1][1] = this->maxShare[1];
+  this->minRange[X0_Y1_Z1][2] = this->maxMine[2];
+  this->maxRange[X0_Y1_Z1][2] = this->maxShare[2];
+
+  // Right bottom front corner
+  this->minRange[X1_Y0_Z0][0] = this->maxMine[0];
+  this->maxRange[X1_Y0_Z0][0] = this->maxShare[0];
+  this->minRange[X1_Y0_Z0][1] = this->minShare[1];
+  this->maxRange[X1_Y0_Z0][1] = this->minMine[1];
+  this->minRange[X1_Y0_Z0][2] = this->minShare[2];
+  this->maxRange[X1_Y0_Z0][2] = this->minMine[2];
+
+  // Right bottom back corner
+  this->minRange[X1_Y0_Z1][0] = this->maxMine[0];
+  this->maxRange[X1_Y0_Z1][0] = this->maxShare[0];
+  this->minRange[X1_Y0_Z1][1] = this->minShare[1];
+  this->maxRange[X1_Y0_Z1][1] = this->minMine[1];
+  this->minRange[X1_Y0_Z1][2] = this->maxMine[2];
+  this->maxRange[X1_Y0_Z1][2] = this->maxShare[2];
+
+  // Right top front corner
+  this->minRange[X1_Y1_Z0][0] = this->maxMine[0];
+  this->maxRange[X1_Y1_Z0][0] = this->maxShare[0];
+  this->minRange[X1_Y1_Z0][1] = this->maxMine[1];
+  this->maxRange[X1_Y1_Z0][1] = this->maxShare[1];
+  this->minRange[X1_Y1_Z0][2] = this->minShare[2];
+  this->maxRange[X1_Y1_Z0][2] = this->minMine[2];
+
+  // Right top back corner
+  this->minRange[X1_Y1_Z1][0] = this->maxMine[0];
+  this->maxRange[X1_Y1_Z1][0] = this->maxShare[0];
+  this->minRange[X1_Y1_Z1][1] = this->maxMine[1];
+  this->maxRange[X1_Y1_Z1][1] = this->maxShare[1];
+  this->minRange[X1_Y1_Z1][2] = this->maxMine[2];
+  this->maxRange[X1_Y1_Z1][2] = this->maxShare[2];
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Set the particle vectors that have already been read and which
+// contain only the alive particles for this processor
+//
+/////////////////////////////////////////////////////////////////////////
+
+void ParticleExchange::setParticles(
+                        vector<POSVEL_T>* xLoc,
+                        vector<POSVEL_T>* yLoc,
+                        vector<POSVEL_T>* zLoc,
+                        vector<POSVEL_T>* xVel,
+                        vector<POSVEL_T>* yVel,
+                        vector<POSVEL_T>* zVel,
+                        vector<POSVEL_T>* mass,
+                        vector<POTENTIAL_T>* potential,
+                        vector<ID_T>* id,
+                        vector<MASK_T>* maskData,
+                        vector<STATUS_T>* type)
+{
+  this->particleCount = (long)xLoc->size();
+  this->numberOfAliveParticles = this->particleCount;
+  this->xx = xLoc;
+  this->yy = yLoc;
+  this->zz = zLoc;
+  this->vx = xVel;
+  this->vy = yVel;
+  this->vz = zVel;
+  this->ms = mass;
+  this->pot = potential;
+  this->tag = id;
+  this->mask = maskData;
+  this->status = type;
+  this->status->clear();
+}
+        
+/////////////////////////////////////////////////////////////////////////////
+//
+// Alive particles are contained on each processor.  Identify the border
+// particles which will be dead on other processors and exchange them
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleExchange::exchangeParticles()
+{
+  // Identify alive particles on this processor which must be shared
+  // because they are dead particles on neighbor processors
+  // x,y,z are still in physical units (because deadSize is given that way)
+  identifyExchangeParticles();
+
+  // Exchange those particles with appropriate neighbors
+  // x,y,z are not in normalized units
+  exchangeNeighborParticles();
+
+  // Count the particles across processors
+  long totalAliveParticles = 0;
+  long totalDeadParticles = 0;
+  
+  long d[2];
+  d[0] = this->numberOfAliveParticles;
+  d[1] = this->numberOfDeadParticles;
+
+#ifdef USE_SERIAL_COSMO
+  totalAliveParticles = this->numberOfAliveParticles;
+  totalDeadParticles = this->numberOfDeadParticles;
+#else
+  
+  //MPI_Allreduce((void*) &this->numberOfAliveParticles, (void*) &totalAliveParticles, 1, MPI_LONG, MPI_SUM, Partition::getComm());
+  //MPI_Allreduce((void*) &this->numberOfDeadParticles, (void*) &totalDeadParticles, 1, MPI_LONG, MPI_SUM, Partition::getComm());
+                
+  
+  MPI_Allreduce( MPI_IN_PLACE, d, 2, MPI_LONG, MPI_SUM, Partition::getComm());
+  totalAliveParticles = d[0];
+  totalDeadParticles  = d[1];
+#endif
+
+#ifndef USE_VTK_COSMO
+#ifdef DEBUG
+  cout << "Exchange Particles Rank " << setw(3) << this->myProc 
+       << " #alive = " << this->numberOfAliveParticles
+       << " #dead = " << this->numberOfDeadParticles << endl;
+#endif
+ 
+  if (this->myProc == MASTER) {
+    cout << "TotalAliveParticles " << totalAliveParticles << endl;
+    cout << "TotalDeadParticles  " << totalDeadParticles << endl << endl;
+  }
+#endif
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Iterate over all the alive particles on this processor and determine
+// which must be shared and add them to the vector for that neighbor
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleExchange::identifyExchangeParticles()
+{
+  long notSharedCount = 0;
+  long sharedCount = 0;
+
+  // All initial particles before the exchange are ALIVE
+  for (long i = 0; i < this->particleCount; i++) {
+    this->status->push_back(ALIVE);
+    if (((*this->xx)[i] > this->minMine[0] && 
+         (*this->xx)[i] < this->maxMine[0]) &&
+        ((*this->yy)[i] > this->minMine[1] && 
+         (*this->yy)[i] < this->maxMine[1]) &&
+        ((*this->zz)[i] > this->minMine[2] && 
+         (*this->zz)[i] < this->maxMine[2])) {
+          notSharedCount++;
+    } else {
+      // Particle is alive here but which processors need it as dead
+      for (int n = 0; n < NUM_OF_NEIGHBORS; n++) {
+        if ((*this->xx)[i] >= minRange[n][0] && 
+            (*this->xx)[i] <= maxRange[n][0] &&
+            (*this->yy)[i] >= minRange[n][1] && 
+            (*this->yy)[i] <= maxRange[n][1] &&
+            (*this->zz)[i] >= minRange[n][2] && 
+            (*this->zz)[i] <= maxRange[n][2]) {
+                this->neighborParticles[n].push_back(i);
+                sharedCount++;
+        }
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Exchange the appropriate particles with neighbors
+// Only the index of the particle to be exchanged is stored so fill out
+// the message with location, velocity, tag.  Status information doesn't
+// have to be sent because when the message is received, the neighbor
+// containing the new dead particle will be known
+//
+// Use the Cartesian communicator for neighbor exchange
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleExchange::exchangeNeighborParticles()
+{
+  // Calculate the maximum number of particles to share for calculating buffer
+  int myShareSize = 0;
+  for (int n = 0; n < NUM_OF_NEIGHBORS; n++)
+    if (myShareSize < (int)this->neighborParticles[n].size())
+      myShareSize = (int)this->neighborParticles[n].size();
+
+  int maxShareSize;
+#ifdef USE_SERIAL_COSMO
+  maxShareSize = myShareSize;
+#else
+  MPI_Allreduce((void*) &myShareSize,
+                (void*) &maxShareSize,
+                1, MPI_INT, MPI_MAX, Partition::getComm());
+#endif
+
+  // Allocate messages to send and receive MPI buffers
+  // Space for particle count +record(loc, vel, mass, tag) + potential + mask
+  int bufferSize = sizeof(int) +
+        (maxShareSize * 
+          (RECORD_SIZE + sizeof(POSVEL_T) + sizeof(MASK_T)));
+
+  Message* sendMessage = new Message(bufferSize);
+  Message* recvMessage = new Message(bufferSize);
+
+#ifndef USE_VTK_COSMO
+  //debug statement added by Adrian to see how much buffer space we're using
+  if(this->myProc == MASTER) {
+    printf("PXCH buffer = 2*%d = %f MB\n",bufferSize,
+           2.0*bufferSize/1024.0/1024.0);
+  }
+#endif
+
+//#ifndef USE_SERIAL_COSMO
+//  MPI_Barrier(Partition::getComm());
+//#endif
+
+  // Exchange with each neighbor, with everyone sending in one direction and
+  // receiving from the other.  Data corresponding to the particle index
+  // must be packed in the buffer.  When the data is received it is unpacked
+  // into the location, velocity and tag vectors and the status is set
+  // to the neighbor who sent it
+
+  for (int n = 0; n < NUM_OF_NEIGHBORS; n=n+2) {
+    // Neighbor pairs in Definition.h must match so that every processor
+    // sends and every processor receives on each exchange
+    exchange(n, n+1, sendMessage, recvMessage);
+    exchange(n+1, n, sendMessage, recvMessage);
+  }
+
+  delete sendMessage;
+  delete recvMessage;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Pack particle data for the indicated neighbor into MPI message
+// Send that message and receive from opposite neighbor
+// Unpack the received particle data and add to particle buffers with
+// an indication of dead and the neighbor on which particle is alive
+//
+/////////////////////////////////////////////////////////////////////////////
+
+void ParticleExchange::exchange(
+                        int sendTo, 
+                        int recvFrom, 
+                        Message* sendMessage, 
+                        Message* recvMessage)
+{
+  POSVEL_T posValue;
+  POTENTIAL_T potValue;
+  ID_T idValue;
+  MASK_T maskValue;
+
+  // Fill same message for each of the neighbors
+  sendMessage->reset();
+  recvMessage->reset();
+
+  // Number of particles to share with neighbor
+  int sendParticleCount = (int)this->neighborParticles[sendTo].size();
+
+  // Overload factor alters the x,y,z dimension for wraparound depending on
+  // the neighbor receiving the data and the position this processor
+  // has in the decomposition
+  POSVEL_T offset[DIMENSION];
+  for (int dim = 0; dim < DIMENSION; dim++)
+    offset[dim] = this->overLoadFactor[sendTo][dim] * this->boxSize;
+
+  // If this processor would be sending to itself skip the MPI
+  if (this->neighbor[sendTo] == this->myProc) {
+    for (int i = 0; i < sendParticleCount; i++) {
+
+      int deadIndex = this->neighborParticles[sendTo][i];
+      this->xx->push_back((*this->xx)[deadIndex] + offset[0]);
+      this->yy->push_back((*this->yy)[deadIndex] + offset[1]);
+      this->zz->push_back((*this->zz)[deadIndex] + offset[2]);
+      this->vx->push_back((*this->vx)[deadIndex]);
+      this->vy->push_back((*this->vy)[deadIndex]);
+      this->vz->push_back((*this->vz)[deadIndex]);
+      this->ms->push_back((*this->ms)[deadIndex]);
+      this->pot->push_back((*this->pot)[deadIndex]);
+      this->tag->push_back((*this->tag)[deadIndex]);
+      this->mask->push_back((*this->mask)[deadIndex]);
+      this->status->push_back(recvFrom);
+
+      this->numberOfDeadParticles++;
+      this->particleCount++;
+    }
+    return;
+  }
+
+  // Pack the number of particles being sent
+  sendMessage->putValue(&sendParticleCount);
+
+  for (int i = 0; i < sendParticleCount; i++) {
+    int deadIndex = this->neighborParticles[sendTo][i];
+
+    // Locations are altered by wraparound if needed
+    posValue = (*this->xx)[deadIndex] + offset[0];
+    sendMessage->putValue(&posValue);
+    posValue = (*this->yy)[deadIndex] + offset[1];
+    sendMessage->putValue(&posValue);
+    posValue = (*this->zz)[deadIndex] + offset[2];
+    sendMessage->putValue(&posValue);
+
+    // Other values are just sent
+    sendMessage->putValue(&(*this->vx)[deadIndex]);
+    sendMessage->putValue(&(*this->vy)[deadIndex]);
+    sendMessage->putValue(&(*this->vz)[deadIndex]);
+    sendMessage->putValue(&(*this->ms)[deadIndex]);
+    sendMessage->putValue(&(*this->pot)[deadIndex]);
+    sendMessage->putValue(&(*this->tag)[deadIndex]);
+    sendMessage->putValue(&(*this->mask)[deadIndex]);
+  }
+
+  // Send the message buffer
+  sendMessage->send(this->neighbor[sendTo]);
+
+  // Receive the buffer from neighbor on other side
+  recvMessage->receive(this->neighbor[recvFrom]);
+
+#ifndef USE_SERIAL_COSMO
+  MPI_Barrier(Partition::getComm());
+#endif
+
+  // Process the received buffer
+  int recvParticleCount;
+  recvMessage->getValue(&recvParticleCount);
+
+  for (int i = 0; i < recvParticleCount; i++) {
+    recvMessage->getValue(&posValue);
+    this->xx->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->yy->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->zz->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->vx->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->vy->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->vz->push_back(posValue);
+    recvMessage->getValue(&posValue);
+    this->ms->push_back(posValue);
+    recvMessage->getValue(&potValue);
+    this->pot->push_back(potValue);
+    recvMessage->getValue(&idValue);
+    this->tag->push_back(idValue);
+    recvMessage->getValue(&maskValue);
+    this->mask->push_back(maskValue);
+    this->status->push_back(recvFrom);
+
+    this->numberOfDeadParticles++;
+    this->particleCount++;
+  }
+}
diff --git a/src/halo-finder/src/ParticleExchange.h b/src/halo-finder/src/ParticleExchange.h
new file mode 100644
index 0000000..c0e6c00
--- /dev/null
+++ b/src/halo-finder/src/ParticleExchange.h
@@ -0,0 +1,183 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+// .NAME ParticleExchange - read or get pointer to alive particles on this
+//                          process and exchange dead particles with neighbors
+//
+// .SECTION Description
+// ParticleExchange is initialized with physical size of particle space and
+// the margin of dead zone desired for each processor.  It is given the
+// physical x,y,z locations for particles on this processor and can get
+// the number of each neighbor processor.  Since the desired goal is to
+// populate every processor with the alive particles (which it enters this
+// class with) and dead particles belonging on the edges of all neighbors,
+// each processor categorizes its own particles and arranges to send them
+// to the appropriate neighbor, and to receive particles from each neighbor
+// which it adds the the location vectors.
+//
+// Information exchanged are x,y,z locations and vectors and integer unique
+// tags per particle.  Also when the data is shared, the particle status is
+// filled in with the number of the neighbor that shared the particle.  This
+// is to make the halo finder faster because instead of listing a particle
+// as just alive or dead, we know where the dead particle is located.
+//
+
+#ifndef ParticleExchange_h
+#define ParticleExchange_h
+
+#include "Message.h"
+
+#ifdef USE_VTK_COSMO 
+#include "CosmoDefinition.h"
+#include "vtkstd/string"
+#include "vtkstd/vector"
+
+using namespace vtkstd;
+#else
+#include "Definition.h"
+#include <string>
+#include <vector>
+
+using namespace std;
+#endif 
+
+#ifdef USE_VTK_COSMO
+class COSMO_EXPORT ParticleExchange {
+#else
+class ParticleExchange {
+#endif
+public:
+  ParticleExchange();
+  ~ParticleExchange();
+
+  // Set parameters particle distribution
+  void setParameters(
+        POSVEL_T rL,            // Box size of the physical problem
+        POSVEL_T deadSize);     // Dead delta border for each processor
+
+  // Calculate the factor to add to locations when doing wraparound shares
+  void calculateOffsetFactor();
+
+  // Set neighbor processor numbers and calculate dead regions
+  void initialize();
+
+  // Calculate physical range of alive particles which must be shared
+  void calculateExchangeRegions();
+
+  // Set alive particle vectors which were created elsewhere
+  void setParticles(
+        vector<POSVEL_T>* xx,
+        vector<POSVEL_T>* yy,
+        vector<POSVEL_T>* zz,
+        vector<POSVEL_T>* vx,
+        vector<POSVEL_T>* vy,
+        vector<POSVEL_T>* vz,
+        vector<POSVEL_T>* mass,
+        vector<POTENTIAL_T>* potential,
+        vector<ID_T>* tag,
+        vector<MASK_T>* mask,
+        vector<STATUS_T>* status);
+
+  // Identify and exchange alive particles which must be shared with neighbors
+  void exchangeParticles();
+  void identifyExchangeParticles();
+  void exchangeNeighborParticles();
+  void exchange(
+        int sendTo,             // Neighbor to send particles to
+        int recvFrom,           // Neighbor to receive particles from
+        Message* sendMessage,
+        Message* recvMessage);
+
+  // Return data needed by other software
+  int getParticleCount()                { return this->particleCount; }
+
+private:
+  int    myProc;                // My processor number
+  int    numProc;               // Total number of processors
+
+  long   totalParticles;        // Number of particles on all files
+  int    headerSize;            // For BLOCK files
+
+  int    layoutSize[DIMENSION]; // Decomposition of processors
+  int    layoutPos[DIMENSION];  // Position of this processor in decomposition
+
+  POSVEL_T boxSize;             // Physical box size (rL)
+  POSVEL_T deadSize;            // Border size for dead particles
+
+  long   numberOfAliveParticles;
+  long   numberOfDeadParticles;
+  long   particleCount;         // Running index used to store data
+                                // Ends up as the number of alive plus dead
+
+  POSVEL_T minMine[DIMENSION];  // Minimum alive particle not exchanged
+  POSVEL_T maxMine[DIMENSION];  // Maximum alive particle not exchanged
+  POSVEL_T minShare[DIMENSION]; // Minimum alive particle shared
+  POSVEL_T maxShare[DIMENSION]; // Maximum alive particle shared
+
+  int      neighbor[NUM_OF_NEIGHBORS];            // Neighbor processor indices
+  POSVEL_T minRange[NUM_OF_NEIGHBORS][DIMENSION]; // Range of dead particles
+  POSVEL_T maxRange[NUM_OF_NEIGHBORS][DIMENSION]; // Range of dead particles
+
+  int    overLoadFactor[NUM_OF_NEIGHBORS][DIMENSION];
+                                // When sending location factor to multiply
+                                // boxSize by for wraparound alteration
+
+  vector<ID_T> neighborParticles[NUM_OF_NEIGHBORS];
+                                // Particle ids sent to each neighbor as DEAD
+
+  vector<POSVEL_T>* xx;         // X location for particles on this processor
+  vector<POSVEL_T>* yy;         // Y location for particles on this processor
+  vector<POSVEL_T>* zz;         // Z location for particles on this processor
+  vector<POSVEL_T>* vx;         // X velocity for particles on this processor
+  vector<POSVEL_T>* vy;         // Y velocity for particles on this processor
+  vector<POSVEL_T>* vz;         // Z velocity for particles on this processor
+  vector<POSVEL_T>* ms;         // Mass for particles on this processor
+  vector<ID_T>* tag;            // Id tag for particles on this processor
+  vector<STATUS_T>* status;     // Particle is ALIVE or labeled with neighbor
+                                // processor index where it is ALIVE
+  vector<POTENTIAL_T>* pot;     // Id tag for particles on this processor
+  vector<MASK_T>* mask;         // Id tag for particles on this processor
+};
+
+#endif
diff --git a/src/halo-finder/src/Partition.cxx b/src/halo-finder/src/Partition.cxx
new file mode 100644
index 0000000..579d2fb
--- /dev/null
+++ b/src/halo-finder/src/Partition.cxx
@@ -0,0 +1,284 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#include <iostream>
+
+#include "Partition.h"
+#include "dims.h"
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Static class to control MPI and the partitioning of processors in
+// a Cartesian grid across the problem space.
+//
+/////////////////////////////////////////////////////////////////////////
+
+#ifndef USE_SERIAL_COSMO
+MPI_Comm Partition::cartComm;
+#endif
+
+int Partition::numProc = 0;
+int Partition::myProc = -1;
+int Partition::decompSize[DIMENSION];
+int Partition::myPosition[DIMENSION];
+int Partition::neighbor[NUM_OF_NEIGHBORS];
+int Partition::initialized = 0;
+
+Partition::Partition()
+{
+}
+
+Partition::~Partition()
+{
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Initialize MPI, allocate the processors across a Cartesian grid of
+// DIMENSION size and record this processors position, id and neighbor ids
+//
+/////////////////////////////////////////////////////////////////////////
+
+//void Partition::initialize(int& argc, char** argv)
+void Partition::initialize
+#ifndef USE_SERIAL_COSMO
+                          (MPI_Comm comm)
+#else
+                          ()
+#endif
+{
+  if(!initialized)
+  {
+#ifndef USE_SERIAL_COSMO
+	fprintf(stdout, "Using MPI Initialize\n"); // TEMP: DEBUG ONLY
+#ifdef USE_VTK_COSMO
+    // this is for when it is compiled against MPI but single processor
+    // on ParaView (client only, it won't MPI_Init itself)
+    int temp;
+    MPI_Initialized(&temp);
+    if(!temp) 
+    {
+      temp = 0;
+      MPI_Init(&temp, 0);
+    }
+#endif
+
+    // Start up MPI
+    //MPI_Init(&argc, &argv);
+    MPI_Comm_rank(comm, &myProc);
+    MPI_Comm_size(comm, &numProc);
+#endif
+
+    for (int dim = 0; dim < DIMENSION; dim++)
+      decompSize[dim] = 0;
+
+#ifdef USE_SERIAL_COSMO
+    myProc = 0;
+    numProc = 1;
+
+	fprintf(stdout, "Using serial initalize\n"); // ** TEMP: DEBUG ONLY
+    for(int dim = 0; dim < DIMENSION; dim = dim + 1)
+    {
+      decompSize[dim] = 1;
+      myPosition[dim] = 0;
+    }
+#else
+    int periodic[] = {1, 1, 1};
+    int reorder = 0;
+
+    // Compute the number of processors in each dimension
+    //MPI_Dims_create(numProc, DIMENSION, decompSize);
+    MY_Dims_create_3D(numProc, DIMENSION, decompSize);
+    
+    // Create the Cartesion communicator
+    MPI_Cart_create(comm,
+                    DIMENSION, decompSize, periodic, reorder, &cartComm);
+    
+    // Reset my rank if it changed
+    MPI_Comm_rank(cartComm, &myProc);
+    
+    // Get this processor's position in the Cartesian topology
+    MPI_Cart_coords(cartComm, myProc, DIMENSION, myPosition);
+#endif
+
+    // Set all my neighbor processor ids for communication
+    setNeighbors();
+    
+#ifndef USE_VTK_COSMO
+    if (myProc == 0)
+      cout << "Partition 3D: [" << decompSize[0] << ":"
+           << decompSize[1] << ":" << decompSize[2] << "]" << endl; 
+#endif
+
+    initialized = 1;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Return the decomposition size of this problem
+//
+/////////////////////////////////////////////////////////////////////////
+
+void Partition::getDecompSize(int size[])
+{
+  for (int dim = 0; dim < DIMENSION; dim++)
+    size[dim] = decompSize[dim];
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Return the position of this processor within the Cartesian grid
+//
+/////////////////////////////////////////////////////////////////////////
+
+void Partition::getMyPosition(int pos[])
+{
+  for (int dim = 0; dim < DIMENSION; dim++)
+    pos[dim] = myPosition[dim];
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Return the ranks of the neighbors of this processor using the
+// description in Definition.h
+//
+/////////////////////////////////////////////////////////////////////////
+
+void Partition::getNeighbors(int neigh[])
+{
+  for (int n = 0; n < NUM_OF_NEIGHBORS; n++)
+    neigh[n] = neighbor[n];
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Get the id of a particular processor given its position in the topology
+//
+/////////////////////////////////////////////////////////////////////////
+
+int Partition::getNeighbor
+#ifdef USE_SERIAL_COSMO
+  (int , int , int )
+#else
+  (int xpos, int ypos, int zpos)
+#endif
+{
+#ifdef USE_SERIAL_COSMO
+  return 0;
+#else
+  static int pos[DIMENSION];
+  pos[0] = xpos;
+  pos[1] = ypos;
+  pos[2] = zpos;
+
+  int neighborProc;
+  MPI_Cart_rank(cartComm, pos, &neighborProc);
+  return neighborProc;
+#endif
+}
+
+/////////////////////////////////////////////////////////////////////////
+//    
+// Every processor will have 26 neighbors because the cosmology structure
+// is a 3D torus.  Each will have 6 face neighbors, 12 edge neighbors and
+// 8 corner neighbors.
+//
+/////////////////////////////////////////////////////////////////////////
+
+void Partition::setNeighbors()
+{ 
+  // Where is this processor in the decomposition
+  int xpos = myPosition[0];
+  int ypos = myPosition[1];
+  int zpos = myPosition[2];
+
+  // Face neighbors
+  neighbor[X0] = Partition::getNeighbor(xpos-1, ypos, zpos);
+  neighbor[X1] = Partition::getNeighbor(xpos+1, ypos, zpos);
+  neighbor[Y0] = Partition::getNeighbor(xpos, ypos-1, zpos);
+  neighbor[Y1] = Partition::getNeighbor(xpos, ypos+1, zpos);
+  neighbor[Z0] = Partition::getNeighbor(xpos, ypos, zpos-1);
+  neighbor[Z1] = Partition::getNeighbor(xpos, ypos, zpos+1);
+
+  // Edge neighbors
+  neighbor[X0_Y0] = Partition::getNeighbor(xpos-1, ypos-1, zpos);
+  neighbor[X0_Y1] = Partition::getNeighbor(xpos-1, ypos+1, zpos);
+  neighbor[X1_Y0] = Partition::getNeighbor(xpos+1, ypos-1, zpos);
+  neighbor[X1_Y1] = Partition::getNeighbor(xpos+1, ypos+1, zpos);
+  
+  neighbor[Y0_Z0] = Partition::getNeighbor(xpos, ypos-1, zpos-1);
+  neighbor[Y0_Z1] = Partition::getNeighbor(xpos, ypos-1, zpos+1);
+  neighbor[Y1_Z0] = Partition::getNeighbor(xpos, ypos+1, zpos-1);
+  neighbor[Y1_Z1] = Partition::getNeighbor(xpos, ypos+1, zpos+1);
+  
+  neighbor[Z0_X0] = Partition::getNeighbor(xpos-1, ypos, zpos-1);
+  neighbor[Z0_X1] = Partition::getNeighbor(xpos+1, ypos, zpos-1);
+  neighbor[Z1_X0] = Partition::getNeighbor(xpos-1, ypos, zpos+1);
+  neighbor[Z1_X1] = Partition::getNeighbor(xpos+1, ypos, zpos+1);
+  
+  // Corner neighbors
+  neighbor[X0_Y0_Z0] = Partition::getNeighbor(xpos-1, ypos-1, zpos-1);
+  neighbor[X1_Y0_Z0] = Partition::getNeighbor(xpos+1, ypos-1, zpos-1);
+  neighbor[X0_Y1_Z0] = Partition::getNeighbor(xpos-1, ypos+1, zpos-1);
+  neighbor[X1_Y1_Z0] = Partition::getNeighbor(xpos+1, ypos+1, zpos-1);
+  neighbor[X0_Y0_Z1] = Partition::getNeighbor(xpos-1, ypos-1, zpos+1);
+  neighbor[X1_Y0_Z1] = Partition::getNeighbor(xpos+1, ypos-1, zpos+1);
+  neighbor[X0_Y1_Z1] = Partition::getNeighbor(xpos-1, ypos+1, zpos+1);
+  neighbor[X1_Y1_Z1] = Partition::getNeighbor(xpos+1, ypos+1, zpos+1);
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// Shut down MPI
+//
+/////////////////////////////////////////////////////////////////////////
+
+void Partition::finalize()
+{
+  numProc = 0;
+  myProc = -1;
+
+  //MPI_Finalize();
+}
diff --git a/src/halo-finder/src/Partition.h b/src/halo-finder/src/Partition.h
new file mode 100644
index 0000000..2a593f1
--- /dev/null
+++ b/src/halo-finder/src/Partition.h
@@ -0,0 +1,122 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+// .NAME Partition - Partition MPI processors into cartesian grid
+//
+// .SECTION Description
+// Partition allows MPI to divide the number of processors it is given and
+// to set the position of this processor within the Cartesian grid.  Using
+// that information with wraparound, all neighbors of a processor are
+// also computed.  This class is static and will be shared by all classes
+// within the infrastructure.
+
+#ifndef Partition_h
+#define Partition_h
+
+
+#ifdef USE_VTK_COSMO
+#include "CosmoDefinition.h"
+#include "vtkstd/string"
+#include "vtkstd/vector"
+
+using namespace vtkstd;
+#else
+#include "Definition.h"
+#include <string>
+#include <vector>
+#include <mpi.h>
+
+using namespace std;
+#endif
+
+#ifdef USE_VTK_COSMO
+class COSMO_EXPORT Partition {
+#else
+class Partition {
+#endif
+public:
+  Partition();
+  ~Partition();
+
+  // Control MPI and the Cartesian topology
+  //static void initialize(int& argc, char** argv);
+#ifndef USE_SERIAL_COSMO
+  static void initialize(MPI_Comm comm = MPI_COMM_WORLD);
+#else
+  static void initialize();
+#endif
+  static void finalize();
+
+  // Set the processor numbers of neighbors in all directions
+  static void setNeighbors();
+
+#ifndef USE_SERIAL_COSMO
+  static MPI_Comm getComm()       { return cartComm; }
+#endif
+
+  static int  getMyProc()               { return myProc; }
+  static int  getNumProc()              { return numProc; }
+
+  static void getDecompSize(int size[]);
+  static void getMyPosition(int pos[]);
+  static void getNeighbors(int neigh[]);
+
+  static int  getNeighbor(int xpos, int ypos, int zpos);
+
+private:
+  static int myProc;                    // My processor number
+  static int numProc;                   // Total number of processors
+  static int initialized;
+
+#ifndef USE_SERIAL_COSMO
+  static MPI_Comm cartComm;             // Cartesian communicator
+#endif
+
+  static int decompSize[DIMENSION];     // Number of processors in each dim
+  static int myPosition[DIMENSION];     // My index in cartesian communicator
+
+  static int neighbor[NUM_OF_NEIGHBORS];// Neighbor processor ids
+};
+
+#endif
diff --git a/src/halo-finder/src/RCBForceTree.cxx b/src/halo-finder/src/RCBForceTree.cxx
new file mode 100644
index 0000000..8774902
--- /dev/null
+++ b/src/halo-finder/src/RCBForceTree.cxx
@@ -0,0 +1,1577 @@
+/*=========================================================================
+
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC.
+This software was produced under U.S. Government contract DE-AC52-06NA25396
+for Los Alamos National Laboratory (LANL), which is operated by
+Los Alamos National Security, LLC for the U.S. Department of Energy.
+The U.S. Government has rights to use, reproduce, and distribute this software.
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.
+If software is modified to produce derivative works, such modified software
+should be clearly marked, so as not to confuse it with the version available
+from LANL.
+
+Additionally, redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the following conditions
+are met:
+-   Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=========================================================================*/
+
+/*=========================================================================
+
+Copyright (c) 2011-2012 Argonne National Laboratory
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+=========================================================================*/
+
+
+/*
+BG/Q tuned version of HACC: 69.2% of peak performance on 96 racks of Sequoia
+Argonne Leadership Computing Facility, Argonne, IL 60439
+Vitali Morozov (morozov@anl.gov)
+Hal Finkel (hfinkel@anl.gov)
+*/
+
+
+#include "hip/hip_runtime.h"
+#include "Timings.h"
+#include "RCBForceTree.h"
+#include "Partition.h"
+
+#include <cstring>
+#include <cstdio>
+#include <ctime>
+#include <stdexcept>
+#include <assert.h>
+using namespace std;
+
+#ifdef __HIPCC__
+#include <cudaUtil.h>
+
+#define TILEX 4                       //Unroll factor in the x dimension, best if 2 or 4, could also add 8 but that is too many registers
+#define TILEY 4                       //Unroll factor in the y dimension, best if 2 or 4, could add 8 but that is too many registers
+#define BLOCKX 32                     //Block size in the x dimension (should be 32)
+#define BLOCKY 4                      //Block size in the y dimension
+#define MAXX 32                       //Maximum blocks in the X dimension, smaller=more reuse but less parallelism
+#define MAXY 256                      //Maximum blocks in the Y dimension, there isn't much reason to make this smaller
+
+#define ALIGNX(n) ((n+TILEX-1)/TILEX*TILEX)  //Rounds an integer to align with TILEX
+#define ALIGNY(n) ((n+TILEY-1)/TILEY*TILEY)  //Rounds an integer to align with TILEY
+
+cudaDeviceSelector __selector__;
+#endif
+
+#ifdef __HIPCC__
+#define __HOST__ __host__
+#define __DEVICE__ __device__
+#else
+#define __HOST__
+#define __DEVICE__
+#endif
+
+
+// References:
+// Emanuel Gafton and Stephan Rosswog. A fast recursive coordinate bisection tree for
+// neighbour search and gravity. Mon. Not. R. Astron. Soc. to appear, 2011.
+// http://arxiv.org/abs/1108.0028v1
+//
+// Atsushi Kawai, Junichiro Makino and Toshikazu Ebisuzaki.
+// Performance Analysis of High-Accuracy Tree Code Based on the Pseudoparticle
+// Multipole Method. The Astrophysical Journal Supplement Series, 151:13-33, 2004.
+// Related: http://arxiv.org/abs/astro-ph/0012041v1
+//
+// R. H. Hardin and N. J. Sloane
+// New Spherical 4-Designs. Discrete Math, 106/107 255-264, 1992.
+//
+// The library of spherical designs:
+// http://www2.research.att.com/~njas/sphdesigns/
+namespace {
+template <int TDPTS>
+struct sphdesign {};
+
+#define DECLARE_SPHDESIGN(TDPTS) \
+template <> \
+struct sphdesign<TDPTS> \
+{ \
+  static const POSVEL_T x[TDPTS]; \
+  static const POSVEL_T y[TDPTS]; \
+  static const POSVEL_T z[TDPTS]; \
+}; \
+/**/
+
+DECLARE_SPHDESIGN(1)
+DECLARE_SPHDESIGN(2)
+DECLARE_SPHDESIGN(3)
+DECLARE_SPHDESIGN(4)
+DECLARE_SPHDESIGN(6)
+DECLARE_SPHDESIGN(12)
+DECLARE_SPHDESIGN(14)
+
+#undef DECLARE_SPHDESIGN
+
+/* this is not a t-design, but puts the monopole moment
+   at the center of mass. */
+const POSVEL_T sphdesign<1>::x[] = {
+  0
+};
+
+const POSVEL_T sphdesign<1>::y[] = {
+  0
+};
+
+const POSVEL_T sphdesign<1>::z[] = {
+  0
+};
+
+const POSVEL_T sphdesign<2>::x[] = {
+  1.0,
+  -1.0
+};
+
+const POSVEL_T sphdesign<2>::y[] = {
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<2>::z[] = {
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<3>::x[] = {
+  1.0,
+  -.5,
+  -.5
+};
+
+const POSVEL_T sphdesign<3>::y[] = {
+  0,
+  .86602540378443864675,
+  -.86602540378443864675
+};
+
+const POSVEL_T sphdesign<3>::z[] = {
+  0,
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<4>::x[] = {
+  .577350269189625763,
+  .577350269189625763,
+  -.577350269189625763,
+  -.577350269189625763
+};
+
+const POSVEL_T sphdesign<4>::y[] = {
+  .577350269189625763,
+  -.577350269189625763,
+  .577350269189625763,
+  -.577350269189625763
+};
+
+const POSVEL_T sphdesign<4>::z[] = {
+  .577350269189625763,
+  -.577350269189625763,
+  -.577350269189625763,
+  .577350269189625763
+};
+
+const POSVEL_T sphdesign<6>::x[] = {
+  1.0,
+  -1.0,
+  0,
+  0,
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<6>::y[] = {
+  0,
+  0,
+  1.0,
+  -1.0,
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<6>::z[] = {
+  0,
+  0,
+  0,
+  0,
+  1.0,
+  -1.0
+};
+
+// This is a 3-D 12-point spherical 4-design
+// (the verticies of a icosahedron) from Hardin and Sloane.
+const POSVEL_T sphdesign<12>::x[] = {
+  0,
+  0,
+  0.525731112119134,
+  -0.525731112119134,
+  0.85065080835204,
+  -0.85065080835204,
+  0,
+  0,
+  -0.525731112119134,
+  0.525731112119134,
+  -0.85065080835204,
+  0.85065080835204
+};
+
+const POSVEL_T sphdesign<12>::y[] = {
+  0.85065080835204,
+  0.85065080835204,
+  0,
+  0,
+  0.525731112119134,
+  0.525731112119134,
+  -0.85065080835204,
+  -0.85065080835204,
+  0,
+  0,
+  -0.525731112119134,
+  -0.525731112119134
+};
+
+const POSVEL_T sphdesign<12>::z[] = {
+  0.525731112119134,
+  -0.525731112119134,
+  0.85065080835204,
+  0.85065080835204,
+  0,
+  0,
+  -0.525731112119134,
+  0.525731112119134,
+  -0.85065080835204,
+  -0.85065080835204,
+  0,
+  0
+};
+
+// This is a 3-D 14-point spherical 4-design by
+// R. H. Hardin and N. J. A. Sloane.
+const POSVEL_T sphdesign<14>::x[] = {
+  1.0e0,
+  5.947189772040725e-1,
+  5.947189772040725e-1,
+  5.947189772040725e-1,
+  -5.947189772040725e-1,
+  -5.947189772040725e-1,
+  -5.947189772040725e-1,
+  3.012536847870683e-1,
+  3.012536847870683e-1,
+  3.012536847870683e-1,
+  -3.012536847870683e-1,
+  -3.012536847870683e-1,
+  -3.012536847870683e-1,
+  -1.0e0
+};
+
+const POSVEL_T sphdesign<14>::y[] = {
+  0.0e0,
+  1.776539926025823e-1,
+  -7.678419429698292e-1,
+  5.90187950367247e-1,
+  1.776539926025823e-1,
+  5.90187950367247e-1,
+  -7.678419429698292e-1,
+  8.79474443923065e-1,
+  -7.588425179318781e-1,
+  -1.206319259911869e-1,
+  8.79474443923065e-1,
+  -1.206319259911869e-1,
+  -7.588425179318781e-1,
+  0.0e0
+};
+
+const POSVEL_T sphdesign<14>::z[] = {
+  0.0e0,
+  7.840589244857197e-1,
+  -2.381765915652909e-1,
+  -5.458823329204288e-1,
+  -7.840589244857197e-1,
+  5.458823329204288e-1,
+  2.381765915652909e-1,
+  3.684710570566285e-1,
+  5.774116818882528e-1,
+  -9.458827389448813e-1,
+  -3.684710570566285e-1,
+  9.458827389448813e-1,
+  -5.774116818882528e-1,
+  0.0e0
+};
+} // anonymous namespace
+
+// Note: In Gafton and Rosswog the far-field force contribution is calculated
+// per-cell (at the center of mass), and then a Taylor expansion about the center
+// of mass is used to calculate the force on the individual particles. For this to
+// work, the functional form of the force must be known (because the Jacobian
+// and Hessian are required). Here, however, the functional form is not known,
+// and so the pseudo-particle method of Makino is used instead.
+
+template <int TDPTS>
+RCBForceTree<TDPTS>::RCBForceTree(
+                         POSVEL_T* minLoc,
+                         POSVEL_T* maxLoc,
+                         POSVEL_T* minForceLoc,
+                         POSVEL_T* maxForceLoc,
+                         ID_T count,
+                         POSVEL_T* xLoc,
+                         POSVEL_T* yLoc,
+                         POSVEL_T* zLoc,
+                         POSVEL_T* xVel,
+                         POSVEL_T* yVel,
+                         POSVEL_T* zVel,
+                         POSVEL_T* ms,
+                         POSVEL_T* phiLoc,
+                         ID_T *idLoc,
+                         MASK_T *maskLoc,
+                         POSVEL_T avgMass,
+                         POSVEL_T fsm,
+                         POSVEL_T r,
+                         POSVEL_T oa,
+                         ID_T nd,
+                         ID_T ds,
+                         ID_T tmin,
+                         ForceLaw *fl,
+                         float fcoeff,
+                         POSVEL_T ppc)
+{
+#ifdef __HIPCC__
+  //nvtxRangeId_t r0;
+  //r0=nvtxRangeStartA("RCBForceTree()");
+#endif
+  // Extract the contiguous data block from a vector pointer
+  particleCount = count;
+
+  xx = xLoc;
+  yy = yLoc;
+  zz = zLoc;
+  vx = xVel;
+  vy = yVel;
+  vz = zVel;
+  mass = ms;
+
+  numThreads=1;
+
+  // static size for the interaction list
+  #define VMAX ALIGNY(16384)
+  nx_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
+  ny_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
+  nz_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
+  nm_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
+
+#ifdef __HIPCC__
+  hipHostRegister(nx_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
+  hipHostRegister(ny_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
+  hipHostRegister(nz_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
+  hipHostRegister(nm_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
+  hipHostRegister(xx,count*sizeof(POSVEL_T),0);
+  hipHostRegister(yy,count*sizeof(POSVEL_T),0);
+  hipHostRegister(zz,count*sizeof(POSVEL_T),0);
+  hipHostRegister(mass,count*sizeof(POSVEL_T),0);
+  hipHostRegister(vx,count*sizeof(POSVEL_T),0);
+  hipHostRegister(vy,count*sizeof(POSVEL_T),0);
+  hipHostRegister(vz,count*sizeof(POSVEL_T),0);
+
+  int size=ALIGNY(nd);
+  hipMalloc(&d_xx,size*sizeof(POSVEL_T)*numThreads);
+  hipMalloc(&d_yy,size*sizeof(POSVEL_T)*numThreads);
+  hipMalloc(&d_zz,size*sizeof(POSVEL_T)*numThreads);
+  hipMalloc(&d_vx,size*sizeof(POSVEL_T)*numThreads);
+  hipMalloc(&d_vy,size*sizeof(POSVEL_T)*numThreads);
+  hipMalloc(&d_vz,size*sizeof(POSVEL_T)*numThreads);
+  hipMalloc(&d_mass,size*sizeof(POSVEL_T)*numThreads);
+
+  hipMalloc(&d_nx_v,VMAX*sizeof(POSVEL_T)*numThreads);
+  hipMalloc(&d_ny_v,VMAX*sizeof(POSVEL_T)*numThreads);
+  hipMalloc(&d_nz_v,VMAX*sizeof(POSVEL_T)*numThreads);
+  hipMalloc(&d_nm_v,VMAX*sizeof(POSVEL_T)*numThreads);
+  cudaCheckError();
+
+
+  event_v=(hipEvent_t*)malloc(sizeof(hipEvent_t)*numThreads);
+  stream_v=(hipStream_t*)malloc(sizeof(hipStream_t)*numThreads);
+  for(int i=0;i<numThreads;i++) {
+    hipEventCreate(&event_v[i]);
+    hipStreamCreate(&stream_v[i]);
+  }
+  cudaCheckError();
+#endif
+
+  phi = phiLoc;
+  id = idLoc;
+  mask = maskLoc;
+
+  particleMass = avgMass;
+  fsrrmax = fsm;
+  rsm = r;
+  sinOpeningAngle = sinf(oa);
+  tanOpeningAngle = tanf(oa);
+  nDirect = nd;
+  depthSafety = ds;
+  taskPartMin = tmin;
+  ppContract = ppc;
+
+  // Find the grid size of this chaining mesh
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    minRange[dim] = minLoc[dim];
+    maxRange[dim] = maxLoc[dim];
+    minForceRange[dim] = minForceLoc[dim];
+    maxForceRange[dim] = maxForceLoc[dim];
+  }
+
+  if (fl) {
+    m_own_fl = false;
+    m_fl = fl;
+    m_fcoeff = fcoeff;
+  } else {
+    //maybe change this to Newton's law or something
+    m_own_fl = true;
+    m_fl = new ForceLawNewton();
+    m_fcoeff = 1.0;
+  }
+
+  // Because the tree may be built in parallel, and no efficient way of locking
+  // the tree seems to be available in OpenMP (no reader/writer locks, etc.),
+  // we just estimate the number of tree nodes that will be needed. Hopefully,
+  // this will be an over estimate. If we need more than this, then tree nodes
+  // that really should be subdivided will not be.
+  //
+  // If the tree were perfectly balanced, then it would have a depth of
+  // log_2(particleCount/nDirect). The tree needs to have (2^depth)+1 entries.
+  // To that, a safety factor is added to the depth.
+  ID_T nds = (((ID_T)(particleCount/(POSVEL_T)nDirect)) << depthSafety) + 1;
+  tree.reserve(nds);
+
+  int nthreads = 1;
+
+  timespec b_start, b_end;
+  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &b_start);
+  // Create the recursive RCB tree from the particle locations
+#ifdef __HIPCC__
+  //nvtxRangeId_t r1;
+  //r1=nvtxRangeStartA("createRCBForceTree");
+#endif
+  createRCBForceTree();
+
+#ifdef __HIPCC__
+  //nvtxRangeEnd(r1);
+#endif
+
+  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &b_end);
+  double b_time = (b_end.tv_sec - b_start.tv_sec);
+  b_time += 1e-9*(b_end.tv_nsec - b_start.tv_nsec);
+
+  printStats(b_time);
+
+  // Interaction lists.
+  inx.resize(nthreads);
+  iny.resize(nthreads);
+  inz.resize(nthreads);
+  inm.resize(nthreads);
+  iq.resize(nthreads);
+
+#ifdef __HIPCC__
+  //r1=nvtxRangeStartA("calcInternodeForces");
+#endif
+  calcInternodeForces();
+#ifdef __HIPCC__
+  //nvtxRangeEnd(r1);
+#endif
+
+#ifdef __HIPCC__
+  //nvtxRangeEnd(r0);
+#endif
+}
+
+template <int TDPTS>
+RCBForceTree<TDPTS>::~RCBForceTree()
+{
+#ifdef __HIPCC__
+  //nvtxRangeId_t r0;
+  //r0=nvtxRangeStartA("~RCBForceTree");
+#endif
+  if (m_own_fl) {
+    delete m_fl;
+  }
+#ifdef __HIPCC__
+  hipHostUnregister(xx);
+  hipHostUnregister(yy);
+  hipHostUnregister(zz);
+  hipHostUnregister(mass);
+  hipHostUnregister(vx);
+  hipHostUnregister(vy);
+  hipHostUnregister(vz);
+  hipHostUnregister(nx_v);
+  hipHostUnregister(ny_v);
+  hipHostUnregister(nz_v);
+  hipHostUnregister(nm_v);
+
+  hipFree(d_xx);
+  hipFree(d_yy);
+  hipFree(d_zz);
+  hipFree(d_vx);
+  hipFree(d_vy);
+  hipFree(d_vz);
+  hipFree(d_mass);
+  hipFree(d_nx_v);
+  hipFree(d_ny_v);
+  hipFree(d_nz_v);
+  hipFree(d_nm_v);
+  cudaCheckError();
+
+  for(int i=0;i<numThreads;i++) {
+    hipEventDestroy(event_v[i]);
+    hipStreamDestroy(stream_v[i]);
+  }
+  cudaCheckError();
+
+  free(event_v);
+  free(stream_v);
+
+#endif
+  free(nx_v);
+  free(ny_v);
+  free(nz_v);
+  free(nm_v);
+#ifdef __HIPCC__
+  //nvtxRangeEnd(r0);
+#endif
+}
+
+template <int TDPTS>
+void RCBForceTree<TDPTS>::printStats(double buildTime)
+{
+  size_t zeroLeafNodes = 0;
+  size_t nonzeroLeafNodes = 0;
+  size_t maxPPN = 0;
+  size_t leafParts = 0;
+
+  for (ID_T tl = 1; tl < (ID_T) tree.size(); ++tl) {
+    if (tree[tl].cl == 0 && tree[tl].cr == 0) {
+      if (tree[tl].count > 0) {
+        ++nonzeroLeafNodes;
+
+        leafParts += tree[tl].count;
+        maxPPN = std::max((size_t) tree[tl].count, maxPPN);
+      } else {
+        ++zeroLeafNodes;
+      }
+    }
+  }
+
+  double localParticleCount = particleCount;
+  double localTreeSize = tree.size();
+  double localTreeCapacity = tree.capacity();
+  double localLeaves = zeroLeafNodes+nonzeroLeafNodes;
+  double localEmptyLeaves = zeroLeafNodes;
+  double localMeanPPN = leafParts/((double) nonzeroLeafNodes);
+  unsigned long localMaxPPN = maxPPN;
+  double localBuildTime = buildTime;
+
+  /*
+  double globalParticleCount;
+  double globalTreeSize;
+  double globalTreeCapacity;
+  double globalLeaves;
+  double globalEmptyLeaves;
+  double globalMeanPPN;
+  unsigned long globalMaxPPN;
+  double globalBuildTime;
+
+  bool printHere = true;
+  */
+
+  if ( Partition::getMyProc() == 0 ) {
+    printf("\ttree post-build statistics (local for rank 0):\n");
+    printf("\t\tparticles: %.2f\n", localParticleCount);
+    printf("\t\tnodes: %.2f (allocated:  %.2f)\n", localTreeSize, localTreeCapacity);
+    printf("\t\tleaves: %.2f (empty: %.2f)\n", localLeaves, localEmptyLeaves);
+    printf("\t\tmean ppn: %.2f (max ppn: %lu)\n", localMeanPPN, localMaxPPN);
+    printf("\t\tbuild time: %g s\n", localBuildTime);
+  }
+}
+
+
+extern "C" void cm(ID_T count, const POSVEL_T*  xx, const POSVEL_T*  yy,
+                      const POSVEL_T*  zz, const POSVEL_T*  mass,
+                      POSVEL_T*  xmin, POSVEL_T*  xmax, POSVEL_T*  xc);
+
+static inline POSVEL_T pptdr(const POSVEL_T*  xmin, const POSVEL_T*  xmax, const POSVEL_T*  xc)
+{
+  return std::min(xmax[0] - xc[0], std::min(xmax[1] - xc[1], std::min(xmax[2] - xc[2], std::min(xc[0] - xmin[0],
+                 std::min(xc[1] - xmin[1], xc[2] - xmin[2])))));
+}
+
+template <int TDPTS>
+static inline void pppts(POSVEL_T tdr, const POSVEL_T*  xc,
+                         POSVEL_T*  ppx, POSVEL_T*  ppy, POSVEL_T*  ppz)
+{
+  for (int i = 0; i < TDPTS; ++i) {
+    ppx[i] = tdr*sphdesign<TDPTS>::x[i] + xc[0];
+    ppy[i] = tdr*sphdesign<TDPTS>::y[i] + xc[1];
+    ppz[i] = tdr*sphdesign<TDPTS>::z[i] + xc[2];
+  }
+}
+
+template <int TDPTS>
+static inline void pp(ID_T count, const POSVEL_T*  xx, const POSVEL_T*  yy,
+                      const POSVEL_T*  zz, const POSVEL_T*  mass, const POSVEL_T*  xc,
+                      const POSVEL_T*  ppx, const POSVEL_T*  ppy, const POSVEL_T*  ppz,
+                      POSVEL_T*  ppm, POSVEL_T tdr)
+{
+  POSVEL_T K = TDPTS;
+  POSVEL_T odr0 = 1/K;
+
+  for (int i = 0; i < count; ++i) {
+    POSVEL_T xi = xx[i] - xc[0];
+    POSVEL_T yi = yy[i] - xc[1];
+    POSVEL_T zi = zz[i] - xc[2];
+    POSVEL_T ri = sqrtf(xi*xi + yi*yi + zi*zi);
+
+    for (int j = 0; j < TDPTS; ++j) {
+      POSVEL_T xj = ppx[j] - xc[0];
+      POSVEL_T yj = ppy[j] - xc[1];
+      POSVEL_T zj = ppz[j] - xc[2];
+      POSVEL_T rj2 = xj*xj + yj*yj + zj*zj;
+
+      POSVEL_T odr1 = 0, odr2 = 0;
+      if (rj2 != 0) {
+        POSVEL_T rj  = sqrtf(rj2);
+        POSVEL_T aij = (xi*xj + yi*yj + zi*zj)/(ri*rj);
+
+        odr1 = (3/K)*(ri/tdr)*aij;
+        odr2 = (5/K)*(ri/tdr)*(ri/tdr)*0.5*(3*aij*aij - 1);
+      }
+
+      ppm[j] += mass[i]*(odr0 + odr1 + odr2);
+    }
+  }
+}
+
+#ifdef __HIPCC__
+
+typedef long long int int64;
+
+template<typename T>
+__device__ __forceinline__
+T load(T *t)
+{
+  return __ldg(t);  //texture load
+}
+
+template<int TILE_SIZE, typename T>__device__ void loadT(T *  out, const T *  in);
+
+//generic version (inefficient)
+template<int TILE_SIZE, typename T>
+__device__ __forceinline__
+void loadT(T *  out, const T *  in) {
+  #pragma unroll
+  for(int i=0;i<TILE_SIZE;i++) {
+    out[i]=__ldg(in+i);
+  }
+}
+
+//Vector loads
+template<>
+__device__ __forceinline__
+void loadT<2,float>(float *  out, const float *  in) {
+  *reinterpret_cast<float2*>(out)=load(reinterpret_cast<const float2*>(in));
+}
+template<>
+__device__ __forceinline__
+void loadT<4,float>(float *  out, const float *  in) {
+  *reinterpret_cast<float4*>(out)=load(reinterpret_cast<const float4*>(in));
+}
+
+//static __device__ __forceinline__ float __internal_fast_rsqrtf(float a)
+//{
+//  float r;
+//  asm ("rsqrt.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
+//  return r;
+//}
+
+//computes the forces between tiles i and j, adds the change in force to xi,yi,zi
+template<int TX, int TY>
+__device__ __forceinline__ void computeForces(POSVEL_T xxi[], POSVEL_T yyi[], POSVEL_T zzi[],
+                                              POSVEL_T xxj[], POSVEL_T yyj[], POSVEL_T zzj[], POSVEL_T massj[],
+                                              POSVEL_T xi[], POSVEL_T yi[], POSVEL_T zi[],
+                                              POSVEL_T ma0, POSVEL_T ma1, POSVEL_T ma2, POSVEL_T ma3, POSVEL_T ma4, POSVEL_T ma5,
+                                              POSVEL_T mp_rsm2, POSVEL_T fsrrmax2) {
+
+  #pragma unroll
+  for(int i=0;i<TY;i++) {
+    #pragma unroll
+    for(int j=0;j<TX;j++) {
+      POSVEL_T dxc = xxj[j] - xxi[i];                                                                //1 FADD
+      POSVEL_T dyc = yyj[j] - yyi[i];                                                                //1 FADD
+      POSVEL_T dzc = zzj[j] - zzi[i];                                                                //1 FADD
+
+      POSVEL_T r2 = dxc * dxc + dyc * dyc + dzc * dzc;                                               //1 FMUL 2 FMA
+      POSVEL_T v=r2+mp_rsm2;                                                                         //1 FADD
+      POSVEL_T v3=v*v*v;                                                                             //2 FMUL
+
+      POSVEL_T f = __frsqrt_rn(v3);                                                        //1 MUFU,
+	  // MDS: Should ask someone why this line is dangling
+      //       - ( ma0 + r2*(ma1 + r2*(ma2 + r2*(ma3 + r2*(ma4 + r2*ma5)))));                          //5 FMA, 1 FADD
+//#define BUG
+#ifndef BUG
+      f*=massj[j]*(r2<fsrrmax2 && r2>0.0f);                                                          //2 FMUL, 1 FSETP, 1 FCMP
+#else
+      f*=massj[j];                                                                                   //1 FMUL
+      f*=(r2<fsrrmax2 && r2>0.0f);                                                                   //1 FMUL, 1 FSETP, 1 FCMP
+#endif
+
+      xi[i] = xi[i] + f * dxc;                                                                       //1 FMA
+      yi[i] = yi[i] + f * dyc;                                                                       //1 FMA
+      zi[i] = zi[i] + f * dzc;                                                                       //1 FMA
+    }
+  }
+}
+
+//loads a tile from memory.  Use checkBounds and loadMass to disable bounds check or mass load at compile time
+template <bool checkBounds, bool loadMass, int TILE_SIZE>
+__device__ __forceinline__
+void loadTile(int i, int bounds,
+              const POSVEL_T*  xx, const POSVEL_T*  yy, const POSVEL_T*  zz, const POSVEL_T*  mass,
+              POSVEL_T xxi[], POSVEL_T yyi[], POSVEL_T zzi[], POSVEL_T massi[]) {
+  if(checkBounds) {
+  #pragma unroll
+  for(int64 u=0;u<TILE_SIZE;u++) {
+    int64 idx=TILE_SIZE*i+u;                                                                        // 1 IMAD
+
+#if 1
+      bool cond=idx<bounds;
+      xxi[u] = (cond) ? load(xx+idx) : 0.0f;                                                     // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV
+      yyi[u] = (cond) ? load(yy+idx) : 0.0f;                                                     // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV
+      zzi[u] = (cond) ? load(zz+idx) : 0.0f;                                                     // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV
+      if(loadMass) massi[u] = (cond) ? load(mass+idx) : 0.0f;                                    // 1 ISETP, 1 LDG, 2 IMAD, 1 MOV
+#else
+      massi[u] = 0.0f;                                                                           //1 MOV
+      if(idx<bounds) {                                                                           //1 ISETP, 1 BRA
+        xxi[u] = load(xx+idx);                                                                   //1 LDG, 2 IMAD
+        yyi[u] = load(yy+idx);                                                                   //1 LDG, 2 IMAD
+        zzi[u] = load(zz+idx);                                                                   //1 LDG, 2 IMAD
+        if(loadMass) massi[u] = load(mass+idx);                                                  //1 LDG, 2 IMAD
+      }
+#endif
+    }
+  } else {
+
+    int idx=TILE_SIZE*i;
+    loadT<TILE_SIZE>(xxi,xx+idx);                                                                //1 LDG, 2 IMAD
+    loadT<TILE_SIZE>(yyi,yy+idx);                                                                //1 LDG, 2 IMAD
+    loadT<TILE_SIZE>(zzi,zz+idx);                                                                //1 LDG, 2 IMAD
+    if(loadMass) loadT<TILE_SIZE>(massi,mass+idx);                                               //1 LDG, 2 IMAD
+  }
+}
+
+//applies the force in xi,yi,zi to update vx, vy, vz
+//use checkBounds to disable bounds checking at compile time
+template <bool checkBounds, int TILE_SIZE>
+__device__ __forceinline__
+void applyForce(int i, int bounds,POSVEL_T fcoeff,
+                const POSVEL_T xi[], const POSVEL_T yi[], const POSVEL_T zi[],
+                POSVEL_T *vx, POSVEL_T *vy, POSVEL_T *vz) {
+    #pragma unroll
+    for(int u=0;u<TILE_SIZE;u++) {
+      int idx=TILE_SIZE*i+u;                                                                         //1 IMAD
+
+      if(!checkBounds || idx<bounds)
+      {                                                                                           //1 ISETP
+        atomicWarpReduceAndUpdate(vx+idx,fcoeff * xi[u]);                                         //2 IMAD, 6 FADD
+        atomicWarpReduceAndUpdate(vy+idx,fcoeff * yi[u]);                                         //2 IMAD, 6 FADD
+        atomicWarpReduceAndUpdate(vz+idx,fcoeff * zi[u]);                                         //2 IMAD, 6 FADD
+      }
+    }
+}
+
+  //Tell the compiler how many blocks we expect to be active.
+  //This gives the compiler a better idea of how many registers to use.  The second number is tunable.
+__launch_bounds__(BLOCKX*BLOCKY,7)
+__global__
+void Step10_cuda_kernel(int count, int count1,
+                        const POSVEL_T*  xx, const POSVEL_T*  yy,
+                        const POSVEL_T*  zz, const POSVEL_T*  mass,
+                        const POSVEL_T*  xx1, const POSVEL_T*  yy1,
+                        const POSVEL_T*  zz1, const POSVEL_T*  mass1,
+                        POSVEL_T*  vx, POSVEL_T*  vy,
+                        POSVEL_T*  vz, POSVEL_T fsrrmax2, POSVEL_T mp_rsm2, POSVEL_T fcoeff)
+{
+  const POSVEL_T ma0 = 0.269327, ma1 = -0.0750978, ma2 = 0.0114808, ma3 = -0.00109313, ma4 = 0.0000605491, ma5 = -0.00000147177;
+
+  //Register arrays to hold tiles of data
+  POSVEL_T xxi[TILEY];
+  POSVEL_T yyi[TILEY];
+  POSVEL_T zzi[TILEY];
+  POSVEL_T xxj[TILEX];
+  POSVEL_T yyj[TILEX];
+  POSVEL_T zzj[TILEX];
+  POSVEL_T massj[TILEX];
+
+  //loop over interior region and calculate forces.
+  //for each tile i
+ for(int i=hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count/TILEY;i+=hipBlockDim_y*hipGridDim_y)                                //1 ISETP
+  {
+    POSVEL_T xi[TILEY]={0};                                                                                //TILEY MOV
+    POSVEL_T yi[TILEY]={0};                                                                                //TILEY MOV
+    POSVEL_T zi[TILEY]={0};                                                                                //TILEY MOV
+
+    //load tile i,mass and bounds check are not needed
+    loadTile<false,false,TILEY>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);
+
+    //for each tile j
+    for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
+    {
+      //load tile j, bounds check is not needed
+      loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
+
+      //compute forces between tile i and tile j
+      computeForces<TILEX,TILEY>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
+    }
+
+    //process remaining elements at the end, use TILEX=1
+    for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
+    {
+      //load tile j, bounds check is needed, mass is needed
+      loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
+
+      //compute forces between tile i and tile j
+      computeForces<1,TILEY>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
+    }
+
+    //apply the force we have calculated above, bounds check is not needed
+    applyForce<false,TILEY>(i,count,fcoeff,xi,yi,zi,vx,vy,vz);
+  }
+
+  //At this point we have computed almost all interactions.
+  //However we still need to add contributions for particles at the end
+
+#if 1
+  //process ramining elements in set TILEY=1
+  //for each tile i
+  for(int i=count/TILEY*TILEY+hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count;i+=hipBlockDim_y*hipGridDim_y)                             //1 ISETP
+  {
+    POSVEL_T xi[1]={0};                                                                                //1 MOV
+    POSVEL_T yi[1]={0};                                                                                //1 MOV
+    POSVEL_T zi[1]={0};                                                                                //1 MOV
+
+    //load xxi, yyi, zzi tiles, mass is not needed, bounds check is needed
+    loadTile<true,false,1>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);
+
+    //for each tile j
+    for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
+    {
+      //load tile j, bounds check is not needed
+      loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
+
+      //compute forces between tile i and tile j
+      computeForces<TILEX,1>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
+    }
+
+    //process remaining elements at the end, use TILEX=1
+    for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
+    {
+      //load tile j, bounds check is needed, mass is needed
+      loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
+
+      //compute forces between tile i and tile j
+      computeForces<1,1>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
+    }
+
+    applyForce<true,1>(i,count,fcoeff,xi,yi,zi,vx,vy,vz);
+  }
+#endif
+
+}
+
+
+
+#endif
+
+#ifdef __bgq__
+extern "C" Step16_int( int count1, float xxi, float yyi, float zzi, float fsrrmax2, float mp_rsm2, const float *xx1, const float *yy1, const float *zz1,const  float *mass1, float *ax, float *ay, float *az );
+#endif
+
+static inline void nbody1(ID_T count, ID_T count1, const POSVEL_T*  xx, const POSVEL_T*  yy,
+                         const POSVEL_T*  zz, const POSVEL_T*  mass,
+                         const POSVEL_T*  xx1, const POSVEL_T*  yy1,
+                         const POSVEL_T*  zz1, const POSVEL_T*  mass1,
+                         POSVEL_T*  vx, POSVEL_T*  vy, POSVEL_T*  vz,
+                         ForceLaw *fl, float fcoeff, float fsrrmax, float rsm
+#ifdef __HIPCC__
+                         , hipStream_t stream
+#endif
+                         )
+{
+  POSVEL_T fsrrmax2 = fsrrmax*fsrrmax;
+  POSVEL_T rsm2 = rsm*rsm;
+
+#ifdef __bgq__
+  float ax = 0.0f, ay = 0.0f, az = 0.0f;
+
+  for (int i = 0; i < count; ++i)
+  {
+
+    Step16_int ( count1, xx[i],yy[i],zz[i], fsrrmax2,rsm2,xx1,yy1,zz1,mass1, &ax, &ay, &az );
+
+    vx[i] = vx[i] + ax * fcoeff;
+    vy[i] = vy[i] + ay * fcoeff;
+    vz[i] = vz[i] + az * fcoeff;
+  }
+
+#else
+
+#ifdef __HIPCC__
+  //const int MAXX=64;
+  //const int MAXY=64;
+
+  dim3 threads(BLOCKX,BLOCKY);
+  int blocksX=(count1+threads.x-1)/threads.x;
+  int blocksY=(count+threads.y-1)/threads.y;
+  dim3 blocks( min(blocksX,MAXX), min(blocksY,MAXY));
+
+  //call kernel
+
+  cudaCheckError();
+  //printf("count: %d, count1: %d\n", count,count1);
+#if 0
+  checkCudaPtr(xx,"xx");
+  checkCudaPtr(yy,"yy");
+  checkCudaPtr(zz,"zz");
+  checkCudaPtr(mass,"mass");
+  checkCudaPtr(xx1,"xx1");
+  checkCudaPtr(yy1,"yy1");
+  checkCudaPtr(zz1,"zz1");
+  checkCudaPtr(mass1,"mass1");
+  checkCudaPtr(vx,"vx");
+  checkCudaPtr(vy,"vy");
+  checkCudaPtr(vz,"vz");
+#endif
+  hipLaunchKernelGGL(Step10_cuda_kernel, dim3(blocks), dim3(threads), 0, stream, count,count1,xx,yy,zz,mass,xx1,yy1,zz1,mass1, vx, vy, vz, fsrrmax2, rsm2, fcoeff);
+  cudaCheckError();
+
+  //hipDeviceSynchronize();
+  //exit(0);
+#else
+
+  for (int i = 0; i < count; ++i)
+    for (int j = 0; j < count1; ++j) {
+      POSVEL_T dx = xx1[j] - xx[i];
+      POSVEL_T dy = yy1[j] - yy[i];
+      POSVEL_T dz = zz1[j] - zz[i];
+      POSVEL_T dist2 = dx*dx + dy*dy + dz*dz;
+      POSVEL_T f_over_r = mass[i]*mass1[j] * fl->f_over_r(dist2);
+
+      POSVEL_T updateq = 1.0;
+      updateq *= (dist2 < fsrrmax2);
+
+      vx[i] += updateq*fcoeff*f_over_r*dx;
+      vy[i] += updateq*fcoeff*f_over_r*dy;
+      vz[i] += updateq*fcoeff*f_over_r*dz;
+    }
+#endif //end __HIPCC__
+
+
+#endif //end __bgq__
+}
+
+
+static inline ID_T partition(ID_T n,
+                             POSVEL_T*  xx, POSVEL_T*  yy, POSVEL_T*  zz,
+                             POSVEL_T*  vx, POSVEL_T*  vy, POSVEL_T*  vz,
+                             POSVEL_T*  mass, POSVEL_T*  phi,
+                             ID_T*  id, MASK_T*  mask, POSVEL_T pv
+                            )
+{
+  float t0, t1, t2, t3, t4, t5, t6, t7;
+  int32_t is, i, j;
+  long i0;
+  uint16_t i1;
+
+  int idx[n];
+
+  is = 0;
+  for ( i = 0; i < n; i = i + 1 )
+  {
+    if (xx[i] < pv)
+    {
+      idx[is] = i;
+      is = is + 1;
+    }
+  }
+
+#pragma unroll (4)
+  for ( j = 0; j < is; j++ )
+  {
+      i = idx[j];
+
+      t6 = mass[i]; mass[i] = mass[j]; mass[j] = t6;
+      t7 = phi [i]; phi [i] = phi [j]; phi [j] = t7;
+      i1 = mask[i]; mask[i] = mask[j]; mask[j] = i1;
+      i0 = id  [i]; id  [i] = id  [j]; id  [j] = i0;
+  }
+
+#pragma unroll (4)
+  for ( j = 0; j < is; j++ )
+  {
+      i = idx[j];
+
+      t0 = xx[i]; xx[i] = xx[j]; xx[j] = t0;
+      t1 = yy[i]; yy[i] = yy[j]; yy[j] = t1;
+      t2 = zz[i]; zz[i] = zz[j]; zz[j] = t2;
+      t3 = vx[i]; vx[i] = vx[j]; vx[j] = t3;
+      t4 = vy[i]; vy[i] = vy[j]; vy[j] = t4;
+      t5 = vz[i]; vz[i] = vz[j]; vz[j] = t5;
+  }
+
+  return is;
+}
+
+template <int TDPTS>
+void RCBForceTree<TDPTS>::createRCBForceSubtree(int d, ID_T tl, ID_T tlcl, ID_T tlcr)
+{
+  POSVEL_T *x1, *x2, *x3;
+  switch (d) {
+  case 0:
+    x1 = xx;
+    x2 = yy;
+    x3 = zz;
+  break;
+  case 1:
+    x1 = yy;
+    x2 = zz;
+    x3 = xx;
+  break;
+  /*case 2*/ default:
+    x1 = zz;
+    x2 = xx;
+    x3 = yy;
+  break;
+  }
+
+#ifdef __bgq__
+  int tid = 0;
+
+#endif
+  const bool geoSplit = false;
+  POSVEL_T split = geoSplit ? (tree[tl].xmax[d]+tree[tl].xmin[d])/2 : tree[tl].xc[d];
+  ID_T is = ::partition(tree[tl].count, x1 + tree[tl].offset, x2 + tree[tl].offset, x3 + tree[tl].offset,
+                        vx + tree[tl].offset, vy + tree[tl].offset, vz + tree[tl].offset,
+                        mass + tree[tl].offset, phi + tree[tl].offset,
+                        id + tree[tl].offset, mask + tree[tl].offset, split
+                       );
+
+  if (is == 0 || is == tree[tl].count) {
+    return;
+  }
+
+  tree[tlcl].count = is;
+  tree[tlcr].count = tree[tl].count - tree[tlcl].count;
+
+  if (tree[tlcl].count > 0) {
+    tree[tl].cl = tlcl;
+    tree[tlcl].offset = tree[tl].offset;
+    tree[tlcl].xmax[d] = split;
+
+    createRCBForceTreeInParallel(tlcl);
+  }
+
+  if (tree[tlcr].count > 0) {
+    tree[tl].cr = tlcr;
+    tree[tlcr].offset = tree[tl].offset + tree[tlcl].count;
+    tree[tlcr].xmin[d] = split;
+
+    createRCBForceTreeInParallel(tlcr);
+  }
+}
+
+// This is basically the algorithm from (Gafton and Rosswog, 2011).
+template <int TDPTS>
+void RCBForceTree<TDPTS>::createRCBForceTreeInParallel(ID_T tl)
+{
+  ID_T cnt = tree[tl].count;
+  ID_T off = tree[tl].offset;
+
+  // Compute the center-of-mass coordinates (and recompute the min/max)
+  ::cm(cnt, xx + off, yy + off, zz + off, mass + off,
+       tree[tl].xmin, tree[tl].xmax, tree[tl].xc);
+
+  if (cnt <= nDirect) {
+    // The pseudoparticles
+    tree[tl].tdr = ppContract*::pptdr(tree[tl].xmin, tree[tl].xmax, tree[tl].xc);
+    memset(tree[tl].ppm, 0, sizeof(POSVEL_T)*TDPTS);
+    if (cnt > TDPTS) { // Otherwise, the pseudoparticles are never used
+      POSVEL_T ppx[TDPTS], ppy[TDPTS], ppz[TDPTS];
+      pppts<TDPTS>(tree[tl].tdr, tree[tl].xc, ppx, ppy, ppz);
+      pp<TDPTS>(cnt, xx + off, yy + off, zz + off, mass + off, tree[tl].xc,
+                ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
+    }
+
+    return;
+  }
+
+  // Index of the right and left child levels
+  ID_T tlcl, tlcr;
+  {
+    tlcl = tree.size();
+    tlcr = tlcl+1;
+    size_t newSize = tlcr+1;
+    tree.resize(newSize);
+  }
+  memset(&tree[tlcl], 0, sizeof(TreeNode)*2);
+
+  // Both children have similar bounding boxes to the current node (the
+  // parent), so copy the bounding box here, and then overwrite the changed
+  // coordinate later.
+  for (int i = 0; i < DIMENSION; ++i) {
+          tree[tlcl].xmin[i] = tree[tl].xmin[i];
+          tree[tlcr].xmin[i] = tree[tl].xmin[i];
+          tree[tlcl].xmax[i] = tree[tl].xmax[i];
+          tree[tlcr].xmax[i] = tree[tl].xmax[i];
+  }
+
+  // Split the longest edge at the center of mass.
+  POSVEL_T xlen[DIMENSION];
+  for (int i = 0; i < DIMENSION; ++i) {
+    xlen[i] = tree[tl].xmax[i] - tree[tl].xmin[i];
+  }
+
+  int d;
+  if (xlen[0] > xlen[1] && xlen[0] > xlen[2]) {
+        d = 0; // Split in the x direction
+  }
+  else if (xlen[1] > xlen[2]) {
+        d = 1; // Split in the y direction
+  }
+  else {
+        d = 2; // Split in the z direction
+  }
+
+  createRCBForceSubtree(d, tl, tlcl, tlcr);
+
+  // Compute the pseudoparticles based on those of the children
+  POSVEL_T ppx[TDPTS], ppy[TDPTS], ppz[TDPTS];
+  tree[tl].tdr = ppContract*::pptdr(tree[tl].xmin, tree[tl].xmax, tree[tl].xc);
+  pppts<TDPTS>(tree[tl].tdr, tree[tl].xc, ppx, ppy, ppz);
+  memset(tree[tl].ppm, 0, sizeof(POSVEL_T)*TDPTS);
+
+  if (tree[tlcl].count > 0) {
+    if (tree[tlcl].count <= TDPTS) {
+      ID_T offc = tree[tlcl].offset;
+      pp<TDPTS>(tree[tlcl].count, xx + offc, yy + offc, zz + offc, mass + offc,
+                tree[tl].xc, ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
+    } else {
+      POSVEL_T ppxc[TDPTS], ppyc[TDPTS], ppzc[TDPTS];
+      pppts<TDPTS>(tree[tlcl].tdr, tree[tlcl].xc, ppxc, ppyc, ppzc);
+      pp<TDPTS>(TDPTS, ppxc, ppyc, ppzc, tree[tlcl].ppm, tree[tl].xc,
+                ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
+    }
+  }
+  if (tree[tlcr].count > 0) {
+    if (tree[tlcr].count <= TDPTS) {
+      ID_T offc = tree[tlcr].offset;
+      pp<TDPTS>(tree[tlcr].count, xx + offc, yy + offc, zz + offc, mass + offc,
+                tree[tl].xc, ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
+    } else {
+      POSVEL_T ppxc[TDPTS], ppyc[TDPTS], ppzc[TDPTS];
+      pppts<TDPTS>(tree[tlcr].tdr, tree[tlcr].xc, ppxc, ppyc, ppzc);
+      pp<TDPTS>(TDPTS, ppxc, ppyc, ppzc, tree[tlcr].ppm, tree[tl].xc,
+                ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
+    }
+  }
+}
+
+template <int TDPTS>
+void RCBForceTree<TDPTS>::createRCBForceTree()
+{
+  // The top tree is the entire box
+  tree.resize(1);
+  memset(&tree[0], 0, sizeof(TreeNode));
+
+  tree[0].count = particleCount;
+  tree[0].offset = 0;
+
+  for (int i = 0; i < DIMENSION; ++i) {
+    tree[0].xmin[i] = minRange[i];
+    tree[0].xmax[i] = maxRange[i];
+  }
+
+  createRCBForceTreeInParallel();
+}
+
+
+template <int TDPTS>
+void RCBForceTree<TDPTS>::calcInternodeForce(ID_T tl,
+                                            const std::vector<ID_T> &parents) {
+
+  POSVEL_T fsrrmax2 = fsrrmax*fsrrmax;
+  const TreeNode* tree_ = &tree[0];
+
+  int tid = 0;
+
+  std::vector<ID_T> &q = iq[tid];
+  q.clear();
+  q.push_back(0);
+
+
+  POSVEL_T *nx=nx_v+tid*VMAX;
+  POSVEL_T *ny=ny_v+tid*VMAX;
+  POSVEL_T *nz=nz_v+tid*VMAX;
+  POSVEL_T *nm=nm_v+tid*VMAX;
+
+#ifdef __HIPCC__
+  //Adjust pointers to this threads workspace
+  POSVEL_T *d_nx=d_nx_v+tid*VMAX;
+  POSVEL_T *d_ny=d_ny_v+tid*VMAX;
+  POSVEL_T *d_nz=d_nz_v+tid*VMAX;
+  POSVEL_T *d_nm=d_nm_v+tid*VMAX;
+  int size=ALIGNY(nDirect);
+  POSVEL_T *d_xxl=d_xx+tid*size;
+  POSVEL_T *d_yyl=d_yy+tid*size;
+  POSVEL_T *d_zzl=d_zz+tid*size;
+  POSVEL_T *d_massl=d_mass+tid*size;
+  POSVEL_T *d_vxl=d_vx+tid*size;
+  POSVEL_T *d_vyl=d_vy+tid*size;
+  POSVEL_T *d_vzl=d_vz+tid*size;
+
+  hipEvent_t& event=event_v[tid];
+  hipStream_t& stream=stream_v[tid];
+  hipEventSynchronize(event);  //wait for transfers from previous call to finish before overwriting nx,ny,nz,nm
+  cudaCheckError();
+#endif
+
+  // The interaction list.
+  int SIZE = 0; // current size of these arrays
+
+  while (!q.empty()) {
+    ID_T tln = q.back();
+    q.pop_back();
+
+    // We should not interact with our own parents.
+    if (tln < tl) {
+      bool isParent = std::binary_search(parents.begin(), parents.end(), tln);
+      if (isParent) {
+        ID_T tlncr = tree_[tln].cr;
+        ID_T tlncl = tree_[tln].cl;
+
+        if (tlncl != tl && tlncl > 0 && tree_[tlncl].count > 0) {
+          q.push_back(tlncl);
+        }
+        if (tlncr != tl && tlncr > 0 && tree_[tlncr].count > 0) {
+          q.push_back(tlncr);
+        }
+
+        continue;
+      }
+    }
+
+    // Is this node have a small enough opening angle to interact with?
+    POSVEL_T dx = tree_[tln].xc[0] - tree_[tl].xc[0];
+    POSVEL_T dy = tree_[tln].xc[1] - tree_[tl].xc[1];
+    POSVEL_T dz = tree_[tln].xc[2] - tree_[tl].xc[2];
+    POSVEL_T dist2 = dx*dx + dy*dy + dz*dz;
+
+    POSVEL_T sx = tree_[tln].xmax[0]-tree_[tln].xmin[0];
+    POSVEL_T sy = tree_[tln].xmax[1]-tree_[tln].xmin[1];
+    POSVEL_T sz = tree_[tln].xmax[2]-tree_[tln].xmin[2];
+    POSVEL_T l2 = std::min(sx*sx, std::min(sy*sy, sz*sz)); // under-estimate
+
+    POSVEL_T dtt2 = dist2*tanOpeningAngle*tanOpeningAngle;
+    bool looksBig;
+    // l2/dist2 is really tan^2 theta, for small theta, tan(theta) ~ theta
+    if (l2 > dtt2) {
+      // the under-estimate is too big, so this is definitely too big
+      looksBig = true;
+    } else {
+      // There are 8 corner points of the remote node, and the maximum angular
+      // size will be from one of those points to its opposite points. So there
+      // are 8 vector dot products to compute to determine the maximum angular
+      // size at any given reference point. (do we need to do this for each point
+      // in leaf node, or will the c.m. point be sufficient?).
+      looksBig = false;
+      for (int i = 0; i < 2; ++i)
+      for (int j = 0; j < 2; ++j) {
+        POSVEL_T x1 = (i == 0 ? tree_[tln].xmin : tree_[tln].xmax)[0] - tree_[tl].xc[0];
+        POSVEL_T y1 = (j == 0 ? tree_[tln].xmin : tree_[tln].xmax)[1] - tree_[tl].xc[1];
+        POSVEL_T z1 = tree_[tln].xmin[2] - tree_[tl].xc[2];
+
+        POSVEL_T x2 = (i == 0 ? tree_[tln].xmax : tree_[tln].xmin)[0] - tree_[tl].xc[0];
+        POSVEL_T y2 = (j == 0 ? tree_[tln].xmax : tree_[tln].xmin)[1] - tree_[tl].xc[1];
+        POSVEL_T z2 = tree_[tln].xmax[2] - tree_[tl].xc[2];
+
+        const bool useRealOA = false;
+        if (useRealOA) {
+          // |a x b| = a*b*sin(theta)
+          POSVEL_T cx = y1*z2 - z1*y2;
+          POSVEL_T cy = z1*x2 - x1*z2;
+          POSVEL_T cz = x1*y2 - y1*x2;
+          if ((cx*cx + cy*cy + cz*cz) > sinOpeningAngle*sinOpeningAngle*
+                                          (x1*x1 + y1*y1 + z1*z1)*(x2*x2 + y2*y2 + z2*z2)
+             ) {
+            looksBig = true;
+            break;
+          }
+        } else {
+          // Instead of using the real opening angle, use the tan approximation; this is
+          // better than the opening-angle b/c it incorporates depth information.
+          POSVEL_T ddx = x1 - x2, ddy = y1 - y2, ddz = z1 - z2;
+          POSVEL_T dh2 = ddx*ddx + ddy*ddy + ddz*ddz;
+          if (dh2 > dtt2) {
+            looksBig = true;
+            break;
+          }
+        }
+      }
+    }
+
+    if (!looksBig) {
+      if (dist2 > fsrrmax2) {
+        // We could interact with this node, but it is too far away to make
+        // any difference, so it will be skipped, along with all of its
+        // children.
+        continue;
+      }
+
+      // This node has fewer particles than pseudo particles, so just use the
+      // particles that are actually there.
+      if (tree_[tln].count <= TDPTS) {
+        ID_T offn = tree_[tln].offset;
+        ID_T cntn = tree_[tln].count;
+
+        int start = SIZE;
+        SIZE = SIZE + cntn;
+        assert( SIZE < VMAX );
+
+        for ( int i = 0; i < cntn; ++i) {
+          nx[start + i] = xx[offn + i];
+          ny[start + i] = yy[offn + i];
+          nz[start + i] = zz[offn + i];
+          nm[start + i] = mass[offn + i];
+        }
+
+        continue;
+      }
+
+      // Interact the particles in this node with the pseudoparticles of the
+      // other node.
+      int start = SIZE;
+      SIZE = SIZE + TDPTS;
+      assert( SIZE < VMAX );
+
+      pppts<TDPTS>(tree_[tln].tdr, tree_[tln].xc, &nx[start], &ny[start], &nz[start]);
+      for ( int i = 0; i < TDPTS; ++i) {
+        nm[start + i] = tree_[tln].ppm[i];
+      }
+
+      continue;
+    } else if (tree_[tln].cr == 0 && tree_[tln].cl == 0) {
+      // This is a leaf node with which we must interact.
+      ID_T offn = tree_[tln].offset;
+      ID_T cntn = tree_[tln].count;
+
+      int start = SIZE;
+      SIZE = SIZE + cntn;
+      assert( SIZE < VMAX );
+
+      for ( int i = 0; i < cntn; ++i) {
+        nx[start + i] = xx[offn + i];
+        ny[start + i] = yy[offn + i];
+        nz[start + i] = zz[offn + i];
+        nm[start + i] = mass[offn + i];
+      }
+
+      continue;
+    }
+
+    // This other node is not a leaf, but has too large an opening angle
+    // for an approx. interaction: queue its children.
+
+    ID_T tlncr = tree_[tln].cr;
+    ID_T tlncl = tree_[tln].cl;
+
+    if (tlncl > 0 && tree_[tlncl].count > 0) {
+      bool close = true;
+      for (int i = 0; i < DIMENSION; ++i) {
+        POSVEL_T dist = 0;
+        if (tree_[tl].xmax[i] < tree_[tlncl].xmin[i]) {
+          dist = tree_[tlncl].xmin[i] - tree_[tl].xmax[i];
+        } else if (tree_[tl].xmin[i] > tree_[tlncl].xmax[i]) {
+          dist = tree_[tl].xmin[i] - tree_[tlncl].xmax[i];
+        }
+
+        if (dist > fsrrmax) {
+          close = false;
+          break;
+        }
+      }
+
+      if (close) q.push_back(tlncl);
+    }
+    if (tlncr > 0 && tree_[tlncr].count > 0) {
+      bool close = true;
+      for (int i = 0; i < DIMENSION; ++i) {
+        POSVEL_T dist = 0;
+        if (tree_[tl].xmax[i] < tree_[tlncr].xmin[i]) {
+          dist = tree_[tlncr].xmin[i] - tree_[tl].xmax[i];
+        } else if (tree_[tl].xmin[i] > tree_[tlncr].xmax[i]) {
+          dist = tree_[tl].xmin[i] - tree_[tlncr].xmax[i];
+        }
+
+        if (dist > fsrrmax) {
+          close = false;
+          break;
+        }
+      }
+
+      if (close) q.push_back(tlncr);
+    }
+  }
+
+  ID_T off = tree_[tl].offset;
+  ID_T cnt = tree_[tl].count;
+
+  // Add self interactions...
+  int start = SIZE;
+  SIZE = SIZE + cnt;
+  assert( SIZE < VMAX );
+
+  for ( int i = 0; i < cnt; ++i) {
+    nx[start + i] = xx[off + i];
+    ny[start + i] = yy[off + i];
+    nz[start + i] = zz[off + i];
+    nm[start + i] = mass[off + i];
+  }
+
+#ifdef __HIPCC__
+  hipMemcpyAsync(d_nx,nx,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
+  hipMemcpyAsync(d_ny,ny,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
+  hipMemcpyAsync(d_nz,nz,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
+  hipMemcpyAsync(d_nm,nm,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
+  hipEventRecord(event,stream);  //mark when transfers have finished
+  cudaCheckError();
+  hipDeviceSynchronize();
+
+
+  hipMemcpyAsync(d_xxl,xx+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
+  hipMemcpyAsync(d_yyl,yy+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
+  hipMemcpyAsync(d_zzl,zz+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
+  hipMemcpyAsync(d_massl,mass+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
+  hipMemcpyAsync(d_vxl,vx+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
+  hipMemcpyAsync(d_vyl,vy+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
+  hipMemcpyAsync(d_vzl,vz+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
+  cudaCheckError();
+  hipDeviceSynchronize();
+#endif
+
+  // Process the interaction list...
+#ifdef __HIPCC__
+  ::nbody1(cnt, SIZE, d_xxl, d_yyl, d_zzl, d_massl, d_nx, d_ny, d_nz, d_nm, d_vxl, d_vyl, d_vzl, m_fl, m_fcoeff, fsrrmax, rsm, stream);
+  hipDeviceSynchronize();
+#else
+  ::nbody1(cnt, SIZE, xx + off, yy + off, zz + off, mass + off, nx, ny, nz, nm, vx + off, vy + off, vz + off, m_fl, m_fcoeff, fsrrmax, rsm);
+#endif
+
+#ifdef __HIPCC__
+  //transfer up vx vy vz
+  hipMemcpyAsync(vx+off,d_vxl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
+  hipMemcpyAsync(vy+off,d_vyl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
+  hipMemcpyAsync(vz+off,d_vzl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
+  cudaCheckError();
+  hipDeviceSynchronize();
+#endif
+
+}
+
+// Iterate through the tree nodes, for each leaf node, start a task.
+// That task iterates through the tree nodes, skipping any node (and all
+// of its children) if all corners are too far away. Then it compares the
+// opening angle.
+template <int TDPTS>
+void RCBForceTree<TDPTS>::calcInternodeForces()
+{
+
+  std::vector<ID_T> q(1, 0);
+  std::vector<ID_T> parents;
+  while (!q.empty()) {
+    ID_T tl = q.back();
+    if (tree[tl].cr == 0 && tree[tl].cl == 0) {
+      // This is a leaf node.
+      q.pop_back();
+
+      bool inside = true;
+      for (int i = 0; i < DIMENSION; ++i) {
+        inside &= (tree[tl].xmax[i] < maxForceRange[i] && tree[tl].xmax[i] > minForceRange[i]) ||
+                  (tree[tl].xmin[i] < maxForceRange[i] && tree[tl].xmin[i] > minForceRange[i]);
+      }
+
+      if (inside) {
+        calcInternodeForce(tl, parents);
+      }
+    } else if (parents.size() > 0 && parents.back() == tl) {
+      // This is second time here; we've done with all children.
+      parents.pop_back();
+      q.pop_back();
+    } else {
+      // This is the first time at this parent node, queue the children.
+      if (tree[tl].cl > 0) q.push_back(tree[tl].cl);
+      if (tree[tl].cr > 0) q.push_back(tree[tl].cr);
+      parents.push_back(tl);
+    }
+  }
+}
+
+// Explicit template instantiation...
+template class RCBForceTree<QUADRUPOLE_TDPTS>;
+template class RCBForceTree<MONOPOLE_TDPTS>;
+
diff --git a/src/halo-finder/src/RCBForceTree.h b/src/halo-finder/src/RCBForceTree.h
new file mode 100644
index 0000000..8de7773
--- /dev/null
+++ b/src/halo-finder/src/RCBForceTree.h
@@ -0,0 +1,226 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+/*=========================================================================
+
+Copyright (c) 2011-2012 Argonne National Laboratory
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+=========================================================================*/
+
+#ifndef RCBForceTree_h
+#define RCBForceTree_h
+
+#include "BasicDefinition.h"
+#include "ForceLaw.h"
+#include "bigchunk.h"
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "hip/hip_runtime.h"
+
+// The number of points used for the pseudo-particle t-design.
+#define QUADRUPOLE_TDPTS 12 // 14
+#define MONOPOLE_TDPTS   1
+
+template <int TDPTS>
+class RCBForceTree
+{
+public:
+  RCBForceTree(
+              POSVEL_T* minLoc,       // Bounding box of halo
+              POSVEL_T* maxLoc,       // Bounding box of halo
+              POSVEL_T* minForceLoc,  // Bounding box for force updates
+              POSVEL_T* maxForceLoc,  // Bounding box for force updates
+              ID_T count,             // Number of particles in halo
+              POSVEL_T* xLoc,         // Locations of every particle
+              POSVEL_T* yLoc,
+              POSVEL_T* zLoc,
+              POSVEL_T* xVel,         // Velocities of every particle
+              POSVEL_T* yVel,
+              POSVEL_T* zVel,
+              POSVEL_T* mass,         // Mass of each particle
+              POSVEL_T* phiLoc,
+              ID_T *idLoc,
+              MASK_T *maskLoc,
+              POSVEL_T avgMass,       // Average mass for estimation
+              POSVEL_T fsm,
+              POSVEL_T r,             // rsm
+              POSVEL_T oa,
+              ID_T nd = 1,            // The number of particles below which
+                                      // to do the direct N^2 calculation
+              ID_T ds = 1,            // The "safety" factor to add to the
+                                      // estimated maximum depth
+              ID_T tmin = 128,        // Min. number of particles to build
+                                      // using a new task
+              ForceLaw *fl = 0,
+              float fcoeff = 0.0,
+              POSVEL_T ppc = 0.9);
+
+  ~RCBForceTree();
+
+  void printStats(double buildTime);
+
+protected:
+  struct TreeNode
+  {
+    ID_T count;                       // The number of particles in this node
+    ID_T offset;                      // The offset into the particle arrays at
+                                      // which data for this tree node starts
+
+    ID_T cl, cr;                      // Left and right children
+
+    POSVEL_T ppm[TDPTS];              // The pseudo-particle masses
+    POSVEL_T tdr;                     // The radius of the t-design sphere on
+                                      // which the pseudo-particles sit
+
+    POSVEL_T xmin[DIMENSION],
+             xmax[DIMENSION],
+             xc[DIMENSION];           // The bounding box of this node and its
+                                      // center position.
+  };
+
+protected:
+  void createRCBForceSubtree(int d, ID_T tl, ID_T tlcl, ID_T tlcr);
+  void createRCBForceTreeInParallel(ID_T tl = 0);
+  void createRCBForceTree();
+
+  void calcInternodeForce(ID_T tl, const std::vector<ID_T> &parents);
+  void calcInternodeForces();
+
+protected:
+  ID_T   particleCount;         // Total particles
+
+  POSVEL_T fsrrmax, rsm;
+  POSVEL_T particleMass;        // Average particle mass
+  POSVEL_T sinOpeningAngle,     // Criteria for opening node to lower level
+           tanOpeningAngle;
+  POSVEL_T ppContract;          // The pseudoparticle contraction factor
+
+  POSVEL_T*  xx;      // X location for particles on this processor
+  POSVEL_T*  yy;      // Y location for particles on this processor
+  POSVEL_T*  zz;      // Z location for particles on this processor
+  POSVEL_T*  vx;      // X velocity for particles on this processor
+  POSVEL_T*  vy;      // Y velocity for particles on this processor
+  POSVEL_T*  vz;      // Z velocity for particles on this processor
+  POSVEL_T*  mass;    // Mass for particles on this processor
+  POSVEL_T*  nx_v;    // X interaction list for each thread
+  POSVEL_T*  ny_v;    // Y interaction list for each thread
+  POSVEL_T*  nz_v;    // Z interaction list for each thread
+  POSVEL_T*  nm_v;    // Mass interaction list for each thread
+  
+#ifdef __HIPCC__
+  POSVEL_T* d_xx;      // X location for particles on this processor
+  POSVEL_T* d_yy;      // Y location for particles on this processor
+  POSVEL_T* d_zz;      // Z location for particles on this processor
+  POSVEL_T* d_vx;      // X velocity for particles on this processor
+  POSVEL_T* d_vy;      // Y velocity for particles on this processor
+  POSVEL_T* d_vz;      // Z velocity for particles on this processor
+  POSVEL_T* d_mass;    // Mass for particles on this processor
+  POSVEL_T* d_nx_v;    // X interaction list for each thread
+  POSVEL_T* d_ny_v;    // Y interaction list for each thread
+  POSVEL_T* d_nz_v;    // Z interaction list for each thread
+  POSVEL_T* d_nm_v;    // Mass interaction list for each thread
+
+  hipEvent_t* event_v;   // event for synchronization for each thread
+  hipStream_t* stream_v; // stream for each thread
+#endif
+
+  POSVEL_T*  phi;
+  ID_T*      id;
+  MASK_T*    mask;
+
+  POSVEL_T minRange[DIMENSION]; // Physical range of data
+  POSVEL_T maxRange[DIMENSION]; // Physical range of data
+  POSVEL_T minForceRange[DIMENSION]; // Physical range of data for force updates
+  POSVEL_T maxForceRange[DIMENSION]; // Physical range of data for force updates
+
+  int numThreads;
+
+  ID_T nDirect;
+  ID_T depthSafety;
+  ID_T taskPartMin; // Min number of particles for which to launch a build task
+
+  vector<TreeNode, bigchunk_allocator<TreeNode> > tree; // Internal nodes of tree
+
+  bool m_own_fl;
+  ForceLaw *m_fl;
+  float m_fcoeff;
+
+  // Interaction lists (one per thread)
+  vector<vector<POSVEL_T> > inx, iny, inz, inm;
+  vector<vector<ID_T> > iq; // The interaction queue
+
+#ifdef __bgq__BROKEN
+  vector<vector<ID_T> > part_idx;
+#endif
+};
+
+typedef RCBForceTree<QUADRUPOLE_TDPTS> RCBQuadrupoleForceTree;
+typedef RCBForceTree<MONOPOLE_TDPTS>   RCBMonopoleForceTree;
+
+#endif // RCBForceTree_h
+
diff --git a/src/halo-finder/src/RCOForceTree.cxx b/src/halo-finder/src/RCOForceTree.cxx
new file mode 100644
index 0000000..587cde9
--- /dev/null
+++ b/src/halo-finder/src/RCOForceTree.cxx
@@ -0,0 +1,1001 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+/*=========================================================================
+
+Copyright (c) 2011-2012 Argonne National Laboratory
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+=========================================================================*/
+
+#include "Timings.h"
+#include "RCOForceTree.h"
+
+#include <cstring>
+#include <cstdio>
+using namespace std;
+
+// References:
+// Emanuel Gafton and Stephan Rosswog. A fast recursive coordinate bisection tree for
+// neighbour search and gravity. Mon. Not. R. Astron. Soc. to appear, 2011.
+// http://arxiv.org/abs/1108.0028v1
+//
+// Atsushi Kawai, Junichiro Makino and Toshikazu Ebisuzaki.
+// Performance Analysis of High-Accuracy Tree Code Based on the Pseudoparticle
+// Multipole Method. The Astrophysical Journal Supplement Series, 151:13-33, 2004.
+// Related: http://arxiv.org/abs/astro-ph/0012041v1
+//
+// R. H. Hardin and N. J. Sloane
+// New Spherical 4-Designs. Discrete Math, 106/107 255-264, 1992.
+//
+// The library of spherical designs:
+// http://www2.research.att.com/~njas/sphdesigns/
+namespace {
+template <int TDPTS>
+struct sphdesign {};
+
+#define DECLARE_SPHDESIGN(TDPTS) \
+template <> \
+struct sphdesign<TDPTS> \
+{ \
+  static const POSVEL_T x[TDPTS]; \
+  static const POSVEL_T y[TDPTS]; \
+  static const POSVEL_T z[TDPTS]; \
+}; \
+/**/
+
+DECLARE_SPHDESIGN(1)
+DECLARE_SPHDESIGN(2)
+DECLARE_SPHDESIGN(3)
+DECLARE_SPHDESIGN(4)
+DECLARE_SPHDESIGN(6)
+DECLARE_SPHDESIGN(12)
+DECLARE_SPHDESIGN(14)
+
+#undef DECLARE_SPHDESIGN
+
+/* this is not a t-design, but puts the monopole moment
+   at the center of mass. */
+const POSVEL_T sphdesign<1>::x[] = {
+  0
+};
+
+const POSVEL_T sphdesign<1>::y[] = {
+  0
+};
+
+const POSVEL_T sphdesign<1>::z[] = {
+  0
+};
+
+const POSVEL_T sphdesign<2>::x[] = {
+  1.0,
+  -1.0
+};
+
+const POSVEL_T sphdesign<2>::y[] = {
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<2>::z[] = {
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<3>::x[] = {
+  1.0,
+  -.5,
+  -.5
+};
+
+const POSVEL_T sphdesign<3>::y[] = {
+  0,
+  .86602540378443864675,
+  -.86602540378443864675
+};
+
+const POSVEL_T sphdesign<3>::z[] = {
+  0,
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<4>::x[] = {
+  .577350269189625763,
+  .577350269189625763,
+  -.577350269189625763,
+  -.577350269189625763
+};
+
+const POSVEL_T sphdesign<4>::y[] = {
+  .577350269189625763,
+  -.577350269189625763,
+  .577350269189625763,
+  -.577350269189625763
+};
+
+const POSVEL_T sphdesign<4>::z[] = {
+  .577350269189625763,
+  -.577350269189625763,
+  -.577350269189625763,
+  .577350269189625763
+};
+
+const POSVEL_T sphdesign<6>::x[] = {
+  1.0,
+  -1.0,
+  0,
+  0,
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<6>::y[] = {
+  0,
+  0,
+  1.0,
+  -1.0,
+  0,
+  0
+};
+
+const POSVEL_T sphdesign<6>::z[] = {
+  0,
+  0,
+  0,
+  0,
+  1.0,
+  -1.0
+};
+
+// This is a 3-D 12-point spherical 4-design
+// (the verticies of a icosahedron) from Hardin and Sloane.
+const POSVEL_T sphdesign<12>::x[] = {
+  0,
+  0,
+  0.525731112119134,
+  -0.525731112119134,
+  0.85065080835204,
+  -0.85065080835204,
+  0,
+  0,
+  -0.525731112119134,
+  0.525731112119134,
+  -0.85065080835204,
+  0.85065080835204
+};
+
+const POSVEL_T sphdesign<12>::y[] = {
+  0.85065080835204,
+  0.85065080835204,
+  0,
+  0,
+  0.525731112119134,
+  0.525731112119134,
+  -0.85065080835204,
+  -0.85065080835204,
+  0,
+  0,
+  -0.525731112119134,
+  -0.525731112119134
+};
+
+const POSVEL_T sphdesign<12>::z[] = {
+  0.525731112119134,
+  -0.525731112119134,
+  0.85065080835204,
+  0.85065080835204,
+  0,
+  0,
+  -0.525731112119134,
+  0.525731112119134,
+  -0.85065080835204,
+  -0.85065080835204,
+  0,
+  0
+};
+
+// This is a 3-D 14-point spherical 4-design by
+// R. H. Hardin and N. J. A. Sloane.
+const POSVEL_T sphdesign<14>::x[] = {
+  1.0e0,
+  5.947189772040725e-1,
+  5.947189772040725e-1,
+  5.947189772040725e-1,
+  -5.947189772040725e-1,
+  -5.947189772040725e-1,
+  -5.947189772040725e-1,
+  3.012536847870683e-1,
+  3.012536847870683e-1,
+  3.012536847870683e-1,
+  -3.012536847870683e-1,
+  -3.012536847870683e-1,
+  -3.012536847870683e-1,
+  -1.0e0
+};
+
+const POSVEL_T sphdesign<14>::y[] = {
+  0.0e0,
+  1.776539926025823e-1,
+  -7.678419429698292e-1,
+  5.90187950367247e-1,
+  1.776539926025823e-1,
+  5.90187950367247e-1,
+  -7.678419429698292e-1,
+  8.79474443923065e-1,
+  -7.588425179318781e-1,
+  -1.206319259911869e-1,
+  8.79474443923065e-1,
+  -1.206319259911869e-1,
+  -7.588425179318781e-1,
+  0.0e0
+};
+
+const POSVEL_T sphdesign<14>::z[] = {
+  0.0e0,
+  7.840589244857197e-1,
+  -2.381765915652909e-1,
+  -5.458823329204288e-1,
+  -7.840589244857197e-1,
+  5.458823329204288e-1,
+  2.381765915652909e-1,
+  3.684710570566285e-1,
+  5.774116818882528e-1,
+  -9.458827389448813e-1,
+  -3.684710570566285e-1,
+  9.458827389448813e-1,
+  -5.774116818882528e-1,
+  0.0e0
+};
+} // anonymous namespace
+
+// Note: In Gafton and Rosswog the far-field force contribution is calculated
+// per-cell (at the center of mass), and then a Taylor expansion about the center
+// of mass is used to calculate the force on the individual particles. For this to
+// work, the functional form of the force must be known (because the Jacobian
+// and Hessian are required). Here, however, the functional form is not known,
+// and so the pseudo-particle method of Makino is used instead.
+
+template <int TDPTS>
+RCOForceTree<TDPTS>::RCOForceTree(
+			 POSVEL_T* minLoc,
+			 POSVEL_T* maxLoc,
+			 POSVEL_T* minForceLoc,
+			 POSVEL_T* maxForceLoc,
+			 ID_T count,
+			 POSVEL_T* xLoc,
+			 POSVEL_T* yLoc,
+			 POSVEL_T* zLoc,
+			 POSVEL_T* xVel,
+			 POSVEL_T* yVel,
+			 POSVEL_T* zVel,
+			 POSVEL_T* ms,
+                         POSVEL_T* phiLoc,
+                         ID_T *idLoc,
+                         MASK_T *maskLoc,
+			 POSVEL_T avgMass,
+                         POSVEL_T fsm,
+                         POSVEL_T oa,
+                         ID_T nd,
+                         ID_T ds,
+			 ForceLaw *fl,
+			 float fcoeff,
+                         POSVEL_T ppc)
+{
+  // Extract the contiguous data block from a vector pointer
+  particleCount = count;
+
+  xx = xLoc;
+  yy = yLoc;
+  zz = zLoc;
+  vx = xVel;
+  vy = yVel;
+  vz = zVel;
+  mass = ms;
+  phi = phiLoc;
+  id = idLoc;
+  mask = maskLoc;
+
+  particleMass = avgMass;
+  fsrrmax = fsm;
+  sinOpeningAngle = sinf(oa);
+  tanOpeningAngle = tanf(oa);
+  nDirect = nd;
+  depthSafety = ds;
+  ppContract = ppc;
+
+  // Find the grid size of this chaining mesh
+  for (int dim = 0; dim < DIMENSION; dim++) {
+    minRange[dim] = minLoc[dim];
+    maxRange[dim] = maxLoc[dim];
+    minForceRange[dim] = minForceLoc[dim];
+    maxForceRange[dim] = maxForceLoc[dim];
+  }
+
+  if (fl) {
+    m_own_fl = false;
+    m_fl = fl;
+    m_fcoeff = fcoeff;
+  } else {
+    //maybe change this to Newton's law or something
+    m_own_fl = true;
+    m_fl = new ForceLawNewton();
+    m_fcoeff = 1.0;
+  }
+
+  // Because the tree may be built in parallel, and no efficient way of locking
+  // the tree seems to be available in OpenMP (no reader/writer locks, etc.),
+  // we just estimate the number of tree nodes that will be needed. Hopefully,
+  // this will be an over estimate. If we need more than this, then tree nodes
+  // that really should be subdivided will not be.
+  //
+  // If the tree were perfectly balanced, then it would have a depth of
+  // log_8(particleCount/nDirect). The tree needs to have (8^depth)+1 entries.
+  // To that, a safety factor is added to the depth.
+  ID_T nds = (((ID_T)(particleCount/(POSVEL_T)nDirect)) << depthSafety) + 1;
+  tree.reserve(nds);
+
+  // Create the recursive RCO tree from the particle locations
+  createRCOForceTree();
+
+  printStats();
+
+  // Interaction lists.
+  int nthreads = 1;
+  inx.resize(nthreads);
+  iny.resize(nthreads);
+  inz.resize(nthreads);
+  inm.resize(nthreads);
+  iq.resize(nthreads);
+
+  calcInternodeForces();
+}
+
+template <int TDPTS>
+RCOForceTree<TDPTS>::~RCOForceTree()
+{
+  if (m_own_fl) {
+    delete m_fl;
+  }
+}
+
+template <int TDPTS>
+void RCOForceTree<TDPTS>::printStats()
+{
+  size_t zeroLeafNodes = 0;
+  size_t nonzeroLeafNodes = 0;
+  size_t maxPPN = 0;
+  size_t leafParts = 0;
+
+  for (ID_T tl = 1; tl < (ID_T) tree.size(); ++tl) {
+    if (tree[tl].leaf()) {
+      if (tree[tl].count > 0) {
+        ++nonzeroLeafNodes;
+
+        leafParts += tree[tl].count;
+        maxPPN = std::max((size_t) tree[tl].count, maxPPN);
+      } else {
+        ++zeroLeafNodes;
+      }
+    }
+  }
+
+  printf("\ttree post-build statistics:\n");
+  printf("\t\tparticles: %lu\n", (size_t) particleCount);
+  printf("\t\tnodes: %lu (allocated: %lu)\n", tree.size(), tree.capacity());
+  printf("\t\tleaves: %lu (empty: %lu)\n", zeroLeafNodes+nonzeroLeafNodes, zeroLeafNodes);
+  printf("\t\tmean ppn: %.2f (max ppn: %lu)\n", leafParts/((double) nonzeroLeafNodes), maxPPN);
+}
+
+static inline void cm(ID_T count, const POSVEL_T* __restrict xx, const POSVEL_T* __restrict yy,
+                      const POSVEL_T* __restrict zz, const POSVEL_T* __restrict mass,
+                      POSVEL_T* __restrict xmin, POSVEL_T* __restrict xmax, POSVEL_T* __restrict xc)
+{
+  // xmin/xmax are currently set to the whole bounding box, but this is too conservative, so we'll
+  // set them based on the actual particle content.
+
+  double x = 0, y = 0, z = 0, m = 0;
+
+  for (int i = 0; i < count; ++i) {
+    if (i == 0) {
+      xmin[0] = xmax[0] = xx[0];
+      xmin[1] = xmax[1] = yy[0];
+      xmin[2] = xmax[2] = zz[0];
+    } else {
+      xmin[0] = std::min(xmin[0], xx[i]);
+      xmax[0] = std::max(xmax[0], xx[i]);
+      xmin[1] = std::min(xmin[1], yy[i]);
+      xmax[1] = std::max(xmax[1], yy[i]);
+      xmin[2] = std::min(xmin[2], zz[i]);
+      xmax[2] = std::max(xmax[2], zz[i]);
+    }
+
+    POSVEL_T w = mass[i];
+    x += w*xx[i];
+    y += w*yy[i];
+    z += w*zz[i];
+    m += w;
+  }
+
+  xc[0] = (POSVEL_T) (x/m);
+  xc[1] = (POSVEL_T) (y/m);
+  xc[2] = (POSVEL_T) (z/m);
+}
+
+static inline POSVEL_T pptdr(const POSVEL_T* __restrict xmin, const POSVEL_T* __restrict xmax, const POSVEL_T* __restrict xc)
+{
+  return std::min(xmax[0] - xc[0], std::min(xmax[1] - xc[1], std::min(xmax[2] - xc[2], std::min(xc[0] - xmin[0],
+                 std::min(xc[1] - xmin[1], xc[2] - xmin[2])))));
+}
+
+template <int TDPTS>
+static inline void pppts(POSVEL_T tdr, const POSVEL_T* __restrict xc,
+                         POSVEL_T* __restrict ppx, POSVEL_T* __restrict ppy, POSVEL_T* __restrict ppz)
+{
+  for (int i = 0; i < TDPTS; ++i) {
+    ppx[i] = tdr*sphdesign<TDPTS>::x[i] + xc[0];
+    ppy[i] = tdr*sphdesign<TDPTS>::y[i] + xc[1];
+    ppz[i] = tdr*sphdesign<TDPTS>::z[i] + xc[2];
+  }
+}
+
+template <int TDPTS>
+static inline void pp(ID_T count, const POSVEL_T* __restrict xx, const POSVEL_T* __restrict yy,
+                      const POSVEL_T* __restrict zz, const POSVEL_T* __restrict mass, const POSVEL_T* __restrict xc,
+                      const POSVEL_T* __restrict ppx, const POSVEL_T* __restrict ppy, const POSVEL_T* __restrict ppz,
+                      POSVEL_T* __restrict ppm, POSVEL_T tdr)
+{
+  POSVEL_T K = TDPTS;
+  POSVEL_T odr0 = 1/K;
+
+  for (int i = 0; i < count; ++i) {
+    POSVEL_T xi = xx[i] - xc[0];
+    POSVEL_T yi = yy[i] - xc[1];
+    POSVEL_T zi = zz[i] - xc[2];
+    POSVEL_T ri = sqrtf(xi*xi + yi*yi + zi*zi);
+
+    for (int j = 0; j < TDPTS; ++j) {
+      POSVEL_T xj = ppx[j] - xc[0];
+      POSVEL_T yj = ppy[j] - xc[1];
+      POSVEL_T zj = ppz[j] - xc[2];
+      POSVEL_T rj2 = xj*xj + yj*yj + zj*zj;
+
+      POSVEL_T odr1 = 0, odr2 = 0;
+      if (rj2 != 0) {
+        POSVEL_T rj  = sqrtf(rj2);
+        POSVEL_T aij = (xi*xj + yi*yj + zi*zj)/(ri*rj);
+
+        odr1 = (3/K)*(ri/tdr)*aij;
+        odr2 = (5/K)*(ri/tdr)*(ri/tdr)*0.5*(3*aij*aij - 1);
+      }
+
+      ppm[j] += mass[i]*(odr0 + odr1 + odr2);
+    }
+  }
+}
+
+static inline void nbody1(ID_T count, ID_T count1, const POSVEL_T* __restrict xx, const POSVEL_T* __restrict yy,
+                         const POSVEL_T* __restrict zz, const POSVEL_T* __restrict mass,
+                         const POSVEL_T* __restrict xx1, const POSVEL_T* __restrict yy1,
+                         const POSVEL_T* __restrict zz1, const POSVEL_T* __restrict mass1,
+                         POSVEL_T* __restrict vx, POSVEL_T* __restrict vy, POSVEL_T* __restrict vz,
+                         ForceLaw *fl, float fcoeff, float fsrrmax)
+{
+  POSVEL_T fsrrmax2 = fsrrmax*fsrrmax;
+
+  for (int i = 0; i < count; ++i)
+    for (int j = 0; j < count1; ++j) {
+      POSVEL_T dx = xx1[j] - xx[i];
+      POSVEL_T dy = yy1[j] - yy[i];
+      POSVEL_T dz = zz1[j] - zz[i];
+      POSVEL_T dist2 = dx*dx + dy*dy + dz*dz;
+      POSVEL_T f_over_r = mass[i]*mass1[j] * fl->f_over_r(dist2);
+
+      POSVEL_T updateq = 1.0;
+      updateq *= (dist2 < fsrrmax2);
+
+      vx[i] += updateq*fcoeff*f_over_r*dx;
+      vy[i] += updateq*fcoeff*f_over_r*dy;
+      vz[i] += updateq*fcoeff*f_over_r*dz;
+    }
+}
+
+static inline void partition(ID_T n,
+                             POSVEL_T* __restrict xx, POSVEL_T* __restrict yy, POSVEL_T* __restrict zz,
+                             POSVEL_T* __restrict vx, POSVEL_T* __restrict vy, POSVEL_T* __restrict vz,
+                             POSVEL_T* __restrict mass, POSVEL_T* __restrict phi,
+                             ID_T* __restrict id, MASK_T* __restrict mask, const POSVEL_T* __restrict pv,
+                             ID_T* __restrict is)
+{
+  // Initialize all split index values to 0. These must be kept ordered (is[i+1] >= is[i]).
+  // All values less than is[0] have all three coords less than those in split. All values
+  // less than is[1] have x and y less than split but z >= split[2], etc.
+  for (int i = 0; i < NUM_CHILDREN-1; ++i) {
+    is[i] = 0;
+  }
+
+  for (ID_T i = 0; i < n; ++i) {
+    int p = 0;
+    if (xx[i] >= pv[0]) p += NUM_CHILDREN/2;
+    if (yy[i] >= pv[1]) p += NUM_CHILDREN/4;
+    if (zz[i] >= pv[2]) p += NUM_CHILDREN/8;
+    if (p >= NUM_CHILDREN-1) continue;
+
+    // This value needs to be moved into the bin that ends one element before is[p].
+    // If that is not the second-to-last bin then is[p] probably points to the first
+    // element in is[p+1].
+
+#define CHAIN_SWAP(var) \
+    std::swap(var[is[NUM_CHILDREN-1]], var[i]); \
+    for (int q = NUM_CHILDREN-1; q > p; --q) std::swap(var[is[q-1]], var[is[q]]); \
+/**/
+    CHAIN_SWAP(xx)
+    CHAIN_SWAP(yy)
+    CHAIN_SWAP(zz)
+    CHAIN_SWAP(vx)
+    CHAIN_SWAP(vy)
+    CHAIN_SWAP(vz)
+    CHAIN_SWAP(mass)
+    CHAIN_SWAP(phi)
+    CHAIN_SWAP(id)
+    CHAIN_SWAP(mask)
+#undef CHAIN_SWAP
+
+    do { ++is[p++]; } while (p < NUM_CHILDREN-1);
+  }
+}
+                             
+template <int TDPTS>
+void RCOForceTree<TDPTS>::createRCOForceSubtree(ID_T tl, const ID_T *__restrict tlc)
+{
+  ID_T is[NUM_CHILDREN-1];
+  POSVEL_T split[DIMENSION];
+
+  const bool geoSplit = false;
+  for (int i = 0; i < DIMENSION; ++i) {
+    split[i] = geoSplit ? (tree[tl].xmax[i]+tree[tl].xmin[i])/2 : tree[tl].xc[i];
+  }
+
+  ::partition(tree[tl].count, xx + tree[tl].offset, yy + tree[tl].offset, zz + tree[tl].offset,
+              vx + tree[tl].offset, vy + tree[tl].offset, vz + tree[tl].offset,
+              mass + tree[tl].offset, phi + tree[tl].offset,
+              id + tree[tl].offset, mask + tree[tl].offset, split, is);
+
+  bool noDiv = true;
+  for (int i = 0; i < NUM_CHILDREN-1; ++i) {
+    if (!(is[i] == 0 || is[i] == tree[tl].count)) {
+      noDiv = false;
+    }
+  }
+  if (noDiv) return;
+
+  tree[tlc[0]].count = is[0];
+  for (int i = 1; i < NUM_CHILDREN-1; ++i) {
+    tree[tlc[i]].count = is[i] - is[i-1];
+  }
+  tree[tlc[NUM_CHILDREN-1]].count = tree[tl].count - is[NUM_CHILDREN-2];
+
+  ID_T coff = tree[tl].offset;
+  for (int i = 0; i < NUM_CHILDREN; ++i) {
+    if (tree[tlc[i]].count > 0) {
+      tree[tl].c[i] = tlc[i];
+      tree[tlc[i]].offset = coff;
+      // Note: tree[tlc[i]].xmax,xmin are not set here b/c cm will
+      // compute them from the contained particles.
+      coff += tree[tlc[i]].count;
+
+      createRCOForceTreeInParallel(tlc[i]);
+    }
+  }
+}
+
+// This is basically the algorithm from (Gafton and Rosswog, 2011).
+template <int TDPTS>
+void RCOForceTree<TDPTS>::createRCOForceTreeInParallel(ID_T tl)
+{
+  ID_T cnt = tree[tl].count;
+  ID_T off = tree[tl].offset;
+
+  // Compute the center-of-mass coordinates (and recompute the min/max)
+  ::cm(cnt, xx + off, yy + off, zz + off, mass + off,
+       tree[tl].xmin, tree[tl].xmax, tree[tl].xc);
+
+  if (cnt <= nDirect) {
+    // The pseudoparticles
+    tree[tl].tdr = ppContract*::pptdr(tree[tl].xmin, tree[tl].xmax, tree[tl].xc);
+    memset(tree[tl].ppm, 0, sizeof(POSVEL_T)*TDPTS);
+    if (cnt > TDPTS) { // Otherwise, the pseudoparticles are never used
+      POSVEL_T ppx[TDPTS], ppy[TDPTS], ppz[TDPTS];
+      pppts<TDPTS>(tree[tl].tdr, tree[tl].xc, ppx, ppy, ppz);
+      pp<TDPTS>(cnt, xx + off, yy + off, zz + off, mass + off, tree[tl].xc,
+                ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
+    }
+
+    return;
+  }
+
+  // Index of the right and left child levels
+  ID_T tlc[NUM_CHILDREN];
+  tlc[0] = tree.size();
+  for (int i = 1; i < NUM_CHILDREN; ++i) {
+    tlc[i] = tlc[0]+i;
+  }
+  tree.resize(tlc[NUM_CHILDREN-1]+1);
+  memset(&tree[tlc[0]], 0, sizeof(TreeNode)*NUM_CHILDREN);
+
+  // Both children have similar bounding boxes to the current node (the
+  // parent), so copy the bounding box here, and then overwrite the changed
+  // coordinate later.
+  for (int i = 0; i < DIMENSION; ++i) {
+    for (int j = 0; j < NUM_CHILDREN; ++j) {
+      tree[tlc[j]].xmin[i] = tree[tl].xmin[i];
+      tree[tlc[j]].xmax[i] = tree[tl].xmax[i];
+    }
+  }
+
+  // Split all edges at the center of mass.
+  createRCOForceSubtree(tl, tlc);
+
+  // Compute the pseudoparticles based on those of the children
+  POSVEL_T ppx[TDPTS], ppy[TDPTS], ppz[TDPTS];
+  tree[tl].tdr = ppContract*::pptdr(tree[tl].xmin, tree[tl].xmax, tree[tl].xc);
+  pppts<TDPTS>(tree[tl].tdr, tree[tl].xc, ppx, ppy, ppz);
+  memset(tree[tl].ppm, 0, sizeof(POSVEL_T)*TDPTS);
+
+  for (int i = 0; i < NUM_CHILDREN; ++i) {
+    if (tree[tlc[i]].count > 0) {
+      if (tree[tlc[i]].count <= TDPTS) {
+        ID_T offc = tree[tlc[i]].offset;
+        pp<TDPTS>(tree[tlc[i]].count, xx + offc, yy + offc, zz + offc, mass + offc,
+                  tree[tl].xc, ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
+      } else {
+        POSVEL_T ppxc[TDPTS], ppyc[TDPTS], ppzc[TDPTS];
+        pppts<TDPTS>(tree[tlc[i]].tdr, tree[tlc[i]].xc, ppxc, ppyc, ppzc);
+        pp<TDPTS>(TDPTS, ppxc, ppyc, ppzc, tree[tlc[i]].ppm, tree[tl].xc,
+                  ppx, ppy, ppz, tree[tl].ppm, tree[tl].tdr);
+      }
+    }
+  }
+}
+
+template <int TDPTS>
+void RCOForceTree<TDPTS>::createRCOForceTree()
+{
+  // The top tree is the entire box
+  tree.resize(1);
+  memset(&tree[0], 0, sizeof(TreeNode));
+
+  tree[0].count = particleCount;
+  tree[0].offset = 0;
+
+  for (int i = 0; i < DIMENSION; ++i) {
+    tree[0].xmin[i] = minRange[i];
+    tree[0].xmax[i] = maxRange[i];
+  }
+
+  createRCOForceTreeInParallel();
+}
+
+template <int TDPTS>
+void RCOForceTree<TDPTS>::calcInternodeForce(ID_T tl,
+                                            const std::vector<ID_T> &parents) {
+  POSVEL_T fsrrmax2 = fsrrmax*fsrrmax;
+
+  int tid = 0;
+
+  std::vector<ID_T> &q = iq[tid];
+  q.clear();
+  q.push_back(0);
+
+  // The interaction list.
+  std::vector<POSVEL_T> &nx = inx[tid];
+  std::vector<POSVEL_T> &ny = iny[tid];
+  std::vector<POSVEL_T> &nz = inz[tid];
+  std::vector<POSVEL_T> &nm = inm[tid];
+  nx.clear();
+  ny.clear();
+  nz.clear();
+  nm.clear();
+  nx.reserve(4096);
+  ny.reserve(4096);
+  nz.reserve(4096);
+  nm.reserve(4096);
+
+  while (!q.empty()) {
+    ID_T tln = q.back();
+    q.pop_back();
+
+    // We should not interact with our own parents.
+    if (tln < tl) {
+      bool isParent = std::binary_search(parents.begin(), parents.end(), tln);
+      if (isParent) {
+        for (int i = 0; i < NUM_CHILDREN; ++i) {
+          ID_T tlnc = tree[tln].c[i];
+
+          if (tlnc != tl && tlnc > 0 && tree[tlnc].count > 0) {
+            q.push_back(tlnc);
+          }
+        }
+
+        continue;
+      }
+    }
+
+    // Is this node have a small enough opening angle to interact with?
+    POSVEL_T dx = tree[tln].xc[0] - tree[tl].xc[0];
+    POSVEL_T dy = tree[tln].xc[1] - tree[tl].xc[1];
+    POSVEL_T dz = tree[tln].xc[2] - tree[tl].xc[2];
+    POSVEL_T dist2 = dx*dx + dy*dy + dz*dz;
+
+    POSVEL_T sx = tree[tln].xmax[0]-tree[tln].xmin[0];
+    POSVEL_T sy = tree[tln].xmax[1]-tree[tln].xmin[1];
+    POSVEL_T sz = tree[tln].xmax[2]-tree[tln].xmin[2];
+    POSVEL_T l2 = std::min(sx*sx, std::min(sy*sy, sz*sz)); // under-estimate
+
+    POSVEL_T dtt2 = dist2*tanOpeningAngle*tanOpeningAngle; 
+    bool looksBig;
+    // l2/dist2 is really tan^2 theta, for small theta, tan(theta) ~ theta
+    if (l2 > dtt2) {
+      // the under-estimate is too big, so this is definitely too big
+      looksBig = true;
+    } else {
+      // There are 8 corner points of the remote node, and the maximum angular
+      // size will be from one of those points to its opposite points. So there
+      // are 8 vector dot products to compute to determine the maximum angular
+      // size at any given reference point. (do we need to do this for each point
+      // in leaf node, or will the c.m. point be sufficient?).
+      looksBig = false;
+      for (int i = 0; i < 2; ++i)
+      for (int j = 0; j < 2; ++j) {
+        POSVEL_T x1 = (i == 0 ? tree[tln].xmin : tree[tln].xmax)[0] - tree[tl].xc[0];
+        POSVEL_T y1 = (j == 0 ? tree[tln].xmin : tree[tln].xmax)[1] - tree[tl].xc[1];
+        POSVEL_T z1 = tree[tln].xmin[2] - tree[tl].xc[2];
+  
+        POSVEL_T x2 = (i == 0 ? tree[tln].xmax : tree[tln].xmin)[0] - tree[tl].xc[0];
+        POSVEL_T y2 = (j == 0 ? tree[tln].xmax : tree[tln].xmin)[1] - tree[tl].xc[1];
+        POSVEL_T z2 = tree[tln].xmax[2] - tree[tl].xc[2];
+ 
+        const bool useRealOA = false;
+        if (useRealOA) {
+          // |a x b| = a*b*sin(theta)
+          POSVEL_T cx = y1*z2 - z1*y2;
+          POSVEL_T cy = z1*x2 - x1*z2;
+          POSVEL_T cz = x1*y2 - y1*x2;
+          if ((cx*cx + cy*cy + cz*cz) > sinOpeningAngle*sinOpeningAngle*
+                                          (x1*x1 + y1*y1 + z1*z1)*(x2*x2 + y2*y2 + z2*z2)
+             ) {
+            looksBig = true;
+            break;
+          }
+        } else {
+          // Instead of using the real opening angle, use the tan approximation; this is
+          // better than the opening-angle b/c it incorporates depth information.
+          POSVEL_T ddx = x1 - x2, ddy = y1 - y2, ddz = z1 - z2;
+          POSVEL_T dh2 = ddx*ddx + ddy*ddy + ddz*ddz;
+          if (dh2 > dtt2) {
+            looksBig = true;
+            break;
+          }
+        }
+      }
+    }
+
+    if (!looksBig) {
+      if (dist2 > fsrrmax2) {
+        // We could interact with this node, but it is too far away to make
+        // any difference, so it will be skipped, along with all of its
+        // children.
+        continue;
+      }
+
+      // This node has fewer particles than pseudo particles, so just use the
+      // particles that are actually there.
+      if (tree[tln].count <= TDPTS) {
+        ID_T offn = tree[tln].offset;
+        ID_T cntn = tree[tln].count;
+        size_t start = nx.size();
+        nx.resize(nx.size() + cntn);
+        ny.resize(ny.size() + cntn);
+        nz.resize(nz.size() + cntn);
+        nm.resize(nm.size() + cntn);
+        for (size_t i = 0; i < (size_t) cntn; ++i) {
+          nx[start + i] = xx[offn + i];
+          ny[start + i] = yy[offn + i];
+          nz[start + i] = zz[offn + i];
+          nm[start + i] = mass[offn + i];
+        }
+
+        continue;
+      }
+
+      // Interact the particles in this node with the pseudoparticles of the
+      // other node.
+      size_t start = nx.size();
+      nx.resize(nx.size() + TDPTS);
+      ny.resize(ny.size() + TDPTS);
+      nz.resize(nz.size() + TDPTS);
+      nm.resize(nm.size() + TDPTS);
+      pppts<TDPTS>(tree[tln].tdr, tree[tln].xc, &nx[start], &ny[start], &nz[start]);
+      for (size_t i = 0; i < (size_t) TDPTS; ++i) {
+        nm[start + i] = tree[tln].ppm[i];
+      }
+
+      continue;
+    } else if (tree[tln].leaf()) {
+      // This is a leaf node with which we must interact.
+      ID_T offn = tree[tln].offset;
+      ID_T cntn = tree[tln].count;
+      size_t start = nx.size();
+      nx.resize(nx.size() + cntn);
+      ny.resize(ny.size() + cntn);
+      nz.resize(nz.size() + cntn);
+      nm.resize(nm.size() + cntn);
+      for (size_t i = 0; i < (size_t) cntn; ++i) {
+        nx[start + i] = xx[offn + i];
+        ny[start + i] = yy[offn + i];
+        nz[start + i] = zz[offn + i];
+        nm[start + i] = mass[offn + i];
+      }
+
+      continue;
+    }
+
+    // This other node is not a leaf, but has too large an opening angle
+    // for an approx. interaction: queue its children.
+
+    for (int j = 0; j < NUM_CHILDREN; ++j) {
+      ID_T tlnc = tree[tln].c[j];
+
+      if (tlnc > 0 && tree[tlnc].count > 0) {
+        bool close = true;
+        for (int i = 0; i < DIMENSION; ++i) {
+          POSVEL_T dist = 0;
+          if (tree[tl].xmax[i] < tree[tlnc].xmin[i]) {
+            dist = tree[tlnc].xmin[i] - tree[tl].xmax[i];
+          } else if (tree[tl].xmin[i] > tree[tlnc].xmax[i]) {
+            dist = tree[tl].xmin[i] - tree[tlnc].xmax[i];
+          }
+  
+          if (dist > fsrrmax) {
+            close = false;
+            break;
+          }
+        }
+  
+        if (close) q.push_back(tlnc);
+      }
+    }
+  }
+
+  ID_T off = tree[tl].offset;
+  ID_T cnt = tree[tl].count;
+
+  // Add self interactions...
+  size_t start = nx.size();
+  nx.resize(nx.size() + cnt);
+  ny.resize(ny.size() + cnt);
+  nz.resize(nz.size() + cnt);
+  nm.resize(nm.size() + cnt);
+  for (size_t i = 0; i < (size_t) cnt; ++i) {
+    nx[start + i] = xx[off + i];
+    ny[start + i] = yy[off + i];
+    nz[start + i] = zz[off + i];
+    nm[start + i] = mass[off + i];
+  }
+
+  // Process the interaction list...
+  ::nbody1(cnt, nx.size(),
+           xx + off, yy + off, zz + off, mass + off,
+           &nx[0], &ny[0], &nz[0], &nm[0],
+           vx + off, vy + off, vz + off, m_fl, m_fcoeff, fsrrmax);
+}
+
+// Iterate through the tree nodes, for each leaf node, start a task.
+// That task iterates through the tree nodes, skipping any node (and all
+// of its children) if all corners are too far away. Then it compares the
+// opening angle.
+template <int TDPTS>
+void RCOForceTree<TDPTS>::calcInternodeForces()
+{
+  std::vector<ID_T> q(1, 0);
+  std::vector<ID_T> parents;
+  while (!q.empty()) {
+    ID_T tl = q.back();
+
+    if (tree[tl].leaf()) {
+      // This is a leaf node.
+      q.pop_back();
+
+      bool inside = true;
+      for (int i = 0; i < DIMENSION; ++i) {
+        inside &= (tree[tl].xmax[i] < maxForceRange[i] && tree[tl].xmax[i] > minForceRange[i]) ||
+                  (tree[tl].xmin[i] < maxForceRange[i] && tree[tl].xmin[i] > minForceRange[i]);
+      }
+
+      if (inside) {
+        calcInternodeForce(tl, parents);
+      }
+    } else if (parents.size() > 0 && parents.back() == tl) {
+      // This is second time here; we've done with all children.
+      parents.pop_back();
+      q.pop_back();
+    } else {
+      // This is the first time at this parent node, queue the children.
+      for (int i = 0; i < NUM_CHILDREN; ++i) {
+        if (tree[tl].c[i] > 0) {
+          q.push_back(tree[tl].c[i]);
+        }
+      }
+
+      parents.push_back(tl);
+    }
+  }
+}
+
+// Explicit template instantiation...
+template class RCOForceTree<QUADRUPOLE_TDPTS>;
+template class RCOForceTree<MONOPOLE_TDPTS>;
+
diff --git a/src/halo-finder/src/RCOForceTree.h b/src/halo-finder/src/RCOForceTree.h
new file mode 100644
index 0000000..62c3195
--- /dev/null
+++ b/src/halo-finder/src/RCOForceTree.h
@@ -0,0 +1,205 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+/*=========================================================================
+
+Copyright (c) 2011-2012 Argonne National Laboratory
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+=========================================================================*/
+
+#ifndef RCOForceTree_h
+#define RCOForceTree_h
+
+#include "BasicDefinition.h"
+#include "ForceLaw.h"
+#include "bigchunk.h"
+
+#include <string>
+#include <vector>
+#include <algorithm>
+
+
+// The number of points used for the pseudo-particle t-design.
+#define QUADRUPOLE_TDPTS 12 // 14
+#define MONOPOLE_TDPTS   1
+
+template <int TDPTS>
+class RCOForceTree
+{
+public:
+  RCOForceTree(
+              POSVEL_T* minLoc,       // Bounding box of halo
+              POSVEL_T* maxLoc,       // Bounding box of halo
+              POSVEL_T* minForceLoc,  // Bounding box for force updates
+              POSVEL_T* maxForceLoc,  // Bounding box for force updates
+              ID_T count,             // Number of particles in halo
+              POSVEL_T* xLoc,         // Locations of every particle
+              POSVEL_T* yLoc,
+              POSVEL_T* zLoc,
+              POSVEL_T* xVel,         // Velocities of every particle
+              POSVEL_T* yVel,
+              POSVEL_T* zVel,
+              POSVEL_T* mass,         // Mass of each particle
+              POSVEL_T* phiLoc,
+              ID_T *idLoc,
+              MASK_T *maskLoc,
+              POSVEL_T avgMass,       // Average mass for estimation
+              POSVEL_T fsm,
+              POSVEL_T oa,
+              ID_T nd = 1,            // The number of particles below which
+                                      // to do the direct N^2 calculation
+              ID_T ds = 1,            // The "safety" factor to add to the
+                                      // estimated maximum depth
+              ForceLaw *fl = 0,
+              float fcoeff = 0.0,
+              POSVEL_T ppc = 0.9);
+
+  ~RCOForceTree();
+
+  void printStats();
+
+protected:
+  struct TreeNode
+  {
+    ID_T count;                       // The number of particles in this node
+    ID_T offset;                      // The offset into the particle arrays at
+                                      // which data for this tree node starts
+
+    ID_T c[NUM_CHILDREN];             // Left and right children
+
+    POSVEL_T ppm[TDPTS];              // The pseudo-particle masses
+    POSVEL_T tdr;                     // The radius of the t-design sphere on
+                                      // which the pseudo-particles sit
+
+    POSVEL_T xmin[DIMENSION],
+             xmax[DIMENSION],
+             xc[DIMENSION];           // The bounding box of this node and its
+                                      // center position.
+
+    bool leaf() const {
+      for (int i = 0; i < NUM_CHILDREN; ++i) {
+        if (c[i] > 0) {
+          return false;
+        }
+      }
+
+      return true;
+    }
+  };
+
+protected:
+  void createRCOForceSubtree(ID_T tl, const ID_T *__restrict tlc);
+  void createRCOForceTreeInParallel(ID_T tl = 0);
+  void createRCOForceTree();
+
+  void calcInternodeForce(ID_T tl, const std::vector<ID_T> &parents);
+  void calcInternodeForces();
+
+protected:
+  ID_T   particleCount;         // Total particles
+
+  POSVEL_T fsrrmax;
+  POSVEL_T particleMass;        // Average particle mass
+  POSVEL_T sinOpeningAngle,     // Criteria for opening node to lower level
+           tanOpeningAngle;
+  POSVEL_T ppContract;          // The pseudoparticle contraction factor
+
+  POSVEL_T* __restrict xx;      // X location for particles on this processor
+  POSVEL_T* __restrict yy;      // Y location for particles on this processor
+  POSVEL_T* __restrict zz;      // Z location for particles on this processor
+  POSVEL_T* __restrict vx;      // X velocity for particles on this processor
+  POSVEL_T* __restrict vy;      // Y velocity for particles on this processor
+  POSVEL_T* __restrict vz;      // Z velocity for particles on this processor
+  POSVEL_T* __restrict mass;    // Mass for particles on this processor
+
+  POSVEL_T* __restrict phi;
+  ID_T*     __restrict id;
+  MASK_T*   __restrict mask;
+
+  POSVEL_T minRange[DIMENSION]; // Physical range of data
+  POSVEL_T maxRange[DIMENSION]; // Physical range of data
+  POSVEL_T minForceRange[DIMENSION]; // Physical range of data for force updates
+  POSVEL_T maxForceRange[DIMENSION]; // Physical range of data for force updates
+
+  ID_T nDirect;
+  ID_T depthSafety;
+
+  vector<TreeNode, bigchunk_allocator<TreeNode> > tree; // Internal nodes of tree
+
+  bool m_own_fl;
+  ForceLaw *m_fl;
+  float m_fcoeff;
+
+  // Interaction lists (one per thread)
+  vector<vector<POSVEL_T> > inx, iny, inz, inm;
+  vector<vector<ID_T> > iq; // The interaction queue
+};
+
+typedef RCOForceTree<QUADRUPOLE_TDPTS> RCOQuadrupoleForceTree;
+typedef RCOForceTree<MONOPOLE_TDPTS>   RCOMonopoleForceTree;
+
+#endif // RCOForceTree_h
+
diff --git a/src/halo-finder/src/README b/src/halo-finder/src/README
new file mode 100644
index 0000000..5c090eb
--- /dev/null
+++ b/src/halo-finder/src/README
@@ -0,0 +1,229 @@
+
+Parallel Halo Finder Operation
+------------------------------
+
+The test driver for the parallel halo finder is HaloTestP.cxx.  The basic
+operation of the parallel halo finder is to take input data of particle
+location, velocity, mass and tag identifier and distribute the particles on
+processors such that the particles are completely partitioned.  These are
+called the alive particles belonging to a processor.  Each processor will
+share information so that particles which are alive on a neighbor processor
+are added to a processor becoming dead particles there.  By definition the
+belt of dead particles will be all particles which can comprise a halo.
+
+Once each processor has both alive and dead particles, the serial halo finder
+is called and will return all halos found on that processor.  A halo will
+contain only alive particles, in which case it is owned by the processor,
+or only dead particles and it will be discarded because it will belong to 
+one of the neighbor processors.  A halo might contain a combination of 
+alive and dead particles and this mixed halo will be shared by one or 
+more neighbor processors.  
+
+The parallel halo finder must merge these mixed halos before giving the
+results.  When a particle is assigned to a processor it has a status which
+can be ALIVE, or DEAD.  The dead particles are actually given a status of
+0 through 26 to indicate which neighbor processor contains that particle
+as alive.  When the serial halo finder completes and halos are looked at,
+a mixed halo will also know what neighbors share it.  A simple rule using
+the planes of the neighbor allows most mixed halos to be categorized
+immediately.  If the neigbor sharing the halo is in the "upper" planes
+of the physical space (right, top and back planes of x,y,z) then this
+processor will keep the halo as its own.  Conversely if the neighbor sharing
+the halo is in the "lower" planes (left, bottom, front) then it will delete
+that halo, knowing the the neighbor will claim it.  Any other halos which
+cross planes or are contain in more that one other neighbor are gathered
+up and sent to the master processor who arbitrates which processor will
+claim the halo and which will give it up.
+
+
+Input Files
+-----------
+
+The halo finder takes two styles of input.  The first is the .cosmo format
+which is called RECORD format.  Each particle has seven 4 byte floats 
+(x location, x velocity, y location, y velocity, z location, z velocity, 
+mass (which is 1.0)) and one 4 byte integer which is the unique particle 
+identifier which starts with 1.
+
+The second format is the Gadget style which is called BLOCK.  In this file
+there is a header of 256 bytes and which is described in Definition.h.
+It is followed by three blocks of data.  The first has 4 byte floats for
+each particle's x,y,z location.  Note that particle 0 is written but since
+particles start with identifier 1, these first three floats are ignored.
+Next is a block with x,y,z velocity and again the first three floats
+must be ignored.  Finally a block of 4 byte integer tag identifiers with
+the first tag ignored.
+
+
+Methods of Input
+----------------
+
+Data files can either contain only the alive particles for a processor or
+a mixture of particles such that each processor must read each file to
+obtain its alive particles.  In the first case the processor only has
+to read one file, and then can do an exchange of dead particles with 
+immediate neighbor processors which is done in rotation.  All processors
+send right face neighbor particles to the right, and receive from the left,
+and then the reverse is done.  With 26 of these rotations all neighbors will
+get dead particles and every processor has something to do on each step
+of the rotation so that the MPI Send/Receive works.
+
+In the second case since every file has to be read and then MPI messages
+containing the particles are created and passed round robin so that
+every processor has a chance to choose its own alive particles.  After
+this the same particle exchange is done to populate the dead particles.
+
+
+Output Files
+------------
+
+The .cosmo files are written again as output, but with the mass field
+replaced by the mass * the number of particles in the halo.  Only the
+first particle in a halo contains that accumulated mass.  Every other
+particle in the halo reports the tag identifier of that first particle
+so that you can tell what particles make up a halo.  If the mass is -1
+that means the particle is not in any halo.  An additional file per
+processor is written with the identifier and size of every halo on that
+processor and is used by the BinHalos.cxx program to create a histogram.
+
+
+C++ Classes
+-----------
+
+Five classes are available for use.
+
+"Partition" is a static class which does the MPI initialize and finalize and
+which creates the Cartesian topology of the problem and identifies this
+processor within the topology and all of its neighbors in the topology.
+
+"ParticleDistribute" does the reading of files for either ROUND_ROBIN or
+ONE_TO_ONE and in both cases the final result has only alive particles.
+
+The round robin read is complicated because of the MPI message passing.
+Since there can be fewer files than processors, if they can be divided 
+nicely as in 8 files across 32 processors, then four processors read the 
+8 files and distribute in four ROUND_ROBIN loops.  If there are fewer 
+processors than files, then many loops of read will happen although the 
+final loop may have some processors participating in the exchange which 
+did not initially read any data and will just be passing buffers of 0 particles.
+
+The one to one read assumes that files have only alive particles and have
+been laid out in the same topology that MPI Cartesian topology will use.
+This is a preamble to simply passing alive particles in memory.
+
+"ParticleExchange" does the exchange with nearest neighbors using either
+the vectors from ParticleDistribute or from some other initializer which
+contain only alive particles.  When it exits the vectors have both alive
+and dead particles in it.
+
+"CosmoHaloFinderP" takes the information from either ParticleDistribute or
+ParticleExchange, normalizes the locations, and calls the serial
+halo finder.  It collects the results, merges the mixed halos and writes
+halo output.
+
+"CosmoHaloFinder" is the serial halo finder and can operate on its own using
+HaloTest.cxx as a driver.
+
+
+Command Line Arguments
+----------------------
+
+An input file format was devised and is read by the class HaloFinderInput.
+Examples for BLOCK and COSMO formats are in sb256_gadget2.in and
+sb256_cosmo.in.
+
+################################################################################
+# Header version information
+################################################################################
+HALOFINDER_HEADER_VERSION 1.0.0
+
+################################################################################
+# Input base name ending in '.' if followed by processor id
+################################################################################
+INPUT_BASE_NAME /Users/pkf/Cosmo.files/gadget_sb256.cosmo.
+
+################################################################################
+# Input data style (RECORD = .cosmo)  (BLOCK = .gadget2)
+################################################################################
+INPUT_TYPE RECORD
+
+################################################################################
+# Particle distribution style
+#  ROUND_ROBIN indicates particles must be looked at by all processors
+#  ONE_TO_ONE  indicates that particles physically reside on matchin processor
+################################################################################
+DISTRIBUTE_TYPE ROUND_ROBIN
+
+################################################################################
+# Output base name
+################################################################################
+OUTPUT_BASE_NAME sb256_c
+
+################################################################################
+# Box size (rL)
+################################################################################
+BOX_SIZE 64.0
+
+################################################################################
+# Overload zone size (dead zone)
+################################################################################
+OVERLOAD_SIZE 5.0
+
+################################################################################
+# Number of particles in all files (np^3)
+################################################################################
+NUMBER_OF_PARTICLES 256 
+
+################################################################################
+# Minimum distance between particles in a halo (bb)
+################################################################################
+MINIMUM_PARTICLE_DISTANCE 0.20
+
+################################################################################
+# Minimum number of particles in a halo (pmin)
+################################################################################
+MINIMUM_PARTICLES_PER_HALO 10
+
+################################################################################
+# Omega dm
+################################################################################
+OMEGADM 1.0
+
+################################################################################
+# Hubble constant
+################################################################################
+HUBBLE_CONSTANT 0.5
+
+################################################################################
+# Deut
+################################################################################
+DEUT 0.0
+
+################################################################################
+# Output all particle data with mass field replaced by halo tag
+################################################################################
+OUTPUT_PARTICLES 0
+
+################################################################################
+# Output the halo catalog of one entry per halo (.cosmo and ascii format)
+################################################################################
+OUTPUT_HALO_CATALOG 1
+
+################################################################################
+# Output FOF halo properties report (ascii)
+################################################################################
+OUTPUT_FOF_PROPERTIES 1
+
+
+Sample Command Lines
+--------------------
+
+mpirun -np 8 HaloFinder sb256_cosmo.in
+mpirun -np 8 HaloFinder sb256_gadget2.in
+
+
+Compile, Load and Run
+---------------------
+
+Modules to load must include a c++ compile and MPI.
+% make HaloFinder will compile.
diff --git a/src/halo-finder/src/Timer.cxx b/src/halo-finder/src/Timer.cxx
new file mode 100644
index 0000000..09ae3e2
--- /dev/null
+++ b/src/halo-finder/src/Timer.cxx
@@ -0,0 +1,434 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+// .NAME Timer - create timer for program execution
+//
+// .SECTION Description
+// The Timer class allows for easy timing of the program.  The timer
+// tracks real (clock) time elapsed, user time, and system time.
+
+#include "Timer.h"
+
+#define TIMEROFF	0
+#define TIMERON		1
+
+/************************************************************************/
+/*									*/
+/*			FUNCTION Timer					*/
+/*									*/
+/*	This is the constructor for the class Timer.  It sets the timer */
+/*  status to TIMEROFF and clears all the values.  It also makes a call */
+/*  to sysconf to determine the number of clock ticks per second for    */
+/*  use with the call times()						*/
+/*  It also makes calibration calls.                                    */
+/*									*/
+/************************************************************************/
+
+Timer::Timer()
+{
+#ifdef __MWERKS__
+  // For now, stub out all Timer guts for MetroWerks
+#else
+
+#ifndef POOMA_TFLOP
+  cpu_speed = sysconf(_SC_CLK_TCK);
+#endif
+  timer_state = TIMEROFF;
+  clear();
+
+  // Calibration:
+#if defined(POOMA_T3E)
+  long start_time, end_time, total_time;
+  (void) rtclock();
+  start_time = rtclock();
+  end_time = rtclock();
+  total_time = end_time - start_time;
+  calibration = tick_secs(total_time, cpu_speed);
+#else
+  // No other machines have calibration defined yet.
+  calibration = 0.0;
+#endif
+
+#endif // __MWERKS__
+}
+/*			END OF FUNCTION Timer				*/
+
+Timer::~Timer()
+{
+#ifdef __MWERKS__
+  // For now, stub out all Timer guts for MetroWerks
+#else
+
+  // Check to see if the timer is on
+  if (timer_state == TIMERON)
+    {
+      //  Destroying a running Timer
+      // ERRORMSG(level2 << "TRIED TO DELETE A RUNNING TIMER!\n");
+      // ERRORMSG("STOPPING AND DELETING TIMER." << endl);
+      timer_state=TIMEROFF;
+    }
+
+#endif // __MWERKS__
+}
+
+/************************************************************************/
+/*									*/
+/*			FUNCTION clear					*/
+/*									*/
+/*	clear sets all of the accumulated times for this timer to 0.	*/
+/*  It is intended to only be used on a stopped timer.  If it is used	*/
+/*  on a running timer, a warning message is printed, the timer is      */
+/*  stopped and all of its values are cleared.				*/
+/*									*/
+/************************************************************************/
+
+void Timer::clear()
+{
+#ifdef __MWERKS__
+  // For now, stub out all Timer guts for MetroWerks
+  return;
+#else
+  // Check to see if the timer if on
+  if (timer_state == TIMERON)
+    {
+      //  Clearing a running Timer
+      // ERRORMSG(level2 << "TRIED TO CLEAR A RUNNING TIMER!\n");
+      // ERRORMSG("SETTING ALL VALUES TO 0 AND STOPPING TIMER." << endl);
+      timer_state = TIMEROFF;
+    }
+
+  //  Set all of the accumulated values to 0
+#ifdef POOMA_TFLOP
+  current_clock = 0.0;
+#else
+  current_secs = 0;
+  current_usecs = 0;
+  current_user_time = 0;
+  current_system_time = 0;
+#endif // POOMA_TFLOP
+
+#endif // __MWERKS__
+}
+/*			END OF FUNCTION clear				*/
+
+/************************************************************************/
+/*									*/
+/*			FUNCTION start					*/
+/*									*/
+/*	start a Timer timing.  This will start adding time elapsed to   */
+/*  the current accumulated values of the timer.  If you try to start   */
+/*  a timer that is already running, a warning message is printed	*/
+/*									*/
+/************************************************************************/
+
+void Timer::start()
+{
+#ifdef __MWERKS__
+  // For now, stub out all Timer guts for MetroWerks
+  return;
+#else
+  //  Check to see if the timer is already running
+  if (timer_state == TIMERON)
+    {
+      // ERRORMSG(level2 << "TRIED TO START A RUNNING TIMER!\n");
+      // ERRORMSG("CONTINUING UNCHANGED." << endl);
+      return;
+    }
+
+  //  Get the current time values from the system
+#if defined(POOMA_T3E)
+  last_secs = rtclock();
+  // Omit non-real times on T3E:
+  last_usecs = 0;
+  last_user_time = 0;
+  last_system_time = 0;
+#elif defined(POOMA_TFLOP)
+  last_clock = dclock();
+#else
+  gettimeofday(&tvbuf, &tzbuf);
+  times(&tmsbuf);
+  //  Set the starting values to the current time
+  last_secs = tvbuf.tv_sec;
+  last_usecs = tvbuf.tv_usec;
+  last_user_time = tmsbuf.tms_utime;
+  last_system_time = tmsbuf.tms_stime;
+#endif
+
+  //  Set the state of the Timer
+  timer_state = TIMERON;
+  return;
+#endif // __MWERKS__
+}
+/*			END OF FUNCTION start				*/
+
+/************************************************************************/
+/*									*/
+/*				FUNCITON stop				*/
+/*									*/
+/*	stop stops a Timer from accumulating time.  If you try to stop */
+/*  a stopped Timer, a warning message is printed			*/
+/*									*/
+/************************************************************************/
+
+void Timer::stop()
+{
+#ifdef __MWERKS__
+  // For now, stub out all Timer guts for MetroWerks
+  return;
+#else
+  //  Check to see if the timer is already stopped
+  if (timer_state == TIMEROFF)
+    {
+      // ERRORMSG(level2 << "TRIED TO STOP A STOPPED TIMER!\n");
+      // ERRORMSG("CONTINUING UNCHANGED." << endl);
+      return;
+    }
+
+  //  Get the current time values from the system and accumulate
+#if defined(POOMA_T3E)
+  long end_time = rtclock();
+
+  current_secs +=  end_time - last_secs;
+  current_usecs += 0;
+  current_user_time += 0;
+  current_system_time += 0;
+#elif defined(POOMA_TFLOP)
+  double end_clock = dclock();
+  current_clock += end_clock - last_clock;
+#else
+  gettimeofday(&tvbuf, &tzbuf);
+  times(&tmsbuf);
+
+  current_secs += tvbuf.tv_sec - last_secs;
+  current_usecs += tvbuf.tv_usec - last_usecs;
+  current_user_time += tmsbuf.tms_utime - last_user_time;
+  current_system_time += tmsbuf.tms_stime - last_system_time;
+#endif
+
+  //  Set the state of the Timer
+  timer_state = TIMEROFF;
+  return;
+#endif // __MWERKS__
+}
+/*			END OF FUNCTION stop				*/
+
+/************************************************************************/
+/*									*/
+/*			FUNCTION clock_time				*/
+/*									*/
+/*	clock_time returns the current amount of real (clock) time	*/
+/*  accumulated by this timer.  If the timer is stopped, this is just	*/
+/*  the total accumulated time.  If the timer is running, this is the	*/
+/*  accumulated time plus the time since the timer was last started.	*/
+/*									*/
+/************************************************************************/
+
+double Timer::clock_time()
+{
+#ifdef __MWERKS__
+  // For now, stub out all Timer guts for MetroWerks
+  return 0.0;
+#else
+
+#if !defined(POOMA_TFLOP)
+  long seconds;	    // seconds elapsed
+  
+#if !defined(POOMA_T3E)
+  long useconds;    // useconds (mirco-seconds) elapsed
+#endif
+
+#endif
+
+  double ret_val;    // time elpased
+
+  if (timer_state == TIMEROFF)
+    {
+      // Timer is currently off, so just return accumulated time
+#if !defined(POOMA_TFLOP)
+      seconds = current_secs;
+      
+#if !defined(POOMA_T3E)
+      useconds = current_usecs;
+#endif
+
+#else
+      ret_val = current_clock;
+#endif
+    }
+  else
+    {
+      // Timer is currently running, so add the elapsed
+      // time since the timer was last started to the
+      // accumulated time
+#if defined(POOMA_T3E)
+      long end_time = rtclock();
+      seconds = current_secs + end_time - last_secs;
+#elif defined(POOMA_TFLOP)
+      double end_clock = dclock();
+      ret_val = current_clock + end_clock - last_clock;
+#else
+      gettimeofday(&tvbuf, &tzbuf);
+
+      seconds = current_secs + tvbuf.tv_sec - last_secs;
+      useconds = current_usecs + tvbuf.tv_usec - last_usecs;
+#endif
+    }
+
+  //  Convert into floating point number of seconds
+#if defined(POOMA_T3E)
+  ret_val = tick_secs(seconds, cpu_speed);
+#elif defined(POOMA_TFLOP)
+  // no need to convert
+#else
+  //  Adjust for the fact that the useconds may be negative.
+  //  If they are, take away 1 second and add 1 million
+  //  microseconds until they are positive
+  while (useconds < 0)
+    {
+      useconds = useconds + 1000000;
+      seconds = seconds - 1;
+    }
+
+  long long_ret_val = (1000000 * seconds) + useconds;
+  ret_val = ( (double) long_ret_val ) / 1000000.0;
+#endif
+
+  return ret_val;
+  
+#endif // __MWERKS__
+}
+/*			END OF FUNCTION clock_time			*/
+
+/************************************************************************/
+/*									*/
+/*			FUNCTION user_time				*/
+/*									*/
+/*	user_time reports the current amount of user cpu time           */
+/*   accumulated by this Timer.  If the timer is currently off, 	*/
+/*   this is just the accumulated time.  If the Timer is running, this  */
+/*   is the accumulated time plust the time since the timer was last    */
+/*   started.								*/
+/*									*/
+/************************************************************************/
+
+double Timer::user_time()
+{
+#ifdef __MWERKS__
+// For now, stub out all Timer guts for MetroWerks
+  return 0.0;
+#else
+  double ret_val;		//  Return value	
+
+#if ( defined(POOMA_T3E) || defined(POOMA_TFLOP) )
+  // Not defined yet on T3E or TFLOP.
+  // ERRORMSG("user_time() not defined." << endl);
+  ret_val = -9999.0;
+#else
+  if (timer_state == TIMEROFF)
+    {
+      //  Timer is off, just return accumulated time
+      ret_val = current_user_time;
+    }
+  else
+    {
+      //  Timer is on, add current running time to accumulated time
+      times(&tmsbuf);
+      ret_val = current_user_time + tmsbuf.tms_utime - last_user_time;
+    }
+
+  //  Convert from clock ticks to seconds using the
+  //  cpu_speed value obtained by the constructor
+  ret_val = ret_val / cpu_speed;
+#endif
+
+  return ret_val;
+#endif // __MWERKS__
+}
+/*			END OF FUNCTION user_time			*/
+
+/************************************************************************/
+/*									*/
+/*			FUNCTION system_time				*/
+/*									*/
+/*	system_time reports the current amount of system cpu time       */
+/*   accumulated by this Timer.  If the timer is currently off, 	*/
+/*   this is just the accumulated time.  If the Timer is running, this  */
+/*   is the accumulated time plus the time since the timer was last     */
+/*   started.								*/
+/*									*/
+/************************************************************************/
+
+double Timer::system_time()
+{
+#ifdef __MWERKS__
+// For now, stub out all Timer guts for MetroWerks
+  return 0.0;
+#else
+  double ret_val;		//  Return value
+
+#if ( defined(POOMA_T3E) || (POOMA_TFLOP) )
+  // Not defined yet on T3E or TFLOP.
+  // ERRORMSG("system_time() not defined." << endl);
+  ret_val = -9999.0;
+#else
+  if (timer_state == TIMEROFF)
+    {
+      //  Timer is off, just return accumulated time
+      ret_val = current_system_time;
+    }
+  else
+    {
+      //  Timer is on, return accumulated plus current
+      times(&tmsbuf);
+      ret_val = current_system_time + tmsbuf.tms_stime - last_system_time;
+    }
+
+  //  Convert from clock ticks to seconds using the
+  //  cpu_speed value obtained by the constructor
+  ret_val = ret_val / cpu_speed;
+#endif
+
+  return ret_val;
+#endif // __MWERKS__
+}
diff --git a/src/halo-finder/src/Timer.h b/src/halo-finder/src/Timer.h
new file mode 100644
index 0000000..0099ae3
--- /dev/null
+++ b/src/halo-finder/src/Timer.h
@@ -0,0 +1,114 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+// .NAME Timer - create timer for program execution
+//
+// .SECTION Description
+// The Timer class allows for easy timing of the program.  The timer
+// tracks real (clock) time elapsed, user time, and system time.
+
+#ifndef TIMER_H
+#define TIMER_H
+
+#ifdef __sgi
+// make sure this is defined for BSD time routines
+#define _BSD_TYPES
+// fix a glitch in ANSI compatibility with SGI headers
+#define _STAMP_T
+#endif
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/times.h>
+#include <sys/time.h>
+
+#ifdef __sgi
+// fix a glitch in ANSI compatibility with SGI headers
+#undef _STAMP_T
+#endif
+
+
+class Timer
+{
+public:
+  Timer();			// Constructor
+  ~Timer();                     // Destructor
+  void clear();			// Set all accumulated times to 0
+  void start();			// Start timer
+  void stop();			// Stop timer
+
+  double clock_time();		// Report clock time accumulated in seconds
+  double user_time();		// Report user time accumlated in seconds
+  double system_time();		// Report system time accumulated in seconds
+  double cpu_time()
+  {
+    // Report total cpu_time which is just user_time + system_time
+    return ( user_time() + system_time() );
+  }		
+
+  double calibration;		// Calibration time: time it takes to
+                                // get in and out of timer functions
+private:
+  short timer_state;		// State of timer, either on or off
+  long cpu_speed;		  // CPU speed for times() call
+
+  unsigned long last_secs;	  // Clock seconds value when the
+				  // timer was last started
+  long last_usecs;		  // Clock useconds value when the
+				  // timer was last started
+  unsigned long last_user_time;   // User time when timer was last started
+  unsigned long last_system_time; // System time when timer was last started
+
+  long current_secs;		// Current accumulated clock seconds
+  long current_usecs;		// Current accumulated clock useconds
+  long current_user_time;	// Current accumulated user time
+  long current_system_time;	// Current accumulated system time
+
+  struct tms tmsbuf;	        //  Values from call to times
+  struct timeval tvbuf;	        //  Values from call to gettimeofday
+  struct timezone tzbuf;        //  Timezone values from gettimeofday
+	  		        //  These values aren't used for anything
+};
+
+#endif
diff --git a/src/halo-finder/src/Timings.cxx b/src/halo-finder/src/Timings.cxx
new file mode 100644
index 0000000..69e9b85
--- /dev/null
+++ b/src/halo-finder/src/Timings.cxx
@@ -0,0 +1,212 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+#include "Timings.h"
+#include <iostream>
+#include <rru_mpi.h>
+
+#include <cstring>
+using namespace std;
+
+// .NAME Timings - create timer for program execution
+//
+// .SECTION Description
+// The Timer class allows for easy timing of the program.  The timer
+// tracks real (clock) time elapsed, user time, and system time.
+
+// static data members of Timings class
+Timings::TimerList_t Timings::TimerList;
+Timings::TimerMap_t  Timings::TimerMap;
+
+
+//////////////////////////////////////////////////////////////////////
+// default constructor
+Timings::Timings() { }
+
+
+//////////////////////////////////////////////////////////////////////
+// destructor
+Timings::~Timings() { }
+
+
+//////////////////////////////////////////////////////////////////////
+// create a timer, or get one that already exists
+Timings::TimerRef Timings::getTimer(const char *nm) {
+  string s(nm);
+  TimerInfo *tptr = 0;
+  TimerMap_t::iterator loc = TimerMap.find(s);
+  if (loc == TimerMap.end()) {
+    tptr = new TimerInfo;
+    tptr->indx = TimerList.size();
+    tptr->name = s;
+    TimerMap.insert(TimerMap_t::value_type(s,tptr));
+    TimerList.push_back(my_auto_ptr<TimerInfo>(tptr));
+  } else {
+    tptr = (*loc).second;
+  }
+  return tptr->indx;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// start a timer
+void Timings::startTimer(TimerRef t) {
+  if (t < 0 || t >= (int) TimerList.size())
+    return;
+  TimerList[t]->start();
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// stop a timer, and accumulate it's values
+void Timings::stopTimer(TimerRef t) {
+  if (t < 0 || t >= (int) TimerList.size())
+    return;
+  TimerList[t]->stop();
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// clear a timer, by turning it off and throwing away its time
+void Timings::clearTimer(TimerRef t) {
+  if (t < 0 || t >= (int) TimerList.size())
+    return;
+  TimerList[t]->clear();
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// print out the timing results
+void Timings::print() {
+  int i,j;
+  if (TimerList.size() < 1)
+    return;
+
+  int nodes, rank;
+#ifndef USE_SERIAL_COSMO
+  MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#endif
+
+  // report the average time for each timer
+  if (rank == 0) {
+    cout << "-----------------------------------------------------------------";
+    cout << endl;
+    cout << "     Timing results for " << nodes << " nodes:" << endl;
+    cout << "-----------------------------------------------------------------";
+    cout << endl;
+  }
+  for (i=0; i<1; ++i){
+    TimerInfo *tptr = TimerList[i].get();
+    double walltotal = 0.0, cputotal = 0.0;
+
+#ifndef USE_SERIAL_COSMO
+    MPI_Reduce(&tptr->wallTime, &walltotal, 1, MPI_DOUBLE, MPI_MAX, 0,
+        MPI_COMM_WORLD);
+    MPI_Reduce(&tptr->cpuTime, &cputotal, 1, MPI_DOUBLE, MPI_MAX, 0,
+        MPI_COMM_WORLD);
+#endif
+
+    if (rank == 0) {
+      cout << tptr->name.c_str() << " ";
+      for (j=strlen(tptr->name.c_str()); j < 20; ++j)
+        cout << ".";
+      cout << " Wall tot = ";
+      cout.width(10);
+      cout << walltotal << ", CPU tot = ";
+      cout.width(10);
+      cout << cputotal << endl << endl;
+    }
+  }
+
+  for (i=1; i < (int) TimerList.size(); ++i) {
+    TimerInfo *tptr = TimerList[i].get();
+    double wallmax = 0.0, cpumax = 0.0, wallmin = 0.0, cpumin = 0.0;
+    double wallavg = 0.0, cpuavg = 0.0;
+
+#ifndef USE_SERIAL_COSMO
+    MPI_Reduce(&tptr->wallTime, &wallmax, 1, MPI_DOUBLE, MPI_MAX, 0,
+        MPI_COMM_WORLD);
+    MPI_Reduce(&tptr->cpuTime, &cpumax, 1, MPI_DOUBLE, MPI_MAX, 0,
+        MPI_COMM_WORLD);
+    MPI_Reduce(&tptr->wallTime, &wallmin, 1, MPI_DOUBLE, MPI_MIN, 0,
+        MPI_COMM_WORLD);
+    MPI_Reduce(&tptr->cpuTime, &cpumin, 1, MPI_DOUBLE, MPI_MIN, 0,
+        MPI_COMM_WORLD);
+    MPI_Reduce(&tptr->wallTime, &wallavg, 1, MPI_DOUBLE, MPI_SUM, 0,
+        MPI_COMM_WORLD);
+    MPI_Reduce(&tptr->cpuTime, &cpuavg, 1, MPI_DOUBLE, MPI_SUM, 0,
+        MPI_COMM_WORLD);
+#endif
+
+    if (rank == 0) {
+      cout << tptr->name.c_str() << " ";
+      for (j=strlen(tptr->name.c_str()); j < 20; ++j)
+        cout << ".";
+      cout << " Wall max = ";
+      cout.width(10);
+      cout << wallmax << ", CPU max = ";
+      cout.width(10);
+      cout << cpumax << endl;
+      for (j = 0; j < 21; ++j)
+        cout << " ";
+      cout << " Wall avg = ";
+      cout.width(10);
+      cout << wallavg / nodes << ", CPU avg = ";
+      cout.width(10);
+      cout << cpuavg / nodes << endl;
+      for (j = 0; j < 21; ++j)
+        cout << " ";
+      cout << " Wall min = ";
+      cout.width(10);
+      cout << wallmin << ", CPU min = ";
+      cout.width(10);
+      cout << cpumin << endl << endl;
+    }
+  }
+  if (rank == 0) {
+    cout << "-----------------------------------------------------------------";
+    cout << endl;
+  }
+}
diff --git a/src/halo-finder/src/Timings.h b/src/halo-finder/src/Timings.h
new file mode 100644
index 0000000..74661ea
--- /dev/null
+++ b/src/halo-finder/src/Timings.h
@@ -0,0 +1,240 @@
+/*=========================================================================
+                                                                                
+Copyright (c) 2007, Los Alamos National Security, LLC
+
+All rights reserved.
+
+Copyright 2007. Los Alamos National Security, LLC. 
+This software was produced under U.S. Government contract DE-AC52-06NA25396 
+for Los Alamos National Laboratory (LANL), which is operated by 
+Los Alamos National Security, LLC for the U.S. Department of Energy. 
+The U.S. Government has rights to use, reproduce, and distribute this software. 
+NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,
+EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  
+If software is modified to produce derivative works, such modified software 
+should be clearly marked, so as not to confuse it with the version available 
+from LANL.
+ 
+Additionally, redistribution and use in source and binary forms, with or 
+without modification, are permitted provided that the following conditions 
+are met:
+-   Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer. 
+-   Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution. 
+-   Neither the name of Los Alamos National Security, LLC, Los Alamos National
+    Laboratory, LANL, the U.S. Government, nor the names of its contributors
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+                                                                                
+=========================================================================*/
+
+// .NAME Timer - create timer for program execution
+//
+// .SECTION Description
+// The Timer class allows for easy timing of the program.  The timer
+// tracks real (clock) time elapsed, user time, and system time.
+
+#ifndef TIMINGS_H
+#define TIMINGS_H
+
+/*************************************************************************
+ * Timings - a simple singleton class which lets the user create and
+ *   timers that can be printed out at the end of the program.
+ *
+ * General usage
+ *  1) create a timer:
+ *     Timings::TimerRef val = Timings::getTimer("timer name");
+ *  This will either create a new one, or return a ref to an existing one
+ *
+ *  2) start a timer:
+ *     Timings::startTimer(val);
+ *  This will start the referenced timer running.  If it is already running,
+ *  it will not change anything.
+ *
+ *  3) stop a timer:
+ *     Timings::stopTimer(val);
+ *  This will stop the timer, assuming it was running, and add in the
+ *  time to the accumulating time for that timer.
+ *
+ *  4) print out the results:
+ *     Timings::print();
+ *
+ *************************************************************************/
+
+#include "Timer.h"
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+/*
+  A simple compliant implementation of auto_ptr.
+  This is from Greg Colvin's implementation posted to comp.std.c++.
+
+  Instead of using mutable this casts away const in release.
+
+  We have to do this because we can't build containers of these
+  things otherwise.
+  */
+//////////////////////////////////////////////////////////////////////
+
+template<class X>
+class my_auto_ptr
+{
+  X* px;
+public:
+  my_auto_ptr() : px(0) {}
+  my_auto_ptr(X* p) : px(p) {}
+  my_auto_ptr(const my_auto_ptr<X>& r) : px(r.release()) {}
+  my_auto_ptr& operator=(const my_auto_ptr<X>& r)
+  {
+    if (&r != this)
+      {
+	delete px;
+	px = r.release();
+      }
+    return *this;
+  }
+  ~my_auto_ptr() { delete px; }
+  X& operator*()  const { return *px; }
+  X* operator->() const { return px; }
+  X* get()        const { return px; }
+  X* release()    const { X *p=px; ((my_auto_ptr<X>*)(this))->px=0; return p; }
+};
+
+#include <string>
+using std::string;
+#include <vector>
+using std::vector;
+#include <map>
+using std::map;
+
+// a simple class used to store timer values
+class TimerInfo
+{
+public:
+  // typedef for reference to a timer
+  typedef int TimerRef;
+
+  // constructor
+  TimerInfo() : name(""), cpuTime(0.0), wallTime(0.0), indx(-1) {
+    clear();
+  }
+
+  // destructor
+  ~TimerInfo() { }
+
+  // timer operations
+  void start() {
+    if (!running) {
+      running = true;
+      t.stop();
+      t.clear();
+      t.start();
+    }
+  }
+
+  void stop() {
+    if (running) {
+      t.stop();
+      running = false;
+      cpuTime += t.cpu_time();
+      wallTime += t.clock_time();
+    }
+  }
+
+  void clear() {
+    t.stop();
+    t.clear();
+    running = false;
+  }
+
+  // the POOMA timer that this object manages
+  Timer t;
+
+  // the name of this timer
+  string name;
+
+  // the accumulated time
+  double cpuTime;
+  double wallTime;
+
+  // is the timer turned on right now?
+  bool running;
+
+  // an index value for this timer
+  TimerRef indx;
+};
+
+
+
+class Timings
+{
+public:
+  // typedef for reference to a timer
+  typedef int TimerRef;
+
+  // a typedef for the timer information object
+  typedef TimerInfo TimerInfo_t;
+
+public:
+  // Default constructor
+  Timings();
+
+  // Destructor - clear out the existing timers
+  ~Timings();
+
+  //
+  // timer manipulation methods
+  //
+
+  // create a timer, or get one that already exists
+  static TimerRef getTimer(const char *);
+
+  // start a timer
+  static void startTimer(TimerRef);
+
+  // stop a timer, and accumulate it's values
+  static void stopTimer(TimerRef);
+
+  // clear a timer, by turning it off and throwing away its time
+  static void clearTimer(TimerRef);
+
+  // return a TimerInfo struct by asking for the name
+  static TimerInfo_t *infoTimer(const char *nm) {
+    return TimerMap[string(nm)];
+  }
+
+  //
+  // I/O methods
+  //
+
+  // print the results to standard out
+  static void print();
+
+private:
+  // type of storage for list of TimerInfo
+  typedef vector<my_auto_ptr<TimerInfo> > TimerList_t;
+  typedef map<string, TimerInfo *> TimerMap_t;
+
+  // a list of timer info structs
+  static TimerList_t TimerList;
+
+  // a map of timers, keyed by string
+  static TimerMap_t TimerMap;
+};
+
+#endif
diff --git a/src/halo-finder/src/bigchunk.c b/src/halo-finder/src/bigchunk.c
new file mode 100644
index 0000000..fd886bd
--- /dev/null
+++ b/src/halo-finder/src/bigchunk.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2011 UChicago Argonne, LLC
+ * All Rights Reserved
+ *
+ * Permission to use, reproduce, prepare derivative works, and to redistribute
+ * to others this software, derivatives of this software, and future versions
+ * of this software as well as its documentation is hereby granted, provided
+ * that this notice is retained thereon and on all copies or modifications.
+ * This permission is perpetual, world-wide, and provided on a royalty-free
+ * basis. UChicago Argonne, LLC and all other contributors make no
+ * representations as to the suitability and operability of this software for
+ * any purpose. It is provided "as is" without express or implied warranty. 
+ *
+ * Portions of this software are copyright by UChicago Argonne, LLC. Argonne
+ * National Laboratory with facilities in the state of Illinois, is owned by
+ * The United States Government, and operated by UChicago Argonne, LLC under
+ * provision of a contract with the Department of Energy. 
+ *
+ * PORTIONS OF THIS SOFTWARE  WERE PREPARED AS AN ACCOUNT OF WORK SPONSORED BY
+ * AN AGENCY OF THE UNITED STATES GOVERNMENT. NEITHER THE UNITED STATES
+ * GOVERNMENT NOR ANY AGENCY THEREOF, NOR THE UNIVERSITY OF CHICAGO, NOR ANY OF
+ * THEIR EMPLOYEES OR OFFICERS, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
+ * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT, OR
+ * PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE PRIVATELY
+ * OWNED RIGHTS. REFERENCE HEREIN TO ANY SPECIFIC COMMERCIAL PRODUCT, PROCESS,
+ * OR SERVICE BY TRADE NAME, TRADEMARK, MANUFACTURER, OR OTHERWISE, DOES NOT
+ * NECESSARILY CONSTITUTE OR IMPLY ITS ENDORSEMENT, RECOMMENDATION, OR FAVORING
+ * BY THE UNITED STATES GOVERNMENT OR ANY AGENCY THEREOF. THE VIEW AND OPINIONS
+ * OF AUTHORS EXPRESSED HEREIN DO NOT NECESSARILY STATE OR REFLECT THOSE OF THE
+ * UNITED STATES GOVERNMENT OR ANY AGENCY THEREOF. 
+ *
+ * Author: Hal Finkel <hfinkel@anl.gov>
+ */
+
+#include "bigchunk.h"
+#include <stdio.h>
+
+static void *_bigchunk_ptr = (void *) 0;
+static size_t _bigchunk_last_alloc = (size_t) -1;
+static size_t _bigchunk_sz = 0;
+static size_t _bigchunk_used = 0;
+static size_t _bigchunk_total = 0;
+static const size_t min_alloc = 32; /* for alignment; must be 2^n */
+
+void *bigchunk_malloc(size_t sz)
+{
+	if (sz < min_alloc)
+		sz = min_alloc;
+	else {
+		size_t e = sz - (sz & ~(min_alloc-1));
+		if (e != 0) sz += min_alloc - e;
+	}
+
+	if (_bigchunk_sz - _bigchunk_used >= sz) {
+		/* this fits in the big chunk */
+		void *r = (char *)_bigchunk_ptr + _bigchunk_used;
+		_bigchunk_last_alloc = _bigchunk_used;
+		_bigchunk_used += sz;
+		_bigchunk_total += sz;
+		return r;
+	} else if (_bigchunk_used == 0 && _bigchunk_sz > 0) {
+		/* this is smaller than the big chunk, but nothing
+		   is currently using the big chunk, so just make
+		   the big chunk bigger.
+		*/
+
+		void *new_chuck = realloc(_bigchunk_ptr, sz);
+		if (new_chuck) {
+			_bigchunk_ptr = new_chuck;
+			_bigchunk_last_alloc = 0;
+			_bigchunk_sz = sz;
+			_bigchunk_used = sz;
+			_bigchunk_total += sz;
+			return _bigchunk_ptr;
+		}
+        }
+
+	void *ptr = malloc(sz);
+	if (ptr) _bigchunk_total += sz;
+	return ptr;
+}
+
+void bigchunk_free(void *ptr)
+{
+	if (ptr < _bigchunk_ptr || ptr >= (char *)_bigchunk_ptr + _bigchunk_sz) {
+		free(ptr);
+	} else if (_bigchunk_last_alloc != (size_t) -1 &&
+                   ptr == (char *)_bigchunk_ptr + _bigchunk_last_alloc) {
+		/* this is the last allocation, so we can undo that easily... */
+		_bigchunk_used = _bigchunk_last_alloc;
+		_bigchunk_last_alloc = (size_t) -1;
+	}
+}
+
+void bigchunk_reset()
+{
+	_bigchunk_used = 0;
+	_bigchunk_total = 0;
+	_bigchunk_last_alloc = (size_t) -1;
+}
+
+void bigchunk_init(size_t sz)
+{
+	_bigchunk_ptr = malloc(sz);
+	if (_bigchunk_ptr) {
+		_bigchunk_sz = sz;
+		_bigchunk_used = 0;
+		_bigchunk_last_alloc = (size_t) -1;
+	}
+}
+
+void bigchunk_cleanup()
+{
+	free(_bigchunk_ptr);
+	_bigchunk_ptr = 0;
+	_bigchunk_sz = 0;
+	_bigchunk_used = 0;
+	_bigchunk_total = 0;
+	_bigchunk_last_alloc = (size_t) -1;
+}
+
+size_t bigchunk_get_size()
+{
+	return _bigchunk_sz;
+}
+
+
+size_t bigchunk_get_total()
+{
+	return _bigchunk_total;
+}
+
diff --git a/src/halo-finder/src/bigchunk.h b/src/halo-finder/src/bigchunk.h
new file mode 100644
index 0000000..e144f49
--- /dev/null
+++ b/src/halo-finder/src/bigchunk.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2011 UChicago Argonne, LLC
+ * All Rights Reserved
+ *
+ * Permission to use, reproduce, prepare derivative works, and to redistribute
+ * to others this software, derivatives of this software, and future versions
+ * of this software as well as its documentation is hereby granted, provided
+ * that this notice is retained thereon and on all copies or modifications.
+ * This permission is perpetual, world-wide, and provided on a royalty-free
+ * basis. UChicago Argonne, LLC and all other contributors make no
+ * representations as to the suitability and operability of this software for
+ * any purpose. It is provided "as is" without express or implied warranty. 
+ *
+ * Portions of this software are copyright by UChicago Argonne, LLC. Argonne
+ * National Laboratory with facilities in the state of Illinois, is owned by
+ * The United States Government, and operated by UChicago Argonne, LLC under
+ * provision of a contract with the Department of Energy. 
+ *
+ * PORTIONS OF THIS SOFTWARE  WERE PREPARED AS AN ACCOUNT OF WORK SPONSORED BY
+ * AN AGENCY OF THE UNITED STATES GOVERNMENT. NEITHER THE UNITED STATES
+ * GOVERNMENT NOR ANY AGENCY THEREOF, NOR THE UNIVERSITY OF CHICAGO, NOR ANY OF
+ * THEIR EMPLOYEES OR OFFICERS, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
+ * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT, OR
+ * PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE PRIVATELY
+ * OWNED RIGHTS. REFERENCE HEREIN TO ANY SPECIFIC COMMERCIAL PRODUCT, PROCESS,
+ * OR SERVICE BY TRADE NAME, TRADEMARK, MANUFACTURER, OR OTHERWISE, DOES NOT
+ * NECESSARILY CONSTITUTE OR IMPLY ITS ENDORSEMENT, RECOMMENDATION, OR FAVORING
+ * BY THE UNITED STATES GOVERNMENT OR ANY AGENCY THEREOF. THE VIEW AND OPINIONS
+ * OF AUTHORS EXPRESSED HEREIN DO NOT NECESSARILY STATE OR REFLECT THOSE OF THE
+ * UNITED STATES GOVERNMENT OR ANY AGENCY THEREOF. 
+ *
+ * Author: Hal Finkel <hfinkel@anl.gov>
+ */
+
+#ifndef BIGCHUNK_H
+#define BIGCHUNK_H
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Allocates memory from the big chunck, falling back to the system's allocator.
+ */
+
+void *bigchunk_malloc(size_t sz);
+
+/*
+ * Frees memory, this does nothing if the memory comes from the big chunk.
+ */
+
+void bigchunk_free(void *ptr);
+
+/*
+ * Resets the state of the big chunk (marks all memory in the chunk as free).
+ */
+
+void bigchunk_reset();
+
+/*
+ * Initialize the big chunk (to the specified size).
+ */
+
+void bigchunk_init(size_t sz);
+
+/*
+ * Free the big chunk (all memory within it should already be free).
+ */
+
+void bigchunk_cleanup();
+
+/*
+ * Get the size of the big chunk.
+ */
+
+size_t bigchunk_get_size();
+
+/*
+ * Get the total amount of memory allocated
+ * (freed memory is not subtracted, so this measures the total of all allocation
+ * requests in between calls to bigchunk_reset().
+ */
+
+size_t bigchunk_get_total();
+
+#ifdef __cplusplus
+}
+
+template <typename T>
+class bigchunk_allocator
+{
+public:
+  typedef T value_type;
+  typedef T *pointer;
+  typedef T &reference;
+  typedef const T *const_pointer;
+  typedef const T &const_reference;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  template <typename U>
+  struct rebind {
+  	typedef bigchunk_allocator<U> other;
+  };
+
+public:
+  bigchunk_allocator() throw() {};
+  bigchunk_allocator(const bigchunk_allocator&) throw() {};
+
+  template <typename U>
+  bigchunk_allocator(const bigchunk_allocator<U>&) throw() {};
+
+public:
+  ~bigchunk_allocator() throw () {};
+
+public:
+  pointer address(reference x) const { return &x; }
+  const_pointer address (const_reference x) const { return &x; }
+
+  size_type max_size() const throw() { return size_t(-1) / sizeof(T); }
+
+  void construct(pointer p, const_reference val) { ::new ((void*)p) T(val); }
+  void destroy(pointer p) { ((T*)p)->~T(); }
+
+public:
+  pointer allocate(size_type n,
+                   const void * /*hint*/ = 0)
+  {
+    return (pointer) ::bigchunk_malloc(n*sizeof(T));
+  }
+
+  void deallocate(pointer p, size_type n)
+  {
+    ::bigchunk_free((void *) p);
+  }
+};
+
+#endif // __cplusplus
+#endif // BIGCHUNK_H
+
diff --git a/src/halo-finder/src/cm_int.c b/src/halo-finder/src/cm_int.c
new file mode 100644
index 0000000..87fd174
--- /dev/null
+++ b/src/halo-finder/src/cm_int.c
@@ -0,0 +1,43 @@
+#define POSVEL_T float
+#define ID_T int
+
+void cm_int(ID_T count, const POSVEL_T* __restrict xx, const POSVEL_T* __restrict yy,
+                      const POSVEL_T* __restrict zz, const POSVEL_T* __restrict mass,
+                      POSVEL_T* __restrict xmin, POSVEL_T* __restrict xmax, POSVEL_T* __restrict xc)
+{
+  // xmin/xmax are currently set to the whole bounding box, but this is too conservative, so we'll
+  // set them based on the actual particle content.
+
+  double x = 0, y = 0, z = 0, m = 0;
+
+  POSVEL_T w,x1,x2,y1,y2,z1,z2;
+
+  x1 = xx[0]; x2 = xx[0];
+  y1 = yy[0]; y2 = yy[0];
+  z1 = zz[0]; z2 = zz[0];
+
+  for (int i = 0; i < count; ++i) 
+  {
+    if ( x1 > xx[i] ) x1 = xx[i]; /* x1 = min( xx[] ) */
+    if ( x2 < xx[i] ) x2 = xx[i]; /* x2 = max( xx[] ) */
+    if ( y1 > yy[i] ) y1 = yy[i]; /* y1 = min( yy[] ) */
+    if ( y2 < yy[i] ) y2 = yy[i]; /* y2 = max( yy[] ) */
+    if ( z1 > zz[i] ) z1 = zz[i]; /* z1 = min( zz[] ) */
+    if ( z2 < zz[i] ) z2 = zz[i]; /* z2 = max( zz[] ) */
+
+    w = mass[i];
+    x = x + w * xx[i];
+    y = y + w * yy[i];
+    z = z + w * zz[i];
+    m = m + w;
+  }
+
+  xc[0] = (POSVEL_T) (x/m);
+  xc[1] = (POSVEL_T) (y/m);
+  xc[2] = (POSVEL_T) (z/m);
+
+  xmin[0] = x1; xmax[0] = x2;
+  xmin[1] = y1; xmax[1] = y2;
+  xmin[2] = z1; xmax[2] = z2;
+}
+
diff --git a/src/halo-finder/src/cudaUtil.h b/src/halo-finder/src/cudaUtil.h
new file mode 100644
index 0000000..33722a4
--- /dev/null
+++ b/src/halo-finder/src/cudaUtil.h
@@ -0,0 +1,160 @@
+#include "hip/hip_runtime.h"
+#pragma once
+#include <stdio.h>
+
+#define cudaCheckError() {                                          \
+ hipError_t e=hipGetLastError();                                 \
+ if(e!=hipSuccess) {                                              \
+   printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,hipGetErrorString(e));           \
+   exit(0); \
+ }                                                                 \
+}
+
+template <int hipWarpSize, typename T>
+__device__ __inline__ T warpReduceSum(T val) {
+  if(hipWarpSize>16) val+=__shfl_down(val,16,hipWarpSize);
+  if(hipWarpSize>8) val+=__shfl_down(val,8,hipWarpSize);
+  if(hipWarpSize>4) val+=__shfl_down(val,4,hipWarpSize);
+  if(hipWarpSize>2) val+=__shfl_down(val,2,hipWarpSize);
+  if(hipWarpSize>1) val+=__shfl_down(val,1,hipWarpSize);
+  return val;
+}
+template <int hipWarpSize, typename T>
+__device__ __inline__ T warpReduceMax(T val) {
+  if(hipWarpSize>16) val=max(val,__shfl_down(val,16,hipWarpSize));
+  if(hipWarpSize>8)  val=max(val,__shfl_down(val,8,hipWarpSize));
+  if(hipWarpSize>4)  val=max(val,__shfl_down(val,4,hipWarpSize));
+  if(hipWarpSize>2)  val=max(val,__shfl_down(val,2,hipWarpSize));
+  if(hipWarpSize>1)  val=max(val,__shfl_down(val,1,hipWarpSize));
+  return val;
+}
+template <int hipWarpSize, typename T>
+__device__ __inline__ T warpReduceMin(T val) {
+  if(hipWarpSize>16) val=min(val,__shfl_down(val,16,hipWarpSize));
+  if(hipWarpSize>8)  val=min(val,__shfl_down(val,8,hipWarpSize));
+  if(hipWarpSize>4)  val=min(val,__shfl_down(val,4,hipWarpSize));
+  if(hipWarpSize>2)  val=min(val,__shfl_down(val,2,hipWarpSize));
+  if(hipWarpSize>1)  val=min(val,__shfl_down(val,1,hipWarpSize));
+  return val;
+}
+
+template <typename T>
+__device__ __inline__ T blockReduceSum(T val) {
+  __shared__ volatile T smem[32];
+
+  val=warpReduceSum<32>(val);
+  if(hipThreadIdx_x%32==0) smem[hipThreadIdx_x/32]=val;
+  __syncthreads();
+  val=0;
+  if(hipThreadIdx_x<hipBlockDim_x/32) val=smem[hipThreadIdx_x];
+  if(hipThreadIdx_x<32) val=warpReduceSum<32>(val);
+  __syncthreads();
+  return val;
+}
+
+template <typename T>
+__device__ __inline__ T blockReduceMax(T val) {
+  __shared__ volatile T smem[32];
+
+  val=warpReduceMax<32>(val);
+  if(hipThreadIdx_x%32==0) smem[hipThreadIdx_x/32]=val;
+  __syncthreads();
+  
+  if(hipThreadIdx_x<32==0) {
+    val=max(smem[0],smem[hipThreadIdx_x]);
+    val=warpReduceMax<32>(val);
+  }
+  __syncthreads();
+  return val;
+}
+
+template <typename T>
+__device__ __inline__ T blockReduceMin(T val) {
+  __shared__ volatile T smem[32];
+
+  val=warpReduceMin<32>(val);
+  if(hipThreadIdx_x%32==0) smem[hipThreadIdx_x/32]=val;
+  __syncthreads();
+  
+  if(hipThreadIdx_x<32==0) {
+    val=min(smem[0],smem[hipThreadIdx_x]);
+    val=warpReduceMin<32>(val);
+  }
+  __syncthreads();
+  return val;
+}
+
+__device__  __forceinline__
+void atomicWarpReduceAndUpdate(POSVEL_T *out, POSVEL_T val) {
+  //perform shfl reduction
+  val+=__shfl_down(val, 16); 
+  val+=__shfl_down(val, 8); 
+  val+=__shfl_down(val, 4);
+  val+=__shfl_down(val, 2); 
+  val+=__shfl_down(val, 1);
+
+  if(hipThreadIdx_x%32==0)
+    atomicAdd(out,val);  //atomics are unecessary but they are faster than non-atomics due to a single bus transaction
+}
+
+class cudaDeviceSelector {
+  public:
+  cudaDeviceSelector() {
+    char* str;
+    int local_rank = 0;
+    int numDev=1;
+
+    //No MPI at this time so go by enviornment variables. 
+    //This may need to be updated to match your MPI flavor
+    if((str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) {
+      local_rank = atoi(str);
+    }
+    else if((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) {
+      local_rank = atoi(str);
+    }
+    else if((str = getenv("SLURM_LOCALID")) != NULL) {
+      local_rank = atoi(str);
+    }
+
+    //get the number of devices to use
+    if((str = getenv("HACC_NUM_CUDA_DEV")) != NULL) {
+      numDev=atoi(str);
+    }
+
+#if 0
+
+#if 0
+    //Use MPS,  need to figure out how to set numDev, perhaps and enviornment varaible?
+    char var[100];
+    sprintf(var,"/tmp/nvidia-mps_%d",local_rank%numDev);
+    setenv("CUDA_MPS_PIPE_DIRECTORY",var,1);
+#endif
+#else 
+    int dev;
+    //set via local MPI rank 
+    dev = local_rank % numDev;
+ 
+    //we must set this for all threads
+	hipSetDevice(dev);
+#endif
+  }
+};
+
+inline void checkCudaPtr(const void* ptr, const char* name) {
+  int rank;
+
+  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+
+  int dev;
+  hipGetDevice(&dev);
+
+  hipPointerAttribute_t at;
+
+  hipPointerGetAttributes(&at,ptr);
+
+  if(dev!=at.device) {
+    printf("%d: Error '%s', dev: %d, at.device: %d\n", rank, name, dev, at.device);
+  }
+}
+
+  
diff --git a/src/halo-finder/src/dfft/Makefile b/src/halo-finder/src/dfft/Makefile
new file mode 100644
index 0000000..6d377d4
--- /dev/null
+++ b/src/halo-finder/src/dfft/Makefile
@@ -0,0 +1,137 @@
+SHELL	:= /bin/sh
+
+OBJDIR := ${HACC_OBJDIR}
+
+SOURCES += distribution.c
+SOURCES += comm-schedule.c
+SOURCES += dims.c
+#SOURCES += subarray.c
+OBJLIST = $(SOURCES:.c=.o)
+OBJECTS := $(addprefix $(OBJDIR)/,$(OBJLIST))
+
+TARGETS	+= $(OBJDIR)/libdfft.a
+
+all: $(TARGETS)
+
+tests	+= $(OBJDIR)/test-comm-schedule
+tests	+= $(OBJDIR)/test-distribution
+tests	+= $(OBJDIR)/test-distribution-2-to-3
+tests	+= $(OBJDIR)/test-dfft
+tests	+= $(OBJDIR)/test-solver-class
+tests	+= $(OBJDIR)/test-solver-bench
+tests	+= $(OBJDIR)/test-qpm
+tests   += $(OBJDIR)/test-mpi-init
+tests-fftw3 += $(OBJDIR)/test-distribution-fft
+tests-fftw3 += $(OBJDIR)/test-delta-function
+tests-fftw3 += $(OBJDIR)/test-timing
+tests-fftw3 += $(OBJDIR)/test-solver
+tests-fftw3 += $(OBJDIR)/test-zarija
+tests-fftw3 += $(OBJDIR)/test-fftw-guru
+
+fftw	:= $(FFTW_HOME)
+
+CFLAGS += ${HACC_MPI_CFLAGS}
+CXXFLAGS += ${HACC_MPI_CXXFLAGS}
+LDFLAGS += ${HACC_MPI_LDFLAGS}
+
+CFLAGS	+= $(WARN)
+CFLAGS	+= -DUSE_SLAB_WORKAROUND=1
+CFLAGS	+= -I$(fftw)/include -I../halo_finder
+
+CXXFLAGS += $(WARN)
+CXXFLAGS += -I$(fftw)/include -I../halo_finder
+
+LDFLAGS	+= -L$(fftw)/lib
+
+ifeq "${FFTW_MAJOR_VERSION}" "2"
+  CFLAGS   += -DFFTW2=1
+  CXXFLAGS += -DFFTW2=1
+  LDLIBS   += -lfftw_mpi -lfftw
+endif
+
+ifeq "${FFTW_MAJOR_VERSION}" "3"
+  CFLAGS	+= -DFFTW3=1
+  CXXFLAGS	+= -DFFTW3=1
+ifeq ("${FFTW_WRAPPER}", "essl")
+  CFLAGS += -DESSL_FFTW=1
+  CXXFLAGS += -DESSL_FFTW=1
+  LDLIBS += -lfftw3_esslbg -lesslsmpbg
+ifeq ("${FFTW_THREADING}", "omp")
+  CFLAGS	+= -DFFTW3_THREADS=1
+  CXXFLAGS	+= -DFFTW3_THREADS=1
+  LDLIBS	+= -lfftw3_mpi -lfftw3_omp -lfftw3
+else
+  LDLIBS	+= -lfftw3_mpi -lfftw3
+endif
+endif
+include pencil.mk
+  CFLAGS	+= ${DFFT_PEN_CFLAGS}
+  CXXFLAGS	+= ${DFFT_PEN_CXXFLAGS}
+endif
+
+#$(OBJECTS): | $(OBJDIR)
+
+$(OBJDIR):
+	mkdir -p $(OBJDIR)
+
+$(OBJDIR)/%.o: %.c | $(OBJDIR)
+	${HACC_MPI_CC} ${CFLAGS} -c -o $@ $<
+
+$(OBJDIR)/%.o: %.cpp | $(OBJDIR)
+	${HACC_MPI_CXX} ${CXXFLAGS} -c -o $@ $<
+
+clean::
+	$(RM) $(TARGETS) $(tests) $(tests-fftw3) plot.in *.o *~ bigchunk-local.c
+	$(RM) -rf $(OBJDIR)
+
+check: tests
+	for t in $(tests) ; do \
+	    echo '++++' $$t '++++' ; \
+	    mpirun -np 8 $$t ; \
+	    echo '----' $$t '----' ; \
+	done
+	make -C testdata/qpm check-data
+	make -C testdata/qpm clean
+
+check-fftw3: tests-fftw3
+
+tags:
+	find .. -type f | egrep '(c|cc|cpp|cxx|h|hpp|hxx)$$' | xargs /usr/bin/etags
+
+tests: $(tests)
+
+ifeq "${FFTW_MAJOR_VERSION}" "3"
+tests-fftw3: $(tests-fftw3)
+else
+tests-fftw3:
+endif
+
+
+$(OBJDIR)/libdfft.a: $(OBJDIR)/libdfft.a($(OBJECTS))
+	ranlib $@
+
+$(OBJDIR)/test-%: $(OBJDIR)/test-%.o $(OBJDIR)/bigchunk-local.o $(OBJDIR)/libdfft.a
+	${HACC_MPI_CC} $(CFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)
+
+bigchunk-local.c: ../halo_finder/bigchunk.c
+	cp ../halo_finder/bigchunk.c bigchunk-local.c
+
+$(OBJDIR)/test-solver-class.o: solver.hpp
+$(OBJDIR)/test-solver-class: $(OBJDIR)/test-solver-class.o $(OBJDIR)/bigchunk-local.o $(OBJDIR)/libdfft.a
+	${HACC_MPI_CXX} $(CXXFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)
+
+$(OBJDIR)/test-solver-bench.o: solver.hpp
+$(OBJDIR)/test-solver-bench: $(OBJDIR)/test-solver-bench.o $(OBJDIR)/bigchunk-local.o $(OBJDIR)/libdfft.a
+	${HACC_MPI_CXX} $(CXXFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)
+
+$(OBJDIR)/test-qpm.o: solver.hpp
+$(OBJDIR)/test-qpm: $(OBJDIR)/test-qpm.o $(OBJDIR)/bigchunk-local.o $(OBJDIR)/libdfft.a
+	${HACC_MPI_CXX} $(CXXFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)
+
+$(OBJDIR)/test-zarija.o: solver.hpp
+$(OBJDIR)/test-zarija: $(OBJDIR)/test-zarija.o $(OBJDIR)/bigchunk-local.o $(OBJDIR)/libdfft.a
+	${HACC_MPI_CXX} $(CXXFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)
+
+$(OBJDIR)/test-dfft.o: dfft.hpp distribution.hpp distribution.h
+$(OBJDIR)/test-dfft: $(OBJDIR)/test-dfft.o $(OBJDIR)/bigchunk-local.o $(OBJDIR)/libdfft.a
+	${HACC_MPI_CXX} $(CXXFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)
diff --git a/src/halo-finder/src/dfft/README b/src/halo-finder/src/dfft/README
new file mode 100644
index 0000000..d13d91e
--- /dev/null
+++ b/src/halo-finder/src/dfft/README
@@ -0,0 +1,41 @@
+dfft: Distributed FFT
+
+FFTW + 3-D to 1-D Data Redistribution
+
+This directory contains the implementation and tests for the FFTW MPI
+transform, combined with a 1-d <--> 3-d data redistribution so that it
+can be applied directly to problems requiring a 3-d data distribution
+
+Implementation:
+  distribution.c
+  distribution.h
+
+Tools:
+  fp.h
+    Floating point comparison functions.  
+
+  cycle.h
+    The cycle timer from FFTW, useful for comparative benchmarks.
+
+Tests:
+
+  test-distribution
+    This test initializes the field to the value of its global
+    coordinate for thorough checking of the redistribution
+    functions. Checks that the data is correct are performed before
+    and after each data redistribution redistribution
+
+  test-distribution-fft
+    As above but with a forward backward fft.
+
+  test-delta-function
+    This test initializes the field to a delta function centered
+    and performs forward and backward transforms, checking the
+    data is correct before and after each data redistribution
+    or transform.
+
+  test-timing
+    This test times the combined redistribution and transform.
+
+  test-solver
+    A test stand-alone implementation of the Poisson solver.
diff --git a/src/halo-finder/src/dfft/active-schedule.c b/src/halo-finder/src/dfft/active-schedule.c
new file mode 100644
index 0000000..51151a5
--- /dev/null
+++ b/src/halo-finder/src/dfft/active-schedule.c
@@ -0,0 +1,199 @@
+#include <stdbool.h>
+
+#include "active-schedule.h"
+
+#ifdef DEBUG
+static bool debug = true;
+#else
+static bool debug = false;
+#endif
+
+static void print_schedule(active_schedule_t *schedule)
+{
+    int global_self;
+    int self;
+    MPI_Aint lb;
+    MPI_Aint extent;
+    
+    MPI_Comm_rank(MPI_COMM_WORLD, &global_self);
+    MPI_Comm_rank(schedule->comm, &self);
+    MPI_Type_get_extent(schedule->type, &lb, &extent);
+
+    fprintf(stderr,
+            "%d: schedule=%p, comm=%p, "
+            "type=%p(lb=%ld, extent=%ld), count=%d, %d %s %d\n",
+            global_self,
+            schedule,
+            (void *) schedule->comm,
+            (void *) schedule->type,
+            (long) lb,
+            (long) extent,
+            schedule->count,
+            self,
+            schedule->direction == ACTIVE_SCHEDULE_SEND ? " -> " : " <- ",
+            schedule->peer);
+}            
+
+
+active_schedule_t *active_schedule_prepend(active_schedule_t *schedule,
+                                           MPI_Comm comm,
+                                           int peer,
+                                           int direction,
+                                           void *addr,
+                                           MPI_Datatype type,
+                                           int count,
+                                           active_function_pointer_t pre_function,
+                                           void *pre_data,
+                                           active_function_pointer_t post_function,
+                                           void *post_data)
+{
+    active_schedule_t *new_schedule;
+
+    new_schedule = (active_schedule_t *) malloc(sizeof(active_schedule_t));
+    if (!new_schedule) {
+        perror("out of memory");
+    }
+    new_schedule->next = schedule;
+    new_schedule->comm = comm;
+    new_schedule->peer = peer;
+    new_schedule->direction = direction;
+    new_schedule->addr = addr;
+    MPI_Type_dup(type, &new_schedule->type);
+    new_schedule->pre_function = pre_function;
+    new_schedule->pre_data = pre_data;
+    new_schedule->post_function = post_function;
+    new_schedule->post_data = post_data;
+    new_schedule->count = count;
+    new_schedule->req = MPI_REQUEST_NULL;
+    return new_schedule;
+}
+
+
+active_schedule_t *active_schedule_append(active_schedule_t *schedule,
+                                          MPI_Comm comm,
+                                          int peer,
+                                          int direction,
+                                          void *addr,
+                                          MPI_Datatype type,
+                                          int count,
+                                          active_function_pointer_t pre_function,
+                                          void *pre_data,
+                                          active_function_pointer_t post_function,
+                                          void *post_data)
+{
+    active_schedule_t *new_schedule;
+
+    new_schedule = (active_schedule_t *) malloc(sizeof(active_schedule_t));
+    if (!new_schedule) {
+        perror("out of memory");
+    }
+    new_schedule->next = NULL;
+    new_schedule->comm = comm;
+    new_schedule->peer = peer;
+    new_schedule->direction = direction;
+    new_schedule->addr = addr;
+    MPI_Type_dup(type, &new_schedule->type);
+    new_schedule->pre_function = pre_function;
+    new_schedule->pre_data = pre_data;
+    new_schedule->post_function = post_function;
+    new_schedule->post_data = post_data;
+    new_schedule->count = count;
+    new_schedule->req = MPI_REQUEST_NULL;
+
+    if (schedule == NULL) {
+        schedule = new_schedule;
+    } else {
+        active_schedule_t *p = schedule;
+        while (p->next) {
+            p = p->next;
+        }
+        p->next = new_schedule;
+    }
+
+    return schedule;
+}
+
+
+void active_schedule_free(active_schedule_t *schedule)
+{
+    while (schedule) {
+        active_schedule_t *s = schedule->next;
+        MPI_Type_free(&schedule->type);
+        free(schedule);
+        schedule = s;
+    }
+}
+
+
+void active_schedule_start(active_schedule_t *schedule)
+{
+    for (active_schedule_t *s = schedule; s; s = s->next) {
+        if (debug) {
+            print_schedule(s);
+        }
+        if (s->pre_function) {
+            s->pre_function(s->pre_data);
+        }
+        if (s->direction == ACTIVE_SCHEDULE_SEND) {
+            MPI_Isend(s->addr, s->count, s->type, s->peer, 0, s->comm, &s->req);
+        } else if (s->direction == ACTIVE_SCHEDULE_RECV) {
+            MPI_Irecv(s->addr, s->count, s->type, s->peer, 0, s->comm, &s->req);
+        } else {
+            perror("unknown direction in communication schedule");
+        }
+    }
+}
+
+
+void active_schedule_progress(active_schedule_t *schedule)
+{
+    for (active_schedule_t *s = schedule; s; s = s->next) {
+        int flag;
+        MPI_Test(&s->req, &flag, MPI_STATUS_IGNORE);
+    }
+}
+
+
+void active_schedule_wait(active_schedule_t *schedule)
+{
+    for (active_schedule_t *s = schedule; s; s = s->next) {
+        MPI_Wait(&s->req, MPI_STATUS_IGNORE);
+        if (s->post_function) {
+            s->post_function(s->post_data);
+        }
+    }
+}
+
+
+void active_schedule_execute(active_schedule_t *schedule, int depth)
+{
+
+    active_schedule_t *s = schedule;
+    active_schedule_t *t = schedule;
+    
+    while (t) {
+        if (s) {
+            if (s->pre_function) {
+                s->pre_function(s->pre_data);
+            }
+            if (s->direction == ACTIVE_SCHEDULE_SEND) {
+                MPI_Isend(s->addr, s->count, s->type, s->peer, 0, s->comm, &s->req);
+            } else if (s->direction == ACTIVE_SCHEDULE_RECV) {
+                MPI_Irecv(s->addr, s->count, s->type, s->peer, 0, s->comm, &s->req);
+            } else {
+                perror("unknown direction in communication schedule");
+            }
+            s = s->next;
+        }
+
+        if (depth == 0) {
+            MPI_Wait(&t->req, MPI_STATUS_IGNORE);
+            if (t->post_function) {
+                t->post_function(t->post_data);
+            }
+            t = t->next;
+        } else {
+            --depth;
+        }
+    }
+}
diff --git a/src/halo-finder/src/dfft/active-schedule.h b/src/halo-finder/src/dfft/active-schedule.h
new file mode 100644
index 0000000..5a17fd4
--- /dev/null
+++ b/src/halo-finder/src/dfft/active-schedule.h
@@ -0,0 +1,122 @@
+///
+// A communication schedule is a list of sends and recvs to execute.
+//
+// See test-comm-schedule.c for an example of use.
+///
+
+#ifndef ACTIVE_SCHEDULE_H
+#define ACTIVE_SCHEDULE_H
+
+#ifdef __cplusplus
+#define EXTERN_C_BEGIN extern "C" {
+#define EXTERN_C_END }
+#else
+#define EXTERN_C_BEGIN
+#define EXTERN_C_END
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+EXTERN_C_BEGIN
+
+///
+// Function pointer
+///
+typedef void (*active_function_pointer_t)(void *);
+
+///
+// eunumeration of directions (send or recv) in a communication schedule
+///
+enum { ACTIVE_SCHEDULE_SEND = 1, ACTIVE_SCHEDULE_RECV = 2 };
+
+///
+// descriptor for a communication schedule
+//   next       descriptor for the next item in the schedule
+//   addr       the base address for this item
+//   type       MPI datatype for this item
+//   comm       MPI communicator for this item
+//   peer       the rank of my peer for this item
+//   direction  send or recv
+//   count      number of MPI datatypes to send for this item
+//   pre_func   a function to call before the send/recv is started
+//   pre_data   pointer to data for pre_func
+//   post_func  a function to call after the send/recv completes
+//   post_data  pointer to data for post_func
+///
+typedef struct active_schedule_t active_schedule_t;
+struct active_schedule_t {
+    active_schedule_t *next;
+    void *addr;
+    MPI_Datatype type;
+    MPI_Comm comm;
+    MPI_Request req;
+    int peer;
+    int direction;
+    int count;
+    active_function_pointer_t pre_function;
+    void *pre_data;
+    active_function_pointer_t post_function;
+    void *post_data;
+};
+
+
+///
+// append a new item to a communication schedule
+///
+active_schedule_t *active_schedule_append(active_schedule_t *schedule,
+                                          MPI_Comm comm,
+                                          int peer,
+                                          int direction,
+                                          void *addr,
+                                          MPI_Datatype type,
+                                          int count,
+                                          active_function_pointer_t pre_function,
+                                          void *pre_data,
+                                          active_function_pointer_t post_function,
+                                          void *post_data);
+
+///
+// prepend a new item to a communication schedule
+///
+active_schedule_t *active_schedule_prepend(active_schedule_t *schedule,
+                                           MPI_Comm comm,
+                                           int peer,
+                                           int direction,
+                                           void *addr,
+                                           MPI_Datatype type,
+                                           int count,
+                                           active_function_pointer_t pre_function,
+                                           void *pre_data,
+                                           active_function_pointer_t post_function,
+                                           void *post_data);
+
+///
+// start execution of a communication schedule
+///
+void active_schedule_start(active_schedule_t *schedule);
+
+///
+// progress an already started communication schedule
+///
+void active_schedule_progress(active_schedule_t *schedule);
+
+///
+// wait for completion of a communication schedule
+///
+void active_schedule_wait(active_schedule_t *schedule);
+
+///
+// execute a communication schedule
+///
+void active_schedule_execute(active_schedule_t *schedule, int depth);
+
+///
+// delete a communication schedule
+///
+void active_schedule_free(active_schedule_t *schedule);
+
+EXTERN_C_END
+
+#endif
diff --git a/src/halo-finder/src/dfft/allocator.hpp b/src/halo-finder/src/dfft/allocator.hpp
new file mode 100644
index 0000000..33c3f72
--- /dev/null
+++ b/src/halo-finder/src/dfft/allocator.hpp
@@ -0,0 +1,98 @@
+#ifndef ALLOCATOR_HPP
+#define ALLOCATOR_HPP
+
+#ifdef FFTW3
+#ifdef ESSL_FFTW
+#include <fftw3_essl.h>
+#else
+#include <fftw3.h>
+#endif
+#else
+#include <fftw.h>
+#endif
+
+///
+// An allocator class based on fftw_malloc to get SIMD friendly
+// alignment.
+///
+template <class T> class fftw_allocator
+{
+public:
+    typedef T                 value_type;
+    typedef value_type*       pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type&       reference;
+    typedef const value_type& const_reference;
+    typedef std::size_t       size_type;
+    typedef std::ptrdiff_t    difference_type;
+  
+    template <class U> 
+    struct rebind { typedef fftw_allocator<U> other; };
+
+    fftw_allocator() {}
+    fftw_allocator(const fftw_allocator&) {}
+    template <class U> 
+    fftw_allocator(const fftw_allocator<U>&) {}
+    ~fftw_allocator() {}
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return x; }
+
+    pointer allocate(size_type n, const_pointer = 0)
+        {
+            void* p = fftw_malloc(n * sizeof(T));
+            if (!p) {
+                throw std::bad_alloc();
+            }
+            return static_cast<pointer>(p);
+        }
+
+    void deallocate(pointer p, size_type)
+        {
+            fftw_free(p);
+        }
+
+    size_type max_size() const
+        { 
+            return static_cast<size_type>(-1) / sizeof(T);
+        }
+
+    void construct(pointer p, const value_type& x)
+        { 
+            new(p) value_type(x); 
+        }
+
+    void destroy(pointer p)
+        {
+            p->~value_type();
+        }
+
+private:
+    void operator=(const fftw_allocator&);
+};
+
+
+template<> class fftw_allocator<void>
+{
+    typedef void        value_type;
+    typedef void*       pointer;
+    typedef const void* const_pointer;
+
+    template <class U> 
+    struct rebind { typedef fftw_allocator<U> other; };
+};
+
+
+template <class T>
+inline bool operator==(const fftw_allocator<T>&,  const fftw_allocator<T>&)
+{
+    return true;
+}
+
+template <class T>
+inline bool operator!=(const fftw_allocator<T>&, 
+                       const fftw_allocator<T>&)
+{
+    return false;
+}
+
+#endif
diff --git a/src/halo-finder/src/dfft/comm-schedule.c b/src/halo-finder/src/dfft/comm-schedule.c
new file mode 100644
index 0000000..ebd70b5
--- /dev/null
+++ b/src/halo-finder/src/dfft/comm-schedule.c
@@ -0,0 +1,162 @@
+#include <stdbool.h>
+
+#include "comm-schedule.h"
+
+#ifdef DEBUG
+static bool debug = true;
+#else
+static bool debug = false;
+#endif
+
+
+static void print_schedule(comm_schedule_t *schedule)
+{
+    int global_self;
+    int self;
+    MPI_Aint lb;
+    MPI_Aint extent;
+    
+    MPI_Comm_rank(MPI_COMM_WORLD, &global_self);
+    MPI_Comm_rank(schedule->comm, &self);
+    MPI_Type_get_extent(schedule->type, &lb, &extent);
+
+    fprintf(stderr,
+            "%d: schedule=%p, comm=%p, "
+            "type=%p(lb=%ld, extent=%ld), count=%d, %d %s %d\n",
+            global_self,
+            schedule,
+            (void *) schedule->comm,
+            (void *) schedule->type,
+            (long) lb,
+            (long) extent,
+            schedule->count,
+            self,
+            schedule->direction == COMM_SCHEDULE_SEND ? " -> " : " <- ",
+            schedule->peer);
+}            
+
+
+comm_schedule_t *comm_schedule_append(comm_schedule_t *schedule,
+                                      MPI_Comm comm,
+                                      int peer,
+                                      int direction,
+                                      void *addr,
+                                      MPI_Datatype type,
+                                      int count)
+{
+    comm_schedule_t *new_schedule;
+
+    new_schedule = (comm_schedule_t *) malloc(sizeof(comm_schedule_t));
+    if (!new_schedule) {
+        perror("out of memory");
+    }
+    new_schedule->next = NULL;
+    new_schedule->comm = comm;
+    new_schedule->peer = peer;
+    new_schedule->direction = direction;
+    new_schedule->addr = addr;
+    MPI_Type_dup(type, &new_schedule->type);
+    new_schedule->count = count;
+    new_schedule->req = MPI_REQUEST_NULL;
+
+    if (schedule == NULL) {
+        schedule = new_schedule;
+    } else {
+        comm_schedule_t *p = schedule;
+        while (p->next) {
+            p = p->next;
+        }
+        p->next = new_schedule;
+    }
+
+    return schedule;
+}
+
+
+comm_schedule_t *comm_schedule_prepend(comm_schedule_t *schedule,
+                                       MPI_Comm comm,
+                                       int peer,
+                                       int direction,
+                                       void *addr,
+                                       MPI_Datatype type,
+                                       int count)
+{
+    comm_schedule_t *new_schedule;
+
+    new_schedule = (comm_schedule_t *) malloc(sizeof(comm_schedule_t));
+    if (!new_schedule) {
+        perror("out of memory");
+    }
+    new_schedule->next = schedule;
+    new_schedule->comm = comm;
+    new_schedule->peer = peer;
+    new_schedule->direction = direction;
+    new_schedule->addr = addr;
+    MPI_Type_dup(type, &new_schedule->type);
+    new_schedule->count = count;
+    new_schedule->req = MPI_REQUEST_NULL;
+    return new_schedule;
+}
+
+
+void comm_schedule_free(comm_schedule_t *schedule)
+{
+    while (schedule) {
+        comm_schedule_t *s = schedule->next;
+        MPI_Type_free(&schedule->type);
+        free(schedule);
+        schedule = s;
+    }
+}
+
+
+void comm_schedule_start(comm_schedule_t *schedule)
+{
+    for (comm_schedule_t *s = schedule; s; s = s->next) {
+        if (debug) {
+            print_schedule(s);
+        }
+        if (s->direction == COMM_SCHEDULE_SEND) {
+            MPI_Isend(s->addr, s->count, s->type, s->peer, 0, s->comm, &s->req);
+        } else if (s->direction == COMM_SCHEDULE_RECV) {
+            MPI_Irecv(s->addr, s->count, s->type, s->peer, 0, s->comm, &s->req);
+        } else {
+            perror("unknown direction in communication schedule");
+        }
+    }
+}
+
+
+void comm_schedule_progress(comm_schedule_t *schedule)
+{
+    for (comm_schedule_t *s = schedule; s; s = s->next) {
+        int flag;
+        MPI_Test(&s->req, &flag, MPI_STATUS_IGNORE);
+    }
+}
+
+
+void comm_schedule_wait(comm_schedule_t *schedule)
+{
+    MPI_Request array_of_requests[256];
+    int i;
+    
+    i = 0;
+    for (comm_schedule_t *s = schedule; s; s = s->next) {
+        array_of_requests[i] = s -> req;
+        i = i + 1;
+    }
+
+    MPI_Waitall( i, array_of_requests, MPI_STATUSES_IGNORE );
+
+    //for (comm_schedule_t *s = schedule; s; s = s->next) {
+    //    MPI_Wait(&s->req, MPI_STATUS_IGNORE);
+    //}
+}
+
+
+void comm_schedule_execute(comm_schedule_t *schedule)
+{
+    comm_schedule_start(schedule);
+    comm_schedule_wait(schedule);
+}
diff --git a/src/halo-finder/src/dfft/comm-schedule.h b/src/halo-finder/src/dfft/comm-schedule.h
new file mode 100644
index 0000000..98fee0e
--- /dev/null
+++ b/src/halo-finder/src/dfft/comm-schedule.h
@@ -0,0 +1,96 @@
+///
+// A communication schedule is a list of sends and recvs to execute.
+//
+// See test-comm-schedule.c for an example of use.
+///
+
+#ifndef COMM_SCHEDULE_H
+#define COMM_SCHEDULE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+///
+// eunumeration of directions (send or recv) in a communication schedule
+///
+enum { COMM_SCHEDULE_SEND = 1, COMM_SCHEDULE_RECV = 2 };
+
+///
+// descriptor for a communication schedule
+//   next       descriptor for the next item in the schedule
+//   addr       the base address for this item
+//   type       MPI datatype for this item
+//   comm       MPI communicator for this item
+//   peer       the rank of my peer for this item
+//   direction  send or recv
+//   count      number of MPI datatypes to send for this item
+///
+typedef struct comm_schedule_t comm_schedule_t;
+struct comm_schedule_t {
+    comm_schedule_t *next;
+    void *addr;
+    MPI_Datatype type;
+    MPI_Comm comm;
+    MPI_Request req;
+    int peer;
+    int direction;
+    int count;
+};
+
+///
+// append a new item to a communication schedule
+///
+comm_schedule_t *comm_schedule_append(comm_schedule_t *schedule,
+                                      MPI_Comm comm,
+                                      int peer,
+                                      int direction,
+                                      void *addr,
+                                      MPI_Datatype type,
+                                      int count);
+
+///
+// prepend a new item to a communication schedule
+///
+comm_schedule_t *comm_schedule_prepend(comm_schedule_t *schedule,
+                                       MPI_Comm comm,
+                                       int peer,
+                                       int direction,
+                                       void *addr,
+                                       MPI_Datatype type,
+                                       int count);
+
+///
+// start execution of a communication schedule
+///
+void comm_schedule_start(comm_schedule_t *schedule);
+
+///
+// progress an already started communication schedule
+///
+void comm_schedule_progress(comm_schedule_t *schedule);
+
+///
+// wait for completion of a communication schedule
+///
+void comm_schedule_wait(comm_schedule_t *schedule);
+
+///
+// execute a communication schedule
+///
+void comm_schedule_execute(comm_schedule_t *schedule);
+
+///
+// delete a communication schedule
+///
+void comm_schedule_free(comm_schedule_t *schedule);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/halo-finder/src/dfft/complex-type.h b/src/halo-finder/src/dfft/complex-type.h
new file mode 100644
index 0000000..aa5a752
--- /dev/null
+++ b/src/halo-finder/src/dfft/complex-type.h
@@ -0,0 +1,36 @@
+// Compatibility file for C99 and C++ complex.  This header
+// can be included by either C99 or ANSI C++ programs to
+// allow complex arithmetic to be written in a common subset.
+// Note that overloads for both the real and complex math
+// functions are available after this header has been
+// included.
+
+#ifndef COMPLEX_TYPE_H
+#define COMPLEX_TYPE_H
+
+#ifdef __cplusplus
+
+#include <cmath>
+#include <complex>
+
+typedef std::complex<double> complex_t;
+
+#define I complex_t(0.0, 1.0)
+
+#else
+
+#include <complex.h>
+#include <math.h>
+
+typedef double complex complex_t;
+
+#define complex_t(r,i) ((double)(r) + ((double)(i)) * I)
+
+#define real(x) creal(x)
+#define imag(x) cimag(x)
+#define abs(x) fabs(x)
+#define arg(x) carg(x)
+
+#endif  // #ifdef __cplusplus
+
+#endif  // #ifndef COMPLEX_TYPE_H
diff --git a/src/halo-finder/src/dfft/cross.hpp b/src/halo-finder/src/dfft/cross.hpp
new file mode 100644
index 0000000..2049e27
--- /dev/null
+++ b/src/halo-finder/src/dfft/cross.hpp
@@ -0,0 +1,280 @@
+#ifndef CROSS_HPP
+#define CROSS_HPP
+
+#include "complex-type.h"
+
+#ifdef FFTW3
+#include <fftw3-mpi.h>
+#else
+#include <fftw_mpi.h>
+#endif
+
+#include <algorithm>
+#include <vector>
+
+#include "allocator.hpp"
+#include "distribution.hpp"
+#include "solver.hpp"
+
+#include <string.h>
+
+// pgCC doesn't yet play well with C99 constructs, so...
+#ifdef __PGI__
+extern "C" long int lrint(double x);
+#endif
+
+#define FFTW_ADDR(X) reinterpret_cast<fftw_complex*>(&(X)[0])
+
+class CrossBase : public Distribution {
+
+public:
+
+  // methods
+
+
+
+  CrossBase()
+  {
+  }
+
+
+
+  CrossBase(MPI_Comm comm, int ng)
+    : Distribution(comm, ng)
+  {
+    std::vector<int> n;
+    n.assign(3, ng);
+    initialize(comm, n);
+  }
+
+
+
+  CrossBase(MPI_Comm comm, std::vector<int> const & n)
+    : Distribution(comm, n)
+  {
+    initialize(comm, n);
+  }
+
+
+  
+  virtual ~CrossBase()
+  {
+#ifdef FFTW3
+    fftw_destroy_plan(m_plan_f);
+    fftw_destroy_plan(m_plan_b);
+#else
+    fftwnd_mpi_destroy_plan(m_plan_f);
+    fftwnd_mpi_destroy_plan(m_plan_b);
+#endif
+  }
+  
+
+
+  // solve interfaces
+
+
+
+  //NOT SURE I DID THIS CORRECTLY FOR FFTW2 -- ADRIAN
+  void forward(complex_t const *rho1, complex_t const *rho2)
+  {
+
+    //forward rho1
+#ifdef FFTW3
+    distribution_3_to_1(rho1, &m_buf1[0], &m_d);            // rho1 --> buf1
+    fftw_execute(m_plan_f);                                 // buf1 --> buf2
+#else
+    distribution_3_to_1(rho1, &m_buf2[0], &m_d);            // rho1 --> buf2
+    fftwnd_mpi(m_plan_f, 1,   FFTW_ADDR(m_buf2), FFTW_ADDR(m_buf3),
+	       FFTW_NORMAL_ORDER);                          // buf2 --> buf3
+#endif
+
+    //copy transformed rho1 (in buf2) to a safe place (buf3)
+    memcpy( &m_buf3[0], &m_buf2[0], local_size()*sizeof(complex_t) );
+
+    //forward rho2
+#ifdef FFTW3
+    distribution_3_to_1(rho2, &m_buf1[0], &m_d);            // rho2 --> buf1
+    fftw_execute(m_plan_f);                                 // buf1 --> buf2
+#else
+    distribution_3_to_1(rho2, &m_buf2[0], &m_d);            // rho2 --> buf2
+    fftwnd_mpi(m_plan_f, 1,   FFTW_ADDR(m_buf2), FFTW_ADDR(m_buf3),
+                       FFTW_NORMAL_ORDER);                  // buf2 --> buf3
+#endif
+  }
+
+
+
+  void backward_xi(complex_t *phi) 
+  {
+    //now, transformed rho1 in buf3, transformed rho2 in buf2
+    //need intermediate result in buf1
+    for(int i=0; i<local_size(); i++) {
+      m_buf1[i] = m_buf2[i] * conj(m_buf3[i]);
+    }
+
+    //it would be nice to set (0,0,0) mode to 0
+    int index = 0;
+    for (int local_k0 = 0; local_k0 < local_ng_1d(0); ++local_k0) {
+      int k0 = local_k0 + self_1d(0) * local_ng_1d(0);
+      for (int k1 = 0; k1 < local_ng_1d(1); ++k1) {
+	for (int k2 = 0; k2 < local_ng_1d(2); ++k2) {
+	  if(k0 == 0 && k1==0 && k2==0) {
+	    m_buf1[index] *= 0.0;
+	  }
+	  index++;
+	}
+	index += m_d.padding[2];
+      }
+      index += m_d.padding[1];
+    }
+
+#ifdef FFTW3
+    fftw_execute(m_plan_b);                                 // buf1 --> buf3
+    distribution_1_to_3(&m_buf3[0], phi, &m_d);             // buf3 --> phi
+#else
+    fftwnd_mpi(m_plan_b, 1,
+	       (fftw_complex *) &m_buf1[0],
+	       (fftw_complex *) &m_buf3[0],
+	       FFTW_NORMAL_ORDER);                           // buf1 -->buf1
+    distribution_1_to_3(&m_buf1[0], phi, &m_d);              // buf3 --> phi
+#endif
+  }
+
+
+  
+  // interfaces for std::vector
+
+
+  
+  void forward(std::vector<complex_t> const & rho1,
+	       std::vector<complex_t> const & rho2)
+  {
+    forward(&rho1[0], &rho2[0]);
+  }
+
+
+
+  void backward_xi(std::vector<complex_t> & phi)
+  {
+    backward_xi(&phi[0]);
+  }
+
+
+
+  // analysis interfaces
+    ///
+  // calculate the k-space power spectrum
+  //   P(modk) = Sum { |rho(k)|^2 : |k| = modk, k <- [0, ng / 2)^3, periodically extended }
+  ///
+  void power_spectrum(std::vector<double> & power)
+  {
+    std::vector<complex_t, fftw_allocator<complex_t> > const & rho1 = m_buf3;
+    std::vector<complex_t, fftw_allocator<complex_t> > const & rho2 = m_buf2;
+    std::vector<int> ksq;
+    std::vector<double> weight;
+    int ng = m_d.n[0];
+    double volume = 1.0 * ng * ng * ng; 
+    
+    // cache periodic ksq
+    ksq.resize(ng);
+    double ksq_max = 0;
+    for (int k = 0; k < ng / 2; ++k) {
+      ksq[k] = k * k;
+      ksq_max = max(ksq_max, ksq[k]);
+      ksq[k + ng / 2] = (k - ng / 2) * (k - ng / 2);
+      ksq_max = max(ksq_max, ksq[k + ng / 2]);
+    }
+    long modk_max = lrint(sqrt(3 * ksq_max)); // round to nearest integer
+    
+    // calculate power spectrum
+    power.resize(modk_max + 1);
+    power.assign(modk_max + 1, 0.0);
+    weight.resize(modk_max + 1);
+    weight.assign(modk_max + 1, 0.0);
+    int index = 0;
+    for (int local_k0 = 0; local_k0 < local_ng_1d(0); ++local_k0) {
+      int k0 = local_k0 + self_1d(0) * local_ng_1d(0);
+      double ksq0 = ksq[k0];
+      for (int k1 = 0; k1 < local_ng_1d(1); ++k1) {
+	double ksq1 = ksq[k1];
+	for (int k2 = 0; k2 < local_ng_1d(2); ++k2) {
+	  double ksq2 = ksq[k2];
+	  // round to nearest integer
+	  long modk = lrint(sqrt(ksq0 + ksq1 + ksq2));
+	  power[modk] += real(rho1[index] * conj(rho2[index]));
+	  weight[modk] += volume;
+	  index++;
+	}
+	index += m_d.padding[2];
+      }
+      index += m_d.padding[1];
+    }
+    
+    // accumulate across processors
+    MPI_Allreduce(MPI_IN_PLACE, &power[0], power.size(), MPI_DOUBLE, MPI_SUM, cart_1d());
+    MPI_Allreduce(MPI_IN_PLACE, &weight[0], weight.size(), MPI_DOUBLE, MPI_SUM, cart_1d());
+    
+    //make sure we don't divide by zero
+    for(int i = 0; i < weight.size(); ++i) {
+      weight[i] += 1.0 * (weight[i] < 1.0);
+    }
+    
+    // scale power by weight
+    std::transform(power.begin(), power.end(), weight.begin(), power.begin(), std::divides<double>());
+  }
+  
+
+
+  ///
+  // General initialization
+  ///
+  void initialize(MPI_Comm comm, std::vector<int> n, bool transposed_order = false)
+  {
+    int flags_f;
+    int flags_b;
+    
+    distribution_init(comm, &n[0], &n[0], &m_d, false);
+    distribution_assert_commensurate(&m_d);
+#ifdef FFTW3
+    fftw_mpi_init();
+#endif
+    m_buf1.resize(local_size());
+    m_buf2.resize(local_size());
+    m_buf3.resize(local_size());
+    
+    // create plan for forward and backward DFT's
+    flags_f = flags_b = FFTW_ESTIMATE;
+#ifdef FFTW3
+    if (transposed_order) {
+      flags_f |= FFTW_MPI_TRANSPOSED_OUT;
+      flags_b |= FFTW_MPI_TRANSPOSED_IN;
+    }
+    m_plan_f = fftw_mpi_plan_dft_3d(n[0], n[1], n[2],
+				    FFTW_ADDR(m_buf1), FFTW_ADDR(m_buf2),
+				    comm, FFTW_FORWARD, flags_f);
+    m_plan_b = fftw_mpi_plan_dft_3d(n[0], n[1], n[2],
+				    FFTW_ADDR(m_buf1), FFTW_ADDR(m_buf3),
+				    comm, FFTW_BACKWARD, flags_b);
+#else
+    m_plan_f = fftw3d_mpi_create_plan(comm, n[0], n[1], n[2], FFTW_FORWARD, flags_f);
+    m_plan_b = fftw3d_mpi_create_plan(comm, n[0], n[1], n[2], FFTW_BACKWARD, flags_b);
+#endif
+  }
+
+
+
+protected:
+    double max(double a, double b) { return a > b ? a : b; }
+    std::vector<complex_t, fftw_allocator<complex_t> > m_buf1;
+    std::vector<complex_t, fftw_allocator<complex_t> > m_buf2;
+    std::vector<complex_t, fftw_allocator<complex_t> > m_buf3;
+#ifdef FFTW3
+    fftw_plan m_plan_f;
+    fftw_plan m_plan_b;
+#else
+    fftwnd_mpi_plan m_plan_f;
+    fftwnd_mpi_plan m_plan_b;
+#endif
+};
+
+#endif
diff --git a/src/halo-finder/src/dfft/cycle.h b/src/halo-finder/src/dfft/cycle.h
new file mode 100644
index 0000000..2652a04
--- /dev/null
+++ b/src/halo-finder/src/dfft/cycle.h
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2003, 2007-8 Matteo Frigo
+ * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+/* machine-dependent cycle counters code. Needs to be inlined. */
+
+/***************************************************************************/
+/* To use the cycle counters in your code, simply #include "cycle.h" (this
+   file), and then use the functions/macros:
+
+                 ticks getticks(void);
+
+   ticks is an opaque typedef defined below, representing the current time.
+   You extract the elapsed time between two calls to gettick() via:
+
+                 double elapsed(ticks t1, ticks t0);
+
+   which returns a double-precision variable in arbitrary units.  You
+   are not expected to convert this into human units like seconds; it
+   is intended only for *comparisons* of time intervals.
+
+   (In order to use some of the OS-dependent timer routines like
+   Solaris' gethrtime, you need to paste the autoconf snippet below
+   into your configure.ac file and #include "config.h" before cycle.h,
+   or define the relevant macros manually if you are not using autoconf.)
+*/
+
+/***************************************************************************/
+/* This file uses macros like HAVE_GETHRTIME that are assumed to be
+   defined according to whether the corresponding function/type/header
+   is available on your system.  The necessary macros are most
+   conveniently defined if you are using GNU autoconf, via the tests:
+   
+   dnl ---------------------------------------------------------------------
+
+   AC_C_INLINE
+   AC_HEADER_TIME
+   AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
+
+   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif])
+
+   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
+
+   dnl Cray UNICOS _rtc() (real-time clock) intrinsic
+   AC_MSG_CHECKING([for _rtc intrinsic])
+   rtc_ok=yes
+   AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+   AC_MSG_RESULT($rtc_ok)
+
+   dnl ---------------------------------------------------------------------
+*/
+
+/***************************************************************************/
+
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
+{									  \
+     return (double)t1 - (double)t0;					  \
+}
+
+/*----------------------------------------------------------------*/
+/* Solaris */
+#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
+typedef hrtime_t ticks;
+
+#define getticks gethrtime
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* AIX v. 4+ routines to read the real-time clock or time-base register */
+#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
+typedef timebasestruct_t ticks;
+
+static __inline ticks getticks(void)
+{
+     ticks t;
+     read_real_time(&t, TIMEBASE_SZ);
+     return t;
+}
+
+static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
+{
+     time_base_to_time(&t1, TIMEBASE_SZ);
+     time_base_to_time(&t0, TIMEBASE_SZ);
+     return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + 
+	     ((double)t1.tb_low - (double)t0.tb_low));
+}
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PowerPC ``cycle'' counter using the time base register.
+ */
+#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__))))  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned int tbl, tbu0, tbu1;
+
+     do {
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+	  __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+     } while (tbu0 != tbu1);
+
+     return (((unsigned long long)tbu0) << 32) | tbl;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
+   from Carbon, requires no additional libraries to be linked). */
+#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
+#include <mach/mach_time.h>
+typedef uint64_t ticks;
+#define getticks mach_absolute_time
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * Pentium cycle counter 
+ */
+#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("rdtsc": "=A" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/* Visual C++ -- thanks to Morten Nissov for his help with this */
+#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
+#include <windows.h>
+typedef LARGE_INTEGER ticks;
+#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
+
+static __inline ticks getticks(void)
+{
+     ticks retval;
+
+     __asm {
+	  RDTSC
+	  mov retval.HighPart, edx
+	  mov retval.LowPart, eax
+     }
+     return retval;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{  
+     return (double)t1.QuadPart - (double)t0.QuadPart;
+}  
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * X86-64 cycle counter
+ */
+#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned a, d; 
+     asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
+     return ((ticks)a) | (((ticks)d) << 32); 
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
+   NOTE: this code will fail to link unless you use the -Masmkeyword compiler
+   option (grrr). */
+#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) 
+typedef unsigned long long ticks;
+static ticks getticks(void)
+{
+    asm(" rdtsc; shl    $0x20,%rdx; mov    %eax,%eax; or     %rdx,%rax;    ");
+}
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Visual C++, courtesy of Dirk Michaelis */
+#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
+
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+typedef unsigned __int64 ticks;
+#define getticks __rdtsc
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * IA64 cycle counter
+ */
+
+/* intel's icc/ecc compiler */
+#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+#include <ia64intrin.h>
+
+static __inline__ ticks getticks(void)
+{
+     return __getReg(_IA64_REG_AR_ITC);
+}
+ 
+INLINE_ELAPSED(__inline__)
+ 
+#define HAVE_TICK_COUNTER
+#endif
+
+/* gcc */
+#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
+#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
+#include <machine/sys/inline.h>
+typedef unsigned long ticks;
+
+static inline ticks getticks(void)
+{
+     ticks ret;
+
+     ret = _Asm_mov_from_ar (_AREG_ITC);
+     return ret;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Microsoft Visual C++ */
+#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned __int64 ticks;
+
+#  ifdef __cplusplus
+extern "C"
+#  endif
+ticks __getReg(int whichReg);
+#pragma intrinsic(__getReg)
+
+static __inline ticks getticks(void)
+{
+     volatile ticks temp;
+     temp = __getReg(3116);
+     return temp;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PA-RISC cycle counter 
+ */
+#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+#  ifdef __GNUC__
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("mfctl 16, %0": "=r" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+#  else
+#  include <machine/inline.h>
+static inline unsigned long getticks(void)
+{
+     register ticks ret;
+     _MFCTL(16, ret);
+     return ret;
+}
+#  endif
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* S390, courtesy of James Treacy */
+#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks cycles;
+     __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
+     return cycles;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
+/*
+ * The 32-bit cycle counter on alpha overflows pretty quickly, 
+ * unfortunately.  A 1GHz machine overflows in 4 seconds.
+ */
+typedef unsigned int ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned long cc;
+     __asm__ __volatile__ ("rpcc %0" : "=r"(cc));
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+     __asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
+#  include <c_asm.h>
+typedef unsigned int ticks;
+
+static __inline ticks getticks(void)
+{
+     unsigned long cc;
+     cc = asm("rpcc %v0");
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+/* SGI/Irix */
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
+typedef struct timespec ticks;
+
+static inline ticks getticks(void)
+{
+     struct timespec t;
+     clock_gettime(CLOCK_SGI_CYCLE, &t);
+     return t;
+}
+
+static inline double elapsed(ticks t1, ticks t0)
+{
+     return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
+	  ((double)t1.tv_nsec - (double)t0.tv_nsec);
+}
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* Cray UNICOS _rtc() intrinsic function */
+#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
+#ifdef HAVE_INTRINSICS_H
+#  include <intrinsics.h>
+#endif
+
+typedef long long ticks;
+
+#define getticks _rtc
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* MIPS ZBus */
+#if HAVE_MIPS_ZBUS_TIMER
+#if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+typedef uint64_t ticks;
+
+static inline ticks getticks(void)
+{
+  static uint64_t* addr = 0;
+
+  if (addr == 0)
+  {
+    uint32_t rq_addr = 0x10030000;
+    int fd;
+    int pgsize;
+
+    pgsize = getpagesize();
+    fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
+    if (fd < 0) {
+      perror("open");
+      return NULL;
+    }
+    addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
+    close(fd);
+    if (addr == (uint64_t *)-1) {
+      perror("mmap");
+      return NULL;
+    }
+  }
+
+  return *addr;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+#endif /* HAVE_MIPS_ZBUS_TIMER */
+
diff --git a/src/halo-finder/src/dfft/dfft.hpp b/src/halo-finder/src/dfft/dfft.hpp
new file mode 100644
index 0000000..1a4ab63
--- /dev/null
+++ b/src/halo-finder/src/dfft/dfft.hpp
@@ -0,0 +1,215 @@
+#ifndef DFFT_HPP
+#define DFFT_HPP
+
+///
+// Distributed FFT
+//
+// This is a high-level interface providing FFT's on 3-d data
+// distribution.  The same data distribution (partition) is assumed in
+// both x and k space and is determined by the underlying Distribution
+// class.
+///
+
+#include <vector>
+
+#include "complex-type.h"
+
+#ifdef FFTW3
+#include <fftw3-mpi.h>
+#else
+#include <fftw_mpi.h>
+#endif
+
+#include "allocator.hpp"
+#include "distribution.hpp"
+
+#define FFTW_ADDR(X) reinterpret_cast<fftw_complex *>(&(X)[0])
+
+class Dfft : public Distribution {
+
+public:
+    
+    Dfft()
+        : Distribution()
+        {
+        }
+
+    Dfft(MPI_Comm comm, int ng, bool transposed_order = false)
+        : Distribution(comm, ng)
+        {
+            std::vector<int> n;
+            n.assign(3, ng);
+            initialize(comm, &n[0], transposed_order);
+        }
+
+    Dfft(MPI_Comm comm, std::vector<int> const & n, bool transposed_order = false)
+        : Distribution(comm, n)
+        {
+            initialize(comm, &n[0], transposed_order);
+        }
+
+    Dfft(MPI_Comm comm, int const n[], bool transposed_order = false)
+        : Distribution(comm, n)
+        {
+            initialize(comm, n, transposed_order);
+        }
+
+    ~Dfft()
+        {
+#ifdef FFTW3
+            fftw_destroy_plan(m_plan_f);
+            fftw_destroy_plan(m_plan_b);
+#else
+            fftwnd_mpi_destroy_plan(m_plan_f);
+            fftwnd_mpi_destroy_plan(m_plan_b);
+#endif
+        }
+
+    void initialize(MPI_Comm comm, int const n[], bool transposed_order)
+        {
+            int padding[3] = { 0, 0, 0 };
+            int flags_f;
+            int flags_b;
+
+#ifdef FFTW3
+            fftw_mpi_init();
+#endif
+            distribution_init(comm, n, padding, &m_d, false);
+            distribution_assert_commensurate(&m_d);
+
+            m_buf1.resize(local_size());
+            m_buf2.resize(local_size());
+
+            // create plan for forward and backward DFT's
+            flags_f = flags_b = FFTW_ESTIMATE;
+#ifdef FFTW3
+            if (transposed_order) {
+                flags_f |= FFTW_MPI_TRANSPOSED_OUT;
+                flags_b |= FFTW_MPI_TRANSPOSED_IN;
+            }
+            m_plan_f = fftw_mpi_plan_dft_3d(n[0], n[1], n[2], FFTW_ADDR(m_buf1), FFTW_ADDR(m_buf2),
+                                            comm, FFTW_FORWARD, flags_f);
+            m_plan_b = fftw_mpi_plan_dft_3d(n[0], n[1], n[2], FFTW_ADDR(m_buf1), FFTW_ADDR(m_buf2),
+                                            comm, FFTW_BACKWARD, flags_b);
+#else
+            m_plan_f = fftw3d_mpi_create_plan(comm, n[0], n[1], n[2], FFTW_FORWARD, flags_f);
+            m_plan_b = fftw3d_mpi_create_plan(comm, n[0], n[1], n[2], FFTW_BACKWARD, flags_b);
+#endif
+        }
+
+    ///
+    // Forward transform
+    ///
+    void forward(double const *in, complex_t *out)
+        {
+            complexify(in, &m_buf2[0], m_buf2.size());              // in   --> buf2
+            distribution_3_to_1(&m_buf2[0], &m_buf1[0], &m_d);      // buf2 --> buf1
+#ifdef FFTW3
+            fftw_execute(m_plan_f);                                 // buf1 --> buf2
+            distribution_1_to_3(&m_buf2[0], out, &m_d);             // buf2 --> out
+#else
+            fftwnd_mpi(m_plan_f, 1,
+                       (fftw_complex *) &m_buf1[0],
+                       (fftw_complex *) &m_buf2[0],
+                       FFTW_NORMAL_ORDER);                          // buf1 -->buf1
+            distribution_1_to_3(&m_buf1[0], out, &m_d);             // buf1 --> out
+#endif
+        }
+
+    void forward(complex_t const *in, complex_t *out)
+        {
+            distribution_3_to_1(in, &m_buf1[0], &m_d);              // in --> buf1
+#ifdef FFTW3
+            fftw_execute(m_plan_f);                                 // buf1 --> buf2
+            distribution_1_to_3(&m_buf2[0], out, &m_d);             // buf2 --> out
+#else
+            fftwnd_mpi(m_plan_f, 1,
+                       (fftw_complex *) &m_buf1[0],
+                       (fftw_complex *) &m_buf2[0],
+                       FFTW_NORMAL_ORDER);                          // buf1 -->buf1
+            distribution_1_to_3(&m_buf1[0], out, &m_d);             // buf1 --> out
+#endif
+        }
+
+    void forward(std::vector<double> const & in, std::vector<complex_t> & out)
+        {
+            forward(&in[0], &out[0]);
+        }
+
+    void forward(std::vector<complex_t> const & in, std::vector<complex_t> & out)
+        {
+            forward(&in[0], &out[0]);
+        }
+
+    ///
+    // Backward transform
+    ///
+    void backward(complex_t const *in, double *out)
+        {
+#ifdef FFTW3
+            distribution_3_to_1(in, &m_buf1[0], &m_d);              // in --> buf1
+            fftw_execute(m_plan_f);                                 // buf1 --> buf2
+#else
+            distribution_3_to_1(in, &m_buf1[0], &m_d);              // in --> buf2
+            fftwnd_mpi(m_plan_f, 1,
+                       (fftw_complex *) &m_buf2[0],
+                       (fftw_complex *) &m_buf1[0],
+                       FFTW_NORMAL_ORDER);                          // buf2 -->buf2
+#endif
+            distribution_1_to_3(&m_buf2[0], &m_buf1[0], &m_d);      // buf2 --> buf1
+            decomplexify(&m_buf1[0], out, m_buf1.size());           // buf1 --> out
+        }
+
+    void backward(complex_t const *in, complex_t *out)
+        {
+#ifdef FFTW3
+            distribution_3_to_1(in, &m_buf1[0], &m_d);              // in --> buf1
+            fftw_execute(m_plan_f);                                 // buf1 --> buf2
+#else
+            distribution_3_to_1(in, &m_buf1[0], &m_d);              // in --> buf2
+            fftwnd_mpi(m_plan_f, 1,
+                       (fftw_complex *) &m_buf2[0],
+                       (fftw_complex *) &m_buf1[0],
+                       FFTW_NORMAL_ORDER);                          // buf2 -->buf2
+#endif
+            distribution_1_to_3(&m_buf2[0], out, &m_d);             // buf2 --> out
+        }
+
+    void backward(std::vector<complex_t> const & in, std::vector<double> & out)
+        {
+            backward(&in[0], &out[0]);
+        }
+
+    void backward(std::vector<complex_t> const & in, std::vector<complex_t> & out)
+        {
+            backward(&in[0], &out[0]);
+        }
+
+private:
+
+    void complexify(double const *r, complex_t *z, size_t size)
+        {
+            for (size_t i = 0; i < size; ++i) {
+                z[i] = r[i];
+            }
+        }
+
+    void decomplexify(complex_t const *z, double *r, size_t size)
+        {
+            for (size_t i = 0; i < size; ++i) {
+                r[i] = real(z[i]);
+            }
+        }
+
+    std::vector<complex_t, fftw_allocator<complex_t> > m_buf1;
+    std::vector<complex_t, fftw_allocator<complex_t> > m_buf2;
+#ifdef FFTW3
+    fftw_plan m_plan_f;
+    fftw_plan m_plan_b;
+#else
+    fftwnd_mpi_plan m_plan_f;
+    fftwnd_mpi_plan m_plan_b;
+#endif
+};
+
+#endif
diff --git a/src/halo-finder/src/dfft/dims.c b/src/halo-finder/src/dfft/dims.c
new file mode 100644
index 0000000..3ac1a95
--- /dev/null
+++ b/src/halo-finder/src/dfft/dims.c
@@ -0,0 +1,47 @@
+#include "dims.h"
+#include <assert.h>
+
+#define DIMENSION 3
+
+static int dims_init=0;
+static int dims_dims[DIMENSION] = {0,0,0};
+
+int MY_Dims_init_3D(int nnodes, int ndim, int *dims) {
+  int i, tmp_nnodes=1;
+
+  assert(ndim == DIMENSION);
+
+  tmp_nnodes = 1;
+  for(i=0; i<ndim; i++) {
+    dims_dims[i] = dims[i];
+    tmp_nnodes *= dims[i];
+  }
+
+  assert(tmp_nnodes = nnodes);
+
+  dims_init = 1;
+
+  return 0;
+}
+
+int MY_Dims_create_3D(int nnodes, int ndim, int *dims) {
+  int i, ret=0;
+
+  assert(ndim == DIMENSION);
+  assert(dims[0] == 0);
+  assert(dims[1] == 0);
+  assert(dims[2] == 0);
+
+  if(dims_init == 0)
+#ifndef USE_SERIAL_COSMO
+    ret = MPI_Dims_create(nnodes, ndim, dims);
+#else
+    ret = -1;
+#endif
+  else {
+    for(i=0; i<ndim; i++)
+      dims[i] = dims_dims[i];
+  }
+
+  return ret;
+}
diff --git a/src/halo-finder/src/dfft/dims.h b/src/halo-finder/src/dfft/dims.h
new file mode 100644
index 0000000..b401d68
--- /dev/null
+++ b/src/halo-finder/src/dfft/dims.h
@@ -0,0 +1,17 @@
+#ifndef DIMS_H
+#define DIMS_H
+
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int MY_Dims_init_3D(int nnodes, int ndim, int *dims);
+int MY_Dims_create_3D(int nnodes, int ndim, int *dims);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/halo-finder/src/dfft/distribution.c b/src/halo-finder/src/dfft/distribution.c
new file mode 100644
index 0000000..02a132e
--- /dev/null
+++ b/src/halo-finder/src/dfft/distribution.c
@@ -0,0 +1,1934 @@
+#include <assert.h>
+#include <mpi.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "comm-schedule.h"
+#include "complex-type.h"
+#include "distribution.h"
+#include "bigchunk.h"
+#include "dims.h"
+
+#ifndef USE_SLAB_WORKAROUND
+#define USE_SLAB_WORKAROUND 0
+#endif
+
+enum {
+  REDISTRIBUTE_1_TO_3,
+  REDISTRIBUTE_3_TO_1,
+  REDISTRIBUTE_2_TO_3,
+  REDISTRIBUTE_3_TO_2
+};
+
+//#define DEBUG_CONDITION (self == 0 || self == 1)
+#define DEBUG_CONDITION false
+
+// return comma or period depending on position in a list
+static inline char separator(int i, int n)
+{
+  return i == (n - 1) ? '.' : ',';
+}
+
+
+//Go from rank of processor to its cartesian coords, and vica versa. 
+//Assumes the ranks increment in the z dimension then y then x.
+void Coord_cube(int myrank, 
+		int coord[], 
+		distribution_t *d)
+{
+  coord[0]=myrank/(d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2]);
+  coord[1]=(myrank%(d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2]))/(d->process_topology_3.nproc[2]);
+  coord[2]=(myrank%(d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2]))%(d->process_topology_3.nproc[2]);
+  return;
+}
+
+void Rank_cube(int * myrank, 
+	       int coord[], 
+	       distribution_t *d)
+{
+  *myrank = coord[2] + (d->process_topology_3.nproc[2])*(coord[1] + d->process_topology_3.nproc[1]*coord[0]);
+  return;
+}
+
+
+/*
+  The subsequent member functions are used to look up and ranks of x,y, and z 
+  pencils from their coordinates, and vica versa.
+  The ordering of the ranks is such that pencils will be going through cubes 
+  with the same rank sequencially. (since the cubes ranks are already 
+  determined and can not be changed, these routines figure out which ranks 
+  the pencils should be assigned so that there is no communication hangs.)
+*/
+void Coord_x_pencils(int myrank, 
+		     int coord[], 
+		     distribution_t *d)
+{
+  // asserts only one processor in x_direction
+  assert(d->process_topology_2_x.nproc[0] == 1);
+  //since x_pencils only have one processor in the x_direction.
+  coord[0]=0;
+  int num_pen_in_cube_col=d->process_topology_2_x.nproc[1]/d->process_topology_3.nproc[1];
+  int num_pen_in_cube_row=d->process_topology_2_x.nproc[2]/d->process_topology_3.nproc[2];
+  int num_cubes=(d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1]);
+  
+/*
+  the x_pencil ranks increment in each cube sequencially, after reaching the 
+  last cube the second slot in the first cube is the next rank, and then the 
+  process repeats. num_repeats, is the number of times this repetition had to 
+  have occured to increment to the current rank.
+*/
+  int num_repeats=myrank/(num_cubes);
+  
+  //now subtract the difference of how many repetitions, to find the lowest 
+  //rank in the cube it resides. 
+  int low_rank=myrank-num_repeats*num_cubes;
+  
+  //find the y and z coords of the low_rank, then adjust coords for ranks 
+  //that repeated around the cube.
+  coord[1] = (low_rank/d->process_topology_3.nproc[2])*num_pen_in_cube_col 
+    + num_repeats%num_pen_in_cube_col;
+  coord[2] = (low_rank%d->process_topology_3.nproc[2])*num_pen_in_cube_row + num_repeats/num_pen_in_cube_col;
+    
+  return;
+}
+
+void Rank_x_pencils(int * myrank, 
+		    int coord[], 
+		    distribution_t *d)
+{
+  int num_pen_in_cube_col=d->process_topology_2_x.nproc[1]/d->process_topology_3.nproc[1];
+  int num_pen_in_cube_row=d->process_topology_2_x.nproc[2]/d->process_topology_3.nproc[2];
+  if(num_pen_in_cube_col == 0)
+    fprintf(stderr,"num_cube_col%d ", 
+	    d->process_topology_2_x.nproc[1]/d->process_topology_3.nproc[1]);
+  if(num_pen_in_cube_row ==0)
+    fprintf(stderr,"num_cube_row%d ", d->process_topology_3.nproc[2]);
+  assert(num_pen_in_cube_col !=0 && num_pen_in_cube_row !=0);
+  int alpha = coord[1]%num_pen_in_cube_col;
+  int num_cubes = (d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1]);
+  int beta = coord[2]%num_pen_in_cube_row;
+  *myrank = (alpha*num_cubes) 
+    + ((coord[1]/num_pen_in_cube_col)*d->process_topology_3.nproc[2]) 
+    + (beta*(num_cubes)*num_pen_in_cube_col) + coord[2]/num_pen_in_cube_row;
+  return;
+}
+
+void Coord_y_pencils(int myrank, 
+		     int coord[], 
+		     distribution_t *d)
+{
+  // asserts only one processor in y_direction
+  assert(d->process_topology_2_y.nproc[1] == 1);
+  //since y_pencils only have one processor in the y_direction.
+  coord[1] = 0;
+  int num_pen_in_cube_row = d->process_topology_2_y.nproc[2]/d->process_topology_3.nproc[2];
+  int alpha = myrank%(d->process_topology_2_y.nproc[2]);
+  coord[0] = myrank/d->process_topology_2_y.nproc[2];
+  
+  coord[2] = (alpha/d->process_topology_3.nproc[2]) 
+    + (alpha%d->process_topology_3.nproc[2])*num_pen_in_cube_row;
+  
+  return;
+}
+
+void Rank_y_pencils(int * myrank, 
+		    int coord[], 
+		    distribution_t *d)
+{
+  int num_pen_in_cube_col = d->process_topology_2_y.nproc[0]/d->process_topology_3.nproc[0];
+  int num_pen_in_cube_row = d->process_topology_2_y.nproc[2]/d->process_topology_3.nproc[2];
+  //WHY ARE THESE COMMENTED OUT?
+  //if(num_pen_in_cube_col ==0)fprintf(stderr,"num_cube_col%d ", d->process_topology_2_y.nproc[1]/d->process_topology_3.nproc[1]);
+  //if(num_pen_in_cube_row ==0)fprintf(stderr,"num_cube_row%d ", d->process_topology_3.nproc[2]);
+  assert(num_pen_in_cube_col !=0 && num_pen_in_cube_row !=0);
+  int beta = coord[2]%num_pen_in_cube_row;
+  *myrank = coord[0]*d->process_topology_2_y.nproc[2] 
+    + beta*d->process_topology_3.nproc[2] 
+    + coord[2]/num_pen_in_cube_row;
+  return;
+}
+
+void Coord_z_pencils(int myrank, 
+		     int coord[], 
+		     distribution_t *d)
+{
+  // asserts only one processor in z_direction
+  assert(d->process_topology_2_z.nproc[2] == 1);
+  //since z_pencils only have one processor in the z_direction.
+  coord[2] = 0;
+  int num_pen_in_cube_col = d->process_topology_2_z.nproc[1]/d->process_topology_3.nproc[1];
+  int num_pen_in_cube_row = d->process_topology_2_z.nproc[0]/d->process_topology_3.nproc[0];
+  int num_pen_in_cube = d->process_topology_3.nproc[2];
+  int alpha = myrank/(d->process_topology_2_z.nproc[1]*num_pen_in_cube_row);
+  coord[0] = alpha*num_pen_in_cube_row + (myrank%num_pen_in_cube)/num_pen_in_cube_col;
+  coord[1] = ((myrank%(d->process_topology_2_z.nproc[1]*num_pen_in_cube_row))/num_pen_in_cube)*num_pen_in_cube_col + myrank%num_pen_in_cube_col;
+  
+  return;
+}
+
+void Rank_z_pencils(int * myrank, 
+		    int coord[], 
+		    distribution_t *d)
+{
+  int num_pen_in_cube_col = d->process_topology_2_z.nproc[1]/d->process_topology_3.nproc[1];
+  int num_pen_in_cube_row = d->process_topology_2_z.nproc[0]/d->process_topology_3.nproc[0];
+  int num_pen_in_cube = d->process_topology_3.nproc[2];
+  if(num_pen_in_cube_col == 0)
+    fprintf(stderr,"num_cube_col%d ", 
+	    d->process_topology_2_z.nproc[1]/d->process_topology_3.nproc[1]);
+  if(num_pen_in_cube_row == 0)
+    fprintf(stderr,"num_cube_row%d ", d->process_topology_3.nproc[2]);
+  assert(num_pen_in_cube_col !=0 && num_pen_in_cube_row !=0);
+  int alpha = coord[1]%num_pen_in_cube_col;
+  int beta = coord[0]%num_pen_in_cube_row;
+  *myrank = alpha 
+    + ((coord[1]/num_pen_in_cube_col)*num_pen_in_cube) 
+    + (beta*num_pen_in_cube_col) 
+    + (coord[0]/num_pen_in_cube_row)*d->process_topology_2_z.nproc[1]*num_pen_in_cube_row;
+  return;
+}
+
+
+// create 1-, 2- and 3-d cartesian data distributions comm MPI Communicator
+void distribution_init(MPI_Comm comm, 
+		       const int n[], 
+		       const int n_padded[], 
+		       distribution_t *d, bool debug)
+{
+/* 
+   As of 09/06/2011 The MPI function MPI_Dims_create is used to come up with 
+   the most evenly distributed number of processors for the 3d distribution. 
+   Since this can actually vary between machines, we should later write our own
+   prime factorization function that does that for us. For the 2d distribution 
+   pencils, Dims_create is also used, but the code then checks if the pencils 
+   it outputs fits inside the 3d cuboids that were created. If it does not, it 
+   tries swapping the dimensions of the pencils, and if they still do not fit, 
+   it takes the 3d cubes dimensions of processors (np1,np2,np3) and (for 
+   Z-pencils for example) makes pencils of the form (np1*np3,np2,1), or 
+   (np1,np2*np3,1) which fit inside the cubes. However, even then, it is not 
+   ensured that this will work since for example, if np1*np3 is bigger then 
+   the number of points in one dimension (Ng) then there are not enough points 
+   for each processor to have at least one point in that dimension. So the code
+   checks this and asserts three variables check_x_dims check_y_dims, and 
+   check_z_dims, which will assert if these kinda errors happen (as well as 
+   checking errors coming from picking the total number of processors and Ng 
+   in a way where the cubes will not fit for any orientation (like 100 procs 
+   and Ng=101!)). Curretly the fix to these errors is to pick better values 
+   for Ng and the total number of processors that work, however when we do 
+   have our own prime factorization method, then that method could also make 
+   pencils that fit inside the proper distribution (and we would not need so 
+   many checks). In the mean time, to pick these "better" values for Ng, the 
+   user should pick values such that: Ng % np1, Ng % np2, and Ng % np3 all 
+   equal zero, and that np1*np2, np2*np3, and np3*np1 are all less then Ng.
+   (in other words, the cubes created fit inside the number of grid points, 
+   and the number of pencils created is not more then the number of points 
+   in a dimension (Ng)).
+*/
+  int nproc;//num_processors
+  int self; //rank
+  int ndim = 3;
+  int period[3];
+
+  MPI_Comm_rank(comm, &self);
+  MPI_Comm_size(comm, &nproc);
+
+  if (!self) 
+    printf("Initializing redistribution using a %s layout on %d ranks.\n",
+#ifdef PENCIL
+"pencil"
+#else
+"slab"
+#endif
+	   ,nproc);
+
+  d->debug = debug;
+  for (int i = 0; i < 3; ++i) {
+    d->n[i] = n[i];
+    d->padding[i] = n_padded[i] - n[i];
+  }
+
+
+
+  // set up process grid with 1d decomposition (SLABs)
+  d->process_topology_1.nproc[0] = 0;
+  d->process_topology_1.nproc[1] = 1; // don't distribute outer dimensions
+  d->process_topology_1.nproc[2] = 1; // don't distribute outer dimensions
+  period[0] = period[1] = period[2] = 1;
+  //process_topology_1.nproc is filled with number of processors in each dim
+  MPI_Dims_create(nproc, ndim, d->process_topology_1.nproc); 
+
+  if(self == 0) {
+    printf("distribution 1D: [%d:%d:%d]\n",
+	   d->process_topology_1.nproc[0],
+	   d->process_topology_1.nproc[1],
+	   d->process_topology_1.nproc[2]);
+    fflush(stdout);
+  }
+
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "Process grids:\n");
+    fprintf(stderr, "  1d: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", 
+	      d->process_topology_1.nproc[i], 
+	      separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  }
+  //creates the new communicator
+  MPI_Cart_create(comm, ndim, d->process_topology_1.nproc, period, 0, 
+		  &d->process_topology_1.cart);
+  //gets .self (is coordinate)
+  MPI_Cart_get(d->process_topology_1.cart, ndim, d->process_topology_1.nproc, 
+	       d->process_topology_1.period, d->process_topology_1.self);
+  //calculates the local dimensions (number of points in each dimension)
+  d->process_topology_1.n[0] = n[0] / d->process_topology_1.nproc[0];
+  d->process_topology_1.n[1] = n[1] / d->process_topology_1.nproc[1];
+  d->process_topology_1.n[2] = n[2] / d->process_topology_1.nproc[2];
+  
+
+
+  // set up process grid with 3d decomposition (CUBE)
+  d->process_topology_3.nproc[0] = 0;
+  d->process_topology_3.nproc[1] = 0;
+  d->process_topology_3.nproc[2] = 0;
+  period[0] = period[1] = period[2] = 1;
+  //MPI_Dims_create(nproc, ndim, d->process_topology_3.nproc);
+  MY_Dims_create_3D(nproc, ndim, d->process_topology_3.nproc);
+
+  if(self == 0) {
+    printf("distribution 3D: [%d:%d:%d]\n",
+	   d->process_topology_3.nproc[0],
+	   d->process_topology_3.nproc[1],
+	   d->process_topology_3.nproc[2]);
+    fflush(stdout);
+  }
+  
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "  3d: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", 
+	      d->process_topology_3.nproc[i], 
+	      separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  }
+
+  MPI_Cart_create(comm, ndim, d->process_topology_3.nproc, period, 0, 
+		  &d->process_topology_3.cart);
+  //finds cartesian coordinate of this current rank
+  Coord_cube(self,d->process_topology_3.self,d);
+
+  if(debug){
+/*
+  this debug statment checks to see if the way coordinates found by 
+  calculation matchs MPI's coord system (MPI might differ between machines 
+  so this is why the code calculates the coord system itself, however with 
+  debug on, can check if it matches MPI(even tho it is not enforced to match 
+  it.)).
+*/
+    int prev_coord[3];
+    prev_coord[0]=d->process_topology_3.self[0];
+    prev_coord[1]=d->process_topology_3.self[1];
+    prev_coord[2]=d->process_topology_3.self[2];
+    MPI_Cart_get(d->process_topology_3.cart, ndim, 
+		 d->process_topology_3.nproc, 
+		 d->process_topology_3.period, 
+		 d->process_topology_3.self);
+    for(int i=0; i < 3; i++)
+      if(prev_coord[i] != d->process_topology_3.self[i])
+	abort();
+  }
+  assert(n[0]%d->process_topology_3.nproc[0] == 0);
+  assert(n[0]%d->process_topology_3.nproc[1] == 0);
+  assert(n[0]%d->process_topology_3.nproc[2] == 0);
+  
+  //set local dimensions
+  d->process_topology_3.n[0] = n[0] / d->process_topology_3.nproc[0];
+  d->process_topology_3.n[1] = n[1] / d->process_topology_3.nproc[1];
+  d->process_topology_3.n[2] = n[2] / d->process_topology_3.nproc[2];
+
+
+
+  // set up process grid with 2d decomposition (z_PENCILs )
+  d->process_topology_2_z.nproc[0] = 0;
+  d->process_topology_2_z.nproc[1] = 0;
+  d->process_topology_2_z.nproc[2] = 1; // don't distribute outer dimension 
+  period[0] = period[1] = period[2] = 1;
+  MPI_Dims_create(nproc, ndim, d->process_topology_2_z.nproc);
+  d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
+  d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
+  d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
+  //variable used to ensure that pencils created fit inside the cuboids, 
+  //if not the code will assert out.
+  bool check_z_dims=false; 
+  if(d->process_topology_2_z.n[0] != 0 
+     && d->process_topology_2_z.n[1] != 0 
+     && d->process_topology_2_z.n[2] != 0)
+  {// protects from dividing by zero.
+    check_z_dims = ((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0) 
+      && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0) 
+      && (n[0] % (d->process_topology_2_z.nproc[0]) == 0) 
+      && (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
+    
+    if(self==0 && debug && !check_z_dims)
+      fprintf(stderr,"Need to fix Z PENCILS z_procs(%d,%d,%d) 3d.ns(%d,%d,%d) 2d_z.ns(%d,%d,%d).... \n", 
+	      d->process_topology_2_z.nproc[0],
+	      d->process_topology_2_z.nproc[1],
+	      d->process_topology_2_z.nproc[2],
+	      d->process_topology_3.n[0],
+	      d->process_topology_3.n[1],
+	      d->process_topology_3.n[2],
+	      d->process_topology_2_z.n[0],
+	      d->process_topology_2_z.n[1],
+	      d->process_topology_2_z.n[2]);
+   
+    //try swaping pencil dimensions if current setup pencil dimensions dont 
+    //fit inside the cubes.
+    if(!(check_z_dims) 
+       && ((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[1]) == 0) 
+       && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[0]) == 0))
+    {
+
+      if(self==0 && debug)
+	fprintf(stderr,"Swaping Z pencils in initialization  (%d,%d,%d).... \n", 
+		d->process_topology_2_z.nproc[0],
+		d->process_topology_2_z.nproc[1],
+		d->process_topology_2_z.nproc[2]);
+      int temp=d->process_topology_2_z.nproc[0];
+      d->process_topology_2_z.nproc[0] = d->process_topology_2_z.nproc[1];
+      d->process_topology_2_z.nproc[1] = temp;
+      d->process_topology_2_z.nproc[2] = d->process_topology_2_z.nproc[2];
+      
+      d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
+      d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
+      d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
+      check_z_dims = ((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0) 
+	&& ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0)
+	&& (n[0] % (d->process_topology_2_z.nproc[0]) == 0) 
+	&& (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
+    }
+  } else {
+    check_z_dims=false;
+  }
+  /*
+    if that did not work, make a pencil that does if inside the 3d cuboids by 
+    taking the cuboids dimensions (np1,np2,np3) and making pencils 
+    (np1,np2*np3,1), or (np1*np3,np2,1) on the most evenly distributed 
+    dimensions
+  */
+  if(!check_z_dims){
+    if(self==0 && debug)
+      fprintf(stderr,"MAKING Z PENCILS FIT zprocs(%d,%d,%d) z.ns(%d,%d,%d).... \n", 
+	      d->process_topology_2_z.nproc[0],
+	      d->process_topology_2_z.nproc[1],
+	      d->process_topology_2_z.nproc[2],
+	      d->process_topology_2_z.n[0],
+	      d->process_topology_2_z.n[1],
+	      d->process_topology_2_z.n[2]);
+    
+    d->process_topology_2_z.nproc[2]=1;
+    if(d->process_topology_3.n[0]>d->process_topology_3.n[1])
+    {
+      d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2];
+      d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0];
+      if((n[0] % (d->process_topology_2_z.nproc[0]) != 0) 
+	 || (n[0] % (d->process_topology_2_z.nproc[1]) != 0))
+      {
+	d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0]*d->process_topology_3.nproc[2];
+	d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1];
+      }
+    } else {
+      d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0]*d->process_topology_3.nproc[2];
+      d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1];
+      if((n[0] % (d->process_topology_2_z.nproc[0]) != 0) 
+	 || (n[0] % (d->process_topology_2_z.nproc[1]) != 0))
+      {
+	d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2];
+	d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0];
+      }
+    }
+    d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
+    d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
+    d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
+    if(self==0 && debug)
+      fprintf(stderr,"MAKING Z PENCILS FIT AFTER zprocs(%d,%d,%d) z.ns(%d,%d,%d)...\n", 
+	      d->process_topology_2_z.nproc[0],
+	      d->process_topology_2_z.nproc[1],
+	      d->process_topology_2_z.nproc[2],
+	      d->process_topology_2_z.n[0],
+	      d->process_topology_2_z.n[1],
+	      d->process_topology_2_z.n[2]);
+    if(d->process_topology_2_z.n[0] != 0 
+       && d->process_topology_2_z.n[1] != 0 
+       && d->process_topology_2_z.n[2] != 0)
+    {// protects from dividing by zero.
+      check_z_dims=((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0) 
+	&& ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0)
+	&& (n[0] % (d->process_topology_2_z.nproc[0]) == 0) 
+	&& (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
+    } else {
+      check_z_dims=false;
+    }
+  }
+    
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "  2d_z: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", 
+	      d->process_topology_2_z.nproc[i], 
+	      separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  } 
+  if(!check_z_dims && debug && (self==0)){
+    FILE * outfile;
+    outfile= fopen("error.data","a");
+    fprintf(outfile,"Z DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",
+	    d->process_topology_2_z.nproc[0],
+	    d->process_topology_2_z.nproc[1],
+	    d->process_topology_2_z.nproc[2], 
+	    d->process_topology_3.nproc[0],
+	    d->process_topology_3.nproc[1],
+	    d->process_topology_3.nproc[2]);
+  }
+  assert(check_z_dims);
+/*
+  if this happends, it is because the dimensions were chosen incorrectly. 
+  Either to many processors for the number of points in one dimenison (could 
+  not do at least 1 point per processor), or the methods above could 
+  not make a distribution of pencils that fit in the cubiods, which would 
+  happen if the user gave numbers that wouldent work (we require the number 
+  of processors in each dimension of the cuboid must be modulo the number of 
+  points in that dimension, otherwise, this error will happen).
+*/
+  MPI_Cart_create(comm, 
+		  ndim, 
+		  d->process_topology_2_z.nproc, 
+		  period, 
+		  0, 
+		  &d->process_topology_2_z.cart);
+  //find the cartesian coord of the current rank (for the z_pencil)
+  Coord_z_pencils(self,d->process_topology_2_z.self,d);
+
+  if(self == 0) {
+    printf("distribution 2z: [%d:%d:%d]\n",
+	   d->process_topology_2_z.nproc[0],
+	   d->process_topology_2_z.nproc[1],
+	   d->process_topology_2_z.nproc[2]);
+    fflush(stdout);
+  }
+
+
+
+  // set up process grid with 2d decomposition (x_PENCILs)
+  d->process_topology_2_x.nproc[0] = 1; // don't distribute outer dimension
+  d->process_topology_2_x.nproc[1] = 0;
+  d->process_topology_2_x.nproc[2] = 0;
+  period[0] = period[1] = period[2] = 1;
+  MPI_Dims_create(nproc, ndim, d->process_topology_2_x.nproc);
+  d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
+  d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
+  d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
+  //variable used to ensure that pencils created fit inside the cuboids, 
+  //if not the code will assert out.
+  bool check_x_dims = false;
+  if(d->process_topology_2_x.n[0] != 0 
+     && d->process_topology_2_x.n[1] != 0 
+     && d->process_topology_2_x.n[2] != 0)
+  {// protects from dividing by zero.
+    check_x_dims = ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0) 
+      && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0) 
+      && (n[0] % (d->process_topology_2_x.nproc[2]) == 0) 
+      && (n[0] % (d->process_topology_2_x.nproc[0]) == 0);
+    if(self==0 && debug && !check_x_dims)
+      fprintf(stderr,"Need to fix X PENCILS x_procs(%d,%d,%d) 3d.ns(%d,%d,%d) 2d_x.ns(%d,%d,%d)...\n", 
+	      d->process_topology_2_x.nproc[0],
+	      d->process_topology_2_x.nproc[1],
+	      d->process_topology_2_x.nproc[2],
+	      d->process_topology_3.n[0],
+	      d->process_topology_3.n[1],
+	      d->process_topology_3.n[2],
+	      d->process_topology_2_x.n[0],
+	      d->process_topology_2_x.n[1],
+	      d->process_topology_2_x.n[2]);
+
+    //try swaping pencil dimensions if current setup does not have pencils 
+    //that fit inside cubes.
+    if(!(check_x_dims) 
+       && ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[1]) == 0) 
+       && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[2]) == 0))
+    {
+      if(self==0 && debug)
+	fprintf(stderr,"Swaping X pencils in initialization .... \n");
+      d->process_topology_2_x.nproc[0] = d->process_topology_2_x.nproc[0];
+      int temp = d->process_topology_2_x.nproc[1];
+      d->process_topology_2_x.nproc[1] = d->process_topology_2_x.nproc[2];
+      d->process_topology_2_x.nproc[2] = temp;
+   
+      d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
+      d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
+      d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
+      check_x_dims = ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0) 
+	&& ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0)
+	&& (n[0] % (d->process_topology_2_x.nproc[2]) == 0) 
+	&& (n[0] % (d->process_topology_2_x.nproc[0]) == 0);
+    } 
+  } else{
+    check_x_dims=false;
+  }
+  /*
+    if that did not work, make a pencil that does by taking the cuboid 
+    (np1,np2,np3) and making pencils of the form (1,np2*np1,np3) or 
+    (1,np2*np1,np3) depending on the most even distribution it can.
+  */
+  if(!check_x_dims){
+    if(self==0 && debug)
+      fprintf(stderr,"MAKING X PENCILS FIT xprocs(%d,%d,%d) x.ns(%d,%d,%d)...\n",
+	      d->process_topology_2_x.nproc[0],
+	      d->process_topology_2_x.nproc[1],
+	      d->process_topology_2_x.nproc[2],
+	      d->process_topology_2_x.n[0],
+	      d->process_topology_2_x.n[1],
+	      d->process_topology_2_x.n[2]);
+
+    d->process_topology_2_x.nproc[0] = 1;
+    if(d->process_topology_3.nproc[2] > d->process_topology_3.nproc[1])
+    {
+      d->process_topology_2_x.nproc[1] = d->process_topology_3.nproc[1]*d->process_topology_3.nproc[0];
+      d->process_topology_2_x.nproc[2] = d->process_topology_3.nproc[2];
+      if((n[0] % (d->process_topology_2_x.nproc[2]) != 0) 
+	 || (n[0] % (d->process_topology_2_x.nproc[0]) != 0))
+      {
+	d->process_topology_2_x.nproc[2]=d->process_topology_3.nproc[2]*d->process_topology_3.nproc[0];
+	d->process_topology_2_x.nproc[1]=d->process_topology_3.nproc[1];
+      }
+
+    } else {
+      d->process_topology_2_x.nproc[2] = d->process_topology_3.nproc[2]*d->process_topology_3.nproc[0];
+      d->process_topology_2_x.nproc[1] = d->process_topology_3.nproc[1];
+      if((n[0] % (d->process_topology_2_x.nproc[2]) != 0) 
+	 || (n[0] % (d->process_topology_2_x.nproc[0]) != 0))
+      {
+	d->process_topology_2_x.nproc[1]=d->process_topology_3.nproc[1]*d->process_topology_3.nproc[0];
+	d->process_topology_2_x.nproc[2]=d->process_topology_3.nproc[2];
+      }
+    }
+    d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
+    d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
+    d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
+    if(self==0 && debug)
+      fprintf(stderr,"MAKING X PENCILS FIT AFTER xprocs(%d,%d,%d) x.ns(%d,%d,%d)...\n",
+	      d->process_topology_2_x.nproc[0],
+	      d->process_topology_2_x.nproc[1],
+	      d->process_topology_2_x.nproc[2],
+	      d->process_topology_2_x.n[0],
+	      d->process_topology_2_x.n[1],
+	      d->process_topology_2_x.n[2]);
+    if(d->process_topology_2_x.n[0] != 0 
+       && d->process_topology_2_x.n[1] != 0 
+       && d->process_topology_2_x.n[2] != 0)
+    {// protects from dividing by zero.
+      check_x_dims = ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0) 
+	&& ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0)
+	&& (n[0] % (d->process_topology_2_x.nproc[2]) == 0) 
+	&& (n[0] % (d->process_topology_2_x.nproc[0]) == 0);
+    } else {
+      check_x_dims=false;
+    }  
+  }
+   
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "  2d_x: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", 
+	      d->process_topology_2_x.nproc[i], 
+	      separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  }
+  if(!check_x_dims && debug && (self==0)){
+    FILE * outfile;
+    outfile= fopen("error.data","a");
+    fprintf(outfile,"X DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",
+	    d->process_topology_2_x.nproc[0],
+	    d->process_topology_2_x.nproc[1],
+	    d->process_topology_2_x.nproc[2], 
+	    d->process_topology_3.nproc[0],
+	    d->process_topology_3.nproc[1],
+	    d->process_topology_3.nproc[2]);
+  }
+  assert(check_x_dims);
+/*
+  if this happends, it is because the dimensions were chosen incorrectly. 
+  Either to many processors for the number of points in one dimenison (could 
+  not do at least 1 point per processor), or the methods above could not make 
+  a distribution of pencils that fit in the cubiods, which would happen if the 
+  user gave numbers that wouldent work (we require the number of processors in 
+  each dimension of the cuboid must be modulo the number of points in that 
+  dimension, otherwise, this error will happen).
+*/
+  MPI_Cart_create(comm, 
+		  ndim, 
+		  d->process_topology_2_x.nproc, 
+		  period, 
+		  0, 
+		  &d->process_topology_2_x.cart);
+  Coord_x_pencils(self, d->process_topology_2_x.self, d);
+
+  if(self == 0) {
+    printf("distribution 2x: [%d:%d:%d]\n",
+	   d->process_topology_2_x.nproc[0],
+	   d->process_topology_2_x.nproc[1],
+	   d->process_topology_2_x.nproc[2]);
+    fflush(stdout);
+  }
+  
+
+
+  // set up process grid with 2d decomposition (y_PENCILs)
+  d->process_topology_2_y.nproc[0] = 0;
+  d->process_topology_2_y.nproc[1] = 1; // don't distribute outer dimension
+  d->process_topology_2_y.nproc[2] = 0;
+  period[0] = period[1] = period[2] = 1;
+  MPI_Dims_create(nproc, ndim, d->process_topology_2_y.nproc);
+  d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
+  d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
+  d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
+  //variable used to ensure that pencils created fit inside the cuboids, 
+  //if not the code will assert out.
+  bool check_y_dims=false;
+  if(d->process_topology_2_y.n[0] != 0 
+     && d->process_topology_2_y.n[1] != 0 
+     && d->process_topology_2_y.n[2] != 0)
+  {// protects from dividing by zero.
+    check_y_dims = (((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0) 
+		    && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0) 
+		    && (n[0] % (d->process_topology_2_y.nproc[2]) == 0) 
+		    && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
+    if(self==0 && debug && !check_y_dims)
+      fprintf(stderr,"Need to fix Y PENCILS y_procs(%d,%d,%d) 3d.ns(%d,%d,%d) 2d_y.ns(%d,%d,%d)...\n",
+	      d->process_topology_2_y.nproc[0],
+	      d->process_topology_2_y.nproc[1],
+	      d->process_topology_2_y.nproc[2],
+	      d->process_topology_3.n[0],
+	      d->process_topology_3.n[1],
+	      d->process_topology_3.n[2],
+	      d->process_topology_2_y.n[0],
+	      d->process_topology_2_y.n[1],
+	      d->process_topology_2_y.n[2]);
+    //try swaping pencil dimensions if the current dimension of the pencils 
+    //does not fit inside the cubes.
+    if(!(check_y_dims) 
+       && ((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[0]) == 0) 
+       && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[2]) == 0))
+    {
+      if(self==0 && debug)
+	fprintf(stderr,"Swaping Y pencils in initialization .... \n");
+      
+      int temp = d->process_topology_2_y.nproc[0];
+      d->process_topology_2_y.nproc[0] = d->process_topology_2_y.nproc[2];
+      d->process_topology_2_y.nproc[2] = temp;
+      d->process_topology_2_y.nproc[1] = d->process_topology_2_y.nproc[1];
+      
+      d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
+      d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
+      d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
+      check_y_dims = (((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0) 
+		      && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0) 
+		      && (n[0] % (d->process_topology_2_y.nproc[2]) == 0) 
+		      && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
+    }
+  } else {
+    check_y_dims = false;
+  }
+/*
+  if that did not work, make a pencil that does by taking the cuboid 
+  (np1,np2,np3) and making pencils of the form (np1,1,np3*np2) or 
+  (np1*np2,1,np3) depending on the most even distribution it can.
+*/
+  if(!check_y_dims){
+    if(self==0 && debug)
+      fprintf(stderr,"MAKING Y PENCILS FIT yprocs(%d,%d,%d) y.ns(%d,%d,%d)...\n", 
+	      d->process_topology_2_y.nproc[0],
+	      d->process_topology_2_y.nproc[1],
+	      d->process_topology_2_y.nproc[2],
+	      d->process_topology_2_y.n[0],
+	      d->process_topology_2_y.n[1],
+	      d->process_topology_2_y.n[2]);
+    
+    d->process_topology_2_y.nproc[1]=1;
+    if(d->process_topology_3.nproc[2] > d->process_topology_3.nproc[0])
+    {
+      d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0]*d->process_topology_3.nproc[1];
+      d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2];
+      if((n[0] % (d->process_topology_2_y.nproc[2]) != 0) 
+	 || (n[0] % (d->process_topology_2_y.nproc[0]) != 0))
+      {
+	d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1];
+	d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0];
+      }
+    } else {
+      d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1];
+      d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0];
+      if((n[0] % (d->process_topology_2_y.nproc[2]) != 0) 
+	 || (n[0] % (d->process_topology_2_y.nproc[0]) != 0))
+      {
+	d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0]*d->process_topology_3.nproc[1];
+	d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2];
+      }
+    }
+    
+    d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
+    d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
+    d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
+    if(self==0 && debug)
+      fprintf(stderr,"MAKING Y PENCILS FIT AFTER yprocs(%d,%d,%d) y.ns(%d,%d,%d)...\n",
+	      d->process_topology_2_y.nproc[0],
+	      d->process_topology_2_y.nproc[1],
+	      d->process_topology_2_y.nproc[2],
+	      d->process_topology_2_y.n[0],
+	      d->process_topology_2_y.n[1],
+	      d->process_topology_2_y.n[2]);
+    if(d->process_topology_2_y.n[0] != 0 && d->process_topology_2_y.n[1] != 0 
+       && d->process_topology_2_y.n[2] != 0)
+    {// protects from dividing by zero.
+      check_y_dims = (((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0) 
+		      && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0) 
+		      && (n[0] % (d->process_topology_2_y.nproc[2]) == 0) 
+		      && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
+    } else {
+      check_y_dims=false;
+    }
+  }
+   
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "  2d_y: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", 
+	      d->process_topology_2_y.nproc[i], 
+	      separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  }
+  if(!check_y_dims && debug && (self==0)){
+    FILE * outfile;
+    outfile = fopen("error.data","a");
+    fprintf(outfile,"Y DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",
+	    d->process_topology_2_y.nproc[0],
+	    d->process_topology_2_y.nproc[1],
+	    d->process_topology_2_y.nproc[2], 
+	    d->process_topology_3.nproc[0],
+	    d->process_topology_3.nproc[1],
+	    d->process_topology_3.nproc[2]);
+  }
+  assert(check_y_dims);
+/*
+  if this happends, it is because the dimensions were chosen incorrectly. 
+  Either to many processors for the number of points in one dimenison (could 
+  not do at least 1 point per processor), or the methods above could 
+  not make a distribution of pencils that fit in the cubiods, which would 
+  happen if the user gave numbers that wouldent work (we require the number of 
+  processors in each dimension of the cuboid must be modulo the number of 
+  points in that dimension, otherwise, this error will happen).
+*/
+  MPI_Cart_create(comm, 
+		  ndim, 
+		  d->process_topology_2_y.nproc, 
+		  period, 
+		  0, 
+		  &d->process_topology_2_y.cart);
+  //find the cartesian coord of the current rank (for the y_pencil)
+  Coord_y_pencils(self,d->process_topology_2_y.self,d);
+
+  if(self == 0) {
+    printf("distribution 2y: [%d:%d:%d]\n",
+	   d->process_topology_2_y.nproc[0],
+	   d->process_topology_2_y.nproc[1],
+	   d->process_topology_2_y.nproc[2]);
+    fflush(stdout);
+  }
+
+
+  
+  if (d->debug) {
+    int myrank_cube;
+    Rank_cube(&myrank_cube,d->process_topology_3.self,d);
+    int myrank_x;
+    Rank_x_pencils(&myrank_x,d->process_topology_2_x.self,d);
+    int myrank_y;
+    Rank_y_pencils(&myrank_y,d->process_topology_2_y.self,d);
+    int myrank_z;
+    Rank_z_pencils(&myrank_z,d->process_topology_2_z.self,d);
+    if(myrank_z != self 
+       || myrank_y != self 
+       || myrank_x != self 
+       || myrank_cube != self)
+      abort(); //means ranks were calculated wrong.
+    if (0 == self) {
+      fprintf(stderr, "Process map:\n");
+    }
+    for (int p = 0; p < nproc; ++p) {
+      MPI_Barrier(comm);
+      if (p == self) {
+	fprintf(stderr, "  %d: 1d = (%d, %d, %d), 2d_x = (%d, %d, %d) rank is= %d,2d_y = (%d, %d, %d) rank is= %d,2d_z = (%d, %d, %d) rank is= %d, 3d = (%d, %d, %d). rank is= %d\n",
+		self,
+		d->process_topology_1.self[0], 
+		d->process_topology_1.self[1], 
+		d->process_topology_1.self[2],
+		d->process_topology_2_x.self[0], 
+		d->process_topology_2_x.self[1], 
+		d->process_topology_2_x.self[2],
+		myrank_x,
+		d->process_topology_2_y.self[0], 
+		d->process_topology_2_y.self[1], 
+		d->process_topology_2_y.self[2],
+		myrank_y,
+		d->process_topology_2_z.self[0], 
+		d->process_topology_2_z.self[1], 
+		d->process_topology_2_z.self[2],
+		myrank_z,
+		d->process_topology_3.self[0], 
+		d->process_topology_3.self[1], 
+		d->process_topology_3.self[2],
+		myrank_cube);
+      }
+    }
+  }
+
+  //allocate size of buffers used to hold pencil chunks of data in the 
+  //distribution routines for 3d to 1d and vica versa.
+  int buff_z_chunk = d->process_topology_2_z.n[0]*d->process_topology_2_z.n[1]*d->process_topology_3.n[2];
+  int buff_y_chunk = d->process_topology_2_y.n[0]*d->process_topology_2_y.n[2]*d->process_topology_3.n[1];
+  int buff_x_chunk = d->process_topology_2_x.n[1]*d->process_topology_2_x.n[2]*d->process_topology_3.n[0];
+  int buff_size = 0;
+  if(buff_z_chunk > buff_y_chunk){
+    buff_size=buff_z_chunk;
+  } else {
+    buff_size=buff_y_chunk;
+  }
+  if(buff_x_chunk > buff_size)
+    buff_size = buff_x_chunk;
+  
+  d->d2_chunk=(complex_t *) bigchunk_malloc(sizeof(complex_t)*buff_size);
+  d->d3_chunk=(complex_t *) bigchunk_malloc(sizeof(complex_t)*buff_size);
+}
+
+
+
+// create 1-, 2- and 3-d cartesian data distributions with explicitly
+// provided dimension lists
+void distribution_init_explicit(MPI_Comm comm, 
+				const int n[], 
+				const int n_padded[],
+                                int nproc_1d[],
+                                int nproc_2d_x[],
+                                int nproc_2d_y[],
+                                int nproc_2d_z[],
+                                int nproc_3d[],
+                                distribution_t *d, 
+				bool debug)
+{
+  int nproc;
+  int self;
+  int ndim = 3;
+  int period[3];
+  
+  MPI_Comm_rank(comm, &self);
+  MPI_Comm_size(comm, &nproc);
+  
+  if (!self) printf("Initializing redistribution using a %s layout on %d ranks.\n",
+#ifdef PENCIL
+		    "pencil"
+#else
+		    "slab"
+#endif
+		    ,nproc);
+  
+  d->debug = debug;
+  for (int i = 0; i < 3; ++i) {
+    d->n[i] = n[i];
+    d->padding[i] = n_padded[i] - n[i];
+  }
+  
+  // check supplied dimension lists are valid
+  assert(nproc_1d[0] == nproc);
+  assert(nproc_1d[1] == 1);
+  assert(nproc_1d[2] == 1);
+  
+  assert(nproc_2d_x[1] * nproc_2d_x[2] == nproc);
+  assert(nproc_2d_x[0] == 1);
+  
+  assert(nproc_2d_y[0] * nproc_2d_y[2] == nproc);
+  assert(nproc_2d_y[1] == 1);
+  
+  assert(nproc_2d_z[0] * nproc_2d_z[1] == nproc);
+  assert(nproc_2d_z[2] == 1);
+  
+  assert(nproc_3d[0] * nproc_3d[1] * nproc_3d[2]== nproc);
+  
+  // set up process grid with 1d decomposition (SLABs)
+  period[0] = period[1] = period[2] = 1;
+  MPI_Cart_create(comm, ndim, nproc_1d, period, 0, &d->process_topology_1.cart);
+  MPI_Cart_get(d->process_topology_1.cart, ndim, d->process_topology_1.nproc, d->process_topology_1.period, d->process_topology_1.self);
+  d->process_topology_1.n[0] = n[0] / d->process_topology_1.nproc[0];
+  d->process_topology_1.n[1] = n[1] / d->process_topology_1.nproc[1];
+  d->process_topology_1.n[2] = n[2] / d->process_topology_1.nproc[2];
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "Process grids:\n");
+    fprintf(stderr, "  1d: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", d->process_topology_1.nproc[i], separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  }
+  
+  // set up process grid with 3d decomposition (CUBE)
+  period[0] = period[1] = period[2] = 1;
+  MPI_Cart_create(comm, ndim, nproc_3d, period, 0, &d->process_topology_3.cart);
+  Coord_cube(self,d->process_topology_3.self,d);
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "  3d: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", d->process_topology_3.nproc[i], separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  }
+  if(debug){
+    int prev_coord[3];
+    prev_coord[0]=d->process_topology_3.self[0];
+    prev_coord[1]=d->process_topology_3.self[1];
+    prev_coord[2]=d->process_topology_3.self[2];
+    MPI_Cart_get(d->process_topology_3.cart, ndim, d->process_topology_3.nproc, d->process_topology_3.period, d->process_topology_3.self);
+    for(int i=0; i < 3; i++){
+      if(prev_coord[i] != d->process_topology_3.self[i])abort();//Cube coordinates calculated wrong!
+    }
+  }
+  d->process_topology_3.n[0] = n[0] / d->process_topology_3.nproc[0];
+  d->process_topology_3.n[1] = n[1] / d->process_topology_3.nproc[1];
+  d->process_topology_3.n[2] = n[2] / d->process_topology_3.nproc[2];
+  
+  // set up process grid with 2d_x decomposition (X dim Pencils)
+  period[0] = period[1] = period[2] = 1;
+  MPI_Cart_create(comm, ndim, nproc_2d_x, period, 0, &d->process_topology_2_x.cart);
+  d->process_topology_2_x.nproc[0]=nproc_2d_x[0];
+  d->process_topology_2_x.nproc[1]=nproc_2d_x[1];
+  d->process_topology_2_x.nproc[2]=nproc_2d_x[2];
+  d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
+  d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
+  d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
+  
+  bool check_x_dims=((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0) && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0) && (n[0] % (d->process_topology_2_x.nproc[2]) == 0) && (n[0] % (d->process_topology_2_x.nproc[0]) == 0);
+  if(!check_x_dims && debug && (self==0)){
+    FILE * outfile;
+    outfile= fopen("error.data","a");
+    fprintf(outfile,"X DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",d->process_topology_2_x.nproc[0],d->process_topology_2_x.nproc[1],d->process_topology_2_x.nproc[2], d->process_topology_3.nproc[0],d->process_topology_3.nproc[1],d->process_topology_3.nproc[2]);
+  }
+  assert(check_x_dims);//if this happends, it is because the dimensions were chosen incorrectly. Either to many processors for the number of points in one dimenison (could not do at least 1 point per processor), or the methods above could 
+  //not make a distribution of pencils that fit in the cubiods, which would happen if the user gave numbers that wouldent work (we require the number of processors in each dimension of the cuboid must be modulo the number of points 
+  //in that dimension, otherwise, this error will happen).
+  Coord_x_pencils(self,d->process_topology_2_x.self,d);
+  
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "  2d_x: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", d->process_topology_2_x.nproc[i], separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  }
+  
+  // set up process grid with 2d_y decomposition (Y dim Pencils)
+  period[0] = period[1] = period[2] = 1;
+  MPI_Cart_create(comm, ndim, nproc_2d_y, period, 0, &d->process_topology_2_y.cart);
+  d->process_topology_2_y.nproc[0]=nproc_2d_y[0];
+  d->process_topology_2_y.nproc[1]=nproc_2d_y[1];
+  d->process_topology_2_y.nproc[2]=nproc_2d_y[2];
+  d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
+  d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
+  d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
+  
+  
+  bool check_y_dims=(((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0) && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0) && (n[0] % (d->process_topology_2_y.nproc[2]) == 0) && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
+  if(!check_y_dims && debug && (self==0)){
+    FILE * outfile;
+    outfile= fopen("error.data","a");
+    fprintf(outfile,"Y DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",d->process_topology_2_y.nproc[0],d->process_topology_2_y.nproc[1],d->process_topology_2_y.nproc[2], d->process_topology_3.nproc[0],d->process_topology_3.nproc[1],d->process_topology_3.nproc[2]);
+  }
+  assert(check_y_dims);//if this happends, it is because the dimensions were chosen incorrectly. Either to many processors for the number of points in one dimenison (could not do at least 1 point per processor), or the methods above could 
+  //not make a distribution of pencils that fit in the cubiods, which would happen if the user gave numbers that wouldent work (we require the number of processors in each dimension of the cuboid must be modulo the number of points 
+  //in that dimension, otherwise, this error will happen).
+  Coord_y_pencils(self,d->process_topology_2_y.self,d);
+  
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "  2d_y: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", d->process_topology_2_y.nproc[i], separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  }
+  
+  // set up process grid with 2d_z decomposition (Z dim pencils)
+  period[0] = period[1] = period[2] = 1;
+  MPI_Cart_create(comm, ndim, nproc_2d_z, period, 0, &d->process_topology_2_z.cart);
+  d->process_topology_2_z.nproc[0]=nproc_2d_z[0];
+  d->process_topology_2_z.nproc[1]=nproc_2d_z[1];
+  d->process_topology_2_z.nproc[2]=nproc_2d_z[2];
+  d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
+  d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
+  d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
+  
+  
+  bool check_z_dims=((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0) && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0) && (n[0] % (d->process_topology_2_z.nproc[0]) == 0) && (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
+  if(!check_z_dims && debug && (self==0)){
+    FILE * outfile;
+    outfile= fopen("error.data","a");
+    fprintf(outfile,"Z DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",d->process_topology_2_z.nproc[0],d->process_topology_2_z.nproc[1],d->process_topology_2_z.nproc[2], d->process_topology_3.nproc[0],d->process_topology_3.nproc[1],d->process_topology_3.nproc[2]);
+  }
+  assert(check_z_dims);//if this happends, it is because the dimensions were chosen incorrectly. Either to many processors for the number of points in one dimenison (could not do at least 1 point per processor), or the methods above could 
+  //not make a distribution of pencils that fit in the cubiods, which would happen if the user gave numbers that wouldent work (we require the number of processors in each dimension of the cuboid must be modulo the number of points 
+  //in that dimension, otherwise, this error will happen).
+  Coord_z_pencils(self,d->process_topology_2_z.self,d);
+  
+  if (d->debug && 0 == self) {
+    fprintf(stderr, "  2d_z: ");
+    for (int i = 0; i < ndim; ++i) {
+      fprintf(stderr, "%d%c", d->process_topology_2_z.nproc[i], separator(i, ndim));
+    }
+    fprintf(stderr, "\n");
+  }
+  //assert that all pencils fit in the cuboid.
+  
+  if (d->debug) {
+    int myrank_cube;
+    Rank_cube(&myrank_cube,d->process_topology_3.self,d);
+    int myrank_z;
+    Rank_z_pencils(&myrank_z,d->process_topology_2_z.self,d);
+    int myrank_y;
+    Rank_y_pencils(&myrank_y,d->process_topology_2_y.self,d);
+    int myrank_x;
+    Rank_x_pencils(&myrank_x,d->process_topology_2_x.self,d);
+    if(myrank_z != self || myrank_y != self || myrank_x != self || myrank_cube != self)abort(); //means ranks were calculated wrong.
+    if (0 == self) {
+      fprintf(stderr, "Process map:\n");
+    }
+    for (int p = 0; p < nproc; ++p) {
+      MPI_Barrier(comm);
+      if (p == self) {
+	fprintf(stderr,
+		"  %d: 1d = (%d, %d, %d), 2d_x = (%d, %d, %d) rank (%d), 2d_y = (%d, %d, %d) rank (%d), 2d_z = (%d, %d, %d) rank (%d), 3d = (%d, %d, %d) rank (%d).\n",
+		self,
+		d->process_topology_1.self[0], d->process_topology_1.self[1], d->process_topology_1.self[2],
+		d->process_topology_2_x.self[0], d->process_topology_2_x.self[1], d->process_topology_2_x.self[2],myrank_x,
+		d->process_topology_2_y.self[0], d->process_topology_2_y.self[1], d->process_topology_2_y.self[2],myrank_y,
+		d->process_topology_2_z.self[0], d->process_topology_2_z.self[1], d->process_topology_2_z.self[2],myrank_z,
+		d->process_topology_3.self[0], d->process_topology_3.self[1], d->process_topology_3.self[2],myrank_cube);
+      }
+    }
+  }
+}
+
+
+
+
+///
+// clean up the data distribution
+//   d    distribution descriptor
+///
+void distribution_fini(distribution_t *d)
+{
+  MPI_Comm_free(&d->process_topology_1.cart);
+  MPI_Comm_free(&d->process_topology_2_x.cart);
+  MPI_Comm_free(&d->process_topology_2_y.cart);
+  MPI_Comm_free(&d->process_topology_2_z.cart);
+  MPI_Comm_free(&d->process_topology_3.cart);
+  bigchunk_free(d->d2_chunk);
+  bigchunk_free(d->d3_chunk);
+}
+
+
+///
+// check that the dimensions, n, of an array are commensurate with the
+// process grids of this distribution
+//   n    (global) grid dimensions
+//   d    distribution descriptor
+///
+void distribution_assert_commensurate(distribution_t *d)
+{
+  for (int i = 0; i < 3; ++i) {
+#if defined(PENCIL)
+    assert(0 == (d->n[i] % d->process_topology_2_x.nproc[i]));
+    assert(0 == (d->n[i] % d->process_topology_2_y.nproc[i]));
+    assert(0 == (d->n[i] % d->process_topology_2_z.nproc[i]));
+#else
+    assert(0 == (d->n[i] % d->process_topology_1.nproc[i]));
+#endif
+    assert(0 == (d->n[i] % d->process_topology_3.nproc[i]));
+  }
+}
+
+
+// forward declarations
+static void redistribute(const complex_t *, complex_t *, distribution_t *, int);
+static void redistribute_2_and_3(const complex_t *, complex_t *, distribution_t *, int, int);
+static void redistribute_slab(const complex_t *, complex_t *, distribution_t *, int);
+
+
+///
+// redistribute a 1-d to a 3-d data distribution
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+///
+void distribution_1_to_3(const complex_t *a,
+                         complex_t *b,
+                         distribution_t *d)
+{
+  if (USE_SLAB_WORKAROUND) {
+    redistribute_slab(a, b, d, REDISTRIBUTE_1_TO_3);
+  } else {
+    redistribute(a, b, d, REDISTRIBUTE_1_TO_3);
+  }
+}
+
+
+///
+// redistribute a 3-d to a 1-d data distribution
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+///
+void distribution_3_to_1(const complex_t *a,
+                         complex_t *b,
+                         distribution_t *d)
+{
+  if (USE_SLAB_WORKAROUND) {
+    redistribute_slab(a, b, d, REDISTRIBUTE_3_TO_1);
+  } else {
+    redistribute(a, b, d, REDISTRIBUTE_3_TO_1);
+  }
+}
+
+
+///
+// redistribute between 1- and 3-d distributions.
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+//   dir  direction of redistribution
+//
+// This actually does the work.
+///
+static void redistribute(const complex_t *a,
+                         complex_t *b,
+                         distribution_t *d,
+                         int direction)
+{
+  int remaining_dim[3];
+  MPI_Comm subgrid_cart;
+  int subgrid_self;
+  int subgrid_nproc;
+  
+  // exchange data with processes in a 2-d slab of 3-d subdomains
+  
+  remaining_dim[0] = 0;
+  remaining_dim[1] = 1;
+  remaining_dim[2] = 1;
+  MPI_Cart_sub(d->process_topology_3.cart, remaining_dim, &subgrid_cart);
+  MPI_Comm_rank(subgrid_cart, &subgrid_self);
+  MPI_Comm_size(subgrid_cart, &subgrid_nproc);
+  
+  for (int p = 0; p < subgrid_nproc; ++p) {
+    int d1_peer = (subgrid_self + p) % subgrid_nproc;
+    int d3_peer = (subgrid_self - p + subgrid_nproc) % subgrid_nproc;
+    int coord[2];
+    int sizes[3];
+    int subsizes[3];
+    int starts[3];
+    MPI_Datatype d1_type;
+    MPI_Datatype d3_type;
+    
+    MPI_Cart_coords(subgrid_cart, d1_peer, 2, coord);
+    if (0) {
+      int self;
+      MPI_Comm_rank(MPI_COMM_WORLD, &self);
+      fprintf(stderr, "%d: d1_peer, d1_coord, d3_peer = %d, (%d, %d), %d\n",
+	      self, d1_peer, coord[0], coord[1], d3_peer);
+    }
+    
+    // create dataypes representing a subarray in the 1- and 3-d distributions
+    
+    sizes[0] = d->process_topology_1.n[0];
+    sizes[1] = d->process_topology_1.n[1];
+    sizes[2] = d->process_topology_1.n[2];
+    subsizes[0] = d->process_topology_1.n[0];
+    subsizes[1] = d->process_topology_3.n[1];
+    subsizes[2] = d->process_topology_3.n[2];
+    starts[0] = 0;
+    starts[1] = coord[0] * d->process_topology_3.n[1];
+    starts[2] = coord[1] * d->process_topology_3.n[2];
+    MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE_COMPLEX, &d1_type);
+    MPI_Type_commit(&d1_type);
+    
+    sizes[0] = d->process_topology_3.n[0];
+    sizes[1] = d->process_topology_3.n[1];
+    sizes[2] = d->process_topology_3.n[2];
+    subsizes[0] = d->process_topology_1.n[0];
+    subsizes[1] = d->process_topology_3.n[1];
+    subsizes[2] = d->process_topology_3.n[2];
+    starts[0] = d3_peer * d->process_topology_1.n[0];
+    starts[1] = 0;
+    starts[2] = 0;
+    MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE_COMPLEX, &d3_type);
+    MPI_Type_commit(&d3_type);
+    
+    // exchange data
+    
+    if (direction == REDISTRIBUTE_3_TO_1) {
+      MPI_Sendrecv((char *) a, 1, d3_type, d3_peer, 0,
+		   (char *) b, 1, d1_type, d1_peer, 0,
+		   subgrid_cart, MPI_STATUS_IGNORE);
+    } else if (direction == REDISTRIBUTE_1_TO_3) {
+      MPI_Sendrecv((char *) a, 1, d1_type, d1_peer, 0,
+		   (char *) b, 1, d3_type, d3_peer, 0,
+		   subgrid_cart, MPI_STATUS_IGNORE);
+    } else {
+      abort();
+    }
+    
+    // free datatypes
+    
+    MPI_Type_free(&d1_type);
+    MPI_Type_free(&d3_type);
+  }
+  
+  MPI_Comm_free(&subgrid_cart);
+}
+
+
+///
+// redistribute between 1- and 3-d distributions.
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+//   dir  direction of redistribution
+//
+// This actually does the work, using slabs of subarrays to work
+// around an issue in Open MPI with large non-contiguous datatypes.
+///
+static void redistribute_slab(const complex_t *a,
+                              complex_t *b,
+                              distribution_t *d,
+                              int direction)
+{
+  int remaining_dim[3];
+  MPI_Comm subgrid_cart;
+  int subgrid_self;
+  int subgrid_nproc;
+  ptrdiff_t d1_slice = d->process_topology_1.n[1] * d->process_topology_1.n[2] * sizeof(complex_t);
+  ptrdiff_t d3_slice = d->process_topology_3.n[1] * d->process_topology_3.n[2] * sizeof(complex_t);
+  
+  // exchange data with processes in a 2-d slab of 3-d subdomains
+  
+  remaining_dim[0] = 0;
+  remaining_dim[1] = 1;
+  remaining_dim[2] = 1;
+  MPI_Cart_sub(d->process_topology_3.cart, remaining_dim, &subgrid_cart);
+  MPI_Comm_rank(subgrid_cart, &subgrid_self);
+  MPI_Comm_size(subgrid_cart, &subgrid_nproc);
+  
+  for (int p = 0; p < subgrid_nproc; ++p) {
+    int coord[2];
+    int d1_peer = (subgrid_self + p) % subgrid_nproc;
+    int d3_peer = (subgrid_self - p + subgrid_nproc) % subgrid_nproc;
+    
+    MPI_Cart_coords(subgrid_cart, d1_peer, 2, coord);
+    if (0) {
+      int self;
+      MPI_Comm_rank(MPI_COMM_WORLD, &self);
+      fprintf(stderr, "%d: d1_peer, d1_coord, d3_peer = %d, (%d, %d), %d\n",
+	      self, d1_peer, coord[0], coord[1], d3_peer);
+    }
+    
+    for (int slice = 0; slice < d->process_topology_1.n[0]; ++slice) {
+      int sizes[2];
+      int subsizes[2];
+      int starts[2];
+      MPI_Datatype d1_type;
+      MPI_Datatype d3_type;
+      ptrdiff_t d1_offset = slice * d1_slice;
+      ptrdiff_t d3_offset = (slice + d3_peer * d->process_topology_1.n[0]) * d3_slice;
+      
+      // create subarray dataypes representing the slice subarray in the 1- and 3-d distributions
+      
+      sizes[0] = d->process_topology_1.n[1];
+      sizes[1] = d->process_topology_1.n[2];
+      subsizes[0] = d->process_topology_3.n[1];
+      subsizes[1] = d->process_topology_3.n[2];
+      starts[0] = coord[0] * d->process_topology_3.n[1];
+      starts[1] = coord[1] * d->process_topology_3.n[2];
+      MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE_COMPLEX, &d1_type);
+      MPI_Type_commit(&d1_type);
+      
+      MPI_Type_contiguous(d->process_topology_3.n[1] * d->process_topology_3.n[2],
+			  MPI_DOUBLE_COMPLEX,
+			  &d3_type);
+      MPI_Type_commit(&d3_type);
+      
+      // exchange data
+      
+      if (direction == REDISTRIBUTE_3_TO_1) {
+	MPI_Sendrecv((char *) a + d3_offset, 1, d3_type, d3_peer, 0,
+		     (char *) b + d1_offset, 1, d1_type, d1_peer, 0,
+		     subgrid_cart, MPI_STATUS_IGNORE);
+      } else if (direction == REDISTRIBUTE_1_TO_3) {
+	MPI_Sendrecv((char *) a + d1_offset, 1, d1_type, d1_peer, 0,
+		     (char *) b + d3_offset, 1, d3_type, d3_peer, 0,
+		     subgrid_cart, MPI_STATUS_IGNORE);
+      } else {
+	abort();
+      }
+      
+      // free datatypes
+      
+      MPI_Type_free(&d1_type);
+      MPI_Type_free(&d3_type);
+    }
+  }
+  
+  MPI_Comm_free(&subgrid_cart);
+}
+
+
+///
+// redistribute a 2-d to a 3-d data distribution
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+///
+void distribution_2_to_3(const complex_t *a,
+                         complex_t *b,
+                         distribution_t *d, int z_dim)
+{
+  redistribute_2_and_3(a, b, d, REDISTRIBUTE_2_TO_3, z_dim);
+}
+
+
+///
+// redistribute a 3-d to a 2-d data distribution
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+///
+void distribution_3_to_2(const complex_t *a,
+                         complex_t *b,
+                         distribution_t *d, int z_dim)
+{
+  redistribute_2_and_3(a, b, d, REDISTRIBUTE_3_TO_2, z_dim);
+}
+
+
+///
+// redistribute between 2- and 3-d distributions.
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+//   dir  direction of redistribution
+//
+// This actually does the work.
+///
+static void redistribute_2_and_3(const complex_t *a,
+                                 complex_t *b,
+                                 distribution_t *d,
+                                 int direction,
+				 int z_dim)
+{
+  int self = d->process_topology_1.self[0];
+  int npeers;
+  int me=0;//determines which processor to print
+  bool print_me=false; //prints info on proccessor whose rank = me.
+  bool print_mess=false;//prints communication sends and recieves without actually doing the comms(intended to debug comm hangs).
+  bool print_result=false /*true*/;//prints a line in a file called "passed.data" which happends if the code runs completly.
+  assert(z_dim==0||z_dim==1||z_dim==2);
+  int x_dim=0,y_dim=0;
+  //x_dim, y_dim and z_dim are the dimensions of the x,y,z axis of the pencil with respect to the original axis(where index 2 is into the grid, 1 is vertical translation and 0 is horizontal).
+  switch(z_dim){
+    case 0: x_dim=1; y_dim=2; 
+      if((self == me) && print_me)fprintf(stderr, "DOING X PENCILS!...\n"); break;
+    case 1: x_dim=2; y_dim=0;
+      if((self == me && print_me))fprintf(stderr, "DOING Y PENCILS!...\n"); break;
+    case 2: x_dim=0; y_dim=1;
+      if((self == me && print_me))fprintf(stderr, "DOING Z PENCILS!...\n"); break;
+    default: assert("incorrect inputed dimension");
+  }
+  
+  // assuming dimensions are all commensurate, then the number of
+  // peers to exchange with is the number of processes in the z_dimension
+  // direction in the 3d distribution
+  npeers = d->process_topology_3.nproc[z_dim]; //picked last direction (lets say into the grid)
+  
+  // book-keeping for the processor translation in the x-y plane
+  int p0 = 0;
+  int p1 = 0;
+  int p1max = 0;
+  
+  MPI_Request req1=MPI_REQUEST_NULL;
+  MPI_Request req2=MPI_REQUEST_NULL;
+  
+  int pencil_sizes[3];
+  int cube_sizes[3];
+  int subsizes[3];
+  
+  
+  cube_sizes[x_dim] = d->process_topology_3.n[x_dim];
+  cube_sizes[y_dim] = d->process_topology_3.n[y_dim]; 
+  cube_sizes[z_dim] = d->process_topology_3.n[z_dim];
+  
+  //set varibles used to calculate the subarrays of each pencil and cube.
+  switch(z_dim){
+    case 0: 
+      p1max = d->process_topology_2_x.nproc[x_dim] / d->process_topology_3.nproc[x_dim] - 1; 
+      //find out the size of the chunk you need to use (stored in subsizes), and set sizes to the local size of the pencil.
+      //The x and y dimensions of the subchunck will be the dimensions of the pencil (since the code asserts at the beginning that all pencils fit inside the 3d cuboid.)
+      //The z dimension will be the dimension of the cuboid, since this will always be <= to the z_dim of the pencil.
+      pencil_sizes[x_dim] = d->process_topology_2_x.n[x_dim];
+      pencil_sizes[y_dim] = d->process_topology_2_x.n[y_dim];  
+      pencil_sizes[z_dim] = d->process_topology_2_x.n[z_dim]; 
+      subsizes[x_dim] = d->process_topology_2_x.n[x_dim];
+      subsizes[y_dim] = d->process_topology_2_x.n[y_dim];   
+      break;
+    case 1: 
+      p1max = d->process_topology_2_y.nproc[x_dim] / d->process_topology_3.nproc[x_dim] - 1; 
+      pencil_sizes[x_dim] = d->process_topology_2_y.n[x_dim];
+      pencil_sizes[y_dim] = d->process_topology_2_y.n[y_dim];  
+      pencil_sizes[z_dim] = d->process_topology_2_y.n[z_dim]; 
+      subsizes[x_dim] = d->process_topology_2_y.n[x_dim];
+      subsizes[y_dim] = d->process_topology_2_y.n[y_dim];   
+      break;
+    case 2: 
+      p1max = d->process_topology_2_z.nproc[y_dim] / d->process_topology_3.nproc[y_dim] - 1; 
+      pencil_sizes[x_dim] = d->process_topology_2_z.n[x_dim];
+      pencil_sizes[y_dim] = d->process_topology_2_z.n[y_dim];  
+      pencil_sizes[z_dim] = d->process_topology_2_z.n[z_dim]; 
+      subsizes[x_dim] = d->process_topology_2_z.n[x_dim];
+      subsizes[y_dim] = d->process_topology_2_z.n[y_dim];   
+      break;
+  }
+  subsizes[z_dim] = d->process_topology_3.n[z_dim];
+  int chunk_size=subsizes[0]*subsizes[1]*subsizes[2];//size of data chunks that will be communicated between pencil and cube distributions.
+  
+  //set variables that will be used to find pencils chunks
+  int pencil_dims[3]={0,0,0};// size of entire pencil in its local coord system 
+  int local_sizes[3]={0,0,0}; //size of chunck in its local coord system.
+  if(z_dim==2){
+    local_sizes[0]=subsizes[0];
+    local_sizes[1]=subsizes[1];
+    local_sizes[2]=subsizes[2];
+    pencil_dims[0]=d->process_topology_2_z.n[0];//pencil dims in grid coord system (where index 2 is the z direction).
+    pencil_dims[1]=d->process_topology_2_z.n[1];
+    pencil_dims[2]=d->process_topology_2_z.n[2];
+  }
+  else if(z_dim==1){
+    
+    local_sizes[0]=subsizes[0];
+    local_sizes[1]=subsizes[2];
+    local_sizes[2]=subsizes[1];
+    pencil_dims[0]=d->process_topology_2_y.n[0];
+    pencil_dims[1]=d->process_topology_2_y.n[2];
+    pencil_dims[2]=d->process_topology_2_y.n[1];
+  }
+  else if(z_dim==0){
+    local_sizes[0]=subsizes[2];
+    local_sizes[1]=subsizes[1];
+    local_sizes[2]=subsizes[0];
+    pencil_dims[0]=d->process_topology_2_x.n[2];
+    pencil_dims[1]=d->process_topology_2_x.n[1];
+    pencil_dims[2]=d->process_topology_2_x.n[0];
+  }
+  
+  if((self == me) && print_me)fprintf(stderr, "%d, %d, %d, %d Dimensions!...\n", x_dim,y_dim,z_dim, p1max);
+  
+  // communicate with our peers
+  for (int p = 0; p < npeers; ++p) {
+    if((self == me) && print_me)fprintf(stderr, "%d, %d, %d Made it beg-for!...\n", self,p, npeers);
+    
+    int d2_coord[3];
+    int d2_peer;
+    int d2_peer_coord[3];
+    int d3_coord[3];
+    int d3_peer;
+    int d3_peer_coord[3];
+    int recv_peer;
+    int send_peer;
+    int d2_array_start[3];
+    int d3_array_start[3];
+    //turn the processor coordinate into one specified by the number of data points in each dimension.
+    for (int i = 0; i < 3; ++i) {
+      switch(z_dim){
+	case 0: d2_coord[i]  = d->process_topology_2_x.self[i] * d->process_topology_2_x.n[i]; break;
+	case 1: d2_coord[i]  = d->process_topology_2_y.self[i] * d->process_topology_2_y.n[i]; break;
+	case 2: d2_coord[i]  = d->process_topology_2_z.self[i] * d->process_topology_2_z.n[i]; break;
+      }
+    }
+    //over every iteration of the loop, transverse down the pencil (since it will be divided in chunks whose coordinates will only differ in the z_dimension.
+    d2_coord[z_dim] += p * d->process_topology_3.n[z_dim]; 
+    
+    
+    if((self == me) && print_me)fprintf(stderr, "%d, %d, %d Coord!...\n", d2_coord[0],d2_coord[1],d2_coord[2]);
+    
+    
+    //d2_array_start is the starting index of the chunk in the pencils local coordinates.
+    d2_array_start[0] = d2_coord[x_dim] % pencil_sizes[x_dim]; 
+    d2_array_start[1] = d2_coord[y_dim] % pencil_sizes[y_dim]; 
+    d2_array_start[2] = d2_coord[z_dim] % pencil_sizes[z_dim]; 
+    
+    if (DEBUG_CONDITION || ((self== me) && print_me)) {
+      fprintf(stderr,
+	      "%d: pencil_sizes=(%d,%d,%d), cube_sizes=(%d,%d,%d), subsizes=(%d,%d,%d),d2_coord=(%d,%d,%d), d2_array_start=(%d,%d,%d) \n",
+	      self,
+	      pencil_sizes[0], pencil_sizes[1], pencil_sizes[2],
+	      cube_sizes[0], cube_sizes[1], cube_sizes[2],
+	      subsizes[0], subsizes[1], subsizes[2],
+	      d2_coord[0], d2_coord[1], d2_coord[2],
+	      d2_array_start[0],d2_array_start[1],d2_array_start[2]);
+    }
+    
+    
+    //if making cuboids from pencils, right here we need to fill the d2_chunk array with the data that later needs to be sent to a cuboid.
+        //The array is a chunk of the pencil and is why we needed to calculate the starting index for the array in the local coordinates.
+    if(direction == REDISTRIBUTE_2_TO_3){	
+      int64_t ch_indx=0;
+      int dims_size=pencil_dims[0]*pencil_dims[1]*pencil_dims[2];
+      for(int i0=d2_array_start[0];i0<d2_array_start[0]+local_sizes[0];i0++){
+	for(int i1=d2_array_start[1];i1<d2_array_start[1]+local_sizes[1];i1++){
+	  for(int i2=d2_array_start[2];i2<d2_array_start[2]+local_sizes[2];i2++){
+	    int64_t local_indx=pencil_dims[2]*(pencil_dims[1]*i0+i1) + i2;
+	    assert(local_indx < dims_size);
+	    assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
+	    d->d2_chunk[ch_indx]=a[local_indx];
+	    ch_indx++;
+	  }
+	}
+      }
+      
+      if((self == me) && print_me)fprintf(stderr, "%d, %d, %d, pencil_dims!...\n", pencil_dims[0],pencil_dims[1],pencil_dims[2]);
+    }
+    
+    // what peer in the 3d distribution owns this subarray? 
+    for (int i = 0; i < 3; ++i) {
+      d3_peer_coord[i] = d2_coord[i] / d->process_topology_3.n[i];
+    }
+    if((self == me) && print_me)fprintf(stderr, "%d, %d, %d Cube that hits pencil coord!...\n",d3_peer_coord[0],d3_peer_coord[1],d3_peer_coord[2]);
+    //find the rank of this peer.
+    switch(z_dim){
+      case 0: MPI_Cart_rank(d->process_topology_3.cart, d3_peer_coord, &d3_peer); break;
+      case 1: MPI_Cart_rank(d->process_topology_3.cart, d3_peer_coord, &d3_peer); break;
+      case 2: MPI_Cart_rank(d->process_topology_3.cart, d3_peer_coord, &d3_peer); break;
+    }
+    if((self == me) && print_me)fprintf(stderr, "%d, %d, Made it half way!...\n", self,p);
+    if((self == me) && print_me)fprintf(stderr, "%d, %d, PEER!...\n", self,d3_peer);
+    
+    //By here in the for loop, we have broken the pencil into a chunk and found which cuboid it resides; over every iteration, the for-loop will break up the pencil in the z_dimension.
+    //From here on we do the opposite. We divide the cuboid into chunks (that are the same size as the ones in the pencil), and determine which pencils own these chunks.
+    
+    
+    // what is the coordinate of my pth subarray in the 3d distribution?
+    for (int i = 0; i < 3; ++i) {
+      switch(z_dim){
+	case 0: d3_coord[i]  = d->process_topology_3.self[i] * d->process_topology_3.n[i]; break;
+	case 1: d3_coord[i]  = d->process_topology_3.self[i] * d->process_topology_3.n[i]; break;
+	case 2: d3_coord[i]  = d->process_topology_3.self[i] * d->process_topology_3.n[i]; break;
+      }
+    }
+    
+    //now unlike above, we dont need to iterate in the z_dim, because for each processor its subarrays inward dimension is already set by the cubes z_dim.
+    //Instead, each iteration of the for-loop will look at different subarrays whose locations in the cuboid differ by local x and y coords.
+    
+    switch(z_dim){
+      //p1 is a place holder for the first translation . The outside for-loop will increment the coord in that direction, say x_dim, 
+      //and keep doing so until all of the chunks in that dimension are calculated. Then it will increment p0 in the other dimension (in this example the y) 
+      //and repeat until all of the subchunks in the x and y dimensions are calculated.
+      //are found. 
+      //Note: p0 and p1 will increment different dimensions depending of whether it is using the x y or z pencils, this is because the set up of the coordinate system for each 
+      //pencil is different and to ensure that no communications hang up later, the directions coded below are unique for each type of pencil.
+      case 0:
+	d3_coord[y_dim] += p0 * d->process_topology_2_x.n[y_dim]; 
+	d3_coord[x_dim] += p1 * d->process_topology_2_x.n[x_dim]; 
+	break;
+      case 1:
+	d3_coord[y_dim] += p0 * d->process_topology_2_y.n[y_dim]; 
+	d3_coord[x_dim] += p1 * d->process_topology_2_y.n[x_dim]; 
+	break;
+      case 2:
+	d3_coord[x_dim] += p0 * d->process_topology_2_z.n[x_dim]; 
+	d3_coord[y_dim] += p1 * d->process_topology_2_z.n[y_dim]; 
+	break;
+    }
+    if (p1 == p1max) {
+      p0++;
+      p1 = 0;
+    } else {
+      p1++;
+    }
+    // create a dataype for my pth subrarray in the 3d distribution
+    
+    
+    //d3_array_start holds the starting index of the chunk in the cubes local coordinates(note the cubes local coord system is actually the same as the grids global coord system, by set up)
+    
+    d3_array_start[x_dim] = d3_coord[x_dim] % cube_sizes[x_dim]; 
+    d3_array_start[y_dim] = d3_coord[y_dim] % cube_sizes[y_dim]; 
+    d3_array_start[z_dim] = d3_coord[z_dim] % cube_sizes[z_dim]; 
+    
+    //make starting point so that it coincides with the starting point of the pencil from the pencils coordinate system. (for z_pencils nothing needs to be changed, since it already
+    //has the coordinate system of the grid, however, the x and y pencils have different starting points of the subchunk in their coord systems.)
+    if(z_dim==0 || z_dim ==1){
+      d3_array_start[2]=d3_array_start[2]+subsizes[2]-1;
+    }
+    if(print_me && (self==me))fprintf(stderr,"D3_array_start is (%d,%d,%d) and subsizes is (%d,%d,%d) \n",d3_array_start[0],d3_array_start[1],d3_array_start[2],subsizes[0],subsizes[1],subsizes[2]);
+    
+    
+    //If sending cube chunks to pencils, need to fill those chunks with data here. The chunks are filled in the order 
+    //such that when the pencil recieves the chunk, in its local array indexing, it assumes that the array is already 
+    //filled such that it is contiguous. Therefore, complicated for-loops below fill the array in the cubes local indexing to match what the pencil will
+    //expect. 
+    if(direction == REDISTRIBUTE_3_TO_2){
+      int64_t ch_indx=0;
+      int dims_size=cube_sizes[0]*cube_sizes[1]*cube_sizes[2];
+      if((self == me) && print_me)fprintf(stderr, "%d, %d, MAKE 3D Chunk...\n", self,d3_peer);
+      switch(z_dim){
+	case 0:
+	  for(int i2=d3_array_start[y_dim];i2>d3_array_start[y_dim]-subsizes[y_dim];i2--){//perhaps y_dim
+	    for(int i1=d3_array_start[x_dim];i1<d3_array_start[x_dim]+subsizes[x_dim];i1++){//perhaps x_dim
+	      for(int i0=d3_array_start[z_dim];i0<d3_array_start[z_dim]+subsizes[z_dim];i0++){//perhaps z_dim
+		int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
+		assert(local_indx < dims_size);
+		assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
+		d->d3_chunk[ch_indx]=a[local_indx];
+		ch_indx++;
+	      }
+	    }
+	  }
+	  break;
+	case 1:
+	  for(int i0=d3_array_start[y_dim];i0<d3_array_start[y_dim]+subsizes[y_dim];i0++){
+	    for(int i2=d3_array_start[x_dim];i2>d3_array_start[x_dim]-subsizes[x_dim];i2--){
+	      for(int i1=d3_array_start[z_dim];i1<d3_array_start[z_dim]+subsizes[z_dim];i1++){
+		int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
+		assert(local_indx < dims_size);
+		assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
+		d->d3_chunk[ch_indx]=a[local_indx];
+		ch_indx++;
+	      }
+	    }
+	  }
+	  
+	  break;
+	case 2:
+	  for(int i0=d3_array_start[x_dim];i0<d3_array_start[x_dim]+subsizes[x_dim];i0++){
+	    for(int i1=d3_array_start[y_dim];i1<d3_array_start[y_dim]+subsizes[y_dim];i1++){
+	      for(int i2=d3_array_start[z_dim];i2<d3_array_start[z_dim]+subsizes[z_dim];i2++){
+		int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
+		assert(local_indx < dims_size);
+		assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
+		d->d3_chunk[ch_indx]=a[local_indx];
+		ch_indx++;
+	      }
+	    }
+	  }
+	  
+	  break;
+      }
+    }
+    
+    if (DEBUG_CONDITION || ((self == me) && print_me)) {
+      fprintf(stderr,
+	      "%d: pencil_sizes=(%d,%d,%d), cube_sizes=(%d,%d,%d), subsizes=(%d,%d,%d), d3_coord=(%d,%d,%d), d3_array_start=(%d,%d,%d) \n",
+	      self,
+	      pencil_sizes[0], pencil_sizes[1], pencil_sizes[2],
+	      cube_sizes[0], cube_sizes[1], cube_sizes[2],
+	      subsizes[0], subsizes[1], subsizes[2],
+	      d3_coord[0], d3_coord[1], d3_coord[2],
+	      d3_array_start[0],d3_array_start[1],d3_array_start[2]);
+    }
+    
+    // what peer in the 2d distribution owns this subarray?
+    for (int i = 0; i < 3; ++i) {
+      switch(z_dim){
+	case 0:
+	  d2_peer_coord[i] = d3_coord[i] / d->process_topology_2_x.n[i];
+	  break;
+	case 1:
+	  d2_peer_coord[i] = d3_coord[i] / d->process_topology_2_y.n[i];
+	  break;
+	case 2:
+	  d2_peer_coord[i] = d3_coord[i] / d->process_topology_2_z.n[i];
+	  break;
+      }
+    }
+    d2_peer_coord[z_dim] = 0;//since these are pencils, there is no two pencils in this direction.
+    if((self == me) && print_me)fprintf(stderr, "%d, %d, %d PENCIL that hits chunk!...\n",d2_peer_coord[0],d2_peer_coord[1],d2_peer_coord[2]);
+    switch(z_dim){
+      //find its rank
+      case 0:
+        Rank_x_pencils(&d2_peer,d2_peer_coord,d);
+	break;
+      case 1:
+        Rank_y_pencils(&d2_peer,d2_peer_coord,d);
+	break;
+      case 2:
+	Rank_z_pencils(&d2_peer,d2_peer_coord,d);
+	break;
+    }
+    if((self == me) && print_me)fprintf(stderr, "%d, %d, %d Made it before comm!...\n", self,p, npeers);
+    
+    // record the communication to be done in a schedule
+    if (direction == REDISTRIBUTE_3_TO_2) {
+      recv_peer = d3_peer;
+      send_peer = d2_peer;
+    } else if (direction == REDISTRIBUTE_2_TO_3) {
+      recv_peer = d2_peer;
+      send_peer = d3_peer;
+    } else {
+      abort();
+    }
+    //comunication of the chunks:
+    //if print_mess boolean is set to true, then the code runs without sending any messages, and is used to test which messages would be sent in the entire run.
+    //(designed to debug comm hangups, if they occur).
+    
+    if(direction == REDISTRIBUTE_3_TO_2){
+      
+      if((self == me) && print_mess)fprintf(stderr, " I am %d, making request to recieve from %d...\n", self,recv_peer);
+      if(!print_mess)MPI_Irecv((void *) d->d2_chunk, chunk_size, MPI_DOUBLE_COMPLEX, recv_peer, 0, d->process_topology_1.cart, &req1);
+      
+      if((self == me) && print_mess)fprintf(stderr, " I am %d, making request to send to %d...\n", self,send_peer);
+      if(!print_mess)MPI_Isend((void *) d->d3_chunk, chunk_size, MPI_DOUBLE_COMPLEX, send_peer, 0, d->process_topology_1.cart, &req2);
+      
+      if((self == me) && print_mess)fprintf(stderr, " I am %d, waiting to recieve from %d...\n", self,recv_peer);
+      //fprintf(stderr, " I am %d, waiting to recieve from %d...\n", self,recv_peer);
+      if(!print_mess)MPI_Wait(&req1,MPI_STATUS_IGNORE);
+      
+      //if((self == me || self == 1 || self == 2 || self == 3) && print_me)fprintf(stderr, " I am %d, waiting to send to %d...\n", self,send_peer);
+      //fprintf(stderr, " I am %d, waiting to send to %d...\n", self,send_peer);
+      if(self==me && print_mess)fprintf(stderr, " I am %d, waiting to send to %d...\n", self,send_peer);
+      if(!print_mess)MPI_Wait(&req2,MPI_STATUS_IGNORE);
+      
+      //fill the local array with the received chunk.
+      int64_t ch_indx=0;
+      int dims_size=pencil_dims[0]*pencil_dims[1]*pencil_dims[2];
+      if(self==me && print_me)fprintf(stderr,"REAL SUBSIZES (%d,%d,%d)\n",subsizes[x_dim],subsizes[y_dim],subsizes[z_dim]);
+      if(self==me && print_me)fprintf(stderr,"PENCIL DIMENSION VS. local sizes (%d,%d,%d) vs (%d,%d,%d)\n",pencil_dims[0],pencil_dims[1],pencil_dims[2],local_sizes[0],local_sizes[1],local_sizes[2]);
+      if(self==me && print_me)fprintf(stderr,"DIM_2_ARRAY_START (%d,%d,%d) \n",d2_array_start[0],d2_array_start[1],d2_array_start[2]);
+      for(int i0=d2_array_start[0];i0<d2_array_start[0]+local_sizes[0];i0++){
+	for(int i1=d2_array_start[1];i1<d2_array_start[1]+local_sizes[1];i1++){
+	  for(int i2=d2_array_start[2];i2<d2_array_start[2]+local_sizes[2];i2++){
+	    int64_t local_indx=pencil_dims[2]*(pencil_dims[1]*i0+i1) + i2;
+	    //if(self==me)fprintf(stderr,"local_indx = %d ",local_indx);
+	    //if(local_indx >= dims_size)fprintf(stderr,"WOW, in third for, dims is (%d), we are %d and my rank is %d",dims_size,local_indx,self);
+	    assert(local_indx < dims_size);
+	    assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
+	    b[local_indx]=d->d2_chunk[ch_indx];
+	    //if((p==0 || p==1 || p==2 || p==3 || p==4 || p==5) && self==me)fprintf(stderr,"(%f,%f) ",real(d->d2_chunk[ch_indx]),imag(d->d2_chunk[ch_indx]));
+	    ch_indx++;
+	  }
+                        	}
+      }
+      //     if((p==0 ||p==1 || p==2 || p==3 || p==4 || p==5) && self==me)fprintf(stderr,"P is %d \n",p);
+      
+    } 
+    else if (direction == REDISTRIBUTE_2_TO_3) {
+      
+      if((self == me) && print_mess)fprintf(stderr, " I am %d, making request to recieve from %d...\n", self,recv_peer);
+      if(!print_mess)MPI_Irecv((void *) d->d3_chunk, chunk_size, MPI_DOUBLE_COMPLEX, recv_peer, 0, d->process_topology_1.cart, &req1);
+      
+      if((self == me) && print_mess)fprintf(stderr, " I am %d, making request to send to %d...\n", self,send_peer);
+      if(!print_mess)MPI_Isend((void *) d->d2_chunk, chunk_size, MPI_DOUBLE_COMPLEX, send_peer, 0, d->process_topology_1.cart, &req2);
+      
+      if((self == me) && print_mess)fprintf(stderr, " I am %d, waiting to recieve from %d...\n", self,recv_peer);
+      if(!print_mess)MPI_Wait(&req1,MPI_STATUS_IGNORE);
+      
+      if((self == me) && print_mess)fprintf(stderr, " I am %d, waiting to send to %d...\n", self,send_peer);
+      if(!print_mess)MPI_Wait(&req2,MPI_STATUS_IGNORE);
+      int64_t ch_indx=0;
+      int dims_size=(d->process_topology_3.n[2])*(d->process_topology_3.n[1])*(d->process_topology_3.n[0]);
+      if(z_dim==0){
+	//fill the local array with the received chunk.
+	
+	for(int i2=d3_array_start[y_dim];i2>d3_array_start[y_dim]-subsizes[y_dim];i2--){
+	  for(int i1=d3_array_start[x_dim];i1<d3_array_start[x_dim]+subsizes[x_dim];i1++){
+	    for(int i0=d3_array_start[z_dim];i0<d3_array_start[z_dim]+subsizes[z_dim];i0++){
+	      int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
+	      //if(local_indx >= dims_size)fprintf(stderr,"WOW, in fourth for, dims is (%d), we are %d and my rank is %d",dims_size,local_indx,self);
+	      assert(local_indx < dims_size);
+	      assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
+	      b[local_indx]=d->d3_chunk[ch_indx];
+	      //                         if(p==3 && self==me)fprintf(stderr,"(%f,%f) ",real(d->d3_chunk[ch_indx]),imag(d->d3_chunk[ch_indx]));
+	      ch_indx++;
+	    }
+	  }
+	}
+      }
+      else if(z_dim==1){
+	for(int i0=d3_array_start[y_dim];i0<d3_array_start[y_dim]+subsizes[y_dim];i0++){
+	  for(int i2=d3_array_start[x_dim];i2>d3_array_start[x_dim]-subsizes[x_dim];i2--){
+	    for(int i1=d3_array_start[z_dim];i1<d3_array_start[z_dim]+subsizes[z_dim];i1++){
+	      int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
+	      //if(local_indx >= dims_size)fprintf(stderr,"WOW, in fourth for, dims is (%d), we are %d and my rank is %d",dims_size,local_indx,self);
+	      assert(local_indx < dims_size);
+	      assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
+	      b[local_indx]=d->d3_chunk[ch_indx];
+	      //                             if(p==0 && self==me)fprintf(stderr,"(%f,%f) ",real(d->d3_chunk[ch_indx]),imag(d->d3_chunk[ch_indx]));
+	      ch_indx++;
+	    }
+	  }
+	}
+	
+      }
+      else if(z_dim==2){
+	for(int i0=d3_array_start[x_dim];i0<d3_array_start[x_dim]+subsizes[x_dim];i0++){
+	  for(int i1=d3_array_start[y_dim];i1<d3_array_start[y_dim]+subsizes[y_dim];i1++){
+	    for(int i2=d3_array_start[z_dim];i2<d3_array_start[z_dim]+subsizes[z_dim];i2++){
+	      int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
+	      assert(local_indx < dims_size);
+	      assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
+	      b[local_indx]=d->d3_chunk[ch_indx];
+	      //                   if(p==1 && self==me)fprintf(stderr,"(%f,%f) ",real(d->d3_chunk[ch_indx]),imag(d->d3_chunk[ch_indx]));
+	      ch_indx++;
+	    }
+	  }
+	}
+	
+      }
+      else{
+	abort();
+      }
+    }
+    
+    if (DEBUG_CONDITION) {
+      fprintf(stderr,
+	      "%d: npeers,p,p0,p1,p1max=(%d,%d,%d,%d,%d), "
+	      "d3_coord=(%d,%d,%d), d2_peer_coord=(%d,%d,%d), "
+	      "d2_coord=(%d,%d,%d), d3_peer_coord=(%d,%d,%d), "
+	      "recv_peer=%d, send_peer=%d\n",
+	      self,
+	      npeers, p, p0, p1, p1max,
+	      d3_coord[0], d3_coord[1], d3_coord[2],
+	      d2_peer_coord[0], d2_peer_coord[1], d2_peer_coord[2],
+	      d2_coord[0], d2_coord[1], d2_coord[2],
+	      d3_peer_coord[0], d3_peer_coord[1], d3_peer_coord[2],
+	      recv_peer, send_peer);
+    }
+    
+    if((self == me) && print_me)fprintf(stderr, "%d, %d, %d Made it end-for!...\n", self,p, npeers);
+  }
+  
+  //if((self == me) && print_me)fprintf(outfile, "   Made it all the way! for z_dim =(%d) and num_proc = (%d)...\n", z_dim, d->process_topology_1.nproc[0]);
+  if((self == me) && print_result){
+    FILE * outfile;
+    outfile= fopen("passed.data","a");
+    if (outfile) fprintf(outfile, "   Made it all the way! for z_dim =(%d) and num_proc = (%d)...\n", z_dim, d->process_topology_1.nproc[0]);
+    if (outfile) fclose(outfile);
+  }
+//    fprintf(stderr, "%d, Made it all the way!...\n", self);
+}
diff --git a/src/halo-finder/src/dfft/distribution.h b/src/halo-finder/src/dfft/distribution.h
new file mode 100644
index 0000000..4e26edd
--- /dev/null
+++ b/src/halo-finder/src/dfft/distribution.h
@@ -0,0 +1,198 @@
+#ifndef DISTRIBUTION_H
+#define DISTRIBUTION_H
+
+#include "complex-type.h"
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+// descriptor for a process grid
+//   cart     Cartesian MPI communicator
+//   nproc[]  dimensions of process grid
+//   period[] periods of process grid
+//   self[]   coordinate of this process in the process grid
+//   n[]      local grid dimensions
+///
+typedef struct {
+    MPI_Comm cart;
+    int nproc[3];
+    int period[3];
+    int self[3];
+    int n[3];
+} process_topology_t;
+
+
+///
+// descriptor for data distribution
+//   debug               toggle debug output
+//   n[3]                (global) grid dimensions
+//   padding[3]          padding applied to (local) arrays
+//   process_topology_1  1-d process topology
+//   process_topology_2  2-d process topology
+//   process_topology_3  3-d process topology
+///
+typedef struct {
+    bool debug;
+    int n[3];
+    int padding[3];
+    process_topology_t process_topology_1;
+    process_topology_t process_topology_2_z;
+    process_topology_t process_topology_2_y;
+    process_topology_t process_topology_2_x;
+    process_topology_t process_topology_3;
+    complex_t *d2_chunk;
+    complex_t *d3_chunk;
+} distribution_t;
+
+
+///
+// create 1-, 2- and 3-d cartesian data distributions
+//   comm   MPI Communicator
+//   d      distribution descriptor
+//   n      (global) grid dimensions (3 element array)
+//   debug  debugging output
+///
+void distribution_init(MPI_Comm comm, const int n[], const int n_padded[], distribution_t *d, bool debug);
+
+
+///
+// create 1-, 2- and 3-d cartesian data distributions with explicitly
+// provided dimension lists
+//   comm       MPI Communicator
+//   n          (global) grid dimensions (3 element array)
+//   n_padded   padded grid dimensions (3 element array)
+//   nproc_1d   1d process grid (3 element array: x, 1, 1)
+//   nproc_2d   1d process grid (3 element array: x, y, 1)
+//   nproc_3d   3d process grid (3 element array: x, y, z)
+//   d          distribution descriptor
+//   debug      debugging output
+///
+void distribution_init_explicit(MPI_Comm comm,
+                                const int n[],
+                                const int n_padded[],
+                                int nproc_1d[],
+                                int nproc_2d_x[],
+                                int nproc_2d_y[],
+                                int nproc_2d_z[],
+                                int nproc_3d[],
+                                distribution_t *d,
+                                bool debug);
+
+///
+// clean up the data distribution
+//   d    distribution descriptor
+///
+void distribution_fini(distribution_t *d);
+
+
+///
+// assert that the data and processor grids are commensurate
+//   d    distribution descriptor
+///
+void distribution_assert_commensurate(distribution_t *d);
+
+
+///
+// redistribute a 1-d to a 3-d data distribution
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+///
+void distribution_1_to_3(const complex_t *a,
+                         complex_t *b,
+                         distribution_t *d);
+
+///
+// redistribute a 3-d to a 1-d data distribution
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+///
+void distribution_3_to_1(const complex_t *a,
+                         complex_t *b,
+                         distribution_t *d);
+
+///
+// redistribute a 2-d to a 3-d data distribution
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+///
+void distribution_2_to_3(const complex_t *a,
+                         complex_t *b,
+                         distribution_t *d, int dim_z);
+
+///
+// redistribute a 3-d to a 2-d data distribution
+//   a    input
+//   b    ouput
+//   d    distribution descriptor
+///
+void distribution_3_to_2(const complex_t *a,
+                         complex_t *b,
+                         distribution_t *d, int dim_z);
+
+
+///
+// Some accessor functions
+///
+static inline int distribution_get_nproc_1d(distribution_t *d, int direction)
+{
+    return d->process_topology_1.nproc[direction];
+}
+
+static inline int distribution_get_nproc_2d_x(distribution_t *d, int direction)
+{
+    return d->process_topology_2_x.nproc[direction];
+}
+static inline int distribution_get_nproc_2d_y(distribution_t *d, int direction)
+{
+    return d->process_topology_2_y.nproc[direction];
+}
+static inline int distribution_get_nproc_2d_z(distribution_t *d, int direction)
+{
+    return d->process_topology_2_z.nproc[direction];
+}
+
+static inline int distribution_get_nproc_3d(distribution_t *d, int direction)
+{
+    return d->process_topology_3.nproc[direction];
+}
+
+static inline int distribution_get_self_1d(distribution_t *d, int direction)
+{
+    return d->process_topology_1.self[direction];
+}
+
+static inline int distribution_get_self_2d_x(distribution_t *d, int direction)
+{
+    return d->process_topology_2_x.self[direction];
+}
+static inline int distribution_get_self_2d_y(distribution_t *d, int direction)
+{
+    return d->process_topology_2_y.self[direction];
+}
+static inline int distribution_get_self_2d_z(distribution_t *d, int direction)
+{
+    return d->process_topology_2_z.self[direction];
+}
+static inline int distribution_get_self_3d(distribution_t *d, int direction)
+{
+    return d->process_topology_3.self[direction];
+}
+
+void Coord_x_pencils(int myrank, int coord[], distribution_t *d);
+void Rank_x_pencils(int * myrank, int coord[], distribution_t *d);
+void Coord_y_pencils(int myrank, int coord[], distribution_t *d);
+void Rank_y_pencils(int * myrank, int coord[], distribution_t *d);
+void Coord_z_pencils(int myrank, int coord[], distribution_t *d);
+void Rank_z_pencils(int * myrank, int coord[], distribution_t *d);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/halo-finder/src/dfft/distribution.hpp b/src/halo-finder/src/dfft/distribution.hpp
new file mode 100644
index 0000000..27b9adc
--- /dev/null
+++ b/src/halo-finder/src/dfft/distribution.hpp
@@ -0,0 +1,333 @@
+#ifndef DISTRIBUTION_HPP
+#define DISTRIBUTION_HPP
+
+#include <vector>
+
+///
+// Distribution / partition / decomposition of data
+//
+// A C++ wrapper around distribution.h
+///
+
+#include "complex-type.h"
+#include "distribution.h"
+
+class Distribution {
+
+public:
+    Distribution()
+        {
+        }
+
+    Distribution(MPI_Comm comm, int const n[], int n_padded[], bool debug = false)
+        : m_debug(debug)
+        {
+            initialize(comm, n, n_padded);
+        }
+
+    Distribution(MPI_Comm comm, int const n[], bool debug = false)
+        : m_debug(debug)
+        {
+            initialize(comm, n, n);
+        }
+
+    Distribution(MPI_Comm comm, std::vector<int> const & n, bool debug = false)
+        : m_debug(debug)
+        {
+            initialize(comm, &n[0], &n[0]);
+        }
+
+    Distribution(MPI_Comm comm, int ng, bool debug = false)
+        : m_debug(debug)
+        {
+            int n[3] = { ng, ng, ng };
+            initialize(comm, n, n);
+        }
+
+    virtual ~Distribution()
+        {
+            distribution_fini(&m_d);
+        }
+
+    void initialize(MPI_Comm comm, int const n[], int const n_padded[])
+        {
+            int flag;
+            MPI_Initialized(&flag);
+            if (flag == 0) {
+                MPI_Init(0, 0);
+            }
+            distribution_init(comm, n, n_padded, &m_d, m_debug);
+        }
+
+    void redistribute_1_to_3(const complex_t *a, complex_t *b)
+        {
+            distribution_1_to_3(a, b, &m_d);
+        }
+
+    void redistribute_1_to_3(std::vector<complex_t> const & a,
+                             std::vector<complex_t> & b)
+        {
+            distribution_1_to_3(&a[0], &b[0], &m_d);
+        }
+
+    void redistribute_3_to_1(const complex_t *a, complex_t *b)
+        {
+            distribution_3_to_1(a, b, &m_d);
+        }
+
+    void redistribute_3_to_1(std::vector<complex_t> const & a,
+                             std::vector<complex_t> & b)
+        {
+            distribution_3_to_1(&a[0], &b[0], &m_d);
+        }
+
+    void redistribute_2_to_3(const complex_t *a, complex_t *b, int axis)
+        {
+            distribution_2_to_3(a, b, &m_d, axis);
+        }
+
+    void redistribute_2_to_3(std::vector<complex_t> const & a,
+                             std::vector<complex_t> & b, int axis)
+        {
+            distribution_2_to_3(&a[0], &b[0], &m_d, axis);
+        }
+
+    void redistribute_3_to_2(const complex_t *a, complex_t *b, int axis)
+        {
+            distribution_3_to_2(a, b, &m_d, axis);
+        }
+
+    void redistribute_3_to_2(std::vector<complex_t> const & a,
+                             std::vector<complex_t> & b, int axis)
+        {
+            distribution_3_to_2(&a[0], &b[0], &m_d, axis);
+        }
+
+    size_t local_size() const
+        {
+            size_t size = 1;
+            for (int i = 0; i < 3; ++i) {
+                size *= (m_d.n[i] / m_d.process_topology_3.nproc[i]);
+            }
+            return size;
+        }
+
+    size_t global_size() const
+        {
+            size_t size = 1;
+            for (int i = 0; i < 3; ++i) {
+                size *= m_d.n[i];
+            }
+            return size;
+        }
+
+    int global_ng(int i) const
+        {
+            return m_d.n[i];
+        }
+
+    int local_ng_1d(int i) const
+        {
+            return m_d.process_topology_1.n[i];
+        }
+
+    int local_ng_2d_x(int i) const
+        {
+            return m_d.process_topology_2_x.n[i];
+        }
+
+    int local_ng_2d_y(int i) const
+        {
+            return m_d.process_topology_2_y.n[i];
+        }
+
+    int local_ng_2d_z(int i) const
+        {
+            return m_d.process_topology_2_z.n[i];
+        }
+
+    int local_ng_3d(int i) const
+        {
+            return m_d.process_topology_3.n[i];
+        }
+
+    int nproc() const
+        {
+            return m_d.process_topology_1.nproc[0];
+        }
+
+    int const (& nproc_1d() const)[3]
+        {
+            return m_d.process_topology_1.nproc;
+        }
+
+    int const (& nproc_2d_x() const)[3]
+        {
+            return m_d.process_topology_2_x.nproc;
+        }
+
+    int const (& nproc_2d_y() const)[3]
+        {
+            return m_d.process_topology_2_y.nproc;
+        }
+
+    int const (& nproc_2d_z() const)[3]
+        {
+            return m_d.process_topology_2_z.nproc;
+        }
+
+    int const (& nproc_3d() const)[3]
+        {
+            return m_d.process_topology_3.nproc;
+        }
+
+    int nproc_1d(int i) const
+        {
+            return m_d.process_topology_1.nproc[i];
+        }
+
+    int nproc_2d_x(int i) const
+        {
+            return m_d.process_topology_2_x.nproc[i];
+        }
+
+    int nproc_2d_y(int i) const
+        {
+            return m_d.process_topology_2_y.nproc[i];
+        }
+
+    int nproc_2d_z(int i) const
+        {
+            return m_d.process_topology_2_z.nproc[i];
+        }
+
+    int nproc_3d(int i) const
+        {
+            return m_d.process_topology_3.nproc[i];
+        }
+
+    int self() const
+        {
+            return m_d.process_topology_1.self[0];
+        }
+
+    int const (& self_1d() const)[3]
+        {
+            return m_d.process_topology_1.self;
+        }
+
+    int const (& self_2d_x() const)[3]
+        {
+            return m_d.process_topology_2_x.self;
+        }
+
+    int const (& self_2d_y() const)[3]
+        {
+            return m_d.process_topology_2_y.self;
+        }
+
+    int const (& self_2d_z() const)[3]
+        {
+            return m_d.process_topology_2_z.self;
+        }
+
+    int const (& self_3d() const)[3]
+        {
+            return m_d.process_topology_3.self;
+        }
+
+    int self_1d(int i) const
+        {
+            return m_d.process_topology_1.self[i];
+        }
+
+    int self_2d_x(int i) const
+        {
+            return m_d.process_topology_2_x.self[i];
+        }
+
+    int self_2d_y(int i) const
+        {
+            return m_d.process_topology_2_y.self[i];
+        }
+
+    int self_2d_z(int i) const
+        {
+            return m_d.process_topology_2_z.self[i];
+        }
+
+    int self_3d(int i) const
+        {
+            return m_d.process_topology_3.self[i];
+        }
+
+    MPI_Comm cart_1d() const
+        {
+            return m_d.process_topology_1.cart;
+        }
+
+    MPI_Comm cart_2d_x() const
+        {
+            return m_d.process_topology_2_x.cart;
+        }
+
+    MPI_Comm cart_2d_y() const
+        {
+            return m_d.process_topology_2_y.cart;
+        }
+
+    MPI_Comm cart_2d_z() const
+        {
+            return m_d.process_topology_2_z.cart;
+        }
+
+    MPI_Comm cart_3d() const
+        {
+            return m_d.process_topology_3.cart;
+        }
+
+    int rank_2d_x(int c[])
+        {
+            int r;
+
+            Rank_x_pencils(&r, c, &m_d);
+            return r;
+        }
+
+    int rank_2d_y(int c[])
+        {
+            int r;
+
+            Rank_y_pencils(&r, c, &m_d);
+            return r;
+        }
+
+    int rank_2d_z(int c[])
+        {
+            int r;
+
+            Rank_z_pencils(&r, c, &m_d);
+            return r;
+        }
+
+     void coords_2d_x(int r, int c[])
+        {
+            Coord_x_pencils(r, c, &m_d);
+        }
+
+     void coords_2d_y(int r, int c[])
+        {
+            Coord_y_pencils(r, c, &m_d);
+        }
+
+     void coords_2d_z(int r, int c[])
+        {
+            Coord_z_pencils(r, c, &m_d);
+        }
+
+protected:
+    distribution_t m_d;
+    bool m_debug;
+};
+
+#endif
diff --git a/src/halo-finder/src/dfft/fp.h b/src/halo-finder/src/dfft/fp.h
new file mode 100644
index 0000000..10818ee
--- /dev/null
+++ b/src/halo-finder/src/dfft/fp.h
@@ -0,0 +1,94 @@
+#ifndef FP_H
+#define FP_H
+
+#include "complex-type.h"
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+static bool fp_enable_check = true;
+
+///
+// Print warnings about funny floating point numbers
+///
+
+// pgCC doesn't yet play well with C99 constructs, so...
+#if defined(__cplusplus) && defined(__PGI_)
+static inline void fp_check(double x) { }
+#else
+static inline void fp_check(double x)
+{
+#if defined(__cplusplus)
+    int type = std::fpclassify(x);
+#else
+    int type = fpclassify(x);
+#endif
+
+    if (type == FP_SUBNORMAL) {
+        fprintf(stderr, "Warning: fpclassify: FP_SUBNORMAL\n");
+    } else if (type == FP_INFINITE) {
+        fprintf(stderr, "Warning: fpclassify: FP_INFINITE\n");
+    } else if (type == FP_NAN) {
+        fprintf(stderr, "Warning: fpclassify: FP_NAN\n");
+    }
+}
+#endif
+
+///
+// Find the number of representable floating point numbers between two
+// floating point numbers.  Assumes IEEE 754 and 2's complement
+// arithmetic.
+//
+// A separation of 1 corresponds to a relative difference of 1 part
+// in 2^53 ~ 10^-16.
+///
+static inline uint64_t fp_diff(double a, double b)
+{
+    int64_t ia = *(int64_t *) &a;
+    int64_t ib = *(int64_t *) &b;
+
+    if (ia < 0) {
+        ia = 0x8000000000000000LL - ia;
+    }
+    if (ib < 0) {
+        ib = 0x8000000000000000LL - ib;
+    }
+    if (ia > ib) {
+        return (uint64_t) (ia - ib);
+    } else {
+        return (uint64_t) (ib - ia);
+    }
+}
+
+
+///
+// Compare floating point numbers based on the number of representable
+// floating point numbers between them.
+//
+// A tolerance of 1 corresponds to a relative difference of 1 part
+// in 2^53 ~ 10^-16.
+///
+static inline bool fp_isclose(double a, double b, uint64_t tolerance)
+{
+    if (fp_enable_check) {
+        fp_check(a);
+        fp_check(b);
+    }
+    return fp_diff(a, b) <= tolerance;
+}
+
+
+///
+// Compare floating point numbers based on the number of representable
+// floating point numbers between them.
+//
+// A tolerance of 1 corresponds to a relative difference of 1 part
+// in 2^53 ~ 10^-16.
+///
+static inline bool fp_complex_isclose(complex_t a, complex_t b, uint64_t tolerance)
+{
+    return (fp_isclose(real(a), real(b), tolerance) &&
+            fp_isclose(imag(a), imag(b), tolerance));
+}
+
+#endif
diff --git a/src/halo-finder/src/dfft/include.mk b/src/halo-finder/src/dfft/include.mk
new file mode 100644
index 0000000..db8e81c
--- /dev/null
+++ b/src/halo-finder/src/dfft/include.mk
@@ -0,0 +1,10 @@
+DFFT := dfft
+DFFT_HEADERS := ${DFFT}/distribution.h
+DFFT_HEADERS += ${DFFT}/allocator.hpp
+DFFT_HEADERS += ${DFFT}/distribution.hpp
+DFFT_HEADERS += ${DFFT}/solver.hpp
+DFFT_CXXFLAGS := -I${DFFT}
+#DFFT_LDFLAGS := -L${DFFT}/${HACC_OBJDIR} -ldfft
+DFFT_LDFLAGS := -L${DFFT}/${HACC_OBJDIR}
+include ${DFFT}/pencil.mk
+DFFT_CXXFLAGS += ${DFFT_PEN_CXXFLAGS}
diff --git a/src/halo-finder/src/dfft/pencil.mk b/src/halo-finder/src/dfft/pencil.mk
new file mode 100644
index 0000000..e4a3a1a
--- /dev/null
+++ b/src/halo-finder/src/dfft/pencil.mk
@@ -0,0 +1,7 @@
+PENCIL = 1
+
+ifeq (${PENCIL},1)
+DFFT_PEN_CXXFLAGS += -DPENCIL=1
+DFFT_PEN_CFLAGS   += -DPENCIL=1
+endif
+
diff --git a/src/halo-finder/src/dfft/plot.sh b/src/halo-finder/src/dfft/plot.sh
new file mode 100644
index 0000000..85cb1ea
--- /dev/null
+++ b/src/halo-finder/src/dfft/plot.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env gnuplot
+
+set terminal x11 persist
+set xlabel 'x'
+set ylabel 'phi(x)'
+set title 'Solution of Laplace phi(x) = - delta(x)'
+
+plot 'plot.in' title '256^3 grid', - 1 / (4 * pi * x) title '-1 / (4 pi x)'
+
+exit
+
+set terminal png
+set output 'plot.png'
+replot
+
+set terminal postscript eps
+set output 'plot.eps'
+replot
diff --git a/src/halo-finder/src/dfft/solver.hpp b/src/halo-finder/src/dfft/solver.hpp
new file mode 100644
index 0000000..8edd347
--- /dev/null
+++ b/src/halo-finder/src/dfft/solver.hpp
@@ -0,0 +1,924 @@
+#ifndef SOLVER_HPP
+#define SOLVER_HPP
+
+#include "complex-type.h"
+
+#if !defined(FFTW3) && defined(PENCIL)
+#error PENCIL FFT REQUIRES FFTW3
+#endif
+
+#if defined(FFTW3) && defined(PENCIL)
+#ifdef ESSL_FFTW
+#include <fftw3_essl.h>
+#else
+#include <fftw3.h>
+#endif
+#elif defined(FFTW3)
+#include <fftw3-mpi.h>
+#else
+#include <fftw_mpi.h>
+#endif
+
+#include <algorithm>
+#include <vector>
+#include <cassert>
+
+#include "allocator.hpp"
+#include "distribution.hpp"
+
+#include "bigchunk.h"
+
+#define CERRILLOS_SLAB_HACK 1
+
+const double pi = 3.14159265358979323846;
+
+// pgCC doesn't yet play well with C99 constructs, so...
+#ifdef __PGI__
+extern "C" long int lrint(double x);
+#endif
+
+#define FFTW_ADDR(X) reinterpret_cast<fftw_complex*>(&(X)[0])
+
+///
+// Abstract base class for Poisson solvers.
+//
+// Derived classes must provide their own implementation of
+// initialize_greens_function(), and, if necessary, override the
+// backward_solve() and backward_solve_gradient() methods.
+///
+class SolverBase : public Distribution {
+
+public:
+
+  // methods
+  
+  SolverBase()
+  {
+  }
+  
+  SolverBase(MPI_Comm comm, int ng)
+    : Distribution(comm, ng)
+  {
+    initialize(comm);
+  }
+  
+  SolverBase(MPI_Comm comm, std::vector<int> const & n)
+    : Distribution(comm, n)
+  {
+    initialize(comm);
+  }
+  
+  virtual ~SolverBase()
+  {
+#if defined(FFTW3) && defined(PENCIL)
+    fftw_destroy_plan(m_plan_f_x);
+    fftw_destroy_plan(m_plan_f_y);
+    fftw_destroy_plan(m_plan_f_z);
+    fftw_destroy_plan(m_plan_b_x);
+    fftw_destroy_plan(m_plan_b_y);
+    fftw_destroy_plan(m_plan_b_z);
+#elif defined(FFTW3)
+    fftw_destroy_plan(m_plan_f);
+    fftw_destroy_plan(m_plan_b);
+#else
+    fftwnd_mpi_destroy_plan(m_plan_f);
+    fftwnd_mpi_destroy_plan(m_plan_b);
+#endif
+  }
+  
+  // solve interfaces
+  
+  void forward_solve(complex_t const *rho)
+  {
+#if defined(FFTW3) && defined(PENCIL)
+
+    distribution_3_to_2(rho, &m_buf1[0], &m_d, 0);            // rho  --> buf1
+    fftw_execute(m_plan_f_x);                                 // buf1 --> buf2
+    distribution_2_to_3(&m_buf2[0], &m_buf1[0], &m_d, 0);     // buf2 --> buf1
+    distribution_3_to_2(&m_buf1[0], &m_buf2[0], &m_d, 1);     // buf1 --> buf2
+    fftw_execute(m_plan_f_y);                                 // buf2 --> buf1
+    distribution_2_to_3(&m_buf1[0], &m_buf2[0], &m_d, 1);     // buf1 --> buf2
+    distribution_3_to_2(&m_buf2[0], &m_buf1[0], &m_d, 2);     // buf2 --> buf1
+    fftw_execute(m_plan_f_z);                                 // buf1 --> buf2
+
+#elif defined(FFTW3)
+    distribution_3_to_1(rho, &m_buf1[0], &m_d);             // rho  --> buf1
+    fftw_execute(m_plan_f);                                 // buf1 --> buf2
+#else
+    distribution_3_to_1(rho, &m_buf2[0], &m_d);             // rho  --> buf2
+    fftwnd_mpi(m_plan_f, 1,   
+	       FFTW_ADDR(m_buf2), 
+	       FFTW_ADDR(m_buf3),
+	       FFTW_NORMAL_ORDER);                          // buf2 -->buf2
+#endif
+  }
+  
+  void backward_solve(complex_t *phi)
+  {
+    kspace_solve(&m_buf2[0], &m_buf1[0]);                   // buf2 --> buf1
+#if defined(FFTW3) && defined(PENCIL)
+    fftw_execute(m_plan_b_z);                               // buf1 --> buf3
+    distribution_2_to_3(&m_buf3[0], &m_buf1[0], &m_d, 2);   // buf3 --> buf1
+    distribution_3_to_2(&m_buf1[0], &m_buf3[0], &m_d, 1);   // buf1 --> buf3
+    fftw_execute(m_plan_b_y);                               // buf3 --> buf1
+    distribution_2_to_3(&m_buf1[0], &m_buf3[0], &m_d, 1);   // buf1 --> buf3
+    distribution_3_to_2(&m_buf3[0], &m_buf1[0], &m_d, 0);   // buf3 --> buf1
+    fftw_execute(m_plan_b_x);                               // buf1 --> buf3
+    distribution_2_to_3(&m_buf3[0], phi, &m_d, 0);          // buf3 --> phi
+#elif defined(FFTW3)
+    fftw_execute(m_plan_b);                                 // buf1 --> buf3
+    distribution_1_to_3(&m_buf3[0], phi, &m_d);             // buf3 --> phi
+#else
+    fftwnd_mpi(m_plan_b, 1,
+	       (fftw_complex *) &m_buf1[0],
+	       (fftw_complex *) &m_buf3[0],
+	       FFTW_NORMAL_ORDER);                           // buf1 -->buf1
+    distribution_1_to_3(&m_buf1[0], phi, &m_d);              // buf1 --> phi
+#endif
+  }
+  
+  void backward_solve_gradient(int axis, complex_t *grad_phi)
+  {
+    kspace_solve_gradient(axis, &m_buf2[0], &m_buf1[0]);    // buf2 --> buf1
+#if defined(FFTW3) && defined(PENCIL)
+    fftw_execute(m_plan_b_z);                               // buf1 --> buf3
+    distribution_2_to_3(&m_buf3[0], &m_buf1[0], &m_d, 2);   // buf3 --> buf1
+    distribution_3_to_2(&m_buf1[0], &m_buf3[0], &m_d, 1);   // buf1 --> buf3
+    fftw_execute(m_plan_b_y);                               // buf3 --> buf1
+    distribution_2_to_3(&m_buf1[0], &m_buf3[0], &m_d, 1);   // buf1 --> buf3
+    distribution_3_to_2(&m_buf3[0], &m_buf1[0], &m_d, 0);   // buf3 --> buf1
+    fftw_execute(m_plan_b_x);                               // buf1 --> buf3
+    distribution_2_to_3(&m_buf3[0], grad_phi, &m_d, 0);     // buf3 --> grad_phi
+#elif defined(FFTW3)
+    fftw_execute(m_plan_b);                                 // buf1 --> buf3
+    distribution_1_to_3(&m_buf3[0], grad_phi, &m_d);        // buf3 --> grad_phi
+#else
+    fftwnd_mpi(m_plan_b, 1,
+	       (fftw_complex *) &m_buf1[0],
+	       (fftw_complex *) &m_buf3[0],
+	       FFTW_NORMAL_ORDER);                           // buf1 -> buf1
+    distribution_1_to_3(&m_buf1[0], grad_phi, &m_d);         // buf1 --> grad_phi
+#endif
+  }
+  
+  void solve(const complex_t *rho, complex_t *phi)
+  {
+    forward_solve(rho);
+    backward_solve(phi);
+  }
+  
+  void solve_gradient(int axis, const complex_t *rho, complex_t *phi)
+  {
+    forward_solve(rho);
+    backward_solve_gradient(axis, phi);
+  }
+  
+  // interfaces for std::vector
+  
+  void forward_solve(std::vector<complex_t> const & rho)
+  {
+    forward_solve(&rho[0]);
+  }
+  
+  void backward_solve(std::vector<complex_t> & phi)
+  {
+    backward_solve(&phi[0]);
+  }
+  
+  void backward_solve_gradient(int axis, std::vector<complex_t> & phi)
+  {
+    backward_solve_gradient(axis, &phi[0]);
+  }
+  
+  void solve(std::vector<complex_t> const & rho, std::vector<complex_t> & phi)
+  {
+    solve(&rho[0], &phi[0]);
+  }
+  
+  void solve_gradient(int axis, std::vector<complex_t> const & rho, std::vector<complex_t> & phi)
+  {
+    solve_gradient(axis, &rho[0], &phi[0]);
+  }
+  
+  
+  // analysis interfaces
+  
+  ///
+  // calculate the k-space power spectrum
+  //   P(modk) = Sum { |rho(k)|^2 : |k| = modk, k <- [0, ng / 2)^3, periodically extended }
+  ///
+  void power_spectrum(std::vector<double> & power)
+  {
+    //intermediate in m_buf2 for both FFTW2 and FFTW3
+    std::vector<complex_t, bigchunk_allocator<complex_t> > const & rho = m_buf2;
+
+    int ng = m_d.n[0];
+    double volume = 1.0 * ng * ng * ng; 
+    double kk, tpi;
+
+    tpi = 2.0*atan(1.0)*4.0;
+
+    // cache periodic ksq
+    m_pk_ksq.resize(ng);
+    m_pk_cic.resize(ng);
+    double ksq_max = 0;
+    for (int k = 0; k < ng / 2; ++k) {
+
+      m_pk_ksq[k] = k * k;
+      ksq_max = max(ksq_max, m_pk_ksq[k]);
+
+      m_pk_ksq[k + ng / 2] = (k - ng / 2) * (k - ng / 2);
+      ksq_max = max(ksq_max, m_pk_ksq[k + ng / 2]);
+
+      kk = tpi*k/ng;
+      m_pk_cic[k] = pow(sin(0.5*kk)/(0.5*kk),-4.0);
+
+      kk = tpi*(k-ng/2)/ng;
+      m_pk_cic[k + ng/2] = pow(sin(0.5*kk)/(0.5*kk),-4.0);
+    }
+    m_pk_cic[0] = 1.0;
+
+    long modk_max = lrint(sqrt(3 * ksq_max)); // round to nearest integer
+    
+    // calculate power spectrum
+    power.resize(modk_max + 1);
+    power.assign(modk_max + 1, 0.0);
+
+    m_pk_weight.resize(modk_max + 1);
+    m_pk_weight.assign(modk_max + 1, 0.0);
+    
+    /*
+!-----3-D anti-CIC filter for deconvolution
+
+      forall(ii=1:ng)kk(ii)=(ii-1)*tpi/(1.0*ng)
+
+      do ii=1,ng
+      if(ii.ge.ng/2+1)kk(ii)=(ii-ng-1)*tpi/(1.0*ng)
+      enddo
+
+      mult(1)=1.0
+      
+      forall(ii=2:ng)mult(ii)=
+      #      1.0/(sin(kk(ii)/2.0)/(kk(ii)/2.0))**2
+      
+      forall(ii=1:ng,jj=1:ng,mm=1:ng)erhotr(ii,jj,mm)=
+      #      mult(ii)*mult(jj)*mult(mm)*erhotr(ii,jj,mm)
+    */
+   
+    int local_dim[3]; 
+    int self_coord[3];
+#if defined(FFTW3) && defined(PENCIL)
+    self_coord[0]=self_2d_z(0);
+    self_coord[1]=self_2d_z(1);
+    self_coord[2]=self_2d_z(2);
+    local_dim[0]=local_ng_2d_z(0);
+    local_dim[1]=local_ng_2d_z(1);
+    local_dim[2]=local_ng_2d_z(2);
+#else
+    self_coord[0]=self_1d(0);
+    self_coord[1]=self_1d(1);
+    self_coord[2]=self_1d(2);
+    local_dim[0]=local_ng_1d(0);
+    local_dim[1]=local_ng_1d(1);
+    local_dim[2]=local_ng_1d(2);
+#endif
+    int index = 0;
+    for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
+      int k0 = local_k0 + self_coord[0] * local_dim[0];
+      double ksq0 = m_pk_ksq[k0];
+
+      for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
+        int k1 = local_k1 + self_coord[1] * local_dim[1];
+	double ksq1 = m_pk_ksq[k1];
+
+	for (int local_k2 = 0; local_k2 < local_dim[2]; ++local_k2) {
+          int k2 = local_k2 + self_coord[2] * local_dim[2];
+	  double ksq2 = m_pk_ksq[k2];
+	  long modk = lrint(sqrt(ksq0 + ksq1 + ksq2)); //round to nearest int
+	  //power[modk] += real(rho[index] * conj(rho[index]));
+	  power[modk] += std::real(rho[index] * conj(rho[index])) * m_pk_cic[k0] * m_pk_cic[k1] * m_pk_cic[k2];
+	  m_pk_weight[modk] += volume;
+	  index++;
+	}
+	index += m_d.padding[2];
+      }
+      index += m_d.padding[1];
+    }
+    
+    // accumulate across processors
+    MPI_Allreduce(MPI_IN_PLACE, &power[0], power.size(), 
+		  MPI_DOUBLE, MPI_SUM, cart_1d());
+    MPI_Allreduce(MPI_IN_PLACE, &m_pk_weight[0], m_pk_weight.size(), 
+		  MPI_DOUBLE, MPI_SUM, cart_1d());
+    
+    //make sure we don't divide by zero
+    for(size_t i = 0; i < m_pk_weight.size(); ++i) {
+      m_pk_weight[i] += 1.0 * (m_pk_weight[i] < 1.0);
+    }
+    
+    // scale power by weight
+    std::transform(power.begin(), power.end(), 
+		   m_pk_weight.begin(), power.begin(), 
+		   std::divides<double>());
+  }
+
+  ///
+  // General initialization
+  ///
+  void initialize(MPI_Comm comm, bool transposed_order = false)
+  {
+    int flags_f;
+    int flags_b;
+    
+    // distribution_init(comm, &n[0], &n[0], &m_d, false);
+    // distribution_assert_commensurate(&m_d);
+#if defined(FFTW3) && !defined(PENCIL)
+    fftw_mpi_init();
+#endif
+    m_greens_functions_initialized = false;
+    m_buf1.resize(local_size());
+    m_buf2.resize(local_size());
+    m_buf3.resize(local_size());
+    
+    // create plan for forward and backward DFT's
+    flags_f = flags_b = FFTW_ESTIMATE;
+#if defined(FFTW3) && defined(PENCIL)
+    m_plan_f_x = fftw_plan_many_dft(1, // rank
+                                    &(m_d.process_topology_2_x.n[0]), // const int *n,
+                                    m_d.process_topology_2_x.n[1] * m_d.process_topology_2_x.n[2], // howmany
+                                    FFTW_ADDR(m_buf1),
+                                    NULL, // const int *inembed,
+                                    1, // int istride,
+                                    m_d.process_topology_2_x.n[0], // int idist,
+                                    FFTW_ADDR(m_buf2),
+                                    NULL, // const int *onembed,
+                                    1, // int ostride,
+                                    m_d.process_topology_2_x.n[0], // int odist,
+                                    FFTW_FORWARD, // int sign,
+                                    0); // unsigned flags
+    m_plan_f_y = fftw_plan_many_dft(1, // rank
+                                    &(m_d.process_topology_2_y.n[1]), // const int *n,
+                                    m_d.process_topology_2_y.n[0] * m_d.process_topology_2_y.n[2], // howmany
+                                    FFTW_ADDR(m_buf2),
+                                    NULL, // const int *inembed,
+                                    1, // int istride,
+                                    m_d.process_topology_2_y.n[1], // int idist,
+                                    FFTW_ADDR(m_buf1),
+                                    NULL, // const int *onembed,
+                                    1, // int ostride,
+                                    m_d.process_topology_2_y.n[1], // int odist,
+                                    FFTW_FORWARD, // int sign,
+                                    0); // unsigned flags
+    m_plan_f_z = fftw_plan_many_dft(1, // rank
+                                    &(m_d.process_topology_2_z.n[2]), // const int *n,
+                                    m_d.process_topology_2_z.n[1] * m_d.process_topology_2_z.n[0], // howmany
+                                    FFTW_ADDR(m_buf1),
+                                    NULL, // const int *inembed,
+                                    1, // int istride,
+                                    m_d.process_topology_2_z.n[2], // int idist,
+                                    FFTW_ADDR(m_buf2),
+                                    NULL, // const int *onembed,
+                                    1, // int ostride,
+                                    m_d.process_topology_2_z.n[2], // int odist,
+                                    FFTW_FORWARD, // int sign,
+                                    0); // unsigned flags
+    m_plan_b_x = fftw_plan_many_dft(1, // rank
+                                    &(m_d.process_topology_2_x.n[0]), // const int *n,
+                                    m_d.process_topology_2_x.n[1] * m_d.process_topology_2_x.n[2], // howmany
+                                    FFTW_ADDR(m_buf1),
+                                    NULL, // const int *inembed,
+                                    1, // int istride,
+                                    m_d.process_topology_2_x.n[0], // int idist,
+                                    FFTW_ADDR(m_buf3),
+                                    NULL, // const int *onembed,
+                                    1, // int ostride,
+                                    m_d.process_topology_2_x.n[0], // int odist,
+                                    FFTW_BACKWARD, // int sign,
+                                    0); // unsigned flags
+    m_plan_b_y = fftw_plan_many_dft(1, // rank
+                                    &(m_d.process_topology_2_y.n[1]), // const int *n,
+                                    m_d.process_topology_2_y.n[0] * m_d.process_topology_2_y.n[2], // howmany
+                                    FFTW_ADDR(m_buf3),
+                                    NULL, // const int *inembed,
+                                    1, // int istride,
+                                    m_d.process_topology_2_y.n[1], // int idist,
+                                    FFTW_ADDR(m_buf1),
+                                    NULL, // const int *onembed,
+                                    1, // int ostride,
+                                    m_d.process_topology_2_y.n[1], // int odist,
+                                    FFTW_BACKWARD, // int sign,
+                                    0); // unsigned flags
+    m_plan_b_z = fftw_plan_many_dft(1, // rank
+                                    &(m_d.process_topology_2_z.n[2]), // const int *n,
+                                    m_d.process_topology_2_z.n[1] * m_d.process_topology_2_z.n[0], // howmany
+                                    FFTW_ADDR(m_buf1),
+                                    NULL, // const int *inembed,
+                                    1, // int istride,
+                                    m_d.process_topology_2_z.n[2], // int idist,
+                                    FFTW_ADDR(m_buf3),
+                                    NULL, // const int *onembed,
+                                    1, // int ostride,
+                                    m_d.process_topology_2_z.n[2], // int odist,
+                                    FFTW_BACKWARD, // int sign,
+                                    0); // unsigned flags
+#elif defined(FFTW3)
+    if (transposed_order) {
+      flags_f |= FFTW_MPI_TRANSPOSED_OUT;
+      flags_b |= FFTW_MPI_TRANSPOSED_IN;
+    }
+
+#if CERRILLOS_SLAB_HACK == 1
+    //attempt to make mpi fftw 3.3 work on more of cerrillos
+    //because of green's function application, 
+    //does not require changes to simulation code
+    flags_f |= FFTW_DESTROY_INPUT;
+    flags_b |= FFTW_DESTROY_INPUT;
+#endif
+
+    m_plan_f = fftw_mpi_plan_dft_3d(m_d.n[0], m_d.n[1], m_d.n[2],
+				    FFTW_ADDR(m_buf1), FFTW_ADDR(m_buf2),
+				    comm, FFTW_FORWARD, flags_f);
+    m_plan_b = fftw_mpi_plan_dft_3d(m_d.n[0], m_d.n[1], m_d.n[2],
+				    FFTW_ADDR(m_buf1), FFTW_ADDR(m_buf3),
+				    comm, FFTW_BACKWARD, flags_b);
+#else
+    m_plan_f = fftw3d_mpi_create_plan(comm, m_d.n[0], m_d.n[1], m_d.n[2], FFTW_FORWARD, flags_f);
+    m_plan_b = fftw3d_mpi_create_plan(comm, m_d.n[0], m_d.n[1], m_d.n[2], FFTW_BACKWARD, flags_b);
+#endif
+
+  }
+  
+  ///
+  // Solve for the potential by applying the Green's function to the
+  // density (in k-space)
+  //   rho           density (input)
+  //   phi           potential (output)
+  ///
+  void kspace_solve(const complex_t *rho, complex_t *phi)
+  {
+    int k[3];
+    int index;
+    
+    initialize_greens_function();
+
+    int local_dim[3]; 
+    int self_coord[3];
+#if defined(FFTW3) && defined(PENCIL)
+    self_coord[0]=self_2d_z(0);
+    self_coord[1]=self_2d_z(1);
+    self_coord[2]=self_2d_z(2);
+    local_dim[0]=local_ng_2d_z(0);
+    local_dim[1]=local_ng_2d_z(1);
+    local_dim[2]=local_ng_2d_z(2);
+#else
+    self_coord[0]=self_1d(0);
+    self_coord[1]=self_1d(1);
+    self_coord[2]=self_1d(2);
+    local_dim[0]=local_ng_1d(0);
+    local_dim[1]=local_ng_1d(1);
+    local_dim[2]=local_ng_1d(2);
+#endif
+    
+    index = 0;
+    for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
+      k[0] = local_k0 + self_coord[0] * local_dim[0];
+      for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
+          k[1] = local_k1 + self_coord[1] * local_dim[1];
+	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
+          k[2] = local_k2 + self_coord[2] * local_dim[2];
+	  phi[index] = m_green[index] * rho[index];
+	  index++;
+	}
+	index += m_d.padding[2];
+      }
+      index += m_d.padding[1];
+    }
+  }
+  
+  ///
+  // Solve for the gradient of the potential along the given axis by
+  // applying the derivative Green's function to the density (in
+  // k-space)
+  //   axis          the axis along which to take the gradient
+  //   rho           density (input)
+  //   grad_phi      the gradient of the potential (output)
+  ///
+  void kspace_solve_gradient(int axis, const complex_t *rho, complex_t *grad_phi)
+  {
+    int k[3];
+    int index;
+
+    initialize_greens_function();
+
+    int local_dim[3]; 
+    int self_coord[3];
+#if defined(FFTW3) && defined(PENCIL)
+    self_coord[0]=self_2d_z(0);
+    self_coord[1]=self_2d_z(1);
+    self_coord[2]=self_2d_z(2);
+    local_dim[0]=local_ng_2d_z(0);
+    local_dim[1]=local_ng_2d_z(1);
+    local_dim[2]=local_ng_2d_z(2);
+#else
+    self_coord[0]=self_1d(0);
+    self_coord[1]=self_1d(1);
+    self_coord[2]=self_1d(2);
+    local_dim[0]=local_ng_1d(0);
+    local_dim[1]=local_ng_1d(1);
+    local_dim[2]=local_ng_1d(2);
+#endif
+    
+    index = 0;
+    for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
+      k[0] = local_k0 + self_coord[0] * local_dim[0];
+      for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
+          k[1] = local_k1 + self_coord[1] * local_dim[1];
+	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
+          k[2] = local_k2 + self_coord[2] * local_dim[2];
+	  grad_phi[index] = I * (- m_gradient[k[axis]]) * m_green[index] * rho[index];
+	  index++;
+	}
+	index += m_d.padding[2];
+      }
+      index += m_d.padding[1];
+    }
+  }
+  
+  ///
+  // Allocate and pre-calculate isotropic Green's function and
+  // gradient operator.
+  ///
+  virtual void initialize_greens_function() = 0;
+  
+protected:
+  double max(double a, double b) { return a > b ? a : b; }
+  std::vector<double> m_green; // green's function
+  std::vector<double> m_gradient; //imaginary part of the gradient in grid units
+  std::vector<double> m_pk_cic;
+  std::vector<double> m_pk_weight;
+  std::vector<int>    m_pk_ksq;
+  std::vector<complex_t, bigchunk_allocator<complex_t> > m_buf1;
+  std::vector<complex_t, bigchunk_allocator<complex_t> > m_buf2;
+  std::vector<complex_t, bigchunk_allocator<complex_t> > m_buf3;
+#if defined(FFTW3) && defined(PENCIL)
+  fftw_plan m_plan_f_x;
+  fftw_plan m_plan_f_y;
+  fftw_plan m_plan_f_z;
+  fftw_plan m_plan_b_x;
+  fftw_plan m_plan_b_y;
+  fftw_plan m_plan_b_z;
+#elif defined(FFTW3)
+  fftw_plan m_plan_f;
+  fftw_plan m_plan_b;
+#else
+  fftwnd_mpi_plan m_plan_f;
+  fftwnd_mpi_plan m_plan_b;
+#endif
+  bool m_greens_functions_initialized;
+};
+
+
+///
+//  Poison solver using a 2nd-order discrete Green's function, and a
+//  2nd-order derivative.
+//
+//  G(k) = 1 / (2 * (Sum_i cos(2 pi k_i / n) - 3))
+//  (D_i f)(k) =  ( - i * sin(2 pi k / n) / (2 pi / n) )* f(k)
+///
+class SolverDiscrete : public SolverBase {
+
+public:
+
+  SolverDiscrete(MPI_Comm comm, int ng)
+    : SolverBase(comm, ng)
+  {
+  }
+  
+  SolverDiscrete(MPI_Comm comm, std::vector<int> n)
+    : SolverBase(comm, n)
+  {
+  }
+  
+  ///
+  // Allocate and pre-calculate isotropic Green's function (1D
+  // distribution) and trigonometric factors:
+  //   1 / (2 * (Sum_i cos(2 pi k_i / n) - 3))
+  ///
+  void initialize_greens_function()
+  {
+    double kstep;
+    int index;
+    int k[3];
+    std::vector<double> cosine;
+    
+    if (m_greens_functions_initialized) {
+      return;
+    }
+    
+    m_greens_functions_initialized = true;
+    
+    m_green.resize(local_size());
+    m_gradient.resize(m_d.n[0]);
+    cosine.resize(m_d.n[0]);
+    
+    // cache trigonometric factors and imaginary part of gradient
+    kstep = 2.0 * pi / (double) m_d.n[0];
+    for (int kk = 0; kk < m_d.n[0]; ++kk) {
+      cosine[kk] = cos(kk * kstep);
+      m_gradient[kk] = sin(kk * kstep); // imaginary part of gradient
+    }
+    
+    // cache isotropic Green's function (1D or 2D distribution)
+    int local_dim[3]; 
+    int self_coord[3];
+#if defined(FFTW3) && defined(PENCIL)
+    self_coord[0]=self_2d_z(0);
+    self_coord[1]=self_2d_z(1);
+    self_coord[2]=self_2d_z(2);
+    local_dim[0]=local_ng_2d_z(0);
+    local_dim[1]=local_ng_2d_z(1);
+    local_dim[2]=local_ng_2d_z(2);
+#else
+    self_coord[0]=self_1d(0);
+    self_coord[1]=self_1d(1);
+    self_coord[2]=self_1d(2);
+    local_dim[0]=local_ng_1d(0);
+    local_dim[1]=local_ng_1d(1);
+    local_dim[2]=local_ng_1d(2);
+#endif
+    index = 0;
+    double coeff = 0.5 / double(global_size());
+    for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
+      k[0] = local_k0 + self_coord[0] * local_dim[0];
+      for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
+          k[1] = local_k1 + self_coord[1] * local_dim[1];
+	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
+          k[2] = local_k2 + self_coord[2] * local_dim[2];
+	  m_green[index] = coeff / (cosine[k[0]] + cosine[k[1]] + cosine[k[2]] - 3.0);
+	  index++;
+	}
+	index += m_d.padding[2];
+      }
+      index += m_d.padding[1];
+    }
+    // handle the pole
+    if (self() == 0) {
+      m_green[0] = 0.0;
+    }
+  }
+};
+
+
+///
+//  Poison solver using a 6th-order discrete Green's function with a
+//  Gaussian noise-quieting filter function, and a 4th-order
+//  derivative.
+//
+//    G(k) = W(k) * 45/128 / Sum_i [ cos (2 pi k / n)
+//                                 - 5/64 cos(4 pi k / n)
+//                                 + 1/1024 cos(8 pi k_i / n)
+//                                 - 945/1024 ]
+//  or
+//    G(k) = W(k) / Sum_i ( a0 + a1 cos (2 pi k / n)
+//                             + a2 cos (4 pi k / n)
+//                             + a3 cos (8 pi k / n) )
+//  where
+//     a0 = -21/8
+//     a1 =  128/45
+//     a2 = -2/9
+//     a3 =  1/360
+//  satisfying
+//     a0 + a1 + a2 + a3 = 0
+//     (a1 + 4 a2 + 16 a3) = 2
+//  and
+//    W(k) = exp( - k^2 sigma^2 / 4) [ sin(k/2) / (k/2)]^n_s
+//    sigma = 0.8
+//    n_s = 3
+//
+//  The gradient operator (imaginary part, grid units) is
+//    (gradient f)(k) =  b1 sin(2 pi k / n) + b2 sin( 4 pi k / n) f(k)
+//  where
+//     b1 = 4/3
+//     b2 = -1/6
+//  satisfying
+//     b1 + 2 b2 = 1
+//     b1 + 8 b2 = 0
+///
+class SolverQuiet : public SolverBase {
+
+public:
+
+  SolverQuiet(MPI_Comm comm, int ng)
+    : SolverBase(comm, ng)
+  {
+  }
+  
+  SolverQuiet(MPI_Comm comm, std::vector<int> n)
+    : SolverBase(comm, n)
+  {
+  }
+  
+  ///
+  // Allocate and pre-calculate isotropic Green's function (1D
+  // distribution) and trigonometric factors:
+  //   1 / (2 * (Sum_i cos(2 pi k_i / n) - 3))
+  ///
+  void initialize_greens_function()
+  {
+    double kstep;
+    int index;
+    int k[3];
+    std::vector<double> c1;
+    std::vector<double> c2;
+    std::vector<double> c3;
+    std::vector<double> filter;
+    std::vector<double> kperiodic;
+    double const a0 = - 21.0 / 8.0;
+    double const a1 =   128.0 / 45.0;
+    double const a2 = - 2.0 / 9.0;
+    double const a3 =   1.0 / 360.0;
+    double const b1 =   4.0 / 3.0;
+    double const b2 = - 1.0 / 6.0;
+    double const sigma = 0.8;
+    double const ns = 3.0;
+    int ng = m_d.n[0];
+    
+    if (m_greens_functions_initialized) {
+      return;
+    }
+    
+    // check Taylor series coefficent conditions
+    assert(fabs(a0 + a1 + a2 + a3) < 1.0e-12);
+    assert(fabs(a1 + 4 * a2 + 16 * a3 - 2.0) < 1.0e-12);
+    assert(fabs(b1 + 2 * b2 - 1.0) < 1.0e-12);
+    assert(fabs(b1 + 8 * b2) < 1.0e-12);
+    
+    m_greens_functions_initialized = true;
+    
+    m_green.resize(local_size());
+    m_gradient.resize(ng);
+    c1.resize(ng);
+    c2.resize(ng);
+    c3.resize(ng);
+    kperiodic.resize(ng);
+    filter.resize(ng);
+    
+    // cache k array with the correct periodicity
+    // cache trigonometric factors and imaginary part of gradient
+    kstep = 2.0 * pi / static_cast<double>(ng);
+    for (int kk = 0; kk < ng; ++kk) {
+      c1[kk] = cos(kk * kstep);
+      c2[kk] = cos(2 * kk * kstep);
+      c3[kk] = cos(4 * kk * kstep);
+      if (kk < ng / 2) {
+	kperiodic[kk] = kk * kstep;
+	kperiodic[kk + ng / 2] = (kk - ng / 2) * kstep;
+      }
+    }
+    
+    // cache Green's function filter, and 4th order k-space gradient operator
+    filter[0] = 1.0;
+    m_gradient[0] = 1.0;
+    for (int kk = 1; kk < ng; ++kk) {
+      filter[kk] = exp(- 0.25 * sigma * sigma * kperiodic[kk] * kperiodic[kk])
+	* pow(sin(0.5 * kperiodic[kk]) / (0.5 * kperiodic[kk]), ns);
+      m_gradient[kk] = (b1 * sin(kperiodic[kk]) + b2 * sin(2 * kperiodic[kk]));
+    }
+    
+    // cache isotropic Green's function (1D or 2D distribution)
+    int local_dim[3]; 
+    int self_coord[3];
+#if defined(FFTW3) && defined(PENCIL)
+    self_coord[0]=self_2d_z(0);
+    self_coord[1]=self_2d_z(1);
+    self_coord[2]=self_2d_z(2);
+    local_dim[0]=local_ng_2d_z(0);
+    local_dim[1]=local_ng_2d_z(1);
+    local_dim[2]=local_ng_2d_z(2);
+#else
+    self_coord[0]=self_1d(0);
+    self_coord[1]=self_1d(1);
+    self_coord[2]=self_1d(2);
+    local_dim[0]=local_ng_1d(0);
+    local_dim[1]=local_ng_1d(1);
+    local_dim[2]=local_ng_1d(2);
+#endif
+    index = 0;
+    double coeff = 1.0 / double(global_size());
+    for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
+      k[0] = local_k0 + self_coord[0] * local_dim[0];
+      double d0 = a0 + a1 * c1[k[0]] + a2 * c2[k[0]] + a3 * c3[k[0]];
+      for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
+          k[1] = local_k1 + self_coord[1] * local_dim[1];
+	  double d1 = a0 + a1 * c1[k[1]] + a2 * c2[k[1]] + a3 * c3[k[1]];
+	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
+          k[2] = local_k2 + self_coord[2] * local_dim[2];
+	  double filt = coeff * filter[k[0]] * filter[k[1]] * filter[k[2]];
+	  double d2 = a0 + a1 * c1[k[2]] + a2 * c2[k[2]] + a3 * c3[k[2]];
+	  m_green[index] = filt / (d0 + d1 + d2);
+	  index++;
+	}
+	index += m_d.padding[2];
+      }
+      index += m_d.padding[1];
+    }
+    
+    // handle the pole
+    if (self() == 0) {
+      m_green[0] = 0.0;
+    }
+  }
+};
+
+
+///
+//  Poisson solver class using continuum Green's function:
+//    - 1 / k^2
+///
+class SolverContinuum : public SolverBase {
+
+public:
+
+  virtual void this_class_is_not_yet_tested_and_should_not_be_used() = 0;
+  
+  SolverContinuum(MPI_Comm comm, int ng)
+    : SolverBase(comm, ng)
+  {
+  }
+  
+  SolverContinuum(MPI_Comm comm, std::vector<int> n)
+    : SolverBase(comm, n)
+  {
+  }
+  
+  ///
+  // Allocate and pre-calculate isotropic Green's function and gradient.
+  ///
+  void initialize_greens_function()
+  {
+    double kstep;
+    double coeff;
+    int index;
+    int k[3];
+    int ng = m_d.n[0];
+    
+    if (m_greens_functions_initialized) {
+      return;
+    }
+    
+    m_greens_functions_initialized = true;
+    
+    m_green.resize(local_size());
+    m_gradient.resize(ng);
+    
+    // cache imaginary part of gradient, imposing symmetries by hand
+    kstep = 2.0 * pi / (double) ng;
+    for (int kk = 0; kk < ng / 2; ++kk) {
+      m_gradient[kk] = kk * kstep;
+      m_gradient[kk + ng / 2] = (kk - ng / 2) * kstep;
+    }
+    
+    // cache isotropic Green's function (1D or 2D distribution)
+    int local_dim[3]; 
+    int self_coord[3];
+#if defined(FFTW3) && defined(PENCIL)
+    self_coord[0]=self_2d_z(0);
+    self_coord[1]=self_2d_z(1);
+    self_coord[2]=self_2d_z(2);
+    local_dim[0]=local_ng_2d_z(0);
+    local_dim[1]=local_ng_2d_z(1);
+    local_dim[2]=local_ng_2d_z(2);
+#else
+    self_coord[0]=self_1d(0);
+    self_coord[1]=self_1d(1);
+    self_coord[2]=self_1d(2);
+    local_dim[0]=local_ng_1d(0);
+    local_dim[1]=local_ng_1d(1);
+    local_dim[2]=local_ng_1d(2);
+#endif
+    index = 0;
+    coeff = -1.0 / double(global_size());
+    for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
+      k[0] = local_k0 + self_coord[0] * local_dim[0];
+      double k0sq = m_gradient[k[0]] * m_gradient[k[0]];
+      for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
+          k[1] = local_k1 + self_coord[1] * local_dim[1];
+	  double k1sq = m_gradient[k[1]] * m_gradient[k[1]];
+	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
+          k[2] = local_k2 + self_coord[2] * local_dim[2];
+	  double k2sq = m_gradient[k[2]] * m_gradient[k[2]];
+	  m_green[index] = coeff / (k0sq + k1sq + k2sq);
+	  index++;
+	}
+	index += m_d.padding[2];
+      }
+      index += m_d.padding[1];
+    }
+    // handle the pole
+    if (self() == 0) {
+      m_green[0] = 0.0;
+    }
+  }
+};
+
+#endif
diff --git a/src/halo-finder/src/dfft/subarray.c b/src/halo-finder/src/dfft/subarray.c
new file mode 100644
index 0000000..ecc97af
--- /dev/null
+++ b/src/halo-finder/src/dfft/subarray.c
@@ -0,0 +1,58 @@
+// This is an implementation of MPI subarray's in terms of MPI struct
+// used to work around a potential bug in Open MPI's implementation.
+
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int MPI_Type_create_subarray(int ndims,
+                             int array_of_sizes[],
+                             int array_of_subsizes[],
+                             int array_of_starts[],
+                             int order,
+                             MPI_Datatype oldtype,
+                             MPI_Datatype *newtype)
+{
+    MPI_Datatype *t;
+    MPI_Aint lb;
+    MPI_Aint extent;
+
+    t = malloc((ndims + 1) * sizeof(MPI_Datatype));
+    if (!t) {
+        perror("out of memory");
+    }
+    MPI_Type_get_extent(oldtype, &lb, &extent);
+    MPI_Type_dup(oldtype, &t[ndims]);
+    for (int i = ndims - 1; i >= 0; --i) {
+        int blocklength[3];
+        MPI_Aint displacement[3];
+        MPI_Datatype type[3];
+
+        blocklength[0] = 1;
+        displacement[0] = 0;
+        type[0] = MPI_LB;
+
+        blocklength[1] = array_of_subsizes[i];
+        displacement[1] = extent * array_of_starts[i];
+        type[1] = t[i + 1];
+
+        blocklength[2] = 1;
+        displacement[2] = extent * array_of_sizes[i];
+        type[2] = MPI_UB;
+
+        MPI_Type_create_struct(3,
+                               blocklength,
+                               displacement,
+                               type,
+                               &t[i]);
+        extent *= array_of_sizes[i];
+    }
+    MPI_Type_dup(t[0], newtype);
+    MPI_Type_commit(newtype);
+    for (int i = 0; i < (ndims + 1); ++i) {
+        MPI_Type_free(&t[i]);
+    }
+    free(t);
+
+    return 0;
+}
diff --git a/src/halo-finder/src/dims-local.c b/src/halo-finder/src/dims-local.c
new file mode 100644
index 0000000..3ac1a95
--- /dev/null
+++ b/src/halo-finder/src/dims-local.c
@@ -0,0 +1,47 @@
+#include "dims.h"
+#include <assert.h>
+
+#define DIMENSION 3
+
+static int dims_init=0;
+static int dims_dims[DIMENSION] = {0,0,0};
+
+int MY_Dims_init_3D(int nnodes, int ndim, int *dims) {
+  int i, tmp_nnodes=1;
+
+  assert(ndim == DIMENSION);
+
+  tmp_nnodes = 1;
+  for(i=0; i<ndim; i++) {
+    dims_dims[i] = dims[i];
+    tmp_nnodes *= dims[i];
+  }
+
+  assert(tmp_nnodes = nnodes);
+
+  dims_init = 1;
+
+  return 0;
+}
+
+int MY_Dims_create_3D(int nnodes, int ndim, int *dims) {
+  int i, ret=0;
+
+  assert(ndim == DIMENSION);
+  assert(dims[0] == 0);
+  assert(dims[1] == 0);
+  assert(dims[2] == 0);
+
+  if(dims_init == 0)
+#ifndef USE_SERIAL_COSMO
+    ret = MPI_Dims_create(nnodes, ndim, dims);
+#else
+    ret = -1;
+#endif
+  else {
+    for(i=0; i<ndim; i++)
+      dims[i] = dims_dims[i];
+  }
+
+  return ret;
+}
diff --git a/src/halo-finder/src/include.mk b/src/halo-finder/src/include.mk
new file mode 100644
index 0000000..eabfa72
--- /dev/null
+++ b/src/halo-finder/src/include.mk
@@ -0,0 +1,16 @@
+HF_HOME := ../halo_finder
+#HF_TYPE_FLAGS := -DID_32 -DPOSVEL_32 -DGRID_32
+#HF_TYPE_FLAGS := -DID_64 -DPOSVEL_64 -DGRID_64
+HF_TYPE_FLAGS := -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HF_HEADERS := ${HF_HOME}/Definition.h
+HF_HEADERS += ${HF_HOME}/Partition.h
+HF_HEADERS += ${HF_HOME}/ParticleExchange.h
+HF_HEADERS += ${HF_HOME}/InitialExchange.h
+HF_HEADERS += ${HF_HOME}/GridExchange.h
+HF_HEADERS += ${HF_HOME}/ParticleDistribute.h
+HF_HEADERS += ${HF_HOME}/CosmoHaloFinderP.h
+HF_HEADERS += ${HF_HOME}/FOFHaloProperties.h
+#HF_WARNING := -Wmissing-noreturn -Wunused -Wsign-compare -Wshadow -Wformat
+HF_CFLAGS := -I${HF_HOME} ${HF_TYPE_FLAGS} ${HF_WARNING}
+HF_CXXFLAGS := -I${HF_HOME} ${HF_TYPE_FLAGS} ${HF_WARNING}
+HF_LDFLAGS := -L${HF_HOME}/${HACC_OBJDIR}
diff --git a/src/halo-finder/src/log.txt b/src/halo-finder/src/log.txt
new file mode 100644
index 0000000..7af0f11
--- /dev/null
+++ b/src/halo-finder/src/log.txt
@@ -0,0 +1,1864 @@
+make: Warning: Archive 'hip/libparticle.a' seems to have been created in deterministic mode. 'hip/ParticleDistribute.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/ParticleDistribute.o ParticleDistribute.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name ParticleDistribute.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/ParticleDistribute.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/ParticleDistribute-c9a3f8.s -x hc-host ParticleDistribute.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/ParticleDistribute-c9a3f8.s hip/ParticleDistribute.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ParticleDistribute-c9a3f8.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/ParticleDistribute.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/ParticleDistribute.o -x ir /tmp/ParticleDistribute-c9a3f8.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name ParticleDistribute.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/ParticleDistribute.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/ParticleDistribute-4a379d.s -x hc-kernel ParticleDistribute.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/ParticleDistribute-4a379d.s hip/ParticleDistribute.o
+ar rv hip/libparticle.a hip/ParticleDistribute.o
+r - hip/ParticleDistribute.o
+make: Warning: Archive 'hip/libparticle.a' seems to have been created in deterministic mode. 'hip/ParticleDistribute.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libparticle.a' seems to have been created in deterministic mode. 'hip/ParticleExchange.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/ParticleExchange.o ParticleExchange.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name ParticleExchange.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/ParticleExchange.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/ParticleExchange-be7c03.s -x hc-host ParticleExchange.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/ParticleExchange-be7c03.s hip/ParticleExchange.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ParticleExchange-be7c03.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/ParticleExchange.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/ParticleExchange.o -x ir /tmp/ParticleExchange-be7c03.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name ParticleExchange.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/ParticleExchange.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/ParticleExchange-67ceac.s -x hc-kernel ParticleExchange.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/ParticleExchange-67ceac.s hip/ParticleExchange.o
+ar rv hip/libparticle.a hip/ParticleExchange.o
+r - hip/ParticleExchange.o
+make: Warning: Archive 'hip/libparticle.a' seems to have been created in deterministic mode. 'hip/ParticleExchange.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libparticle.a' seems to have been created in deterministic mode. 'hip/InitialExchange.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/InitialExchange.o InitialExchange.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name InitialExchange.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/InitialExchange.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/InitialExchange-0c446c.s -x hc-host InitialExchange.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/InitialExchange-0c446c.s hip/InitialExchange.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name InitialExchange-0c446c.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/InitialExchange.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/InitialExchange.o -x ir /tmp/InitialExchange-0c446c.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name InitialExchange.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/InitialExchange.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/InitialExchange-5c0a35.s -x hc-kernel InitialExchange.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/InitialExchange-5c0a35.s hip/InitialExchange.o
+ar rv hip/libparticle.a hip/InitialExchange.o
+r - hip/InitialExchange.o
+make: Warning: Archive 'hip/libparticle.a' seems to have been created in deterministic mode. 'hip/InitialExchange.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libparticle.a' seems to have been created in deterministic mode. 'hip/Message.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/Message.o Message.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name Message.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/Message.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/Message-f5e802.s -x hc-host Message.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/Message-f5e802.s hip/Message.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Message-f5e802.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/Message.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/Message.o -x ir /tmp/Message-f5e802.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name Message.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/Message.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/Message-7a1aa3.s -x hc-kernel Message.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/Message-7a1aa3.s hip/Message.o
+ar rv hip/libparticle.a hip/Message.o
+r - hip/Message.o
+make: Warning: Archive 'hip/libparticle.a' seems to have been created in deterministic mode. 'hip/Message.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/ForceLaw.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+ar rv hip/libBHForceTree.a hip/ForceLaw.o
+r - hip/ForceLaw.o
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/ForceLaw.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/BHForceTree.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/BHForceTree.o BHForceTree.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name BHForceTree.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/BHForceTree.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/BHForceTree-de0bcd.s -x hc-host BHForceTree.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/BHForceTree-de0bcd.s hip/BHForceTree.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name BHForceTree-de0bcd.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/BHForceTree.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/BHForceTree.o -x ir /tmp/BHForceTree-de0bcd.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name BHForceTree.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/BHForceTree.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/BHForceTree-df8a38.s -x hc-kernel BHForceTree.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/BHForceTree-df8a38.s hip/BHForceTree.o
+ar rv hip/libBHForceTree.a hip/BHForceTree.o
+r - hip/BHForceTree.o
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/BHForceTree.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/RCOForceTree.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/RCOForceTree.o RCOForceTree.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name RCOForceTree.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/RCOForceTree.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/RCOForceTree-133c98.s -x hc-host RCOForceTree.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+In file included from RCOForceTree.cxx:73:
+./Timings.h:80:17: warning: using directive refers to implicitly-defined namespace 'std'
+using namespace std;
+                ^
+1 warning generated.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/RCOForceTree-133c98.s hip/RCOForceTree.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name RCOForceTree-133c98.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/RCOForceTree.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/RCOForceTree.o -x ir /tmp/RCOForceTree-133c98.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name RCOForceTree.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/RCOForceTree.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/RCOForceTree-92bb17.s -x hc-kernel RCOForceTree.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+In file included from RCOForceTree.cxx:73:
+./Timings.h:80:17: warning: using directive refers to implicitly-defined namespace 'std'
+using namespace std;
+                ^
+1 warning generated.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/RCOForceTree-92bb17.s hip/RCOForceTree.o
+ar rv hip/libBHForceTree.a hip/RCOForceTree.o
+r - hip/RCOForceTree.o
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/RCOForceTree.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/RCBForceTree.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/RCBForceTree.o RCBForceTree.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name RCBForceTree.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/RCBForceTree.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/RCBForceTree-1b5997.s -x hc-host RCBForceTree.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/RCBForceTree-1b5997.s hip/RCBForceTree.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name RCBForceTree-1b5997.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/RCBForceTree.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/RCBForceTree.o -x ir /tmp/RCBForceTree-1b5997.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name RCBForceTree.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/RCBForceTree.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/RCBForceTree-35501f.s -x hc-kernel RCBForceTree.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/RCBForceTree-35501f.s hip/RCBForceTree.o
+ar rv hip/libBHForceTree.a hip/RCBForceTree.o
+r - hip/RCBForceTree.o
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/RCBForceTree.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/BGQStep16.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hcc -x c -Xclang -std=c99 -Xclang -pthread -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -I../dfft -c -o hip/BGQStep16.o BGQStep16.c
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name BGQStep16.c -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/BGQStep16.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -std=c99 -pthread -o hip/BGQStep16.o -x c BGQStep16.c
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /usr/lib/openmpi/include
+ /opt/rocm/hip/include
+ /opt/rocm/hcc-1.0/include
+ ../dfft
+ .
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ar rv hip/libBHForceTree.a hip/BGQStep16.o
+r - hip/BGQStep16.o
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/BGQStep16.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/BGQCM.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hcc -x c -Xclang -std=c99 -Xclang -pthread -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -I../dfft -c -o hip/BGQCM.o BGQCM.c
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name BGQCM.c -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/BGQCM.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -std=c99 -pthread -o hip/BGQCM.o -x c BGQCM.c
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /usr/lib/openmpi/include
+ /opt/rocm/hip/include
+ /opt/rocm/hcc-1.0/include
+ ../dfft
+ .
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ar rv hip/libBHForceTree.a hip/BGQCM.o
+r - hip/BGQCM.o
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'hip/BGQCM.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'BGQStep16.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+cc -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -I../dfft   -c -o BGQStep16.o BGQStep16.c
+Using built-in specs.
+COLLECT_GCC=cc
+Target: x86_64-linux-gnu
+Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.12' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
+Thread model: posix
+gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.12) 
+COLLECT_GCC_OPTIONS='-O3' '-g' '-D' 'RCB_UNTHREADED_BUILD' '-D' 'USE_SERIAL_COSMO' '-fopenmp' '-v' '-I' '/usr/lib/openmpi/include' '-I' '/opt/rocm/hip/include' '-I' '/opt/rocm/hcc-1.0/include' '-I' '../dfft' '-c' '-o' 'BGQStep16.o' '-mtune=generic' '-march=x86-64' '-pthread'
+ /usr/lib/gcc/x86_64-linux-gnu/5/cc1 -quiet -v -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -I ../dfft -imultiarch x86_64-linux-gnu -D_REENTRANT -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO BGQStep16.c -quiet -dumpbase BGQStep16.c -mtune=generic -march=x86-64 -auxbase-strip BGQStep16.o -g -O3 -version -fopenmp -fstack-protector-strong -Wformat -Wformat-security -o /tmp/cc31MOZG.s
+GNU C11 (Ubuntu 5.4.0-6ubuntu1~16.04.12) version 5.4.0 20160609 (x86_64-linux-gnu)
+	compiled by GNU C version 5.4.0 20160609, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3
+GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
+ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5/../../../../x86_64-linux-gnu/include"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /usr/lib/openmpi/include
+ /opt/rocm/hip/include
+ /opt/rocm/hcc-1.0/include
+ ../dfft
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5/include
+ /usr/local/include
+ /usr/lib/gcc/x86_64-linux-gnu/5/include-fixed
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+GNU C11 (Ubuntu 5.4.0-6ubuntu1~16.04.12) version 5.4.0 20160609 (x86_64-linux-gnu)
+	compiled by GNU C version 5.4.0 20160609, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3
+GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
+Compiler executable checksum: 8087146d2ee737d238113fb57fabb1f2
+COLLECT_GCC_OPTIONS='-O3' '-g' '-D' 'RCB_UNTHREADED_BUILD' '-D' 'USE_SERIAL_COSMO' '-fopenmp' '-v' '-I' '/usr/lib/openmpi/include' '-I' '/opt/rocm/hip/include' '-I' '/opt/rocm/hcc-1.0/include' '-I' '../dfft' '-c' '-o' 'BGQStep16.o' '-mtune=generic' '-march=x86-64' '-pthread'
+ as -v -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -I ../dfft --64 -o BGQStep16.o /tmp/cc31MOZG.s
+GNU assembler version 2.26.1 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.26.1
+COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/
+LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/
+COLLECT_GCC_OPTIONS='-O3' '-g' '-D' 'RCB_UNTHREADED_BUILD' '-D' 'USE_SERIAL_COSMO' '-fopenmp' '-v' '-I' '/usr/lib/openmpi/include' '-I' '/opt/rocm/hip/include' '-I' '/opt/rocm/hcc-1.0/include' '-I' '../dfft' '-c' '-o' 'BGQStep16.o' '-mtune=generic' '-march=x86-64' '-pthread'
+ar rv hip/libBHForceTree.a BGQStep16.o
+r - BGQStep16.o
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'BGQStep16.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'BGQCM.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+cc -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -I../dfft   -c -o BGQCM.o BGQCM.c
+Using built-in specs.
+COLLECT_GCC=cc
+Target: x86_64-linux-gnu
+Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.12' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
+Thread model: posix
+gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.12) 
+COLLECT_GCC_OPTIONS='-O3' '-g' '-D' 'RCB_UNTHREADED_BUILD' '-D' 'USE_SERIAL_COSMO' '-fopenmp' '-v' '-I' '/usr/lib/openmpi/include' '-I' '/opt/rocm/hip/include' '-I' '/opt/rocm/hcc-1.0/include' '-I' '../dfft' '-c' '-o' 'BGQCM.o' '-mtune=generic' '-march=x86-64' '-pthread'
+ /usr/lib/gcc/x86_64-linux-gnu/5/cc1 -quiet -v -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -I ../dfft -imultiarch x86_64-linux-gnu -D_REENTRANT -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO BGQCM.c -quiet -dumpbase BGQCM.c -mtune=generic -march=x86-64 -auxbase-strip BGQCM.o -g -O3 -version -fopenmp -fstack-protector-strong -Wformat -Wformat-security -o /tmp/ccHqo2vJ.s
+GNU C11 (Ubuntu 5.4.0-6ubuntu1~16.04.12) version 5.4.0 20160609 (x86_64-linux-gnu)
+	compiled by GNU C version 5.4.0 20160609, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3
+GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
+ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5/../../../../x86_64-linux-gnu/include"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /usr/lib/openmpi/include
+ /opt/rocm/hip/include
+ /opt/rocm/hcc-1.0/include
+ ../dfft
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5/include
+ /usr/local/include
+ /usr/lib/gcc/x86_64-linux-gnu/5/include-fixed
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+GNU C11 (Ubuntu 5.4.0-6ubuntu1~16.04.12) version 5.4.0 20160609 (x86_64-linux-gnu)
+	compiled by GNU C version 5.4.0 20160609, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3
+GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
+Compiler executable checksum: 8087146d2ee737d238113fb57fabb1f2
+COLLECT_GCC_OPTIONS='-O3' '-g' '-D' 'RCB_UNTHREADED_BUILD' '-D' 'USE_SERIAL_COSMO' '-fopenmp' '-v' '-I' '/usr/lib/openmpi/include' '-I' '/opt/rocm/hip/include' '-I' '/opt/rocm/hcc-1.0/include' '-I' '../dfft' '-c' '-o' 'BGQCM.o' '-mtune=generic' '-march=x86-64' '-pthread'
+ as -v -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -I ../dfft --64 -o BGQCM.o /tmp/ccHqo2vJ.s
+GNU assembler version 2.26.1 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.26.1
+COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/
+LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/
+COLLECT_GCC_OPTIONS='-O3' '-g' '-D' 'RCB_UNTHREADED_BUILD' '-D' 'USE_SERIAL_COSMO' '-fopenmp' '-v' '-I' '/usr/lib/openmpi/include' '-I' '/opt/rocm/hip/include' '-I' '/opt/rocm/hcc-1.0/include' '-I' '../dfft' '-c' '-o' 'BGQCM.o' '-mtune=generic' '-march=x86-64' '-pthread'
+ar rv hip/libBHForceTree.a BGQCM.o
+r - BGQCM.o
+make: Warning: Archive 'hip/libBHForceTree.a' seems to have been created in deterministic mode. 'BGQCM.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libpartition.a' seems to have been created in deterministic mode. 'hip/Partition.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/Partition.o Partition.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name Partition.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/Partition.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/Partition-3e6791.s -x hc-host Partition.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/Partition-3e6791.s hip/Partition.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Partition-3e6791.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/Partition.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/Partition.o -x ir /tmp/Partition-3e6791.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name Partition.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/Partition.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/Partition-7a6952.s -x hc-kernel Partition.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/Partition-7a6952.s hip/Partition.o
+ar rv hip/libpartition.a hip/Partition.o
+r - hip/Partition.o
+make: Warning: Archive 'hip/libpartition.a' seems to have been created in deterministic mode. 'hip/Partition.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libhalotime.a' seems to have been created in deterministic mode. 'hip/Timings.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/Timings.o Timings.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name Timings.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/Timings.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/Timings-1d08ee.s -x hc-host Timings.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+In file included from Timings.cxx:45:
+./Timings.h:80:17: warning: using directive refers to implicitly-defined namespace 'std'
+using namespace std;
+                ^
+1 warning generated.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/Timings-1d08ee.s hip/Timings.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Timings-1d08ee.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/Timings.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/Timings.o -x ir /tmp/Timings-1d08ee.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name Timings.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/Timings.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/Timings-6caa88.s -x hc-kernel Timings.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+In file included from Timings.cxx:45:
+./Timings.h:80:17: warning: using directive refers to implicitly-defined namespace 'std'
+using namespace std;
+                ^
+1 warning generated.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/Timings-6caa88.s hip/Timings.o
+ar rv hip/libhalotime.a hip/Timings.o
+r - hip/Timings.o
+make: Warning: Archive 'hip/libhalotime.a' seems to have been created in deterministic mode. 'hip/Timings.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libhalotime.a' seems to have been created in deterministic mode. 'hip/Timer.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -c -o hip/Timer.o Timer.cxx
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name Timer.cxx -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/Timer.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -pthread -fhsa-ext -o /tmp/Timer-0b4bd3.s -x hc-host Timer.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-host-assemble" /tmp/Timer-0b4bd3.s hip/Timer.o -D__HIPCC__ -Wno-deprecated-register -DHIP_VERSION_MAJOR=1 -DHIP_VERSION_MINOR=3 -DHIP_VERSION_PATCH=17385 -D__HIP_ARCH_GFX801__=1 -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Timer-0b4bd3.s.bc -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/Timer.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -O3 -Wno-deprecated-register -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o hip/Timer.o -x ir /tmp/Timer-0b4bd3.s.bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_HC__=1 -D__HCC_HC__=1 -D__GPU__=1 -D__KALMAR_ACCELERATOR__=1 -D__HCC_ACCELERATOR__=1 -famp-is-device -fno-builtin -fno-common -O2 -triple amdgcn--amdhsa-hcc -aux-triple x86_64-unknown-linux-gnu -S -disable-free -disable-llvm-verifier -main-file-name Timer.cxx -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -no-integrated-as -mconstructor-aliases -v -coverage-notes-file /hacc/src/halo_finder/hip/Timer.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D __HIPCC__ -I /opt/rocm/hcc/include -I /opt/rocm/hip/include/hip/hcc_detail/cuda -I /opt/rocm/hsa/include -I /opt/rocm/profiler/CXLActivityLogger/include -I /opt/rocm/hip/include -D HIP_VERSION_MAJOR=1 -D HIP_VERSION_MINOR=3 -D HIP_VERSION_PATCH=17385 -D __HIP_ARCH_GFX801__=1 -I ../dfft -D PENCIL=1 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -D ID_64 -D POSVEL_32 -D GRID_32 -D LONG_INTEGER -I . -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-deprecated-register -std=c++amp -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-slp -pthread -fhsa-ext -o /tmp/Timer-04a004.s -x hc-kernel Timer.cxx -emit-llvm-bc
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/amdgcn--amdhsa-hcc/c++/5.4.0"
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/opt/rocm/hip/include"
+ignoring duplicate directory "/opt/rocm/hcc/include"
+ignoring duplicate directory "../dfft"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "."
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+ignoring duplicate directory "/usr/local/include"
+ignoring duplicate directory "/opt/rocm/hcc-1.0/lib/clang/6.0.0/include"
+ignoring duplicate directory "/usr/include"
+#include "..." search starts here:
+#include <...> search starts here:
+ /opt/rocm/hcc/include
+ /opt/rocm/hip/include/hip/hcc_detail/cuda
+ /opt/rocm/hsa/include
+ /opt/rocm/profiler/CXLActivityLogger/include
+ /opt/rocm/hip/include
+ ../dfft
+ /usr/lib/openmpi/include
+ .
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/x86_64-linux-gnu/c++/5.4.0
+ /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/backward
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include
+End of search list.
+ "/opt/rocm/hcc/bin/hc-kernel-assemble" /tmp/Timer-04a004.s hip/Timer.o
+ar rv hip/libhalotime.a hip/Timer.o
+r - hip/Timer.o
+make: Warning: Archive 'hip/libhalotime.a' seems to have been created in deterministic mode. 'hip/Timer.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+make: Warning: Archive 'hip/libbigchunk.a' seems to have been created in deterministic mode. 'hip/bigchunk.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hcc -x c -Xclang -std=c99 -Xclang -pthread -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -I../dfft -c -o hip/bigchunk.o bigchunk.c
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc-1.0/bin/clang-6.0" -cc1 -D__KALMAR_CPU__=1 -D__HCC_CPU__=1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name bigchunk.c -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -momit-leaf-frame-pointer -v -coverage-notes-file /hacc/src/halo_finder/hip/bigchunk.gcno -resource-dir /opt/rocm/hcc-1.0/lib/clang/6.0.0 -D RCB_UNTHREADED_BUILD -D USE_SERIAL_COSMO -I /usr/lib/openmpi/include -I /opt/rocm/hip/include -I /opt/rocm/hcc-1.0/include -I ../dfft -I/usr/include -I/usr/include -I. -internal-isystem /usr/local/include -internal-isystem /opt/rocm/hcc-1.0/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdebug-compilation-dir /hacc/src/halo_finder -ferror-limit 19 -fmessage-length 0 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -std=c99 -pthread -o hip/bigchunk.o -x c bigchunk.c
+clang -cc1 version 6.0.0 based upon HCC 1.0.17412-f590a25-821e6d8-64e7fc7  default target x86_64-unknown-linux-gnu
+ignoring nonexistent directory "/include"
+ignoring duplicate directory "/usr/include"
+ignoring duplicate directory "/usr/include"
+  as it is a non-system directory that duplicates a system directory
+#include "..." search starts here:
+#include <...> search starts here:
+ /usr/lib/openmpi/include
+ /opt/rocm/hip/include
+ /opt/rocm/hcc-1.0/include
+ ../dfft
+ .
+ /usr/local/include
+ /opt/rocm/hcc-1.0/lib/clang/6.0.0/include
+ /usr/include/x86_64-linux-gnu
+ /usr/include
+End of search list.
+bigchunk.c:86:33: warning: comparison of distinct pointer types ('void *' and 'char *') [-Wcompare-distinct-pointer-types]
+        if (ptr < _bigchunk_ptr || ptr >= (char *)_bigchunk_ptr + _bigchunk_sz) {
+                                   ~~~ ^  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+1 warning generated.
+ar rv hip/libbigchunk.a hip/bigchunk.o
+r - hip/bigchunk.o
+make: Warning: Archive 'hip/libbigchunk.a' seems to have been created in deterministic mode. 'hip/bigchunk.o' will always be updated. Please consider passing the U flag to ar to avoid the problem.
+/opt/rocm/bin/hipcc -Xclang -pthread -o hip/ForceTreeTest hip/ForceTreeTest.o hip/libparticle.a hip/libBHForceTree.a hip/libpartition.a hip/libhalotime.a hip/libbigchunk.a hip/dims-local.o -I../dfft -DPENCIL=1 -O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO -fopenmp -v -I/usr/lib/openmpi/include -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include -DID_64 -DPOSVEL_32 -DGRID_32 -DLONG_INTEGER  -I. -I../dfft -lm -lrt -L/usr/lib/openmpi/lib -lmpi -lmpi_cxx  -DUSE_SERIAL_COSMO=1
+HCC clang version 6.0.0  (based on HCC 1.0.17412-f590a25-821e6d8-64e7fc7 )
+Target: x86_64-unknown-linux-gnu
+Thread model: posix
+InstalledDir: /opt/rocm/hcc/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.0.0
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/5.4.0
+Candidate multilib: .;@m64
+Candidate multilib: 32;@m32
+Candidate multilib: x32;@mx32
+Selected multilib: .;@m64
+ "/opt/rocm/hcc/bin/clamp-link" --verbose --amdgpu-target=gfx801 -z relro --hash-style=gnu --eh-frame-hdr -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o hip/ForceTreeTest /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5.4.0/crtbegin.o -L/opt/rocm/hcc/lib -L/opt/rocm/hsa/lib -L/opt/rocm/lib -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -L/usr/lib/openmpi/lib -L/usr/lib/gcc/x86_64-linux-gnu/5.4.0 -L/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu -L/lib/x86_64-linux-gnu -L/lib/../lib64 -L/usr/lib/x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../.. -L/opt/rocm/hcc-1.0/bin/../lib -L/lib -L/usr/lib --rpath=/opt/rocm/hip/lib /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a --rpath=/opt/rocm/hcc/lib -ldl -lm -lpthread -lunwind --whole-archive -lmcwamp --no-whole-archive -lsupc++ -lhsa-runtime64 -lhc_am -lhsakmt -lCXLActivityLogger --rpath=/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lm hip/ForceTreeTest.o hip/libparticle.a hip/libBHForceTree.a hip/libpartition.a hip/libhalotime.a hip/libbigchunk.a hip/dims-local.o -lm -lrt -lmpi -lmpi_cxx -lstdc++ -lm -lomp -lgcc_s -lgcc -lpthread -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/5.4.0/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crtn.o -lclang_rt.builtins-x86_64
+AMDGPU target array: gfx801
+
+new kernel args: /tmp/tmp.qW4sNUeRZG/device_functions.cpp.kernel.bc /tmp/tmp.qW4sNUeRZG/device_util.cpp.kernel.bc /tmp/tmp.qW4sNUeRZG/hip_fp16.cpp.kernel.bc /tmp/tmp.qW4sNUeRZG/hip_ldg.cpp.kernel.bc /tmp/tmp.qW4sNUeRZG/math_functions.cpp.kernel.bc /tmp/tmp.qW4sNUeRZG/mcwamp.cpp.kernel.bc /tmp/tmp.qW4sNUeRZG/ForceTreeTest.kernel.bc /tmp/tmp.qW4sNUeRZG/InitialExchange.kernel.bc /tmp/tmp.qW4sNUeRZG/Message.kernel.bc /tmp/tmp.qW4sNUeRZG/ParticleDistribute.kernel.bc /tmp/tmp.qW4sNUeRZG/ParticleExchange.kernel.bc /tmp/tmp.qW4sNUeRZG/BHForceTree.kernel.bc /tmp/tmp.qW4sNUeRZG/ForceLaw.kernel.bc /tmp/tmp.qW4sNUeRZG/RCBForceTree.kernel.bc /tmp/tmp.qW4sNUeRZG/RCOForceTree.kernel.bc /tmp/tmp.qW4sNUeRZG/Partition.kernel.bc /tmp/tmp.qW4sNUeRZG/Timer.kernel.bc /tmp/tmp.qW4sNUeRZG/Timings.kernel.bc
+
+new host args: /tmp/tmp.qW4sNUeRZG/device_functions.cpp.host.o /tmp/tmp.qW4sNUeRZG/device_util.cpp.host.o /tmp/tmp.qW4sNUeRZG/hip_fp16.cpp.host.o /tmp/tmp.qW4sNUeRZG/hip_ldg.cpp.host.o /tmp/tmp.qW4sNUeRZG/math_functions.cpp.host.o /tmp/tmp.qW4sNUeRZG/mcwamp.cpp.host.o /tmp/tmp.qW4sNUeRZG/ForceTreeTest.host.o /tmp/tmp.qW4sNUeRZG/InitialExchange.host.o /tmp/tmp.qW4sNUeRZG/Message.host.o /tmp/tmp.qW4sNUeRZG/ParticleDistribute.host.o /tmp/tmp.qW4sNUeRZG/ParticleExchange.host.o /tmp/tmp.qW4sNUeRZG/BHForceTree.host.o /tmp/tmp.qW4sNUeRZG/ForceLaw.host.o /tmp/tmp.qW4sNUeRZG/RCBForceTree.host.o /tmp/tmp.qW4sNUeRZG/RCOForceTree.host.o /tmp/tmp.qW4sNUeRZG/Partition.host.o /tmp/tmp.qW4sNUeRZG/Timer.host.o /tmp/tmp.qW4sNUeRZG/Timings.host.o
+
+new other args: --verbose -z relro --hash-style=gnu --eh-frame-hdr -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o hip/ForceTreeTest /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5.4.0/crtbegin.o -L/opt/rocm/hcc/lib -L/opt/rocm/hsa/lib -L/opt/rocm/lib -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -L/usr/lib/openmpi/lib -L/usr/lib/gcc/x86_64-linux-gnu/5.4.0 -L/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu -L/lib/x86_64-linux-gnu -L/lib/../lib64 -L/usr/lib/x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../.. -L/opt/rocm/hcc-1.0/bin/../lib -L/lib -L/usr/lib --rpath=/opt/rocm/hip/lib /opt/rocm/hip/lib/libhip_hcc.so --rpath=/opt/rocm/hcc/lib -ldl -lm -lpthread -lunwind --whole-archive --no-whole-archive -lsupc++ -lhsa-runtime64 -lhc_am -lhsakmt -lCXLActivityLogger --rpath=/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lm /tmp/tmp.qW4sNUeRZG/libBHForceTree.a/BGQCM.o /tmp/tmp.qW4sNUeRZG/libBHForceTree.a/BGQStep16.o hip/libbigchunk.a hip/dims-local.o -lm -lrt -lmpi -lmpi_cxx -lstdc++ -lm -lomp -lgcc_s -lgcc -lpthread -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/5.4.0/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crtn.o -lclang_rt.builtins-x86_64
+
+Generating AMD GCN kernel
+GNU ld (GNU Binutils for Ubuntu) 2.26.1
+  Supported emulations:
+   elf_x86_64
+   elf32_x86_64
+   elf_i386
+   elf_iamcu
+   i386linux
+   elf_l1om
+   elf_k1om
+   i386pep
+   i386pe
+using internal linker script:
+==================================================
+/* Script for -z combreloc: combine and sort reloc sections */
+/* Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copying and distribution of this script, with or without modification,
+   are permitted in any medium without royalty provided the copyright
+   notice and this notice are preserved.  */
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64",
+	      "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(_start)
+SEARCH_DIR("=/usr/local/lib/x86_64-linux-gnu"); SEARCH_DIR("=/lib/x86_64-linux-gnu"); SEARCH_DIR("=/usr/lib/x86_64-linux-gnu"); SEARCH_DIR("=/usr/local/lib64"); SEARCH_DIR("=/lib64"); SEARCH_DIR("=/usr/lib64"); SEARCH_DIR("=/usr/local/lib"); SEARCH_DIR("=/lib"); SEARCH_DIR("=/usr/lib"); SEARCH_DIR("=/usr/x86_64-linux-gnu/lib64"); SEARCH_DIR("=/usr/x86_64-linux-gnu/lib");
+SECTIONS
+{
+  /* Read-only sections, merged into text segment: */
+  PROVIDE (__executable_start = SEGMENT_START("text-segment", 0x400000)); . = SEGMENT_START("text-segment", 0x400000) + SIZEOF_HEADERS;
+  .interp         : { *(.interp) }
+  .note.gnu.build-id : { *(.note.gnu.build-id) }
+  .hash           : { *(.hash) }
+  .gnu.hash       : { *(.gnu.hash) }
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+  .rela.dyn       :
+    {
+      *(.rela.init)
+      *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
+      *(.rela.fini)
+      *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
+      *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
+      *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
+      *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
+      *(.rela.ctors)
+      *(.rela.dtors)
+      *(.rela.got)
+      *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
+      *(.rela.ldata .rela.ldata.* .rela.gnu.linkonce.l.*)
+      *(.rela.lbss .rela.lbss.* .rela.gnu.linkonce.lb.*)
+      *(.rela.lrodata .rela.lrodata.* .rela.gnu.linkonce.lr.*)
+      *(.rela.ifunc)
+    }
+  .rela.plt       :
+    {
+      *(.rela.plt)
+      PROVIDE_HIDDEN (__rela_iplt_start = .);
+      *(.rela.iplt)
+      PROVIDE_HIDDEN (__rela_iplt_end = .);
+    }
+  .init           :
+  {
+    KEEP (*(SORT_NONE(.init)))
+  }
+  .plt            : { *(.plt) *(.iplt) }
+.plt.got        : { *(.plt.got) }
+.plt.bnd        : { *(.plt.bnd) }
+  .text           :
+  {
+    *(.text.unlikely .text.*_unlikely .text.unlikely.*)
+    *(.text.exit .text.exit.*)
+    *(.text.startup .text.startup.*)
+    *(.text.hot .text.hot.*)
+    *(.text .stub .text.* .gnu.linkonce.t.*)
+    /* .gnu.warning sections are handled specially by elf32.em.  */
+    *(.gnu.warning)
+  }
+  .fini           :
+  {
+    KEEP (*(SORT_NONE(.fini)))
+  }
+  PROVIDE (__etext = .);
+  PROVIDE (_etext = .);
+  PROVIDE (etext = .);
+  .rodata         : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+  .rodata1        : { *(.rodata1) }
+  .eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) }
+  .eh_frame       : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) }
+  .gcc_except_table   : ONLY_IF_RO { *(.gcc_except_table
+  .gcc_except_table.*) }
+  .gnu_extab   : ONLY_IF_RO { *(.gnu_extab*) }
+  /* These sections are generated by the Sun/Oracle C++ compiler.  */
+  .exception_ranges   : ONLY_IF_RO { *(.exception_ranges
+  .exception_ranges*) }
+  /* Adjust the address for the data segment.  We want to adjust up to
+     the same address within the page on the next page up.  */
+  . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
+  /* Exception handling  */
+  .eh_frame       : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
+  .gnu_extab      : ONLY_IF_RW { *(.gnu_extab) }
+  .gcc_except_table   : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
+  .exception_ranges   : ONLY_IF_RW { *(.exception_ranges .exception_ranges*) }
+  /* Thread Local Storage sections  */
+  .tdata	  : { *(.tdata .tdata.* .gnu.linkonce.td.*) }
+  .tbss		  : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) }
+  .preinit_array     :
+  {
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP (*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+  }
+  .init_array     :
+  {
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+    KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
+    PROVIDE_HIDDEN (__init_array_end = .);
+  }
+  .fini_array     :
+  {
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+    KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+  }
+  .ctors          :
+  {
+    /* gcc uses crtbegin.o to find the start of
+       the constructors, so we make sure it is
+       first.  Because this is a wildcard, it
+       doesn't matter if the user does not
+       actually link against crtbegin.o; the
+       linker won't look for a file to match a
+       wildcard.  The wildcard also means that it
+       doesn't matter which directory crtbegin.o
+       is in.  */
+    KEEP (*crtbegin.o(.ctors))
+    KEEP (*crtbegin?.o(.ctors))
+    /* We don't want to include the .ctor section from
+       the crtend.o file until after the sorted ctors.
+       The .ctor section from the crtend file contains the
+       end of ctors marker and it must be last */
+    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
+    KEEP (*(SORT(.ctors.*)))
+    KEEP (*(.ctors))
+  }
+  .dtors          :
+  {
+    KEEP (*crtbegin.o(.dtors))
+    KEEP (*crtbegin?.o(.dtors))
+    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
+    KEEP (*(SORT(.dtors.*)))
+    KEEP (*(.dtors))
+  }
+  .jcr            : { KEEP (*(.jcr)) }
+  .data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) }
+  .dynamic        : { *(.dynamic) }
+  .got            : { *(.got) *(.igot) }
+  . = DATA_SEGMENT_RELRO_END (SIZEOF (.got.plt) >= 24 ? 24 : 0, .);
+  .got.plt        : { *(.got.plt)  *(.igot.plt) }
+  .data           :
+  {
+    *(.data .data.* .gnu.linkonce.d.*)
+    SORT(CONSTRUCTORS)
+  }
+  .data1          : { *(.data1) }
+  _edata = .; PROVIDE (edata = .);
+  . = .;
+  __bss_start = .;
+  .bss            :
+  {
+   *(.dynbss)
+   *(.bss .bss.* .gnu.linkonce.b.*)
+   *(COMMON)
+   /* Align here to ensure that the .bss section occupies space up to
+      _end.  Align after .bss to ensure correct alignment even if the
+      .bss section disappears because there are no input sections.
+      FIXME: Why do we need it? When there is no .bss section, we don't
+      pad the .data section.  */
+   . = ALIGN(. != 0 ? 64 / 8 : 1);
+  }
+  .lbss   :
+  {
+    *(.dynlbss)
+    *(.lbss .lbss.* .gnu.linkonce.lb.*)
+    *(LARGE_COMMON)
+  }
+  . = ALIGN(64 / 8);
+  . = SEGMENT_START("ldata-segment", .);
+  .lrodata   ALIGN(CONSTANT (MAXPAGESIZE)) + (. & (CONSTANT (MAXPAGESIZE) - 1)) :
+  {
+    *(.lrodata .lrodata.* .gnu.linkonce.lr.*)
+  }
+  .ldata   ALIGN(CONSTANT (MAXPAGESIZE)) + (. & (CONSTANT (MAXPAGESIZE) - 1)) :
+  {
+    *(.ldata .ldata.* .gnu.linkonce.l.*)
+    . = ALIGN(. != 0 ? 64 / 8 : 1);
+  }
+  . = ALIGN(64 / 8);
+  _end = .; PROVIDE (end = .);
+  . = DATA_SEGMENT_END (.);
+  /* Stabs debugging sections.  */
+  .stab          0 : { *(.stab) }
+  .stabstr       0 : { *(.stabstr) }
+  .stab.excl     0 : { *(.stab.excl) }
+  .stab.exclstr  0 : { *(.stab.exclstr) }
+  .stab.index    0 : { *(.stab.index) }
+  .stab.indexstr 0 : { *(.stab.indexstr) }
+  .comment       0 : { *(.comment) }
+  /* DWARF debug sections.
+     Symbols in the DWARF debugging sections are relative to the beginning
+     of the section so we begin them at 0.  */
+  /* DWARF 1 */
+  .debug          0 : { *(.debug) }
+  .line           0 : { *(.line) }
+  /* GNU DWARF 1 extensions */
+  .debug_srcinfo  0 : { *(.debug_srcinfo) }
+  .debug_sfnames  0 : { *(.debug_sfnames) }
+  /* DWARF 1.1 and DWARF 2 */
+  .debug_aranges  0 : { *(.debug_aranges) }
+  .debug_pubnames 0 : { *(.debug_pubnames) }
+  /* DWARF 2 */
+  .debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+  .debug_abbrev   0 : { *(.debug_abbrev) }
+  .debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end ) }
+  .debug_frame    0 : { *(.debug_frame) }
+  .debug_str      0 : { *(.debug_str) }
+  .debug_loc      0 : { *(.debug_loc) }
+  .debug_macinfo  0 : { *(.debug_macinfo) }
+  /* SGI/MIPS DWARF 2 extensions */
+  .debug_weaknames 0 : { *(.debug_weaknames) }
+  .debug_funcnames 0 : { *(.debug_funcnames) }
+  .debug_typenames 0 : { *(.debug_typenames) }
+  .debug_varnames  0 : { *(.debug_varnames) }
+  /* DWARF 3 */
+  .debug_pubtypes 0 : { *(.debug_pubtypes) }
+  .debug_ranges   0 : { *(.debug_ranges) }
+  /* DWARF Extension.  */
+  .debug_macro    0 : { *(.debug_macro) }
+  .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+  /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
+}
+
+
+==================================================
+attempt to open /tmp/tmp.qW4sNUeRZG/kernel_hsa.o succeeded
+/tmp/tmp.qW4sNUeRZG/kernel_hsa.o
+attempt to open /tmp/tmp.qW4sNUeRZG/device_functions.cpp.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/device_functions.cpp.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/device_util.cpp.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/device_util.cpp.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/hip_fp16.cpp.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/hip_fp16.cpp.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/hip_ldg.cpp.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/hip_ldg.cpp.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/math_functions.cpp.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/math_functions.cpp.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/mcwamp.cpp.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/mcwamp.cpp.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/ForceTreeTest.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/ForceTreeTest.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/InitialExchange.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/InitialExchange.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/Message.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/Message.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/ParticleDistribute.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/ParticleDistribute.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/ParticleExchange.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/ParticleExchange.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/BHForceTree.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/BHForceTree.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/ForceLaw.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/ForceLaw.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/RCBForceTree.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/RCBForceTree.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/RCOForceTree.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/RCOForceTree.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/Partition.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/Partition.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/Timer.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/Timer.host.o
+attempt to open /tmp/tmp.qW4sNUeRZG/Timings.host.o succeeded
+/tmp/tmp.qW4sNUeRZG/Timings.host.o
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crt1.o succeeded
+/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crt1.o
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crti.o succeeded
+/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crti.o
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/crtbegin.o succeeded
+/usr/lib/gcc/x86_64-linux-gnu/5.4.0/crtbegin.o
+attempt to open /opt/rocm/hip/lib/libhip_hcc.so succeeded
+/opt/rocm/hip/lib/libhip_hcc.so
+attempt to open /opt/rocm/hcc/lib/libdl.so failed
+attempt to open /opt/rocm/hcc/lib/libdl.a failed
+attempt to open /opt/rocm/hsa/lib/libdl.so failed
+attempt to open /opt/rocm/hsa/lib/libdl.a failed
+attempt to open /opt/rocm/lib/libdl.so failed
+attempt to open /opt/rocm/lib/libdl.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libdl.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libdl.a failed
+attempt to open /usr/lib/openmpi/lib/libdl.so failed
+attempt to open /usr/lib/openmpi/lib/libdl.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libdl.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libdl.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libdl.so succeeded
+-ldl (/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libdl.so)
+attempt to open /opt/rocm/hcc/lib/libm.so failed
+attempt to open /opt/rocm/hcc/lib/libm.a failed
+attempt to open /opt/rocm/hsa/lib/libm.so failed
+attempt to open /opt/rocm/hsa/lib/libm.a failed
+attempt to open /opt/rocm/lib/libm.so failed
+attempt to open /opt/rocm/lib/libm.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libm.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libm.a failed
+attempt to open /usr/lib/openmpi/lib/libm.so failed
+attempt to open /usr/lib/openmpi/lib/libm.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libm.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libm.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so succeeded
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so
+attempt to open /lib/x86_64-linux-gnu/libm.so.6 succeeded
+/lib/x86_64-linux-gnu/libm.so.6
+attempt to open /usr/lib/x86_64-linux-gnu/libmvec_nonshared.a succeeded
+attempt to open /lib/x86_64-linux-gnu/libmvec.so.1 succeeded
+/lib/x86_64-linux-gnu/libmvec.so.1
+/lib/x86_64-linux-gnu/libmvec.so.1
+attempt to open /opt/rocm/hcc/lib/libpthread.so failed
+attempt to open /opt/rocm/hcc/lib/libpthread.a failed
+attempt to open /opt/rocm/hsa/lib/libpthread.so failed
+attempt to open /opt/rocm/hsa/lib/libpthread.a failed
+attempt to open /opt/rocm/lib/libpthread.so failed
+attempt to open /opt/rocm/lib/libpthread.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libpthread.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libpthread.a failed
+attempt to open /usr/lib/openmpi/lib/libpthread.so failed
+attempt to open /usr/lib/openmpi/lib/libpthread.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libpthread.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libpthread.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libpthread.so succeeded
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libpthread.so
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libpthread.so
+attempt to open /lib/x86_64-linux-gnu/libpthread.so.0 succeeded
+/lib/x86_64-linux-gnu/libpthread.so.0
+attempt to open /usr/lib/x86_64-linux-gnu/libpthread_nonshared.a succeeded
+attempt to open /opt/rocm/hcc/lib/libunwind.so failed
+attempt to open /opt/rocm/hcc/lib/libunwind.a failed
+attempt to open /opt/rocm/hsa/lib/libunwind.so failed
+attempt to open /opt/rocm/hsa/lib/libunwind.a failed
+attempt to open /opt/rocm/lib/libunwind.so failed
+attempt to open /opt/rocm/lib/libunwind.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libunwind.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libunwind.a failed
+attempt to open /usr/lib/openmpi/lib/libunwind.so failed
+attempt to open /usr/lib/openmpi/lib/libunwind.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libunwind.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libunwind.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libunwind.so succeeded
+-lunwind (/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libunwind.so)
+attempt to open /opt/rocm/hcc/lib/libsupc++.so failed
+attempt to open /opt/rocm/hcc/lib/libsupc++.a failed
+attempt to open /opt/rocm/hsa/lib/libsupc++.so failed
+attempt to open /opt/rocm/hsa/lib/libsupc++.a failed
+attempt to open /opt/rocm/lib/libsupc++.so failed
+attempt to open /opt/rocm/lib/libsupc++.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libsupc++.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libsupc++.a failed
+attempt to open /usr/lib/openmpi/lib/libsupc++.so failed
+attempt to open /usr/lib/openmpi/lib/libsupc++.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a succeeded
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)class_type_info.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)del_op.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)del_opv.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_alloc.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_catch.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_exception.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_globals.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_personality.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_terminate.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_throw.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_unex_handler.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)guard.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)guard_error.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)new_op.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)new_opv.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)pure.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)si_class_type_info.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)tinfo.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)cp-demangle.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)bad_alloc.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_call.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_term_handler.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)new_handler.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)vterminate.o
+(/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libsupc++.a)eh_type.o
+attempt to open /opt/rocm/hcc/lib/libhsa-runtime64.so failed
+attempt to open /opt/rocm/hcc/lib/libhsa-runtime64.a failed
+attempt to open /opt/rocm/hsa/lib/libhsa-runtime64.so failed
+attempt to open /opt/rocm/hsa/lib/libhsa-runtime64.a failed
+attempt to open /opt/rocm/lib/libhsa-runtime64.so succeeded
+-lhsa-runtime64 (/opt/rocm/lib/libhsa-runtime64.so)
+attempt to open /opt/rocm/hcc/lib/libhc_am.so succeeded
+-lhc_am (/opt/rocm/hcc/lib/libhc_am.so)
+attempt to open /opt/rocm/hcc/lib/libhsakmt.so failed
+attempt to open /opt/rocm/hcc/lib/libhsakmt.a failed
+attempt to open /opt/rocm/hsa/lib/libhsakmt.so failed
+attempt to open /opt/rocm/hsa/lib/libhsakmt.a failed
+attempt to open /opt/rocm/lib/libhsakmt.so succeeded
+-lhsakmt (/opt/rocm/lib/libhsakmt.so)
+attempt to open /opt/rocm/hcc/lib/libCXLActivityLogger.so failed
+attempt to open /opt/rocm/hcc/lib/libCXLActivityLogger.a failed
+attempt to open /opt/rocm/hsa/lib/libCXLActivityLogger.so failed
+attempt to open /opt/rocm/hsa/lib/libCXLActivityLogger.a failed
+attempt to open /opt/rocm/lib/libCXLActivityLogger.so succeeded
+-lCXLActivityLogger (/opt/rocm/lib/libCXLActivityLogger.so)
+attempt to open /opt/rocm/hcc/lib/libm.so failed
+attempt to open /opt/rocm/hcc/lib/libm.a failed
+attempt to open /opt/rocm/hsa/lib/libm.so failed
+attempt to open /opt/rocm/hsa/lib/libm.a failed
+attempt to open /opt/rocm/lib/libm.so failed
+attempt to open /opt/rocm/lib/libm.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libm.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libm.a failed
+attempt to open /usr/lib/openmpi/lib/libm.so failed
+attempt to open /usr/lib/openmpi/lib/libm.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libm.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libm.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so succeeded
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so
+attempt to open /lib/x86_64-linux-gnu/libm.so.6 succeeded
+/lib/x86_64-linux-gnu/libm.so.6
+attempt to open /usr/lib/x86_64-linux-gnu/libmvec_nonshared.a succeeded
+attempt to open /lib/x86_64-linux-gnu/libmvec.so.1 succeeded
+/lib/x86_64-linux-gnu/libmvec.so.1
+attempt to open /tmp/tmp.qW4sNUeRZG/libBHForceTree.a/BGQCM.o succeeded
+/tmp/tmp.qW4sNUeRZG/libBHForceTree.a/BGQCM.o
+attempt to open /tmp/tmp.qW4sNUeRZG/libBHForceTree.a/BGQStep16.o succeeded
+/tmp/tmp.qW4sNUeRZG/libBHForceTree.a/BGQStep16.o
+attempt to open hip/libbigchunk.a succeeded
+(hip/libbigchunk.a)bigchunk.o
+attempt to open hip/dims-local.o succeeded
+hip/dims-local.o
+attempt to open /opt/rocm/hcc/lib/libm.so failed
+attempt to open /opt/rocm/hcc/lib/libm.a failed
+attempt to open /opt/rocm/hsa/lib/libm.so failed
+attempt to open /opt/rocm/hsa/lib/libm.a failed
+attempt to open /opt/rocm/lib/libm.so failed
+attempt to open /opt/rocm/lib/libm.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libm.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libm.a failed
+attempt to open /usr/lib/openmpi/lib/libm.so failed
+attempt to open /usr/lib/openmpi/lib/libm.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libm.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libm.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so succeeded
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so
+attempt to open /lib/x86_64-linux-gnu/libm.so.6 succeeded
+/lib/x86_64-linux-gnu/libm.so.6
+attempt to open /usr/lib/x86_64-linux-gnu/libmvec_nonshared.a succeeded
+attempt to open /lib/x86_64-linux-gnu/libmvec.so.1 succeeded
+/lib/x86_64-linux-gnu/libmvec.so.1
+attempt to open /opt/rocm/hcc/lib/librt.so failed
+attempt to open /opt/rocm/hcc/lib/librt.a failed
+attempt to open /opt/rocm/hsa/lib/librt.so failed
+attempt to open /opt/rocm/hsa/lib/librt.a failed
+attempt to open /opt/rocm/lib/librt.so failed
+attempt to open /opt/rocm/lib/librt.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/librt.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/librt.a failed
+attempt to open /usr/lib/openmpi/lib/librt.so failed
+attempt to open /usr/lib/openmpi/lib/librt.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/librt.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/librt.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/librt.so succeeded
+-lrt (/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/librt.so)
+attempt to open /opt/rocm/hcc/lib/libmpi.so failed
+attempt to open /opt/rocm/hcc/lib/libmpi.a failed
+attempt to open /opt/rocm/hsa/lib/libmpi.so failed
+attempt to open /opt/rocm/hsa/lib/libmpi.a failed
+attempt to open /opt/rocm/lib/libmpi.so failed
+attempt to open /opt/rocm/lib/libmpi.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libmpi.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libmpi.a failed
+attempt to open /usr/lib/openmpi/lib/libmpi.so succeeded
+-lmpi (/usr/lib/openmpi/lib/libmpi.so)
+attempt to open /opt/rocm/hcc/lib/libmpi_cxx.so failed
+attempt to open /opt/rocm/hcc/lib/libmpi_cxx.a failed
+attempt to open /opt/rocm/hsa/lib/libmpi_cxx.so failed
+attempt to open /opt/rocm/hsa/lib/libmpi_cxx.a failed
+attempt to open /opt/rocm/lib/libmpi_cxx.so failed
+attempt to open /opt/rocm/lib/libmpi_cxx.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libmpi_cxx.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libmpi_cxx.a failed
+attempt to open /usr/lib/openmpi/lib/libmpi_cxx.so succeeded
+-lmpi_cxx (/usr/lib/openmpi/lib/libmpi_cxx.so)
+attempt to open /opt/rocm/hcc/lib/libstdc++.so failed
+attempt to open /opt/rocm/hcc/lib/libstdc++.a failed
+attempt to open /opt/rocm/hsa/lib/libstdc++.so failed
+attempt to open /opt/rocm/hsa/lib/libstdc++.a failed
+attempt to open /opt/rocm/lib/libstdc++.so failed
+attempt to open /opt/rocm/lib/libstdc++.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libstdc++.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libstdc++.a failed
+attempt to open /usr/lib/openmpi/lib/libstdc++.so failed
+attempt to open /usr/lib/openmpi/lib/libstdc++.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libstdc++.so succeeded
+-lstdc++ (/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libstdc++.so)
+attempt to open /opt/rocm/hcc/lib/libm.so failed
+attempt to open /opt/rocm/hcc/lib/libm.a failed
+attempt to open /opt/rocm/hsa/lib/libm.so failed
+attempt to open /opt/rocm/hsa/lib/libm.a failed
+attempt to open /opt/rocm/lib/libm.so failed
+attempt to open /opt/rocm/lib/libm.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libm.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libm.a failed
+attempt to open /usr/lib/openmpi/lib/libm.so failed
+attempt to open /usr/lib/openmpi/lib/libm.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libm.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libm.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so succeeded
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libm.so
+attempt to open /lib/x86_64-linux-gnu/libm.so.6 succeeded
+/lib/x86_64-linux-gnu/libm.so.6
+attempt to open /usr/lib/x86_64-linux-gnu/libmvec_nonshared.a succeeded
+attempt to open /lib/x86_64-linux-gnu/libmvec.so.1 succeeded
+/lib/x86_64-linux-gnu/libmvec.so.1
+attempt to open /opt/rocm/hcc/lib/libomp.so failed
+attempt to open /opt/rocm/hcc/lib/libomp.a failed
+attempt to open /opt/rocm/hsa/lib/libomp.so failed
+attempt to open /opt/rocm/hsa/lib/libomp.a failed
+attempt to open /opt/rocm/lib/libomp.so failed
+attempt to open /opt/rocm/lib/libomp.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libomp.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libomp.a failed
+attempt to open /usr/lib/openmpi/lib/libomp.so failed
+attempt to open /usr/lib/openmpi/lib/libomp.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libomp.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libomp.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libomp.so succeeded
+-lomp (/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libomp.so)
+attempt to open /opt/rocm/hcc/lib/libgcc_s.so failed
+attempt to open /opt/rocm/hcc/lib/libgcc_s.a failed
+attempt to open /opt/rocm/hsa/lib/libgcc_s.so failed
+attempt to open /opt/rocm/hsa/lib/libgcc_s.a failed
+attempt to open /opt/rocm/lib/libgcc_s.so failed
+attempt to open /opt/rocm/lib/libgcc_s.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libgcc_s.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libgcc_s.a failed
+attempt to open /usr/lib/openmpi/lib/libgcc_s.so failed
+attempt to open /usr/lib/openmpi/lib/libgcc_s.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libgcc_s.so succeeded
+-lgcc_s (/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libgcc_s.so)
+attempt to open /opt/rocm/hcc/lib/libgcc.so failed
+attempt to open /opt/rocm/hcc/lib/libgcc.a failed
+attempt to open /opt/rocm/hsa/lib/libgcc.so failed
+attempt to open /opt/rocm/hsa/lib/libgcc.a failed
+attempt to open /opt/rocm/lib/libgcc.so failed
+attempt to open /opt/rocm/lib/libgcc.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libgcc.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libgcc.a failed
+attempt to open /usr/lib/openmpi/lib/libgcc.so failed
+attempt to open /usr/lib/openmpi/lib/libgcc.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libgcc.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libgcc.a succeeded
+attempt to open /opt/rocm/hcc/lib/libpthread.so failed
+attempt to open /opt/rocm/hcc/lib/libpthread.a failed
+attempt to open /opt/rocm/hsa/lib/libpthread.so failed
+attempt to open /opt/rocm/hsa/lib/libpthread.a failed
+attempt to open /opt/rocm/lib/libpthread.so failed
+attempt to open /opt/rocm/lib/libpthread.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libpthread.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libpthread.a failed
+attempt to open /usr/lib/openmpi/lib/libpthread.so failed
+attempt to open /usr/lib/openmpi/lib/libpthread.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libpthread.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libpthread.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libpthread.so succeeded
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libpthread.so
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libpthread.so
+attempt to open /lib/x86_64-linux-gnu/libpthread.so.0 succeeded
+/lib/x86_64-linux-gnu/libpthread.so.0
+attempt to open /usr/lib/x86_64-linux-gnu/libpthread_nonshared.a succeeded
+attempt to open /opt/rocm/hcc/lib/libc.so failed
+attempt to open /opt/rocm/hcc/lib/libc.a failed
+attempt to open /opt/rocm/hsa/lib/libc.so failed
+attempt to open /opt/rocm/hsa/lib/libc.a failed
+attempt to open /opt/rocm/lib/libc.so failed
+attempt to open /opt/rocm/lib/libc.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libc.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libc.a failed
+attempt to open /usr/lib/openmpi/lib/libc.so failed
+attempt to open /usr/lib/openmpi/lib/libc.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libc.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libc.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libc.so succeeded
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libc.so
+opened script file /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libc.so
+attempt to open /lib/x86_64-linux-gnu/libc.so.6 succeeded
+/lib/x86_64-linux-gnu/libc.so.6
+attempt to open /usr/lib/x86_64-linux-gnu/libc_nonshared.a succeeded
+(/usr/lib/x86_64-linux-gnu/libc_nonshared.a)elf-init.oS
+attempt to open /lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 succeeded
+/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
+attempt to open /opt/rocm/hcc/lib/libgcc_s.so failed
+attempt to open /opt/rocm/hcc/lib/libgcc_s.a failed
+attempt to open /opt/rocm/hsa/lib/libgcc_s.so failed
+attempt to open /opt/rocm/hsa/lib/libgcc_s.a failed
+attempt to open /opt/rocm/lib/libgcc_s.so failed
+attempt to open /opt/rocm/lib/libgcc_s.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libgcc_s.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libgcc_s.a failed
+attempt to open /usr/lib/openmpi/lib/libgcc_s.so failed
+attempt to open /usr/lib/openmpi/lib/libgcc_s.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libgcc_s.so succeeded
+-lgcc_s (/usr/lib/gcc/x86_64-linux-gnu/5.4.0/libgcc_s.so)
+attempt to open /opt/rocm/hcc/lib/libgcc.so failed
+attempt to open /opt/rocm/hcc/lib/libgcc.a failed
+attempt to open /opt/rocm/hsa/lib/libgcc.so failed
+attempt to open /opt/rocm/hsa/lib/libgcc.a failed
+attempt to open /opt/rocm/lib/libgcc.so failed
+attempt to open /opt/rocm/lib/libgcc.a failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libgcc.so failed
+attempt to open /opt/rocm/profiler/CXLActivityLogger/bin/x86_64/libgcc.a failed
+attempt to open /usr/lib/openmpi/lib/libgcc.so failed
+attempt to open /usr/lib/openmpi/lib/libgcc.a failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libgcc.so failed
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/libgcc.a succeeded
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/crtend.o succeeded
+/usr/lib/gcc/x86_64-linux-gnu/5.4.0/crtend.o
+attempt to open /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crtn.o succeeded
+/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/crtn.o
+attempt to open /opt/rocm/hcc/lib/libclang_rt.builtins-x86_64.so failed
+attempt to open /opt/rocm/hcc/lib/libclang_rt.builtins-x86_64.a succeeded
+liblzma.so.5 needed by /usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../x86_64-linux-gnu/libunwind.so
+found liblzma.so.5 at //lib/x86_64-linux-gnu/liblzma.so.5
+libelf.so.1 needed by /opt/rocm/lib/libhsa-runtime64.so
+found libelf.so.1 at /usr/lib/x86_64-linux-gnu/libelf.so.1
+libpci.so.3 needed by /opt/rocm/lib/libhsakmt.so
+found libpci.so.3 at //lib/x86_64-linux-gnu/libpci.so.3
+libibverbs.so.1 needed by /usr/lib/openmpi/lib/libmpi.so
+found libibverbs.so.1 at //usr/lib/libibverbs.so.1
+libopen-rte.so.12 needed by /usr/lib/openmpi/lib/libmpi.so
+found libopen-rte.so.12 at //usr/lib/libopen-rte.so.12
+libopen-pal.so.13 needed by /usr/lib/openmpi/lib/libmpi.so
+found libopen-pal.so.13 at //usr/lib/libopen-pal.so.13
+libz.so.1 needed by /usr/lib/x86_64-linux-gnu/libelf.so.1
+found libz.so.1 at //lib/x86_64-linux-gnu/libz.so.1
+libresolv.so.2 needed by //lib/x86_64-linux-gnu/libpci.so.3
+found libresolv.so.2 at //lib/x86_64-linux-gnu/libresolv.so.2
+libudev.so.1 needed by //lib/x86_64-linux-gnu/libpci.so.3
+found libudev.so.1 at //lib/x86_64-linux-gnu/libudev.so.1
+libhwloc.so.5 needed by //usr/lib/libopen-rte.so.12
+found libhwloc.so.5 at /usr/lib/x86_64-linux-gnu/libhwloc.so.5
+libutil.so.1 needed by //usr/lib/libopen-pal.so.13
+found libutil.so.1 at //lib/x86_64-linux-gnu/libutil.so.1
+libnuma.so.1 needed by /usr/lib/x86_64-linux-gnu/libhwloc.so.5
+found libnuma.so.1 at /usr/lib/x86_64-linux-gnu/libnuma.so.1
+libltdl.so.7 needed by /usr/lib/x86_64-linux-gnu/libhwloc.so.5
+found libltdl.so.7 at /usr/lib/x86_64-linux-gnu/libltdl.so.7
+rm hip/Timings.o hip/RCOForceTree.o hip/ParticleExchange.o hip/InitialExchange.o hip/Partition.o hip/bigchunk.o hip/ParticleDistribute.o hip/BGQCM.o BGQStep16.o hip/BHForceTree.o hip/RCBForceTree.o hip/Message.o hip/BGQStep16.o hip/Timer.o BGQCM.o
diff --git a/src/halo-finder/src/rru_mpi.h b/src/halo-finder/src/rru_mpi.h
new file mode 100644
index 0000000..9ab0e13
--- /dev/null
+++ b/src/halo-finder/src/rru_mpi.h
@@ -0,0 +1,16 @@
+#ifndef RRU_MPI_H
+#define RRU_MPI_H
+
+//#ifndef USE_VTK_COSMO
+// Needed for some versions of MPI which define these
+//#undef SEEK_SET
+//#undef SEEK_CUR
+//#undef SEEK_END
+//#endif
+
+#ifndef MPICH_IGNORE_CXX_SEEK
+#define MPICH_IGNORE_CXX_SEEK
+#endif
+#include <mpi.h>
+
+#endif
diff --git a/src/halo-finder/src/winDirent.h b/src/halo-finder/src/winDirent.h
new file mode 100644
index 0000000..c36bc42
--- /dev/null
+++ b/src/halo-finder/src/winDirent.h
@@ -0,0 +1,232 @@
+/*****************************************************************************
+ * dirent.h - dirent API for Microsoft Visual Studio
+ *
+ * Copyright (C) 2006 Toni Ronkko
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * ``Software''), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL TONI RONKKO BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Dec 15, 2009, John Cunningham
+ * Added rewinddir member function
+ *
+ * Jan 18, 2008, Toni Ronkko
+ * Using FindFirstFileA and WIN32_FIND_DATAA to avoid converting string
+ * between multi-byte and unicode representations.  This makes the
+ * code simpler and also allows the code to be compiled under MingW.  Thanks
+ * to Azriel Fasten for the suggestion.
+ *
+ * Mar 4, 2007, Toni Ronkko
+ * Bug fix: due to the strncpy_s() function this file only compiled in
+ * Visual Studio 2005.  Using the new string functions only when the
+ * compiler version allows.
+ *
+ * Nov  2, 2006, Toni Ronkko
+ * Major update: removed support for Watcom C, MS-DOS and Turbo C to
+ * simplify the file, updated the code to compile cleanly on Visual
+ * Studio 2005 with both unicode and multi-byte character strings,
+ * removed rewinddir() as it had a bug.
+ *
+ * Aug 20, 2006, Toni Ronkko
+ * Removed all remarks about MSVC 1.0, which is antiqued now.  Simplified
+ * comments by removing SGML tags.
+ *
+ * May 14 2002, Toni Ronkko
+ * Embedded the function definitions directly to the header so that no
+ * source modules need to be included in the Visual Studio project.  Removed
+ * all the dependencies to other projects so that this very header can be
+ * used independently.
+ *
+ * May 28 1998, Toni Ronkko
+ * First version.
+ *****************************************************************************/
+#ifndef __WINDOWS_DIRENT_H
+#define __WINDOWS_DIRENT_H
+
+#include <windows.h>
+#include <string.h>
+#include <assert.h>
+
+typedef struct dirent
+{
+   char d_name[MAX_PATH + 1]; /* current dir entry (multi-byte char string) */
+   WIN32_FIND_DATAA data;     /* file attributes */
+}  dirent;
+
+
+typedef struct DIR
+{
+   dirent current;            /* Current directory entry */
+   int    cached;             /* Indicates un-processed entry in memory */
+   HANDLE search_handle;      /* File search handle */
+   char   patt[MAX_PATH + 3]; /* search pattern (3 = pattern + "\\*\0") */
+} DIR;
+
+
+/* Forward declarations */
+static DIR *opendir (const char *dirname);
+static struct dirent *readdir (DIR *dirp);
+static int closedir (DIR *dirp);
+
+#ifndef USE_VTK_COSMO
+static void rewinddir(DIR* dirp);
+#endif
+
+/* Use the new safe string functions introduced in Visual Studio 2005 */
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+# define STRNCPY(dest,src,size) strncpy_s((dest),(size),(src),_TRUNCATE)
+#else
+# define STRNCPY(dest,src,size) strncpy((dest),(src),(size))
+#endif
+
+
+/*****************************************************************************
+ * Open directory stream DIRNAME for read and return a pointer to the
+ * internal working area that is used to retrieve individual directory
+ * entries.
+ */
+static DIR *opendir(const char *dirname)
+{
+   DIR *dirp;
+   assert (dirname != NULL);
+   assert (strlen (dirname) < MAX_PATH);
+
+   /* construct new DIR structure */
+   dirp = (DIR*) malloc (sizeof (struct DIR));
+   if (dirp != NULL) {
+      char *p;
+
+      /* take directory name... */
+      STRNCPY (dirp->patt, dirname, sizeof(dirp->patt));
+      dirp->patt[MAX_PATH] = '\0';
+
+      /* ... and append search pattern to it */
+      p = strchr (dirp->patt, '\0');
+      if (dirp->patt < p  &&  *(p-1) != '\\'  &&  *(p-1) != ':') {
+         *p++ = '\\';
+      }
+      *p++ = '*';
+      *p = '\0';
+
+      /* open stream and retrieve first file */
+      dirp->search_handle = FindFirstFileA (dirp->patt, &dirp->current.data);
+      if (dirp->search_handle == INVALID_HANDLE_VALUE) {
+         /* invalid search pattern? */
+         free (dirp);
+         return NULL;
+      }
+
+      /* there is an un-processed directory entry in memory now */
+      dirp->cached = 1;
+   }
+
+   return dirp;
+}
+
+
+/*****************************************************************************
+ * Read a directory entry, and return a pointer to a dirent structure
+ * containing the name of the entry in d_name field.  Individual directory
+ * entries returned by this very function include regular files,
+ * sub-directories, pseudo-directories "." and "..", but also volume labels,
+ * hidden files and system files may be returned.
+ */
+static struct dirent *readdir(DIR *dirp)
+{
+   assert (dirp != NULL);
+
+   if (dirp->search_handle == INVALID_HANDLE_VALUE) {
+      /* directory stream was opened/rewound incorrectly or ended normally */
+      return NULL;
+   }
+
+   /* get next directory entry */
+   if (dirp->cached != 0) {
+      /* a valid directory entry already in memory */
+      dirp->cached = 0;
+   } else {
+      /* read next directory entry from disk */
+      if (FindNextFileA (dirp->search_handle, &dirp->current.data) == FALSE) {
+         /* the very last file has been processed or an error occured */
+         FindClose (dirp->search_handle);
+         dirp->search_handle = INVALID_HANDLE_VALUE;
+         return NULL;
+      }
+   }
+
+   /* copy as a multibyte character string */
+   STRNCPY ( dirp->current.d_name,
+             dirp->current.data.cFileName,
+             sizeof(dirp->current.d_name) );
+   dirp->current.d_name[MAX_PATH] = '\0';
+
+   return &dirp->current;
+}
+
+/*****************************************************************************
+ * Close directory stream opened by opendir() function.  Close of the
+ * directory stream invalidates the DIR structure as well as any previously
+ * read directory entry.
+ */
+static int closedir(DIR *dirp)
+{
+   assert (dirp != NULL);
+
+   /* release search handle */
+   if (dirp->search_handle != INVALID_HANDLE_VALUE) {
+      FindClose (dirp->search_handle);
+      dirp->search_handle = INVALID_HANDLE_VALUE;
+   }
+
+   /* release directory handle */
+   free (dirp);
+   return 0;
+}
+
+#ifndef USE_VTK_COSMO
+
+/*****************************************************************************
+ * Resets the position of the directory stream to which dirp refers to the
+ * beginning of the directory. It also causes the directory stream to refer
+ * to the current state of the corresponding directory, as a call to opendir()
+ * would have done. If dirp does not refer to a directory stream, the effect
+ * is undefined.
+ */
+static void rewinddir(DIR* dirp)
+{
+   /* release search handle */
+   if (dirp->search_handle != INVALID_HANDLE_VALUE) {
+      FindClose (dirp->search_handle);
+      dirp->search_handle = INVALID_HANDLE_VALUE;
+   }
+
+   /* open new search handle and retrieve first file */
+   dirp->search_handle = FindFirstFileA (dirp->patt, &dirp->current.data);
+   if (dirp->search_handle == INVALID_HANDLE_VALUE) {
+      /* invalid search pattern? */
+      free (dirp);
+      return;
+   }
+
+   /* there is an un-processed directory entry in memory now */
+   dirp->cached = 1;
+}
+
+#endif
+
+#endif /*__WINDOWS_DIRENT_H*/