resources: Add lulesh benchmark

This commit adds the lulesh hydrodynamics proxy app that has been
ported for HIP, allowing it to be ran on the gem5 GPU model

Change-Id: I0f1fb400b6645e934da7ccd8a53da7afa2b09c0b
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/40215
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>
diff --git a/README.md b/README.md
index 9cd8bbe..756e6ae 100644
--- a/README.md
+++ b/README.md
@@ -486,6 +486,44 @@
 
 <http://dist.gem5.org/dist/develop/test-progs/heterosync/gcn3/allSyncPrims-1kernel>
 
+# Resource: lulesh
+
+[lulesh](https://computing.llnl.gov/projects/co-design/lulesh) is a DOE proxy
+application that is used as an example of hydrodynamics modeling. The version
+provided is for use with the gpu-compute model of gem5.
+
+## Compilation and Running
+```
+cd src/lulesh
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
+```
+
+By default, the makefile builds for gfx801, and is placed in the `src/lulesh/bin` folder.
+
+lulesh is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
+To build GCN3_X86:
+
+```
+# Working directory is your gem5 directory
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+```
+
+The following command shows how to run lulesh
+
+Note: lulesh has two optional command-line arguments, to specify the stop time and number
+of iterations. To set the arguments, add `--options="<stop_time> <num_iters>`
+to the run command. The default arguments are equivalent to `--options="1.0e-2 10"`
+
+
+```
+# Assuming gem5 and gem5-resources are in your working directory
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n2 --mem-size=8GB --benchmark-root=gem5-resources/src/lulesh/bin -clulesh
+```
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/develop/test-progs/lulesh/lulesh>
+
 # Resource: SPEC 2006
 
 The [Standard Performance Evaluation Corporation](
@@ -621,6 +659,7 @@
 * **hip-samples**: Consult individual copyright notices of the source file in
 'src/hip-samples/src'
 * **heterosync**: Consult `src/heterosync/LICENSE.txt`
+* **lulesh**: Consult the copyright notice in `src/lulesh/src/lulesh.hip.cc`
 * **spec 2006**: SPEC CPU 2006 requires purchase of benchmark suite from
 [SPEC](https://www.spec.org/cpu2006/) thus, it cannot be freely distributed.
 Consult individual copyright notices of source files in `src/spec-2006`.
diff --git a/src/lulesh/.gitignore b/src/lulesh/.gitignore
new file mode 100644
index 0000000..e660fd9
--- /dev/null
+++ b/src/lulesh/.gitignore
@@ -0,0 +1 @@
+bin/
diff --git a/src/lulesh/Makefile b/src/lulesh/Makefile
new file mode 100755
index 0000000..64bdc4f
--- /dev/null
+++ b/src/lulesh/Makefile
@@ -0,0 +1,10 @@
+BIN_DIR?= ./bin
+
+all: $(BIN_DIR)
+	hipcc src/lulesh.hip.cc -o $(BIN_DIR)/lulesh --amdgpu-target=gfx801
+
+$(BIN_DIR):
+	mkdir -p $(BIN_DIR)
+
+clean:
+	rm -rf $(BIN_DIR)
diff --git a/src/lulesh/src/lulesh.hip.cc b/src/lulesh/src/lulesh.hip.cc
new file mode 100644
index 0000000..2026f6c
--- /dev/null
+++ b/src/lulesh/src/lulesh.hip.cc
@@ -0,0 +1,5668 @@
+/*
+
+                 Copyright (c) 2010.
+      Lawrence Livermore National Security, LLC.
+Produced at the Lawrence Livermore National Laboratory.
+                  LLNL-CODE-461231
+                All rights reserved.
+
+This file is part of LULESH, Version 1.0.
+Please also read this link -- http://www.opensource.org/licenses/index.php
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+* Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the disclaimer below.
+
+         * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the disclaimer (as noted below)
+     in the documentation and/or other materials provided with the
+     distribution.
+
+         * Neither the name of the LLNS/LLNL nor the names of its contributors
+     may be used to endorse or promote products derived from this software
+     without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
+THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Additional BSD Notice
+
+1. This notice is required to be provided under our contract with the U.S.
+   Department of Energy (DOE). This work was produced at Lawrence Livermore
+   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
+
+2. Neither the United States Government nor Lawrence Livermore National
+   Security, LLC nor any of their employees, makes any warranty, express
+   or implied, or assumes any liability or responsibility for the accuracy,
+   completeness, or usefulness of any information, apparatus, product, or
+   process disclosed, or represents that its use would not infringe
+   privately-owned rights.
+
+3. Also, reference herein to any specific commercial products, process, or
+   services by trade name, trademark, manufacturer or otherwise does not
+   necessarily constitute or imply its endorsement, recommendation, or
+   favoring by the United States Government or Lawrence Livermore National
+   Security, LLC. The views and opinions of authors expressed herein do not
+   necessarily state or reflect those of the United States Government or
+   Lawrence Livermore National Security, LLC, and shall not be used for
+   advertising or product endorsement purposes.
+
+*/
+
+#include "hip/hip_runtime.h"
+#include <vector>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define LULESH_SHOW_PROGRESS 1
+
+enum { VolumeError = -1, QStopError = -2 } ;
+
+/****************************************************/
+/* Allow flexibility for arithmetic representations */
+/****************************************************/
+
+/* Could also support fixed point and interval arithmetic types */
+typedef float        real4 ;
+typedef double       real8 ;
+typedef long double  real10 ;  /* 10 bytes on x86 */
+
+typedef int    Index_t ; /* array subscript and loop index */
+typedef real8  Real_t ;  /* floating point representation */
+typedef int    Int_t ;   /* integer representation */
+
+__host__ __device__ inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
+__host__ __device__ inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
+__host__            inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
+
+__host__ __device__ inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
+__host__ __device__ inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
+__host__            inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
+
+__host__ __device__ inline real4  FABS(real4  arg) { return fabsf(arg) ; }
+__host__ __device__ inline real8  FABS(real8  arg) { return fabs(arg) ; }
+__host__            inline real10 FABS(real10 arg) { return fabsl(arg) ; }
+
+__host__ __device__ inline real4  FMAX(real4  arg1,real4  arg2) { return fmaxf(arg1,arg2) ; }
+__host__ __device__ inline real8  FMAX(real8  arg1,real8  arg2) { return fmax(arg1,arg2) ; }
+__host__            inline real10 FMAX(real10 arg1,real10 arg2) { return fmaxl(arg1,arg2) ; }
+
+#define HIP_SAFE_CALL( call) do {                                           \
+        hipError_t err = call;                                              \
+        if( hipSuccess != err) {                                            \
+            fprintf(stderr, "HIP error in file '%s' in line %i : %s.\n",    \
+                    __FILE__, __LINE__, hipGetErrorString( err) );          \
+            exit(EXIT_FAILURE);                                             \
+        } } while (0)
+
+#define HIP(call) HIP_SAFE_CALL(call)
+
+#ifdef HIP_SYNC_ALL
+#define HIP_DEBUGSYNC HIP(hipDeviceSynchronize())
+#else
+#define HIP_DEBUGSYNC
+#endif
+
+#define BLOCKSIZE 256
+
+/* Given a number of bytes, nbytes, and a byte alignment, align, (e.g., 2,
+ * 4, 8, or 16), return the smallest integer that is larger than nbytes and
+ * a multiple of align.
+ */
+#define PAD_DIV(nbytes, align)  (((nbytes) + (align) - 1) / (align))
+#define PAD(nbytes, align)  (PAD_DIV((nbytes),(align)) * (align))
+
+/* More general version of reduceInPlacePOT (this works for arbitrary
+ * numThreadsPerBlock <= 1024). Again, conditionals on
+ * numThreadsPerBlock are evaluated at compile time.
+ */
+template <class T, int numThreadsPerBlock>
+__device__ void
+reduceSum(T *sresult, const int threadID)
+{
+  /* If number of threads is not a power of two, first add the ones
+       after the last power of two into the beginning. At most one of
+       these conditionals will be true for a given NPOT block size. */
+  if (numThreadsPerBlock > 512 && numThreadsPerBlock <= 1024)
+  {
+          __syncthreads();
+          if (threadID < numThreadsPerBlock-512)
+                sresult[threadID] += sresult[threadID + 512];
+  }
+
+  if (numThreadsPerBlock > 256 && numThreadsPerBlock < 512)
+  {
+          __syncthreads();
+          if (threadID < numThreadsPerBlock-256)
+                sresult[threadID] += sresult[threadID + 256];
+  }
+
+  if (numThreadsPerBlock > 128 && numThreadsPerBlock < 256)
+  {
+          __syncthreads();
+          if (threadID < numThreadsPerBlock-128)
+                sresult[threadID] += sresult[threadID + 128];
+  }
+
+  if (numThreadsPerBlock > 64 && numThreadsPerBlock < 128)
+  {
+          __syncthreads();
+          if (threadID < numThreadsPerBlock-64)
+                sresult[threadID] += sresult[threadID + 64];
+  }
+
+  if (numThreadsPerBlock > 32 && numThreadsPerBlock < 64)
+  {
+          __syncthreads();
+          if (threadID < numThreadsPerBlock-32)
+                sresult[threadID] += sresult[threadID + 32];
+  }
+
+  if (numThreadsPerBlock > 16 && numThreadsPerBlock < 32)
+  {
+          __syncthreads();
+          if (threadID < numThreadsPerBlock-16)
+                sresult[threadID] += sresult[threadID + 16];
+  }
+
+  if (numThreadsPerBlock > 8 && numThreadsPerBlock < 16)
+  {
+          __syncthreads();
+          if (threadID < numThreadsPerBlock-8)
+                sresult[threadID] += sresult[threadID + 8];
+  }
+
+  if (numThreadsPerBlock > 4 && numThreadsPerBlock < 8)
+  {
+          __syncthreads();
+          if (threadID < numThreadsPerBlock-4)
+                sresult[threadID] += sresult[threadID + 4];
+  }
+
+  if (numThreadsPerBlock > 2 && numThreadsPerBlock < 4)
+  {
+          __syncthreads();
+          if (threadID < numThreadsPerBlock-2)
+                sresult[threadID] += sresult[threadID + 2];
+  }
+
+  if (numThreadsPerBlock >= 512) {
+        __syncthreads();
+        if (threadID < 256)
+          sresult[threadID] += sresult[threadID + 256];
+  }
+
+  if (numThreadsPerBlock >= 256) {
+        __syncthreads();
+        if (threadID < 128)
+          sresult[threadID] += sresult[threadID + 128];
+  }
+  if (numThreadsPerBlock >= 128) {
+        __syncthreads();
+        if (threadID < 64)
+          sresult[threadID] += sresult[threadID + 64];
+  }
+  __syncthreads();
+#ifdef _DEVICEEMU
+  if (numThreadsPerBlock >= 64) {
+        __syncthreads();
+        if (threadID < 32)
+          sresult[threadID] += sresult[threadID + 32];
+  }
+  if (numThreadsPerBlock >= 32) {
+        __syncthreads();
+        if (threadID < 16)
+          sresult[threadID] += sresult[threadID + 16];
+  }
+  if (numThreadsPerBlock >= 16) {
+        __syncthreads();
+        if (threadID < 8)
+          sresult[threadID] += sresult[threadID + 8];
+  }
+  if (numThreadsPerBlock >= 8) {
+        __syncthreads();
+        if (threadID < 4)
+          sresult[threadID] += sresult[threadID + 4];
+  }
+  if (numThreadsPerBlock >= 4) {
+        __syncthreads();
+        if (threadID < 2)
+          sresult[threadID] += sresult[threadID + 2];
+  }
+  if (numThreadsPerBlock >= 2) {
+        __syncthreads();
+        if (threadID < 1)
+          sresult[threadID] += sresult[threadID + 1];
+  }
+#else
+  if (threadID < 32) {
+        volatile T *vol = sresult;
+        if (numThreadsPerBlock >= 64) vol[threadID] += vol[threadID + 32];
+        if (numThreadsPerBlock >= 32) vol[threadID] += vol[threadID + 16];
+        if (numThreadsPerBlock >= 16) vol[threadID] += vol[threadID + 8];
+        if (numThreadsPerBlock >= 8) vol[threadID] += vol[threadID + 4];
+        if (numThreadsPerBlock >= 4) vol[threadID] += vol[threadID + 2];
+        if (numThreadsPerBlock >= 2) vol[threadID] += vol[threadID + 1];
+  }
+#endif
+  __syncthreads();
+}
+
+#define MINEQ(a,b) (a)=(((a)<(b))?(a):(b))
+
+  template <class T, int numThreadsPerBlock>
+  __device__ void
+  reduceMin(T *sresult, const int threadID)
+  {
+        /* If number of threads is not a power of two, first add the ones
+       after the last power of two into the beginning. At most one of
+       these conditionals will be true for a given NPOT block size. */
+        if (numThreadsPerBlock > 512 && numThreadsPerBlock <= 1024)
+    {
+                __syncthreads();
+                if (threadID < numThreadsPerBlock-512)
+                  MINEQ(sresult[threadID],sresult[threadID + 512]);
+        }
+
+        if (numThreadsPerBlock > 256 && numThreadsPerBlock < 512)
+        {
+                __syncthreads();
+                if (threadID < numThreadsPerBlock-256)
+                  MINEQ(sresult[threadID],sresult[threadID + 256]);
+        }
+
+        if (numThreadsPerBlock > 128 && numThreadsPerBlock < 256)
+        {
+                __syncthreads();
+                if (threadID < numThreadsPerBlock-128)
+                  MINEQ(sresult[threadID],sresult[threadID + 128]);
+        }
+
+        if (numThreadsPerBlock > 64 && numThreadsPerBlock < 128)
+        {
+                __syncthreads();
+                if (threadID < numThreadsPerBlock-64)
+                  MINEQ(sresult[threadID],sresult[threadID + 64]);
+        }
+
+        if (numThreadsPerBlock > 32 && numThreadsPerBlock < 64)
+        {
+                __syncthreads();
+                if (threadID < numThreadsPerBlock-32)
+                  MINEQ(sresult[threadID],sresult[threadID + 32]);
+        }
+
+        if (numThreadsPerBlock > 16 && numThreadsPerBlock < 32)
+        {
+                __syncthreads();
+                if (threadID < numThreadsPerBlock-16)
+                  MINEQ(sresult[threadID],sresult[threadID + 16]);
+        }
+
+        if (numThreadsPerBlock > 8 && numThreadsPerBlock < 16)
+        {
+                __syncthreads();
+                if (threadID < numThreadsPerBlock-8)
+                  MINEQ(sresult[threadID],sresult[threadID + 8]);
+        }
+
+        if (numThreadsPerBlock > 4 && numThreadsPerBlock < 8)
+        {
+                __syncthreads();
+                if (threadID < numThreadsPerBlock-4)
+                  MINEQ(sresult[threadID],sresult[threadID + 4]);
+        }
+
+        if (numThreadsPerBlock > 2 && numThreadsPerBlock < 4)
+        {
+                __syncthreads();
+                if (threadID < numThreadsPerBlock-2)
+                  MINEQ(sresult[threadID],sresult[threadID + 2]);
+        }
+
+        if (numThreadsPerBlock >= 512) {
+          __syncthreads();
+          if (threadID < 256)
+                MINEQ(sresult[threadID],sresult[threadID + 256]);
+        }
+
+        if (numThreadsPerBlock >= 256) {
+          __syncthreads();
+          if (threadID < 128)
+                MINEQ(sresult[threadID],sresult[threadID + 128]);
+        }
+        if (numThreadsPerBlock >= 128) {
+          __syncthreads();
+          if (threadID < 64)
+                MINEQ(sresult[threadID],sresult[threadID + 64]);
+        }
+        __syncthreads();
+        #ifdef _DEVICEEMU
+        if (numThreadsPerBlock >= 64) {
+          __syncthreads();
+          if (threadID < 32)
+                MINEQ(sresult[threadID],sresult[threadID + 32]);
+        }
+        if (numThreadsPerBlock >= 32) {
+          __syncthreads();
+          if (threadID < 16)
+                MINEQ(sresult[threadID],sresult[threadID + 16]);
+        }
+        if (numThreadsPerBlock >= 16) {
+          __syncthreads();
+          if (threadID < 8)
+                MINEQ(sresult[threadID],sresult[threadID + 8]);
+        }
+        if (numThreadsPerBlock >= 8) {
+          __syncthreads();
+          if (threadID < 4)
+                MINEQ(sresult[threadID],sresult[threadID + 4]);
+        }
+        if (numThreadsPerBlock >= 4) {
+          __syncthreads();
+          if (threadID < 2)
+                MINEQ(sresult[threadID],sresult[threadID + 2]);
+        }
+        if (numThreadsPerBlock >= 2) {
+          __syncthreads();
+          if (threadID < 1)
+                MINEQ(sresult[threadID],sresult[threadID + 1]);
+        }
+#else
+        if (threadID < 32) {
+          volatile T *vol = sresult;
+          if (numThreadsPerBlock >= 64) MINEQ(vol[threadID],vol[threadID + 32]);
+          if (numThreadsPerBlock >= 32) MINEQ(vol[threadID],vol[threadID + 16]);
+          if (numThreadsPerBlock >= 16) MINEQ(vol[threadID],vol[threadID + 8]);
+          if (numThreadsPerBlock >= 8)  MINEQ(vol[threadID],vol[threadID + 4]);
+          if (numThreadsPerBlock >= 4)  MINEQ(vol[threadID],vol[threadID + 2]);
+          if (numThreadsPerBlock >= 2)  MINEQ(vol[threadID],vol[threadID + 1]);
+        }
+#endif
+        __syncthreads();
+  }
+
+void hip_init()
+{
+  int deviceCount, dev;
+  hipDeviceProp_t hip_deviceProp;
+  char *s;
+
+  HIP( hipGetDeviceCount(&deviceCount) );
+  if (deviceCount == 0) {
+        fprintf(stderr, "hip_init(): no devices supporting HIP.\n");
+        exit(1);
+  }
+  s=getenv("HIP_DEVICE");
+  if (s != NULL) { dev=atoi(s); }
+  else dev=0;
+  if ((dev < 0) || (dev > deviceCount-1)) {
+        fprintf(stderr, "hip_init(): requested device (%d) out of range [%d,%d]\n",
+                        dev, 0, deviceCount-1);
+        exit(1);
+  }
+  HIP( hipGetDeviceProperties(&hip_deviceProp, dev) );
+  if (hip_deviceProp.major < 1) {
+        fprintf(stderr, "hip_init(): device %d does not support HIP.\n", dev);
+        exit(1);
+  }
+  fprintf(stderr, "setting HIP device %d\n",dev);
+  HIP( hipSetDevice(dev) );
+}
+
+/************************************************************/
+/* Allow for flexible data layout experiments by separating */
+/* array interface from underlying implementation.          */
+/************************************************************/
+
+struct Mesh {
+
+  /* This first implementation allows for runnable code */
+  /* and is not meant to be optimal. Final implementation */
+  /* should separate declaration and allocation phases */
+  /* so that allocation can be scheduled in a cache conscious */
+  /* manner. */
+
+  friend struct MeshGPU;
+
+public:
+
+  /**************/
+  /* Allocation */
+  /**************/
+
+  void AllocateNodalPersistent(size_t size)
+  {
+        m_x.resize(size) ;
+        m_y.resize(size) ;
+        m_z.resize(size) ;
+
+        m_xd.resize(size, Real_t(0.)) ;
+        m_yd.resize(size, Real_t(0.)) ;
+        m_zd.resize(size, Real_t(0.)) ;
+
+        m_xdd.resize(size, Real_t(0.)) ;
+        m_ydd.resize(size, Real_t(0.)) ;
+        m_zdd.resize(size, Real_t(0.)) ;
+
+        m_fx.resize(size) ;
+        m_fy.resize(size) ;
+        m_fz.resize(size) ;
+
+        m_nodalMass.resize(size, Real_t(0.)) ;
+  }
+
+  void AllocateElemPersistent(size_t size)
+  {
+        m_matElemlist.resize(size) ;
+        m_nodelist.resize(8*size) ;
+
+        m_lxim.resize(size) ;
+        m_lxip.resize(size) ;
+        m_letam.resize(size) ;
+        m_letap.resize(size) ;
+        m_lzetam.resize(size) ;
+        m_lzetap.resize(size) ;
+
+        m_elemBC.resize(size) ;
+
+        m_e.resize(size, Real_t(0.)) ;
+
+        m_p.resize(size, Real_t(0.)) ;
+        m_q.resize(size) ;
+        m_ql.resize(size) ;
+        m_qq.resize(size) ;
+
+        m_v.resize(size, 1.0) ;
+        m_volo.resize(size) ;
+        m_delv.resize(size) ;
+        m_vdov.resize(size) ;
+
+        m_arealg.resize(size) ;
+
+        m_ss.resize(size) ;
+
+        m_elemMass.resize(size) ;
+  }
+
+  /* Temporaries should not be initialized in bulk but */
+  /* this is a runnable placeholder for now */
+  void AllocateElemTemporary(size_t size)
+  {
+        m_dxx.resize(size) ;
+        m_dyy.resize(size) ;
+        m_dzz.resize(size) ;
+
+        m_delv_xi.resize(size) ;
+        m_delv_eta.resize(size) ;
+        m_delv_zeta.resize(size) ;
+
+        m_delx_xi.resize(size) ;
+        m_delx_eta.resize(size) ;
+        m_delx_zeta.resize(size) ;
+
+        m_vnew.resize(size) ;
+  }
+
+  void AllocateNodesets(size_t size)
+  {
+        m_symmX.resize(size) ;
+        m_symmY.resize(size) ;
+        m_symmZ.resize(size) ;
+  }
+
+  void AllocateNodeElemIndexes()
+  {
+        Index_t i,j,nidx;
+
+        /* set up node-centered indexing of elements */
+        m_nodeElemCount.resize(m_numNode);
+        for (i=0;i<m_numNode;i++) m_nodeElemCount[i]=0;
+        m_nodeElemCornerList.resize(m_numNode*8);
+        for (i=0;i<m_numElem;i++) {
+          for (j=0;j<8;j++) {
+                nidx=nodelist(i,j);
+                m_nodeElemCornerList[nidx+m_numNode*m_nodeElemCount[nidx]++] = i+m_numElem*j;
+                if (m_nodeElemCount[nidx]>8) {
+                  fprintf(stderr, "Node degree is higher than 8!\n");
+                  exit(1);
+                }
+          }
+        }
+  }
+
+  /**********/
+  /* Access */
+  /**********/
+
+  /* Node-centered */
+
+  Real_t& x(Index_t idx)    { return m_x[idx] ; }
+  Real_t& y(Index_t idx)    { return m_y[idx] ; }
+  Real_t& z(Index_t idx)    { return m_z[idx] ; }
+
+  Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
+  Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
+  Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
+
+  Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
+  Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
+  Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
+
+  Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
+  Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
+  Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
+
+  Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
+
+  Index_t&  symmX(Index_t idx) { return m_symmX[idx] ; }
+  Index_t&  symmY(Index_t idx) { return m_symmY[idx] ; }
+  Index_t&  symmZ(Index_t idx) { return m_symmZ[idx] ; }
+
+  /* Element-centered */
+
+  Index_t&  matElemlist(Index_t idx) { return m_matElemlist[idx] ; }
+  Index_t&  nodelist(Index_t idx,Index_t nidx)    { return m_nodelist[idx+nidx*m_numElem] ; }
+
+  Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
+  Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
+  Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
+  Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
+  Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
+  Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
+
+  Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
+
+  Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
+  Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
+  Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
+
+  Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
+  Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
+  Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
+
+  Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
+  Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
+  Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
+
+  Real_t& e(Index_t idx)          { return m_e[idx] ; }
+
+  Real_t& p(Index_t idx)          { return m_p[idx] ; }
+  Real_t& q(Index_t idx)          { return m_q[idx] ; }
+  Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
+  Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
+
+  Real_t& v(Index_t idx)          { return m_v[idx] ; }
+  Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
+  Real_t& vnew(Index_t idx)       { return m_vnew[idx] ; }
+  Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
+  Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
+
+  Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
+
+  Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
+
+  Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
+
+  /* Params */
+
+  Real_t& dtfixed()              { return m_dtfixed ; }
+  Real_t& time()                 { return m_time ; }
+  Real_t& deltatime()            { return m_deltatime ; }
+  Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
+  Real_t& deltatimemultub()      { return m_deltatimemultub ; }
+  Real_t& stoptime()             { return m_stoptime ; }
+
+  Real_t& u_cut()                { return m_u_cut ; }
+  Real_t& hgcoef()               { return m_hgcoef ; }
+  Real_t& qstop()                { return m_qstop ; }
+  Real_t& monoq_max_slope()      { return m_monoq_max_slope ; }
+  Real_t& monoq_limiter_mult()   { return m_monoq_limiter_mult ; }
+  Real_t& e_cut()                { return m_e_cut ; }
+  Real_t& p_cut()                { return m_p_cut ; }
+  Real_t& ss4o3()                { return m_ss4o3 ; }
+  Real_t& q_cut()                { return m_q_cut ; }
+  Real_t& v_cut()                { return m_v_cut ; }
+  Real_t& qlc_monoq()            { return m_qlc_monoq ; }
+  Real_t& qqc_monoq()            { return m_qqc_monoq ; }
+  Real_t& qqc()                  { return m_qqc ; }
+  Real_t& eosvmax()              { return m_eosvmax ; }
+  Real_t& eosvmin()              { return m_eosvmin ; }
+  Real_t& pmin()                 { return m_pmin ; }
+  Real_t& emin()                 { return m_emin ; }
+  Real_t& dvovmax()              { return m_dvovmax ; }
+  Real_t& refdens()              { return m_refdens ; }
+
+  Real_t& dtcourant()            { return m_dtcourant ; }
+  Real_t& dthydro()              { return m_dthydro ; }
+  Real_t& dtmax()                { return m_dtmax ; }
+
+  Int_t&  cycle()                { return m_cycle ; }
+
+  Index_t&  sizeX()              { return m_sizeX ; }
+  Index_t&  sizeY()              { return m_sizeY ; }
+  Index_t&  sizeZ()              { return m_sizeZ ; }
+  Index_t&  numElem()            { return m_numElem ; }
+  Index_t&  numNode()            { return m_numNode ; }
+
+
+  //private:
+
+  /******************/
+  /* Implementation */
+  /******************/
+
+  /* Node-centered */
+
+  std::vector<Real_t> m_x ;  /* coordinates */
+  std::vector<Real_t> m_y ;
+  std::vector<Real_t> m_z ;
+
+  std::vector<Real_t> m_xd ; /* velocities */
+  std::vector<Real_t> m_yd ;
+  std::vector<Real_t> m_zd ;
+
+  std::vector<Real_t> m_xdd ; /* accelerations */
+  std::vector<Real_t> m_ydd ;
+  std::vector<Real_t> m_zdd ;
+
+  std::vector<Real_t> m_fx ;  /* forces */
+  std::vector<Real_t> m_fy ;
+  std::vector<Real_t> m_fz ;
+
+  std::vector<Real_t> m_nodalMass ;  /* mass */
+
+  std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
+  std::vector<Index_t> m_symmY ;
+  std::vector<Index_t> m_symmZ ;
+
+  std::vector<Int_t> m_nodeElemCount ;
+  std::vector<Index_t> m_nodeElemCornerList ;
+
+  /* Element-centered */
+
+  std::vector<Index_t>  m_matElemlist ;  /* material indexset */
+  std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
+
+  std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
+  std::vector<Index_t>  m_lxip ;
+  std::vector<Index_t>  m_letam ;
+  std::vector<Index_t>  m_letap ;
+  std::vector<Index_t>  m_lzetam ;
+  std::vector<Index_t>  m_lzetap ;
+
+  std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
+
+  std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
+  std::vector<Real_t> m_dyy ;
+  std::vector<Real_t> m_dzz ;
+
+  std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
+  std::vector<Real_t> m_delv_eta ;
+  std::vector<Real_t> m_delv_zeta ;
+
+  std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
+  std::vector<Real_t> m_delx_eta ;
+  std::vector<Real_t> m_delx_zeta ;
+
+  std::vector<Real_t> m_e ;   /* energy */
+
+  std::vector<Real_t> m_p ;   /* pressure */
+  std::vector<Real_t> m_q ;   /* q */
+  std::vector<Real_t> m_ql ;  /* linear term for q */
+  std::vector<Real_t> m_qq ;  /* quadratic term for q */
+
+  std::vector<Real_t> m_v ;     /* relative volume */
+  std::vector<Real_t> m_volo ;  /* reference volume */
+  std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
+  std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
+  std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
+
+  std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
+
+  std::vector<Real_t> m_ss ;      /* "sound speed" */
+
+  std::vector<Real_t> m_elemMass ;  /* mass */
+
+  /* Parameters */
+
+  Real_t  m_dtfixed ;           /* fixed time increment */
+  Real_t  m_time ;              /* current time */
+  Real_t  m_deltatime ;         /* variable time increment */
+  Real_t  m_deltatimemultlb ;
+  Real_t  m_deltatimemultub ;
+  Real_t  m_stoptime ;          /* end time for simulation */
+
+  Real_t  m_u_cut ;             /* velocity tolerance */
+  Real_t  m_hgcoef ;            /* hourglass control */
+  Real_t  m_qstop ;             /* excessive q indicator */
+  Real_t  m_monoq_max_slope ;
+  Real_t  m_monoq_limiter_mult ;
+  Real_t  m_e_cut ;             /* energy tolerance */
+  Real_t  m_p_cut ;             /* pressure tolerance */
+  Real_t  m_ss4o3 ;
+  Real_t  m_q_cut ;             /* q tolerance */
+  Real_t  m_v_cut ;             /* relative volume tolerance */
+  Real_t  m_qlc_monoq ;         /* linear term coef for q */
+  Real_t  m_qqc_monoq ;         /* quadratic term coef for q */
+  Real_t  m_qqc ;
+  Real_t  m_eosvmax ;
+  Real_t  m_eosvmin ;
+  Real_t  m_pmin ;              /* pressure floor */
+  Real_t  m_emin ;              /* energy floor */
+  Real_t  m_dvovmax ;           /* maximum allowable volume change */
+  Real_t  m_refdens ;           /* reference density */
+
+  Real_t  m_dtcourant ;         /* courant constraint */
+  Real_t  m_dthydro ;           /* volume change constraint */
+  Real_t  m_dtmax ;             /* maximum allowable time increment */
+
+  Int_t   m_cycle ;             /* iteration count for simulation */
+
+  Index_t   m_sizeX ;           /* X,Y,Z extent of this block */
+  Index_t   m_sizeY ;
+  Index_t   m_sizeZ ;
+
+  Index_t   m_numElem ;         /* Elements/Nodes in this domain */
+  Index_t   m_numNode ;
+} mesh ;
+
+template <typename T>
+T *Allocate(size_t size)
+{
+  return static_cast<T *>(malloc(sizeof(T)*size)) ;
+}
+
+template <typename T>
+void Release(T **ptr)
+{
+  if (*ptr != NULL) {
+        free(*ptr) ;
+        *ptr = NULL ;
+  }
+}
+
+
+#define GPU_STALE 0
+#define CPU_STALE 1
+#define ALL_FRESH 2
+
+template<typename T>
+void freshenGPU(std::vector<T>&cpu,T **gpu,int& stale) {
+  if (stale!=GPU_STALE) return;
+  if (!(*gpu)) {HIP( hipMalloc(gpu,sizeof(T)*cpu.size()) );}
+  HIP( hipMemcpy(*gpu,&cpu[0],sizeof(T)*cpu.size(),hipMemcpyHostToDevice) );
+  stale=ALL_FRESH;
+}
+
+template<typename T>
+void freshenCPU(std::vector<T>&cpu,T *gpu,int& stale) {
+  if (stale!=CPU_STALE) return;
+  if (!gpu) {fprintf(stderr,"freshenCPU(): NULL GPU data!\n");exit(1);}
+  HIP( hipMemcpy(&cpu[0],gpu,sizeof(T)*cpu.size(),hipMemcpyDeviceToHost) );
+  stale=ALL_FRESH;
+}
+
+// freshen helpers
+#define FC(var) freshenCPU(mesh.m_ ## var , meshGPU.m_ ## var ,meshGPU.m_ ## var ## _stale ); // freshen CPU
+#define FG(var) freshenGPU(mesh.m_ ## var , &meshGPU.m_ ## var ,meshGPU.m_ ## var ## _stale ); // freshen GPU
+// stale helpers
+#define SC(var) meshGPU.m_ ## var ## _stale = CPU_STALE; // stale CPU
+#define SG(var) meshGPU.m_ ## var ## _stale = GPU_STALE; // stale GPU
+
+struct MeshGPU {
+  Mesh *m_mesh;
+
+  /******************/
+  /* Implementation */
+  /******************/
+
+  /* Node-centered */
+
+  Real_t *m_x ;  /* coordinates */
+  Real_t *m_y ;
+  Real_t *m_z ;
+
+  Real_t *m_xd ; /* velocities */
+  Real_t *m_yd ;
+  Real_t *m_zd ;
+
+  Real_t *m_xdd ; /* accelerations */
+  Real_t *m_ydd ;
+  Real_t *m_zdd ;
+
+  Real_t *m_fx ;  /* forces */
+  Real_t *m_fy ;
+  Real_t *m_fz ;
+
+  Real_t *m_nodalMass ;  /* mass */
+
+  Index_t *m_symmX ;  /* symmetry plane nodesets */
+  Index_t *m_symmY ;
+  Index_t *m_symmZ ;
+
+  Int_t   *m_nodeElemCount ;
+  Index_t *m_nodeElemCornerList ;
+
+  /* Element-centered */
+
+  Index_t * m_matElemlist ;  /* material indexset */
+  Index_t * m_nodelist ;     /* elemToNode connectivity */
+
+  Index_t * m_lxim ;  /* element connectivity across each face */
+  Index_t * m_lxip ;
+  Index_t * m_letam ;
+  Index_t * m_letap ;
+  Index_t * m_lzetam ;
+  Index_t * m_lzetap ;
+
+  Int_t *   m_elemBC ;  /* symmetry/free-surface flags for each elem face */
+
+  Real_t *m_dxx ;  /* principal strains -- temporary */
+  Real_t *m_dyy ;
+  Real_t *m_dzz ;
+
+  Real_t *m_delv_xi ;    /* velocity gradient -- temporary */
+  Real_t *m_delv_eta ;
+  Real_t *m_delv_zeta ;
+
+  Real_t *m_delx_xi ;    /* coordinate gradient -- temporary */
+  Real_t *m_delx_eta ;
+  Real_t *m_delx_zeta ;
+
+  Real_t *m_e ;   /* energy */
+
+  Real_t *m_p ;   /* pressure */
+  Real_t *m_q ;   /* q */
+  Real_t *m_ql ;  /* linear term for q */
+  Real_t *m_qq ;  /* quadratic term for q */
+
+  Real_t *m_v ;     /* relative volume */
+  Real_t *m_volo ;  /* reference volume */
+  Real_t *m_vnew ;  /* new relative volume -- temporary */
+  Real_t *m_delv ;  /* m_vnew - m_v */
+  Real_t *m_vdov ;  /* volume derivative over volume */
+
+  Real_t *m_arealg ;  /* characteristic length of an element */
+
+  Real_t *m_ss ;      /* "sound speed" */
+
+  Real_t *m_elemMass ;  /* mass */
+
+  /* Stale flags */
+  int m_x_stale,m_y_stale,m_z_stale;
+  int m_xd_stale,m_yd_stale,m_zd_stale;
+  int m_xdd_stale,m_ydd_stale,m_zdd_stale;
+  int m_fx_stale,m_fy_stale,m_fz_stale;
+  int m_nodalMass_stale;
+  int m_symmX_stale,m_symmY_stale,m_symmZ_stale;
+  int m_nodeElemCount_stale,m_nodeElemCornerList_stale;
+  int m_matElemlist_stale,m_nodelist_stale;
+  int m_lxim_stale,m_lxip_stale,m_letam_stale,m_letap_stale,m_lzetam_stale,m_lzetap_stale;
+  int m_elemBC_stale;
+  int m_dxx_stale,m_dyy_stale,m_dzz_stale;
+  int m_delv_xi_stale,m_delv_eta_stale,m_delv_zeta_stale;
+  int m_delx_xi_stale,m_delx_eta_stale,m_delx_zeta_stale;
+  int m_e_stale;
+  int m_p_stale,m_q_stale,m_ql_stale,m_qq_stale;
+  int m_v_stale,m_volo_stale,m_vnew_stale,m_delv_stale,m_vdov_stale;
+  int m_arealg_stale;
+  int m_ss_stale;
+  int m_elemMass_stale;
+
+  void init(Mesh *mesh) {
+        m_mesh=mesh;
+        m_x=m_y=m_z=NULL;
+        m_xd=m_yd=m_zd=NULL;
+        m_xdd=m_ydd=m_zdd=NULL;
+        m_fx=m_fy=m_fz=NULL;
+        m_nodalMass=NULL;
+        m_symmX=m_symmY=m_symmZ=NULL;
+        m_nodeElemCount=m_nodeElemCornerList=NULL;
+        m_matElemlist=m_nodelist=NULL;
+        m_lxim=m_lxip=m_letam=m_letap=m_lzetam=m_lzetap=NULL;
+        m_elemBC=NULL;
+        m_dxx=m_dyy=m_dzz=NULL;
+        m_delv_xi=m_delv_eta=m_delv_zeta=NULL;
+        m_delx_xi=m_delx_eta=m_delx_zeta=NULL;
+        m_e=NULL;
+        m_p=m_q=m_ql=m_qq=NULL;
+        m_v=m_volo=m_vnew=m_delv=m_vdov=NULL;
+        m_arealg=NULL;
+        m_ss=NULL;
+        m_elemMass=NULL;
+                m_x_stale=m_y_stale=m_z_stale=
+                                      m_xd_stale=m_yd_stale=m_zd_stale=
+                                      m_xdd_stale=m_ydd_stale=m_zdd_stale=
+                                      m_fx_stale=m_fy_stale=m_fz_stale=
+                                      m_nodalMass_stale=
+                                      m_symmX_stale=m_symmY_stale=m_symmZ_stale=
+                                      m_nodeElemCount_stale=m_nodeElemCornerList_stale=
+                                      m_matElemlist_stale=m_nodelist_stale=
+                                      m_lxim_stale=m_lxip_stale=m_letam_stale=m_letap_stale=m_lzetam_stale=m_lzetap_stale=
+                                      m_elemBC_stale=
+                                      m_dxx_stale=m_dyy_stale=m_dzz_stale=
+                                      m_delv_xi_stale=m_delv_eta_stale=m_delv_zeta_stale=
+                                      m_delx_xi_stale=m_delx_eta_stale=m_delx_zeta_stale=
+                                      m_e_stale=
+                                      m_p_stale=m_q_stale=m_ql_stale=m_qq_stale=
+                                      m_v_stale=m_volo_stale=m_vnew_stale=m_delv_stale=m_vdov_stale=
+                                      m_arealg_stale=
+                                      m_ss_stale=
+                                      m_elemMass_stale=
+                          GPU_STALE;
+  }
+  void freshenGPU() {
+#define F(var) ::freshenGPU(m_mesh->m_ ## var , &m_ ## var ,m_ ## var ## _stale);
+        F(x); F(y); F(z);
+        F(xd); F(yd); F(zd);
+        F(xdd); F(ydd); F(zdd);
+        F(fx); F(fy); F(fz);
+        F(nodalMass);
+        F(symmX); F(symmY); F(symmZ);
+        F(nodeElemCount); F(nodeElemCornerList);
+        F(matElemlist); F(nodelist);
+        F(lxim); F(lxip); F(letam); F(letap); F(lzetam); F(lzetap);
+        F(elemBC);
+        F(dxx); F(dyy); F(dzz);
+        F(delv_xi); F(delv_eta); F(delv_zeta);
+        F(delx_xi); F(delx_eta); F(delx_zeta);
+        F(e);
+        F(p); F(q); F(ql); F(qq);
+        F(v); F(volo); F(vnew); F(delv); F(vdov);
+        F(arealg);
+        F(ss);
+        F(elemMass);
+        #undef F
+  }
+  void freshenCPU() {
+#define F(var) ::freshenCPU(m_mesh->m_ ## var , m_ ## var ,m_ ## var ## _stale);
+        F(x); F(y); F(z);
+        F(xd); F(yd); F(zd);
+        F(xdd); F(ydd); F(zdd);
+        F(fx); F(fy); F(fz);
+        F(nodalMass);
+        F(symmX); F(symmY); F(symmZ);
+        F(nodeElemCount); F(nodeElemCornerList);
+        F(matElemlist); F(nodelist);
+        F(lxim); F(lxip); F(letam); F(letap); F(lzetam); F(lzetap);
+        F(elemBC);
+        F(dxx); F(dyy); F(dzz);
+        F(delv_xi); F(delv_eta); F(delv_zeta);
+        F(delx_xi); F(delx_eta); F(delx_zeta);
+        F(e);
+        F(p); F(q); F(ql); F(qq);
+        F(v); F(volo); F(vnew); F(delv); F(vdov);
+        F(arealg);
+        F(ss);
+        F(elemMass);
+        #undef F
+  }
+} meshGPU;
+
+
+
+/* Stuff needed for boundary conditions */
+/* 2 BCs on each of 6 hexahedral faces (12 bits) */
+#define XI_M        0x003
+#define XI_M_SYMM   0x001
+#define XI_M_FREE   0x002
+
+#define XI_P        0x00c
+#define XI_P_SYMM   0x004
+#define XI_P_FREE   0x008
+
+#define ETA_M       0x030
+#define ETA_M_SYMM  0x010
+#define ETA_M_FREE  0x020
+
+#define ETA_P       0x0c0
+#define ETA_P_SYMM  0x040
+#define ETA_P_FREE  0x080
+
+#define ZETA_M      0x300
+#define ZETA_M_SYMM 0x100
+#define ZETA_M_FREE 0x200
+
+#define ZETA_P      0xc00
+#define ZETA_P_SYMM 0x400
+#define ZETA_P_FREE 0x800
+
+
+static inline
+void TimeIncrement()
+{
+  Real_t targetdt = mesh.stoptime() - mesh.time() ;
+
+  if ((mesh.dtfixed() <= Real_t(0.0)) && (mesh.cycle() != Int_t(0))) {
+        Real_t ratio ;
+        Real_t olddt = mesh.deltatime() ;
+
+        /* This will require a reduction in parallel */
+        Real_t newdt = Real_t(1.0e+20) ;
+        if (mesh.dtcourant() < newdt) {
+          newdt = mesh.dtcourant() / Real_t(2.0) ;
+        }
+        if (mesh.dthydro() < newdt) {
+          newdt = mesh.dthydro() * Real_t(2.0) / Real_t(3.0) ;
+        }
+
+        ratio = newdt / olddt ;
+        if (ratio >= Real_t(1.0)) {
+          if (ratio < mesh.deltatimemultlb()) {
+                newdt = olddt ;
+          }
+          else if (ratio > mesh.deltatimemultub()) {
+                newdt = olddt*mesh.deltatimemultub() ;
+          }
+        }
+
+        if (newdt > mesh.dtmax()) {
+          newdt = mesh.dtmax() ;
+        }
+        mesh.deltatime() = newdt ;
+  }
+
+  /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
+  if ((targetdt > mesh.deltatime()) &&
+          (targetdt < (Real_t(4.0) * mesh.deltatime() / Real_t(3.0))) ) {
+        targetdt = Real_t(2.0) * mesh.deltatime() / Real_t(3.0) ;
+  }
+
+  if (targetdt < mesh.deltatime()) {
+        mesh.deltatime() = targetdt ;
+  }
+
+  mesh.time() += mesh.deltatime() ;
+
+  ++mesh.cycle() ;
+}
+
+__global__
+void InitStressTermsForElems_kernel(
+                                   int numElem,Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, Real_t *p, Real_t *q)
+{
+  int i = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<numElem)
+        sigxx[i] = sigyy[i] = sigzz[i] =  - p[i] - q[i] ;
+}
+
+static inline
+void InitStressTermsForElems_gpu(Index_t numElem,
+                                 Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
+{
+  dim3 dimBlock(BLOCKSIZE,1,1);
+  dim3 dimGrid(PAD_DIV(numElem,dimBlock.x),1,1);
+  //hipFuncSetCacheConfig(InitStressTermsForElems_kernel,hipFuncCachePreferL1); // set as default for all kernels after this one
+  hipLaunchKernelGGL((InitStressTermsForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,sigxx,sigyy,sigzz,meshGPU.m_p,meshGPU.m_q);
+  HIP_DEBUGSYNC;
+}
+
+static inline
+void InitStressTermsForElems_cpu(Index_t numElem,
+                                 Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
+{
+  //
+  // pull in the stresses appropriate to the hydro integration
+  //
+  for (Index_t i = 0 ; i < numElem ; ++i){
+        sigxx[i] =  sigyy[i] = sigzz[i] =  - mesh.p(i) - mesh.q(i) ;
+  }
+}
+
+static inline
+void InitStressTermsForElems(Index_t numElem,
+                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
+                             int useCPU)
+{
+  if (useCPU) {
+        FC(p); FC(q);
+        InitStressTermsForElems_cpu(numElem,sigxx,sigyy,sigzz);
+  }
+  else {
+        FG(p); FG(q);
+        InitStressTermsForElems_gpu(numElem,sigxx,sigyy,sigzz);
+  }
+}
+
+__host__ __device__
+static inline
+void CalcElemShapeFunctionDerivatives( const Real_t* const x,
+                                       const Real_t* const y,
+                                       const Real_t* const z,
+                                       Real_t b[][8],
+                                       Real_t* const volume )
+{
+  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
+  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
+  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
+  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
+
+  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
+  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
+  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
+  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
+
+  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
+  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
+  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
+  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
+
+  Real_t fjxxi, fjxet, fjxze;
+  Real_t fjyxi, fjyet, fjyze;
+  Real_t fjzxi, fjzet, fjzze;
+  Real_t cjxxi, cjxet, cjxze;
+  Real_t cjyxi, cjyet, cjyze;
+  Real_t cjzxi, cjzet, cjzze;
+
+  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
+  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
+  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
+
+  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
+  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
+  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
+
+  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
+  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
+  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
+
+  /* compute cofactors */
+  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
+  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
+  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
+
+  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
+  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
+  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
+
+  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
+  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
+  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
+
+  /* calculate partials :
+     this need only be done for l = 0,1,2,3   since , by symmetry ,
+     (6,7,4,5) = - (0,1,2,3) .
+  */
+  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
+  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
+  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
+  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
+  b[0][4] = -b[0][2];
+  b[0][5] = -b[0][3];
+  b[0][6] = -b[0][0];
+  b[0][7] = -b[0][1];
+
+  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
+  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
+  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
+  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
+  b[1][4] = -b[1][2];
+  b[1][5] = -b[1][3];
+  b[1][6] = -b[1][0];
+  b[1][7] = -b[1][1];
+
+  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
+  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
+  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
+  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
+  b[2][4] = -b[2][2];
+  b[2][5] = -b[2][3];
+  b[2][6] = -b[2][0];
+  b[2][7] = -b[2][1];
+
+  /* calculate jacobian determinant (volume) */
+  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
+}
+
+__host__ __device__
+static inline
+void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
+                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
+                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
+                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
+                       const Real_t x0, const Real_t y0, const Real_t z0,
+                       const Real_t x1, const Real_t y1, const Real_t z1,
+                       const Real_t x2, const Real_t y2, const Real_t z2,
+                       const Real_t x3, const Real_t y3, const Real_t z3)
+{
+  Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
+  Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
+  Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
+  Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
+  Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
+  Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
+  Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
+  Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
+  Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
+
+  *normalX0 += areaX;
+  *normalX1 += areaX;
+  *normalX2 += areaX;
+  *normalX3 += areaX;
+
+  *normalY0 += areaY;
+  *normalY1 += areaY;
+  *normalY2 += areaY;
+  *normalY3 += areaY;
+
+  *normalZ0 += areaZ;
+  *normalZ1 += areaZ;
+  *normalZ2 += areaZ;
+  *normalZ3 += areaZ;
+}
+
+__host__ __device__
+static inline
+void CalcElemNodeNormals(Real_t pfx[8],
+                         Real_t pfy[8],
+                         Real_t pfz[8],
+                         const Real_t x[8],
+                         const Real_t y[8],
+                         const Real_t z[8])
+{
+  for (Index_t i = 0 ; i < 8 ; ++i) {
+        pfx[i] = Real_t(0.0);
+        pfy[i] = Real_t(0.0);
+        pfz[i] = Real_t(0.0);
+  }
+  /* evaluate face one: nodes 0, 1, 2, 3 */
+  SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
+                    &pfx[1], &pfy[1], &pfz[1],
+                    &pfx[2], &pfy[2], &pfz[2],
+                    &pfx[3], &pfy[3], &pfz[3],
+                    x[0], y[0], z[0], x[1], y[1], z[1],
+                    x[2], y[2], z[2], x[3], y[3], z[3]);
+  /* evaluate face two: nodes 0, 4, 5, 1 */
+  SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
+                    &pfx[4], &pfy[4], &pfz[4],
+                    &pfx[5], &pfy[5], &pfz[5],
+                    &pfx[1], &pfy[1], &pfz[1],
+                    x[0], y[0], z[0], x[4], y[4], z[4],
+                    x[5], y[5], z[5], x[1], y[1], z[1]);
+  /* evaluate face three: nodes 1, 5, 6, 2 */
+  SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
+                    &pfx[5], &pfy[5], &pfz[5],
+                    &pfx[6], &pfy[6], &pfz[6],
+                    &pfx[2], &pfy[2], &pfz[2],
+                    x[1], y[1], z[1], x[5], y[5], z[5],
+                    x[6], y[6], z[6], x[2], y[2], z[2]);
+  /* evaluate face four: nodes 2, 6, 7, 3 */
+  SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
+                    &pfx[6], &pfy[6], &pfz[6],
+                    &pfx[7], &pfy[7], &pfz[7],
+                    &pfx[3], &pfy[3], &pfz[3],
+                    x[2], y[2], z[2], x[6], y[6], z[6],
+                    x[7], y[7], z[7], x[3], y[3], z[3]);
+  /* evaluate face five: nodes 3, 7, 4, 0 */
+  SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
+                    &pfx[7], &pfy[7], &pfz[7],
+                    &pfx[4], &pfy[4], &pfz[4],
+                    &pfx[0], &pfy[0], &pfz[0],
+                    x[3], y[3], z[3], x[7], y[7], z[7],
+                    x[4], y[4], z[4], x[0], y[0], z[0]);
+  /* evaluate face six: nodes 4, 7, 6, 5 */
+  SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
+                    &pfx[7], &pfy[7], &pfz[7],
+                    &pfx[6], &pfy[6], &pfz[6],
+                    &pfx[5], &pfy[5], &pfz[5],
+                    x[4], y[4], z[4], x[7], y[7], z[7],
+                    x[6], y[6], z[6], x[5], y[5], z[5]);
+}
+
+__host__ __device__
+static inline
+void SumElemStressesToNodeForces( const Real_t B[][8],
+                                  const Real_t stress_xx,
+                                  const Real_t stress_yy,
+                                  const Real_t stress_zz,
+                                  Real_t* const fx,
+                                  Real_t* const fy,
+                                  Real_t* const fz,
+                                  int stride)
+{
+  Real_t pfx0 = B[0][0] ;   Real_t pfx1 = B[0][1] ;
+  Real_t pfx2 = B[0][2] ;   Real_t pfx3 = B[0][3] ;
+  Real_t pfx4 = B[0][4] ;   Real_t pfx5 = B[0][5] ;
+  Real_t pfx6 = B[0][6] ;   Real_t pfx7 = B[0][7] ;
+
+  Real_t pfy0 = B[1][0] ;   Real_t pfy1 = B[1][1] ;
+  Real_t pfy2 = B[1][2] ;   Real_t pfy3 = B[1][3] ;
+  Real_t pfy4 = B[1][4] ;   Real_t pfy5 = B[1][5] ;
+  Real_t pfy6 = B[1][6] ;   Real_t pfy7 = B[1][7] ;
+
+  Real_t pfz0 = B[2][0] ;   Real_t pfz1 = B[2][1] ;
+  Real_t pfz2 = B[2][2] ;   Real_t pfz3 = B[2][3] ;
+  Real_t pfz4 = B[2][4] ;   Real_t pfz5 = B[2][5] ;
+  Real_t pfz6 = B[2][6] ;   Real_t pfz7 = B[2][7] ;
+
+  fx[0*stride] = -( stress_xx * pfx0 );
+  fx[1*stride] = -( stress_xx * pfx1 );
+  fx[2*stride] = -( stress_xx * pfx2 );
+  fx[3*stride] = -( stress_xx * pfx3 );
+  fx[4*stride] = -( stress_xx * pfx4 );
+  fx[5*stride] = -( stress_xx * pfx5 );
+  fx[6*stride] = -( stress_xx * pfx6 );
+  fx[7*stride] = -( stress_xx * pfx7 );
+
+  fy[0*stride] = -( stress_yy * pfy0  );
+  fy[1*stride] = -( stress_yy * pfy1  );
+  fy[2*stride] = -( stress_yy * pfy2  );
+  fy[3*stride] = -( stress_yy * pfy3  );
+  fy[4*stride] = -( stress_yy * pfy4  );
+  fy[5*stride] = -( stress_yy * pfy5  );
+  fy[6*stride] = -( stress_yy * pfy6  );
+  fy[7*stride] = -( stress_yy * pfy7  );
+
+  fz[0*stride] = -( stress_zz * pfz0 );
+  fz[1*stride] = -( stress_zz * pfz1 );
+  fz[2*stride] = -( stress_zz * pfz2 );
+  fz[3*stride] = -( stress_zz * pfz3 );
+  fz[4*stride] = -( stress_zz * pfz4 );
+  fz[5*stride] = -( stress_zz * pfz5 );
+  fz[6*stride] = -( stress_zz * pfz6 );
+  fz[7*stride] = -( stress_zz * pfz7 );
+}
+
+__global__
+void IntegrateStressForElems_kernel( Index_t numElem, Index_t *nodelist,
+                                     Real_t *x, Real_t *y, Real_t *z,
+                                     Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem,
+                                     Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
+                                     Real_t *determ)
+{
+  Real_t B[3][8] ;// shape function derivatives
+  Real_t x_local[8] ;
+  Real_t y_local[8] ;
+  Real_t z_local[8] ;
+
+  int k=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (k<numElem) {
+        // get nodal coordinates from global arrays and copy into local arrays.
+    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
+    {
+            Index_t gnode = nodelist[k+lnode*numElem];
+            x_local[lnode] = x[gnode];
+            y_local[lnode] = y[gnode];
+            z_local[lnode] = z[gnode];
+    }
+
+    /* Volume calculation involves extra work for numerical consistency. */
+    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
+                                     B, &determ[k]);
+
+    CalcElemNodeNormals( B[0] , B[1], B[2],
+                         x_local, y_local, z_local );
+
+    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
+                                 &fx_elem[k], &fy_elem[k], &fz_elem[k], numElem ) ;
+  }
+}
+
+__global__
+void AddNodeForcesFromElems_kernel( Index_t numNode,
+                                    Int_t *nodeElemCount, Index_t *nodeElemCornerList,
+                                    Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem,
+                                    Real_t *fx_node, Real_t *fy_node, Real_t *fz_node)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<numNode) {
+        Int_t count=nodeElemCount[i];
+        Real_t fx,fy,fz;
+        fx=fy=fz=Real_t(0.0);
+        for (int j=0;j<count;j++) {
+          Index_t elem=nodeElemCornerList[i+numNode*j];
+          fx+=fx_elem[elem]; fy+=fy_elem[elem]; fz+=fz_elem[elem];
+        }
+        fx_node[i]=fx; fy_node[i]=fy; fz_node[i]=fz;
+  }
+}
+
+__global__
+void AddNodeForcesFromElems2_kernel( Index_t numNode,
+                                     Int_t *nodeElemCount, Index_t *nodeElemCornerList,
+                                     Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem,
+                                     Real_t *fx_node, Real_t *fy_node, Real_t *fz_node)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<numNode) {
+        Int_t count=nodeElemCount[i];
+        Real_t fx,fy,fz;
+        fx=fy=fz=Real_t(0.0);
+        for (int j=0;j<count;j++) {
+          Index_t elem=nodeElemCornerList[i+numNode*j];
+          fx+=fx_elem[elem]; fy+=fy_elem[elem]; fz+=fz_elem[elem];
+        }
+        fx_node[i]+=fx; fy_node[i]+=fy; fz_node[i]+=fz;
+  }
+}
+
+static inline
+void IntegrateStressForElems_gpu( Index_t numElem,
+                                  Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
+                                  Real_t *determ, int& badvol)
+{
+  Real_t *fx_elem,*fy_elem,*fz_elem;
+
+  HIP( hipMalloc(&fx_elem,numElem*8*sizeof(Real_t)) );
+  HIP( hipMalloc(&fy_elem,numElem*8*sizeof(Real_t)) );
+  HIP( hipMalloc(&fz_elem,numElem*8*sizeof(Real_t)) );
+
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
+  hipLaunchKernelGGL((IntegrateStressForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem, meshGPU.m_nodelist, meshGPU.m_x, meshGPU.m_y, meshGPU.m_z,
+         fx_elem, fy_elem, fz_elem, sigxx, sigyy, sigzz, determ);
+  HIP_DEBUGSYNC;
+
+  dimGrid=dim3(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
+  hipLaunchKernelGGL((AddNodeForcesFromElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),meshGPU.m_nodeElemCount,meshGPU.m_nodeElemCornerList,
+         fx_elem,fy_elem,fz_elem,meshGPU.m_fx,meshGPU.m_fy,meshGPU.m_fz);
+  HIP_DEBUGSYNC;
+
+  HIP( hipFree(fx_elem) );
+  HIP( hipFree(fy_elem) );
+  HIP( hipFree(fz_elem) );
+
+  // JDC -- need a reduction step to check for non-positive element volumes
+  badvol=0;
+}
+
+static inline
+void IntegrateStressForElems_cpu( Index_t numElem,
+                                  Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
+                                  Real_t *determ, int& badvol)
+{
+  Real_t B[3][8] ;// shape function derivatives
+  Real_t x_local[8] ;
+  Real_t y_local[8] ;
+  Real_t z_local[8] ;
+  Real_t fx_local[8] ;
+  Real_t fy_local[8] ;
+  Real_t fz_local[8] ;
+
+  // loop over all elements
+  for( Index_t k=0 ; k<numElem ; ++k )
+  {
+          // get nodal coordinates from global arrays and copy into local arrays.
+      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
+      {
+          Index_t gnode = mesh.nodelist(k,lnode);
+          x_local[lnode] = mesh.x(gnode);
+          y_local[lnode] = mesh.y(gnode);
+          z_local[lnode] = mesh.z(gnode);
+      }
+
+      /* Volume calculation involves extra work for numerical consistency. */
+      CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
+                                       B, &determ[k]);
+
+      CalcElemNodeNormals( B[0] , B[1], B[2],
+                           x_local, y_local, z_local );
+
+      SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
+                                   fx_local, fy_local, fz_local, 1 ) ;
+
+      // copy nodal force contributions to global force arrray.
+      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
+      {
+              Index_t gnode = mesh.nodelist(k,lnode);
+              mesh.fx(gnode) += fx_local[lnode];
+              mesh.fy(gnode) += fy_local[lnode];
+              mesh.fz(gnode) += fz_local[lnode];
+      }
+  }
+
+  badvol=0;
+  for ( Index_t k=0 ; k<numElem ; ++k ) {
+        if (determ[k] <= Real_t(0.0)) {
+          badvol=1;
+        }
+  }
+}
+
+static inline
+void IntegrateStressForElems( Index_t numElem,
+                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
+                              Real_t *determ, int& badvol, int useCPU)
+{
+  if (useCPU) {
+        FC(nodelist); FC(x); FC(y); FC(z);
+        IntegrateStressForElems_cpu(numElem,sigxx,sigyy,sigzz,determ,badvol);
+        SG(fx); SG(fy); SG(fz);
+  }
+  else {
+        FG(nodelist); FG(nodeElemCount); FG(nodeElemCornerList);
+        FG(x); FG(y); FG(z);
+        IntegrateStressForElems_gpu(numElem,sigxx,sigyy,sigzz,determ,badvol);
+        SC(fx); SC(fy); SC(fz);
+  }
+
+}
+
+
+static inline
+void CollectDomainNodesToElemNodes(const Index_t elemNum,
+                                   Real_t elemX[8],
+                                   Real_t elemY[8],
+                                   Real_t elemZ[8])
+{
+  Index_t nd0i = mesh.nodelist(elemNum,0) ;
+  Index_t nd1i = mesh.nodelist(elemNum,1) ;
+  Index_t nd2i = mesh.nodelist(elemNum,2) ;
+  Index_t nd3i = mesh.nodelist(elemNum,3) ;
+  Index_t nd4i = mesh.nodelist(elemNum,4) ;
+  Index_t nd5i = mesh.nodelist(elemNum,5) ;
+  Index_t nd6i = mesh.nodelist(elemNum,6) ;
+  Index_t nd7i = mesh.nodelist(elemNum,7) ;
+
+  elemX[0] = mesh.x(nd0i);
+  elemX[1] = mesh.x(nd1i);
+  elemX[2] = mesh.x(nd2i);
+  elemX[3] = mesh.x(nd3i);
+  elemX[4] = mesh.x(nd4i);
+  elemX[5] = mesh.x(nd5i);
+  elemX[6] = mesh.x(nd6i);
+  elemX[7] = mesh.x(nd7i);
+
+  elemY[0] = mesh.y(nd0i);
+  elemY[1] = mesh.y(nd1i);
+  elemY[2] = mesh.y(nd2i);
+  elemY[3] = mesh.y(nd3i);
+  elemY[4] = mesh.y(nd4i);
+  elemY[5] = mesh.y(nd5i);
+  elemY[6] = mesh.y(nd6i);
+  elemY[7] = mesh.y(nd7i);
+
+  elemZ[0] = mesh.z(nd0i);
+  elemZ[1] = mesh.z(nd1i);
+  elemZ[2] = mesh.z(nd2i);
+  elemZ[3] = mesh.z(nd3i);
+  elemZ[4] = mesh.z(nd4i);
+  elemZ[5] = mesh.z(nd5i);
+  elemZ[6] = mesh.z(nd6i);
+  elemZ[7] = mesh.z(nd7i);
+
+}
+
+
+__host__
+static inline
+void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
+             const Real_t x3, const Real_t x4, const Real_t x5,
+             const Real_t y0, const Real_t y1, const Real_t y2,
+             const Real_t y3, const Real_t y4, const Real_t y5,
+             const Real_t z0, const Real_t z1, const Real_t z2,
+             const Real_t z3, const Real_t z4, const Real_t z5,
+             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
+{
+  const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
+
+  *dvdx =
+        (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
+        (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
+        (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
+  *dvdy =
+        - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
+        (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
+        (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
+
+  *dvdz =
+        - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
+        (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
+        (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
+
+  *dvdx *= twelfth;
+  *dvdy *= twelfth;
+  *dvdz *= twelfth;
+}
+
+#if 0
+__device__
+static inline
+void VOLUDER(const Real_t a0, const Real_t a1, const Real_t a2,
+                         const Real_t a3, const Real_t a4, const Real_t a5,
+                         const Real_t b0, const Real_t b1, const Real_t b2,
+                         const Real_t b3, const Real_t b4, const Real_t b5,
+                         Real_t& dvdc)
+{
+  const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
+
+     dvdc=
+           (a1 + a2) * (b0 + b1) - (a0 + a1) * (b1 + b2) +
+           (a0 + a4) * (b3 + b4) - (a3 + a4) * (b0 + b4) -
+           (a2 + a5) * (b3 + b5) + (a3 + a5) * (b2 + b5);
+         dvdc *= twelfth;
+}
+#else
+// Even though the above version is inlined, it seems to prohibit some kind of compiler optimization.
+// This macro version uses many fewer registers and avoids spill-over into local memory.
+#define VOLUDER(a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5,dvdc)\
+  {\
+        const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;\
+        \
+        dvdc= \
+          ((a1) + (a2)) * ((b0) + (b1)) - ((a0) + (a1)) * ((b1) + (b2)) +\
+          ((a0) + (a4)) * ((b3) + (b4)) - ((a3) + (a4)) * ((b0) + (b4)) -\
+          ((a2) + (a5)) * ((b3) + (b5)) + ((a3) + (a5)) * ((b2) + (b5));\
+        dvdc *= twelfth;\
+  }
+#endif
+
+__host__
+static inline
+void CalcElemVolumeDerivative(Real_t dvdx[8],
+                              Real_t dvdy[8],
+                              Real_t dvdz[8],
+                              const Real_t x[8],
+                              const Real_t y[8],
+                              const Real_t z[8])
+{
+  VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
+                  y[1], y[2], y[3], y[4], y[5], y[7],
+                  z[1], z[2], z[3], z[4], z[5], z[7],
+                  &dvdx[0], &dvdy[0], &dvdz[0]);
+  VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
+                  y[0], y[1], y[2], y[7], y[4], y[6],
+                  z[0], z[1], z[2], z[7], z[4], z[6],
+                  &dvdx[3], &dvdy[3], &dvdz[3]);
+  VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
+                  y[3], y[0], y[1], y[6], y[7], y[5],
+                  z[3], z[0], z[1], z[6], z[7], z[5],
+                  &dvdx[2], &dvdy[2], &dvdz[2]);
+  VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
+                  y[2], y[3], y[0], y[5], y[6], y[4],
+                  z[2], z[3], z[0], z[5], z[6], z[4],
+                  &dvdx[1], &dvdy[1], &dvdz[1]);
+  VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
+                  y[7], y[6], y[5], y[0], y[3], y[1],
+                  z[7], z[6], z[5], z[0], z[3], z[1],
+                  &dvdx[4], &dvdy[4], &dvdz[4]);
+  VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
+                  y[4], y[7], y[6], y[1], y[0], y[2],
+                  z[4], z[7], z[6], z[1], z[0], z[2],
+                  &dvdx[5], &dvdy[5], &dvdz[5]);
+  VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
+                  y[5], y[4], y[7], y[2], y[1], y[3],
+                  z[5], z[4], z[7], z[2], z[1], z[3],
+                  &dvdx[6], &dvdy[6], &dvdz[6]);
+  VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
+                  y[6], y[5], y[4], y[3], y[2], y[0],
+                  z[6], z[5], z[4], z[3], z[2], z[0],
+                  &dvdx[7], &dvdy[7], &dvdz[7]);
+}
+
+__device__
+static inline
+void CalcElemVolumeDerivative(Real_t& dvdx,
+                              Real_t& dvdy,
+                              Real_t& dvdz,
+                              const Real_t x,
+                              const Real_t y,
+                              const Real_t z,
+                              unsigned int node)
+{
+  __shared__ Real_t array1[256],array2[256];
+  volatile Real_t *va1;
+  volatile Real_t *va2;
+
+  unsigned int idx,elem;
+  unsigned int ind0,ind1,ind2,ind3,ind4,ind5;
+
+  switch(node) {
+  case 0:
+        {ind0=1; ind1=2; ind2=3; ind3=4; ind4=5; ind5=7;
+          break;}
+  case 1:
+        {ind0=2; ind1=3; ind2=0; ind3=5; ind4=6; ind5=4;
+          break;}
+  case 2:
+        {ind0=3; ind1=0; ind2=1; ind3=6; ind4=7; ind5=5;
+          break;}
+  case 3:
+        {ind0=0; ind1=1; ind2=2; ind3=7; ind4=4; ind5=6;
+          break;}
+  case 4:
+        {ind0=7; ind1=6; ind2=5; ind3=0; ind4=3; ind5=1;
+          break;}
+  case 5:
+        {ind0=4; ind1=7; ind2=6; ind3=1; ind4=0; ind5=2;
+          break;}
+  case 6:
+        {ind0=5; ind1=4; ind2=7; ind3=2; ind4=1; ind5=3;
+          break;}
+  case 7:
+        {ind0=6; ind1=5; ind2=4; ind3=3; ind4=2; ind5=0;
+          break;}
+  default:
+        {ind0=ind1=ind2=ind3=ind4=ind5=0xFFFFFFFF;
+          break;}
+  }
+
+  idx=hipThreadIdx_x;
+  elem=idx /*& 0x1F*/ - node*32;
+
+  va1=&array1[0];
+  va2=&array2[0];
+
+  // load y and z
+  __syncthreads();
+  va1[idx]=y; va2[idx]=z;
+  __syncthreads();
+  VOLUDER(va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem],
+                  va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem],
+                  va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem],
+                  va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem],
+                  dvdx);
+
+  // load x
+  __syncthreads();
+  va1[idx]=x;
+  __syncthreads();
+  VOLUDER(va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem],
+                  va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem],
+                  va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem],
+                  va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem],
+                  dvdy);
+  __syncthreads();
+
+  // load y
+  __syncthreads();
+  va2[idx]=y;
+  __syncthreads();
+  VOLUDER(va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem],
+                  va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem],
+                  va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem],
+                  va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem],
+                  dvdz);
+  __syncthreads();
+}
+
+__host__
+static inline
+void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t *hourgam0,
+                              Real_t *hourgam1, Real_t *hourgam2, Real_t *hourgam3,
+                              Real_t *hourgam4, Real_t *hourgam5, Real_t *hourgam6,
+                              Real_t *hourgam7, Real_t coefficient,
+                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
+{
+  Index_t i00=0;
+  Index_t i01=1;
+  Index_t i02=2;
+  Index_t i03=3;
+
+     Real_t h00 =
+                 hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +
+                 hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +
+                 hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +
+           hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];
+
+            Real_t h01 =
+                        hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +
+                        hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +
+                        hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +
+                  hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];
+
+                Real_t h02 =
+                  hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+
+                  hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+
+                  hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+
+                  hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];
+
+                Real_t h03 =
+                  hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +
+                  hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +
+                  hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +
+                  hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];
+
+                hgfx[0] = coefficient *
+                  (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
+                   hourgam0[i02] * h02 + hourgam0[i03] * h03);
+
+                hgfx[1] = coefficient *
+                  (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
+                   hourgam1[i02] * h02 + hourgam1[i03] * h03);
+
+                hgfx[2] = coefficient *
+                  (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
+                   hourgam2[i02] * h02 + hourgam2[i03] * h03);
+
+                hgfx[3] = coefficient *
+                  (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
+                   hourgam3[i02] * h02 + hourgam3[i03] * h03);
+
+                hgfx[4] = coefficient *
+                  (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
+                   hourgam4[i02] * h02 + hourgam4[i03] * h03);
+
+                hgfx[5] = coefficient *
+                  (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
+                   hourgam5[i02] * h02 + hourgam5[i03] * h03);
+
+                hgfx[6] = coefficient *
+                  (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
+                   hourgam6[i02] * h02 + hourgam6[i03] * h03);
+
+                hgfx[7] = coefficient *
+                  (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
+                   hourgam7[i02] * h02 + hourgam7[i03] * h03);
+
+                h00 =
+                  hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +
+                  hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +
+                  hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +
+                  hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];
+
+                h01 =
+                  hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +
+                  hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +
+                  hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +
+                  hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];
+
+                h02 =
+                  hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+
+                  hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+
+                  hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+
+                  hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];
+
+                h03 =
+                  hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +
+                  hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +
+                  hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +
+                  hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];
+
+
+                hgfy[0] = coefficient *
+                  (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
+                   hourgam0[i02] * h02 + hourgam0[i03] * h03);
+
+                hgfy[1] = coefficient *
+                  (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
+                   hourgam1[i02] * h02 + hourgam1[i03] * h03);
+
+                hgfy[2] = coefficient *
+                  (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
+                   hourgam2[i02] * h02 + hourgam2[i03] * h03);
+
+                hgfy[3] = coefficient *
+                  (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
+                   hourgam3[i02] * h02 + hourgam3[i03] * h03);
+
+                hgfy[4] = coefficient *
+                  (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
+                   hourgam4[i02] * h02 + hourgam4[i03] * h03);
+
+                hgfy[5] = coefficient *
+                  (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
+                   hourgam5[i02] * h02 + hourgam5[i03] * h03);
+
+                hgfy[6] = coefficient *
+                  (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
+                   hourgam6[i02] * h02 + hourgam6[i03] * h03);
+
+                hgfy[7] = coefficient *
+                  (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
+                   hourgam7[i02] * h02 + hourgam7[i03] * h03);
+
+                h00 =
+                  hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +
+                  hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +
+                  hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +
+                  hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];
+
+                h01 =
+                  hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +
+                  hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +
+                  hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +
+                  hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];
+
+                h02 =
+                  hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+
+                  hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+
+                  hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+
+                  hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];
+
+                h03 =
+                  hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +
+                  hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +
+                  hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +
+                  hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];
+
+
+                hgfz[0] = coefficient *
+                  (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
+                   hourgam0[i02] * h02 + hourgam0[i03] * h03);
+
+                hgfz[1] = coefficient *
+                  (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
+                   hourgam1[i02] * h02 + hourgam1[i03] * h03);
+
+                hgfz[2] = coefficient *
+                  (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
+                   hourgam2[i02] * h02 + hourgam2[i03] * h03);
+
+                hgfz[3] = coefficient *
+                  (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
+                   hourgam3[i02] * h02 + hourgam3[i03] * h03);
+
+                hgfz[4] = coefficient *
+                  (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
+                   hourgam4[i02] * h02 + hourgam4[i03] * h03);
+
+                hgfz[5] = coefficient *
+                  (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
+                   hourgam5[i02] * h02 + hourgam5[i03] * h03);
+
+                hgfz[6] = coefficient *
+                  (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
+                   hourgam6[i02] * h02 + hourgam6[i03] * h03);
+
+                hgfz[7] = coefficient *
+                  (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
+                   hourgam7[i02] * h02 + hourgam7[i03] * h03);
+}
+
+
+__device__
+static inline
+Real_t SumOverNodes(Real_t val) {
+  __shared__ Real_t shm_array[32*8];
+
+  // Sum up 8 node values for each element
+  // Assumes 256 threads: 32 elements, 8 nodes per element.
+  // NOTE: we could probably avoid some of the __syncthreads() if we map 8 nodes
+  //       of an element to the same warp.
+  unsigned int tid=hipThreadIdx_x;
+
+#if 1
+#if 0
+  unsigned int node=tid>>5;
+  unsigned int elem=tid-(node<<5);
+#elif 1
+  unsigned int node=tid/32;
+  unsigned int elem=tid-(node*32);
+#else
+  unsigned int elem=tid & 0x1F;
+#endif
+  __syncthreads();
+  shm_array[tid]=val;
+  __syncthreads();
+  if (tid<128) shm_array[tid]+=shm_array[tid+128];
+  __syncthreads();
+  if (tid<64)  shm_array[tid]+=shm_array[tid+64];
+  __syncthreads();
+  if (tid<32)  shm_array[tid]+=shm_array[tid+32];
+  __syncthreads();
+  Real_t ret=shm_array[elem];
+  __syncthreads();
+  return ret;
+#else
+#if 0
+  unsigned int node=tid>>5;
+  unsigned int elem=tid-(node<<5);
+#else
+  unsigned int node=tid/32;
+  unsigned int elem=tid-(node*32);
+#endif
+  unsigned int idx=elem*8+node;
+  __syncthreads();
+  shm_array[idx]=val;
+  __syncthreads();
+  if (node<4) shm_array[idx]+=shm_array[idx+4];
+  if (node<2) shm_array[idx]+=shm_array[idx+2];
+  if (node<1) shm_array[idx]+=shm_array[idx+1];
+  __syncthreads();
+  return shm_array[elem*8];
+#endif
+}
+
+__device__
+static inline
+void CalcElemFBHourglassForce(Real_t xd,Real_t yd,Real_t zd,
+                              Real_t *hourgam,Real_t coefficient,
+                              Real_t &hgfx, Real_t &hgfy, Real_t &hgfz)
+{
+  hgfx=0;
+  for (int i=0;i<4;i++) {
+        Real_t h;
+        h=hourgam[i]*xd;
+        h=SumOverNodes(h);
+        hgfx+=hourgam[i]*h;
+  }
+  hgfx *= coefficient;
+
+  hgfy=0;
+  for (int i=0;i<4;i++) {
+        Real_t h;
+        h=hourgam[i]*yd;
+        h=SumOverNodes(h);
+        hgfy+=hourgam[i]*h;
+  }
+  hgfy *= coefficient;
+
+  hgfz=0;
+  for (int i=0;i<4;i++) {
+        Real_t h;
+        h=hourgam[i]*zd;
+        h=SumOverNodes(h);
+        hgfz+=hourgam[i]*h;
+  }
+  hgfz *= coefficient;
+}
+
+__global__
+void CalcFBHourglassForceForElems_kernel(
+                                         Real_t *determ,
+                                         Real_t *x8n,      Real_t *y8n,      Real_t *z8n,
+                                         Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,
+                                         Real_t hourg,
+                                         Index_t numElem, Index_t *nodelist,
+                                         Real_t *ss, Real_t *elemMass,
+                                         Real_t *xd, Real_t *yd, Real_t *zd,
+                                         Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem)
+{
+  /*************************************************
+   *
+   *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
+   *               force.
+   *
+   *************************************************/
+
+  Real_t hgfx, hgfy, hgfz;
+
+  Real_t coefficient;
+
+  Real_t hourgam[4];
+  Real_t xd1, yd1, zd1;
+
+  /*************************************************/
+  /*    compute the hourglass modes */
+
+  const Real_t posf = Real_t( 1.);
+  const Real_t negf = Real_t(-1.);
+
+  // Assume we will launch 256 threads, which we map to 32 elements, each
+  // with 8 per-node threads. Organize so each warp of 32 consecutive
+  // threads operates on the same node of different elements.
+
+
+  // THESE ARE ALL GIVING ME DIFFERENT ANSWERS IN CUDA 4.0 !!?!!?!!
+  unsigned int tid=hipThreadIdx_x;
+  unsigned int bid=hipBlockIdx_x;
+#if 0
+  unsigned int node=tid>>5;
+  unsigned int elem=bid<<5 + (tid - (node<<5));
+#elif 1
+  unsigned int node=tid/32;
+  unsigned int elem=bid*32 + (tid-node*32);
+#elif 0
+  unsigned int node=tid/32;;
+  unsigned int elem=bid*32 + (tid & 0x1F);
+#elif 0
+  unsigned int node=tid/32;
+  unsigned int elem=bid<<5 + (tid & 0x1F);
+#elif 0
+  unsigned int node=tid>>5;
+  unsigned int elem=bid*32 + (tid & 0x1F);
+#else
+  unsigned int node=tid>>5;
+  unsigned int elem=bid<<5 + (tid & 0x1F);
+#endif
+
+  if (elem>=numElem) elem=numElem-1; // don't return -- need thread to participate in sync operations
+
+  //if (elem<0) elem=0; // debugging test
+
+  Real_t volinv=Real_t(1.0)/determ[elem];
+  Real_t ss1, mass1, volume13 ;
+
+  Real_t xn,yn,zn,dvdxn,dvdyn,dvdzn;
+  Real_t hourmodx, hourmody, hourmodz;
+
+
+#if 1
+  xn=x8n[elem+numElem*node]; yn=y8n[elem+numElem*node]; zn=z8n[elem+numElem*node];
+  dvdxn=dvdx[elem+numElem*node]; dvdyn=dvdy[elem+numElem*node]; dvdzn=dvdz[elem+numElem*node];
+#else
+  xn=yn=zn=posf; dvdxn=dvdyn=dvdzn=negf;
+#endif
+
+#if 1
+  hourmodx=xn; hourmody=yn; hourmodz=zn;
+  if (node==2 || node==3 || node==4 || node==5) {
+        hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
+        hourgam[0] = negf;
+  }
+  else hourgam[0] = posf;
+  hourmodx = SumOverNodes(hourmodx);
+  hourmody = SumOverNodes(hourmody);
+  hourmodz = SumOverNodes(hourmodz);
+  hourgam[0] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);
+
+
+  hourmodx=xn; hourmody=yn; hourmodz=zn;
+  if (node==1 || node==2 || node==4 || node==7) {
+        hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
+        hourgam[1] = negf;
+  }
+  else hourgam[1] = posf;
+  hourmodx = SumOverNodes(hourmodx);
+  hourmody = SumOverNodes(hourmody);
+  hourmodz = SumOverNodes(hourmodz);
+  hourgam[1] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);
+
+
+  hourmodx=xn; hourmody=yn; hourmodz=zn;
+  if (node==1 || node==3 || node==5 || node==7) {
+        hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
+        hourgam[2] = negf;
+  }
+  else hourgam[2] = posf;
+  hourmodx = SumOverNodes(hourmodx);
+  hourmody = SumOverNodes(hourmody);
+  hourmodz = SumOverNodes(hourmodz);
+  hourgam[2] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);
+
+
+  hourmodx=xn; hourmody=yn; hourmodz=zn;
+  if (node==0 || node==2 || node==5 || node==7) {
+        hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
+        hourgam[3] = negf;
+  }
+  else hourgam[3] = posf;
+  hourmodx = SumOverNodes(hourmodx);
+  hourmody = SumOverNodes(hourmody);
+  hourmodz = SumOverNodes(hourmodz);
+  hourgam[3] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);
+
+
+  /* compute forces */
+  /* store forces into h arrays (force arrays) */
+
+  ss1=ss[elem];
+  mass1=elemMass[elem];
+  volume13=CBRT(determ[elem]);
+
+  Index_t ni = nodelist[elem+numElem*node];
+  xd1=xd[ni]; yd1=yd[ni]; zd1=zd[ni];
+
+  coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
+
+  CalcElemFBHourglassForce(xd1,yd1,zd1,hourgam,coefficient,hgfx,hgfy,hgfz);
+#else
+  hgfx=xn+dvdxn; hgfy=yn+dvdyn; hgfz=zn+dvdzn;
+#endif
+#if 1
+  fx_elem[elem+numElem*node]=hgfx; fy_elem[elem+numElem*node]=hgfy; fz_elem[elem+numElem*node]=hgfz;
+#else
+  fx_elem[0]=hgfx; fy_elem[0]=hgfy; fz_elem[0]=hgfz;
+#endif
+}
+
+
+static inline
+void CalcFBHourglassForceForElems_cpu(Real_t *determ,
+                                      Real_t *x8n,      Real_t *y8n,      Real_t *z8n,
+                                      Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,
+                                      Real_t hourg)
+{
+  /*************************************************
+   *
+   *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
+   *               force.
+   *
+   *************************************************/
+
+  Index_t numElem = mesh.numElem() ;
+
+  Real_t hgfx[8], hgfy[8], hgfz[8] ;
+
+  Real_t coefficient;
+
+  Real_t  gamma[4][8];
+  Real_t hourgam0[4], hourgam1[4], hourgam2[4], hourgam3[4] ;
+  Real_t hourgam4[4], hourgam5[4], hourgam6[4], hourgam7[4];
+  Real_t xd1[8], yd1[8], zd1[8] ;
+
+  gamma[0][0] = Real_t( 1.);
+  gamma[0][1] = Real_t( 1.);
+  gamma[0][2] = Real_t(-1.);
+  gamma[0][3] = Real_t(-1.);
+  gamma[0][4] = Real_t(-1.);
+  gamma[0][5] = Real_t(-1.);
+  gamma[0][6] = Real_t( 1.);
+  gamma[0][7] = Real_t( 1.);
+  gamma[1][0] = Real_t( 1.);
+  gamma[1][1] = Real_t(-1.);
+  gamma[1][2] = Real_t(-1.);
+  gamma[1][3] = Real_t( 1.);
+  gamma[1][4] = Real_t(-1.);
+  gamma[1][5] = Real_t( 1.);
+  gamma[1][6] = Real_t( 1.);
+  gamma[1][7] = Real_t(-1.);
+  gamma[2][0] = Real_t( 1.);
+  gamma[2][1] = Real_t(-1.);
+  gamma[2][2] = Real_t( 1.);
+  gamma[2][3] = Real_t(-1.);
+  gamma[2][4] = Real_t( 1.);
+  gamma[2][5] = Real_t(-1.);
+  gamma[2][6] = Real_t( 1.);
+  gamma[2][7] = Real_t(-1.);
+  gamma[3][0] = Real_t(-1.);
+  gamma[3][1] = Real_t( 1.);
+  gamma[3][2] = Real_t(-1.);
+  gamma[3][3] = Real_t( 1.);
+  gamma[3][4] = Real_t( 1.);
+  gamma[3][5] = Real_t(-1.);
+  gamma[3][6] = Real_t( 1.);
+  gamma[3][7] = Real_t(-1.);
+
+  /*************************************************/
+  /*    compute the hourglass modes */
+
+
+  for(Index_t i2=0;i2<numElem;++i2){
+        Index_t i3=8*i2;
+        Real_t volinv=Real_t(1.0)/determ[i2];
+        Real_t ss1, mass1, volume13 ;
+        for(Index_t i1=0;i1<4;++i1){
+
+          Real_t hourmodx =
+                x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
+                x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
+                x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
+                x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
+
+          Real_t hourmody =
+                y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
+                y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
+                y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
+                y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
+
+          Real_t hourmodz =
+                z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
+                z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
+                z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
+                z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
+
+          hourgam0[i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
+                                                 dvdy[i3  ] * hourmody +
+                                                 dvdz[i3  ] * hourmodz );
+
+          hourgam1[i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
+                                                 dvdy[i3+1] * hourmody +
+                                                 dvdz[i3+1] * hourmodz );
+
+          hourgam2[i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
+                                                 dvdy[i3+2] * hourmody +
+                                                 dvdz[i3+2] * hourmodz );
+
+          hourgam3[i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
+                                                 dvdy[i3+3] * hourmody +
+                                                 dvdz[i3+3] * hourmodz );
+
+          hourgam4[i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
+                                                 dvdy[i3+4] * hourmody +
+                                                 dvdz[i3+4] * hourmodz );
+
+          hourgam5[i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
+                                                 dvdy[i3+5] * hourmody +
+                                                 dvdz[i3+5] * hourmodz );
+
+          hourgam6[i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
+                                                 dvdy[i3+6] * hourmody +
+                                                 dvdz[i3+6] * hourmodz );
+
+          hourgam7[i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
+                                                 dvdy[i3+7] * hourmody +
+                                                 dvdz[i3+7] * hourmodz );
+
+        }
+
+        /* compute forces */
+        /* store forces into h arrays (force arrays) */
+
+        ss1=mesh.ss(i2);
+        mass1=mesh.elemMass(i2);
+        volume13=CBRT(determ[i2]);
+
+        Index_t n0si2 = mesh.nodelist(i2,0);
+        Index_t n1si2 = mesh.nodelist(i2,1);
+        Index_t n2si2 = mesh.nodelist(i2,2);
+        Index_t n3si2 = mesh.nodelist(i2,3);
+        Index_t n4si2 = mesh.nodelist(i2,4);
+        Index_t n5si2 = mesh.nodelist(i2,5);
+        Index_t n6si2 = mesh.nodelist(i2,6);
+        Index_t n7si2 = mesh.nodelist(i2,7);
+
+        xd1[0] = mesh.xd(n0si2);
+        xd1[1] = mesh.xd(n1si2);
+        xd1[2] = mesh.xd(n2si2);
+        xd1[3] = mesh.xd(n3si2);
+        xd1[4] = mesh.xd(n4si2);
+        xd1[5] = mesh.xd(n5si2);
+        xd1[6] = mesh.xd(n6si2);
+        xd1[7] = mesh.xd(n7si2);
+
+        yd1[0] = mesh.yd(n0si2);
+        yd1[1] = mesh.yd(n1si2);
+        yd1[2] = mesh.yd(n2si2);
+        yd1[3] = mesh.yd(n3si2);
+        yd1[4] = mesh.yd(n4si2);
+        yd1[5] = mesh.yd(n5si2);
+        yd1[6] = mesh.yd(n6si2);
+        yd1[7] = mesh.yd(n7si2);
+
+        zd1[0] = mesh.zd(n0si2);
+        zd1[1] = mesh.zd(n1si2);
+        zd1[2] = mesh.zd(n2si2);
+        zd1[3] = mesh.zd(n3si2);
+        zd1[4] = mesh.zd(n4si2);
+        zd1[5] = mesh.zd(n5si2);
+        zd1[6] = mesh.zd(n6si2);
+        zd1[7] = mesh.zd(n7si2);
+
+        coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
+
+        CalcElemFBHourglassForce(xd1,yd1,zd1,
+                                 hourgam0,hourgam1,hourgam2,hourgam3,
+                                 hourgam4,hourgam5,hourgam6,hourgam7,
+                                 coefficient, hgfx, hgfy, hgfz);
+
+        mesh.fx(n0si2) += hgfx[0];
+        mesh.fy(n0si2) += hgfy[0];
+        mesh.fz(n0si2) += hgfz[0];
+
+        mesh.fx(n1si2) += hgfx[1];
+        mesh.fy(n1si2) += hgfy[1];
+        mesh.fz(n1si2) += hgfz[1];
+
+        mesh.fx(n2si2) += hgfx[2];
+        mesh.fy(n2si2) += hgfy[2];
+        mesh.fz(n2si2) += hgfz[2];
+
+        mesh.fx(n3si2) += hgfx[3];
+        mesh.fy(n3si2) += hgfy[3];
+        mesh.fz(n3si2) += hgfz[3];
+
+        mesh.fx(n4si2) += hgfx[4];
+        mesh.fy(n4si2) += hgfy[4];
+        mesh.fz(n4si2) += hgfz[4];
+
+        mesh.fx(n5si2) += hgfx[5];
+        mesh.fy(n5si2) += hgfy[5];
+        mesh.fz(n5si2) += hgfz[5];
+
+        mesh.fx(n6si2) += hgfx[6];
+        mesh.fy(n6si2) += hgfy[6];
+        mesh.fz(n6si2) += hgfz[6];
+
+        mesh.fx(n7si2) += hgfx[7];
+        mesh.fy(n7si2) += hgfy[7];
+        mesh.fz(n7si2) += hgfz[7];
+  }
+}
+
+static inline
+void CalcFBHourglassForceForElems_gpu(Real_t *determ,
+                                      Real_t *x8n,      Real_t *y8n,      Real_t *z8n,
+                                      Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,
+                                      Real_t hourg)
+{
+  Index_t numElem = mesh.numElem();
+  Real_t *fx_elem,*fy_elem,*fz_elem;
+
+  HIP( hipMalloc(&fx_elem,numElem*8*sizeof(Real_t)) );
+  HIP( hipMalloc(&fy_elem,numElem*8*sizeof(Real_t)) );
+  HIP( hipMalloc(&fz_elem,numElem*8*sizeof(Real_t)) );
+
+  dim3 dimBlock=dim3(256,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(numElem*8,dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcFBHourglassForceForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0,
+                     determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hourg,
+                     numElem,meshGPU.m_nodelist,
+                     meshGPU.m_ss,meshGPU.m_elemMass,
+                     meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd,
+                     fx_elem,fy_elem,fz_elem);
+  HIP_DEBUGSYNC;
+
+  dimGrid=dim3(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
+  hipLaunchKernelGGL((AddNodeForcesFromElems2_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),meshGPU.m_nodeElemCount,meshGPU.m_nodeElemCornerList,
+         fx_elem,fy_elem,fz_elem,meshGPU.m_fx,meshGPU.m_fy,meshGPU.m_fz);
+  HIP_DEBUGSYNC;
+
+  HIP( hipFree(fx_elem) );
+  HIP( hipFree(fy_elem) );
+  HIP( hipFree(fz_elem) );
+}
+
+
+__global__
+void CalcHourglassControlForElems_kernel(Int_t numElem,Index_t *nodelist,
+                                         Real_t *x,Real_t *y,Real_t *z,
+                                         Real_t *determ,Real_t *volo,Real_t *v,
+                                         Real_t *dvdx,Real_t *dvdy,Real_t *dvdz,
+                                         Real_t *x8n,Real_t *y8n,Real_t *z8n)
+{
+  Real_t  x1,y1,z1;
+  Real_t pfx,pfy,pfz;
+
+  // THESE ARE ALL GIVING ME DIFFERENT ANSWERS IN CUDA 4.0 !!?!!?!!
+  unsigned int tid=hipThreadIdx_x;
+  unsigned int bid=hipBlockIdx_x;
+#if 0
+  unsigned int node=tid>>5;
+  unsigned int elem=bid<<5 + (tid - (node<<5));
+#elif 1
+  unsigned int node=tid/32;
+  unsigned int elem=bid*32 + (tid-node*32);
+#elif 0
+  unsigned int node=tid/32;;
+  unsigned int elem=bid*32 + (tid & 0x1F);
+#elif 0
+  unsigned int node=tid/32;
+  unsigned int elem=bid<<5 + (tid & 0x1F);
+#elif 0
+  unsigned int node=tid>>5;
+  unsigned int elem=bid*32 + (tid & 0x1F);
+#else
+  unsigned int node=tid>>5;
+  unsigned int elem=bid<<5 + (tid & 0x1F);
+#endif
+
+  if (elem>=numElem) elem=numElem-1; // don't return -- need thread to participate in sync operations
+
+  Index_t idx=elem+numElem*node;
+
+  Index_t ni = nodelist[idx];
+  x1=x[ni]; y1=y[ni]; z1=z[ni];
+
+  CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1, node);
+
+  /* load into temporary storage for FB Hour Glass control */
+
+  dvdx[idx] = pfx;
+  dvdy[idx] = pfy;
+  dvdz[idx] = pfz;
+  x8n[idx]  = x1;
+  y8n[idx]  = y1;
+  z8n[idx]  = z1;
+
+  //if (node==0)
+  determ[elem] = volo[elem] * v[elem];
+
+#if 0 // JDC
+  /* Do a check for negative volumes */
+  if ( mesh.v(i) <= Real_t(0.0) ) {
+        exit(VolumeError) ;
+  }
+#endif
+}
+
+
+static inline
+void CalcHourglassControlForElems_gpu(Real_t determ[], Real_t hgcoef)
+{
+  Index_t numElem = mesh.numElem() ;
+  Index_t numElem8 = numElem * 8 ;
+  Real_t *dvdx,*dvdy,*dvdz;
+  Real_t *x8n,*y8n,*z8n;
+
+  HIP( hipMalloc(&dvdx,sizeof(Real_t)*numElem8) );
+  HIP( hipMalloc(&dvdy,sizeof(Real_t)*numElem8) );
+  HIP( hipMalloc(&dvdz,sizeof(Real_t)*numElem8) );
+  HIP( hipMalloc(&x8n,sizeof(Real_t)*numElem8) );
+  HIP( hipMalloc(&y8n,sizeof(Real_t)*numElem8) );
+  HIP( hipMalloc(&z8n,sizeof(Real_t)*numElem8) );
+
+  dim3 dimBlock=dim3(256,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(numElem*8,dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcHourglassControlForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem, meshGPU.m_nodelist,
+         meshGPU.m_x,meshGPU.m_y,meshGPU.m_z,
+         determ,meshGPU.m_volo,meshGPU.m_v,
+         dvdx,dvdy,dvdz,x8n,y8n,z8n);
+  HIP_DEBUGSYNC;
+
+  // JDC -- need a reduction to check for negative volumes
+
+  if ( hgcoef > Real_t(0.) ) {
+        CalcFBHourglassForceForElems_gpu(determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hgcoef) ;
+  }
+
+  HIP( hipFree(dvdx) );
+  HIP( hipFree(dvdy) );
+  HIP( hipFree(dvdz) );
+  HIP( hipFree(x8n) );
+  HIP( hipFree(y8n) );
+  HIP( hipFree(z8n) );
+
+  return ;
+}
+
+
+static inline
+void CalcHourglassControlForElems_cpu(Real_t determ[], Real_t hgcoef)
+{
+  Index_t i, ii, jj ;
+  Real_t  x1[8],  y1[8],  z1[8] ;
+  Real_t pfx[8], pfy[8], pfz[8] ;
+  Index_t numElem = mesh.numElem() ;
+  Index_t numElem8 = numElem * 8 ;
+  Real_t *dvdx = Allocate<Real_t>(numElem8) ;
+  Real_t *dvdy = Allocate<Real_t>(numElem8) ;
+  Real_t *dvdz = Allocate<Real_t>(numElem8) ;
+  Real_t *x8n  = Allocate<Real_t>(numElem8) ;
+  Real_t *y8n  = Allocate<Real_t>(numElem8) ;
+  Real_t *z8n  = Allocate<Real_t>(numElem8) ;
+
+  /* start loop over elements */
+  for (i=0 ; i<numElem ; ++i){
+
+        CollectDomainNodesToElemNodes(i, x1, y1, z1);
+
+        CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
+
+        /* load into temporary storage for FB Hour Glass control */
+        for(ii=0;ii<8;++ii){
+          jj=8*i+ii;
+
+          dvdx[jj] = pfx[ii];
+          dvdy[jj] = pfy[ii];
+          dvdz[jj] = pfz[ii];
+          x8n[jj]  = x1[ii];
+          y8n[jj]  = y1[ii];
+          z8n[jj]  = z1[ii];
+        }
+
+        determ[i] = mesh.volo(i) * mesh.v(i);
+
+        /* Do a check for negative volumes */
+        if ( mesh.v(i) <= Real_t(0.0) ) {
+          exit(VolumeError) ;
+        }
+  }
+
+  if ( hgcoef > Real_t(0.) ) {
+        CalcFBHourglassForceForElems_cpu(determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hgcoef) ;
+  }
+
+  Release(&z8n) ;
+  Release(&y8n) ;
+  Release(&x8n) ;
+  Release(&dvdz) ;
+  Release(&dvdy) ;
+  Release(&dvdx) ;
+
+  return ;
+}
+
+
+static inline
+void CalcHourglassControlForElems(Real_t determ[], Real_t hgcoef, int useCPU)
+{
+  if (useCPU) {
+        FC(x); FC(y); FC(z); FC(xd); FC(yd); FC(zd);
+        FC(nodelist); FC(ss); FC(elemMass);
+        FC(xd); FC(yd); FC(zd);
+        FC(fx); FC(fy); FC(fz);
+        CalcHourglassControlForElems_cpu(determ,hgcoef);
+        SG(fx); SG(fy); SG(fz);
+  }
+  else {
+        FG(x); FG(y); FG(z); FG(xd); FG(yd); FG(zd);
+        FG(nodelist); FG(ss); FG(elemMass);
+        FG(xd); FG(yd); FG(zd);
+        FG(fx); FG(fy); FG(fz);
+        CalcHourglassControlForElems_gpu(determ,hgcoef);
+        SC(fx); SC(fy); SC(fz);
+  }
+}
+
+
+static inline
+void CalcVolumeForceForElems_gpu()
+{
+  Index_t numElem = mesh.numElem() ;
+  if (numElem != 0) {
+        Real_t  hgcoef = mesh.hgcoef() ;
+        Real_t *sigxx, *sigyy, *sigzz, *determ;
+        int badvol;
+
+        HIP( hipMalloc(&sigxx,numElem*sizeof(Real_t)) );
+        HIP( hipMalloc(&sigyy,numElem*sizeof(Real_t)) );
+        HIP( hipMalloc(&sigzz,numElem*sizeof(Real_t)) );
+        HIP( hipMalloc(&determ,numElem*sizeof(Real_t)) );
+
+        /* Sum contributions to total stress tensor */
+        InitStressTermsForElems(numElem, sigxx, sigyy, sigzz, 0);
+
+        // call elemlib stress integration loop to produce nodal forces from
+        // material stresses.
+        IntegrateStressForElems( numElem, sigxx, sigyy, sigzz, determ, badvol, 0) ;
+
+        HIP( hipFree(sigxx) );
+        HIP( hipFree(sigyy) );
+        HIP( hipFree(sigzz) );
+
+        // check for negative element volume
+        if (badvol) exit(VolumeError) ;
+
+        CalcHourglassControlForElems(determ, hgcoef, 0) ;
+
+        HIP( hipFree(determ) );
+  }
+}
+
+
+static inline
+void CalcVolumeForceForElems_cpu()
+{
+  Index_t numElem = mesh.numElem() ;
+  if (numElem != 0) {
+        Real_t  hgcoef = mesh.hgcoef() ;
+        Real_t *sigxx  = Allocate<Real_t>(numElem) ;
+        Real_t *sigyy  = Allocate<Real_t>(numElem) ;
+        Real_t *sigzz  = Allocate<Real_t>(numElem) ;
+        Real_t *determ = Allocate<Real_t>(numElem) ;
+        int badvol;
+
+        /* Sum contributions to total stress tensor */
+        InitStressTermsForElems(numElem, sigxx, sigyy, sigzz, 1);
+
+        // call elemlib stress integration loop to produce nodal forces from
+        // material stresses.
+        IntegrateStressForElems( numElem, sigxx, sigyy, sigzz, determ, badvol, 1) ;
+
+        Release(&sigzz) ;
+        Release(&sigyy) ;
+        Release(&sigxx) ;
+
+        // check for negative element volume
+        if (badvol) exit(VolumeError);
+#if 0
+        for ( Index_t k=0 ; k<numElem ; ++k ) {
+          if (determ[k] <= Real_t(0.0)) {
+                exit(VolumeError) ;
+          }
+        }
+#endif
+
+        CalcHourglassControlForElems(determ, hgcoef, 1) ;
+
+        Release(&determ) ;
+  }
+}
+
+static inline void CalcForceForNodes_gpu()
+{
+  /* Calcforce calls partial, force, hourq */
+  CalcVolumeForceForElems_gpu() ;
+
+  /* Calculate Nodal Forces at domain boundaries */
+  /* problem->commSBN->Transfer(CommSBN::forces); */
+
+
+}
+
+static inline void CalcForceForNodes_cpu()
+{
+  Index_t numNode = mesh.numNode() ;
+  for (Index_t i=0; i<numNode; ++i) {
+        mesh.fx(i) = Real_t(0.0) ;
+        mesh.fy(i) = Real_t(0.0) ;
+        mesh.fz(i) = Real_t(0.0) ;
+  }
+
+  /* Calcforce calls partial, force, hourq */
+  CalcVolumeForceForElems_cpu() ;
+
+  /* Calculate Nodal Forces at domain boundaries */
+  /* problem->commSBN->Transfer(CommSBN::forces); */
+
+}
+
+static inline void CalcForceForNodes(int useCPU)
+{
+  if (useCPU) {
+        CalcForceForNodes_cpu();
+  }
+  else {
+        CalcForceForNodes_gpu();
+  }
+}
+
+__global__
+void CalcAccelerationForNodes_kernel(int numNode,
+                                     Real_t *xdd, Real_t *ydd, Real_t *zdd,
+                                     Real_t *fx, Real_t *fy, Real_t *fz,
+                                     Real_t *nodalMass)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<numNode) {
+        xdd[i]=fx[i]/nodalMass[i];
+        ydd[i]=fy[i]/nodalMass[i];
+        zdd[i]=fz[i]/nodalMass[i];
+  }
+}
+
+static inline
+void CalcAccelerationForNodes_gpu()
+{
+  dim3 dimBlock = dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid = dim3(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcAccelerationForNodes_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),
+         meshGPU.m_xdd,meshGPU.m_ydd,meshGPU.m_zdd,
+         meshGPU.m_fx,meshGPU.m_fy,meshGPU.m_fz,
+         meshGPU.m_nodalMass);
+  HIP_DEBUGSYNC;
+}
+
+
+static inline
+void CalcAccelerationForNodes_cpu()
+{
+  Index_t numNode = mesh.numNode() ;
+  for (Index_t i = 0; i < numNode; ++i) {
+        mesh.xdd(i) = mesh.fx(i) / mesh.nodalMass(i);
+        mesh.ydd(i) = mesh.fy(i) / mesh.nodalMass(i);
+        mesh.zdd(i) = mesh.fz(i) / mesh.nodalMass(i);
+  }
+}
+
+static inline
+void CalcAccelerationForNodes(int useCPU)
+{
+  if (useCPU) {
+        FC(fx); FC(fy); FC(fz); FC(nodalMass);
+        CalcAccelerationForNodes_cpu();
+        SG(xdd); SG(ydd); SG(zdd);
+  }
+  else {
+        FG(fx); FG(fy); FG(fz); FG(nodalMass);
+        CalcAccelerationForNodes_gpu();
+        SC(xdd); SC(ydd); SC(zdd);
+  }
+}
+
+__global__
+void ApplyAccelerationBoundaryConditionsForNodes_kernel(
+                                                        int numNodeBC, Real_t *xdd, Real_t *ydd, Real_t *zdd,
+                                                        Index_t *symmX, Index_t *symmY, Index_t *symmZ)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<numNodeBC) {
+        xdd[symmX[i]] = Real_t(0.0) ;
+        ydd[symmY[i]] = Real_t(0.0) ;
+        zdd[symmZ[i]] = Real_t(0.0) ;
+  }
+}
+
+static inline
+void ApplyAccelerationBoundaryConditionsForNodes_gpu()
+{
+  Index_t numNodeBC = (mesh.sizeX()+1)*(mesh.sizeX()+1) ;
+  dim3 dimBlock(BLOCKSIZE,1,1);
+  dim3 dimGrid(PAD_DIV(numNodeBC,dimBlock.x),1,1);
+  hipLaunchKernelGGL((ApplyAccelerationBoundaryConditionsForNodes_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numNodeBC,
+         meshGPU.m_xdd,meshGPU.m_ydd,meshGPU.m_zdd,
+         meshGPU.m_symmX,meshGPU.m_symmY,meshGPU.m_symmZ);
+  HIP_DEBUGSYNC;
+}
+
+static inline
+void ApplyAccelerationBoundaryConditionsForNodes_cpu()
+{
+  Index_t numNodeBC = (mesh.sizeX()+1)*(mesh.sizeX()+1) ;
+  for(Index_t i=0 ; i<numNodeBC ; ++i)
+        mesh.xdd(mesh.symmX(i)) = Real_t(0.0) ;
+
+  for(Index_t i=0 ; i<numNodeBC ; ++i)
+        mesh.ydd(mesh.symmY(i)) = Real_t(0.0) ;
+
+  for(Index_t i=0 ; i<numNodeBC ; ++i)
+        mesh.zdd(mesh.symmZ(i)) = Real_t(0.0) ;
+}
+
+static inline
+void ApplyAccelerationBoundaryConditionsForNodes(int useCPU)
+{
+  if (useCPU) {
+        FC(xdd); FC(ydd); FC(zdd); FC(symmX); FC(symmY); FC(symmZ);
+        ApplyAccelerationBoundaryConditionsForNodes_cpu();
+        SG(xdd); SG(ydd); SG(zdd);
+  }
+  else {
+        FG(xdd); FG(ydd); FG(zdd); FG(symmX); FG(symmY); FG(symmZ);
+        ApplyAccelerationBoundaryConditionsForNodes_gpu();
+        SC(xdd); SC(ydd); SC(zdd);
+  }
+}
+
+
+__global__
+void CalcVelocityForNodes_kernel(int numNode, const Real_t dt, const Real_t u_cut,
+                                 Real_t *xd, Real_t *yd, Real_t *zd,
+                                 Real_t *xdd, Real_t *ydd, Real_t *zdd)
+{
+  int i = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<numNode) {
+        Real_t xdtmp, ydtmp, zdtmp ;
+
+        xdtmp = xd[i] + xdd[i] * dt ;
+        if( FABS(xdtmp) < u_cut ) xdtmp = 0.0;//Real_t(0.0);
+        xd[i] = xdtmp ;
+
+        ydtmp = yd[i] + ydd[i] * dt ;
+        if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
+        yd[i] = ydtmp ;
+
+        zdtmp = zd[i] + zdd[i] * dt ;
+        if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
+        zd[i] = zdtmp ;
+  }
+}
+
+static inline
+void CalcVelocityForNodes_gpu(const Real_t dt, const Real_t u_cut)
+{
+  dim3 dimBlock(BLOCKSIZE,1,1);
+  dim3 dimGrid(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcVelocityForNodes_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),dt,u_cut,
+         meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd,
+         meshGPU.m_xdd,meshGPU.m_ydd,meshGPU.m_zdd);
+  HIP_DEBUGSYNC;
+}
+
+static inline
+void CalcVelocityForNodes_cpu(const Real_t dt, const Real_t u_cut)
+{
+  Index_t numNode = mesh.numNode() ;
+
+  for ( Index_t i = 0 ; i < numNode ; ++i )
+  {
+          Real_t xdtmp, ydtmp, zdtmp ;
+
+          xdtmp = mesh.xd(i) + mesh.xdd(i) * dt ;
+          if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
+          mesh.xd(i) = xdtmp ;
+
+          ydtmp = mesh.yd(i) + mesh.ydd(i) * dt ;
+          if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
+          mesh.yd(i) = ydtmp ;
+
+          zdtmp = mesh.zd(i) + mesh.zdd(i) * dt ;
+          if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
+          mesh.zd(i) = zdtmp ;
+  }
+}
+
+static inline
+void CalcVelocityForNodes(const Real_t dt, const Real_t u_cut, int useCPU)
+{
+  if (useCPU) {
+        FC(xd); FC(yd); FC(zd); FC(xdd); FC(ydd); FC(zdd);
+        CalcVelocityForNodes_cpu(dt,u_cut);
+        SG(xd); SG(yd); SG(zd);
+  }
+  else {
+        FG(xd); FG(yd); FG(zd); FG(xdd); FG(ydd); FG(zdd);
+        CalcVelocityForNodes_gpu(dt,u_cut);
+        SC(xd); SC(yd); SC(zd);
+  }
+}
+
+__global__
+void CalcPositionForNodes_kernel(int numNode, Real_t dt,
+                                 Real_t *x, Real_t *y, Real_t *z,
+                                 Real_t *xd, Real_t *yd, Real_t *zd)
+{
+  int i = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<numNode) {
+        x[i] += xd[i] * dt;
+        y[i] += yd[i] * dt;
+        z[i] += zd[i] * dt;
+  }
+}
+
+static inline
+void CalcPositionForNodes_gpu(const Real_t dt)
+{
+  dim3 dimBlock(BLOCKSIZE,1,1);
+  dim3 dimGrid(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcPositionForNodes_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),dt,meshGPU.m_x,meshGPU.m_y,meshGPU.m_z,meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd);
+  HIP_DEBUGSYNC;
+}
+
+static inline
+void CalcPositionForNodes_cpu(const Real_t dt)
+{
+  Index_t numNode = mesh.numNode() ;
+
+  for ( Index_t i = 0 ; i < numNode ; ++i )
+  {
+          mesh.x(i) += mesh.xd(i) * dt ;
+          mesh.y(i) += mesh.yd(i) * dt ;
+          mesh.z(i) += mesh.zd(i) * dt ;
+  }
+}
+
+static inline
+void CalcPositionForNodes(const Real_t dt,int useCPU)
+{
+  if (useCPU) {
+        FC(x); FC(y); FC(z); FC(xd); FC(yd); FC(zd);
+        CalcPositionForNodes_cpu(dt);
+        SG(x); SG(y); SG(z);
+  }
+  else {
+        FG(x); FG(y); FG(z); FG(xd); FG(yd); FG(zd);
+        CalcPositionForNodes_gpu(dt);
+        SC(x); SC(y); SC(z);
+  }
+}
+
+static inline
+void LagrangeNodal(int useCPU)
+{
+  const Real_t delt = mesh.deltatime() ;
+  Real_t u_cut = mesh.u_cut() ;
+
+  /* time of boundary condition evaluation is beginning of step for force and
+   * acceleration boundary conditions. */
+  CalcForceForNodes(/*0*/useCPU);
+
+  CalcAccelerationForNodes(useCPU);
+
+  ApplyAccelerationBoundaryConditionsForNodes(useCPU);
+
+  CalcVelocityForNodes( delt, u_cut, useCPU ) ;
+
+  CalcPositionForNodes( delt, useCPU );
+
+  return;
+}
+
+__host__ __device__
+static inline
+Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
+                       const Real_t x2, const Real_t x3,
+                       const Real_t x4, const Real_t x5,
+                       const Real_t x6, const Real_t x7,
+                       const Real_t y0, const Real_t y1,
+                       const Real_t y2, const Real_t y3,
+                       const Real_t y4, const Real_t y5,
+                       const Real_t y6, const Real_t y7,
+                       const Real_t z0, const Real_t z1,
+                       const Real_t z2, const Real_t z3,
+                       const Real_t z4, const Real_t z5,
+                       const Real_t z6, const Real_t z7 )
+{
+  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
+
+  Real_t dx61 = x6 - x1;
+  Real_t dy61 = y6 - y1;
+  Real_t dz61 = z6 - z1;
+
+  Real_t dx70 = x7 - x0;
+  Real_t dy70 = y7 - y0;
+  Real_t dz70 = z7 - z0;
+
+  Real_t dx63 = x6 - x3;
+  Real_t dy63 = y6 - y3;
+  Real_t dz63 = z6 - z3;
+
+  Real_t dx20 = x2 - x0;
+  Real_t dy20 = y2 - y0;
+  Real_t dz20 = z2 - z0;
+
+  Real_t dx50 = x5 - x0;
+  Real_t dy50 = y5 - y0;
+  Real_t dz50 = z5 - z0;
+
+  Real_t dx64 = x6 - x4;
+  Real_t dy64 = y6 - y4;
+  Real_t dz64 = z6 - z4;
+
+  Real_t dx31 = x3 - x1;
+  Real_t dy31 = y3 - y1;
+  Real_t dz31 = z3 - z1;
+
+  Real_t dx72 = x7 - x2;
+  Real_t dy72 = y7 - y2;
+  Real_t dz72 = z7 - z2;
+
+  Real_t dx43 = x4 - x3;
+  Real_t dy43 = y4 - y3;
+  Real_t dz43 = z4 - z3;
+
+  Real_t dx57 = x5 - x7;
+  Real_t dy57 = y5 - y7;
+  Real_t dz57 = z5 - z7;
+
+  Real_t dx14 = x1 - x4;
+  Real_t dy14 = y1 - y4;
+  Real_t dz14 = z1 - z4;
+
+  Real_t dx25 = x2 - x5;
+  Real_t dy25 = y2 - y5;
+  Real_t dz25 = z2 - z5;
+
+#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
+  ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
+
+    Real_t volume =
+          TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
+                         dy31 + dy72, dy63, dy20,
+                         dz31 + dz72, dz63, dz20) +
+          TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
+                         dy43 + dy57, dy64, dy70,
+                         dz43 + dz57, dz64, dz70) +
+          TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
+                         dy14 + dy25, dy61, dy50,
+                         dz14 + dz25, dz61, dz50);
+
+        #undef TRIPLE_PRODUCT
+
+        volume *= twelveth;
+
+        return volume ;
+}
+
+__host__ __device__
+static inline
+Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
+{
+  return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+                         y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
+                         z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
+}
+
+__host__ __device__
+static inline
+Real_t AreaFace( const Real_t x0, const Real_t x1,
+                 const Real_t x2, const Real_t x3,
+                 const Real_t y0, const Real_t y1,
+                 const Real_t y2, const Real_t y3,
+                 const Real_t z0, const Real_t z1,
+                 const Real_t z2, const Real_t z3)
+{
+  Real_t fx = (x2 - x0) - (x3 - x1);
+  Real_t fy = (y2 - y0) - (y3 - y1);
+  Real_t fz = (z2 - z0) - (z3 - z1);
+  Real_t gx = (x2 - x0) + (x3 - x1);
+  Real_t gy = (y2 - y0) + (y3 - y1);
+  Real_t gz = (z2 - z0) + (z3 - z1);
+     Real_t area =
+           (fx * fx + fy * fy + fz * fz) *
+           (gx * gx + gy * gy + gz * gz) -
+           (fx * gx + fy * gy + fz * gz) *
+           (fx * gx + fy * gy + fz * gz);
+         return area ;
+}
+
+__host__ __device__
+static inline
+Real_t CalcElemCharacteristicLength( const Real_t x[8],
+                                     const Real_t y[8],
+                                     const Real_t z[8],
+                                     const Real_t volume)
+{
+  Real_t a, charLength = Real_t(0.0);
+
+  a = AreaFace(x[0],x[1],x[2],x[3],
+               y[0],y[1],y[2],y[3],
+               z[0],z[1],z[2],z[3]) ;
+  charLength = FMAX(a,charLength) ;
+
+  a = AreaFace(x[4],x[5],x[6],x[7],
+               y[4],y[5],y[6],y[7],
+               z[4],z[5],z[6],z[7]) ;
+  charLength = FMAX(a,charLength) ;
+
+  a = AreaFace(x[0],x[1],x[5],x[4],
+               y[0],y[1],y[5],y[4],
+               z[0],z[1],z[5],z[4]) ;
+  charLength = FMAX(a,charLength) ;
+
+  a = AreaFace(x[1],x[2],x[6],x[5],
+               y[1],y[2],y[6],y[5],
+               z[1],z[2],z[6],z[5]) ;
+  charLength = FMAX(a,charLength) ;
+
+  a = AreaFace(x[2],x[3],x[7],x[6],
+               y[2],y[3],y[7],y[6],
+               z[2],z[3],z[7],z[6]) ;
+  charLength = FMAX(a,charLength) ;
+
+  a = AreaFace(x[3],x[0],x[4],x[7],
+               y[3],y[0],y[4],y[7],
+               z[3],z[0],z[4],z[7]) ;
+  charLength = FMAX(a,charLength) ;
+
+  charLength = Real_t(4.0) * volume / SQRT(charLength);
+
+  return charLength;
+}
+
+__host__ __device__
+static inline
+void CalcElemVelocityGradient( const Real_t* const xvel,
+                               const Real_t* const yvel,
+                               const Real_t* const zvel,
+                               const Real_t b[][8],
+                               const Real_t detJ,
+                               Real_t* const d )
+{
+  const Real_t inv_detJ = Real_t(1.0) / detJ ;
+  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
+  const Real_t* const pfx = b[0];
+  const Real_t* const pfy = b[1];
+  const Real_t* const pfz = b[2];
+
+  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
+                                          + pfx[1] * (xvel[1]-xvel[7])
+                                          + pfx[2] * (xvel[2]-xvel[4])
+                                          + pfx[3] * (xvel[3]-xvel[5]) );
+
+  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
+                                          + pfy[1] * (yvel[1]-yvel[7])
+                                          + pfy[2] * (yvel[2]-yvel[4])
+                                          + pfy[3] * (yvel[3]-yvel[5]) );
+
+  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
+                                          + pfz[1] * (zvel[1]-zvel[7])
+                                          + pfz[2] * (zvel[2]-zvel[4])
+                                          + pfz[3] * (zvel[3]-zvel[5]) );
+
+  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
+                                                + pfx[1] * (yvel[1]-yvel[7])
+                                                + pfx[2] * (yvel[2]-yvel[4])
+                                                + pfx[3] * (yvel[3]-yvel[5]) );
+
+  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
+                                                + pfy[1] * (xvel[1]-xvel[7])
+                                                + pfy[2] * (xvel[2]-xvel[4])
+                                                + pfy[3] * (xvel[3]-xvel[5]) );
+
+  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
+                                                + pfx[1] * (zvel[1]-zvel[7])
+                                                + pfx[2] * (zvel[2]-zvel[4])
+                                                + pfx[3] * (zvel[3]-zvel[5]) );
+
+  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
+                                                + pfz[1] * (xvel[1]-xvel[7])
+                                                + pfz[2] * (xvel[2]-xvel[4])
+                                                + pfz[3] * (xvel[3]-xvel[5]) );
+
+  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
+                                                + pfy[1] * (zvel[1]-zvel[7])
+                                                + pfy[2] * (zvel[2]-zvel[4])
+                                                + pfy[3] * (zvel[3]-zvel[5]) );
+
+  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
+                                                + pfz[1] * (yvel[1]-yvel[7])
+                                                + pfz[2] * (yvel[2]-yvel[4])
+                                                + pfz[3] * (yvel[3]-yvel[5]) );
+  d[5]  = Real_t( .5) * ( dxddy + dyddx );
+  d[4]  = Real_t( .5) * ( dxddz + dzddx );
+  d[3]  = Real_t( .5) * ( dzddy + dyddz );
+}
+
+__global__
+void CalcKinematicsForElems_kernel(
+                                   Index_t numElem, Real_t dt,
+                                   Index_t *nodelist,Real_t *volo,Real_t *v,
+                                   Real_t *x,Real_t *y,Real_t *z,Real_t *xd,Real_t *yd,Real_t *zd,
+                                   Real_t *vnew,Real_t *delv,Real_t *arealg,Real_t *dxx,Real_t *dyy,Real_t *dzz
+                                   )
+{
+  Real_t B[3][8] ; /** shape function derivatives */
+  Real_t D[6] ;
+  Real_t x_local[8] ;
+  Real_t y_local[8] ;
+  Real_t z_local[8] ;
+  Real_t xd_local[8] ;
+  Real_t yd_local[8] ;
+  Real_t zd_local[8] ;
+  Real_t detJ = Real_t(0.0) ;
+
+  int k=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (k<numElem) {
+
+        Real_t volume ;
+        Real_t relativeVolume ;
+
+        // get nodal coordinates from global arrays and copy into local arrays.
+        for( Index_t lnode=0 ; lnode<8 ; ++lnode )
+          {
+                Index_t gnode = nodelist[k+lnode*numElem];
+                x_local[lnode] = x[gnode];
+                y_local[lnode] = y[gnode];
+                z_local[lnode] = z[gnode];
+          }
+
+        // volume calculations
+        volume = CalcElemVolume(x_local, y_local, z_local );
+        relativeVolume = volume / volo[k] ;
+        vnew[k] = relativeVolume ;
+        delv[k] = relativeVolume - v[k] ;
+
+        // set characteristic length
+        arealg[k] = CalcElemCharacteristicLength(x_local,y_local,z_local,volume);
+
+        // get nodal velocities from global array and copy into local arrays.
+        for( Index_t lnode=0 ; lnode<8 ; ++lnode )
+          {
+                Index_t gnode = nodelist[k+lnode*numElem];
+                xd_local[lnode] = xd[gnode];
+                yd_local[lnode] = yd[gnode];
+                zd_local[lnode] = zd[gnode];
+          }
+
+        Real_t dt2 = Real_t(0.5) * dt;
+        for ( Index_t j=0 ; j<8 ; ++j )
+          {
+                x_local[j] -= dt2 * xd_local[j];
+                y_local[j] -= dt2 * yd_local[j];
+                z_local[j] -= dt2 * zd_local[j];
+          }
+
+        CalcElemShapeFunctionDerivatives(x_local,y_local,z_local,B,&detJ );
+
+        CalcElemVelocityGradient(xd_local,yd_local,zd_local,B,detJ,D);
+
+        // put velocity gradient quantities into their global arrays.
+        dxx[k] = D[0];
+        dyy[k] = D[1];
+        dzz[k] = D[2];
+  }
+}
+
+
+static inline
+void CalcKinematicsForElems_gpu( Index_t numElem, Real_t dt )
+{
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcKinematicsForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,dt,meshGPU.m_nodelist,meshGPU.m_volo,meshGPU.m_v,
+         meshGPU.m_x,meshGPU.m_y,meshGPU.m_z,meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd,
+         meshGPU.m_vnew,meshGPU.m_delv,meshGPU.m_arealg,meshGPU.m_dxx,meshGPU.m_dyy,meshGPU.m_dzz);
+  HIP_DEBUGSYNC;
+}
+
+
+static inline
+void CalcKinematicsForElems_cpu( Index_t numElem, Real_t dt )
+{
+  Real_t B[3][8] ; /** shape function derivatives */
+  Real_t D[6] ;
+  Real_t x_local[8] ;
+  Real_t y_local[8] ;
+  Real_t z_local[8] ;
+  Real_t xd_local[8] ;
+  Real_t yd_local[8] ;
+  Real_t zd_local[8] ;
+  Real_t detJ = Real_t(0.0) ;
+
+  // loop over all elements
+  for( Index_t k=0 ; k<numElem ; ++k )
+        {
+          Real_t volume ;
+          Real_t relativeVolume ;
+
+          // get nodal coordinates from global arrays and copy into local arrays.
+          for( Index_t lnode=0 ; lnode<8 ; ++lnode )
+                {
+                  Index_t gnode = mesh.nodelist(k,lnode);
+                  x_local[lnode] = mesh.x(gnode);
+                  y_local[lnode] = mesh.y(gnode);
+                  z_local[lnode] = mesh.z(gnode);
+                }
+
+          // volume calculations
+          volume = CalcElemVolume(x_local, y_local, z_local );
+          relativeVolume = volume / mesh.volo(k) ;
+          mesh.vnew(k) = relativeVolume ;
+          mesh.delv(k) = relativeVolume - mesh.v(k) ;
+
+          // set characteristic length
+          mesh.arealg(k) = CalcElemCharacteristicLength(x_local,
+                                                        y_local,
+                                                        z_local,
+                                                        volume);
+
+          // get nodal velocities from global array and copy into local arrays.
+          for( Index_t lnode=0 ; lnode<8 ; ++lnode )
+                {
+                  Index_t gnode = mesh.nodelist(k,lnode);
+                  xd_local[lnode] = mesh.xd(gnode);
+                  yd_local[lnode] = mesh.yd(gnode);
+                  zd_local[lnode] = mesh.zd(gnode);
+                }
+
+          Real_t dt2 = Real_t(0.5) * dt;
+          for ( Index_t j=0 ; j<8 ; ++j )
+                {
+                  x_local[j] -= dt2 * xd_local[j];
+                  y_local[j] -= dt2 * yd_local[j];
+                  z_local[j] -= dt2 * zd_local[j];
+                }
+
+          CalcElemShapeFunctionDerivatives( x_local,
+                                            y_local,
+                                            z_local,
+                                            B, &detJ );
+
+          CalcElemVelocityGradient( xd_local,
+                                    yd_local,
+                                    zd_local,
+                                    B, detJ, D );
+
+          // put velocity gradient quantities into their global arrays.
+          mesh.dxx(k) = D[0];
+          mesh.dyy(k) = D[1];
+          mesh.dzz(k) = D[2];
+        }
+}
+
+
+static inline
+void CalcKinematicsForElems( Index_t numElem, Real_t dt, int useCPU )
+{
+  if (useCPU) {
+        FC(nodelist); FC(volo); FC(v); FC(x); FC(y); FC(z); FC(xd); FC(yd); FC(zd);
+        CalcKinematicsForElems_cpu(numElem,dt);
+        SG(vnew); SG(delv); SG(arealg); SG(dxx); SG(dyy); SG(dzz);
+  }
+  else {
+        FG(nodelist); FG(volo); FG(v); FG(x); FG(y); FG(z); FG(xd); FG(yd); FG(zd);
+        CalcKinematicsForElems_gpu(numElem,dt);
+        SC(vnew); SC(delv); SC(arealg); SC(dxx); SC(dyy); SC(dzz);
+  }
+}
+
+
+__global__
+void CalcLagrangeElementsPart2_kernel(
+                                      Index_t numElem,
+                                      Real_t *dxx,Real_t *dyy, Real_t *dzz,
+                                          Real_t *vdov
+                                      )
+{
+  int k=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (k<numElem) {
+
+        // calc strain rate and apply as constraint (only done in FB element)
+        Real_t vdovNew = dxx[k] + dyy[k] + dzz[k] ;
+        Real_t vdovthird = vdovNew/Real_t(3.0) ;
+
+        // make the rate of deformation tensor deviatoric
+        vdov[k] = vdovNew ;
+        dxx[k] -= vdovthird ;
+        dyy[k] -= vdovthird ;
+        dzz[k] -= vdovthird ;
+
+        // See if any volumes are negative, and take appropriate action.
+        //if (mesh.vnew(k) <= Real_t(0.0))
+        //{
+        //    exit(VolumeError) ;
+        //}
+  }
+}
+
+
+static inline
+void CalcLagrangeElementsPart2_gpu()
+{
+  Index_t numElem = mesh.numElem();
+
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcLagrangeElementsPart2_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,
+         meshGPU.m_dxx,meshGPU.m_dyy,meshGPU.m_dzz,
+         meshGPU.m_vdov);
+  HIP_DEBUGSYNC;
+}
+
+
+static inline
+void CalcLagrangeElementsPart2_cpu()
+{
+  Index_t numElem = mesh.numElem() ;
+
+  // element loop to do some stuff not included in the elemlib function.
+  for ( Index_t k=0 ; k<numElem ; ++k )
+  {
+          // calc strain rate and apply as constraint (only done in FB element)
+          Real_t vdov = mesh.dxx(k) + mesh.dyy(k) + mesh.dzz(k) ;
+          Real_t vdovthird = vdov/Real_t(3.0) ;
+
+          // make the rate of deformation tensor deviatoric
+          mesh.vdov(k) = vdov ;
+          mesh.dxx(k) -= vdovthird ;
+          mesh.dyy(k) -= vdovthird ;
+          mesh.dzz(k) -= vdovthird ;
+
+          // See if any volumes are negative, and take appropriate action.
+          if (mesh.vnew(k) <= Real_t(0.0))
+          {
+                  exit(VolumeError) ;
+          }
+  }
+}
+
+
+static inline
+void CalcLagrangeElementsPart2(int useCPU)
+{
+  if (useCPU) {
+        FC(dxx); FC(dyy); FC(dzz);
+        CalcLagrangeElementsPart2_cpu();
+        SG(vdov); SG(dxx); SG(dyy); SG(dzz);
+  }
+  else {
+        FG(dxx); FG(dyy); FG(dzz);
+        CalcLagrangeElementsPart2_gpu();
+        SC(vdov); SC(dxx); SC(dyy); SC(dzz);
+  }
+}
+
+static inline
+void CalcLagrangeElements(Real_t deltatime, int useCPU)
+{
+  Index_t numElem = mesh.numElem() ;
+  if (numElem > 0) {
+        CalcKinematicsForElems(numElem, deltatime, useCPU);
+        CalcLagrangeElementsPart2(useCPU);
+  }
+}
+
+
+__global__
+void CalcMonotonicQGradientsForElems_kernel(
+                                            Index_t numElem,
+                                            Index_t *nodelist,
+                                            Real_t *x,Real_t *y,Real_t *z,Real_t *xd,Real_t *yd,Real_t *zd,
+                                            Real_t *volo,Real_t *vnew,
+                                            Real_t *delx_zeta,Real_t *delv_zeta,
+                                            Real_t *delx_xi,Real_t *delv_xi,
+                                            Real_t *delx_eta,Real_t *delv_eta
+                                            )
+{
+#define SUM4(a,b,c,d) (a + b + c + d)
+  const Real_t ptiny = Real_t(1.e-36) ;
+
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<numElem) {
+        Real_t ax,ay,az ;
+        Real_t dxv,dyv,dzv ;
+
+        Index_t n0 = nodelist[i+0*numElem] ;
+        Index_t n1 = nodelist[i+1*numElem] ;
+        Index_t n2 = nodelist[i+2*numElem] ;
+        Index_t n3 = nodelist[i+3*numElem] ;
+        Index_t n4 = nodelist[i+4*numElem] ;
+        Index_t n5 = nodelist[i+5*numElem] ;
+        Index_t n6 = nodelist[i+6*numElem] ;
+        Index_t n7 = nodelist[i+7*numElem] ;
+
+        Real_t x0 = x[n0] ;
+        Real_t x1 = x[n1] ;
+        Real_t x2 = x[n2] ;
+        Real_t x3 = x[n3] ;
+        Real_t x4 = x[n4] ;
+        Real_t x5 = x[n5] ;
+        Real_t x6 = x[n6] ;
+        Real_t x7 = x[n7] ;
+
+        Real_t y0 = y[n0] ;
+        Real_t y1 = y[n1] ;
+        Real_t y2 = y[n2] ;
+        Real_t y3 = y[n3] ;
+        Real_t y4 = y[n4] ;
+        Real_t y5 = y[n5] ;
+        Real_t y6 = y[n6] ;
+        Real_t y7 = y[n7] ;
+
+        Real_t z0 = z[n0] ;
+        Real_t z1 = z[n1] ;
+        Real_t z2 = z[n2] ;
+        Real_t z3 = z[n3] ;
+        Real_t z4 = z[n4] ;
+        Real_t z5 = z[n5] ;
+        Real_t z6 = z[n6] ;
+        Real_t z7 = z[n7] ;
+
+        Real_t xv0 = xd[n0] ;
+        Real_t xv1 = xd[n1] ;
+        Real_t xv2 = xd[n2] ;
+        Real_t xv3 = xd[n3] ;
+        Real_t xv4 = xd[n4] ;
+        Real_t xv5 = xd[n5] ;
+        Real_t xv6 = xd[n6] ;
+        Real_t xv7 = xd[n7] ;
+
+        Real_t yv0 = yd[n0] ;
+        Real_t yv1 = yd[n1] ;
+        Real_t yv2 = yd[n2] ;
+        Real_t yv3 = yd[n3] ;
+        Real_t yv4 = yd[n4] ;
+        Real_t yv5 = yd[n5] ;
+        Real_t yv6 = yd[n6] ;
+        Real_t yv7 = yd[n7] ;
+
+        Real_t zv0 = zd[n0] ;
+        Real_t zv1 = zd[n1] ;
+        Real_t zv2 = zd[n2] ;
+        Real_t zv3 = zd[n3] ;
+        Real_t zv4 = zd[n4] ;
+        Real_t zv5 = zd[n5] ;
+        Real_t zv6 = zd[n6] ;
+        Real_t zv7 = zd[n7] ;
+
+        Real_t vol = volo[i]*vnew[i] ;
+        Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
+
+        Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
+        Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
+        Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;
+
+        Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
+        Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
+        Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;
+
+        Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
+        Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
+        Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;
+
+        /* find delvk and delxk ( i cross j ) */
+
+        ax = dyi*dzj - dzi*dyj ;
+        ay = dzi*dxj - dxi*dzj ;
+        az = dxi*dyj - dyi*dxj ;
+
+        delx_zeta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
+
+        ax *= norm ;
+        ay *= norm ;
+        az *= norm ;
+
+        dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
+        dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
+        dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;
+
+        delv_zeta[i] = ax*dxv + ay*dyv + az*dzv ;
+
+        /* find delxi and delvi ( j cross k ) */
+
+        ax = dyj*dzk - dzj*dyk ;
+        ay = dzj*dxk - dxj*dzk ;
+        az = dxj*dyk - dyj*dxk ;
+
+        delx_xi[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
+
+        ax *= norm ;
+        ay *= norm ;
+        az *= norm ;
+
+        dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
+        dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
+        dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;
+
+        delv_xi[i] = ax*dxv + ay*dyv + az*dzv ;
+
+        /* find delxj and delvj ( k cross i ) */
+
+        ax = dyk*dzi - dzk*dyi ;
+        ay = dzk*dxi - dxk*dzi ;
+        az = dxk*dyi - dyk*dxi ;
+
+        delx_eta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
+
+        ax *= norm ;
+        ay *= norm ;
+        az *= norm ;
+
+        dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
+        dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
+        dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;
+
+        delv_eta[i] = ax*dxv + ay*dyv + az*dzv ;
+  }
+  #undef SUM4
+}
+
+
+static inline
+void CalcMonotonicQGradientsForElems_gpu()
+{
+  Index_t numElem = mesh.numElem();
+
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcMonotonicQGradientsForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,
+         meshGPU.m_nodelist,
+         meshGPU.m_x,meshGPU.m_y,meshGPU.m_z,meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd,
+         meshGPU.m_volo,meshGPU.m_vnew,
+         meshGPU.m_delx_zeta,meshGPU.m_delv_zeta,
+         meshGPU.m_delx_xi,meshGPU.m_delv_xi,
+         meshGPU.m_delx_eta,meshGPU.m_delv_eta);
+  HIP_DEBUGSYNC;
+}
+
+
+static inline
+void CalcMonotonicQGradientsForElems_cpu()
+{
+#define SUM4(a,b,c,d) (a + b + c + d)
+  Index_t numElem = mesh.numElem() ;
+  const Real_t ptiny = Real_t(1.e-36) ;
+
+  for (Index_t i = 0 ; i < numElem ; ++i ) {
+        Real_t ax,ay,az ;
+        Real_t dxv,dyv,dzv ;
+
+        Index_t n0 = mesh.nodelist(i,0) ;
+        Index_t n1 = mesh.nodelist(i,1) ;
+        Index_t n2 = mesh.nodelist(i,2) ;
+        Index_t n3 = mesh.nodelist(i,3) ;
+        Index_t n4 = mesh.nodelist(i,4) ;
+        Index_t n5 = mesh.nodelist(i,5) ;
+        Index_t n6 = mesh.nodelist(i,6) ;
+        Index_t n7 = mesh.nodelist(i,7) ;
+
+        Real_t x0 = mesh.x(n0) ;
+        Real_t x1 = mesh.x(n1) ;
+        Real_t x2 = mesh.x(n2) ;
+        Real_t x3 = mesh.x(n3) ;
+        Real_t x4 = mesh.x(n4) ;
+        Real_t x5 = mesh.x(n5) ;
+        Real_t x6 = mesh.x(n6) ;
+        Real_t x7 = mesh.x(n7) ;
+
+        Real_t y0 = mesh.y(n0) ;
+        Real_t y1 = mesh.y(n1) ;
+        Real_t y2 = mesh.y(n2) ;
+        Real_t y3 = mesh.y(n3) ;
+        Real_t y4 = mesh.y(n4) ;
+        Real_t y5 = mesh.y(n5) ;
+        Real_t y6 = mesh.y(n6) ;
+        Real_t y7 = mesh.y(n7) ;
+
+        Real_t z0 = mesh.z(n0) ;
+        Real_t z1 = mesh.z(n1) ;
+        Real_t z2 = mesh.z(n2) ;
+        Real_t z3 = mesh.z(n3) ;
+        Real_t z4 = mesh.z(n4) ;
+        Real_t z5 = mesh.z(n5) ;
+        Real_t z6 = mesh.z(n6) ;
+        Real_t z7 = mesh.z(n7) ;
+
+        Real_t xv0 = mesh.xd(n0) ;
+        Real_t xv1 = mesh.xd(n1) ;
+        Real_t xv2 = mesh.xd(n2) ;
+        Real_t xv3 = mesh.xd(n3) ;
+        Real_t xv4 = mesh.xd(n4) ;
+        Real_t xv5 = mesh.xd(n5) ;
+        Real_t xv6 = mesh.xd(n6) ;
+        Real_t xv7 = mesh.xd(n7) ;
+
+        Real_t yv0 = mesh.yd(n0) ;
+        Real_t yv1 = mesh.yd(n1) ;
+        Real_t yv2 = mesh.yd(n2) ;
+        Real_t yv3 = mesh.yd(n3) ;
+        Real_t yv4 = mesh.yd(n4) ;
+        Real_t yv5 = mesh.yd(n5) ;
+        Real_t yv6 = mesh.yd(n6) ;
+        Real_t yv7 = mesh.yd(n7) ;
+
+        Real_t zv0 = mesh.zd(n0) ;
+        Real_t zv1 = mesh.zd(n1) ;
+        Real_t zv2 = mesh.zd(n2) ;
+        Real_t zv3 = mesh.zd(n3) ;
+        Real_t zv4 = mesh.zd(n4) ;
+        Real_t zv5 = mesh.zd(n5) ;
+        Real_t zv6 = mesh.zd(n6) ;
+        Real_t zv7 = mesh.zd(n7) ;
+
+        Real_t vol = mesh.volo(i)*mesh.vnew(i) ;
+        Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
+
+        Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
+        Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
+        Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;
+
+        Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
+        Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
+        Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;
+
+        Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
+        Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
+        Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;
+
+        /* find delvk and delxk ( i cross j ) */
+
+        ax = dyi*dzj - dzi*dyj ;
+        ay = dzi*dxj - dxi*dzj ;
+        az = dxi*dyj - dyi*dxj ;
+
+        mesh.delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
+
+        ax *= norm ;
+        ay *= norm ;
+        az *= norm ;
+
+        dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
+        dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
+        dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;
+
+        mesh.delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;
+
+        /* find delxi and delvi ( j cross k ) */
+
+        ax = dyj*dzk - dzj*dyk ;
+        ay = dzj*dxk - dxj*dzk ;
+        az = dxj*dyk - dyj*dxk ;
+
+        mesh.delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
+
+        ax *= norm ;
+        ay *= norm ;
+        az *= norm ;
+
+        dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
+        dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
+        dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;
+
+        mesh.delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;
+
+        /* find delxj and delvj ( k cross i ) */
+
+        ax = dyk*dzi - dzk*dyi ;
+        ay = dzk*dxi - dxk*dzi ;
+        az = dxk*dyi - dyk*dxi ;
+
+        mesh.delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
+
+        ax *= norm ;
+        ay *= norm ;
+        az *= norm ;
+
+        dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
+        dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
+        dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;
+
+        mesh.delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
+  }
+  #undef SUM4
+}
+
+
+static inline
+void CalcMonotonicQGradientsForElems(int useCPU)
+{
+  if (useCPU) {
+        FC(nodelist); FC(x); FC(y); FC(z); FC(xd); FC(yd); FC(zd); FC(volo); FC(vnew);
+        CalcMonotonicQGradientsForElems_cpu();
+        SG(delx_zeta); SG(delv_zeta); SG(delx_xi); SG(delv_xi); SG(delx_eta); SG(delv_eta);
+  }
+  else {
+        FG(nodelist); FG(x); FG(y); FG(z); FG(xd); FG(yd); FG(zd); FG(volo); FG(vnew);
+        CalcMonotonicQGradientsForElems_gpu();
+        SC(delx_zeta); SC(delv_zeta); SC(delx_xi); SC(delv_xi); SC(delx_eta); SC(delv_eta);
+  }
+}
+
+
+__global__
+void CalcMonotonicQRegionForElems_kernel(
+                                         Real_t qlc_monoq,
+                                         Real_t qqc_monoq,
+                                         Real_t monoq_limiter_mult,
+                                         Real_t monoq_max_slope,
+                                         Real_t ptiny,
+
+                                         // the elementset length
+                                         Index_t elength,
+
+                                         Index_t *matElemlist,Index_t *elemBC,
+                                         Index_t *lxim,Index_t *lxip,
+                                         Index_t *letam,Index_t *letap,
+                                         Index_t *lzetam,Index_t *lzetap,
+                                         Real_t *delv_xi,Real_t *delv_eta,Real_t *delv_zeta,
+                                         Real_t *delx_xi,Real_t *delx_eta,Real_t *delx_zeta,
+                                         Real_t *vdov,Real_t *elemMass,Real_t *volo,Real_t *vnew,
+                                         Real_t *qq,Real_t *ql
+                                         )
+{
+  int ielem=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (ielem<elength) {
+        Real_t qlin, qquad ;
+        Real_t phixi, phieta, phizeta ;
+        Index_t i = matElemlist[ielem];
+        Int_t bcMask = elemBC[i] ;
+        Real_t delvm, delvp ;
+
+        /*  phixi     */
+        Real_t norm = Real_t(1.) / ( delv_xi[i] + ptiny ) ;
+
+        switch (bcMask & XI_M) {
+        case 0:         delvm = delv_xi[lxim[i]] ; break ;
+        case XI_M_SYMM: delvm = delv_xi[i] ;       break ;
+        case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
+        default:        /* ERROR */ ;              break ;
+        }
+        switch (bcMask & XI_P) {
+        case 0:         delvp = delv_xi[lxip[i]] ; break ;
+        case XI_P_SYMM: delvp = delv_xi[i] ;       break ;
+        case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
+        default:        /* ERROR */ ;              break ;
+        }
+
+        delvm = delvm * norm ;
+        delvp = delvp * norm ;
+
+        phixi = Real_t(.5) * ( delvm + delvp ) ;
+
+        delvm *= monoq_limiter_mult ;
+        delvp *= monoq_limiter_mult ;
+
+        if ( delvm < phixi ) phixi = delvm ;
+        if ( delvp < phixi ) phixi = delvp ;
+        if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
+        if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
+
+
+        /*  phieta     */
+        norm = Real_t(1.) / ( delv_eta[i] + ptiny ) ;
+
+        switch (bcMask & ETA_M) {
+        case 0:          delvm = delv_eta[letam[i]] ; break ;
+        case ETA_M_SYMM: delvm = delv_eta[i] ;        break ;
+        case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
+        default:         /* ERROR */ ;                break ;
+        }
+        switch (bcMask & ETA_P) {
+        case 0:          delvp = delv_eta[letap[i]] ; break ;
+        case ETA_P_SYMM: delvp = delv_eta[i] ;        break ;
+        case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
+        default:         /* ERROR */ ;                break ;
+        }
+
+        delvm = delvm * norm ;
+        delvp = delvp * norm ;
+
+        phieta = Real_t(.5) * ( delvm + delvp ) ;
+
+        delvm *= monoq_limiter_mult ;
+        delvp *= monoq_limiter_mult ;
+
+        if ( delvm  < phieta ) phieta = delvm ;
+        if ( delvp  < phieta ) phieta = delvp ;
+        if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
+        if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
+
+        /*  phizeta     */
+        norm = Real_t(1.) / ( delv_zeta[i] + ptiny ) ;
+
+        switch (bcMask & ZETA_M) {
+        case 0:           delvm = delv_zeta[lzetam[i]] ; break ;
+        case ZETA_M_SYMM: delvm = delv_zeta[i] ;         break ;
+        case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
+        default:          /* ERROR */ ;                  break ;
+        }
+        switch (bcMask & ZETA_P) {
+        case 0:           delvp = delv_zeta[lzetap[i]] ; break ;
+        case ZETA_P_SYMM: delvp = delv_zeta[i] ;         break ;
+        case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
+        default:          /* ERROR */ ;                  break ;
+        }
+
+        delvm = delvm * norm ;
+        delvp = delvp * norm ;
+
+        phizeta = Real_t(.5) * ( delvm + delvp ) ;
+
+        delvm *= monoq_limiter_mult ;
+        delvp *= monoq_limiter_mult ;
+
+        if ( delvm   < phizeta ) phizeta = delvm ;
+        if ( delvp   < phizeta ) phizeta = delvp ;
+        if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
+        if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
+
+        /* Remove length scale */
+
+        if ( vdov[i] > Real_t(0.) )  {
+          qlin  = Real_t(0.) ;
+          qquad = Real_t(0.) ;
+        }
+        else {
+          Real_t delvxxi   = delv_xi[i]   * delx_xi[i]   ;
+          Real_t delvxeta  = delv_eta[i]  * delx_eta[i]  ;
+          Real_t delvxzeta = delv_zeta[i] * delx_zeta[i] ;
+
+          if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
+          if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
+          if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
+
+          Real_t rho = elemMass[i] / (volo[i] * vnew[i]) ;
+
+          qlin = -qlc_monoq * rho *
+                (  delvxxi   * (Real_t(1.) - phixi) +
+                   delvxeta  * (Real_t(1.) - phieta) +
+                   delvxzeta * (Real_t(1.) - phizeta)  ) ;
+
+          qquad = qqc_monoq * rho *
+                (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
+                   delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
+                   delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
+        }
+
+        qq[i] = qquad ;
+        ql[i] = qlin  ;
+  }
+}
+
+
+static inline
+void CalcMonotonicQRegionForElems_gpu(// parameters
+                                      Real_t qlc_monoq,
+                                      Real_t qqc_monoq,
+                                      Real_t monoq_limiter_mult,
+                                      Real_t monoq_max_slope,
+                                      Real_t ptiny,
+
+                                      // the elementset length
+                                      Index_t elength )
+{
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(elength,dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcMonotonicQRegionForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, qlc_monoq,qqc_monoq,monoq_limiter_mult,monoq_max_slope,ptiny,elength,
+         meshGPU.m_matElemlist,meshGPU.m_elemBC,
+         meshGPU.m_lxim,meshGPU.m_lxip,
+         meshGPU.m_letam,meshGPU.m_letap,
+         meshGPU.m_lzetam,meshGPU.m_lzetap,
+         meshGPU.m_delv_xi,meshGPU.m_delv_eta,meshGPU.m_delv_zeta,
+         meshGPU.m_delx_xi,meshGPU.m_delx_eta,meshGPU.m_delx_zeta,
+         meshGPU.m_vdov,meshGPU.m_elemMass,meshGPU.m_volo,meshGPU.m_vnew,
+         meshGPU.m_qq,meshGPU.m_ql);
+  HIP_DEBUGSYNC;
+}
+
+
+static inline
+void CalcMonotonicQRegionForElems_cpu(// parameters
+                                      Real_t qlc_monoq,
+                                      Real_t qqc_monoq,
+                                      Real_t monoq_limiter_mult,
+                                      Real_t monoq_max_slope,
+                                      Real_t ptiny,
+
+                                      // the elementset length
+                                      Index_t elength )
+{
+  for ( Index_t ielem = 0 ; ielem < elength; ++ielem ) {
+        Real_t qlin, qquad ;
+        Real_t phixi, phieta, phizeta ;
+        Index_t i = mesh.matElemlist(ielem);
+        Int_t bcMask = mesh.elemBC(i) ;
+        Real_t delvm, delvp ;
+
+        /*  phixi     */
+        Real_t norm = Real_t(1.) / ( mesh.delv_xi(i) + ptiny ) ;
+
+        switch (bcMask & XI_M) {
+        case 0:         delvm = mesh.delv_xi(mesh.lxim(i)) ; break ;
+        case XI_M_SYMM: delvm = mesh.delv_xi(i) ;            break ;
+        case XI_M_FREE: delvm = Real_t(0.0) ;                break ;
+        default:        /* ERROR */ ;                        break ;
+        }
+        switch (bcMask & XI_P) {
+        case 0:         delvp = mesh.delv_xi(mesh.lxip(i)) ; break ;
+        case XI_P_SYMM: delvp = mesh.delv_xi(i) ;            break ;
+        case XI_P_FREE: delvp = Real_t(0.0) ;                break ;
+        default:        /* ERROR */ ;                        break ;
+        }
+
+        delvm = delvm * norm ;
+        delvp = delvp * norm ;
+
+        phixi = Real_t(.5) * ( delvm + delvp ) ;
+
+        delvm *= monoq_limiter_mult ;
+        delvp *= monoq_limiter_mult ;
+
+        if ( delvm < phixi ) phixi = delvm ;
+        if ( delvp < phixi ) phixi = delvp ;
+        if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
+        if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
+
+
+        /*  phieta     */
+        norm = Real_t(1.) / ( mesh.delv_eta(i) + ptiny ) ;
+
+        switch (bcMask & ETA_M) {
+        case 0:          delvm = mesh.delv_eta(mesh.letam(i)) ; break ;
+        case ETA_M_SYMM: delvm = mesh.delv_eta(i) ;             break ;
+        case ETA_M_FREE: delvm = Real_t(0.0) ;                  break ;
+        default:         /* ERROR */ ;                          break ;
+        }
+        switch (bcMask & ETA_P) {
+        case 0:          delvp = mesh.delv_eta(mesh.letap(i)) ; break ;
+        case ETA_P_SYMM: delvp = mesh.delv_eta(i) ;             break ;
+        case ETA_P_FREE: delvp = Real_t(0.0) ;                  break ;
+        default:         /* ERROR */ ;                          break ;
+        }
+
+        delvm = delvm * norm ;
+        delvp = delvp * norm ;
+
+        phieta = Real_t(.5) * ( delvm + delvp ) ;
+
+        delvm *= monoq_limiter_mult ;
+        delvp *= monoq_limiter_mult ;
+
+        if ( delvm  < phieta ) phieta = delvm ;
+        if ( delvp  < phieta ) phieta = delvp ;
+        if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
+        if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
+
+        /*  phizeta     */
+        norm = Real_t(1.) / ( mesh.delv_zeta(i) + ptiny ) ;
+
+        switch (bcMask & ZETA_M) {
+        case 0:           delvm = mesh.delv_zeta(mesh.lzetam(i)) ; break ;
+        case ZETA_M_SYMM: delvm = mesh.delv_zeta(i) ;              break ;
+        case ZETA_M_FREE: delvm = Real_t(0.0) ;                    break ;
+        default:          /* ERROR */ ;                            break ;
+        }
+        switch (bcMask & ZETA_P) {
+        case 0:           delvp = mesh.delv_zeta(mesh.lzetap(i)) ; break ;
+        case ZETA_P_SYMM: delvp = mesh.delv_zeta(i) ;              break ;
+        case ZETA_P_FREE: delvp = Real_t(0.0) ;                    break ;
+        default:          /* ERROR */ ;                            break ;
+        }
+
+        delvm = delvm * norm ;
+        delvp = delvp * norm ;
+
+        phizeta = Real_t(.5) * ( delvm + delvp ) ;
+
+        delvm *= monoq_limiter_mult ;
+        delvp *= monoq_limiter_mult ;
+
+        if ( delvm   < phizeta ) phizeta = delvm ;
+        if ( delvp   < phizeta ) phizeta = delvp ;
+        if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
+        if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
+
+        /* Remove length scale */
+
+        if ( mesh.vdov(i) > Real_t(0.) )  {
+          qlin  = Real_t(0.) ;
+          qquad = Real_t(0.) ;
+        }
+        else {
+          Real_t delvxxi   = mesh.delv_xi(i)   * mesh.delx_xi(i)   ;
+          Real_t delvxeta  = mesh.delv_eta(i)  * mesh.delx_eta(i)  ;
+          Real_t delvxzeta = mesh.delv_zeta(i) * mesh.delx_zeta(i) ;
+
+          if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
+          if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
+          if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
+
+          Real_t rho = mesh.elemMass(i) / (mesh.volo(i) * mesh.vnew(i)) ;
+
+          qlin = -qlc_monoq * rho *
+                (  delvxxi   * (Real_t(1.) - phixi) +
+                   delvxeta  * (Real_t(1.) - phieta) +
+                   delvxzeta * (Real_t(1.) - phizeta)  ) ;
+
+          qquad = qqc_monoq * rho *
+                (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
+                   delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
+                   delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
+        }
+
+        mesh.qq(i) = qquad ;
+        mesh.ql(i) = qlin  ;
+  }
+}
+
+static inline
+void CalcMonotonicQRegionForElems(// parameters
+                                  Real_t qlc_monoq,
+                                  Real_t qqc_monoq,
+                                  Real_t monoq_limiter_mult,
+                                  Real_t monoq_max_slope,
+                                  Real_t ptiny,
+
+                                  // the elementset length
+                                  Index_t elength,
+                                  int useCPU)
+{
+  if (useCPU) {
+        FC(matElemlist); FC(elemBC); FC(lxim); FC(lxip); FC(letam); FC(letap); FC(lzetam); FC(lzetap);
+        FC(delv_xi); FC(delv_eta); FC(delv_zeta); FC(delx_xi); FC(delx_eta); FC(delx_zeta);
+        FC(vdov); FC(elemMass); FC(volo); FC(vnew);
+        CalcMonotonicQRegionForElems_cpu(qlc_monoq,qqc_monoq,
+                                         monoq_limiter_mult,monoq_max_slope,ptiny,
+                                         elength);
+        SG(qq); SG(ql);
+  }
+  else {
+        FG(matElemlist); FG(elemBC); FG(lxim); FG(lxip); FG(letam); FG(letap); FG(lzetam); FG(lzetap);
+        FG(delv_xi); FG(delv_eta); FG(delv_zeta); FG(delx_xi); FG(delx_eta); FG(delx_zeta);
+        FG(vdov); FG(elemMass); FG(volo); FG(vnew);
+        CalcMonotonicQRegionForElems_gpu(qlc_monoq,qqc_monoq,
+                                         monoq_limiter_mult,monoq_max_slope,ptiny,
+                                         elength);
+        SC(qq); SC(ql);
+  }
+}
+
+static inline
+void CalcMonotonicQForElems(int useCPU)
+{
+  //
+  // initialize parameters
+  //
+  const Real_t ptiny        = Real_t(1.e-36) ;
+  Real_t monoq_max_slope    = mesh.monoq_max_slope() ;
+  Real_t monoq_limiter_mult = mesh.monoq_limiter_mult() ;
+
+  //
+  // calculate the monotonic q for pure regions
+  //
+  Index_t elength = mesh.numElem() ;
+  if (elength > 0) {
+        Real_t qlc_monoq = mesh.qlc_monoq();
+        Real_t qqc_monoq = mesh.qqc_monoq();
+        CalcMonotonicQRegionForElems(// parameters
+                                     qlc_monoq,
+                                     qqc_monoq,
+                                     monoq_limiter_mult,
+                                     monoq_max_slope,
+                                     ptiny,
+
+                                     // the elemset length
+                                     elength,
+                                     useCPU);
+  }
+}
+
+static inline
+void CalcQForElems(int useCPU)
+{
+  Real_t qstop = mesh.qstop() ;
+  Index_t numElem = mesh.numElem() ;
+
+  //
+  // MONOTONIC Q option
+  //
+
+  /* Calculate velocity gradients */
+  CalcMonotonicQGradientsForElems(useCPU) ;
+
+  /* Transfer veloctiy gradients in the first order elements */
+  /* problem->commElements->Transfer(CommElements::monoQ) ; */
+  CalcMonotonicQForElems(useCPU) ;
+
+  /* Don't allow excessive artificial viscosity */
+  /*
+   if (numElem != 0) {
+      Index_t idx = -1;
+      for (Index_t i=0; i<numElem; ++i) {
+         if ( mesh.q(i) > qstop ) {
+            idx = i ;
+            break ;
+                        }
+                        }
+
+                        if(idx >= 0) {
+                        exit(QStopError) ;
+                        }
+                        }
+   */
+}
+
+
+__global__
+void CalcPressureForElems_kernel(Real_t* p_new, Real_t* bvc,
+                                 Real_t* pbvc, Real_t* e_old,
+                                 Real_t* compression, Real_t *vnewc,
+                                 Real_t pmin,
+                                 Real_t p_cut, Real_t eosvmax,
+                                 Index_t length, Real_t c1s)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<length) {
+
+        bvc[i] = c1s * (compression[i] + Real_t(1.));
+        pbvc[i] = c1s;
+
+        p_new[i] = bvc[i] * e_old[i] ;
+
+        if    (FABS(p_new[i]) <  p_cut   )
+          p_new[i] = Real_t(0.0) ;
+
+        if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */
+          p_new[i] = Real_t(0.0) ;
+
+        if    (p_new[i]       <  pmin)
+          p_new[i]   = pmin ;
+  }
+}
+
+
+static inline
+void CalcPressureForElems_gpu(Real_t* p_new, Real_t* bvc,
+                              Real_t* pbvc, Real_t* e_old,
+                              Real_t* compression, Real_t *vnewc,
+                              Real_t pmin,
+                              Real_t p_cut, Real_t eosvmax,
+                              Index_t length)
+{
+  Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcPressureForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, p_new,bvc,pbvc,e_old,compression,vnewc,pmin,p_cut,eosvmax,length,c1s);
+  HIP_DEBUGSYNC;
+}
+
+
+static inline
+void CalcPressureForElems_cpu(Real_t* p_new, Real_t* bvc,
+                              Real_t* pbvc, Real_t* e_old,
+                              Real_t* compression, Real_t *vnewc,
+                              Real_t pmin,
+                              Real_t p_cut, Real_t eosvmax,
+                              Index_t length)
+{
+  Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
+  for (Index_t i = 0; i < length ; ++i) {
+        bvc[i] = c1s * (compression[i] + Real_t(1.));
+        pbvc[i] = c1s;
+  }
+
+  for (Index_t i = 0 ; i < length ; ++i){
+        p_new[i] = bvc[i] * e_old[i] ;
+
+        if    (FABS(p_new[i]) <  p_cut   )
+          p_new[i] = Real_t(0.0) ;
+
+        if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */
+          p_new[i] = Real_t(0.0) ;
+
+        if    (p_new[i]       <  pmin)
+          p_new[i]   = pmin ;
+  }
+}
+
+__global__
+void CalcEnergyForElemsPart1_kernel(
+                                    Index_t length,Real_t emin,
+                                    Real_t *e_old,Real_t *delvc,Real_t *p_old,Real_t *q_old,Real_t *work,
+                                    Real_t *e_new)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<length) {
+        e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
+          + Real_t(0.5) * work[i];
+
+        if (e_new[i]  < emin ) {
+          e_new[i] = emin ;
+        }
+  }
+}
+
+
+__global__
+void CalcEnergyForElemsPart2_kernel(
+                                    Index_t length,Real_t rho0,Real_t e_cut,Real_t emin,
+                                    Real_t *compHalfStep,Real_t *delvc,Real_t *pbvc,Real_t *bvc,
+                                    Real_t *pHalfStep,Real_t *ql,Real_t *qq,Real_t *p_old,Real_t *q_old,Real_t *work,
+                                    Real_t *e_new,
+                                    Real_t *q_new
+                                    )
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<length) {
+
+        Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
+
+        if ( delvc[i] > Real_t(0.) ) {
+          q_new[i] /* = qq[i] = ql[i] */ = Real_t(0.) ;
+        }
+        else {
+          Real_t ssc = ( pbvc[i] * e_new[i]
+                                         + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
+
+          if ( ssc <= Real_t(0.) ) {
+                ssc =Real_t(.333333e-36) ;
+          } else {
+                ssc = SQRT(ssc) ;
+          }
+
+          q_new[i] = (ssc*ql[i] + qq[i]) ;
+        }
+
+        e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
+          * (  Real_t(3.0)*(p_old[i]     + q_old[i])
+                   - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;
+
+        e_new[i] += Real_t(0.5) * work[i];
+
+        if (FABS(e_new[i]) < e_cut) {
+          e_new[i] = Real_t(0.)  ;
+        }
+        if (     e_new[i]  < emin ) {
+          e_new[i] = emin ;
+        }
+  }
+}
+
+
+__global__
+void CalcEnergyForElemsPart3_kernel(
+                                    Index_t length,Real_t rho0,Real_t sixth,Real_t e_cut,Real_t emin,
+                                    Real_t *pbvc,Real_t *vnewc,Real_t *bvc,Real_t *p_new,Real_t *ql,Real_t *qq,
+                                    Real_t *p_old,Real_t *q_old,Real_t *pHalfStep,Real_t *q_new,Real_t *delvc,
+                                    Real_t *e_new)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<length) {
+        Real_t q_tilde ;
+
+        if (delvc[i] > Real_t(0.)) {
+          q_tilde = Real_t(0.) ;
+        }
+        else {
+          Real_t ssc = ( pbvc[i] * e_new[i]
+                                         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+
+          if ( ssc <= Real_t(0.) ) {
+                ssc = Real_t(.333333e-36) ;
+          } else {
+                ssc = SQRT(ssc) ;
+          }
+
+          q_tilde = (ssc*ql[i] + qq[i]) ;
+        }
+
+        e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
+                                                         - Real_t(8.0)*(pHalfStep[i] + q_new[i])
+                                                         + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
+
+        if (FABS(e_new[i]) < e_cut) {
+          e_new[i] = Real_t(0.)  ;
+        }
+        if (     e_new[i]  < emin ) {
+          e_new[i] = emin ;
+        }
+  }
+}
+
+
+__global__
+void CalcEnergyForElemsPart4_kernel(
+                                    Index_t length,Real_t rho0,Real_t q_cut,
+                                    Real_t *delvc,Real_t *pbvc,Real_t *e_new,Real_t *vnewc,Real_t *bvc,
+                                    Real_t *p_new,Real_t *ql,Real_t *qq,
+                                    Real_t *q_new)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<length) {
+
+        if ( delvc[i] <= Real_t(0.) ) {
+          Real_t ssc = ( pbvc[i] * e_new[i]
+                                         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+
+          if ( ssc <= Real_t(0.) ) {
+                ssc = Real_t(.333333e-36) ;
+          } else {
+                ssc = SQRT(ssc) ;
+          }
+
+          q_new[i] = (ssc*ql[i] + qq[i]) ;
+
+          if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
+        }
+  }
+}
+
+static inline
+void CalcEnergyForElems_gpu(Real_t* p_new, Real_t* e_new, Real_t* q_new,
+                            Real_t* bvc, Real_t* pbvc,
+                            Real_t* p_old, Real_t* e_old, Real_t* q_old,
+                            Real_t* compression, Real_t* compHalfStep,
+                            Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
+                            Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
+                            Real_t* qq, Real_t* ql,
+                            Real_t rho0,
+                            Real_t eosvmax,
+                            Index_t length)
+{
+  const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
+  Real_t *pHalfStep;
+
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);
+
+  HIP( hipMalloc(&pHalfStep,sizeof(Real_t)*length) );
+
+  hipLaunchKernelGGL((CalcEnergyForElemsPart1_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,emin,e_old,delvc,p_old,q_old,work,e_new);
+  HIP_DEBUGSYNC;
+
+  CalcPressureForElems_gpu(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
+                           pmin, p_cut, eosvmax, length);
+
+  hipLaunchKernelGGL((CalcEnergyForElemsPart2_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,rho0,e_cut,emin,
+         compHalfStep,delvc,pbvc,bvc,pHalfStep,ql,qq,p_old,q_old,work,
+         e_new,
+         q_new);
+  HIP_DEBUGSYNC;
+
+  CalcPressureForElems_gpu(p_new, bvc, pbvc, e_new, compression, vnewc,
+                           pmin, p_cut, eosvmax, length);
+
+  hipLaunchKernelGGL((CalcEnergyForElemsPart3_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,rho0,sixth,e_cut,emin,
+         pbvc,vnewc,bvc,p_new,ql,qq,
+         p_old,q_old,pHalfStep,q_new,delvc,
+         e_new);
+  HIP_DEBUGSYNC;
+
+  CalcPressureForElems_gpu(p_new, bvc, pbvc, e_new, compression, vnewc,
+                           pmin, p_cut, eosvmax, length);
+
+  hipLaunchKernelGGL((CalcEnergyForElemsPart4_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,rho0,q_cut,
+         delvc,pbvc,e_new,vnewc,bvc,
+         p_new,ql,qq,
+         q_new);
+  HIP_DEBUGSYNC;
+
+  HIP( hipFree(pHalfStep) );
+
+  return ;
+}
+
+static inline
+void CalcEnergyForElems_cpu(Real_t* p_new, Real_t* e_new, Real_t* q_new,
+                            Real_t* bvc, Real_t* pbvc,
+                            Real_t* p_old, Real_t* e_old, Real_t* q_old,
+                            Real_t* compression, Real_t* compHalfStep,
+                            Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
+                            Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
+                            Real_t* qq, Real_t* ql,
+                            Real_t rho0,
+                            Real_t eosvmax,
+                            Index_t length)
+{
+  const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
+  Real_t *pHalfStep = Allocate<Real_t>(length) ;
+
+  for (Index_t i = 0 ; i < length ; ++i) {
+        e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
+          + Real_t(0.5) * work[i];
+
+        if (e_new[i]  < emin ) {
+          e_new[i] = emin ;
+        }
+  }
+
+  CalcPressureForElems_cpu(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
+                           pmin, p_cut, eosvmax, length);
+
+  for (Index_t i = 0 ; i < length ; ++i) {
+        Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
+
+        if ( delvc[i] > Real_t(0.) ) {
+          q_new[i] /* = qq[i] = ql[i] */ = Real_t(0.) ;
+        }
+        else {
+          Real_t ssc = ( pbvc[i] * e_new[i]
+                                         + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
+
+          if ( ssc <= Real_t(0.) ) {
+                ssc =Real_t(.333333e-36) ;
+          } else {
+                ssc = SQRT(ssc) ;
+          }
+
+          q_new[i] = (ssc*ql[i] + qq[i]) ;
+        }
+
+        e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
+          * (  Real_t(3.0)*(p_old[i]     + q_old[i])
+                   - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;
+  }
+
+  for (Index_t i = 0 ; i < length ; ++i) {
+
+        e_new[i] += Real_t(0.5) * work[i];
+
+        if (FABS(e_new[i]) < e_cut) {
+          e_new[i] = Real_t(0.)  ;
+        }
+        if (     e_new[i]  < emin ) {
+          e_new[i] = emin ;
+        }
+  }
+
+  CalcPressureForElems_cpu(p_new, bvc, pbvc, e_new, compression, vnewc,
+                           pmin, p_cut, eosvmax, length);
+
+  for (Index_t i = 0 ; i < length ; ++i){
+        Real_t q_tilde ;
+
+        if (delvc[i] > Real_t(0.)) {
+          q_tilde = Real_t(0.) ;
+        }
+        else {
+          Real_t ssc = ( pbvc[i] * e_new[i]
+                                         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+
+          if ( ssc <= Real_t(0.) ) {
+                ssc = Real_t(.333333e-36) ;
+          } else {
+                ssc = SQRT(ssc) ;
+          }
+
+          q_tilde = (ssc*ql[i] + qq[i]) ;
+        }
+
+        e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
+                                                         - Real_t(8.0)*(pHalfStep[i] + q_new[i])
+                                                         + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
+
+        if (FABS(e_new[i]) < e_cut) {
+          e_new[i] = Real_t(0.)  ;
+        }
+        if (     e_new[i]  < emin ) {
+          e_new[i] = emin ;
+        }
+  }
+
+  CalcPressureForElems_cpu(p_new, bvc, pbvc, e_new, compression, vnewc,
+                                                   pmin, p_cut, eosvmax, length);
+
+  for (Index_t i = 0 ; i < length ; ++i){
+
+        if ( delvc[i] <= Real_t(0.) ) {
+          Real_t ssc = ( pbvc[i] * e_new[i]
+                                         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+
+          if ( ssc <= Real_t(0.) ) {
+                ssc = Real_t(.333333e-36) ;
+          } else {
+                ssc = SQRT(ssc) ;
+          }
+
+          q_new[i] = (ssc*ql[i] + qq[i]) ;
+
+          if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
+        }
+  }
+
+  Release(&pHalfStep) ;
+
+  return ;
+}
+
+
+__global__
+void CalcSoundSpeedForElems_kernel(Real_t *vnewc, Real_t rho0, Real_t *enewc,
+                                   Real_t *pnewc, Real_t *pbvc,
+                                   Real_t *bvc, Real_t ss4o3, Index_t nz,Index_t *matElemlist,
+                                   Real_t *ss)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<nz) {
+
+        Index_t iz = matElemlist[i];
+        Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[i] * vnewc[i] *
+                                        bvc[i] * pnewc[i]) / rho0;
+        if (ssTmp <= Real_t(1.111111e-36)) {
+          ssTmp = Real_t(1.111111e-36);
+        }
+        ss[iz] = SQRT(ssTmp);
+  }
+}
+
+
+static inline
+void CalcSoundSpeedForElems_gpu(Real_t *vnewc, Real_t rho0, Real_t *enewc,
+                                Real_t *pnewc, Real_t *pbvc,
+                                Real_t *bvc, Real_t ss4o3, Index_t nz)
+{
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(nz,dimBlock.x),1,1);
+  hipLaunchKernelGGL((CalcSoundSpeedForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, vnewc,rho0,enewc,pnewc,pbvc,bvc,ss4o3,nz,meshGPU.m_matElemlist,meshGPU.m_ss);
+  HIP_DEBUGSYNC;
+
+}
+
+static inline
+void CalcSoundSpeedForElems_cpu(Real_t *vnewc, Real_t rho0, Real_t *enewc,
+                                Real_t *pnewc, Real_t *pbvc,
+                                Real_t *bvc, Real_t ss4o3, Index_t nz)
+{
+  for (Index_t i = 0; i < nz ; ++i) {
+        Index_t iz = mesh.matElemlist(i);
+        Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[i] * vnewc[i] *
+                                        bvc[i] * pnewc[i]) / rho0;
+        if (ssTmp <= Real_t(1.111111e-36)) {
+          ssTmp = Real_t(1.111111e-36);
+        }
+        mesh.ss(iz) = SQRT(ssTmp);
+  }
+}
+
+
+__global__
+void EvalEOSForElemsPart1_kernel(
+                                 Index_t length,Real_t eosvmin,Real_t eosvmax,
+                                 Index_t *matElemlist,
+                                 Real_t *e,Real_t *delv,Real_t *p,Real_t *q,Real_t *qq,Real_t *ql,
+                                 Real_t *vnewc,
+                                 Real_t *e_old,Real_t *delvc,Real_t *p_old,Real_t *q_old,
+                                 Real_t *compression,Real_t *compHalfStep,
+                                 Real_t *qq_old,Real_t *ql_old,Real_t *work)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<length) {
+        Index_t zidx = matElemlist[i];
+        e_old[i] = e[zidx];
+        delvc[i] = delv[zidx];
+        p_old[i] = p[zidx];
+        q_old[i] = q[zidx];
+
+        Real_t vchalf ;
+        compression[i] = Real_t(1.) / vnewc[i] - Real_t(1.);
+        vchalf = vnewc[i] - delvc[i] * Real_t(.5);
+        compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);
+
+        if ( eosvmin != Real_t(0.) ) {
+          if (vnewc[i] <= eosvmin) { /* impossible due to calling func? */
+                compHalfStep[i] = compression[i] ;
+          }
+        }
+        if ( eosvmax != Real_t(0.) ) {
+          if (vnewc[i] >= eosvmax) { /* impossible due to calling func? */
+                p_old[i]        = Real_t(0.) ;
+                compression[i]  = Real_t(0.) ;
+                compHalfStep[i] = Real_t(0.) ;
+          }
+        }
+
+        qq_old[i] = qq[zidx] ;
+        ql_old[i] = ql[zidx] ;
+        work[i] = Real_t(0.) ;
+  }
+}
+
+
+__global__
+void EvalEOSForElemsPart2_kernel(
+                                 Index_t length,
+                                 Index_t *matElemlist,Real_t *p_new,Real_t *e_new,Real_t *q_new,
+                                 Real_t *p,Real_t *e,Real_t *q)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<length) {
+        Index_t zidx = matElemlist[i] ;
+        p[zidx] = p_new[i];
+        e[zidx] = e_new[i];
+        q[zidx] = q_new[i];
+  }
+}
+
+
+static inline
+void EvalEOSForElems_gpu(Real_t *vnewc, Index_t length)
+{
+  Real_t  e_cut = mesh.e_cut();
+  Real_t  p_cut = mesh.p_cut();
+  Real_t  ss4o3 = mesh.ss4o3();
+  Real_t  q_cut = mesh.q_cut();
+
+  Real_t eosvmax = mesh.eosvmax() ;
+  Real_t eosvmin = mesh.eosvmin() ;
+  Real_t pmin    = mesh.pmin() ;
+  Real_t emin    = mesh.emin() ;
+  Real_t rho0    = mesh.refdens() ;
+
+  Real_t *e_old,*delvc,*p_old,*q_old;
+  Real_t *compression,*compHalfStep;
+  Real_t *qq,*ql,*work,*p_new,*e_new,*q_new,*bvc,*pbvc;
+
+  HIP( hipMalloc(&e_old,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&delvc,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&p_old,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&q_old,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&compression,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&compHalfStep,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&qq,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&ql,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&work,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&p_new,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&e_new,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&q_new,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&bvc,sizeof(Real_t)*length) );
+  HIP( hipMalloc(&pbvc,sizeof(Real_t)*length) );
+
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);
+
+  hipLaunchKernelGGL((EvalEOSForElemsPart1_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,eosvmin,eosvmax,
+         meshGPU.m_matElemlist,
+         meshGPU.m_e,meshGPU.m_delv,meshGPU.m_p,meshGPU.m_q,meshGPU.m_qq,meshGPU.m_ql,
+         vnewc,
+         e_old,delvc,p_old,q_old,
+         compression,compHalfStep,qq,ql,work);
+  HIP_DEBUGSYNC;
+
+  CalcEnergyForElems_gpu(p_new, e_new, q_new, bvc, pbvc,
+                         p_old, e_old,  q_old, compression, compHalfStep,
+                         vnewc, work,  delvc, pmin,
+                         p_cut, e_cut, q_cut, emin,
+                         qq, ql, rho0, eosvmax, length);
+
+
+  hipLaunchKernelGGL((EvalEOSForElemsPart2_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,
+         meshGPU.m_matElemlist,p_new,e_new,q_new,
+         meshGPU.m_p,meshGPU.m_e,meshGPU.m_q);
+  HIP_DEBUGSYNC;
+
+  CalcSoundSpeedForElems_gpu(vnewc, rho0, e_new, p_new,
+                             pbvc, bvc, ss4o3, length) ;
+
+  HIP( hipFree(pbvc) );
+  HIP( hipFree(bvc) );
+  HIP( hipFree(q_new) );
+  HIP( hipFree(e_new) );
+  HIP( hipFree(p_new) );
+  HIP( hipFree(work) );
+  HIP( hipFree(ql) );
+  HIP( hipFree(qq) );
+  HIP( hipFree(compHalfStep) );
+  HIP( hipFree(compression) );
+  HIP( hipFree(q_old) );
+  HIP( hipFree(p_old) );
+  HIP( hipFree(delvc) );
+  HIP( hipFree(e_old) );
+}
+
+
+static inline
+void EvalEOSForElems_cpu(Real_t *vnewc, Index_t length)
+{
+  Real_t  e_cut = mesh.e_cut();
+  Real_t  p_cut = mesh.p_cut();
+  Real_t  ss4o3 = mesh.ss4o3();
+  Real_t  q_cut = mesh.q_cut();
+
+  Real_t eosvmax = mesh.eosvmax() ;
+  Real_t eosvmin = mesh.eosvmin() ;
+  Real_t pmin    = mesh.pmin() ;
+  Real_t emin    = mesh.emin() ;
+  Real_t rho0    = mesh.refdens() ;
+
+  Real_t *e_old = Allocate<Real_t>(length) ;
+  Real_t *delvc = Allocate<Real_t>(length) ;
+  Real_t *p_old = Allocate<Real_t>(length) ;
+  Real_t *q_old = Allocate<Real_t>(length) ;
+  Real_t *compression = Allocate<Real_t>(length) ;
+  Real_t *compHalfStep = Allocate<Real_t>(length) ;
+  Real_t *qq = Allocate<Real_t>(length) ;
+  Real_t *ql = Allocate<Real_t>(length) ;
+  Real_t *work = Allocate<Real_t>(length) ;
+  Real_t *p_new = Allocate<Real_t>(length) ;
+  Real_t *e_new = Allocate<Real_t>(length) ;
+  Real_t *q_new = Allocate<Real_t>(length) ;
+  Real_t *bvc = Allocate<Real_t>(length) ;
+  Real_t *pbvc = Allocate<Real_t>(length) ;
+
+  /* compress data, minimal set */
+  for (Index_t i=0; i<length; ++i) {
+        Index_t zidx = mesh.matElemlist(i) ;
+        e_old[i] = mesh.e(zidx) ;
+  }
+
+  for (Index_t i=0; i<length; ++i) {
+        Index_t zidx = mesh.matElemlist(i) ;
+        delvc[i] = mesh.delv(zidx) ;
+  }
+
+  for (Index_t i=0; i<length; ++i) {
+        Index_t zidx = mesh.matElemlist(i) ;
+        p_old[i] = mesh.p(zidx) ;
+  }
+
+  for (Index_t i=0; i<length; ++i) {
+        Index_t zidx = mesh.matElemlist(i) ;
+        q_old[i] = mesh.q(zidx) ;
+  }
+
+  for (Index_t i = 0; i < length ; ++i) {
+        Real_t vchalf ;
+        compression[i] = Real_t(1.) / vnewc[i] - Real_t(1.);
+        vchalf = vnewc[i] - delvc[i] * Real_t(.5);
+        compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);
+  }
+
+  /* Check for v > eosvmax or v < eosvmin */
+  if ( eosvmin != Real_t(0.) ) {
+        for(Index_t i=0 ; i<length ; ++i) {
+          if (vnewc[i] <= eosvmin) { /* impossible due to calling func? */
+                compHalfStep[i] = compression[i] ;
+          }
+        }
+  }
+  if ( eosvmax != Real_t(0.) ) {
+        for(Index_t i=0 ; i<length ; ++i) {
+          if (vnewc[i] >= eosvmax) { /* impossible due to calling func? */
+                p_old[i]        = Real_t(0.) ;
+                compression[i]  = Real_t(0.) ;
+                compHalfStep[i] = Real_t(0.) ;
+          }
+        }
+  }
+
+  for (Index_t i = 0 ; i < length ; ++i) {
+        Index_t zidx = mesh.matElemlist(i) ;
+        qq[i] = mesh.qq(zidx) ;
+        ql[i] = mesh.ql(zidx) ;
+        work[i] = Real_t(0.) ;
+  }
+
+  CalcEnergyForElems_cpu(p_new, e_new, q_new, bvc, pbvc,
+                         p_old, e_old,  q_old, compression, compHalfStep,
+                         vnewc, work,  delvc, pmin,
+                         p_cut, e_cut, q_cut, emin,
+                         qq, ql, rho0, eosvmax, length);
+
+
+  for (Index_t i=0; i<length; ++i) {
+        Index_t zidx = mesh.matElemlist(i) ;
+        mesh.p(zidx) = p_new[i] ;
+  }
+
+  for (Index_t i=0; i<length; ++i) {
+        Index_t zidx = mesh.matElemlist(i) ;
+        mesh.e(zidx) = e_new[i] ;
+  }
+
+  for (Index_t i=0; i<length; ++i) {
+        Index_t zidx = mesh.matElemlist(i) ;
+        mesh.q(zidx) = q_new[i] ;
+  }
+
+  CalcSoundSpeedForElems_cpu(vnewc, rho0, e_new, p_new,
+                             pbvc, bvc, ss4o3, length) ;
+
+  Release(&pbvc) ;
+  Release(&bvc) ;
+  Release(&q_new) ;
+  Release(&e_new) ;
+  Release(&p_new) ;
+  Release(&work) ;
+  Release(&ql) ;
+  Release(&qq) ;
+  Release(&compHalfStep) ;
+  Release(&compression) ;
+  Release(&q_old) ;
+  Release(&p_old) ;
+  Release(&delvc) ;
+  Release(&e_old) ;
+}
+
+
+__global__
+void ApplyMaterialPropertiesForElemsPart1_kernel(
+                                                 Index_t length,Real_t eosvmin,Real_t eosvmax,
+                                                 Index_t *matElemlist,Real_t *vnew,
+                                                 Real_t *vnewc)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<length) {
+        Index_t zn = matElemlist[i] ;
+        vnewc[i] = vnew[zn] ;
+
+        if (eosvmin != Real_t(0.)) {
+          if (vnewc[i] < eosvmin)
+                vnewc[i] = eosvmin ;
+        }
+
+        if (eosvmax != Real_t(0.)) {
+          if (vnewc[i] > eosvmax)
+                vnewc[i] = eosvmax ;
+        }
+  }
+}
+
+
+static inline
+void ApplyMaterialPropertiesForElems_gpu()
+{
+  Index_t length = mesh.numElem() ;
+
+  if (length != 0) {
+        /* Expose all of the variables needed for material evaluation */
+        Real_t eosvmin = mesh.eosvmin() ;
+        Real_t eosvmax = mesh.eosvmax() ;
+        Real_t *vnewc;
+
+        HIP( hipMalloc(&vnewc,sizeof(Real_t)*length) );
+
+        dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+        dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);
+        hipLaunchKernelGGL((ApplyMaterialPropertiesForElemsPart1_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,eosvmin,eosvmax,
+           meshGPU.m_matElemlist,meshGPU.m_vnew,
+           vnewc);
+        HIP_DEBUGSYNC;
+
+        /*
+    for (Index_t i=0; i<length; ++i) {
+       Index_t zn = mesh.matElemlist(i) ;
+       Real_t vc = mesh.v(zn) ;
+       if (eosvmin != Real_t(0.)) {
+          if (vc < eosvmin)
+             vc = eosvmin ;
+       }
+       if (eosvmax != Real_t(0.)) {
+          if (vc > eosvmax)
+             vc = eosvmax ;
+       }
+       if (vc <= 0.) {
+          exit(VolumeError) ;
+       }
+    }
+    */
+
+        EvalEOSForElems_gpu(vnewc, length);
+
+        HIP( hipFree(vnewc) );
+  }
+}
+
+static inline
+void ApplyMaterialPropertiesForElems_cpu()
+{
+  Index_t length = mesh.numElem() ;
+
+  if (length != 0) {
+        /* Expose all of the variables needed for material evaluation */
+        Real_t eosvmin = mesh.eosvmin() ;
+        Real_t eosvmax = mesh.eosvmax() ;
+        Real_t *vnewc = Allocate<Real_t>(length) ;
+
+        for (Index_t i=0 ; i<length ; ++i) {
+          Index_t zn = mesh.matElemlist(i) ;
+          vnewc[i] = mesh.vnew(zn) ;
+        }
+
+        if (eosvmin != Real_t(0.)) {
+          for(Index_t i=0 ; i<length ; ++i) {
+                if (vnewc[i] < eosvmin)
+                  vnewc[i] = eosvmin ;
+          }
+        }
+
+        if (eosvmax != Real_t(0.)) {
+          for(Index_t i=0 ; i<length ; ++i) {
+                if (vnewc[i] > eosvmax)
+                  vnewc[i] = eosvmax ;
+          }
+        }
+
+        for (Index_t i=0; i<length; ++i) {
+          Index_t zn = mesh.matElemlist(i) ;
+          Real_t vc = mesh.v(zn) ;
+          if (eosvmin != Real_t(0.)) {
+                if (vc < eosvmin)
+                  vc = eosvmin ;
+          }
+          if (eosvmax != Real_t(0.)) {
+                if (vc > eosvmax)
+                  vc = eosvmax ;
+          }
+          if (vc <= 0.) {
+                exit(VolumeError) ;
+          }
+        }
+
+        EvalEOSForElems_cpu(vnewc, length);
+
+        Release(&vnewc) ;
+
+  }
+}
+
+static inline
+void ApplyMaterialPropertiesForElems(int useCPU)
+{
+  if (useCPU) {
+        FC(matElemlist); FC(vnew); FC(v); FC(e); FC(delv); FC(p); FC(q); FC(qq); FC(ql);
+        ApplyMaterialPropertiesForElems_cpu();
+        SG(p); SG(e); SG(q); SG(ss);
+  }
+  else {
+        FG(matElemlist); FG(vnew); FG(v); FG(e); FG(delv); FG(p); FG(q); FG(qq); FG(ql);
+        ApplyMaterialPropertiesForElems_gpu();
+        SC(p); SC(e); SC(q); SC(ss);
+  }
+}
+
+__global__
+void UpdateVolumesForElems_kernel(Index_t numElem,Real_t v_cut,
+                                  Real_t *vnew,
+                                  Real_t *v)
+{
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+  if (i<numElem) {
+        Real_t tmpV ;
+        tmpV = vnew[i] ;
+
+        if ( FABS(tmpV - Real_t(1.0)) < v_cut )
+          tmpV = Real_t(1.0) ;
+        v[i] = tmpV ;
+  }
+}
+
+
+static inline
+void UpdateVolumesForElems_gpu()
+{
+  Index_t numElem = mesh.numElem();
+  if (numElem != 0) {
+        Real_t v_cut = mesh.v_cut();
+        dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+        dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
+        hipLaunchKernelGGL((UpdateVolumesForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,v_cut,meshGPU.m_vnew,meshGPU.m_v);
+  }
+}
+
+
+static inline
+void UpdateVolumesForElems_cpu()
+{
+  Index_t numElem = mesh.numElem();
+  if (numElem != 0) {
+        Real_t v_cut = mesh.v_cut();
+
+        for(Index_t i=0 ; i<numElem ; ++i) {
+          Real_t tmpV ;
+          tmpV = mesh.vnew(i) ;
+
+          if ( FABS(tmpV - Real_t(1.0)) < v_cut )
+                tmpV = Real_t(1.0) ;
+          mesh.v(i) = tmpV ;
+        }
+  }
+
+  return ;
+}
+
+static inline
+void UpdateVolumesForElems(int useCPU)
+{
+  if (useCPU) {
+        FC(vnew);
+        UpdateVolumesForElems_cpu();
+        SG(v);
+  }
+  else {
+        FG(vnew);
+        UpdateVolumesForElems_gpu();
+        SC(v);
+  }
+}
+
+
+static inline
+void LagrangeElements(int useCPU)
+{
+  const Real_t deltatime = mesh.deltatime() ;
+
+  CalcLagrangeElements(deltatime, useCPU) ;
+
+  /* Calculate Q.  (Monotonic q option requires communication) */
+  CalcQForElems(useCPU) ;
+
+  ApplyMaterialPropertiesForElems(useCPU) ;
+
+  UpdateVolumesForElems(useCPU) ;
+}
+
+
+__global__
+void CalcCourantConstraintForElems_kernel(
+                                          Index_t length,Real_t qqc2,
+                                          Index_t *matElemlist,Real_t *ss,Real_t *vdov,Real_t *arealg,
+                                          Real_t *mindtcourant)
+{
+  __shared__ Real_t minArray[BLOCKSIZE];
+
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+
+  Real_t dtcourant = Real_t(1.0e+20) ;
+  if (i<length) {
+        Index_t indx = matElemlist[i] ;
+        Real_t dtf = ss[indx] * ss[indx] ;
+        if ( vdov[indx] < Real_t(0.) ) {
+                      dtf = dtf
+                                                        + qqc2 * arealg[indx] * arealg[indx]
+                                        * vdov[indx] * vdov[indx] ;
+        }
+        dtf = SQRT(dtf) ;
+        dtf = arealg[indx] / dtf ;
+
+        /* determine minimum timestep with its corresponding elem */
+        if (vdov[indx] != Real_t(0.)) {
+          if ( dtf < dtcourant ) {
+                dtcourant = dtf ;
+          }
+        }
+  }
+  minArray[hipThreadIdx_x]=dtcourant;
+  reduceMin<Real_t,BLOCKSIZE>(minArray,hipThreadIdx_x);
+  if (hipThreadIdx_x==0)
+        mindtcourant[hipBlockIdx_x]=minArray[0];
+}
+
+
+static inline
+void CalcCourantConstraintForElems_gpu()
+{
+  Real_t qqc = mesh.qqc();
+  Real_t qqc2 = Real_t(64.0) * qqc * qqc ;
+  Index_t length = mesh.numElem() ;
+
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);
+
+  Real_t *dev_mindtcourant;
+  HIP( hipMalloc(&dev_mindtcourant,sizeof(Real_t)*dimGrid.x) );
+
+  hipLaunchKernelGGL((CalcCourantConstraintForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,qqc2,
+         meshGPU.m_matElemlist,meshGPU.m_ss,meshGPU.m_vdov,meshGPU.m_arealg,
+         dev_mindtcourant);
+  HIP_DEBUGSYNC;
+
+  Real_t *mindtcourant = (Real_t*)malloc(sizeof(Real_t)*dimGrid.x);
+  HIP( hipMemcpy(mindtcourant,dev_mindtcourant,sizeof(Real_t)*dimGrid.x,hipMemcpyDeviceToHost) );
+  HIP( hipFree(dev_mindtcourant) );
+
+  // finish the MIN computation over the thread blocks
+  Real_t dtcourant;
+  dtcourant=mindtcourant[0];
+  for (int i=1; i<dimGrid.x; i++) {
+        MINEQ(dtcourant,mindtcourant[i]);
+  }
+  free(mindtcourant);
+
+  if (dtcourant < Real_t(1.0e+20))
+        mesh.dtcourant() = dtcourant ;
+}
+
+static inline
+void CalcCourantConstraintForElems_cpu()
+{
+  Real_t dtcourant = Real_t(1.0e+20) ;
+  Index_t   courant_elem = -1 ;
+  Real_t      qqc = mesh.qqc() ;
+  Index_t length = mesh.numElem() ;
+
+  Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
+
+  for (Index_t i = 0 ; i < length ; ++i) {
+        Index_t indx = mesh.matElemlist(i) ;
+
+        Real_t dtf = mesh.ss(indx) * mesh.ss(indx) ;
+
+        if ( mesh.vdov(indx) < Real_t(0.) ) {
+
+                   dtf = dtf
+                                 + qqc2 * mesh.arealg(indx) * mesh.arealg(indx)
+                                 * mesh.vdov(indx) * mesh.vdov(indx) ;
+        }
+
+        dtf = SQRT(dtf) ;
+
+        dtf = mesh.arealg(indx) / dtf ;
+
+        /* determine minimum timestep with its corresponding elem */
+        if (mesh.vdov(indx) != Real_t(0.)) {
+          if ( dtf < dtcourant ) {
+                dtcourant = dtf ;
+                courant_elem = indx ;
+          }
+        }
+  }
+
+  /* Don't try to register a time constraint if none of the elements
+   * were active */
+  if (courant_elem != -1) {
+        mesh.dtcourant() = dtcourant ;
+  }
+
+  return ;
+}
+
+
+static inline
+void CalcCourantConstraintForElems(int useCPU)
+{
+  if (useCPU) {
+        FC(matElemlist); FC(ss); FC(vdov); FC(arealg);
+        CalcCourantConstraintForElems_cpu();
+  }
+  else {
+        FG(matElemlist); FG(ss); FG(vdov); FG(arealg);
+        CalcCourantConstraintForElems_gpu();
+  }
+}
+
+
+__global__
+void CalcHydroConstraintForElems_kernel(
+                                        Index_t length,Real_t dvovmax,
+                                        Index_t *matElemlist,Real_t *vdov,
+                                        Real_t *mindthydro)
+{
+  __shared__ Real_t minArray[BLOCKSIZE];
+
+  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
+
+  Real_t dthydro = Real_t(1.0e+20) ;
+  if (i<length) {
+        Index_t indx = matElemlist[i] ;
+        if (vdov[indx] != Real_t(0.)) {
+          Real_t dtdvov = dvovmax / (FABS(vdov[indx])+Real_t(1.e-20)) ;
+          if ( dthydro > dtdvov ) {
+                dthydro = dtdvov ;
+          }
+        }
+  }
+  minArray[hipThreadIdx_x]=dthydro;
+  reduceMin<Real_t,BLOCKSIZE>(minArray,hipThreadIdx_x);
+  if (hipThreadIdx_x==0)
+        mindthydro[hipBlockIdx_x]=minArray[0];
+}
+
+
+static inline
+void CalcHydroConstraintForElems_gpu()
+{
+  Real_t dvovmax = mesh.dvovmax() ;
+  Index_t length = mesh.numElem() ;
+
+  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
+  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);
+
+  Real_t *dev_mindthydro;
+  HIP( hipMalloc(&dev_mindthydro,sizeof(Real_t)*dimGrid.x) );
+
+  hipLaunchKernelGGL((CalcHydroConstraintForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,dvovmax,
+         meshGPU.m_matElemlist,meshGPU.m_vdov,
+         dev_mindthydro);
+  HIP_DEBUGSYNC;
+
+  Real_t *mindthydro = (Real_t*)malloc(sizeof(Real_t)*dimGrid.x);
+  HIP( hipMemcpy(mindthydro,dev_mindthydro,sizeof(Real_t)*dimGrid.x,hipMemcpyDeviceToHost) );
+  HIP( hipFree(dev_mindthydro) );
+
+  // finish the MIN computation over the thread blocks
+  Real_t dthydro=mindthydro[0];
+  for (int i=1; i<dimGrid.x; i++) {
+        MINEQ(dthydro,mindthydro[i]);
+  }
+  free(mindthydro);
+
+  if (dthydro < Real_t(1.0e+20))
+        mesh.dthydro() = dthydro ;
+}
+
+static inline
+void CalcHydroConstraintForElems_cpu()
+{
+  Real_t dthydro = Real_t(1.0e+20) ;
+  Index_t hydro_elem = -1 ;
+  Real_t dvovmax = mesh.dvovmax() ;
+  Index_t length = mesh.numElem() ;
+
+  for (Index_t i = 0 ; i < length ; ++i) {
+        Index_t indx = mesh.matElemlist(i) ;
+
+        if (mesh.vdov(indx) != Real_t(0.)) {
+          Real_t dtdvov = dvovmax / (FABS(mesh.vdov(indx))+Real_t(1.e-20)) ;
+          if ( dthydro > dtdvov ) {
+                dthydro = dtdvov ;
+                hydro_elem = indx ;
+          }
+        }
+  }
+
+  if (hydro_elem != -1) {
+        mesh.dthydro() = dthydro ;
+  }
+
+  return ;
+}
+
+
+static inline
+void CalcHydroConstraintForElems(int useCPU)
+{
+  if (useCPU) {
+        FC(matElemlist); FC(vdov);
+        CalcHydroConstraintForElems_cpu();
+  }
+  else {
+        FG(matElemlist); FG(vdov);
+        CalcHydroConstraintForElems_gpu();
+  }
+}
+
+
+
+static inline
+void CalcTimeConstraintsForElems(int useCPU) {
+  /* evaluate time constraint */
+  CalcCourantConstraintForElems(useCPU) ;
+
+  /* check hydro constraint */
+  CalcHydroConstraintForElems(useCPU) ;
+}
+
+static inline
+void LagrangeLeapFrog(int useCPU)
+{
+  /* calculate nodal forces, accelerations, velocities, positions, with
+   * applied boundary conditions and slide surface considerations */
+
+  LagrangeNodal(useCPU);
+
+  /* calculate element quantities (i.e. velocity gradient & q), and update
+   * material states */
+  LagrangeElements(useCPU);
+
+  CalcTimeConstraintsForElems(useCPU);
+
+  // LagrangeRelease() ;  Creation/destruction of temps may be important to capture
+}
+
+int main(int argc, char *argv[])
+{
+  Index_t edgeElems = 45 ;
+  Index_t edgeNodes = edgeElems+1 ;
+  // Real_t ds = Real_t(1.125)/Real_t(edgeElems) ; /* may accumulate roundoff */
+  Real_t tx, ty, tz ;
+  Index_t nidx, zidx ;
+  Index_t meshElems ;
+
+  /* get run options to measure various metrics */
+
+  /* ... */
+
+  hip_init();
+
+  /****************************/
+  /*   Initialize Sedov Mesh  */
+  /****************************/
+
+  /* construct a uniform box for this processor */
+
+  mesh.sizeX()   = edgeElems ;
+  mesh.sizeY()   = edgeElems ;
+  mesh.sizeZ()   = edgeElems ;
+  mesh.numElem() = edgeElems*edgeElems*edgeElems ;
+  mesh.numNode() = edgeNodes*edgeNodes*edgeNodes ;
+
+  meshElems = mesh.numElem() ;
+
+
+  /* allocate field memory */
+
+  mesh.AllocateElemPersistent(mesh.numElem()) ;
+  mesh.AllocateElemTemporary (mesh.numElem()) ;
+
+  mesh.AllocateNodalPersistent(mesh.numNode()) ;
+  mesh.AllocateNodesets(edgeNodes*edgeNodes) ;
+
+
+  /* initialize nodal coordinates */
+
+  nidx = 0 ;
+  tz  = Real_t(0.) ;
+  for (Index_t plane=0; plane<edgeNodes; ++plane) {
+        ty = Real_t(0.) ;
+        for (Index_t row=0; row<edgeNodes; ++row) {
+          tx = Real_t(0.) ;
+          for (Index_t col=0; col<edgeNodes; ++col) {
+                mesh.x(nidx) = tx ;
+                mesh.y(nidx) = ty ;
+                mesh.z(nidx) = tz ;
+                ++nidx ;
+                // tx += ds ; /* may accumulate roundoff... */
+                tx = Real_t(1.125)*Real_t(col+1)/Real_t(edgeElems) ;
+          }
+          // ty += ds ;  /* may accumulate roundoff... */
+          ty = Real_t(1.125)*Real_t(row+1)/Real_t(edgeElems) ;
+        }
+        // tz += ds ;  /* may accumulate roundoff... */
+        tz = Real_t(1.125)*Real_t(plane+1)/Real_t(edgeElems) ;
+  }
+
+
+  /* embed hexehedral elements in nodal point lattice */
+
+  nidx = 0 ;
+  zidx = 0 ;
+  for (Index_t plane=0; plane<edgeElems; ++plane) {
+        for (Index_t row=0; row<edgeElems; ++row) {
+          for (Index_t col=0; col<edgeElems; ++col) {
+                mesh.nodelist(zidx,0) = nidx                                       ;
+                mesh.nodelist(zidx,1) = nidx                                   + 1 ;
+                mesh.nodelist(zidx,2) = nidx                       + edgeNodes + 1 ;
+                mesh.nodelist(zidx,3) = nidx                       + edgeNodes     ;
+                mesh.nodelist(zidx,4) = nidx + edgeNodes*edgeNodes                 ;
+                mesh.nodelist(zidx,5) = nidx + edgeNodes*edgeNodes             + 1 ;
+                mesh.nodelist(zidx,6) = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
+                mesh.nodelist(zidx,7) = nidx + edgeNodes*edgeNodes + edgeNodes     ;
+                ++zidx ;
+                ++nidx ;
+          }
+          ++nidx ;
+        }
+        nidx += edgeNodes ;
+  }
+
+  /* Create a material IndexSet (entire mesh same material for now) */
+  for (Index_t i=0; i<meshElems; ++i) {
+        mesh.matElemlist(i) = i ;
+  }
+
+  /* initialize material parameters */
+  mesh.dtfixed() = Real_t(-1.0e-7) ;
+  mesh.deltatime() = Real_t(1.0e-7) ;
+  mesh.deltatimemultlb() = Real_t(1.1) ;
+  mesh.deltatimemultub() = Real_t(1.2) ;
+  int numIters = 0;
+  if (argc == 2) {
+        mesh.stoptime()  = Real_t(atof(argv[1])) ;
+        numIters = 10;
+  } else if (argc == 3) {
+        mesh.stoptime()  = Real_t(atof(argv[1])) ;
+        numIters = atoi(argv[2]);
+  } else {
+        mesh.stoptime()  = Real_t(1.0e-2) ;
+        numIters = 10;
+  }
+  mesh.dtcourant() = Real_t(1.0e+20) ;
+  mesh.dthydro()   = Real_t(1.0e+20) ;
+  mesh.dtmax()     = Real_t(1.0e-2) ;
+  mesh.time()    = Real_t(0.) ;
+  mesh.cycle()   = 0 ;
+
+  mesh.e_cut() = Real_t(1.0e-7) ;
+  mesh.p_cut() = Real_t(1.0e-7) ;
+  mesh.q_cut() = Real_t(1.0e-7) ;
+  mesh.u_cut() = Real_t(1.0e-7) ;
+  mesh.v_cut() = Real_t(1.0e-10) ;
+
+  mesh.hgcoef()      = Real_t(3.0) ;
+  mesh.ss4o3()       = Real_t(4.0)/Real_t(3.0) ;
+
+  mesh.qstop()              =  Real_t(1.0e+12) ;
+  mesh.monoq_max_slope()    =  Real_t(1.0) ;
+  mesh.monoq_limiter_mult() =  Real_t(2.0) ;
+  mesh.qlc_monoq()          = Real_t(0.5) ;
+  mesh.qqc_monoq()          = Real_t(2.0)/Real_t(3.0) ;
+  mesh.qqc()                = Real_t(2.0) ;
+
+  mesh.pmin() =  Real_t(0.) ;
+  mesh.emin() = Real_t(-1.0e+15) ;
+
+  mesh.dvovmax() =  Real_t(0.1) ;
+
+  mesh.eosvmax() =  Real_t(1.0e+9) ;
+  mesh.eosvmin() =  Real_t(1.0e-9) ;
+
+  mesh.refdens() =  Real_t(1.0) ;
+
+  /* initialize field data */
+  for (Index_t i=0; i<meshElems; ++i) {
+        Real_t x_local[8], y_local[8], z_local[8] ;
+        for( Index_t lnode=0 ; lnode<8 ; ++lnode )
+          {
+                Index_t gnode = mesh.nodelist(i,lnode);
+                x_local[lnode] = mesh.x(gnode);
+                y_local[lnode] = mesh.y(gnode);
+                z_local[lnode] = mesh.z(gnode);
+          }
+
+        // volume calculations
+        Real_t volume = CalcElemVolume(x_local, y_local, z_local );
+        mesh.volo(i) = volume ;
+        mesh.elemMass(i) = volume ;
+        for (Index_t j=0; j<8; ++j) {
+          Index_t idx = mesh.nodelist(i,j);
+          mesh.nodalMass(idx) += volume / Real_t(8.0) ;
+        }
+  }
+
+  /* deposit energy */
+  mesh.e(0) = Real_t(3.948746e+7) ;
+
+  /* set up symmetry nodesets */
+  nidx = 0 ;
+  for (Index_t i=0; i<edgeNodes; ++i) {
+        Index_t planeInc = i*edgeNodes*edgeNodes ;
+        Index_t rowInc   = i*edgeNodes ;
+        for (Index_t j=0; j<edgeNodes; ++j) {
+          mesh.symmX(nidx) = planeInc + j*edgeNodes ;
+          mesh.symmY(nidx) = planeInc + j ;
+          mesh.symmZ(nidx) = rowInc   + j ;
+          ++nidx ;
+        }
+  }
+
+  /* set up elemement connectivity information */
+  mesh.lxim(0) = 0 ;
+  for (Index_t i=1; i<meshElems; ++i) {
+        mesh.lxim(i)   = i-1 ;
+        mesh.lxip(i-1) = i ;
+  }
+  mesh.lxip(meshElems-1) = meshElems-1 ;
+
+  for (Index_t i=0; i<edgeElems; ++i) {
+        mesh.letam(i) = i ;
+        mesh.letap(meshElems-edgeElems+i) = meshElems-edgeElems+i ;
+  }
+  for (Index_t i=edgeElems; i<meshElems; ++i) {
+        mesh.letam(i) = i-edgeElems ;
+        mesh.letap(i-edgeElems) = i ;
+  }
+
+  for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
+        mesh.lzetam(i) = i ;
+        mesh.lzetap(meshElems-edgeElems*edgeElems+i) = meshElems-edgeElems*edgeElems+i ;
+  }
+  for (Index_t i=edgeElems*edgeElems; i<meshElems; ++i) {
+        mesh.lzetam(i) = i - edgeElems*edgeElems ;
+        mesh.lzetap(i-edgeElems*edgeElems) = i ;
+  }
+
+  /* set up boundary condition information */
+  for (Index_t i=0; i<meshElems; ++i) {
+        mesh.elemBC(i) = 0 ;  /* clear BCs by default */
+  }
+
+  /* faces on "external" boundaries will be */
+  /* symmetry plane or free surface BCs */
+  for (Index_t i=0; i<edgeElems; ++i) {
+        Index_t planeInc = i*edgeElems*edgeElems ;
+        Index_t rowInc   = i*edgeElems ;
+        for (Index_t j=0; j<edgeElems; ++j) {
+          mesh.elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;
+          mesh.elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;
+          mesh.elemBC(planeInc+j) |= ETA_M_SYMM ;
+          mesh.elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= ETA_P_FREE ;
+          mesh.elemBC(rowInc+j) |= ZETA_M_SYMM ;
+          mesh.elemBC(rowInc+j+meshElems-edgeElems*edgeElems) |= ZETA_P_FREE ;
+        }
+  }
+
+  mesh.AllocateNodeElemIndexes();
+
+
+
+  /* initialize meshGPU */
+  meshGPU.init(&mesh);
+  meshGPU.freshenGPU();
+
+  /* timestep to solution */
+  int its=0;
+#if 0
+  while (its<50) {
+#else
+  //while(mesh.time() < mesh.stoptime() ) {
+  while(its < numIters) {
+#endif
+          TimeIncrement() ;
+          LagrangeLeapFrog(0) ;
+          its++;
+          /* problem->commNodes->Transfer(CommNodes::syncposvel) ; */
+#if LULESH_SHOW_PROGRESS
+          printf("time = %e, dt=%e\n",
+                 double(mesh.time()), double(mesh.deltatime()) ) ;
+#endif
+  }
+  printf("iterations: %d\n",its);
+
+#if LULESH_WRITE_OUTPUT
+     FC(x);
+     FILE *fp = fopen("x.asc","wb");
+     for (Index_t i=0; i<mesh.numElem(); i++)
+         fprintf(fp,"%.6f\n",mesh.x(i));
+     fclose(fp);
+#endif
+
+  return 0 ;
+}