/*

                 Copyright (c) 2010.
      Lawrence Livermore National Security, LLC.
Produced at the Lawrence Livermore National Laboratory.
                  LLNL-CODE-461231
                All rights reserved.

This file is part of LULESH, Version 1.0.
Please also read this link -- http://www.opensource.org/licenses/index.php

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

* Redistributions of source code must retain the above copyright
     notice, this list of conditions and the disclaimer below.

         * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the disclaimer (as noted below)
     in the documentation and/or other materials provided with the
     distribution.

         * Neither the name of the LLNS/LLNL nor the names of its contributors
     may be used to endorse or promote products derived from this software
     without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


Additional BSD Notice

1. This notice is required to be provided under our contract with the U.S.
   Department of Energy (DOE). This work was produced at Lawrence Livermore
   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.

2. Neither the United States Government nor Lawrence Livermore National
   Security, LLC nor any of their employees, makes any warranty, express
   or implied, or assumes any liability or responsibility for the accuracy,
   completeness, or usefulness of any information, apparatus, product, or
   process disclosed, or represents that its use would not infringe
   privately-owned rights.

3. Also, reference herein to any specific commercial products, process, or
   services by trade name, trademark, manufacturer or otherwise does not
   necessarily constitute or imply its endorsement, recommendation, or
   favoring by the United States Government or Lawrence Livermore National
   Security, LLC. The views and opinions of authors expressed herein do not
   necessarily state or reflect those of the United States Government or
   Lawrence Livermore National Security, LLC, and shall not be used for
   advertising or product endorsement purposes.

*/

#include "hip/hip_runtime.h"
#include <vector>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define LULESH_SHOW_PROGRESS 1

enum { VolumeError = -1, QStopError = -2 } ;

/****************************************************/
/* Allow flexibility for arithmetic representations */
/****************************************************/

/* Could also support fixed point and interval arithmetic types */
typedef float        real4 ;
typedef double       real8 ;
typedef long double  real10 ;  /* 10 bytes on x86 */

typedef int    Index_t ; /* array subscript and loop index */
typedef real8  Real_t ;  /* floating point representation */
typedef int    Int_t ;   /* integer representation */

__host__ __device__ inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
__host__ __device__ inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
__host__            inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }

__host__ __device__ inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
__host__ __device__ inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
__host__            inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }

__host__ __device__ inline real4  FABS(real4  arg) { return fabsf(arg) ; }
__host__ __device__ inline real8  FABS(real8  arg) { return fabs(arg) ; }
__host__            inline real10 FABS(real10 arg) { return fabsl(arg) ; }

__host__ __device__ inline real4  FMAX(real4  arg1,real4  arg2) { return fmaxf(arg1,arg2) ; }
__host__ __device__ inline real8  FMAX(real8  arg1,real8  arg2) { return fmax(arg1,arg2) ; }
__host__            inline real10 FMAX(real10 arg1,real10 arg2) { return fmaxl(arg1,arg2) ; }

#define HIP_SAFE_CALL( call) do {                                           \
        hipError_t err = call;                                              \
        if( hipSuccess != err) {                                            \
            fprintf(stderr, "HIP error in file '%s' in line %i : %s.\n",    \
                    __FILE__, __LINE__, hipGetErrorString( err) );          \
            exit(EXIT_FAILURE);                                             \
        } } while (0)

#define HIP(call) HIP_SAFE_CALL(call)

#ifdef HIP_SYNC_ALL
#define HIP_DEBUGSYNC HIP(hipDeviceSynchronize())
#else
#define HIP_DEBUGSYNC
#endif

#define BLOCKSIZE 256

/* Given a number of bytes, nbytes, and a byte alignment, align, (e.g., 2,
 * 4, 8, or 16), return the smallest integer that is larger than nbytes and
 * a multiple of align.
 */
#define PAD_DIV(nbytes, align)  (((nbytes) + (align) - 1) / (align))
#define PAD(nbytes, align)  (PAD_DIV((nbytes),(align)) * (align))

/* More general version of reduceInPlacePOT (this works for arbitrary
 * numThreadsPerBlock <= 1024). Again, conditionals on
 * numThreadsPerBlock are evaluated at compile time.
 */
template <class T, int numThreadsPerBlock>
__device__ void
reduceSum(T *sresult, const int threadID)
{
  /* If number of threads is not a power of two, first add the ones
       after the last power of two into the beginning. At most one of
       these conditionals will be true for a given NPOT block size. */
  if (numThreadsPerBlock > 512 && numThreadsPerBlock <= 1024)
  {
          __syncthreads();
          if (threadID < numThreadsPerBlock-512)
                sresult[threadID] += sresult[threadID + 512];
  }

  if (numThreadsPerBlock > 256 && numThreadsPerBlock < 512)
  {
          __syncthreads();
          if (threadID < numThreadsPerBlock-256)
                sresult[threadID] += sresult[threadID + 256];
  }

  if (numThreadsPerBlock > 128 && numThreadsPerBlock < 256)
  {
          __syncthreads();
          if (threadID < numThreadsPerBlock-128)
                sresult[threadID] += sresult[threadID + 128];
  }

  if (numThreadsPerBlock > 64 && numThreadsPerBlock < 128)
  {
          __syncthreads();
          if (threadID < numThreadsPerBlock-64)
                sresult[threadID] += sresult[threadID + 64];
  }

  if (numThreadsPerBlock > 32 && numThreadsPerBlock < 64)
  {
          __syncthreads();
          if (threadID < numThreadsPerBlock-32)
                sresult[threadID] += sresult[threadID + 32];
  }

  if (numThreadsPerBlock > 16 && numThreadsPerBlock < 32)
  {
          __syncthreads();
          if (threadID < numThreadsPerBlock-16)
                sresult[threadID] += sresult[threadID + 16];
  }

  if (numThreadsPerBlock > 8 && numThreadsPerBlock < 16)
  {
          __syncthreads();
          if (threadID < numThreadsPerBlock-8)
                sresult[threadID] += sresult[threadID + 8];
  }

  if (numThreadsPerBlock > 4 && numThreadsPerBlock < 8)
  {
          __syncthreads();
          if (threadID < numThreadsPerBlock-4)
                sresult[threadID] += sresult[threadID + 4];
  }

  if (numThreadsPerBlock > 2 && numThreadsPerBlock < 4)
  {
          __syncthreads();
          if (threadID < numThreadsPerBlock-2)
                sresult[threadID] += sresult[threadID + 2];
  }

  if (numThreadsPerBlock >= 512) {
        __syncthreads();
        if (threadID < 256)
          sresult[threadID] += sresult[threadID + 256];
  }

  if (numThreadsPerBlock >= 256) {
        __syncthreads();
        if (threadID < 128)
          sresult[threadID] += sresult[threadID + 128];
  }
  if (numThreadsPerBlock >= 128) {
        __syncthreads();
        if (threadID < 64)
          sresult[threadID] += sresult[threadID + 64];
  }
  __syncthreads();
#ifdef _DEVICEEMU
  if (numThreadsPerBlock >= 64) {
        __syncthreads();
        if (threadID < 32)
          sresult[threadID] += sresult[threadID + 32];
  }
  if (numThreadsPerBlock >= 32) {
        __syncthreads();
        if (threadID < 16)
          sresult[threadID] += sresult[threadID + 16];
  }
  if (numThreadsPerBlock >= 16) {
        __syncthreads();
        if (threadID < 8)
          sresult[threadID] += sresult[threadID + 8];
  }
  if (numThreadsPerBlock >= 8) {
        __syncthreads();
        if (threadID < 4)
          sresult[threadID] += sresult[threadID + 4];
  }
  if (numThreadsPerBlock >= 4) {
        __syncthreads();
        if (threadID < 2)
          sresult[threadID] += sresult[threadID + 2];
  }
  if (numThreadsPerBlock >= 2) {
        __syncthreads();
        if (threadID < 1)
          sresult[threadID] += sresult[threadID + 1];
  }
#else
  if (threadID < 32) {
        volatile T *vol = sresult;
        if (numThreadsPerBlock >= 64) vol[threadID] += vol[threadID + 32];
        if (numThreadsPerBlock >= 32) vol[threadID] += vol[threadID + 16];
        if (numThreadsPerBlock >= 16) vol[threadID] += vol[threadID + 8];
        if (numThreadsPerBlock >= 8) vol[threadID] += vol[threadID + 4];
        if (numThreadsPerBlock >= 4) vol[threadID] += vol[threadID + 2];
        if (numThreadsPerBlock >= 2) vol[threadID] += vol[threadID + 1];
  }
#endif
  __syncthreads();
}

#define MINEQ(a,b) (a)=(((a)<(b))?(a):(b))

  template <class T, int numThreadsPerBlock>
  __device__ void
  reduceMin(T *sresult, const int threadID)
  {
        /* If number of threads is not a power of two, first add the ones
       after the last power of two into the beginning. At most one of
       these conditionals will be true for a given NPOT block size. */
        if (numThreadsPerBlock > 512 && numThreadsPerBlock <= 1024)
    {
                __syncthreads();
                if (threadID < numThreadsPerBlock-512)
                  MINEQ(sresult[threadID],sresult[threadID + 512]);
        }

        if (numThreadsPerBlock > 256 && numThreadsPerBlock < 512)
        {
                __syncthreads();
                if (threadID < numThreadsPerBlock-256)
                  MINEQ(sresult[threadID],sresult[threadID + 256]);
        }

        if (numThreadsPerBlock > 128 && numThreadsPerBlock < 256)
        {
                __syncthreads();
                if (threadID < numThreadsPerBlock-128)
                  MINEQ(sresult[threadID],sresult[threadID + 128]);
        }

        if (numThreadsPerBlock > 64 && numThreadsPerBlock < 128)
        {
                __syncthreads();
                if (threadID < numThreadsPerBlock-64)
                  MINEQ(sresult[threadID],sresult[threadID + 64]);
        }

        if (numThreadsPerBlock > 32 && numThreadsPerBlock < 64)
        {
                __syncthreads();
                if (threadID < numThreadsPerBlock-32)
                  MINEQ(sresult[threadID],sresult[threadID + 32]);
        }

        if (numThreadsPerBlock > 16 && numThreadsPerBlock < 32)
        {
                __syncthreads();
                if (threadID < numThreadsPerBlock-16)
                  MINEQ(sresult[threadID],sresult[threadID + 16]);
        }

        if (numThreadsPerBlock > 8 && numThreadsPerBlock < 16)
        {
                __syncthreads();
                if (threadID < numThreadsPerBlock-8)
                  MINEQ(sresult[threadID],sresult[threadID + 8]);
        }

        if (numThreadsPerBlock > 4 && numThreadsPerBlock < 8)
        {
                __syncthreads();
                if (threadID < numThreadsPerBlock-4)
                  MINEQ(sresult[threadID],sresult[threadID + 4]);
        }

        if (numThreadsPerBlock > 2 && numThreadsPerBlock < 4)
        {
                __syncthreads();
                if (threadID < numThreadsPerBlock-2)
                  MINEQ(sresult[threadID],sresult[threadID + 2]);
        }

        if (numThreadsPerBlock >= 512) {
          __syncthreads();
          if (threadID < 256)
                MINEQ(sresult[threadID],sresult[threadID + 256]);
        }

        if (numThreadsPerBlock >= 256) {
          __syncthreads();
          if (threadID < 128)
                MINEQ(sresult[threadID],sresult[threadID + 128]);
        }
        if (numThreadsPerBlock >= 128) {
          __syncthreads();
          if (threadID < 64)
                MINEQ(sresult[threadID],sresult[threadID + 64]);
        }
        __syncthreads();
        #ifdef _DEVICEEMU
        if (numThreadsPerBlock >= 64) {
          __syncthreads();
          if (threadID < 32)
                MINEQ(sresult[threadID],sresult[threadID + 32]);
        }
        if (numThreadsPerBlock >= 32) {
          __syncthreads();
          if (threadID < 16)
                MINEQ(sresult[threadID],sresult[threadID + 16]);
        }
        if (numThreadsPerBlock >= 16) {
          __syncthreads();
          if (threadID < 8)
                MINEQ(sresult[threadID],sresult[threadID + 8]);
        }
        if (numThreadsPerBlock >= 8) {
          __syncthreads();
          if (threadID < 4)
                MINEQ(sresult[threadID],sresult[threadID + 4]);
        }
        if (numThreadsPerBlock >= 4) {
          __syncthreads();
          if (threadID < 2)
                MINEQ(sresult[threadID],sresult[threadID + 2]);
        }
        if (numThreadsPerBlock >= 2) {
          __syncthreads();
          if (threadID < 1)
                MINEQ(sresult[threadID],sresult[threadID + 1]);
        }
#else
        if (threadID < 32) {
          volatile T *vol = sresult;
          if (numThreadsPerBlock >= 64) MINEQ(vol[threadID],vol[threadID + 32]);
          if (numThreadsPerBlock >= 32) MINEQ(vol[threadID],vol[threadID + 16]);
          if (numThreadsPerBlock >= 16) MINEQ(vol[threadID],vol[threadID + 8]);
          if (numThreadsPerBlock >= 8)  MINEQ(vol[threadID],vol[threadID + 4]);
          if (numThreadsPerBlock >= 4)  MINEQ(vol[threadID],vol[threadID + 2]);
          if (numThreadsPerBlock >= 2)  MINEQ(vol[threadID],vol[threadID + 1]);
        }
#endif
        __syncthreads();
  }

void hip_init()
{
  int deviceCount, dev;
  hipDeviceProp_t hip_deviceProp;
  char *s;

  HIP( hipGetDeviceCount(&deviceCount) );
  if (deviceCount == 0) {
        fprintf(stderr, "hip_init(): no devices supporting HIP.\n");
        exit(1);
  }
  s=getenv("HIP_DEVICE");
  if (s != NULL) { dev=atoi(s); }
  else dev=0;
  if ((dev < 0) || (dev > deviceCount-1)) {
        fprintf(stderr, "hip_init(): requested device (%d) out of range [%d,%d]\n",
                        dev, 0, deviceCount-1);
        exit(1);
  }
  HIP( hipGetDeviceProperties(&hip_deviceProp, dev) );
  if (hip_deviceProp.major < 1) {
        fprintf(stderr, "hip_init(): device %d does not support HIP.\n", dev);
        exit(1);
  }
  fprintf(stderr, "setting HIP device %d\n",dev);
  HIP( hipSetDevice(dev) );
}

/************************************************************/
/* Allow for flexible data layout experiments by separating */
/* array interface from underlying implementation.          */
/************************************************************/

struct Mesh {

  /* This first implementation allows for runnable code */
  /* and is not meant to be optimal. Final implementation */
  /* should separate declaration and allocation phases */
  /* so that allocation can be scheduled in a cache conscious */
  /* manner. */

  friend struct MeshGPU;

public:

  /**************/
  /* Allocation */
  /**************/

  void AllocateNodalPersistent(size_t size)
  {
        m_x.resize(size) ;
        m_y.resize(size) ;
        m_z.resize(size) ;

        m_xd.resize(size, Real_t(0.)) ;
        m_yd.resize(size, Real_t(0.)) ;
        m_zd.resize(size, Real_t(0.)) ;

        m_xdd.resize(size, Real_t(0.)) ;
        m_ydd.resize(size, Real_t(0.)) ;
        m_zdd.resize(size, Real_t(0.)) ;

        m_fx.resize(size) ;
        m_fy.resize(size) ;
        m_fz.resize(size) ;

        m_nodalMass.resize(size, Real_t(0.)) ;
  }

  void AllocateElemPersistent(size_t size)
  {
        m_matElemlist.resize(size) ;
        m_nodelist.resize(8*size) ;

        m_lxim.resize(size) ;
        m_lxip.resize(size) ;
        m_letam.resize(size) ;
        m_letap.resize(size) ;
        m_lzetam.resize(size) ;
        m_lzetap.resize(size) ;

        m_elemBC.resize(size) ;

        m_e.resize(size, Real_t(0.)) ;

        m_p.resize(size, Real_t(0.)) ;
        m_q.resize(size) ;
        m_ql.resize(size) ;
        m_qq.resize(size) ;

        m_v.resize(size, 1.0) ;
        m_volo.resize(size) ;
        m_delv.resize(size) ;
        m_vdov.resize(size) ;

        m_arealg.resize(size) ;

        m_ss.resize(size) ;

        m_elemMass.resize(size) ;
  }

  /* Temporaries should not be initialized in bulk but */
  /* this is a runnable placeholder for now */
  void AllocateElemTemporary(size_t size)
  {
        m_dxx.resize(size) ;
        m_dyy.resize(size) ;
        m_dzz.resize(size) ;

        m_delv_xi.resize(size) ;
        m_delv_eta.resize(size) ;
        m_delv_zeta.resize(size) ;

        m_delx_xi.resize(size) ;
        m_delx_eta.resize(size) ;
        m_delx_zeta.resize(size) ;

        m_vnew.resize(size) ;
  }

  void AllocateNodesets(size_t size)
  {
        m_symmX.resize(size) ;
        m_symmY.resize(size) ;
        m_symmZ.resize(size) ;
  }

  void AllocateNodeElemIndexes()
  {
        Index_t i,j,nidx;

        /* set up node-centered indexing of elements */
        m_nodeElemCount.resize(m_numNode);
        for (i=0;i<m_numNode;i++) m_nodeElemCount[i]=0;
        m_nodeElemCornerList.resize(m_numNode*8);
        for (i=0;i<m_numElem;i++) {
          for (j=0;j<8;j++) {
                nidx=nodelist(i,j);
                m_nodeElemCornerList[nidx+m_numNode*m_nodeElemCount[nidx]++] = i+m_numElem*j;
                if (m_nodeElemCount[nidx]>8) {
                  fprintf(stderr, "Node degree is higher than 8!\n");
                  exit(1);
                }
          }
        }
  }

  /**********/
  /* Access */
  /**********/

  /* Node-centered */

  Real_t& x(Index_t idx)    { return m_x[idx] ; }
  Real_t& y(Index_t idx)    { return m_y[idx] ; }
  Real_t& z(Index_t idx)    { return m_z[idx] ; }

  Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
  Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
  Real_t& zd(Index_t idx)   { return m_zd[idx] ; }

  Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
  Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
  Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }

  Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
  Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
  Real_t& fz(Index_t idx)   { return m_fz[idx] ; }

  Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }

  Index_t&  symmX(Index_t idx) { return m_symmX[idx] ; }
  Index_t&  symmY(Index_t idx) { return m_symmY[idx] ; }
  Index_t&  symmZ(Index_t idx) { return m_symmZ[idx] ; }

  /* Element-centered */

  Index_t&  matElemlist(Index_t idx) { return m_matElemlist[idx] ; }
  Index_t&  nodelist(Index_t idx,Index_t nidx)    { return m_nodelist[idx+nidx*m_numElem] ; }

  Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
  Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
  Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
  Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
  Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
  Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }

  Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }

  Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
  Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
  Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }

  Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
  Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
  Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }

  Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
  Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
  Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }

  Real_t& e(Index_t idx)          { return m_e[idx] ; }

  Real_t& p(Index_t idx)          { return m_p[idx] ; }
  Real_t& q(Index_t idx)          { return m_q[idx] ; }
  Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
  Real_t& qq(Index_t idx)         { return m_qq[idx] ; }

  Real_t& v(Index_t idx)          { return m_v[idx] ; }
  Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
  Real_t& vnew(Index_t idx)       { return m_vnew[idx] ; }
  Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
  Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }

  Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }

  Real_t& ss(Index_t idx)         { return m_ss[idx] ; }

  Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }

  /* Params */

  Real_t& dtfixed()              { return m_dtfixed ; }
  Real_t& time()                 { return m_time ; }
  Real_t& deltatime()            { return m_deltatime ; }
  Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
  Real_t& deltatimemultub()      { return m_deltatimemultub ; }
  Real_t& stoptime()             { return m_stoptime ; }

  Real_t& u_cut()                { return m_u_cut ; }
  Real_t& hgcoef()               { return m_hgcoef ; }
  Real_t& qstop()                { return m_qstop ; }
  Real_t& monoq_max_slope()      { return m_monoq_max_slope ; }
  Real_t& monoq_limiter_mult()   { return m_monoq_limiter_mult ; }
  Real_t& e_cut()                { return m_e_cut ; }
  Real_t& p_cut()                { return m_p_cut ; }
  Real_t& ss4o3()                { return m_ss4o3 ; }
  Real_t& q_cut()                { return m_q_cut ; }
  Real_t& v_cut()                { return m_v_cut ; }
  Real_t& qlc_monoq()            { return m_qlc_monoq ; }
  Real_t& qqc_monoq()            { return m_qqc_monoq ; }
  Real_t& qqc()                  { return m_qqc ; }
  Real_t& eosvmax()              { return m_eosvmax ; }
  Real_t& eosvmin()              { return m_eosvmin ; }
  Real_t& pmin()                 { return m_pmin ; }
  Real_t& emin()                 { return m_emin ; }
  Real_t& dvovmax()              { return m_dvovmax ; }
  Real_t& refdens()              { return m_refdens ; }

  Real_t& dtcourant()            { return m_dtcourant ; }
  Real_t& dthydro()              { return m_dthydro ; }
  Real_t& dtmax()                { return m_dtmax ; }

  Int_t&  cycle()                { return m_cycle ; }

  Index_t&  sizeX()              { return m_sizeX ; }
  Index_t&  sizeY()              { return m_sizeY ; }
  Index_t&  sizeZ()              { return m_sizeZ ; }
  Index_t&  numElem()            { return m_numElem ; }
  Index_t&  numNode()            { return m_numNode ; }


  //private:

  /******************/
  /* Implementation */
  /******************/

  /* Node-centered */

  std::vector<Real_t> m_x ;  /* coordinates */
  std::vector<Real_t> m_y ;
  std::vector<Real_t> m_z ;

  std::vector<Real_t> m_xd ; /* velocities */
  std::vector<Real_t> m_yd ;
  std::vector<Real_t> m_zd ;

  std::vector<Real_t> m_xdd ; /* accelerations */
  std::vector<Real_t> m_ydd ;
  std::vector<Real_t> m_zdd ;

  std::vector<Real_t> m_fx ;  /* forces */
  std::vector<Real_t> m_fy ;
  std::vector<Real_t> m_fz ;

  std::vector<Real_t> m_nodalMass ;  /* mass */

  std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
  std::vector<Index_t> m_symmY ;
  std::vector<Index_t> m_symmZ ;

  std::vector<Int_t> m_nodeElemCount ;
  std::vector<Index_t> m_nodeElemCornerList ;

  /* Element-centered */

  std::vector<Index_t>  m_matElemlist ;  /* material indexset */
  std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */

  std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
  std::vector<Index_t>  m_lxip ;
  std::vector<Index_t>  m_letam ;
  std::vector<Index_t>  m_letap ;
  std::vector<Index_t>  m_lzetam ;
  std::vector<Index_t>  m_lzetap ;

  std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */

  std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
  std::vector<Real_t> m_dyy ;
  std::vector<Real_t> m_dzz ;

  std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
  std::vector<Real_t> m_delv_eta ;
  std::vector<Real_t> m_delv_zeta ;

  std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
  std::vector<Real_t> m_delx_eta ;
  std::vector<Real_t> m_delx_zeta ;

  std::vector<Real_t> m_e ;   /* energy */

  std::vector<Real_t> m_p ;   /* pressure */
  std::vector<Real_t> m_q ;   /* q */
  std::vector<Real_t> m_ql ;  /* linear term for q */
  std::vector<Real_t> m_qq ;  /* quadratic term for q */

  std::vector<Real_t> m_v ;     /* relative volume */
  std::vector<Real_t> m_volo ;  /* reference volume */
  std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
  std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
  std::vector<Real_t> m_vdov ;  /* volume derivative over volume */

  std::vector<Real_t> m_arealg ;  /* characteristic length of an element */

  std::vector<Real_t> m_ss ;      /* "sound speed" */

  std::vector<Real_t> m_elemMass ;  /* mass */

  /* Parameters */

  Real_t  m_dtfixed ;           /* fixed time increment */
  Real_t  m_time ;              /* current time */
  Real_t  m_deltatime ;         /* variable time increment */
  Real_t  m_deltatimemultlb ;
  Real_t  m_deltatimemultub ;
  Real_t  m_stoptime ;          /* end time for simulation */

  Real_t  m_u_cut ;             /* velocity tolerance */
  Real_t  m_hgcoef ;            /* hourglass control */
  Real_t  m_qstop ;             /* excessive q indicator */
  Real_t  m_monoq_max_slope ;
  Real_t  m_monoq_limiter_mult ;
  Real_t  m_e_cut ;             /* energy tolerance */
  Real_t  m_p_cut ;             /* pressure tolerance */
  Real_t  m_ss4o3 ;
  Real_t  m_q_cut ;             /* q tolerance */
  Real_t  m_v_cut ;             /* relative volume tolerance */
  Real_t  m_qlc_monoq ;         /* linear term coef for q */
  Real_t  m_qqc_monoq ;         /* quadratic term coef for q */
  Real_t  m_qqc ;
  Real_t  m_eosvmax ;
  Real_t  m_eosvmin ;
  Real_t  m_pmin ;              /* pressure floor */
  Real_t  m_emin ;              /* energy floor */
  Real_t  m_dvovmax ;           /* maximum allowable volume change */
  Real_t  m_refdens ;           /* reference density */

  Real_t  m_dtcourant ;         /* courant constraint */
  Real_t  m_dthydro ;           /* volume change constraint */
  Real_t  m_dtmax ;             /* maximum allowable time increment */

  Int_t   m_cycle ;             /* iteration count for simulation */

  Index_t   m_sizeX ;           /* X,Y,Z extent of this block */
  Index_t   m_sizeY ;
  Index_t   m_sizeZ ;

  Index_t   m_numElem ;         /* Elements/Nodes in this domain */
  Index_t   m_numNode ;
} mesh ;

template <typename T>
T *Allocate(size_t size)
{
  return static_cast<T *>(malloc(sizeof(T)*size)) ;
}

template <typename T>
void Release(T **ptr)
{
  if (*ptr != NULL) {
        free(*ptr) ;
        *ptr = NULL ;
  }
}


#define GPU_STALE 0
#define CPU_STALE 1
#define ALL_FRESH 2

template<typename T>
void freshenGPU(std::vector<T>&cpu,T **gpu,int& stale) {
  if (stale!=GPU_STALE) return;
  if (!(*gpu)) {HIP( hipMalloc(gpu,sizeof(T)*cpu.size()) );}
  HIP( hipMemcpy(*gpu,&cpu[0],sizeof(T)*cpu.size(),hipMemcpyHostToDevice) );
  stale=ALL_FRESH;
}

template<typename T>
void freshenCPU(std::vector<T>&cpu,T *gpu,int& stale) {
  if (stale!=CPU_STALE) return;
  if (!gpu) {fprintf(stderr,"freshenCPU(): NULL GPU data!\n");exit(1);}
  HIP( hipMemcpy(&cpu[0],gpu,sizeof(T)*cpu.size(),hipMemcpyDeviceToHost) );
  stale=ALL_FRESH;
}

// freshen helpers
#define FC(var) freshenCPU(mesh.m_ ## var , meshGPU.m_ ## var ,meshGPU.m_ ## var ## _stale ); // freshen CPU
#define FG(var) freshenGPU(mesh.m_ ## var , &meshGPU.m_ ## var ,meshGPU.m_ ## var ## _stale ); // freshen GPU
// stale helpers
#define SC(var) meshGPU.m_ ## var ## _stale = CPU_STALE; // stale CPU
#define SG(var) meshGPU.m_ ## var ## _stale = GPU_STALE; // stale GPU

struct MeshGPU {
  Mesh *m_mesh;

  /******************/
  /* Implementation */
  /******************/

  /* Node-centered */

  Real_t *m_x ;  /* coordinates */
  Real_t *m_y ;
  Real_t *m_z ;

  Real_t *m_xd ; /* velocities */
  Real_t *m_yd ;
  Real_t *m_zd ;

  Real_t *m_xdd ; /* accelerations */
  Real_t *m_ydd ;
  Real_t *m_zdd ;

  Real_t *m_fx ;  /* forces */
  Real_t *m_fy ;
  Real_t *m_fz ;

  Real_t *m_nodalMass ;  /* mass */

  Index_t *m_symmX ;  /* symmetry plane nodesets */
  Index_t *m_symmY ;
  Index_t *m_symmZ ;

  Int_t   *m_nodeElemCount ;
  Index_t *m_nodeElemCornerList ;

  /* Element-centered */

  Index_t * m_matElemlist ;  /* material indexset */
  Index_t * m_nodelist ;     /* elemToNode connectivity */

  Index_t * m_lxim ;  /* element connectivity across each face */
  Index_t * m_lxip ;
  Index_t * m_letam ;
  Index_t * m_letap ;
  Index_t * m_lzetam ;
  Index_t * m_lzetap ;

  Int_t *   m_elemBC ;  /* symmetry/free-surface flags for each elem face */

  Real_t *m_dxx ;  /* principal strains -- temporary */
  Real_t *m_dyy ;
  Real_t *m_dzz ;

  Real_t *m_delv_xi ;    /* velocity gradient -- temporary */
  Real_t *m_delv_eta ;
  Real_t *m_delv_zeta ;

  Real_t *m_delx_xi ;    /* coordinate gradient -- temporary */
  Real_t *m_delx_eta ;
  Real_t *m_delx_zeta ;

  Real_t *m_e ;   /* energy */

  Real_t *m_p ;   /* pressure */
  Real_t *m_q ;   /* q */
  Real_t *m_ql ;  /* linear term for q */
  Real_t *m_qq ;  /* quadratic term for q */

  Real_t *m_v ;     /* relative volume */
  Real_t *m_volo ;  /* reference volume */
  Real_t *m_vnew ;  /* new relative volume -- temporary */
  Real_t *m_delv ;  /* m_vnew - m_v */
  Real_t *m_vdov ;  /* volume derivative over volume */

  Real_t *m_arealg ;  /* characteristic length of an element */

  Real_t *m_ss ;      /* "sound speed" */

  Real_t *m_elemMass ;  /* mass */

  /* Stale flags */
  int m_x_stale,m_y_stale,m_z_stale;
  int m_xd_stale,m_yd_stale,m_zd_stale;
  int m_xdd_stale,m_ydd_stale,m_zdd_stale;
  int m_fx_stale,m_fy_stale,m_fz_stale;
  int m_nodalMass_stale;
  int m_symmX_stale,m_symmY_stale,m_symmZ_stale;
  int m_nodeElemCount_stale,m_nodeElemCornerList_stale;
  int m_matElemlist_stale,m_nodelist_stale;
  int m_lxim_stale,m_lxip_stale,m_letam_stale,m_letap_stale,m_lzetam_stale,m_lzetap_stale;
  int m_elemBC_stale;
  int m_dxx_stale,m_dyy_stale,m_dzz_stale;
  int m_delv_xi_stale,m_delv_eta_stale,m_delv_zeta_stale;
  int m_delx_xi_stale,m_delx_eta_stale,m_delx_zeta_stale;
  int m_e_stale;
  int m_p_stale,m_q_stale,m_ql_stale,m_qq_stale;
  int m_v_stale,m_volo_stale,m_vnew_stale,m_delv_stale,m_vdov_stale;
  int m_arealg_stale;
  int m_ss_stale;
  int m_elemMass_stale;

  void init(Mesh *mesh) {
        m_mesh=mesh;
        m_x=m_y=m_z=NULL;
        m_xd=m_yd=m_zd=NULL;
        m_xdd=m_ydd=m_zdd=NULL;
        m_fx=m_fy=m_fz=NULL;
        m_nodalMass=NULL;
        m_symmX=m_symmY=m_symmZ=NULL;
        m_nodeElemCount=m_nodeElemCornerList=NULL;
        m_matElemlist=m_nodelist=NULL;
        m_lxim=m_lxip=m_letam=m_letap=m_lzetam=m_lzetap=NULL;
        m_elemBC=NULL;
        m_dxx=m_dyy=m_dzz=NULL;
        m_delv_xi=m_delv_eta=m_delv_zeta=NULL;
        m_delx_xi=m_delx_eta=m_delx_zeta=NULL;
        m_e=NULL;
        m_p=m_q=m_ql=m_qq=NULL;
        m_v=m_volo=m_vnew=m_delv=m_vdov=NULL;
        m_arealg=NULL;
        m_ss=NULL;
        m_elemMass=NULL;
                m_x_stale=m_y_stale=m_z_stale=
                                      m_xd_stale=m_yd_stale=m_zd_stale=
                                      m_xdd_stale=m_ydd_stale=m_zdd_stale=
                                      m_fx_stale=m_fy_stale=m_fz_stale=
                                      m_nodalMass_stale=
                                      m_symmX_stale=m_symmY_stale=m_symmZ_stale=
                                      m_nodeElemCount_stale=m_nodeElemCornerList_stale=
                                      m_matElemlist_stale=m_nodelist_stale=
                                      m_lxim_stale=m_lxip_stale=m_letam_stale=m_letap_stale=m_lzetam_stale=m_lzetap_stale=
                                      m_elemBC_stale=
                                      m_dxx_stale=m_dyy_stale=m_dzz_stale=
                                      m_delv_xi_stale=m_delv_eta_stale=m_delv_zeta_stale=
                                      m_delx_xi_stale=m_delx_eta_stale=m_delx_zeta_stale=
                                      m_e_stale=
                                      m_p_stale=m_q_stale=m_ql_stale=m_qq_stale=
                                      m_v_stale=m_volo_stale=m_vnew_stale=m_delv_stale=m_vdov_stale=
                                      m_arealg_stale=
                                      m_ss_stale=
                                      m_elemMass_stale=
                          GPU_STALE;
  }
  void freshenGPU() {
#define F(var) ::freshenGPU(m_mesh->m_ ## var , &m_ ## var ,m_ ## var ## _stale);
        F(x); F(y); F(z);
        F(xd); F(yd); F(zd);
        F(xdd); F(ydd); F(zdd);
        F(fx); F(fy); F(fz);
        F(nodalMass);
        F(symmX); F(symmY); F(symmZ);
        F(nodeElemCount); F(nodeElemCornerList);
        F(matElemlist); F(nodelist);
        F(lxim); F(lxip); F(letam); F(letap); F(lzetam); F(lzetap);
        F(elemBC);
        F(dxx); F(dyy); F(dzz);
        F(delv_xi); F(delv_eta); F(delv_zeta);
        F(delx_xi); F(delx_eta); F(delx_zeta);
        F(e);
        F(p); F(q); F(ql); F(qq);
        F(v); F(volo); F(vnew); F(delv); F(vdov);
        F(arealg);
        F(ss);
        F(elemMass);
        #undef F
  }
  void freshenCPU() {
#define F(var) ::freshenCPU(m_mesh->m_ ## var , m_ ## var ,m_ ## var ## _stale);
        F(x); F(y); F(z);
        F(xd); F(yd); F(zd);
        F(xdd); F(ydd); F(zdd);
        F(fx); F(fy); F(fz);
        F(nodalMass);
        F(symmX); F(symmY); F(symmZ);
        F(nodeElemCount); F(nodeElemCornerList);
        F(matElemlist); F(nodelist);
        F(lxim); F(lxip); F(letam); F(letap); F(lzetam); F(lzetap);
        F(elemBC);
        F(dxx); F(dyy); F(dzz);
        F(delv_xi); F(delv_eta); F(delv_zeta);
        F(delx_xi); F(delx_eta); F(delx_zeta);
        F(e);
        F(p); F(q); F(ql); F(qq);
        F(v); F(volo); F(vnew); F(delv); F(vdov);
        F(arealg);
        F(ss);
        F(elemMass);
        #undef F
  }
} meshGPU;



/* Stuff needed for boundary conditions */
/* 2 BCs on each of 6 hexahedral faces (12 bits) */
#define XI_M        0x003
#define XI_M_SYMM   0x001
#define XI_M_FREE   0x002

#define XI_P        0x00c
#define XI_P_SYMM   0x004
#define XI_P_FREE   0x008

#define ETA_M       0x030
#define ETA_M_SYMM  0x010
#define ETA_M_FREE  0x020

#define ETA_P       0x0c0
#define ETA_P_SYMM  0x040
#define ETA_P_FREE  0x080

#define ZETA_M      0x300
#define ZETA_M_SYMM 0x100
#define ZETA_M_FREE 0x200

#define ZETA_P      0xc00
#define ZETA_P_SYMM 0x400
#define ZETA_P_FREE 0x800


static inline
void TimeIncrement()
{
  Real_t targetdt = mesh.stoptime() - mesh.time() ;

  if ((mesh.dtfixed() <= Real_t(0.0)) && (mesh.cycle() != Int_t(0))) {
        Real_t ratio ;
        Real_t olddt = mesh.deltatime() ;

        /* This will require a reduction in parallel */
        Real_t newdt = Real_t(1.0e+20) ;
        if (mesh.dtcourant() < newdt) {
          newdt = mesh.dtcourant() / Real_t(2.0) ;
        }
        if (mesh.dthydro() < newdt) {
          newdt = mesh.dthydro() * Real_t(2.0) / Real_t(3.0) ;
        }

        ratio = newdt / olddt ;
        if (ratio >= Real_t(1.0)) {
          if (ratio < mesh.deltatimemultlb()) {
                newdt = olddt ;
          }
          else if (ratio > mesh.deltatimemultub()) {
                newdt = olddt*mesh.deltatimemultub() ;
          }
        }

        if (newdt > mesh.dtmax()) {
          newdt = mesh.dtmax() ;
        }
        mesh.deltatime() = newdt ;
  }

  /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
  if ((targetdt > mesh.deltatime()) &&
          (targetdt < (Real_t(4.0) * mesh.deltatime() / Real_t(3.0))) ) {
        targetdt = Real_t(2.0) * mesh.deltatime() / Real_t(3.0) ;
  }

  if (targetdt < mesh.deltatime()) {
        mesh.deltatime() = targetdt ;
  }

  mesh.time() += mesh.deltatime() ;

  ++mesh.cycle() ;
}

__global__
void InitStressTermsForElems_kernel(
                                   int numElem,Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, Real_t *p, Real_t *q)
{
  int i = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<numElem)
        sigxx[i] = sigyy[i] = sigzz[i] =  - p[i] - q[i] ;
}

static inline
void InitStressTermsForElems_gpu(Index_t numElem,
                                 Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
{
  dim3 dimBlock(BLOCKSIZE,1,1);
  dim3 dimGrid(PAD_DIV(numElem,dimBlock.x),1,1);
  //hipFuncSetCacheConfig(InitStressTermsForElems_kernel,hipFuncCachePreferL1); // set as default for all kernels after this one
  hipLaunchKernelGGL((InitStressTermsForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,sigxx,sigyy,sigzz,meshGPU.m_p,meshGPU.m_q);
  HIP_DEBUGSYNC;
}

static inline
void InitStressTermsForElems_cpu(Index_t numElem,
                                 Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
{
  //
  // pull in the stresses appropriate to the hydro integration
  //
  for (Index_t i = 0 ; i < numElem ; ++i){
        sigxx[i] =  sigyy[i] = sigzz[i] =  - mesh.p(i) - mesh.q(i) ;
  }
}

static inline
void InitStressTermsForElems(Index_t numElem,
                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
                             int useCPU)
{
  if (useCPU) {
        FC(p); FC(q);
        InitStressTermsForElems_cpu(numElem,sigxx,sigyy,sigzz);
  }
  else {
        FG(p); FG(q);
        InitStressTermsForElems_gpu(numElem,sigxx,sigyy,sigzz);
  }
}

__host__ __device__
static inline
void CalcElemShapeFunctionDerivatives( const Real_t* const x,
                                       const Real_t* const y,
                                       const Real_t* const z,
                                       Real_t b[][8],
                                       Real_t* const volume )
{
  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;

  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;

  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;

  Real_t fjxxi, fjxet, fjxze;
  Real_t fjyxi, fjyet, fjyze;
  Real_t fjzxi, fjzet, fjzze;
  Real_t cjxxi, cjxet, cjxze;
  Real_t cjyxi, cjyet, cjyze;
  Real_t cjzxi, cjzet, cjzze;

  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );

  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );

  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );

  /* compute cofactors */
  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);

  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);

  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);

  /* calculate partials :
     this need only be done for l = 0,1,2,3   since , by symmetry ,
     (6,7,4,5) = - (0,1,2,3) .
  */
  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
  b[0][4] = -b[0][2];
  b[0][5] = -b[0][3];
  b[0][6] = -b[0][0];
  b[0][7] = -b[0][1];

  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
  b[1][4] = -b[1][2];
  b[1][5] = -b[1][3];
  b[1][6] = -b[1][0];
  b[1][7] = -b[1][1];

  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
  b[2][4] = -b[2][2];
  b[2][5] = -b[2][3];
  b[2][6] = -b[2][0];
  b[2][7] = -b[2][1];

  /* calculate jacobian determinant (volume) */
  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
}

__host__ __device__
static inline
void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
                       const Real_t x0, const Real_t y0, const Real_t z0,
                       const Real_t x1, const Real_t y1, const Real_t z1,
                       const Real_t x2, const Real_t y2, const Real_t z2,
                       const Real_t x3, const Real_t y3, const Real_t z3)
{
  Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
  Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
  Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
  Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
  Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
  Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
  Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
  Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
  Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);

  *normalX0 += areaX;
  *normalX1 += areaX;
  *normalX2 += areaX;
  *normalX3 += areaX;

  *normalY0 += areaY;
  *normalY1 += areaY;
  *normalY2 += areaY;
  *normalY3 += areaY;

  *normalZ0 += areaZ;
  *normalZ1 += areaZ;
  *normalZ2 += areaZ;
  *normalZ3 += areaZ;
}

__host__ __device__
static inline
void CalcElemNodeNormals(Real_t pfx[8],
                         Real_t pfy[8],
                         Real_t pfz[8],
                         const Real_t x[8],
                         const Real_t y[8],
                         const Real_t z[8])
{
  for (Index_t i = 0 ; i < 8 ; ++i) {
        pfx[i] = Real_t(0.0);
        pfy[i] = Real_t(0.0);
        pfz[i] = Real_t(0.0);
  }
  /* evaluate face one: nodes 0, 1, 2, 3 */
  SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
                    &pfx[1], &pfy[1], &pfz[1],
                    &pfx[2], &pfy[2], &pfz[2],
                    &pfx[3], &pfy[3], &pfz[3],
                    x[0], y[0], z[0], x[1], y[1], z[1],
                    x[2], y[2], z[2], x[3], y[3], z[3]);
  /* evaluate face two: nodes 0, 4, 5, 1 */
  SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
                    &pfx[4], &pfy[4], &pfz[4],
                    &pfx[5], &pfy[5], &pfz[5],
                    &pfx[1], &pfy[1], &pfz[1],
                    x[0], y[0], z[0], x[4], y[4], z[4],
                    x[5], y[5], z[5], x[1], y[1], z[1]);
  /* evaluate face three: nodes 1, 5, 6, 2 */
  SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
                    &pfx[5], &pfy[5], &pfz[5],
                    &pfx[6], &pfy[6], &pfz[6],
                    &pfx[2], &pfy[2], &pfz[2],
                    x[1], y[1], z[1], x[5], y[5], z[5],
                    x[6], y[6], z[6], x[2], y[2], z[2]);
  /* evaluate face four: nodes 2, 6, 7, 3 */
  SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
                    &pfx[6], &pfy[6], &pfz[6],
                    &pfx[7], &pfy[7], &pfz[7],
                    &pfx[3], &pfy[3], &pfz[3],
                    x[2], y[2], z[2], x[6], y[6], z[6],
                    x[7], y[7], z[7], x[3], y[3], z[3]);
  /* evaluate face five: nodes 3, 7, 4, 0 */
  SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
                    &pfx[7], &pfy[7], &pfz[7],
                    &pfx[4], &pfy[4], &pfz[4],
                    &pfx[0], &pfy[0], &pfz[0],
                    x[3], y[3], z[3], x[7], y[7], z[7],
                    x[4], y[4], z[4], x[0], y[0], z[0]);
  /* evaluate face six: nodes 4, 7, 6, 5 */
  SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
                    &pfx[7], &pfy[7], &pfz[7],
                    &pfx[6], &pfy[6], &pfz[6],
                    &pfx[5], &pfy[5], &pfz[5],
                    x[4], y[4], z[4], x[7], y[7], z[7],
                    x[6], y[6], z[6], x[5], y[5], z[5]);
}

__host__ __device__
static inline
void SumElemStressesToNodeForces( const Real_t B[][8],
                                  const Real_t stress_xx,
                                  const Real_t stress_yy,
                                  const Real_t stress_zz,
                                  Real_t* const fx,
                                  Real_t* const fy,
                                  Real_t* const fz,
                                  int stride)
{
  Real_t pfx0 = B[0][0] ;   Real_t pfx1 = B[0][1] ;
  Real_t pfx2 = B[0][2] ;   Real_t pfx3 = B[0][3] ;
  Real_t pfx4 = B[0][4] ;   Real_t pfx5 = B[0][5] ;
  Real_t pfx6 = B[0][6] ;   Real_t pfx7 = B[0][7] ;

  Real_t pfy0 = B[1][0] ;   Real_t pfy1 = B[1][1] ;
  Real_t pfy2 = B[1][2] ;   Real_t pfy3 = B[1][3] ;
  Real_t pfy4 = B[1][4] ;   Real_t pfy5 = B[1][5] ;
  Real_t pfy6 = B[1][6] ;   Real_t pfy7 = B[1][7] ;

  Real_t pfz0 = B[2][0] ;   Real_t pfz1 = B[2][1] ;
  Real_t pfz2 = B[2][2] ;   Real_t pfz3 = B[2][3] ;
  Real_t pfz4 = B[2][4] ;   Real_t pfz5 = B[2][5] ;
  Real_t pfz6 = B[2][6] ;   Real_t pfz7 = B[2][7] ;

  fx[0*stride] = -( stress_xx * pfx0 );
  fx[1*stride] = -( stress_xx * pfx1 );
  fx[2*stride] = -( stress_xx * pfx2 );
  fx[3*stride] = -( stress_xx * pfx3 );
  fx[4*stride] = -( stress_xx * pfx4 );
  fx[5*stride] = -( stress_xx * pfx5 );
  fx[6*stride] = -( stress_xx * pfx6 );
  fx[7*stride] = -( stress_xx * pfx7 );

  fy[0*stride] = -( stress_yy * pfy0  );
  fy[1*stride] = -( stress_yy * pfy1  );
  fy[2*stride] = -( stress_yy * pfy2  );
  fy[3*stride] = -( stress_yy * pfy3  );
  fy[4*stride] = -( stress_yy * pfy4  );
  fy[5*stride] = -( stress_yy * pfy5  );
  fy[6*stride] = -( stress_yy * pfy6  );
  fy[7*stride] = -( stress_yy * pfy7  );

  fz[0*stride] = -( stress_zz * pfz0 );
  fz[1*stride] = -( stress_zz * pfz1 );
  fz[2*stride] = -( stress_zz * pfz2 );
  fz[3*stride] = -( stress_zz * pfz3 );
  fz[4*stride] = -( stress_zz * pfz4 );
  fz[5*stride] = -( stress_zz * pfz5 );
  fz[6*stride] = -( stress_zz * pfz6 );
  fz[7*stride] = -( stress_zz * pfz7 );
}

__global__
void IntegrateStressForElems_kernel( Index_t numElem, Index_t *nodelist,
                                     Real_t *x, Real_t *y, Real_t *z,
                                     Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem,
                                     Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
                                     Real_t *determ)
{
  Real_t B[3][8] ;// shape function derivatives
  Real_t x_local[8] ;
  Real_t y_local[8] ;
  Real_t z_local[8] ;

  int k=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (k<numElem) {
        // get nodal coordinates from global arrays and copy into local arrays.
    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
    {
            Index_t gnode = nodelist[k+lnode*numElem];
            x_local[lnode] = x[gnode];
            y_local[lnode] = y[gnode];
            z_local[lnode] = z[gnode];
    }

    /* Volume calculation involves extra work for numerical consistency. */
    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
                                     B, &determ[k]);

    CalcElemNodeNormals( B[0] , B[1], B[2],
                         x_local, y_local, z_local );

    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
                                 &fx_elem[k], &fy_elem[k], &fz_elem[k], numElem ) ;
  }
}

__global__
void AddNodeForcesFromElems_kernel( Index_t numNode,
                                    Int_t *nodeElemCount, Index_t *nodeElemCornerList,
                                    Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem,
                                    Real_t *fx_node, Real_t *fy_node, Real_t *fz_node)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<numNode) {
        Int_t count=nodeElemCount[i];
        Real_t fx,fy,fz;
        fx=fy=fz=Real_t(0.0);
        for (int j=0;j<count;j++) {
          Index_t elem=nodeElemCornerList[i+numNode*j];
          fx+=fx_elem[elem]; fy+=fy_elem[elem]; fz+=fz_elem[elem];
        }
        fx_node[i]=fx; fy_node[i]=fy; fz_node[i]=fz;
  }
}

__global__
void AddNodeForcesFromElems2_kernel( Index_t numNode,
                                     Int_t *nodeElemCount, Index_t *nodeElemCornerList,
                                     Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem,
                                     Real_t *fx_node, Real_t *fy_node, Real_t *fz_node)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<numNode) {
        Int_t count=nodeElemCount[i];
        Real_t fx,fy,fz;
        fx=fy=fz=Real_t(0.0);
        for (int j=0;j<count;j++) {
          Index_t elem=nodeElemCornerList[i+numNode*j];
          fx+=fx_elem[elem]; fy+=fy_elem[elem]; fz+=fz_elem[elem];
        }
        fx_node[i]+=fx; fy_node[i]+=fy; fz_node[i]+=fz;
  }
}

static inline
void IntegrateStressForElems_gpu( Index_t numElem,
                                  Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
                                  Real_t *determ, int& badvol)
{
  Real_t *fx_elem,*fy_elem,*fz_elem;

  HIP( hipMalloc(&fx_elem,numElem*8*sizeof(Real_t)) );
  HIP( hipMalloc(&fy_elem,numElem*8*sizeof(Real_t)) );
  HIP( hipMalloc(&fz_elem,numElem*8*sizeof(Real_t)) );

  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
  hipLaunchKernelGGL((IntegrateStressForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem, meshGPU.m_nodelist, meshGPU.m_x, meshGPU.m_y, meshGPU.m_z,
         fx_elem, fy_elem, fz_elem, sigxx, sigyy, sigzz, determ);
  HIP_DEBUGSYNC;

  dimGrid=dim3(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
  hipLaunchKernelGGL((AddNodeForcesFromElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),meshGPU.m_nodeElemCount,meshGPU.m_nodeElemCornerList,
         fx_elem,fy_elem,fz_elem,meshGPU.m_fx,meshGPU.m_fy,meshGPU.m_fz);
  HIP_DEBUGSYNC;

  HIP( hipFree(fx_elem) );
  HIP( hipFree(fy_elem) );
  HIP( hipFree(fz_elem) );

  // JDC -- need a reduction step to check for non-positive element volumes
  badvol=0;
}

static inline
void IntegrateStressForElems_cpu( Index_t numElem,
                                  Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
                                  Real_t *determ, int& badvol)
{
  Real_t B[3][8] ;// shape function derivatives
  Real_t x_local[8] ;
  Real_t y_local[8] ;
  Real_t z_local[8] ;
  Real_t fx_local[8] ;
  Real_t fy_local[8] ;
  Real_t fz_local[8] ;

  // loop over all elements
  for( Index_t k=0 ; k<numElem ; ++k )
  {
          // get nodal coordinates from global arrays and copy into local arrays.
      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
      {
          Index_t gnode = mesh.nodelist(k,lnode);
          x_local[lnode] = mesh.x(gnode);
          y_local[lnode] = mesh.y(gnode);
          z_local[lnode] = mesh.z(gnode);
      }

      /* Volume calculation involves extra work for numerical consistency. */
      CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
                                       B, &determ[k]);

      CalcElemNodeNormals( B[0] , B[1], B[2],
                           x_local, y_local, z_local );

      SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
                                   fx_local, fy_local, fz_local, 1 ) ;

      // copy nodal force contributions to global force arrray.
      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
      {
              Index_t gnode = mesh.nodelist(k,lnode);
              mesh.fx(gnode) += fx_local[lnode];
              mesh.fy(gnode) += fy_local[lnode];
              mesh.fz(gnode) += fz_local[lnode];
      }
  }

  badvol=0;
  for ( Index_t k=0 ; k<numElem ; ++k ) {
        if (determ[k] <= Real_t(0.0)) {
          badvol=1;
        }
  }
}

static inline
void IntegrateStressForElems( Index_t numElem,
                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
                              Real_t *determ, int& badvol, int useCPU)
{
  if (useCPU) {
        FC(nodelist); FC(x); FC(y); FC(z);
        IntegrateStressForElems_cpu(numElem,sigxx,sigyy,sigzz,determ,badvol);
        SG(fx); SG(fy); SG(fz);
  }
  else {
        FG(nodelist); FG(nodeElemCount); FG(nodeElemCornerList);
        FG(x); FG(y); FG(z);
        IntegrateStressForElems_gpu(numElem,sigxx,sigyy,sigzz,determ,badvol);
        SC(fx); SC(fy); SC(fz);
  }

}


static inline
void CollectDomainNodesToElemNodes(const Index_t elemNum,
                                   Real_t elemX[8],
                                   Real_t elemY[8],
                                   Real_t elemZ[8])
{
  Index_t nd0i = mesh.nodelist(elemNum,0) ;
  Index_t nd1i = mesh.nodelist(elemNum,1) ;
  Index_t nd2i = mesh.nodelist(elemNum,2) ;
  Index_t nd3i = mesh.nodelist(elemNum,3) ;
  Index_t nd4i = mesh.nodelist(elemNum,4) ;
  Index_t nd5i = mesh.nodelist(elemNum,5) ;
  Index_t nd6i = mesh.nodelist(elemNum,6) ;
  Index_t nd7i = mesh.nodelist(elemNum,7) ;

  elemX[0] = mesh.x(nd0i);
  elemX[1] = mesh.x(nd1i);
  elemX[2] = mesh.x(nd2i);
  elemX[3] = mesh.x(nd3i);
  elemX[4] = mesh.x(nd4i);
  elemX[5] = mesh.x(nd5i);
  elemX[6] = mesh.x(nd6i);
  elemX[7] = mesh.x(nd7i);

  elemY[0] = mesh.y(nd0i);
  elemY[1] = mesh.y(nd1i);
  elemY[2] = mesh.y(nd2i);
  elemY[3] = mesh.y(nd3i);
  elemY[4] = mesh.y(nd4i);
  elemY[5] = mesh.y(nd5i);
  elemY[6] = mesh.y(nd6i);
  elemY[7] = mesh.y(nd7i);

  elemZ[0] = mesh.z(nd0i);
  elemZ[1] = mesh.z(nd1i);
  elemZ[2] = mesh.z(nd2i);
  elemZ[3] = mesh.z(nd3i);
  elemZ[4] = mesh.z(nd4i);
  elemZ[5] = mesh.z(nd5i);
  elemZ[6] = mesh.z(nd6i);
  elemZ[7] = mesh.z(nd7i);

}


__host__
static inline
void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
             const Real_t x3, const Real_t x4, const Real_t x5,
             const Real_t y0, const Real_t y1, const Real_t y2,
             const Real_t y3, const Real_t y4, const Real_t y5,
             const Real_t z0, const Real_t z1, const Real_t z2,
             const Real_t z3, const Real_t z4, const Real_t z5,
             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
{
  const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;

  *dvdx =
        (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
        (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
        (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
  *dvdy =
        - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
        (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
        (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);

  *dvdz =
        - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
        (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
        (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);

  *dvdx *= twelfth;
  *dvdy *= twelfth;
  *dvdz *= twelfth;
}

#if 0
__device__
static inline
void VOLUDER(const Real_t a0, const Real_t a1, const Real_t a2,
                         const Real_t a3, const Real_t a4, const Real_t a5,
                         const Real_t b0, const Real_t b1, const Real_t b2,
                         const Real_t b3, const Real_t b4, const Real_t b5,
                         Real_t& dvdc)
{
  const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;

     dvdc=
           (a1 + a2) * (b0 + b1) - (a0 + a1) * (b1 + b2) +
           (a0 + a4) * (b3 + b4) - (a3 + a4) * (b0 + b4) -
           (a2 + a5) * (b3 + b5) + (a3 + a5) * (b2 + b5);
         dvdc *= twelfth;
}
#else
// Even though the above version is inlined, it seems to prohibit some kind of compiler optimization.
// This macro version uses many fewer registers and avoids spill-over into local memory.
#define VOLUDER(a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5,dvdc)\
  {\
        const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;\
        \
        dvdc= \
          ((a1) + (a2)) * ((b0) + (b1)) - ((a0) + (a1)) * ((b1) + (b2)) +\
          ((a0) + (a4)) * ((b3) + (b4)) - ((a3) + (a4)) * ((b0) + (b4)) -\
          ((a2) + (a5)) * ((b3) + (b5)) + ((a3) + (a5)) * ((b2) + (b5));\
        dvdc *= twelfth;\
  }
#endif

__host__
static inline
void CalcElemVolumeDerivative(Real_t dvdx[8],
                              Real_t dvdy[8],
                              Real_t dvdz[8],
                              const Real_t x[8],
                              const Real_t y[8],
                              const Real_t z[8])
{
  VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
                  y[1], y[2], y[3], y[4], y[5], y[7],
                  z[1], z[2], z[3], z[4], z[5], z[7],
                  &dvdx[0], &dvdy[0], &dvdz[0]);
  VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
                  y[0], y[1], y[2], y[7], y[4], y[6],
                  z[0], z[1], z[2], z[7], z[4], z[6],
                  &dvdx[3], &dvdy[3], &dvdz[3]);
  VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
                  y[3], y[0], y[1], y[6], y[7], y[5],
                  z[3], z[0], z[1], z[6], z[7], z[5],
                  &dvdx[2], &dvdy[2], &dvdz[2]);
  VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
                  y[2], y[3], y[0], y[5], y[6], y[4],
                  z[2], z[3], z[0], z[5], z[6], z[4],
                  &dvdx[1], &dvdy[1], &dvdz[1]);
  VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
                  y[7], y[6], y[5], y[0], y[3], y[1],
                  z[7], z[6], z[5], z[0], z[3], z[1],
                  &dvdx[4], &dvdy[4], &dvdz[4]);
  VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
                  y[4], y[7], y[6], y[1], y[0], y[2],
                  z[4], z[7], z[6], z[1], z[0], z[2],
                  &dvdx[5], &dvdy[5], &dvdz[5]);
  VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
                  y[5], y[4], y[7], y[2], y[1], y[3],
                  z[5], z[4], z[7], z[2], z[1], z[3],
                  &dvdx[6], &dvdy[6], &dvdz[6]);
  VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
                  y[6], y[5], y[4], y[3], y[2], y[0],
                  z[6], z[5], z[4], z[3], z[2], z[0],
                  &dvdx[7], &dvdy[7], &dvdz[7]);
}

__device__
static inline
void CalcElemVolumeDerivative(Real_t& dvdx,
                              Real_t& dvdy,
                              Real_t& dvdz,
                              const Real_t x,
                              const Real_t y,
                              const Real_t z,
                              unsigned int node)
{
  __shared__ Real_t array1[256],array2[256];
  volatile Real_t *va1;
  volatile Real_t *va2;

  unsigned int idx,elem;
  unsigned int ind0,ind1,ind2,ind3,ind4,ind5;

  switch(node) {
  case 0:
        {ind0=1; ind1=2; ind2=3; ind3=4; ind4=5; ind5=7;
          break;}
  case 1:
        {ind0=2; ind1=3; ind2=0; ind3=5; ind4=6; ind5=4;
          break;}
  case 2:
        {ind0=3; ind1=0; ind2=1; ind3=6; ind4=7; ind5=5;
          break;}
  case 3:
        {ind0=0; ind1=1; ind2=2; ind3=7; ind4=4; ind5=6;
          break;}
  case 4:
        {ind0=7; ind1=6; ind2=5; ind3=0; ind4=3; ind5=1;
          break;}
  case 5:
        {ind0=4; ind1=7; ind2=6; ind3=1; ind4=0; ind5=2;
          break;}
  case 6:
        {ind0=5; ind1=4; ind2=7; ind3=2; ind4=1; ind5=3;
          break;}
  case 7:
        {ind0=6; ind1=5; ind2=4; ind3=3; ind4=2; ind5=0;
          break;}
  default:
        {ind0=ind1=ind2=ind3=ind4=ind5=0xFFFFFFFF;
          break;}
  }

  idx=hipThreadIdx_x;
  elem=idx /*& 0x1F*/ - node*32;

  va1=&array1[0];
  va2=&array2[0];

  // load y and z
  __syncthreads();
  va1[idx]=y; va2[idx]=z;
  __syncthreads();
  VOLUDER(va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem],
                  va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem],
                  va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem],
                  va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem],
                  dvdx);

  // load x
  __syncthreads();
  va1[idx]=x;
  __syncthreads();
  VOLUDER(va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem],
                  va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem],
                  va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem],
                  va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem],
                  dvdy);
  __syncthreads();

  // load y
  __syncthreads();
  va2[idx]=y;
  __syncthreads();
  VOLUDER(va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem],
                  va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem],
                  va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem],
                  va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem],
                  dvdz);
  __syncthreads();
}

__host__
static inline
void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t *hourgam0,
                              Real_t *hourgam1, Real_t *hourgam2, Real_t *hourgam3,
                              Real_t *hourgam4, Real_t *hourgam5, Real_t *hourgam6,
                              Real_t *hourgam7, Real_t coefficient,
                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
{
  Index_t i00=0;
  Index_t i01=1;
  Index_t i02=2;
  Index_t i03=3;

     Real_t h00 =
                 hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +
                 hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +
                 hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +
           hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];

            Real_t h01 =
                        hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +
                        hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +
                        hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +
                  hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];

                Real_t h02 =
                  hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+
                  hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+
                  hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+
                  hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];

                Real_t h03 =
                  hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +
                  hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +
                  hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +
                  hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];

                hgfx[0] = coefficient *
                  (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
                   hourgam0[i02] * h02 + hourgam0[i03] * h03);

                hgfx[1] = coefficient *
                  (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
                   hourgam1[i02] * h02 + hourgam1[i03] * h03);

                hgfx[2] = coefficient *
                  (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
                   hourgam2[i02] * h02 + hourgam2[i03] * h03);

                hgfx[3] = coefficient *
                  (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
                   hourgam3[i02] * h02 + hourgam3[i03] * h03);

                hgfx[4] = coefficient *
                  (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
                   hourgam4[i02] * h02 + hourgam4[i03] * h03);

                hgfx[5] = coefficient *
                  (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
                   hourgam5[i02] * h02 + hourgam5[i03] * h03);

                hgfx[6] = coefficient *
                  (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
                   hourgam6[i02] * h02 + hourgam6[i03] * h03);

                hgfx[7] = coefficient *
                  (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
                   hourgam7[i02] * h02 + hourgam7[i03] * h03);

                h00 =
                  hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +
                  hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +
                  hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +
                  hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];

                h01 =
                  hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +
                  hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +
                  hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +
                  hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];

                h02 =
                  hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+
                  hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+
                  hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+
                  hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];

                h03 =
                  hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +
                  hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +
                  hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +
                  hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];


                hgfy[0] = coefficient *
                  (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
                   hourgam0[i02] * h02 + hourgam0[i03] * h03);

                hgfy[1] = coefficient *
                  (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
                   hourgam1[i02] * h02 + hourgam1[i03] * h03);

                hgfy[2] = coefficient *
                  (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
                   hourgam2[i02] * h02 + hourgam2[i03] * h03);

                hgfy[3] = coefficient *
                  (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
                   hourgam3[i02] * h02 + hourgam3[i03] * h03);

                hgfy[4] = coefficient *
                  (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
                   hourgam4[i02] * h02 + hourgam4[i03] * h03);

                hgfy[5] = coefficient *
                  (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
                   hourgam5[i02] * h02 + hourgam5[i03] * h03);

                hgfy[6] = coefficient *
                  (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
                   hourgam6[i02] * h02 + hourgam6[i03] * h03);

                hgfy[7] = coefficient *
                  (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
                   hourgam7[i02] * h02 + hourgam7[i03] * h03);

                h00 =
                  hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +
                  hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +
                  hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +
                  hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];

                h01 =
                  hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +
                  hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +
                  hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +
                  hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];

                h02 =
                  hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+
                  hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+
                  hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+
                  hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];

                h03 =
                  hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +
                  hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +
                  hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +
                  hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];


                hgfz[0] = coefficient *
                  (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
                   hourgam0[i02] * h02 + hourgam0[i03] * h03);

                hgfz[1] = coefficient *
                  (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
                   hourgam1[i02] * h02 + hourgam1[i03] * h03);

                hgfz[2] = coefficient *
                  (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
                   hourgam2[i02] * h02 + hourgam2[i03] * h03);

                hgfz[3] = coefficient *
                  (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
                   hourgam3[i02] * h02 + hourgam3[i03] * h03);

                hgfz[4] = coefficient *
                  (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
                   hourgam4[i02] * h02 + hourgam4[i03] * h03);

                hgfz[5] = coefficient *
                  (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
                   hourgam5[i02] * h02 + hourgam5[i03] * h03);

                hgfz[6] = coefficient *
                  (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
                   hourgam6[i02] * h02 + hourgam6[i03] * h03);

                hgfz[7] = coefficient *
                  (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
                   hourgam7[i02] * h02 + hourgam7[i03] * h03);
}


__device__
static inline
Real_t SumOverNodes(Real_t val) {
  __shared__ Real_t shm_array[32*8];

  // Sum up 8 node values for each element
  // Assumes 256 threads: 32 elements, 8 nodes per element.
  // NOTE: we could probably avoid some of the __syncthreads() if we map 8 nodes
  //       of an element to the same warp.
  unsigned int tid=hipThreadIdx_x;

#if 1
#if 0
  unsigned int node=tid>>5;
  unsigned int elem=tid-(node<<5);
#elif 1
  unsigned int node=tid/32;
  unsigned int elem=tid-(node*32);
#else
  unsigned int elem=tid & 0x1F;
#endif
  __syncthreads();
  shm_array[tid]=val;
  __syncthreads();
  if (tid<128) shm_array[tid]+=shm_array[tid+128];
  __syncthreads();
  if (tid<64)  shm_array[tid]+=shm_array[tid+64];
  __syncthreads();
  if (tid<32)  shm_array[tid]+=shm_array[tid+32];
  __syncthreads();
  Real_t ret=shm_array[elem];
  __syncthreads();
  return ret;
#else
#if 0
  unsigned int node=tid>>5;
  unsigned int elem=tid-(node<<5);
#else
  unsigned int node=tid/32;
  unsigned int elem=tid-(node*32);
#endif
  unsigned int idx=elem*8+node;
  __syncthreads();
  shm_array[idx]=val;
  __syncthreads();
  if (node<4) shm_array[idx]+=shm_array[idx+4];
  if (node<2) shm_array[idx]+=shm_array[idx+2];
  if (node<1) shm_array[idx]+=shm_array[idx+1];
  __syncthreads();
  return shm_array[elem*8];
#endif
}

__device__
static inline
void CalcElemFBHourglassForce(Real_t xd,Real_t yd,Real_t zd,
                              Real_t *hourgam,Real_t coefficient,
                              Real_t &hgfx, Real_t &hgfy, Real_t &hgfz)
{
  hgfx=0;
  for (int i=0;i<4;i++) {
        Real_t h;
        h=hourgam[i]*xd;
        h=SumOverNodes(h);
        hgfx+=hourgam[i]*h;
  }
  hgfx *= coefficient;

  hgfy=0;
  for (int i=0;i<4;i++) {
        Real_t h;
        h=hourgam[i]*yd;
        h=SumOverNodes(h);
        hgfy+=hourgam[i]*h;
  }
  hgfy *= coefficient;

  hgfz=0;
  for (int i=0;i<4;i++) {
        Real_t h;
        h=hourgam[i]*zd;
        h=SumOverNodes(h);
        hgfz+=hourgam[i]*h;
  }
  hgfz *= coefficient;
}

__global__
void CalcFBHourglassForceForElems_kernel(
                                         Real_t *determ,
                                         Real_t *x8n,      Real_t *y8n,      Real_t *z8n,
                                         Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,
                                         Real_t hourg,
                                         Index_t numElem, Index_t *nodelist,
                                         Real_t *ss, Real_t *elemMass,
                                         Real_t *xd, Real_t *yd, Real_t *zd,
                                         Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem)
{
  /*************************************************
   *
   *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
   *               force.
   *
   *************************************************/

  Real_t hgfx, hgfy, hgfz;

  Real_t coefficient;

  Real_t hourgam[4];
  Real_t xd1, yd1, zd1;

  /*************************************************/
  /*    compute the hourglass modes */

  const Real_t posf = Real_t( 1.);
  const Real_t negf = Real_t(-1.);

  // Assume we will launch 256 threads, which we map to 32 elements, each
  // with 8 per-node threads. Organize so each warp of 32 consecutive
  // threads operates on the same node of different elements.


  // THESE ARE ALL GIVING ME DIFFERENT ANSWERS IN CUDA 4.0 !!?!!?!!
  unsigned int tid=hipThreadIdx_x;
  unsigned int bid=hipBlockIdx_x;
#if 0
  unsigned int node=tid>>5;
  unsigned int elem=bid<<5 + (tid - (node<<5));
#elif 1
  unsigned int node=tid/32;
  unsigned int elem=bid*32 + (tid-node*32);
#elif 0
  unsigned int node=tid/32;;
  unsigned int elem=bid*32 + (tid & 0x1F);
#elif 0
  unsigned int node=tid/32;
  unsigned int elem=bid<<5 + (tid & 0x1F);
#elif 0
  unsigned int node=tid>>5;
  unsigned int elem=bid*32 + (tid & 0x1F);
#else
  unsigned int node=tid>>5;
  unsigned int elem=bid<<5 + (tid & 0x1F);
#endif

  if (elem>=numElem) elem=numElem-1; // don't return -- need thread to participate in sync operations

  //if (elem<0) elem=0; // debugging test

  Real_t volinv=Real_t(1.0)/determ[elem];
  Real_t ss1, mass1, volume13 ;

  Real_t xn,yn,zn,dvdxn,dvdyn,dvdzn;
  Real_t hourmodx, hourmody, hourmodz;


#if 1
  xn=x8n[elem+numElem*node]; yn=y8n[elem+numElem*node]; zn=z8n[elem+numElem*node];
  dvdxn=dvdx[elem+numElem*node]; dvdyn=dvdy[elem+numElem*node]; dvdzn=dvdz[elem+numElem*node];
#else
  xn=yn=zn=posf; dvdxn=dvdyn=dvdzn=negf;
#endif

#if 1
  hourmodx=xn; hourmody=yn; hourmodz=zn;
  if (node==2 || node==3 || node==4 || node==5) {
        hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
        hourgam[0] = negf;
  }
  else hourgam[0] = posf;
  hourmodx = SumOverNodes(hourmodx);
  hourmody = SumOverNodes(hourmody);
  hourmodz = SumOverNodes(hourmodz);
  hourgam[0] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);


  hourmodx=xn; hourmody=yn; hourmodz=zn;
  if (node==1 || node==2 || node==4 || node==7) {
        hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
        hourgam[1] = negf;
  }
  else hourgam[1] = posf;
  hourmodx = SumOverNodes(hourmodx);
  hourmody = SumOverNodes(hourmody);
  hourmodz = SumOverNodes(hourmodz);
  hourgam[1] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);


  hourmodx=xn; hourmody=yn; hourmodz=zn;
  if (node==1 || node==3 || node==5 || node==7) {
        hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
        hourgam[2] = negf;
  }
  else hourgam[2] = posf;
  hourmodx = SumOverNodes(hourmodx);
  hourmody = SumOverNodes(hourmody);
  hourmodz = SumOverNodes(hourmodz);
  hourgam[2] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);


  hourmodx=xn; hourmody=yn; hourmodz=zn;
  if (node==0 || node==2 || node==5 || node==7) {
        hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
        hourgam[3] = negf;
  }
  else hourgam[3] = posf;
  hourmodx = SumOverNodes(hourmodx);
  hourmody = SumOverNodes(hourmody);
  hourmodz = SumOverNodes(hourmodz);
  hourgam[3] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);


  /* compute forces */
  /* store forces into h arrays (force arrays) */

  ss1=ss[elem];
  mass1=elemMass[elem];
  volume13=CBRT(determ[elem]);

  Index_t ni = nodelist[elem+numElem*node];
  xd1=xd[ni]; yd1=yd[ni]; zd1=zd[ni];

  coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;

  CalcElemFBHourglassForce(xd1,yd1,zd1,hourgam,coefficient,hgfx,hgfy,hgfz);
#else
  hgfx=xn+dvdxn; hgfy=yn+dvdyn; hgfz=zn+dvdzn;
#endif
#if 1
  fx_elem[elem+numElem*node]=hgfx; fy_elem[elem+numElem*node]=hgfy; fz_elem[elem+numElem*node]=hgfz;
#else
  fx_elem[0]=hgfx; fy_elem[0]=hgfy; fz_elem[0]=hgfz;
#endif
}


static inline
void CalcFBHourglassForceForElems_cpu(Real_t *determ,
                                      Real_t *x8n,      Real_t *y8n,      Real_t *z8n,
                                      Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,
                                      Real_t hourg)
{
  /*************************************************
   *
   *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
   *               force.
   *
   *************************************************/

  Index_t numElem = mesh.numElem() ;

  Real_t hgfx[8], hgfy[8], hgfz[8] ;

  Real_t coefficient;

  Real_t  gamma[4][8];
  Real_t hourgam0[4], hourgam1[4], hourgam2[4], hourgam3[4] ;
  Real_t hourgam4[4], hourgam5[4], hourgam6[4], hourgam7[4];
  Real_t xd1[8], yd1[8], zd1[8] ;

  gamma[0][0] = Real_t( 1.);
  gamma[0][1] = Real_t( 1.);
  gamma[0][2] = Real_t(-1.);
  gamma[0][3] = Real_t(-1.);
  gamma[0][4] = Real_t(-1.);
  gamma[0][5] = Real_t(-1.);
  gamma[0][6] = Real_t( 1.);
  gamma[0][7] = Real_t( 1.);
  gamma[1][0] = Real_t( 1.);
  gamma[1][1] = Real_t(-1.);
  gamma[1][2] = Real_t(-1.);
  gamma[1][3] = Real_t( 1.);
  gamma[1][4] = Real_t(-1.);
  gamma[1][5] = Real_t( 1.);
  gamma[1][6] = Real_t( 1.);
  gamma[1][7] = Real_t(-1.);
  gamma[2][0] = Real_t( 1.);
  gamma[2][1] = Real_t(-1.);
  gamma[2][2] = Real_t( 1.);
  gamma[2][3] = Real_t(-1.);
  gamma[2][4] = Real_t( 1.);
  gamma[2][5] = Real_t(-1.);
  gamma[2][6] = Real_t( 1.);
  gamma[2][7] = Real_t(-1.);
  gamma[3][0] = Real_t(-1.);
  gamma[3][1] = Real_t( 1.);
  gamma[3][2] = Real_t(-1.);
  gamma[3][3] = Real_t( 1.);
  gamma[3][4] = Real_t( 1.);
  gamma[3][5] = Real_t(-1.);
  gamma[3][6] = Real_t( 1.);
  gamma[3][7] = Real_t(-1.);

  /*************************************************/
  /*    compute the hourglass modes */


  for(Index_t i2=0;i2<numElem;++i2){
        Index_t i3=8*i2;
        Real_t volinv=Real_t(1.0)/determ[i2];
        Real_t ss1, mass1, volume13 ;
        for(Index_t i1=0;i1<4;++i1){

          Real_t hourmodx =
                x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
                x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
                x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
                x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];

          Real_t hourmody =
                y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
                y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
                y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
                y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];

          Real_t hourmodz =
                z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
                z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
                z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
                z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];

          hourgam0[i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
                                                 dvdy[i3  ] * hourmody +
                                                 dvdz[i3  ] * hourmodz );

          hourgam1[i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
                                                 dvdy[i3+1] * hourmody +
                                                 dvdz[i3+1] * hourmodz );

          hourgam2[i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
                                                 dvdy[i3+2] * hourmody +
                                                 dvdz[i3+2] * hourmodz );

          hourgam3[i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
                                                 dvdy[i3+3] * hourmody +
                                                 dvdz[i3+3] * hourmodz );

          hourgam4[i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
                                                 dvdy[i3+4] * hourmody +
                                                 dvdz[i3+4] * hourmodz );

          hourgam5[i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
                                                 dvdy[i3+5] * hourmody +
                                                 dvdz[i3+5] * hourmodz );

          hourgam6[i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
                                                 dvdy[i3+6] * hourmody +
                                                 dvdz[i3+6] * hourmodz );

          hourgam7[i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
                                                 dvdy[i3+7] * hourmody +
                                                 dvdz[i3+7] * hourmodz );

        }

        /* compute forces */
        /* store forces into h arrays (force arrays) */

        ss1=mesh.ss(i2);
        mass1=mesh.elemMass(i2);
        volume13=CBRT(determ[i2]);

        Index_t n0si2 = mesh.nodelist(i2,0);
        Index_t n1si2 = mesh.nodelist(i2,1);
        Index_t n2si2 = mesh.nodelist(i2,2);
        Index_t n3si2 = mesh.nodelist(i2,3);
        Index_t n4si2 = mesh.nodelist(i2,4);
        Index_t n5si2 = mesh.nodelist(i2,5);
        Index_t n6si2 = mesh.nodelist(i2,6);
        Index_t n7si2 = mesh.nodelist(i2,7);

        xd1[0] = mesh.xd(n0si2);
        xd1[1] = mesh.xd(n1si2);
        xd1[2] = mesh.xd(n2si2);
        xd1[3] = mesh.xd(n3si2);
        xd1[4] = mesh.xd(n4si2);
        xd1[5] = mesh.xd(n5si2);
        xd1[6] = mesh.xd(n6si2);
        xd1[7] = mesh.xd(n7si2);

        yd1[0] = mesh.yd(n0si2);
        yd1[1] = mesh.yd(n1si2);
        yd1[2] = mesh.yd(n2si2);
        yd1[3] = mesh.yd(n3si2);
        yd1[4] = mesh.yd(n4si2);
        yd1[5] = mesh.yd(n5si2);
        yd1[6] = mesh.yd(n6si2);
        yd1[7] = mesh.yd(n7si2);

        zd1[0] = mesh.zd(n0si2);
        zd1[1] = mesh.zd(n1si2);
        zd1[2] = mesh.zd(n2si2);
        zd1[3] = mesh.zd(n3si2);
        zd1[4] = mesh.zd(n4si2);
        zd1[5] = mesh.zd(n5si2);
        zd1[6] = mesh.zd(n6si2);
        zd1[7] = mesh.zd(n7si2);

        coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;

        CalcElemFBHourglassForce(xd1,yd1,zd1,
                                 hourgam0,hourgam1,hourgam2,hourgam3,
                                 hourgam4,hourgam5,hourgam6,hourgam7,
                                 coefficient, hgfx, hgfy, hgfz);

        mesh.fx(n0si2) += hgfx[0];
        mesh.fy(n0si2) += hgfy[0];
        mesh.fz(n0si2) += hgfz[0];

        mesh.fx(n1si2) += hgfx[1];
        mesh.fy(n1si2) += hgfy[1];
        mesh.fz(n1si2) += hgfz[1];

        mesh.fx(n2si2) += hgfx[2];
        mesh.fy(n2si2) += hgfy[2];
        mesh.fz(n2si2) += hgfz[2];

        mesh.fx(n3si2) += hgfx[3];
        mesh.fy(n3si2) += hgfy[3];
        mesh.fz(n3si2) += hgfz[3];

        mesh.fx(n4si2) += hgfx[4];
        mesh.fy(n4si2) += hgfy[4];
        mesh.fz(n4si2) += hgfz[4];

        mesh.fx(n5si2) += hgfx[5];
        mesh.fy(n5si2) += hgfy[5];
        mesh.fz(n5si2) += hgfz[5];

        mesh.fx(n6si2) += hgfx[6];
        mesh.fy(n6si2) += hgfy[6];
        mesh.fz(n6si2) += hgfz[6];

        mesh.fx(n7si2) += hgfx[7];
        mesh.fy(n7si2) += hgfy[7];
        mesh.fz(n7si2) += hgfz[7];
  }
}

static inline
void CalcFBHourglassForceForElems_gpu(Real_t *determ,
                                      Real_t *x8n,      Real_t *y8n,      Real_t *z8n,
                                      Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,
                                      Real_t hourg)
{
  Index_t numElem = mesh.numElem();
  Real_t *fx_elem,*fy_elem,*fz_elem;

  HIP( hipMalloc(&fx_elem,numElem*8*sizeof(Real_t)) );
  HIP( hipMalloc(&fy_elem,numElem*8*sizeof(Real_t)) );
  HIP( hipMalloc(&fz_elem,numElem*8*sizeof(Real_t)) );

  dim3 dimBlock=dim3(256,1,1);
  dim3 dimGrid=dim3(PAD_DIV(numElem*8,dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcFBHourglassForceForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0,
                     determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hourg,
                     numElem,meshGPU.m_nodelist,
                     meshGPU.m_ss,meshGPU.m_elemMass,
                     meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd,
                     fx_elem,fy_elem,fz_elem);
  HIP_DEBUGSYNC;

  dimGrid=dim3(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
  hipLaunchKernelGGL((AddNodeForcesFromElems2_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),meshGPU.m_nodeElemCount,meshGPU.m_nodeElemCornerList,
         fx_elem,fy_elem,fz_elem,meshGPU.m_fx,meshGPU.m_fy,meshGPU.m_fz);
  HIP_DEBUGSYNC;

  HIP( hipFree(fx_elem) );
  HIP( hipFree(fy_elem) );
  HIP( hipFree(fz_elem) );
}


__global__
void CalcHourglassControlForElems_kernel(Int_t numElem,Index_t *nodelist,
                                         Real_t *x,Real_t *y,Real_t *z,
                                         Real_t *determ,Real_t *volo,Real_t *v,
                                         Real_t *dvdx,Real_t *dvdy,Real_t *dvdz,
                                         Real_t *x8n,Real_t *y8n,Real_t *z8n)
{
  Real_t  x1,y1,z1;
  Real_t pfx,pfy,pfz;

  // THESE ARE ALL GIVING ME DIFFERENT ANSWERS IN CUDA 4.0 !!?!!?!!
  unsigned int tid=hipThreadIdx_x;
  unsigned int bid=hipBlockIdx_x;
#if 0
  unsigned int node=tid>>5;
  unsigned int elem=bid<<5 + (tid - (node<<5));
#elif 1
  unsigned int node=tid/32;
  unsigned int elem=bid*32 + (tid-node*32);
#elif 0
  unsigned int node=tid/32;;
  unsigned int elem=bid*32 + (tid & 0x1F);
#elif 0
  unsigned int node=tid/32;
  unsigned int elem=bid<<5 + (tid & 0x1F);
#elif 0
  unsigned int node=tid>>5;
  unsigned int elem=bid*32 + (tid & 0x1F);
#else
  unsigned int node=tid>>5;
  unsigned int elem=bid<<5 + (tid & 0x1F);
#endif

  if (elem>=numElem) elem=numElem-1; // don't return -- need thread to participate in sync operations

  Index_t idx=elem+numElem*node;

  Index_t ni = nodelist[idx];
  x1=x[ni]; y1=y[ni]; z1=z[ni];

  CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1, node);

  /* load into temporary storage for FB Hour Glass control */

  dvdx[idx] = pfx;
  dvdy[idx] = pfy;
  dvdz[idx] = pfz;
  x8n[idx]  = x1;
  y8n[idx]  = y1;
  z8n[idx]  = z1;

  //if (node==0)
  determ[elem] = volo[elem] * v[elem];

#if 0 // JDC
  /* Do a check for negative volumes */
  if ( mesh.v(i) <= Real_t(0.0) ) {
        exit(VolumeError) ;
  }
#endif
}


static inline
void CalcHourglassControlForElems_gpu(Real_t determ[], Real_t hgcoef)
{
  Index_t numElem = mesh.numElem() ;
  Index_t numElem8 = numElem * 8 ;
  Real_t *dvdx,*dvdy,*dvdz;
  Real_t *x8n,*y8n,*z8n;

  HIP( hipMalloc(&dvdx,sizeof(Real_t)*numElem8) );
  HIP( hipMalloc(&dvdy,sizeof(Real_t)*numElem8) );
  HIP( hipMalloc(&dvdz,sizeof(Real_t)*numElem8) );
  HIP( hipMalloc(&x8n,sizeof(Real_t)*numElem8) );
  HIP( hipMalloc(&y8n,sizeof(Real_t)*numElem8) );
  HIP( hipMalloc(&z8n,sizeof(Real_t)*numElem8) );

  dim3 dimBlock=dim3(256,1,1);
  dim3 dimGrid=dim3(PAD_DIV(numElem*8,dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcHourglassControlForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem, meshGPU.m_nodelist,
         meshGPU.m_x,meshGPU.m_y,meshGPU.m_z,
         determ,meshGPU.m_volo,meshGPU.m_v,
         dvdx,dvdy,dvdz,x8n,y8n,z8n);
  HIP_DEBUGSYNC;

  // JDC -- need a reduction to check for negative volumes

  if ( hgcoef > Real_t(0.) ) {
        CalcFBHourglassForceForElems_gpu(determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hgcoef) ;
  }

  HIP( hipFree(dvdx) );
  HIP( hipFree(dvdy) );
  HIP( hipFree(dvdz) );
  HIP( hipFree(x8n) );
  HIP( hipFree(y8n) );
  HIP( hipFree(z8n) );

  return ;
}


static inline
void CalcHourglassControlForElems_cpu(Real_t determ[], Real_t hgcoef)
{
  Index_t i, ii, jj ;
  Real_t  x1[8],  y1[8],  z1[8] ;
  Real_t pfx[8], pfy[8], pfz[8] ;
  Index_t numElem = mesh.numElem() ;
  Index_t numElem8 = numElem * 8 ;
  Real_t *dvdx = Allocate<Real_t>(numElem8) ;
  Real_t *dvdy = Allocate<Real_t>(numElem8) ;
  Real_t *dvdz = Allocate<Real_t>(numElem8) ;
  Real_t *x8n  = Allocate<Real_t>(numElem8) ;
  Real_t *y8n  = Allocate<Real_t>(numElem8) ;
  Real_t *z8n  = Allocate<Real_t>(numElem8) ;

  /* start loop over elements */
  for (i=0 ; i<numElem ; ++i){

        CollectDomainNodesToElemNodes(i, x1, y1, z1);

        CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);

        /* load into temporary storage for FB Hour Glass control */
        for(ii=0;ii<8;++ii){
          jj=8*i+ii;

          dvdx[jj] = pfx[ii];
          dvdy[jj] = pfy[ii];
          dvdz[jj] = pfz[ii];
          x8n[jj]  = x1[ii];
          y8n[jj]  = y1[ii];
          z8n[jj]  = z1[ii];
        }

        determ[i] = mesh.volo(i) * mesh.v(i);

        /* Do a check for negative volumes */
        if ( mesh.v(i) <= Real_t(0.0) ) {
          exit(VolumeError) ;
        }
  }

  if ( hgcoef > Real_t(0.) ) {
        CalcFBHourglassForceForElems_cpu(determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hgcoef) ;
  }

  Release(&z8n) ;
  Release(&y8n) ;
  Release(&x8n) ;
  Release(&dvdz) ;
  Release(&dvdy) ;
  Release(&dvdx) ;

  return ;
}


static inline
void CalcHourglassControlForElems(Real_t determ[], Real_t hgcoef, int useCPU)
{
  if (useCPU) {
        FC(x); FC(y); FC(z); FC(xd); FC(yd); FC(zd);
        FC(nodelist); FC(ss); FC(elemMass);
        FC(xd); FC(yd); FC(zd);
        FC(fx); FC(fy); FC(fz);
        CalcHourglassControlForElems_cpu(determ,hgcoef);
        SG(fx); SG(fy); SG(fz);
  }
  else {
        FG(x); FG(y); FG(z); FG(xd); FG(yd); FG(zd);
        FG(nodelist); FG(ss); FG(elemMass);
        FG(xd); FG(yd); FG(zd);
        FG(fx); FG(fy); FG(fz);
        CalcHourglassControlForElems_gpu(determ,hgcoef);
        SC(fx); SC(fy); SC(fz);
  }
}


static inline
void CalcVolumeForceForElems_gpu()
{
  Index_t numElem = mesh.numElem() ;
  if (numElem != 0) {
        Real_t  hgcoef = mesh.hgcoef() ;
        Real_t *sigxx, *sigyy, *sigzz, *determ;
        int badvol;

        HIP( hipMalloc(&sigxx,numElem*sizeof(Real_t)) );
        HIP( hipMalloc(&sigyy,numElem*sizeof(Real_t)) );
        HIP( hipMalloc(&sigzz,numElem*sizeof(Real_t)) );
        HIP( hipMalloc(&determ,numElem*sizeof(Real_t)) );

        /* Sum contributions to total stress tensor */
        InitStressTermsForElems(numElem, sigxx, sigyy, sigzz, 0);

        // call elemlib stress integration loop to produce nodal forces from
        // material stresses.
        IntegrateStressForElems( numElem, sigxx, sigyy, sigzz, determ, badvol, 0) ;

        HIP( hipFree(sigxx) );
        HIP( hipFree(sigyy) );
        HIP( hipFree(sigzz) );

        // check for negative element volume
        if (badvol) exit(VolumeError) ;

        CalcHourglassControlForElems(determ, hgcoef, 0) ;

        HIP( hipFree(determ) );
  }
}


static inline
void CalcVolumeForceForElems_cpu()
{
  Index_t numElem = mesh.numElem() ;
  if (numElem != 0) {
        Real_t  hgcoef = mesh.hgcoef() ;
        Real_t *sigxx  = Allocate<Real_t>(numElem) ;
        Real_t *sigyy  = Allocate<Real_t>(numElem) ;
        Real_t *sigzz  = Allocate<Real_t>(numElem) ;
        Real_t *determ = Allocate<Real_t>(numElem) ;
        int badvol;

        /* Sum contributions to total stress tensor */
        InitStressTermsForElems(numElem, sigxx, sigyy, sigzz, 1);

        // call elemlib stress integration loop to produce nodal forces from
        // material stresses.
        IntegrateStressForElems( numElem, sigxx, sigyy, sigzz, determ, badvol, 1) ;

        Release(&sigzz) ;
        Release(&sigyy) ;
        Release(&sigxx) ;

        // check for negative element volume
        if (badvol) exit(VolumeError);
#if 0
        for ( Index_t k=0 ; k<numElem ; ++k ) {
          if (determ[k] <= Real_t(0.0)) {
                exit(VolumeError) ;
          }
        }
#endif

        CalcHourglassControlForElems(determ, hgcoef, 1) ;

        Release(&determ) ;
  }
}

static inline void CalcForceForNodes_gpu()
{
  /* Calcforce calls partial, force, hourq */
  CalcVolumeForceForElems_gpu() ;

  /* Calculate Nodal Forces at domain boundaries */
  /* problem->commSBN->Transfer(CommSBN::forces); */


}

static inline void CalcForceForNodes_cpu()
{
  Index_t numNode = mesh.numNode() ;
  for (Index_t i=0; i<numNode; ++i) {
        mesh.fx(i) = Real_t(0.0) ;
        mesh.fy(i) = Real_t(0.0) ;
        mesh.fz(i) = Real_t(0.0) ;
  }

  /* Calcforce calls partial, force, hourq */
  CalcVolumeForceForElems_cpu() ;

  /* Calculate Nodal Forces at domain boundaries */
  /* problem->commSBN->Transfer(CommSBN::forces); */

}

static inline void CalcForceForNodes(int useCPU)
{
  if (useCPU) {
        CalcForceForNodes_cpu();
  }
  else {
        CalcForceForNodes_gpu();
  }
}

__global__
void CalcAccelerationForNodes_kernel(int numNode,
                                     Real_t *xdd, Real_t *ydd, Real_t *zdd,
                                     Real_t *fx, Real_t *fy, Real_t *fz,
                                     Real_t *nodalMass)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<numNode) {
        xdd[i]=fx[i]/nodalMass[i];
        ydd[i]=fy[i]/nodalMass[i];
        zdd[i]=fz[i]/nodalMass[i];
  }
}

static inline
void CalcAccelerationForNodes_gpu()
{
  dim3 dimBlock = dim3(BLOCKSIZE,1,1);
  dim3 dimGrid = dim3(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcAccelerationForNodes_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),
         meshGPU.m_xdd,meshGPU.m_ydd,meshGPU.m_zdd,
         meshGPU.m_fx,meshGPU.m_fy,meshGPU.m_fz,
         meshGPU.m_nodalMass);
  HIP_DEBUGSYNC;
}


static inline
void CalcAccelerationForNodes_cpu()
{
  Index_t numNode = mesh.numNode() ;
  for (Index_t i = 0; i < numNode; ++i) {
        mesh.xdd(i) = mesh.fx(i) / mesh.nodalMass(i);
        mesh.ydd(i) = mesh.fy(i) / mesh.nodalMass(i);
        mesh.zdd(i) = mesh.fz(i) / mesh.nodalMass(i);
  }
}

static inline
void CalcAccelerationForNodes(int useCPU)
{
  if (useCPU) {
        FC(fx); FC(fy); FC(fz); FC(nodalMass);
        CalcAccelerationForNodes_cpu();
        SG(xdd); SG(ydd); SG(zdd);
  }
  else {
        FG(fx); FG(fy); FG(fz); FG(nodalMass);
        CalcAccelerationForNodes_gpu();
        SC(xdd); SC(ydd); SC(zdd);
  }
}

__global__
void ApplyAccelerationBoundaryConditionsForNodes_kernel(
                                                        int numNodeBC, Real_t *xdd, Real_t *ydd, Real_t *zdd,
                                                        Index_t *symmX, Index_t *symmY, Index_t *symmZ)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<numNodeBC) {
        xdd[symmX[i]] = Real_t(0.0) ;
        ydd[symmY[i]] = Real_t(0.0) ;
        zdd[symmZ[i]] = Real_t(0.0) ;
  }
}

static inline
void ApplyAccelerationBoundaryConditionsForNodes_gpu()
{
  Index_t numNodeBC = (mesh.sizeX()+1)*(mesh.sizeX()+1) ;
  dim3 dimBlock(BLOCKSIZE,1,1);
  dim3 dimGrid(PAD_DIV(numNodeBC,dimBlock.x),1,1);
  hipLaunchKernelGGL((ApplyAccelerationBoundaryConditionsForNodes_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numNodeBC,
         meshGPU.m_xdd,meshGPU.m_ydd,meshGPU.m_zdd,
         meshGPU.m_symmX,meshGPU.m_symmY,meshGPU.m_symmZ);
  HIP_DEBUGSYNC;
}

static inline
void ApplyAccelerationBoundaryConditionsForNodes_cpu()
{
  Index_t numNodeBC = (mesh.sizeX()+1)*(mesh.sizeX()+1) ;
  for(Index_t i=0 ; i<numNodeBC ; ++i)
        mesh.xdd(mesh.symmX(i)) = Real_t(0.0) ;

  for(Index_t i=0 ; i<numNodeBC ; ++i)
        mesh.ydd(mesh.symmY(i)) = Real_t(0.0) ;

  for(Index_t i=0 ; i<numNodeBC ; ++i)
        mesh.zdd(mesh.symmZ(i)) = Real_t(0.0) ;
}

static inline
void ApplyAccelerationBoundaryConditionsForNodes(int useCPU)
{
  if (useCPU) {
        FC(xdd); FC(ydd); FC(zdd); FC(symmX); FC(symmY); FC(symmZ);
        ApplyAccelerationBoundaryConditionsForNodes_cpu();
        SG(xdd); SG(ydd); SG(zdd);
  }
  else {
        FG(xdd); FG(ydd); FG(zdd); FG(symmX); FG(symmY); FG(symmZ);
        ApplyAccelerationBoundaryConditionsForNodes_gpu();
        SC(xdd); SC(ydd); SC(zdd);
  }
}


__global__
void CalcVelocityForNodes_kernel(int numNode, const Real_t dt, const Real_t u_cut,
                                 Real_t *xd, Real_t *yd, Real_t *zd,
                                 Real_t *xdd, Real_t *ydd, Real_t *zdd)
{
  int i = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<numNode) {
        Real_t xdtmp, ydtmp, zdtmp ;

        xdtmp = xd[i] + xdd[i] * dt ;
        if( FABS(xdtmp) < u_cut ) xdtmp = 0.0;//Real_t(0.0);
        xd[i] = xdtmp ;

        ydtmp = yd[i] + ydd[i] * dt ;
        if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
        yd[i] = ydtmp ;

        zdtmp = zd[i] + zdd[i] * dt ;
        if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
        zd[i] = zdtmp ;
  }
}

static inline
void CalcVelocityForNodes_gpu(const Real_t dt, const Real_t u_cut)
{
  dim3 dimBlock(BLOCKSIZE,1,1);
  dim3 dimGrid(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcVelocityForNodes_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),dt,u_cut,
         meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd,
         meshGPU.m_xdd,meshGPU.m_ydd,meshGPU.m_zdd);
  HIP_DEBUGSYNC;
}

static inline
void CalcVelocityForNodes_cpu(const Real_t dt, const Real_t u_cut)
{
  Index_t numNode = mesh.numNode() ;

  for ( Index_t i = 0 ; i < numNode ; ++i )
  {
          Real_t xdtmp, ydtmp, zdtmp ;

          xdtmp = mesh.xd(i) + mesh.xdd(i) * dt ;
          if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
          mesh.xd(i) = xdtmp ;

          ydtmp = mesh.yd(i) + mesh.ydd(i) * dt ;
          if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
          mesh.yd(i) = ydtmp ;

          zdtmp = mesh.zd(i) + mesh.zdd(i) * dt ;
          if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
          mesh.zd(i) = zdtmp ;
  }
}

static inline
void CalcVelocityForNodes(const Real_t dt, const Real_t u_cut, int useCPU)
{
  if (useCPU) {
        FC(xd); FC(yd); FC(zd); FC(xdd); FC(ydd); FC(zdd);
        CalcVelocityForNodes_cpu(dt,u_cut);
        SG(xd); SG(yd); SG(zd);
  }
  else {
        FG(xd); FG(yd); FG(zd); FG(xdd); FG(ydd); FG(zdd);
        CalcVelocityForNodes_gpu(dt,u_cut);
        SC(xd); SC(yd); SC(zd);
  }
}

__global__
void CalcPositionForNodes_kernel(int numNode, Real_t dt,
                                 Real_t *x, Real_t *y, Real_t *z,
                                 Real_t *xd, Real_t *yd, Real_t *zd)
{
  int i = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<numNode) {
        x[i] += xd[i] * dt;
        y[i] += yd[i] * dt;
        z[i] += zd[i] * dt;
  }
}

static inline
void CalcPositionForNodes_gpu(const Real_t dt)
{
  dim3 dimBlock(BLOCKSIZE,1,1);
  dim3 dimGrid(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcPositionForNodes_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),dt,meshGPU.m_x,meshGPU.m_y,meshGPU.m_z,meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd);
  HIP_DEBUGSYNC;
}

static inline
void CalcPositionForNodes_cpu(const Real_t dt)
{
  Index_t numNode = mesh.numNode() ;

  for ( Index_t i = 0 ; i < numNode ; ++i )
  {
          mesh.x(i) += mesh.xd(i) * dt ;
          mesh.y(i) += mesh.yd(i) * dt ;
          mesh.z(i) += mesh.zd(i) * dt ;
  }
}

static inline
void CalcPositionForNodes(const Real_t dt,int useCPU)
{
  if (useCPU) {
        FC(x); FC(y); FC(z); FC(xd); FC(yd); FC(zd);
        CalcPositionForNodes_cpu(dt);
        SG(x); SG(y); SG(z);
  }
  else {
        FG(x); FG(y); FG(z); FG(xd); FG(yd); FG(zd);
        CalcPositionForNodes_gpu(dt);
        SC(x); SC(y); SC(z);
  }
}

static inline
void LagrangeNodal(int useCPU)
{
  const Real_t delt = mesh.deltatime() ;
  Real_t u_cut = mesh.u_cut() ;

  /* time of boundary condition evaluation is beginning of step for force and
   * acceleration boundary conditions. */
  CalcForceForNodes(/*0*/useCPU);

  CalcAccelerationForNodes(useCPU);

  ApplyAccelerationBoundaryConditionsForNodes(useCPU);

  CalcVelocityForNodes( delt, u_cut, useCPU ) ;

  CalcPositionForNodes( delt, useCPU );

  return;
}

__host__ __device__
static inline
Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
                       const Real_t x2, const Real_t x3,
                       const Real_t x4, const Real_t x5,
                       const Real_t x6, const Real_t x7,
                       const Real_t y0, const Real_t y1,
                       const Real_t y2, const Real_t y3,
                       const Real_t y4, const Real_t y5,
                       const Real_t y6, const Real_t y7,
                       const Real_t z0, const Real_t z1,
                       const Real_t z2, const Real_t z3,
                       const Real_t z4, const Real_t z5,
                       const Real_t z6, const Real_t z7 )
{
  Real_t twelveth = Real_t(1.0)/Real_t(12.0);

  Real_t dx61 = x6 - x1;
  Real_t dy61 = y6 - y1;
  Real_t dz61 = z6 - z1;

  Real_t dx70 = x7 - x0;
  Real_t dy70 = y7 - y0;
  Real_t dz70 = z7 - z0;

  Real_t dx63 = x6 - x3;
  Real_t dy63 = y6 - y3;
  Real_t dz63 = z6 - z3;

  Real_t dx20 = x2 - x0;
  Real_t dy20 = y2 - y0;
  Real_t dz20 = z2 - z0;

  Real_t dx50 = x5 - x0;
  Real_t dy50 = y5 - y0;
  Real_t dz50 = z5 - z0;

  Real_t dx64 = x6 - x4;
  Real_t dy64 = y6 - y4;
  Real_t dz64 = z6 - z4;

  Real_t dx31 = x3 - x1;
  Real_t dy31 = y3 - y1;
  Real_t dz31 = z3 - z1;

  Real_t dx72 = x7 - x2;
  Real_t dy72 = y7 - y2;
  Real_t dz72 = z7 - z2;

  Real_t dx43 = x4 - x3;
  Real_t dy43 = y4 - y3;
  Real_t dz43 = z4 - z3;

  Real_t dx57 = x5 - x7;
  Real_t dy57 = y5 - y7;
  Real_t dz57 = z5 - z7;

  Real_t dx14 = x1 - x4;
  Real_t dy14 = y1 - y4;
  Real_t dz14 = z1 - z4;

  Real_t dx25 = x2 - x5;
  Real_t dy25 = y2 - y5;
  Real_t dz25 = z2 - z5;

#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
  ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))

    Real_t volume =
          TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
                         dy31 + dy72, dy63, dy20,
                         dz31 + dz72, dz63, dz20) +
          TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
                         dy43 + dy57, dy64, dy70,
                         dz43 + dz57, dz64, dz70) +
          TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
                         dy14 + dy25, dy61, dy50,
                         dz14 + dz25, dz61, dz50);

        #undef TRIPLE_PRODUCT

        volume *= twelveth;

        return volume ;
}

__host__ __device__
static inline
Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
{
  return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
                         y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
                         z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
}

__host__ __device__
static inline
Real_t AreaFace( const Real_t x0, const Real_t x1,
                 const Real_t x2, const Real_t x3,
                 const Real_t y0, const Real_t y1,
                 const Real_t y2, const Real_t y3,
                 const Real_t z0, const Real_t z1,
                 const Real_t z2, const Real_t z3)
{
  Real_t fx = (x2 - x0) - (x3 - x1);
  Real_t fy = (y2 - y0) - (y3 - y1);
  Real_t fz = (z2 - z0) - (z3 - z1);
  Real_t gx = (x2 - x0) + (x3 - x1);
  Real_t gy = (y2 - y0) + (y3 - y1);
  Real_t gz = (z2 - z0) + (z3 - z1);
     Real_t area =
           (fx * fx + fy * fy + fz * fz) *
           (gx * gx + gy * gy + gz * gz) -
           (fx * gx + fy * gy + fz * gz) *
           (fx * gx + fy * gy + fz * gz);
         return area ;
}

__host__ __device__
static inline
Real_t CalcElemCharacteristicLength( const Real_t x[8],
                                     const Real_t y[8],
                                     const Real_t z[8],
                                     const Real_t volume)
{
  Real_t a, charLength = Real_t(0.0);

  a = AreaFace(x[0],x[1],x[2],x[3],
               y[0],y[1],y[2],y[3],
               z[0],z[1],z[2],z[3]) ;
  charLength = FMAX(a,charLength) ;

  a = AreaFace(x[4],x[5],x[6],x[7],
               y[4],y[5],y[6],y[7],
               z[4],z[5],z[6],z[7]) ;
  charLength = FMAX(a,charLength) ;

  a = AreaFace(x[0],x[1],x[5],x[4],
               y[0],y[1],y[5],y[4],
               z[0],z[1],z[5],z[4]) ;
  charLength = FMAX(a,charLength) ;

  a = AreaFace(x[1],x[2],x[6],x[5],
               y[1],y[2],y[6],y[5],
               z[1],z[2],z[6],z[5]) ;
  charLength = FMAX(a,charLength) ;

  a = AreaFace(x[2],x[3],x[7],x[6],
               y[2],y[3],y[7],y[6],
               z[2],z[3],z[7],z[6]) ;
  charLength = FMAX(a,charLength) ;

  a = AreaFace(x[3],x[0],x[4],x[7],
               y[3],y[0],y[4],y[7],
               z[3],z[0],z[4],z[7]) ;
  charLength = FMAX(a,charLength) ;

  charLength = Real_t(4.0) * volume / SQRT(charLength);

  return charLength;
}

__host__ __device__
static inline
void CalcElemVelocityGradient( const Real_t* const xvel,
                               const Real_t* const yvel,
                               const Real_t* const zvel,
                               const Real_t b[][8],
                               const Real_t detJ,
                               Real_t* const d )
{
  const Real_t inv_detJ = Real_t(1.0) / detJ ;
  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
  const Real_t* const pfx = b[0];
  const Real_t* const pfy = b[1];
  const Real_t* const pfz = b[2];

  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
                                          + pfx[1] * (xvel[1]-xvel[7])
                                          + pfx[2] * (xvel[2]-xvel[4])
                                          + pfx[3] * (xvel[3]-xvel[5]) );

  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
                                          + pfy[1] * (yvel[1]-yvel[7])
                                          + pfy[2] * (yvel[2]-yvel[4])
                                          + pfy[3] * (yvel[3]-yvel[5]) );

  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
                                          + pfz[1] * (zvel[1]-zvel[7])
                                          + pfz[2] * (zvel[2]-zvel[4])
                                          + pfz[3] * (zvel[3]-zvel[5]) );

  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
                                                + pfx[1] * (yvel[1]-yvel[7])
                                                + pfx[2] * (yvel[2]-yvel[4])
                                                + pfx[3] * (yvel[3]-yvel[5]) );

  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
                                                + pfy[1] * (xvel[1]-xvel[7])
                                                + pfy[2] * (xvel[2]-xvel[4])
                                                + pfy[3] * (xvel[3]-xvel[5]) );

  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
                                                + pfx[1] * (zvel[1]-zvel[7])
                                                + pfx[2] * (zvel[2]-zvel[4])
                                                + pfx[3] * (zvel[3]-zvel[5]) );

  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
                                                + pfz[1] * (xvel[1]-xvel[7])
                                                + pfz[2] * (xvel[2]-xvel[4])
                                                + pfz[3] * (xvel[3]-xvel[5]) );

  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
                                                + pfy[1] * (zvel[1]-zvel[7])
                                                + pfy[2] * (zvel[2]-zvel[4])
                                                + pfy[3] * (zvel[3]-zvel[5]) );

  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
                                                + pfz[1] * (yvel[1]-yvel[7])
                                                + pfz[2] * (yvel[2]-yvel[4])
                                                + pfz[3] * (yvel[3]-yvel[5]) );
  d[5]  = Real_t( .5) * ( dxddy + dyddx );
  d[4]  = Real_t( .5) * ( dxddz + dzddx );
  d[3]  = Real_t( .5) * ( dzddy + dyddz );
}

__global__
void CalcKinematicsForElems_kernel(
                                   Index_t numElem, Real_t dt,
                                   Index_t *nodelist,Real_t *volo,Real_t *v,
                                   Real_t *x,Real_t *y,Real_t *z,Real_t *xd,Real_t *yd,Real_t *zd,
                                   Real_t *vnew,Real_t *delv,Real_t *arealg,Real_t *dxx,Real_t *dyy,Real_t *dzz
                                   )
{
  Real_t B[3][8] ; /** shape function derivatives */
  Real_t D[6] ;
  Real_t x_local[8] ;
  Real_t y_local[8] ;
  Real_t z_local[8] ;
  Real_t xd_local[8] ;
  Real_t yd_local[8] ;
  Real_t zd_local[8] ;
  Real_t detJ = Real_t(0.0) ;

  int k=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (k<numElem) {

        Real_t volume ;
        Real_t relativeVolume ;

        // get nodal coordinates from global arrays and copy into local arrays.
        for( Index_t lnode=0 ; lnode<8 ; ++lnode )
          {
                Index_t gnode = nodelist[k+lnode*numElem];
                x_local[lnode] = x[gnode];
                y_local[lnode] = y[gnode];
                z_local[lnode] = z[gnode];
          }

        // volume calculations
        volume = CalcElemVolume(x_local, y_local, z_local );
        relativeVolume = volume / volo[k] ;
        vnew[k] = relativeVolume ;
        delv[k] = relativeVolume - v[k] ;

        // set characteristic length
        arealg[k] = CalcElemCharacteristicLength(x_local,y_local,z_local,volume);

        // get nodal velocities from global array and copy into local arrays.
        for( Index_t lnode=0 ; lnode<8 ; ++lnode )
          {
                Index_t gnode = nodelist[k+lnode*numElem];
                xd_local[lnode] = xd[gnode];
                yd_local[lnode] = yd[gnode];
                zd_local[lnode] = zd[gnode];
          }

        Real_t dt2 = Real_t(0.5) * dt;
        for ( Index_t j=0 ; j<8 ; ++j )
          {
                x_local[j] -= dt2 * xd_local[j];
                y_local[j] -= dt2 * yd_local[j];
                z_local[j] -= dt2 * zd_local[j];
          }

        CalcElemShapeFunctionDerivatives(x_local,y_local,z_local,B,&detJ );

        CalcElemVelocityGradient(xd_local,yd_local,zd_local,B,detJ,D);

        // put velocity gradient quantities into their global arrays.
        dxx[k] = D[0];
        dyy[k] = D[1];
        dzz[k] = D[2];
  }
}


static inline
void CalcKinematicsForElems_gpu( Index_t numElem, Real_t dt )
{
  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcKinematicsForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,dt,meshGPU.m_nodelist,meshGPU.m_volo,meshGPU.m_v,
         meshGPU.m_x,meshGPU.m_y,meshGPU.m_z,meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd,
         meshGPU.m_vnew,meshGPU.m_delv,meshGPU.m_arealg,meshGPU.m_dxx,meshGPU.m_dyy,meshGPU.m_dzz);
  HIP_DEBUGSYNC;
}


static inline
void CalcKinematicsForElems_cpu( Index_t numElem, Real_t dt )
{
  Real_t B[3][8] ; /** shape function derivatives */
  Real_t D[6] ;
  Real_t x_local[8] ;
  Real_t y_local[8] ;
  Real_t z_local[8] ;
  Real_t xd_local[8] ;
  Real_t yd_local[8] ;
  Real_t zd_local[8] ;
  Real_t detJ = Real_t(0.0) ;

  // loop over all elements
  for( Index_t k=0 ; k<numElem ; ++k )
        {
          Real_t volume ;
          Real_t relativeVolume ;

          // get nodal coordinates from global arrays and copy into local arrays.
          for( Index_t lnode=0 ; lnode<8 ; ++lnode )
                {
                  Index_t gnode = mesh.nodelist(k,lnode);
                  x_local[lnode] = mesh.x(gnode);
                  y_local[lnode] = mesh.y(gnode);
                  z_local[lnode] = mesh.z(gnode);
                }

          // volume calculations
          volume = CalcElemVolume(x_local, y_local, z_local );
          relativeVolume = volume / mesh.volo(k) ;
          mesh.vnew(k) = relativeVolume ;
          mesh.delv(k) = relativeVolume - mesh.v(k) ;

          // set characteristic length
          mesh.arealg(k) = CalcElemCharacteristicLength(x_local,
                                                        y_local,
                                                        z_local,
                                                        volume);

          // get nodal velocities from global array and copy into local arrays.
          for( Index_t lnode=0 ; lnode<8 ; ++lnode )
                {
                  Index_t gnode = mesh.nodelist(k,lnode);
                  xd_local[lnode] = mesh.xd(gnode);
                  yd_local[lnode] = mesh.yd(gnode);
                  zd_local[lnode] = mesh.zd(gnode);
                }

          Real_t dt2 = Real_t(0.5) * dt;
          for ( Index_t j=0 ; j<8 ; ++j )
                {
                  x_local[j] -= dt2 * xd_local[j];
                  y_local[j] -= dt2 * yd_local[j];
                  z_local[j] -= dt2 * zd_local[j];
                }

          CalcElemShapeFunctionDerivatives( x_local,
                                            y_local,
                                            z_local,
                                            B, &detJ );

          CalcElemVelocityGradient( xd_local,
                                    yd_local,
                                    zd_local,
                                    B, detJ, D );

          // put velocity gradient quantities into their global arrays.
          mesh.dxx(k) = D[0];
          mesh.dyy(k) = D[1];
          mesh.dzz(k) = D[2];
        }
}


static inline
void CalcKinematicsForElems( Index_t numElem, Real_t dt, int useCPU )
{
  if (useCPU) {
        FC(nodelist); FC(volo); FC(v); FC(x); FC(y); FC(z); FC(xd); FC(yd); FC(zd);
        CalcKinematicsForElems_cpu(numElem,dt);
        SG(vnew); SG(delv); SG(arealg); SG(dxx); SG(dyy); SG(dzz);
  }
  else {
        FG(nodelist); FG(volo); FG(v); FG(x); FG(y); FG(z); FG(xd); FG(yd); FG(zd);
        CalcKinematicsForElems_gpu(numElem,dt);
        SC(vnew); SC(delv); SC(arealg); SC(dxx); SC(dyy); SC(dzz);
  }
}


__global__
void CalcLagrangeElementsPart2_kernel(
                                      Index_t numElem,
                                      Real_t *dxx,Real_t *dyy, Real_t *dzz,
                                          Real_t *vdov
                                      )
{
  int k=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (k<numElem) {

        // calc strain rate and apply as constraint (only done in FB element)
        Real_t vdovNew = dxx[k] + dyy[k] + dzz[k] ;
        Real_t vdovthird = vdovNew/Real_t(3.0) ;

        // make the rate of deformation tensor deviatoric
        vdov[k] = vdovNew ;
        dxx[k] -= vdovthird ;
        dyy[k] -= vdovthird ;
        dzz[k] -= vdovthird ;

        // See if any volumes are negative, and take appropriate action.
        //if (mesh.vnew(k) <= Real_t(0.0))
        //{
        //    exit(VolumeError) ;
        //}
  }
}


static inline
void CalcLagrangeElementsPart2_gpu()
{
  Index_t numElem = mesh.numElem();

  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcLagrangeElementsPart2_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,
         meshGPU.m_dxx,meshGPU.m_dyy,meshGPU.m_dzz,
         meshGPU.m_vdov);
  HIP_DEBUGSYNC;
}


static inline
void CalcLagrangeElementsPart2_cpu()
{
  Index_t numElem = mesh.numElem() ;

  // element loop to do some stuff not included in the elemlib function.
  for ( Index_t k=0 ; k<numElem ; ++k )
  {
          // calc strain rate and apply as constraint (only done in FB element)
          Real_t vdov = mesh.dxx(k) + mesh.dyy(k) + mesh.dzz(k) ;
          Real_t vdovthird = vdov/Real_t(3.0) ;

          // make the rate of deformation tensor deviatoric
          mesh.vdov(k) = vdov ;
          mesh.dxx(k) -= vdovthird ;
          mesh.dyy(k) -= vdovthird ;
          mesh.dzz(k) -= vdovthird ;

          // See if any volumes are negative, and take appropriate action.
          if (mesh.vnew(k) <= Real_t(0.0))
          {
                  exit(VolumeError) ;
          }
  }
}


static inline
void CalcLagrangeElementsPart2(int useCPU)
{
  if (useCPU) {
        FC(dxx); FC(dyy); FC(dzz);
        CalcLagrangeElementsPart2_cpu();
        SG(vdov); SG(dxx); SG(dyy); SG(dzz);
  }
  else {
        FG(dxx); FG(dyy); FG(dzz);
        CalcLagrangeElementsPart2_gpu();
        SC(vdov); SC(dxx); SC(dyy); SC(dzz);
  }
}

static inline
void CalcLagrangeElements(Real_t deltatime, int useCPU)
{
  Index_t numElem = mesh.numElem() ;
  if (numElem > 0) {
        CalcKinematicsForElems(numElem, deltatime, useCPU);
        CalcLagrangeElementsPart2(useCPU);
  }
}


__global__
void CalcMonotonicQGradientsForElems_kernel(
                                            Index_t numElem,
                                            Index_t *nodelist,
                                            Real_t *x,Real_t *y,Real_t *z,Real_t *xd,Real_t *yd,Real_t *zd,
                                            Real_t *volo,Real_t *vnew,
                                            Real_t *delx_zeta,Real_t *delv_zeta,
                                            Real_t *delx_xi,Real_t *delv_xi,
                                            Real_t *delx_eta,Real_t *delv_eta
                                            )
{
#define SUM4(a,b,c,d) (a + b + c + d)
  const Real_t ptiny = Real_t(1.e-36) ;

  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<numElem) {
        Real_t ax,ay,az ;
        Real_t dxv,dyv,dzv ;

        Index_t n0 = nodelist[i+0*numElem] ;
        Index_t n1 = nodelist[i+1*numElem] ;
        Index_t n2 = nodelist[i+2*numElem] ;
        Index_t n3 = nodelist[i+3*numElem] ;
        Index_t n4 = nodelist[i+4*numElem] ;
        Index_t n5 = nodelist[i+5*numElem] ;
        Index_t n6 = nodelist[i+6*numElem] ;
        Index_t n7 = nodelist[i+7*numElem] ;

        Real_t x0 = x[n0] ;
        Real_t x1 = x[n1] ;
        Real_t x2 = x[n2] ;
        Real_t x3 = x[n3] ;
        Real_t x4 = x[n4] ;
        Real_t x5 = x[n5] ;
        Real_t x6 = x[n6] ;
        Real_t x7 = x[n7] ;

        Real_t y0 = y[n0] ;
        Real_t y1 = y[n1] ;
        Real_t y2 = y[n2] ;
        Real_t y3 = y[n3] ;
        Real_t y4 = y[n4] ;
        Real_t y5 = y[n5] ;
        Real_t y6 = y[n6] ;
        Real_t y7 = y[n7] ;

        Real_t z0 = z[n0] ;
        Real_t z1 = z[n1] ;
        Real_t z2 = z[n2] ;
        Real_t z3 = z[n3] ;
        Real_t z4 = z[n4] ;
        Real_t z5 = z[n5] ;
        Real_t z6 = z[n6] ;
        Real_t z7 = z[n7] ;

        Real_t xv0 = xd[n0] ;
        Real_t xv1 = xd[n1] ;
        Real_t xv2 = xd[n2] ;
        Real_t xv3 = xd[n3] ;
        Real_t xv4 = xd[n4] ;
        Real_t xv5 = xd[n5] ;
        Real_t xv6 = xd[n6] ;
        Real_t xv7 = xd[n7] ;

        Real_t yv0 = yd[n0] ;
        Real_t yv1 = yd[n1] ;
        Real_t yv2 = yd[n2] ;
        Real_t yv3 = yd[n3] ;
        Real_t yv4 = yd[n4] ;
        Real_t yv5 = yd[n5] ;
        Real_t yv6 = yd[n6] ;
        Real_t yv7 = yd[n7] ;

        Real_t zv0 = zd[n0] ;
        Real_t zv1 = zd[n1] ;
        Real_t zv2 = zd[n2] ;
        Real_t zv3 = zd[n3] ;
        Real_t zv4 = zd[n4] ;
        Real_t zv5 = zd[n5] ;
        Real_t zv6 = zd[n6] ;
        Real_t zv7 = zd[n7] ;

        Real_t vol = volo[i]*vnew[i] ;
        Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;

        Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
        Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
        Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;

        Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
        Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
        Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;

        Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
        Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
        Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;

        /* find delvk and delxk ( i cross j ) */

        ax = dyi*dzj - dzi*dyj ;
        ay = dzi*dxj - dxi*dzj ;
        az = dxi*dyj - dyi*dxj ;

        delx_zeta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;

        ax *= norm ;
        ay *= norm ;
        az *= norm ;

        dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
        dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
        dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;

        delv_zeta[i] = ax*dxv + ay*dyv + az*dzv ;

        /* find delxi and delvi ( j cross k ) */

        ax = dyj*dzk - dzj*dyk ;
        ay = dzj*dxk - dxj*dzk ;
        az = dxj*dyk - dyj*dxk ;

        delx_xi[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;

        ax *= norm ;
        ay *= norm ;
        az *= norm ;

        dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
        dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
        dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;

        delv_xi[i] = ax*dxv + ay*dyv + az*dzv ;

        /* find delxj and delvj ( k cross i ) */

        ax = dyk*dzi - dzk*dyi ;
        ay = dzk*dxi - dxk*dzi ;
        az = dxk*dyi - dyk*dxi ;

        delx_eta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;

        ax *= norm ;
        ay *= norm ;
        az *= norm ;

        dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
        dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
        dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;

        delv_eta[i] = ax*dxv + ay*dyv + az*dzv ;
  }
  #undef SUM4
}


static inline
void CalcMonotonicQGradientsForElems_gpu()
{
  Index_t numElem = mesh.numElem();

  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcMonotonicQGradientsForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,
         meshGPU.m_nodelist,
         meshGPU.m_x,meshGPU.m_y,meshGPU.m_z,meshGPU.m_xd,meshGPU.m_yd,meshGPU.m_zd,
         meshGPU.m_volo,meshGPU.m_vnew,
         meshGPU.m_delx_zeta,meshGPU.m_delv_zeta,
         meshGPU.m_delx_xi,meshGPU.m_delv_xi,
         meshGPU.m_delx_eta,meshGPU.m_delv_eta);
  HIP_DEBUGSYNC;
}


static inline
void CalcMonotonicQGradientsForElems_cpu()
{
#define SUM4(a,b,c,d) (a + b + c + d)
  Index_t numElem = mesh.numElem() ;
  const Real_t ptiny = Real_t(1.e-36) ;

  for (Index_t i = 0 ; i < numElem ; ++i ) {
        Real_t ax,ay,az ;
        Real_t dxv,dyv,dzv ;

        Index_t n0 = mesh.nodelist(i,0) ;
        Index_t n1 = mesh.nodelist(i,1) ;
        Index_t n2 = mesh.nodelist(i,2) ;
        Index_t n3 = mesh.nodelist(i,3) ;
        Index_t n4 = mesh.nodelist(i,4) ;
        Index_t n5 = mesh.nodelist(i,5) ;
        Index_t n6 = mesh.nodelist(i,6) ;
        Index_t n7 = mesh.nodelist(i,7) ;

        Real_t x0 = mesh.x(n0) ;
        Real_t x1 = mesh.x(n1) ;
        Real_t x2 = mesh.x(n2) ;
        Real_t x3 = mesh.x(n3) ;
        Real_t x4 = mesh.x(n4) ;
        Real_t x5 = mesh.x(n5) ;
        Real_t x6 = mesh.x(n6) ;
        Real_t x7 = mesh.x(n7) ;

        Real_t y0 = mesh.y(n0) ;
        Real_t y1 = mesh.y(n1) ;
        Real_t y2 = mesh.y(n2) ;
        Real_t y3 = mesh.y(n3) ;
        Real_t y4 = mesh.y(n4) ;
        Real_t y5 = mesh.y(n5) ;
        Real_t y6 = mesh.y(n6) ;
        Real_t y7 = mesh.y(n7) ;

        Real_t z0 = mesh.z(n0) ;
        Real_t z1 = mesh.z(n1) ;
        Real_t z2 = mesh.z(n2) ;
        Real_t z3 = mesh.z(n3) ;
        Real_t z4 = mesh.z(n4) ;
        Real_t z5 = mesh.z(n5) ;
        Real_t z6 = mesh.z(n6) ;
        Real_t z7 = mesh.z(n7) ;

        Real_t xv0 = mesh.xd(n0) ;
        Real_t xv1 = mesh.xd(n1) ;
        Real_t xv2 = mesh.xd(n2) ;
        Real_t xv3 = mesh.xd(n3) ;
        Real_t xv4 = mesh.xd(n4) ;
        Real_t xv5 = mesh.xd(n5) ;
        Real_t xv6 = mesh.xd(n6) ;
        Real_t xv7 = mesh.xd(n7) ;

        Real_t yv0 = mesh.yd(n0) ;
        Real_t yv1 = mesh.yd(n1) ;
        Real_t yv2 = mesh.yd(n2) ;
        Real_t yv3 = mesh.yd(n3) ;
        Real_t yv4 = mesh.yd(n4) ;
        Real_t yv5 = mesh.yd(n5) ;
        Real_t yv6 = mesh.yd(n6) ;
        Real_t yv7 = mesh.yd(n7) ;

        Real_t zv0 = mesh.zd(n0) ;
        Real_t zv1 = mesh.zd(n1) ;
        Real_t zv2 = mesh.zd(n2) ;
        Real_t zv3 = mesh.zd(n3) ;
        Real_t zv4 = mesh.zd(n4) ;
        Real_t zv5 = mesh.zd(n5) ;
        Real_t zv6 = mesh.zd(n6) ;
        Real_t zv7 = mesh.zd(n7) ;

        Real_t vol = mesh.volo(i)*mesh.vnew(i) ;
        Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;

        Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
        Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
        Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;

        Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
        Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
        Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;

        Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
        Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
        Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;

        /* find delvk and delxk ( i cross j ) */

        ax = dyi*dzj - dzi*dyj ;
        ay = dzi*dxj - dxi*dzj ;
        az = dxi*dyj - dyi*dxj ;

        mesh.delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;

        ax *= norm ;
        ay *= norm ;
        az *= norm ;

        dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
        dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
        dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;

        mesh.delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;

        /* find delxi and delvi ( j cross k ) */

        ax = dyj*dzk - dzj*dyk ;
        ay = dzj*dxk - dxj*dzk ;
        az = dxj*dyk - dyj*dxk ;

        mesh.delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;

        ax *= norm ;
        ay *= norm ;
        az *= norm ;

        dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
        dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
        dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;

        mesh.delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;

        /* find delxj and delvj ( k cross i ) */

        ax = dyk*dzi - dzk*dyi ;
        ay = dzk*dxi - dxk*dzi ;
        az = dxk*dyi - dyk*dxi ;

        mesh.delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;

        ax *= norm ;
        ay *= norm ;
        az *= norm ;

        dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
        dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
        dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;

        mesh.delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
  }
  #undef SUM4
}


static inline
void CalcMonotonicQGradientsForElems(int useCPU)
{
  if (useCPU) {
        FC(nodelist); FC(x); FC(y); FC(z); FC(xd); FC(yd); FC(zd); FC(volo); FC(vnew);
        CalcMonotonicQGradientsForElems_cpu();
        SG(delx_zeta); SG(delv_zeta); SG(delx_xi); SG(delv_xi); SG(delx_eta); SG(delv_eta);
  }
  else {
        FG(nodelist); FG(x); FG(y); FG(z); FG(xd); FG(yd); FG(zd); FG(volo); FG(vnew);
        CalcMonotonicQGradientsForElems_gpu();
        SC(delx_zeta); SC(delv_zeta); SC(delx_xi); SC(delv_xi); SC(delx_eta); SC(delv_eta);
  }
}


__global__
void CalcMonotonicQRegionForElems_kernel(
                                         Real_t qlc_monoq,
                                         Real_t qqc_monoq,
                                         Real_t monoq_limiter_mult,
                                         Real_t monoq_max_slope,
                                         Real_t ptiny,

                                         // the elementset length
                                         Index_t elength,

                                         Index_t *matElemlist,Index_t *elemBC,
                                         Index_t *lxim,Index_t *lxip,
                                         Index_t *letam,Index_t *letap,
                                         Index_t *lzetam,Index_t *lzetap,
                                         Real_t *delv_xi,Real_t *delv_eta,Real_t *delv_zeta,
                                         Real_t *delx_xi,Real_t *delx_eta,Real_t *delx_zeta,
                                         Real_t *vdov,Real_t *elemMass,Real_t *volo,Real_t *vnew,
                                         Real_t *qq,Real_t *ql
                                         )
{
  int ielem=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (ielem<elength) {
        Real_t qlin, qquad ;
        Real_t phixi, phieta, phizeta ;
        Index_t i = matElemlist[ielem];
        Int_t bcMask = elemBC[i] ;
        Real_t delvm, delvp ;

        /*  phixi     */
        Real_t norm = Real_t(1.) / ( delv_xi[i] + ptiny ) ;

        switch (bcMask & XI_M) {
        case 0:         delvm = delv_xi[lxim[i]] ; break ;
        case XI_M_SYMM: delvm = delv_xi[i] ;       break ;
        case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
        default:        /* ERROR */ ;              break ;
        }
        switch (bcMask & XI_P) {
        case 0:         delvp = delv_xi[lxip[i]] ; break ;
        case XI_P_SYMM: delvp = delv_xi[i] ;       break ;
        case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
        default:        /* ERROR */ ;              break ;
        }

        delvm = delvm * norm ;
        delvp = delvp * norm ;

        phixi = Real_t(.5) * ( delvm + delvp ) ;

        delvm *= monoq_limiter_mult ;
        delvp *= monoq_limiter_mult ;

        if ( delvm < phixi ) phixi = delvm ;
        if ( delvp < phixi ) phixi = delvp ;
        if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
        if ( phixi > monoq_max_slope) phixi = monoq_max_slope;


        /*  phieta     */
        norm = Real_t(1.) / ( delv_eta[i] + ptiny ) ;

        switch (bcMask & ETA_M) {
        case 0:          delvm = delv_eta[letam[i]] ; break ;
        case ETA_M_SYMM: delvm = delv_eta[i] ;        break ;
        case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
        default:         /* ERROR */ ;                break ;
        }
        switch (bcMask & ETA_P) {
        case 0:          delvp = delv_eta[letap[i]] ; break ;
        case ETA_P_SYMM: delvp = delv_eta[i] ;        break ;
        case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
        default:         /* ERROR */ ;                break ;
        }

        delvm = delvm * norm ;
        delvp = delvp * norm ;

        phieta = Real_t(.5) * ( delvm + delvp ) ;

        delvm *= monoq_limiter_mult ;
        delvp *= monoq_limiter_mult ;

        if ( delvm  < phieta ) phieta = delvm ;
        if ( delvp  < phieta ) phieta = delvp ;
        if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
        if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;

        /*  phizeta     */
        norm = Real_t(1.) / ( delv_zeta[i] + ptiny ) ;

        switch (bcMask & ZETA_M) {
        case 0:           delvm = delv_zeta[lzetam[i]] ; break ;
        case ZETA_M_SYMM: delvm = delv_zeta[i] ;         break ;
        case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
        default:          /* ERROR */ ;                  break ;
        }
        switch (bcMask & ZETA_P) {
        case 0:           delvp = delv_zeta[lzetap[i]] ; break ;
        case ZETA_P_SYMM: delvp = delv_zeta[i] ;         break ;
        case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
        default:          /* ERROR */ ;                  break ;
        }

        delvm = delvm * norm ;
        delvp = delvp * norm ;

        phizeta = Real_t(.5) * ( delvm + delvp ) ;

        delvm *= monoq_limiter_mult ;
        delvp *= monoq_limiter_mult ;

        if ( delvm   < phizeta ) phizeta = delvm ;
        if ( delvp   < phizeta ) phizeta = delvp ;
        if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
        if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;

        /* Remove length scale */

        if ( vdov[i] > Real_t(0.) )  {
          qlin  = Real_t(0.) ;
          qquad = Real_t(0.) ;
        }
        else {
          Real_t delvxxi   = delv_xi[i]   * delx_xi[i]   ;
          Real_t delvxeta  = delv_eta[i]  * delx_eta[i]  ;
          Real_t delvxzeta = delv_zeta[i] * delx_zeta[i] ;

          if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
          if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
          if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;

          Real_t rho = elemMass[i] / (volo[i] * vnew[i]) ;

          qlin = -qlc_monoq * rho *
                (  delvxxi   * (Real_t(1.) - phixi) +
                   delvxeta  * (Real_t(1.) - phieta) +
                   delvxzeta * (Real_t(1.) - phizeta)  ) ;

          qquad = qqc_monoq * rho *
                (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
                   delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
                   delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
        }

        qq[i] = qquad ;
        ql[i] = qlin  ;
  }
}


static inline
void CalcMonotonicQRegionForElems_gpu(// parameters
                                      Real_t qlc_monoq,
                                      Real_t qqc_monoq,
                                      Real_t monoq_limiter_mult,
                                      Real_t monoq_max_slope,
                                      Real_t ptiny,

                                      // the elementset length
                                      Index_t elength )
{
  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(elength,dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcMonotonicQRegionForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, qlc_monoq,qqc_monoq,monoq_limiter_mult,monoq_max_slope,ptiny,elength,
         meshGPU.m_matElemlist,meshGPU.m_elemBC,
         meshGPU.m_lxim,meshGPU.m_lxip,
         meshGPU.m_letam,meshGPU.m_letap,
         meshGPU.m_lzetam,meshGPU.m_lzetap,
         meshGPU.m_delv_xi,meshGPU.m_delv_eta,meshGPU.m_delv_zeta,
         meshGPU.m_delx_xi,meshGPU.m_delx_eta,meshGPU.m_delx_zeta,
         meshGPU.m_vdov,meshGPU.m_elemMass,meshGPU.m_volo,meshGPU.m_vnew,
         meshGPU.m_qq,meshGPU.m_ql);
  HIP_DEBUGSYNC;
}


static inline
void CalcMonotonicQRegionForElems_cpu(// parameters
                                      Real_t qlc_monoq,
                                      Real_t qqc_monoq,
                                      Real_t monoq_limiter_mult,
                                      Real_t monoq_max_slope,
                                      Real_t ptiny,

                                      // the elementset length
                                      Index_t elength )
{
  for ( Index_t ielem = 0 ; ielem < elength; ++ielem ) {
        Real_t qlin, qquad ;
        Real_t phixi, phieta, phizeta ;
        Index_t i = mesh.matElemlist(ielem);
        Int_t bcMask = mesh.elemBC(i) ;
        Real_t delvm, delvp ;

        /*  phixi     */
        Real_t norm = Real_t(1.) / ( mesh.delv_xi(i) + ptiny ) ;

        switch (bcMask & XI_M) {
        case 0:         delvm = mesh.delv_xi(mesh.lxim(i)) ; break ;
        case XI_M_SYMM: delvm = mesh.delv_xi(i) ;            break ;
        case XI_M_FREE: delvm = Real_t(0.0) ;                break ;
        default:        /* ERROR */ ;                        break ;
        }
        switch (bcMask & XI_P) {
        case 0:         delvp = mesh.delv_xi(mesh.lxip(i)) ; break ;
        case XI_P_SYMM: delvp = mesh.delv_xi(i) ;            break ;
        case XI_P_FREE: delvp = Real_t(0.0) ;                break ;
        default:        /* ERROR */ ;                        break ;
        }

        delvm = delvm * norm ;
        delvp = delvp * norm ;

        phixi = Real_t(.5) * ( delvm + delvp ) ;

        delvm *= monoq_limiter_mult ;
        delvp *= monoq_limiter_mult ;

        if ( delvm < phixi ) phixi = delvm ;
        if ( delvp < phixi ) phixi = delvp ;
        if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
        if ( phixi > monoq_max_slope) phixi = monoq_max_slope;


        /*  phieta     */
        norm = Real_t(1.) / ( mesh.delv_eta(i) + ptiny ) ;

        switch (bcMask & ETA_M) {
        case 0:          delvm = mesh.delv_eta(mesh.letam(i)) ; break ;
        case ETA_M_SYMM: delvm = mesh.delv_eta(i) ;             break ;
        case ETA_M_FREE: delvm = Real_t(0.0) ;                  break ;
        default:         /* ERROR */ ;                          break ;
        }
        switch (bcMask & ETA_P) {
        case 0:          delvp = mesh.delv_eta(mesh.letap(i)) ; break ;
        case ETA_P_SYMM: delvp = mesh.delv_eta(i) ;             break ;
        case ETA_P_FREE: delvp = Real_t(0.0) ;                  break ;
        default:         /* ERROR */ ;                          break ;
        }

        delvm = delvm * norm ;
        delvp = delvp * norm ;

        phieta = Real_t(.5) * ( delvm + delvp ) ;

        delvm *= monoq_limiter_mult ;
        delvp *= monoq_limiter_mult ;

        if ( delvm  < phieta ) phieta = delvm ;
        if ( delvp  < phieta ) phieta = delvp ;
        if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
        if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;

        /*  phizeta     */
        norm = Real_t(1.) / ( mesh.delv_zeta(i) + ptiny ) ;

        switch (bcMask & ZETA_M) {
        case 0:           delvm = mesh.delv_zeta(mesh.lzetam(i)) ; break ;
        case ZETA_M_SYMM: delvm = mesh.delv_zeta(i) ;              break ;
        case ZETA_M_FREE: delvm = Real_t(0.0) ;                    break ;
        default:          /* ERROR */ ;                            break ;
        }
        switch (bcMask & ZETA_P) {
        case 0:           delvp = mesh.delv_zeta(mesh.lzetap(i)) ; break ;
        case ZETA_P_SYMM: delvp = mesh.delv_zeta(i) ;              break ;
        case ZETA_P_FREE: delvp = Real_t(0.0) ;                    break ;
        default:          /* ERROR */ ;                            break ;
        }

        delvm = delvm * norm ;
        delvp = delvp * norm ;

        phizeta = Real_t(.5) * ( delvm + delvp ) ;

        delvm *= monoq_limiter_mult ;
        delvp *= monoq_limiter_mult ;

        if ( delvm   < phizeta ) phizeta = delvm ;
        if ( delvp   < phizeta ) phizeta = delvp ;
        if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
        if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;

        /* Remove length scale */

        if ( mesh.vdov(i) > Real_t(0.) )  {
          qlin  = Real_t(0.) ;
          qquad = Real_t(0.) ;
        }
        else {
          Real_t delvxxi   = mesh.delv_xi(i)   * mesh.delx_xi(i)   ;
          Real_t delvxeta  = mesh.delv_eta(i)  * mesh.delx_eta(i)  ;
          Real_t delvxzeta = mesh.delv_zeta(i) * mesh.delx_zeta(i) ;

          if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
          if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
          if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;

          Real_t rho = mesh.elemMass(i) / (mesh.volo(i) * mesh.vnew(i)) ;

          qlin = -qlc_monoq * rho *
                (  delvxxi   * (Real_t(1.) - phixi) +
                   delvxeta  * (Real_t(1.) - phieta) +
                   delvxzeta * (Real_t(1.) - phizeta)  ) ;

          qquad = qqc_monoq * rho *
                (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
                   delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
                   delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
        }

        mesh.qq(i) = qquad ;
        mesh.ql(i) = qlin  ;
  }
}

static inline
void CalcMonotonicQRegionForElems(// parameters
                                  Real_t qlc_monoq,
                                  Real_t qqc_monoq,
                                  Real_t monoq_limiter_mult,
                                  Real_t monoq_max_slope,
                                  Real_t ptiny,

                                  // the elementset length
                                  Index_t elength,
                                  int useCPU)
{
  if (useCPU) {
        FC(matElemlist); FC(elemBC); FC(lxim); FC(lxip); FC(letam); FC(letap); FC(lzetam); FC(lzetap);
        FC(delv_xi); FC(delv_eta); FC(delv_zeta); FC(delx_xi); FC(delx_eta); FC(delx_zeta);
        FC(vdov); FC(elemMass); FC(volo); FC(vnew);
        CalcMonotonicQRegionForElems_cpu(qlc_monoq,qqc_monoq,
                                         monoq_limiter_mult,monoq_max_slope,ptiny,
                                         elength);
        SG(qq); SG(ql);
  }
  else {
        FG(matElemlist); FG(elemBC); FG(lxim); FG(lxip); FG(letam); FG(letap); FG(lzetam); FG(lzetap);
        FG(delv_xi); FG(delv_eta); FG(delv_zeta); FG(delx_xi); FG(delx_eta); FG(delx_zeta);
        FG(vdov); FG(elemMass); FG(volo); FG(vnew);
        CalcMonotonicQRegionForElems_gpu(qlc_monoq,qqc_monoq,
                                         monoq_limiter_mult,monoq_max_slope,ptiny,
                                         elength);
        SC(qq); SC(ql);
  }
}

static inline
void CalcMonotonicQForElems(int useCPU)
{
  //
  // initialize parameters
  //
  const Real_t ptiny        = Real_t(1.e-36) ;
  Real_t monoq_max_slope    = mesh.monoq_max_slope() ;
  Real_t monoq_limiter_mult = mesh.monoq_limiter_mult() ;

  //
  // calculate the monotonic q for pure regions
  //
  Index_t elength = mesh.numElem() ;
  if (elength > 0) {
        Real_t qlc_monoq = mesh.qlc_monoq();
        Real_t qqc_monoq = mesh.qqc_monoq();
        CalcMonotonicQRegionForElems(// parameters
                                     qlc_monoq,
                                     qqc_monoq,
                                     monoq_limiter_mult,
                                     monoq_max_slope,
                                     ptiny,

                                     // the elemset length
                                     elength,
                                     useCPU);
  }
}

static inline
void CalcQForElems(int useCPU)
{
  Real_t qstop = mesh.qstop() ;
  Index_t numElem = mesh.numElem() ;

  //
  // MONOTONIC Q option
  //

  /* Calculate velocity gradients */
  CalcMonotonicQGradientsForElems(useCPU) ;

  /* Transfer veloctiy gradients in the first order elements */
  /* problem->commElements->Transfer(CommElements::monoQ) ; */
  CalcMonotonicQForElems(useCPU) ;

  /* Don't allow excessive artificial viscosity */
  /*
   if (numElem != 0) {
      Index_t idx = -1;
      for (Index_t i=0; i<numElem; ++i) {
         if ( mesh.q(i) > qstop ) {
            idx = i ;
            break ;
                        }
                        }

                        if(idx >= 0) {
                        exit(QStopError) ;
                        }
                        }
   */
}


__global__
void CalcPressureForElems_kernel(Real_t* p_new, Real_t* bvc,
                                 Real_t* pbvc, Real_t* e_old,
                                 Real_t* compression, Real_t *vnewc,
                                 Real_t pmin,
                                 Real_t p_cut, Real_t eosvmax,
                                 Index_t length, Real_t c1s)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<length) {

        bvc[i] = c1s * (compression[i] + Real_t(1.));
        pbvc[i] = c1s;

        p_new[i] = bvc[i] * e_old[i] ;

        if    (FABS(p_new[i]) <  p_cut   )
          p_new[i] = Real_t(0.0) ;

        if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */
          p_new[i] = Real_t(0.0) ;

        if    (p_new[i]       <  pmin)
          p_new[i]   = pmin ;
  }
}


static inline
void CalcPressureForElems_gpu(Real_t* p_new, Real_t* bvc,
                              Real_t* pbvc, Real_t* e_old,
                              Real_t* compression, Real_t *vnewc,
                              Real_t pmin,
                              Real_t p_cut, Real_t eosvmax,
                              Index_t length)
{
  Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcPressureForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, p_new,bvc,pbvc,e_old,compression,vnewc,pmin,p_cut,eosvmax,length,c1s);
  HIP_DEBUGSYNC;
}


static inline
void CalcPressureForElems_cpu(Real_t* p_new, Real_t* bvc,
                              Real_t* pbvc, Real_t* e_old,
                              Real_t* compression, Real_t *vnewc,
                              Real_t pmin,
                              Real_t p_cut, Real_t eosvmax,
                              Index_t length)
{
  Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
  for (Index_t i = 0; i < length ; ++i) {
        bvc[i] = c1s * (compression[i] + Real_t(1.));
        pbvc[i] = c1s;
  }

  for (Index_t i = 0 ; i < length ; ++i){
        p_new[i] = bvc[i] * e_old[i] ;

        if    (FABS(p_new[i]) <  p_cut   )
          p_new[i] = Real_t(0.0) ;

        if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */
          p_new[i] = Real_t(0.0) ;

        if    (p_new[i]       <  pmin)
          p_new[i]   = pmin ;
  }
}

__global__
void CalcEnergyForElemsPart1_kernel(
                                    Index_t length,Real_t emin,
                                    Real_t *e_old,Real_t *delvc,Real_t *p_old,Real_t *q_old,Real_t *work,
                                    Real_t *e_new)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<length) {
        e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
          + Real_t(0.5) * work[i];

        if (e_new[i]  < emin ) {
          e_new[i] = emin ;
        }
  }
}


__global__
void CalcEnergyForElemsPart2_kernel(
                                    Index_t length,Real_t rho0,Real_t e_cut,Real_t emin,
                                    Real_t *compHalfStep,Real_t *delvc,Real_t *pbvc,Real_t *bvc,
                                    Real_t *pHalfStep,Real_t *ql,Real_t *qq,Real_t *p_old,Real_t *q_old,Real_t *work,
                                    Real_t *e_new,
                                    Real_t *q_new
                                    )
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<length) {

        Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;

        if ( delvc[i] > Real_t(0.) ) {
          q_new[i] /* = qq[i] = ql[i] */ = Real_t(0.) ;
        }
        else {
          Real_t ssc = ( pbvc[i] * e_new[i]
                                         + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;

          if ( ssc <= Real_t(0.) ) {
                ssc =Real_t(.333333e-36) ;
          } else {
                ssc = SQRT(ssc) ;
          }

          q_new[i] = (ssc*ql[i] + qq[i]) ;
        }

        e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
          * (  Real_t(3.0)*(p_old[i]     + q_old[i])
                   - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;

        e_new[i] += Real_t(0.5) * work[i];

        if (FABS(e_new[i]) < e_cut) {
          e_new[i] = Real_t(0.)  ;
        }
        if (     e_new[i]  < emin ) {
          e_new[i] = emin ;
        }
  }
}


__global__
void CalcEnergyForElemsPart3_kernel(
                                    Index_t length,Real_t rho0,Real_t sixth,Real_t e_cut,Real_t emin,
                                    Real_t *pbvc,Real_t *vnewc,Real_t *bvc,Real_t *p_new,Real_t *ql,Real_t *qq,
                                    Real_t *p_old,Real_t *q_old,Real_t *pHalfStep,Real_t *q_new,Real_t *delvc,
                                    Real_t *e_new)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<length) {
        Real_t q_tilde ;

        if (delvc[i] > Real_t(0.)) {
          q_tilde = Real_t(0.) ;
        }
        else {
          Real_t ssc = ( pbvc[i] * e_new[i]
                                         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;

          if ( ssc <= Real_t(0.) ) {
                ssc = Real_t(.333333e-36) ;
          } else {
                ssc = SQRT(ssc) ;
          }

          q_tilde = (ssc*ql[i] + qq[i]) ;
        }

        e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
                                                         - Real_t(8.0)*(pHalfStep[i] + q_new[i])
                                                         + (p_new[i] + q_tilde)) * delvc[i]*sixth ;

        if (FABS(e_new[i]) < e_cut) {
          e_new[i] = Real_t(0.)  ;
        }
        if (     e_new[i]  < emin ) {
          e_new[i] = emin ;
        }
  }
}


__global__
void CalcEnergyForElemsPart4_kernel(
                                    Index_t length,Real_t rho0,Real_t q_cut,
                                    Real_t *delvc,Real_t *pbvc,Real_t *e_new,Real_t *vnewc,Real_t *bvc,
                                    Real_t *p_new,Real_t *ql,Real_t *qq,
                                    Real_t *q_new)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<length) {

        if ( delvc[i] <= Real_t(0.) ) {
          Real_t ssc = ( pbvc[i] * e_new[i]
                                         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;

          if ( ssc <= Real_t(0.) ) {
                ssc = Real_t(.333333e-36) ;
          } else {
                ssc = SQRT(ssc) ;
          }

          q_new[i] = (ssc*ql[i] + qq[i]) ;

          if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
        }
  }
}

static inline
void CalcEnergyForElems_gpu(Real_t* p_new, Real_t* e_new, Real_t* q_new,
                            Real_t* bvc, Real_t* pbvc,
                            Real_t* p_old, Real_t* e_old, Real_t* q_old,
                            Real_t* compression, Real_t* compHalfStep,
                            Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
                            Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
                            Real_t* qq, Real_t* ql,
                            Real_t rho0,
                            Real_t eosvmax,
                            Index_t length)
{
  const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
  Real_t *pHalfStep;

  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);

  HIP( hipMalloc(&pHalfStep,sizeof(Real_t)*length) );

  hipLaunchKernelGGL((CalcEnergyForElemsPart1_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,emin,e_old,delvc,p_old,q_old,work,e_new);
  HIP_DEBUGSYNC;

  CalcPressureForElems_gpu(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
                           pmin, p_cut, eosvmax, length);

  hipLaunchKernelGGL((CalcEnergyForElemsPart2_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,rho0,e_cut,emin,
         compHalfStep,delvc,pbvc,bvc,pHalfStep,ql,qq,p_old,q_old,work,
         e_new,
         q_new);
  HIP_DEBUGSYNC;

  CalcPressureForElems_gpu(p_new, bvc, pbvc, e_new, compression, vnewc,
                           pmin, p_cut, eosvmax, length);

  hipLaunchKernelGGL((CalcEnergyForElemsPart3_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,rho0,sixth,e_cut,emin,
         pbvc,vnewc,bvc,p_new,ql,qq,
         p_old,q_old,pHalfStep,q_new,delvc,
         e_new);
  HIP_DEBUGSYNC;

  CalcPressureForElems_gpu(p_new, bvc, pbvc, e_new, compression, vnewc,
                           pmin, p_cut, eosvmax, length);

  hipLaunchKernelGGL((CalcEnergyForElemsPart4_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,rho0,q_cut,
         delvc,pbvc,e_new,vnewc,bvc,
         p_new,ql,qq,
         q_new);
  HIP_DEBUGSYNC;

  HIP( hipFree(pHalfStep) );

  return ;
}

static inline
void CalcEnergyForElems_cpu(Real_t* p_new, Real_t* e_new, Real_t* q_new,
                            Real_t* bvc, Real_t* pbvc,
                            Real_t* p_old, Real_t* e_old, Real_t* q_old,
                            Real_t* compression, Real_t* compHalfStep,
                            Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
                            Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
                            Real_t* qq, Real_t* ql,
                            Real_t rho0,
                            Real_t eosvmax,
                            Index_t length)
{
  const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
  Real_t *pHalfStep = Allocate<Real_t>(length) ;

  for (Index_t i = 0 ; i < length ; ++i) {
        e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
          + Real_t(0.5) * work[i];

        if (e_new[i]  < emin ) {
          e_new[i] = emin ;
        }
  }

  CalcPressureForElems_cpu(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
                           pmin, p_cut, eosvmax, length);

  for (Index_t i = 0 ; i < length ; ++i) {
        Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;

        if ( delvc[i] > Real_t(0.) ) {
          q_new[i] /* = qq[i] = ql[i] */ = Real_t(0.) ;
        }
        else {
          Real_t ssc = ( pbvc[i] * e_new[i]
                                         + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;

          if ( ssc <= Real_t(0.) ) {
                ssc =Real_t(.333333e-36) ;
          } else {
                ssc = SQRT(ssc) ;
          }

          q_new[i] = (ssc*ql[i] + qq[i]) ;
        }

        e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
          * (  Real_t(3.0)*(p_old[i]     + q_old[i])
                   - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;
  }

  for (Index_t i = 0 ; i < length ; ++i) {

        e_new[i] += Real_t(0.5) * work[i];

        if (FABS(e_new[i]) < e_cut) {
          e_new[i] = Real_t(0.)  ;
        }
        if (     e_new[i]  < emin ) {
          e_new[i] = emin ;
        }
  }

  CalcPressureForElems_cpu(p_new, bvc, pbvc, e_new, compression, vnewc,
                           pmin, p_cut, eosvmax, length);

  for (Index_t i = 0 ; i < length ; ++i){
        Real_t q_tilde ;

        if (delvc[i] > Real_t(0.)) {
          q_tilde = Real_t(0.) ;
        }
        else {
          Real_t ssc = ( pbvc[i] * e_new[i]
                                         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;

          if ( ssc <= Real_t(0.) ) {
                ssc = Real_t(.333333e-36) ;
          } else {
                ssc = SQRT(ssc) ;
          }

          q_tilde = (ssc*ql[i] + qq[i]) ;
        }

        e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
                                                         - Real_t(8.0)*(pHalfStep[i] + q_new[i])
                                                         + (p_new[i] + q_tilde)) * delvc[i]*sixth ;

        if (FABS(e_new[i]) < e_cut) {
          e_new[i] = Real_t(0.)  ;
        }
        if (     e_new[i]  < emin ) {
          e_new[i] = emin ;
        }
  }

  CalcPressureForElems_cpu(p_new, bvc, pbvc, e_new, compression, vnewc,
                                                   pmin, p_cut, eosvmax, length);

  for (Index_t i = 0 ; i < length ; ++i){

        if ( delvc[i] <= Real_t(0.) ) {
          Real_t ssc = ( pbvc[i] * e_new[i]
                                         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;

          if ( ssc <= Real_t(0.) ) {
                ssc = Real_t(.333333e-36) ;
          } else {
                ssc = SQRT(ssc) ;
          }

          q_new[i] = (ssc*ql[i] + qq[i]) ;

          if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
        }
  }

  Release(&pHalfStep) ;

  return ;
}


__global__
void CalcSoundSpeedForElems_kernel(Real_t *vnewc, Real_t rho0, Real_t *enewc,
                                   Real_t *pnewc, Real_t *pbvc,
                                   Real_t *bvc, Real_t ss4o3, Index_t nz,Index_t *matElemlist,
                                   Real_t *ss)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<nz) {

        Index_t iz = matElemlist[i];
        Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[i] * vnewc[i] *
                                        bvc[i] * pnewc[i]) / rho0;
        if (ssTmp <= Real_t(1.111111e-36)) {
          ssTmp = Real_t(1.111111e-36);
        }
        ss[iz] = SQRT(ssTmp);
  }
}


static inline
void CalcSoundSpeedForElems_gpu(Real_t *vnewc, Real_t rho0, Real_t *enewc,
                                Real_t *pnewc, Real_t *pbvc,
                                Real_t *bvc, Real_t ss4o3, Index_t nz)
{
  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(nz,dimBlock.x),1,1);
  hipLaunchKernelGGL((CalcSoundSpeedForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, vnewc,rho0,enewc,pnewc,pbvc,bvc,ss4o3,nz,meshGPU.m_matElemlist,meshGPU.m_ss);
  HIP_DEBUGSYNC;

}

static inline
void CalcSoundSpeedForElems_cpu(Real_t *vnewc, Real_t rho0, Real_t *enewc,
                                Real_t *pnewc, Real_t *pbvc,
                                Real_t *bvc, Real_t ss4o3, Index_t nz)
{
  for (Index_t i = 0; i < nz ; ++i) {
        Index_t iz = mesh.matElemlist(i);
        Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[i] * vnewc[i] *
                                        bvc[i] * pnewc[i]) / rho0;
        if (ssTmp <= Real_t(1.111111e-36)) {
          ssTmp = Real_t(1.111111e-36);
        }
        mesh.ss(iz) = SQRT(ssTmp);
  }
}


__global__
void EvalEOSForElemsPart1_kernel(
                                 Index_t length,Real_t eosvmin,Real_t eosvmax,
                                 Index_t *matElemlist,
                                 Real_t *e,Real_t *delv,Real_t *p,Real_t *q,Real_t *qq,Real_t *ql,
                                 Real_t *vnewc,
                                 Real_t *e_old,Real_t *delvc,Real_t *p_old,Real_t *q_old,
                                 Real_t *compression,Real_t *compHalfStep,
                                 Real_t *qq_old,Real_t *ql_old,Real_t *work)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<length) {
        Index_t zidx = matElemlist[i];
        e_old[i] = e[zidx];
        delvc[i] = delv[zidx];
        p_old[i] = p[zidx];
        q_old[i] = q[zidx];

        Real_t vchalf ;
        compression[i] = Real_t(1.) / vnewc[i] - Real_t(1.);
        vchalf = vnewc[i] - delvc[i] * Real_t(.5);
        compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);

        if ( eosvmin != Real_t(0.) ) {
          if (vnewc[i] <= eosvmin) { /* impossible due to calling func? */
                compHalfStep[i] = compression[i] ;
          }
        }
        if ( eosvmax != Real_t(0.) ) {
          if (vnewc[i] >= eosvmax) { /* impossible due to calling func? */
                p_old[i]        = Real_t(0.) ;
                compression[i]  = Real_t(0.) ;
                compHalfStep[i] = Real_t(0.) ;
          }
        }

        qq_old[i] = qq[zidx] ;
        ql_old[i] = ql[zidx] ;
        work[i] = Real_t(0.) ;
  }
}


__global__
void EvalEOSForElemsPart2_kernel(
                                 Index_t length,
                                 Index_t *matElemlist,Real_t *p_new,Real_t *e_new,Real_t *q_new,
                                 Real_t *p,Real_t *e,Real_t *q)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<length) {
        Index_t zidx = matElemlist[i] ;
        p[zidx] = p_new[i];
        e[zidx] = e_new[i];
        q[zidx] = q_new[i];
  }
}


static inline
void EvalEOSForElems_gpu(Real_t *vnewc, Index_t length)
{
  Real_t  e_cut = mesh.e_cut();
  Real_t  p_cut = mesh.p_cut();
  Real_t  ss4o3 = mesh.ss4o3();
  Real_t  q_cut = mesh.q_cut();

  Real_t eosvmax = mesh.eosvmax() ;
  Real_t eosvmin = mesh.eosvmin() ;
  Real_t pmin    = mesh.pmin() ;
  Real_t emin    = mesh.emin() ;
  Real_t rho0    = mesh.refdens() ;

  Real_t *e_old,*delvc,*p_old,*q_old;
  Real_t *compression,*compHalfStep;
  Real_t *qq,*ql,*work,*p_new,*e_new,*q_new,*bvc,*pbvc;

  HIP( hipMalloc(&e_old,sizeof(Real_t)*length) );
  HIP( hipMalloc(&delvc,sizeof(Real_t)*length) );
  HIP( hipMalloc(&p_old,sizeof(Real_t)*length) );
  HIP( hipMalloc(&q_old,sizeof(Real_t)*length) );
  HIP( hipMalloc(&compression,sizeof(Real_t)*length) );
  HIP( hipMalloc(&compHalfStep,sizeof(Real_t)*length) );
  HIP( hipMalloc(&qq,sizeof(Real_t)*length) );
  HIP( hipMalloc(&ql,sizeof(Real_t)*length) );
  HIP( hipMalloc(&work,sizeof(Real_t)*length) );
  HIP( hipMalloc(&p_new,sizeof(Real_t)*length) );
  HIP( hipMalloc(&e_new,sizeof(Real_t)*length) );
  HIP( hipMalloc(&q_new,sizeof(Real_t)*length) );
  HIP( hipMalloc(&bvc,sizeof(Real_t)*length) );
  HIP( hipMalloc(&pbvc,sizeof(Real_t)*length) );

  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);

  hipLaunchKernelGGL((EvalEOSForElemsPart1_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,eosvmin,eosvmax,
         meshGPU.m_matElemlist,
         meshGPU.m_e,meshGPU.m_delv,meshGPU.m_p,meshGPU.m_q,meshGPU.m_qq,meshGPU.m_ql,
         vnewc,
         e_old,delvc,p_old,q_old,
         compression,compHalfStep,qq,ql,work);
  HIP_DEBUGSYNC;

  CalcEnergyForElems_gpu(p_new, e_new, q_new, bvc, pbvc,
                         p_old, e_old,  q_old, compression, compHalfStep,
                         vnewc, work,  delvc, pmin,
                         p_cut, e_cut, q_cut, emin,
                         qq, ql, rho0, eosvmax, length);


  hipLaunchKernelGGL((EvalEOSForElemsPart2_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,
         meshGPU.m_matElemlist,p_new,e_new,q_new,
         meshGPU.m_p,meshGPU.m_e,meshGPU.m_q);
  HIP_DEBUGSYNC;

  CalcSoundSpeedForElems_gpu(vnewc, rho0, e_new, p_new,
                             pbvc, bvc, ss4o3, length) ;

  HIP( hipFree(pbvc) );
  HIP( hipFree(bvc) );
  HIP( hipFree(q_new) );
  HIP( hipFree(e_new) );
  HIP( hipFree(p_new) );
  HIP( hipFree(work) );
  HIP( hipFree(ql) );
  HIP( hipFree(qq) );
  HIP( hipFree(compHalfStep) );
  HIP( hipFree(compression) );
  HIP( hipFree(q_old) );
  HIP( hipFree(p_old) );
  HIP( hipFree(delvc) );
  HIP( hipFree(e_old) );
}


static inline
void EvalEOSForElems_cpu(Real_t *vnewc, Index_t length)
{
  Real_t  e_cut = mesh.e_cut();
  Real_t  p_cut = mesh.p_cut();
  Real_t  ss4o3 = mesh.ss4o3();
  Real_t  q_cut = mesh.q_cut();

  Real_t eosvmax = mesh.eosvmax() ;
  Real_t eosvmin = mesh.eosvmin() ;
  Real_t pmin    = mesh.pmin() ;
  Real_t emin    = mesh.emin() ;
  Real_t rho0    = mesh.refdens() ;

  Real_t *e_old = Allocate<Real_t>(length) ;
  Real_t *delvc = Allocate<Real_t>(length) ;
  Real_t *p_old = Allocate<Real_t>(length) ;
  Real_t *q_old = Allocate<Real_t>(length) ;
  Real_t *compression = Allocate<Real_t>(length) ;
  Real_t *compHalfStep = Allocate<Real_t>(length) ;
  Real_t *qq = Allocate<Real_t>(length) ;
  Real_t *ql = Allocate<Real_t>(length) ;
  Real_t *work = Allocate<Real_t>(length) ;
  Real_t *p_new = Allocate<Real_t>(length) ;
  Real_t *e_new = Allocate<Real_t>(length) ;
  Real_t *q_new = Allocate<Real_t>(length) ;
  Real_t *bvc = Allocate<Real_t>(length) ;
  Real_t *pbvc = Allocate<Real_t>(length) ;

  /* compress data, minimal set */
  for (Index_t i=0; i<length; ++i) {
        Index_t zidx = mesh.matElemlist(i) ;
        e_old[i] = mesh.e(zidx) ;
  }

  for (Index_t i=0; i<length; ++i) {
        Index_t zidx = mesh.matElemlist(i) ;
        delvc[i] = mesh.delv(zidx) ;
  }

  for (Index_t i=0; i<length; ++i) {
        Index_t zidx = mesh.matElemlist(i) ;
        p_old[i] = mesh.p(zidx) ;
  }

  for (Index_t i=0; i<length; ++i) {
        Index_t zidx = mesh.matElemlist(i) ;
        q_old[i] = mesh.q(zidx) ;
  }

  for (Index_t i = 0; i < length ; ++i) {
        Real_t vchalf ;
        compression[i] = Real_t(1.) / vnewc[i] - Real_t(1.);
        vchalf = vnewc[i] - delvc[i] * Real_t(.5);
        compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);
  }

  /* Check for v > eosvmax or v < eosvmin */
  if ( eosvmin != Real_t(0.) ) {
        for(Index_t i=0 ; i<length ; ++i) {
          if (vnewc[i] <= eosvmin) { /* impossible due to calling func? */
                compHalfStep[i] = compression[i] ;
          }
        }
  }
  if ( eosvmax != Real_t(0.) ) {
        for(Index_t i=0 ; i<length ; ++i) {
          if (vnewc[i] >= eosvmax) { /* impossible due to calling func? */
                p_old[i]        = Real_t(0.) ;
                compression[i]  = Real_t(0.) ;
                compHalfStep[i] = Real_t(0.) ;
          }
        }
  }

  for (Index_t i = 0 ; i < length ; ++i) {
        Index_t zidx = mesh.matElemlist(i) ;
        qq[i] = mesh.qq(zidx) ;
        ql[i] = mesh.ql(zidx) ;
        work[i] = Real_t(0.) ;
  }

  CalcEnergyForElems_cpu(p_new, e_new, q_new, bvc, pbvc,
                         p_old, e_old,  q_old, compression, compHalfStep,
                         vnewc, work,  delvc, pmin,
                         p_cut, e_cut, q_cut, emin,
                         qq, ql, rho0, eosvmax, length);


  for (Index_t i=0; i<length; ++i) {
        Index_t zidx = mesh.matElemlist(i) ;
        mesh.p(zidx) = p_new[i] ;
  }

  for (Index_t i=0; i<length; ++i) {
        Index_t zidx = mesh.matElemlist(i) ;
        mesh.e(zidx) = e_new[i] ;
  }

  for (Index_t i=0; i<length; ++i) {
        Index_t zidx = mesh.matElemlist(i) ;
        mesh.q(zidx) = q_new[i] ;
  }

  CalcSoundSpeedForElems_cpu(vnewc, rho0, e_new, p_new,
                             pbvc, bvc, ss4o3, length) ;

  Release(&pbvc) ;
  Release(&bvc) ;
  Release(&q_new) ;
  Release(&e_new) ;
  Release(&p_new) ;
  Release(&work) ;
  Release(&ql) ;
  Release(&qq) ;
  Release(&compHalfStep) ;
  Release(&compression) ;
  Release(&q_old) ;
  Release(&p_old) ;
  Release(&delvc) ;
  Release(&e_old) ;
}


__global__
void ApplyMaterialPropertiesForElemsPart1_kernel(
                                                 Index_t length,Real_t eosvmin,Real_t eosvmax,
                                                 Index_t *matElemlist,Real_t *vnew,
                                                 Real_t *vnewc)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<length) {
        Index_t zn = matElemlist[i] ;
        vnewc[i] = vnew[zn] ;

        if (eosvmin != Real_t(0.)) {
          if (vnewc[i] < eosvmin)
                vnewc[i] = eosvmin ;
        }

        if (eosvmax != Real_t(0.)) {
          if (vnewc[i] > eosvmax)
                vnewc[i] = eosvmax ;
        }
  }
}


static inline
void ApplyMaterialPropertiesForElems_gpu()
{
  Index_t length = mesh.numElem() ;

  if (length != 0) {
        /* Expose all of the variables needed for material evaluation */
        Real_t eosvmin = mesh.eosvmin() ;
        Real_t eosvmax = mesh.eosvmax() ;
        Real_t *vnewc;

        HIP( hipMalloc(&vnewc,sizeof(Real_t)*length) );

        dim3 dimBlock=dim3(BLOCKSIZE,1,1);
        dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);
        hipLaunchKernelGGL((ApplyMaterialPropertiesForElemsPart1_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,eosvmin,eosvmax,
           meshGPU.m_matElemlist,meshGPU.m_vnew,
           vnewc);
        HIP_DEBUGSYNC;

        /*
    for (Index_t i=0; i<length; ++i) {
       Index_t zn = mesh.matElemlist(i) ;
       Real_t vc = mesh.v(zn) ;
       if (eosvmin != Real_t(0.)) {
          if (vc < eosvmin)
             vc = eosvmin ;
       }
       if (eosvmax != Real_t(0.)) {
          if (vc > eosvmax)
             vc = eosvmax ;
       }
       if (vc <= 0.) {
          exit(VolumeError) ;
       }
    }
    */

        EvalEOSForElems_gpu(vnewc, length);

        HIP( hipFree(vnewc) );
  }
}

static inline
void ApplyMaterialPropertiesForElems_cpu()
{
  Index_t length = mesh.numElem() ;

  if (length != 0) {
        /* Expose all of the variables needed for material evaluation */
        Real_t eosvmin = mesh.eosvmin() ;
        Real_t eosvmax = mesh.eosvmax() ;
        Real_t *vnewc = Allocate<Real_t>(length) ;

        for (Index_t i=0 ; i<length ; ++i) {
          Index_t zn = mesh.matElemlist(i) ;
          vnewc[i] = mesh.vnew(zn) ;
        }

        if (eosvmin != Real_t(0.)) {
          for(Index_t i=0 ; i<length ; ++i) {
                if (vnewc[i] < eosvmin)
                  vnewc[i] = eosvmin ;
          }
        }

        if (eosvmax != Real_t(0.)) {
          for(Index_t i=0 ; i<length ; ++i) {
                if (vnewc[i] > eosvmax)
                  vnewc[i] = eosvmax ;
          }
        }

        for (Index_t i=0; i<length; ++i) {
          Index_t zn = mesh.matElemlist(i) ;
          Real_t vc = mesh.v(zn) ;
          if (eosvmin != Real_t(0.)) {
                if (vc < eosvmin)
                  vc = eosvmin ;
          }
          if (eosvmax != Real_t(0.)) {
                if (vc > eosvmax)
                  vc = eosvmax ;
          }
          if (vc <= 0.) {
                exit(VolumeError) ;
          }
        }

        EvalEOSForElems_cpu(vnewc, length);

        Release(&vnewc) ;

  }
}

static inline
void ApplyMaterialPropertiesForElems(int useCPU)
{
  if (useCPU) {
        FC(matElemlist); FC(vnew); FC(v); FC(e); FC(delv); FC(p); FC(q); FC(qq); FC(ql);
        ApplyMaterialPropertiesForElems_cpu();
        SG(p); SG(e); SG(q); SG(ss);
  }
  else {
        FG(matElemlist); FG(vnew); FG(v); FG(e); FG(delv); FG(p); FG(q); FG(qq); FG(ql);
        ApplyMaterialPropertiesForElems_gpu();
        SC(p); SC(e); SC(q); SC(ss);
  }
}

__global__
void UpdateVolumesForElems_kernel(Index_t numElem,Real_t v_cut,
                                  Real_t *vnew,
                                  Real_t *v)
{
  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
  if (i<numElem) {
        Real_t tmpV ;
        tmpV = vnew[i] ;

        if ( FABS(tmpV - Real_t(1.0)) < v_cut )
          tmpV = Real_t(1.0) ;
        v[i] = tmpV ;
  }
}


static inline
void UpdateVolumesForElems_gpu()
{
  Index_t numElem = mesh.numElem();
  if (numElem != 0) {
        Real_t v_cut = mesh.v_cut();
        dim3 dimBlock=dim3(BLOCKSIZE,1,1);
        dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
        hipLaunchKernelGGL((UpdateVolumesForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,v_cut,meshGPU.m_vnew,meshGPU.m_v);
  }
}


static inline
void UpdateVolumesForElems_cpu()
{
  Index_t numElem = mesh.numElem();
  if (numElem != 0) {
        Real_t v_cut = mesh.v_cut();

        for(Index_t i=0 ; i<numElem ; ++i) {
          Real_t tmpV ;
          tmpV = mesh.vnew(i) ;

          if ( FABS(tmpV - Real_t(1.0)) < v_cut )
                tmpV = Real_t(1.0) ;
          mesh.v(i) = tmpV ;
        }
  }

  return ;
}

static inline
void UpdateVolumesForElems(int useCPU)
{
  if (useCPU) {
        FC(vnew);
        UpdateVolumesForElems_cpu();
        SG(v);
  }
  else {
        FG(vnew);
        UpdateVolumesForElems_gpu();
        SC(v);
  }
}


static inline
void LagrangeElements(int useCPU)
{
  const Real_t deltatime = mesh.deltatime() ;

  CalcLagrangeElements(deltatime, useCPU) ;

  /* Calculate Q.  (Monotonic q option requires communication) */
  CalcQForElems(useCPU) ;

  ApplyMaterialPropertiesForElems(useCPU) ;

  UpdateVolumesForElems(useCPU) ;
}


__global__
void CalcCourantConstraintForElems_kernel(
                                          Index_t length,Real_t qqc2,
                                          Index_t *matElemlist,Real_t *ss,Real_t *vdov,Real_t *arealg,
                                          Real_t *mindtcourant)
{
  __shared__ Real_t minArray[BLOCKSIZE];

  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;

  Real_t dtcourant = Real_t(1.0e+20) ;
  if (i<length) {
        Index_t indx = matElemlist[i] ;
        Real_t dtf = ss[indx] * ss[indx] ;
        if ( vdov[indx] < Real_t(0.) ) {
                      dtf = dtf
                                                        + qqc2 * arealg[indx] * arealg[indx]
                                        * vdov[indx] * vdov[indx] ;
        }
        dtf = SQRT(dtf) ;
        dtf = arealg[indx] / dtf ;

        /* determine minimum timestep with its corresponding elem */
        if (vdov[indx] != Real_t(0.)) {
          if ( dtf < dtcourant ) {
                dtcourant = dtf ;
          }
        }
  }
  minArray[hipThreadIdx_x]=dtcourant;
  reduceMin<Real_t,BLOCKSIZE>(minArray,hipThreadIdx_x);
  if (hipThreadIdx_x==0)
        mindtcourant[hipBlockIdx_x]=minArray[0];
}


static inline
void CalcCourantConstraintForElems_gpu()
{
  Real_t qqc = mesh.qqc();
  Real_t qqc2 = Real_t(64.0) * qqc * qqc ;
  Index_t length = mesh.numElem() ;

  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);

  Real_t *dev_mindtcourant;
  HIP( hipMalloc(&dev_mindtcourant,sizeof(Real_t)*dimGrid.x) );

  hipLaunchKernelGGL((CalcCourantConstraintForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,qqc2,
         meshGPU.m_matElemlist,meshGPU.m_ss,meshGPU.m_vdov,meshGPU.m_arealg,
         dev_mindtcourant);
  HIP_DEBUGSYNC;

  Real_t *mindtcourant = (Real_t*)malloc(sizeof(Real_t)*dimGrid.x);
  HIP( hipMemcpy(mindtcourant,dev_mindtcourant,sizeof(Real_t)*dimGrid.x,hipMemcpyDeviceToHost) );
  HIP( hipFree(dev_mindtcourant) );

  // finish the MIN computation over the thread blocks
  Real_t dtcourant;
  dtcourant=mindtcourant[0];
  for (int i=1; i<dimGrid.x; i++) {
        MINEQ(dtcourant,mindtcourant[i]);
  }
  free(mindtcourant);

  if (dtcourant < Real_t(1.0e+20))
        mesh.dtcourant() = dtcourant ;
}

static inline
void CalcCourantConstraintForElems_cpu()
{
  Real_t dtcourant = Real_t(1.0e+20) ;
  Index_t   courant_elem = -1 ;
  Real_t      qqc = mesh.qqc() ;
  Index_t length = mesh.numElem() ;

  Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;

  for (Index_t i = 0 ; i < length ; ++i) {
        Index_t indx = mesh.matElemlist(i) ;

        Real_t dtf = mesh.ss(indx) * mesh.ss(indx) ;

        if ( mesh.vdov(indx) < Real_t(0.) ) {

                   dtf = dtf
                                 + qqc2 * mesh.arealg(indx) * mesh.arealg(indx)
                                 * mesh.vdov(indx) * mesh.vdov(indx) ;
        }

        dtf = SQRT(dtf) ;

        dtf = mesh.arealg(indx) / dtf ;

        /* determine minimum timestep with its corresponding elem */
        if (mesh.vdov(indx) != Real_t(0.)) {
          if ( dtf < dtcourant ) {
                dtcourant = dtf ;
                courant_elem = indx ;
          }
        }
  }

  /* Don't try to register a time constraint if none of the elements
   * were active */
  if (courant_elem != -1) {
        mesh.dtcourant() = dtcourant ;
  }

  return ;
}


static inline
void CalcCourantConstraintForElems(int useCPU)
{
  if (useCPU) {
        FC(matElemlist); FC(ss); FC(vdov); FC(arealg);
        CalcCourantConstraintForElems_cpu();
  }
  else {
        FG(matElemlist); FG(ss); FG(vdov); FG(arealg);
        CalcCourantConstraintForElems_gpu();
  }
}


__global__
void CalcHydroConstraintForElems_kernel(
                                        Index_t length,Real_t dvovmax,
                                        Index_t *matElemlist,Real_t *vdov,
                                        Real_t *mindthydro)
{
  __shared__ Real_t minArray[BLOCKSIZE];

  int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;

  Real_t dthydro = Real_t(1.0e+20) ;
  if (i<length) {
        Index_t indx = matElemlist[i] ;
        if (vdov[indx] != Real_t(0.)) {
          Real_t dtdvov = dvovmax / (FABS(vdov[indx])+Real_t(1.e-20)) ;
          if ( dthydro > dtdvov ) {
                dthydro = dtdvov ;
          }
        }
  }
  minArray[hipThreadIdx_x]=dthydro;
  reduceMin<Real_t,BLOCKSIZE>(minArray,hipThreadIdx_x);
  if (hipThreadIdx_x==0)
        mindthydro[hipBlockIdx_x]=minArray[0];
}


static inline
void CalcHydroConstraintForElems_gpu()
{
  Real_t dvovmax = mesh.dvovmax() ;
  Index_t length = mesh.numElem() ;

  dim3 dimBlock=dim3(BLOCKSIZE,1,1);
  dim3 dimGrid=dim3(PAD_DIV(length,dimBlock.x),1,1);

  Real_t *dev_mindthydro;
  HIP( hipMalloc(&dev_mindthydro,sizeof(Real_t)*dimGrid.x) );

  hipLaunchKernelGGL((CalcHydroConstraintForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, length,dvovmax,
         meshGPU.m_matElemlist,meshGPU.m_vdov,
         dev_mindthydro);
  HIP_DEBUGSYNC;

  Real_t *mindthydro = (Real_t*)malloc(sizeof(Real_t)*dimGrid.x);
  HIP( hipMemcpy(mindthydro,dev_mindthydro,sizeof(Real_t)*dimGrid.x,hipMemcpyDeviceToHost) );
  HIP( hipFree(dev_mindthydro) );

  // finish the MIN computation over the thread blocks
  Real_t dthydro=mindthydro[0];
  for (int i=1; i<dimGrid.x; i++) {
        MINEQ(dthydro,mindthydro[i]);
  }
  free(mindthydro);

  if (dthydro < Real_t(1.0e+20))
        mesh.dthydro() = dthydro ;
}

static inline
void CalcHydroConstraintForElems_cpu()
{
  Real_t dthydro = Real_t(1.0e+20) ;
  Index_t hydro_elem = -1 ;
  Real_t dvovmax = mesh.dvovmax() ;
  Index_t length = mesh.numElem() ;

  for (Index_t i = 0 ; i < length ; ++i) {
        Index_t indx = mesh.matElemlist(i) ;

        if (mesh.vdov(indx) != Real_t(0.)) {
          Real_t dtdvov = dvovmax / (FABS(mesh.vdov(indx))+Real_t(1.e-20)) ;
          if ( dthydro > dtdvov ) {
                dthydro = dtdvov ;
                hydro_elem = indx ;
          }
        }
  }

  if (hydro_elem != -1) {
        mesh.dthydro() = dthydro ;
  }

  return ;
}


static inline
void CalcHydroConstraintForElems(int useCPU)
{
  if (useCPU) {
        FC(matElemlist); FC(vdov);
        CalcHydroConstraintForElems_cpu();
  }
  else {
        FG(matElemlist); FG(vdov);
        CalcHydroConstraintForElems_gpu();
  }
}



static inline
void CalcTimeConstraintsForElems(int useCPU) {
  /* evaluate time constraint */
  CalcCourantConstraintForElems(useCPU) ;

  /* check hydro constraint */
  CalcHydroConstraintForElems(useCPU) ;
}

static inline
void LagrangeLeapFrog(int useCPU)
{
  /* calculate nodal forces, accelerations, velocities, positions, with
   * applied boundary conditions and slide surface considerations */

  LagrangeNodal(useCPU);

  /* calculate element quantities (i.e. velocity gradient & q), and update
   * material states */
  LagrangeElements(useCPU);

  CalcTimeConstraintsForElems(useCPU);

  // LagrangeRelease() ;  Creation/destruction of temps may be important to capture
}

int main(int argc, char *argv[])
{
  Index_t edgeElems = 45 ;
  Index_t edgeNodes = edgeElems+1 ;
  // Real_t ds = Real_t(1.125)/Real_t(edgeElems) ; /* may accumulate roundoff */
  Real_t tx, ty, tz ;
  Index_t nidx, zidx ;
  Index_t meshElems ;

  /* get run options to measure various metrics */

  /* ... */

  hip_init();

  /****************************/
  /*   Initialize Sedov Mesh  */
  /****************************/

  /* construct a uniform box for this processor */

  mesh.sizeX()   = edgeElems ;
  mesh.sizeY()   = edgeElems ;
  mesh.sizeZ()   = edgeElems ;
  mesh.numElem() = edgeElems*edgeElems*edgeElems ;
  mesh.numNode() = edgeNodes*edgeNodes*edgeNodes ;

  meshElems = mesh.numElem() ;


  /* allocate field memory */

  mesh.AllocateElemPersistent(mesh.numElem()) ;
  mesh.AllocateElemTemporary (mesh.numElem()) ;

  mesh.AllocateNodalPersistent(mesh.numNode()) ;
  mesh.AllocateNodesets(edgeNodes*edgeNodes) ;


  /* initialize nodal coordinates */

  nidx = 0 ;
  tz  = Real_t(0.) ;
  for (Index_t plane=0; plane<edgeNodes; ++plane) {
        ty = Real_t(0.) ;
        for (Index_t row=0; row<edgeNodes; ++row) {
          tx = Real_t(0.) ;
          for (Index_t col=0; col<edgeNodes; ++col) {
                mesh.x(nidx) = tx ;
                mesh.y(nidx) = ty ;
                mesh.z(nidx) = tz ;
                ++nidx ;
                // tx += ds ; /* may accumulate roundoff... */
                tx = Real_t(1.125)*Real_t(col+1)/Real_t(edgeElems) ;
          }
          // ty += ds ;  /* may accumulate roundoff... */
          ty = Real_t(1.125)*Real_t(row+1)/Real_t(edgeElems) ;
        }
        // tz += ds ;  /* may accumulate roundoff... */
        tz = Real_t(1.125)*Real_t(plane+1)/Real_t(edgeElems) ;
  }


  /* embed hexehedral elements in nodal point lattice */

  nidx = 0 ;
  zidx = 0 ;
  for (Index_t plane=0; plane<edgeElems; ++plane) {
        for (Index_t row=0; row<edgeElems; ++row) {
          for (Index_t col=0; col<edgeElems; ++col) {
                mesh.nodelist(zidx,0) = nidx                                       ;
                mesh.nodelist(zidx,1) = nidx                                   + 1 ;
                mesh.nodelist(zidx,2) = nidx                       + edgeNodes + 1 ;
                mesh.nodelist(zidx,3) = nidx                       + edgeNodes     ;
                mesh.nodelist(zidx,4) = nidx + edgeNodes*edgeNodes                 ;
                mesh.nodelist(zidx,5) = nidx + edgeNodes*edgeNodes             + 1 ;
                mesh.nodelist(zidx,6) = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
                mesh.nodelist(zidx,7) = nidx + edgeNodes*edgeNodes + edgeNodes     ;
                ++zidx ;
                ++nidx ;
          }
          ++nidx ;
        }
        nidx += edgeNodes ;
  }

  /* Create a material IndexSet (entire mesh same material for now) */
  for (Index_t i=0; i<meshElems; ++i) {
        mesh.matElemlist(i) = i ;
  }

  /* initialize material parameters */
  mesh.dtfixed() = Real_t(-1.0e-7) ;
  mesh.deltatime() = Real_t(1.0e-7) ;
  mesh.deltatimemultlb() = Real_t(1.1) ;
  mesh.deltatimemultub() = Real_t(1.2) ;
  int numIters = 0;
  if (argc == 2) {
        mesh.stoptime()  = Real_t(atof(argv[1])) ;
        numIters = 10;
  } else if (argc == 3) {
        mesh.stoptime()  = Real_t(atof(argv[1])) ;
        numIters = atoi(argv[2]);
  } else {
        mesh.stoptime()  = Real_t(1.0e-2) ;
        numIters = 10;
  }
  mesh.dtcourant() = Real_t(1.0e+20) ;
  mesh.dthydro()   = Real_t(1.0e+20) ;
  mesh.dtmax()     = Real_t(1.0e-2) ;
  mesh.time()    = Real_t(0.) ;
  mesh.cycle()   = 0 ;

  mesh.e_cut() = Real_t(1.0e-7) ;
  mesh.p_cut() = Real_t(1.0e-7) ;
  mesh.q_cut() = Real_t(1.0e-7) ;
  mesh.u_cut() = Real_t(1.0e-7) ;
  mesh.v_cut() = Real_t(1.0e-10) ;

  mesh.hgcoef()      = Real_t(3.0) ;
  mesh.ss4o3()       = Real_t(4.0)/Real_t(3.0) ;

  mesh.qstop()              =  Real_t(1.0e+12) ;
  mesh.monoq_max_slope()    =  Real_t(1.0) ;
  mesh.monoq_limiter_mult() =  Real_t(2.0) ;
  mesh.qlc_monoq()          = Real_t(0.5) ;
  mesh.qqc_monoq()          = Real_t(2.0)/Real_t(3.0) ;
  mesh.qqc()                = Real_t(2.0) ;

  mesh.pmin() =  Real_t(0.) ;
  mesh.emin() = Real_t(-1.0e+15) ;

  mesh.dvovmax() =  Real_t(0.1) ;

  mesh.eosvmax() =  Real_t(1.0e+9) ;
  mesh.eosvmin() =  Real_t(1.0e-9) ;

  mesh.refdens() =  Real_t(1.0) ;

  /* initialize field data */
  for (Index_t i=0; i<meshElems; ++i) {
        Real_t x_local[8], y_local[8], z_local[8] ;
        for( Index_t lnode=0 ; lnode<8 ; ++lnode )
          {
                Index_t gnode = mesh.nodelist(i,lnode);
                x_local[lnode] = mesh.x(gnode);
                y_local[lnode] = mesh.y(gnode);
                z_local[lnode] = mesh.z(gnode);
          }

        // volume calculations
        Real_t volume = CalcElemVolume(x_local, y_local, z_local );
        mesh.volo(i) = volume ;
        mesh.elemMass(i) = volume ;
        for (Index_t j=0; j<8; ++j) {
          Index_t idx = mesh.nodelist(i,j);
          mesh.nodalMass(idx) += volume / Real_t(8.0) ;
        }
  }

  /* deposit energy */
  mesh.e(0) = Real_t(3.948746e+7) ;

  /* set up symmetry nodesets */
  nidx = 0 ;
  for (Index_t i=0; i<edgeNodes; ++i) {
        Index_t planeInc = i*edgeNodes*edgeNodes ;
        Index_t rowInc   = i*edgeNodes ;
        for (Index_t j=0; j<edgeNodes; ++j) {
          mesh.symmX(nidx) = planeInc + j*edgeNodes ;
          mesh.symmY(nidx) = planeInc + j ;
          mesh.symmZ(nidx) = rowInc   + j ;
          ++nidx ;
        }
  }

  /* set up elemement connectivity information */
  mesh.lxim(0) = 0 ;
  for (Index_t i=1; i<meshElems; ++i) {
        mesh.lxim(i)   = i-1 ;
        mesh.lxip(i-1) = i ;
  }
  mesh.lxip(meshElems-1) = meshElems-1 ;

  for (Index_t i=0; i<edgeElems; ++i) {
        mesh.letam(i) = i ;
        mesh.letap(meshElems-edgeElems+i) = meshElems-edgeElems+i ;
  }
  for (Index_t i=edgeElems; i<meshElems; ++i) {
        mesh.letam(i) = i-edgeElems ;
        mesh.letap(i-edgeElems) = i ;
  }

  for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
        mesh.lzetam(i) = i ;
        mesh.lzetap(meshElems-edgeElems*edgeElems+i) = meshElems-edgeElems*edgeElems+i ;
  }
  for (Index_t i=edgeElems*edgeElems; i<meshElems; ++i) {
        mesh.lzetam(i) = i - edgeElems*edgeElems ;
        mesh.lzetap(i-edgeElems*edgeElems) = i ;
  }

  /* set up boundary condition information */
  for (Index_t i=0; i<meshElems; ++i) {
        mesh.elemBC(i) = 0 ;  /* clear BCs by default */
  }

  /* faces on "external" boundaries will be */
  /* symmetry plane or free surface BCs */
  for (Index_t i=0; i<edgeElems; ++i) {
        Index_t planeInc = i*edgeElems*edgeElems ;
        Index_t rowInc   = i*edgeElems ;
        for (Index_t j=0; j<edgeElems; ++j) {
          mesh.elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;
          mesh.elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;
          mesh.elemBC(planeInc+j) |= ETA_M_SYMM ;
          mesh.elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= ETA_P_FREE ;
          mesh.elemBC(rowInc+j) |= ZETA_M_SYMM ;
          mesh.elemBC(rowInc+j+meshElems-edgeElems*edgeElems) |= ZETA_P_FREE ;
        }
  }

  mesh.AllocateNodeElemIndexes();



  /* initialize meshGPU */
  meshGPU.init(&mesh);
  meshGPU.freshenGPU();

  /* timestep to solution */
  int its=0;
#if 0
  while (its<50) {
#else
  //while(mesh.time() < mesh.stoptime() ) {
  while(its < numIters) {
#endif
          TimeIncrement() ;
          LagrangeLeapFrog(0) ;
          its++;
          /* problem->commNodes->Transfer(CommNodes::syncposvel) ; */
#if LULESH_SHOW_PROGRESS
          printf("time = %e, dt=%e\n",
                 double(mesh.time()), double(mesh.deltatime()) ) ;
#endif
  }
  printf("iterations: %d\n",its);

#if LULESH_WRITE_OUTPUT
     FC(x);
     FILE *fp = fopen("x.asc","wb");
     for (Index_t i=0; i<mesh.numElem(); i++)
         fprintf(fp,"%.6f\n",mesh.x(i));
     fclose(fp);
#endif

  return 0 ;
}
