blob: 2026f6cfa6ce397716149d5443f086afea95f311 [file] [log] [blame]
/*
Copyright (c) 2010.
Lawrence Livermore National Security, LLC.
Produced at the Lawrence Livermore National Laboratory.
LLNL-CODE-461231
All rights reserved.
This file is part of LULESH, Version 1.0.
Please also read this link -- http://www.opensource.org/licenses/index.php
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the disclaimer below.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the disclaimer (as noted below)
in the documentation and/or other materials provided with the
distribution.
* Neither the name of the LLNS/LLNL nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Additional BSD Notice
1. This notice is required to be provided under our contract with the U.S.
Department of Energy (DOE). This work was produced at Lawrence Livermore
National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
2. Neither the United States Government nor Lawrence Livermore National
Security, LLC nor any of their employees, makes any warranty, express
or implied, or assumes any liability or responsibility for the accuracy,
completeness, or usefulness of any information, apparatus, product, or
process disclosed, or represents that its use would not infringe
privately-owned rights.
3. Also, reference herein to any specific commercial products, process, or
services by trade name, trademark, manufacturer or otherwise does not
necessarily constitute or imply its endorsement, recommendation, or
favoring by the United States Government or Lawrence Livermore National
Security, LLC. The views and opinions of authors expressed herein do not
necessarily state or reflect those of the United States Government or
Lawrence Livermore National Security, LLC, and shall not be used for
advertising or product endorsement purposes.
*/
#include "hip/hip_runtime.h"
#include <vector>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#define LULESH_SHOW_PROGRESS 1
enum { VolumeError = -1, QStopError = -2 } ;
/****************************************************/
/* Allow flexibility for arithmetic representations */
/****************************************************/
/* Could also support fixed point and interval arithmetic types */
typedef float real4 ;
typedef double real8 ;
typedef long double real10 ; /* 10 bytes on x86 */
typedef int Index_t ; /* array subscript and loop index */
typedef real8 Real_t ; /* floating point representation */
typedef int Int_t ; /* integer representation */
__host__ __device__ inline real4 SQRT(real4 arg) { return sqrtf(arg) ; }
__host__ __device__ inline real8 SQRT(real8 arg) { return sqrt(arg) ; }
__host__ inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
__host__ __device__ inline real4 CBRT(real4 arg) { return cbrtf(arg) ; }
__host__ __device__ inline real8 CBRT(real8 arg) { return cbrt(arg) ; }
__host__ inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
__host__ __device__ inline real4 FABS(real4 arg) { return fabsf(arg) ; }
__host__ __device__ inline real8 FABS(real8 arg) { return fabs(arg) ; }
__host__ inline real10 FABS(real10 arg) { return fabsl(arg) ; }
__host__ __device__ inline real4 FMAX(real4 arg1,real4 arg2) { return fmaxf(arg1,arg2) ; }
__host__ __device__ inline real8 FMAX(real8 arg1,real8 arg2) { return fmax(arg1,arg2) ; }
__host__ inline real10 FMAX(real10 arg1,real10 arg2) { return fmaxl(arg1,arg2) ; }
#define HIP_SAFE_CALL( call) do { \
hipError_t err = call; \
if( hipSuccess != err) { \
fprintf(stderr, "HIP error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, hipGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
#define HIP(call) HIP_SAFE_CALL(call)
#ifdef HIP_SYNC_ALL
#define HIP_DEBUGSYNC HIP(hipDeviceSynchronize())
#else
#define HIP_DEBUGSYNC
#endif
#define BLOCKSIZE 256
/* Given a number of bytes, nbytes, and a byte alignment, align, (e.g., 2,
* 4, 8, or 16), return the smallest integer that is larger than nbytes and
* a multiple of align.
*/
#define PAD_DIV(nbytes, align) (((nbytes) + (align) - 1) / (align))
#define PAD(nbytes, align) (PAD_DIV((nbytes),(align)) * (align))
/* More general version of reduceInPlacePOT (this works for arbitrary
* numThreadsPerBlock <= 1024). Again, conditionals on
* numThreadsPerBlock are evaluated at compile time.
*/
template <class T, int numThreadsPerBlock>
__device__ void
reduceSum(T *sresult, const int threadID)
{
/* If number of threads is not a power of two, first add the ones
after the last power of two into the beginning. At most one of
these conditionals will be true for a given NPOT block size. */
if (numThreadsPerBlock > 512 && numThreadsPerBlock <= 1024)
{
__syncthreads();
if (threadID < numThreadsPerBlock-512)
sresult[threadID] += sresult[threadID + 512];
}
if (numThreadsPerBlock > 256 && numThreadsPerBlock < 512)
{
__syncthreads();
if (threadID < numThreadsPerBlock-256)
sresult[threadID] += sresult[threadID + 256];
}
if (numThreadsPerBlock > 128 && numThreadsPerBlock < 256)
{
__syncthreads();
if (threadID < numThreadsPerBlock-128)
sresult[threadID] += sresult[threadID + 128];
}
if (numThreadsPerBlock > 64 && numThreadsPerBlock < 128)
{
__syncthreads();
if (threadID < numThreadsPerBlock-64)
sresult[threadID] += sresult[threadID + 64];
}
if (numThreadsPerBlock > 32 && numThreadsPerBlock < 64)
{
__syncthreads();
if (threadID < numThreadsPerBlock-32)
sresult[threadID] += sresult[threadID + 32];
}
if (numThreadsPerBlock > 16 && numThreadsPerBlock < 32)
{
__syncthreads();
if (threadID < numThreadsPerBlock-16)
sresult[threadID] += sresult[threadID + 16];
}
if (numThreadsPerBlock > 8 && numThreadsPerBlock < 16)
{
__syncthreads();
if (threadID < numThreadsPerBlock-8)
sresult[threadID] += sresult[threadID + 8];
}
if (numThreadsPerBlock > 4 && numThreadsPerBlock < 8)
{
__syncthreads();
if (threadID < numThreadsPerBlock-4)
sresult[threadID] += sresult[threadID + 4];
}
if (numThreadsPerBlock > 2 && numThreadsPerBlock < 4)
{
__syncthreads();
if (threadID < numThreadsPerBlock-2)
sresult[threadID] += sresult[threadID + 2];
}
if (numThreadsPerBlock >= 512) {
__syncthreads();
if (threadID < 256)
sresult[threadID] += sresult[threadID + 256];
}
if (numThreadsPerBlock >= 256) {
__syncthreads();
if (threadID < 128)
sresult[threadID] += sresult[threadID + 128];
}
if (numThreadsPerBlock >= 128) {
__syncthreads();
if (threadID < 64)
sresult[threadID] += sresult[threadID + 64];
}
__syncthreads();
#ifdef _DEVICEEMU
if (numThreadsPerBlock >= 64) {
__syncthreads();
if (threadID < 32)
sresult[threadID] += sresult[threadID + 32];
}
if (numThreadsPerBlock >= 32) {
__syncthreads();
if (threadID < 16)
sresult[threadID] += sresult[threadID + 16];
}
if (numThreadsPerBlock >= 16) {
__syncthreads();
if (threadID < 8)
sresult[threadID] += sresult[threadID + 8];
}
if (numThreadsPerBlock >= 8) {
__syncthreads();
if (threadID < 4)
sresult[threadID] += sresult[threadID + 4];
}
if (numThreadsPerBlock >= 4) {
__syncthreads();
if (threadID < 2)
sresult[threadID] += sresult[threadID + 2];
}
if (numThreadsPerBlock >= 2) {
__syncthreads();
if (threadID < 1)
sresult[threadID] += sresult[threadID + 1];
}
#else
if (threadID < 32) {
volatile T *vol = sresult;
if (numThreadsPerBlock >= 64) vol[threadID] += vol[threadID + 32];
if (numThreadsPerBlock >= 32) vol[threadID] += vol[threadID + 16];
if (numThreadsPerBlock >= 16) vol[threadID] += vol[threadID + 8];
if (numThreadsPerBlock >= 8) vol[threadID] += vol[threadID + 4];
if (numThreadsPerBlock >= 4) vol[threadID] += vol[threadID + 2];
if (numThreadsPerBlock >= 2) vol[threadID] += vol[threadID + 1];
}
#endif
__syncthreads();
}
#define MINEQ(a,b) (a)=(((a)<(b))?(a):(b))
template <class T, int numThreadsPerBlock>
__device__ void
reduceMin(T *sresult, const int threadID)
{
/* If number of threads is not a power of two, first add the ones
after the last power of two into the beginning. At most one of
these conditionals will be true for a given NPOT block size. */
if (numThreadsPerBlock > 512 && numThreadsPerBlock <= 1024)
{
__syncthreads();
if (threadID < numThreadsPerBlock-512)
MINEQ(sresult[threadID],sresult[threadID + 512]);
}
if (numThreadsPerBlock > 256 && numThreadsPerBlock < 512)
{
__syncthreads();
if (threadID < numThreadsPerBlock-256)
MINEQ(sresult[threadID],sresult[threadID + 256]);
}
if (numThreadsPerBlock > 128 && numThreadsPerBlock < 256)
{
__syncthreads();
if (threadID < numThreadsPerBlock-128)
MINEQ(sresult[threadID],sresult[threadID + 128]);
}
if (numThreadsPerBlock > 64 && numThreadsPerBlock < 128)
{
__syncthreads();
if (threadID < numThreadsPerBlock-64)
MINEQ(sresult[threadID],sresult[threadID + 64]);
}
if (numThreadsPerBlock > 32 && numThreadsPerBlock < 64)
{
__syncthreads();
if (threadID < numThreadsPerBlock-32)
MINEQ(sresult[threadID],sresult[threadID + 32]);
}
if (numThreadsPerBlock > 16 && numThreadsPerBlock < 32)
{
__syncthreads();
if (threadID < numThreadsPerBlock-16)
MINEQ(sresult[threadID],sresult[threadID + 16]);
}
if (numThreadsPerBlock > 8 && numThreadsPerBlock < 16)
{
__syncthreads();
if (threadID < numThreadsPerBlock-8)
MINEQ(sresult[threadID],sresult[threadID + 8]);
}
if (numThreadsPerBlock > 4 && numThreadsPerBlock < 8)
{
__syncthreads();
if (threadID < numThreadsPerBlock-4)
MINEQ(sresult[threadID],sresult[threadID + 4]);
}
if (numThreadsPerBlock > 2 && numThreadsPerBlock < 4)
{
__syncthreads();
if (threadID < numThreadsPerBlock-2)
MINEQ(sresult[threadID],sresult[threadID + 2]);
}
if (numThreadsPerBlock >= 512) {
__syncthreads();
if (threadID < 256)
MINEQ(sresult[threadID],sresult[threadID + 256]);
}
if (numThreadsPerBlock >= 256) {
__syncthreads();
if (threadID < 128)
MINEQ(sresult[threadID],sresult[threadID + 128]);
}
if (numThreadsPerBlock >= 128) {
__syncthreads();
if (threadID < 64)
MINEQ(sresult[threadID],sresult[threadID + 64]);
}
__syncthreads();
#ifdef _DEVICEEMU
if (numThreadsPerBlock >= 64) {
__syncthreads();
if (threadID < 32)
MINEQ(sresult[threadID],sresult[threadID + 32]);
}
if (numThreadsPerBlock >= 32) {
__syncthreads();
if (threadID < 16)
MINEQ(sresult[threadID],sresult[threadID + 16]);
}
if (numThreadsPerBlock >= 16) {
__syncthreads();
if (threadID < 8)
MINEQ(sresult[threadID],sresult[threadID + 8]);
}
if (numThreadsPerBlock >= 8) {
__syncthreads();
if (threadID < 4)
MINEQ(sresult[threadID],sresult[threadID + 4]);
}
if (numThreadsPerBlock >= 4) {
__syncthreads();
if (threadID < 2)
MINEQ(sresult[threadID],sresult[threadID + 2]);
}
if (numThreadsPerBlock >= 2) {
__syncthreads();
if (threadID < 1)
MINEQ(sresult[threadID],sresult[threadID + 1]);
}
#else
if (threadID < 32) {
volatile T *vol = sresult;
if (numThreadsPerBlock >= 64) MINEQ(vol[threadID],vol[threadID + 32]);
if (numThreadsPerBlock >= 32) MINEQ(vol[threadID],vol[threadID + 16]);
if (numThreadsPerBlock >= 16) MINEQ(vol[threadID],vol[threadID + 8]);
if (numThreadsPerBlock >= 8) MINEQ(vol[threadID],vol[threadID + 4]);
if (numThreadsPerBlock >= 4) MINEQ(vol[threadID],vol[threadID + 2]);
if (numThreadsPerBlock >= 2) MINEQ(vol[threadID],vol[threadID + 1]);
}
#endif
__syncthreads();
}
void hip_init()
{
int deviceCount, dev;
hipDeviceProp_t hip_deviceProp;
char *s;
HIP( hipGetDeviceCount(&deviceCount) );
if (deviceCount == 0) {
fprintf(stderr, "hip_init(): no devices supporting HIP.\n");
exit(1);
}
s=getenv("HIP_DEVICE");
if (s != NULL) { dev=atoi(s); }
else dev=0;
if ((dev < 0) || (dev > deviceCount-1)) {
fprintf(stderr, "hip_init(): requested device (%d) out of range [%d,%d]\n",
dev, 0, deviceCount-1);
exit(1);
}
HIP( hipGetDeviceProperties(&hip_deviceProp, dev) );
if (hip_deviceProp.major < 1) {
fprintf(stderr, "hip_init(): device %d does not support HIP.\n", dev);
exit(1);
}
fprintf(stderr, "setting HIP device %d\n",dev);
HIP( hipSetDevice(dev) );
}
/************************************************************/
/* Allow for flexible data layout experiments by separating */
/* array interface from underlying implementation. */
/************************************************************/
struct Mesh {
/* This first implementation allows for runnable code */
/* and is not meant to be optimal. Final implementation */
/* should separate declaration and allocation phases */
/* so that allocation can be scheduled in a cache conscious */
/* manner. */
friend struct MeshGPU;
public:
/**************/
/* Allocation */
/**************/
void AllocateNodalPersistent(size_t size)
{
m_x.resize(size) ;
m_y.resize(size) ;
m_z.resize(size) ;
m_xd.resize(size, Real_t(0.)) ;
m_yd.resize(size, Real_t(0.)) ;
m_zd.resize(size, Real_t(0.)) ;
m_xdd.resize(size, Real_t(0.)) ;
m_ydd.resize(size, Real_t(0.)) ;
m_zdd.resize(size, Real_t(0.)) ;
m_fx.resize(size) ;
m_fy.resize(size) ;
m_fz.resize(size) ;
m_nodalMass.resize(size, Real_t(0.)) ;
}
void AllocateElemPersistent(size_t size)
{
m_matElemlist.resize(size) ;
m_nodelist.resize(8*size) ;
m_lxim.resize(size) ;
m_lxip.resize(size) ;
m_letam.resize(size) ;
m_letap.resize(size) ;
m_lzetam.resize(size) ;
m_lzetap.resize(size) ;
m_elemBC.resize(size) ;
m_e.resize(size, Real_t(0.)) ;
m_p.resize(size, Real_t(0.)) ;
m_q.resize(size) ;
m_ql.resize(size) ;
m_qq.resize(size) ;
m_v.resize(size, 1.0) ;
m_volo.resize(size) ;
m_delv.resize(size) ;
m_vdov.resize(size) ;
m_arealg.resize(size) ;
m_ss.resize(size) ;
m_elemMass.resize(size) ;
}
/* Temporaries should not be initialized in bulk but */
/* this is a runnable placeholder for now */
void AllocateElemTemporary(size_t size)
{
m_dxx.resize(size) ;
m_dyy.resize(size) ;
m_dzz.resize(size) ;
m_delv_xi.resize(size) ;
m_delv_eta.resize(size) ;
m_delv_zeta.resize(size) ;
m_delx_xi.resize(size) ;
m_delx_eta.resize(size) ;
m_delx_zeta.resize(size) ;
m_vnew.resize(size) ;
}
void AllocateNodesets(size_t size)
{
m_symmX.resize(size) ;
m_symmY.resize(size) ;
m_symmZ.resize(size) ;
}
void AllocateNodeElemIndexes()
{
Index_t i,j,nidx;
/* set up node-centered indexing of elements */
m_nodeElemCount.resize(m_numNode);
for (i=0;i<m_numNode;i++) m_nodeElemCount[i]=0;
m_nodeElemCornerList.resize(m_numNode*8);
for (i=0;i<m_numElem;i++) {
for (j=0;j<8;j++) {
nidx=nodelist(i,j);
m_nodeElemCornerList[nidx+m_numNode*m_nodeElemCount[nidx]++] = i+m_numElem*j;
if (m_nodeElemCount[nidx]>8) {
fprintf(stderr, "Node degree is higher than 8!\n");
exit(1);
}
}
}
}
/**********/
/* Access */
/**********/
/* Node-centered */
Real_t& x(Index_t idx) { return m_x[idx] ; }
Real_t& y(Index_t idx) { return m_y[idx] ; }
Real_t& z(Index_t idx) { return m_z[idx] ; }
Real_t& xd(Index_t idx) { return m_xd[idx] ; }
Real_t& yd(Index_t idx) { return m_yd[idx] ; }
Real_t& zd(Index_t idx) { return m_zd[idx] ; }
Real_t& xdd(Index_t idx) { return m_xdd[idx] ; }
Real_t& ydd(Index_t idx) { return m_ydd[idx] ; }
Real_t& zdd(Index_t idx) { return m_zdd[idx] ; }
Real_t& fx(Index_t idx) { return m_fx[idx] ; }
Real_t& fy(Index_t idx) { return m_fy[idx] ; }
Real_t& fz(Index_t idx) { return m_fz[idx] ; }
Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
Index_t& symmX(Index_t idx) { return m_symmX[idx] ; }
Index_t& symmY(Index_t idx) { return m_symmY[idx] ; }
Index_t& symmZ(Index_t idx) { return m_symmZ[idx] ; }
/* Element-centered */
Index_t& matElemlist(Index_t idx) { return m_matElemlist[idx] ; }
Index_t& nodelist(Index_t idx,Index_t nidx) { return m_nodelist[idx+nidx*m_numElem] ; }
Index_t& lxim(Index_t idx) { return m_lxim[idx] ; }
Index_t& lxip(Index_t idx) { return m_lxip[idx] ; }
Index_t& letam(Index_t idx) { return m_letam[idx] ; }
Index_t& letap(Index_t idx) { return m_letap[idx] ; }
Index_t& lzetam(Index_t idx) { return m_lzetam[idx] ; }
Index_t& lzetap(Index_t idx) { return m_lzetap[idx] ; }
Int_t& elemBC(Index_t idx) { return m_elemBC[idx] ; }
Real_t& dxx(Index_t idx) { return m_dxx[idx] ; }
Real_t& dyy(Index_t idx) { return m_dyy[idx] ; }
Real_t& dzz(Index_t idx) { return m_dzz[idx] ; }
Real_t& delv_xi(Index_t idx) { return m_delv_xi[idx] ; }
Real_t& delv_eta(Index_t idx) { return m_delv_eta[idx] ; }
Real_t& delv_zeta(Index_t idx) { return m_delv_zeta[idx] ; }
Real_t& delx_xi(Index_t idx) { return m_delx_xi[idx] ; }
Real_t& delx_eta(Index_t idx) { return m_delx_eta[idx] ; }
Real_t& delx_zeta(Index_t idx) { return m_delx_zeta[idx] ; }
Real_t& e(Index_t idx) { return m_e[idx] ; }
Real_t& p(Index_t idx) { return m_p[idx] ; }
Real_t& q(Index_t idx) { return m_q[idx] ; }
Real_t& ql(Index_t idx) { return m_ql[idx] ; }
Real_t& qq(Index_t idx) { return m_qq[idx] ; }
Real_t& v(Index_t idx) { return m_v[idx] ; }
Real_t& volo(Index_t idx) { return m_volo[idx] ; }
Real_t& vnew(Index_t idx) { return m_vnew[idx] ; }
Real_t& delv(Index_t idx) { return m_delv[idx] ; }
Real_t& vdov(Index_t idx) { return m_vdov[idx] ; }
Real_t& arealg(Index_t idx) { return m_arealg[idx] ; }
Real_t& ss(Index_t idx) { return m_ss[idx] ; }
Real_t& elemMass(Index_t idx) { return m_elemMass[idx] ; }
/* Params */
Real_t& dtfixed() { return m_dtfixed ; }
Real_t& time() { return m_time ; }
Real_t& deltatime() { return m_deltatime ; }
Real_t& deltatimemultlb() { return m_deltatimemultlb ; }
Real_t& deltatimemultub() { return m_deltatimemultub ; }
Real_t& stoptime() { return m_stoptime ; }
Real_t& u_cut() { return m_u_cut ; }
Real_t& hgcoef() { return m_hgcoef ; }
Real_t& qstop() { return m_qstop ; }
Real_t& monoq_max_slope() { return m_monoq_max_slope ; }
Real_t& monoq_limiter_mult() { return m_monoq_limiter_mult ; }
Real_t& e_cut() { return m_e_cut ; }
Real_t& p_cut() { return m_p_cut ; }
Real_t& ss4o3() { return m_ss4o3 ; }
Real_t& q_cut() { return m_q_cut ; }
Real_t& v_cut() { return m_v_cut ; }
Real_t& qlc_monoq() { return m_qlc_monoq ; }
Real_t& qqc_monoq() { return m_qqc_monoq ; }
Real_t& qqc() { return m_qqc ; }
Real_t& eosvmax() { return m_eosvmax ; }
Real_t& eosvmin() { return m_eosvmin ; }
Real_t& pmin() { return m_pmin ; }
Real_t& emin() { return m_emin ; }
Real_t& dvovmax() { return m_dvovmax ; }
Real_t& refdens() { return m_refdens ; }
Real_t& dtcourant() { return m_dtcourant ; }
Real_t& dthydro() { return m_dthydro ; }
Real_t& dtmax() { return m_dtmax ; }
Int_t& cycle() { return m_cycle ; }
Index_t& sizeX() { return m_sizeX ; }
Index_t& sizeY() { return m_sizeY ; }
Index_t& sizeZ() { return m_sizeZ ; }
Index_t& numElem() { return m_numElem ; }
Index_t& numNode() { return m_numNode ; }
//private:
/******************/
/* Implementation */
/******************/
/* Node-centered */
std::vector<Real_t> m_x ; /* coordinates */
std::vector<Real_t> m_y ;
std::vector<Real_t> m_z ;
std::vector<Real_t> m_xd ; /* velocities */
std::vector<Real_t> m_yd ;
std::vector<Real_t> m_zd ;
std::vector<Real_t> m_xdd ; /* accelerations */
std::vector<Real_t> m_ydd ;
std::vector<Real_t> m_zdd ;
std::vector<Real_t> m_fx ; /* forces */
std::vector<Real_t> m_fy ;
std::vector<Real_t> m_fz ;
std::vector<Real_t> m_nodalMass ; /* mass */
std::vector<Index_t> m_symmX ; /* symmetry plane nodesets */
std::vector<Index_t> m_symmY ;
std::vector<Index_t> m_symmZ ;
std::vector<Int_t> m_nodeElemCount ;
std::vector<Index_t> m_nodeElemCornerList ;
/* Element-centered */
std::vector<Index_t> m_matElemlist ; /* material indexset */
std::vector<Index_t> m_nodelist ; /* elemToNode connectivity */
std::vector<Index_t> m_lxim ; /* element connectivity across each face */
std::vector<Index_t> m_lxip ;
std::vector<Index_t> m_letam ;
std::vector<Index_t> m_letap ;
std::vector<Index_t> m_lzetam ;
std::vector<Index_t> m_lzetap ;
std::vector<Int_t> m_elemBC ; /* symmetry/free-surface flags for each elem face */
std::vector<Real_t> m_dxx ; /* principal strains -- temporary */
std::vector<Real_t> m_dyy ;
std::vector<Real_t> m_dzz ;
std::vector<Real_t> m_delv_xi ; /* velocity gradient -- temporary */
std::vector<Real_t> m_delv_eta ;
std::vector<Real_t> m_delv_zeta ;
std::vector<Real_t> m_delx_xi ; /* coordinate gradient -- temporary */
std::vector<Real_t> m_delx_eta ;
std::vector<Real_t> m_delx_zeta ;
std::vector<Real_t> m_e ; /* energy */
std::vector<Real_t> m_p ; /* pressure */
std::vector<Real_t> m_q ; /* q */
std::vector<Real_t> m_ql ; /* linear term for q */
std::vector<Real_t> m_qq ; /* quadratic term for q */
std::vector<Real_t> m_v ; /* relative volume */
std::vector<Real_t> m_volo ; /* reference volume */
std::vector<Real_t> m_vnew ; /* new relative volume -- temporary */
std::vector<Real_t> m_delv ; /* m_vnew - m_v */
std::vector<Real_t> m_vdov ; /* volume derivative over volume */
std::vector<Real_t> m_arealg ; /* characteristic length of an element */
std::vector<Real_t> m_ss ; /* "sound speed" */
std::vector<Real_t> m_elemMass ; /* mass */
/* Parameters */
Real_t m_dtfixed ; /* fixed time increment */
Real_t m_time ; /* current time */
Real_t m_deltatime ; /* variable time increment */
Real_t m_deltatimemultlb ;
Real_t m_deltatimemultub ;
Real_t m_stoptime ; /* end time for simulation */
Real_t m_u_cut ; /* velocity tolerance */
Real_t m_hgcoef ; /* hourglass control */
Real_t m_qstop ; /* excessive q indicator */
Real_t m_monoq_max_slope ;
Real_t m_monoq_limiter_mult ;
Real_t m_e_cut ; /* energy tolerance */
Real_t m_p_cut ; /* pressure tolerance */
Real_t m_ss4o3 ;
Real_t m_q_cut ; /* q tolerance */
Real_t m_v_cut ; /* relative volume tolerance */
Real_t m_qlc_monoq ; /* linear term coef for q */
Real_t m_qqc_monoq ; /* quadratic term coef for q */
Real_t m_qqc ;
Real_t m_eosvmax ;
Real_t m_eosvmin ;
Real_t m_pmin ; /* pressure floor */
Real_t m_emin ; /* energy floor */
Real_t m_dvovmax ; /* maximum allowable volume change */
Real_t m_refdens ; /* reference density */
Real_t m_dtcourant ; /* courant constraint */
Real_t m_dthydro ; /* volume change constraint */
Real_t m_dtmax ; /* maximum allowable time increment */
Int_t m_cycle ; /* iteration count for simulation */
Index_t m_sizeX ; /* X,Y,Z extent of this block */
Index_t m_sizeY ;
Index_t m_sizeZ ;
Index_t m_numElem ; /* Elements/Nodes in this domain */
Index_t m_numNode ;
} mesh ;
template <typename T>
T *Allocate(size_t size)
{
return static_cast<T *>(malloc(sizeof(T)*size)) ;
}
template <typename T>
void Release(T **ptr)
{
if (*ptr != NULL) {
free(*ptr) ;
*ptr = NULL ;
}
}
#define GPU_STALE 0
#define CPU_STALE 1
#define ALL_FRESH 2
template<typename T>
void freshenGPU(std::vector<T>&cpu,T **gpu,int& stale) {
if (stale!=GPU_STALE) return;
if (!(*gpu)) {HIP( hipMalloc(gpu,sizeof(T)*cpu.size()) );}
HIP( hipMemcpy(*gpu,&cpu[0],sizeof(T)*cpu.size(),hipMemcpyHostToDevice) );
stale=ALL_FRESH;
}
template<typename T>
void freshenCPU(std::vector<T>&cpu,T *gpu,int& stale) {
if (stale!=CPU_STALE) return;
if (!gpu) {fprintf(stderr,"freshenCPU(): NULL GPU data!\n");exit(1);}
HIP( hipMemcpy(&cpu[0],gpu,sizeof(T)*cpu.size(),hipMemcpyDeviceToHost) );
stale=ALL_FRESH;
}
// freshen helpers
#define FC(var) freshenCPU(mesh.m_ ## var , meshGPU.m_ ## var ,meshGPU.m_ ## var ## _stale ); // freshen CPU
#define FG(var) freshenGPU(mesh.m_ ## var , &meshGPU.m_ ## var ,meshGPU.m_ ## var ## _stale ); // freshen GPU
// stale helpers
#define SC(var) meshGPU.m_ ## var ## _stale = CPU_STALE; // stale CPU
#define SG(var) meshGPU.m_ ## var ## _stale = GPU_STALE; // stale GPU
struct MeshGPU {
Mesh *m_mesh;
/******************/
/* Implementation */
/******************/
/* Node-centered */
Real_t *m_x ; /* coordinates */
Real_t *m_y ;
Real_t *m_z ;
Real_t *m_xd ; /* velocities */
Real_t *m_yd ;
Real_t *m_zd ;
Real_t *m_xdd ; /* accelerations */
Real_t *m_ydd ;
Real_t *m_zdd ;
Real_t *m_fx ; /* forces */
Real_t *m_fy ;
Real_t *m_fz ;
Real_t *m_nodalMass ; /* mass */
Index_t *m_symmX ; /* symmetry plane nodesets */
Index_t *m_symmY ;
Index_t *m_symmZ ;
Int_t *m_nodeElemCount ;
Index_t *m_nodeElemCornerList ;
/* Element-centered */
Index_t * m_matElemlist ; /* material indexset */
Index_t * m_nodelist ; /* elemToNode connectivity */
Index_t * m_lxim ; /* element connectivity across each face */
Index_t * m_lxip ;
Index_t * m_letam ;
Index_t * m_letap ;
Index_t * m_lzetam ;
Index_t * m_lzetap ;
Int_t * m_elemBC ; /* symmetry/free-surface flags for each elem face */
Real_t *m_dxx ; /* principal strains -- temporary */
Real_t *m_dyy ;
Real_t *m_dzz ;
Real_t *m_delv_xi ; /* velocity gradient -- temporary */
Real_t *m_delv_eta ;
Real_t *m_delv_zeta ;
Real_t *m_delx_xi ; /* coordinate gradient -- temporary */
Real_t *m_delx_eta ;
Real_t *m_delx_zeta ;
Real_t *m_e ; /* energy */
Real_t *m_p ; /* pressure */
Real_t *m_q ; /* q */
Real_t *m_ql ; /* linear term for q */
Real_t *m_qq ; /* quadratic term for q */
Real_t *m_v ; /* relative volume */
Real_t *m_volo ; /* reference volume */
Real_t *m_vnew ; /* new relative volume -- temporary */
Real_t *m_delv ; /* m_vnew - m_v */
Real_t *m_vdov ; /* volume derivative over volume */
Real_t *m_arealg ; /* characteristic length of an element */
Real_t *m_ss ; /* "sound speed" */
Real_t *m_elemMass ; /* mass */
/* Stale flags */
int m_x_stale,m_y_stale,m_z_stale;
int m_xd_stale,m_yd_stale,m_zd_stale;
int m_xdd_stale,m_ydd_stale,m_zdd_stale;
int m_fx_stale,m_fy_stale,m_fz_stale;
int m_nodalMass_stale;
int m_symmX_stale,m_symmY_stale,m_symmZ_stale;
int m_nodeElemCount_stale,m_nodeElemCornerList_stale;
int m_matElemlist_stale,m_nodelist_stale;
int m_lxim_stale,m_lxip_stale,m_letam_stale,m_letap_stale,m_lzetam_stale,m_lzetap_stale;
int m_elemBC_stale;
int m_dxx_stale,m_dyy_stale,m_dzz_stale;
int m_delv_xi_stale,m_delv_eta_stale,m_delv_zeta_stale;
int m_delx_xi_stale,m_delx_eta_stale,m_delx_zeta_stale;
int m_e_stale;
int m_p_stale,m_q_stale,m_ql_stale,m_qq_stale;
int m_v_stale,m_volo_stale,m_vnew_stale,m_delv_stale,m_vdov_stale;
int m_arealg_stale;
int m_ss_stale;
int m_elemMass_stale;
void init(Mesh *mesh) {
m_mesh=mesh;
m_x=m_y=m_z=NULL;
m_xd=m_yd=m_zd=NULL;
m_xdd=m_ydd=m_zdd=NULL;
m_fx=m_fy=m_fz=NULL;
m_nodalMass=NULL;
m_symmX=m_symmY=m_symmZ=NULL;
m_nodeElemCount=m_nodeElemCornerList=NULL;
m_matElemlist=m_nodelist=NULL;
m_lxim=m_lxip=m_letam=m_letap=m_lzetam=m_lzetap=NULL;
m_elemBC=NULL;
m_dxx=m_dyy=m_dzz=NULL;
m_delv_xi=m_delv_eta=m_delv_zeta=NULL;
m_delx_xi=m_delx_eta=m_delx_zeta=NULL;
m_e=NULL;
m_p=m_q=m_ql=m_qq=NULL;
m_v=m_volo=m_vnew=m_delv=m_vdov=NULL;
m_arealg=NULL;
m_ss=NULL;
m_elemMass=NULL;
m_x_stale=m_y_stale=m_z_stale=
m_xd_stale=m_yd_stale=m_zd_stale=
m_xdd_stale=m_ydd_stale=m_zdd_stale=
m_fx_stale=m_fy_stale=m_fz_stale=
m_nodalMass_stale=
m_symmX_stale=m_symmY_stale=m_symmZ_stale=
m_nodeElemCount_stale=m_nodeElemCornerList_stale=
m_matElemlist_stale=m_nodelist_stale=
m_lxim_stale=m_lxip_stale=m_letam_stale=m_letap_stale=m_lzetam_stale=m_lzetap_stale=
m_elemBC_stale=
m_dxx_stale=m_dyy_stale=m_dzz_stale=
m_delv_xi_stale=m_delv_eta_stale=m_delv_zeta_stale=
m_delx_xi_stale=m_delx_eta_stale=m_delx_zeta_stale=
m_e_stale=
m_p_stale=m_q_stale=m_ql_stale=m_qq_stale=
m_v_stale=m_volo_stale=m_vnew_stale=m_delv_stale=m_vdov_stale=
m_arealg_stale=
m_ss_stale=
m_elemMass_stale=
GPU_STALE;
}
void freshenGPU() {
#define F(var) ::freshenGPU(m_mesh->m_ ## var , &m_ ## var ,m_ ## var ## _stale);
F(x); F(y); F(z);
F(xd); F(yd); F(zd);
F(xdd); F(ydd); F(zdd);
F(fx); F(fy); F(fz);
F(nodalMass);
F(symmX); F(symmY); F(symmZ);
F(nodeElemCount); F(nodeElemCornerList);
F(matElemlist); F(nodelist);
F(lxim); F(lxip); F(letam); F(letap); F(lzetam); F(lzetap);
F(elemBC);
F(dxx); F(dyy); F(dzz);
F(delv_xi); F(delv_eta); F(delv_zeta);
F(delx_xi); F(delx_eta); F(delx_zeta);
F(e);
F(p); F(q); F(ql); F(qq);
F(v); F(volo); F(vnew); F(delv); F(vdov);
F(arealg);
F(ss);
F(elemMass);
#undef F
}
void freshenCPU() {
#define F(var) ::freshenCPU(m_mesh->m_ ## var , m_ ## var ,m_ ## var ## _stale);
F(x); F(y); F(z);
F(xd); F(yd); F(zd);
F(xdd); F(ydd); F(zdd);
F(fx); F(fy); F(fz);
F(nodalMass);
F(symmX); F(symmY); F(symmZ);
F(nodeElemCount); F(nodeElemCornerList);
F(matElemlist); F(nodelist);
F(lxim); F(lxip); F(letam); F(letap); F(lzetam); F(lzetap);
F(elemBC);
F(dxx); F(dyy); F(dzz);
F(delv_xi); F(delv_eta); F(delv_zeta);
F(delx_xi); F(delx_eta); F(delx_zeta);
F(e);
F(p); F(q); F(ql); F(qq);
F(v); F(volo); F(vnew); F(delv); F(vdov);
F(arealg);
F(ss);
F(elemMass);
#undef F
}
} meshGPU;
/* Stuff needed for boundary conditions */
/* 2 BCs on each of 6 hexahedral faces (12 bits) */
#define XI_M 0x003
#define XI_M_SYMM 0x001
#define XI_M_FREE 0x002
#define XI_P 0x00c
#define XI_P_SYMM 0x004
#define XI_P_FREE 0x008
#define ETA_M 0x030
#define ETA_M_SYMM 0x010
#define ETA_M_FREE 0x020
#define ETA_P 0x0c0
#define ETA_P_SYMM 0x040
#define ETA_P_FREE 0x080
#define ZETA_M 0x300
#define ZETA_M_SYMM 0x100
#define ZETA_M_FREE 0x200
#define ZETA_P 0xc00
#define ZETA_P_SYMM 0x400
#define ZETA_P_FREE 0x800
static inline
void TimeIncrement()
{
Real_t targetdt = mesh.stoptime() - mesh.time() ;
if ((mesh.dtfixed() <= Real_t(0.0)) && (mesh.cycle() != Int_t(0))) {
Real_t ratio ;
Real_t olddt = mesh.deltatime() ;
/* This will require a reduction in parallel */
Real_t newdt = Real_t(1.0e+20) ;
if (mesh.dtcourant() < newdt) {
newdt = mesh.dtcourant() / Real_t(2.0) ;
}
if (mesh.dthydro() < newdt) {
newdt = mesh.dthydro() * Real_t(2.0) / Real_t(3.0) ;
}
ratio = newdt / olddt ;
if (ratio >= Real_t(1.0)) {
if (ratio < mesh.deltatimemultlb()) {
newdt = olddt ;
}
else if (ratio > mesh.deltatimemultub()) {
newdt = olddt*mesh.deltatimemultub() ;
}
}
if (newdt > mesh.dtmax()) {
newdt = mesh.dtmax() ;
}
mesh.deltatime() = newdt ;
}
/* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
if ((targetdt > mesh.deltatime()) &&
(targetdt < (Real_t(4.0) * mesh.deltatime() / Real_t(3.0))) ) {
targetdt = Real_t(2.0) * mesh.deltatime() / Real_t(3.0) ;
}
if (targetdt < mesh.deltatime()) {
mesh.deltatime() = targetdt ;
}
mesh.time() += mesh.deltatime() ;
++mesh.cycle() ;
}
__global__
void InitStressTermsForElems_kernel(
int numElem,Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, Real_t *p, Real_t *q)
{
int i = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
if (i<numElem)
sigxx[i] = sigyy[i] = sigzz[i] = - p[i] - q[i] ;
}
static inline
void InitStressTermsForElems_gpu(Index_t numElem,
Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
{
dim3 dimBlock(BLOCKSIZE,1,1);
dim3 dimGrid(PAD_DIV(numElem,dimBlock.x),1,1);
//hipFuncSetCacheConfig(InitStressTermsForElems_kernel,hipFuncCachePreferL1); // set as default for all kernels after this one
hipLaunchKernelGGL((InitStressTermsForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,sigxx,sigyy,sigzz,meshGPU.m_p,meshGPU.m_q);
HIP_DEBUGSYNC;
}
static inline
void InitStressTermsForElems_cpu(Index_t numElem,
Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
{
//
// pull in the stresses appropriate to the hydro integration
//
for (Index_t i = 0 ; i < numElem ; ++i){
sigxx[i] = sigyy[i] = sigzz[i] = - mesh.p(i) - mesh.q(i) ;
}
}
static inline
void InitStressTermsForElems(Index_t numElem,
Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
int useCPU)
{
if (useCPU) {
FC(p); FC(q);
InitStressTermsForElems_cpu(numElem,sigxx,sigyy,sigzz);
}
else {
FG(p); FG(q);
InitStressTermsForElems_gpu(numElem,sigxx,sigyy,sigzz);
}
}
__host__ __device__
static inline
void CalcElemShapeFunctionDerivatives( const Real_t* const x,
const Real_t* const y,
const Real_t* const z,
Real_t b[][8],
Real_t* const volume )
{
const Real_t x0 = x[0] ; const Real_t x1 = x[1] ;
const Real_t x2 = x[2] ; const Real_t x3 = x[3] ;
const Real_t x4 = x[4] ; const Real_t x5 = x[5] ;
const Real_t x6 = x[6] ; const Real_t x7 = x[7] ;
const Real_t y0 = y[0] ; const Real_t y1 = y[1] ;
const Real_t y2 = y[2] ; const Real_t y3 = y[3] ;
const Real_t y4 = y[4] ; const Real_t y5 = y[5] ;
const Real_t y6 = y[6] ; const Real_t y7 = y[7] ;
const Real_t z0 = z[0] ; const Real_t z1 = z[1] ;
const Real_t z2 = z[2] ; const Real_t z3 = z[3] ;
const Real_t z4 = z[4] ; const Real_t z5 = z[5] ;
const Real_t z6 = z[6] ; const Real_t z7 = z[7] ;
Real_t fjxxi, fjxet, fjxze;
Real_t fjyxi, fjyet, fjyze;
Real_t fjzxi, fjzet, fjzze;
Real_t cjxxi, cjxet, cjxze;
Real_t cjyxi, cjyet, cjyze;
Real_t cjzxi, cjzet, cjzze;
fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
/* compute cofactors */
cjxxi = (fjyet * fjzze) - (fjzet * fjyze);
cjxet = - (fjyxi * fjzze) + (fjzxi * fjyze);
cjxze = (fjyxi * fjzet) - (fjzxi * fjyet);
cjyxi = - (fjxet * fjzze) + (fjzet * fjxze);
cjyet = (fjxxi * fjzze) - (fjzxi * fjxze);
cjyze = - (fjxxi * fjzet) + (fjzxi * fjxet);
cjzxi = (fjxet * fjyze) - (fjyet * fjxze);
cjzet = - (fjxxi * fjyze) + (fjyxi * fjxze);
cjzze = (fjxxi * fjyet) - (fjyxi * fjxet);
/* calculate partials :
this need only be done for l = 0,1,2,3 since , by symmetry ,
(6,7,4,5) = - (0,1,2,3) .
*/
b[0][0] = - cjxxi - cjxet - cjxze;
b[0][1] = cjxxi - cjxet - cjxze;
b[0][2] = cjxxi + cjxet - cjxze;
b[0][3] = - cjxxi + cjxet - cjxze;
b[0][4] = -b[0][2];
b[0][5] = -b[0][3];
b[0][6] = -b[0][0];
b[0][7] = -b[0][1];
b[1][0] = - cjyxi - cjyet - cjyze;
b[1][1] = cjyxi - cjyet - cjyze;
b[1][2] = cjyxi + cjyet - cjyze;
b[1][3] = - cjyxi + cjyet - cjyze;
b[1][4] = -b[1][2];
b[1][5] = -b[1][3];
b[1][6] = -b[1][0];
b[1][7] = -b[1][1];
b[2][0] = - cjzxi - cjzet - cjzze;
b[2][1] = cjzxi - cjzet - cjzze;
b[2][2] = cjzxi + cjzet - cjzze;
b[2][3] = - cjzxi + cjzet - cjzze;
b[2][4] = -b[2][2];
b[2][5] = -b[2][3];
b[2][6] = -b[2][0];
b[2][7] = -b[2][1];
/* calculate jacobian determinant (volume) */
*volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
}
__host__ __device__
static inline
void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
const Real_t x0, const Real_t y0, const Real_t z0,
const Real_t x1, const Real_t y1, const Real_t z1,
const Real_t x2, const Real_t y2, const Real_t z2,
const Real_t x3, const Real_t y3, const Real_t z3)
{
Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
*normalX0 += areaX;
*normalX1 += areaX;
*normalX2 += areaX;
*normalX3 += areaX;
*normalY0 += areaY;
*normalY1 += areaY;
*normalY2 += areaY;
*normalY3 += areaY;
*normalZ0 += areaZ;
*normalZ1 += areaZ;
*normalZ2 += areaZ;
*normalZ3 += areaZ;
}
__host__ __device__
static inline
void CalcElemNodeNormals(Real_t pfx[8],
Real_t pfy[8],
Real_t pfz[8],
const Real_t x[8],
const Real_t y[8],
const Real_t z[8])
{
for (Index_t i = 0 ; i < 8 ; ++i) {
pfx[i] = Real_t(0.0);
pfy[i] = Real_t(0.0);
pfz[i] = Real_t(0.0);
}
/* evaluate face one: nodes 0, 1, 2, 3 */
SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
&pfx[1], &pfy[1], &pfz[1],
&pfx[2], &pfy[2], &pfz[2],
&pfx[3], &pfy[3], &pfz[3],
x[0], y[0], z[0], x[1], y[1], z[1],
x[2], y[2], z[2], x[3], y[3], z[3]);
/* evaluate face two: nodes 0, 4, 5, 1 */
SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
&pfx[4], &pfy[4], &pfz[4],
&pfx[5], &pfy[5], &pfz[5],
&pfx[1], &pfy[1], &pfz[1],
x[0], y[0], z[0], x[4], y[4], z[4],
x[5], y[5], z[5], x[1], y[1], z[1]);
/* evaluate face three: nodes 1, 5, 6, 2 */
SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
&pfx[5], &pfy[5], &pfz[5],
&pfx[6], &pfy[6], &pfz[6],
&pfx[2], &pfy[2], &pfz[2],
x[1], y[1], z[1], x[5], y[5], z[5],
x[6], y[6], z[6], x[2], y[2], z[2]);
/* evaluate face four: nodes 2, 6, 7, 3 */
SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
&pfx[6], &pfy[6], &pfz[6],
&pfx[7], &pfy[7], &pfz[7],
&pfx[3], &pfy[3], &pfz[3],
x[2], y[2], z[2], x[6], y[6], z[6],
x[7], y[7], z[7], x[3], y[3], z[3]);
/* evaluate face five: nodes 3, 7, 4, 0 */
SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
&pfx[7], &pfy[7], &pfz[7],
&pfx[4], &pfy[4], &pfz[4],
&pfx[0], &pfy[0], &pfz[0],
x[3], y[3], z[3], x[7], y[7], z[7],
x[4], y[4], z[4], x[0], y[0], z[0]);
/* evaluate face six: nodes 4, 7, 6, 5 */
SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
&pfx[7], &pfy[7], &pfz[7],
&pfx[6], &pfy[6], &pfz[6],
&pfx[5], &pfy[5], &pfz[5],
x[4], y[4], z[4], x[7], y[7], z[7],
x[6], y[6], z[6], x[5], y[5], z[5]);
}
__host__ __device__
static inline
void SumElemStressesToNodeForces( const Real_t B[][8],
const Real_t stress_xx,
const Real_t stress_yy,
const Real_t stress_zz,
Real_t* const fx,
Real_t* const fy,
Real_t* const fz,
int stride)
{
Real_t pfx0 = B[0][0] ; Real_t pfx1 = B[0][1] ;
Real_t pfx2 = B[0][2] ; Real_t pfx3 = B[0][3] ;
Real_t pfx4 = B[0][4] ; Real_t pfx5 = B[0][5] ;
Real_t pfx6 = B[0][6] ; Real_t pfx7 = B[0][7] ;
Real_t pfy0 = B[1][0] ; Real_t pfy1 = B[1][1] ;
Real_t pfy2 = B[1][2] ; Real_t pfy3 = B[1][3] ;
Real_t pfy4 = B[1][4] ; Real_t pfy5 = B[1][5] ;
Real_t pfy6 = B[1][6] ; Real_t pfy7 = B[1][7] ;
Real_t pfz0 = B[2][0] ; Real_t pfz1 = B[2][1] ;
Real_t pfz2 = B[2][2] ; Real_t pfz3 = B[2][3] ;
Real_t pfz4 = B[2][4] ; Real_t pfz5 = B[2][5] ;
Real_t pfz6 = B[2][6] ; Real_t pfz7 = B[2][7] ;
fx[0*stride] = -( stress_xx * pfx0 );
fx[1*stride] = -( stress_xx * pfx1 );
fx[2*stride] = -( stress_xx * pfx2 );
fx[3*stride] = -( stress_xx * pfx3 );
fx[4*stride] = -( stress_xx * pfx4 );
fx[5*stride] = -( stress_xx * pfx5 );
fx[6*stride] = -( stress_xx * pfx6 );
fx[7*stride] = -( stress_xx * pfx7 );
fy[0*stride] = -( stress_yy * pfy0 );
fy[1*stride] = -( stress_yy * pfy1 );
fy[2*stride] = -( stress_yy * pfy2 );
fy[3*stride] = -( stress_yy * pfy3 );
fy[4*stride] = -( stress_yy * pfy4 );
fy[5*stride] = -( stress_yy * pfy5 );
fy[6*stride] = -( stress_yy * pfy6 );
fy[7*stride] = -( stress_yy * pfy7 );
fz[0*stride] = -( stress_zz * pfz0 );
fz[1*stride] = -( stress_zz * pfz1 );
fz[2*stride] = -( stress_zz * pfz2 );
fz[3*stride] = -( stress_zz * pfz3 );
fz[4*stride] = -( stress_zz * pfz4 );
fz[5*stride] = -( stress_zz * pfz5 );
fz[6*stride] = -( stress_zz * pfz6 );
fz[7*stride] = -( stress_zz * pfz7 );
}
__global__
void IntegrateStressForElems_kernel( Index_t numElem, Index_t *nodelist,
Real_t *x, Real_t *y, Real_t *z,
Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem,
Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
Real_t *determ)
{
Real_t B[3][8] ;// shape function derivatives
Real_t x_local[8] ;
Real_t y_local[8] ;
Real_t z_local[8] ;
int k=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
if (k<numElem) {
// get nodal coordinates from global arrays and copy into local arrays.
for( Index_t lnode=0 ; lnode<8 ; ++lnode )
{
Index_t gnode = nodelist[k+lnode*numElem];
x_local[lnode] = x[gnode];
y_local[lnode] = y[gnode];
z_local[lnode] = z[gnode];
}
/* Volume calculation involves extra work for numerical consistency. */
CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
B, &determ[k]);
CalcElemNodeNormals( B[0] , B[1], B[2],
x_local, y_local, z_local );
SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
&fx_elem[k], &fy_elem[k], &fz_elem[k], numElem ) ;
}
}
__global__
void AddNodeForcesFromElems_kernel( Index_t numNode,
Int_t *nodeElemCount, Index_t *nodeElemCornerList,
Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem,
Real_t *fx_node, Real_t *fy_node, Real_t *fz_node)
{
int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
if (i<numNode) {
Int_t count=nodeElemCount[i];
Real_t fx,fy,fz;
fx=fy=fz=Real_t(0.0);
for (int j=0;j<count;j++) {
Index_t elem=nodeElemCornerList[i+numNode*j];
fx+=fx_elem[elem]; fy+=fy_elem[elem]; fz+=fz_elem[elem];
}
fx_node[i]=fx; fy_node[i]=fy; fz_node[i]=fz;
}
}
__global__
void AddNodeForcesFromElems2_kernel( Index_t numNode,
Int_t *nodeElemCount, Index_t *nodeElemCornerList,
Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem,
Real_t *fx_node, Real_t *fy_node, Real_t *fz_node)
{
int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
if (i<numNode) {
Int_t count=nodeElemCount[i];
Real_t fx,fy,fz;
fx=fy=fz=Real_t(0.0);
for (int j=0;j<count;j++) {
Index_t elem=nodeElemCornerList[i+numNode*j];
fx+=fx_elem[elem]; fy+=fy_elem[elem]; fz+=fz_elem[elem];
}
fx_node[i]+=fx; fy_node[i]+=fy; fz_node[i]+=fz;
}
}
static inline
void IntegrateStressForElems_gpu( Index_t numElem,
Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
Real_t *determ, int& badvol)
{
Real_t *fx_elem,*fy_elem,*fz_elem;
HIP( hipMalloc(&fx_elem,numElem*8*sizeof(Real_t)) );
HIP( hipMalloc(&fy_elem,numElem*8*sizeof(Real_t)) );
HIP( hipMalloc(&fz_elem,numElem*8*sizeof(Real_t)) );
dim3 dimBlock=dim3(BLOCKSIZE,1,1);
dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1);
hipLaunchKernelGGL((IntegrateStressForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem, meshGPU.m_nodelist, meshGPU.m_x, meshGPU.m_y, meshGPU.m_z,
fx_elem, fy_elem, fz_elem, sigxx, sigyy, sigzz, determ);
HIP_DEBUGSYNC;
dimGrid=dim3(PAD_DIV(mesh.numNode(),dimBlock.x),1,1);
hipLaunchKernelGGL((AddNodeForcesFromElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),meshGPU.m_nodeElemCount,meshGPU.m_nodeElemCornerList,
fx_elem,fy_elem,fz_elem,meshGPU.m_fx,meshGPU.m_fy,meshGPU.m_fz);
HIP_DEBUGSYNC;
HIP( hipFree(fx_elem) );
HIP( hipFree(fy_elem) );
HIP( hipFree(fz_elem) );
// JDC -- need a reduction step to check for non-positive element volumes
badvol=0;
}
static inline
void IntegrateStressForElems_cpu( Index_t numElem,
Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
Real_t *determ, int& badvol)
{
Real_t B[3][8] ;// shape function derivatives
Real_t x_local[8] ;
Real_t y_local[8] ;
Real_t z_local[8] ;
Real_t fx_local[8] ;
Real_t fy_local[8] ;
Real_t fz_local[8] ;
// loop over all elements
for( Index_t k=0 ; k<numElem ; ++k )
{
// get nodal coordinates from global arrays and copy into local arrays.
for( Index_t lnode=0 ; lnode<8 ; ++lnode )
{
Index_t gnode = mesh.nodelist(k,lnode);
x_local[lnode] = mesh.x(gnode);
y_local[lnode] = mesh.y(gnode);
z_local[lnode] = mesh.z(gnode);
}
/* Volume calculation involves extra work for numerical consistency. */
CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
B, &determ[k]);
CalcElemNodeNormals( B[0] , B[1], B[2],
x_local, y_local, z_local );
SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
fx_local, fy_local, fz_local, 1 ) ;
// copy nodal force contributions to global force arrray.
for( Index_t lnode=0 ; lnode<8 ; ++lnode )
{
Index_t gnode = mesh.nodelist(k,lnode);
mesh.fx(gnode) += fx_local[lnode];
mesh.fy(gnode) += fy_local[lnode];
mesh.fz(gnode) += fz_local[lnode];
}
}
badvol=0;
for ( Index_t k=0 ; k<numElem ; ++k ) {
if (determ[k] <= Real_t(0.0)) {
badvol=1;
}
}
}
static inline
void IntegrateStressForElems( Index_t numElem,
Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
Real_t *determ, int& badvol, int useCPU)
{
if (useCPU) {
FC(nodelist); FC(x); FC(y); FC(z);
IntegrateStressForElems_cpu(numElem,sigxx,sigyy,sigzz,determ,badvol);
SG(fx); SG(fy); SG(fz);
}
else {
FG(nodelist); FG(nodeElemCount); FG(nodeElemCornerList);
FG(x); FG(y); FG(z);
IntegrateStressForElems_gpu(numElem,sigxx,sigyy,sigzz,determ,badvol);
SC(fx); SC(fy); SC(fz);
}
}
static inline
void CollectDomainNodesToElemNodes(const Index_t elemNum,
Real_t elemX[8],
Real_t elemY[8],
Real_t elemZ[8])
{
Index_t nd0i = mesh.nodelist(elemNum,0) ;
Index_t nd1i = mesh.nodelist(elemNum,1) ;
Index_t nd2i = mesh.nodelist(elemNum,2) ;
Index_t nd3i = mesh.nodelist(elemNum,3) ;
Index_t nd4i = mesh.nodelist(elemNum,4) ;
Index_t nd5i = mesh.nodelist(elemNum,5) ;
Index_t nd6i = mesh.nodelist(elemNum,6) ;
Index_t nd7i = mesh.nodelist(elemNum,7) ;
elemX[0] = mesh.x(nd0i);
elemX[1] = mesh.x(nd1i);
elemX[2] = mesh.x(nd2i);
elemX[3] = mesh.x(nd3i);
elemX[4] = mesh.x(nd4i);
elemX[5] = mesh.x(nd5i);
elemX[6] = mesh.x(nd6i);
elemX[7] = mesh.x(nd7i);
elemY[0] = mesh.y(nd0i);
elemY[1] = mesh.y(nd1i);
elemY[2] = mesh.y(nd2i);
elemY[3] = mesh.y(nd3i);
elemY[4] = mesh.y(nd4i);
elemY[5] = mesh.y(nd5i);
elemY[6] = mesh.y(nd6i);
elemY[7] = mesh.y(nd7i);
elemZ[0] = mesh.z(nd0i);
elemZ[1] = mesh.z(nd1i);
elemZ[2] = mesh.z(nd2i);
elemZ[3] = mesh.z(nd3i);
elemZ[4] = mesh.z(nd4i);
elemZ[5] = mesh.z(nd5i);
elemZ[6] = mesh.z(nd6i);
elemZ[7] = mesh.z(nd7i);
}
__host__
static inline
void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
const Real_t x3, const Real_t x4, const Real_t x5,
const Real_t y0, const Real_t y1, const Real_t y2,
const Real_t y3, const Real_t y4, const Real_t y5,
const Real_t z0, const Real_t z1, const Real_t z2,
const Real_t z3, const Real_t z4, const Real_t z5,
Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
{
const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
*dvdx =
(y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
(y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
(y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
*dvdy =
- (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
(x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
(x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
*dvdz =
- (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
(y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
(y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
*dvdx *= twelfth;
*dvdy *= twelfth;
*dvdz *= twelfth;
}
#if 0
__device__
static inline
void VOLUDER(const Real_t a0, const Real_t a1, const Real_t a2,
const Real_t a3, const Real_t a4, const Real_t a5,
const Real_t b0, const Real_t b1, const Real_t b2,
const Real_t b3, const Real_t b4, const Real_t b5,
Real_t& dvdc)
{
const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
dvdc=
(a1 + a2) * (b0 + b1) - (a0 + a1) * (b1 + b2) +
(a0 + a4) * (b3 + b4) - (a3 + a4) * (b0 + b4) -
(a2 + a5) * (b3 + b5) + (a3 + a5) * (b2 + b5);
dvdc *= twelfth;
}
#else
// Even though the above version is inlined, it seems to prohibit some kind of compiler optimization.
// This macro version uses many fewer registers and avoids spill-over into local memory.
#define VOLUDER(a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5,dvdc)\
{\
const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;\
\
dvdc= \
((a1) + (a2)) * ((b0) + (b1)) - ((a0) + (a1)) * ((b1) + (b2)) +\
((a0) + (a4)) * ((b3) + (b4)) - ((a3) + (a4)) * ((b0) + (b4)) -\
((a2) + (a5)) * ((b3) + (b5)) + ((a3) + (a5)) * ((b2) + (b5));\
dvdc *= twelfth;\
}
#endif
__host__
static inline
void CalcElemVolumeDerivative(Real_t dvdx[8],
Real_t dvdy[8],
Real_t dvdz[8],
const Real_t x[8],
const Real_t y[8],
const Real_t z[8])
{
VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
y[1], y[2], y[3], y[4], y[5], y[7],
z[1], z[2], z[3], z[4], z[5], z[7],
&dvdx[0], &dvdy[0], &dvdz[0]);
VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
y[0], y[1], y[2], y[7], y[4], y[6],
z[0], z[1], z[2], z[7], z[4], z[6],
&dvdx[3], &dvdy[3], &dvdz[3]);
VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
y[3], y[0], y[1], y[6], y[7], y[5],
z[3], z[0], z[1], z[6], z[7], z[5],
&dvdx[2], &dvdy[2], &dvdz[2]);
VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
y[2], y[3], y[0], y[5], y[6], y[4],
z[2], z[3], z[0], z[5], z[6], z[4],
&dvdx[1], &dvdy[1], &dvdz[1]);
VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
y[7], y[6], y[5], y[0], y[3], y[1],
z[7], z[6], z[5], z[0], z[3], z[1],
&dvdx[4], &dvdy[4], &dvdz[4]);
VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
y[4], y[7], y[6], y[1], y[0], y[2],
z[4], z[7], z[6], z[1], z[0], z[2],
&dvdx[5], &dvdy[5], &dvdz[5]);
VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
y[5], y[4], y[7], y[2], y[1], y[3],
z[5], z[4], z[7], z[2], z[1], z[3],
&dvdx[6], &dvdy[6], &dvdz[6]);
VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
y[6], y[5], y[4], y[3], y[2], y[0],
z[6], z[5], z[4], z[3], z[2], z[0],
&dvdx[7], &dvdy[7], &dvdz[7]);
}
__device__
static inline
void CalcElemVolumeDerivative(Real_t& dvdx,
Real_t& dvdy,
Real_t& dvdz,
const Real_t x,
const Real_t y,
const Real_t z,
unsigned int node)
{
__shared__ Real_t array1[256],array2[256];
volatile Real_t *va1;
volatile Real_t *va2;
unsigned int idx,elem;
unsigned int ind0,ind1,ind2,ind3,ind4,ind5;
switch(node) {
case 0:
{ind0=1; ind1=2; ind2=3; ind3=4; ind4=5; ind5=7;
break;}
case 1:
{ind0=2; ind1=3; ind2=0; ind3=5; ind4=6; ind5=4;
break;}
case 2:
{ind0=3; ind1=0; ind2=1; ind3=6; ind4=7; ind5=5;
break;}
case 3:
{ind0=0; ind1=1; ind2=2; ind3=7; ind4=4; ind5=6;
break;}
case 4:
{ind0=7; ind1=6; ind2=5; ind3=0; ind4=3; ind5=1;
break;}
case 5:
{ind0=4; ind1=7; ind2=6; ind3=1; ind4=0; ind5=2;
break;}
case 6:
{ind0=5; ind1=4; ind2=7; ind3=2; ind4=1; ind5=3;
break;}
case 7:
{ind0=6; ind1=5; ind2=4; ind3=3; ind4=2; ind5=0;
break;}
default:
{ind0=ind1=ind2=ind3=ind4=ind5=0xFFFFFFFF;
break;}
}
idx=hipThreadIdx_x;
elem=idx /*& 0x1F*/ - node*32;
va1=&array1[0];
va2=&array2[0];
// load y and z
__syncthreads();
va1[idx]=y; va2[idx]=z;
__syncthreads();
VOLUDER(va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem],
va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem],
va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem],
va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem],
dvdx);
// load x
__syncthreads();
va1[idx]=x;
__syncthreads();
VOLUDER(va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem],
va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem],
va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem],
va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem],
dvdy);
__syncthreads();
// load y
__syncthreads();
va2[idx]=y;
__syncthreads();
VOLUDER(va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem],
va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem],
va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem],
va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem],
dvdz);
__syncthreads();
}
__host__
static inline
void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd, Real_t *hourgam0,
Real_t *hourgam1, Real_t *hourgam2, Real_t *hourgam3,
Real_t *hourgam4, Real_t *hourgam5, Real_t *hourgam6,
Real_t *hourgam7, Real_t coefficient,
Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
{
Index_t i00=0;
Index_t i01=1;
Index_t i02=2;
Index_t i03=3;
Real_t h00 =
hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +
hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +
hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +
hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];
Real_t h01 =
hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +
hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +
hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +
hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];
Real_t h02 =
hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+
hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+
hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+
hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];
Real_t h03 =
hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +
hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +
hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +
hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];
hgfx[0] = coefficient *
(hourgam0[i00] * h00 + hourgam0[i01] * h01 +
hourgam0[i02] * h02 + hourgam0[i03] * h03);
hgfx[1] = coefficient *
(hourgam1[i00] * h00 + hourgam1[i01] * h01 +
hourgam1[i02] * h02 + hourgam1[i03] * h03);
hgfx[2] = coefficient *
(hourgam2[i00] * h00 + hourgam2[i01] * h01 +
hourgam2[i02] * h02 + hourgam2[i03] * h03);
hgfx[3] = coefficient *
(hourgam3[i00] * h00 + hourgam3[i01] * h01 +
hourgam3[i02] * h02 + hourgam3[i03] * h03);
hgfx[4] = coefficient *
(hourgam4[i00] * h00 + hourgam4[i01] * h01 +
hourgam4[i02] * h02 + hourgam4[i03] * h03);
hgfx[5] = coefficient *
(hourgam5[i00] * h00 + hourgam5[i01] * h01 +
hourgam5[i02] * h02 + hourgam5[i03] * h03);
hgfx[6] = coefficient *
(hourgam6[i00] * h00 + hourgam6[i01] * h01 +
hourgam6[i02] * h02 + hourgam6[i03] * h03);
hgfx[7] = coefficient *
(hourgam7[i00] * h00 + hourgam7[i01] * h01 +
hourgam7[i02] * h02 + hourgam7[i03] * h03);
h00 =
hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +
hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +
hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +
hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];
h01 =
hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +
hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +
hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +
hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];
h02 =
hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+
hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+
hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+
hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];
h03 =
hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +
hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +
hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +
hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];
hgfy[0] = coefficient *
(hourgam0[i00] * h00 + hourgam0[i01] * h01 +
hourgam0[i02] * h02 + hourgam0[i03] * h03);
hgfy[1] = coefficient *
(hourgam1[i00] * h00 + hourgam1[i01] * h01 +
hourgam1[i02] * h02 + hourgam1[i03] * h03);
hgfy[2] = coefficient *
(hourgam2[i00] * h00 + hourgam2[i01] * h01 +
hourgam2[i02] * h02 + hourgam2[i03] * h03);
hgfy[3] = coefficient *
(hourgam3[i00] * h00 + hourgam3[i01] * h01 +
hourgam3[i02] * h02 + hourgam3[i03] * h03);
hgfy[4] = coefficient *
(hourgam4[i00] * h00 + hourgam4[i01] * h01 +
hourgam4[i02] * h02 + hourgam4[i03] * h03);
hgfy[5] = coefficient *
(hourgam5[i00] * h00 + hourgam5[i01] * h01 +
hourgam5[i02] * h02 + hourgam5[i03] * h03);
hgfy[6] = coefficient *
(hourgam6[i00] * h00 + hourgam6[i01] * h01 +
hourgam6[i02] * h02 + hourgam6[i03] * h03);
hgfy[7] = coefficient *
(hourgam7[i00] * h00 + hourgam7[i01] * h01 +
hourgam7[i02] * h02 + hourgam7[i03] * h03);
h00 =
hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +
hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +
hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +
hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];
h01 =
hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +
hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +
hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +
hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];
h02 =
hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+
hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+
hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+
hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];
h03 =
hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +
hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +
hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +
hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];
hgfz[0] = coefficient *
(hourgam0[i00] * h00 + hourgam0[i01] * h01 +
hourgam0[i02] * h02 + hourgam0[i03] * h03);
hgfz[1] = coefficient *
(hourgam1[i00] * h00 + hourgam1[i01] * h01 +
hourgam1[i02] * h02 + hourgam1[i03] * h03);
hgfz[2] = coefficient *
(hourgam2[i00] * h00 + hourgam2[i01] * h01 +
hourgam2[i02] * h02 + hourgam2[i03] * h03);
hgfz[3] = coefficient *
(hourgam3[i00] * h00 + hourgam3[i01] * h01 +
hourgam3[i02] * h02 + hourgam3[i03] * h03);
hgfz[4] = coefficient *
(hourgam4[i00] * h00 + hourgam4[i01] * h01 +
hourgam4[i02] * h02 + hourgam4[i03] * h03);
hgfz[5] = coefficient *
(hourgam5[i00] * h00 + hourgam5[i01] * h01 +
hourgam5[i02] * h02 + hourgam5[i03] * h03);
hgfz[6] = coefficient *
(hourgam6[i00] * h00 + hourgam6[i01] * h01 +
hourgam6[i02] * h02 + hourgam6[i03] * h03);
hgfz[7] = coefficient *
(hourgam7[i00] * h00 + hourgam7[i01] * h01 +
hourgam7[i02] * h02 + hourgam7[i03] * h03);
}
__device__
static inline
Real_t SumOverNodes(Real_t val) {
__shared__ Real_t shm_array[32*8];
// Sum up 8 node values for each element
// Assumes 256 threads: 32 elements, 8 nodes per element.
// NOTE: we could probably avoid some of the __syncthreads() if we map 8 nodes
// of an element to the same warp.
unsigned int tid=hipThreadIdx_x;
#if 1
#if 0
unsigned int node=tid>>5;
unsigned int elem=tid-(node<<5);
#elif 1
unsigned int node=tid/32;
unsigned int elem=tid-(node*32);
#else
unsigned int elem=tid & 0x1F;
#endif
__syncthreads();
shm_array[tid]=val;
__syncthreads();
if (tid<128) shm_array[tid]+=shm_array[tid+128];
__syncthreads();
if (tid<64) shm_array[tid]+=shm_array[tid+64];
__syncthreads();
if (tid<32) shm_array[tid]+=shm_array[tid+32];
__syncthreads();
Real_t ret=shm_array[elem];
__syncthreads();
return ret;
#else
#if 0
unsigned int node=tid>>5;
unsigned int elem=tid-(node<<5);
#else
unsigned int node=tid/32;
unsigned int elem=tid-(node*32);
#endif
unsigned int idx=elem*8+node;
__syncthreads();
shm_array[idx]=val;
__syncthreads();
if (node<4) shm_array[idx]+=shm_array[idx+4];
if (node<2) shm_array[idx]+=shm_array[idx+2];
if (node<1) shm_array[idx]+=shm_array[idx+1];
__syncthreads();
return shm_array[elem*8];
#endif
}
__device__
static inline
void CalcElemFBHourglassForce(Real_t xd,Real_t yd,Real_t zd,
Real_t *hourgam,Real_t coefficient,
Real_t &hgfx, Real_t &hgfy, Real_t &hgfz)
{
hgfx=0;
for (int i=0;i<4;i++) {
Real_t h;
h=hourgam[i]*xd;
h=SumOverNodes(h);
hgfx+=hourgam[i]*h;
}
hgfx *= coefficient;
hgfy=0;
for (int i=0;i<4;i++) {
Real_t h;
h=hourgam[i]*yd;
h=SumOverNodes(h);
hgfy+=hourgam[i]*h;
}
hgfy *= coefficient;
hgfz=0;
for (int i=0;i<4;i++) {
Real_t h;
h=hourgam[i]*zd;
h=SumOverNodes(h);
hgfz+=hourgam[i]*h;
}
hgfz *= coefficient;
}
__global__
void CalcFBHourglassForceForElems_kernel(
Real_t *determ,
Real_t *x8n, Real_t *y8n, Real_t *z8n,
Real_t *dvdx, Real_t *dvdy, Real_t *dvdz,
Real_t hourg,
Index_t numElem, Index_t *nodelist,
Real_t *ss, Real_t *elemMass,
Real_t *xd, Real_t *yd, Real_t *zd,
Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem)
{
/*************************************************
*
* FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
* force.
*
*************************************************/
Real_t hgfx, hgfy, hgfz;
Real_t coefficient;
Real_t hourgam[4];
Real_t xd1, yd1, zd1;
/*************************************************/
/* compute the hourglass modes */
const Real_t posf = Real_t( 1.);
const Real_t negf = Real_t(-1.);
// Assume we will launch 256 threads, which we map to 32 elements, each
// with 8 per-node threads. Organize so each warp of 32 consecutive
// threads operates on the same node of different elements.
// THESE ARE ALL GIVING ME DIFFERENT ANSWERS IN CUDA 4.0 !!?!!?!!
unsigned int tid=hipThreadIdx_x;
unsigned int bid=hipBlockIdx_x;
#if 0
unsigned int node=tid>>5;
unsigned int elem=bid<<5 + (tid - (node<<5));
#elif 1
unsigned int node=tid/32;
unsigned int elem=bid*32 + (tid-node*32);
#elif 0
unsigned int node=tid/32;;
unsigned int elem=bid*32 + (tid & 0x1F);
#elif 0
unsigned int node=tid/32;
unsigned int elem=bid<<5 + (tid & 0x1F);
#elif 0
unsigned int node=tid>>5;
unsigned int elem=bid*32 + (tid & 0x1F);
#else
unsigned int node=tid>>5;
unsigned int elem=bid<<5 + (tid & 0x1F);
#endif
if (elem>=numElem) elem=numElem-1; // don't return -- need thread to participate in sync operations
//if (elem<0) elem=0; // debugging test
Real_t volinv=Real_t(1.0)/determ[elem];
Real_t ss1, mass1, volume13 ;
Real_t xn,yn,zn,dvdxn,dvdyn,dvdzn;
Real_t hourmodx, hourmody, hourmodz;
#if 1
xn=x8n[elem+numElem*node]; yn=y8n[elem+numElem*node]; zn=z8n[elem+numElem*node];
dvdxn=dvdx[elem+numElem*node]; dvdyn=dvdy[elem+numElem*node]; dvdzn=dvdz[elem+numElem*node];
#else
xn=yn=zn=posf; dvdxn=dvdyn=dvdzn=negf;
#endif
#if 1
hourmodx=xn; hourmody=yn; hourmodz=zn;
if (node==2 || node==3 || node==4 || node==5) {
hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
hourgam[0] = negf;
}
else hourgam[0] = posf;
hourmodx = SumOverNodes(hourmodx);
hourmody = SumOverNodes(hourmody);
hourmodz = SumOverNodes(hourmodz);
hourgam[0] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);
hourmodx=xn; hourmody=yn; hourmodz=zn;
if (node==1 || node==2 || node==4 || node==7) {
hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
hourgam[1] = negf;
}
else hourgam[1] = posf;
hourmodx = SumOverNodes(hourmodx);
hourmody = SumOverNodes(hourmody);
hourmodz = SumOverNodes(hourmodz);
hourgam[1] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);
hourmodx=xn; hourmody=yn; hourmodz=zn;
if (node==1 || node==3 || node==5 || node==7) {
hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
hourgam[2] = negf;
}
else hourgam[2] = posf;
hourmodx = SumOverNodes(hourmodx);
hourmody = SumOverNodes(hourmody);
hourmodz = SumOverNodes(hourmodz);
hourgam[2] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);
hourmodx=xn; hourmody=yn; hourmodz=zn;
if (node==0 || node==2 || node==5 || node==7) {
hourmodx *= negf; hourmody *= negf; hourmodz *= negf;
hourgam[3] = negf;
}
else hourgam[3] = posf;
hourmodx = SumOverNodes(hourmodx);
hourmody = SumOverNodes(hourmody);
hourmodz = SumOverNodes(hourmodz);
hourgam[3] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz);
/* compute forces */
/* store forces into h arrays (force arrays) */
ss1=ss[elem];
mass1=elemMass[elem];