| /* |
| |
| Copyright (c) 2010. |
| Lawrence Livermore National Security, LLC. |
| Produced at the Lawrence Livermore National Laboratory. |
| LLNL-CODE-461231 |
| All rights reserved. |
| |
| This file is part of LULESH, Version 1.0. |
| Please also read this link -- http://www.opensource.org/licenses/index.php |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions |
| are met: |
| |
| * Redistributions of source code must retain the above copyright |
| notice, this list of conditions and the disclaimer below. |
| |
| * Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the disclaimer (as noted below) |
| in the documentation and/or other materials provided with the |
| distribution. |
| |
| * Neither the name of the LLNS/LLNL nor the names of its contributors |
| may be used to endorse or promote products derived from this software |
| without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, |
| THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, |
| INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
| EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| |
| Additional BSD Notice |
| |
| 1. This notice is required to be provided under our contract with the U.S. |
| Department of Energy (DOE). This work was produced at Lawrence Livermore |
| National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. |
| |
| 2. Neither the United States Government nor Lawrence Livermore National |
| Security, LLC nor any of their employees, makes any warranty, express |
| or implied, or assumes any liability or responsibility for the accuracy, |
| completeness, or usefulness of any information, apparatus, product, or |
| process disclosed, or represents that its use would not infringe |
| privately-owned rights. |
| |
| 3. Also, reference herein to any specific commercial products, process, or |
| services by trade name, trademark, manufacturer or otherwise does not |
| necessarily constitute or imply its endorsement, recommendation, or |
| favoring by the United States Government or Lawrence Livermore National |
| Security, LLC. The views and opinions of authors expressed herein do not |
| necessarily state or reflect those of the United States Government or |
| Lawrence Livermore National Security, LLC, and shall not be used for |
| advertising or product endorsement purposes. |
| |
| */ |
| |
| #include "hip/hip_runtime.h" |
| #include <vector> |
| #include <math.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| |
| #define LULESH_SHOW_PROGRESS 1 |
| |
| enum { VolumeError = -1, QStopError = -2 } ; |
| |
| /****************************************************/ |
| /* Allow flexibility for arithmetic representations */ |
| /****************************************************/ |
| |
| /* Could also support fixed point and interval arithmetic types */ |
| typedef float real4 ; |
| typedef double real8 ; |
| typedef long double real10 ; /* 10 bytes on x86 */ |
| |
| typedef int Index_t ; /* array subscript and loop index */ |
| typedef real8 Real_t ; /* floating point representation */ |
| typedef int Int_t ; /* integer representation */ |
| |
| __host__ __device__ inline real4 SQRT(real4 arg) { return sqrtf(arg) ; } |
| __host__ __device__ inline real8 SQRT(real8 arg) { return sqrt(arg) ; } |
| __host__ inline real10 SQRT(real10 arg) { return sqrtl(arg) ; } |
| |
| __host__ __device__ inline real4 CBRT(real4 arg) { return cbrtf(arg) ; } |
| __host__ __device__ inline real8 CBRT(real8 arg) { return cbrt(arg) ; } |
| __host__ inline real10 CBRT(real10 arg) { return cbrtl(arg) ; } |
| |
| __host__ __device__ inline real4 FABS(real4 arg) { return fabsf(arg) ; } |
| __host__ __device__ inline real8 FABS(real8 arg) { return fabs(arg) ; } |
| __host__ inline real10 FABS(real10 arg) { return fabsl(arg) ; } |
| |
| __host__ __device__ inline real4 FMAX(real4 arg1,real4 arg2) { return fmaxf(arg1,arg2) ; } |
| __host__ __device__ inline real8 FMAX(real8 arg1,real8 arg2) { return fmax(arg1,arg2) ; } |
| __host__ inline real10 FMAX(real10 arg1,real10 arg2) { return fmaxl(arg1,arg2) ; } |
| |
| #define HIP_SAFE_CALL( call) do { \ |
| hipError_t err = call; \ |
| if( hipSuccess != err) { \ |
| fprintf(stderr, "HIP error in file '%s' in line %i : %s.\n", \ |
| __FILE__, __LINE__, hipGetErrorString( err) ); \ |
| exit(EXIT_FAILURE); \ |
| } } while (0) |
| |
| #define HIP(call) HIP_SAFE_CALL(call) |
| |
| #ifdef HIP_SYNC_ALL |
| #define HIP_DEBUGSYNC HIP(hipDeviceSynchronize()) |
| #else |
| #define HIP_DEBUGSYNC |
| #endif |
| |
| #define BLOCKSIZE 256 |
| |
| /* Given a number of bytes, nbytes, and a byte alignment, align, (e.g., 2, |
| * 4, 8, or 16), return the smallest integer that is larger than nbytes and |
| * a multiple of align. |
| */ |
| #define PAD_DIV(nbytes, align) (((nbytes) + (align) - 1) / (align)) |
| #define PAD(nbytes, align) (PAD_DIV((nbytes),(align)) * (align)) |
| |
| /* More general version of reduceInPlacePOT (this works for arbitrary |
| * numThreadsPerBlock <= 1024). Again, conditionals on |
| * numThreadsPerBlock are evaluated at compile time. |
| */ |
| template <class T, int numThreadsPerBlock> |
| __device__ void |
| reduceSum(T *sresult, const int threadID) |
| { |
| /* If number of threads is not a power of two, first add the ones |
| after the last power of two into the beginning. At most one of |
| these conditionals will be true for a given NPOT block size. */ |
| if (numThreadsPerBlock > 512 && numThreadsPerBlock <= 1024) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-512) |
| sresult[threadID] += sresult[threadID + 512]; |
| } |
| |
| if (numThreadsPerBlock > 256 && numThreadsPerBlock < 512) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-256) |
| sresult[threadID] += sresult[threadID + 256]; |
| } |
| |
| if (numThreadsPerBlock > 128 && numThreadsPerBlock < 256) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-128) |
| sresult[threadID] += sresult[threadID + 128]; |
| } |
| |
| if (numThreadsPerBlock > 64 && numThreadsPerBlock < 128) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-64) |
| sresult[threadID] += sresult[threadID + 64]; |
| } |
| |
| if (numThreadsPerBlock > 32 && numThreadsPerBlock < 64) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-32) |
| sresult[threadID] += sresult[threadID + 32]; |
| } |
| |
| if (numThreadsPerBlock > 16 && numThreadsPerBlock < 32) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-16) |
| sresult[threadID] += sresult[threadID + 16]; |
| } |
| |
| if (numThreadsPerBlock > 8 && numThreadsPerBlock < 16) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-8) |
| sresult[threadID] += sresult[threadID + 8]; |
| } |
| |
| if (numThreadsPerBlock > 4 && numThreadsPerBlock < 8) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-4) |
| sresult[threadID] += sresult[threadID + 4]; |
| } |
| |
| if (numThreadsPerBlock > 2 && numThreadsPerBlock < 4) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-2) |
| sresult[threadID] += sresult[threadID + 2]; |
| } |
| |
| if (numThreadsPerBlock >= 512) { |
| __syncthreads(); |
| if (threadID < 256) |
| sresult[threadID] += sresult[threadID + 256]; |
| } |
| |
| if (numThreadsPerBlock >= 256) { |
| __syncthreads(); |
| if (threadID < 128) |
| sresult[threadID] += sresult[threadID + 128]; |
| } |
| if (numThreadsPerBlock >= 128) { |
| __syncthreads(); |
| if (threadID < 64) |
| sresult[threadID] += sresult[threadID + 64]; |
| } |
| __syncthreads(); |
| #ifdef _DEVICEEMU |
| if (numThreadsPerBlock >= 64) { |
| __syncthreads(); |
| if (threadID < 32) |
| sresult[threadID] += sresult[threadID + 32]; |
| } |
| if (numThreadsPerBlock >= 32) { |
| __syncthreads(); |
| if (threadID < 16) |
| sresult[threadID] += sresult[threadID + 16]; |
| } |
| if (numThreadsPerBlock >= 16) { |
| __syncthreads(); |
| if (threadID < 8) |
| sresult[threadID] += sresult[threadID + 8]; |
| } |
| if (numThreadsPerBlock >= 8) { |
| __syncthreads(); |
| if (threadID < 4) |
| sresult[threadID] += sresult[threadID + 4]; |
| } |
| if (numThreadsPerBlock >= 4) { |
| __syncthreads(); |
| if (threadID < 2) |
| sresult[threadID] += sresult[threadID + 2]; |
| } |
| if (numThreadsPerBlock >= 2) { |
| __syncthreads(); |
| if (threadID < 1) |
| sresult[threadID] += sresult[threadID + 1]; |
| } |
| #else |
| if (threadID < 32) { |
| volatile T *vol = sresult; |
| if (numThreadsPerBlock >= 64) vol[threadID] += vol[threadID + 32]; |
| if (numThreadsPerBlock >= 32) vol[threadID] += vol[threadID + 16]; |
| if (numThreadsPerBlock >= 16) vol[threadID] += vol[threadID + 8]; |
| if (numThreadsPerBlock >= 8) vol[threadID] += vol[threadID + 4]; |
| if (numThreadsPerBlock >= 4) vol[threadID] += vol[threadID + 2]; |
| if (numThreadsPerBlock >= 2) vol[threadID] += vol[threadID + 1]; |
| } |
| #endif |
| __syncthreads(); |
| } |
| |
| #define MINEQ(a,b) (a)=(((a)<(b))?(a):(b)) |
| |
| template <class T, int numThreadsPerBlock> |
| __device__ void |
| reduceMin(T *sresult, const int threadID) |
| { |
| /* If number of threads is not a power of two, first add the ones |
| after the last power of two into the beginning. At most one of |
| these conditionals will be true for a given NPOT block size. */ |
| if (numThreadsPerBlock > 512 && numThreadsPerBlock <= 1024) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-512) |
| MINEQ(sresult[threadID],sresult[threadID + 512]); |
| } |
| |
| if (numThreadsPerBlock > 256 && numThreadsPerBlock < 512) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-256) |
| MINEQ(sresult[threadID],sresult[threadID + 256]); |
| } |
| |
| if (numThreadsPerBlock > 128 && numThreadsPerBlock < 256) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-128) |
| MINEQ(sresult[threadID],sresult[threadID + 128]); |
| } |
| |
| if (numThreadsPerBlock > 64 && numThreadsPerBlock < 128) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-64) |
| MINEQ(sresult[threadID],sresult[threadID + 64]); |
| } |
| |
| if (numThreadsPerBlock > 32 && numThreadsPerBlock < 64) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-32) |
| MINEQ(sresult[threadID],sresult[threadID + 32]); |
| } |
| |
| if (numThreadsPerBlock > 16 && numThreadsPerBlock < 32) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-16) |
| MINEQ(sresult[threadID],sresult[threadID + 16]); |
| } |
| |
| if (numThreadsPerBlock > 8 && numThreadsPerBlock < 16) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-8) |
| MINEQ(sresult[threadID],sresult[threadID + 8]); |
| } |
| |
| if (numThreadsPerBlock > 4 && numThreadsPerBlock < 8) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-4) |
| MINEQ(sresult[threadID],sresult[threadID + 4]); |
| } |
| |
| if (numThreadsPerBlock > 2 && numThreadsPerBlock < 4) |
| { |
| __syncthreads(); |
| if (threadID < numThreadsPerBlock-2) |
| MINEQ(sresult[threadID],sresult[threadID + 2]); |
| } |
| |
| if (numThreadsPerBlock >= 512) { |
| __syncthreads(); |
| if (threadID < 256) |
| MINEQ(sresult[threadID],sresult[threadID + 256]); |
| } |
| |
| if (numThreadsPerBlock >= 256) { |
| __syncthreads(); |
| if (threadID < 128) |
| MINEQ(sresult[threadID],sresult[threadID + 128]); |
| } |
| if (numThreadsPerBlock >= 128) { |
| __syncthreads(); |
| if (threadID < 64) |
| MINEQ(sresult[threadID],sresult[threadID + 64]); |
| } |
| __syncthreads(); |
| #ifdef _DEVICEEMU |
| if (numThreadsPerBlock >= 64) { |
| __syncthreads(); |
| if (threadID < 32) |
| MINEQ(sresult[threadID],sresult[threadID + 32]); |
| } |
| if (numThreadsPerBlock >= 32) { |
| __syncthreads(); |
| if (threadID < 16) |
| MINEQ(sresult[threadID],sresult[threadID + 16]); |
| } |
| if (numThreadsPerBlock >= 16) { |
| __syncthreads(); |
| if (threadID < 8) |
| MINEQ(sresult[threadID],sresult[threadID + 8]); |
| } |
| if (numThreadsPerBlock >= 8) { |
| __syncthreads(); |
| if (threadID < 4) |
| MINEQ(sresult[threadID],sresult[threadID + 4]); |
| } |
| if (numThreadsPerBlock >= 4) { |
| __syncthreads(); |
| if (threadID < 2) |
| MINEQ(sresult[threadID],sresult[threadID + 2]); |
| } |
| if (numThreadsPerBlock >= 2) { |
| __syncthreads(); |
| if (threadID < 1) |
| MINEQ(sresult[threadID],sresult[threadID + 1]); |
| } |
| #else |
| if (threadID < 32) { |
| volatile T *vol = sresult; |
| if (numThreadsPerBlock >= 64) MINEQ(vol[threadID],vol[threadID + 32]); |
| if (numThreadsPerBlock >= 32) MINEQ(vol[threadID],vol[threadID + 16]); |
| if (numThreadsPerBlock >= 16) MINEQ(vol[threadID],vol[threadID + 8]); |
| if (numThreadsPerBlock >= 8) MINEQ(vol[threadID],vol[threadID + 4]); |
| if (numThreadsPerBlock >= 4) MINEQ(vol[threadID],vol[threadID + 2]); |
| if (numThreadsPerBlock >= 2) MINEQ(vol[threadID],vol[threadID + 1]); |
| } |
| #endif |
| __syncthreads(); |
| } |
| |
| void hip_init() |
| { |
| int deviceCount, dev; |
| hipDeviceProp_t hip_deviceProp; |
| char *s; |
| |
| HIP( hipGetDeviceCount(&deviceCount) ); |
| if (deviceCount == 0) { |
| fprintf(stderr, "hip_init(): no devices supporting HIP.\n"); |
| exit(1); |
| } |
| s=getenv("HIP_DEVICE"); |
| if (s != NULL) { dev=atoi(s); } |
| else dev=0; |
| if ((dev < 0) || (dev > deviceCount-1)) { |
| fprintf(stderr, "hip_init(): requested device (%d) out of range [%d,%d]\n", |
| dev, 0, deviceCount-1); |
| exit(1); |
| } |
| HIP( hipGetDeviceProperties(&hip_deviceProp, dev) ); |
| if (hip_deviceProp.major < 1) { |
| fprintf(stderr, "hip_init(): device %d does not support HIP.\n", dev); |
| exit(1); |
| } |
| fprintf(stderr, "setting HIP device %d\n",dev); |
| HIP( hipSetDevice(dev) ); |
| } |
| |
| /************************************************************/ |
| /* Allow for flexible data layout experiments by separating */ |
| /* array interface from underlying implementation. */ |
| /************************************************************/ |
| |
| struct Mesh { |
| |
| /* This first implementation allows for runnable code */ |
| /* and is not meant to be optimal. Final implementation */ |
| /* should separate declaration and allocation phases */ |
| /* so that allocation can be scheduled in a cache conscious */ |
| /* manner. */ |
| |
| friend struct MeshGPU; |
| |
| public: |
| |
| /**************/ |
| /* Allocation */ |
| /**************/ |
| |
| void AllocateNodalPersistent(size_t size) |
| { |
| m_x.resize(size) ; |
| m_y.resize(size) ; |
| m_z.resize(size) ; |
| |
| m_xd.resize(size, Real_t(0.)) ; |
| m_yd.resize(size, Real_t(0.)) ; |
| m_zd.resize(size, Real_t(0.)) ; |
| |
| m_xdd.resize(size, Real_t(0.)) ; |
| m_ydd.resize(size, Real_t(0.)) ; |
| m_zdd.resize(size, Real_t(0.)) ; |
| |
| m_fx.resize(size) ; |
| m_fy.resize(size) ; |
| m_fz.resize(size) ; |
| |
| m_nodalMass.resize(size, Real_t(0.)) ; |
| } |
| |
| void AllocateElemPersistent(size_t size) |
| { |
| m_matElemlist.resize(size) ; |
| m_nodelist.resize(8*size) ; |
| |
| m_lxim.resize(size) ; |
| m_lxip.resize(size) ; |
| m_letam.resize(size) ; |
| m_letap.resize(size) ; |
| m_lzetam.resize(size) ; |
| m_lzetap.resize(size) ; |
| |
| m_elemBC.resize(size) ; |
| |
| m_e.resize(size, Real_t(0.)) ; |
| |
| m_p.resize(size, Real_t(0.)) ; |
| m_q.resize(size) ; |
| m_ql.resize(size) ; |
| m_qq.resize(size) ; |
| |
| m_v.resize(size, 1.0) ; |
| m_volo.resize(size) ; |
| m_delv.resize(size) ; |
| m_vdov.resize(size) ; |
| |
| m_arealg.resize(size) ; |
| |
| m_ss.resize(size) ; |
| |
| m_elemMass.resize(size) ; |
| } |
| |
| /* Temporaries should not be initialized in bulk but */ |
| /* this is a runnable placeholder for now */ |
| void AllocateElemTemporary(size_t size) |
| { |
| m_dxx.resize(size) ; |
| m_dyy.resize(size) ; |
| m_dzz.resize(size) ; |
| |
| m_delv_xi.resize(size) ; |
| m_delv_eta.resize(size) ; |
| m_delv_zeta.resize(size) ; |
| |
| m_delx_xi.resize(size) ; |
| m_delx_eta.resize(size) ; |
| m_delx_zeta.resize(size) ; |
| |
| m_vnew.resize(size) ; |
| } |
| |
| void AllocateNodesets(size_t size) |
| { |
| m_symmX.resize(size) ; |
| m_symmY.resize(size) ; |
| m_symmZ.resize(size) ; |
| } |
| |
| void AllocateNodeElemIndexes() |
| { |
| Index_t i,j,nidx; |
| |
| /* set up node-centered indexing of elements */ |
| m_nodeElemCount.resize(m_numNode); |
| for (i=0;i<m_numNode;i++) m_nodeElemCount[i]=0; |
| m_nodeElemCornerList.resize(m_numNode*8); |
| for (i=0;i<m_numElem;i++) { |
| for (j=0;j<8;j++) { |
| nidx=nodelist(i,j); |
| m_nodeElemCornerList[nidx+m_numNode*m_nodeElemCount[nidx]++] = i+m_numElem*j; |
| if (m_nodeElemCount[nidx]>8) { |
| fprintf(stderr, "Node degree is higher than 8!\n"); |
| exit(1); |
| } |
| } |
| } |
| } |
| |
| /**********/ |
| /* Access */ |
| /**********/ |
| |
| /* Node-centered */ |
| |
| Real_t& x(Index_t idx) { return m_x[idx] ; } |
| Real_t& y(Index_t idx) { return m_y[idx] ; } |
| Real_t& z(Index_t idx) { return m_z[idx] ; } |
| |
| Real_t& xd(Index_t idx) { return m_xd[idx] ; } |
| Real_t& yd(Index_t idx) { return m_yd[idx] ; } |
| Real_t& zd(Index_t idx) { return m_zd[idx] ; } |
| |
| Real_t& xdd(Index_t idx) { return m_xdd[idx] ; } |
| Real_t& ydd(Index_t idx) { return m_ydd[idx] ; } |
| Real_t& zdd(Index_t idx) { return m_zdd[idx] ; } |
| |
| Real_t& fx(Index_t idx) { return m_fx[idx] ; } |
| Real_t& fy(Index_t idx) { return m_fy[idx] ; } |
| Real_t& fz(Index_t idx) { return m_fz[idx] ; } |
| |
| Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; } |
| |
| Index_t& symmX(Index_t idx) { return m_symmX[idx] ; } |
| Index_t& symmY(Index_t idx) { return m_symmY[idx] ; } |
| Index_t& symmZ(Index_t idx) { return m_symmZ[idx] ; } |
| |
| /* Element-centered */ |
| |
| Index_t& matElemlist(Index_t idx) { return m_matElemlist[idx] ; } |
| Index_t& nodelist(Index_t idx,Index_t nidx) { return m_nodelist[idx+nidx*m_numElem] ; } |
| |
| Index_t& lxim(Index_t idx) { return m_lxim[idx] ; } |
| Index_t& lxip(Index_t idx) { return m_lxip[idx] ; } |
| Index_t& letam(Index_t idx) { return m_letam[idx] ; } |
| Index_t& letap(Index_t idx) { return m_letap[idx] ; } |
| Index_t& lzetam(Index_t idx) { return m_lzetam[idx] ; } |
| Index_t& lzetap(Index_t idx) { return m_lzetap[idx] ; } |
| |
| Int_t& elemBC(Index_t idx) { return m_elemBC[idx] ; } |
| |
| Real_t& dxx(Index_t idx) { return m_dxx[idx] ; } |
| Real_t& dyy(Index_t idx) { return m_dyy[idx] ; } |
| Real_t& dzz(Index_t idx) { return m_dzz[idx] ; } |
| |
| Real_t& delv_xi(Index_t idx) { return m_delv_xi[idx] ; } |
| Real_t& delv_eta(Index_t idx) { return m_delv_eta[idx] ; } |
| Real_t& delv_zeta(Index_t idx) { return m_delv_zeta[idx] ; } |
| |
| Real_t& delx_xi(Index_t idx) { return m_delx_xi[idx] ; } |
| Real_t& delx_eta(Index_t idx) { return m_delx_eta[idx] ; } |
| Real_t& delx_zeta(Index_t idx) { return m_delx_zeta[idx] ; } |
| |
| Real_t& e(Index_t idx) { return m_e[idx] ; } |
| |
| Real_t& p(Index_t idx) { return m_p[idx] ; } |
| Real_t& q(Index_t idx) { return m_q[idx] ; } |
| Real_t& ql(Index_t idx) { return m_ql[idx] ; } |
| Real_t& qq(Index_t idx) { return m_qq[idx] ; } |
| |
| Real_t& v(Index_t idx) { return m_v[idx] ; } |
| Real_t& volo(Index_t idx) { return m_volo[idx] ; } |
| Real_t& vnew(Index_t idx) { return m_vnew[idx] ; } |
| Real_t& delv(Index_t idx) { return m_delv[idx] ; } |
| Real_t& vdov(Index_t idx) { return m_vdov[idx] ; } |
| |
| Real_t& arealg(Index_t idx) { return m_arealg[idx] ; } |
| |
| Real_t& ss(Index_t idx) { return m_ss[idx] ; } |
| |
| Real_t& elemMass(Index_t idx) { return m_elemMass[idx] ; } |
| |
| /* Params */ |
| |
| Real_t& dtfixed() { return m_dtfixed ; } |
| Real_t& time() { return m_time ; } |
| Real_t& deltatime() { return m_deltatime ; } |
| Real_t& deltatimemultlb() { return m_deltatimemultlb ; } |
| Real_t& deltatimemultub() { return m_deltatimemultub ; } |
| Real_t& stoptime() { return m_stoptime ; } |
| |
| Real_t& u_cut() { return m_u_cut ; } |
| Real_t& hgcoef() { return m_hgcoef ; } |
| Real_t& qstop() { return m_qstop ; } |
| Real_t& monoq_max_slope() { return m_monoq_max_slope ; } |
| Real_t& monoq_limiter_mult() { return m_monoq_limiter_mult ; } |
| Real_t& e_cut() { return m_e_cut ; } |
| Real_t& p_cut() { return m_p_cut ; } |
| Real_t& ss4o3() { return m_ss4o3 ; } |
| Real_t& q_cut() { return m_q_cut ; } |
| Real_t& v_cut() { return m_v_cut ; } |
| Real_t& qlc_monoq() { return m_qlc_monoq ; } |
| Real_t& qqc_monoq() { return m_qqc_monoq ; } |
| Real_t& qqc() { return m_qqc ; } |
| Real_t& eosvmax() { return m_eosvmax ; } |
| Real_t& eosvmin() { return m_eosvmin ; } |
| Real_t& pmin() { return m_pmin ; } |
| Real_t& emin() { return m_emin ; } |
| Real_t& dvovmax() { return m_dvovmax ; } |
| Real_t& refdens() { return m_refdens ; } |
| |
| Real_t& dtcourant() { return m_dtcourant ; } |
| Real_t& dthydro() { return m_dthydro ; } |
| Real_t& dtmax() { return m_dtmax ; } |
| |
| Int_t& cycle() { return m_cycle ; } |
| |
| Index_t& sizeX() { return m_sizeX ; } |
| Index_t& sizeY() { return m_sizeY ; } |
| Index_t& sizeZ() { return m_sizeZ ; } |
| Index_t& numElem() { return m_numElem ; } |
| Index_t& numNode() { return m_numNode ; } |
| |
| |
| //private: |
| |
| /******************/ |
| /* Implementation */ |
| /******************/ |
| |
| /* Node-centered */ |
| |
| std::vector<Real_t> m_x ; /* coordinates */ |
| std::vector<Real_t> m_y ; |
| std::vector<Real_t> m_z ; |
| |
| std::vector<Real_t> m_xd ; /* velocities */ |
| std::vector<Real_t> m_yd ; |
| std::vector<Real_t> m_zd ; |
| |
| std::vector<Real_t> m_xdd ; /* accelerations */ |
| std::vector<Real_t> m_ydd ; |
| std::vector<Real_t> m_zdd ; |
| |
| std::vector<Real_t> m_fx ; /* forces */ |
| std::vector<Real_t> m_fy ; |
| std::vector<Real_t> m_fz ; |
| |
| std::vector<Real_t> m_nodalMass ; /* mass */ |
| |
| std::vector<Index_t> m_symmX ; /* symmetry plane nodesets */ |
| std::vector<Index_t> m_symmY ; |
| std::vector<Index_t> m_symmZ ; |
| |
| std::vector<Int_t> m_nodeElemCount ; |
| std::vector<Index_t> m_nodeElemCornerList ; |
| |
| /* Element-centered */ |
| |
| std::vector<Index_t> m_matElemlist ; /* material indexset */ |
| std::vector<Index_t> m_nodelist ; /* elemToNode connectivity */ |
| |
| std::vector<Index_t> m_lxim ; /* element connectivity across each face */ |
| std::vector<Index_t> m_lxip ; |
| std::vector<Index_t> m_letam ; |
| std::vector<Index_t> m_letap ; |
| std::vector<Index_t> m_lzetam ; |
| std::vector<Index_t> m_lzetap ; |
| |
| std::vector<Int_t> m_elemBC ; /* symmetry/free-surface flags for each elem face */ |
| |
| std::vector<Real_t> m_dxx ; /* principal strains -- temporary */ |
| std::vector<Real_t> m_dyy ; |
| std::vector<Real_t> m_dzz ; |
| |
| std::vector<Real_t> m_delv_xi ; /* velocity gradient -- temporary */ |
| std::vector<Real_t> m_delv_eta ; |
| std::vector<Real_t> m_delv_zeta ; |
| |
| std::vector<Real_t> m_delx_xi ; /* coordinate gradient -- temporary */ |
| std::vector<Real_t> m_delx_eta ; |
| std::vector<Real_t> m_delx_zeta ; |
| |
| std::vector<Real_t> m_e ; /* energy */ |
| |
| std::vector<Real_t> m_p ; /* pressure */ |
| std::vector<Real_t> m_q ; /* q */ |
| std::vector<Real_t> m_ql ; /* linear term for q */ |
| std::vector<Real_t> m_qq ; /* quadratic term for q */ |
| |
| std::vector<Real_t> m_v ; /* relative volume */ |
| std::vector<Real_t> m_volo ; /* reference volume */ |
| std::vector<Real_t> m_vnew ; /* new relative volume -- temporary */ |
| std::vector<Real_t> m_delv ; /* m_vnew - m_v */ |
| std::vector<Real_t> m_vdov ; /* volume derivative over volume */ |
| |
| std::vector<Real_t> m_arealg ; /* characteristic length of an element */ |
| |
| std::vector<Real_t> m_ss ; /* "sound speed" */ |
| |
| std::vector<Real_t> m_elemMass ; /* mass */ |
| |
| /* Parameters */ |
| |
| Real_t m_dtfixed ; /* fixed time increment */ |
| Real_t m_time ; /* current time */ |
| Real_t m_deltatime ; /* variable time increment */ |
| Real_t m_deltatimemultlb ; |
| Real_t m_deltatimemultub ; |
| Real_t m_stoptime ; /* end time for simulation */ |
| |
| Real_t m_u_cut ; /* velocity tolerance */ |
| Real_t m_hgcoef ; /* hourglass control */ |
| Real_t m_qstop ; /* excessive q indicator */ |
| Real_t m_monoq_max_slope ; |
| Real_t m_monoq_limiter_mult ; |
| Real_t m_e_cut ; /* energy tolerance */ |
| Real_t m_p_cut ; /* pressure tolerance */ |
| Real_t m_ss4o3 ; |
| Real_t m_q_cut ; /* q tolerance */ |
| Real_t m_v_cut ; /* relative volume tolerance */ |
| Real_t m_qlc_monoq ; /* linear term coef for q */ |
| Real_t m_qqc_monoq ; /* quadratic term coef for q */ |
| Real_t m_qqc ; |
| Real_t m_eosvmax ; |
| Real_t m_eosvmin ; |
| Real_t m_pmin ; /* pressure floor */ |
| Real_t m_emin ; /* energy floor */ |
| Real_t m_dvovmax ; /* maximum allowable volume change */ |
| Real_t m_refdens ; /* reference density */ |
| |
| Real_t m_dtcourant ; /* courant constraint */ |
| Real_t m_dthydro ; /* volume change constraint */ |
| Real_t m_dtmax ; /* maximum allowable time increment */ |
| |
| Int_t m_cycle ; /* iteration count for simulation */ |
| |
| Index_t m_sizeX ; /* X,Y,Z extent of this block */ |
| Index_t m_sizeY ; |
| Index_t m_sizeZ ; |
| |
| Index_t m_numElem ; /* Elements/Nodes in this domain */ |
| Index_t m_numNode ; |
| } mesh ; |
| |
| template <typename T> |
| T *Allocate(size_t size) |
| { |
| return static_cast<T *>(malloc(sizeof(T)*size)) ; |
| } |
| |
| template <typename T> |
| void Release(T **ptr) |
| { |
| if (*ptr != NULL) { |
| free(*ptr) ; |
| *ptr = NULL ; |
| } |
| } |
| |
| |
| #define GPU_STALE 0 |
| #define CPU_STALE 1 |
| #define ALL_FRESH 2 |
| |
| template<typename T> |
| void freshenGPU(std::vector<T>&cpu,T **gpu,int& stale) { |
| if (stale!=GPU_STALE) return; |
| if (!(*gpu)) {HIP( hipHostMalloc(gpu,sizeof(T)*cpu.size()) );} |
| HIP( hipMemcpy(*gpu,&cpu[0],sizeof(T)*cpu.size(),hipMemcpyHostToDevice) ); |
| stale=ALL_FRESH; |
| } |
| |
| template<typename T> |
| void freshenCPU(std::vector<T>&cpu,T *gpu,int& stale) { |
| if (stale!=CPU_STALE) return; |
| if (!gpu) {fprintf(stderr,"freshenCPU(): NULL GPU data!\n");exit(1);} |
| HIP( hipMemcpy(&cpu[0],gpu,sizeof(T)*cpu.size(),hipMemcpyDeviceToHost) ); |
| stale=ALL_FRESH; |
| } |
| |
| // freshen helpers |
| #define FC(var) freshenCPU(mesh.m_ ## var , meshGPU.m_ ## var ,meshGPU.m_ ## var ## _stale ); // freshen CPU |
| #define FG(var) freshenGPU(mesh.m_ ## var , &meshGPU.m_ ## var ,meshGPU.m_ ## var ## _stale ); // freshen GPU |
| // stale helpers |
| #define SC(var) meshGPU.m_ ## var ## _stale = CPU_STALE; // stale CPU |
| #define SG(var) meshGPU.m_ ## var ## _stale = GPU_STALE; // stale GPU |
| |
| struct MeshGPU { |
| Mesh *m_mesh; |
| |
| /******************/ |
| /* Implementation */ |
| /******************/ |
| |
| /* Node-centered */ |
| |
| Real_t *m_x ; /* coordinates */ |
| Real_t *m_y ; |
| Real_t *m_z ; |
| |
| Real_t *m_xd ; /* velocities */ |
| Real_t *m_yd ; |
| Real_t *m_zd ; |
| |
| Real_t *m_xdd ; /* accelerations */ |
| Real_t *m_ydd ; |
| Real_t *m_zdd ; |
| |
| Real_t *m_fx ; /* forces */ |
| Real_t *m_fy ; |
| Real_t *m_fz ; |
| |
| Real_t *m_nodalMass ; /* mass */ |
| |
| Index_t *m_symmX ; /* symmetry plane nodesets */ |
| Index_t *m_symmY ; |
| Index_t *m_symmZ ; |
| |
| Int_t *m_nodeElemCount ; |
| Index_t *m_nodeElemCornerList ; |
| |
| /* Element-centered */ |
| |
| Index_t * m_matElemlist ; /* material indexset */ |
| Index_t * m_nodelist ; /* elemToNode connectivity */ |
| |
| Index_t * m_lxim ; /* element connectivity across each face */ |
| Index_t * m_lxip ; |
| Index_t * m_letam ; |
| Index_t * m_letap ; |
| Index_t * m_lzetam ; |
| Index_t * m_lzetap ; |
| |
| Int_t * m_elemBC ; /* symmetry/free-surface flags for each elem face */ |
| |
| Real_t *m_dxx ; /* principal strains -- temporary */ |
| Real_t *m_dyy ; |
| Real_t *m_dzz ; |
| |
| Real_t *m_delv_xi ; /* velocity gradient -- temporary */ |
| Real_t *m_delv_eta ; |
| Real_t *m_delv_zeta ; |
| |
| Real_t *m_delx_xi ; /* coordinate gradient -- temporary */ |
| Real_t *m_delx_eta ; |
| Real_t *m_delx_zeta ; |
| |
| Real_t *m_e ; /* energy */ |
| |
| Real_t *m_p ; /* pressure */ |
| Real_t *m_q ; /* q */ |
| Real_t *m_ql ; /* linear term for q */ |
| Real_t *m_qq ; /* quadratic term for q */ |
| |
| Real_t *m_v ; /* relative volume */ |
| Real_t *m_volo ; /* reference volume */ |
| Real_t *m_vnew ; /* new relative volume -- temporary */ |
| Real_t *m_delv ; /* m_vnew - m_v */ |
| Real_t *m_vdov ; /* volume derivative over volume */ |
| |
| Real_t *m_arealg ; /* characteristic length of an element */ |
| |
| Real_t *m_ss ; /* "sound speed" */ |
| |
| Real_t *m_elemMass ; /* mass */ |
| |
| /* Stale flags */ |
| int m_x_stale,m_y_stale,m_z_stale; |
| int m_xd_stale,m_yd_stale,m_zd_stale; |
| int m_xdd_stale,m_ydd_stale,m_zdd_stale; |
| int m_fx_stale,m_fy_stale,m_fz_stale; |
| int m_nodalMass_stale; |
| int m_symmX_stale,m_symmY_stale,m_symmZ_stale; |
| int m_nodeElemCount_stale,m_nodeElemCornerList_stale; |
| int m_matElemlist_stale,m_nodelist_stale; |
| int m_lxim_stale,m_lxip_stale,m_letam_stale,m_letap_stale,m_lzetam_stale,m_lzetap_stale; |
| int m_elemBC_stale; |
| int m_dxx_stale,m_dyy_stale,m_dzz_stale; |
| int m_delv_xi_stale,m_delv_eta_stale,m_delv_zeta_stale; |
| int m_delx_xi_stale,m_delx_eta_stale,m_delx_zeta_stale; |
| int m_e_stale; |
| int m_p_stale,m_q_stale,m_ql_stale,m_qq_stale; |
| int m_v_stale,m_volo_stale,m_vnew_stale,m_delv_stale,m_vdov_stale; |
| int m_arealg_stale; |
| int m_ss_stale; |
| int m_elemMass_stale; |
| |
| void init(Mesh *mesh) { |
| m_mesh=mesh; |
| m_x=m_y=m_z=NULL; |
| m_xd=m_yd=m_zd=NULL; |
| m_xdd=m_ydd=m_zdd=NULL; |
| m_fx=m_fy=m_fz=NULL; |
| m_nodalMass=NULL; |
| m_symmX=m_symmY=m_symmZ=NULL; |
| m_nodeElemCount=m_nodeElemCornerList=NULL; |
| m_matElemlist=m_nodelist=NULL; |
| m_lxim=m_lxip=m_letam=m_letap=m_lzetam=m_lzetap=NULL; |
| m_elemBC=NULL; |
| m_dxx=m_dyy=m_dzz=NULL; |
| m_delv_xi=m_delv_eta=m_delv_zeta=NULL; |
| m_delx_xi=m_delx_eta=m_delx_zeta=NULL; |
| m_e=NULL; |
| m_p=m_q=m_ql=m_qq=NULL; |
| m_v=m_volo=m_vnew=m_delv=m_vdov=NULL; |
| m_arealg=NULL; |
| m_ss=NULL; |
| m_elemMass=NULL; |
| m_x_stale=m_y_stale=m_z_stale= |
| m_xd_stale=m_yd_stale=m_zd_stale= |
| m_xdd_stale=m_ydd_stale=m_zdd_stale= |
| m_fx_stale=m_fy_stale=m_fz_stale= |
| m_nodalMass_stale= |
| m_symmX_stale=m_symmY_stale=m_symmZ_stale= |
| m_nodeElemCount_stale=m_nodeElemCornerList_stale= |
| m_matElemlist_stale=m_nodelist_stale= |
| m_lxim_stale=m_lxip_stale=m_letam_stale=m_letap_stale=m_lzetam_stale=m_lzetap_stale= |
| m_elemBC_stale= |
| m_dxx_stale=m_dyy_stale=m_dzz_stale= |
| m_delv_xi_stale=m_delv_eta_stale=m_delv_zeta_stale= |
| m_delx_xi_stale=m_delx_eta_stale=m_delx_zeta_stale= |
| m_e_stale= |
| m_p_stale=m_q_stale=m_ql_stale=m_qq_stale= |
| m_v_stale=m_volo_stale=m_vnew_stale=m_delv_stale=m_vdov_stale= |
| m_arealg_stale= |
| m_ss_stale= |
| m_elemMass_stale= |
| GPU_STALE; |
| } |
| void freshenGPU() { |
| #define F(var) ::freshenGPU(m_mesh->m_ ## var , &m_ ## var ,m_ ## var ## _stale); |
| F(x); F(y); F(z); |
| F(xd); F(yd); F(zd); |
| F(xdd); F(ydd); F(zdd); |
| F(fx); F(fy); F(fz); |
| F(nodalMass); |
| F(symmX); F(symmY); F(symmZ); |
| F(nodeElemCount); F(nodeElemCornerList); |
| F(matElemlist); F(nodelist); |
| F(lxim); F(lxip); F(letam); F(letap); F(lzetam); F(lzetap); |
| F(elemBC); |
| F(dxx); F(dyy); F(dzz); |
| F(delv_xi); F(delv_eta); F(delv_zeta); |
| F(delx_xi); F(delx_eta); F(delx_zeta); |
| F(e); |
| F(p); F(q); F(ql); F(qq); |
| F(v); F(volo); F(vnew); F(delv); F(vdov); |
| F(arealg); |
| F(ss); |
| F(elemMass); |
| #undef F |
| } |
| void freshenCPU() { |
| #define F(var) ::freshenCPU(m_mesh->m_ ## var , m_ ## var ,m_ ## var ## _stale); |
| F(x); F(y); F(z); |
| F(xd); F(yd); F(zd); |
| F(xdd); F(ydd); F(zdd); |
| F(fx); F(fy); F(fz); |
| F(nodalMass); |
| F(symmX); F(symmY); F(symmZ); |
| F(nodeElemCount); F(nodeElemCornerList); |
| F(matElemlist); F(nodelist); |
| F(lxim); F(lxip); F(letam); F(letap); F(lzetam); F(lzetap); |
| F(elemBC); |
| F(dxx); F(dyy); F(dzz); |
| F(delv_xi); F(delv_eta); F(delv_zeta); |
| F(delx_xi); F(delx_eta); F(delx_zeta); |
| F(e); |
| F(p); F(q); F(ql); F(qq); |
| F(v); F(volo); F(vnew); F(delv); F(vdov); |
| F(arealg); |
| F(ss); |
| F(elemMass); |
| #undef F |
| } |
| } meshGPU; |
| |
| |
| |
| /* Stuff needed for boundary conditions */ |
| /* 2 BCs on each of 6 hexahedral faces (12 bits) */ |
| #define XI_M 0x003 |
| #define XI_M_SYMM 0x001 |
| #define XI_M_FREE 0x002 |
| |
| #define XI_P 0x00c |
| #define XI_P_SYMM 0x004 |
| #define XI_P_FREE 0x008 |
| |
| #define ETA_M 0x030 |
| #define ETA_M_SYMM 0x010 |
| #define ETA_M_FREE 0x020 |
| |
| #define ETA_P 0x0c0 |
| #define ETA_P_SYMM 0x040 |
| #define ETA_P_FREE 0x080 |
| |
| #define ZETA_M 0x300 |
| #define ZETA_M_SYMM 0x100 |
| #define ZETA_M_FREE 0x200 |
| |
| #define ZETA_P 0xc00 |
| #define ZETA_P_SYMM 0x400 |
| #define ZETA_P_FREE 0x800 |
| |
| |
| static inline |
| void TimeIncrement() |
| { |
| Real_t targetdt = mesh.stoptime() - mesh.time() ; |
| |
| if ((mesh.dtfixed() <= Real_t(0.0)) && (mesh.cycle() != Int_t(0))) { |
| Real_t ratio ; |
| Real_t olddt = mesh.deltatime() ; |
| |
| /* This will require a reduction in parallel */ |
| Real_t newdt = Real_t(1.0e+20) ; |
| if (mesh.dtcourant() < newdt) { |
| newdt = mesh.dtcourant() / Real_t(2.0) ; |
| } |
| if (mesh.dthydro() < newdt) { |
| newdt = mesh.dthydro() * Real_t(2.0) / Real_t(3.0) ; |
| } |
| |
| ratio = newdt / olddt ; |
| if (ratio >= Real_t(1.0)) { |
| if (ratio < mesh.deltatimemultlb()) { |
| newdt = olddt ; |
| } |
| else if (ratio > mesh.deltatimemultub()) { |
| newdt = olddt*mesh.deltatimemultub() ; |
| } |
| } |
| |
| if (newdt > mesh.dtmax()) { |
| newdt = mesh.dtmax() ; |
| } |
| mesh.deltatime() = newdt ; |
| } |
| |
| /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */ |
| if ((targetdt > mesh.deltatime()) && |
| (targetdt < (Real_t(4.0) * mesh.deltatime() / Real_t(3.0))) ) { |
| targetdt = Real_t(2.0) * mesh.deltatime() / Real_t(3.0) ; |
| } |
| |
| if (targetdt < mesh.deltatime()) { |
| mesh.deltatime() = targetdt ; |
| } |
| |
| mesh.time() += mesh.deltatime() ; |
| |
| ++mesh.cycle() ; |
| } |
| |
| __global__ |
| void InitStressTermsForElems_kernel( |
| int numElem,Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, Real_t *p, Real_t *q) |
| { |
| int i = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x; |
| if (i<numElem) |
| sigxx[i] = sigyy[i] = sigzz[i] = - p[i] - q[i] ; |
| } |
| |
| static inline |
| void InitStressTermsForElems_gpu(Index_t numElem, |
| Real_t *sigxx, Real_t *sigyy, Real_t *sigzz) |
| { |
| dim3 dimBlock(BLOCKSIZE,1,1); |
| dim3 dimGrid(PAD_DIV(numElem,dimBlock.x),1,1); |
| //hipFuncSetCacheConfig(InitStressTermsForElems_kernel,hipFuncCachePreferL1); // set as default for all kernels after this one |
| hipLaunchKernelGGL((InitStressTermsForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem,sigxx,sigyy,sigzz,meshGPU.m_p,meshGPU.m_q); |
| HIP_DEBUGSYNC; |
| } |
| |
| static inline |
| void InitStressTermsForElems_cpu(Index_t numElem, |
| Real_t *sigxx, Real_t *sigyy, Real_t *sigzz) |
| { |
| // |
| // pull in the stresses appropriate to the hydro integration |
| // |
| for (Index_t i = 0 ; i < numElem ; ++i){ |
| sigxx[i] = sigyy[i] = sigzz[i] = - mesh.p(i) - mesh.q(i) ; |
| } |
| } |
| |
| static inline |
| void InitStressTermsForElems(Index_t numElem, |
| Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, |
| int useCPU) |
| { |
| if (useCPU) { |
| FC(p); FC(q); |
| InitStressTermsForElems_cpu(numElem,sigxx,sigyy,sigzz); |
| } |
| else { |
| FG(p); FG(q); |
| InitStressTermsForElems_gpu(numElem,sigxx,sigyy,sigzz); |
| } |
| } |
| |
| __host__ __device__ |
| static inline |
| void CalcElemShapeFunctionDerivatives( const Real_t* const x, |
| const Real_t* const y, |
| const Real_t* const z, |
| Real_t b[][8], |
| Real_t* const volume ) |
| { |
| const Real_t x0 = x[0] ; const Real_t x1 = x[1] ; |
| const Real_t x2 = x[2] ; const Real_t x3 = x[3] ; |
| const Real_t x4 = x[4] ; const Real_t x5 = x[5] ; |
| const Real_t x6 = x[6] ; const Real_t x7 = x[7] ; |
| |
| const Real_t y0 = y[0] ; const Real_t y1 = y[1] ; |
| const Real_t y2 = y[2] ; const Real_t y3 = y[3] ; |
| const Real_t y4 = y[4] ; const Real_t y5 = y[5] ; |
| const Real_t y6 = y[6] ; const Real_t y7 = y[7] ; |
| |
| const Real_t z0 = z[0] ; const Real_t z1 = z[1] ; |
| const Real_t z2 = z[2] ; const Real_t z3 = z[3] ; |
| const Real_t z4 = z[4] ; const Real_t z5 = z[5] ; |
| const Real_t z6 = z[6] ; const Real_t z7 = z[7] ; |
| |
| Real_t fjxxi, fjxet, fjxze; |
| Real_t fjyxi, fjyet, fjyze; |
| Real_t fjzxi, fjzet, fjzze; |
| Real_t cjxxi, cjxet, cjxze; |
| Real_t cjyxi, cjyet, cjyze; |
| Real_t cjzxi, cjzet, cjzze; |
| |
| fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) ); |
| fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) ); |
| fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) ); |
| |
| fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) ); |
| fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) ); |
| fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) ); |
| |
| fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) ); |
| fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) ); |
| fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) ); |
| |
| /* compute cofactors */ |
| cjxxi = (fjyet * fjzze) - (fjzet * fjyze); |
| cjxet = - (fjyxi * fjzze) + (fjzxi * fjyze); |
| cjxze = (fjyxi * fjzet) - (fjzxi * fjyet); |
| |
| cjyxi = - (fjxet * fjzze) + (fjzet * fjxze); |
| cjyet = (fjxxi * fjzze) - (fjzxi * fjxze); |
| cjyze = - (fjxxi * fjzet) + (fjzxi * fjxet); |
| |
| cjzxi = (fjxet * fjyze) - (fjyet * fjxze); |
| cjzet = - (fjxxi * fjyze) + (fjyxi * fjxze); |
| cjzze = (fjxxi * fjyet) - (fjyxi * fjxet); |
| |
| /* calculate partials : |
| this need only be done for l = 0,1,2,3 since , by symmetry , |
| (6,7,4,5) = - (0,1,2,3) . |
| */ |
| b[0][0] = - cjxxi - cjxet - cjxze; |
| b[0][1] = cjxxi - cjxet - cjxze; |
| b[0][2] = cjxxi + cjxet - cjxze; |
| b[0][3] = - cjxxi + cjxet - cjxze; |
| b[0][4] = -b[0][2]; |
| b[0][5] = -b[0][3]; |
| b[0][6] = -b[0][0]; |
| b[0][7] = -b[0][1]; |
| |
| b[1][0] = - cjyxi - cjyet - cjyze; |
| b[1][1] = cjyxi - cjyet - cjyze; |
| b[1][2] = cjyxi + cjyet - cjyze; |
| b[1][3] = - cjyxi + cjyet - cjyze; |
| b[1][4] = -b[1][2]; |
| b[1][5] = -b[1][3]; |
| b[1][6] = -b[1][0]; |
| b[1][7] = -b[1][1]; |
| |
| b[2][0] = - cjzxi - cjzet - cjzze; |
| b[2][1] = cjzxi - cjzet - cjzze; |
| b[2][2] = cjzxi + cjzet - cjzze; |
| b[2][3] = - cjzxi + cjzet - cjzze; |
| b[2][4] = -b[2][2]; |
| b[2][5] = -b[2][3]; |
| b[2][6] = -b[2][0]; |
| b[2][7] = -b[2][1]; |
| |
| /* calculate jacobian determinant (volume) */ |
| *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet); |
| } |
| |
| __host__ __device__ |
| static inline |
| void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0, |
| Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1, |
| Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2, |
| Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3, |
| const Real_t x0, const Real_t y0, const Real_t z0, |
| const Real_t x1, const Real_t y1, const Real_t z1, |
| const Real_t x2, const Real_t y2, const Real_t z2, |
| const Real_t x3, const Real_t y3, const Real_t z3) |
| { |
| Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0); |
| Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0); |
| Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0); |
| Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0); |
| Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0); |
| Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0); |
| Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1); |
| Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1); |
| Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1); |
| |
| *normalX0 += areaX; |
| *normalX1 += areaX; |
| *normalX2 += areaX; |
| *normalX3 += areaX; |
| |
| *normalY0 += areaY; |
| *normalY1 += areaY; |
| *normalY2 += areaY; |
| *normalY3 += areaY; |
| |
| *normalZ0 += areaZ; |
| *normalZ1 += areaZ; |
| *normalZ2 += areaZ; |
| *normalZ3 += areaZ; |
| } |
| |
| __host__ __device__ |
| static inline |
| void CalcElemNodeNormals(Real_t pfx[8], |
| Real_t pfy[8], |
| Real_t pfz[8], |
| const Real_t x[8], |
| const Real_t y[8], |
| const Real_t z[8]) |
| { |
| for (Index_t i = 0 ; i < 8 ; ++i) { |
| pfx[i] = Real_t(0.0); |
| pfy[i] = Real_t(0.0); |
| pfz[i] = Real_t(0.0); |
| } |
| /* evaluate face one: nodes 0, 1, 2, 3 */ |
| SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0], |
| &pfx[1], &pfy[1], &pfz[1], |
| &pfx[2], &pfy[2], &pfz[2], |
| &pfx[3], &pfy[3], &pfz[3], |
| x[0], y[0], z[0], x[1], y[1], z[1], |
| x[2], y[2], z[2], x[3], y[3], z[3]); |
| /* evaluate face two: nodes 0, 4, 5, 1 */ |
| SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0], |
| &pfx[4], &pfy[4], &pfz[4], |
| &pfx[5], &pfy[5], &pfz[5], |
| &pfx[1], &pfy[1], &pfz[1], |
| x[0], y[0], z[0], x[4], y[4], z[4], |
| x[5], y[5], z[5], x[1], y[1], z[1]); |
| /* evaluate face three: nodes 1, 5, 6, 2 */ |
| SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1], |
| &pfx[5], &pfy[5], &pfz[5], |
| &pfx[6], &pfy[6], &pfz[6], |
| &pfx[2], &pfy[2], &pfz[2], |
| x[1], y[1], z[1], x[5], y[5], z[5], |
| x[6], y[6], z[6], x[2], y[2], z[2]); |
| /* evaluate face four: nodes 2, 6, 7, 3 */ |
| SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2], |
| &pfx[6], &pfy[6], &pfz[6], |
| &pfx[7], &pfy[7], &pfz[7], |
| &pfx[3], &pfy[3], &pfz[3], |
| x[2], y[2], z[2], x[6], y[6], z[6], |
| x[7], y[7], z[7], x[3], y[3], z[3]); |
| /* evaluate face five: nodes 3, 7, 4, 0 */ |
| SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3], |
| &pfx[7], &pfy[7], &pfz[7], |
| &pfx[4], &pfy[4], &pfz[4], |
| &pfx[0], &pfy[0], &pfz[0], |
| x[3], y[3], z[3], x[7], y[7], z[7], |
| x[4], y[4], z[4], x[0], y[0], z[0]); |
| /* evaluate face six: nodes 4, 7, 6, 5 */ |
| SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4], |
| &pfx[7], &pfy[7], &pfz[7], |
| &pfx[6], &pfy[6], &pfz[6], |
| &pfx[5], &pfy[5], &pfz[5], |
| x[4], y[4], z[4], x[7], y[7], z[7], |
| x[6], y[6], z[6], x[5], y[5], z[5]); |
| } |
| |
| __host__ __device__ |
| static inline |
| void SumElemStressesToNodeForces( const Real_t B[][8], |
| const Real_t stress_xx, |
| const Real_t stress_yy, |
| const Real_t stress_zz, |
| Real_t* const fx, |
| Real_t* const fy, |
| Real_t* const fz, |
| int stride) |
| { |
| Real_t pfx0 = B[0][0] ; Real_t pfx1 = B[0][1] ; |
| Real_t pfx2 = B[0][2] ; Real_t pfx3 = B[0][3] ; |
| Real_t pfx4 = B[0][4] ; Real_t pfx5 = B[0][5] ; |
| Real_t pfx6 = B[0][6] ; Real_t pfx7 = B[0][7] ; |
| |
| Real_t pfy0 = B[1][0] ; Real_t pfy1 = B[1][1] ; |
| Real_t pfy2 = B[1][2] ; Real_t pfy3 = B[1][3] ; |
| Real_t pfy4 = B[1][4] ; Real_t pfy5 = B[1][5] ; |
| Real_t pfy6 = B[1][6] ; Real_t pfy7 = B[1][7] ; |
| |
| Real_t pfz0 = B[2][0] ; Real_t pfz1 = B[2][1] ; |
| Real_t pfz2 = B[2][2] ; Real_t pfz3 = B[2][3] ; |
| Real_t pfz4 = B[2][4] ; Real_t pfz5 = B[2][5] ; |
| Real_t pfz6 = B[2][6] ; Real_t pfz7 = B[2][7] ; |
| |
| fx[0*stride] = -( stress_xx * pfx0 ); |
| fx[1*stride] = -( stress_xx * pfx1 ); |
| fx[2*stride] = -( stress_xx * pfx2 ); |
| fx[3*stride] = -( stress_xx * pfx3 ); |
| fx[4*stride] = -( stress_xx * pfx4 ); |
| fx[5*stride] = -( stress_xx * pfx5 ); |
| fx[6*stride] = -( stress_xx * pfx6 ); |
| fx[7*stride] = -( stress_xx * pfx7 ); |
| |
| fy[0*stride] = -( stress_yy * pfy0 ); |
| fy[1*stride] = -( stress_yy * pfy1 ); |
| fy[2*stride] = -( stress_yy * pfy2 ); |
| fy[3*stride] = -( stress_yy * pfy3 ); |
| fy[4*stride] = -( stress_yy * pfy4 ); |
| fy[5*stride] = -( stress_yy * pfy5 ); |
| fy[6*stride] = -( stress_yy * pfy6 ); |
| fy[7*stride] = -( stress_yy * pfy7 ); |
| |
| fz[0*stride] = -( stress_zz * pfz0 ); |
| fz[1*stride] = -( stress_zz * pfz1 ); |
| fz[2*stride] = -( stress_zz * pfz2 ); |
| fz[3*stride] = -( stress_zz * pfz3 ); |
| fz[4*stride] = -( stress_zz * pfz4 ); |
| fz[5*stride] = -( stress_zz * pfz5 ); |
| fz[6*stride] = -( stress_zz * pfz6 ); |
| fz[7*stride] = -( stress_zz * pfz7 ); |
| } |
| |
| __global__ |
| void IntegrateStressForElems_kernel( Index_t numElem, Index_t *nodelist, |
| Real_t *x, Real_t *y, Real_t *z, |
| Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem, |
| Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, |
| Real_t *determ) |
| { |
| Real_t B[3][8] ;// shape function derivatives |
| Real_t x_local[8] ; |
| Real_t y_local[8] ; |
| Real_t z_local[8] ; |
| |
| int k=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x; |
| if (k<numElem) { |
| // get nodal coordinates from global arrays and copy into local arrays. |
| for( Index_t lnode=0 ; lnode<8 ; ++lnode ) |
| { |
| Index_t gnode = nodelist[k+lnode*numElem]; |
| x_local[lnode] = x[gnode]; |
| y_local[lnode] = y[gnode]; |
| z_local[lnode] = z[gnode]; |
| } |
| |
| /* Volume calculation involves extra work for numerical consistency. */ |
| CalcElemShapeFunctionDerivatives(x_local, y_local, z_local, |
| B, &determ[k]); |
| |
| CalcElemNodeNormals( B[0] , B[1], B[2], |
| x_local, y_local, z_local ); |
| |
| SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k], |
| &fx_elem[k], &fy_elem[k], &fz_elem[k], numElem ) ; |
| } |
| } |
| |
| __global__ |
| void AddNodeForcesFromElems_kernel( Index_t numNode, |
| Int_t *nodeElemCount, Index_t *nodeElemCornerList, |
| Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem, |
| Real_t *fx_node, Real_t *fy_node, Real_t *fz_node) |
| { |
| int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x; |
| if (i<numNode) { |
| Int_t count=nodeElemCount[i]; |
| Real_t fx,fy,fz; |
| fx=fy=fz=Real_t(0.0); |
| for (int j=0;j<count;j++) { |
| Index_t elem=nodeElemCornerList[i+numNode*j]; |
| fx+=fx_elem[elem]; fy+=fy_elem[elem]; fz+=fz_elem[elem]; |
| } |
| fx_node[i]=fx; fy_node[i]=fy; fz_node[i]=fz; |
| } |
| } |
| |
| __global__ |
| void AddNodeForcesFromElems2_kernel( Index_t numNode, |
| Int_t *nodeElemCount, Index_t *nodeElemCornerList, |
| Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem, |
| Real_t *fx_node, Real_t *fy_node, Real_t *fz_node) |
| { |
| int i=hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x; |
| if (i<numNode) { |
| Int_t count=nodeElemCount[i]; |
| Real_t fx,fy,fz; |
| fx=fy=fz=Real_t(0.0); |
| for (int j=0;j<count;j++) { |
| Index_t elem=nodeElemCornerList[i+numNode*j]; |
| fx+=fx_elem[elem]; fy+=fy_elem[elem]; fz+=fz_elem[elem]; |
| } |
| fx_node[i]+=fx; fy_node[i]+=fy; fz_node[i]+=fz; |
| } |
| } |
| |
| static inline |
| void IntegrateStressForElems_gpu( Index_t numElem, |
| Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, |
| Real_t *determ, int& badvol) |
| { |
| Real_t *fx_elem,*fy_elem,*fz_elem; |
| |
| HIP( hipHostMalloc(&fx_elem,numElem*8*sizeof(Real_t)) ); |
| HIP( hipHostMalloc(&fy_elem,numElem*8*sizeof(Real_t)) ); |
| HIP( hipHostMalloc(&fz_elem,numElem*8*sizeof(Real_t)) ); |
| |
| dim3 dimBlock=dim3(BLOCKSIZE,1,1); |
| dim3 dimGrid=dim3(PAD_DIV(numElem,dimBlock.x),1,1); |
| hipLaunchKernelGGL((IntegrateStressForElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, numElem, meshGPU.m_nodelist, meshGPU.m_x, meshGPU.m_y, meshGPU.m_z, |
| fx_elem, fy_elem, fz_elem, sigxx, sigyy, sigzz, determ); |
| HIP_DEBUGSYNC; |
| |
| dimGrid=dim3(PAD_DIV(mesh.numNode(),dimBlock.x),1,1); |
| hipLaunchKernelGGL((AddNodeForcesFromElems_kernel), dim3(dimGrid), dim3(dimBlock), 0, 0, mesh.numNode(),meshGPU.m_nodeElemCount,meshGPU.m_nodeElemCornerList, |
| fx_elem,fy_elem,fz_elem,meshGPU.m_fx,meshGPU.m_fy,meshGPU.m_fz); |
| HIP_DEBUGSYNC; |
| |
| HIP( hipFree(fx_elem) ); |
| HIP( hipFree(fy_elem) ); |
| HIP( hipFree(fz_elem) ); |
| |
| // JDC -- need a reduction step to check for non-positive element volumes |
| badvol=0; |
| } |
| |
| static inline |
| void IntegrateStressForElems_cpu( Index_t numElem, |
| Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, |
| Real_t *determ, int& badvol) |
| { |
| Real_t B[3][8] ;// shape function derivatives |
| Real_t x_local[8] ; |
| Real_t y_local[8] ; |
| Real_t z_local[8] ; |
| Real_t fx_local[8] ; |
| Real_t fy_local[8] ; |
| Real_t fz_local[8] ; |
| |
| // loop over all elements |
| for( Index_t k=0 ; k<numElem ; ++k ) |
| { |
| // get nodal coordinates from global arrays and copy into local arrays. |
| for( Index_t lnode=0 ; lnode<8 ; ++lnode ) |
| { |
| Index_t gnode = mesh.nodelist(k,lnode); |
| x_local[lnode] = mesh.x(gnode); |
| y_local[lnode] = mesh.y(gnode); |
| z_local[lnode] = mesh.z(gnode); |
| } |
| |
| /* Volume calculation involves extra work for numerical consistency. */ |
| CalcElemShapeFunctionDerivatives(x_local, y_local, z_local, |
| B, &determ[k]); |
| |
| CalcElemNodeNormals( B[0] , B[1], B[2], |
| x_local, y_local, z_local ); |
| |
| SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k], |
| fx_local, fy_local, fz_local, 1 ) ; |
| |
| // copy nodal force contributions to global force arrray. |
| for( Index_t lnode=0 ; lnode<8 ; ++lnode ) |
| { |
| Index_t gnode = mesh.nodelist(k,lnode); |
| mesh.fx(gnode) += fx_local[lnode]; |
| mesh.fy(gnode) += fy_local[lnode]; |
| mesh.fz(gnode) += fz_local[lnode]; |
| } |
| } |
| |
| badvol=0; |
| for ( Index_t k=0 ; k<numElem ; ++k ) { |
| if (determ[k] <= Real_t(0.0)) { |
| badvol=1; |
| } |
| } |
| } |
| |
| static inline |
| void IntegrateStressForElems( Index_t numElem, |
| Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, |
| Real_t *determ, int& badvol, int useCPU) |
| { |
| if (useCPU) { |
| FC(nodelist); FC(x); FC(y); FC(z); |
| IntegrateStressForElems_cpu(numElem,sigxx,sigyy,sigzz,determ,badvol); |
| SG(fx); SG(fy); SG(fz); |
| } |
| else { |
| FG(nodelist); FG(nodeElemCount); FG(nodeElemCornerList); |
| FG(x); FG(y); FG(z); |
| IntegrateStressForElems_gpu(numElem,sigxx,sigyy,sigzz,determ,badvol); |
| SC(fx); SC(fy); SC(fz); |
| } |
| |
| } |
| |
| |
| static inline |
| void CollectDomainNodesToElemNodes(const Index_t elemNum, |
| Real_t elemX[8], |
| Real_t elemY[8], |
| Real_t elemZ[8]) |
| { |
| Index_t nd0i = mesh.nodelist(elemNum,0) ; |
| Index_t nd1i = mesh.nodelist(elemNum,1) ; |
| Index_t nd2i = mesh.nodelist(elemNum,2) ; |
| Index_t nd3i = mesh.nodelist(elemNum,3) ; |
| Index_t nd4i = mesh.nodelist(elemNum,4) ; |
| Index_t nd5i = mesh.nodelist(elemNum,5) ; |
| Index_t nd6i = mesh.nodelist(elemNum,6) ; |
| Index_t nd7i = mesh.nodelist(elemNum,7) ; |
| |
| elemX[0] = mesh.x(nd0i); |
| elemX[1] = mesh.x(nd1i); |
| elemX[2] = mesh.x(nd2i); |
| elemX[3] = mesh.x(nd3i); |
| elemX[4] = mesh.x(nd4i); |
| elemX[5] = mesh.x(nd5i); |
| elemX[6] = mesh.x(nd6i); |
| elemX[7] = mesh.x(nd7i); |
| |
| elemY[0] = mesh.y(nd0i); |
| elemY[1] = mesh.y(nd1i); |
| elemY[2] = mesh.y(nd2i); |
| elemY[3] = mesh.y(nd3i); |
| elemY[4] = mesh.y(nd4i); |
| elemY[5] = mesh.y(nd5i); |
| elemY[6] = mesh.y(nd6i); |
| elemY[7] = mesh.y(nd7i); |
| |
| elemZ[0] = mesh.z(nd0i); |
| elemZ[1] = mesh.z(nd1i); |
| elemZ[2] = mesh.z(nd2i); |
| elemZ[3] = mesh.z(nd3i); |
| elemZ[4] = mesh.z(nd4i); |
| elemZ[5] = mesh.z(nd5i); |
| elemZ[6] = mesh.z(nd6i); |
| elemZ[7] = mesh.z(nd7i); |
| |
| } |
| |
| |
| __host__ |
| static inline |
| void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2, |
| const Real_t x3, const Real_t x4, const Real_t x5, |
| const Real_t y0, const Real_t y1, const Real_t y2, |
| const Real_t y3, const Real_t y4, const Real_t y5, |
| const Real_t z0, const Real_t z1, const Real_t z2, |
| const Real_t z3, const Real_t z4, const Real_t z5, |
| Real_t* dvdx, Real_t* dvdy, Real_t* dvdz) |
| { |
| const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ; |
| |
| *dvdx = |
| (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) + |
| (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) - |
| (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5); |
| *dvdy = |
| - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) - |
| (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) + |
| (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5); |
| |
| *dvdz = |
| - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) - |
| (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) + |
| (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5); |
| |
| *dvdx *= twelfth; |
| *dvdy *= twelfth; |
| *dvdz *= twelfth; |
| } |
| |
| #if 0 |
| __device__ |
| static inline |
| void VOLUDER(const Real_t a0, const Real_t a1, const Real_t a2, |
| const Real_t a3, const Real_t a4, const Real_t a5, |
| const Real_t b0, const Real_t b1, const Real_t b2, |
| const Real_t b3, const Real_t b4, const Real_t b5, |
| Real_t& dvdc) |
| { |
| const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ; |
| |
| dvdc= |
| (a1 + a2) * (b0 + b1) - (a0 + a1) * (b1 + b2) + |
| (a0 + a4) * (b3 + b4) - (a3 + a4) * (b0 + b4) - |
| (a2 + a5) * (b3 + b5) + (a3 + a5) * (b2 + b5); |
| dvdc *= twelfth; |
| } |
| #else |
| // Even though the above version is inlined, it seems to prohibit some kind of compiler optimization. |
| // This macro version uses many fewer registers and avoids spill-over into local memory. |
| #define VOLUDER(a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5,dvdc)\ |
| {\ |
| const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;\ |
| \ |
| dvdc= \ |
| ((a1) + (a2)) * ((b0) + (b1)) - ((a0) + (a1)) * ((b1) + (b2)) +\ |
| ((a0) + (a4)) * ((b3) + (b4)) - ((a3) + (a4)) * ((b0) + (b4)) -\ |
| ((a2) + (a5)) * ((b3) + (b5)) + ((a3) + (a5)) * ((b2) + (b5));\ |
| dvdc *= twelfth;\ |
| } |
| #endif |
| |
| __host__ |
| static inline |
| void CalcElemVolumeDerivative(Real_t dvdx[8], |
| Real_t dvdy[8], |
| Real_t dvdz[8], |
| const Real_t x[8], |
| const Real_t y[8], |
| const Real_t z[8]) |
| { |
| VoluDer(x[1], x[2], x[3], x[4], x[5], x[7], |
| y[1], y[2], y[3], y[4], y[5], y[7], |
| z[1], z[2], z[3], z[4], z[5], z[7], |
| &dvdx[0], &dvdy[0], &dvdz[0]); |
| VoluDer(x[0], x[1], x[2], x[7], x[4], x[6], |
| y[0], y[1], y[2], y[7], y[4], y[6], |
| z[0], z[1], z[2], z[7], z[4], z[6], |
| &dvdx[3], &dvdy[3], &dvdz[3]); |
| VoluDer(x[3], x[0], x[1], x[6], x[7], x[5], |
| y[3], y[0], y[1], y[6], y[7], y[5], |
| z[3], z[0], z[1], z[6], z[7], z[5], |
| &dvdx[2], &dvdy[2], &dvdz[2]); |
| VoluDer(x[2], x[3], x[0], x[5], x[6], x[4], |
| y[2], y[3], y[0], y[5], y[6], y[4], |
| z[2], z[3], z[0], z[5], z[6], z[4], |
| &dvdx[1], &dvdy[1], &dvdz[1]); |
| VoluDer(x[7], x[6], x[5], x[0], x[3], x[1], |
| y[7], y[6], y[5], y[0], y[3], y[1], |
| z[7], z[6], z[5], z[0], z[3], z[1], |
| &dvdx[4], &dvdy[4], &dvdz[4]); |
| VoluDer(x[4], x[7], x[6], x[1], x[0], x[2], |
| y[4], y[7], y[6], y[1], y[0], y[2], |
| z[4], z[7], z[6], z[1], z[0], z[2], |
| &dvdx[5], &dvdy[5], &dvdz[5]); |
| VoluDer(x[5], x[4], x[7], x[2], x[1], x[3], |
| y[5], y[4], y[7], y[2], y[1], y[3], |
| z[5], z[4], z[7], z[2], z[1], z[3], |
| &dvdx[6], &dvdy[6], &dvdz[6]); |
| VoluDer(x[6], x[5], x[4], x[3], x[2], x[0], |
| y[6], y[5], y[4], y[3], y[2], y[0], |
| z[6], z[5], z[4], z[3], z[2], z[0], |
| &dvdx[7], &dvdy[7], &dvdz[7]); |
| } |
| |
| __device__ |
| static inline |
| void CalcElemVolumeDerivative(Real_t& dvdx, |
| Real_t& dvdy, |
| Real_t& dvdz, |
| const Real_t x, |
| const Real_t y, |
| const Real_t z, |
| unsigned int node) |
| { |
| __shared__ Real_t array1[256],array2[256]; |
| volatile Real_t *va1; |
| volatile Real_t *va2; |
| |
| unsigned int idx,elem; |
| unsigned int ind0,ind1,ind2,ind3,ind4,ind5; |
| |
| switch(node) { |
| case 0: |
| {ind0=1; ind1=2; ind2=3; ind3=4; ind4=5; ind5=7; |
| break;} |
| case 1: |
| {ind0=2; ind1=3; ind2=0; ind3=5; ind4=6; ind5=4; |
| break;} |
| case 2: |
| {ind0=3; ind1=0; ind2=1; ind3=6; ind4=7; ind5=5; |
| break;} |
| case 3: |
| {ind0=0; ind1=1; ind2=2; ind3=7; ind4=4; ind5=6; |
| break;} |
| case 4: |
| {ind0=7; ind1=6; ind2=5; ind3=0; ind4=3; ind5=1; |
| break;} |
| case 5: |
| {ind0=4; ind1=7; ind2=6; ind3=1; ind4=0; ind5=2; |
| break;} |
| case 6: |
| {ind0=5; ind1=4; ind2=7; ind3=2; ind4=1; ind5=3; |
| break;} |
| case 7: |
| {ind0=6; ind1=5; ind2=4; ind3=3; ind4=2; ind5=0; |
| break;} |
| default: |
| {ind0=ind1=ind2=ind3=ind4=ind5=0xFFFFFFFF; |
| break;} |
| } |
| |
| idx=hipThreadIdx_x; |
| elem=idx /*& 0x1F*/ - node*32; |
| |
| va1=&array1[0]; |
| va2=&array2[0]; |
| |
| // load y and z |
| __syncthreads(); |
| va1[idx]=y; va2[idx]=z; |
| __syncthreads(); |
| VOLUDER(va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem], |
| va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem], |
| va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem], |
| va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem], |
| dvdx); |
| |
| // load x |
| __syncthreads(); |
| va1[idx]=x; |
| __syncthreads(); |
| VOLUDER(va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem], |
| va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem], |
| va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem], |
| va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem], |
| dvdy); |
| __syncthreads(); |
| |
| // load y |
| __syncthreads(); |
| va2[idx]=y; |
| __syncthreads(); |
| VOLUDER(va1[ind0*32+elem],va1[ind1*32+elem],va1[ind2*32+elem], |
| va1[ind3*32+elem],va1[ind4*32+elem],va1[ind5*32+elem], |
| va2[ind0*32+elem],va2[ind1*32+elem],va2[ind2*32+elem], |
| va2[ind3*32+elem],va2[ind4*32+elem],va2[ind5*32+elem], |
| dvdz); |
| __syncthreads(); |
| } |
| |
| __host__ |
| static inline |
| void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd, Real_t *hourgam0, |
| Real_t *hourgam1, Real_t *hourgam2, Real_t *hourgam3, |
| Real_t *hourgam4, Real_t *hourgam5, Real_t *hourgam6, |
| Real_t *hourgam7, Real_t coefficient, |
| Real_t *hgfx, Real_t *hgfy, Real_t *hgfz ) |
| { |
| Index_t i00=0; |
| Index_t i01=1; |
| Index_t i02=2; |
| Index_t i03=3; |
| |
| Real_t h00 = |
| hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] + |
| hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] + |
| hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] + |
| hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7]; |
| |
| Real_t h01 = |
| hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] + |
| hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] + |
| hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] + |
| hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7]; |
| |
| Real_t h02 = |
| hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+ |
| hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+ |
| hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+ |
| hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7]; |
| |
| Real_t h03 = |
| hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] + |
| hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] + |
| hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] + |
| hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7]; |
| |
| hgfx[0] = coefficient * |
| (hourgam0[i00] * h00 + hourgam0[i01] * h01 + |
| hourgam0[i02] * h02 + hourgam0[i03] * h03); |
| |
| hgfx[1] = coefficient * |
| (hourgam1[i00] * h00 + hourgam1[i01] * h01 + |
| hourgam1[i02] * h02 + hourgam1[i03] * h03); |
| |
| hgfx[2] = coefficient * |
| (hourgam2[i00] * h00 + hourgam2[i01] * h01 + |
| hourgam2[i02] * h02 + hourgam2[i03] * h03); |
| |
| hgfx[3] = coefficient * |
| (hourgam3[i00] * h00 + hourgam3[i01] * h01 + |
| hourgam3[i02] * h02 + hourgam3[i03] * h03); |
| |
| hgfx[4] = coefficient * |
| (hourgam4[i00] * h00 + hourgam4[i01] * h01 + |
| hourgam4[i02] * h02 + hourgam4[i03] * h03); |
| |
| hgfx[5] = coefficient * |
| (hourgam5[i00] * h00 + hourgam5[i01] * h01 + |
| hourgam5[i02] * h02 + hourgam5[i03] * h03); |
| |
| hgfx[6] = coefficient * |
| (hourgam6[i00] * h00 + hourgam6[i01] * h01 + |
| hourgam6[i02] * h02 + hourgam6[i03] * h03); |
| |
| hgfx[7] = coefficient * |
| (hourgam7[i00] * h00 + hourgam7[i01] * h01 + |
| hourgam7[i02] * h02 + hourgam7[i03] * h03); |
| |
| h00 = |
| hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] + |
| hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] + |
| hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] + |
| hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7]; |
| |
| h01 = |
| hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] + |
| hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] + |
| hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] + |
| hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7]; |
| |
| h02 = |
| hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+ |
| hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+ |
| hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+ |
| hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7]; |
| |
| h03 = |
| hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] + |
| hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] + |
| hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] + |
| hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7]; |
| |
| |
| hgfy[0] = coefficient * |
| (hourgam0[i00] * h00 + hourgam0[i01] * h01 + |
| hourgam0[i02] * h02 + hourgam0[i03] * h03); |
| |
| hgfy[1] = coefficient * |
| (hourgam1[i00] * h00 + hourgam1[i01] * h01 + |
| hourgam1[i02] * h02 + hourgam1[i03] * h03); |
| |
| hgfy[2] = coefficient * |
| (hourgam2[i00] * h00 + hourgam2[i01] * h01 + |
| hourgam2[i02] * h02 + hourgam2[i03] * h03); |
| |
| hgfy[3] = coefficient * |
| (hourgam3[i00] * h00 + hourgam3[i01] * h01 + |
| hourgam3[i02] * h02 + hourgam3[i03] * h03); |
| |
| hgfy[4] = coefficient * |
| (hourgam4[i00] * h00 + hourgam4[i01] * h01 + |
| hourgam4[i02] * h02 + hourgam4[i03] * h03); |
| |
| hgfy[5] = coefficient * |
| (hourgam5[i00] * h00 + hourgam5[i01] * h01 + |
| hourgam5[i02] * h02 + hourgam5[i03] * h03); |
| |
| hgfy[6] = coefficient * |
| (hourgam6[i00] * h00 + hourgam6[i01] * h01 + |
| hourgam6[i02] * h02 + hourgam6[i03] * h03); |
| |
| hgfy[7] = coefficient * |
| (hourgam7[i00] * h00 + hourgam7[i01] * h01 + |
| hourgam7[i02] * h02 + hourgam7[i03] * h03); |
| |
| h00 = |
| hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] + |
| hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] + |
| hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] + |
| hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7]; |
| |
| h01 = |
| hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] + |
| hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] + |
| hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] + |
| hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7]; |
| |
| h02 = |
| hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+ |
| hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+ |
| hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+ |
| hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7]; |
| |
| h03 = |
| hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] + |
| hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] + |
| hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] + |
| hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7]; |
| |
| |
| hgfz[0] = coefficient * |
| (hourgam0[i00] * h00 + hourgam0[i01] * h01 + |
| hourgam0[i02] * h02 + hourgam0[i03] * h03); |
| |
| hgfz[1] = coefficient * |
| (hourgam1[i00] * h00 + hourgam1[i01] * h01 + |
| hourgam1[i02] * h02 + hourgam1[i03] * h03); |
| |
| hgfz[2] = coefficient * |
| (hourgam2[i00] * h00 + hourgam2[i01] * h01 + |
| hourgam2[i02] * h02 + hourgam2[i03] * h03); |
| |
| hgfz[3] = coefficient * |
| (hourgam3[i00] * h00 + hourgam3[i01] * h01 + |
| hourgam3[i02] * h02 + hourgam3[i03] * h03); |
| |
| hgfz[4] = coefficient * |
| (hourgam4[i00] * h00 + hourgam4[i01] * h01 + |
| hourgam4[i02] * h02 + hourgam4[i03] * h03); |
| |
| hgfz[5] = coefficient * |
| (hourgam5[i00] * h00 + hourgam5[i01] * h01 + |
| hourgam5[i02] * h02 + hourgam5[i03] * h03); |
| |
| hgfz[6] = coefficient * |
| (hourgam6[i00] * h00 + hourgam6[i01] * h01 + |
| hourgam6[i02] * h02 + hourgam6[i03] * h03); |
| |
| hgfz[7] = coefficient * |
| (hourgam7[i00] * h00 + hourgam7[i01] * h01 + |
| hourgam7[i02] * h02 + hourgam7[i03] * h03); |
| } |
| |
| |
| __device__ |
| static inline |
| Real_t SumOverNodes(Real_t val) { |
| __shared__ Real_t shm_array[32*8]; |
| |
| // Sum up 8 node values for each element |
| // Assumes 256 threads: 32 elements, 8 nodes per element. |
| // NOTE: we could probably avoid some of the __syncthreads() if we map 8 nodes |
| // of an element to the same warp. |
| unsigned int tid=hipThreadIdx_x; |
| |
| #if 1 |
| #if 0 |
| unsigned int node=tid>>5; |
| unsigned int elem=tid-(node<<5); |
| #elif 1 |
| unsigned int node=tid/32; |
| unsigned int elem=tid-(node*32); |
| #else |
| unsigned int elem=tid & 0x1F; |
| #endif |
| __syncthreads(); |
| shm_array[tid]=val; |
| __syncthreads(); |
| if (tid<128) shm_array[tid]+=shm_array[tid+128]; |
| __syncthreads(); |
| if (tid<64) shm_array[tid]+=shm_array[tid+64]; |
| __syncthreads(); |
| if (tid<32) shm_array[tid]+=shm_array[tid+32]; |
| __syncthreads(); |
| Real_t ret=shm_array[elem]; |
| __syncthreads(); |
| return ret; |
| #else |
| #if 0 |
| unsigned int node=tid>>5; |
| unsigned int elem=tid-(node<<5); |
| #else |
| unsigned int node=tid/32; |
| unsigned int elem=tid-(node*32); |
| #endif |
| unsigned int idx=elem*8+node; |
| __syncthreads(); |
| shm_array[idx]=val; |
| __syncthreads(); |
| if (node<4) shm_array[idx]+=shm_array[idx+4]; |
| if (node<2) shm_array[idx]+=shm_array[idx+2]; |
| if (node<1) shm_array[idx]+=shm_array[idx+1]; |
| __syncthreads(); |
| return shm_array[elem*8]; |
| #endif |
| } |
| |
| __device__ |
| static inline |
| void CalcElemFBHourglassForce(Real_t xd,Real_t yd,Real_t zd, |
| Real_t *hourgam,Real_t coefficient, |
| Real_t &hgfx, Real_t &hgfy, Real_t &hgfz) |
| { |
| hgfx=0; |
| for (int i=0;i<4;i++) { |
| Real_t h; |
| h=hourgam[i]*xd; |
| h=SumOverNodes(h); |
| hgfx+=hourgam[i]*h; |
| } |
| hgfx *= coefficient; |
| |
| hgfy=0; |
| for (int i=0;i<4;i++) { |
| Real_t h; |
| h=hourgam[i]*yd; |
| h=SumOverNodes(h); |
| hgfy+=hourgam[i]*h; |
| } |
| hgfy *= coefficient; |
| |
| hgfz=0; |
| for (int i=0;i<4;i++) { |
| Real_t h; |
| h=hourgam[i]*zd; |
| h=SumOverNodes(h); |
| hgfz+=hourgam[i]*h; |
| } |
| hgfz *= coefficient; |
| } |
| |
| __global__ |
| void CalcFBHourglassForceForElems_kernel( |
| Real_t *determ, |
| Real_t *x8n, Real_t *y8n, Real_t *z8n, |
| Real_t *dvdx, Real_t *dvdy, Real_t *dvdz, |
| Real_t hourg, |
| Index_t numElem, Index_t *nodelist, |
| Real_t *ss, Real_t *elemMass, |
| Real_t *xd, Real_t *yd, Real_t *zd, |
| Real_t *fx_elem, Real_t *fy_elem, Real_t *fz_elem) |
| { |
| /************************************************* |
| * |
| * FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass |
| * force. |
| * |
| *************************************************/ |
| |
| Real_t hgfx, hgfy, hgfz; |
| |
| Real_t coefficient; |
| |
| Real_t hourgam[4]; |
| Real_t xd1, yd1, zd1; |
| |
| /*************************************************/ |
| /* compute the hourglass modes */ |
| |
| const Real_t posf = Real_t( 1.); |
| const Real_t negf = Real_t(-1.); |
| |
| // Assume we will launch 256 threads, which we map to 32 elements, each |
| // with 8 per-node threads. Organize so each warp of 32 consecutive |
| // threads operates on the same node of different elements. |
| |
| |
| // THESE ARE ALL GIVING ME DIFFERENT ANSWERS IN CUDA 4.0 !!?!!?!! |
| unsigned int tid=hipThreadIdx_x; |
| unsigned int bid=hipBlockIdx_x; |
| #if 0 |
| unsigned int node=tid>>5; |
| unsigned int elem=bid<<5 + (tid - (node<<5)); |
| #elif 1 |
| unsigned int node=tid/32; |
| unsigned int elem=bid*32 + (tid-node*32); |
| #elif 0 |
| unsigned int node=tid/32;; |
| unsigned int elem=bid*32 + (tid & 0x1F); |
| #elif 0 |
| unsigned int node=tid/32; |
| unsigned int elem=bid<<5 + (tid & 0x1F); |
| #elif 0 |
| unsigned int node=tid>>5; |
| unsigned int elem=bid*32 + (tid & 0x1F); |
| #else |
| unsigned int node=tid>>5; |
| unsigned int elem=bid<<5 + (tid & 0x1F); |
| #endif |
| |
| if (elem>=numElem) elem=numElem-1; // don't return -- need thread to participate in sync operations |
| |
| //if (elem<0) elem=0; // debugging test |
| |
| Real_t volinv=Real_t(1.0)/determ[elem]; |
| Real_t ss1, mass1, volume13 ; |
| |
| Real_t xn,yn,zn,dvdxn,dvdyn,dvdzn; |
| Real_t hourmodx, hourmody, hourmodz; |
| |
| |
| #if 1 |
| xn=x8n[elem+numElem*node]; yn=y8n[elem+numElem*node]; zn=z8n[elem+numElem*node]; |
| dvdxn=dvdx[elem+numElem*node]; dvdyn=dvdy[elem+numElem*node]; dvdzn=dvdz[elem+numElem*node]; |
| #else |
| xn=yn=zn=posf; dvdxn=dvdyn=dvdzn=negf; |
| #endif |
| |
| #if 1 |
| hourmodx=xn; hourmody=yn; hourmodz=zn; |
| if (node==2 || node==3 || node==4 || node==5) { |
| hourmodx *= negf; hourmody *= negf; hourmodz *= negf; |
| hourgam[0] = negf; |
| } |
| else hourgam[0] = posf; |
| hourmodx = SumOverNodes(hourmodx); |
| hourmody = SumOverNodes(hourmody); |
| hourmodz = SumOverNodes(hourmodz); |
| hourgam[0] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz); |
| |
| |
| hourmodx=xn; hourmody=yn; hourmodz=zn; |
| if (node==1 || node==2 || node==4 || node==7) { |
| hourmodx *= negf; hourmody *= negf; hourmodz *= negf; |
| hourgam[1] = negf; |
| } |
| else hourgam[1] = posf; |
| hourmodx = SumOverNodes(hourmodx); |
| hourmody = SumOverNodes(hourmody); |
| hourmodz = SumOverNodes(hourmodz); |
| hourgam[1] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz); |
| |
| |
| hourmodx=xn; hourmody=yn; hourmodz=zn; |
| if (node==1 || node==3 || node==5 || node==7) { |
| hourmodx *= negf; hourmody *= negf; hourmodz *= negf; |
| hourgam[2] = negf; |
| } |
| else hourgam[2] = posf; |
| hourmodx = SumOverNodes(hourmodx); |
| hourmody = SumOverNodes(hourmody); |
| hourmodz = SumOverNodes(hourmodz); |
| hourgam[2] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz); |
| |
| |
| hourmodx=xn; hourmody=yn; hourmodz=zn; |
| if (node==0 || node==2 || node==5 || node==7) { |
| hourmodx *= negf; hourmody *= negf; hourmodz *= negf; |
| hourgam[3] = negf; |
| } |
| else hourgam[3] = posf; |
| hourmodx = SumOverNodes(hourmodx); |
| hourmody = SumOverNodes(hourmody); |
| hourmodz = SumOverNodes(hourmodz); |
| hourgam[3] -= volinv*(dvdxn*hourmodx + dvdyn*hourmody + dvdzn*hourmodz); |
| |
| |
| /* compute forces */ |
| /* store forces into h arrays (force arrays) */ |
| |
| ss1=ss[elem]; |
| |