resources: Update HACC for ROCm 4.0 and gfx803
This patch updates HACC to build under ROCm 4 and Ubuntu 20 by updating
the appropriate environment variables in the Dockerfile to account for
both the updates to ROCm paths and MPI/OMP paths.
Additionally, this sets the HCC_AMDGPU_TARGET environment variable to
build HACC for both gfx801 and gfx803 at the same time.
Finally, this patch updates the HACC source code to be compatible with
ROCm 4 and the bugs that exist between gem5 and ROCm 4 by replacing
mallocs and hipMallocs with hipHostMalloc, and hipDeviceSynchronize with
hipStreamSynchronize
The output has been validated against and matches a run done on a
real GPU
Change-Id: I91929abd3fdb58ff2a022b3f41c0053435c183ba
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/50987
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Reviewed-by: Alex Dutu <alexandru.dutu@amd.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>
diff --git a/src/gpu/halo-finder/Dockerfile b/src/gpu/halo-finder/Dockerfile
index f02483e..e3ddf6b 100644
--- a/src/gpu/halo-finder/Dockerfile
+++ b/src/gpu/halo-finder/Dockerfile
@@ -1,20 +1,22 @@
-FROM gcr.io/gem5-test/gcn-gpu
+FROM gcr.io/gem5-test/gcn-gpu:latest
RUN apt-get update && apt-get -y install libopenmpi-dev libomp-dev
+ENV HCC_AMDGPU_TARGET="gfx801,gfx803"
+
ENV HIPCC_BIN=/opt/rocm/bin
-ENV MPI_INCLUDE=/usr/lib/openmpi/include
+ENV MPI_INCLUDE=/usr/lib/x86_64-linux-gnu/openmpi/include
ENV OPT="-O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO"
-ENV OMP="-fopenmp"
+ENV OMP="-I/usr/lib/llvm-10/include/openmp -L/usr/lib/llvm-10/lib -fopenmp"
ENV HIPCC_FLAGS="-v -ffast_math -DINLINE_FORCE -I${MPI_INCLUDE}"
-ENV HIPCC_FLAGS="-v -I${MPI_INCLUDE} -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include"
+ENV HIPCC_FLAGS="-v -I${MPI_INCLUDE} -I/opt/rocm/hip/include"
ENV HACC_PLATFORM="hip"
ENV HACC_OBJDIR="${HACC_PLATFORM}"
ENV HACC_CFLAGS="$OPT $OMP $HIPCC_FLAGS"
-ENV HACC_CC="${HIPCC_BIN}/hcc -x c -Xclang -std=c99"
+ENV HACC_CC="${HIPCC_BIN}/hipcc -x c -Xclang -std=c99"
ENV HACC_CXXFLAGS="$OPT $OMP $HIPCC_FLAGS"
ENV HACC_CXX="${HIPCC_BIN}/hipcc -Xclang"
@@ -26,7 +28,7 @@
ENV USE_SERIAL_COSMO="1"
ENV HACC_NUM_CUDA_DEV="1"
ENV HACC_MPI_CFLAGS="$OPT $OMP $HIPCC_FLAGS"
-ENV HACC_MPI_CC="${HIPCC_BIN}/hcc -x c -Xclang -std=c99 -Xclang -pthread"
+ENV HACC_MPI_CC="${HIPCC_BIN}/hipcc -x c -Xclang -std=c99 -Xclang -pthread"
ENV HACC_MPI_CXXFLAGS="$OPT $OMP $HIPCC_FLAGS"
ENV HACC_MPI_CXX="${HIPCC_BIN}/hipcc -Xclang -pthread"
diff --git a/src/gpu/halo-finder/src/ForceTreeTest.cxx b/src/gpu/halo-finder/src/ForceTreeTest.cxx
index 176fb1f..9a5e4ef 100644
--- a/src/gpu/halo-finder/src/ForceTreeTest.cxx
+++ b/src/gpu/halo-finder/src/ForceTreeTest.cxx
@@ -5,6 +5,7 @@
//newton should match theory prediction
//short range may not due to lack of gauss law
+#include "hip/hip_runtime.h"
#include <cassert>
#include <cmath>
#include <iostream>
@@ -79,13 +80,30 @@
float m_rsm = 0.1;
int Np = nSphere+1;
- POSVEL_T* m_xArr = new POSVEL_T[Np];
- POSVEL_T* m_yArr = new POSVEL_T[Np];
- POSVEL_T* m_zArr = new POSVEL_T[Np];
- POSVEL_T* m_vxArr = new POSVEL_T[Np];
- POSVEL_T* m_vyArr = new POSVEL_T[Np];
- POSVEL_T* m_vzArr = new POSVEL_T[Np];
- POSVEL_T* m_massArr = new POSVEL_T[Np];
+
+ POSVEL_T* m_xArr;
+ hipHostMalloc(&m_xArr, Np*sizeof(POSVEL_T));
+ POSVEL_T* m_yArr;
+ hipHostMalloc(&m_yArr, Np*sizeof(POSVEL_T));
+ POSVEL_T* m_zArr;
+ hipHostMalloc(&m_zArr, Np*sizeof(POSVEL_T));
+ POSVEL_T* m_vxArr;
+ hipHostMalloc(&m_vxArr, Np*sizeof(POSVEL_T));
+ POSVEL_T* m_vyArr;
+ hipHostMalloc(&m_vyArr, Np*sizeof(POSVEL_T));
+ POSVEL_T* m_vzArr;
+ hipHostMalloc(&m_vzArr, Np*sizeof(POSVEL_T));
+ POSVEL_T* m_massArr;
+ hipHostMalloc(&m_massArr, Np*sizeof(POSVEL_T));
+ for(int i = 0; i < Np; i++) {
+ m_xArr[i] = 0;
+ m_yArr[i] = 0;
+ m_zArr[i] = 0;
+ m_vxArr[i] = 0;
+ m_vyArr[i] = 0;
+ m_vzArr[i] = 0;
+ m_massArr[i] = 0;
+ }
FGrid *m_fg = new FGrid();
FGridEval *m_fgore = new FGridEvalFit(m_fg);
@@ -308,13 +326,13 @@
delete m_fgore;
delete m_fg;
- delete [] m_xArr;
- delete [] m_yArr;
- delete [] m_zArr;
- delete [] m_vxArr;
- delete [] m_vyArr;
- delete [] m_vzArr;
- delete [] m_massArr;
+ hipHostFree(m_xArr);
+ hipHostFree(m_yArr);
+ hipHostFree(m_zArr);
+ hipHostFree(m_vxArr);
+ hipHostFree(m_vyArr);
+ hipHostFree(m_vzArr);
+ hipHostFree(m_massArr);
#ifndef USE_SERIAL_COSMO
MPI_Finalize();
diff --git a/src/gpu/halo-finder/src/RCBForceTree.cxx b/src/gpu/halo-finder/src/RCBForceTree.cxx
index 8774902..0c18dd3 100644
--- a/src/gpu/halo-finder/src/RCBForceTree.cxx
+++ b/src/gpu/halo-finder/src/RCBForceTree.cxx
@@ -405,37 +405,47 @@
// static size for the interaction list
#define VMAX ALIGNY(16384)
- nx_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
- ny_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
- nz_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
- nm_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&nx_v, VMAX*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&ny_v, VMAX*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&nz_v, VMAX*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&nm_v, VMAX*sizeof(POSVEL_T)*numThreads);
+ for(int i = 0; i < VMAX*numThreads; i++) {
+ nx_v[i] = 0;
+ ny_v[i] = 0;
+ nz_v[i] = 0;
+ nm_v[i] = 0;
+ }
+
#ifdef __HIPCC__
- hipHostRegister(nx_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
- hipHostRegister(ny_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
- hipHostRegister(nz_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
- hipHostRegister(nm_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
- hipHostRegister(xx,count*sizeof(POSVEL_T),0);
- hipHostRegister(yy,count*sizeof(POSVEL_T),0);
- hipHostRegister(zz,count*sizeof(POSVEL_T),0);
- hipHostRegister(mass,count*sizeof(POSVEL_T),0);
- hipHostRegister(vx,count*sizeof(POSVEL_T),0);
- hipHostRegister(vy,count*sizeof(POSVEL_T),0);
- hipHostRegister(vz,count*sizeof(POSVEL_T),0);
-
int size=ALIGNY(nd);
- hipMalloc(&d_xx,size*sizeof(POSVEL_T)*numThreads);
- hipMalloc(&d_yy,size*sizeof(POSVEL_T)*numThreads);
- hipMalloc(&d_zz,size*sizeof(POSVEL_T)*numThreads);
- hipMalloc(&d_vx,size*sizeof(POSVEL_T)*numThreads);
- hipMalloc(&d_vy,size*sizeof(POSVEL_T)*numThreads);
- hipMalloc(&d_vz,size*sizeof(POSVEL_T)*numThreads);
- hipMalloc(&d_mass,size*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_xx,size*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_yy,size*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_zz,size*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_vx,size*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_vy,size*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_vz,size*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_mass,size*sizeof(POSVEL_T)*numThreads);
+ for(int i = 0; i < size*numThreads; i++) {
+ d_xx[i] = 0;
+ d_yy[i] = 0;
+ d_zz[i] = 0;
+ d_vx[i] = 0;
+ d_vy[i] = 0;
+ d_vz[i] = 0;
+ d_mass[i] = 0;
+ }
- hipMalloc(&d_nx_v,VMAX*sizeof(POSVEL_T)*numThreads);
- hipMalloc(&d_ny_v,VMAX*sizeof(POSVEL_T)*numThreads);
- hipMalloc(&d_nz_v,VMAX*sizeof(POSVEL_T)*numThreads);
- hipMalloc(&d_nm_v,VMAX*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_nx_v,VMAX*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_ny_v,VMAX*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_nz_v,VMAX*sizeof(POSVEL_T)*numThreads);
+ hipHostMalloc(&d_nm_v,VMAX*sizeof(POSVEL_T)*numThreads);
+ for(int i = 0; i < VMAX*numThreads; i++) {
+ d_nx_v[i] = 0;
+ d_ny_v[i] = 0;
+ d_nz_v[i] = 0;
+ d_nm_v[i] = 0;
+ }
cudaCheckError();
@@ -545,29 +555,17 @@
delete m_fl;
}
#ifdef __HIPCC__
- hipHostUnregister(xx);
- hipHostUnregister(yy);
- hipHostUnregister(zz);
- hipHostUnregister(mass);
- hipHostUnregister(vx);
- hipHostUnregister(vy);
- hipHostUnregister(vz);
- hipHostUnregister(nx_v);
- hipHostUnregister(ny_v);
- hipHostUnregister(nz_v);
- hipHostUnregister(nm_v);
-
- hipFree(d_xx);
- hipFree(d_yy);
- hipFree(d_zz);
- hipFree(d_vx);
- hipFree(d_vy);
- hipFree(d_vz);
- hipFree(d_mass);
- hipFree(d_nx_v);
- hipFree(d_ny_v);
- hipFree(d_nz_v);
- hipFree(d_nm_v);
+ hipHostFree(d_xx);
+ hipHostFree(d_yy);
+ hipHostFree(d_zz);
+ hipHostFree(d_vx);
+ hipHostFree(d_vy);
+ hipHostFree(d_vz);
+ hipHostFree(d_mass);
+ hipHostFree(d_nx_v);
+ hipHostFree(d_ny_v);
+ hipHostFree(d_nz_v);
+ hipHostFree(d_nm_v);
cudaCheckError();
for(int i=0;i<numThreads;i++) {
@@ -580,10 +578,10 @@
free(stream_v);
#endif
- free(nx_v);
- free(ny_v);
- free(nz_v);
- free(nm_v);
+ hipHostFree(nx_v);
+ hipHostFree(ny_v);
+ hipHostFree(nz_v);
+ hipHostFree(nm_v);
#ifdef __HIPCC__
//nvtxRangeEnd(r0);
#endif
@@ -859,9 +857,13 @@
POSVEL_T zzj[TILEX];
POSVEL_T massj[TILEX];
+ // Consolidate variables to help fit within the register limit
+ int x_idx = hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;
+ int y_idx = hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;
+
//loop over interior region and calculate forces.
//for each tile i
- for(int i=hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count/TILEY;i+=hipBlockDim_y*hipGridDim_y) //1 ISETP
+ for(int i=y_idx;i<count/TILEY;i+=hipBlockDim_y*hipGridDim_y) //1 ISETP
{
POSVEL_T xi[TILEY]={0}; //TILEY MOV
POSVEL_T yi[TILEY]={0}; //TILEY MOV
@@ -871,7 +873,7 @@
loadTile<false,false,TILEY>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);
//for each tile j
- for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP
+ for (int j=x_idx;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP
{
//load tile j, bounds check is not needed
loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
@@ -881,7 +883,7 @@
}
//process remaining elements at the end, use TILEX=1
- for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP
+ for (int j=count1/TILEX*TILEX+x_idx;j<count1;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP
{
//load tile j, bounds check is needed, mass is needed
loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
@@ -900,17 +902,19 @@
#if 1
//process ramining elements in set TILEY=1
//for each tile i
- for(int i=count/TILEY*TILEY+hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count;i+=hipBlockDim_y*hipGridDim_y) //1 ISETP
+ for(int i=y_idx;i<count - count/TILEY*TILEY;i+=hipBlockDim_y*hipGridDim_y) //1 ISETP
{
+ // Taken out of the loop condition to help fit within the register limit
+ int k = i + count/TILEY*TILEY;
POSVEL_T xi[1]={0}; //1 MOV
POSVEL_T yi[1]={0}; //1 MOV
POSVEL_T zi[1]={0}; //1 MOV
//load xxi, yyi, zzi tiles, mass is not needed, bounds check is needed
- loadTile<true,false,1>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);
+ loadTile<true,false,1>(k,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);
//for each tile j
- for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP
+ for (int j=x_idx;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP
{
//load tile j, bounds check is not needed
loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
@@ -920,7 +924,7 @@
}
//process remaining elements at the end, use TILEX=1
- for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP
+ for (int j=count1/TILEX*TILEX+x_idx;j<count1;j+=hipBlockDim_x*hipGridDim_x) //1 ISETP
{
//load tile j, bounds check is needed, mass is needed
loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
@@ -929,7 +933,7 @@
computeForces<1,1>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
}
- applyForce<true,1>(i,count,fcoeff,xi,yi,zi,vx,vy,vz);
+ applyForce<true,1>(k,count,fcoeff,xi,yi,zi,vx,vy,vz);
}
#endif
@@ -998,10 +1002,11 @@
checkCudaPtr(vy,"vy");
checkCudaPtr(vz,"vz");
#endif
+
hipLaunchKernelGGL(Step10_cuda_kernel, dim3(blocks), dim3(threads), 0, stream, count,count1,xx,yy,zz,mass,xx1,yy1,zz1,mass1, vx, vy, vz, fsrrmax2, rsm2, fcoeff);
cudaCheckError();
- //hipDeviceSynchronize();
+ hipStreamSynchronize(stream);
//exit(0);
#else
@@ -1051,7 +1056,7 @@
}
}
-#pragma unroll (4)
+#pragma unroll 4
for ( j = 0; j < is; j++ )
{
i = idx[j];
@@ -1062,7 +1067,7 @@
i0 = id [i]; id [i] = id [j]; id [j] = i0;
}
-#pragma unroll (4)
+#pragma unroll 4
for ( j = 0; j < is; j++ )
{
i = idx[j];
@@ -1500,7 +1505,7 @@
hipMemcpyAsync(d_nm,nm,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
hipEventRecord(event,stream); //mark when transfers have finished
cudaCheckError();
- hipDeviceSynchronize();
+ hipStreamSynchronize(stream);
hipMemcpyAsync(d_xxl,xx+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
@@ -1511,13 +1516,13 @@
hipMemcpyAsync(d_vyl,vy+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
hipMemcpyAsync(d_vzl,vz+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
cudaCheckError();
- hipDeviceSynchronize();
+ hipStreamSynchronize(stream);
#endif
// Process the interaction list...
#ifdef __HIPCC__
::nbody1(cnt, SIZE, d_xxl, d_yyl, d_zzl, d_massl, d_nx, d_ny, d_nz, d_nm, d_vxl, d_vyl, d_vzl, m_fl, m_fcoeff, fsrrmax, rsm, stream);
- hipDeviceSynchronize();
+ hipStreamSynchronize(stream);
#else
::nbody1(cnt, SIZE, xx + off, yy + off, zz + off, mass + off, nx, ny, nz, nm, vx + off, vy + off, vz + off, m_fl, m_fcoeff, fsrrmax, rsm);
#endif
@@ -1528,7 +1533,7 @@
hipMemcpyAsync(vy+off,d_vyl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
hipMemcpyAsync(vz+off,d_vzl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
cudaCheckError();
- hipDeviceSynchronize();
+ hipStreamSynchronize(stream);
#endif
}