resources: Update HACC for ROCm 4.0 and gfx803

This patch updates HACC to build under ROCm 4 and Ubuntu 20 by updating
the appropriate environment variables in the Dockerfile to account for
both the updates to ROCm paths and MPI/OMP paths.

Additionally, this sets the HCC_AMDGPU_TARGET environment variable to
build HACC for both gfx801 and gfx803 at the same time.

Finally, this patch updates the HACC source code to be compatible with
ROCm 4 and the bugs that exist between gem5 and ROCm 4 by replacing
mallocs and hipMallocs with hipHostMalloc, and hipDeviceSynchronize with
hipStreamSynchronize

The output has been validated against and matches a run done on a
real GPU

Change-Id: I91929abd3fdb58ff2a022b3f41c0053435c183ba
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/50987
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Reviewed-by: Alex Dutu <alexandru.dutu@amd.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>
diff --git a/src/gpu/halo-finder/Dockerfile b/src/gpu/halo-finder/Dockerfile
index f02483e..e3ddf6b 100644
--- a/src/gpu/halo-finder/Dockerfile
+++ b/src/gpu/halo-finder/Dockerfile
@@ -1,20 +1,22 @@
-FROM gcr.io/gem5-test/gcn-gpu
+FROM gcr.io/gem5-test/gcn-gpu:latest
 RUN apt-get update && apt-get -y install libopenmpi-dev libomp-dev
 
+ENV HCC_AMDGPU_TARGET="gfx801,gfx803"
+
 ENV HIPCC_BIN=/opt/rocm/bin
-ENV MPI_INCLUDE=/usr/lib/openmpi/include
+ENV MPI_INCLUDE=/usr/lib/x86_64-linux-gnu/openmpi/include
 
 ENV OPT="-O3 -g -DRCB_UNTHREADED_BUILD -DUSE_SERIAL_COSMO"
-ENV OMP="-fopenmp"
+ENV OMP="-I/usr/lib/llvm-10/include/openmp -L/usr/lib/llvm-10/lib -fopenmp"
 
 ENV HIPCC_FLAGS="-v -ffast_math -DINLINE_FORCE -I${MPI_INCLUDE}"
-ENV HIPCC_FLAGS="-v -I${MPI_INCLUDE} -I/opt/rocm/hip/include -I/opt/rocm/hcc-1.0/include"
+ENV HIPCC_FLAGS="-v -I${MPI_INCLUDE} -I/opt/rocm/hip/include"
 
 ENV HACC_PLATFORM="hip"
 ENV HACC_OBJDIR="${HACC_PLATFORM}"
 
 ENV HACC_CFLAGS="$OPT $OMP $HIPCC_FLAGS"
-ENV HACC_CC="${HIPCC_BIN}/hcc -x c -Xclang -std=c99"
+ENV HACC_CC="${HIPCC_BIN}/hipcc -x c -Xclang -std=c99"
 
 ENV HACC_CXXFLAGS="$OPT $OMP $HIPCC_FLAGS"
 ENV HACC_CXX="${HIPCC_BIN}/hipcc -Xclang"
@@ -26,7 +28,7 @@
 ENV USE_SERIAL_COSMO="1"
 ENV HACC_NUM_CUDA_DEV="1"
 ENV HACC_MPI_CFLAGS="$OPT $OMP $HIPCC_FLAGS"
-ENV HACC_MPI_CC="${HIPCC_BIN}/hcc -x c -Xclang -std=c99 -Xclang -pthread"
+ENV HACC_MPI_CC="${HIPCC_BIN}/hipcc -x c -Xclang -std=c99 -Xclang -pthread"
 
 ENV HACC_MPI_CXXFLAGS="$OPT $OMP $HIPCC_FLAGS"
 ENV HACC_MPI_CXX="${HIPCC_BIN}/hipcc -Xclang -pthread"
diff --git a/src/gpu/halo-finder/src/ForceTreeTest.cxx b/src/gpu/halo-finder/src/ForceTreeTest.cxx
index 176fb1f..9a5e4ef 100644
--- a/src/gpu/halo-finder/src/ForceTreeTest.cxx
+++ b/src/gpu/halo-finder/src/ForceTreeTest.cxx
@@ -5,6 +5,7 @@
 //newton should match theory prediction
 //short range may not due to lack of gauss law
 
+#include "hip/hip_runtime.h"
 #include <cassert>
 #include <cmath>
 #include <iostream>
@@ -79,13 +80,30 @@
   float m_rsm = 0.1;
 
   int Np = nSphere+1;
-  POSVEL_T* m_xArr = new POSVEL_T[Np];
-  POSVEL_T* m_yArr = new POSVEL_T[Np];
-  POSVEL_T* m_zArr = new POSVEL_T[Np];
-  POSVEL_T* m_vxArr = new POSVEL_T[Np];
-  POSVEL_T* m_vyArr = new POSVEL_T[Np];
-  POSVEL_T* m_vzArr = new POSVEL_T[Np];
-  POSVEL_T* m_massArr = new POSVEL_T[Np];
+
+  POSVEL_T* m_xArr;
+  hipHostMalloc(&m_xArr, Np*sizeof(POSVEL_T));
+  POSVEL_T* m_yArr;
+  hipHostMalloc(&m_yArr, Np*sizeof(POSVEL_T));
+  POSVEL_T* m_zArr;
+  hipHostMalloc(&m_zArr, Np*sizeof(POSVEL_T));
+  POSVEL_T* m_vxArr;
+  hipHostMalloc(&m_vxArr, Np*sizeof(POSVEL_T));
+  POSVEL_T* m_vyArr;
+  hipHostMalloc(&m_vyArr, Np*sizeof(POSVEL_T));
+  POSVEL_T* m_vzArr;
+  hipHostMalloc(&m_vzArr, Np*sizeof(POSVEL_T));
+  POSVEL_T* m_massArr;
+  hipHostMalloc(&m_massArr, Np*sizeof(POSVEL_T));
+  for(int i = 0; i < Np; i++) {
+        m_xArr[i] = 0;
+        m_yArr[i] = 0;
+        m_zArr[i] = 0;
+        m_vxArr[i] = 0;
+        m_vyArr[i] = 0;
+        m_vzArr[i] = 0;
+        m_massArr[i] = 0;
+  }
 
   FGrid *m_fg = new FGrid();
   FGridEval *m_fgore = new FGridEvalFit(m_fg);
@@ -308,13 +326,13 @@
   delete m_fgore;
   delete m_fg;
 
-  delete [] m_xArr;
-  delete [] m_yArr;
-  delete [] m_zArr;
-  delete [] m_vxArr;
-  delete [] m_vyArr;
-  delete [] m_vzArr;
-  delete [] m_massArr;
+  hipHostFree(m_xArr);
+  hipHostFree(m_yArr);
+  hipHostFree(m_zArr);
+  hipHostFree(m_vxArr);
+  hipHostFree(m_vyArr);
+  hipHostFree(m_vzArr);
+  hipHostFree(m_massArr);
 
 #ifndef USE_SERIAL_COSMO
   MPI_Finalize();
diff --git a/src/gpu/halo-finder/src/RCBForceTree.cxx b/src/gpu/halo-finder/src/RCBForceTree.cxx
index 8774902..0c18dd3 100644
--- a/src/gpu/halo-finder/src/RCBForceTree.cxx
+++ b/src/gpu/halo-finder/src/RCBForceTree.cxx
@@ -405,37 +405,47 @@
 
   // static size for the interaction list
   #define VMAX ALIGNY(16384)
-  nx_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
-  ny_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
-  nz_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
-  nm_v=(POSVEL_T*)malloc(VMAX*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&nx_v, VMAX*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&ny_v, VMAX*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&nz_v, VMAX*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&nm_v, VMAX*sizeof(POSVEL_T)*numThreads);
+  for(int i = 0; i < VMAX*numThreads; i++) {
+    nx_v[i] = 0;
+    ny_v[i] = 0;
+    nz_v[i] = 0;
+    nm_v[i] = 0;
+  }
+
 
 #ifdef __HIPCC__
-  hipHostRegister(nx_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
-  hipHostRegister(ny_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
-  hipHostRegister(nz_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
-  hipHostRegister(nm_v,VMAX*sizeof(POSVEL_T)*numThreads,0);
-  hipHostRegister(xx,count*sizeof(POSVEL_T),0);
-  hipHostRegister(yy,count*sizeof(POSVEL_T),0);
-  hipHostRegister(zz,count*sizeof(POSVEL_T),0);
-  hipHostRegister(mass,count*sizeof(POSVEL_T),0);
-  hipHostRegister(vx,count*sizeof(POSVEL_T),0);
-  hipHostRegister(vy,count*sizeof(POSVEL_T),0);
-  hipHostRegister(vz,count*sizeof(POSVEL_T),0);
-
   int size=ALIGNY(nd);
-  hipMalloc(&d_xx,size*sizeof(POSVEL_T)*numThreads);
-  hipMalloc(&d_yy,size*sizeof(POSVEL_T)*numThreads);
-  hipMalloc(&d_zz,size*sizeof(POSVEL_T)*numThreads);
-  hipMalloc(&d_vx,size*sizeof(POSVEL_T)*numThreads);
-  hipMalloc(&d_vy,size*sizeof(POSVEL_T)*numThreads);
-  hipMalloc(&d_vz,size*sizeof(POSVEL_T)*numThreads);
-  hipMalloc(&d_mass,size*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_xx,size*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_yy,size*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_zz,size*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_vx,size*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_vy,size*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_vz,size*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_mass,size*sizeof(POSVEL_T)*numThreads);
+  for(int i = 0; i < size*numThreads; i++) {
+    d_xx[i] = 0;
+    d_yy[i] = 0;
+    d_zz[i] = 0;
+    d_vx[i] = 0;
+    d_vy[i] = 0;
+    d_vz[i] = 0;
+    d_mass[i] = 0;
+  }
 
-  hipMalloc(&d_nx_v,VMAX*sizeof(POSVEL_T)*numThreads);
-  hipMalloc(&d_ny_v,VMAX*sizeof(POSVEL_T)*numThreads);
-  hipMalloc(&d_nz_v,VMAX*sizeof(POSVEL_T)*numThreads);
-  hipMalloc(&d_nm_v,VMAX*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_nx_v,VMAX*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_ny_v,VMAX*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_nz_v,VMAX*sizeof(POSVEL_T)*numThreads);
+  hipHostMalloc(&d_nm_v,VMAX*sizeof(POSVEL_T)*numThreads);
+  for(int i = 0; i < VMAX*numThreads; i++) {
+    d_nx_v[i] = 0;
+    d_ny_v[i] = 0;
+    d_nz_v[i] = 0;
+    d_nm_v[i] = 0;
+  }
   cudaCheckError();
 
 
@@ -545,29 +555,17 @@
     delete m_fl;
   }
 #ifdef __HIPCC__
-  hipHostUnregister(xx);
-  hipHostUnregister(yy);
-  hipHostUnregister(zz);
-  hipHostUnregister(mass);
-  hipHostUnregister(vx);
-  hipHostUnregister(vy);
-  hipHostUnregister(vz);
-  hipHostUnregister(nx_v);
-  hipHostUnregister(ny_v);
-  hipHostUnregister(nz_v);
-  hipHostUnregister(nm_v);
-
-  hipFree(d_xx);
-  hipFree(d_yy);
-  hipFree(d_zz);
-  hipFree(d_vx);
-  hipFree(d_vy);
-  hipFree(d_vz);
-  hipFree(d_mass);
-  hipFree(d_nx_v);
-  hipFree(d_ny_v);
-  hipFree(d_nz_v);
-  hipFree(d_nm_v);
+  hipHostFree(d_xx);
+  hipHostFree(d_yy);
+  hipHostFree(d_zz);
+  hipHostFree(d_vx);
+  hipHostFree(d_vy);
+  hipHostFree(d_vz);
+  hipHostFree(d_mass);
+  hipHostFree(d_nx_v);
+  hipHostFree(d_ny_v);
+  hipHostFree(d_nz_v);
+  hipHostFree(d_nm_v);
   cudaCheckError();
 
   for(int i=0;i<numThreads;i++) {
@@ -580,10 +578,10 @@
   free(stream_v);
 
 #endif
-  free(nx_v);
-  free(ny_v);
-  free(nz_v);
-  free(nm_v);
+  hipHostFree(nx_v);
+  hipHostFree(ny_v);
+  hipHostFree(nz_v);
+  hipHostFree(nm_v);
 #ifdef __HIPCC__
   //nvtxRangeEnd(r0);
 #endif
@@ -859,9 +857,13 @@
   POSVEL_T zzj[TILEX];
   POSVEL_T massj[TILEX];
 
+  // Consolidate variables to help fit within the register limit
+  int x_idx = hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;
+  int y_idx = hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;
+
   //loop over interior region and calculate forces.
   //for each tile i
- for(int i=hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count/TILEY;i+=hipBlockDim_y*hipGridDim_y)                                //1 ISETP
+ for(int i=y_idx;i<count/TILEY;i+=hipBlockDim_y*hipGridDim_y)                                //1 ISETP
   {
     POSVEL_T xi[TILEY]={0};                                                                                //TILEY MOV
     POSVEL_T yi[TILEY]={0};                                                                                //TILEY MOV
@@ -871,7 +873,7 @@
     loadTile<false,false,TILEY>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);
 
     //for each tile j
-    for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
+    for (int j=x_idx;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
     {
       //load tile j, bounds check is not needed
       loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
@@ -881,7 +883,7 @@
     }
 
     //process remaining elements at the end, use TILEX=1
-    for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
+    for (int j=count1/TILEX*TILEX+x_idx;j<count1;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
     {
       //load tile j, bounds check is needed, mass is needed
       loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
@@ -900,17 +902,19 @@
 #if 1
   //process ramining elements in set TILEY=1
   //for each tile i
-  for(int i=count/TILEY*TILEY+hipBlockIdx_y*hipBlockDim_y+hipThreadIdx_y;i<count;i+=hipBlockDim_y*hipGridDim_y)                             //1 ISETP
+  for(int i=y_idx;i<count - count/TILEY*TILEY;i+=hipBlockDim_y*hipGridDim_y)                             //1 ISETP
   {
+    // Taken out of the loop condition to help fit within the register limit
+    int k = i + count/TILEY*TILEY;
     POSVEL_T xi[1]={0};                                                                                //1 MOV
     POSVEL_T yi[1]={0};                                                                                //1 MOV
     POSVEL_T zi[1]={0};                                                                                //1 MOV
 
     //load xxi, yyi, zzi tiles, mass is not needed, bounds check is needed
-    loadTile<true,false,1>(i,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);
+    loadTile<true,false,1>(k,count,xx,yy,zz,NULL,xxi,yyi,zzi,NULL);
 
     //for each tile j
-    for (int j=hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
+    for (int j=x_idx;j<count1/TILEX;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
     {
       //load tile j, bounds check is not needed
       loadTile<false,true,TILEX>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
@@ -920,7 +924,7 @@
     }
 
     //process remaining elements at the end, use TILEX=1
-    for (int j=count1/TILEX*TILEX+hipBlockIdx_x*hipBlockDim_x+hipThreadIdx_x;j<count1;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
+    for (int j=count1/TILEX*TILEX+x_idx;j<count1;j+=hipBlockDim_x*hipGridDim_x)                                  //1 ISETP
     {
       //load tile j, bounds check is needed, mass is needed
       loadTile<true,true,1>(j,count1,xx1,yy1,zz1,mass1,xxj,yyj,zzj,massj);
@@ -929,7 +933,7 @@
       computeForces<1,1>(xxi,yyi,zzi,xxj,yyj,zzj,massj,xi,yi,zi,ma0,ma1,ma2,ma3,ma4,ma5,mp_rsm2,fsrrmax2);
     }
 
-    applyForce<true,1>(i,count,fcoeff,xi,yi,zi,vx,vy,vz);
+    applyForce<true,1>(k,count,fcoeff,xi,yi,zi,vx,vy,vz);
   }
 #endif
 
@@ -998,10 +1002,11 @@
   checkCudaPtr(vy,"vy");
   checkCudaPtr(vz,"vz");
 #endif
+
   hipLaunchKernelGGL(Step10_cuda_kernel, dim3(blocks), dim3(threads), 0, stream, count,count1,xx,yy,zz,mass,xx1,yy1,zz1,mass1, vx, vy, vz, fsrrmax2, rsm2, fcoeff);
   cudaCheckError();
 
-  //hipDeviceSynchronize();
+  hipStreamSynchronize(stream);
   //exit(0);
 #else
 
@@ -1051,7 +1056,7 @@
     }
   }
 
-#pragma unroll (4)
+#pragma unroll 4
   for ( j = 0; j < is; j++ )
   {
       i = idx[j];
@@ -1062,7 +1067,7 @@
       i0 = id  [i]; id  [i] = id  [j]; id  [j] = i0;
   }
 
-#pragma unroll (4)
+#pragma unroll 4
   for ( j = 0; j < is; j++ )
   {
       i = idx[j];
@@ -1500,7 +1505,7 @@
   hipMemcpyAsync(d_nm,nm,sizeof(POSVEL_T)*SIZE,hipMemcpyHostToDevice,stream);
   hipEventRecord(event,stream);  //mark when transfers have finished
   cudaCheckError();
-  hipDeviceSynchronize();
+  hipStreamSynchronize(stream);
 
 
   hipMemcpyAsync(d_xxl,xx+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
@@ -1511,13 +1516,13 @@
   hipMemcpyAsync(d_vyl,vy+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
   hipMemcpyAsync(d_vzl,vz+off,sizeof(POSVEL_T)*cnt,hipMemcpyHostToDevice,stream);
   cudaCheckError();
-  hipDeviceSynchronize();
+  hipStreamSynchronize(stream);
 #endif
 
   // Process the interaction list...
 #ifdef __HIPCC__
   ::nbody1(cnt, SIZE, d_xxl, d_yyl, d_zzl, d_massl, d_nx, d_ny, d_nz, d_nm, d_vxl, d_vyl, d_vzl, m_fl, m_fcoeff, fsrrmax, rsm, stream);
-  hipDeviceSynchronize();
+  hipStreamSynchronize(stream);
 #else
   ::nbody1(cnt, SIZE, xx + off, yy + off, zz + off, mass + off, nx, ny, nz, nm, vx + off, vy + off, vz + off, m_fl, m_fcoeff, fsrrmax, rsm);
 #endif
@@ -1528,7 +1533,7 @@
   hipMemcpyAsync(vy+off,d_vyl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
   hipMemcpyAsync(vz+off,d_vzl,sizeof(POSVEL_T)*cnt,hipMemcpyDeviceToHost,stream);
   cudaCheckError();
-  hipDeviceSynchronize();
+  hipStreamSynchronize(stream);
 #endif
 
 }