resources: Update heterosync for ROCm 4 and dGPU support

This patch implements workarounds for the memory management bugs that
exist in gem5 as a result of the update to ROCm 4, as well as updates to
the Makefile to build for both supported GPU versions in the same
binary.

The patch replaces calls to hipMalloc with hipHostMalloc, as
hipHostMalloc creates host and device memory at the same time without
needing to use hipMemcpy, which currently doesn't work for hipMalloc'd
memory in gem5.

This patch also updates the Makefile to build for both gfx801 and gfx803
in the same binary

Change-Id: I0e289fa89baefa09ba360c151f9c7ba70041d961
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/48484
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>
diff --git a/README.md b/README.md
index a0308d9..a5c7a0e 100644
--- a/README.md
+++ b/README.md
@@ -418,12 +418,12 @@
 ## Compilation
 ```
 cd src/gpu/heterosync
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8-apu
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8
 ```
 
-The release-gfx8-apu target builds for gfx801, a GCN3-based APU. There are other
-targets (release-gfx8, release) that build for GPU types that are currently unsupported
-in gem5.
+The release-gfx8 target builds for gfx801, a GCN3-based APU, and gfx803, a
+GCN3-based dGPU. There are other targets (release) that build for GPU types
+that are currently unsupported in gem5.
 
 ## Pre-built binary
 
diff --git a/src/gpu/heterosync/Makefile b/src/gpu/heterosync/Makefile
index a1aaad4..4eb34cf 100644
--- a/src/gpu/heterosync/Makefile
+++ b/src/gpu/heterosync/Makefile
@@ -13,10 +13,7 @@
 
 # gfx8 has a different number of bits it uses for sleeps, so compile accordingly
 release-gfx8: $(SRC) | $(BIN_DIR)
-	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx803 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
-
-release-gfx8-apu: $(SRC) | $(BIN_DIR)
-	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx801 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
+	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx803,gfx801 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
 
 debug: $(SRC) | $(BIN_DIR)
 	$(HIP_PATH)/bin/hipcc -DDEBUG -g -O0 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE).debug
diff --git a/src/gpu/heterosync/README.md b/src/gpu/heterosync/README.md
index 9d8468e..1fc7308 100644
--- a/src/gpu/heterosync/README.md
+++ b/src/gpu/heterosync/README.md
@@ -21,12 +21,12 @@
 ## Compilation
 ```
 cd src/gpu/heterosync
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8-apu
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8
 ```
 
-The release-gfx8-apu target builds for gfx801, a GCN3-based APU. There are other
-targets (release-gfx8, release) that build for GPU types that are currently unsupported
-in gem5.
+The release-gfx8 target builds for gfx801, a GCN3-based APU, and gfx803, a
+GCN3-based dGPU. There are other targets (release) that build for GPU types
+that are currently unsupported in gem5.
 
 ## Pre-built binary
 
diff --git a/src/gpu/heterosync/src/hipLocksImpl.h b/src/gpu/heterosync/src/hipLocksImpl.h
index b04fb37..750de8f 100644
--- a/src/gpu/heterosync/src/hipLocksImpl.h
+++ b/src/gpu/heterosync/src/hipLocksImpl.h
@@ -28,13 +28,13 @@
   cpuLockData->mutexCount             = numMutexes;
   cpuLockData->semaphoreCount         = numSemaphores;
 
-  hipMalloc(&cpuLockData->barrierBuffers,   sizeof(unsigned int) * cpuLockData->arrayStride * 2);
+  hipHostMalloc(&cpuLockData->barrierBuffers,   sizeof(unsigned int) * cpuLockData->arrayStride * 2);
 
-  hipMalloc(&cpuLockData->mutexBuffers,     sizeof(int) * cpuLockData->arrayStride * cpuLockData->mutexCount);
-  hipMalloc(&cpuLockData->mutexBufferHeads, sizeof(unsigned int) * cpuLockData->mutexCount);
-  hipMalloc(&cpuLockData->mutexBufferTails, sizeof(unsigned int) * cpuLockData->mutexCount);
+  hipHostMalloc(&cpuLockData->mutexBuffers,     sizeof(int) * cpuLockData->arrayStride * cpuLockData->mutexCount);
+  hipHostMalloc(&cpuLockData->mutexBufferHeads, sizeof(unsigned int) * cpuLockData->mutexCount);
+  hipHostMalloc(&cpuLockData->mutexBufferTails, sizeof(unsigned int) * cpuLockData->mutexCount);
 
-  hipMalloc(&cpuLockData->semaphoreBuffers, sizeof(unsigned int) * 4 * cpuLockData->semaphoreCount);
+  hipHostMalloc(&cpuLockData->semaphoreBuffers, sizeof(unsigned int) * 4 * cpuLockData->semaphoreCount);
 
   hipErr = hipGetLastError();
   checkError(hipErr, "Before memsets");
@@ -81,11 +81,11 @@
 hipError_t hipLocksDestroy()
 {
   if (cpuLockData == NULL) { return hipErrorInitializationError; }
-  hipFree(cpuLockData->mutexBuffers);
-  hipFree(cpuLockData->mutexBufferHeads);
-  hipFree(cpuLockData->mutexBufferTails);
+  hipHostFree(cpuLockData->mutexBuffers);
+  hipHostFree(cpuLockData->mutexBufferHeads);
+  hipHostFree(cpuLockData->mutexBufferTails);
 
-  hipFree(cpuLockData->semaphoreBuffers);
+  hipHostFree(cpuLockData->semaphoreBuffers);
 
   hipHostFree(cpuLockData);
 
diff --git a/src/gpu/heterosync/src/main.hip.cpp b/src/gpu/heterosync/src/main.hip.cpp
index e4a2d0f..db38cb5 100644
--- a/src/gpu/heterosync/src/main.hip.cpp
+++ b/src/gpu/heterosync/src/main.hip.cpp
@@ -1269,7 +1269,8 @@
     The atomic barrier per-CU synchronization fits inside the lock-free size
     requirements so we can reuse the same locations.
   */
-  unsigned int * perCUBarriers = (unsigned int *)malloc(sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
+  unsigned int * perCUBarriers;
+  hipHostMalloc(&perCUBarriers, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
 
   int numLocsMult = 0;
   // barriers and unique semaphores have numWGs WGs accessing unique locations
@@ -1298,7 +1299,8 @@
   assert(numUniqLocsAccPerWG > 0);
   int numStorageLocs = (numLocsMult * numUniqLocsAccPerWG);
   assert(numStorageLocs > 0);
-  float * storage = (float *)malloc(sizeof(float) * numStorageLocs);
+  float * storage;
+  hipHostMalloc(&storage, sizeof(float) * numStorageLocs);
 
   fprintf(stdout, "# WGs: %d, # Ld/St: %d, # Locs Mult: %d, # Uniq Locs/WG: %d, # Storage Locs: %d\n", numWGs, NUM_LDST, numLocsMult, numUniqLocsAccPerWG, numStorageLocs);
 
@@ -1307,17 +1309,6 @@
   // initialize per-CU barriers to 0's
   for (int i = 0; i < (NUM_CU * MAX_WGS * 2); ++i) { perCUBarriers[i] = 0; }
 
-  // gpu copies of storage and perCUBarriers
-  //float elapsedTime = 0.0f;
-  unsigned int * perCUBarriers_d = NULL;
-  float * storage_d = NULL;
-
-  hipMalloc(&perCUBarriers_d, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
-  hipMalloc(&storage_d, sizeof(float) * numStorageLocs);
-
-  hipMemcpy(perCUBarriers_d, perCUBarriers, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2), hipMemcpyHostToDevice);
-  hipMemcpy(storage_d, storage, sizeof(float) * numStorageLocs, hipMemcpyHostToDevice);
-
   // lock variables
   hipMutex_t spinMutex, faMutex, sleepMutex, eboMutex;
   hipMutex_t spinMutex_uniq, faMutex_uniq, sleepMutex_uniq, eboMutex_uniq;
@@ -1479,52 +1470,52 @@
 
   switch (syncPrim) {
     case 0: // atomic tree barrier
-      invokeAtomicTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeAtomicTreeBarrier(storage, perCUBarriers, NUM_ITERS);
       break;
     case 1: // atomic tree barrier with local exchange
-      invokeAtomicTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeAtomicTreeBarrierLocalExch(storage, perCUBarriers, NUM_ITERS);
       break;
     case 2: // lock-free barrier
-      invokeFBSTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeFBSTreeBarrier(storage, perCUBarriers, NUM_ITERS);
       break;
     case 3: // lock-free barrier with local exchange
-      invokeFBSTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeFBSTreeBarrierLocalExch(storage, perCUBarriers, NUM_ITERS);
       break;
     case 4: // Spin Lock Mutex
-      invokeSpinLockMutex   (spinMutex,  storage_d, NUM_ITERS);
+      invokeSpinLockMutex   (spinMutex,  storage, NUM_ITERS);
       break;
     case 5: // Spin Lock Mutex with backoff
-      invokeEBOMutex        (eboMutex,   storage_d, NUM_ITERS);
+      invokeEBOMutex        (eboMutex,   storage, NUM_ITERS);
       break;
     case 6: // Sleeping Mutex
-      invokeSleepingMutex   (sleepMutex, storage_d, NUM_ITERS);
+      invokeSleepingMutex   (sleepMutex, storage, NUM_ITERS);
       break;
     case 7: // fetch-and-add mutex
-      invokeFetchAndAddMutex(faMutex,    storage_d, NUM_ITERS);
+      invokeFetchAndAddMutex(faMutex,    storage, NUM_ITERS);
       break;
     case 8: // spin semaphore (1)
-      invokeSpinLockSemaphore(spinSem1,   storage_d,   1, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem1,   storage,   1, NUM_ITERS, numStorageLocs);
       break;
     case 9: // spin semaphore (2)
-      invokeSpinLockSemaphore(spinSem2,   storage_d,   2, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem2,   storage,   2, NUM_ITERS, numStorageLocs);
       break;
     case 10: // spin semaphore (10)
-      invokeSpinLockSemaphore(spinSem10,   storage_d,   10, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem10,   storage,   10, NUM_ITERS, numStorageLocs);
       break;
     case 11: // spin semaphore (120)
-      invokeSpinLockSemaphore(spinSem120,   storage_d,   120, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem120,   storage,   120, NUM_ITERS, numStorageLocs);
       break;
     case 12: // spin semaphore with backoff (1)
-      invokeEBOSemaphore(eboSem1,   storage_d,     1, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem1,   storage,     1, NUM_ITERS, numStorageLocs);
       break;
     case 13: // spin semaphore with backoff (2)
-      invokeEBOSemaphore(eboSem2,   storage_d,     2, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem2,   storage,     2, NUM_ITERS, numStorageLocs);
       break;
     case 14: // spin semaphore with backoff (10)
-      invokeEBOSemaphore(eboSem10,   storage_d,   10, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem10,   storage,   10, NUM_ITERS, numStorageLocs);
       break;
     case 15: // spin semaphore with backoff (120)
-      invokeEBOSemaphore(eboSem120,   storage_d, 120, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem120,   storage, 120, NUM_ITERS, numStorageLocs);
       break;
     // cases 16-19 reserved
     case 16:
@@ -1536,40 +1527,40 @@
     case 19:
       break;
     case 20: // Spin Lock Mutex (uniq)
-      invokeSpinLockMutex_uniq   (spinMutex_uniq,  storage_d, NUM_ITERS);
+      invokeSpinLockMutex_uniq   (spinMutex_uniq,  storage, NUM_ITERS);
       break;
     case 21: // Spin Lock Mutex with backoff (uniq)
-      invokeEBOMutex_uniq        (eboMutex_uniq,   storage_d, NUM_ITERS);
+      invokeEBOMutex_uniq        (eboMutex_uniq,   storage, NUM_ITERS);
       break;
     case 22: // Sleeping Mutex (uniq)
-      invokeSleepingMutex_uniq   (sleepMutex_uniq, storage_d, NUM_ITERS);
+      invokeSleepingMutex_uniq   (sleepMutex_uniq, storage, NUM_ITERS);
       break;
     case 23: // fetch-and-add mutex (uniq)
-      invokeFetchAndAddMutex_uniq(faMutex_uniq,    storage_d, NUM_ITERS);
+      invokeFetchAndAddMutex_uniq(faMutex_uniq,    storage, NUM_ITERS);
       break;
     case 24: // spin semaphore (1) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem1_uniq,   storage_d,   1, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem1_uniq,   storage,   1, NUM_ITERS);
       break;
     case 25: // spin semaphore (2) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem2_uniq,   storage_d,   2, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem2_uniq,   storage,   2, NUM_ITERS);
       break;
     case 26: // spin semaphore (10) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem10_uniq,   storage_d,   10, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem10_uniq,   storage,   10, NUM_ITERS);
       break;
     case 27: // spin semaphore (120) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem120_uniq,   storage_d,   120, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem120_uniq,   storage,   120, NUM_ITERS);
       break;
     case 28: // spin semaphore with backoff (1) (uniq)
-      invokeEBOSemaphore_uniq(eboSem1_uniq,   storage_d,     1, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem1_uniq,   storage,     1, NUM_ITERS);
       break;
     case 29: // spin semaphore with backoff (2) (uniq)
-      invokeEBOSemaphore_uniq(eboSem2_uniq,   storage_d,     2, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem2_uniq,   storage,     2, NUM_ITERS);
       break;
     case 30: // spin semaphore with backoff (10) (uniq)
-      invokeEBOSemaphore_uniq(eboSem10_uniq,   storage_d,   10, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem10_uniq,   storage,   10, NUM_ITERS);
       break;
     case 31: // spin semaphore with backoff (120) (uniq)
-      invokeEBOSemaphore_uniq(eboSem120_uniq,   storage_d, 120, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem120_uniq,   storage, 120, NUM_ITERS);
       break;
     // cases 32-36 reserved
     case 32:
@@ -1594,9 +1585,6 @@
   // NOTE: Can end simulation here if don't care about output checking
   hipDeviceSynchronize();
 
-  // copy results back to compare to golden
-  hipMemcpy(storage, storage_d, sizeof(float) * numStorageLocs, hipMemcpyDeviceToHost);
-
   // get golden results
   float storageGolden[numStorageLocs];
   int numLocsAccessed = 0, currLoc = 0;
@@ -1777,10 +1765,8 @@
 
   // free arrays
   hipLocksDestroy();
-  hipFree(storage_d);
-  hipFree(perCUBarriers_d);
-  free(storage);
-  free(perCUBarriers);
+  hipHostFree(storage);
+  hipHostFree(perCUBarriers);
 
   return 0;
 }