resources: Update square for ROCm 4 and dGPU support

ROCm 4 made it so previous methods of GPU memory management
used in ROCm 1.6 don't work in gem5. To get around that, we
use hipHostMalloc, which allocates host and device accessible
memory at the same time, all in a single pointer.

Additionally, because APU and dGPU code now don't allocate
memory differently, the #DGPU define is removed.

Finally, the Makefile is updated to build all supported GPU
versions into the same binary.

Change-Id: I89dd2403da0181f53961f9c05f93f46c087603a8
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/47979
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Alex Dutu <alexandru.dutu@amd.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>
diff --git a/src/gpu/square/Makefile b/src/gpu/square/Makefile
index 4f4eb82..48164aa 100644
--- a/src/gpu/square/Makefile
+++ b/src/gpu/square/Makefile
@@ -3,10 +3,10 @@
 
 BIN_DIR?= ./bin
 
-gfx8-apu: $(BIN_DIR)/square.o
+square: $(BIN_DIR)/square
 
-$(BIN_DIR)/square.o: square.cpp $(BIN_DIR)
-	$(HIPCC) --amdgpu-target=gfx801 $(CXXFLAGS) square.cpp -o $(BIN_DIR)/square.o
+$(BIN_DIR)/square: square.cpp $(BIN_DIR)
+	$(HIPCC) --amdgpu-target=gfx801,gfx803 $(CXXFLAGS) square.cpp -o $(BIN_DIR)/square
 
 $(BIN_DIR):
 	mkdir -p $(BIN_DIR)
@@ -14,4 +14,4 @@
 clean:
 	rm -rf $(BIN_DIR)
 
-.PHONY: gfx8-apu clean
+.PHONY: square clean
diff --git a/src/gpu/square/README.md b/src/gpu/square/README.md
index 3a764c1..5e3c64d 100644
--- a/src/gpu/square/README.md
+++ b/src/gpu/square/README.md
@@ -15,14 +15,15 @@
 
 ## Compiling Square
 
+By default, square will build for all supported GPU types (gfx801, gfx803)
 ```
 cd src/gpu/square
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make gfx8-apu
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
 The compiled binary can be found in the `bin` directory.
 
-A pre-built binary can be found at <http://dist.gem5.org/dist/develop/test-progs/square/square.o>.
+A pre-built binary can be found at <http://dist.gem5.org/dist/develop/test-progs/square/square>.
 
 ## Compiling GN3_X86/gem5.opt
 
@@ -37,5 +38,5 @@
 ## Running Square on GCN3_X86/gem5.opt
 
 ```
-docker run -u $UID:$GUID --volume $(pwd):$(pwd) -w $(pwd) gcr.io/gem5-test/gcn-gpu:latest gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n <num cores> -c bin/square.o
+docker run -u $UID:$GUID --volume $(pwd):$(pwd) -w $(pwd) gcr.io/gem5-test/gcn-gpu:latest gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n 3 -c bin/square
 ```
diff --git a/src/gpu/square/square.cpp b/src/gpu/square/square.cpp
index 87bf597..cd1ce72 100644
--- a/src/gpu/square/square.cpp
+++ b/src/gpu/square/square.cpp
@@ -50,9 +50,6 @@
 
 int main(int argc, char *argv[])
 {
-#ifdef DGPU
-    float *A_d, *C_d;
-#endif
     float *A_h, *C_h;
     size_t N = 1000000;
     size_t Nbytes = N * sizeof(float);
@@ -63,38 +60,21 @@
     #ifdef __HIP_PLATFORM_HCC__
       printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch);
     #endif
-    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-    A_h = (float*)malloc(Nbytes);
-    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
-    C_h = (float*)malloc(Nbytes);
-    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    printf ("info: allocate host and device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    CHECK(hipHostMalloc(&A_h, Nbytes));
+    CHECK(hipHostMalloc(&C_h, Nbytes));
     // Fill with Phi + i
     for (size_t i=0; i<N; i++)
     {
         A_h[i] = 1.618f + i;
     }
 
-#ifdef DGPU
-    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-    CHECK(hipMalloc(&A_d, Nbytes));
-    CHECK(hipMalloc(&C_d, Nbytes));
-
-    printf ("info: copy Host2Device\n");
-    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-#endif
-
     const unsigned blocks = 512;
     const unsigned threadsPerBlock = 256;
 
     printf ("info: launch 'vector_square' kernel\n");
-#ifdef DGPU
-    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
-
-    printf ("info: copy Device2Host\n");
-    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-#else
     hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_h, A_h, N);
-#endif
+    hipDeviceSynchronize();
 
     printf ("info: check result\n");
     for (size_t i=0; i<N; i++)  {