resources: Update square for ROCm 4 and dGPU support ROCm 4 made it so previous methods of GPU memory management used in ROCm 1.6 don't work in gem5. To get around that, we use hipHostMalloc, which allocates host and device accessible memory at the same time, all in a single pointer. Additionally, because APU and dGPU code now don't allocate memory differently, the #DGPU define is removed. Finally, the Makefile is updated to build all supported GPU versions into the same binary. Change-Id: I89dd2403da0181f53961f9c05f93f46c087603a8 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/47979 Reviewed-by: Matthew Poremba <matthew.poremba@amd.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Reviewed-by: Alex Dutu <alexandru.dutu@amd.com> Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu> Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>

commit: b9976264fcf6624a40a9b758ae19b6d6b64e9dda [log] [tgz]
author: Kyle Roarty <kyleroarty1716@gmail.com> Mon Jul 12 18:42:39 2021 -0500
committer: Matt Sinclair <mattdsinclair@gmail.com> Thu Jul 22 20:50:11 2021 +0000
tree: 0b4d4bf6c63567f39c0b8e9c031e4baa6c905e2a
parent: 24bbfc3bc8a7fbbf9bc59e7807f36b8f7a94699c [diff]
diff --git a/src/gpu/square/Makefile b/src/gpu/square/Makefile
index 4f4eb82..48164aa 100644
--- a/src/gpu/square/Makefile
+++ b/src/gpu/square/Makefile

@@ -3,10 +3,10 @@
 
 BIN_DIR?= ./bin
 
-gfx8-apu: $(BIN_DIR)/square.o
+square: $(BIN_DIR)/square
 
-$(BIN_DIR)/square.o: square.cpp $(BIN_DIR)
-	$(HIPCC) --amdgpu-target=gfx801 $(CXXFLAGS) square.cpp -o $(BIN_DIR)/square.o
+$(BIN_DIR)/square: square.cpp $(BIN_DIR)
+	$(HIPCC) --amdgpu-target=gfx801,gfx803 $(CXXFLAGS) square.cpp -o $(BIN_DIR)/square
 
 $(BIN_DIR):
 	mkdir -p $(BIN_DIR)
@@ -14,4 +14,4 @@
 clean:
 	rm -rf $(BIN_DIR)
 
-.PHONY: gfx8-apu clean
+.PHONY: square clean

diff --git a/src/gpu/square/README.md b/src/gpu/square/README.md
index 3a764c1..5e3c64d 100644
--- a/src/gpu/square/README.md
+++ b/src/gpu/square/README.md

@@ -15,14 +15,15 @@
 
 ## Compiling Square
 
+By default, square will build for all supported GPU types (gfx801, gfx803)
 ```
 cd src/gpu/square
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make gfx8-apu
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
 The compiled binary can be found in the `bin` directory.
 
-A pre-built binary can be found at <http://dist.gem5.org/dist/develop/test-progs/square/square.o>.
+A pre-built binary can be found at <http://dist.gem5.org/dist/develop/test-progs/square/square>.
 
 ## Compiling GN3_X86/gem5.opt
 
@@ -37,5 +38,5 @@
 ## Running Square on GCN3_X86/gem5.opt
 
 ```
-docker run -u $UID:$GUID --volume $(pwd):$(pwd) -w $(pwd) gcr.io/gem5-test/gcn-gpu:latest gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n <num cores> -c bin/square.o
+docker run -u $UID:$GUID --volume $(pwd):$(pwd) -w $(pwd) gcr.io/gem5-test/gcn-gpu:latest gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n 3 -c bin/square
 ```

diff --git a/src/gpu/square/square.cpp b/src/gpu/square/square.cpp
index 87bf597..cd1ce72 100644
--- a/src/gpu/square/square.cpp
+++ b/src/gpu/square/square.cpp

@@ -50,9 +50,6 @@
 
 int main(int argc, char *argv[])
 {
-#ifdef DGPU
-    float *A_d, *C_d;
-#endif
     float *A_h, *C_h;
     size_t N = 1000000;
     size_t Nbytes = N * sizeof(float);
@@ -63,38 +60,21 @@
     #ifdef __HIP_PLATFORM_HCC__
       printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch);
     #endif
-    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-    A_h = (float*)malloc(Nbytes);
-    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
-    C_h = (float*)malloc(Nbytes);
-    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    printf ("info: allocate host and device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    CHECK(hipHostMalloc(&A_h, Nbytes));
+    CHECK(hipHostMalloc(&C_h, Nbytes));
     // Fill with Phi + i
     for (size_t i=0; i<N; i++)
     {
         A_h[i] = 1.618f + i;
     }
 
-#ifdef DGPU
-    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-    CHECK(hipMalloc(&A_d, Nbytes));
-    CHECK(hipMalloc(&C_d, Nbytes));
-
-    printf ("info: copy Host2Device\n");
-    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-#endif
-
     const unsigned blocks = 512;
     const unsigned threadsPerBlock = 256;
 
     printf ("info: launch 'vector_square' kernel\n");
-#ifdef DGPU
-    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
-
-    printf ("info: copy Device2Host\n");
-    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-#else
     hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_h, A_h, N);
-#endif
+    hipDeviceSynchronize();
 
     printf ("info: check result\n");
     for (size_t i=0; i<N; i++)  {
commit	b9976264fcf6624a40a9b758ae19b6d6b64e9dda	[log] [tgz]
author	Kyle Roarty <kyleroarty1716@gmail.com>	Mon Jul 12 18:42:39 2021 -0500
committer	Matt Sinclair <mattdsinclair@gmail.com>	Thu Jul 22 20:50:11 2021 +0000
tree	0b4d4bf6c63567f39c0b8e9c031e4baa6c905e2a
parent	24bbfc3bc8a7fbbf9bc59e7807f36b8f7a94699c [diff]