resources: Merge branch 'release-staging-v21-1' into stable

Change-Id: Ib7089a0ccae8a15b8338887ac3aa945fc9e03836
diff --git a/README.md b/README.md
index 53c2f7f..88d8abc 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@
 
 The compiled resources for gem5 can be found under
 http://dist.gem5.org/dist/{VERSION}. E.g. compiled resources for gem5 v20.2
-are under http://dist.gem5.org/dist/v20-1 and are compiled from
+are under http://dist.gem5.org/dist/develop and are compiled from
 gem5-resources v20.2. http://dist.gem5.org/dist/develop is kept in sync
 with the develop branch, and therefore should not be depended upon for stable,
 regular usage.
@@ -96,29 +96,29 @@
 
 ### RISCV Tests Pre-built binaries
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/dhrystone.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/dhrystone.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/median.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/median.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/mm.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/mm.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/mt-matmul.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/mt-matmul.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/mt-vvadd.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/mt-vvadd.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/multiply.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/multiply.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/pmp.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/pmp.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/qsort.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/qsort.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/rsort.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/rsort.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/spmv.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/spmv.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/towers.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/towers.riscv>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/riscv-tests/vvadd.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/vvadd.riscv>
 
 ## Resource: simple
 
@@ -241,86 +241,85 @@
 
 ### simple Pre-built binaries
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/x86/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/x86/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/x86/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/x86/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_atomic>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/x86/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_pthread_cond>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/x86/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_std_thread>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/x86/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_std_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/x86/test_std_condition_variable>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_std_condition_variable>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch32/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch32/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch32/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch32/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_atomic>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch32/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_pthread_cond>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch32/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_std_thread>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch32/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_std_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch32/test_std_condition_variable>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_std_condition_variable>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch64/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch64/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch64/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch64/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_atomic>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch64/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_pthread_cond>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch64/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_std_thread>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch64/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_std_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/aarch64/test_std_condition_variable>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_std_condition_variable>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/riscv64/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/riscv64/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/riscv64/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/riscv64/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_atomic>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/riscv64/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_pthread_cond>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/riscv64/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_std_thread>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/riscv64/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_std_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/riscv64/test_std_condition_variable>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_std_condition_variable>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/sparc64/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/sparc64/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/sparc64/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/sparc64/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_atomic>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/sparc64/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_pthread_cond>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/sparc64/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_std_thread>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/sparc64/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_std_mutex>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pthreads/sparc64/test_std_condition_variable>
-
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_std_condition_variable>
 
 ## Resource: Square
 
@@ -329,15 +328,15 @@
 To compile:
 
 ```
-cd src/square
+cd src/gpu/square
 docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make gfx8-apu
 ```
 
-The compiled binary can be found in `src/square/bin`
+The compiled binary can be found in `src/gpu/square/bin`
 
 ### Square Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/square/square.o>
+<http://dist.gem5.org/dist/v21-1/test-progs/square/square.o>
 
 # Resource: HSA Agent Packet Example
 
@@ -358,11 +357,11 @@
 To compile:
 
 ```
-cd src/hsa-agent-pkt
+cd src/gpu/hsa-agent-pkt
 docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make gfx8-apu
 ```
 
-The compiled binary can be found in `src/hsa-agent-pkt/bin`
+The compiled binary can be found in `src/gpu/hsa-agent-pkt/bin`
 
 # Resource: HIP Sample Applications
 
@@ -380,7 +379,7 @@
 ## Compilation
 
 ```
-cd src/hip-samples
+cd src/gpu/hip-samples
 docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
@@ -391,21 +390,21 @@
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/2dshfl>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/2dshfl>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/dynamic_shared>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/dynamic_shared>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/inline_asm>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/inline_asm>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/MatrixTranspose>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/MatrixTranspose>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/sharedMemory>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/sharedMemory>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/shfl>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/shfl>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/stream>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/stream>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/unroll>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/unroll>
 
 # Resource: Heterosync
 
@@ -418,17 +417,17 @@
 
 ## Compilation
 ```
-cd src/heterosync
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8-apu
+cd src/gpu/heterosync
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8
 ```
 
-The release-gfx8-apu target builds for gfx801, a GCN3-based APU. There are other
-targets (release-gfx8, release) that build for GPU types that are currently unsupported
-in gem5.
+The release-gfx8 target builds for gfx801, a GCN3-based APU, and gfx803, a
+GCN3-based dGPU. There are other targets (release) that build for GPU types
+that are currently unsupported in gem5.
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/heterosync/gcn3/allSyncPrims-1kernel>
+<http://dist.gem5.org/dist/v21-1/test-progs/heterosync/gcn3/allSyncPrims-1kernel>
 
 # Resource: lulesh
 
@@ -438,11 +437,11 @@
 
 ## Compilation and Running
 ```
-cd src/lulesh
+cd src/gpu/lulesh
 docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
-By default, the makefile builds for gfx801, and is placed in the `src/lulesh/bin` folder.
+By default, the makefile builds for gfx801, and is placed in the `src/gpu/lulesh/bin` folder.
 
 lulesh is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
 To build GCN3_X86:
@@ -461,12 +460,12 @@
 
 ```
 # Assuming gem5 and gem5-resources are in your working directory
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n2 --mem-size=8GB --benchmark-root=gem5-resources/src/lulesh/bin -clulesh
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --mem-size=8GB --benchmark-root=gem5-resources/src/gpu/lulesh/bin -clulesh
 ```
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/lulesh/lulesh>
+<http://dist.gem5.org/dist/v21-1/test-progs/lulesh/lulesh>
 
 # Resource: halo-finder (HACC)
 
@@ -475,8 +474,8 @@
 under the influence of gravity. The halo-finder code can be GPU accelerated by using
 the code in RCBForceTree.cxx
 
-`src/halo-finder/src` contains the code required to build and run ForceTreeTest from `src/halo_finder` in the main HACC codebase.
-`src/halo-finder/src/dfft` contains the dfft code from `src/dfft` in the main HACC codebase.
+`src/gpu/halo-finder/src` contains the code required to build and run ForceTreeTest from `src/halo_finder` in the main HACC codebase.
+`src/gpu/halo-finder/src/dfft` contains the dfft code from `src/dfft` in the main HACC codebase.
 
 ## Compilation and Running
 
@@ -489,12 +488,12 @@
 
 To build the Docker image and the benchmark:
 ```
-cd src/halo-finder
+cd src/gpu/halo-finder
 docker build -t <image_name> .
 docker run --rm -v ${PWD}:${PWD} -w ${PWD}/src -u $UID:$GID <image_name> make hip/ForceTreeTest
 ```
 
-The binary is built for gfx801 by default and is placed at `src/halo-finder/src/hip/ForceTreeTest`
+The binary is built for gfx801 by default and is placed at `src/gpu/halo-finder/src/hip/ForceTreeTest`
 
 ForceTreeTest is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
 To build GCN3_X86:
@@ -506,12 +505,12 @@
 To run ForceTreeTest:
 ```
 # Assuming gem5 and gem5-resources are in the working directory
-docker run --rm -v $PWD:$PWD -w $PWD -u $UID:$GID <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/halo-finder/src/hip -cForceTreeTest --options="0.5 0.1 64 0.1 1 N 12 rcb"
+docker run --rm -v $PWD:$PWD -w $PWD -u $UID:$GID <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/halo-finder/src/hip -cForceTreeTest --options="0.5 0.1 64 0.1 1 N 12 rcb"
 ```
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/halo-finder/ForceTreeTest>
+<http://dist.gem5.org/dist/v21-1/test-progs/halo-finder/ForceTreeTest>
 
 # Resource: DNNMark
 
@@ -520,30 +519,25 @@
 
 ## Compilation and Running
 
-DNNMark requires additional programs that aren't installed in the standard GCN
-docker image. There is a Dockerfile in `src/DNNMark` that installs the additional
-software.
-
-To build DNNMark (Including the new docker image):
+To build DNNMark:
 **NOTE**: Due to DNNMark building a library, it's important to mount gem5-resources
 to the same directory within the docker container when building and running, as otherwise the benchmarks
 won't be able to link against the library. The example commands do this by using
 `-v ${PWD}:${PWD}` in the docker run commands
 ```
-cd src/DNNMark
-docker build -t <image_name> .
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> ./setup.sh HIP
-docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID <image_name> make
+cd src/gpu/DNNMark
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu ./setup.sh HIP
+docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
 DNNMark uses MIOpen kernels, which are unable to be compiled on-the-fly in gem5.
-We have provided a shell script to generate these kernels for a subset of the
-benchmarks.
+We have provided a python script to generate these kernels for a subset of the
+benchmarks for a gfx801 GPU with 4 CUs by default
 
 To generate the MIOpen kernels:
 ```
-cd src/DNNMark
-docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> ./generate_cachefiles.sh
+cd src/gpu/DNNMark
+docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu python3 generate_cachefiles.py cachefiles.csv [--gfx-version={gfx801,gfx803}] [--num-cus=N]
 ```
 
 Due to the large amounts of memory that need to be set up for DNNMark, we have
@@ -552,7 +546,7 @@
 
 To make the MMAP file:
 ```
-cd src/DNNMark
+cd src/gpu/DNNMark
 g++ -std=c++0x generate_rand_data.cpp -o generate_rand_data
 ./generate_rand_data
 ```
@@ -561,13 +555,13 @@
 To build GCN3_X86:
 ```
 # Working directory is your gem5 directory
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
 ```
 
 To run one of the benchmarks (fwd softmax) in gem5:
 ```
 # Assuming gem5 and gem5-resources are sub-directories of the current directory
-docker run --rm -u $UID:$GID -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/DNNMark/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n2 --benchmark-root=gem5-resources/src/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/DNNMark/mmap.bin"
+docker run --rm -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
 ```
 
 
@@ -581,28 +575,28 @@
 ## Compiling and Running
 
 ```
-cd src/pennant
+cd src/gpu/pennant
 docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
-By default, the binary is built for gfx801 and is placed in `src/pennant/build`
+By default, the binary is built for gfx801 and is placed in `src/gpu/pennant/build`
 
 pennant is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
 
-pennant has sample input files located at `src/pennant/test`. The following command shows how to run the sample `noh`
+pennant has sample input files located at `src/gpu/pennant/test`. The following command shows how to run the sample `noh`
 
 ```
 # Assuming gem5 and gem5-resources are in your working directory
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n2 --benchmark-root=gem5-resources/src/pennant/build -cpennant --options="gem5-resources/src/pennant/test/noh/noh.pnt"
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/pennant/build -cpennant --options="gem5-resources/src/gpu/pennant/test/noh/noh.pnt"
 ```
 
-The output gets placed in `src/pennant/test/noh/`, and the file `noh.xy`
+The output gets placed in `src/gpu/pennant/test/noh/`, and the file `noh.xy`
 against the `noh.xy.std` file. Note: Only some tests have `.xy.std` files to
 compare against, and there may be slight differences due to floating-point rounding
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pennant/pennant>
+<http://dist.gem5.org/dist/v21-1/test-progs/pennant/pennant>
 
 ## Resource: SPEC 2006
 
@@ -652,7 +646,7 @@
 
 ### GAPBS Pre-built disk image
 
-<http://dist.gem5.org/dist/v21-0/images/x86/ubuntu-18-04/gapbs.img.gz>.
+<http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/gapbs.img.gz>.
 
 ## Resource: PARSEC Benchmark Suite
 
@@ -673,7 +667,7 @@
 
 ### GAPBS Pre-built disk image
 
-<http://dist.gem5.org/dist/v21-0/images/x86/ubuntu-18-04/parsec.img.gz>.
+<http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/parsec.img.gz>.
 
 ## Resource: NAS Parallel Benchmarks (NPB) Tests
 
@@ -700,7 +694,7 @@
 
 ### NPB Pre-built disk image
 
-<http://dist.gem5.org/dist/v21-0/images/x86/ubuntu-18-04/npb.img.gz>
+<http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/npb.img.gz>
 
 
 ## Resource: Linux Boot Tests
@@ -715,16 +709,20 @@
 
 The instructions to build the boot-tests disk image (`boot-exit`), the Linux binaries, and how to use gem5 run scripts to run boot-tests are available in this [README](src/boot-tests/README.md) file.
 
-# Resource: RISCV Full System Test
+## Resource: RISCV Full System
 
-This resource refers to a simple setup for a riscv based full system simulation of Linux kernel.
+The RISCV Full System resource includes a RISCV boot loader (`berkeley bootloader (bbl)`) to boot the Linux 5.10 kernel on a RISCV system, and an image which includes the BusyBox software suite.
+The resource also contains simple gem5 run/config scripts to run Linux full system simulations in which a user may telnet into.
 
-Main components include:
-- a disk image
-- a riscv boot loader with linux kernel as payload and a device tree compiled in
-- gem5 run/config scripts
+Further information on building a riscv disk image, a riscv boot loader, and how to use gem5 scripts to run riscv Linux full system simulations, is available in the [README](src/riscv-fs/README.md) file.
 
-The instructions to build a riscv disk image, a riscv boot loader (`berkeley bootloader (bbl)`) and how to use gem5 scripts to run riscv Linux full system simulations are available in this [README](src/riscv-fs/README.md) file.
+### RISCV Full System pre-built disk image
+
+<http://dist.gem5.org/dist/develop/images/riscv/busybox/riscv-disk.img.gz>
+
+### RISCV Full System pre-built Linux bootloader
+
+<http://dist.gem5.org/dist/develop/kernels/riscv/static/bootloader-vmlinux-5.10>
 
 ## Resource: Insttest
 
@@ -755,7 +753,7 @@
 
 ### Insttest Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/insttest/bin/sparc/linux/insttest>
+<http://dist.gem5.org/dist/v21-1/test-progs/insttest/bin/sparc/linux/insttest>
 
 ## Resource: Linux Kernel Binary
 
@@ -768,10 +766,10 @@
 
 ### Linux Kernel Pre-built binaries
 
-<http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.4.186>
-<http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.9.186>
-<http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.14.134>
-<http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.19.83>
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.4.186>
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.9.186>
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.14.134>
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>
 
 ## Licensing
 
@@ -784,21 +782,21 @@
 * **riscv-tests** : [`src/riscv-tests/LICENSE`](
 https://gem5.googlesource.com/public/gem5-resources/+/refs/heads/stable/src/riscv-tests/LICENSE).
 * **square**: Consult individual copyright notices of source files in
-`src/square`.
-* **hsa-agent-pkt**: `src/hsa-agent-pkt/square.cpp` is licensed under the
-same licence as 'src/square/square.cpp'.
-`src/hsa-agent-pkt/HSA_Interface.[h|.cpp]` are licensed under a BSD Lisense
+`src/gpu/square`.
+* **hsa-agent-pkt**: `src/gpu/hsa-agent-pkt/square.cpp` is licensed under the
+same licence as 'src/gpu/square/square.cpp'.
+`src/gpu/hsa-agent-pkt/HSA_Interface.[h|.cpp]` are licensed under a BSD Lisense
 (A University of Maryland copyright).
 * **hip-samples**: Consult individual copyright notices of the source file in
-'src/hip-samples/src'
-* **heterosync**: Consult `src/heterosync/LICENSE.txt`
-* **lulesh**: Consult the copyright notice in `src/lulesh/src/lulesh.hip.cc`
+'src/gpu/hip-samples/src'
+* **heterosync**: Consult `src/gpu/heterosync/LICENSE.txt`
+* **lulesh**: Consult the copyright notice in `src/gpu/lulesh/src/gpu/lulesh.hip.cc`
 * **halo-finder**: halo-finder is a subcomponent of HACC, which is licensed under
 a BSD license.
-* **DNNMark**: DNNMark is licensed under an MIT license, see `src/DNNMark/LICENSE`
-* **pennant**: pennant is licensed under a BSD license, see `src/pennant/LICENSE`
-[src/square](
-https://gem5.googlesource.com/public/gem5-resources/+/refs/heads/stable/src/square).
+* **DNNMark**: DNNMark is licensed under an MIT license, see `src/gpu/DNNMark/LICENSE`
+* **pennant**: pennant is licensed under a BSD license, see `src/gpu/pennant/LICENSE`
+[src/gpu/square](
+https://gem5.googlesource.com/public/gem5-resources/+/refs/heads/stable/src/gpu/square).
 * **spec 2006**: SPEC CPU 2006 requires purchase of benchmark suite from
 [SPEC](https://www.spec.org/cpu2006/) thus, it cannot be freely distributed.
 Consult individual copyright notices of source files in [`src/spec-2006`](
diff --git a/src/DNNMark/Dockerfile b/src/DNNMark/Dockerfile
deleted file mode 100644
index 5299b26..0000000
--- a/src/DNNMark/Dockerfile
+++ /dev/null
@@ -1,2 +0,0 @@
-FROM gcr.io/gem5-test/gcn-gpu
-RUN apt-get update && apt-get -y install libgflags-dev libgoogle-glog-dev
diff --git a/src/DNNMark/generate_cachefiles.sh b/src/DNNMark/generate_cachefiles.sh
deleted file mode 100755
index 1f2a2e3..0000000
--- a/src/DNNMark/generate_cachefiles.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-cd /MIOpen/src/kernels
-
-# test_fwd_softmax/test_bwd_softmax
-mkdir -p /.cache/miopen/1.7.0/5c3130f7e6d7b29bb65a02f5de0084a6
-/opt/rocm/bin/clang-ocl  -DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenSoftmax.cl -o /.cache/miopen/1.7.0/5c3130f7e6d7b29bb65a02f5de0084a6/MIOpenSoftmax.cl.o
-
-# test_fwd_bn
-mkdir -p /.cache/miopen/1.7.0/f8850ed3a540a1e8eb258b582f554d57
-/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_SAVE_MEAN_VARIANCE=1 -DMIO_RUNNING_RESULT=1 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_LDS_SIZE=256 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=256 -DMIO_BN_GRP2=1 -DMIO_BN_NCHW=100000 -mcpu=gfx801 -Wno-everything MIOpenBatchNormFwdTrainPerAct.cl -o  /.cache/miopen/1.7.0/f8850ed3a540a1e8eb258b582f554d57/MIOpenBatchNormFwdTrainPerAct.cl.o
-
-# test_bwd_bn
-mkdir -p /.cache/miopen/1.7.0/2d295f7887fee4bec3c01ac73f8a25cd
-/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_NCHW=100000 -DMIO_BN_NGRPS=1 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=64 -DMIO_BN_GRP2=1 -mcpu=gfx801 -Wno-everything MIOpenBatchNormBwdPerAct.cl -o /.cache/miopen/1.7.0/2d295f7887fee4bec3c01ac73f8a25cd/MIOpenBatchNormBwdPerAct.cl.o
-
-# test_fwd_bypass/test_bwd_bypass
-mkdir -p /.cache/miopen/1.7.0/e213d754468ef6732bb836ed186f5783
-/opt/rocm/bin/clang-ocl  -DLITE -DMIOPEN_READ_UNIT=4 -DMIOPEN_READ_TYPE=_FLOAT4 -DMIOPEN_NRN_OP_ID=0 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenNeuron.cl -o /.cache/miopen/1.7.0/e213d754468ef6732bb836ed186f5783/MIOpenNeuron.cl.o
-
-# test_fwd_composed_model/test_bwd_composed_model
-mkdir -p /.cache/miopen/1.7.0/86de626b159aea830f0ba2f1788e0f40
-/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenUtilKernels2.cl -o /.cache/miopen/1.7.0/86de626b159aea830f0ba2f1788e0f40/MIOpenUtilKernels2.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/ac0046008721a79b06896f9a5a3ca2cc
-/opt/rocm/bin/clang-ocl  -DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DWORK_LENGTH_0=4096 -mcpu=gfx801 -Wno-everything MIOpenSubTensorOpWithScalarKernel.cl -o /.cache/miopen/1.7.0/ac0046008721a79b06896f9a5a3ca2cc/MIOpenSubTensorOpWithScalarKernel.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/7a58553f312474aa3cf449e5d9969a51
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=32 -DMLO_POOLING_N_CHANNELS=32 -DMLO_POOLING_N_HORIZ_OUT_PIX=4 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=8 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_BOT_BATCH_STRIDE=32768 -DMLO_POOLING_BOT_CHANNEL_STRIDE=1024 -DMLO_POOLING_BOT_STRIDE=32 -DMLO_POOLING_TOP_BATCH_STRIDE=8192 -DMLO_POOLING_TOP_CHANNEL_STRIDE=256 -DMLO_POOLING_TOP_STRIDE=16 -DMLO_POOLING_BOT_WIDTH=32 -DMLO_POOLING_BOT_HEIGHT=32 -DMLO_POOLING_TOP_WIDTH=16 -DMLO_POOLING_TOP_HEIGHT=16 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPooling.cl -o /.cache/miopen/1.7.0/7a58553f312474aa3cf449e5d9969a51/MIOpenPooling.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/f0ed53d85baef9414aa97b0b15b78a4d
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=32 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=2 -DMLO_POOLBWD_N_VERT_OUT_PIX=2 -DMLO_POOLBWD_GROUP_SZ0=8 -DMLO_POOLBWD_GROUP_SZ1=8 -DMLO_POOLBWD_BOT_WIDTH=32 -DMLO_POOLBWD_BOT_HEIGHT=32 -DMLO_POOLBWD_TOP_WIDTH=16 -DMLO_POOLBWD_TOP_HEIGHT=16 -DMLO_POOLBWD_BOTDF_BATCH_STRIDE=32768 -DMLO_POOLBWD_BOTDF_CHANNEL_STRIDE=1024 -DMLO_POOLBWD_BOTDF_STRIDE=32 -DMLO_POOLBWD_TOPDF_BATCH_STRIDE=8192 -DMLO_POOLBWD_TOPDF_CHANNEL_STRIDE=256 -DMLO_POOLBWD_TOPDF_STRIDE=16 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPoolingBwd.cl -o /.cache/miopen/1.7.0/f0ed53d85baef9414aa97b0b15b78a4d/MIOpenPoolingBwd.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/ae3e81b4f4b5968e01343ac25026c938
-/opt/rocm/bin/clang-ocl  -DNUM_CH_PER_WG=1 -DNUM_IM_BLKS_X=1 -DNUM_IM_BLKS=4 -DLOCAL_MEM_SIZE=432 -DSTRIDE_GT_1=0 -DTILE_SZ_X=32 -DTILE_SZ_Y=8 -DUSE_IM_OFF_GUARD=1 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenUtilKernels.cl -o /.cache/miopen/1.7.0/ae3e81b4f4b5968e01343ac25026c938/MIOpenUtilKernels.cl.o
-
-# test_fwd_pool
-mkdir -p /.cache/miopen/1.7.0/7cda2f346ecf0e84b50181f05e75480b
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=3 -DMLO_POOLING_N_CHANNELS=3 -DMLO_POOLING_N_HORIZ_OUT_PIX=4 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=8 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_BOT_BATCH_STRIDE=196608 -DMLO_POOLING_BOT_CHANNEL_STRIDE=65536 -DMLO_POOLING_BOT_STRIDE=256 -DMLO_POOLING_TOP_BATCH_STRIDE=49152 -DMLO_POOLING_TOP_CHANNEL_STRIDE=16384 -DMLO_POOLING_TOP_STRIDE=128 -DMLO_POOLING_BOT_WIDTH=256 -DMLO_POOLING_BOT_HEIGHT=256 -DMLO_POOLING_TOP_WIDTH=128 -DMLO_POOLING_TOP_HEIGHT=128 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPooling.cl -o /.cache/miopen/1.7.0/7cda2f346ecf0e84b50181f05e75480b/MIOpenPooling.cl.o
-
-# test_bwd_pool
-mkdir -p /.cache/miopen/1.7.0/e33e7c33bfa58bc339c2c0ed6e8d29ad
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=3 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=2 -DMLO_POOLBWD_N_VERT_OUT_PIX=2 -DMLO_POOLBWD_GROUP_SZ0=8 -DMLO_POOLBWD_GROUP_SZ1=8 -DMLO_POOLBWD_BOT_WIDTH=256 -DMLO_POOLBWD_BOT_HEIGHT=256 -DMLO_POOLBWD_TOP_WIDTH=128 -DMLO_POOLBWD_TOP_HEIGHT=128 -DMLO_POOLBWD_BOTDF_BATCH_STRIDE=196608 -DMLO_POOLBWD_BOTDF_CHANNEL_STRIDE=65536 -DMLO_POOLBWD_BOTDF_STRIDE=256 -DMLO_POOLBWD_TOPDF_BATCH_STRIDE=49152 -DMLO_POOLBWD_TOPDF_CHANNEL_STRIDE=16384 -DMLO_POOLBWD_TOPDF_STRIDE=128 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPoolingBwd.cl -o /.cache/miopen/1.7.0/e33e7c33bfa58bc339c2c0ed6e8d29ad/MIOpenPoolingBwd.cl.o
diff --git a/src/boot-exit/README.md b/src/boot-exit/README.md
index 82965f4..a5d2160 100644
--- a/src/boot-exit/README.md
+++ b/src/boot-exit/README.md
@@ -85,7 +85,7 @@
 ```
 
 Once this process succeeds, the disk image can be found on `boot-exit/boot-exit-image/boot-exit`.
-A disk image already created following the above instructions can be found, gzipped, [here](http://dist.gem5.org/dist/v21-0/images/x86/ubuntu-18-04/boot-exit.img.gz).
+A disk image already created following the above instructions can be found, gzipped, [here](http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/boot-exit.img.gz).
 
 
 ## gem5 Run Scripts
diff --git a/src/boot-exit/configs/system/caches.py b/src/boot-exit/configs/system/caches.py
index 4316aa1..7d60733 100755
--- a/src/boot-exit/configs/system/caches.py
+++ b/src/boot-exit/configs/system/caches.py
@@ -27,14 +27,10 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script. It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
diff --git a/src/boot-exit/configs/system/ruby_system.py b/src/boot-exit/configs/system/ruby_system.py
index 30eebd4..3959a71 100755
--- a/src/boot-exit/configs/system/ruby_system.py
+++ b/src/boot-exit/configs/system/ruby_system.py
@@ -26,7 +26,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
@@ -147,8 +146,6 @@
         # so the port isn't connected twice.
         self.pc.attachIO(self.iobus, [self.pc.south_bridge.ide.dma])
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/boot-exit/configs/system/system.py b/src/boot-exit/configs/system/system.py
index d92cd72..55875f2 100755
--- a/src/boot-exit/configs/system/system.py
+++ b/src/boot-exit/configs/system/system.py
@@ -26,7 +26,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
@@ -233,8 +232,6 @@
         self.iocache.cpu_side = self.iobus.mem_side_ports
         self.iocache.mem_side = self.membus.cpu_side_ports
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/gapbs/README.md b/src/gapbs/README.md
index a1344f7..4a5d2d6 100644
--- a/src/gapbs/README.md
+++ b/src/gapbs/README.md
@@ -12,7 +12,7 @@
 
 This document provides instructions to create a GAP Benchmark Suite (GAPBS) disk image, which, along with provided configuration scripts, may be used to run GAPBS within gem5 simulations.
 
-A pre-build disk image, for X86, can be found, gzipped, here: <http://dist.gem5.org/dist/v21-0/images/x86/ubuntu-18-04/gapbs.img.gz>.
+A pre-build disk image, for X86, can be found, gzipped, here: <http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/gapbs.img.gz>.
 
 ## Building the Disk Image
 
@@ -45,7 +45,7 @@
 gem5 scripts which configure the system and run the simulation are available in `configs/`.
 The main script `run_gapbs.py` expects following arguments:
 
-* **kernel** : A mandatory positional argument. The path to the Linux kernel. GAPBS has been tested with [vmlinux-5.2.3](http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-5.2.3). See `src/linux-kernel` for information on building a linux kernel for gem5.
+* **kernel** : A manditory positional argument. The path to the Linux kernel. GAPBS has been tested with [vmlinux-5.2.3](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-5.2.3). See `src/linux-kernel` for information on building a linux kernel for gem5.
 
 * **disk** : A mandatory positional argument. The path to the disk image.
 
diff --git a/src/gapbs/configs/run_gapbs.py b/src/gapbs/configs/run_gapbs.py
index 2e9848c..997d12c 100644
--- a/src/gapbs/configs/run_gapbs.py
+++ b/src/gapbs/configs/run_gapbs.py
@@ -30,9 +30,6 @@
     The workloads have two modes: synthetic and real graphs.
 """
 
-import sys
-import time
-
 import m5
 import m5.ticks
 from m5.objects import *
diff --git a/src/gapbs/configs/system/caches.py b/src/gapbs/configs/system/caches.py
index a58f3af..049a695 100644
--- a/src/gapbs/configs/system/caches.py
+++ b/src/gapbs/configs/system/caches.py
@@ -30,15 +30,10 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script. It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
-
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
diff --git a/src/gapbs/configs/system/ruby_system.py b/src/gapbs/configs/system/ruby_system.py
index e7c1135..c2a2b58 100644
--- a/src/gapbs/configs/system/ruby_system.py
+++ b/src/gapbs/configs/system/ruby_system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
@@ -150,8 +149,6 @@
         # so the port isn't connected twice.
         self.pc.attachIO(self.iobus, [self.pc.south_bridge.ide.dma])
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/gapbs/configs/system/system.py b/src/gapbs/configs/system/system.py
index 3287c3e..dbb11b9 100644
--- a/src/gapbs/configs/system/system.py
+++ b/src/gapbs/configs/system/system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
@@ -259,8 +258,6 @@
         self.iocache.cpu_side = self.iobus.mem_side_ports
         self.iocache.mem_side = self.membus.cpu_side_ports
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/DNNMark/.gitignore b/src/gpu/DNNMark/.gitignore
similarity index 100%
rename from src/DNNMark/.gitignore
rename to src/gpu/DNNMark/.gitignore
diff --git a/src/DNNMark/CMakeLists.txt b/src/gpu/DNNMark/CMakeLists.txt
similarity index 96%
rename from src/DNNMark/CMakeLists.txt
rename to src/gpu/DNNMark/CMakeLists.txt
index 352c08c..7b1adc7 100644
--- a/src/DNNMark/CMakeLists.txt
+++ b/src/gpu/DNNMark/CMakeLists.txt
@@ -40,8 +40,8 @@
   endif()
 endif()
 
-# Detect HCC
-find_program(HCC_FOUND hcc)
+# Detect HIPCC
+find_program(HIPCC_FOUND hipcc)
 
 option (double-test "Make data type double" OFF)
 option (enable-cudnnv6 "Enable cuDNN version 6" OFF)
@@ -124,7 +124,7 @@
                         ${GLOG_LIBRARY}
                         m)
 
-elseif(HCC_FOUND AND ${HCC_ENABLE})
+elseif(HIPCC_FOUND AND ${HCC_ENABLE})
 
   # Cover the include and linkage requirement here
   execute_process(COMMAND hcc-config  --cxxflags
@@ -188,9 +188,6 @@
 
   message(${ROCBLAS_LIBRARY} ${MIOPEN_LIBRARY})
 
-  # Find other libraries
-  find_library(HIP_HCC hip_hcc /opt/rocm/hip/lib)
-
   # Find glog libraries
   find_library(GLOG_LIBRARY glog)
 
@@ -200,7 +197,6 @@
   target_link_libraries(${PROJECT_NAME}
                         ${ROCBLAS_LIBRARY}
                         ${MIOPEN_LIBRARY}
-                        ${HIP_HCC}
                         ${GLOG_LIBRARY}
                         m)
   set_target_properties(${PROJECT_NAME} PROPERTIES
diff --git a/src/DNNMark/LICENSE b/src/gpu/DNNMark/LICENSE
similarity index 100%
rename from src/DNNMark/LICENSE
rename to src/gpu/DNNMark/LICENSE
diff --git a/src/DNNMark/Makefile b/src/gpu/DNNMark/Makefile
similarity index 100%
rename from src/DNNMark/Makefile
rename to src/gpu/DNNMark/Makefile
diff --git a/src/DNNMark/README.md b/src/gpu/DNNMark/README.md
similarity index 71%
rename from src/DNNMark/README.md
rename to src/gpu/DNNMark/README.md
index 6fe17f0..79256bc 100644
--- a/src/DNNMark/README.md
+++ b/src/gpu/DNNMark/README.md
@@ -1,14 +1,72 @@
 ---
-title: DNN Mark
+title: GCN3 DNNMark Tests
 tags:
-    - gpu
     - x86
+    - amdgpu
 layout: default
 permalink: resources/dnn-mark
 author: ["Kyle Roarty"]
 license: MIT License
+shortdoc: >
+    Resources to build a disk image with the GCN3 DNNMark workloads.
 ---
 
+[DNNMark](https://github.com/shidong-ai/DNNMark) is a benchmark framework used
+to characterize the performance of deep neural network (DNN) primitive workloads.
+
+The gem5 DNNMark tests can be used to test the GCN3-GPU model.
+
+Compiling DNNMark, compiling the GCN3_X86 gem5, and running DNNMark on gem5 is dependent on the gcn-gpu docker image, built from the `util/dockerfiles/gcn-gpu/Dockerfile` on the [gem5 stable branch](https://gem5.googlesource.com/public/gem5/+/refs/heads/stable).
+
+## Compilation and Running
+
+To build DNNMark:
+**NOTE**: Due to DNNMark building a library, it's important to mount gem5-resources
+to the same directory within the docker container when building and running, as otherwise the benchmarks
+won't be able to link against the library. The example commands do this by using
+`-v ${PWD}:${PWD}` in the docker run commands
+```
+cd src/gpu/DNNMark
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu ./setup.sh HIP
+docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
+```
+
+DNNMark uses MIOpen kernels, which are unable to be compiled on-the-fly in gem5.
+We have provided a python script to generate these kernels for a subset of the
+benchmarks for a gfx801 GPU with 4 CUs by default
+
+To generate the MIOpen kernels:
+```
+cd src/gpu/DNNMark
+docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu python3 generate_cachefiles.py cachefiles.csv [--gfx-version={gfx801,gfx803}] [--num-cus=N]
+```
+
+Due to the large amounts of memory that need to be set up for DNNMark, we have
+added in the ability to MMAP a file to reduce setup time, as well as added a
+program that can generate a 2GB file of floats.
+
+To make the MMAP file:
+```
+cd src/gpu/DNNMark
+g++ -std=c++0x generate_rand_data.cpp -o generate_rand_data
+./generate_rand_data
+```
+
+DNNMark is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
+To build GCN3_X86:
+```
+# Working directory is your gem5 directory
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+```
+
+To run one of the benchmarks (fwd softmax) in gem5:
+```
+# Assuming gem5 and gem5-resources are sub-directories of the current directory
+docker run --rm -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
+```
+
+Information from the original DNNMark README included below.
+
 # Announcement
 DNNMark is now supporting MIOpen. Right now DNNMark can run on both AMD and Nvidia platform.
 HCC, HIP, MIOpen and miopengemm are required in order to build MIOpen version of DNNMark.
diff --git a/src/DNNMark/benchmarks/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_VGG/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_VGG/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_VGG/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_VGG/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_VGG/test_VGG.cc b/src/gpu/DNNMark/benchmarks/test_VGG/test_VGG.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_VGG/test_VGG.cc
rename to src/gpu/DNNMark/benchmarks/test_VGG/test_VGG.cc
diff --git a/src/DNNMark/benchmarks/test_alexnet/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_alexnet/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_alexnet/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_alexnet/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_alexnet/test_alexnet.cc b/src/gpu/DNNMark/benchmarks/test_alexnet/test_alexnet.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_alexnet/test_alexnet.cc
rename to src/gpu/DNNMark/benchmarks/test_alexnet/test_alexnet.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_activation/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_activation/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_activation/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_activation/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_activation/test_bwd_activation.cc b/src/gpu/DNNMark/benchmarks/test_bwd_activation/test_bwd_activation.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_activation/test_bwd_activation.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_activation/test_bwd_activation.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_bn/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_bn/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_bn/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_bn/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_bn/test_bwd_bn.cc b/src/gpu/DNNMark/benchmarks/test_bwd_bn/test_bwd_bn.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_bn/test_bwd_bn.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_bn/test_bwd_bn.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_bypass/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_bypass/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_bypass/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_bypass/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_bypass/test_bwd_bypass.cc b/src/gpu/DNNMark/benchmarks/test_bwd_bypass/test_bwd_bypass.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_bypass/test_bwd_bypass.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_bypass/test_bwd_bypass.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_composed_model/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_composed_model/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_composed_model/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_composed_model/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_composed_model/test_bwd_composed_model.cc b/src/gpu/DNNMark/benchmarks/test_bwd_composed_model/test_bwd_composed_model.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_composed_model/test_bwd_composed_model.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_composed_model/test_bwd_composed_model.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_conv/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_conv/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_conv/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_conv/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_conv/test_bwd_conv.cc b/src/gpu/DNNMark/benchmarks/test_bwd_conv/test_bwd_conv.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_conv/test_bwd_conv.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_conv/test_bwd_conv.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_dropout/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_dropout/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_dropout/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_dropout/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_dropout/test_bwd_dropout.cc b/src/gpu/DNNMark/benchmarks/test_bwd_dropout/test_bwd_dropout.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_dropout/test_bwd_dropout.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_dropout/test_bwd_dropout.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_fc/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_fc/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_fc/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_fc/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_fc/test_bwd_fc.cc b/src/gpu/DNNMark/benchmarks/test_bwd_fc/test_bwd_fc.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_fc/test_bwd_fc.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_fc/test_bwd_fc.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_lrn/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_lrn/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_lrn/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_lrn/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_lrn/test_bwd_lrn.cc b/src/gpu/DNNMark/benchmarks/test_bwd_lrn/test_bwd_lrn.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_lrn/test_bwd_lrn.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_lrn/test_bwd_lrn.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_pool/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_pool/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_pool/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_pool/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_pool/test_bwd_pool.cc b/src/gpu/DNNMark/benchmarks/test_bwd_pool/test_bwd_pool.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_pool/test_bwd_pool.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_pool/test_bwd_pool.cc
diff --git a/src/DNNMark/benchmarks/test_bwd_softmax/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_bwd_softmax/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_softmax/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_bwd_softmax/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_bwd_softmax/test_bwd_softmax.cc b/src/gpu/DNNMark/benchmarks/test_bwd_softmax/test_bwd_softmax.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_bwd_softmax/test_bwd_softmax.cc
rename to src/gpu/DNNMark/benchmarks/test_bwd_softmax/test_bwd_softmax.cc
diff --git a/src/DNNMark/benchmarks/test_composed_model/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_composed_model/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_composed_model/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_composed_model/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_composed_model/test_composed_model.cc b/src/gpu/DNNMark/benchmarks/test_composed_model/test_composed_model.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_composed_model/test_composed_model.cc
rename to src/gpu/DNNMark/benchmarks/test_composed_model/test_composed_model.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_activation/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_activation/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_activation/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_activation/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_activation/test_fwd_activation.cc b/src/gpu/DNNMark/benchmarks/test_fwd_activation/test_fwd_activation.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_activation/test_fwd_activation.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_activation/test_fwd_activation.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_bn/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_bn/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_bn/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_bn/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_bn/test_fwd_bn.cc b/src/gpu/DNNMark/benchmarks/test_fwd_bn/test_fwd_bn.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_bn/test_fwd_bn.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_bn/test_fwd_bn.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_bypass/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_bypass/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_bypass/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_bypass/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_bypass/test_fwd_bypass.cc b/src/gpu/DNNMark/benchmarks/test_fwd_bypass/test_fwd_bypass.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_bypass/test_fwd_bypass.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_bypass/test_fwd_bypass.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_composed_model/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_composed_model/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_composed_model/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_composed_model/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_composed_model/test_fwd_composed_model.cc b/src/gpu/DNNMark/benchmarks/test_fwd_composed_model/test_fwd_composed_model.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_composed_model/test_fwd_composed_model.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_composed_model/test_fwd_composed_model.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_conv/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_conv/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_conv/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_conv/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_conv/test_fwd_conv.cc b/src/gpu/DNNMark/benchmarks/test_fwd_conv/test_fwd_conv.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_conv/test_fwd_conv.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_conv/test_fwd_conv.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_dropout/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_dropout/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_dropout/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_dropout/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_dropout/test_fwd_dropout.cc b/src/gpu/DNNMark/benchmarks/test_fwd_dropout/test_fwd_dropout.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_dropout/test_fwd_dropout.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_dropout/test_fwd_dropout.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_fc/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_fc/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_fc/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_fc/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_fc/test_fwd_fc.cc b/src/gpu/DNNMark/benchmarks/test_fwd_fc/test_fwd_fc.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_fc/test_fwd_fc.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_fc/test_fwd_fc.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_lrn/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_lrn/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_lrn/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_lrn/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_lrn/test_fwd_lrn.cc b/src/gpu/DNNMark/benchmarks/test_fwd_lrn/test_fwd_lrn.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_lrn/test_fwd_lrn.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_lrn/test_fwd_lrn.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_pool/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_pool/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_pool/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_pool/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_pool/test_fwd_pool.cc b/src/gpu/DNNMark/benchmarks/test_fwd_pool/test_fwd_pool.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_pool/test_fwd_pool.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_pool/test_fwd_pool.cc
diff --git a/src/DNNMark/benchmarks/test_fwd_softmax/CMakeLists.txt b/src/gpu/DNNMark/benchmarks/test_fwd_softmax/CMakeLists.txt
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_softmax/CMakeLists.txt
rename to src/gpu/DNNMark/benchmarks/test_fwd_softmax/CMakeLists.txt
diff --git a/src/DNNMark/benchmarks/test_fwd_softmax/test_fwd_softmax.cc b/src/gpu/DNNMark/benchmarks/test_fwd_softmax/test_fwd_softmax.cc
similarity index 100%
rename from src/DNNMark/benchmarks/test_fwd_softmax/test_fwd_softmax.cc
rename to src/gpu/DNNMark/benchmarks/test_fwd_softmax/test_fwd_softmax.cc
diff --git a/src/DNNMark/benchmarks/usage.cc b/src/gpu/DNNMark/benchmarks/usage.cc
similarity index 100%
rename from src/DNNMark/benchmarks/usage.cc
rename to src/gpu/DNNMark/benchmarks/usage.cc
diff --git a/src/DNNMark/benchmarks/usage.h b/src/gpu/DNNMark/benchmarks/usage.h
similarity index 100%
rename from src/DNNMark/benchmarks/usage.h
rename to src/gpu/DNNMark/benchmarks/usage.h
diff --git a/src/gpu/DNNMark/cachefiles.csv b/src/gpu/DNNMark/cachefiles.csv
new file mode 100644
index 0000000..af971ab
--- /dev/null
+++ b/src/gpu/DNNMark/cachefiles.csv
@@ -0,0 +1,11 @@
+MIOpenBatchNormFwdTrainPerAct.cl, -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_SAVE_MEAN_VARIANCE=1 -DMIO_RUNNING_RESULT=1 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_LDS_SIZE=256 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=256 -DMIO_BN_GRP2=1 -DMIO_BN_NCHW=100000 -mcpu=gfx803
+MIOpenBatchNormBwdPerAct.cl, -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_NCHW=100000 -DMIO_BN_NGRPS=1 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=64 -DMIO_BN_GRP2=1 -mcpu=gfx803
+MIOpenNeuron.cl, -DLITE -DMIOPEN_READ_UNIT=4 -DMIOPEN_READ_TYPE=_FLOAT4 -DMIOPEN_NRN_OP_ID=0 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx803
+MIOpenSoftmax.cl,-DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DUSE_SOFTMAX_ACCURATE=1 -DUSE_SOFTMAX_MODE_CHANNEL=1 -DRUN_FORWARD=0 -DIS_OUTPUT_PACKED=1 -DIS_DOUTPUT_PACKED=1 -DIS_DINPUT_PACKED=1 -mcpu=gfx803
+MIOpenSoftmax.cl,-DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DUSE_SOFTMAX_ACCURATE=1 -DUSE_SOFTMAX_MODE_CHANNEL=1 -DRUN_FORWARD=1 -DIS_INPUT_PACKED=1 -DIS_OUTPUT_PACKED=1 -mcpu=gfx803
+MIOpenIm2d2Col.cl, -DNUM_CH_PER_WG=1 -DNUM_IM_BLKS_X=1 -DNUM_IM_BLKS=4 -DLOCAL_MEM_SIZE=432 -DSTRIDE_GT_1=0 -DTILE_SZ_X=32 -DTILE_SZ_Y=8 -DUSE_IM_OFF_GUARD=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_INT8=0 -DMIOPEN_USE_INT8x4=0 -DMIOPEN_USE_BFP16=0 -DMIOPEN_USE_INT32=0 -DMIOPEN_USE_RNE_BFLOAT16=1 -mcpu=gfx803
+MIOpenPooling.cl, -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_HORIZ_OUT_PIX=1 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=16 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_INDEX_TYPE=uchar -DMLO_POOLING_INDEX_MAX=UCHAR_MAX -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx803
+MIOpenPoolingBwd.cl, -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=1 -DMLO_POOLBWD_N_VERT_OUT_PIX=8 -DMLO_POOLBWD_GROUP_SZ0=32 -DMLO_POOLBWD_GROUP_SZ1=4 -DMLO_POOLING_INDEX_TYPE=uchar -DMLO_POOLING_INDEX_MAX=UCHAR_MAX -DUSE_IMG_INDEX=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx803
+MIOpenSubTensorOpWithScalarKernel.cl,-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_INT8=0 -DMIOPEN_USE_INT8x4=0 -DMIOPEN_USE_BFP16=0 -DMIOPEN_USE_INT32=0 -DMIOPEN_USE_RNE_BFLOAT16=1 -DWORK_LENGTH_0=4096 -mcpu=gfx803
+MIOpenCol2Im2d.cl, -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_INT8=0 -DMIOPEN_USE_INT8x4=0 -DMIOPEN_USE_BFP16=0 -DMIOPEN_USE_INT32=0 -DMIOPEN_USE_RNE_BFLOAT16=1 -mcpu=gfx803
+MIOpenPooling.cl, -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_HORIZ_OUT_PIX=1 -DMLO_POOLING_N_VERT_OUT_PIX=8 -DMLO_POOLING_GROUP_SZ0=16 -DMLO_POOLING_GROUP_SZ1=16 -DMLO_POOLING_INDEX_TYPE=uchar -DMLO_POOLING_INDEX_MAX=UCHAR_MAX -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx803
diff --git a/src/DNNMark/cmake/gflag.cmake b/src/gpu/DNNMark/cmake/gflag.cmake
similarity index 100%
rename from src/DNNMark/cmake/gflag.cmake
rename to src/gpu/DNNMark/cmake/gflag.cmake
diff --git a/src/DNNMark/config_example/VGG.dnnmark b/src/gpu/DNNMark/config_example/VGG.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/VGG.dnnmark
rename to src/gpu/DNNMark/config_example/VGG.dnnmark
diff --git a/src/DNNMark/config_example/VGG.dnntemplate b/src/gpu/DNNMark/config_example/VGG.dnntemplate
similarity index 100%
rename from src/DNNMark/config_example/VGG.dnntemplate
rename to src/gpu/DNNMark/config_example/VGG.dnntemplate
diff --git a/src/DNNMark/config_example/activation_config.dnnmark b/src/gpu/DNNMark/config_example/activation_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/activation_config.dnnmark
rename to src/gpu/DNNMark/config_example/activation_config.dnnmark
diff --git a/src/DNNMark/config_example/alexnet.dnnmark b/src/gpu/DNNMark/config_example/alexnet.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/alexnet.dnnmark
rename to src/gpu/DNNMark/config_example/alexnet.dnnmark
diff --git a/src/DNNMark/config_example/bn_config.dnnmark b/src/gpu/DNNMark/config_example/bn_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/bn_config.dnnmark
rename to src/gpu/DNNMark/config_example/bn_config.dnnmark
diff --git a/src/DNNMark/config_example/bypass_config.dnnmark b/src/gpu/DNNMark/config_example/bypass_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/bypass_config.dnnmark
rename to src/gpu/DNNMark/config_example/bypass_config.dnnmark
diff --git a/src/DNNMark/config_example/composed_model_config.dnnmark b/src/gpu/DNNMark/config_example/composed_model_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/composed_model_config.dnnmark
rename to src/gpu/DNNMark/config_example/composed_model_config.dnnmark
diff --git a/src/DNNMark/config_example/conf_convolution_block.dnnmark b/src/gpu/DNNMark/config_example/conf_convolution_block.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/conf_convolution_block.dnnmark
rename to src/gpu/DNNMark/config_example/conf_convolution_block.dnnmark
diff --git a/src/DNNMark/config_example/conf_convolution_block.dnntemplate b/src/gpu/DNNMark/config_example/conf_convolution_block.dnntemplate
similarity index 100%
rename from src/DNNMark/config_example/conf_convolution_block.dnntemplate
rename to src/gpu/DNNMark/config_example/conf_convolution_block.dnntemplate
diff --git a/src/DNNMark/config_example/conv_config.dnnmark b/src/gpu/DNNMark/config_example/conv_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/conv_config.dnnmark
rename to src/gpu/DNNMark/config_example/conv_config.dnnmark
diff --git a/src/DNNMark/config_example/dropout_config.dnnmark b/src/gpu/DNNMark/config_example/dropout_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/dropout_config.dnnmark
rename to src/gpu/DNNMark/config_example/dropout_config.dnnmark
diff --git a/src/DNNMark/config_example/fc_config.dnnmark b/src/gpu/DNNMark/config_example/fc_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/fc_config.dnnmark
rename to src/gpu/DNNMark/config_example/fc_config.dnnmark
diff --git a/src/DNNMark/config_example/lrn_config.dnnmark b/src/gpu/DNNMark/config_example/lrn_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/lrn_config.dnnmark
rename to src/gpu/DNNMark/config_example/lrn_config.dnnmark
diff --git a/src/DNNMark/config_example/pool_config.dnnmark b/src/gpu/DNNMark/config_example/pool_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/pool_config.dnnmark
rename to src/gpu/DNNMark/config_example/pool_config.dnnmark
diff --git a/src/DNNMark/config_example/softmax_config.dnnmark b/src/gpu/DNNMark/config_example/softmax_config.dnnmark
similarity index 100%
rename from src/DNNMark/config_example/softmax_config.dnnmark
rename to src/gpu/DNNMark/config_example/softmax_config.dnnmark
diff --git a/src/DNNMark/core/include/common.h b/src/gpu/DNNMark/core/include/common.h
similarity index 100%
rename from src/DNNMark/core/include/common.h
rename to src/gpu/DNNMark/core/include/common.h
diff --git a/src/DNNMark/core/include/data_manager.h b/src/gpu/DNNMark/core/include/data_manager.h
similarity index 96%
rename from src/DNNMark/core/include/data_manager.h
rename to src/gpu/DNNMark/core/include/data_manager.h
index 8a4c10a..56c064a 100644
--- a/src/DNNMark/core/include/data_manager.h
+++ b/src/gpu/DNNMark/core/include/data_manager.h
@@ -46,7 +46,7 @@
     CUDA_CALL(cudaMalloc(&gpu_ptr_, size * sizeof(T)));
 #endif
 #ifdef AMD_MIOPEN
-    HIP_CALL(hipMalloc(&gpu_ptr_, size * sizeof(T)));
+    HIP_CALL(hipHostMalloc(&gpu_ptr_, size * sizeof(T)));
 #endif
   }
   ~Data() {
@@ -56,7 +56,7 @@
       CUDA_CALL(cudaFree(gpu_ptr_));
 #endif
 #ifdef AMD_MIOPEN
-      HIP_CALL(hipFree(gpu_ptr_));
+      HIP_CALL(hipHostFree(gpu_ptr_));
 #endif
     }
   }
diff --git a/src/DNNMark/core/include/data_png.h b/src/gpu/DNNMark/core/include/data_png.h
similarity index 95%
rename from src/DNNMark/core/include/data_png.h
rename to src/gpu/DNNMark/core/include/data_png.h
index 4a8d4d7..3f15d79 100644
--- a/src/DNNMark/core/include/data_png.h
+++ b/src/gpu/DNNMark/core/include/data_png.h
@@ -109,8 +109,7 @@
                         (static_cast <float> (RAND_MAX/seed));
     }
 
-    HIP_CALL(hipMemcpy(dev_ptr, host_ptr, size * sizeof(float),
-                       hipMemcpyHostToDevice));
+    memcpy(dev_ptr, host_ptr, size * sizeof(float));
     if (use_mmap) {
         munmap(host_ptr, size*sizeof(float));
     } else {
@@ -146,8 +145,7 @@
                         (static_cast <double> (RAND_MAX/seed));
     }
 
-    HIP_CALL(hipMemcpy(dev_ptr, host_ptr, size * sizeof(double),
-                       hipMemcpyHostToDevice));
+    memcpy(dev_ptr, host_ptr, size * sizeof(double));
 
     if (use_mmap) {
         munmap(host_ptr, size*sizeof(double));
diff --git a/src/DNNMark/core/include/dnn_config_keywords.h b/src/gpu/DNNMark/core/include/dnn_config_keywords.h
similarity index 100%
rename from src/DNNMark/core/include/dnn_config_keywords.h
rename to src/gpu/DNNMark/core/include/dnn_config_keywords.h
diff --git a/src/DNNMark/core/include/dnn_layer.h b/src/gpu/DNNMark/core/include/dnn_layer.h
similarity index 100%
rename from src/DNNMark/core/include/dnn_layer.h
rename to src/gpu/DNNMark/core/include/dnn_layer.h
diff --git a/src/DNNMark/core/include/dnn_param.h b/src/gpu/DNNMark/core/include/dnn_param.h
similarity index 100%
rename from src/DNNMark/core/include/dnn_param.h
rename to src/gpu/DNNMark/core/include/dnn_param.h
diff --git a/src/DNNMark/core/include/dnn_utility.h b/src/gpu/DNNMark/core/include/dnn_utility.h
similarity index 100%
rename from src/DNNMark/core/include/dnn_utility.h
rename to src/gpu/DNNMark/core/include/dnn_utility.h
diff --git a/src/DNNMark/core/include/dnn_wrapper.h b/src/gpu/DNNMark/core/include/dnn_wrapper.h
similarity index 100%
rename from src/DNNMark/core/include/dnn_wrapper.h
rename to src/gpu/DNNMark/core/include/dnn_wrapper.h
diff --git a/src/DNNMark/core/include/dnnmark.h b/src/gpu/DNNMark/core/include/dnnmark.h
similarity index 100%
rename from src/DNNMark/core/include/dnnmark.h
rename to src/gpu/DNNMark/core/include/dnnmark.h
diff --git a/src/DNNMark/core/include/gemm_wrapper.h b/src/gpu/DNNMark/core/include/gemm_wrapper.h
similarity index 100%
rename from src/DNNMark/core/include/gemm_wrapper.h
rename to src/gpu/DNNMark/core/include/gemm_wrapper.h
diff --git a/src/DNNMark/core/include/layers/activation_layer.h b/src/gpu/DNNMark/core/include/layers/activation_layer.h
similarity index 100%
rename from src/DNNMark/core/include/layers/activation_layer.h
rename to src/gpu/DNNMark/core/include/layers/activation_layer.h
diff --git a/src/DNNMark/core/include/layers/bn_layer.h b/src/gpu/DNNMark/core/include/layers/bn_layer.h
similarity index 100%
rename from src/DNNMark/core/include/layers/bn_layer.h
rename to src/gpu/DNNMark/core/include/layers/bn_layer.h
diff --git a/src/DNNMark/core/include/layers/bypass_layer.h b/src/gpu/DNNMark/core/include/layers/bypass_layer.h
similarity index 100%
rename from src/DNNMark/core/include/layers/bypass_layer.h
rename to src/gpu/DNNMark/core/include/layers/bypass_layer.h
diff --git a/src/DNNMark/core/include/layers/conv_layer.h b/src/gpu/DNNMark/core/include/layers/conv_layer.h
similarity index 100%
rename from src/DNNMark/core/include/layers/conv_layer.h
rename to src/gpu/DNNMark/core/include/layers/conv_layer.h
diff --git a/src/DNNMark/core/include/layers/dropout_layer.h b/src/gpu/DNNMark/core/include/layers/dropout_layer.h
similarity index 100%
rename from src/DNNMark/core/include/layers/dropout_layer.h
rename to src/gpu/DNNMark/core/include/layers/dropout_layer.h
diff --git a/src/DNNMark/core/include/layers/fc_layer.h b/src/gpu/DNNMark/core/include/layers/fc_layer.h
similarity index 100%
rename from src/DNNMark/core/include/layers/fc_layer.h
rename to src/gpu/DNNMark/core/include/layers/fc_layer.h
diff --git a/src/DNNMark/core/include/layers/lrn_layer.h b/src/gpu/DNNMark/core/include/layers/lrn_layer.h
similarity index 100%
rename from src/DNNMark/core/include/layers/lrn_layer.h
rename to src/gpu/DNNMark/core/include/layers/lrn_layer.h
diff --git a/src/DNNMark/core/include/layers/pool_layer.h b/src/gpu/DNNMark/core/include/layers/pool_layer.h
similarity index 100%
rename from src/DNNMark/core/include/layers/pool_layer.h
rename to src/gpu/DNNMark/core/include/layers/pool_layer.h
diff --git a/src/DNNMark/core/include/layers/softmax_layer.h b/src/gpu/DNNMark/core/include/layers/softmax_layer.h
similarity index 100%
rename from src/DNNMark/core/include/layers/softmax_layer.h
rename to src/gpu/DNNMark/core/include/layers/softmax_layer.h
diff --git a/src/DNNMark/core/include/timer.h b/src/gpu/DNNMark/core/include/timer.h
similarity index 100%
rename from src/DNNMark/core/include/timer.h
rename to src/gpu/DNNMark/core/include/timer.h
diff --git a/src/DNNMark/core/include/utility.h b/src/gpu/DNNMark/core/include/utility.h
similarity index 100%
rename from src/DNNMark/core/include/utility.h
rename to src/gpu/DNNMark/core/include/utility.h
diff --git a/src/DNNMark/core/src/common.cc b/src/gpu/DNNMark/core/src/common.cc
similarity index 100%
rename from src/DNNMark/core/src/common.cc
rename to src/gpu/DNNMark/core/src/common.cc
diff --git a/src/DNNMark/core/src/dnn_config_keywords.cc b/src/gpu/DNNMark/core/src/dnn_config_keywords.cc
similarity index 100%
rename from src/DNNMark/core/src/dnn_config_keywords.cc
rename to src/gpu/DNNMark/core/src/dnn_config_keywords.cc
diff --git a/src/DNNMark/core/src/dnn_utility.cc b/src/gpu/DNNMark/core/src/dnn_utility.cc
similarity index 100%
rename from src/DNNMark/core/src/dnn_utility.cc
rename to src/gpu/DNNMark/core/src/dnn_utility.cc
diff --git a/src/DNNMark/core/src/dnnmark.cc b/src/gpu/DNNMark/core/src/dnnmark.cc
similarity index 100%
rename from src/DNNMark/core/src/dnnmark.cc
rename to src/gpu/DNNMark/core/src/dnnmark.cc
diff --git a/src/DNNMark/core/src/gemm_wrapper.cc b/src/gpu/DNNMark/core/src/gemm_wrapper.cc
similarity index 100%
rename from src/DNNMark/core/src/gemm_wrapper.cc
rename to src/gpu/DNNMark/core/src/gemm_wrapper.cc
diff --git a/src/DNNMark/core/src/utility.cc b/src/gpu/DNNMark/core/src/utility.cc
similarity index 100%
rename from src/DNNMark/core/src/utility.cc
rename to src/gpu/DNNMark/core/src/utility.cc
diff --git a/src/DNNMark/data/cifar/download.py b/src/gpu/DNNMark/data/cifar/download.py
similarity index 100%
rename from src/DNNMark/data/cifar/download.py
rename to src/gpu/DNNMark/data/cifar/download.py
diff --git a/src/gpu/DNNMark/generate_cachefiles.py b/src/gpu/DNNMark/generate_cachefiles.py
new file mode 100755
index 0000000..dc151d0
--- /dev/null
+++ b/src/gpu/DNNMark/generate_cachefiles.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+import argparse
+import bz2
+import csv
+import hashlib
+import os
+import shlex
+import sqlite3
+import subprocess
+import tempfile
+from pathlib import Path
+
+
+def parseArgs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('csv_file', type=str,
+                        help='File containing cache files to compile '
+                             'in the format of: filename, args')
+    parser.add_argument('--num-cus', default=4, type=int,
+                        help='Number of CUs in simulated GPU')
+    parser.add_argument('--gfx-version', default='gfx801',
+                        choices=['gfx801', 'gfx803'],
+                        help='gfx version of simulated GPU')
+
+    return parser.parse_args()
+
+
+def getDb(options):
+    db_name = f'{options.gfx_version}_{options.num_cus}.ukdb'
+    db_path = '/root/.cache/miopen/2.9.0/'
+
+    full_db_path = os.path.join(db_path, db_name)
+    # Should create file if it doesn't exist
+    # Does assume db_path exists, which it should in the Docker image
+    con = sqlite3.connect(full_db_path)
+
+    cur = con.cursor()
+
+    # Ripped from src/include/miopen/kern_db.hpp
+    cur.execute('''CREATE TABLE IF NOT EXISTS kern_db (
+                        id INTEGER PRIMARY KEY ASC,
+                        kernel_name TEXT NOT NULL,
+                        kernel_args TEXT NOT NULL,
+                        kernel_blob BLOB NOT NULL,
+                        kernel_hash TEXT NOT NULL,
+                        uncompressed_size INT NOT NULL);''')
+    cur.execute('''CREATE UNIQUE INDEX IF NOT EXISTS
+                    idx_kern_db ON kern_db (kernel_name, kernel_args);''')
+
+    return con
+
+
+def insertFiles(con, options):
+    miopen_kern_path = '/MIOpen/src/kernels'
+
+    extra_args = {'gfx801': '-Wno-everything -Xclang '
+                            '-target-feature -Xclang +code-object-v3',
+                  'gfx803': '-Wno-everything -Xclang '
+                            '-target-feature -Xclang +code-object-v3'}
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(options.csv_file) as csvfile:
+            reader = csv.reader(csvfile)
+            for row in reader:
+                miopen_kern = row[0]
+                miopen_kern_full = os.path.join(miopen_kern_path, miopen_kern)
+                # We want to manually add the gfx version
+                # Additionally, everything after the gfx version isn't
+                # used in the database
+                # Explicitly add the leading space because that's used
+                # in the database
+                args = (f' {row[1].split("-mcpu")[0].strip()} '
+                        f'-mcpu={options.gfx_version}')
+
+                # Hash to generate unique output files
+                file_hash = hashlib.md5(args.encode('utf-8')).hexdigest()
+                outfile = f'{miopen_kern}-{file_hash}.o'
+                full_outfile = os.path.join(tmpdir, outfile)
+
+                # Compile the kernel
+                cmd_str = (f'/opt/rocm/bin/clang-ocl {args} '
+                           f'{extra_args[options.gfx_version]} '
+                           f'{miopen_kern_full} -o {full_outfile}')
+                cmd_args = shlex.split(cmd_str)
+                subprocess.run(cmd_args, check=True)
+
+                # Get other params needed for db
+                uncompressed_file = open(full_outfile, 'rb').read()
+                uncompressed_size = Path(full_outfile).stat().st_size
+                uncompressed_hash = hashlib.md5(uncompressed_file).hexdigest()
+                compressed_blob = bz2.compress(uncompressed_file)
+
+                cur = con.cursor()
+                cur.execute('''INSERT OR IGNORE INTO kern_db
+                               (kernel_name, kernel_args, kernel_blob, kernel_hash, uncompressed_size)
+                               VALUES(?, ?, ?, ?, ?)''',
+                            (f'{miopen_kern}.o', args, compressed_blob,
+                                uncompressed_hash, uncompressed_size))
+
+
+def main():
+
+    args = parseArgs()
+
+    con = getDb(args)
+
+    insertFiles(con, args)
+
+    con.commit()
+    con.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/DNNMark/generate_rand_data.cpp b/src/gpu/DNNMark/generate_rand_data.cpp
similarity index 100%
rename from src/DNNMark/generate_rand_data.cpp
rename to src/gpu/DNNMark/generate_rand_data.cpp
diff --git a/src/DNNMark/generate_rand_data.py b/src/gpu/DNNMark/generate_rand_data.py
similarity index 100%
rename from src/DNNMark/generate_rand_data.py
rename to src/gpu/DNNMark/generate_rand_data.py
diff --git a/src/DNNMark/run_dnnmark_template.sh b/src/gpu/DNNMark/run_dnnmark_template.sh
similarity index 100%
rename from src/DNNMark/run_dnnmark_template.sh
rename to src/gpu/DNNMark/run_dnnmark_template.sh
diff --git a/src/DNNMark/setup.sh b/src/gpu/DNNMark/setup.sh
similarity index 81%
rename from src/DNNMark/setup.sh
rename to src/gpu/DNNMark/setup.sh
index 30baf95..ebd2afc 100755
--- a/src/DNNMark/setup.sh
+++ b/src/gpu/DNNMark/setup.sh
@@ -23,9 +23,10 @@
 then
   MIOPEN_PATH=/opt/rocm/miopen
   ROCBLAS_PATH=/opt/rocm/rocblas
-  CXX=/opt/rocm/hcc/bin/hcc cmake \
+  CXX=/opt/rocm/bin/hipcc cmake \
     -DHCC_ENABLE=ON \
     -DMIOPEN_ROOT=${MIOPEN_PATH} \
     -DROCBLAS_ROOT=${ROCBLAS_PATH} \
+    -DCMAKE_PREFIX_PATH="/opt/rocm;/opt/rocm/lib/cmake/AMDDeviceLibs/;/opt/rocm/lib/cmake/amd_comgr/" \
     ..
 fi
diff --git a/src/DNNMark/tools/has_cuda_gpu.c b/src/gpu/DNNMark/tools/has_cuda_gpu.c
similarity index 100%
rename from src/DNNMark/tools/has_cuda_gpu.c
rename to src/gpu/DNNMark/tools/has_cuda_gpu.c
diff --git a/src/DNNMark/tools/parse_nvprof_csv_metrics_with_plot.py b/src/gpu/DNNMark/tools/parse_nvprof_csv_metrics_with_plot.py
similarity index 100%
rename from src/DNNMark/tools/parse_nvprof_csv_metrics_with_plot.py
rename to src/gpu/DNNMark/tools/parse_nvprof_csv_metrics_with_plot.py
diff --git a/src/DNNMark/tools/sanity_test.sh b/src/gpu/DNNMark/tools/sanity_test.sh
similarity index 100%
rename from src/DNNMark/tools/sanity_test.sh
rename to src/gpu/DNNMark/tools/sanity_test.sh
diff --git a/src/halo-finder/.gitignore b/src/gpu/halo-finder/.gitignore
similarity index 100%
rename from src/halo-finder/.gitignore
rename to src/gpu/halo-finder/.gitignore
diff --git a/src/halo-finder/Dockerfile b/src/gpu/halo-finder/Dockerfile
similarity index 100%
rename from src/halo-finder/Dockerfile
rename to src/gpu/halo-finder/Dockerfile
diff --git a/src/gpu/halo-finder/README.md b/src/gpu/halo-finder/README.md
new file mode 100644
index 0000000..cbfb685
--- /dev/null
+++ b/src/gpu/halo-finder/README.md
@@ -0,0 +1,59 @@
+---
+title: GCN3 HACC Test
+tags:
+    - x86
+    - amdgpu
+layout: default
+permalink: resources/hacc
+shortdoc: >
+    Resources to build a disk image with the GCN3 HACC (halo-finder) workload.
+---
+
+# Resource: halo-finder (HACC)
+
+[HACC](https://asc.llnl.gov/coral-2-benchmarks) is a DoE application designed to simulate the
+evolution of the universe by simulating the formation of structure in collisionless fluids
+under the influence of gravity. The halo-finder code can be GPU accelerated by using
+the code in RCBForceTree.cxx
+
+`src/gpu/halo-finder/src` contains the code required to build and run ForceTreeTest from `src/halo_finder` in the main HACC codebase.
+`src/gpu/halo-finder/src/dfft` contains the dfft code from `src/dfft` in the main HACC codebase.
+
+HACC can be used to test the GCN3-GPU model.
+
+Compiling HACC, compiling the GCN3_X86 gem5, and running HACC on gem5 is dependent on the gcn-gpu docker image, built from the `util/dockerfiles/gcn-gpu/Dockerfile` on the [gem5 stable branch](https://gem5.googlesource.com/public/gem5/+/refs/heads/stable).
+
+## Compilation and Running
+
+halo-finder requires that certain libraries that aren't installed by default in the
+GCN3 docker container provided by gem5, and that the environment is configured properly
+in order to build. We provide a Dockerfile that installs those libraries and
+sets the environment.
+
+In order to test the GPU code in halo-finder, we compile and run ForceTreeTest.
+
+To build the Docker image and the benchmark:
+```
+cd src/gpu/halo-finder
+docker build -t <image_name> .
+docker run --rm -v ${PWD}:${PWD} -w ${PWD}/src -u $UID:$GID <image_name> make hip/ForceTreeTest
+```
+
+The binary is built for gfx801 by default and is placed at `src/gpu/halo-finder/src/hip/ForceTreeTest`
+
+ForceTreeTest is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
+To build GCN3_X86:
+```
+# Working directory is your gem5 directory
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+```
+
+To run ForceTreeTest:
+```
+# Assuming gem5 and gem5-resources are in the working directory
+docker run --rm -v $PWD:$PWD -w $PWD -u $UID:$GID <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/halo-finder/src/hip -cForceTreeTest --options="0.5 0.1 64 0.1 1 N 12 rcb"
+```
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/v21-1/test-progs/halo-finder/ForceTreeTest>
diff --git a/src/halo-finder/src/BGQCM.c b/src/gpu/halo-finder/src/BGQCM.c
similarity index 100%
rename from src/halo-finder/src/BGQCM.c
rename to src/gpu/halo-finder/src/BGQCM.c
diff --git a/src/halo-finder/src/BGQStep16.c b/src/gpu/halo-finder/src/BGQStep16.c
similarity index 100%
rename from src/halo-finder/src/BGQStep16.c
rename to src/gpu/halo-finder/src/BGQStep16.c
diff --git a/src/halo-finder/src/BHForceTree.cxx b/src/gpu/halo-finder/src/BHForceTree.cxx
similarity index 100%
rename from src/halo-finder/src/BHForceTree.cxx
rename to src/gpu/halo-finder/src/BHForceTree.cxx
diff --git a/src/halo-finder/src/BHForceTree.h b/src/gpu/halo-finder/src/BHForceTree.h
similarity index 100%
rename from src/halo-finder/src/BHForceTree.h
rename to src/gpu/halo-finder/src/BHForceTree.h
diff --git a/src/halo-finder/src/BasicDefinition.h b/src/gpu/halo-finder/src/BasicDefinition.h
similarity index 100%
rename from src/halo-finder/src/BasicDefinition.h
rename to src/gpu/halo-finder/src/BasicDefinition.h
diff --git a/src/halo-finder/src/CMakeLists.txt b/src/gpu/halo-finder/src/CMakeLists.txt
similarity index 100%
rename from src/halo-finder/src/CMakeLists.txt
rename to src/gpu/halo-finder/src/CMakeLists.txt
diff --git a/src/halo-finder/src/CosmoDefinition.h.in b/src/gpu/halo-finder/src/CosmoDefinition.h.in
similarity index 100%
rename from src/halo-finder/src/CosmoDefinition.h.in
rename to src/gpu/halo-finder/src/CosmoDefinition.h.in
diff --git a/src/halo-finder/src/CosmoHalo.h b/src/gpu/halo-finder/src/CosmoHalo.h
similarity index 100%
rename from src/halo-finder/src/CosmoHalo.h
rename to src/gpu/halo-finder/src/CosmoHalo.h
diff --git a/src/halo-finder/src/CosmoToGadget2.cxx b/src/gpu/halo-finder/src/CosmoToGadget2.cxx
similarity index 100%
rename from src/halo-finder/src/CosmoToGadget2.cxx
rename to src/gpu/halo-finder/src/CosmoToGadget2.cxx
diff --git a/src/halo-finder/src/Definition.h b/src/gpu/halo-finder/src/Definition.h
similarity index 100%
rename from src/halo-finder/src/Definition.h
rename to src/gpu/halo-finder/src/Definition.h
diff --git a/src/halo-finder/src/ForceLaw.cxx b/src/gpu/halo-finder/src/ForceLaw.cxx
similarity index 100%
rename from src/halo-finder/src/ForceLaw.cxx
rename to src/gpu/halo-finder/src/ForceLaw.cxx
diff --git a/src/halo-finder/src/ForceLaw.h b/src/gpu/halo-finder/src/ForceLaw.h
similarity index 100%
rename from src/halo-finder/src/ForceLaw.h
rename to src/gpu/halo-finder/src/ForceLaw.h
diff --git a/src/halo-finder/src/ForceTree.cxx b/src/gpu/halo-finder/src/ForceTree.cxx
similarity index 100%
rename from src/halo-finder/src/ForceTree.cxx
rename to src/gpu/halo-finder/src/ForceTree.cxx
diff --git a/src/halo-finder/src/ForceTree.h b/src/gpu/halo-finder/src/ForceTree.h
similarity index 100%
rename from src/halo-finder/src/ForceTree.h
rename to src/gpu/halo-finder/src/ForceTree.h
diff --git a/src/halo-finder/src/ForceTreeTest.cxx b/src/gpu/halo-finder/src/ForceTreeTest.cxx
similarity index 100%
rename from src/halo-finder/src/ForceTreeTest.cxx
rename to src/gpu/halo-finder/src/ForceTreeTest.cxx
diff --git a/src/halo-finder/src/InitialExchange.cxx b/src/gpu/halo-finder/src/InitialExchange.cxx
similarity index 100%
rename from src/halo-finder/src/InitialExchange.cxx
rename to src/gpu/halo-finder/src/InitialExchange.cxx
diff --git a/src/halo-finder/src/InitialExchange.h b/src/gpu/halo-finder/src/InitialExchange.h
similarity index 100%
rename from src/halo-finder/src/InitialExchange.h
rename to src/gpu/halo-finder/src/InitialExchange.h
diff --git a/src/halo-finder/src/Makefile b/src/gpu/halo-finder/src/Makefile
similarity index 100%
rename from src/halo-finder/src/Makefile
rename to src/gpu/halo-finder/src/Makefile
diff --git a/src/halo-finder/src/Message.cxx b/src/gpu/halo-finder/src/Message.cxx
similarity index 100%
rename from src/halo-finder/src/Message.cxx
rename to src/gpu/halo-finder/src/Message.cxx
diff --git a/src/halo-finder/src/Message.h b/src/gpu/halo-finder/src/Message.h
similarity index 100%
rename from src/halo-finder/src/Message.h
rename to src/gpu/halo-finder/src/Message.h
diff --git a/src/halo-finder/src/ParticleDistribute.cxx b/src/gpu/halo-finder/src/ParticleDistribute.cxx
similarity index 100%
rename from src/halo-finder/src/ParticleDistribute.cxx
rename to src/gpu/halo-finder/src/ParticleDistribute.cxx
diff --git a/src/halo-finder/src/ParticleDistribute.h b/src/gpu/halo-finder/src/ParticleDistribute.h
similarity index 100%
rename from src/halo-finder/src/ParticleDistribute.h
rename to src/gpu/halo-finder/src/ParticleDistribute.h
diff --git a/src/halo-finder/src/ParticleExchange.cxx b/src/gpu/halo-finder/src/ParticleExchange.cxx
similarity index 100%
rename from src/halo-finder/src/ParticleExchange.cxx
rename to src/gpu/halo-finder/src/ParticleExchange.cxx
diff --git a/src/halo-finder/src/ParticleExchange.h b/src/gpu/halo-finder/src/ParticleExchange.h
similarity index 100%
rename from src/halo-finder/src/ParticleExchange.h
rename to src/gpu/halo-finder/src/ParticleExchange.h
diff --git a/src/halo-finder/src/Partition.cxx b/src/gpu/halo-finder/src/Partition.cxx
similarity index 100%
rename from src/halo-finder/src/Partition.cxx
rename to src/gpu/halo-finder/src/Partition.cxx
diff --git a/src/halo-finder/src/Partition.h b/src/gpu/halo-finder/src/Partition.h
similarity index 100%
rename from src/halo-finder/src/Partition.h
rename to src/gpu/halo-finder/src/Partition.h
diff --git a/src/halo-finder/src/RCBForceTree.cxx b/src/gpu/halo-finder/src/RCBForceTree.cxx
similarity index 100%
rename from src/halo-finder/src/RCBForceTree.cxx
rename to src/gpu/halo-finder/src/RCBForceTree.cxx
diff --git a/src/halo-finder/src/RCBForceTree.h b/src/gpu/halo-finder/src/RCBForceTree.h
similarity index 100%
rename from src/halo-finder/src/RCBForceTree.h
rename to src/gpu/halo-finder/src/RCBForceTree.h
diff --git a/src/halo-finder/src/RCOForceTree.cxx b/src/gpu/halo-finder/src/RCOForceTree.cxx
similarity index 100%
rename from src/halo-finder/src/RCOForceTree.cxx
rename to src/gpu/halo-finder/src/RCOForceTree.cxx
diff --git a/src/halo-finder/src/RCOForceTree.h b/src/gpu/halo-finder/src/RCOForceTree.h
similarity index 100%
rename from src/halo-finder/src/RCOForceTree.h
rename to src/gpu/halo-finder/src/RCOForceTree.h
diff --git a/src/halo-finder/src/README b/src/gpu/halo-finder/src/README
similarity index 100%
rename from src/halo-finder/src/README
rename to src/gpu/halo-finder/src/README
diff --git a/src/halo-finder/src/Timer.cxx b/src/gpu/halo-finder/src/Timer.cxx
similarity index 100%
rename from src/halo-finder/src/Timer.cxx
rename to src/gpu/halo-finder/src/Timer.cxx
diff --git a/src/halo-finder/src/Timer.h b/src/gpu/halo-finder/src/Timer.h
similarity index 100%
rename from src/halo-finder/src/Timer.h
rename to src/gpu/halo-finder/src/Timer.h
diff --git a/src/halo-finder/src/Timings.cxx b/src/gpu/halo-finder/src/Timings.cxx
similarity index 100%
rename from src/halo-finder/src/Timings.cxx
rename to src/gpu/halo-finder/src/Timings.cxx
diff --git a/src/halo-finder/src/Timings.h b/src/gpu/halo-finder/src/Timings.h
similarity index 100%
rename from src/halo-finder/src/Timings.h
rename to src/gpu/halo-finder/src/Timings.h
diff --git a/src/halo-finder/src/bigchunk.c b/src/gpu/halo-finder/src/bigchunk.c
similarity index 100%
rename from src/halo-finder/src/bigchunk.c
rename to src/gpu/halo-finder/src/bigchunk.c
diff --git a/src/halo-finder/src/bigchunk.h b/src/gpu/halo-finder/src/bigchunk.h
similarity index 100%
rename from src/halo-finder/src/bigchunk.h
rename to src/gpu/halo-finder/src/bigchunk.h
diff --git a/src/halo-finder/src/cm_int.c b/src/gpu/halo-finder/src/cm_int.c
similarity index 100%
rename from src/halo-finder/src/cm_int.c
rename to src/gpu/halo-finder/src/cm_int.c
diff --git a/src/halo-finder/src/cudaUtil.h b/src/gpu/halo-finder/src/cudaUtil.h
similarity index 100%
rename from src/halo-finder/src/cudaUtil.h
rename to src/gpu/halo-finder/src/cudaUtil.h
diff --git a/src/halo-finder/src/dfft/Makefile b/src/gpu/halo-finder/src/dfft/Makefile
similarity index 100%
rename from src/halo-finder/src/dfft/Makefile
rename to src/gpu/halo-finder/src/dfft/Makefile
diff --git a/src/halo-finder/src/dfft/README b/src/gpu/halo-finder/src/dfft/README
similarity index 100%
rename from src/halo-finder/src/dfft/README
rename to src/gpu/halo-finder/src/dfft/README
diff --git a/src/halo-finder/src/dfft/active-schedule.c b/src/gpu/halo-finder/src/dfft/active-schedule.c
similarity index 100%
rename from src/halo-finder/src/dfft/active-schedule.c
rename to src/gpu/halo-finder/src/dfft/active-schedule.c
diff --git a/src/halo-finder/src/dfft/active-schedule.h b/src/gpu/halo-finder/src/dfft/active-schedule.h
similarity index 100%
rename from src/halo-finder/src/dfft/active-schedule.h
rename to src/gpu/halo-finder/src/dfft/active-schedule.h
diff --git a/src/halo-finder/src/dfft/allocator.hpp b/src/gpu/halo-finder/src/dfft/allocator.hpp
similarity index 100%
rename from src/halo-finder/src/dfft/allocator.hpp
rename to src/gpu/halo-finder/src/dfft/allocator.hpp
diff --git a/src/halo-finder/src/dfft/comm-schedule.c b/src/gpu/halo-finder/src/dfft/comm-schedule.c
similarity index 100%
rename from src/halo-finder/src/dfft/comm-schedule.c
rename to src/gpu/halo-finder/src/dfft/comm-schedule.c
diff --git a/src/halo-finder/src/dfft/comm-schedule.h b/src/gpu/halo-finder/src/dfft/comm-schedule.h
similarity index 100%
rename from src/halo-finder/src/dfft/comm-schedule.h
rename to src/gpu/halo-finder/src/dfft/comm-schedule.h
diff --git a/src/halo-finder/src/dfft/complex-type.h b/src/gpu/halo-finder/src/dfft/complex-type.h
similarity index 100%
rename from src/halo-finder/src/dfft/complex-type.h
rename to src/gpu/halo-finder/src/dfft/complex-type.h
diff --git a/src/halo-finder/src/dfft/cross.hpp b/src/gpu/halo-finder/src/dfft/cross.hpp
similarity index 100%
rename from src/halo-finder/src/dfft/cross.hpp
rename to src/gpu/halo-finder/src/dfft/cross.hpp
diff --git a/src/halo-finder/src/dfft/cycle.h b/src/gpu/halo-finder/src/dfft/cycle.h
similarity index 100%
rename from src/halo-finder/src/dfft/cycle.h
rename to src/gpu/halo-finder/src/dfft/cycle.h
diff --git a/src/halo-finder/src/dfft/dfft.hpp b/src/gpu/halo-finder/src/dfft/dfft.hpp
similarity index 100%
rename from src/halo-finder/src/dfft/dfft.hpp
rename to src/gpu/halo-finder/src/dfft/dfft.hpp
diff --git a/src/halo-finder/src/dfft/dims.c b/src/gpu/halo-finder/src/dfft/dims.c
similarity index 100%
rename from src/halo-finder/src/dfft/dims.c
rename to src/gpu/halo-finder/src/dfft/dims.c
diff --git a/src/halo-finder/src/dfft/dims.h b/src/gpu/halo-finder/src/dfft/dims.h
similarity index 100%
rename from src/halo-finder/src/dfft/dims.h
rename to src/gpu/halo-finder/src/dfft/dims.h
diff --git a/src/halo-finder/src/dfft/distribution.c b/src/gpu/halo-finder/src/dfft/distribution.c
similarity index 100%
rename from src/halo-finder/src/dfft/distribution.c
rename to src/gpu/halo-finder/src/dfft/distribution.c
diff --git a/src/halo-finder/src/dfft/distribution.h b/src/gpu/halo-finder/src/dfft/distribution.h
similarity index 100%
rename from src/halo-finder/src/dfft/distribution.h
rename to src/gpu/halo-finder/src/dfft/distribution.h
diff --git a/src/halo-finder/src/dfft/distribution.hpp b/src/gpu/halo-finder/src/dfft/distribution.hpp
similarity index 100%
rename from src/halo-finder/src/dfft/distribution.hpp
rename to src/gpu/halo-finder/src/dfft/distribution.hpp
diff --git a/src/halo-finder/src/dfft/fp.h b/src/gpu/halo-finder/src/dfft/fp.h
similarity index 100%
rename from src/halo-finder/src/dfft/fp.h
rename to src/gpu/halo-finder/src/dfft/fp.h
diff --git a/src/halo-finder/src/dfft/include.mk b/src/gpu/halo-finder/src/dfft/include.mk
similarity index 100%
rename from src/halo-finder/src/dfft/include.mk
rename to src/gpu/halo-finder/src/dfft/include.mk
diff --git a/src/halo-finder/src/dfft/pencil.mk b/src/gpu/halo-finder/src/dfft/pencil.mk
similarity index 100%
rename from src/halo-finder/src/dfft/pencil.mk
rename to src/gpu/halo-finder/src/dfft/pencil.mk
diff --git a/src/halo-finder/src/dfft/plot.sh b/src/gpu/halo-finder/src/dfft/plot.sh
similarity index 100%
rename from src/halo-finder/src/dfft/plot.sh
rename to src/gpu/halo-finder/src/dfft/plot.sh
diff --git a/src/halo-finder/src/dfft/solver.hpp b/src/gpu/halo-finder/src/dfft/solver.hpp
similarity index 100%
rename from src/halo-finder/src/dfft/solver.hpp
rename to src/gpu/halo-finder/src/dfft/solver.hpp
diff --git a/src/halo-finder/src/dfft/subarray.c b/src/gpu/halo-finder/src/dfft/subarray.c
similarity index 100%
rename from src/halo-finder/src/dfft/subarray.c
rename to src/gpu/halo-finder/src/dfft/subarray.c
diff --git a/src/halo-finder/src/dims-local.c b/src/gpu/halo-finder/src/dims-local.c
similarity index 100%
rename from src/halo-finder/src/dims-local.c
rename to src/gpu/halo-finder/src/dims-local.c
diff --git a/src/halo-finder/src/include.mk b/src/gpu/halo-finder/src/include.mk
similarity index 100%
rename from src/halo-finder/src/include.mk
rename to src/gpu/halo-finder/src/include.mk
diff --git a/src/halo-finder/src/log.txt b/src/gpu/halo-finder/src/log.txt
similarity index 100%
rename from src/halo-finder/src/log.txt
rename to src/gpu/halo-finder/src/log.txt
diff --git a/src/halo-finder/src/rru_mpi.h b/src/gpu/halo-finder/src/rru_mpi.h
similarity index 100%
rename from src/halo-finder/src/rru_mpi.h
rename to src/gpu/halo-finder/src/rru_mpi.h
diff --git a/src/halo-finder/src/winDirent.h b/src/gpu/halo-finder/src/winDirent.h
similarity index 100%
rename from src/halo-finder/src/winDirent.h
rename to src/gpu/halo-finder/src/winDirent.h
diff --git a/src/heterosync/.gitignore b/src/gpu/heterosync/.gitignore
similarity index 100%
rename from src/heterosync/.gitignore
rename to src/gpu/heterosync/.gitignore
diff --git a/src/heterosync/LICENSE.txt b/src/gpu/heterosync/LICENSE.txt
similarity index 100%
rename from src/heterosync/LICENSE.txt
rename to src/gpu/heterosync/LICENSE.txt
diff --git a/src/heterosync/Makefile b/src/gpu/heterosync/Makefile
similarity index 72%
rename from src/heterosync/Makefile
rename to src/gpu/heterosync/Makefile
index a1aaad4..4eb34cf 100644
--- a/src/heterosync/Makefile
+++ b/src/gpu/heterosync/Makefile
@@ -13,10 +13,7 @@
 
 # gfx8 has a different number of bits it uses for sleeps, so compile accordingly
 release-gfx8: $(SRC) | $(BIN_DIR)
-	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx803 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
-
-release-gfx8-apu: $(SRC) | $(BIN_DIR)
-	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx801 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
+	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx803,gfx801 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
 
 debug: $(SRC) | $(BIN_DIR)
 	$(HIP_PATH)/bin/hipcc -DDEBUG -g -O0 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE).debug
diff --git a/src/gpu/heterosync/README.md b/src/gpu/heterosync/README.md
new file mode 100644
index 0000000..bba6547
--- /dev/null
+++ b/src/gpu/heterosync/README.md
@@ -0,0 +1,171 @@
+---
+title: GCN3 HeteroSync Tests
+tags:
+    - x86
+    - amdgpu
+layout: default
+permalink: resources/heterosync
+shortdoc: >
+    Resources to build a disk image with the GCN3 HeteroSync workloads.
+---
+
+# Resource: HeteroSync
+
+[HeteroSync](https://github.com/mattsinc/heterosync) is a benchmark suite used
+to test the performance of various types of fine-grained synchronization on
+tightly-coupled GPUs. The version in gem5-resources contains only the HIP code.
+
+Below the README details the various synchronization primitives and the other
+command-line arguments for use with heterosync.
+
+## Compilation
+```
+cd src/gpu/heterosync
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8
+```
+
+The release-gfx8 target builds for gfx801, a GCN3-based APU, and gfx803, a
+GCN3-based dGPU. There are other targets (release) that build for GPU types
+that are currently unsupported in gem5.
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/v21-1/test-progs/heterosync/gcn3/allSyncPrims-1kernel>
+
+Information from original HeteroSync README included below:
+
+These files are provided AS IS, and can be improved in many aspects. While we performed some performance optimization, there is more to be done. We do not claim that this is the most optimal implementation. The code is presented as a representative case of a CUDA and HIP implementations of these workloads only.  It is NOT meant to be interpreted as a definitive answer to how well this application can perform on GPUs, CUDA, or HIP.  If any of you are interested in improving the performance of these benchmarks, please let us know or submit a pull request on GitHub.
+
+BACKGROUND INFORMATION
+----------------------
+
+Structure: All of the HeteroSync microbenchmarks are run from a single main function.  Each of the microbenchmarks has a separate .cu (CUDA) file that contains the code for its lock and unlock functions.  In the HIP version, these files are header files, because of HIP's requirements for compilation.
+
+Contents: The following Synchronization Primitives (SyncPrims) microbenchmarks are included in HeteroSync:
+
+- Centralized Mutexes:
+	1.  Spin Mutex Lock: A fairly standard spin-lock implementation.  It repeatedly tries to obtain the lock.  This version has high contention and a lot of atomic accesses since all TBs are spinning on the same lock variable.
+	2.  Spin Mutex Lock with Backoff: Standard backoff version of a spin lock where they “sleep” for a short period of time between each unsuccessful acquire.  They use a linear backoff instead of exponential backoff.  On the first failed acquire they will “sleep” for I_min; every subsequent failed read will increase the “sleep” time (up to I_max).
+	3.  Fetch-and-Add (FA) Mutex Lock (similar to Ticket/Queue-style Locks): To make their spin lock fair and have a deterministic number of atomic accesses per operation they also implement this queue-style spin lock.  Every TB uses an atomic to get a "ticket" for when they'll get the lock.  The TBs poll the “current ticket” location until their turn arrives (when it does they acquire the lock).  FAMutex uses backoff in the polling section of this lock to reduce contention.
+	4.  Ring Buffer-based Sleeping Mutex Lock: Each TB places itself on the end of the buffer and repeatedly checks if is now at the front of the buffer.  To unlock they increment the head pointer.  In the original paper they found that performance is bad for this one because it requires more reads and writes to the head pointer are serialized.
+- Centralized Semaphores:
+	1.  Spin Lock Semaphore: To approximate the "perform OP if > 0" feature of semaphores (on CPUs) they use atomicExch's to block the TB until the exchange returns true.  Requires more reads and writes on a GPU than a mutex.  Each TB sets the semaphore to the appropriate new values in the post and wait phases depending on the current capacity of the semaphore.
+	2.  Spin Lock Semaphore with Backoff: As with the mutexes, they add a linear backoff to decrease contention.  The backoff is only in the wait() phase because usually more TBs are waiting, not posting.
+- Barriers:
+	1.  Atomic Barrier: a two-stage atomic counter barrier.  There are several versions of this barrier: a tree barrier and a second version that exchanges data locally on a CU before joining the global tree barrier.
+	2.  Lock-Free Barrier: a decentralized, sleeping based approach that doesn't require atomics.  Each TB sets a flag in a distinct memory location.  Once all TBs have set their flag, then each TB does an intra-block barrier between its warps.  Like the atomic barrier, there are two versions.
+
+All microbenchmarks access shared data that requires synchronization.
+
+A subsequent commit will add the Relaxed Atomics microbenchmarks discussed in our paper.
+
+USAGE
+-----
+
+Compilation:
+
+Since all of the microbenchmarks run from a single main function, users only need to compile the entire suite once in order to use any of the microbenchmarks.  You will need to set CUDA_DIR in the Makefile in order to properly compile the code.  To use HIP, you will need to set HIP_PATH for compilation to work correctly.
+
+Running:
+
+The usage of the microbenchmarks is as follows:
+
+./allSyncPrims-1kernel <syncPrim> <numLdSt> <numTBs> <numCSIters>
+
+<syncPrim> is a string that differs for each synchronization primitive to be run:
+	// Barriers use hybrid local-global synchronization
+	- atomicTreeBarrUniq - atomic tree barrier
+	- atomicTreeBarrUniqLocalExch - atomic tree barrier with local exchange
+	- lfTreeBarrUniq - lock-free tree barrier
+	- lfTreeBarrUniqLocalExch - lock-free tree barrier with local exchange
+	// global synchronization versions
+	- spinMutex - spin lock mutex
+	- spinMutexEBO - spin lock mutex with exponential backoff
+	- sleepMutex - decentralized ticket lock
+	- faMutex - centralized ticket lock (aka, fetch-and-add mutex)
+	- spinSem1 - spin lock semaphore, semaphore size 1
+	- spinSem2 - spin lock semaphore, semaphore size 2
+	- spinSem10 - spin lock semaphore, semaphore size 10
+	- spinSem120 - spin lock semaphore, semaphore size 120
+	- spinSemEBO1 - spin lock semaphore with exponential backoff, semaphore size 1
+	- spinSemEBO2 - spin lock semaphore with exponential backoff, semaphore size 2
+	- spinSemEBO10 - spin lock semaphore with exponential backoff, semaphore size 10
+	- spinSemEBO120 - spin lock semaphore with exponential backoff, semaphore size 120
+	// local synchronization versions
+	- spinMutexUniq - local spin lock mutex
+	- spinMutexEBOUniq - local spin lock mutex with exponential backoff
+	- sleepMutexUniq - local decentralized ticket lock
+	- faMutexUniq - local centralized ticket lock
+	- spinSemUniq1 - local spin lock semaphore, semaphore size 1
+	- spinSemUniq2 - local spin lock semaphore, semaphore size 2
+	- spinSemUniq10 - local spin lock semaphore, semaphore size 10
+	- spinSemUniq120 - local spin lock semaphore, semaphore size 120
+	- spinSemEBOUniq1 - local spin lock semaphore with exponential backoff, semaphore size 1
+	- spinSemEBOUniq2 - local spin lock semaphore with exponential backoff, semaphore size 2
+	- spinSemEBOUniq10 - local spin lock semaphore with exponential backoff, semaphore size 10
+	- spinSemEBOUniq120 - local spin lock semaphore with exponential backoff, semaphore size 120
+
+<numLdSt> is a positive integer representing how many loads and stores each thread will perform.  For the mutexes and semaphores, these accesses are all performed in the critical section.  For the barriers, these accesses use barriers to ensure that multiple threads are not accessing the same data.
+
+<numTBs> is a positive integer representing the number of thread blocks (TBs) to execute.  For many of the microbenchmarks (especially the barriers), this number needs to be divisible by the number of SMs on the GPU.
+
+<numCSIters> is a positive integer representing the number of iterations of the critical section.
+
+IISWC '17 VERSION
+-----------------
+
+The version used in our IISWC '17 paper assumes a unified address space between the CPU and GPU.  Thus, it does not require any copies.  Moreover, this version is based on CUDA SDK 3.1 and HIP version 1.6, as this is the last version of CUDA that is fully supported by GPGPU-Sim and gem5, respectively, as of the release.  Later versions of CUDA and HIP allow additional C++ features, which may simplify the code or allow other optimizations.  Finally, this version is designed to run in the DeNovo ecosystem, which simulates a unified address space with multiple CPU cores and GPU CUs using a combination of Simics, GEMS, Garnet, and GPGPU-Sim.  In this ecosystem, we assume a SC-for-DRF style memory consistency model.  SC-for-DRF's ordering requirements are enforced by the epilogues and atomic operations.  We assume that the epilogues will self-invalidate all valid data in the local (L1) caches and flush per-CU/core store buffers to write through or obtain ownership for dirty data.
+
+Similarly, to enforce the appropriate ordering requirements, we assume that the CUDA and HIP atomic operations have specific semantics:
+ 
+Atomic      | Reprogrammed? | Load Acquire | Store Release |  Unpaired  |
+atomicAdd   |               |              |               | X (LD, ST) |
+atomicSub   |               |              |               | X (LD, ST) |
+atomicExch  |      X        |              |      X (ST)   |            |
+atomicMin   |               |              |               | X (LD, ST) |
+atomicMax   |               |              |               | X (LD, ST) |
+atomicInc   |               |              |      X (ST)   |   X (LD)   |
+atomicDec   |               |              |      X (ST)   |   X (LD)   |
+atomicCAS   |               |    X (LD)    |               |   X (ST)   |
+atomicAnd   |      X        |              |               | X (LD, ST) |
+atomicOr    |      X        |              |               | X (LD, ST) |
+atomicXor   |      X        |    X (LD)    |               |            |
+
+If your ecosystem does not make the same assumptions, then you will need to add the appropriate fences (e.g., CUDA's __threadfence() and __threadfence_block()) to ensure the proper ordering of requests in the memory system.  In the case of the HIP implementation, you may be able to use some OpenCL atomics with the desired orderings, but we left it as is to ensure portability and correctness with future versions of HIP that may not support this feature.
+
+Reprogrammed Atomics:
+
+In addition to the above assumptions about semantics for a given atomic, we have also reprogrammed some of the CUDA atomics to provide certain functionality we needed that CUDA doesn't provide:
+
+- atomicAnd() was reprogrammed to have the same functionality as an atomicInc() but without store release semantics (i.e., atomicInc has store release semantics, atomicAnd does not).  We chose atomicAnd() for this because it was not used in any of our applications.  This change was necessary because atomicInc() sometimes needs store release semantics.
+- atomicXor() was reprogrammed to do an atomic load (instead of an atomic RMW).
+- atomicOr() was reprogrammed to do an (unpaired) atomic store (instead of an atomic RMW).  We chose atomicOr for the symmetry with atomicXor and because no applications used it.
+- atomicExch() was not reprogrammed in the simulator, but we have re-purposed it assuming that the value returned by the atomicExch() is never returned or used in the program.  This allows us to treat atomicExch() as if it were an atomic store.  Thus, the programmer should consider an atomicExch() to be an atomic store.  All of the applications we have encountered thus far already did this.  In the simulator, we account for the read on the timing and functional sides.
+
+Instruction-Centric vs. Data-Centric:
+
+Common programming languages like C++ and OpenCL, which use a data-centric approach.  These languages identify atomic accesses by “tagging” a variable with the atomic qualifier.  These languages use an instruction-centric method for identifying which atomic accesses can/should use relaxed atomics instead of SC atomics; the accesses that can be relaxed have “memory_order_relaxed” appended to their accesses.  Since CUDA does not provide support for the same framework as C++ and OpenCL, we had to make a design decision about how to identify atomic accesses and how to identify which of those atomic accesses can use relaxed atomics vs. SC atomics.  We chose to use an instruction-centric method for identifying atomic vs. non-atomic accesses.  In this method, we designate certain CUDA atomic instructions as being load acquires, store releases, or unpaired (as denoted above).  Moreover, note that CUDA does not have direct support for atomic loads or stores.  HIP does support these, but only with OpenCL commands.
+
+CUDA UVM VERSION
+----------------
+
+The CUDA UVM version is based on CUDA SDK 6.0, and uses CUDA's unified virtual memory to avoid making explicit copies of some of the arrays and structures.  Unlike the IISWC '17 version, this version does not make any assumptions about ordering atomics provide.  Nor does it require epilogues.  Instead, it adds the appropriate CUDA fence commands around atomic accesses to ensure the SC-for-DRF ordering is provided.  This version has been tested on a Pascal P100 GPU, but has not been tested as rigorously as the IISWC '17 version.
+
+HIP UVM VERSION
+----------------
+
+The HIP UVM version is based on HIP 1.6, and uses HIP's unified virtual memory to avoid making explicit copies of some of the arrays and structures.  Unlike the IISWC '17 version, this version does not make any assumptions about ordering atomics provide.  Nor does it require epilogues.  Instead, it adds the appropriate HIP fence commands around atomic accesses to ensure the SC-for-DRF ordering is provided.  This version has been tested on a Vega 56 GPU, but has not been tested as rigorously as the IISWC '17 version.
+
+CITATION
+--------
+
+If you publish work that uses these benchmarks, please cite the following papers:
+
+1.  M. D. Sinclair, J. Alsop, and S. V. Adve, HeteroSync: A Benchmark Suite for Fine-Grained Synchronization on Tightly Coupled GPUs, in the IEEE International Symposium on Workload Characterization (IISWC), October 2017
+
+2.  J. A. Stuart and J. D. Owens, “Efficient Synchronization Primitives for GPUs,” CoRR, vol. abs/1110.4623, 2011
+
+ACKNOWLEDGEMENTS
+----------------
+
+This work was supported in part by a Qualcomm Innovation Fellowship for Sinclair, the National Science Foundation under grants CCF 13-02641 and CCF 16-19245, the Center for Future Architectures Research (C-FAR), a Semiconductor Research Corporation program sponsored by MARCO and DARPA, and the Center for Applications Driving Architectures (ADA), one of six centers of JUMP, a Semiconductor Research Corporation program co-sponsored by DARPA.
diff --git a/src/heterosync/README.txt b/src/gpu/heterosync/README.txt
similarity index 100%
rename from src/heterosync/README.txt
rename to src/gpu/heterosync/README.txt
diff --git a/src/heterosync/src/hipLocks.h b/src/gpu/heterosync/src/hipLocks.h
similarity index 93%
rename from src/heterosync/src/hipLocks.h
rename to src/gpu/heterosync/src/hipLocks.h
index 690ce4f..2a8dafd 100644
--- a/src/heterosync/src/hipLocks.h
+++ b/src/gpu/heterosync/src/hipLocks.h
@@ -7,9 +7,6 @@
 /*
   Shared sleep function.  Since s_sleep only takes in consstants (between 1 and 128),
   need code to handle long tail.
-
-  Currently s_sleep is unsupported in gem5, so sleepFunc is commented out and
-  replaced with a spin in the lock implementations
  */
 inline __device__ void sleepFunc(int backoff) {
   int backoffCopy = backoff;
diff --git a/src/heterosync/src/hipLocksBarrier.h b/src/gpu/heterosync/src/hipLocksBarrier.h
similarity index 100%
rename from src/heterosync/src/hipLocksBarrier.h
rename to src/gpu/heterosync/src/hipLocksBarrier.h
diff --git a/src/heterosync/src/hipLocksBarrierAtomic.h b/src/gpu/heterosync/src/hipLocksBarrierAtomic.h
similarity index 98%
rename from src/heterosync/src/hipLocksBarrierAtomic.h
rename to src/gpu/heterosync/src/hipLocksBarrierAtomic.h
index a51a77f..38fbc9d 100644
--- a/src/heterosync/src/hipLocksBarrierAtomic.h
+++ b/src/gpu/heterosync/src/hipLocksBarrierAtomic.h
@@ -56,8 +56,7 @@
     // do exponential backoff to reduce the number of times we pound the global
     // barrier
     if (!*done) {
-      //sleepFunc(backoff);
-      for (int j = 0; j < backoff; ++j) { ; }
+      sleepFunc(backoff);
       __syncthreads();
     }
   }
diff --git a/src/heterosync/src/hipLocksBarrierFast.h b/src/gpu/heterosync/src/hipLocksBarrierFast.h
similarity index 100%
rename from src/heterosync/src/hipLocksBarrierFast.h
rename to src/gpu/heterosync/src/hipLocksBarrierFast.h
diff --git a/src/heterosync/src/hipLocksImpl.h b/src/gpu/heterosync/src/hipLocksImpl.h
similarity index 81%
rename from src/heterosync/src/hipLocksImpl.h
rename to src/gpu/heterosync/src/hipLocksImpl.h
index b04fb37..750de8f 100644
--- a/src/heterosync/src/hipLocksImpl.h
+++ b/src/gpu/heterosync/src/hipLocksImpl.h
@@ -28,13 +28,13 @@
   cpuLockData->mutexCount             = numMutexes;
   cpuLockData->semaphoreCount         = numSemaphores;
 
-  hipMalloc(&cpuLockData->barrierBuffers,   sizeof(unsigned int) * cpuLockData->arrayStride * 2);
+  hipHostMalloc(&cpuLockData->barrierBuffers,   sizeof(unsigned int) * cpuLockData->arrayStride * 2);
 
-  hipMalloc(&cpuLockData->mutexBuffers,     sizeof(int) * cpuLockData->arrayStride * cpuLockData->mutexCount);
-  hipMalloc(&cpuLockData->mutexBufferHeads, sizeof(unsigned int) * cpuLockData->mutexCount);
-  hipMalloc(&cpuLockData->mutexBufferTails, sizeof(unsigned int) * cpuLockData->mutexCount);
+  hipHostMalloc(&cpuLockData->mutexBuffers,     sizeof(int) * cpuLockData->arrayStride * cpuLockData->mutexCount);
+  hipHostMalloc(&cpuLockData->mutexBufferHeads, sizeof(unsigned int) * cpuLockData->mutexCount);
+  hipHostMalloc(&cpuLockData->mutexBufferTails, sizeof(unsigned int) * cpuLockData->mutexCount);
 
-  hipMalloc(&cpuLockData->semaphoreBuffers, sizeof(unsigned int) * 4 * cpuLockData->semaphoreCount);
+  hipHostMalloc(&cpuLockData->semaphoreBuffers, sizeof(unsigned int) * 4 * cpuLockData->semaphoreCount);
 
   hipErr = hipGetLastError();
   checkError(hipErr, "Before memsets");
@@ -81,11 +81,11 @@
 hipError_t hipLocksDestroy()
 {
   if (cpuLockData == NULL) { return hipErrorInitializationError; }
-  hipFree(cpuLockData->mutexBuffers);
-  hipFree(cpuLockData->mutexBufferHeads);
-  hipFree(cpuLockData->mutexBufferTails);
+  hipHostFree(cpuLockData->mutexBuffers);
+  hipHostFree(cpuLockData->mutexBufferHeads);
+  hipHostFree(cpuLockData->mutexBufferTails);
 
-  hipFree(cpuLockData->semaphoreBuffers);
+  hipHostFree(cpuLockData->semaphoreBuffers);
 
   hipHostFree(cpuLockData);
 
diff --git a/src/heterosync/src/hipLocksMutex.h b/src/gpu/heterosync/src/hipLocksMutex.h
similarity index 100%
rename from src/heterosync/src/hipLocksMutex.h
rename to src/gpu/heterosync/src/hipLocksMutex.h
diff --git a/src/heterosync/src/hipLocksMutexEBO.h b/src/gpu/heterosync/src/hipLocksMutexEBO.h
similarity index 95%
rename from src/heterosync/src/hipLocksMutexEBO.h
rename to src/gpu/heterosync/src/hipLocksMutexEBO.h
index 0adaac0..69ab38d 100644
--- a/src/heterosync/src/hipLocksMutexEBO.h
+++ b/src/gpu/heterosync/src/hipLocksMutexEBO.h
@@ -43,8 +43,7 @@
       {
         // if we failed in acquiring the lock, wait for a little while before
         // trying again
-        //sleepFunc(backoff);
-        for (int j = 0; j < backoff; ++j) { ; }
+        sleepFunc(backoff);
         // (capped) exponential backoff
         backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
       }
@@ -100,8 +99,7 @@
       {
         // if we failed in acquiring the lock, wait for a little while before
         // trying again
-        //sleepFunc(backoff);
-        for (int j = 0; j < backoff; ++j) { ; }
+        sleepFunc(backoff);
         // (capped) exponential backoff
         backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
       }
diff --git a/src/heterosync/src/hipLocksMutexFA.h b/src/gpu/heterosync/src/hipLocksMutexFA.h
similarity index 100%
rename from src/heterosync/src/hipLocksMutexFA.h
rename to src/gpu/heterosync/src/hipLocksMutexFA.h
diff --git a/src/heterosync/src/hipLocksMutexSleep.h b/src/gpu/heterosync/src/hipLocksMutexSleep.h
similarity index 98%
rename from src/heterosync/src/hipLocksMutexSleep.h
rename to src/gpu/heterosync/src/hipLocksMutexSleep.h
index b9a1461..c49d401 100644
--- a/src/heterosync/src/hipLocksMutexSleep.h
+++ b/src/gpu/heterosync/src/hipLocksMutexSleep.h
@@ -79,8 +79,7 @@
       {
         // if we failed in acquiring the lock, wait for a little while before
         // trying again
-        //sleepFunc(backoff);
-        for (int j = 0; j < backoff; ++j) { ; }
+        sleepFunc(backoff);
         // (capped) exponential backoff
         backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
       }
diff --git a/src/heterosync/src/hipLocksMutexSpin.h b/src/gpu/heterosync/src/hipLocksMutexSpin.h
similarity index 100%
rename from src/heterosync/src/hipLocksMutexSpin.h
rename to src/gpu/heterosync/src/hipLocksMutexSpin.h
diff --git a/src/heterosync/src/hipLocksSemaphore.h b/src/gpu/heterosync/src/hipLocksSemaphore.h
similarity index 100%
rename from src/heterosync/src/hipLocksSemaphore.h
rename to src/gpu/heterosync/src/hipLocksSemaphore.h
diff --git a/src/heterosync/src/hipLocksSemaphoreEBO.h b/src/gpu/heterosync/src/hipLocksSemaphoreEBO.h
similarity index 98%
rename from src/heterosync/src/hipLocksSemaphoreEBO.h
rename to src/gpu/heterosync/src/hipLocksSemaphoreEBO.h
index 0128de3..69520be 100644
--- a/src/heterosync/src/hipLocksSemaphoreEBO.h
+++ b/src/gpu/heterosync/src/hipLocksSemaphoreEBO.h
@@ -162,8 +162,7 @@
     {
       // if we failed to enter the semaphore, wait for a little while before
       // trying again
-      //sleepFunc(backoff);
-      for (int j = 0; j < backoff; ++j) { ; }
+      sleepFunc(backoff);
       /*
         for writers increse backoff a lot because failing means readers are in
         the CS currently -- most important for non-unique because all WGs on
@@ -385,8 +384,7 @@
         if we failed to enter the semaphore, wait for a little while before
         trying again
       */
-      //sleepFunc(backoff);
-      for (int j = 0; j < backoff; ++j) { ; }
+      sleepFunc(backoff);
       // (capped) exponential backoff
       backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
     }
diff --git a/src/heterosync/src/hipLocksSemaphoreSpin.h b/src/gpu/heterosync/src/hipLocksSemaphoreSpin.h
similarity index 100%
rename from src/heterosync/src/hipLocksSemaphoreSpin.h
rename to src/gpu/heterosync/src/hipLocksSemaphoreSpin.h
diff --git a/src/heterosync/src/hip_error.h b/src/gpu/heterosync/src/hip_error.h
similarity index 100%
rename from src/heterosync/src/hip_error.h
rename to src/gpu/heterosync/src/hip_error.h
diff --git a/src/heterosync/src/main.hip.cpp b/src/gpu/heterosync/src/main.hip.cpp
similarity index 95%
rename from src/heterosync/src/main.hip.cpp
rename to src/gpu/heterosync/src/main.hip.cpp
index e4a2d0f..db38cb5 100644
--- a/src/heterosync/src/main.hip.cpp
+++ b/src/gpu/heterosync/src/main.hip.cpp
@@ -1269,7 +1269,8 @@
     The atomic barrier per-CU synchronization fits inside the lock-free size
     requirements so we can reuse the same locations.
   */
-  unsigned int * perCUBarriers = (unsigned int *)malloc(sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
+  unsigned int * perCUBarriers;
+  hipHostMalloc(&perCUBarriers, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
 
   int numLocsMult = 0;
   // barriers and unique semaphores have numWGs WGs accessing unique locations
@@ -1298,7 +1299,8 @@
   assert(numUniqLocsAccPerWG > 0);
   int numStorageLocs = (numLocsMult * numUniqLocsAccPerWG);
   assert(numStorageLocs > 0);
-  float * storage = (float *)malloc(sizeof(float) * numStorageLocs);
+  float * storage;
+  hipHostMalloc(&storage, sizeof(float) * numStorageLocs);
 
   fprintf(stdout, "# WGs: %d, # Ld/St: %d, # Locs Mult: %d, # Uniq Locs/WG: %d, # Storage Locs: %d\n", numWGs, NUM_LDST, numLocsMult, numUniqLocsAccPerWG, numStorageLocs);
 
@@ -1307,17 +1309,6 @@
   // initialize per-CU barriers to 0's
   for (int i = 0; i < (NUM_CU * MAX_WGS * 2); ++i) { perCUBarriers[i] = 0; }
 
-  // gpu copies of storage and perCUBarriers
-  //float elapsedTime = 0.0f;
-  unsigned int * perCUBarriers_d = NULL;
-  float * storage_d = NULL;
-
-  hipMalloc(&perCUBarriers_d, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
-  hipMalloc(&storage_d, sizeof(float) * numStorageLocs);
-
-  hipMemcpy(perCUBarriers_d, perCUBarriers, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2), hipMemcpyHostToDevice);
-  hipMemcpy(storage_d, storage, sizeof(float) * numStorageLocs, hipMemcpyHostToDevice);
-
   // lock variables
   hipMutex_t spinMutex, faMutex, sleepMutex, eboMutex;
   hipMutex_t spinMutex_uniq, faMutex_uniq, sleepMutex_uniq, eboMutex_uniq;
@@ -1479,52 +1470,52 @@
 
   switch (syncPrim) {
     case 0: // atomic tree barrier
-      invokeAtomicTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeAtomicTreeBarrier(storage, perCUBarriers, NUM_ITERS);
       break;
     case 1: // atomic tree barrier with local exchange
-      invokeAtomicTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeAtomicTreeBarrierLocalExch(storage, perCUBarriers, NUM_ITERS);
       break;
     case 2: // lock-free barrier
-      invokeFBSTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeFBSTreeBarrier(storage, perCUBarriers, NUM_ITERS);
       break;
     case 3: // lock-free barrier with local exchange
-      invokeFBSTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeFBSTreeBarrierLocalExch(storage, perCUBarriers, NUM_ITERS);
       break;
     case 4: // Spin Lock Mutex
-      invokeSpinLockMutex   (spinMutex,  storage_d, NUM_ITERS);
+      invokeSpinLockMutex   (spinMutex,  storage, NUM_ITERS);
       break;
     case 5: // Spin Lock Mutex with backoff
-      invokeEBOMutex        (eboMutex,   storage_d, NUM_ITERS);
+      invokeEBOMutex        (eboMutex,   storage, NUM_ITERS);
       break;
     case 6: // Sleeping Mutex
-      invokeSleepingMutex   (sleepMutex, storage_d, NUM_ITERS);
+      invokeSleepingMutex   (sleepMutex, storage, NUM_ITERS);
       break;
     case 7: // fetch-and-add mutex
-      invokeFetchAndAddMutex(faMutex,    storage_d, NUM_ITERS);
+      invokeFetchAndAddMutex(faMutex,    storage, NUM_ITERS);
       break;
     case 8: // spin semaphore (1)
-      invokeSpinLockSemaphore(spinSem1,   storage_d,   1, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem1,   storage,   1, NUM_ITERS, numStorageLocs);
       break;
     case 9: // spin semaphore (2)
-      invokeSpinLockSemaphore(spinSem2,   storage_d,   2, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem2,   storage,   2, NUM_ITERS, numStorageLocs);
       break;
     case 10: // spin semaphore (10)
-      invokeSpinLockSemaphore(spinSem10,   storage_d,   10, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem10,   storage,   10, NUM_ITERS, numStorageLocs);
       break;
     case 11: // spin semaphore (120)
-      invokeSpinLockSemaphore(spinSem120,   storage_d,   120, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem120,   storage,   120, NUM_ITERS, numStorageLocs);
       break;
     case 12: // spin semaphore with backoff (1)
-      invokeEBOSemaphore(eboSem1,   storage_d,     1, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem1,   storage,     1, NUM_ITERS, numStorageLocs);
       break;
     case 13: // spin semaphore with backoff (2)
-      invokeEBOSemaphore(eboSem2,   storage_d,     2, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem2,   storage,     2, NUM_ITERS, numStorageLocs);
       break;
     case 14: // spin semaphore with backoff (10)
-      invokeEBOSemaphore(eboSem10,   storage_d,   10, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem10,   storage,   10, NUM_ITERS, numStorageLocs);
       break;
     case 15: // spin semaphore with backoff (120)
-      invokeEBOSemaphore(eboSem120,   storage_d, 120, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem120,   storage, 120, NUM_ITERS, numStorageLocs);
       break;
     // cases 16-19 reserved
     case 16:
@@ -1536,40 +1527,40 @@
     case 19:
       break;
     case 20: // Spin Lock Mutex (uniq)
-      invokeSpinLockMutex_uniq   (spinMutex_uniq,  storage_d, NUM_ITERS);
+      invokeSpinLockMutex_uniq   (spinMutex_uniq,  storage, NUM_ITERS);
       break;
     case 21: // Spin Lock Mutex with backoff (uniq)
-      invokeEBOMutex_uniq        (eboMutex_uniq,   storage_d, NUM_ITERS);
+      invokeEBOMutex_uniq        (eboMutex_uniq,   storage, NUM_ITERS);
       break;
     case 22: // Sleeping Mutex (uniq)
-      invokeSleepingMutex_uniq   (sleepMutex_uniq, storage_d, NUM_ITERS);
+      invokeSleepingMutex_uniq   (sleepMutex_uniq, storage, NUM_ITERS);
       break;
     case 23: // fetch-and-add mutex (uniq)
-      invokeFetchAndAddMutex_uniq(faMutex_uniq,    storage_d, NUM_ITERS);
+      invokeFetchAndAddMutex_uniq(faMutex_uniq,    storage, NUM_ITERS);
       break;
     case 24: // spin semaphore (1) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem1_uniq,   storage_d,   1, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem1_uniq,   storage,   1, NUM_ITERS);
       break;
     case 25: // spin semaphore (2) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem2_uniq,   storage_d,   2, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem2_uniq,   storage,   2, NUM_ITERS);
       break;
     case 26: // spin semaphore (10) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem10_uniq,   storage_d,   10, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem10_uniq,   storage,   10, NUM_ITERS);
       break;
     case 27: // spin semaphore (120) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem120_uniq,   storage_d,   120, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem120_uniq,   storage,   120, NUM_ITERS);
       break;
     case 28: // spin semaphore with backoff (1) (uniq)
-      invokeEBOSemaphore_uniq(eboSem1_uniq,   storage_d,     1, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem1_uniq,   storage,     1, NUM_ITERS);
       break;
     case 29: // spin semaphore with backoff (2) (uniq)
-      invokeEBOSemaphore_uniq(eboSem2_uniq,   storage_d,     2, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem2_uniq,   storage,     2, NUM_ITERS);
       break;
     case 30: // spin semaphore with backoff (10) (uniq)
-      invokeEBOSemaphore_uniq(eboSem10_uniq,   storage_d,   10, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem10_uniq,   storage,   10, NUM_ITERS);
       break;
     case 31: // spin semaphore with backoff (120) (uniq)
-      invokeEBOSemaphore_uniq(eboSem120_uniq,   storage_d, 120, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem120_uniq,   storage, 120, NUM_ITERS);
       break;
     // cases 32-36 reserved
     case 32:
@@ -1594,9 +1585,6 @@
   // NOTE: Can end simulation here if don't care about output checking
   hipDeviceSynchronize();
 
-  // copy results back to compare to golden
-  hipMemcpy(storage, storage_d, sizeof(float) * numStorageLocs, hipMemcpyDeviceToHost);
-
   // get golden results
   float storageGolden[numStorageLocs];
   int numLocsAccessed = 0, currLoc = 0;
@@ -1777,10 +1765,8 @@
 
   // free arrays
   hipLocksDestroy();
-  hipFree(storage_d);
-  hipFree(perCUBarriers_d);
-  free(storage);
-  free(perCUBarriers);
+  hipHostFree(storage);
+  hipHostFree(perCUBarriers);
 
   return 0;
 }
diff --git a/src/heterosync/src/syncPrims_results.sh b/src/gpu/heterosync/src/syncPrims_results.sh
similarity index 100%
rename from src/heterosync/src/syncPrims_results.sh
rename to src/gpu/heterosync/src/syncPrims_results.sh
diff --git a/src/hip-samples/.gitignore b/src/gpu/hip-samples/.gitignore
similarity index 100%
rename from src/hip-samples/.gitignore
rename to src/gpu/hip-samples/.gitignore
diff --git a/src/hip-samples/Makefile b/src/gpu/hip-samples/Makefile
similarity index 83%
rename from src/hip-samples/Makefile
rename to src/gpu/hip-samples/Makefile
index da12ce5..ee863b8 100644
--- a/src/hip-samples/Makefile
+++ b/src/gpu/hip-samples/Makefile
@@ -12,7 +12,7 @@
 all: $(EXECUTABLES)
 
 $(EXECUTABLES): %: $(SRC_DIR)/%.cpp | $(BIN_DIR)
-	$(HIPCC) $< -o $(BIN_DIR)/$@
+	$(HIPCC) --amdgpu-target=gfx801,gfx803 $< -o $(BIN_DIR)/$@
 
 $(BIN_DIR):
 	mkdir -p $@
diff --git a/src/gpu/hip-samples/README.md b/src/gpu/hip-samples/README.md
new file mode 100644
index 0000000..467705b
--- /dev/null
+++ b/src/gpu/hip-samples/README.md
@@ -0,0 +1,55 @@
+---
+title: GCN3 HIP-Samples Tests
+tags:
+    - x86
+    - amdgpu
+layout: default
+permalink: resources/hip-samples
+shortdoc: >
+    Resources to build a disk image with the GCN3 HIP-Sample-Applications workloads.
+---
+
+# Resource: HIP Sample Applications
+
+The [HIP sample apps](
+https://github.com/ROCm-Developer-Tools/HIP/tree/roc-1.6.0/samples) contain
+applications that introduce various GPU programming concepts that are usable
+in HIP.
+
+The samples cover topics such as using and accessing different parts of GPU
+memory, running multiple GPU streams, and optimization techniques for GPU code.
+
+Certain apps aren't included due to complexities with either ROCm or Docker
+(hipEvent, profiler), or due to lack of feature support in gem5 (peer2peer)
+
+Compiling the HIP samples, compiling the GCN3_X86 gem5, and running the HIP samples on gem5 is dependent on the gcn-gpu docker image, built from the `util/dockerfiles/gcn-gpu/Dockerfile` on the [gem5 stable branch](https://gem5.googlesource.com/public/gem5/+/refs/heads/stable).
+
+## Compilation
+
+```
+cd src/gpu/hip-samples
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
+```
+
+Individual programs can be made by specifying the name of the program
+
+By default, the apps are built for all supported GPU types (gfx801, gfx803).
+This can be changed by editing the --amdgpu-target argument in the Makefile.
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/2dshfl>
+
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/dynamic_shared>
+
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/inline_asm>
+
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/MatrixTranspose>
+
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/sharedMemory>
+
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/shfl>
+
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/stream>
+
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/unroll>
diff --git a/src/hip-samples/src/2dshfl.cpp b/src/gpu/hip-samples/src/2dshfl.cpp
similarity index 74%
rename from src/hip-samples/src/2dshfl.cpp
rename to src/gpu/hip-samples/src/2dshfl.cpp
index 1b22a0c..4e58cfb 100644
--- a/src/hip-samples/src/2dshfl.cpp
+++ b/src/gpu/hip-samples/src/2dshfl.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -66,10 +65,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -80,8 +77,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -90,21 +86,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -113,8 +103,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -125,12 +115,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/hip-samples/src/MatrixTranspose.cpp b/src/gpu/hip-samples/src/MatrixTranspose.cpp
similarity index 74%
rename from src/hip-samples/src/MatrixTranspose.cpp
rename to src/gpu/hip-samples/src/MatrixTranspose.cpp
index 264fcbe..68741e2 100644
--- a/src/hip-samples/src/MatrixTranspose.cpp
+++ b/src/gpu/hip-samples/src/MatrixTranspose.cpp
@@ -37,8 +37,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -66,10 +65,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -80,8 +77,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -90,21 +86,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -113,7 +103,7 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
       errors++;
     }
   }
@@ -124,12 +114,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/hip-samples/src/dynamic_shared.cpp b/src/gpu/hip-samples/src/dynamic_shared.cpp
similarity index 73%
rename from src/hip-samples/src/dynamic_shared.cpp
rename to src/gpu/hip-samples/src/dynamic_shared.cpp
index 22d7eb9..9627d3b 100644
--- a/src/hip-samples/src/dynamic_shared.cpp
+++ b/src/gpu/hip-samples/src/dynamic_shared.cpp
@@ -34,8 +34,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  sizeof(float)*WIDTH*WIDTH, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     sizeof(float)*WIDTH*WIDTH, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/hip-samples/src/inline_asm.cpp b/src/gpu/hip-samples/src/inline_asm.cpp
similarity index 73%
rename from src/hip-samples/src/inline_asm.cpp
rename to src/gpu/hip-samples/src/inline_asm.cpp
index f2345e5..5a8b628 100644
--- a/src/hip-samples/src/inline_asm.cpp
+++ b/src/gpu/hip-samples/src/inline_asm.cpp
@@ -35,8 +35,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -65,10 +64,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -79,8 +76,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -89,21 +85,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -112,8 +102,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("gpu%f cpu %f \n",gpuTransposeMatrix[i],cpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -124,12 +114,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/hip-samples/src/sharedMemory.cpp b/src/gpu/hip-samples/src/sharedMemory.cpp
similarity index 73%
rename from src/hip-samples/src/sharedMemory.cpp
rename to src/gpu/hip-samples/src/sharedMemory.cpp
index 9b51aba..d88d18e 100644
--- a/src/hip-samples/src/sharedMemory.cpp
+++ b/src/gpu/hip-samples/src/sharedMemory.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -71,10 +70,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -85,8 +82,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -95,21 +91,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -118,8 +108,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -130,12 +120,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/hip-samples/src/shfl.cpp b/src/gpu/hip-samples/src/shfl.cpp
similarity index 74%
rename from src/hip-samples/src/shfl.cpp
rename to src/gpu/hip-samples/src/shfl.cpp
index e0f4c21..d523ffb 100644
--- a/src/hip-samples/src/shfl.cpp
+++ b/src/gpu/hip-samples/src/shfl.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/hip-samples/src/stream.cpp b/src/gpu/hip-samples/src/stream.cpp
similarity index 60%
rename from src/hip-samples/src/stream.cpp
rename to src/gpu/hip-samples/src/stream.cpp
index 2dc7544..c14759a 100644
--- a/src/hip-samples/src/stream.cpp
+++ b/src/gpu/hip-samples/src/stream.cpp
@@ -30,10 +30,9 @@
 
 using namespace std;
 
-__global__ void matrixTranspose_static_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
+__global__ void matrixTranspose_static_shared(float *out,
+                                              float *in,
+                                              const int width)
 {
     __shared__ float sharedMem[WIDTH*WIDTH];
 
@@ -47,10 +46,9 @@
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
+__global__ void matrixTranspose_dynamic_shared(float *out,
+                                               float *in,
+                                               const int width)
 {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -65,7 +63,7 @@
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, float **TransposeMatrix, int width)
+void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, int width)
 {
     const int num_streams = 2;
     hipStream_t streams[num_streams];
@@ -75,48 +73,42 @@
 
     for(int i=0;i<num_streams;i++)
     {
-        hipMalloc((void**)&data[i], NUM * sizeof(float));
+        hipHostMalloc(&data[i], NUM * sizeof(float));
         hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice,streams[i]);
     }
 
-    hipLaunchKernel(matrixTranspose_static_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    0, streams[0],
-                    gpuTransposeMatrix[0], data[0], width);
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
+                       dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                       dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                       0, streams[0],
+                       gpuTransposeMatrix[0], data[0], width);
 
-    hipLaunchKernel(matrixTranspose_dynamic_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    sizeof(float)*WIDTH*WIDTH, streams[1],
-                    gpuTransposeMatrix[1], data[1], width);
-
-    for(int i=0;i<num_streams;i++)
-        hipMemcpyAsync(TransposeMatrix[i], gpuTransposeMatrix[i], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[i]);
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
+                       dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                       dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                       sizeof(float)*WIDTH*WIDTH, streams[1],
+                       gpuTransposeMatrix[1], data[1], width);
 }
 
 int main(){
 
     hipSetDevice(0);
 
-    float *data[2], *TransposeMatrix[2], *gpuTransposeMatrix[2], *randArray;
+    float *data[2], *gpuTransposeMatrix[2], *randArray;
 
     int width = WIDTH;
 
     randArray = (float*)malloc(NUM * sizeof(float));
 
-    TransposeMatrix[0] = (float*)malloc(NUM * sizeof(float));
-    TransposeMatrix[1] = (float*)malloc(NUM * sizeof(float));
-
-    hipMalloc((void**)&gpuTransposeMatrix[0], NUM * sizeof(float));
-    hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float));
+    hipHostMalloc(&gpuTransposeMatrix[0], NUM * sizeof(float));
+    hipHostMalloc(&gpuTransposeMatrix[1], NUM * sizeof(float));
 
     for(int i = 0; i < NUM; i++)
     {
         randArray[i] = (float)i*1.0f;
     }
 
-    MultipleStream(data, randArray, gpuTransposeMatrix, TransposeMatrix, width);
+    MultipleStream(data, randArray, gpuTransposeMatrix, width);
 
     hipDeviceSynchronize();
 
@@ -124,9 +116,9 @@
     int errors = 0;
     double eps = 1.0E-6;
     for (int i = 0; i < NUM; i++) {
-        if (std::abs(TransposeMatrix[0][i] - TransposeMatrix[1][i]) > eps ) {
-        printf("%d stream0: %f stream1  %f\n",i,TransposeMatrix[0][i],TransposeMatrix[1][i]);
-        errors++;
+        if (std::abs(gpuTransposeMatrix[0][i] - gpuTransposeMatrix[1][i]) > eps ) {
+            printf("%d stream0: %f stream1  %f\n",i,gpuTransposeMatrix[0][i],gpuTransposeMatrix[1][i]);
+            errors++;
         }
     }
     if (errors!=0) {
@@ -139,7 +131,6 @@
     for(int i=0;i<2;i++){
        hipFree(data[i]);
        hipFree(gpuTransposeMatrix[i]);
-       free(TransposeMatrix[i]);
     }
 
     hipDeviceReset();
diff --git a/src/hip-samples/src/shfl.cpp b/src/gpu/hip-samples/src/unroll.cpp
similarity index 74%
copy from src/hip-samples/src/shfl.cpp
copy to src/gpu/hip-samples/src/unroll.cpp
index e0f4c21..6935c03 100644
--- a/src/hip-samples/src/shfl.cpp
+++ b/src/gpu/hip-samples/src/unroll.cpp
@@ -36,15 +36,14 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-
     float val = in[x];
 
+#pragma unroll
     for(int i=0;i<width;i++)
     {
         for(int j=0;j<width;j++)
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/hsa-agent-pkt/HSA_Interface.cpp b/src/gpu/hsa-agent-pkt/HSA_Interface.cpp
similarity index 100%
rename from src/hsa-agent-pkt/HSA_Interface.cpp
rename to src/gpu/hsa-agent-pkt/HSA_Interface.cpp
diff --git a/src/hsa-agent-pkt/HSA_Interface.h b/src/gpu/hsa-agent-pkt/HSA_Interface.h
similarity index 100%
rename from src/hsa-agent-pkt/HSA_Interface.h
rename to src/gpu/hsa-agent-pkt/HSA_Interface.h
diff --git a/src/hsa-agent-pkt/Makefile b/src/gpu/hsa-agent-pkt/Makefile
similarity index 100%
rename from src/hsa-agent-pkt/Makefile
rename to src/gpu/hsa-agent-pkt/Makefile
diff --git a/src/gpu/hsa-agent-pkt/README.md b/src/gpu/hsa-agent-pkt/README.md
new file mode 100644
index 0000000..4d20e10
--- /dev/null
+++ b/src/gpu/hsa-agent-pkt/README.md
@@ -0,0 +1,35 @@
+---
+title: GCN3 HSA Agent Packet Test
+tags:
+    - x86
+    - amdgpu
+layout: default
+permalink: resources/hsa-agent-pkt
+shortdoc: >
+    Resources to build a disk image with the GCN3 HSA Agent Packet workload.
+---
+
+# Resource: HSA Agent Packet Example
+
+Based off of the Square resource in this repository, this resource serves as
+an example for using an HSA Agent Packet to send commands to the GPU command
+processor included in the GCN_X86 build of gem5.
+
+The example command extracts the kernel's completion signal from the domain
+of the command processor and the GPU's dispatcher. Initially this was a
+workaround for the hipDeviceSynchronize bug, now fixed. The method of
+waiting on a signal can be applied to other agent packet commands though.
+
+Custom commands can be added to the command processor in gem5 to control
+the GPU in novel ways.
+
+## Compilation
+
+To compile:
+
+```
+cd src/gpu/hsa-agent-pkt
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make gfx8-apu
+```
+
+The compiled binary can be found in `src/gpu/hsa-agent-pkt/bin`
diff --git a/src/hsa-agent-pkt/square.cpp b/src/gpu/hsa-agent-pkt/square.cpp
similarity index 100%
rename from src/hsa-agent-pkt/square.cpp
rename to src/gpu/hsa-agent-pkt/square.cpp
diff --git a/src/lulesh/.gitignore b/src/gpu/lulesh/.gitignore
similarity index 100%
rename from src/lulesh/.gitignore
rename to src/gpu/lulesh/.gitignore
diff --git a/src/lulesh/Makefile b/src/gpu/lulesh/Makefile
similarity index 100%
rename from src/lulesh/Makefile
rename to src/gpu/lulesh/Makefile
diff --git a/src/gpu/lulesh/README.md b/src/gpu/lulesh/README.md
new file mode 100644
index 0000000..a0726c4
--- /dev/null
+++ b/src/gpu/lulesh/README.md
@@ -0,0 +1,50 @@
+---
+title: GCN3 LULESH Test
+tags:
+    - x86
+    - amdgpu
+layout: default
+permalink: resources/lulesh
+shortdoc: >
+    Resources to build a disk image with the GCN3 LULESH workload.
+---
+
+# Resource: lulesh
+
+[lulesh](https://computing.llnl.gov/projects/co-design/lulesh) is a DOE proxy
+application that is used as an example of hydrodynamics modeling. The version
+provided is for use with the gpu-compute model of gem5.
+
+Compiling LULESH, compiling the GCN3_X86 gem5, and running LULESH on gem5 is dependent on the gcn-gpu docker image, built from the `util/dockerfiles/gcn-gpu/Dockerfile` on the [gem5 stable branch](https://gem5.googlesource.com/public/gem5/+/refs/heads/stable).
+
+## Compilation and Running
+```
+cd src/gpu/lulesh
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
+```
+
+By default, the makefile builds for gfx801, and is placed in the `src/gpu/lulesh/bin` folder.
+
+lulesh is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
+To build GCN3_X86:
+
+```
+# Working directory is your gem5 directory
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+```
+
+The following command shows how to run lulesh
+
+Note: lulesh has two optional command-line arguments, to specify the stop time and number
+of iterations. To set the arguments, add `--options="<stop_time> <num_iters>`
+to the run command. The default arguments are equivalent to `--options="1.0e-2 10"`
+
+
+```
+# Assuming gem5 and gem5-resources are in your working directory
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --mem-size=8GB --benchmark-root=gem5-resources/src/gpu/lulesh/bin -clulesh
+```
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/v21-1/test-progs/lulesh/lulesh>
diff --git a/src/lulesh/src/lulesh.hip.cc b/src/gpu/lulesh/src/lulesh.hip.cc
similarity index 100%
rename from src/lulesh/src/lulesh.hip.cc
rename to src/gpu/lulesh/src/lulesh.hip.cc
diff --git a/src/pennant/.gitignore b/src/gpu/pennant/.gitignore
similarity index 100%
rename from src/pennant/.gitignore
rename to src/gpu/pennant/.gitignore
diff --git a/src/pennant/LICENSE b/src/gpu/pennant/LICENSE
similarity index 100%
rename from src/pennant/LICENSE
rename to src/gpu/pennant/LICENSE
diff --git a/src/pennant/Makefile b/src/gpu/pennant/Makefile
similarity index 100%
rename from src/pennant/Makefile
rename to src/gpu/pennant/Makefile
diff --git a/src/pennant/README b/src/gpu/pennant/README
similarity index 100%
rename from src/pennant/README
rename to src/gpu/pennant/README
diff --git a/src/gpu/pennant/README.md b/src/gpu/pennant/README.md
new file mode 100644
index 0000000..5c1108f
--- /dev/null
+++ b/src/gpu/pennant/README.md
@@ -0,0 +1,85 @@
+---
+title: GCN3 PENNANT Test
+tags:
+    - x86
+    - amdgpu
+layout: default
+permalink: resources/pennant
+shortdoc: >
+    Resources to build a disk image with the GCN3 PENNANT workload.
+---
+
+# Resource: PENNANT
+
+PENNANT is an unstructured mesh physics mini-app designed for advanced
+architecture research.  It contains mesh data structures and a few
+physics algorithms adapted from the LANL rad-hydro code FLAG, and gives
+a sample of the typical memory access patterns of FLAG.
+
+## Compiling and Running
+
+```
+cd src/gpu/pennant
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
+```
+
+By default, the binary is built for gfx801 and is placed in `src/gpu/pennant/build`
+
+pennant is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
+
+pennant has sample input files located at `src/gpu/pennant/test`. The following command shows how to run the sample `noh`
+
+```
+# Assuming gem5 and gem5-resources are in your working directory
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/pennant/build -cpennant --options="gem5-resources/src/gpu/pennant/test/noh/noh.pnt"
+```
+
+The output gets placed in `src/gpu/pennant/test/noh/`, and the file `noh.xy`
+against the `noh.xy.std` file. Note: Only some tests have `.xy.std` files to
+compare against, and there may be slight differences due to floating-point rounding
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/v21-1/test-progs/pennant/pennant>
+
+The information from the original PENNANT README is included below.
+
+PENNANT Description:
+
+PENNANT is an unstructured mesh physics mini-app designed for advanced
+architecture research.  It contains mesh data structures and a few
+physics algorithms adapted from the LANL rad-hydro code FLAG, and gives
+a sample of the typical memory access patterns of FLAG.
+
+Further documentation can be found in the 'doc' directory of the
+PENNANT distribution.
+
+
+Version Log:
+
+0.6, February 2014:
+     Replaced GMV mesh reader with internal mesh generators.
+     Added QCS velocity difference routine to reflect a recent
+     bugfix in FLAG.  Increased size of big test problems.
+     [ Master branch contained this change but CUDA branch does not:
+     First MPI version.  MPI capability is working and mostly
+     optimized; MPI+OpenMP is working but needs optimization. ]
+
+0.5, May 2013:
+     Further optimizations.
+
+0.4, January 2013:
+     First open-source release.  Fixed a bug in QCS and added some
+     optimizations.  Added Sedov and Leblanc test problems, and some
+     new input keywords to support them.
+
+0.3, July 2012:
+     Added OpenMP pragmas and point chunk processing.  Modified physics
+     state arrays to be flat arrays instead of STL vectors.
+
+0.2, June 2012:
+     Added side chunk processing.  Miscellaneous minor cleanup.
+
+0.1, March 2012:
+     Initial release, internal LANL only.
+
diff --git a/src/pennant/doc/mesh-entities.png b/src/gpu/pennant/doc/mesh-entities.png
similarity index 100%
rename from src/pennant/doc/mesh-entities.png
rename to src/gpu/pennant/doc/mesh-entities.png
Binary files differ
diff --git a/src/pennant/doc/noh-result.png b/src/gpu/pennant/doc/noh-result.png
similarity index 100%
rename from src/pennant/doc/noh-result.png
rename to src/gpu/pennant/doc/noh-result.png
Binary files differ
diff --git a/src/pennant/doc/pennantdoc.pdf b/src/gpu/pennant/doc/pennantdoc.pdf
similarity index 100%
rename from src/pennant/doc/pennantdoc.pdf
rename to src/gpu/pennant/doc/pennantdoc.pdf
Binary files differ
diff --git a/src/pennant/doc/pennantdoc.tex b/src/gpu/pennant/doc/pennantdoc.tex
similarity index 100%
rename from src/pennant/doc/pennantdoc.tex
rename to src/gpu/pennant/doc/pennantdoc.tex
diff --git a/src/pennant/doc/sedov-result.png b/src/gpu/pennant/doc/sedov-result.png
similarity index 100%
rename from src/pennant/doc/sedov-result.png
rename to src/gpu/pennant/doc/sedov-result.png
Binary files differ
diff --git a/src/pennant/doc/side-maps.png b/src/gpu/pennant/doc/side-maps.png
similarity index 100%
rename from src/pennant/doc/side-maps.png
rename to src/gpu/pennant/doc/side-maps.png
Binary files differ
diff --git a/src/pennant/src/Driver.cc b/src/gpu/pennant/src/Driver.cc
similarity index 100%
rename from src/pennant/src/Driver.cc
rename to src/gpu/pennant/src/Driver.cc
diff --git a/src/pennant/src/Driver.hh b/src/gpu/pennant/src/Driver.hh
similarity index 100%
rename from src/pennant/src/Driver.hh
rename to src/gpu/pennant/src/Driver.hh
diff --git a/src/pennant/src/ExportGold.cc b/src/gpu/pennant/src/ExportGold.cc
similarity index 100%
rename from src/pennant/src/ExportGold.cc
rename to src/gpu/pennant/src/ExportGold.cc
diff --git a/src/pennant/src/ExportGold.hh b/src/gpu/pennant/src/ExportGold.hh
similarity index 100%
rename from src/pennant/src/ExportGold.hh
rename to src/gpu/pennant/src/ExportGold.hh
diff --git a/src/pennant/src/GenMesh.cc b/src/gpu/pennant/src/GenMesh.cc
similarity index 100%
rename from src/pennant/src/GenMesh.cc
rename to src/gpu/pennant/src/GenMesh.cc
diff --git a/src/pennant/src/GenMesh.hh b/src/gpu/pennant/src/GenMesh.hh
similarity index 100%
rename from src/pennant/src/GenMesh.hh
rename to src/gpu/pennant/src/GenMesh.hh
diff --git a/src/pennant/src/Hydro.cc b/src/gpu/pennant/src/Hydro.cc
similarity index 100%
rename from src/pennant/src/Hydro.cc
rename to src/gpu/pennant/src/Hydro.cc
diff --git a/src/pennant/src/Hydro.hh b/src/gpu/pennant/src/Hydro.hh
similarity index 100%
rename from src/pennant/src/Hydro.hh
rename to src/gpu/pennant/src/Hydro.hh
diff --git a/src/pennant/src/HydroBC.cc b/src/gpu/pennant/src/HydroBC.cc
similarity index 100%
rename from src/pennant/src/HydroBC.cc
rename to src/gpu/pennant/src/HydroBC.cc
diff --git a/src/pennant/src/HydroBC.hh b/src/gpu/pennant/src/HydroBC.hh
similarity index 100%
rename from src/pennant/src/HydroBC.hh
rename to src/gpu/pennant/src/HydroBC.hh
diff --git a/src/pennant/src/HydroGPU.cxx b/src/gpu/pennant/src/HydroGPU.cxx
similarity index 100%
rename from src/pennant/src/HydroGPU.cxx
rename to src/gpu/pennant/src/HydroGPU.cxx
diff --git a/src/pennant/src/HydroGPU.hh b/src/gpu/pennant/src/HydroGPU.hh
similarity index 100%
rename from src/pennant/src/HydroGPU.hh
rename to src/gpu/pennant/src/HydroGPU.hh
diff --git a/src/pennant/src/ImportGMV.cc b/src/gpu/pennant/src/ImportGMV.cc
similarity index 100%
rename from src/pennant/src/ImportGMV.cc
rename to src/gpu/pennant/src/ImportGMV.cc
diff --git a/src/pennant/src/ImportGMV.hh b/src/gpu/pennant/src/ImportGMV.hh
similarity index 100%
rename from src/pennant/src/ImportGMV.hh
rename to src/gpu/pennant/src/ImportGMV.hh
diff --git a/src/pennant/src/InputFile.cc b/src/gpu/pennant/src/InputFile.cc
similarity index 100%
rename from src/pennant/src/InputFile.cc
rename to src/gpu/pennant/src/InputFile.cc
diff --git a/src/pennant/src/InputFile.hh b/src/gpu/pennant/src/InputFile.hh
similarity index 100%
rename from src/pennant/src/InputFile.hh
rename to src/gpu/pennant/src/InputFile.hh
diff --git a/src/pennant/src/Memory.hh b/src/gpu/pennant/src/Memory.hh
similarity index 100%
rename from src/pennant/src/Memory.hh
rename to src/gpu/pennant/src/Memory.hh
diff --git a/src/pennant/src/Mesh.cc b/src/gpu/pennant/src/Mesh.cc
similarity index 100%
rename from src/pennant/src/Mesh.cc
rename to src/gpu/pennant/src/Mesh.cc
diff --git a/src/pennant/src/Mesh.hh b/src/gpu/pennant/src/Mesh.hh
similarity index 100%
rename from src/pennant/src/Mesh.hh
rename to src/gpu/pennant/src/Mesh.hh
diff --git a/src/pennant/src/Parallel.cc b/src/gpu/pennant/src/Parallel.cc
similarity index 100%
rename from src/pennant/src/Parallel.cc
rename to src/gpu/pennant/src/Parallel.cc
diff --git a/src/pennant/src/Parallel.hh b/src/gpu/pennant/src/Parallel.hh
similarity index 100%
rename from src/pennant/src/Parallel.hh
rename to src/gpu/pennant/src/Parallel.hh
diff --git a/src/pennant/src/PolyGas.cc b/src/gpu/pennant/src/PolyGas.cc
similarity index 100%
rename from src/pennant/src/PolyGas.cc
rename to src/gpu/pennant/src/PolyGas.cc
diff --git a/src/pennant/src/PolyGas.hh b/src/gpu/pennant/src/PolyGas.hh
similarity index 100%
rename from src/pennant/src/PolyGas.hh
rename to src/gpu/pennant/src/PolyGas.hh
diff --git a/src/pennant/src/QCS.cc b/src/gpu/pennant/src/QCS.cc
similarity index 100%
rename from src/pennant/src/QCS.cc
rename to src/gpu/pennant/src/QCS.cc
diff --git a/src/pennant/src/QCS.hh b/src/gpu/pennant/src/QCS.hh
similarity index 100%
rename from src/pennant/src/QCS.hh
rename to src/gpu/pennant/src/QCS.hh
diff --git a/src/pennant/src/TTS.cc b/src/gpu/pennant/src/TTS.cc
similarity index 100%
rename from src/pennant/src/TTS.cc
rename to src/gpu/pennant/src/TTS.cc
diff --git a/src/pennant/src/TTS.hh b/src/gpu/pennant/src/TTS.hh
similarity index 100%
rename from src/pennant/src/TTS.hh
rename to src/gpu/pennant/src/TTS.hh
diff --git a/src/pennant/src/Vec2.hh b/src/gpu/pennant/src/Vec2.hh
similarity index 100%
rename from src/pennant/src/Vec2.hh
rename to src/gpu/pennant/src/Vec2.hh
diff --git a/src/pennant/src/WriteXY.cc b/src/gpu/pennant/src/WriteXY.cc
similarity index 100%
rename from src/pennant/src/WriteXY.cc
rename to src/gpu/pennant/src/WriteXY.cc
diff --git a/src/pennant/src/WriteXY.hh b/src/gpu/pennant/src/WriteXY.hh
similarity index 100%
rename from src/pennant/src/WriteXY.hh
rename to src/gpu/pennant/src/WriteXY.hh
diff --git a/src/pennant/src/main.cc b/src/gpu/pennant/src/main.cc
similarity index 100%
rename from src/pennant/src/main.cc
rename to src/gpu/pennant/src/main.cc
diff --git a/src/pennant/test/leblanc/leblanc.pnt b/src/gpu/pennant/test/leblanc/leblanc.pnt
similarity index 100%
rename from src/pennant/test/leblanc/leblanc.pnt
rename to src/gpu/pennant/test/leblanc/leblanc.pnt
diff --git a/src/pennant/test/leblanc/leblanc.xy.std b/src/gpu/pennant/test/leblanc/leblanc.xy.std
similarity index 100%
rename from src/pennant/test/leblanc/leblanc.xy.std
rename to src/gpu/pennant/test/leblanc/leblanc.xy.std
diff --git a/src/pennant/test/leblanc/rect10x90.gmv b/src/gpu/pennant/test/leblanc/rect10x90.gmv
similarity index 100%
rename from src/pennant/test/leblanc/rect10x90.gmv
rename to src/gpu/pennant/test/leblanc/rect10x90.gmv
diff --git a/src/pennant/test/leblancbig/leblancbig.pnt b/src/gpu/pennant/test/leblancbig/leblancbig.pnt
similarity index 100%
rename from src/pennant/test/leblancbig/leblancbig.pnt
rename to src/gpu/pennant/test/leblancbig/leblancbig.pnt
diff --git a/src/pennant/test/leblancbig/rect80x720.gmv b/src/gpu/pennant/test/leblancbig/rect80x720.gmv
similarity index 100%
rename from src/pennant/test/leblancbig/rect80x720.gmv
rename to src/gpu/pennant/test/leblancbig/rect80x720.gmv
diff --git a/src/pennant/test/noh/noh.pnt b/src/gpu/pennant/test/noh/noh.pnt
similarity index 100%
rename from src/pennant/test/noh/noh.pnt
rename to src/gpu/pennant/test/noh/noh.pnt
diff --git a/src/pennant/test/noh/noh.xy.std b/src/gpu/pennant/test/noh/noh.xy.std
similarity index 100%
rename from src/pennant/test/noh/noh.xy.std
rename to src/gpu/pennant/test/noh/noh.xy.std
diff --git a/src/pennant/test/nohpoly/hex150x150.gmv b/src/gpu/pennant/test/nohpoly/hex150x150.gmv
similarity index 100%
rename from src/pennant/test/nohpoly/hex150x150.gmv
rename to src/gpu/pennant/test/nohpoly/hex150x150.gmv
diff --git a/src/pennant/test/nohpoly/nohpoly.pnt b/src/gpu/pennant/test/nohpoly/nohpoly.pnt
similarity index 100%
rename from src/pennant/test/nohpoly/nohpoly.pnt
rename to src/gpu/pennant/test/nohpoly/nohpoly.pnt
diff --git a/src/pennant/test/nohsmall/nohsmall.pnt b/src/gpu/pennant/test/nohsmall/nohsmall.pnt
similarity index 100%
rename from src/pennant/test/nohsmall/nohsmall.pnt
rename to src/gpu/pennant/test/nohsmall/nohsmall.pnt
diff --git a/src/pennant/test/nohsmall/nohsmall.xy.std b/src/gpu/pennant/test/nohsmall/nohsmall.xy.std
similarity index 100%
rename from src/pennant/test/nohsmall/nohsmall.xy.std
rename to src/gpu/pennant/test/nohsmall/nohsmall.xy.std
diff --git a/src/pennant/test/nohsmall/radial2x20.gmv b/src/gpu/pennant/test/nohsmall/radial2x20.gmv
similarity index 100%
rename from src/pennant/test/nohsmall/radial2x20.gmv
rename to src/gpu/pennant/test/nohsmall/radial2x20.gmv
diff --git a/src/pennant/test/nohsquare/nohsquare.pnt b/src/gpu/pennant/test/nohsquare/nohsquare.pnt
similarity index 100%
rename from src/pennant/test/nohsquare/nohsquare.pnt
rename to src/gpu/pennant/test/nohsquare/nohsquare.pnt
diff --git a/src/pennant/test/nohsquare/square180x180.gmv b/src/gpu/pennant/test/nohsquare/square180x180.gmv
similarity index 100%
rename from src/pennant/test/nohsquare/square180x180.gmv
rename to src/gpu/pennant/test/nohsquare/square180x180.gmv
diff --git a/src/pennant/test/sedov/sedov.pnt b/src/gpu/pennant/test/sedov/sedov.pnt
similarity index 100%
rename from src/pennant/test/sedov/sedov.pnt
rename to src/gpu/pennant/test/sedov/sedov.pnt
diff --git a/src/pennant/test/sedov/sedov.xy.std b/src/gpu/pennant/test/sedov/sedov.xy.std
similarity index 100%
rename from src/pennant/test/sedov/sedov.xy.std
rename to src/gpu/pennant/test/sedov/sedov.xy.std
diff --git a/src/pennant/test/sedov/square45x45.gmv b/src/gpu/pennant/test/sedov/square45x45.gmv
similarity index 100%
rename from src/pennant/test/sedov/square45x45.gmv
rename to src/gpu/pennant/test/sedov/square45x45.gmv
diff --git a/src/pennant/test/sedovbig/sedovbig.pnt b/src/gpu/pennant/test/sedovbig/sedovbig.pnt
similarity index 100%
rename from src/pennant/test/sedovbig/sedovbig.pnt
rename to src/gpu/pennant/test/sedovbig/sedovbig.pnt
diff --git a/src/pennant/test/sedovbig/square270x270.gmv b/src/gpu/pennant/test/sedovbig/square270x270.gmv
similarity index 100%
rename from src/pennant/test/sedovbig/square270x270.gmv
rename to src/gpu/pennant/test/sedovbig/square270x270.gmv
diff --git a/src/pennant/test/sedovsmall/sedovsmall.pnt b/src/gpu/pennant/test/sedovsmall/sedovsmall.pnt
similarity index 100%
rename from src/pennant/test/sedovsmall/sedovsmall.pnt
rename to src/gpu/pennant/test/sedovsmall/sedovsmall.pnt
diff --git a/src/pennant/test/sedovsmall/sedovsmall.xy.std b/src/gpu/pennant/test/sedovsmall/sedovsmall.xy.std
similarity index 100%
rename from src/pennant/test/sedovsmall/sedovsmall.xy.std
rename to src/gpu/pennant/test/sedovsmall/sedovsmall.xy.std
diff --git a/src/pennant/test/sedovsmall/square9x9.gmv b/src/gpu/pennant/test/sedovsmall/square9x9.gmv
similarity index 100%
rename from src/pennant/test/sedovsmall/square9x9.gmv
rename to src/gpu/pennant/test/sedovsmall/square9x9.gmv
diff --git a/src/pennant/tools/gmvrect.py b/src/gpu/pennant/tools/gmvrect.py
similarity index 100%
rename from src/pennant/tools/gmvrect.py
rename to src/gpu/pennant/tools/gmvrect.py
diff --git a/src/square/.gitignore b/src/gpu/square/.gitignore
similarity index 100%
rename from src/square/.gitignore
rename to src/gpu/square/.gitignore
diff --git a/src/gpu/square/Makefile b/src/gpu/square/Makefile
new file mode 100644
index 0000000..48164aa
--- /dev/null
+++ b/src/gpu/square/Makefile
@@ -0,0 +1,17 @@
+HIP_PATH?= /opt/rocm/hip
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+BIN_DIR?= ./bin
+
+square: $(BIN_DIR)/square
+
+$(BIN_DIR)/square: square.cpp $(BIN_DIR)
+	$(HIPCC) --amdgpu-target=gfx801,gfx803 $(CXXFLAGS) square.cpp -o $(BIN_DIR)/square
+
+$(BIN_DIR):
+	mkdir -p $(BIN_DIR)
+
+clean:
+	rm -rf $(BIN_DIR)
+
+.PHONY: square clean
diff --git a/src/square/README.md b/src/gpu/square/README.md
similarity index 68%
rename from src/square/README.md
rename to src/gpu/square/README.md
index 84039ab..104f434 100644
--- a/src/square/README.md
+++ b/src/gpu/square/README.md
@@ -11,19 +11,19 @@
 
 The square test is used to test the GCN3-GPU model.
 
-Compiling square, compiling the GCN3_X86 gem5, and runing square on gem5is dependent on the gcn-gpu docker image, built from the `util/dockerfiles/gcn-gpu/Dockerfile` on the [gem5 stable branch](https://gem5.googlesource.com/public/gem5/+/refs/heads/stable).
+Compiling square, compiling the GCN3_X86 gem5, and running square on gem5 is dependent on the gcn-gpu docker image, built from the `util/dockerfiles/gcn-gpu/Dockerfile` on the [gem5 stable branch](https://gem5.googlesource.com/public/gem5/+/refs/heads/stable).
 
 ## Compiling Square
 
+By default, square will build for all supported GPU types (gfx801, gfx803)
 ```
-cd src/square
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make gfx8-apu
+cd src/gpu/square
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
 The compiled binary can be found in the `bin` directory.
 
-A pre-built binary can be found at <http://dist.gem5.org/dist/v21-0/test-progs/square/square.o>.
-
+A pre-built binary can be found at <http://dist.gem5.org/dist/v21-1/test-progs/square/square>.
 
 ## Compiling GN3_X86/gem5.opt
 
@@ -38,5 +38,5 @@
 ## Running Square on GCN3_X86/gem5.opt
 
 ```
-docker run -u $UID:$GUID --volume $(pwd):$(pwd) -w $(pwd) gcr.io/gem5-test/gcn-gpu:latest gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n <num cores> -c bin/square.o
+docker run -u $UID:$GUID --volume $(pwd):$(pwd) -w $(pwd) gcr.io/gem5-test/gcn-gpu:latest gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n 3 -c bin/square
 ```
diff --git a/src/square/square.cpp b/src/gpu/square/square.cpp
similarity index 76%
rename from src/square/square.cpp
rename to src/gpu/square/square.cpp
index 87bf597..cd1ce72 100644
--- a/src/square/square.cpp
+++ b/src/gpu/square/square.cpp
@@ -50,9 +50,6 @@
 
 int main(int argc, char *argv[])
 {
-#ifdef DGPU
-    float *A_d, *C_d;
-#endif
     float *A_h, *C_h;
     size_t N = 1000000;
     size_t Nbytes = N * sizeof(float);
@@ -63,38 +60,21 @@
     #ifdef __HIP_PLATFORM_HCC__
       printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch);
     #endif
-    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-    A_h = (float*)malloc(Nbytes);
-    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
-    C_h = (float*)malloc(Nbytes);
-    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    printf ("info: allocate host and device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    CHECK(hipHostMalloc(&A_h, Nbytes));
+    CHECK(hipHostMalloc(&C_h, Nbytes));
     // Fill with Phi + i
     for (size_t i=0; i<N; i++)
     {
         A_h[i] = 1.618f + i;
     }
 
-#ifdef DGPU
-    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-    CHECK(hipMalloc(&A_d, Nbytes));
-    CHECK(hipMalloc(&C_d, Nbytes));
-
-    printf ("info: copy Host2Device\n");
-    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-#endif
-
     const unsigned blocks = 512;
     const unsigned threadsPerBlock = 256;
 
     printf ("info: launch 'vector_square' kernel\n");
-#ifdef DGPU
-    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
-
-    printf ("info: copy Device2Host\n");
-    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-#else
     hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_h, A_h, N);
-#endif
+    hipDeviceSynchronize();
 
     printf ("info: check result\n");
     for (size_t i=0; i<N; i++)  {
diff --git a/src/hip-samples/src/unroll.cpp b/src/hip-samples/src/unroll.cpp
deleted file mode 100644
index 22f1c75..0000000
--- a/src/hip-samples/src/unroll.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include<iostream>
-
-// hip header file
-#include "hip/hip_runtime.h"
-
-
-#define WIDTH     4
-
-#define NUM       (WIDTH*WIDTH)
-
-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
-
-// Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    float val = in[x];
-
-#pragma unroll
-    for(int i=0;i<width;i++)
-    {
-        for(int j=0;j<width;j++)
-            out[i*width + j] = __shfl(val,j*width + i);
-    }
-}
-
-// CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
-        }
-    }
-}
-
-int main() {
-
-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
-
-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
-
-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
-
-  std::cout << "Device name " << devProp.name << std::endl;
-
-  int i;
-  int errors;
-
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
-      errors++;
-    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }
-
-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
-
-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
-
-  return errors;
-}
diff --git a/src/linux-kernel/README.md b/src/linux-kernel/README.md
index 1ffdd5e..8a7c438 100644
--- a/src/linux-kernel/README.md
+++ b/src/linux-kernel/README.md
@@ -66,11 +66,11 @@
 

 The pre-build compiled Linux binaries can be downloaded from the following links:

 

-- [vmlinux-4.4.186](http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.4.186)

-- [vmlinux-4.9.186](http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.9.186)

-- [vmlinux-4.14.134](http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.14.134)

-- [vmlinux-4.19.83](http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.19.83)

-- [vmlinux-5.4.49](http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-5.4.49)

+- [vmlinux-4.4.186](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.4.186)

+- [vmlinux-4.9.186](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.9.186)

+- [vmlinux-4.14.134](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.14.134)

+- [vmlinux-4.19.83](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83)

+- [vmlinux-5.4.49](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-5.4.49)

 

 

 **Licensing:**

diff --git a/src/npb/README.md b/src/npb/README.md
index 85e7a71..a4a71c2 100644
--- a/src/npb/README.md
+++ b/src/npb/README.md
@@ -81,15 +81,14 @@
 ```
 
 Once this process succeeds, the created disk image can be found on `npb/npb-image/npb`.
-A disk image already created following the above instructions can be found, gzipped, [here](http://dist.gem5.org/dist/v21-0/images/x86/ubuntu-18-04/npb.img.gz).
-
+A disk image already created following the above instructions can be found, gzipped, [here](http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/npb.img.gz).
 
 ## gem5 Run Scripts
 
 The gem5 scripts which configure the system and run simulation are available in configs-npb-tests/.
 The main script `run_npb.py` expects following arguments:
 
-**kernel:** path to the Linux kernel. This disk image has been tested with version 4.19.83, available at <http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.19.83>. More info on building Linux Kernels can be found in the `src/linux-kernels` directory.
+**kernel:** path to the Linux kernel. This disk image has been tested with version 4.19.83, available at <http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>. More info on building Linux Kernels can be found in the `src/linux-kernels` directory.
 
 **disk:** path to the npb disk image.
 
diff --git a/src/npb/configs/run_npb.py b/src/npb/configs/run_npb.py
index 12b43de..7aa3dca 100755
--- a/src/npb/configs/run_npb.py
+++ b/src/npb/configs/run_npb.py
@@ -35,17 +35,12 @@
     number of instructions executed in the ROI. It also tracks how much
     wallclock and simulated time.
 """
-import errno
-import os
-import sys
+import argparse
 import time
 import m5
 import m5.ticks
 from m5.objects import *
 
-sys.path.append('gem5/configs/common/') # For the next line...
-import SimpleOpts
-
 from system import *
 
 def writeBenchScript(dir, bench):
@@ -66,32 +61,80 @@
     bench_file.close()
     return file_name
 
-if __name__ == "__m5_main__":
-    (opts, args) = SimpleOpts.parse_args()
-    kernel, disk, cpu, mem_sys, benchmark, num_cpus = args
+supported_protocols = ["classic", "MI_example", "MESI_Two_Level",
+                        "MOESI_CMP_directory"]
+supported_cpu_types = ['kvm', 'atomic', 'timing']
+benchmark_choices = ['bt.A.x', 'cg.A.x', 'ep.A.x', 'ft.A.x',
+                     'is.A.x', 'lu.A.x', 'mg.A.x', 'sp.A.x',
+                     'bt.B.x', 'cg.B.x', 'ep.B.x', 'ft.B.x',
+                     'is.B.x', 'lu.B.x', 'mg.B.x', 'sp.B.x',
+                     'bt.C.x', 'cg.C.x', 'ep.C.x', 'ft.C.x',
+                     'is.C.x', 'lu.C.x', 'mg.C.x', 'sp.C.x',
+                     'bt.D.x', 'cg.D.x', 'ep.D.x', 'ft.D.x',
+                     'is.D.x', 'lu.D.x', 'mg.D.x', 'sp.D.x',
+                     'bt.F.x', 'cg.F.x', 'ep.F.x', 'ft.F.x',
+                     'is.F.x', 'lu.F.x', 'mg.F.x', 'sp.F.x']
 
-    if not cpu in ['atomic', 'kvm', 'timing']:
-        m5.fatal("cpu not supported")
+def parse_options():
+
+    parser = argparse.ArgumentParser(description='For use with gem5. This '
+                'runs a NAS Parallel Benchmark application. This only works '
+                'with x86 ISA.')
+
+    # The manditry position arguments.
+    parser.add_argument("kernel", type=str,
+                        help="Path to the kernel binary to boot")
+    parser.add_argument("disk", type=str,
+                        help="Path to the disk image to boot")
+    parser.add_argument("cpu", type=str, choices=supported_cpu_types,
+                        help="The type of CPU to use in the system")
+    parser.add_argument("mem_sys", type=str, choices=supported_protocols,
+                        help="Type of memory system or coherence protocol")
+    parser.add_argument("benchmark", type=str, choices=benchmark_choices,
+                        help="The NPB application to run")
+    parser.add_argument("num_cpus", type=int, help="Number of CPU cores")
+
+    # The optional arguments.
+    parser.add_argument("--no_host_parallel", action="store_true",
+                        help="Do NOT run gem5 on multiple host threads "
+                              "(kvm only)")
+    parser.add_argument("--second_disk", type=str,
+                        help="The second disk image to mount (/dev/hdb)")
+    parser.add_argument("--no_prefetchers", action="store_true",
+                        help="Enable prefectchers on the caches")
+    parser.add_argument("--l1i_size", type=str, default='32kB',
+                        help="L1 instruction cache size. Default: 32kB")
+    parser.add_argument("--l1d_size", type=str, default='32kB',
+                        help="L1 data cache size. Default: 32kB")
+    parser.add_argument("--l2_size", type=str, default = "256kB",
+                        help="L2 cache size. Default: 256kB")
+    parser.add_argument("--l3_size", type=str, default = "4MB",
+                        help="L2 cache size. Default: 4MB")
+
+    return parser.parse_args()
+
+if __name__ == "__m5_main__":
+    args = parse_options()
+
 
     # create the system we are going to simulate
-    system = MySystem(kernel, disk, int(num_cpus), opts, no_kvm=False)
+    system = MySystem(args.kernel, args.disk, args.num_cpus, args,
+                      no_kvm=False)
 
 
-    ruby_protocols = [ "MI_example", "MESI_Two_Level", "MOESI_CMP_directory"]
-
-    if mem_sys == "classic":
-        system = MySystem(kernel, disk, int(num_cpus), opts, no_kvm=False)
-    elif mem_sys in ruby_protocols:
-        system = MyRubySystem(kernel, disk, mem_sys, int(num_cpus), opts)
+    if args.mem_sys == "classic":
+        system = MySystem(args.kernel, args.disk, args.num_cpus, args,
+                          no_kvm=False)
     else:
-        m5.fatal("Bad option for mem_sys")
+        system = MyRubySystem(args.kernel, args.disk, args.mem_sys,
+                              args.num_cpus, args)
 
     # Exit from guest on workbegin/workend
     system.exit_on_work_items = True
 
     # Create and pass a script to the simulated system to run the reuired
     # benchmark
-    system.readfile = writeBenchScript(m5.options.outdir, benchmark)
+    system.readfile = writeBenchScript(m5.options.outdir, args.benchmark)
 
     # set up the root SimObject and start the simulation
     root = Root(full_system = True, system = system)
@@ -111,7 +154,7 @@
     globalStart = time.time()
 
     print("Running the simulation")
-    print("Using cpu: {}".format(cpu))
+    print("Using cpu: {}".format(args.cpu))
     exit_event = m5.simulate()
 
     if exit_event.getCause() == "workbegin":
@@ -124,9 +167,9 @@
         start_tick = m5.curTick()
         start_insts = system.totalInsts()
         # switching cpu if argument cpu == atomic or timing
-        if cpu == 'atomic':
+        if args.cpu == 'atomic':
             system.switchCpus(system.cpu, system.atomicCpu)
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.cpu, system.timingCpu)
     else:
         print("Unexpected termination of simulation !")
@@ -149,18 +192,17 @@
     # switching back to simulate the remaining
     # part
 
-    if mem_sys in ruby_protocols:
-        print("Ruby Mem: Not Switching back to KVM!")
-
-    if mem_sys == 'classic':
+    if args.mem_sys == 'classic':
         # switch cpu back to kvm if atomic/timing was used for ROI
-        if cpu == 'atomic':
+        if args.cpu == 'atomic':
             system.switchCpus(system.atomicCpu, system.cpu)
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.timingCpu, system.cpu)
 
         # Simulate the remaning part of the benchmark
         exit_event = m5.simulate()
+    else:
+        print("Ruby Mem: Not Switching back to KVM!")
 
     print("Done with the simulation")
     print()
diff --git a/src/npb/configs/system/caches.py b/src/npb/configs/system/caches.py
index 3751926..9e44211 100755
--- a/src/npb/configs/system/caches.py
+++ b/src/npb/configs/system/caches.py
@@ -30,26 +30,16 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script. It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
-
-import SimpleOpts
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
 
 class PrefetchCache(Cache):
 
-    SimpleOpts.add_option("--no_prefetchers", default=False,
-                          action="store_true",
-                          help="Enable prefectchers on the caches")
-
     def __init__(self, options):
         super(PrefetchCache, self).__init__()
         if not options or options.no_prefetchers:
@@ -83,12 +73,6 @@
 class L1ICache(L1Cache):
     """Simple L1 instruction cache with default values"""
 
-    # Set the default size
-    size = '32kB'
-
-    SimpleOpts.add_option('--l1i_size',
-                        help="L1 instruction cache size. Default: %s" % size)
-
     def __init__(self, opts=None):
         super(L1ICache, self).__init__(opts)
         if not opts or not opts.l1i_size:
@@ -102,12 +86,6 @@
 class L1DCache(L1Cache):
     """Simple L1 data cache with default values"""
 
-    # Set the default size
-    size = '32kB'
-
-    SimpleOpts.add_option('--l1d_size',
-                          help="L1 data cache size. Default: %s" % size)
-
     def __init__(self, opts=None):
         super(L1DCache, self).__init__(opts)
         if not opts or not opts.l1d_size:
@@ -149,7 +127,6 @@
     """Simple L2 Cache with default values"""
 
     # Default parameters
-    size = '256kB'
     assoc = 16
     tag_latency = 10
     data_latency = 10
@@ -158,9 +135,6 @@
     tgts_per_mshr = 12
     writeback_clean = True
 
-    SimpleOpts.add_option('--l2_size',
-                          help="L2 cache size. Default: %s" % size)
-
     def __init__(self, opts=None):
         super(L2Cache, self).__init__(opts)
         if not opts or not opts.l2_size:
@@ -179,9 +153,6 @@
        be used as a standalone L3 cache.
     """
 
-    SimpleOpts.add_option('--l3_size', default = '4MB',
-                          help="L3 cache size. Default: 4MB")
-
     # Default parameters
     assoc = 32
     tag_latency = 40
diff --git a/src/npb/configs/system/ruby_system.py b/src/npb/configs/system/ruby_system.py
index 4d313bb..a6d7fcb 100755
--- a/src/npb/configs/system/ruby_system.py
+++ b/src/npb/configs/system/ruby_system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
@@ -157,8 +156,6 @@
         # so the port isn't connected twice.
         self.pc.attachIO(self.iobus, [self.pc.south_bridge.ide.dma])
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/npb/configs/system/system.py b/src/npb/configs/system/system.py
index 23b8b60..f0e71c2 100755
--- a/src/npb/configs/system/system.py
+++ b/src/npb/configs/system/system.py
@@ -29,20 +29,12 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
 
 class MySystem(System):
 
-    SimpleOpts.add_option("--no_host_parallel", default=False,
-                action="store_true",
-                help="Do NOT run gem5 on multiple host threads (kvm only)")
-
-    SimpleOpts.add_option("--second_disk", default='',
-                          help="The second disk image to mount (/dev/hdb)")
-
     def __init__(self, kernel, disk, num_cpus, opts, no_kvm=False):
         super(MySystem, self).__init__()
         self._opts = opts
@@ -306,8 +298,6 @@
         self.iocache.cpu_side = self.iobus.mem_side_ports
         self.iocache.mem_side = self.membus.cpu_side_ports
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/parsec/README.md b/src/parsec/README.md
index dfc1c8d..0665fd2 100644
--- a/src/parsec/README.md
+++ b/src/parsec/README.md
@@ -79,8 +79,8 @@
 
 There are two sets of run scripts and system configuration files in the directory. The scripts found in `configs` use the classic memory system while the scripts in `configs-mesi-two-level` use the ruby memory system with MESI_Two_Level cache coherency protocol. The parameters used in the both sets of experiments are explained below:
 
-* **kernel**: The path to the linux kernel. We have verified capatibility with kernel version 4.19.83 which you can download at <http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.19.83>. More information on building kernels for gem5 can be around in `src/linux-kernel`.
-* **disk**: The path to the PARSEC disk-image.
+* **kernel**: The path to the linux kernel. We have verified capatibility with kernel version 4.19.83 which you can download at <http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>. More information on building kernels for gem5 can be around in `src/linux-kernel`.
+* **disk**: The path to the PARSEC disk-image. This can be downloaded, gzipped, from <http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/parsec.img.gz>.
 * **cpu**: The type of cpu to use. There are two supported options: `kvm` (KvmCPU) and `timing` (TimingSimpleCPU).
 * **benchmark**: The PARSEC workload to run. They include `blackscholes`, `bodytrack`, `canneal`, `dedup`, `facesim`, `ferret`, `fluidanimate`, `freqmine`, `raytrace`, `streamcluster`, `swaptions`, `vips`, `x264`. For more information on the workloads can be found at <https://parsec.cs.princeton.edu/>.
 * **size**: The size of the chosen workload. Valid sizes are `simsmall`, `simmedium`, and `simlarge`.
diff --git a/src/parsec/configs-mesi-two-level/run_parsec_mesi_two_level.py b/src/parsec/configs-mesi-two-level/run_parsec_mesi_two_level.py
index e95c8e1..311d8ca 100755
--- a/src/parsec/configs-mesi-two-level/run_parsec_mesi_two_level.py
+++ b/src/parsec/configs-mesi-two-level/run_parsec_mesi_two_level.py
@@ -37,19 +37,42 @@
     disk-image for this script.
 
 """
-import errno
-import os
-import sys
+import argparse
 import time
 import m5
 import m5.ticks
 from m5.objects import *
 
-sys.path.append('gem5/configs/common/') # For the next line...
-import SimpleOpts
-
 from system import *
 
+supported_cpu_types = ["kvm", "timing"]
+benchmark_choices = ["blackscholes", "bodytrack", "canneal", "dedup",
+                     "facesim", "ferret", "fluidanimate", "freqmine",
+                     "raytrace", "streamcluster", "swaptions", "vips", "x264"]
+size_choices=["simsmall", "simmedium", "simlarge"]
+
+
+def parse_options():
+
+    parser = argparse.ArgumentParser(description='For use with gem5. This '
+                'runs a NAS Parallel Benchmark application. This only works '
+                'with x86 ISA.')
+
+    parser.add_argument("kernel", type=str,
+                        help="Path to the kernel binary to boot")
+    parser.add_argument("disk", type=str, help="Path to the PARSEC disk image")
+    parser.add_argument("cpu", type=str, choices=supported_cpu_types,
+                        help="The type of CPU to use in the system")
+    parser.add_argument("benchmark", type=str, choices=benchmark_choices,
+                        help="The PARSEC benchmark application to run")
+    parser.add_argument("size", type=str, choices=size_choices,
+                        help="The input size to the PARSEC benchmark "
+                             "application")
+    parser.add_argument("num_cpus", type=int, choices=[1,2,8],
+                        help="The number of CPU cores")
+
+    return parser.parse_args()
+
 def writeBenchScript(dir, bench, size, num_cpus):
     """
     This method creates a script in dir which will be eventually
@@ -72,21 +95,19 @@
     return file_name
 
 if __name__ == "__m5_main__":
-    (opts, args) = SimpleOpts.parse_args()
-    kernel, disk, cpu, benchmark, size, num_cpus = args
 
-    if not cpu in ['kvm', 'timing']:
-        m5.fatal("cpu not supported")
+    args = parse_options()
 
     # create the system we are going to simulate
-    system = MyRubySystem(kernel, disk, int(num_cpus), opts)
+    system = MyRubySystem(args.kernel, args.disk, args.num_cpus, args)
 
     # Exit from guest on workbegin/workend
     system.exit_on_work_items = True
 
     # Create and pass a script to the simulated system to run the reuired
     # benchmark
-    system.readfile = writeBenchScript(m5.options.outdir, benchmark, size, num_cpus)
+    system.readfile = writeBenchScript(m5.options.outdir, args.benchmark,
+                                      args.size, args.num_cpus)
 
     # set up the root SimObject and start the simulation
     root = Root(full_system = True, system = system)
@@ -106,7 +127,7 @@
     globalStart = time.time()
 
     print("Running the simulation")
-    print("Using cpu: {}".format(cpu))
+    print("Using cpu: {}".format(args.cpu))
 
     start_tick = m5.curTick()
     end_tick = m5.curTick()
@@ -126,7 +147,7 @@
         start_tick = m5.curTick()
         start_insts = system.totalInsts()
         # switching to timing cpu if argument cpu == timing
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.cpu, system.timingCpu)
     else:
         print("Unexpected termination of simulation!")
@@ -159,7 +180,7 @@
         end_insts = system.totalInsts()
         m5.stats.reset()
         # switching to timing cpu if argument cpu == timing
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             # This line is commented due to an unimplemented
             # flush request in MESI_Two_Level that results in
             # the crashing of simulation. There will be a patch
diff --git a/src/parsec/configs-mesi-two-level/system/fs_tools.py b/src/parsec/configs-mesi-two-level/system/fs_tools.py
index 91f6646..9c02722 100755
--- a/src/parsec/configs-mesi-two-level/system/fs_tools.py
+++ b/src/parsec/configs-mesi-two-level/system/fs_tools.py
@@ -27,7 +27,9 @@
 #
 
 from m5.objects import IdeDisk, CowDiskImage, RawDiskImage
-
+import errno
+import os
+import sys
 class CowDisk(IdeDisk):
 
     def __init__(self, filename):
diff --git a/src/parsec/configs-mesi-two-level/system/ruby_system.py b/src/parsec/configs-mesi-two-level/system/ruby_system.py
index d0adcc7..e4e9d1f 100755
--- a/src/parsec/configs-mesi-two-level/system/ruby_system.py
+++ b/src/parsec/configs-mesi-two-level/system/ruby_system.py
@@ -29,7 +29,6 @@
 import m5
 import math
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 class MyRubySystem(System):
@@ -161,8 +160,6 @@
         # so the port isn't connected twice.
         self.pc.attachIO(self.iobus, [self.pc.south_bridge.ide.dma])
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/parsec/configs/run_parsec.py b/src/parsec/configs/run_parsec.py
index bab584e..5dac66d 100644
--- a/src/parsec/configs/run_parsec.py
+++ b/src/parsec/configs/run_parsec.py
@@ -34,19 +34,42 @@
     https://github.com/darchr/parsec-benchmark.git to create a working
     disk-image for this script.
 """
-import errno
-import os
-import sys
+import argparse
 import time
 import m5
 import m5.ticks
 from m5.objects import *
 
-sys.path.append('gem5/configs/common/') # For the next line...
-import SimpleOpts
-
 from system import *
 
+supported_cpu_types = ["kvm", "timing"]
+benchmark_choices = ["blackscholes", "bodytrack", "canneal", "dedup",
+                     "facesim", "ferret", "fluidanimate", "freqmine",
+                     "raytrace", "streamcluster", "swaptions", "vips", "x264"]
+size_choices=["simsmall", "simmedium", "simlarge"]
+
+
+def parse_options():
+
+    parser = argparse.ArgumentParser(description='For use with gem5. This '
+                'runs a NAS Parallel Benchmark application. This only works '
+                'with x86 ISA.')
+
+    parser.add_argument("kernel", type=str,
+                        help="Path to the kernel binary to boot")
+    parser.add_argument("disk", type=str, help="Path to the PARSEC disk image")
+    parser.add_argument("cpu", type=str, choices=supported_cpu_types,
+                        help="The type of CPU to use in the system")
+    parser.add_argument("benchmark", type=str, choices=benchmark_choices,
+                        help="The PARSEC benchmark application to run")
+    parser.add_argument("size", type=str, choices=size_choices,
+                        help="The input size to the PARSEC benchmark "
+                             "application")
+    parser.add_argument("num_cpus", type=int, choices=[1,2,8],
+                        help="The number of CPU cores")
+
+    return parser.parse_args()
+
 def writeBenchScript(dir, bench, size, num_cpus):
     """
     This method creates a script in dir which will be eventually
@@ -69,22 +92,19 @@
     return file_name
 
 if __name__ == "__m5_main__":
-    (opts, args) = SimpleOpts.parse_args()
-    kernel, disk, cpu, benchmark, size, num_cpus = args
 
-    if not cpu in ['kvm', 'timing']:
-        m5.fatal("cpu not supported")
+    args = parse_options()
 
     # create the system
-    system = MySystem(kernel, disk, cpu, int(num_cpus))
+    system = MySystem(args.kernel, args.disk, args.cpu, args.num_cpus)
 
     # Exit from guest on workbegin/workend
     system.exit_on_work_items = True
 
     # Create and pass a script to the simulated system to run the reuired
     # benchmark
-    system.readfile = writeBenchScript(m5.options.outdir, benchmark, size,
-                                       num_cpus)
+    system.readfile = writeBenchScript(m5.options.outdir, args.benchmark,
+                                      args.size, args.num_cpus)
 
     # set up the root SimObject and start the simulation
     root = Root(full_system = True, system = system)
@@ -104,7 +124,7 @@
     globalStart = time.time()
 
     print("Running the simulation")
-    print("Using cpu: {}".format(cpu))
+    print("Using cpu: {}".format(args.cpu))
 
     start_tick = m5.curTick()
     end_tick = m5.curTick()
@@ -124,7 +144,7 @@
         start_tick = m5.curTick()
         start_insts = system.totalInsts()
         # switching to timing cpu if argument cpu == timing
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.cpu, system.detailedCpu)
     else:
         print("Unexpected termination of simulation!")
@@ -157,7 +177,7 @@
         end_insts = system.totalInsts()
         m5.stats.reset()
         # switching to timing cpu if argument cpu == timing
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.timingCpu, system.cpu)
     else:
         print("Unexpected termination of simulation!")
diff --git a/src/parsec/configs/system/caches.py b/src/parsec/configs/system/caches.py
index 4316aa1..7d60733 100644
--- a/src/parsec/configs/system/caches.py
+++ b/src/parsec/configs/system/caches.py
@@ -27,14 +27,10 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script. It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
diff --git a/src/parsec/configs/system/ruby_system.py b/src/parsec/configs/system/ruby_system.py
index 30eebd4..3959a71 100644
--- a/src/parsec/configs/system/ruby_system.py
+++ b/src/parsec/configs/system/ruby_system.py
@@ -26,7 +26,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
@@ -147,8 +146,6 @@
         # so the port isn't connected twice.
         self.pc.attachIO(self.iobus, [self.pc.south_bridge.ide.dma])
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/parsec/configs/system/system.py b/src/parsec/configs/system/system.py
index fe4d198..09030c2 100644
--- a/src/parsec/configs/system/system.py
+++ b/src/parsec/configs/system/system.py
@@ -26,7 +26,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
@@ -250,8 +249,6 @@
         self.iocache.cpu_side = self.iobus.mem_side_ports
         self.iocache.mem_side = self.membus.cpu_side_ports
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/riscv-fs/README.md b/src/riscv-fs/README.md
index 08b386e..e5bad0e 100644
--- a/src/riscv-fs/README.md
+++ b/src/riscv-fs/README.md
@@ -17,7 +17,7 @@
 
 The used disk image is based on [busybox](https://busybox.net/) and [UCanLinux](https://github.com/UCanLinux/). It is built using the instructions, mostly from [here](https://github.com/UCanLinux/riscv64-sample).
 
-All components are cross compiled on an x86 host using a riscv tool chain.
+**Note:** All components are cross compiled on an x86 host using a riscv tool chain. We used `88b004d4c2a7d4e4f08b17ee32d2` commit of the riscv tool chain source while building the source (riscv gcc version 10.2.0).
 
 We assume the following directory structure while following the instructions in this README file:
 
@@ -27,8 +27,6 @@
   |
   |___ riscv-disk                              # built disk image will go here
   |
-  |___ device.dts                              # device tree file to use with bbl
-  |
   |___ riscv-gnu-toolchain                     # riscv tool chain for cross compilation
   |
   |___ riscv64-sample                          # UCanLinux source
@@ -38,7 +36,7 @@
   |       |__RootFS                            # root file system for disk image
   |
   |
-  |___ configs-riscv-fs
+  |___ configs
   |      |___ system                           # gem5 system config files
   |      |___ run_riscv.py                     # gem5 run script
   |
@@ -57,6 +55,7 @@
 # clone riscv gnu toolchain source
 git clone https://github.com/riscv/riscv-gnu-toolchain
 cd riscv-gnu-toolchain
+git checkout 88b004d4c2a7d4e4f08b17ee32d2
 
 # change the prefix to your directory
 # of choice for installation of the
@@ -73,7 +72,7 @@
 export PATH=$PATH:/opt/riscv/bin/
 ```
 
-***Note:** The above step is necessary and might cause errors while cross compiling different components for riscv if other methods are used to point to the toolchain.
+**Note:** The above step is necessary and might cause errors while cross compiling different components for riscv if other methods are used to point to the toolchain.
 
 ## UCanLinux Source
 
@@ -86,7 +85,7 @@
 git clone https://github.com/UCanLinux/riscv64-sample
 ```
 
-This source contains already built bootloader and disk images as well. Though the given disk image might be usable with gem5, the `bbl` (bootloader image) will not work with gem5 and we need to compile `bbl` with an input device tree (`.dts`) file separately. The following sections provide instructions to build both `bbl` and disk images.
+The following sections provide instructions to build both `bbl` and disk images.
 
 ## Linux Kernel
 
@@ -113,6 +112,7 @@
 ```
 
 This should generate a `vmlinux` image in the `linux` directory.
+A pre-built RISC-V 5.10 linux kernel can be downloaded [here](http://dist.gem5.org/dist/v21-1/kernels/riscv/static/vmlinux-5.10).
 
 ## Bootloader (bbl)
 
@@ -130,10 +130,9 @@
 
 apt-get install device-tree-compiler
 
-# copy the device tree file from riscv-fs
-cp ../../../device.dts .
+# configure bbl build
+../configure --host=riscv64-unknown-linux-gnu --with-payload=../../linux/vmlinux --prefix=/opt/riscv/
 
-../configure --host=riscv64-unknown-linux-gnu --with-payload=../../linux/vmlinux --prefix=/opt/riscv/ --with-dts=device.dts
 make -j$(nproc)
 
 chmod 755 bbl
@@ -143,6 +142,7 @@
 ```
 
 This will produce a `bbl` bootloader binary with linux kernel in `riscv-pk/build` directory.
+A pre-built copy of this bootloard binary, with the linux kernel can be downloaded [here](http://dist.gem5.org/dist/develop/kernels/riscv/static/bootloader-vmlinux-5.10).
 
 ## Busy Box
 
@@ -155,7 +155,7 @@
 cd busybox
 git checkout 1_30_stable  # checkout the latest stable branch
 make menuconfig
-cp ../sample/busybox.config .config  # optional
+cp ../busybox.config .config  # optional
 make menuconfig
 make CROSS_COMPILE=riscv64-unknown-linux-gnu- all -j$(nproc)
 make CROSS_COMPILE=riscv64-unknown-linux-gnu- install
@@ -167,7 +167,7 @@
 
 ```sh
 # going back to riscv64-sample directory
-cd ../..
+cd ../
 
 mkdir RootFS
 cd RootFS
@@ -191,12 +191,16 @@
 
 # build m5 util for riscv and move
 # it to the root file system as well
-cd ../../../
+cd ../../../../
 cd gem5/util/m5
-scons -C util/m5 build/riscv/out/m5
-cp build/riscv/out/m5 ../../../RootFS/sbin/
+scons build/riscv/out/m5
+cp build/riscv/out/m5 ../../../riscv64-sample/RootFS/sbin/
 ```
 
+**Note**: the default cross-compiler is `riscv64-unknown-linux-gnu-`. To change the cross-compiler, you can set the cross-compiler using the scons sticky variable `riscv.CROSS_COMPILE`. For example,
+```sh
+scons riscv.CROSS_COMPILE=riscv64-linux-gnu- build/riscv/out/m5
+```
 ## Disk Image
 
 Create a disk of 512MB size.
@@ -214,16 +218,15 @@
 sudo mkdir /mnt/rootfs
 sudo mount riscv_disk /mnt/rootfs
 
-sudo cp -a RootFS/* /mnt/rootfs
+sudo cp -a riscv64-sample/RootFS/* /mnt/rootfs
 
 sudo chown -R -h root:root /mnt/rootfs/
 df /mnt/rootfs
-# make sure you are in riscv64-sample dir
-cd ../riscv64-sample
 sudo umount /mnt/rootfs
 ```
 
 The disk image `riscv_disk` is ready to use.
+A pre-built, gzipped, disk image can be downloaded [here](http://dist.gem5.org/dist/develop/images/riscv/busybox/riscv-disk.img.gz).
 
 **Note:** If you need to resize the disk image once it is created, you can do the following:
 
@@ -240,27 +243,34 @@
 
 ## gem5 Run Scripts
 
-gem5 scripts which can configure a riscv full system and run simulation are available in configs-riscv-fs/.
+gem5 scripts which can configure a riscv full system and run simulation are available in configs/.
 The main script `run_riscv.py` expects following arguments:
 
-**bbl:** path to the bbl (berkeley bootloader) binary with kernel payload.
+**bbl:** path to the bbl (berkeley bootloader) binary with kernel payload (located at `riscv64-sample/riscv-pk/build/bbl`).
 
-**disk:** path to the disk image to use.
+**disk:** path to the disk image to use (located at `riscv64-sample/riscv_disk`).
 
-**cpu_type:** cpu model (`atomic`, `simple`).
+**cpu_type:** cpu model (`atomic` for AtomicSimpleCPU, `simple` for TimingSimpleCPU).
 
 **num_cpus:** number of cpu cores.
 
 An example use of this script is the following:
 
 ```sh
-[gem5 binary] -re configs/run_exit.py [path to bbl] [path to the disk image] atomic 4
+[gem5 binary] configs/run_riscv.py [path to bbl] [path to the disk image] atomic 1
 ```
 
-To interact with the simulated system's console:
+To interact with the simulated system's console, you can use `telnet`,
 
 ```sh
-telnet localhost 3457 (this port number comes from `simerr` file)
+telnet localhost <port>
+```
+
+Another option is to use `m5term` provided by gem5. To compile and launch `m5term`,
+```sh
+cd gem5/util/term
+make                         # compiling
+./m5term localhost <port>    # launching the terminal
 ```
 
 The default linux system based on this README, has both `login` and `password` set as `root`.
diff --git a/src/riscv-fs/configs-riscv-fs/run_riscv.py b/src/riscv-fs/configs/run_riscv.py
similarity index 96%
rename from src/riscv-fs/configs-riscv-fs/run_riscv.py
rename to src/riscv-fs/configs/run_riscv.py
index 1ca8665..3e3b747 100755
--- a/src/riscv-fs/configs-riscv-fs/run_riscv.py
+++ b/src/riscv-fs/configs/run_riscv.py
@@ -26,8 +26,8 @@
 
 """
 This script is supposed to run full system simulation for RISCV targets.
-It has been tested with classic memory system and Atomic
-and TimingSimpleCPU so far.
+It has been tested with classic memory system and Atomic,
+TimingSimpleCPU, and MinorCPU so far.
 """
 
 import time
diff --git a/src/riscv-fs/configs-riscv-fs/system/__init__.py b/src/riscv-fs/configs/system/__init__.py
similarity index 100%
rename from src/riscv-fs/configs-riscv-fs/system/__init__.py
rename to src/riscv-fs/configs/system/__init__.py
diff --git a/src/riscv-fs/configs-riscv-fs/system/system.py b/src/riscv-fs/configs/system/system.py
similarity index 73%
rename from src/riscv-fs/configs-riscv-fs/system/system.py
rename to src/riscv-fs/configs/system/system.py
index 5032664..3f545f1 100755
--- a/src/riscv-fs/configs-riscv-fs/system/system.py
+++ b/src/riscv-fs/configs/system/system.py
@@ -23,10 +23,12 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
 
 import m5
 from m5.objects import *
 from m5.util import convert
+from os import path
 
 '''
 This class creates a bare bones RISCV full system.
@@ -37,6 +39,43 @@
 d3c2-44ea-85fb-acc1df282e21_FU540-C000-v1p3.pdf
 '''
 
+# Dtb generation code from configs/example/riscv/fs_linux.py
+def generateMemNode(state, mem_range):
+    node = FdtNode("memory@%x" % int(mem_range.start))
+    node.append(FdtPropertyStrings("device_type", ["memory"]))
+    node.append(FdtPropertyWords("reg",
+        state.addrCells(mem_range.start) +
+        state.sizeCells(mem_range.size()) ))
+    return node
+
+def generateDtb(system):
+    """
+    Autogenerate DTB. Arguments are the folder where the DTB
+    will be stored, and the name of the DTB file.
+    """
+    state = FdtState(addr_cells=2, size_cells=2, cpu_cells=1)
+    root = FdtNode('/')
+    root.append(state.addrCellsProperty())
+    root.append(state.sizeCellsProperty())
+    root.appendCompatible(["riscv-virtio"])
+
+    for mem_range in system.mem_ranges:
+        root.append(generateMemNode(state, mem_range))
+
+    sections = [*system.cpu, system.platform]
+
+    for section in sections:
+        for node in section.generateDeviceTree(state):
+            if node.get_name() == root.get_name():
+                root.merge(node)
+            else:
+                root.append(node)
+
+    fdt = Fdt()
+    fdt.add_rootnode(root)
+    fdt.writeDtsFile(path.join(m5.options.outdir, 'device.dts'))
+    fdt.writeDtbFile(path.join(m5.options.outdir, 'device.dtb'))
+
 class RiscvSystem(System):
 
     def __init__(self, bbl, disk, cpu_type, num_cpus):
@@ -55,41 +94,24 @@
         # This connects to main memory
         self.membus = SystemXBar(width = 64) # 64-byte width
 
+        # Add a bad addr responder
+        self.membus.badaddr_responder = BadAddr()
+        self.membus.default = self.membus.badaddr_responder.pio
+
         # Set up the system port for functional access from the simulator
         self.system_port = self.membus.cpu_side_ports
 
         # Create the CPUs for our system.
         self.createCPU(cpu_type, num_cpus)
 
-        # using RISCV bare metal as the base full system workload
-        self.workload = RiscvBareMetal()
-
-        # this is user passed berkeley boot loader binary
-        # currently the Linux kernel payload is compiled into this
-        # as well
-        self.workload.bootloader = bbl
-
         # HiFive platform
         # This is based on a HiFive RISCV board and has
         # only a limited number of devices so far i.e.
         # PLIC, CLINT, UART, VirtIOMMIO
         self.platform = HiFive()
 
-        # Next, create and intialize devices
-        # currently supported for RISCV
-
-        # add a disk image
-        self.attachDisk(disk)
-
-        # set up core and platform
-        # level interrupt controllers
-        self.setupIntrCtrl()
-
-        # set up PMA checker
-        self.pmaChecker()
-
-        # attach off and on chip IO
-        self.attachIO(self.membus)
+        # create and intialize devices currently supported for RISCV
+        self.initDevices(self.membus, disk)
 
         # Create the cache heirarchy for the system.
         self.createCacheHierarchy()
@@ -99,6 +121,28 @@
 
         self.setupInterrupts()
 
+        # using RiscvLinux as the base full system workload
+        self.workload = RiscvLinux()
+
+        # this is user passed berkeley boot loader binary
+        # currently the Linux kernel payload is compiled into this
+        # as well
+        self.workload.object_file = bbl
+
+        # Generate DTB (from configs/example/riscv/fs_linux.py)
+        generateDtb(self)
+        self.workload.dtb_filename = path.join(m5.options.outdir, 'device.dtb')
+        # Default DTB address if bbl is bulit with --with-dts option
+        self.workload.dtb_addr = 0x87e00000
+
+        # Linux boot command flags
+        kernel_cmd = [
+            "console=ttyS0",
+            "root=/dev/vda",
+            "ro"
+        ]
+        self.workload.command_line = " ".join(kernel_cmd)
+
     def createCPU(self, cpu_type, num_cpus):
         if cpu_type == "atomic":
             self.cpu = [AtomicSimpleCPU(cpu_id = i)
@@ -108,6 +152,10 @@
             self.cpu = [TimingSimpleCPU(cpu_id = i)
                         for i in range(num_cpus)]
             self.mem_mode = 'timing'
+        elif cpu_type == "minor":
+            self.cpu = [MinorCPU(cpu_id = i)
+                        for i in range(num_cpus)]
+            self.mem_mode = 'timing'
         else:
             m5.fatal("No CPU type {}".format(cpu_type))
 
@@ -170,22 +218,10 @@
                     port = self.membus.mem_side_ports)
         ]
 
-    def attachIO(self, membus):
+    def initDevices(self, membus, disk):
+
         self.iobus = IOXBar()
 
-        self.bridge = Bridge(delay='50ns')
-        self.bridge.master = self.iobus.slave
-        self.bridge.slave = self.membus.master
-        self.bridge.ranges = self.platform._off_chip_ranges()
-
-        # Connecting on chip and off chip IO to the mem
-        # and IO bus
-        self.platform.attachOnChipIO(self.membus)
-        self.platform.attachOffChipIO(self.iobus)
-
-    def setupIntrCtrl(self):
-        self.intrctrl = IntrControl()
-
         # Set the frequency of RTC (real time clock) used by
         # CLINT (core level interrupt controller).
         # This frequency is 1MHz in SiFive's U54MC.
@@ -195,12 +231,17 @@
         # RTC sends the clock signal to CLINT via an interrupt pin.
         self.platform.clint.int_pin = self.platform.rtc.int_pin
 
-        # Attach the PLIC (platform level interrupt controller)
-        # to the platform. This initializes the PLIC with
-        # interrupt sources coming from off chip devices
-        self.platform.attachPlic()
+        # VirtIOMMIO
+        image = CowDiskImage(child=RawDiskImage(read_only=True), read_only=False)
+        image.child.image_file = disk
+        # using reserved memory space
+        self.platform.disk = MmioVirtIO(
+            vio=VirtIOBlock(image=image),
+            interrupt_id=0x8,
+            pio_size = 4096,
+            pio_addr=0x10008000
+        )
 
-    def pmaChecker(self):
         # From riscv/fs_linux.py
         uncacheable_range = [
             *self.platform._on_chip_ranges(),
@@ -213,16 +254,20 @@
         # or MMU-level (system.cpu[0].mmu.pma_checker). It will be resolved
         # by RiscvTLB's Parent.any proxy
 
-        self.pma_checker =  PMAChecker(uncacheable=uncacheable_range)
+        for cpu in self.cpu:
+            cpu.mmu.pma_checker =  PMAChecker(uncacheable=uncacheable_range)
 
-    def attachDisk(self, disk):
-        # VirtIOMMIO
-        image = CowDiskImage(child=RawDiskImage(read_only=True), read_only=False)
-        image.child.image_file = disk
-        # using reserved memory space
-        self.platform.disk = MmioVirtIO(
-            vio=VirtIOBlock(image=image),
-            interrupt_id=0x8,
-            pio_size = 4096,
-            pio_addr=0x10008000
-        )
+        self.bridge = Bridge(delay='50ns')
+        self.bridge.mem_side_port = self.iobus.cpu_side_ports
+        self.bridge.cpu_side_port = self.membus.mem_side_ports
+        self.bridge.ranges = self.platform._off_chip_ranges()
+
+        # Connecting on chip and off chip IO to the mem
+        # and IO bus
+        self.platform.attachOnChipIO(self.membus)
+        self.platform.attachOffChipIO(self.iobus)
+
+        # Attach the PLIC (platform level interrupt controller)
+        # to the platform. This initializes the PLIC with
+        # interrupt sources coming from off chip devices
+        self.platform.attachPlic()
diff --git a/src/riscv-fs/device.dts b/src/riscv-fs/device.dts
deleted file mode 100644
index 7181c6c..0000000
--- a/src/riscv-fs/device.dts
+++ /dev/null
@@ -1,80 +0,0 @@
-/dts-v1/;
-
-/ {
-	#address-cells = <0x2>;
-	#size-cells = <0x2>;
-	compatible = "riscv-virtio";
-	model = "riscv-virtio,qemu";
-
-	chosen {
-		bootargs = "root=/dev/vda ro console=ttyS0";
-		stdout-path = "/soc/uart@10000000";
-	};
-
-	memory@80000000 {
-		device_type = "memory";
-		reg = <0x0 0x80000000 0x0 0x8000000>;
-	};
-
-	cpus {
-		#address-cells = <0x1>;
-		#size-cells = <0x0>;
-		timebase-frequency = <0x989680>;
-
-		cpu@0 {
-			phandle = <0x1>;
-			device_type = "cpu";
-			reg = <0x0>;
-			status = "okay";
-			compatible = "riscv";
-			riscv,isa = "rv64imafdcsu";
-			mmu-type = "riscv,sv48";
-
-			interrupt-controller {
-				#interrupt-cells = <0x1>;
-				interrupt-controller;
-				compatible = "riscv,cpu-intc";
-				phandle = <0x2>;
-			};
-		};
-	};
-
-	soc {
-		#address-cells = <0x2>;
-		#size-cells = <0x2>;
-		compatible = "simple-bus";
-		ranges;
-
-		uart@10000000 {
-			interrupts = <0xa>;
-			interrupt-parent = <0x3>;
-			clock-frequency = <0x384000>;
-			reg = <0x0 0x10000000 0x0 0x008>;
-			compatible = "ns8250";
-		};
-
-		plic@c000000 {
-			phandle = <0x3>;
-			riscv,ndev = <0xa>;
-			reg = <0x0 0xc000000 0x0 0x210000>;
-			interrupts-extended = <0x2 0xb 0x2 0x9>;
-			interrupt-controller;
-			compatible = "riscv,plic0";
-			#interrupt-cells = <0x1>;
-			#address-cells = <0x0>;
-		};
-
-		virtio_mmio@10008000 {
-			interrupts = <0x8>;
-			interrupt-parent = <0x3>;
-			reg = <0x0 0x10008000 0x0 0x1000>;
-			compatible = "virtio,mmio";
-		};
-
-		clint@2000000 {
-			interrupts-extended = <0x2 0x3 0x2 0x7>;
-			reg = <0x0 0x2000000 0x0 0x10000>;
-			compatible = "riscv,clint0";
-		};
-	};
-};
diff --git a/src/spec-2006/README.md b/src/spec-2006/README.md
index 1bbf527..30f3fd5 100644
--- a/src/spec-2006/README.md
+++ b/src/spec-2006/README.md
@@ -85,7 +85,7 @@
 
 `kernel`: required, a positional argument specifying the path to the Linux
 kernel. We have tested using version 4.19.83, which can be downloaded from
-<http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.19.83>. Info on
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>. Info on
 building Linux kernels for gem5 can be found in `src/linux-kernel`
 
 `disk`: required, a positional argument specifying the path to the disk image
diff --git a/src/spec-2006/configs/run_spec.py b/src/spec-2006/configs/run_spec.py
index 461e956..6d17bd9 100644
--- a/src/spec-2006/configs/run_spec.py
+++ b/src/spec-2006/configs/run_spec.py
@@ -69,7 +69,6 @@
                   By default, the ports are off.
 """
 import os
-import sys
 
 import m5
 import m5.ticks
diff --git a/src/spec-2006/configs/system/caches.py b/src/spec-2006/configs/system/caches.py
index 9932ecf..3e786b7 100644
--- a/src/spec-2006/configs/system/caches.py
+++ b/src/spec-2006/configs/system/caches.py
@@ -30,14 +30,11 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script.  It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
 import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
diff --git a/src/spec-2006/configs/system/ruby_system.py b/src/spec-2006/configs/system/ruby_system.py
index 4860504..d1ddb07 100755
--- a/src/spec-2006/configs/system/ruby_system.py
+++ b/src/spec-2006/configs/system/ruby_system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
@@ -163,8 +162,6 @@
         # so the port isn't connected twice.
         self.pc.attachIO(self.iobus, [self.pc.south_bridge.ide.dma])
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/spec-2006/configs/system/system.py b/src/spec-2006/configs/system/system.py
index a5fbe16..35f0721 100644
--- a/src/spec-2006/configs/system/system.py
+++ b/src/spec-2006/configs/system/system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
@@ -281,8 +280,6 @@
         self.iocache.cpu_side = self.iobus.mem_side_ports
         self.iocache.mem_side = self.membus.cpu_side_ports
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/spec-2017/README.md b/src/spec-2017/README.md
index 63eade4..4a355c5 100644
--- a/src/spec-2017/README.md
+++ b/src/spec-2017/README.md
@@ -86,7 +86,7 @@
 
 `kernel`: required, a positional argument specifying the path to the Linux
 kernel. This has been tested with version 4.19.83, available at
-<http://dist.gem5.org/dist/v21-0/kernels/x86/static/vmlinux-4.19.83>. Info on
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>. Info on
 building Linux kernels can be found in `src/linux-kernels`.
 
 `disk`: required, a positional argument specifying the path to the disk image
diff --git a/src/spec-2017/configs/run_spec.py b/src/spec-2017/configs/run_spec.py
index f00913c..63f3934 100644
--- a/src/spec-2017/configs/run_spec.py
+++ b/src/spec-2017/configs/run_spec.py
@@ -68,7 +68,6 @@
                   By default, the ports are off.
 """
 import os
-import sys
 
 import m5
 import m5.ticks
diff --git a/src/spec-2017/configs/system/caches.py b/src/spec-2017/configs/system/caches.py
index 9932ecf..84f63e7 100644
--- a/src/spec-2017/configs/system/caches.py
+++ b/src/spec-2017/configs/system/caches.py
@@ -30,15 +30,10 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script.  It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
-
+from m5.objects import Cache, L2XBar, StridePrefetcher
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
 
diff --git a/src/spec-2017/configs/system/ruby_system.py b/src/spec-2017/configs/system/ruby_system.py
index 4860504..d1ddb07 100755
--- a/src/spec-2017/configs/system/ruby_system.py
+++ b/src/spec-2017/configs/system/ruby_system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
@@ -163,8 +162,6 @@
         # so the port isn't connected twice.
         self.pc.attachIO(self.iobus, [self.pc.south_bridge.ide.dma])
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/spec-2017/configs/system/system.py b/src/spec-2017/configs/system/system.py
index de2afc8..b07596c 100644
--- a/src/spec-2017/configs/system/system.py
+++ b/src/spec-2017/configs/system/system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
@@ -283,8 +282,6 @@
         self.iocache.cpu_side = self.iobus.mem_side_ports
         self.iocache.mem_side = self.membus.cpu_side_ports
 
-        self.intrctrl = IntrControl()
-
         ###############################################
 
         # Add in a Bios information structure.
diff --git a/src/square/Makefile b/src/square/Makefile
deleted file mode 100644
index 4f4eb82..0000000
--- a/src/square/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-HIP_PATH?= /opt/rocm/hip
-HIPCC=$(HIP_PATH)/bin/hipcc
-
-BIN_DIR?= ./bin
-
-gfx8-apu: $(BIN_DIR)/square.o
-
-$(BIN_DIR)/square.o: square.cpp $(BIN_DIR)
-	$(HIPCC) --amdgpu-target=gfx801 $(CXXFLAGS) square.cpp -o $(BIN_DIR)/square.o
-
-$(BIN_DIR):
-	mkdir -p $(BIN_DIR)
-
-clean:
-	rm -rf $(BIN_DIR)
-
-.PHONY: gfx8-apu clean