resources: Merge branch 'release-staging-v21-1' into develop

Change-Id: I7b2b3d136c07faafd656cc8c058ad106d5ecc447
diff --git a/README.md b/README.md
index 1c62e24..88d8abc 100644
--- a/README.md
+++ b/README.md
@@ -96,29 +96,29 @@
 
 ### RISCV Tests Pre-built binaries
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/dhrystone.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/dhrystone.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/median.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/median.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/mm.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/mm.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/mt-matmul.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/mt-matmul.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/mt-vvadd.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/mt-vvadd.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/multiply.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/multiply.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/pmp.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/pmp.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/qsort.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/qsort.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/rsort.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/rsort.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/spmv.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/spmv.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/towers.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/towers.riscv>
 
-<http://dist.gem5.org/dist/develop/test-progs/riscv-tests/vvadd.riscv>
+<http://dist.gem5.org/dist/v21-1/test-progs/riscv-tests/vvadd.riscv>
 
 ## Resource: simple
 
@@ -241,85 +241,85 @@
 
 ### simple Pre-built binaries
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/x86/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/x86/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/x86/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/x86/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_atomic>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/x86/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_pthread_cond>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/x86/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_std_thread>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/x86/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_std_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/x86/test_std_condition_variable>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/x86/test_std_condition_variable>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch32/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch32/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch32/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch32/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_atomic>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch32/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_pthread_cond>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch32/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_std_thread>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch32/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_std_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch32/test_std_condition_variable>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch32/test_std_condition_variable>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch64/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch64/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch64/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch64/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_atomic>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch64/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_pthread_cond>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch64/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_std_thread>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch64/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_std_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/aarch64/test_std_condition_variable>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/aarch64/test_std_condition_variable>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/riscv64/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/riscv64/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/riscv64/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/riscv64/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_atomic>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/riscv64/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_pthread_cond>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/riscv64/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_std_thread>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/riscv64/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_std_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/riscv64/test_std_condition_variable>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/riscv64/test_std_condition_variable>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/sparc64/test_pthread_create_seq>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_pthread_create_seq>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/sparc64/test_pthread_create_para>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_pthread_create_para>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/sparc64/test_pthread_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_pthread_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/sparc64/test_atomic>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_atomic>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/sparc64/test_pthread_cond>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_pthread_cond>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/sparc64/test_std_thread>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_std_thread>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/sparc64/test_std_mutex>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_std_mutex>
 
-<http://dist.gem5.org/dist/develop/test-progs/pthreads/sparc64/test_std_condition_variable>
+<http://dist.gem5.org/dist/v21-1/test-progs/pthreads/sparc64/test_std_condition_variable>
 
 ## Resource: Square
 
@@ -336,7 +336,7 @@
 
 ### Square Pre-built binary
 
-<http://dist.gem5.org/dist/develop/test-progs/square/square.o>
+<http://dist.gem5.org/dist/v21-1/test-progs/square/square.o>
 
 # Resource: HSA Agent Packet Example
 
@@ -390,21 +390,21 @@
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/develop/test-progs/hip-samples/2dshfl>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/2dshfl>
 
-<http://dist.gem5.org/dist/develop/test-progs/hip-samples/dynamic_shared>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/dynamic_shared>
 
-<http://dist.gem5.org/dist/develop/test-progs/hip-samples/inline_asm>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/inline_asm>
 
-<http://dist.gem5.org/dist/develop/test-progs/hip-samples/MatrixTranspose>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/MatrixTranspose>
 
-<http://dist.gem5.org/dist/develop/test-progs/hip-samples/sharedMemory>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/sharedMemory>
 
-<http://dist.gem5.org/dist/develop/test-progs/hip-samples/shfl>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/shfl>
 
-<http://dist.gem5.org/dist/develop/test-progs/hip-samples/stream>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/stream>
 
-<http://dist.gem5.org/dist/develop/test-progs/hip-samples/unroll>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/unroll>
 
 # Resource: Heterosync
 
@@ -418,16 +418,16 @@
 ## Compilation
 ```
 cd src/gpu/heterosync
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8-apu
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8
 ```
 
-The release-gfx8-apu target builds for gfx801, a GCN3-based APU. There are other
-targets (release-gfx8, release) that build for GPU types that are currently unsupported
-in gem5.
+The release-gfx8 target builds for gfx801, a GCN3-based APU, and gfx803, a
+GCN3-based dGPU. There are other targets (release) that build for GPU types
+that are currently unsupported in gem5.
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/develop/test-progs/heterosync/gcn3/allSyncPrims-1kernel>
+<http://dist.gem5.org/dist/v21-1/test-progs/heterosync/gcn3/allSyncPrims-1kernel>
 
 # Resource: lulesh
 
@@ -465,7 +465,7 @@
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/develop/test-progs/lulesh/lulesh>
+<http://dist.gem5.org/dist/v21-1/test-progs/lulesh/lulesh>
 
 # Resource: halo-finder (HACC)
 
@@ -510,7 +510,7 @@
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/develop/test-progs/halo-finder/ForceTreeTest>
+<http://dist.gem5.org/dist/v21-1/test-progs/halo-finder/ForceTreeTest>
 
 # Resource: DNNMark
 
@@ -519,30 +519,25 @@
 
 ## Compilation and Running
 
-DNNMark requires additional programs that aren't installed in the standard GCN
-docker image. There is a Dockerfile in `src/gpu/DNNMark` that installs the additional
-software.
-
-To build DNNMark (Including the new docker image):
+To build DNNMark:
 **NOTE**: Due to DNNMark building a library, it's important to mount gem5-resources
 to the same directory within the docker container when building and running, as otherwise the benchmarks
 won't be able to link against the library. The example commands do this by using
 `-v ${PWD}:${PWD}` in the docker run commands
 ```
 cd src/gpu/DNNMark
-docker build -t <image_name> .
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> ./setup.sh HIP
-docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID <image_name> make
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu ./setup.sh HIP
+docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
 DNNMark uses MIOpen kernels, which are unable to be compiled on-the-fly in gem5.
-We have provided a shell script to generate these kernels for a subset of the
-benchmarks.
+We have provided a python script to generate these kernels for a subset of the
+benchmarks for a gfx801 GPU with 4 CUs by default
 
 To generate the MIOpen kernels:
 ```
 cd src/gpu/DNNMark
-docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> ./generate_cachefiles.sh
+docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu python3 generate_cachefiles.py cachefiles.csv [--gfx-version={gfx801,gfx803}] [--num-cus=N]
 ```
 
 Due to the large amounts of memory that need to be set up for DNNMark, we have
@@ -560,13 +555,13 @@
 To build GCN3_X86:
 ```
 # Working directory is your gem5 directory
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
 ```
 
 To run one of the benchmarks (fwd softmax) in gem5:
 ```
 # Assuming gem5 and gem5-resources are sub-directories of the current directory
-docker run --rm -u $UID:$GID -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
+docker run --rm -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
 ```
 
 
@@ -601,7 +596,7 @@
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/develop/test-progs/pennant/pennant>
+<http://dist.gem5.org/dist/v21-1/test-progs/pennant/pennant>
 
 ## Resource: SPEC 2006
 
@@ -651,7 +646,7 @@
 
 ### GAPBS Pre-built disk image
 
-<http://dist.gem5.org/dist/develop/images/x86/ubuntu-18-04/gapbs.img.gz>.
+<http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/gapbs.img.gz>.
 
 ## Resource: PARSEC Benchmark Suite
 
@@ -672,7 +667,7 @@
 
 ### GAPBS Pre-built disk image
 
-<http://dist.gem5.org/dist/develop/images/x86/ubuntu-18-04/parsec.img.gz>.
+<http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/parsec.img.gz>.
 
 ## Resource: NAS Parallel Benchmarks (NPB) Tests
 
@@ -699,7 +694,7 @@
 
 ### NPB Pre-built disk image
 
-<http://dist.gem5.org/dist/develop/images/x86/ubuntu-18-04/npb.img.gz>
+<http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/npb.img.gz>
 
 
 ## Resource: Linux Boot Tests
@@ -714,16 +709,20 @@
 
 The instructions to build the boot-tests disk image (`boot-exit`), the Linux binaries, and how to use gem5 run scripts to run boot-tests are available in this [README](src/boot-tests/README.md) file.
 
-# Resource: RISCV Full System Test
+## Resource: RISCV Full System
 
-This resource refers to a simple setup for a riscv based full system simulation of Linux kernel.
+The RISCV Full System resource includes a RISCV boot loader (`berkeley bootloader (bbl)`) to boot the Linux 5.10 kernel on a RISCV system, and an image which includes the BusyBox software suite.
+The resource also contains simple gem5 run/config scripts to run Linux full system simulations in which a user may telnet into.
 
-Main components include:
-- a disk image
-- a riscv boot loader with linux kernel as payload and a device tree compiled in
-- gem5 run/config scripts
+Further information on building a riscv disk image, a riscv boot loader, and how to use gem5 scripts to run riscv Linux full system simulations, is available in the [README](src/riscv-fs/README.md) file.
 
-The instructions to build a riscv disk image, a riscv boot loader (`berkeley bootloader (bbl)`) and how to use gem5 scripts to run riscv Linux full system simulations are available in this [README](src/riscv-fs/README.md) file.
+### RISCV Full System pre-built disk image
+
+<http://dist.gem5.org/dist/develop/images/riscv/busybox/riscv-disk.img.gz>
+
+### RISCV Full System pre-built Linux bootloader
+
+<http://dist.gem5.org/dist/develop/kernels/riscv/static/bootloader-vmlinux-5.10>
 
 ## Resource: Insttest
 
@@ -754,7 +753,7 @@
 
 ### Insttest Pre-built binary
 
-<http://dist.gem5.org/dist/develop/test-progs/insttest/bin/sparc/linux/insttest>
+<http://dist.gem5.org/dist/v21-1/test-progs/insttest/bin/sparc/linux/insttest>
 
 ## Resource: Linux Kernel Binary
 
@@ -767,10 +766,10 @@
 
 ### Linux Kernel Pre-built binaries
 
-<http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.4.186>
-<http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.9.186>
-<http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.14.134>
-<http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.19.83>
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.4.186>
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.9.186>
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.14.134>
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>
 
 ## Licensing
 
diff --git a/src/boot-exit/README.md b/src/boot-exit/README.md
index 9a54bed..a5d2160 100644
--- a/src/boot-exit/README.md
+++ b/src/boot-exit/README.md
@@ -85,7 +85,7 @@
 ```
 
 Once this process succeeds, the disk image can be found on `boot-exit/boot-exit-image/boot-exit`.
-A disk image already created following the above instructions can be found, gzipped, [here](http://dist.gem5.org/dist/develop/images/x86/ubuntu-18-04/boot-exit.img.gz).
+A disk image already created following the above instructions can be found, gzipped, [here](http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/boot-exit.img.gz).
 
 
 ## gem5 Run Scripts
diff --git a/src/boot-exit/configs/system/caches.py b/src/boot-exit/configs/system/caches.py
index 4316aa1..7d60733 100755
--- a/src/boot-exit/configs/system/caches.py
+++ b/src/boot-exit/configs/system/caches.py
@@ -27,14 +27,10 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script. It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
diff --git a/src/boot-exit/configs/system/ruby_system.py b/src/boot-exit/configs/system/ruby_system.py
index 687e252..3959a71 100755
--- a/src/boot-exit/configs/system/ruby_system.py
+++ b/src/boot-exit/configs/system/ruby_system.py
@@ -26,7 +26,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
diff --git a/src/boot-exit/configs/system/system.py b/src/boot-exit/configs/system/system.py
index b6ad6b9..55875f2 100755
--- a/src/boot-exit/configs/system/system.py
+++ b/src/boot-exit/configs/system/system.py
@@ -26,7 +26,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
diff --git a/src/gapbs/README.md b/src/gapbs/README.md
index a877aa6..6a864ae 100644
--- a/src/gapbs/README.md
+++ b/src/gapbs/README.md
@@ -12,7 +12,7 @@
 
 This document provides instructions to create a GAP Benchmark Suite (GAPBS) disk image, which, along with provided configuration scripts, may be used to run GAPBS within gem5 simulations.
 
-A pre-build disk image, for X86, can be found, gzipped, here: <http://dist.gem5.org/dist/develop/images/x86/ubuntu-18-04/gapbs.img.gz>.
+A pre-build disk image, for X86, can be found, gzipped, here: <http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/gapbs.img.gz>.
 
 ## Building the Disk Image
 
@@ -45,7 +45,7 @@
 gem5 scripts which configure the system and run the simulation are available in `configs/`.
 The main script `run_gapbs.py` expects following arguments:
 
-* **kernel** : A manditory positional argument. The path to the Linux kernel. GAPBS has been tested with [vmlinux-5.2.3](http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-5.2.3). See `src/linux-kernel` for information on building a linux kernel for gem5.
+* **kernel** : A manditory positional argument. The path to the Linux kernel. GAPBS has been tested with [vmlinux-5.2.3](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-5.2.3). See `src/linux-kernel` for information on building a linux kernel for gem5.
 
 * **disk** : A manditory positional argument. The path to the disk image.
 
diff --git a/src/gapbs/configs/run_gapbs.py b/src/gapbs/configs/run_gapbs.py
index 2e9848c..997d12c 100644
--- a/src/gapbs/configs/run_gapbs.py
+++ b/src/gapbs/configs/run_gapbs.py
@@ -30,9 +30,6 @@
     The workloads have two modes: synthetic and real graphs.
 """
 
-import sys
-import time
-
 import m5
 import m5.ticks
 from m5.objects import *
diff --git a/src/gapbs/configs/system/caches.py b/src/gapbs/configs/system/caches.py
index a58f3af..049a695 100644
--- a/src/gapbs/configs/system/caches.py
+++ b/src/gapbs/configs/system/caches.py
@@ -30,15 +30,10 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script. It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
-
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
diff --git a/src/gapbs/configs/system/ruby_system.py b/src/gapbs/configs/system/ruby_system.py
index 38960f9..c2a2b58 100644
--- a/src/gapbs/configs/system/ruby_system.py
+++ b/src/gapbs/configs/system/ruby_system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
diff --git a/src/gapbs/configs/system/system.py b/src/gapbs/configs/system/system.py
index 70a6255..dbb11b9 100644
--- a/src/gapbs/configs/system/system.py
+++ b/src/gapbs/configs/system/system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
diff --git a/src/gpu/DNNMark/CMakeLists.txt b/src/gpu/DNNMark/CMakeLists.txt
index 352c08c..7b1adc7 100644
--- a/src/gpu/DNNMark/CMakeLists.txt
+++ b/src/gpu/DNNMark/CMakeLists.txt
@@ -40,8 +40,8 @@
   endif()
 endif()
 
-# Detect HCC
-find_program(HCC_FOUND hcc)
+# Detect HIPCC
+find_program(HIPCC_FOUND hipcc)
 
 option (double-test "Make data type double" OFF)
 option (enable-cudnnv6 "Enable cuDNN version 6" OFF)
@@ -124,7 +124,7 @@
                         ${GLOG_LIBRARY}
                         m)
 
-elseif(HCC_FOUND AND ${HCC_ENABLE})
+elseif(HIPCC_FOUND AND ${HCC_ENABLE})
 
   # Cover the include and linkage requirement here
   execute_process(COMMAND hcc-config  --cxxflags
@@ -188,9 +188,6 @@
 
   message(${ROCBLAS_LIBRARY} ${MIOPEN_LIBRARY})
 
-  # Find other libraries
-  find_library(HIP_HCC hip_hcc /opt/rocm/hip/lib)
-
   # Find glog libraries
   find_library(GLOG_LIBRARY glog)
 
@@ -200,7 +197,6 @@
   target_link_libraries(${PROJECT_NAME}
                         ${ROCBLAS_LIBRARY}
                         ${MIOPEN_LIBRARY}
-                        ${HIP_HCC}
                         ${GLOG_LIBRARY}
                         m)
   set_target_properties(${PROJECT_NAME} PROPERTIES
diff --git a/src/gpu/DNNMark/Dockerfile b/src/gpu/DNNMark/Dockerfile
deleted file mode 100644
index 5299b26..0000000
--- a/src/gpu/DNNMark/Dockerfile
+++ /dev/null
@@ -1,2 +0,0 @@
-FROM gcr.io/gem5-test/gcn-gpu
-RUN apt-get update && apt-get -y install libgflags-dev libgoogle-glog-dev
diff --git a/src/gpu/DNNMark/README.md b/src/gpu/DNNMark/README.md
index 763dc27..79256bc 100644
--- a/src/gpu/DNNMark/README.md
+++ b/src/gpu/DNNMark/README.md
@@ -20,30 +20,25 @@
 
 ## Compilation and Running
 
-DNNMark requires additional programs that aren't installed in the standard GCN
-docker image. There is a Dockerfile in `src/gpu/DNNMark` that installs the additional
-software.
-
-To build DNNMark (Including the new docker image):
+To build DNNMark:
 **NOTE**: Due to DNNMark building a library, it's important to mount gem5-resources
 to the same directory within the docker container when building and running, as otherwise the benchmarks
 won't be able to link against the library. The example commands do this by using
 `-v ${PWD}:${PWD}` in the docker run commands
 ```
 cd src/gpu/DNNMark
-docker build -t <image_name> .
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> ./setup.sh HIP
-docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID <image_name> make
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu ./setup.sh HIP
+docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
 DNNMark uses MIOpen kernels, which are unable to be compiled on-the-fly in gem5.
-We have provided a shell script to generate these kernels for a subset of the
-benchmarks.
+We have provided a python script to generate these kernels for a subset of the
+benchmarks for a gfx801 GPU with 4 CUs by default
 
 To generate the MIOpen kernels:
 ```
 cd src/gpu/DNNMark
-docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> ./generate_cachefiles.sh
+docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu python3 generate_cachefiles.py cachefiles.csv [--gfx-version={gfx801,gfx803}] [--num-cus=N]
 ```
 
 Due to the large amounts of memory that need to be set up for DNNMark, we have
@@ -61,13 +56,13 @@
 To build GCN3_X86:
 ```
 # Working directory is your gem5 directory
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
 ```
 
 To run one of the benchmarks (fwd softmax) in gem5:
 ```
 # Assuming gem5 and gem5-resources are sub-directories of the current directory
-docker run --rm -u $UID:$GID -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
+docker run --rm -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
 ```
 
 Information from the original DNNMark README included below.
diff --git a/src/gpu/DNNMark/cachefiles.csv b/src/gpu/DNNMark/cachefiles.csv
new file mode 100644
index 0000000..af971ab
--- /dev/null
+++ b/src/gpu/DNNMark/cachefiles.csv
@@ -0,0 +1,11 @@
+MIOpenBatchNormFwdTrainPerAct.cl, -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_SAVE_MEAN_VARIANCE=1 -DMIO_RUNNING_RESULT=1 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_LDS_SIZE=256 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=256 -DMIO_BN_GRP2=1 -DMIO_BN_NCHW=100000 -mcpu=gfx803
+MIOpenBatchNormBwdPerAct.cl, -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_NCHW=100000 -DMIO_BN_NGRPS=1 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=64 -DMIO_BN_GRP2=1 -mcpu=gfx803
+MIOpenNeuron.cl, -DLITE -DMIOPEN_READ_UNIT=4 -DMIOPEN_READ_TYPE=_FLOAT4 -DMIOPEN_NRN_OP_ID=0 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx803
+MIOpenSoftmax.cl,-DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DUSE_SOFTMAX_ACCURATE=1 -DUSE_SOFTMAX_MODE_CHANNEL=1 -DRUN_FORWARD=0 -DIS_OUTPUT_PACKED=1 -DIS_DOUTPUT_PACKED=1 -DIS_DINPUT_PACKED=1 -mcpu=gfx803
+MIOpenSoftmax.cl,-DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DUSE_SOFTMAX_ACCURATE=1 -DUSE_SOFTMAX_MODE_CHANNEL=1 -DRUN_FORWARD=1 -DIS_INPUT_PACKED=1 -DIS_OUTPUT_PACKED=1 -mcpu=gfx803
+MIOpenIm2d2Col.cl, -DNUM_CH_PER_WG=1 -DNUM_IM_BLKS_X=1 -DNUM_IM_BLKS=4 -DLOCAL_MEM_SIZE=432 -DSTRIDE_GT_1=0 -DTILE_SZ_X=32 -DTILE_SZ_Y=8 -DUSE_IM_OFF_GUARD=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_INT8=0 -DMIOPEN_USE_INT8x4=0 -DMIOPEN_USE_BFP16=0 -DMIOPEN_USE_INT32=0 -DMIOPEN_USE_RNE_BFLOAT16=1 -mcpu=gfx803
+MIOpenPooling.cl, -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_HORIZ_OUT_PIX=1 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=16 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_INDEX_TYPE=uchar -DMLO_POOLING_INDEX_MAX=UCHAR_MAX -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx803
+MIOpenPoolingBwd.cl, -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=1 -DMLO_POOLBWD_N_VERT_OUT_PIX=8 -DMLO_POOLBWD_GROUP_SZ0=32 -DMLO_POOLBWD_GROUP_SZ1=4 -DMLO_POOLING_INDEX_TYPE=uchar -DMLO_POOLING_INDEX_MAX=UCHAR_MAX -DUSE_IMG_INDEX=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx803
+MIOpenSubTensorOpWithScalarKernel.cl,-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_INT8=0 -DMIOPEN_USE_INT8x4=0 -DMIOPEN_USE_BFP16=0 -DMIOPEN_USE_INT32=0 -DMIOPEN_USE_RNE_BFLOAT16=1 -DWORK_LENGTH_0=4096 -mcpu=gfx803
+MIOpenCol2Im2d.cl, -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_INT8=0 -DMIOPEN_USE_INT8x4=0 -DMIOPEN_USE_BFP16=0 -DMIOPEN_USE_INT32=0 -DMIOPEN_USE_RNE_BFLOAT16=1 -mcpu=gfx803
+MIOpenPooling.cl, -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_HORIZ_OUT_PIX=1 -DMLO_POOLING_N_VERT_OUT_PIX=8 -DMLO_POOLING_GROUP_SZ0=16 -DMLO_POOLING_GROUP_SZ1=16 -DMLO_POOLING_INDEX_TYPE=uchar -DMLO_POOLING_INDEX_MAX=UCHAR_MAX -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx803
diff --git a/src/gpu/DNNMark/core/include/data_manager.h b/src/gpu/DNNMark/core/include/data_manager.h
index 8a4c10a..56c064a 100644
--- a/src/gpu/DNNMark/core/include/data_manager.h
+++ b/src/gpu/DNNMark/core/include/data_manager.h
@@ -46,7 +46,7 @@
     CUDA_CALL(cudaMalloc(&gpu_ptr_, size * sizeof(T)));
 #endif
 #ifdef AMD_MIOPEN
-    HIP_CALL(hipMalloc(&gpu_ptr_, size * sizeof(T)));
+    HIP_CALL(hipHostMalloc(&gpu_ptr_, size * sizeof(T)));
 #endif
   }
   ~Data() {
@@ -56,7 +56,7 @@
       CUDA_CALL(cudaFree(gpu_ptr_));
 #endif
 #ifdef AMD_MIOPEN
-      HIP_CALL(hipFree(gpu_ptr_));
+      HIP_CALL(hipHostFree(gpu_ptr_));
 #endif
     }
   }
diff --git a/src/gpu/DNNMark/core/include/data_png.h b/src/gpu/DNNMark/core/include/data_png.h
index 4a8d4d7..3f15d79 100644
--- a/src/gpu/DNNMark/core/include/data_png.h
+++ b/src/gpu/DNNMark/core/include/data_png.h
@@ -109,8 +109,7 @@
                         (static_cast <float> (RAND_MAX/seed));
     }
 
-    HIP_CALL(hipMemcpy(dev_ptr, host_ptr, size * sizeof(float),
-                       hipMemcpyHostToDevice));
+    memcpy(dev_ptr, host_ptr, size * sizeof(float));
     if (use_mmap) {
         munmap(host_ptr, size*sizeof(float));
     } else {
@@ -146,8 +145,7 @@
                         (static_cast <double> (RAND_MAX/seed));
     }
 
-    HIP_CALL(hipMemcpy(dev_ptr, host_ptr, size * sizeof(double),
-                       hipMemcpyHostToDevice));
+    memcpy(dev_ptr, host_ptr, size * sizeof(double));
 
     if (use_mmap) {
         munmap(host_ptr, size*sizeof(double));
diff --git a/src/gpu/DNNMark/generate_cachefiles.py b/src/gpu/DNNMark/generate_cachefiles.py
new file mode 100755
index 0000000..dc151d0
--- /dev/null
+++ b/src/gpu/DNNMark/generate_cachefiles.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+import argparse
+import bz2
+import csv
+import hashlib
+import os
+import shlex
+import sqlite3
+import subprocess
+import tempfile
+from pathlib import Path
+
+
+def parseArgs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('csv_file', type=str,
+                        help='File containing cache files to compile '
+                             'in the format of: filename, args')
+    parser.add_argument('--num-cus', default=4, type=int,
+                        help='Number of CUs in simulated GPU')
+    parser.add_argument('--gfx-version', default='gfx801',
+                        choices=['gfx801', 'gfx803'],
+                        help='gfx version of simulated GPU')
+
+    return parser.parse_args()
+
+
+def getDb(options):
+    db_name = f'{options.gfx_version}_{options.num_cus}.ukdb'
+    db_path = '/root/.cache/miopen/2.9.0/'
+
+    full_db_path = os.path.join(db_path, db_name)
+    # Should create file if it doesn't exist
+    # Does assume db_path exists, which it should in the Docker image
+    con = sqlite3.connect(full_db_path)
+
+    cur = con.cursor()
+
+    # Ripped from src/include/miopen/kern_db.hpp
+    cur.execute('''CREATE TABLE IF NOT EXISTS kern_db (
+                        id INTEGER PRIMARY KEY ASC,
+                        kernel_name TEXT NOT NULL,
+                        kernel_args TEXT NOT NULL,
+                        kernel_blob BLOB NOT NULL,
+                        kernel_hash TEXT NOT NULL,
+                        uncompressed_size INT NOT NULL);''')
+    cur.execute('''CREATE UNIQUE INDEX IF NOT EXISTS
+                    idx_kern_db ON kern_db (kernel_name, kernel_args);''')
+
+    return con
+
+
+def insertFiles(con, options):
+    miopen_kern_path = '/MIOpen/src/kernels'
+
+    extra_args = {'gfx801': '-Wno-everything -Xclang '
+                            '-target-feature -Xclang +code-object-v3',
+                  'gfx803': '-Wno-everything -Xclang '
+                            '-target-feature -Xclang +code-object-v3'}
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(options.csv_file) as csvfile:
+            reader = csv.reader(csvfile)
+            for row in reader:
+                miopen_kern = row[0]
+                miopen_kern_full = os.path.join(miopen_kern_path, miopen_kern)
+                # We want to manually add the gfx version
+                # Additionally, everything after the gfx version isn't
+                # used in the database
+                # Explicitly add the leading space because that's used
+                # in the database
+                args = (f' {row[1].split("-mcpu")[0].strip()} '
+                        f'-mcpu={options.gfx_version}')
+
+                # Hash to generate unique output files
+                file_hash = hashlib.md5(args.encode('utf-8')).hexdigest()
+                outfile = f'{miopen_kern}-{file_hash}.o'
+                full_outfile = os.path.join(tmpdir, outfile)
+
+                # Compile the kernel
+                cmd_str = (f'/opt/rocm/bin/clang-ocl {args} '
+                           f'{extra_args[options.gfx_version]} '
+                           f'{miopen_kern_full} -o {full_outfile}')
+                cmd_args = shlex.split(cmd_str)
+                subprocess.run(cmd_args, check=True)
+
+                # Get other params needed for db
+                uncompressed_file = open(full_outfile, 'rb').read()
+                uncompressed_size = Path(full_outfile).stat().st_size
+                uncompressed_hash = hashlib.md5(uncompressed_file).hexdigest()
+                compressed_blob = bz2.compress(uncompressed_file)
+
+                cur = con.cursor()
+                cur.execute('''INSERT OR IGNORE INTO kern_db
+                               (kernel_name, kernel_args, kernel_blob, kernel_hash, uncompressed_size)
+                               VALUES(?, ?, ?, ?, ?)''',
+                            (f'{miopen_kern}.o', args, compressed_blob,
+                                uncompressed_hash, uncompressed_size))
+
+
+def main():
+
+    args = parseArgs()
+
+    con = getDb(args)
+
+    insertFiles(con, args)
+
+    con.commit()
+    con.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gpu/DNNMark/generate_cachefiles.sh b/src/gpu/DNNMark/generate_cachefiles.sh
deleted file mode 100755
index 1f2a2e3..0000000
--- a/src/gpu/DNNMark/generate_cachefiles.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-cd /MIOpen/src/kernels
-
-# test_fwd_softmax/test_bwd_softmax
-mkdir -p /.cache/miopen/1.7.0/5c3130f7e6d7b29bb65a02f5de0084a6
-/opt/rocm/bin/clang-ocl  -DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenSoftmax.cl -o /.cache/miopen/1.7.0/5c3130f7e6d7b29bb65a02f5de0084a6/MIOpenSoftmax.cl.o
-
-# test_fwd_bn
-mkdir -p /.cache/miopen/1.7.0/f8850ed3a540a1e8eb258b582f554d57
-/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_SAVE_MEAN_VARIANCE=1 -DMIO_RUNNING_RESULT=1 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_LDS_SIZE=256 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=256 -DMIO_BN_GRP2=1 -DMIO_BN_NCHW=100000 -mcpu=gfx801 -Wno-everything MIOpenBatchNormFwdTrainPerAct.cl -o  /.cache/miopen/1.7.0/f8850ed3a540a1e8eb258b582f554d57/MIOpenBatchNormFwdTrainPerAct.cl.o
-
-# test_bwd_bn
-mkdir -p /.cache/miopen/1.7.0/2d295f7887fee4bec3c01ac73f8a25cd
-/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_NCHW=100000 -DMIO_BN_NGRPS=1 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=64 -DMIO_BN_GRP2=1 -mcpu=gfx801 -Wno-everything MIOpenBatchNormBwdPerAct.cl -o /.cache/miopen/1.7.0/2d295f7887fee4bec3c01ac73f8a25cd/MIOpenBatchNormBwdPerAct.cl.o
-
-# test_fwd_bypass/test_bwd_bypass
-mkdir -p /.cache/miopen/1.7.0/e213d754468ef6732bb836ed186f5783
-/opt/rocm/bin/clang-ocl  -DLITE -DMIOPEN_READ_UNIT=4 -DMIOPEN_READ_TYPE=_FLOAT4 -DMIOPEN_NRN_OP_ID=0 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenNeuron.cl -o /.cache/miopen/1.7.0/e213d754468ef6732bb836ed186f5783/MIOpenNeuron.cl.o
-
-# test_fwd_composed_model/test_bwd_composed_model
-mkdir -p /.cache/miopen/1.7.0/86de626b159aea830f0ba2f1788e0f40
-/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenUtilKernels2.cl -o /.cache/miopen/1.7.0/86de626b159aea830f0ba2f1788e0f40/MIOpenUtilKernels2.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/ac0046008721a79b06896f9a5a3ca2cc
-/opt/rocm/bin/clang-ocl  -DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DWORK_LENGTH_0=4096 -mcpu=gfx801 -Wno-everything MIOpenSubTensorOpWithScalarKernel.cl -o /.cache/miopen/1.7.0/ac0046008721a79b06896f9a5a3ca2cc/MIOpenSubTensorOpWithScalarKernel.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/7a58553f312474aa3cf449e5d9969a51
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=32 -DMLO_POOLING_N_CHANNELS=32 -DMLO_POOLING_N_HORIZ_OUT_PIX=4 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=8 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_BOT_BATCH_STRIDE=32768 -DMLO_POOLING_BOT_CHANNEL_STRIDE=1024 -DMLO_POOLING_BOT_STRIDE=32 -DMLO_POOLING_TOP_BATCH_STRIDE=8192 -DMLO_POOLING_TOP_CHANNEL_STRIDE=256 -DMLO_POOLING_TOP_STRIDE=16 -DMLO_POOLING_BOT_WIDTH=32 -DMLO_POOLING_BOT_HEIGHT=32 -DMLO_POOLING_TOP_WIDTH=16 -DMLO_POOLING_TOP_HEIGHT=16 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPooling.cl -o /.cache/miopen/1.7.0/7a58553f312474aa3cf449e5d9969a51/MIOpenPooling.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/f0ed53d85baef9414aa97b0b15b78a4d
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=32 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=2 -DMLO_POOLBWD_N_VERT_OUT_PIX=2 -DMLO_POOLBWD_GROUP_SZ0=8 -DMLO_POOLBWD_GROUP_SZ1=8 -DMLO_POOLBWD_BOT_WIDTH=32 -DMLO_POOLBWD_BOT_HEIGHT=32 -DMLO_POOLBWD_TOP_WIDTH=16 -DMLO_POOLBWD_TOP_HEIGHT=16 -DMLO_POOLBWD_BOTDF_BATCH_STRIDE=32768 -DMLO_POOLBWD_BOTDF_CHANNEL_STRIDE=1024 -DMLO_POOLBWD_BOTDF_STRIDE=32 -DMLO_POOLBWD_TOPDF_BATCH_STRIDE=8192 -DMLO_POOLBWD_TOPDF_CHANNEL_STRIDE=256 -DMLO_POOLBWD_TOPDF_STRIDE=16 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPoolingBwd.cl -o /.cache/miopen/1.7.0/f0ed53d85baef9414aa97b0b15b78a4d/MIOpenPoolingBwd.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/ae3e81b4f4b5968e01343ac25026c938
-/opt/rocm/bin/clang-ocl  -DNUM_CH_PER_WG=1 -DNUM_IM_BLKS_X=1 -DNUM_IM_BLKS=4 -DLOCAL_MEM_SIZE=432 -DSTRIDE_GT_1=0 -DTILE_SZ_X=32 -DTILE_SZ_Y=8 -DUSE_IM_OFF_GUARD=1 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenUtilKernels.cl -o /.cache/miopen/1.7.0/ae3e81b4f4b5968e01343ac25026c938/MIOpenUtilKernels.cl.o
-
-# test_fwd_pool
-mkdir -p /.cache/miopen/1.7.0/7cda2f346ecf0e84b50181f05e75480b
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=3 -DMLO_POOLING_N_CHANNELS=3 -DMLO_POOLING_N_HORIZ_OUT_PIX=4 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=8 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_BOT_BATCH_STRIDE=196608 -DMLO_POOLING_BOT_CHANNEL_STRIDE=65536 -DMLO_POOLING_BOT_STRIDE=256 -DMLO_POOLING_TOP_BATCH_STRIDE=49152 -DMLO_POOLING_TOP_CHANNEL_STRIDE=16384 -DMLO_POOLING_TOP_STRIDE=128 -DMLO_POOLING_BOT_WIDTH=256 -DMLO_POOLING_BOT_HEIGHT=256 -DMLO_POOLING_TOP_WIDTH=128 -DMLO_POOLING_TOP_HEIGHT=128 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPooling.cl -o /.cache/miopen/1.7.0/7cda2f346ecf0e84b50181f05e75480b/MIOpenPooling.cl.o
-
-# test_bwd_pool
-mkdir -p /.cache/miopen/1.7.0/e33e7c33bfa58bc339c2c0ed6e8d29ad
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=3 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=2 -DMLO_POOLBWD_N_VERT_OUT_PIX=2 -DMLO_POOLBWD_GROUP_SZ0=8 -DMLO_POOLBWD_GROUP_SZ1=8 -DMLO_POOLBWD_BOT_WIDTH=256 -DMLO_POOLBWD_BOT_HEIGHT=256 -DMLO_POOLBWD_TOP_WIDTH=128 -DMLO_POOLBWD_TOP_HEIGHT=128 -DMLO_POOLBWD_BOTDF_BATCH_STRIDE=196608 -DMLO_POOLBWD_BOTDF_CHANNEL_STRIDE=65536 -DMLO_POOLBWD_BOTDF_STRIDE=256 -DMLO_POOLBWD_TOPDF_BATCH_STRIDE=49152 -DMLO_POOLBWD_TOPDF_CHANNEL_STRIDE=16384 -DMLO_POOLBWD_TOPDF_STRIDE=128 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPoolingBwd.cl -o /.cache/miopen/1.7.0/e33e7c33bfa58bc339c2c0ed6e8d29ad/MIOpenPoolingBwd.cl.o
diff --git a/src/gpu/DNNMark/setup.sh b/src/gpu/DNNMark/setup.sh
index 30baf95..ebd2afc 100755
--- a/src/gpu/DNNMark/setup.sh
+++ b/src/gpu/DNNMark/setup.sh
@@ -23,9 +23,10 @@
 then
   MIOPEN_PATH=/opt/rocm/miopen
   ROCBLAS_PATH=/opt/rocm/rocblas
-  CXX=/opt/rocm/hcc/bin/hcc cmake \
+  CXX=/opt/rocm/bin/hipcc cmake \
     -DHCC_ENABLE=ON \
     -DMIOPEN_ROOT=${MIOPEN_PATH} \
     -DROCBLAS_ROOT=${ROCBLAS_PATH} \
+    -DCMAKE_PREFIX_PATH="/opt/rocm;/opt/rocm/lib/cmake/AMDDeviceLibs/;/opt/rocm/lib/cmake/amd_comgr/" \
     ..
 fi
diff --git a/src/gpu/halo-finder/README.md b/src/gpu/halo-finder/README.md
index 543c22e..cbfb685 100644
--- a/src/gpu/halo-finder/README.md
+++ b/src/gpu/halo-finder/README.md
@@ -56,4 +56,4 @@
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/halo-finder/ForceTreeTest>
+<http://dist.gem5.org/dist/v21-1/test-progs/halo-finder/ForceTreeTest>
diff --git a/src/gpu/heterosync/Makefile b/src/gpu/heterosync/Makefile
index a1aaad4..4eb34cf 100644
--- a/src/gpu/heterosync/Makefile
+++ b/src/gpu/heterosync/Makefile
@@ -13,10 +13,7 @@
 
 # gfx8 has a different number of bits it uses for sleeps, so compile accordingly
 release-gfx8: $(SRC) | $(BIN_DIR)
-	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx803 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
-
-release-gfx8-apu: $(SRC) | $(BIN_DIR)
-	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx801 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
+	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx803,gfx801 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
 
 debug: $(SRC) | $(BIN_DIR)
 	$(HIP_PATH)/bin/hipcc -DDEBUG -g -O0 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE).debug
diff --git a/src/gpu/heterosync/README.md b/src/gpu/heterosync/README.md
index 9d8468e..bba6547 100644
--- a/src/gpu/heterosync/README.md
+++ b/src/gpu/heterosync/README.md
@@ -21,16 +21,16 @@
 ## Compilation
 ```
 cd src/gpu/heterosync
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8-apu
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8
 ```
 
-The release-gfx8-apu target builds for gfx801, a GCN3-based APU. There are other
-targets (release-gfx8, release) that build for GPU types that are currently unsupported
-in gem5.
+The release-gfx8 target builds for gfx801, a GCN3-based APU, and gfx803, a
+GCN3-based dGPU. There are other targets (release) that build for GPU types
+that are currently unsupported in gem5.
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/heterosync/gcn3/allSyncPrims-1kernel>
+<http://dist.gem5.org/dist/v21-1/test-progs/heterosync/gcn3/allSyncPrims-1kernel>
 
 Information from original HeteroSync README included below:
 
diff --git a/src/gpu/heterosync/src/hipLocks.h b/src/gpu/heterosync/src/hipLocks.h
index 690ce4f..2a8dafd 100644
--- a/src/gpu/heterosync/src/hipLocks.h
+++ b/src/gpu/heterosync/src/hipLocks.h
@@ -7,9 +7,6 @@
 /*
   Shared sleep function.  Since s_sleep only takes in consstants (between 1 and 128),
   need code to handle long tail.
-
-  Currently s_sleep is unsupported in gem5, so sleepFunc is commented out and
-  replaced with a spin in the lock implementations
  */
 inline __device__ void sleepFunc(int backoff) {
   int backoffCopy = backoff;
diff --git a/src/gpu/heterosync/src/hipLocksBarrierAtomic.h b/src/gpu/heterosync/src/hipLocksBarrierAtomic.h
index a51a77f..38fbc9d 100644
--- a/src/gpu/heterosync/src/hipLocksBarrierAtomic.h
+++ b/src/gpu/heterosync/src/hipLocksBarrierAtomic.h
@@ -56,8 +56,7 @@
     // do exponential backoff to reduce the number of times we pound the global
     // barrier
     if (!*done) {
-      //sleepFunc(backoff);
-      for (int j = 0; j < backoff; ++j) { ; }
+      sleepFunc(backoff);
       __syncthreads();
     }
   }
diff --git a/src/gpu/heterosync/src/hipLocksImpl.h b/src/gpu/heterosync/src/hipLocksImpl.h
index b04fb37..750de8f 100644
--- a/src/gpu/heterosync/src/hipLocksImpl.h
+++ b/src/gpu/heterosync/src/hipLocksImpl.h
@@ -28,13 +28,13 @@
   cpuLockData->mutexCount             = numMutexes;
   cpuLockData->semaphoreCount         = numSemaphores;
 
-  hipMalloc(&cpuLockData->barrierBuffers,   sizeof(unsigned int) * cpuLockData->arrayStride * 2);
+  hipHostMalloc(&cpuLockData->barrierBuffers,   sizeof(unsigned int) * cpuLockData->arrayStride * 2);
 
-  hipMalloc(&cpuLockData->mutexBuffers,     sizeof(int) * cpuLockData->arrayStride * cpuLockData->mutexCount);
-  hipMalloc(&cpuLockData->mutexBufferHeads, sizeof(unsigned int) * cpuLockData->mutexCount);
-  hipMalloc(&cpuLockData->mutexBufferTails, sizeof(unsigned int) * cpuLockData->mutexCount);
+  hipHostMalloc(&cpuLockData->mutexBuffers,     sizeof(int) * cpuLockData->arrayStride * cpuLockData->mutexCount);
+  hipHostMalloc(&cpuLockData->mutexBufferHeads, sizeof(unsigned int) * cpuLockData->mutexCount);
+  hipHostMalloc(&cpuLockData->mutexBufferTails, sizeof(unsigned int) * cpuLockData->mutexCount);
 
-  hipMalloc(&cpuLockData->semaphoreBuffers, sizeof(unsigned int) * 4 * cpuLockData->semaphoreCount);
+  hipHostMalloc(&cpuLockData->semaphoreBuffers, sizeof(unsigned int) * 4 * cpuLockData->semaphoreCount);
 
   hipErr = hipGetLastError();
   checkError(hipErr, "Before memsets");
@@ -81,11 +81,11 @@
 hipError_t hipLocksDestroy()
 {
   if (cpuLockData == NULL) { return hipErrorInitializationError; }
-  hipFree(cpuLockData->mutexBuffers);
-  hipFree(cpuLockData->mutexBufferHeads);
-  hipFree(cpuLockData->mutexBufferTails);
+  hipHostFree(cpuLockData->mutexBuffers);
+  hipHostFree(cpuLockData->mutexBufferHeads);
+  hipHostFree(cpuLockData->mutexBufferTails);
 
-  hipFree(cpuLockData->semaphoreBuffers);
+  hipHostFree(cpuLockData->semaphoreBuffers);
 
   hipHostFree(cpuLockData);
 
diff --git a/src/gpu/heterosync/src/hipLocksMutexEBO.h b/src/gpu/heterosync/src/hipLocksMutexEBO.h
index 0adaac0..69ab38d 100644
--- a/src/gpu/heterosync/src/hipLocksMutexEBO.h
+++ b/src/gpu/heterosync/src/hipLocksMutexEBO.h
@@ -43,8 +43,7 @@
       {
         // if we failed in acquiring the lock, wait for a little while before
         // trying again
-        //sleepFunc(backoff);
-        for (int j = 0; j < backoff; ++j) { ; }
+        sleepFunc(backoff);
         // (capped) exponential backoff
         backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
       }
@@ -100,8 +99,7 @@
       {
         // if we failed in acquiring the lock, wait for a little while before
         // trying again
-        //sleepFunc(backoff);
-        for (int j = 0; j < backoff; ++j) { ; }
+        sleepFunc(backoff);
         // (capped) exponential backoff
         backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
       }
diff --git a/src/gpu/heterosync/src/hipLocksMutexSleep.h b/src/gpu/heterosync/src/hipLocksMutexSleep.h
index b9a1461..c49d401 100644
--- a/src/gpu/heterosync/src/hipLocksMutexSleep.h
+++ b/src/gpu/heterosync/src/hipLocksMutexSleep.h
@@ -79,8 +79,7 @@
       {
         // if we failed in acquiring the lock, wait for a little while before
         // trying again
-        //sleepFunc(backoff);
-        for (int j = 0; j < backoff; ++j) { ; }
+        sleepFunc(backoff);
         // (capped) exponential backoff
         backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
       }
diff --git a/src/gpu/heterosync/src/hipLocksSemaphoreEBO.h b/src/gpu/heterosync/src/hipLocksSemaphoreEBO.h
index 0128de3..69520be 100644
--- a/src/gpu/heterosync/src/hipLocksSemaphoreEBO.h
+++ b/src/gpu/heterosync/src/hipLocksSemaphoreEBO.h
@@ -162,8 +162,7 @@
     {
       // if we failed to enter the semaphore, wait for a little while before
       // trying again
-      //sleepFunc(backoff);
-      for (int j = 0; j < backoff; ++j) { ; }
+      sleepFunc(backoff);
       /*
         for writers increse backoff a lot because failing means readers are in
         the CS currently -- most important for non-unique because all WGs on
@@ -385,8 +384,7 @@
         if we failed to enter the semaphore, wait for a little while before
         trying again
       */
-      //sleepFunc(backoff);
-      for (int j = 0; j < backoff; ++j) { ; }
+      sleepFunc(backoff);
       // (capped) exponential backoff
       backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
     }
diff --git a/src/gpu/heterosync/src/main.hip.cpp b/src/gpu/heterosync/src/main.hip.cpp
index e4a2d0f..db38cb5 100644
--- a/src/gpu/heterosync/src/main.hip.cpp
+++ b/src/gpu/heterosync/src/main.hip.cpp
@@ -1269,7 +1269,8 @@
     The atomic barrier per-CU synchronization fits inside the lock-free size
     requirements so we can reuse the same locations.
   */
-  unsigned int * perCUBarriers = (unsigned int *)malloc(sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
+  unsigned int * perCUBarriers;
+  hipHostMalloc(&perCUBarriers, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
 
   int numLocsMult = 0;
   // barriers and unique semaphores have numWGs WGs accessing unique locations
@@ -1298,7 +1299,8 @@
   assert(numUniqLocsAccPerWG > 0);
   int numStorageLocs = (numLocsMult * numUniqLocsAccPerWG);
   assert(numStorageLocs > 0);
-  float * storage = (float *)malloc(sizeof(float) * numStorageLocs);
+  float * storage;
+  hipHostMalloc(&storage, sizeof(float) * numStorageLocs);
 
   fprintf(stdout, "# WGs: %d, # Ld/St: %d, # Locs Mult: %d, # Uniq Locs/WG: %d, # Storage Locs: %d\n", numWGs, NUM_LDST, numLocsMult, numUniqLocsAccPerWG, numStorageLocs);
 
@@ -1307,17 +1309,6 @@
   // initialize per-CU barriers to 0's
   for (int i = 0; i < (NUM_CU * MAX_WGS * 2); ++i) { perCUBarriers[i] = 0; }
 
-  // gpu copies of storage and perCUBarriers
-  //float elapsedTime = 0.0f;
-  unsigned int * perCUBarriers_d = NULL;
-  float * storage_d = NULL;
-
-  hipMalloc(&perCUBarriers_d, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
-  hipMalloc(&storage_d, sizeof(float) * numStorageLocs);
-
-  hipMemcpy(perCUBarriers_d, perCUBarriers, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2), hipMemcpyHostToDevice);
-  hipMemcpy(storage_d, storage, sizeof(float) * numStorageLocs, hipMemcpyHostToDevice);
-
   // lock variables
   hipMutex_t spinMutex, faMutex, sleepMutex, eboMutex;
   hipMutex_t spinMutex_uniq, faMutex_uniq, sleepMutex_uniq, eboMutex_uniq;
@@ -1479,52 +1470,52 @@
 
   switch (syncPrim) {
     case 0: // atomic tree barrier
-      invokeAtomicTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeAtomicTreeBarrier(storage, perCUBarriers, NUM_ITERS);
       break;
     case 1: // atomic tree barrier with local exchange
-      invokeAtomicTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeAtomicTreeBarrierLocalExch(storage, perCUBarriers, NUM_ITERS);
       break;
     case 2: // lock-free barrier
-      invokeFBSTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeFBSTreeBarrier(storage, perCUBarriers, NUM_ITERS);
       break;
     case 3: // lock-free barrier with local exchange
-      invokeFBSTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
+      invokeFBSTreeBarrierLocalExch(storage, perCUBarriers, NUM_ITERS);
       break;
     case 4: // Spin Lock Mutex
-      invokeSpinLockMutex   (spinMutex,  storage_d, NUM_ITERS);
+      invokeSpinLockMutex   (spinMutex,  storage, NUM_ITERS);
       break;
     case 5: // Spin Lock Mutex with backoff
-      invokeEBOMutex        (eboMutex,   storage_d, NUM_ITERS);
+      invokeEBOMutex        (eboMutex,   storage, NUM_ITERS);
       break;
     case 6: // Sleeping Mutex
-      invokeSleepingMutex   (sleepMutex, storage_d, NUM_ITERS);
+      invokeSleepingMutex   (sleepMutex, storage, NUM_ITERS);
       break;
     case 7: // fetch-and-add mutex
-      invokeFetchAndAddMutex(faMutex,    storage_d, NUM_ITERS);
+      invokeFetchAndAddMutex(faMutex,    storage, NUM_ITERS);
       break;
     case 8: // spin semaphore (1)
-      invokeSpinLockSemaphore(spinSem1,   storage_d,   1, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem1,   storage,   1, NUM_ITERS, numStorageLocs);
       break;
     case 9: // spin semaphore (2)
-      invokeSpinLockSemaphore(spinSem2,   storage_d,   2, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem2,   storage,   2, NUM_ITERS, numStorageLocs);
       break;
     case 10: // spin semaphore (10)
-      invokeSpinLockSemaphore(spinSem10,   storage_d,   10, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem10,   storage,   10, NUM_ITERS, numStorageLocs);
       break;
     case 11: // spin semaphore (120)
-      invokeSpinLockSemaphore(spinSem120,   storage_d,   120, NUM_ITERS, numStorageLocs);
+      invokeSpinLockSemaphore(spinSem120,   storage,   120, NUM_ITERS, numStorageLocs);
       break;
     case 12: // spin semaphore with backoff (1)
-      invokeEBOSemaphore(eboSem1,   storage_d,     1, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem1,   storage,     1, NUM_ITERS, numStorageLocs);
       break;
     case 13: // spin semaphore with backoff (2)
-      invokeEBOSemaphore(eboSem2,   storage_d,     2, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem2,   storage,     2, NUM_ITERS, numStorageLocs);
       break;
     case 14: // spin semaphore with backoff (10)
-      invokeEBOSemaphore(eboSem10,   storage_d,   10, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem10,   storage,   10, NUM_ITERS, numStorageLocs);
       break;
     case 15: // spin semaphore with backoff (120)
-      invokeEBOSemaphore(eboSem120,   storage_d, 120, NUM_ITERS, numStorageLocs);
+      invokeEBOSemaphore(eboSem120,   storage, 120, NUM_ITERS, numStorageLocs);
       break;
     // cases 16-19 reserved
     case 16:
@@ -1536,40 +1527,40 @@
     case 19:
       break;
     case 20: // Spin Lock Mutex (uniq)
-      invokeSpinLockMutex_uniq   (spinMutex_uniq,  storage_d, NUM_ITERS);
+      invokeSpinLockMutex_uniq   (spinMutex_uniq,  storage, NUM_ITERS);
       break;
     case 21: // Spin Lock Mutex with backoff (uniq)
-      invokeEBOMutex_uniq        (eboMutex_uniq,   storage_d, NUM_ITERS);
+      invokeEBOMutex_uniq        (eboMutex_uniq,   storage, NUM_ITERS);
       break;
     case 22: // Sleeping Mutex (uniq)
-      invokeSleepingMutex_uniq   (sleepMutex_uniq, storage_d, NUM_ITERS);
+      invokeSleepingMutex_uniq   (sleepMutex_uniq, storage, NUM_ITERS);
       break;
     case 23: // fetch-and-add mutex (uniq)
-      invokeFetchAndAddMutex_uniq(faMutex_uniq,    storage_d, NUM_ITERS);
+      invokeFetchAndAddMutex_uniq(faMutex_uniq,    storage, NUM_ITERS);
       break;
     case 24: // spin semaphore (1) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem1_uniq,   storage_d,   1, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem1_uniq,   storage,   1, NUM_ITERS);
       break;
     case 25: // spin semaphore (2) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem2_uniq,   storage_d,   2, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem2_uniq,   storage,   2, NUM_ITERS);
       break;
     case 26: // spin semaphore (10) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem10_uniq,   storage_d,   10, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem10_uniq,   storage,   10, NUM_ITERS);
       break;
     case 27: // spin semaphore (120) (uniq)
-      invokeSpinLockSemaphore_uniq(spinSem120_uniq,   storage_d,   120, NUM_ITERS);
+      invokeSpinLockSemaphore_uniq(spinSem120_uniq,   storage,   120, NUM_ITERS);
       break;
     case 28: // spin semaphore with backoff (1) (uniq)
-      invokeEBOSemaphore_uniq(eboSem1_uniq,   storage_d,     1, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem1_uniq,   storage,     1, NUM_ITERS);
       break;
     case 29: // spin semaphore with backoff (2) (uniq)
-      invokeEBOSemaphore_uniq(eboSem2_uniq,   storage_d,     2, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem2_uniq,   storage,     2, NUM_ITERS);
       break;
     case 30: // spin semaphore with backoff (10) (uniq)
-      invokeEBOSemaphore_uniq(eboSem10_uniq,   storage_d,   10, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem10_uniq,   storage,   10, NUM_ITERS);
       break;
     case 31: // spin semaphore with backoff (120) (uniq)
-      invokeEBOSemaphore_uniq(eboSem120_uniq,   storage_d, 120, NUM_ITERS);
+      invokeEBOSemaphore_uniq(eboSem120_uniq,   storage, 120, NUM_ITERS);
       break;
     // cases 32-36 reserved
     case 32:
@@ -1594,9 +1585,6 @@
   // NOTE: Can end simulation here if don't care about output checking
   hipDeviceSynchronize();
 
-  // copy results back to compare to golden
-  hipMemcpy(storage, storage_d, sizeof(float) * numStorageLocs, hipMemcpyDeviceToHost);
-
   // get golden results
   float storageGolden[numStorageLocs];
   int numLocsAccessed = 0, currLoc = 0;
@@ -1777,10 +1765,8 @@
 
   // free arrays
   hipLocksDestroy();
-  hipFree(storage_d);
-  hipFree(perCUBarriers_d);
-  free(storage);
-  free(perCUBarriers);
+  hipHostFree(storage);
+  hipHostFree(perCUBarriers);
 
   return 0;
 }
diff --git a/src/gpu/hip-samples/Makefile b/src/gpu/hip-samples/Makefile
index da12ce5..ee863b8 100644
--- a/src/gpu/hip-samples/Makefile
+++ b/src/gpu/hip-samples/Makefile
@@ -12,7 +12,7 @@
 all: $(EXECUTABLES)
 
 $(EXECUTABLES): %: $(SRC_DIR)/%.cpp | $(BIN_DIR)
-	$(HIPCC) $< -o $(BIN_DIR)/$@
+	$(HIPCC) --amdgpu-target=gfx801,gfx803 $< -o $(BIN_DIR)/$@
 
 $(BIN_DIR):
 	mkdir -p $@
diff --git a/src/gpu/hip-samples/README.md b/src/gpu/hip-samples/README.md
index 3b17193..467705b 100644
--- a/src/gpu/hip-samples/README.md
+++ b/src/gpu/hip-samples/README.md
@@ -33,23 +33,23 @@
 
 Individual programs can be made by specifying the name of the program
 
-By default, this code builds for gfx801, a GCN3-based APU. This can be
-overridden by specifying `-e HCC_AMDGPU_TARGET=<target>` in the build command.
+By default, the apps are built for all supported GPU types (gfx801, gfx803).
+This can be changed by editing the --amdgpu-target argument in the Makefile.
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/2dshfl>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/2dshfl>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/dynamic_shared>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/dynamic_shared>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/inline_asm>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/inline_asm>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/MatrixTranspose>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/MatrixTranspose>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/sharedMemory>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/sharedMemory>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/shfl>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/shfl>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/stream>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/stream>
 
-<http://dist.gem5.org/dist/v21-0/test-progs/hip-samples/unroll>
+<http://dist.gem5.org/dist/v21-1/test-progs/hip-samples/unroll>
diff --git a/src/gpu/hip-samples/src/2dshfl.cpp b/src/gpu/hip-samples/src/2dshfl.cpp
index 1b22a0c..4e58cfb 100644
--- a/src/gpu/hip-samples/src/2dshfl.cpp
+++ b/src/gpu/hip-samples/src/2dshfl.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -66,10 +65,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -80,8 +77,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -90,21 +86,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -113,8 +103,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -125,12 +115,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/MatrixTranspose.cpp b/src/gpu/hip-samples/src/MatrixTranspose.cpp
index 264fcbe..68741e2 100644
--- a/src/gpu/hip-samples/src/MatrixTranspose.cpp
+++ b/src/gpu/hip-samples/src/MatrixTranspose.cpp
@@ -37,8 +37,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -66,10 +65,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -80,8 +77,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -90,21 +86,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -113,7 +103,7 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
       errors++;
     }
   }
@@ -124,12 +114,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/dynamic_shared.cpp b/src/gpu/hip-samples/src/dynamic_shared.cpp
index 22d7eb9..9627d3b 100644
--- a/src/gpu/hip-samples/src/dynamic_shared.cpp
+++ b/src/gpu/hip-samples/src/dynamic_shared.cpp
@@ -34,8 +34,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  sizeof(float)*WIDTH*WIDTH, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     sizeof(float)*WIDTH*WIDTH, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/inline_asm.cpp b/src/gpu/hip-samples/src/inline_asm.cpp
index f2345e5..5a8b628 100644
--- a/src/gpu/hip-samples/src/inline_asm.cpp
+++ b/src/gpu/hip-samples/src/inline_asm.cpp
@@ -35,8 +35,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -65,10 +64,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -79,8 +76,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -89,21 +85,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -112,8 +102,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("gpu%f cpu %f \n",gpuTransposeMatrix[i],cpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -124,12 +114,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/sharedMemory.cpp b/src/gpu/hip-samples/src/sharedMemory.cpp
index 9b51aba..d88d18e 100644
--- a/src/gpu/hip-samples/src/sharedMemory.cpp
+++ b/src/gpu/hip-samples/src/sharedMemory.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -71,10 +70,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -85,8 +82,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -95,21 +91,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -118,8 +108,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -130,12 +120,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/shfl.cpp b/src/gpu/hip-samples/src/shfl.cpp
index e0f4c21..d523ffb 100644
--- a/src/gpu/hip-samples/src/shfl.cpp
+++ b/src/gpu/hip-samples/src/shfl.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/stream.cpp b/src/gpu/hip-samples/src/stream.cpp
index 2dc7544..c14759a 100644
--- a/src/gpu/hip-samples/src/stream.cpp
+++ b/src/gpu/hip-samples/src/stream.cpp
@@ -30,10 +30,9 @@
 
 using namespace std;
 
-__global__ void matrixTranspose_static_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
+__global__ void matrixTranspose_static_shared(float *out,
+                                              float *in,
+                                              const int width)
 {
     __shared__ float sharedMem[WIDTH*WIDTH];
 
@@ -47,10 +46,9 @@
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
+__global__ void matrixTranspose_dynamic_shared(float *out,
+                                               float *in,
+                                               const int width)
 {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -65,7 +63,7 @@
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, float **TransposeMatrix, int width)
+void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, int width)
 {
     const int num_streams = 2;
     hipStream_t streams[num_streams];
@@ -75,48 +73,42 @@
 
     for(int i=0;i<num_streams;i++)
     {
-        hipMalloc((void**)&data[i], NUM * sizeof(float));
+        hipHostMalloc(&data[i], NUM * sizeof(float));
         hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice,streams[i]);
     }
 
-    hipLaunchKernel(matrixTranspose_static_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    0, streams[0],
-                    gpuTransposeMatrix[0], data[0], width);
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
+                       dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                       dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                       0, streams[0],
+                       gpuTransposeMatrix[0], data[0], width);
 
-    hipLaunchKernel(matrixTranspose_dynamic_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    sizeof(float)*WIDTH*WIDTH, streams[1],
-                    gpuTransposeMatrix[1], data[1], width);
-
-    for(int i=0;i<num_streams;i++)
-        hipMemcpyAsync(TransposeMatrix[i], gpuTransposeMatrix[i], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[i]);
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
+                       dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                       dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                       sizeof(float)*WIDTH*WIDTH, streams[1],
+                       gpuTransposeMatrix[1], data[1], width);
 }
 
 int main(){
 
     hipSetDevice(0);
 
-    float *data[2], *TransposeMatrix[2], *gpuTransposeMatrix[2], *randArray;
+    float *data[2], *gpuTransposeMatrix[2], *randArray;
 
     int width = WIDTH;
 
     randArray = (float*)malloc(NUM * sizeof(float));
 
-    TransposeMatrix[0] = (float*)malloc(NUM * sizeof(float));
-    TransposeMatrix[1] = (float*)malloc(NUM * sizeof(float));
-
-    hipMalloc((void**)&gpuTransposeMatrix[0], NUM * sizeof(float));
-    hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float));
+    hipHostMalloc(&gpuTransposeMatrix[0], NUM * sizeof(float));
+    hipHostMalloc(&gpuTransposeMatrix[1], NUM * sizeof(float));
 
     for(int i = 0; i < NUM; i++)
     {
         randArray[i] = (float)i*1.0f;
     }
 
-    MultipleStream(data, randArray, gpuTransposeMatrix, TransposeMatrix, width);
+    MultipleStream(data, randArray, gpuTransposeMatrix, width);
 
     hipDeviceSynchronize();
 
@@ -124,9 +116,9 @@
     int errors = 0;
     double eps = 1.0E-6;
     for (int i = 0; i < NUM; i++) {
-        if (std::abs(TransposeMatrix[0][i] - TransposeMatrix[1][i]) > eps ) {
-        printf("%d stream0: %f stream1  %f\n",i,TransposeMatrix[0][i],TransposeMatrix[1][i]);
-        errors++;
+        if (std::abs(gpuTransposeMatrix[0][i] - gpuTransposeMatrix[1][i]) > eps ) {
+            printf("%d stream0: %f stream1  %f\n",i,gpuTransposeMatrix[0][i],gpuTransposeMatrix[1][i]);
+            errors++;
         }
     }
     if (errors!=0) {
@@ -139,7 +131,6 @@
     for(int i=0;i<2;i++){
        hipFree(data[i]);
        hipFree(gpuTransposeMatrix[i]);
-       free(TransposeMatrix[i]);
     }
 
     hipDeviceReset();
diff --git a/src/gpu/hip-samples/src/unroll.cpp b/src/gpu/hip-samples/src/unroll.cpp
index 22f1c75..6935c03 100644
--- a/src/gpu/hip-samples/src/unroll.cpp
+++ b/src/gpu/hip-samples/src/unroll.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/lulesh/README.md b/src/gpu/lulesh/README.md
index dbcc867..a0726c4 100644
--- a/src/gpu/lulesh/README.md
+++ b/src/gpu/lulesh/README.md
@@ -47,4 +47,4 @@
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/lulesh/lulesh>
+<http://dist.gem5.org/dist/v21-1/test-progs/lulesh/lulesh>
diff --git a/src/gpu/pennant/README.md b/src/gpu/pennant/README.md
index 5542214..5c1108f 100644
--- a/src/gpu/pennant/README.md
+++ b/src/gpu/pennant/README.md
@@ -40,7 +40,7 @@
 
 ## Pre-built binary
 
-<http://dist.gem5.org/dist/v21-0/test-progs/pennant/pennant>
+<http://dist.gem5.org/dist/v21-1/test-progs/pennant/pennant>
 
 The information from the original PENNANT README is included below.
 
diff --git a/src/gpu/square/Makefile b/src/gpu/square/Makefile
index 4f4eb82..48164aa 100644
--- a/src/gpu/square/Makefile
+++ b/src/gpu/square/Makefile
@@ -3,10 +3,10 @@
 
 BIN_DIR?= ./bin
 
-gfx8-apu: $(BIN_DIR)/square.o
+square: $(BIN_DIR)/square
 
-$(BIN_DIR)/square.o: square.cpp $(BIN_DIR)
-	$(HIPCC) --amdgpu-target=gfx801 $(CXXFLAGS) square.cpp -o $(BIN_DIR)/square.o
+$(BIN_DIR)/square: square.cpp $(BIN_DIR)
+	$(HIPCC) --amdgpu-target=gfx801,gfx803 $(CXXFLAGS) square.cpp -o $(BIN_DIR)/square
 
 $(BIN_DIR):
 	mkdir -p $(BIN_DIR)
@@ -14,4 +14,4 @@
 clean:
 	rm -rf $(BIN_DIR)
 
-.PHONY: gfx8-apu clean
+.PHONY: square clean
diff --git a/src/gpu/square/README.md b/src/gpu/square/README.md
index 3a764c1..104f434 100644
--- a/src/gpu/square/README.md
+++ b/src/gpu/square/README.md
@@ -15,14 +15,15 @@
 
 ## Compiling Square
 
+By default, square will build for all supported GPU types (gfx801, gfx803)
 ```
 cd src/gpu/square
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make gfx8-apu
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
 The compiled binary can be found in the `bin` directory.
 
-A pre-built binary can be found at <http://dist.gem5.org/dist/develop/test-progs/square/square.o>.
+A pre-built binary can be found at <http://dist.gem5.org/dist/v21-1/test-progs/square/square>.
 
 ## Compiling GN3_X86/gem5.opt
 
@@ -37,5 +38,5 @@
 ## Running Square on GCN3_X86/gem5.opt
 
 ```
-docker run -u $UID:$GUID --volume $(pwd):$(pwd) -w $(pwd) gcr.io/gem5-test/gcn-gpu:latest gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n <num cores> -c bin/square.o
+docker run -u $UID:$GUID --volume $(pwd):$(pwd) -w $(pwd) gcr.io/gem5-test/gcn-gpu:latest gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n 3 -c bin/square
 ```
diff --git a/src/gpu/square/square.cpp b/src/gpu/square/square.cpp
index 87bf597..cd1ce72 100644
--- a/src/gpu/square/square.cpp
+++ b/src/gpu/square/square.cpp
@@ -50,9 +50,6 @@
 
 int main(int argc, char *argv[])
 {
-#ifdef DGPU
-    float *A_d, *C_d;
-#endif
     float *A_h, *C_h;
     size_t N = 1000000;
     size_t Nbytes = N * sizeof(float);
@@ -63,38 +60,21 @@
     #ifdef __HIP_PLATFORM_HCC__
       printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch);
     #endif
-    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-    A_h = (float*)malloc(Nbytes);
-    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
-    C_h = (float*)malloc(Nbytes);
-    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    printf ("info: allocate host and device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    CHECK(hipHostMalloc(&A_h, Nbytes));
+    CHECK(hipHostMalloc(&C_h, Nbytes));
     // Fill with Phi + i
     for (size_t i=0; i<N; i++)
     {
         A_h[i] = 1.618f + i;
     }
 
-#ifdef DGPU
-    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-    CHECK(hipMalloc(&A_d, Nbytes));
-    CHECK(hipMalloc(&C_d, Nbytes));
-
-    printf ("info: copy Host2Device\n");
-    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-#endif
-
     const unsigned blocks = 512;
     const unsigned threadsPerBlock = 256;
 
     printf ("info: launch 'vector_square' kernel\n");
-#ifdef DGPU
-    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
-
-    printf ("info: copy Device2Host\n");
-    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-#else
     hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_h, A_h, N);
-#endif
+    hipDeviceSynchronize();
 
     printf ("info: check result\n");
     for (size_t i=0; i<N; i++)  {
diff --git a/src/linux-kernel/README.md b/src/linux-kernel/README.md
index 445051a..8a7c438 100644
--- a/src/linux-kernel/README.md
+++ b/src/linux-kernel/README.md
@@ -66,11 +66,11 @@
 

 The pre-build compiled Linux binaries can be downloaded from the following links:

 

-- [vmlinux-4.4.186](http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.4.186)

-- [vmlinux-4.9.186](http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.9.186)

-- [vmlinux-4.14.134](http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.14.134)

-- [vmlinux-4.19.83](http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.19.83)

-- [vmlinux-5.4.49](http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-5.4.49)

+- [vmlinux-4.4.186](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.4.186)

+- [vmlinux-4.9.186](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.9.186)

+- [vmlinux-4.14.134](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.14.134)

+- [vmlinux-4.19.83](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83)

+- [vmlinux-5.4.49](http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-5.4.49)

 

 

 **Licensing:**

diff --git a/src/npb/README.md b/src/npb/README.md
index e8c8f0f..a4a71c2 100644
--- a/src/npb/README.md
+++ b/src/npb/README.md
@@ -81,14 +81,14 @@
 ```
 
 Once this process succeeds, the created disk image can be found on `npb/npb-image/npb`.
-A disk image already created following the above instructions can be found, gzipped, [here](http://dist.gem5.org/dist/develop/images/x86/ubuntu-18-04/npb.img.gz).
+A disk image already created following the above instructions can be found, gzipped, [here](http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/npb.img.gz).
 
 ## gem5 Run Scripts
 
 The gem5 scripts which configure the system and run simulation are available in configs-npb-tests/.
 The main script `run_npb.py` expects following arguments:
 
-**kernel:** path to the Linux kernel. This disk image has been tested with version 4.19.83, available at <http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.19.83>. More info on building Linux Kernels can be found in the `src/linux-kernels` directory.
+**kernel:** path to the Linux kernel. This disk image has been tested with version 4.19.83, available at <http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>. More info on building Linux Kernels can be found in the `src/linux-kernels` directory.
 
 **disk:** path to the npb disk image.
 
diff --git a/src/npb/configs/run_npb.py b/src/npb/configs/run_npb.py
index 12b43de..7aa3dca 100755
--- a/src/npb/configs/run_npb.py
+++ b/src/npb/configs/run_npb.py
@@ -35,17 +35,12 @@
     number of instructions executed in the ROI. It also tracks how much
     wallclock and simulated time.
 """
-import errno
-import os
-import sys
+import argparse
 import time
 import m5
 import m5.ticks
 from m5.objects import *
 
-sys.path.append('gem5/configs/common/') # For the next line...
-import SimpleOpts
-
 from system import *
 
 def writeBenchScript(dir, bench):
@@ -66,32 +61,80 @@
     bench_file.close()
     return file_name
 
-if __name__ == "__m5_main__":
-    (opts, args) = SimpleOpts.parse_args()
-    kernel, disk, cpu, mem_sys, benchmark, num_cpus = args
+supported_protocols = ["classic", "MI_example", "MESI_Two_Level",
+                        "MOESI_CMP_directory"]
+supported_cpu_types = ['kvm', 'atomic', 'timing']
+benchmark_choices = ['bt.A.x', 'cg.A.x', 'ep.A.x', 'ft.A.x',
+                     'is.A.x', 'lu.A.x', 'mg.A.x', 'sp.A.x',
+                     'bt.B.x', 'cg.B.x', 'ep.B.x', 'ft.B.x',
+                     'is.B.x', 'lu.B.x', 'mg.B.x', 'sp.B.x',
+                     'bt.C.x', 'cg.C.x', 'ep.C.x', 'ft.C.x',
+                     'is.C.x', 'lu.C.x', 'mg.C.x', 'sp.C.x',
+                     'bt.D.x', 'cg.D.x', 'ep.D.x', 'ft.D.x',
+                     'is.D.x', 'lu.D.x', 'mg.D.x', 'sp.D.x',
+                     'bt.F.x', 'cg.F.x', 'ep.F.x', 'ft.F.x',
+                     'is.F.x', 'lu.F.x', 'mg.F.x', 'sp.F.x']
 
-    if not cpu in ['atomic', 'kvm', 'timing']:
-        m5.fatal("cpu not supported")
+def parse_options():
+
+    parser = argparse.ArgumentParser(description='For use with gem5. This '
+                'runs a NAS Parallel Benchmark application. This only works '
+                'with x86 ISA.')
+
+    # The manditry position arguments.
+    parser.add_argument("kernel", type=str,
+                        help="Path to the kernel binary to boot")
+    parser.add_argument("disk", type=str,
+                        help="Path to the disk image to boot")
+    parser.add_argument("cpu", type=str, choices=supported_cpu_types,
+                        help="The type of CPU to use in the system")
+    parser.add_argument("mem_sys", type=str, choices=supported_protocols,
+                        help="Type of memory system or coherence protocol")
+    parser.add_argument("benchmark", type=str, choices=benchmark_choices,
+                        help="The NPB application to run")
+    parser.add_argument("num_cpus", type=int, help="Number of CPU cores")
+
+    # The optional arguments.
+    parser.add_argument("--no_host_parallel", action="store_true",
+                        help="Do NOT run gem5 on multiple host threads "
+                              "(kvm only)")
+    parser.add_argument("--second_disk", type=str,
+                        help="The second disk image to mount (/dev/hdb)")
+    parser.add_argument("--no_prefetchers", action="store_true",
+                        help="Enable prefectchers on the caches")
+    parser.add_argument("--l1i_size", type=str, default='32kB',
+                        help="L1 instruction cache size. Default: 32kB")
+    parser.add_argument("--l1d_size", type=str, default='32kB',
+                        help="L1 data cache size. Default: 32kB")
+    parser.add_argument("--l2_size", type=str, default = "256kB",
+                        help="L2 cache size. Default: 256kB")
+    parser.add_argument("--l3_size", type=str, default = "4MB",
+                        help="L2 cache size. Default: 4MB")
+
+    return parser.parse_args()
+
+if __name__ == "__m5_main__":
+    args = parse_options()
+
 
     # create the system we are going to simulate
-    system = MySystem(kernel, disk, int(num_cpus), opts, no_kvm=False)
+    system = MySystem(args.kernel, args.disk, args.num_cpus, args,
+                      no_kvm=False)
 
 
-    ruby_protocols = [ "MI_example", "MESI_Two_Level", "MOESI_CMP_directory"]
-
-    if mem_sys == "classic":
-        system = MySystem(kernel, disk, int(num_cpus), opts, no_kvm=False)
-    elif mem_sys in ruby_protocols:
-        system = MyRubySystem(kernel, disk, mem_sys, int(num_cpus), opts)
+    if args.mem_sys == "classic":
+        system = MySystem(args.kernel, args.disk, args.num_cpus, args,
+                          no_kvm=False)
     else:
-        m5.fatal("Bad option for mem_sys")
+        system = MyRubySystem(args.kernel, args.disk, args.mem_sys,
+                              args.num_cpus, args)
 
     # Exit from guest on workbegin/workend
     system.exit_on_work_items = True
 
     # Create and pass a script to the simulated system to run the reuired
     # benchmark
-    system.readfile = writeBenchScript(m5.options.outdir, benchmark)
+    system.readfile = writeBenchScript(m5.options.outdir, args.benchmark)
 
     # set up the root SimObject and start the simulation
     root = Root(full_system = True, system = system)
@@ -111,7 +154,7 @@
     globalStart = time.time()
 
     print("Running the simulation")
-    print("Using cpu: {}".format(cpu))
+    print("Using cpu: {}".format(args.cpu))
     exit_event = m5.simulate()
 
     if exit_event.getCause() == "workbegin":
@@ -124,9 +167,9 @@
         start_tick = m5.curTick()
         start_insts = system.totalInsts()
         # switching cpu if argument cpu == atomic or timing
-        if cpu == 'atomic':
+        if args.cpu == 'atomic':
             system.switchCpus(system.cpu, system.atomicCpu)
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.cpu, system.timingCpu)
     else:
         print("Unexpected termination of simulation !")
@@ -149,18 +192,17 @@
     # switching back to simulate the remaining
     # part
 
-    if mem_sys in ruby_protocols:
-        print("Ruby Mem: Not Switching back to KVM!")
-
-    if mem_sys == 'classic':
+    if args.mem_sys == 'classic':
         # switch cpu back to kvm if atomic/timing was used for ROI
-        if cpu == 'atomic':
+        if args.cpu == 'atomic':
             system.switchCpus(system.atomicCpu, system.cpu)
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.timingCpu, system.cpu)
 
         # Simulate the remaning part of the benchmark
         exit_event = m5.simulate()
+    else:
+        print("Ruby Mem: Not Switching back to KVM!")
 
     print("Done with the simulation")
     print()
diff --git a/src/npb/configs/system/caches.py b/src/npb/configs/system/caches.py
index 3751926..9e44211 100755
--- a/src/npb/configs/system/caches.py
+++ b/src/npb/configs/system/caches.py
@@ -30,26 +30,16 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script. It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
-
-import SimpleOpts
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
 
 class PrefetchCache(Cache):
 
-    SimpleOpts.add_option("--no_prefetchers", default=False,
-                          action="store_true",
-                          help="Enable prefectchers on the caches")
-
     def __init__(self, options):
         super(PrefetchCache, self).__init__()
         if not options or options.no_prefetchers:
@@ -83,12 +73,6 @@
 class L1ICache(L1Cache):
     """Simple L1 instruction cache with default values"""
 
-    # Set the default size
-    size = '32kB'
-
-    SimpleOpts.add_option('--l1i_size',
-                        help="L1 instruction cache size. Default: %s" % size)
-
     def __init__(self, opts=None):
         super(L1ICache, self).__init__(opts)
         if not opts or not opts.l1i_size:
@@ -102,12 +86,6 @@
 class L1DCache(L1Cache):
     """Simple L1 data cache with default values"""
 
-    # Set the default size
-    size = '32kB'
-
-    SimpleOpts.add_option('--l1d_size',
-                          help="L1 data cache size. Default: %s" % size)
-
     def __init__(self, opts=None):
         super(L1DCache, self).__init__(opts)
         if not opts or not opts.l1d_size:
@@ -149,7 +127,6 @@
     """Simple L2 Cache with default values"""
 
     # Default parameters
-    size = '256kB'
     assoc = 16
     tag_latency = 10
     data_latency = 10
@@ -158,9 +135,6 @@
     tgts_per_mshr = 12
     writeback_clean = True
 
-    SimpleOpts.add_option('--l2_size',
-                          help="L2 cache size. Default: %s" % size)
-
     def __init__(self, opts=None):
         super(L2Cache, self).__init__(opts)
         if not opts or not opts.l2_size:
@@ -179,9 +153,6 @@
        be used as a standalone L3 cache.
     """
 
-    SimpleOpts.add_option('--l3_size', default = '4MB',
-                          help="L3 cache size. Default: 4MB")
-
     # Default parameters
     assoc = 32
     tag_latency = 40
diff --git a/src/npb/configs/system/ruby_system.py b/src/npb/configs/system/ruby_system.py
index 6b3435a..a6d7fcb 100755
--- a/src/npb/configs/system/ruby_system.py
+++ b/src/npb/configs/system/ruby_system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
diff --git a/src/npb/configs/system/system.py b/src/npb/configs/system/system.py
index e7a0753..f0e71c2 100755
--- a/src/npb/configs/system/system.py
+++ b/src/npb/configs/system/system.py
@@ -29,20 +29,12 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
 
 class MySystem(System):
 
-    SimpleOpts.add_option("--no_host_parallel", default=False,
-                action="store_true",
-                help="Do NOT run gem5 on multiple host threads (kvm only)")
-
-    SimpleOpts.add_option("--second_disk", default='',
-                          help="The second disk image to mount (/dev/hdb)")
-
     def __init__(self, kernel, disk, num_cpus, opts, no_kvm=False):
         super(MySystem, self).__init__()
         self._opts = opts
diff --git a/src/parsec/README.md b/src/parsec/README.md
index 336e8ef..0665fd2 100644
--- a/src/parsec/README.md
+++ b/src/parsec/README.md
@@ -79,8 +79,8 @@
 
 There are two sets of run scripts and system configuration files in the directory. The scripts found in `configs` use the classic memory system while the scripts in `configs-mesi-two-level` use the ruby memory system with MESI_Two_Level cache coherency protocol. The parameters used in the both sets of experiments are explained below:
 
-* **kernel**: The path to the linux kernel. We have verified capatibility with kernel version 4.19.83 which you can download at <http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.19.83>. More information on building kernels for gem5 can be around in `src/linux-kernel`.
-* **disk**: The path to the PARSEC disk-image.
+* **kernel**: The path to the linux kernel. We have verified capatibility with kernel version 4.19.83 which you can download at <http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>. More information on building kernels for gem5 can be around in `src/linux-kernel`.
+* **disk**: The path to the PARSEC disk-image. This can be downloaded, gzipped, from <http://dist.gem5.org/dist/v21-1/images/x86/ubuntu-18-04/parsec.img.gz>.
 * **cpu**: The type of cpu to use. There are two supported options: `kvm` (KvmCPU) and `timing` (TimingSimpleCPU).
 * **benchmark**: The PARSEC workload to run. They include `blackscholes`, `bodytrack`, `canneal`, `dedup`, `facesim`, `ferret`, `fluidanimate`, `freqmine`, `raytrace`, `streamcluster`, `swaptions`, `vips`, `x264`. For more information on the workloads can be found at <https://parsec.cs.princeton.edu/>.
 * **size**: The size of the chosen workload. Valid sizes are `simsmall`, `simmedium`, and `simlarge`.
diff --git a/src/parsec/configs-mesi-two-level/run_parsec_mesi_two_level.py b/src/parsec/configs-mesi-two-level/run_parsec_mesi_two_level.py
index e95c8e1..311d8ca 100755
--- a/src/parsec/configs-mesi-two-level/run_parsec_mesi_two_level.py
+++ b/src/parsec/configs-mesi-two-level/run_parsec_mesi_two_level.py
@@ -37,19 +37,42 @@
     disk-image for this script.
 
 """
-import errno
-import os
-import sys
+import argparse
 import time
 import m5
 import m5.ticks
 from m5.objects import *
 
-sys.path.append('gem5/configs/common/') # For the next line...
-import SimpleOpts
-
 from system import *
 
+supported_cpu_types = ["kvm", "timing"]
+benchmark_choices = ["blackscholes", "bodytrack", "canneal", "dedup",
+                     "facesim", "ferret", "fluidanimate", "freqmine",
+                     "raytrace", "streamcluster", "swaptions", "vips", "x264"]
+size_choices=["simsmall", "simmedium", "simlarge"]
+
+
+def parse_options():
+
+    parser = argparse.ArgumentParser(description='For use with gem5. This '
+                'runs a NAS Parallel Benchmark application. This only works '
+                'with x86 ISA.')
+
+    parser.add_argument("kernel", type=str,
+                        help="Path to the kernel binary to boot")
+    parser.add_argument("disk", type=str, help="Path to the PARSEC disk image")
+    parser.add_argument("cpu", type=str, choices=supported_cpu_types,
+                        help="The type of CPU to use in the system")
+    parser.add_argument("benchmark", type=str, choices=benchmark_choices,
+                        help="The PARSEC benchmark application to run")
+    parser.add_argument("size", type=str, choices=size_choices,
+                        help="The input size to the PARSEC benchmark "
+                             "application")
+    parser.add_argument("num_cpus", type=int, choices=[1,2,8],
+                        help="The number of CPU cores")
+
+    return parser.parse_args()
+
 def writeBenchScript(dir, bench, size, num_cpus):
     """
     This method creates a script in dir which will be eventually
@@ -72,21 +95,19 @@
     return file_name
 
 if __name__ == "__m5_main__":
-    (opts, args) = SimpleOpts.parse_args()
-    kernel, disk, cpu, benchmark, size, num_cpus = args
 
-    if not cpu in ['kvm', 'timing']:
-        m5.fatal("cpu not supported")
+    args = parse_options()
 
     # create the system we are going to simulate
-    system = MyRubySystem(kernel, disk, int(num_cpus), opts)
+    system = MyRubySystem(args.kernel, args.disk, args.num_cpus, args)
 
     # Exit from guest on workbegin/workend
     system.exit_on_work_items = True
 
     # Create and pass a script to the simulated system to run the reuired
     # benchmark
-    system.readfile = writeBenchScript(m5.options.outdir, benchmark, size, num_cpus)
+    system.readfile = writeBenchScript(m5.options.outdir, args.benchmark,
+                                      args.size, args.num_cpus)
 
     # set up the root SimObject and start the simulation
     root = Root(full_system = True, system = system)
@@ -106,7 +127,7 @@
     globalStart = time.time()
 
     print("Running the simulation")
-    print("Using cpu: {}".format(cpu))
+    print("Using cpu: {}".format(args.cpu))
 
     start_tick = m5.curTick()
     end_tick = m5.curTick()
@@ -126,7 +147,7 @@
         start_tick = m5.curTick()
         start_insts = system.totalInsts()
         # switching to timing cpu if argument cpu == timing
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.cpu, system.timingCpu)
     else:
         print("Unexpected termination of simulation!")
@@ -159,7 +180,7 @@
         end_insts = system.totalInsts()
         m5.stats.reset()
         # switching to timing cpu if argument cpu == timing
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             # This line is commented due to an unimplemented
             # flush request in MESI_Two_Level that results in
             # the crashing of simulation. There will be a patch
diff --git a/src/parsec/configs-mesi-two-level/system/fs_tools.py b/src/parsec/configs-mesi-two-level/system/fs_tools.py
index 91f6646..9c02722 100755
--- a/src/parsec/configs-mesi-two-level/system/fs_tools.py
+++ b/src/parsec/configs-mesi-two-level/system/fs_tools.py
@@ -27,7 +27,9 @@
 #
 
 from m5.objects import IdeDisk, CowDiskImage, RawDiskImage
-
+import errno
+import os
+import sys
 class CowDisk(IdeDisk):
 
     def __init__(self, filename):
diff --git a/src/parsec/configs-mesi-two-level/system/ruby_system.py b/src/parsec/configs-mesi-two-level/system/ruby_system.py
index b6a647e..e4e9d1f 100755
--- a/src/parsec/configs-mesi-two-level/system/ruby_system.py
+++ b/src/parsec/configs-mesi-two-level/system/ruby_system.py
@@ -29,7 +29,6 @@
 import m5
 import math
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 class MyRubySystem(System):
diff --git a/src/parsec/configs/run_parsec.py b/src/parsec/configs/run_parsec.py
index bab584e..5dac66d 100644
--- a/src/parsec/configs/run_parsec.py
+++ b/src/parsec/configs/run_parsec.py
@@ -34,19 +34,42 @@
     https://github.com/darchr/parsec-benchmark.git to create a working
     disk-image for this script.
 """
-import errno
-import os
-import sys
+import argparse
 import time
 import m5
 import m5.ticks
 from m5.objects import *
 
-sys.path.append('gem5/configs/common/') # For the next line...
-import SimpleOpts
-
 from system import *
 
+supported_cpu_types = ["kvm", "timing"]
+benchmark_choices = ["blackscholes", "bodytrack", "canneal", "dedup",
+                     "facesim", "ferret", "fluidanimate", "freqmine",
+                     "raytrace", "streamcluster", "swaptions", "vips", "x264"]
+size_choices=["simsmall", "simmedium", "simlarge"]
+
+
+def parse_options():
+
+    parser = argparse.ArgumentParser(description='For use with gem5. This '
+                'runs a NAS Parallel Benchmark application. This only works '
+                'with x86 ISA.')
+
+    parser.add_argument("kernel", type=str,
+                        help="Path to the kernel binary to boot")
+    parser.add_argument("disk", type=str, help="Path to the PARSEC disk image")
+    parser.add_argument("cpu", type=str, choices=supported_cpu_types,
+                        help="The type of CPU to use in the system")
+    parser.add_argument("benchmark", type=str, choices=benchmark_choices,
+                        help="The PARSEC benchmark application to run")
+    parser.add_argument("size", type=str, choices=size_choices,
+                        help="The input size to the PARSEC benchmark "
+                             "application")
+    parser.add_argument("num_cpus", type=int, choices=[1,2,8],
+                        help="The number of CPU cores")
+
+    return parser.parse_args()
+
 def writeBenchScript(dir, bench, size, num_cpus):
     """
     This method creates a script in dir which will be eventually
@@ -69,22 +92,19 @@
     return file_name
 
 if __name__ == "__m5_main__":
-    (opts, args) = SimpleOpts.parse_args()
-    kernel, disk, cpu, benchmark, size, num_cpus = args
 
-    if not cpu in ['kvm', 'timing']:
-        m5.fatal("cpu not supported")
+    args = parse_options()
 
     # create the system
-    system = MySystem(kernel, disk, cpu, int(num_cpus))
+    system = MySystem(args.kernel, args.disk, args.cpu, args.num_cpus)
 
     # Exit from guest on workbegin/workend
     system.exit_on_work_items = True
 
     # Create and pass a script to the simulated system to run the reuired
     # benchmark
-    system.readfile = writeBenchScript(m5.options.outdir, benchmark, size,
-                                       num_cpus)
+    system.readfile = writeBenchScript(m5.options.outdir, args.benchmark,
+                                      args.size, args.num_cpus)
 
     # set up the root SimObject and start the simulation
     root = Root(full_system = True, system = system)
@@ -104,7 +124,7 @@
     globalStart = time.time()
 
     print("Running the simulation")
-    print("Using cpu: {}".format(cpu))
+    print("Using cpu: {}".format(args.cpu))
 
     start_tick = m5.curTick()
     end_tick = m5.curTick()
@@ -124,7 +144,7 @@
         start_tick = m5.curTick()
         start_insts = system.totalInsts()
         # switching to timing cpu if argument cpu == timing
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.cpu, system.detailedCpu)
     else:
         print("Unexpected termination of simulation!")
@@ -157,7 +177,7 @@
         end_insts = system.totalInsts()
         m5.stats.reset()
         # switching to timing cpu if argument cpu == timing
-        if cpu == 'timing':
+        if args.cpu == 'timing':
             system.switchCpus(system.timingCpu, system.cpu)
     else:
         print("Unexpected termination of simulation!")
diff --git a/src/parsec/configs/system/caches.py b/src/parsec/configs/system/caches.py
index 4316aa1..7d60733 100644
--- a/src/parsec/configs/system/caches.py
+++ b/src/parsec/configs/system/caches.py
@@ -27,14 +27,10 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script. It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
diff --git a/src/parsec/configs/system/ruby_system.py b/src/parsec/configs/system/ruby_system.py
index 687e252..3959a71 100644
--- a/src/parsec/configs/system/ruby_system.py
+++ b/src/parsec/configs/system/ruby_system.py
@@ -26,7 +26,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
diff --git a/src/parsec/configs/system/system.py b/src/parsec/configs/system/system.py
index c2360c0..09030c2 100644
--- a/src/parsec/configs/system/system.py
+++ b/src/parsec/configs/system/system.py
@@ -26,7 +26,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
diff --git a/src/riscv-fs/README.md b/src/riscv-fs/README.md
index 08b386e..e5bad0e 100644
--- a/src/riscv-fs/README.md
+++ b/src/riscv-fs/README.md
@@ -17,7 +17,7 @@
 
 The used disk image is based on [busybox](https://busybox.net/) and [UCanLinux](https://github.com/UCanLinux/). It is built using the instructions, mostly from [here](https://github.com/UCanLinux/riscv64-sample).
 
-All components are cross compiled on an x86 host using a riscv tool chain.
+**Note:** All components are cross compiled on an x86 host using a riscv tool chain. We used `88b004d4c2a7d4e4f08b17ee32d2` commit of the riscv tool chain source while building the source (riscv gcc version 10.2.0).
 
 We assume the following directory structure while following the instructions in this README file:
 
@@ -27,8 +27,6 @@
   |
   |___ riscv-disk                              # built disk image will go here
   |
-  |___ device.dts                              # device tree file to use with bbl
-  |
   |___ riscv-gnu-toolchain                     # riscv tool chain for cross compilation
   |
   |___ riscv64-sample                          # UCanLinux source
@@ -38,7 +36,7 @@
   |       |__RootFS                            # root file system for disk image
   |
   |
-  |___ configs-riscv-fs
+  |___ configs
   |      |___ system                           # gem5 system config files
   |      |___ run_riscv.py                     # gem5 run script
   |
@@ -57,6 +55,7 @@
 # clone riscv gnu toolchain source
 git clone https://github.com/riscv/riscv-gnu-toolchain
 cd riscv-gnu-toolchain
+git checkout 88b004d4c2a7d4e4f08b17ee32d2
 
 # change the prefix to your directory
 # of choice for installation of the
@@ -73,7 +72,7 @@
 export PATH=$PATH:/opt/riscv/bin/
 ```
 
-***Note:** The above step is necessary and might cause errors while cross compiling different components for riscv if other methods are used to point to the toolchain.
+**Note:** The above step is necessary and might cause errors while cross compiling different components for riscv if other methods are used to point to the toolchain.
 
 ## UCanLinux Source
 
@@ -86,7 +85,7 @@
 git clone https://github.com/UCanLinux/riscv64-sample
 ```
 
-This source contains already built bootloader and disk images as well. Though the given disk image might be usable with gem5, the `bbl` (bootloader image) will not work with gem5 and we need to compile `bbl` with an input device tree (`.dts`) file separately. The following sections provide instructions to build both `bbl` and disk images.
+The following sections provide instructions to build both `bbl` and disk images.
 
 ## Linux Kernel
 
@@ -113,6 +112,7 @@
 ```
 
 This should generate a `vmlinux` image in the `linux` directory.
+A pre-built RISC-V 5.10 linux kernel can be downloaded [here](http://dist.gem5.org/dist/v21-1/kernels/riscv/static/vmlinux-5.10).
 
 ## Bootloader (bbl)
 
@@ -130,10 +130,9 @@
 
 apt-get install device-tree-compiler
 
-# copy the device tree file from riscv-fs
-cp ../../../device.dts .
+# configure bbl build
+../configure --host=riscv64-unknown-linux-gnu --with-payload=../../linux/vmlinux --prefix=/opt/riscv/
 
-../configure --host=riscv64-unknown-linux-gnu --with-payload=../../linux/vmlinux --prefix=/opt/riscv/ --with-dts=device.dts
 make -j$(nproc)
 
 chmod 755 bbl
@@ -143,6 +142,7 @@
 ```
 
 This will produce a `bbl` bootloader binary with linux kernel in `riscv-pk/build` directory.
+A pre-built copy of this bootloard binary, with the linux kernel can be downloaded [here](http://dist.gem5.org/dist/develop/kernels/riscv/static/bootloader-vmlinux-5.10).
 
 ## Busy Box
 
@@ -155,7 +155,7 @@
 cd busybox
 git checkout 1_30_stable  # checkout the latest stable branch
 make menuconfig
-cp ../sample/busybox.config .config  # optional
+cp ../busybox.config .config  # optional
 make menuconfig
 make CROSS_COMPILE=riscv64-unknown-linux-gnu- all -j$(nproc)
 make CROSS_COMPILE=riscv64-unknown-linux-gnu- install
@@ -167,7 +167,7 @@
 
 ```sh
 # going back to riscv64-sample directory
-cd ../..
+cd ../
 
 mkdir RootFS
 cd RootFS
@@ -191,12 +191,16 @@
 
 # build m5 util for riscv and move
 # it to the root file system as well
-cd ../../../
+cd ../../../../
 cd gem5/util/m5
-scons -C util/m5 build/riscv/out/m5
-cp build/riscv/out/m5 ../../../RootFS/sbin/
+scons build/riscv/out/m5
+cp build/riscv/out/m5 ../../../riscv64-sample/RootFS/sbin/
 ```
 
+**Note**: the default cross-compiler is `riscv64-unknown-linux-gnu-`. To change the cross-compiler, you can set the cross-compiler using the scons sticky variable `riscv.CROSS_COMPILE`. For example,
+```sh
+scons riscv.CROSS_COMPILE=riscv64-linux-gnu- build/riscv/out/m5
+```
 ## Disk Image
 
 Create a disk of 512MB size.
@@ -214,16 +218,15 @@
 sudo mkdir /mnt/rootfs
 sudo mount riscv_disk /mnt/rootfs
 
-sudo cp -a RootFS/* /mnt/rootfs
+sudo cp -a riscv64-sample/RootFS/* /mnt/rootfs
 
 sudo chown -R -h root:root /mnt/rootfs/
 df /mnt/rootfs
-# make sure you are in riscv64-sample dir
-cd ../riscv64-sample
 sudo umount /mnt/rootfs
 ```
 
 The disk image `riscv_disk` is ready to use.
+A pre-built, gzipped, disk image can be downloaded [here](http://dist.gem5.org/dist/develop/images/riscv/busybox/riscv-disk.img.gz).
 
 **Note:** If you need to resize the disk image once it is created, you can do the following:
 
@@ -240,27 +243,34 @@
 
 ## gem5 Run Scripts
 
-gem5 scripts which can configure a riscv full system and run simulation are available in configs-riscv-fs/.
+gem5 scripts which can configure a riscv full system and run simulation are available in configs/.
 The main script `run_riscv.py` expects following arguments:
 
-**bbl:** path to the bbl (berkeley bootloader) binary with kernel payload.
+**bbl:** path to the bbl (berkeley bootloader) binary with kernel payload (located at `riscv64-sample/riscv-pk/build/bbl`).
 
-**disk:** path to the disk image to use.
+**disk:** path to the disk image to use (located at `riscv64-sample/riscv_disk`).
 
-**cpu_type:** cpu model (`atomic`, `simple`).
+**cpu_type:** cpu model (`atomic` for AtomicSimpleCPU, `simple` for TimingSimpleCPU).
 
 **num_cpus:** number of cpu cores.
 
 An example use of this script is the following:
 
 ```sh
-[gem5 binary] -re configs/run_exit.py [path to bbl] [path to the disk image] atomic 4
+[gem5 binary] configs/run_riscv.py [path to bbl] [path to the disk image] atomic 1
 ```
 
-To interact with the simulated system's console:
+To interact with the simulated system's console, you can use `telnet`,
 
 ```sh
-telnet localhost 3457 (this port number comes from `simerr` file)
+telnet localhost <port>
+```
+
+Another option is to use `m5term` provided by gem5. To compile and launch `m5term`,
+```sh
+cd gem5/util/term
+make                         # compiling
+./m5term localhost <port>    # launching the terminal
 ```
 
 The default linux system based on this README, has both `login` and `password` set as `root`.
diff --git a/src/riscv-fs/configs-riscv-fs/run_riscv.py b/src/riscv-fs/configs/run_riscv.py
similarity index 96%
rename from src/riscv-fs/configs-riscv-fs/run_riscv.py
rename to src/riscv-fs/configs/run_riscv.py
index 1ca8665..3e3b747 100755
--- a/src/riscv-fs/configs-riscv-fs/run_riscv.py
+++ b/src/riscv-fs/configs/run_riscv.py
@@ -26,8 +26,8 @@
 
 """
 This script is supposed to run full system simulation for RISCV targets.
-It has been tested with classic memory system and Atomic
-and TimingSimpleCPU so far.
+It has been tested with classic memory system and Atomic,
+TimingSimpleCPU, and MinorCPU so far.
 """
 
 import time
diff --git a/src/riscv-fs/configs-riscv-fs/system/__init__.py b/src/riscv-fs/configs/system/__init__.py
similarity index 100%
rename from src/riscv-fs/configs-riscv-fs/system/__init__.py
rename to src/riscv-fs/configs/system/__init__.py
diff --git a/src/riscv-fs/configs-riscv-fs/system/system.py b/src/riscv-fs/configs/system/system.py
similarity index 73%
rename from src/riscv-fs/configs-riscv-fs/system/system.py
rename to src/riscv-fs/configs/system/system.py
index 0150972..3f545f1 100755
--- a/src/riscv-fs/configs-riscv-fs/system/system.py
+++ b/src/riscv-fs/configs/system/system.py
@@ -23,10 +23,12 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
 
 import m5
 from m5.objects import *
 from m5.util import convert
+from os import path
 
 '''
 This class creates a bare bones RISCV full system.
@@ -37,6 +39,43 @@
 d3c2-44ea-85fb-acc1df282e21_FU540-C000-v1p3.pdf
 '''
 
+# Dtb generation code from configs/example/riscv/fs_linux.py
+def generateMemNode(state, mem_range):
+    node = FdtNode("memory@%x" % int(mem_range.start))
+    node.append(FdtPropertyStrings("device_type", ["memory"]))
+    node.append(FdtPropertyWords("reg",
+        state.addrCells(mem_range.start) +
+        state.sizeCells(mem_range.size()) ))
+    return node
+
+def generateDtb(system):
+    """
+    Autogenerate DTB. Arguments are the folder where the DTB
+    will be stored, and the name of the DTB file.
+    """
+    state = FdtState(addr_cells=2, size_cells=2, cpu_cells=1)
+    root = FdtNode('/')
+    root.append(state.addrCellsProperty())
+    root.append(state.sizeCellsProperty())
+    root.appendCompatible(["riscv-virtio"])
+
+    for mem_range in system.mem_ranges:
+        root.append(generateMemNode(state, mem_range))
+
+    sections = [*system.cpu, system.platform]
+
+    for section in sections:
+        for node in section.generateDeviceTree(state):
+            if node.get_name() == root.get_name():
+                root.merge(node)
+            else:
+                root.append(node)
+
+    fdt = Fdt()
+    fdt.add_rootnode(root)
+    fdt.writeDtsFile(path.join(m5.options.outdir, 'device.dts'))
+    fdt.writeDtbFile(path.join(m5.options.outdir, 'device.dtb'))
+
 class RiscvSystem(System):
 
     def __init__(self, bbl, disk, cpu_type, num_cpus):
@@ -55,41 +94,24 @@
         # This connects to main memory
         self.membus = SystemXBar(width = 64) # 64-byte width
 
+        # Add a bad addr responder
+        self.membus.badaddr_responder = BadAddr()
+        self.membus.default = self.membus.badaddr_responder.pio
+
         # Set up the system port for functional access from the simulator
         self.system_port = self.membus.cpu_side_ports
 
         # Create the CPUs for our system.
         self.createCPU(cpu_type, num_cpus)
 
-        # using RISCV bare metal as the base full system workload
-        self.workload = RiscvBareMetal()
-
-        # this is user passed berkeley boot loader binary
-        # currently the Linux kernel payload is compiled into this
-        # as well
-        self.workload.bootloader = bbl
-
         # HiFive platform
         # This is based on a HiFive RISCV board and has
         # only a limited number of devices so far i.e.
         # PLIC, CLINT, UART, VirtIOMMIO
         self.platform = HiFive()
 
-        # Next, create and intialize devices
-        # currently supported for RISCV
-
-        # add a disk image
-        self.attachDisk(disk)
-
-        # set up core and platform
-        # level interrupt controllers
-        self.setupIntrCtrl()
-
-        # set up PMA checker
-        self.pmaChecker()
-
-        # attach off and on chip IO
-        self.attachIO(self.membus)
+        # create and intialize devices currently supported for RISCV
+        self.initDevices(self.membus, disk)
 
         # Create the cache heirarchy for the system.
         self.createCacheHierarchy()
@@ -99,6 +121,28 @@
 
         self.setupInterrupts()
 
+        # using RiscvLinux as the base full system workload
+        self.workload = RiscvLinux()
+
+        # this is user passed berkeley boot loader binary
+        # currently the Linux kernel payload is compiled into this
+        # as well
+        self.workload.object_file = bbl
+
+        # Generate DTB (from configs/example/riscv/fs_linux.py)
+        generateDtb(self)
+        self.workload.dtb_filename = path.join(m5.options.outdir, 'device.dtb')
+        # Default DTB address if bbl is bulit with --with-dts option
+        self.workload.dtb_addr = 0x87e00000
+
+        # Linux boot command flags
+        kernel_cmd = [
+            "console=ttyS0",
+            "root=/dev/vda",
+            "ro"
+        ]
+        self.workload.command_line = " ".join(kernel_cmd)
+
     def createCPU(self, cpu_type, num_cpus):
         if cpu_type == "atomic":
             self.cpu = [AtomicSimpleCPU(cpu_id = i)
@@ -108,6 +152,10 @@
             self.cpu = [TimingSimpleCPU(cpu_id = i)
                         for i in range(num_cpus)]
             self.mem_mode = 'timing'
+        elif cpu_type == "minor":
+            self.cpu = [MinorCPU(cpu_id = i)
+                        for i in range(num_cpus)]
+            self.mem_mode = 'timing'
         else:
             m5.fatal("No CPU type {}".format(cpu_type))
 
@@ -170,20 +218,10 @@
                     port = self.membus.mem_side_ports)
         ]
 
-    def attachIO(self, membus):
+    def initDevices(self, membus, disk):
+
         self.iobus = IOXBar()
 
-        self.bridge = Bridge(delay='50ns')
-        self.bridge.master = self.iobus.slave
-        self.bridge.slave = self.membus.master
-        self.bridge.ranges = self.platform._off_chip_ranges()
-
-        # Connecting on chip and off chip IO to the mem
-        # and IO bus
-        self.platform.attachOnChipIO(self.membus)
-        self.platform.attachOffChipIO(self.iobus)
-
-    def setupIntrCtrl(self):
         # Set the frequency of RTC (real time clock) used by
         # CLINT (core level interrupt controller).
         # This frequency is 1MHz in SiFive's U54MC.
@@ -193,12 +231,17 @@
         # RTC sends the clock signal to CLINT via an interrupt pin.
         self.platform.clint.int_pin = self.platform.rtc.int_pin
 
-        # Attach the PLIC (platform level interrupt controller)
-        # to the platform. This initializes the PLIC with
-        # interrupt sources coming from off chip devices
-        self.platform.attachPlic()
+        # VirtIOMMIO
+        image = CowDiskImage(child=RawDiskImage(read_only=True), read_only=False)
+        image.child.image_file = disk
+        # using reserved memory space
+        self.platform.disk = MmioVirtIO(
+            vio=VirtIOBlock(image=image),
+            interrupt_id=0x8,
+            pio_size = 4096,
+            pio_addr=0x10008000
+        )
 
-    def pmaChecker(self):
         # From riscv/fs_linux.py
         uncacheable_range = [
             *self.platform._on_chip_ranges(),
@@ -211,16 +254,20 @@
         # or MMU-level (system.cpu[0].mmu.pma_checker). It will be resolved
         # by RiscvTLB's Parent.any proxy
 
-        self.pma_checker =  PMAChecker(uncacheable=uncacheable_range)
+        for cpu in self.cpu:
+            cpu.mmu.pma_checker =  PMAChecker(uncacheable=uncacheable_range)
 
-    def attachDisk(self, disk):
-        # VirtIOMMIO
-        image = CowDiskImage(child=RawDiskImage(read_only=True), read_only=False)
-        image.child.image_file = disk
-        # using reserved memory space
-        self.platform.disk = MmioVirtIO(
-            vio=VirtIOBlock(image=image),
-            interrupt_id=0x8,
-            pio_size = 4096,
-            pio_addr=0x10008000
-        )
+        self.bridge = Bridge(delay='50ns')
+        self.bridge.mem_side_port = self.iobus.cpu_side_ports
+        self.bridge.cpu_side_port = self.membus.mem_side_ports
+        self.bridge.ranges = self.platform._off_chip_ranges()
+
+        # Connecting on chip and off chip IO to the mem
+        # and IO bus
+        self.platform.attachOnChipIO(self.membus)
+        self.platform.attachOffChipIO(self.iobus)
+
+        # Attach the PLIC (platform level interrupt controller)
+        # to the platform. This initializes the PLIC with
+        # interrupt sources coming from off chip devices
+        self.platform.attachPlic()
diff --git a/src/riscv-fs/device.dts b/src/riscv-fs/device.dts
deleted file mode 100644
index 7181c6c..0000000
--- a/src/riscv-fs/device.dts
+++ /dev/null
@@ -1,80 +0,0 @@
-/dts-v1/;
-
-/ {
-	#address-cells = <0x2>;
-	#size-cells = <0x2>;
-	compatible = "riscv-virtio";
-	model = "riscv-virtio,qemu";
-
-	chosen {
-		bootargs = "root=/dev/vda ro console=ttyS0";
-		stdout-path = "/soc/uart@10000000";
-	};
-
-	memory@80000000 {
-		device_type = "memory";
-		reg = <0x0 0x80000000 0x0 0x8000000>;
-	};
-
-	cpus {
-		#address-cells = <0x1>;
-		#size-cells = <0x0>;
-		timebase-frequency = <0x989680>;
-
-		cpu@0 {
-			phandle = <0x1>;
-			device_type = "cpu";
-			reg = <0x0>;
-			status = "okay";
-			compatible = "riscv";
-			riscv,isa = "rv64imafdcsu";
-			mmu-type = "riscv,sv48";
-
-			interrupt-controller {
-				#interrupt-cells = <0x1>;
-				interrupt-controller;
-				compatible = "riscv,cpu-intc";
-				phandle = <0x2>;
-			};
-		};
-	};
-
-	soc {
-		#address-cells = <0x2>;
-		#size-cells = <0x2>;
-		compatible = "simple-bus";
-		ranges;
-
-		uart@10000000 {
-			interrupts = <0xa>;
-			interrupt-parent = <0x3>;
-			clock-frequency = <0x384000>;
-			reg = <0x0 0x10000000 0x0 0x008>;
-			compatible = "ns8250";
-		};
-
-		plic@c000000 {
-			phandle = <0x3>;
-			riscv,ndev = <0xa>;
-			reg = <0x0 0xc000000 0x0 0x210000>;
-			interrupts-extended = <0x2 0xb 0x2 0x9>;
-			interrupt-controller;
-			compatible = "riscv,plic0";
-			#interrupt-cells = <0x1>;
-			#address-cells = <0x0>;
-		};
-
-		virtio_mmio@10008000 {
-			interrupts = <0x8>;
-			interrupt-parent = <0x3>;
-			reg = <0x0 0x10008000 0x0 0x1000>;
-			compatible = "virtio,mmio";
-		};
-
-		clint@2000000 {
-			interrupts-extended = <0x2 0x3 0x2 0x7>;
-			reg = <0x0 0x2000000 0x0 0x10000>;
-			compatible = "riscv,clint0";
-		};
-	};
-};
diff --git a/src/spec-2006/README.md b/src/spec-2006/README.md
index 15706be..30f3fd5 100644
--- a/src/spec-2006/README.md
+++ b/src/spec-2006/README.md
@@ -85,7 +85,7 @@
 
 `kernel`: required, a positional argument specifying the path to the Linux
 kernel. We have tested using version 4.19.83, which can be downloaded from
-<http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.19.83>. Info on
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>. Info on
 building Linux kernels for gem5 can be found in `src/linux-kernel`
 
 `disk`: required, a positional argument specifying the path to the disk image
diff --git a/src/spec-2006/configs/run_spec.py b/src/spec-2006/configs/run_spec.py
index 461e956..6d17bd9 100644
--- a/src/spec-2006/configs/run_spec.py
+++ b/src/spec-2006/configs/run_spec.py
@@ -69,7 +69,6 @@
                   By default, the ports are off.
 """
 import os
-import sys
 
 import m5
 import m5.ticks
diff --git a/src/spec-2006/configs/system/caches.py b/src/spec-2006/configs/system/caches.py
index 9932ecf..3e786b7 100644
--- a/src/spec-2006/configs/system/caches.py
+++ b/src/spec-2006/configs/system/caches.py
@@ -30,14 +30,11 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script.  It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
 import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
+from m5.objects import Cache, L2XBar, StridePrefetcher
 
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
diff --git a/src/spec-2006/configs/system/ruby_system.py b/src/spec-2006/configs/system/ruby_system.py
index 1ffea57..d1ddb07 100755
--- a/src/spec-2006/configs/system/ruby_system.py
+++ b/src/spec-2006/configs/system/ruby_system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
diff --git a/src/spec-2006/configs/system/system.py b/src/spec-2006/configs/system/system.py
index b6e434e..35f0721 100644
--- a/src/spec-2006/configs/system/system.py
+++ b/src/spec-2006/configs/system/system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *
 
diff --git a/src/spec-2017/README.md b/src/spec-2017/README.md
index a51ba89..4a355c5 100644
--- a/src/spec-2017/README.md
+++ b/src/spec-2017/README.md
@@ -86,7 +86,7 @@
 
 `kernel`: required, a positional argument specifying the path to the Linux
 kernel. This has been tested with version 4.19.83, available at
-<http://dist.gem5.org/dist/develop/kernels/x86/static/vmlinux-4.19.83>. Info on
+<http://dist.gem5.org/dist/v21-1/kernels/x86/static/vmlinux-4.19.83>. Info on
 building Linux kernels can be found in `src/linux-kernels`.
 
 `disk`: required, a positional argument specifying the path to the disk image
diff --git a/src/spec-2017/configs/run_spec.py b/src/spec-2017/configs/run_spec.py
index f00913c..63f3934 100644
--- a/src/spec-2017/configs/run_spec.py
+++ b/src/spec-2017/configs/run_spec.py
@@ -68,7 +68,6 @@
                   By default, the ports are off.
 """
 import os
-import sys
 
 import m5
 import m5.ticks
diff --git a/src/spec-2017/configs/system/caches.py b/src/spec-2017/configs/system/caches.py
index 9932ecf..84f63e7 100644
--- a/src/spec-2017/configs/system/caches.py
+++ b/src/spec-2017/configs/system/caches.py
@@ -30,15 +30,10 @@
 """ Caches with options for a simple gem5 configuration script
 
 This file contains L1 I/D and L2 caches to be used in the simple
-gem5 configuration script.  It uses the SimpleOpts wrapper to set up command
-line options from each individual class.
+gem5 configuration script.
 """
 
-import m5
-from m5.objects import Cache, L2XBar, StridePrefetcher, SubSystem
-from m5.params import AddrRange, AllMemory, MemorySize
-from m5.util.convert import toMemorySize
-
+from m5.objects import Cache, L2XBar, StridePrefetcher
 # Some specific options for caches
 # For all options see src/mem/cache/BaseCache.py
 
diff --git a/src/spec-2017/configs/system/ruby_system.py b/src/spec-2017/configs/system/ruby_system.py
index 1ffea57..d1ddb07 100755
--- a/src/spec-2017/configs/system/ruby_system.py
+++ b/src/spec-2017/configs/system/ruby_system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 
 
diff --git a/src/spec-2017/configs/system/system.py b/src/spec-2017/configs/system/system.py
index a35e570..b07596c 100644
--- a/src/spec-2017/configs/system/system.py
+++ b/src/spec-2017/configs/system/system.py
@@ -29,7 +29,6 @@
 
 import m5
 from m5.objects import *
-from m5.util import convert
 from .fs_tools import *
 from .caches import *