resources: Update DNNMark for ROCm 4 and dGPU support

This patch implements fixes for building DNNMark, building MIOpen
cachefiles, and workarounds for memory management bugs in gem5 that
were a result of the upgrade to ROCm 4.

Previously, DNNMark used the hcc compiler, which is now removed from
ROCm. This patch updates DNNMark's build process to use hipcc instead.

Previously, MIOpen cachefiles were located in a directory that contained
all of the files individually. ROCm 4 instead uses a SQLite database.
This patch removes the old method used for generating cachefiles, and
implements a new method in python that handles creating and inserting
the cachefiles into the database.

This patch also updates the memory management code in DNNMark to use
hipHostMalloc instead of hipMalloc, as hipMemcpys on hipMalloc'd memory
is currently not working in gem5.

This patch also removes the Dockerfile previously used to create an
image with glog and gflags, as they are installed by default in the
current gcn-gpu Docker image.

Change-Id: I3045fbd2adfbf69904732a52f04d963c0cd62482
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/48485
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>
diff --git a/README.md b/README.md
index a5c7a0e..9bd8929 100644
--- a/README.md
+++ b/README.md
@@ -519,30 +519,25 @@
 
 ## Compilation and Running
 
-DNNMark requires additional programs that aren't installed in the standard GCN
-docker image. There is a Dockerfile in `src/gpu/DNNMark` that installs the additional
-software.
-
-To build DNNMark (Including the new docker image):
+To build DNNMark:
 **NOTE**: Due to DNNMark building a library, it's important to mount gem5-resources
 to the same directory within the docker container when building and running, as otherwise the benchmarks
 won't be able to link against the library. The example commands do this by using
 `-v ${PWD}:${PWD}` in the docker run commands
 ```
 cd src/gpu/DNNMark
-docker build -t <image_name> .
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> ./setup.sh HIP
-docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID <image_name> make
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu ./setup.sh HIP
+docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
 DNNMark uses MIOpen kernels, which are unable to be compiled on-the-fly in gem5.
-We have provided a shell script to generate these kernels for a subset of the
-benchmarks.
+We have provided a python script to generate these kernels for a subset of the
+benchmarks for a gfx801 GPU with 4 CUs by default
 
 To generate the MIOpen kernels:
 ```
 cd src/gpu/DNNMark
-docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> ./generate_cachefiles.sh
+docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu python3 generate_cachefiles.py cachefiles.csv [--gfx-version={gfx801,gfx803}] [--num-cus=N]
 ```
 
 Due to the large amounts of memory that need to be set up for DNNMark, we have
@@ -560,13 +555,13 @@
 To build GCN3_X86:
 ```
 # Working directory is your gem5 directory
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
 ```
 
 To run one of the benchmarks (fwd softmax) in gem5:
 ```
 # Assuming gem5 and gem5-resources are sub-directories of the current directory
-docker run --rm -u $UID:$GID -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
+docker run --rm -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
 ```
 
 
diff --git a/src/gpu/DNNMark/CMakeLists.txt b/src/gpu/DNNMark/CMakeLists.txt
index 352c08c..7b1adc7 100644
--- a/src/gpu/DNNMark/CMakeLists.txt
+++ b/src/gpu/DNNMark/CMakeLists.txt
@@ -40,8 +40,8 @@
   endif()
 endif()
 
-# Detect HCC
-find_program(HCC_FOUND hcc)
+# Detect HIPCC
+find_program(HIPCC_FOUND hipcc)
 
 option (double-test "Make data type double" OFF)
 option (enable-cudnnv6 "Enable cuDNN version 6" OFF)
@@ -124,7 +124,7 @@
                         ${GLOG_LIBRARY}
                         m)
 
-elseif(HCC_FOUND AND ${HCC_ENABLE})
+elseif(HIPCC_FOUND AND ${HCC_ENABLE})
 
   # Cover the include and linkage requirement here
   execute_process(COMMAND hcc-config  --cxxflags
@@ -188,9 +188,6 @@
 
   message(${ROCBLAS_LIBRARY} ${MIOPEN_LIBRARY})
 
-  # Find other libraries
-  find_library(HIP_HCC hip_hcc /opt/rocm/hip/lib)
-
   # Find glog libraries
   find_library(GLOG_LIBRARY glog)
 
@@ -200,7 +197,6 @@
   target_link_libraries(${PROJECT_NAME}
                         ${ROCBLAS_LIBRARY}
                         ${MIOPEN_LIBRARY}
-                        ${HIP_HCC}
                         ${GLOG_LIBRARY}
                         m)
   set_target_properties(${PROJECT_NAME} PROPERTIES
diff --git a/src/gpu/DNNMark/Dockerfile b/src/gpu/DNNMark/Dockerfile
deleted file mode 100644
index 5299b26..0000000
--- a/src/gpu/DNNMark/Dockerfile
+++ /dev/null
@@ -1,2 +0,0 @@
-FROM gcr.io/gem5-test/gcn-gpu
-RUN apt-get update && apt-get -y install libgflags-dev libgoogle-glog-dev
diff --git a/src/gpu/DNNMark/README.md b/src/gpu/DNNMark/README.md
index 763dc27..79256bc 100644
--- a/src/gpu/DNNMark/README.md
+++ b/src/gpu/DNNMark/README.md
@@ -20,30 +20,25 @@
 
 ## Compilation and Running
 
-DNNMark requires additional programs that aren't installed in the standard GCN
-docker image. There is a Dockerfile in `src/gpu/DNNMark` that installs the additional
-software.
-
-To build DNNMark (Including the new docker image):
+To build DNNMark:
 **NOTE**: Due to DNNMark building a library, it's important to mount gem5-resources
 to the same directory within the docker container when building and running, as otherwise the benchmarks
 won't be able to link against the library. The example commands do this by using
 `-v ${PWD}:${PWD}` in the docker run commands
 ```
 cd src/gpu/DNNMark
-docker build -t <image_name> .
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> ./setup.sh HIP
-docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID <image_name> make
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu ./setup.sh HIP
+docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
 ```
 
 DNNMark uses MIOpen kernels, which are unable to be compiled on-the-fly in gem5.
-We have provided a shell script to generate these kernels for a subset of the
-benchmarks.
+We have provided a python script to generate these kernels for a subset of the
+benchmarks for a gfx801 GPU with 4 CUs by default
 
 To generate the MIOpen kernels:
 ```
 cd src/gpu/DNNMark
-docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> ./generate_cachefiles.sh
+docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu python3 generate_cachefiles.py cachefiles.csv [--gfx-version={gfx801,gfx803}] [--num-cus=N]
 ```
 
 Due to the large amounts of memory that need to be set up for DNNMark, we have
@@ -61,13 +56,13 @@
 To build GCN3_X86:
 ```
 # Working directory is your gem5 directory
-docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
 ```
 
 To run one of the benchmarks (fwd softmax) in gem5:
 ```
 # Assuming gem5 and gem5-resources are sub-directories of the current directory
-docker run --rm -u $UID:$GID -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
+docker run --rm -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/gpu/DNNMark/cachefiles:/root/.cache/miopen/2.9.0 -w ${PWD} gcr.io/gem5-test/gcn-gpu gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n3 --benchmark-root=gem5-resources/src/gpu/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/gpu/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/gpu/DNNMark/mmap.bin"
 ```
 
 Information from the original DNNMark README included below.
diff --git a/src/gpu/DNNMark/cachefiles.csv b/src/gpu/DNNMark/cachefiles.csv
new file mode 100644
index 0000000..af971ab
--- /dev/null
+++ b/src/gpu/DNNMark/cachefiles.csv
@@ -0,0 +1,11 @@
+MIOpenBatchNormFwdTrainPerAct.cl, -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_SAVE_MEAN_VARIANCE=1 -DMIO_RUNNING_RESULT=1 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_LDS_SIZE=256 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=256 -DMIO_BN_GRP2=1 -DMIO_BN_NCHW=100000 -mcpu=gfx803
+MIOpenBatchNormBwdPerAct.cl, -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_NCHW=100000 -DMIO_BN_NGRPS=1 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=64 -DMIO_BN_GRP2=1 -mcpu=gfx803
+MIOpenNeuron.cl, -DLITE -DMIOPEN_READ_UNIT=4 -DMIOPEN_READ_TYPE=_FLOAT4 -DMIOPEN_NRN_OP_ID=0 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx803
+MIOpenSoftmax.cl,-DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DUSE_SOFTMAX_ACCURATE=1 -DUSE_SOFTMAX_MODE_CHANNEL=1 -DRUN_FORWARD=0 -DIS_OUTPUT_PACKED=1 -DIS_DOUTPUT_PACKED=1 -DIS_DINPUT_PACKED=1 -mcpu=gfx803
+MIOpenSoftmax.cl,-DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DUSE_SOFTMAX_ACCURATE=1 -DUSE_SOFTMAX_MODE_CHANNEL=1 -DRUN_FORWARD=1 -DIS_INPUT_PACKED=1 -DIS_OUTPUT_PACKED=1 -mcpu=gfx803
+MIOpenIm2d2Col.cl, -DNUM_CH_PER_WG=1 -DNUM_IM_BLKS_X=1 -DNUM_IM_BLKS=4 -DLOCAL_MEM_SIZE=432 -DSTRIDE_GT_1=0 -DTILE_SZ_X=32 -DTILE_SZ_Y=8 -DUSE_IM_OFF_GUARD=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_INT8=0 -DMIOPEN_USE_INT8x4=0 -DMIOPEN_USE_BFP16=0 -DMIOPEN_USE_INT32=0 -DMIOPEN_USE_RNE_BFLOAT16=1 -mcpu=gfx803
+MIOpenPooling.cl, -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_HORIZ_OUT_PIX=1 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=16 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_INDEX_TYPE=uchar -DMLO_POOLING_INDEX_MAX=UCHAR_MAX -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx803
+MIOpenPoolingBwd.cl, -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=1 -DMLO_POOLBWD_N_VERT_OUT_PIX=8 -DMLO_POOLBWD_GROUP_SZ0=32 -DMLO_POOLBWD_GROUP_SZ1=4 -DMLO_POOLING_INDEX_TYPE=uchar -DMLO_POOLING_INDEX_MAX=UCHAR_MAX -DUSE_IMG_INDEX=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx803
+MIOpenSubTensorOpWithScalarKernel.cl,-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_INT8=0 -DMIOPEN_USE_INT8x4=0 -DMIOPEN_USE_BFP16=0 -DMIOPEN_USE_INT32=0 -DMIOPEN_USE_RNE_BFLOAT16=1 -DWORK_LENGTH_0=4096 -mcpu=gfx803
+MIOpenCol2Im2d.cl, -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_INT8=0 -DMIOPEN_USE_INT8x4=0 -DMIOPEN_USE_BFP16=0 -DMIOPEN_USE_INT32=0 -DMIOPEN_USE_RNE_BFLOAT16=1 -mcpu=gfx803
+MIOpenPooling.cl, -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_HORIZ_OUT_PIX=1 -DMLO_POOLING_N_VERT_OUT_PIX=8 -DMLO_POOLING_GROUP_SZ0=16 -DMLO_POOLING_GROUP_SZ1=16 -DMLO_POOLING_INDEX_TYPE=uchar -DMLO_POOLING_INDEX_MAX=UCHAR_MAX -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx803
diff --git a/src/gpu/DNNMark/core/include/data_manager.h b/src/gpu/DNNMark/core/include/data_manager.h
index 8a4c10a..56c064a 100644
--- a/src/gpu/DNNMark/core/include/data_manager.h
+++ b/src/gpu/DNNMark/core/include/data_manager.h
@@ -46,7 +46,7 @@
     CUDA_CALL(cudaMalloc(&gpu_ptr_, size * sizeof(T)));
 #endif
 #ifdef AMD_MIOPEN
-    HIP_CALL(hipMalloc(&gpu_ptr_, size * sizeof(T)));
+    HIP_CALL(hipHostMalloc(&gpu_ptr_, size * sizeof(T)));
 #endif
   }
   ~Data() {
@@ -56,7 +56,7 @@
       CUDA_CALL(cudaFree(gpu_ptr_));
 #endif
 #ifdef AMD_MIOPEN
-      HIP_CALL(hipFree(gpu_ptr_));
+      HIP_CALL(hipHostFree(gpu_ptr_));
 #endif
     }
   }
diff --git a/src/gpu/DNNMark/core/include/data_png.h b/src/gpu/DNNMark/core/include/data_png.h
index 4a8d4d7..3f15d79 100644
--- a/src/gpu/DNNMark/core/include/data_png.h
+++ b/src/gpu/DNNMark/core/include/data_png.h
@@ -109,8 +109,7 @@
                         (static_cast <float> (RAND_MAX/seed));
     }
 
-    HIP_CALL(hipMemcpy(dev_ptr, host_ptr, size * sizeof(float),
-                       hipMemcpyHostToDevice));
+    memcpy(dev_ptr, host_ptr, size * sizeof(float));
     if (use_mmap) {
         munmap(host_ptr, size*sizeof(float));
     } else {
@@ -146,8 +145,7 @@
                         (static_cast <double> (RAND_MAX/seed));
     }
 
-    HIP_CALL(hipMemcpy(dev_ptr, host_ptr, size * sizeof(double),
-                       hipMemcpyHostToDevice));
+    memcpy(dev_ptr, host_ptr, size * sizeof(double));
 
     if (use_mmap) {
         munmap(host_ptr, size*sizeof(double));
diff --git a/src/gpu/DNNMark/generate_cachefiles.py b/src/gpu/DNNMark/generate_cachefiles.py
new file mode 100755
index 0000000..dc151d0
--- /dev/null
+++ b/src/gpu/DNNMark/generate_cachefiles.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+import argparse
+import bz2
+import csv
+import hashlib
+import os
+import shlex
+import sqlite3
+import subprocess
+import tempfile
+from pathlib import Path
+
+
+def parseArgs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('csv_file', type=str,
+                        help='File containing cache files to compile '
+                             'in the format of: filename, args')
+    parser.add_argument('--num-cus', default=4, type=int,
+                        help='Number of CUs in simulated GPU')
+    parser.add_argument('--gfx-version', default='gfx801',
+                        choices=['gfx801', 'gfx803'],
+                        help='gfx version of simulated GPU')
+
+    return parser.parse_args()
+
+
+def getDb(options):
+    db_name = f'{options.gfx_version}_{options.num_cus}.ukdb'
+    db_path = '/root/.cache/miopen/2.9.0/'
+
+    full_db_path = os.path.join(db_path, db_name)
+    # Should create file if it doesn't exist
+    # Does assume db_path exists, which it should in the Docker image
+    con = sqlite3.connect(full_db_path)
+
+    cur = con.cursor()
+
+    # Ripped from src/include/miopen/kern_db.hpp
+    cur.execute('''CREATE TABLE IF NOT EXISTS kern_db (
+                        id INTEGER PRIMARY KEY ASC,
+                        kernel_name TEXT NOT NULL,
+                        kernel_args TEXT NOT NULL,
+                        kernel_blob BLOB NOT NULL,
+                        kernel_hash TEXT NOT NULL,
+                        uncompressed_size INT NOT NULL);''')
+    cur.execute('''CREATE UNIQUE INDEX IF NOT EXISTS
+                    idx_kern_db ON kern_db (kernel_name, kernel_args);''')
+
+    return con
+
+
+def insertFiles(con, options):
+    miopen_kern_path = '/MIOpen/src/kernels'
+
+    extra_args = {'gfx801': '-Wno-everything -Xclang '
+                            '-target-feature -Xclang +code-object-v3',
+                  'gfx803': '-Wno-everything -Xclang '
+                            '-target-feature -Xclang +code-object-v3'}
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(options.csv_file) as csvfile:
+            reader = csv.reader(csvfile)
+            for row in reader:
+                miopen_kern = row[0]
+                miopen_kern_full = os.path.join(miopen_kern_path, miopen_kern)
+                # We want to manually add the gfx version
+                # Additionally, everything after the gfx version isn't
+                # used in the database
+                # Explicitly add the leading space because that's used
+                # in the database
+                args = (f' {row[1].split("-mcpu")[0].strip()} '
+                        f'-mcpu={options.gfx_version}')
+
+                # Hash to generate unique output files
+                file_hash = hashlib.md5(args.encode('utf-8')).hexdigest()
+                outfile = f'{miopen_kern}-{file_hash}.o'
+                full_outfile = os.path.join(tmpdir, outfile)
+
+                # Compile the kernel
+                cmd_str = (f'/opt/rocm/bin/clang-ocl {args} '
+                           f'{extra_args[options.gfx_version]} '
+                           f'{miopen_kern_full} -o {full_outfile}')
+                cmd_args = shlex.split(cmd_str)
+                subprocess.run(cmd_args, check=True)
+
+                # Get other params needed for db
+                uncompressed_file = open(full_outfile, 'rb').read()
+                uncompressed_size = Path(full_outfile).stat().st_size
+                uncompressed_hash = hashlib.md5(uncompressed_file).hexdigest()
+                compressed_blob = bz2.compress(uncompressed_file)
+
+                cur = con.cursor()
+                cur.execute('''INSERT OR IGNORE INTO kern_db
+                               (kernel_name, kernel_args, kernel_blob, kernel_hash, uncompressed_size)
+                               VALUES(?, ?, ?, ?, ?)''',
+                            (f'{miopen_kern}.o', args, compressed_blob,
+                                uncompressed_hash, uncompressed_size))
+
+
+def main():
+
+    args = parseArgs()
+
+    con = getDb(args)
+
+    insertFiles(con, args)
+
+    con.commit()
+    con.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gpu/DNNMark/generate_cachefiles.sh b/src/gpu/DNNMark/generate_cachefiles.sh
deleted file mode 100755
index 1f2a2e3..0000000
--- a/src/gpu/DNNMark/generate_cachefiles.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-cd /MIOpen/src/kernels
-
-# test_fwd_softmax/test_bwd_softmax
-mkdir -p /.cache/miopen/1.7.0/5c3130f7e6d7b29bb65a02f5de0084a6
-/opt/rocm/bin/clang-ocl  -DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenSoftmax.cl -o /.cache/miopen/1.7.0/5c3130f7e6d7b29bb65a02f5de0084a6/MIOpenSoftmax.cl.o
-
-# test_fwd_bn
-mkdir -p /.cache/miopen/1.7.0/f8850ed3a540a1e8eb258b582f554d57
-/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_SAVE_MEAN_VARIANCE=1 -DMIO_RUNNING_RESULT=1 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_LDS_SIZE=256 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=256 -DMIO_BN_GRP2=1 -DMIO_BN_NCHW=100000 -mcpu=gfx801 -Wno-everything MIOpenBatchNormFwdTrainPerAct.cl -o  /.cache/miopen/1.7.0/f8850ed3a540a1e8eb258b582f554d57/MIOpenBatchNormFwdTrainPerAct.cl.o
-
-# test_bwd_bn
-mkdir -p /.cache/miopen/1.7.0/2d295f7887fee4bec3c01ac73f8a25cd
-/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_NCHW=100000 -DMIO_BN_NGRPS=1 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=64 -DMIO_BN_GRP2=1 -mcpu=gfx801 -Wno-everything MIOpenBatchNormBwdPerAct.cl -o /.cache/miopen/1.7.0/2d295f7887fee4bec3c01ac73f8a25cd/MIOpenBatchNormBwdPerAct.cl.o
-
-# test_fwd_bypass/test_bwd_bypass
-mkdir -p /.cache/miopen/1.7.0/e213d754468ef6732bb836ed186f5783
-/opt/rocm/bin/clang-ocl  -DLITE -DMIOPEN_READ_UNIT=4 -DMIOPEN_READ_TYPE=_FLOAT4 -DMIOPEN_NRN_OP_ID=0 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenNeuron.cl -o /.cache/miopen/1.7.0/e213d754468ef6732bb836ed186f5783/MIOpenNeuron.cl.o
-
-# test_fwd_composed_model/test_bwd_composed_model
-mkdir -p /.cache/miopen/1.7.0/86de626b159aea830f0ba2f1788e0f40
-/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenUtilKernels2.cl -o /.cache/miopen/1.7.0/86de626b159aea830f0ba2f1788e0f40/MIOpenUtilKernels2.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/ac0046008721a79b06896f9a5a3ca2cc
-/opt/rocm/bin/clang-ocl  -DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DWORK_LENGTH_0=4096 -mcpu=gfx801 -Wno-everything MIOpenSubTensorOpWithScalarKernel.cl -o /.cache/miopen/1.7.0/ac0046008721a79b06896f9a5a3ca2cc/MIOpenSubTensorOpWithScalarKernel.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/7a58553f312474aa3cf449e5d9969a51
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=32 -DMLO_POOLING_N_CHANNELS=32 -DMLO_POOLING_N_HORIZ_OUT_PIX=4 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=8 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_BOT_BATCH_STRIDE=32768 -DMLO_POOLING_BOT_CHANNEL_STRIDE=1024 -DMLO_POOLING_BOT_STRIDE=32 -DMLO_POOLING_TOP_BATCH_STRIDE=8192 -DMLO_POOLING_TOP_CHANNEL_STRIDE=256 -DMLO_POOLING_TOP_STRIDE=16 -DMLO_POOLING_BOT_WIDTH=32 -DMLO_POOLING_BOT_HEIGHT=32 -DMLO_POOLING_TOP_WIDTH=16 -DMLO_POOLING_TOP_HEIGHT=16 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPooling.cl -o /.cache/miopen/1.7.0/7a58553f312474aa3cf449e5d9969a51/MIOpenPooling.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/f0ed53d85baef9414aa97b0b15b78a4d
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=32 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=2 -DMLO_POOLBWD_N_VERT_OUT_PIX=2 -DMLO_POOLBWD_GROUP_SZ0=8 -DMLO_POOLBWD_GROUP_SZ1=8 -DMLO_POOLBWD_BOT_WIDTH=32 -DMLO_POOLBWD_BOT_HEIGHT=32 -DMLO_POOLBWD_TOP_WIDTH=16 -DMLO_POOLBWD_TOP_HEIGHT=16 -DMLO_POOLBWD_BOTDF_BATCH_STRIDE=32768 -DMLO_POOLBWD_BOTDF_CHANNEL_STRIDE=1024 -DMLO_POOLBWD_BOTDF_STRIDE=32 -DMLO_POOLBWD_TOPDF_BATCH_STRIDE=8192 -DMLO_POOLBWD_TOPDF_CHANNEL_STRIDE=256 -DMLO_POOLBWD_TOPDF_STRIDE=16 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPoolingBwd.cl -o /.cache/miopen/1.7.0/f0ed53d85baef9414aa97b0b15b78a4d/MIOpenPoolingBwd.cl.o
-
-mkdir -p /.cache/miopen/1.7.0/ae3e81b4f4b5968e01343ac25026c938
-/opt/rocm/bin/clang-ocl  -DNUM_CH_PER_WG=1 -DNUM_IM_BLKS_X=1 -DNUM_IM_BLKS=4 -DLOCAL_MEM_SIZE=432 -DSTRIDE_GT_1=0 -DTILE_SZ_X=32 -DTILE_SZ_Y=8 -DUSE_IM_OFF_GUARD=1 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenUtilKernels.cl -o /.cache/miopen/1.7.0/ae3e81b4f4b5968e01343ac25026c938/MIOpenUtilKernels.cl.o
-
-# test_fwd_pool
-mkdir -p /.cache/miopen/1.7.0/7cda2f346ecf0e84b50181f05e75480b
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=3 -DMLO_POOLING_N_CHANNELS=3 -DMLO_POOLING_N_HORIZ_OUT_PIX=4 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=8 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_BOT_BATCH_STRIDE=196608 -DMLO_POOLING_BOT_CHANNEL_STRIDE=65536 -DMLO_POOLING_BOT_STRIDE=256 -DMLO_POOLING_TOP_BATCH_STRIDE=49152 -DMLO_POOLING_TOP_CHANNEL_STRIDE=16384 -DMLO_POOLING_TOP_STRIDE=128 -DMLO_POOLING_BOT_WIDTH=256 -DMLO_POOLING_BOT_HEIGHT=256 -DMLO_POOLING_TOP_WIDTH=128 -DMLO_POOLING_TOP_HEIGHT=128 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPooling.cl -o /.cache/miopen/1.7.0/7cda2f346ecf0e84b50181f05e75480b/MIOpenPooling.cl.o
-
-# test_bwd_pool
-mkdir -p /.cache/miopen/1.7.0/e33e7c33bfa58bc339c2c0ed6e8d29ad
-/opt/rocm/bin/clang-ocl  -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=3 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=2 -DMLO_POOLBWD_N_VERT_OUT_PIX=2 -DMLO_POOLBWD_GROUP_SZ0=8 -DMLO_POOLBWD_GROUP_SZ1=8 -DMLO_POOLBWD_BOT_WIDTH=256 -DMLO_POOLBWD_BOT_HEIGHT=256 -DMLO_POOLBWD_TOP_WIDTH=128 -DMLO_POOLBWD_TOP_HEIGHT=128 -DMLO_POOLBWD_BOTDF_BATCH_STRIDE=196608 -DMLO_POOLBWD_BOTDF_CHANNEL_STRIDE=65536 -DMLO_POOLBWD_BOTDF_STRIDE=256 -DMLO_POOLBWD_TOPDF_BATCH_STRIDE=49152 -DMLO_POOLBWD_TOPDF_CHANNEL_STRIDE=16384 -DMLO_POOLBWD_TOPDF_STRIDE=128 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPoolingBwd.cl -o /.cache/miopen/1.7.0/e33e7c33bfa58bc339c2c0ed6e8d29ad/MIOpenPoolingBwd.cl.o
diff --git a/src/gpu/DNNMark/setup.sh b/src/gpu/DNNMark/setup.sh
index 30baf95..ebd2afc 100755
--- a/src/gpu/DNNMark/setup.sh
+++ b/src/gpu/DNNMark/setup.sh
@@ -23,9 +23,10 @@
 then
   MIOPEN_PATH=/opt/rocm/miopen
   ROCBLAS_PATH=/opt/rocm/rocblas
-  CXX=/opt/rocm/hcc/bin/hcc cmake \
+  CXX=/opt/rocm/bin/hipcc cmake \
     -DHCC_ENABLE=ON \
     -DMIOPEN_ROOT=${MIOPEN_PATH} \
     -DROCBLAS_ROOT=${ROCBLAS_PATH} \
+    -DCMAKE_PREFIX_PATH="/opt/rocm;/opt/rocm/lib/cmake/AMDDeviceLibs/;/opt/rocm/lib/cmake/amd_comgr/" \
     ..
 fi