resources: Add DNNMark benchmark

This commit adds DNNMark, a benchmark framework used for measuring the
performance of various deep neural network primitives on GPUs

Change-Id: I39bd57f4fe674aab3a52e487756f8f3205a8a93c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/40197
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>
diff --git a/README.md b/README.md
index 48434eb..283f745 100644
--- a/README.md
+++ b/README.md
@@ -569,6 +569,63 @@
 
 <http://dist.gem5.org/dist/develop/test-progs/halo-finder/ForceTreeTest>
 
+# Resource: DNNMark
+
+[DNNMark](https://github.com/shidong-ai/DNNMark) is a benchmark framework used
+to characterize the performance of deep neural network (DNN) primitive workloads.
+
+## Compilation and Running
+
+DNNMark requires additional programs that aren't installed in the standard GCN
+docker image. There is a Dockerfile in `src/DNNMark` that installs the additional
+software.
+
+To build DNNMark (Including the new docker image):
+**NOTE**: Due to DNNMark building a library, it's important to mount gem5-resources
+to the same directory within the docker container when building and running, as otherwise the benchmarks
+won't be able to link against the library. The example commands do this by using
+`-v ${PWD}:${PWD}` in the docker run commands
+```
+cd src/DNNMark
+docker build -t <image_name> .
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> ./setup.sh HIP
+docker run --rm -v ${PWD}:${PWD} -w ${PWD}/build -u $UID:$GID <image_name> make
+```
+
+DNNMark uses MIOpen kernels, which are unable to be compiled on-the-fly in gem5.
+We have provided a shell script to generate these kernels for a subset of the
+benchmarks.
+
+To generate the MIOpen kernels:
+```
+cd src/DNNMark
+docker run --rm -v ${PWD}:${PWD} -v${PWD}/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> ./generate_cachefiles.sh
+```
+
+Due to the large amounts of memory that need to be set up for DNNMark, we have
+added in the ability to MMAP a file to reduce setup time, as well as added a
+program that can generate a 2GB file of floats.
+
+To make the MMAP file:
+```
+cd src/DNNMark
+g++ -std=c++0x generate_rand_data.cpp -o generate_rand_data
+./generate_rand_data
+```
+
+DNNMark is a GPU application, which requires that gem5 is built with the GCN3_X86 architecture.
+To build GCN3_X86:
+```
+# Working directory is your gem5 directory
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+```
+
+To run one of the benchmarks (fwd softmax) in gem5:
+```
+# Assuming gem5 and gem5-resources are sub-directories of the current directory
+docker run --rm -u $UID:$GID -v ${PWD}:${PWD} -v ${PWD}/gem5-resources/src/DNNMark/cachefiles:/.cache/miopen/1.7.0 -w ${PWD} <image_name> gem5/build/GCN3_X86/gem5.opt gem5/configs/example/apu_se.py -n2 --benchmark-root=gem5-resources/src/DNNMark/build/benchmarks/test_fwd_softmax -cdnnmark_test_fwd_softmax --options="-config gem5-resources/src/DNNMark/config_example/softmax_config.dnnmark -mmap gem5-resources/src/DNNMark/mmap.bin"
+```
+
 # Resource: SPEC 2006
 
 The [Standard Performance Evaluation Corporation](
@@ -707,6 +764,7 @@
 * **lulesh**: Consult the copyright notice in `src/lulesh/src/lulesh.hip.cc`
 * **halo-finder**: halo-finder is a subcomponent of HACC, which is licensed under
 a BSD license.
+* **DNNMark**: DNNMark is licensed under an MIT license, see `src/DNNMark/LICENSE`
 * **spec 2006**: SPEC CPU 2006 requires purchase of benchmark suite from
 [SPEC](https://www.spec.org/cpu2006/) thus, it cannot be freely distributed.
 Consult individual copyright notices of source files in `src/spec-2006`.
diff --git a/src/DNNMark/.gitignore b/src/DNNMark/.gitignore
new file mode 100644
index 0000000..1e9234b
--- /dev/null
+++ b/src/DNNMark/.gitignore
@@ -0,0 +1,23 @@
+####################
+# Temp files #
+####################
+*.swp
+.project
+.cproject
+.settings
+.*
+!.gitignore
+*_tmp.*
+*.pyc
+*.nvvp
+####################
+# Unfavorable dir #
+####################
+libs/
+build/
+logs/
+
+# Others
+generate_rand_data
+*.bin
+cachefiles
diff --git a/src/DNNMark/CMakeLists.txt b/src/DNNMark/CMakeLists.txt
new file mode 100644
index 0000000..352c08c
--- /dev/null
+++ b/src/DNNMark/CMakeLists.txt
@@ -0,0 +1,227 @@
+# CMake version related
+cmake_minimum_required(VERSION 2.8.9)
+if (${CMAKE_VERSION} VERSION_GREATER 3.0)
+  cmake_policy(SET CMP0046 OLD)
+endif()
+
+# Set project
+project(dnnmark)
+
+# Set path of sub-directory with other cmake file
+set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+
+# Set
+option (CUDA_ENABLE "Only run benchmark on Nvidia device" OFF)
+option (HCC_ENABLE "Only run benchmark on AMD device" OFF)
+
+# Locate CUDA package
+find_package(CUDA)
+
+# Detect GPU device
+if(CUDA_FOUND AND ${CUDA_ENABLE})
+  try_run(RUN_RESULT_VAR COMPILE_RESULT_VAR
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_SOURCE_DIR}/tools/has_cuda_gpu.c
+    CMAKE_FLAGS
+      -DINCLUDE_DIRECTORIES:STRING=${CUDA_TOOLKIT_INCLUDE}
+      -DLINK_LIBRARIES:STRING=${CUDA_CUDART_LIBRARY}
+    COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR
+    RUN_OUTPUT_VARIABLE RUN_OUTPUT_VAR)
+
+  # Display number of GPUs found
+  message("${RUN_OUTPUT_VAR}")
+
+  # COMPILE_RESULT_VAR is TRUE when compile succeeds
+  # RUN_RESULT_VAR is zero when a GPU is found
+  if(COMPILE_RESULT_VAR AND NOT RUN_RESULT_VAR)
+    set(CUDA_HAVE_GPU TRUE)
+  else()
+    set(CUDA_HAVE_GPU FALSE)
+  endif()
+endif()
+
+# Detect HCC
+find_program(HCC_FOUND hcc)
+
+option (double-test "Make data type double" OFF)
+option (enable-cudnnv6 "Enable cuDNN version 6" OFF)
+
+# Build NICE library only with CUDA
+if (CUDA_FOUND AND ${CUDA_ENABLE})
+
+  # Print messge to indicate whether GPU device exists
+  if (CUDA_HAVE_GPU)
+    message(STATUS "GPU device exists")
+    add_definitions(-DNVIDIA_CUDNN)
+  else()
+    message(WARNING "GPU device does not exist")
+  endif()
+
+  #include the built-in findCUDA cmake scripts
+  include(FindCUDA)
+
+  # Stop if CUDNN_ROOT is not defined
+  if (NOT DEFINED CUDNN_ROOT)
+    message(FATAL_ERROR "Path to CuDNN is not defined")
+  endif()
+
+  if (EXISTS ${CUDNN_ROOT})
+    message(STATUS "CuDNN library root path: " ${CUDNN_ROOT})
+    include_directories(${CUDNN_ROOT}/include)
+  else()
+    message(FATAL_ERROR "Path to CuDNN library not exists")
+  endif()
+
+  # Enable double data type
+  if (double-test)
+    add_definitions(-DDOUBLE_TEST)
+  endif()
+
+  # Add deinition of C++11 stardard
+  add_definitions(-std=c++11)
+
+  # Enable cuDNN v6
+  if (enable-cudnnv6)
+    add_definitions(-DCUDNNV6)
+  endif()
+
+  # Set path of DNNMark include files
+  set(DNNMARK_INCLUDES ${CMAKE_SOURCE_DIR}/core/include)
+  include_directories(${DNNMARK_INCLUDES})
+
+  # Set path of DNNMark layer files
+  set(DNNMARK_LAYERS ${DNNMARK_INCLUDES}/layers)
+  include_directories(${DNNMARK_LAYERS})
+
+  # Set source files
+  file(GLOB_RECURSE DNNMARK_SOURCES RELATIVE ${CMAKE_SOURCE_DIR} core/src/*.cc)
+  message(STATUS "DNNMark Source files: " ${DNNMARK_SOURCES})
+
+  # Set NVCC flags
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -O3 -gencode arch=compute_50,code=sm_50)
+
+  # Find cuda libraries
+  find_library(CUDNN_LIBRARY cudnn ${CUDNN_ROOT}/lib64)
+  find_library(CUDA_BLAS_LIBRARY cublas HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+  find_library(CUDA_RAND_LIBRARY curand HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+
+  # Find glog library
+  find_library(GLOG_LIBRARY glog)
+
+  # Add NICE library together with CUDA
+  include_directories(${CUDA_INCLUDE_DIR})
+  cuda_add_library(${PROJECT_NAME} SHARED ${DNNMARK_SOURCES})
+  add_dependencies(${PROJECT_NAME} ${CUDNN_LIBRARY})
+  add_dependencies(${PROJECT_NAME} ${CUDA_BLAS_LIBRARY})
+  add_dependencies(${PROJECT_NAME} ${CUDA_RAND_LIBRARY})
+  add_dependencies(${PROJECT_NAME} ${CUDA_LIBRARIES})
+  add_dependencies(${PROJECT_NAME} ${GLOG_LIBRARY})
+  target_link_libraries(${PROJECT_NAME}
+                        ${CUDNN_LIBRARY}
+                        ${CUDA_BLAS_LIBRARY}
+                        ${CUDA_RAND_LIBRARY}
+                        ${CUDA_LIBRARIES}
+                        ${GLOG_LIBRARY}
+                        m)
+
+elseif(HCC_FOUND AND ${HCC_ENABLE})
+
+  # Cover the include and linkage requirement here
+  execute_process(COMMAND hcc-config  --cxxflags
+                  OUTPUT_VARIABLE HCC_COMPILE_FLAGS)
+  set(COMPILER_FLAGS "${COMPILER_FLAGS} ${HCC_COMPILE_FLAGS}")
+
+  execute_process(COMMAND hcc-config  --ldflags
+                  OUTPUT_VARIABLE HCC_LINKER_FLAGS)
+  set(LINKER_FLAGS "${LINKER_FLAGS} ${HCC_LINKER_FLAGS}")
+
+  set(ROCM_ROOT /opt/rocm)
+  if (EXISTS ${MIOPEN_ROOT})
+    message(STATUS "MIOpen root path: " ${MIOPEN_ROOT})
+    include_directories(${MIOPEN_ROOT}/include)
+    include_directories(${MIOPEN_ROOT}/src/include)
+    include_directories(${MIOPEN_ROOT}/build/include)
+    include_directories(${ROCM_ROOT}/hip/include)
+  else()
+    message(FATAL_ERROR "Path to MIOpen library not exists")
+  endif()
+
+  if (EXISTS ${ROCBLAS_ROOT})
+    message(STATUS "ROCBLAS root path: " ${ROCBLAS_ROOT})
+    include_directories(${ROCBLAS_ROOT}/include)
+  else()
+    message(FATAL_ERROR "Path to ROCBLAS library not exists")
+  endif()
+
+  # Locate HIP to this project
+  list(APPEND CMAKE_PREFIX_PATH ${ROCM_ROOT}/hcc ${ROCM_ROOT}/hip)
+  find_package(hip REQUIRED PATHS ${ROCM_ROOT})
+
+  # Add a guard for AMD MIOpen
+  add_definitions(-DAMD_MIOPEN)
+
+  # Add deinition of C++11 stardard
+  add_definitions(-std=c++11 -Wno-attributes)
+
+  # Enable double data type
+  if (double-test)
+    add_definitions(-DDOUBLE_TEST)
+  endif()
+
+  # Set path of DNNMark include files
+  set(DNNMARK_INCLUDES ${CMAKE_SOURCE_DIR}/core/include)
+  include_directories(${DNNMARK_INCLUDES})
+
+  # Set path of DNNMark layer files
+  set(DNNMARK_LAYERS ${DNNMARK_INCLUDES}/layers)
+  include_directories(${DNNMARK_LAYERS})
+
+  # Set source files
+  file(GLOB_RECURSE DNNMARK_SOURCES RELATIVE ${CMAKE_SOURCE_DIR} core/src/*.cc)
+  message(STATUS "DNNMark Source files: " ${DNNMARK_SOURCES})
+
+  # Find rocBLAS libraries
+  find_library(ROCBLAS_LIBRARY rocblas ${ROCBLAS_ROOT}/lib)
+
+  # Find MIOpen libraries
+  find_library(MIOPEN_LIBRARY MIOpen ${MIOPEN_ROOT}/lib)
+
+  message(${ROCBLAS_LIBRARY} ${MIOPEN_LIBRARY})
+
+  # Find other libraries
+  find_library(HIP_HCC hip_hcc /opt/rocm/hip/lib)
+
+  # Find glog libraries
+  find_library(GLOG_LIBRARY glog)
+
+  # Add DNNMark library
+  add_library(${PROJECT_NAME} SHARED ${DNNMARK_SOURCES})
+  add_dependencies(${PROJECT_NAME} ${MIOPEN_LIBRARY})
+  target_link_libraries(${PROJECT_NAME}
+                        ${ROCBLAS_LIBRARY}
+                        ${MIOPEN_LIBRARY}
+                        ${HIP_HCC}
+                        ${GLOG_LIBRARY}
+                        m)
+  set_target_properties(${PROJECT_NAME} PROPERTIES
+      COMPILE_FLAGS ${COMPILER_FLAGS}
+      LINK_FLAGS ${LINKER_FLAGS})
+
+else()
+
+  message(FATAL_ERROR "CUDA or HCC not exists")
+
+endif()
+
+# Benchmarking
+option (enable-benchmarking "Compile all benchmarks of DNNMark" ON)
+
+if (enable-benchmarking)
+  # Include the cmake file of gflag
+  find_library(GFLAGS_LIBRARY gflags)
+
+  # Invoke cmake files of each benchmark
+  set(BENCHMARKS benchmarks)
+  add_subdirectory(${BENCHMARKS})
+
+endif()
diff --git a/src/DNNMark/Dockerfile b/src/DNNMark/Dockerfile
new file mode 100644
index 0000000..5299b26
--- /dev/null
+++ b/src/DNNMark/Dockerfile
@@ -0,0 +1,2 @@
+FROM gcr.io/gem5-test/gcn-gpu
+RUN apt-get update && apt-get -y install libgflags-dev libgoogle-glog-dev
diff --git a/src/DNNMark/LICENSE b/src/DNNMark/LICENSE
new file mode 100644
index 0000000..6e2b86d
--- /dev/null
+++ b/src/DNNMark/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Northeastern University
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/DNNMark/Makefile b/src/DNNMark/Makefile
new file mode 100644
index 0000000..064dc66
--- /dev/null
+++ b/src/DNNMark/Makefile
@@ -0,0 +1,2 @@
+data:
+	g++ -std=c++14 -g generate_rand_data.cpp -o generate_rand_data
diff --git a/src/DNNMark/README.md b/src/DNNMark/README.md
new file mode 100644
index 0000000..c19a2c5
--- /dev/null
+++ b/src/DNNMark/README.md
@@ -0,0 +1,103 @@
+# Announcement
+DNNMark is now supporting MIOpen. Right now DNNMark can run on both AMD and Nvidia platform.
+HCC, HIP, MIOpen and miopengemm are required in order to build MIOpen version of DNNMark.
+
+# Cite Us
+Bibtex is as below:
+
+@inproceedings{Dong:2017:DDN:3038228.3038239,<br />
+ author = {Dong, Shi and Kaeli, David},<br />
+ title = {DNNMark: A Deep Neural Network Benchmark Suite for GPUs},<br />
+ booktitle = {Proceedings of the General Purpose GPUs},<br />
+ series = {GPGPU-10},<br />
+ year = {2017},<br />
+ isbn = {978-1-4503-4915-4},<br />
+ location = {Austin, TX, USA},<br />
+ pages = {63--72},<br />
+ numpages = {10},<br />
+ url = {http://doi.acm.org/10.1145/3038228.3038239},<br />
+ doi = {10.1145/3038228.3038239},<br />
+ acmid = {3038239},<br />
+ publisher = {ACM},<br />
+ address = {New York, NY, USA},<br />
+ keywords = {Benchmark Suite, Deep Neural Network, GPU, cuDNN},<br />
+}<br />
+
+
+# Publications
+[1] Shi Dong and David Kaeli. 2017. *DNNMark: A Deep Neural Network Benchmark Suite for GPUs*. In Proceedings of the General Purpose GPUs (GPGPU-10). ACM, New York, NY, USA, 63-72.
+
+[2] Shi Dong, Xiang Gong, Yifan Sun, Trinayan Baruah, and David Kaeli. 2018. *Characterizing the Microarchitectural Implications of a Convolutional Neural Network (CNN) Execution on GPUs*. In Proceedings of the 2018 ACM/SPEC International Conference on Performance Engineering (ICPE '18). ACM, New York, NY, USA, 96-106. DOI: https://doi.org/10.1145/3184407.3184423 __*(Best Paper Award)*__
+
+[3] Yifan Sun, Saoni Mukherjee, Trinayan Baruah, Shi Dong, Julian Gutierrez, Prannoy Mohan, David Kaeli. 2018. *Evaluating Performance Tradeoffs on the Radeon Open Compute Platform*. ISPASS 2018
+
+[4] Shi Dong, Pu Zhao, Xue Lin, David Kaeli. 2020. *Exploring GPU acceleration of Deep Neural Networks using Block Circulant Matrices*. Parallel Computing
+
+# DNNMark
+Configurable benchmark suite of Deep Neural Networks
+
+DNNMark is a highly configurable, extensible, and flexible Deep Neural Network benchmark framework. In this benchmark suite, each of DNN primitive workloads can be easily invoked separately, without any sacrifice on configurability. One can specify any type of scenarios for benchmarking on an algorithm-specific level. Second, the actual measurement can be done during the execution of any specific kernel. Our framework allows us to ignore the setup stages and focus on only the training steps. Unlike other deep learning frameworks, attaching a real database
+for training purposes is not mandatory anymore. This capability will greatly aid the computer architecture community, which is
+more interested in designing/tuning hardware/software, and less interested in the details or configuration of the deep neural net.
+
+Depending on the specific configuration, deep neural networks can involve combinations of DNN primitives. A model composed of two or more primitive functions may be more desirable in terms of performance evaluation. In such cases, a composed model rather than standalone primitives, are preferred. To provide this capability, DNNmark can be extended to more sophisticated DNN models, where layers are connected to, and dependent upon, each other.
+
+## Configurability
+This frame work provides configurability in both general and algorithm specific parameters. Users can do this through a plain-text configuration file. Several examples are provided inside config_example directory.
+## Extensibility
+New DNN models/scenarios can be easily built for benchmarking through configuration files
+## Convenience
+Designing benchmarks takes little effort thanks to its centralized library
+## Diversity
+DNNMark contains commonly-used DNN primitives and also provides an easy approach to compose a model
+
+# Features
+
+1. Configurable
+2. Provide insights of GPU execution behavior using detailed GPU metrics/events
+3. Separatable or combined DNN primitives benchmarking
+
+# Supported DNN primitives:
+
+1. Convolution forward and backward
+2. Pooling forward and backward
+3. LRN forward and backward
+4. Activation forward and backward
+5. Fully Connected forward and backward
+6. Softmax forward and backward
+7. Batch Normalization forward and backward
+8. Dropout forward and backward
+
+# Build and Usage
+
+## OS, Library, and Software Prerequisite
+OS: Ubuntu 16.04
+
+CUDA related library: CUDA tool kit v8.0; CuDNN v5.0
+
+Other Software: CMake v3.5.1; g++ v5.4.0
+
+Google libraries: gflags (sudo apt-get install libgflags-dev); glog(sudo apt-get install libgoogle-glog-dev)
+
+There is one known problem regarding gflags. Sometimes, the compile process complains that ‘gflags’ has not been declared. This could be related to the gflag version used. So the solution could be either downloading a latest one or changing the namespace name from 'gflags' to 'google'
+
+## Build
+After you download and unzip the DNNMark, you should go to its root directory and edit `setup.sh` to set up path to cuDNN. And then run `./setup.sh`. This will create a build directory and run cmake automatically. To build the code, go to build directory `build` and run `make`
+
+## Usage
+To run the benchmarks that have been built, go to the directory `build` and you will see a directory `benchmarks`. Go inside and select the benchmark you want to run. Run command `./[name of benchmark] -config [path to config file] -debuginfo [1 or 0]` to execute the benchmark.
+
+If you want to include warm up phase in the benchmark, one suggestion is that you mannually add it in the source code and use `-warmup [1 or 0]` to toggle the usage of the warm up phase. You can find one example the test_alexnet benchmark.
+
+# For Contributors
+1. Fork the repository to your own remote repository.
+2. Git clone the repository: `git clone git@github.com/your_account_name/DNNMark.git`
+3. Add this project as an upstream to your local repository by `git remote add upstream https://github.com/doody1986/DNNMark.git`. You can use `git remote -v` to view the upstream.
+4. Create your own local feature branch: `git checkout -b your-own-feature-branch develop`
+3. Make your own feature branch visible by pushing it to your own remote repository (DO NOT PUSH IT TO THE DEVELOP BRANCH): `git push --set-upstream origin your-own-feature-branch`
+4. Develop your own feature branch in your local repository: `git add`, `git commit`, etc..
+5. After your own branch is completed, make sure to merge the latest change from upstream develop branch to your own origin develop branch: 1) `git checkout develop` 2) `git pull upstream develop` 3) `git push origin develop`
+6. Since that you have the latest change in your own origin develop branch from upstream one, now you can update your own feature branch on the your own remote repository by: 1) `git checkout your-own-feature-branch` 2) `git pull origin develop` 3) `git push origin your-own-feature-branch`
+7. Make a pull request from your own feature branch on your own remote repository on github to the develop branch of this repository.
+8. After the pull request is merged, you can delete your own feature branch by 1) `git push origin --delete your-own-feature-branch` to delete the remote branch and 2) `git branch -d your-own-feature-branch` to delete your local branch.
+9. More instructions on using fork can be found [here](https://help.github.com/articles/fork-a-repo/).
diff --git a/src/DNNMark/benchmarks/CMakeLists.txt b/src/DNNMark/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..34c31fe
--- /dev/null
+++ b/src/DNNMark/benchmarks/CMakeLists.txt
@@ -0,0 +1,35 @@
+set(SUBDIRECTORIES
+  test_fwd_conv
+  test_bwd_conv
+  test_fwd_pool
+  test_bwd_pool
+  test_fwd_lrn
+  test_bwd_lrn
+  test_fwd_activation
+  test_bwd_activation
+  test_fwd_fc
+  test_bwd_fc
+  test_fwd_softmax
+  test_bwd_softmax
+  test_fwd_bn
+  test_bwd_bn
+  test_fwd_dropout
+  test_bwd_dropout
+  test_fwd_bypass
+  test_bwd_bypass
+  test_fwd_composed_model
+  test_bwd_composed_model
+  test_composed_model
+  test_alexnet
+  test_VGG
+)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+file(GLOB COMMON_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+get_filename_component(COMMON_SRC ${COMMON_FILE} ABSOLUTE)
+
+foreach(subdir ${SUBDIRECTORIES})
+  add_subdirectory(${subdir})
+endforeach(subdir)
diff --git a/src/DNNMark/benchmarks/test_VGG/CMakeLists.txt b/src/DNNMark/benchmarks/test_VGG/CMakeLists.txt
new file mode 100644
index 0000000..2f9828c
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_VGG/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Include the gflag include path
+#include_directories(${})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_VGG/test_VGG.cc b/src/DNNMark/benchmarks/test_VGG/test_VGG.cc
new file mode 100644
index 0000000..a4a9df0
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_VGG/test_VGG.cc
@@ -0,0 +1,39 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(58, FLAGS_mmap);
+  dnnmark.ParseAllConfig(FLAGS_config);
+
+  dnnmark.Initialize();
+  // Warm up
+  if (FLAGS_warmup) {
+    for (int i = 0; i < 5; i++) {
+      dnnmark.Forward();
+      dnnmark.Backward();
+    }
+  }
+  dnnmark.GetTimer()->Clear();
+
+  // Real benchmark
+  for (int i = 0; i < FLAGS_iterations; i++) {
+    LOG(INFO) << "Iteration " << i;
+    dnnmark.Forward();
+    dnnmark.Backward();
+  }
+  dnnmark.GetTimer()->SumRecords();
+
+  dnnmark.TearDown();
+
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  printf("Total running time(ms): %f\n", dnnmark.GetTimer()->GetTotalTime());
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_alexnet/CMakeLists.txt b/src/DNNMark/benchmarks/test_alexnet/CMakeLists.txt
new file mode 100644
index 0000000..2f9828c
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_alexnet/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Include the gflag include path
+#include_directories(${})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_alexnet/test_alexnet.cc b/src/DNNMark/benchmarks/test_alexnet/test_alexnet.cc
new file mode 100644
index 0000000..1624382
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_alexnet/test_alexnet.cc
@@ -0,0 +1,35 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(21, FLAGS_mmap);
+  dnnmark.ParseAllConfig(FLAGS_config);
+
+  dnnmark.Initialize();
+  // Warm up
+  if (FLAGS_warmup) {
+    for (int i = 0; i < 5; i++) {
+      dnnmark.Forward();
+      dnnmark.Backward();
+    }
+  }
+  dnnmark.GetTimer()->Clear();
+
+  // Real benchmark
+  dnnmark.Forward();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+
+  dnnmark.TearDown();
+
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_activation/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_activation/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_activation/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_activation/test_bwd_activation.cc b/src/DNNMark/benchmarks/test_bwd_activation/test_bwd_activation.cc
new file mode 100644
index 0000000..b0ca1c7
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_activation/test_bwd_activation.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_bn/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_bn/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_bn/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_bn/test_bwd_bn.cc b/src/DNNMark/benchmarks/test_bwd_bn/test_bwd_bn.cc
new file mode 100644
index 0000000..b0ca1c7
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_bn/test_bwd_bn.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_bypass/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_bypass/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_bypass/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_bypass/test_bwd_bypass.cc b/src/DNNMark/benchmarks/test_bwd_bypass/test_bwd_bypass.cc
new file mode 100644
index 0000000..b0ca1c7
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_bypass/test_bwd_bypass.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_composed_model/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_composed_model/CMakeLists.txt
new file mode 100644
index 0000000..2f9828c
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_composed_model/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Include the gflag include path
+#include_directories(${})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_composed_model/test_bwd_composed_model.cc b/src/DNNMark/benchmarks/test_bwd_composed_model/test_bwd_composed_model.cc
new file mode 100644
index 0000000..cc668eb
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_composed_model/test_bwd_composed_model.cc
@@ -0,0 +1,21 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(3, FLAGS_mmap);
+  dnnmark.ParseAllConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_conv/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_conv/CMakeLists.txt
new file mode 100644
index 0000000..2f9828c
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_conv/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Include the gflag include path
+#include_directories(${})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_conv/test_bwd_conv.cc b/src/DNNMark/benchmarks/test_bwd_conv/test_bwd_conv.cc
new file mode 100644
index 0000000..b0ca1c7
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_conv/test_bwd_conv.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_dropout/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_dropout/CMakeLists.txt
new file mode 100644
index 0000000..2f9828c
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_dropout/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Include the gflag include path
+#include_directories(${})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_dropout/test_bwd_dropout.cc b/src/DNNMark/benchmarks/test_bwd_dropout/test_bwd_dropout.cc
new file mode 100644
index 0000000..8aa8991
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_dropout/test_bwd_dropout.cc
@@ -0,0 +1,25 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  //google::ParseCommandLineFlags(&argc, &argv, true);
+  //google::InitGoogleLogging(argv[0]);
+  //FLAGS_logtostderr = FLAGS_debuginfo;
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_fc/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_fc/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_fc/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_fc/test_bwd_fc.cc b/src/DNNMark/benchmarks/test_bwd_fc/test_bwd_fc.cc
new file mode 100644
index 0000000..b0ca1c7
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_fc/test_bwd_fc.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_lrn/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_lrn/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_lrn/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_lrn/test_bwd_lrn.cc b/src/DNNMark/benchmarks/test_bwd_lrn/test_bwd_lrn.cc
new file mode 100644
index 0000000..b0ca1c7
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_lrn/test_bwd_lrn.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_pool/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_pool/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_pool/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_pool/test_bwd_pool.cc b/src/DNNMark/benchmarks/test_bwd_pool/test_bwd_pool.cc
new file mode 100644
index 0000000..b0ca1c7
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_pool/test_bwd_pool.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_bwd_softmax/CMakeLists.txt b/src/DNNMark/benchmarks/test_bwd_softmax/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_softmax/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_bwd_softmax/test_bwd_softmax.cc b/src/DNNMark/benchmarks/test_bwd_softmax/test_bwd_softmax.cc
new file mode 100644
index 0000000..b0ca1c7
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_bwd_softmax/test_bwd_softmax.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Backward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_composed_model/CMakeLists.txt b/src/DNNMark/benchmarks/test_composed_model/CMakeLists.txt
new file mode 100644
index 0000000..2f9828c
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_composed_model/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Include the gflag include path
+#include_directories(${})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_composed_model/test_composed_model.cc b/src/DNNMark/benchmarks/test_composed_model/test_composed_model.cc
new file mode 100644
index 0000000..ddf6888
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_composed_model/test_composed_model.cc
@@ -0,0 +1,39 @@
+#include <iostream>
+#include "stdio.h"
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+#include <gflags/gflags.h>
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(3, FLAGS_mmap);
+  dnnmark.ParseAllConfig(FLAGS_config);
+  dnnmark.Initialize();
+  if (FLAGS_warmup) {
+    for (int i = 0; i < 5; i++) {
+      LOG(INFO) << "Warming up...";
+      dnnmark.Forward();
+      dnnmark.Backward();
+    }
+  }
+  dnnmark.GetTimer()->Clear();
+
+  // Real benchmark
+  for (int i = 0; i < FLAGS_iterations; i++) {
+    LOG(INFO) << "Iteration " << i;
+    dnnmark.Forward();
+    dnnmark.Backward();
+  }
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  printf("Total running time(ms): %f\n", dnnmark.GetTimer()->GetTotalTime());
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_activation/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_activation/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_activation/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_activation/test_fwd_activation.cc b/src/DNNMark/benchmarks/test_fwd_activation/test_fwd_activation.cc
new file mode 100644
index 0000000..df13087
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_activation/test_fwd_activation.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_bn/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_bn/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_bn/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_bn/test_fwd_bn.cc b/src/DNNMark/benchmarks/test_fwd_bn/test_fwd_bn.cc
new file mode 100644
index 0000000..df13087
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_bn/test_fwd_bn.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_bypass/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_bypass/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_bypass/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_bypass/test_fwd_bypass.cc b/src/DNNMark/benchmarks/test_fwd_bypass/test_fwd_bypass.cc
new file mode 100644
index 0000000..df13087
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_bypass/test_fwd_bypass.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_composed_model/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_composed_model/CMakeLists.txt
new file mode 100644
index 0000000..2f9828c
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_composed_model/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Include the gflag include path
+#include_directories(${})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_composed_model/test_fwd_composed_model.cc b/src/DNNMark/benchmarks/test_fwd_composed_model/test_fwd_composed_model.cc
new file mode 100644
index 0000000..b703c4a
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_composed_model/test_fwd_composed_model.cc
@@ -0,0 +1,21 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(3, FLAGS_mmap);
+  dnnmark.ParseAllConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_conv/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_conv/CMakeLists.txt
new file mode 100644
index 0000000..2f9828c
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_conv/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Include the gflag include path
+#include_directories(${})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_conv/test_fwd_conv.cc b/src/DNNMark/benchmarks/test_fwd_conv/test_fwd_conv.cc
new file mode 100644
index 0000000..df13087
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_conv/test_fwd_conv.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_dropout/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_dropout/CMakeLists.txt
new file mode 100644
index 0000000..2f9828c
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_dropout/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Include the gflag include path
+#include_directories(${})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_dropout/test_fwd_dropout.cc b/src/DNNMark/benchmarks/test_fwd_dropout/test_fwd_dropout.cc
new file mode 100644
index 0000000..df13087
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_dropout/test_fwd_dropout.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_fc/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_fc/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_fc/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_fc/test_fwd_fc.cc b/src/DNNMark/benchmarks/test_fwd_fc/test_fwd_fc.cc
new file mode 100644
index 0000000..df13087
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_fc/test_fwd_fc.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_lrn/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_lrn/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_lrn/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_lrn/test_fwd_lrn.cc b/src/DNNMark/benchmarks/test_fwd_lrn/test_fwd_lrn.cc
new file mode 100644
index 0000000..df13087
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_lrn/test_fwd_lrn.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_pool/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_pool/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_pool/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_pool/test_fwd_pool.cc b/src/DNNMark/benchmarks/test_fwd_pool/test_fwd_pool.cc
new file mode 100644
index 0000000..df13087
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_pool/test_fwd_pool.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/test_fwd_softmax/CMakeLists.txt b/src/DNNMark/benchmarks/test_fwd_softmax/CMakeLists.txt
new file mode 100644
index 0000000..9f49e41
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_softmax/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Include the DNNMark include path
+include_directories(${DNNMARK_INCLUDES})
+
+# Obtain the name of the directory(also part of the benchmark name)
+get_filename_component(BENCHMARK_DIR ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+
+# Set the benchmark name
+set(BENCHMARK_NAME ${PROJECT_NAME}_${BENCHMARK_DIR})
+
+# Add source file
+file(GLOB_RECURSE BENCHMARK_SRC_FILE RELATIVE
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     *.cc)
+
+# Print out the source file information
+message(STATUS ${BENCHMARK_NAME} ": Benchmark source files: " ${BENCHMARK_SRC_FILE})
+
+# Add benchmark executable
+add_executable(${BENCHMARK_NAME} ${COMMON_SRC} ${BENCHMARK_SRC_FILE})
+
+# Add dependencies of current benchmark
+add_dependencies(${BENCHMARK_NAME} ${PROJECT_NAME})
+add_dependencies(${BENCHMARK_NAME} ${GFLAGS_LIBRARY})
+
+# Link libraries
+target_link_libraries(${BENCHMARK_NAME}
+    ${PROJECT_NAME}
+    ${GFLAGS_LIBRARY})
+
diff --git a/src/DNNMark/benchmarks/test_fwd_softmax/test_fwd_softmax.cc b/src/DNNMark/benchmarks/test_fwd_softmax/test_fwd_softmax.cc
new file mode 100644
index 0000000..df13087
--- /dev/null
+++ b/src/DNNMark/benchmarks/test_fwd_softmax/test_fwd_softmax.cc
@@ -0,0 +1,22 @@
+#include <iostream>
+#include "common.h"
+#include "dnnmark.h"
+#include "usage.h"
+
+using namespace dnnmark;
+
+int main(int argc, char **argv) {
+  INIT_FLAGS(argc, argv);
+  INIT_LOG(argv);
+  LOG(INFO) << "DNNMark suites: Start...";
+  DNNMark<TestType> dnnmark(FLAGS_mmap);
+  dnnmark.ParseGeneralConfig(FLAGS_config);
+  dnnmark.ParseLayerConfig(FLAGS_config);
+  dnnmark.Initialize();
+  dnnmark.Forward();
+  dnnmark.GetTimer()->SumRecords();
+  dnnmark.TearDown();
+  LOG(INFO) << "Total running time(ms): " << dnnmark.GetTimer()->GetTotalTime();
+  LOG(INFO) << "DNNMark suites: Tear down...";
+  return 0;
+}
diff --git a/src/DNNMark/benchmarks/usage.cc b/src/DNNMark/benchmarks/usage.cc
new file mode 100644
index 0000000..8df937c
--- /dev/null
+++ b/src/DNNMark/benchmarks/usage.cc
@@ -0,0 +1,34 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "usage.h"
+
+DEFINE_string(config, "",
+    "The self defined DNN config file.");
+DEFINE_int32(debuginfo, 0,
+    "The debug info switch to turn on/off debug information.");
+DEFINE_int32(warmup, 0,
+    "The warmup switch to turn on/off warmup phases.");
+DEFINE_int32(iterations, 1,
+    "Number of FWD+BWD passes.");
+DEFINE_string(mmap, "",
+    "The path to a binary data file to use for mmap-ing");
diff --git a/src/DNNMark/benchmarks/usage.h b/src/DNNMark/benchmarks/usage.h
new file mode 100644
index 0000000..263d226
--- /dev/null
+++ b/src/DNNMark/benchmarks/usage.h
@@ -0,0 +1,47 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef BENCHMARKS_USAGE_H_
+#define BENCHMARKS_USAGE_H_
+
+#include <gflags/gflags.h>
+
+DECLARE_string(config);
+DECLARE_int32(debuginfo);
+DECLARE_int32(warmup);
+DECLARE_int32(iterations);
+DECLARE_string(mmap);
+
+#define INIT_FLAGS(X, Y) \
+gflags::SetUsageMessage(\
+      "\n[DNNMark benchmark usage]\n"\
+      "./<benchmark> <args>\n"\
+      );\
+google::ParseCommandLineFlags(&X, &Y, true)
+
+#define INIT_LOG(X) \
+google::InitGoogleLogging(X[0]);\
+FLAGS_logtostderr = FLAGS_debuginfo;\
+CHECK_GT(FLAGS_config.size(), 0) << "Configuration file is needed."
+
+#endif // BENCHMARKS_USAGE_H_
+
diff --git a/src/DNNMark/cmake/gflag.cmake b/src/DNNMark/cmake/gflag.cmake
new file mode 100644
index 0000000..900195f
--- /dev/null
+++ b/src/DNNMark/cmake/gflag.cmake
@@ -0,0 +1,16 @@
+include(ExternalProject)
+
+ExternalProject_Add(gflags
+    GIT_REPOSITORY git clone https://github.com/gflags/gflags.git
+    PREFIX "${CMAKE_CURRENT_BINARY_DIR}/gflags"
+# Disable install step
+    INSTALL_COMMAND ""
+)
+
+# Specify include dir
+ExternalProject_Get_Property(gflags SOURCE_DIR)
+set(GFLAGS_INCLUDE_DIRS ${SOURCE_DIR}/gflags/include)
+
+# Specify MainTest's link libraries
+ExternalProject_Get_Property(googletest binary_dir)
+set(GFLAGS_LIBS_DIR ${binary_dir}/googlemock/gtest)
diff --git a/src/DNNMark/config_example/VGG.dnnmark b/src/DNNMark/config_example/VGG.dnnmark
new file mode 100644
index 0000000..14db38c
--- /dev/null
+++ b/src/DNNMark/config_example/VGG.dnnmark
@@ -0,0 +1,512 @@
+[DNNMark]
+run_mode=composed
+
+[Convolution]
+name=block1_1_conv
+n=100
+c=3
+h=32
+w=32
+previous_layer=null
+conv_mode=convolution
+kernel_size=3
+num_output=64
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block1_1_conv
+name=block1_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block1_1_bn
+name=block1_1_relu
+activation_mode=relu
+
+[Dropout]
+previous_layer=block1_1_relu
+name=dropout1_1
+dropout_probability=.3
+random_seed=0
+
+
+
+[Convolution]
+previous_layer=dropout1_1
+name=block1_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=64
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block1_2_conv
+name=block1_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block1_2_bn
+name=block1_2_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block1_2_relu
+name=pool1_2
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+[Convolution]
+previous_layer=pool1_2
+name=block2_1_conv
+conv_mode=convolution
+kernel_size=3
+num_output=128
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block2_1_conv
+name=block2_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block2_1_bn
+name=block2_1_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block2_1_relu
+name=dropout2_1
+dropout_probability=.4
+random_seed=0
+
+
+[Convolution]
+previous_layer=dropout2_1
+name=block2_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=128
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block2_2_conv
+name=block2_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block2_2_bn
+name=block2_2_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block2_2_relu
+name=pool2_2
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+
+
+
+
+[Convolution]
+previous_layer=pool2_2
+name=block3_1_conv
+conv_mode=convolution
+kernel_size=3
+num_output=256
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block3_1_conv
+name=block3_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block3_1_bn
+name=block3_1_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block3_1_relu
+name=dropout3_1
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout3_1
+name=block3_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=256
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block3_2_conv
+name=block3_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block3_2_bn
+name=block3_2_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block3_2_relu
+name=dropout3_2
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout3_2
+name=block3_3_conv
+conv_mode=convolution
+kernel_size=3
+num_output=256
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block3_3_conv
+name=block3_3_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block3_3_bn
+name=block3_3_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block3_3_relu
+name=pool3_3
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+
+
+
+
+
+[Convolution]
+previous_layer=pool3_3
+name=block4_1_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block4_1_conv
+name=block4_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block4_1_bn
+name=block4_1_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block4_1_relu
+name=dropout4_1
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout4_1
+name=block4_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block4_2_conv
+name=block4_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block4_2_bn
+name=block4_2_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block4_2_relu
+name=dropout4_2
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout4_2
+name=block4_3_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block4_3_conv
+name=block4_3_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block4_3_bn
+name=block4_3_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block4_3_relu
+name=pool4_3
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+
+
+
+
+
+[Convolution]
+previous_layer=pool4_3
+name=block5_1_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block5_1_conv
+name=block5_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block5_1_bn
+name=block5_1_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block5_1_relu
+name=dropout5_1
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout5_1
+name=block5_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block5_2_conv
+name=block5_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block5_2_bn
+name=block5_2_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block5_2_relu
+name=dropout5_2
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout5_2
+name=block5_3_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[BatchNorm]
+previous_layer=block5_3_conv
+name=block5_3_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block5_3_bn
+name=block5_3_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block5_3_relu
+name=pool5_3
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+
+[Dropout]
+previous_layer=pool5_3
+name=dropout5_3
+dropout_probability=.5
+random_seed=0
+
+
+[FullyConnected]
+previous_layer=dropout5_3
+name=fc1
+num_output=512
+
+
+[BatchNorm]
+previous_layer=fc1
+name=bn_fc1
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=bn_fc1
+name=relu_fc1
+activation_mode=relu
+
+
+
+[Dropout]
+previous_layer=relu_fc1
+name=dropout_fc1
+dropout_probability=.5
+random_seed=0
+
+
+[FullyConnected]
+previous_layer=dropout_fc1
+name=fc2
+num_output=100
diff --git a/src/DNNMark/config_example/VGG.dnntemplate b/src/DNNMark/config_example/VGG.dnntemplate
new file mode 100644
index 0000000..d38dd45
--- /dev/null
+++ b/src/DNNMark/config_example/VGG.dnntemplate
@@ -0,0 +1,538 @@
+[DNNMark]
+run_mode=composed
+
+[Convolution]
+name=block1_1_conv
+n=$N
+c=3
+h=32
+w=32
+previous_layer=null
+conv_mode=convolution
+kernel_size=3
+num_output=64
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block1_1_conv
+name=block1_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block1_1_bn
+name=block1_1_relu
+activation_mode=relu
+
+[Dropout]
+previous_layer=block1_1_relu
+name=dropout1_1
+dropout_probability=.3
+random_seed=0
+
+
+
+[Convolution]
+previous_layer=dropout1_1
+name=block1_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=64
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block1_2_conv
+name=block1_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block1_2_bn
+name=block1_2_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block1_2_relu
+name=pool1_2
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+[Convolution]
+previous_layer=pool1_2
+name=block2_1_conv
+conv_mode=convolution
+kernel_size=3
+num_output=128
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block2_1_conv
+name=block2_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block2_1_bn
+name=block2_1_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block2_1_relu
+name=dropout2_1
+dropout_probability=.4
+random_seed=0
+
+
+[Convolution]
+previous_layer=dropout2_1
+name=block2_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=128
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block2_2_conv
+name=block2_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block2_2_bn
+name=block2_2_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block2_2_relu
+name=pool2_2
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+
+
+
+
+[Convolution]
+previous_layer=pool2_2
+name=block3_1_conv
+conv_mode=convolution
+kernel_size=3
+num_output=256
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block3_1_conv
+name=block3_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block3_1_bn
+name=block3_1_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block3_1_relu
+name=dropout3_1
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout3_1
+name=block3_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=256
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block3_2_conv
+name=block3_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block3_2_bn
+name=block3_2_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block3_2_relu
+name=dropout3_2
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout3_2
+name=block3_3_conv
+conv_mode=convolution
+kernel_size=3
+num_output=256
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block3_3_conv
+name=block3_3_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block3_3_bn
+name=block3_3_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block3_3_relu
+name=pool3_3
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+
+
+
+
+
+[Convolution]
+previous_layer=pool3_3
+name=block4_1_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block4_1_conv
+name=block4_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block4_1_bn
+name=block4_1_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block4_1_relu
+name=dropout4_1
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout4_1
+name=block4_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block4_2_conv
+name=block4_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block4_2_bn
+name=block4_2_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block4_2_relu
+name=dropout4_2
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout4_2
+name=block4_3_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block4_3_conv
+name=block4_3_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block4_3_bn
+name=block4_3_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block4_3_relu
+name=pool4_3
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+
+
+
+
+
+[Convolution]
+previous_layer=pool4_3
+name=block5_1_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block5_1_conv
+name=block5_1_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block5_1_bn
+name=block5_1_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block5_1_relu
+name=dropout5_1
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout5_1
+name=block5_2_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block5_2_conv
+name=block5_2_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block5_2_bn
+name=block5_2_relu
+activation_mode=relu
+
+
+[Dropout]
+previous_layer=block5_2_relu
+name=dropout5_2
+dropout_probability=.4
+random_seed=0
+
+
+
+
+[Convolution]
+previous_layer=dropout5_2
+name=block5_3_conv
+conv_mode=convolution
+kernel_size=3
+num_output=512
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+$CUDNN_CBFA
+$CUDNN_CBDA
+
+[BatchNorm]
+previous_layer=block5_3_conv
+name=block5_3_bn
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=block5_3_bn
+name=block5_3_relu
+activation_mode=relu
+
+
+[Pooling]
+previous_layer=block5_3_relu
+name=pool5_3
+pool_mode=max
+kernel_size=2
+pad=0
+stride=2
+
+
+
+[Dropout]
+previous_layer=pool5_3
+name=dropout5_3
+dropout_probability=.5
+random_seed=0
+
+
+[FullyConnected]
+previous_layer=dropout5_3
+name=fc1
+num_output=512
+
+
+[BatchNorm]
+previous_layer=fc1
+name=bn_fc1
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=1
+epsilon=2e-5
+
+[Activation]
+previous_layer=bn_fc1
+name=relu_fc1
+activation_mode=relu
+
+
+
+[Dropout]
+previous_layer=relu_fc1
+name=dropout_fc1
+dropout_probability=.5
+random_seed=0
+
+
+[FullyConnected]
+previous_layer=dropout_fc1
+name=fc2
+num_output=100
diff --git a/src/DNNMark/config_example/activation_config.dnnmark b/src/DNNMark/config_example/activation_config.dnnmark
new file mode 100644
index 0000000..d4d0753
--- /dev/null
+++ b/src/DNNMark/config_example/activation_config.dnnmark
@@ -0,0 +1,11 @@
+[DNNMark]
+run_mode=standalone
+
+[Activation]
+name=relu1
+n=100
+c=32
+h=256
+w=256
+activation_mode=relu
+
diff --git a/src/DNNMark/config_example/alexnet.dnnmark b/src/DNNMark/config_example/alexnet.dnnmark
new file mode 100644
index 0000000..e32b529
--- /dev/null
+++ b/src/DNNMark/config_example/alexnet.dnnmark
@@ -0,0 +1,165 @@
+[DNNMark]
+run_mode=composed
+
+[Convolution]
+name=conv1
+n=16
+c=3
+h=227
+w=227
+previous_layer=null
+conv_mode=convolution
+propagation=false
+num_output=96
+kernel_size=11
+pad=0
+stride=4
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[Activation]
+name=relu1
+previous_layer=conv1
+activation_mode=relu
+
+[LRN]
+name=lrn1
+previous_layer=relu1
+lrn_mode=cross_channel_dim1
+local_size=5
+alpha=0.0001
+beta=0.75
+k=1.0
+
+[Pooling]
+name=pool1
+previous_layer=lrn1
+pool_mode=max
+kernel_size=3
+pad=0
+stride=2
+
+[Convolution]
+name=conv2
+previous_layer=pool1
+conv_mode=convolution
+num_output=256
+kernel_size=5
+pad=2
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[Activation]
+name=relu2
+previous_layer=conv2
+activation_mode=relu
+
+[LRN]
+name=lrn2
+previous_layer=relu2
+lrn_mode=cross_channel_dim1
+local_size=5
+alpha=0.0001
+beta=0.75
+k=1.0
+
+[Pooling]
+name=pool2
+previous_layer=lrn2
+pool_mode=max
+kernel_size=3
+pad=0
+stride=2
+
+[Convolution]
+name=conv3
+previous_layer=pool2
+conv_mode=convolution
+num_output=384
+kernel_size=3
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[Activation]
+name=relu3
+previous_layer=conv3
+activation_mode=relu
+
+[Convolution]
+name=conv4
+previous_layer=relu3
+conv_mode=convolution
+num_output=384
+kernel_size=3
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[Activation]
+name=relu4
+previous_layer=conv4
+activation_mode=relu
+
+[Convolution]
+name=conv5
+previous_layer=relu4
+conv_mode=convolution
+num_output=256
+kernel_size=3
+pad=1
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[Activation]
+name=relu5
+previous_layer=conv5
+activation_mode=relu
+
+[Pooling]
+name=pool5
+previous_layer=relu5
+pool_mode=max
+kernel_size=3
+pad=0
+stride=2
+
+[FullyConnected]
+name=fc6
+previous_layer=pool5
+num_output=4096
+
+[Activation]
+name=relu6
+previous_layer=fc6
+activation_mode=relu
+
+[FullyConnected]
+name=fc7
+previous_layer=relu6
+num_output=4096
+
+[Activation]
+name=relu7
+previous_layer=fc7
+activation_mode=relu
+
+[FullyConnected]
+name=fc8
+previous_layer=relu7
+num_output=1000
+
+[Softmax]
+name=softmax
+previous_layer=fc8
+softmax_algo=accurate
+softmax_mode=channel
diff --git a/src/DNNMark/config_example/bn_config.dnnmark b/src/DNNMark/config_example/bn_config.dnnmark
new file mode 100644
index 0000000..4a94634
--- /dev/null
+++ b/src/DNNMark/config_example/bn_config.dnnmark
@@ -0,0 +1,13 @@
+[DNNMark]
+run_mode=standalone
+
+[BatchNorm]
+name=batchnorm
+n=100
+c=1000
+h=1
+w=1
+batchnorm_mode=per_activation
+save_intermediates=true
+exp_avg_factor=0.5
+epsilon=1e-5
diff --git a/src/DNNMark/config_example/bypass_config.dnnmark b/src/DNNMark/config_example/bypass_config.dnnmark
new file mode 100644
index 0000000..49a199a
--- /dev/null
+++ b/src/DNNMark/config_example/bypass_config.dnnmark
@@ -0,0 +1,9 @@
+[DNNMark]
+run_mode=standalone
+
+[Bypass]
+name=bypass
+n=100
+c=1000
+h=1
+w=1
diff --git a/src/DNNMark/config_example/composed_model_config.dnnmark b/src/DNNMark/config_example/composed_model_config.dnnmark
new file mode 100644
index 0000000..8837948
--- /dev/null
+++ b/src/DNNMark/config_example/composed_model_config.dnnmark
@@ -0,0 +1,31 @@
+[DNNMark]
+run_mode=composed
+
+[Convolution]
+name=conv1
+n=1
+c=3
+h=32
+w=32
+previous_layer=null
+conv_mode=cross_correlation
+num_output=32
+kernel_size=5
+pad=2
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
+[Pooling]
+name=pool1
+previous_layer=conv1
+pool_mode=max
+kernel_size=3
+pad=0
+stride=2
+
+[FullyConnected]
+name=relu1
+previous_layer=pool1
+num_output=32
diff --git a/src/DNNMark/config_example/conf_convolution_block.dnnmark b/src/DNNMark/config_example/conf_convolution_block.dnnmark
new file mode 100644
index 0000000..2ab23be
--- /dev/null
+++ b/src/DNNMark/config_example/conf_convolution_block.dnnmark
@@ -0,0 +1,33 @@
+    [DNNMark]
+    run_mode=composed
+
+    [Convolution]
+    name=conv1
+    n=10
+    c=3
+    h=32
+    w=32
+    previous_layer=null
+    conv_mode=convolution
+    num_output=32
+    kernel_size=3
+    pad=1
+    stride=1
+    conv_fwd_pref=fastest
+    conv_bwd_filter_pref=$conv_bwd_filter_pref
+    conv_bwd_data_pref=fastest
+
+
+    [BatchNorm]
+    name=batchnorm1
+    previous_layer=conv1
+    batchnorm_mode=per_activation
+    save_intermediates=true
+    exp_avg_factor=0.5
+    epsilon=1e-5
+
+    [Activation]
+    name=relu1
+    previous_layer=batchnorm1
+    activation_mode=relu
+
diff --git a/src/DNNMark/config_example/conf_convolution_block.dnntemplate b/src/DNNMark/config_example/conf_convolution_block.dnntemplate
new file mode 100644
index 0000000..6e0e8dc
--- /dev/null
+++ b/src/DNNMark/config_example/conf_convolution_block.dnntemplate
@@ -0,0 +1,33 @@
+    [DNNMark]
+    run_mode=composed
+
+    [Convolution]
+    name=conv1
+    n=$N
+    c=$C
+    h=$H
+    w=$W
+    previous_layer=null
+    conv_mode=convolution
+    num_output=$K
+    kernel_size=$S
+    pad=$P
+    stride=$U
+    conv_fwd_pref=fastest
+    conv_bwd_filter_pref=$conv_bwd_filter_pref
+    conv_bwd_data_pref=fastest
+    $CUDNN_CBFA
+    $CUDNN_CBDA
+
+    [BatchNorm]
+    name=batchnorm1
+    previous_layer=conv1
+    batchnorm_mode=per_activation
+    save_intermediates=true
+    exp_avg_factor=0.5
+    epsilon=1e-5
+
+    [Activation]
+    name=relu1
+    previous_layer=batchnorm1
+    activation_mode=relu
diff --git a/src/DNNMark/config_example/conv_config.dnnmark b/src/DNNMark/config_example/conv_config.dnnmark
new file mode 100644
index 0000000..1185fe0
--- /dev/null
+++ b/src/DNNMark/config_example/conv_config.dnnmark
@@ -0,0 +1,19 @@
+[DNNMark]
+run_mode=standalone
+
+[Convolution]
+name=conv1
+n=126
+c=3
+h=256
+w=256
+previous_layer=null
+conv_mode=convolution
+num_output=32
+kernel_size=5
+pad=2
+stride=1
+conv_fwd_pref=fastest
+conv_bwd_filter_pref=fastest
+conv_bwd_data_pref=fastest
+
diff --git a/src/DNNMark/config_example/dropout_config.dnnmark b/src/DNNMark/config_example/dropout_config.dnnmark
new file mode 100644
index 0000000..2a9f448
--- /dev/null
+++ b/src/DNNMark/config_example/dropout_config.dnnmark
@@ -0,0 +1,11 @@
+[DNNMark]
+run_mode=standalone
+
+[Dropout]
+name=dropout
+n=100
+c=1000
+h=1
+w=1
+dropout_probability=.5
+random_seed=0
diff --git a/src/DNNMark/config_example/fc_config.dnnmark b/src/DNNMark/config_example/fc_config.dnnmark
new file mode 100644
index 0000000..da08d57
--- /dev/null
+++ b/src/DNNMark/config_example/fc_config.dnnmark
@@ -0,0 +1,11 @@
+[DNNMark]
+run_mode=standalone
+
+[FullyConnected]
+name=relu1
+n=100
+c=9162
+h=1
+w=1
+num_output=4096
+
diff --git a/src/DNNMark/config_example/lrn_config.dnnmark b/src/DNNMark/config_example/lrn_config.dnnmark
new file mode 100644
index 0000000..da5ff46
--- /dev/null
+++ b/src/DNNMark/config_example/lrn_config.dnnmark
@@ -0,0 +1,16 @@
+[DNNMark]
+run_mode=standalone
+
+[LRN]
+name=lrn1
+n=100
+c=32
+h=256
+w=256
+previous_layer=null
+lrn_mode=cross_channel_dim1
+local_size=5
+alpha=0.001
+beta=0.75
+k=1.0
+
diff --git a/src/DNNMark/config_example/pool_config.dnnmark b/src/DNNMark/config_example/pool_config.dnnmark
new file mode 100644
index 0000000..0302db1
--- /dev/null
+++ b/src/DNNMark/config_example/pool_config.dnnmark
@@ -0,0 +1,15 @@
+[DNNMark]
+run_mode=standalone
+
+[Pooling]
+name=pool1
+n=100
+c=3
+h=256
+w=256
+previous_layer=null
+pool_mode=max
+kernel_size=3
+pad=0
+stride=2
+
diff --git a/src/DNNMark/config_example/softmax_config.dnnmark b/src/DNNMark/config_example/softmax_config.dnnmark
new file mode 100644
index 0000000..1d4a227
--- /dev/null
+++ b/src/DNNMark/config_example/softmax_config.dnnmark
@@ -0,0 +1,11 @@
+[DNNMark]
+run_mode=standalone
+
+[Softmax]
+name=softmax
+n=100
+c=1000
+h=1
+w=1
+softmax_algo=accurate
+softmax_mode=channel
diff --git a/src/DNNMark/core/include/common.h b/src/DNNMark/core/include/common.h
new file mode 100644
index 0000000..2f4f6a3
--- /dev/null
+++ b/src/DNNMark/core/include/common.h
@@ -0,0 +1,182 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_COMMON_H_
+#define CORE_INCLUDE_COMMON_H_
+
+#include <iostream>
+
+#ifdef NVIDIA_CUDNN
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <curand.h>
+#include <cudnn.h>
+#include <cublas_v2.h>
+#endif
+
+#ifdef AMD_MIOPEN
+#include <miopen/miopen.h>
+#include <hip/hip_runtime_api.h>
+#endif
+
+namespace dnnmark {
+
+#ifdef NVIDIA_CUDNN
+
+#define CUDA_CALL(x) \
+do {\
+  cudaError_t ret = x;\
+  if(ret != cudaSuccess) {\
+    std::cout << "CUDA Error at " << __FILE__ << __LINE__ << std::endl;\
+    std::cout << cudaGetErrorString(ret) << std::endl;\
+    exit(EXIT_FAILURE);\
+  }\
+} while(0)\
+
+#define CURAND_CALL(x) \
+do {\
+  if((x) != CURAND_STATUS_SUCCESS) {\
+    std::cout << "CURAND Error at " << __FILE__ << __LINE__;\
+    exit(EXIT_FAILURE);\
+  }\
+} while(0)\
+
+#define CUDNN_CALL(x) \
+do {\
+  cudnnStatus_t ret = x;\
+  if(ret != CUDNN_STATUS_SUCCESS) {\
+    std::cout << "CUDNN Error at " << __FILE__ << __LINE__;\
+    std::cout << cudnnGetErrorString(ret) << std::endl;\
+    exit(EXIT_FAILURE);\
+  }\
+} while(0)\
+
+#define CUBLAS_CALL(x) \
+do {\
+  if((x) != CUBLAS_STATUS_SUCCESS) {\
+    std::cout << "CUDNN Error at " << __FILE__ << __LINE__;\
+    exit(EXIT_FAILURE);\
+  }\
+} while(0)\
+
+#endif
+
+#ifdef AMD_MIOPEN
+
+#define HIP_CALL(x) \
+do {\
+  hipError_t ret = x;\
+  if(ret != hipSuccess) {\
+    std::cout << "HIP Error at " << __FILE__ << __LINE__ << std::endl;\
+    std::cout << hipGetErrorString(ret) << std::endl;\
+    exit(EXIT_FAILURE);\
+  }\
+} while(0)\
+
+#define MIOPEN_CALL(x) \
+do {\
+  miopenStatus_t ret = x;\
+  if(ret != miopenStatusSuccess) {\
+    std::cout << "MIOpen Error: " << ret << " at " << __FILE__ << __LINE__;\
+    exit(EXIT_FAILURE);\
+  }\
+} while(0)\
+
+#define ROCBLAS_CALL(x) \
+do {\
+  rocblas_status ret = x;\
+  if (ret != rocblas_status_success) {\
+    std::cout << "ROCBLAS Error: " << ret << " at " << __FILE__ << __LINE__;\
+    exit(EXIT_FAILURE);\
+  }\
+} while(0)\
+
+#endif
+
+#define CONFIG_CHECK(x) \
+do {\
+  if ((x) != 0) {\
+    std::cout << "Parse configuration Error at " << __FILE__ << __LINE__;\
+    exit(EXIT_FAILURE);\
+  }\
+} while(0)\
+
+#ifdef DOUBLE_TEST
+#define TestType double
+#else
+#define TestType float
+#endif
+
+// Code courtesy of Caffe
+template <typename T>
+class DataType;
+template <> class DataType<float>  {
+ public:
+#ifdef NVIDIA_CUDNN
+  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+#endif
+#ifdef AMD_MIOPEN
+  static const miopenDataType_t type = miopenFloat;
+#endif
+  static float oneval, zeroval;
+  static const void *one, *zero;
+};
+
+#ifdef NVIDIA_CUDNN
+template <> class DataType<double> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+  static double oneval, zeroval;
+  static const void *one, *zero;
+};
+#endif
+
+// Min epsilon for BN
+#define BN_MIN_EPSILON 1e-5
+
+// Benchmark running mode
+// None: the benchmark haven't been setup
+// Standalone: only one layer will be benchmarked
+// Composed: multiple layers will be benchmarked, maybe a whole network
+enum RunMode {
+  NONE = 0,
+  STANDALONE,
+  COMPOSED
+};
+
+// Layer type
+enum LayerType {
+  CONVOLUTION = 1,
+  POOLING,
+  LRN,
+  ACTIVATION,
+  FC,
+  SOFTMAX,
+  BN,
+  DROPOUT,
+  BYPASS
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_COMMON_H_
+
diff --git a/src/DNNMark/core/include/data_manager.h b/src/DNNMark/core/include/data_manager.h
new file mode 100644
index 0000000..8a4c10a
--- /dev/null
+++ b/src/DNNMark/core/include/data_manager.h
@@ -0,0 +1,120 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_DATA_MANAGER_H_
+#define CORE_INCLUDE_DATA_MANAGER_H_
+
+#include <memory>
+#include <map>
+#include <glog/logging.h>
+
+#include "common.h"
+#include "data_png.h"
+
+namespace dnnmark {
+
+template <typename T>
+class Data {
+ private:
+  PseudoNumGenerator *png_;
+  size_t size_;
+  T *gpu_ptr_;
+ public:
+  Data(size_t size)
+  : size_(size) {
+    LOG(INFO) << "Create Data chunk of size " << size_;
+#ifdef NVIDIA_CUDNN
+    CUDA_CALL(cudaMalloc(&gpu_ptr_, size * sizeof(T)));
+#endif
+#ifdef AMD_MIOPEN
+    HIP_CALL(hipMalloc(&gpu_ptr_, size * sizeof(T)));
+#endif
+  }
+  ~Data() {
+    LOG(INFO) << "Free Data chunk of size " << size_;
+    if (size_ > 0) {
+#ifdef NVIDIA_CUDNN
+      CUDA_CALL(cudaFree(gpu_ptr_));
+#endif
+#ifdef AMD_MIOPEN
+      HIP_CALL(hipFree(gpu_ptr_));
+#endif
+    }
+  }
+  void Filler() {
+    png_ = PseudoNumGenerator::GetInstance();
+    png_->GenerateUniformData(gpu_ptr_, size_);
+  }
+  T *Get() { return gpu_ptr_; }
+};
+
+
+template <typename T>
+class DataManager {
+ private:
+  // Memory pool indexed by chunk id
+  std::map<int, std::shared_ptr<Data<T>>> gpu_data_pool_;
+  int num_data_chunks_;
+
+  // Constructor
+  DataManager()
+  : num_data_chunks_(0) {
+  }
+
+  // Memory manager instance
+  static std::unique_ptr<DataManager<T>> instance_;
+ public:
+  static DataManager<T> *GetInstance() {
+    if (instance_.get())
+      return instance_.get();
+    instance_.reset(new DataManager());
+    return instance_.get();
+  }
+
+  ~DataManager() {
+    gpu_data_pool_.clear();
+  }
+
+  int CreateData(size_t size) {
+    int gen_chunk_id = num_data_chunks_;
+    num_data_chunks_++;
+    gpu_data_pool_.emplace(gen_chunk_id, std::make_shared<Data<T>>(size));
+    LOG(INFO) << "Create data with ID: " << gen_chunk_id;
+    return gen_chunk_id;
+  }
+
+  void RemoveData(int chunk_id) {
+    gpu_data_pool_.erase(chunk_id);
+  }
+
+  Data<T> *GetData(int chunk_id) {
+    return gpu_data_pool_[chunk_id].get();
+  }
+};
+
+template <typename T>
+std::unique_ptr<DataManager<T>> DataManager<T>::instance_ = nullptr;
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_DATA_MANAGER_H_
+
diff --git a/src/DNNMark/core/include/data_png.h b/src/DNNMark/core/include/data_png.h
new file mode 100644
index 0000000..4a8d4d7
--- /dev/null
+++ b/src/DNNMark/core/include/data_png.h
@@ -0,0 +1,169 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_DATA_PNG_H_
+#define CORE_INCLUDE_DATA_PNG_H_
+
+#include <fcntl.h>
+#include <map>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <vector>
+
+namespace dnnmark {
+
+// Seed of random number generator
+static unsigned long long int seed = 1234;
+
+class PseudoNumGenerator {
+ private:
+#ifdef NVIDIA_CUDNN
+  curandGenerator_t gen_;
+#endif
+
+  // Constructor
+  PseudoNumGenerator(const std::string &mmap_file) :
+    mmap_file_(mmap_file.c_str()), use_mmap(!mmap_file.empty()) {
+#ifdef NVIDIA_CUDNN
+    CURAND_CALL(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT));
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen_, seed));
+#endif
+  }
+
+  PseudoNumGenerator() {
+#ifdef NVIDIA_CUDNN
+    CURAND_CALL(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT));
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen_, seed));
+#endif
+  }
+
+  // PNG instance
+  static std::unique_ptr<PseudoNumGenerator> instance_;
+  static uint64_t offset;
+  const char *mmap_file_;
+  bool use_mmap;
+ public:
+
+  ~PseudoNumGenerator() {
+#ifdef NVIDIA_CUDNN
+    CURAND_CALL(curandDestroyGenerator(gen_));
+#endif
+  }
+
+  static void CreateInstance(const std::string &mmap_file) {
+    if (instance_.get())
+      return;
+    instance_.reset(new PseudoNumGenerator(mmap_file));
+  }
+
+  static PseudoNumGenerator *GetInstance() {
+    if (instance_.get())
+      return instance_.get();
+    instance_.reset(new PseudoNumGenerator());
+    return instance_.get();
+  }
+  void GenerateUniformData(float *dev_ptr, int size) {
+#ifdef NVIDIA_CUDNN
+    CURAND_CALL(curandGenerateUniform(gen_, dev_ptr, size));
+#endif
+#ifdef AMD_MIOPEN
+    float *host_ptr;
+    if (use_mmap) {
+        int fd = open(mmap_file_, O_RDONLY);
+        LOG_IF(FATAL, size*sizeof(float) > lseek(fd, 0, SEEK_END)) <<
+            "Requested data larger than binary file size";
+
+        if (offset + (size * sizeof(float)) > lseek(fd, 0, SEEK_END)) {
+            LOG(INFO) << "Mapped binary data insufficient, rolling over";
+            offset = 0;
+        }
+
+        host_ptr = (float *)mmap(NULL, size*sizeof(float), PROT_READ, MAP_SHARED,
+                                fd, offset);
+        offset += size*sizeof(float);
+        offset -= offset % sysconf(_SC_PAGESIZE);
+        close(fd);
+    } else {
+        host_ptr = new float[size];
+        for (int i = 0; i < size; i++)
+          host_ptr[i] = static_cast <float> (rand()) /
+                        (static_cast <float> (RAND_MAX/seed));
+    }
+
+    HIP_CALL(hipMemcpy(dev_ptr, host_ptr, size * sizeof(float),
+                       hipMemcpyHostToDevice));
+    if (use_mmap) {
+        munmap(host_ptr, size*sizeof(float));
+    } else {
+        delete []host_ptr;
+    }
+
+#endif
+  }
+  void GenerateUniformData(double *dev_ptr, int size) {
+#ifdef NVIDIA_CUDNN
+    CURAND_CALL(curandGenerateUniformDouble(gen_, dev_ptr, size));
+#endif
+#ifdef AMD_MIOPEN
+    double *host_ptr = new double[size];
+    if (use_mmap) {
+        int fd = open(mmap_file_, O_RDONLY);
+        LOG_IF(FATAL, size*sizeof(double) > lseek(fd, 0, SEEK_END)) <<
+            "Requested data larger than binary file size";
+
+        if (offset + (size * sizeof(double)) > lseek(fd, 0, SEEK_END)) {
+            LOG(INFO) << "Mapped binary data insufficient, rolling over";
+            offset = 0;
+        }
+
+        host_ptr = (double *)mmap(NULL, size*sizeof(double), PROT_READ, MAP_SHARED,
+                                  fd, offset);
+        offset += size*sizeof(float);
+        offset -= offset % sysconf(_SC_PAGESIZE);
+        close(fd);
+    } else {
+        for (int i = 0; i < size; i++)
+          host_ptr[i] = static_cast <double> (rand()) /
+                        (static_cast <double> (RAND_MAX/seed));
+    }
+
+    HIP_CALL(hipMemcpy(dev_ptr, host_ptr, size * sizeof(double),
+                       hipMemcpyHostToDevice));
+
+    if (use_mmap) {
+        munmap(host_ptr, size*sizeof(double));
+    } else {
+        delete []host_ptr;
+    }
+
+#endif
+  }
+};
+
+uint64_t PseudoNumGenerator::offset = 0;
+
+std::unique_ptr<PseudoNumGenerator> PseudoNumGenerator::instance_ = nullptr;
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_DATA_PNG_H_
+
diff --git a/src/DNNMark/core/include/dnn_config_keywords.h b/src/DNNMark/core/include/dnn_config_keywords.h
new file mode 100644
index 0000000..723ed41
--- /dev/null
+++ b/src/DNNMark/core/include/dnn_config_keywords.h
@@ -0,0 +1,150 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_CONFIG_KEYWORDS_H_
+#define CORE_INCLUDE_CONFIG_KEYWORDS_H_
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+namespace dnnmark {
+
+// Configuration section keywords
+const std::vector<std::string> general_section_keywords = {
+  "[DNNMark]"
+};
+const std::vector<std::string> layer_section_keywords = {
+  "[Convolution]",
+  "[Pooling]",
+  "[LRN]",
+  "[Activation]",
+  "[FullyConnected]",
+  "[Softmax]",
+  "[BatchNorm]",
+  "[Dropout]",
+  "[Bypass]"
+};
+
+// DNNMark keywords
+const std::vector<std::string> dnnmark_config_keywords = {
+  "run_mode"
+};
+
+// Data config keywords
+const std::vector<std::string> data_config_keywords = {
+  "name",
+  "n",
+  "c",
+  "h",
+  "w",
+  "previous_layer"
+};
+
+// Convolution layer keywords
+const std::vector<std::string> conv_config_keywords = {
+  "conv_mode",
+  "algo",
+  "algod",
+  "propagation",
+  "num_output",
+  "kernel_size",
+  "pad",
+  "stride",
+  "kernel_size_h",
+  "kernel_size_w",
+  "pad_h",
+  "pad_w",
+  "stride_h",
+  "stride_w",
+  "conv_fwd_pref",
+  "conv_bwd_filter_pref",
+  "conv_bwd_data_pref"
+};
+
+// Pooling layer keywords
+const std::vector<std::string> pool_config_keywords = {
+  "pool_mode",
+  "kernel_size",
+  "pad",
+  "stride",
+  "kernel_size_h",
+  "kernel_size_w",
+  "pad_h",
+  "pad_w",
+  "stride_h",
+  "stride_w"
+};
+
+// LRN layer keywords
+const std::vector<std::string> lrn_config_keywords = {
+  "lrn_mode",
+  "local_size",
+  "alpha",
+  "beta",
+  "k"
+};
+
+// Activation layer keywords
+const std::vector<std::string> activation_config_keywords = {
+  "activation_mode"
+};
+
+// FC layer keywords
+const std::vector<std::string> fc_config_keywords = {
+  "num_output"
+};
+
+// Softmax layer keywords
+const std::vector<std::string> softmax_config_keywords = {
+  "softmax_algo",
+  "softmax_mode"
+};
+
+// BN layer keywords
+const std::vector<std::string> bn_config_keywords = {
+  "batchnorm_mode",
+  "save_intermediates",
+  "exp_avg_factor",
+  "epsilon"
+};
+
+// DROPOUT layer keywords
+const std::vector<std::string> dropout_config_keywords = {
+  "dropout_probability",
+  "random_seed"
+};
+
+// BYPASS layer keywords
+const std::vector<std::string> bypass_config_keywords = {
+};
+
+bool isSection(const std::string &s);
+bool isGeneralSection(const std::string &s);
+bool isLayerSection(const std::string &s);
+bool isKeywordExist(const std::string &s,
+                    const std::vector<std::string> &config_keywords);
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_CONFIG_KEYWORDS_H_
+
diff --git a/src/DNNMark/core/include/dnn_layer.h b/src/DNNMark/core/include/dnn_layer.h
new file mode 100644
index 0000000..3345688
--- /dev/null
+++ b/src/DNNMark/core/include/dnn_layer.h
@@ -0,0 +1,198 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_DNN_LAYER_H_
+#define CORE_INCLUDE_DNN_LAYER_H_
+
+#include <vector>
+#include <glog/logging.h>
+#include "common.h"
+#include "dnn_param.h"
+#include "dnn_utility.h"
+#include "data_manager.h"
+
+namespace dnnmark {
+
+// Forward declaration
+template <typename T> class DNNMark;
+
+template <typename T>
+class Layer {
+ protected:
+  // DNNMark pointer
+  DNNMark<T> *p_dnnmark_;
+
+  bool has_learnable_params_;
+  LayerType type_;
+  int layer_id_;
+  std::string layer_name_;
+  std::string previous_layer_name_;
+  DataDim input_dim_;
+  DataDim output_dim_;
+  DataTensor<T> bottom_desc_;
+  DataTensor<T> top_desc_;
+  DataManager<T> *data_manager_;
+
+  int num_bottoms_;
+  // Layer bottom data
+  std::vector<Data<T> *> bottoms_;
+  std::vector<int> bottom_chunk_ids_;
+  std::vector<Data<T> *> bottom_diffs_;
+  std::vector<int> bottom_diff_chunk_ids_;
+
+  int num_tops_;
+  // Layer top data
+  std::vector<Data<T> *> tops_;
+  std::vector<int> top_chunk_ids_;
+  std::vector<Data<T> *> top_diffs_;
+  std::vector<int> top_diff_chunk_ids_;
+ public:
+  Layer(DNNMark<T> *p_dnnmark)
+  : p_dnnmark_(p_dnnmark),
+    layer_id_(0), has_learnable_params_(false),
+    input_dim_(), bottom_desc_(),
+    output_dim_(), top_desc_(),
+    num_bottoms_(1), num_tops_(1) {
+    data_manager_ = DataManager<T>::GetInstance();
+  }
+  ~Layer() {
+  }
+  DataDim *getInputDim() { return &input_dim_; }
+  DataDim *getOutputDim() { return &output_dim_; }
+  void setLayerName(const char *layer_name) {
+    layer_name_.assign(layer_name);
+    // Debug info
+    LOG(INFO) << "Layer name: " << layer_name_;
+  }
+  void setPrevLayerName(const char *previous_layer_name) {
+    previous_layer_name_.assign(previous_layer_name);
+    // Debug info
+    LOG(INFO) << "Previous layer: " << previous_layer_name_;
+  }
+  void setLayerId(int layer_id) { layer_id_ = layer_id; }
+  int getLayerId() { return layer_id_; }
+  void setLayerType(LayerType type) { type_ = type; }
+  LayerType getLayerType() { return type_; }
+
+  // Functions that used to communicate with its successor layer
+  int getNumTops() { return num_tops_; }
+  int getTopChunkID(int index) { return top_chunk_ids_[index]; }
+  int getTopDiffChunkID(int index) { return top_diff_chunk_ids_[index]; }
+  int getTopDimN() { return output_dim_.n_; }
+  int getTopDimC() { return output_dim_.c_; }
+  int getTopDimH() { return output_dim_.h_; }
+  int getTopDimW() { return output_dim_.w_; }
+
+  // Base layer setup function
+  virtual void Setup() {
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 &&
+        input_dim_.h_ != 0 && input_dim_.w_ != 0) {
+      // Debug info
+      LOG(INFO) << "Bottom dimension: "
+                << "N: " << input_dim_.n_ << " "
+                << "C: " << input_dim_.c_ << " "
+                << "H: " << input_dim_.h_ << " "
+                << "W: " << input_dim_.w_;
+      //
+      // Standalone mode or the first layer in composed mode
+      //
+      if (p_dnnmark_->getRunMode() == COMPOSED)
+        if (previous_layer_name_.compare("null")!=0) {
+          LOG(INFO) << "Problems with "<< layer_name_ << " <- "
+                    << previous_layer_name_ << " "
+                    << previous_layer_name_.compare("null");
+          LOG(FATAL) << "When composed as composed mode, the first layer "
+                     << "should set data dimension "
+                     << "and have a <null> previous layer";
+        }
+      // Set bottom tensor
+      bottom_desc_.Set(input_dim_.n_,
+                       input_dim_.c_,
+                       input_dim_.h_,
+                       input_dim_.w_);
+
+      // Prepare bottom data
+      int bottom_size = input_dim_.n_ *
+                        input_dim_.c_ *
+                        input_dim_.h_ *
+                        input_dim_.w_;
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottom_chunk_ids_.push_back(
+          data_manager_->CreateData(bottom_size));
+        bottoms_.push_back(
+          data_manager_->GetData(bottom_chunk_ids_[i]));
+        bottom_diff_chunk_ids_.push_back(
+          data_manager_->CreateData(bottom_size));
+        bottom_diffs_.push_back(
+          data_manager_->GetData(bottom_diff_chunk_ids_[i]));
+      }
+    } else {
+      //
+      // Composed mode
+      //
+      CHECK_EQ(p_dnnmark_->getRunMode(), COMPOSED);
+      if (p_dnnmark_->isLayerExist(previous_layer_name_)) {
+        Layer<T> *previous_layer =
+          p_dnnmark_->GetLayerByName(previous_layer_name_);
+        num_bottoms_ = previous_layer->getNumTops();
+        num_tops_ = num_bottoms_;
+        input_dim_.n_ = previous_layer->getTopDimN();
+        input_dim_.c_ = previous_layer->getTopDimC();
+        input_dim_.h_ = previous_layer->getTopDimH();
+        input_dim_.w_ = previous_layer->getTopDimW();
+
+        // Debug info
+        LOG(INFO) << "Bottom dimension: "
+                  << "N: " << input_dim_.n_ << " "
+                  << "C: " << input_dim_.c_ << " "
+                  << "H: " << input_dim_.h_ << " "
+                  << "W: " << input_dim_.w_;
+
+        // Set bottom tensor
+        bottom_desc_.Set(input_dim_.n_,
+                         input_dim_.c_,
+                         input_dim_.h_,
+                         input_dim_.w_);
+        for (int i = 0; i < num_bottoms_; i++) {
+          bottom_chunk_ids_.push_back(
+            previous_layer->getTopChunkID(i));
+          bottoms_.push_back(
+            data_manager_->GetData(bottom_chunk_ids_[i]));
+          bottom_diff_chunk_ids_.push_back(
+            previous_layer->getTopDiffChunkID(i));
+          bottom_diffs_.push_back(
+            data_manager_->GetData(bottom_diff_chunk_ids_[i]));
+        }
+      } else {
+        LOG(FATAL) << "Wrong previous layer name!!!";
+      }
+    }
+  }
+
+  virtual void ForwardPropagation() {}
+  virtual void BackwardPropagation() {}
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_DNN_LAYER_H_
diff --git a/src/DNNMark/core/include/dnn_param.h b/src/DNNMark/core/include/dnn_param.h
new file mode 100644
index 0000000..d8aaa05
--- /dev/null
+++ b/src/DNNMark/core/include/dnn_param.h
@@ -0,0 +1,641 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_DNN_PARAM_H_
+#define CORE_INCLUDE_DNN_PARAM_H_
+
+#include <iostream>
+#include <string>
+#include <glog/logging.h>
+
+#ifdef NVIDIA_CUDNN
+#include <cudnn.h>
+#endif
+
+#ifdef AMD_MIOPEN
+#include <miopen/miopen.h>
+#endif
+
+#include "common.h"
+#include "dnn_config_keywords.h"
+
+namespace dnnmark {
+
+struct DataDim {
+  int n_;
+  int c_;
+  int h_;
+  int w_;
+
+  DataDim()
+  : n_(0), c_(0), h_(0), w_(0) {}
+};
+
+inline std::ostream &operator<<(std::ostream &os, const DataDim &data_dim) {
+  os << std::endl;
+  os << "[Data Dim] N: " << data_dim.n_ << std::endl;
+  os << "[Data Dim] C: " << data_dim.c_ << std::endl;
+  os << "[Data Dim] H: " << data_dim.h_ << std::endl;
+  os << "[Data Dim] W: " << data_dim.w_ << std::endl;
+  return os;
+}
+
+struct ConvolutionParam {
+#ifdef NVIDIA_CUDNN
+  cudnnConvolutionMode_t mode_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenConvolutionMode_t mode_;
+#endif
+  bool propagation_;
+  int output_num_;
+  int pad_h_;
+  int pad_w_;
+  int stride_u_;
+  int stride_v_;
+  int upscale_x_;
+  int upscale_y_;
+  int kernel_size_h_;
+  int kernel_size_w_;
+  bool algo_set_;
+  std::string algo_;
+  std::string algod_;
+#ifdef NVIDIA_CUDNN
+  cudnnConvolutionFwdPreference_t conv_fwd_pref_;
+  cudnnConvolutionBwdFilterPreference_t conv_bwd_filter_pref_;
+  cudnnConvolutionBwdDataPreference_t conv_bwd_data_pref_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenConvAlgoPerf_t *pref_;
+#endif
+  ConvolutionParam()
+#ifdef NVIDIA_CUDNN
+  : mode_(CUDNN_CROSS_CORRELATION),
+#endif
+#ifdef AMD_MIOPEN
+  : mode_(miopenConvolution),
+#endif
+    output_num_(32),
+    pad_h_(2), pad_w_(2),
+    stride_u_(1), stride_v_(1),
+    upscale_x_(1), upscale_y_(1),
+    kernel_size_h_(5), kernel_size_w_(5), propagation_(true),
+    algo_set_(false), algo_(""), algod_(""),
+#ifdef NVIDIA_CUDNN
+    conv_fwd_pref_(CUDNN_CONVOLUTION_FWD_PREFER_FASTEST),
+    conv_bwd_filter_pref_(CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST),
+    conv_bwd_data_pref_(CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST) {}
+#endif
+#ifdef AMD_MIOPEN
+    pref_(nullptr) {}
+#endif
+
+};
+
+inline std::ostream &operator<<(std::ostream &os,
+                         const ConvolutionParam &conv_param) {
+  os << std::endl;
+  os << "[Convolution Param] Output Num: "
+     << conv_param.output_num_ << std::endl;
+  os << "[Convolution Param] Pad H: "
+     << conv_param.pad_h_ << std::endl;
+  os << "[Convolution Param] Pad W: "
+     << conv_param.pad_w_ << std::endl;
+  os << "[Convolution Param] Stride U: "
+     << conv_param.stride_u_ << std::endl;
+  os << "[Convolution Param] Stride V: "
+     << conv_param.stride_v_ << std::endl;
+  os << "[Convolution Param] Kernel Size H: "
+     << conv_param.kernel_size_h_ << std::endl;
+  os << "[Convolution Param] Kernel Size W: "
+     << conv_param.kernel_size_w_ << std::endl;
+
+  return os;
+}
+
+inline void SetupConvParam(const std::string &var, const std::string &val,
+                           ConvolutionParam *conv_param) {
+  // Process all the corresponding keywords in config
+  if(isKeywordExist(var, conv_config_keywords)) {
+    if (!var.compare("conv_mode")) {
+      if (!val.compare("convolution"))
+#ifdef NVIDIA_CUDNN
+        conv_param->mode_ = CUDNN_CONVOLUTION;
+#endif
+#ifdef AMD_MIOPEN
+        conv_param->mode_ = miopenTranspose;
+#endif
+      else if (!val.compare("cross_correlation"))
+#ifdef NVIDIA_CUDNN
+        conv_param->mode_ = CUDNN_CROSS_CORRELATION;
+#endif
+#ifdef AMD_MIOPEN
+        conv_param->mode_ = miopenConvolution;
+#endif
+#ifdef AMD_MIOPEN
+      else if (!val.compare("transpose"))
+        conv_param->mode_ = miopenTranspose;
+#endif
+      else
+        LOG(FATAL) << "Invalid conv mode: " << val << std::endl;
+    } else if (!var.compare("num_output")) {
+      conv_param->output_num_ = atoi(val.c_str());
+    } else if (!var.compare("kernel_size")) {
+      conv_param->kernel_size_h_ = atoi(val.c_str());
+      conv_param->kernel_size_w_ = atoi(val.c_str());
+    } else if (!var.compare("pad")) {
+      conv_param->pad_h_ = atoi(val.c_str());
+      conv_param->pad_w_ = atoi(val.c_str());
+    } else if (!var.compare("stride")) {
+      conv_param->stride_u_ = atoi(val.c_str());
+      conv_param->stride_v_ = atoi(val.c_str());
+    } else if (!var.compare("kernel_size_h")) {
+      conv_param->kernel_size_h_ = atoi(val.c_str());
+    } else if (!var.compare("kernel_size_w")) {
+      conv_param->kernel_size_w_ = atoi(val.c_str());
+    } else if (!var.compare("pad_h")) {
+      conv_param->pad_h_ = atoi(val.c_str());
+    } else if (!var.compare("pad_w")) {
+      conv_param->pad_w_ = atoi(val.c_str());
+    } else if (!var.compare("stride_h")) {
+      conv_param->stride_u_ = atoi(val.c_str());
+    } else if (!var.compare("stride_w")) {
+      conv_param->stride_v_ = atoi(val.c_str());
+    } else if (!var.compare("propagation")) {
+      if (!val.compare("false"))
+        conv_param->propagation_ = false;
+    } else if (!var.compare("algo")) {
+        conv_param->algo_set_ = true;
+        conv_param->algo_ = val;
+    } else if (!var.compare("algod")) {
+        conv_param->algod_ = val;
+    } else if (!var.compare("conv_fwd_pref")) {
+#ifdef NVIDIA_CUDNN
+      if (!val.compare("no_workspace"))
+        conv_param->conv_fwd_pref_ = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+      else if (!val.compare("fastest"))
+        conv_param->conv_fwd_pref_ = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+      else if (!val.compare("specify_workspace_limit"))
+        conv_param->conv_fwd_pref_ =
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+#endif
+    } else if (!var.compare("conv_bwd_filter_pref")) {
+#ifdef NVIDIA_CUDNN
+      if (!val.compare("no_workspace"))
+        conv_param->conv_bwd_filter_pref_ =
+          CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+      else if (!val.compare("fastest"))
+        conv_param->conv_bwd_filter_pref_ =
+          CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+      else if (!val.compare("specify_workspace_limit"))
+        conv_param->conv_bwd_filter_pref_ =
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+#endif
+    } else if (!var.compare("conv_bwd_data_pref")) {
+#ifdef NVIDIA_CUDNN
+      if (!val.compare("no_workspace"))
+        conv_param->conv_bwd_data_pref_ =
+          CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+      else if (!val.compare("fastest"))
+        conv_param->conv_bwd_data_pref_ =
+          CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+      else if (!val.compare("specify_workspace_limit"))
+        conv_param->conv_bwd_data_pref_ =
+          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+#endif
+    }
+  } else {
+    LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+  }
+}
+
+struct PoolingParam {
+#ifdef NVIDIA_CUDNN
+  cudnnPoolingMode_t mode_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenPoolingMode_t mode_;
+#endif
+  int pad_h_;
+  int pad_w_;
+  int stride_h_;
+  int stride_w_;
+  int kernel_size_h_;
+  int kernel_size_w_;
+  PoolingParam()
+#ifdef NVIDIA_CUDNN
+  : mode_(CUDNN_POOLING_MAX),
+#endif
+#ifdef AMD_MIOPEN
+  : mode_(miopenPoolingMax),
+#endif
+    pad_h_(0), pad_w_(0),
+    stride_h_(2), stride_w_(2),
+    kernel_size_h_(3), kernel_size_w_(3) {}
+};
+
+inline std::ostream &operator<<(std::ostream &os,
+                         const PoolingParam &pool_param) {
+  os << std::endl;
+  os << "[Pooling Param] Pad H: "
+     << pool_param.pad_h_ << std::endl;
+  os << "[Pooling Param] Pad W: "
+     << pool_param.pad_w_ << std::endl;
+  os << "[Pooling Param] Stride H: "
+     << pool_param.stride_h_ << std::endl;
+  os << "[Pooling Param] Stride W: "
+     << pool_param.stride_w_ << std::endl;
+  os << "[Pooling Param] Kernel Size H: "
+     << pool_param.kernel_size_h_ << std::endl;
+  os << "[Pooling Param] Kernel Size W: "
+     << pool_param.kernel_size_w_ << std::endl;
+
+  return os;
+}
+
+inline void SetupPoolingParam(const std::string &var, const std::string &val,
+                              PoolingParam *pool_param) {
+  // Process all the keywords in config
+  if(isKeywordExist(var, pool_config_keywords)) {
+    if (!var.compare("pool_mode")) {
+      if (!val.compare("max"))
+#ifdef NVIDIA_CUDNN
+        pool_param->mode_ = CUDNN_POOLING_MAX;
+#endif
+#ifdef AMD_MIOPEN
+        pool_param->mode_ = miopenPoolingMax;
+#endif
+#ifdef NVIDIA_CUDNN
+      else if (!val.compare("avg_include_padding"))
+        pool_param->mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      else if (!val.compare("avg_exclude_padding"))
+        pool_param->mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+#endif
+#ifdef AMD_MIOPEN
+      else if (!val.compare("avg"))
+        pool_param->mode_ = miopenPoolingAverage;
+#endif
+      else
+        LOG(FATAL) << "Invalid pool mode" << std::endl;
+    } else if (!var.compare("kernel_size")) {
+      pool_param->kernel_size_h_ = atoi(val.c_str());
+      pool_param->kernel_size_w_ = atoi(val.c_str());
+    } else if (!var.compare("pad")) {
+      pool_param->pad_h_ = atoi(val.c_str());
+      pool_param->pad_w_ = atoi(val.c_str());
+    } else if (!var.compare("stride")) {
+      pool_param->stride_h_ = atoi(val.c_str());
+      pool_param->stride_w_ = atoi(val.c_str());
+    } else if (!var.compare("kernel_size_h")) {
+      pool_param->kernel_size_h_ = atoi(val.c_str());
+    } else if (!var.compare("kernel_size_w")) {
+      pool_param->kernel_size_w_ = atoi(val.c_str());
+    } else if (!var.compare("pad_h")) {
+      pool_param->pad_h_ = atoi(val.c_str());
+    } else if (!var.compare("pad_w")) {
+      pool_param->pad_w_ = atoi(val.c_str());
+    } else if (!var.compare("stride_h")) {
+      pool_param->stride_h_ = atoi(val.c_str());
+    } else if (!var.compare("stride_w")) {
+      pool_param->stride_w_ = atoi(val.c_str());
+    }
+  } else {
+    LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+  }
+}
+
+struct LRNParam {
+#ifdef NVIDIA_CUDNN
+  cudnnLRNMode_t mode_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenLRNMode_t mode_;
+#endif
+  int local_size_;
+  double alpha_;
+  double beta_;
+  double k_;
+  LRNParam()
+#ifdef NVIDIA_CUDNN
+  : mode_(CUDNN_LRN_CROSS_CHANNEL_DIM1),
+#endif
+#ifdef AMD_MIOPEN
+  : mode_(miopenLRNCrossChannel),
+#endif
+    local_size_(5),
+    alpha_(0.0001), beta_(0.75), k_(2.0) {}
+};
+
+inline std::ostream &operator<<(std::ostream &os,
+                         const LRNParam &lrn_param) {
+  os << std::endl;
+  os << "[LRN Param] Local size: "
+     << lrn_param.local_size_ << std::endl;
+  os << "[LRN Param] Alpha: "
+     << lrn_param.alpha_ << std::endl;
+  os << "[LRN Param] Beta: "
+     << lrn_param.beta_ << std::endl;
+  os << "[LRN Param] K: "
+     << lrn_param.k_ << std::endl;
+
+  return os;
+}
+
+inline void SetupLrnParam(const std::string &var, const std::string &val,
+                          LRNParam *lrn_param) {
+  // Process all the keywords in config
+  if(isKeywordExist(var, lrn_config_keywords)) {
+    if (!var.compare("lrn_mode")) {
+      if (!val.compare("cross_channel_dim1"))
+#ifdef NVIDIA_CUDNN
+        lrn_param->mode_ = CUDNN_LRN_CROSS_CHANNEL_DIM1;
+#endif
+#ifdef AMD_MIOPEN
+        lrn_param->mode_ = miopenLRNCrossChannel;
+      else if (!val.compare("within_channel"))
+        lrn_param->mode_ = miopenLRNWithinChannel;
+#endif
+      else
+        LOG(FATAL) << "Invalid lrn mode" << std::endl;
+    } else if (!var.compare("local_size")) {
+      lrn_param->local_size_ = atoi(val.c_str());
+    } else if (!var.compare("alpha")) {
+      lrn_param->alpha_ = atof(val.c_str());
+    } else if (!var.compare("beta")) {
+      lrn_param->beta_ = atof(val.c_str());
+    } else if (!var.compare("k")) {
+      lrn_param->k_ = atof(val.c_str());
+    }
+  } else {
+    LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+  }
+}
+
+struct ActivationParam {
+#ifdef NVIDIA_CUDNN
+  cudnnActivationMode_t mode_;
+  ActivationParam()
+  : mode_(CUDNN_ACTIVATION_RELU) {}
+#endif
+#ifdef AMD_MIOPEN
+  miopenActivationMode_t mode_;
+  double alpha_;
+  double beta_;
+  double power_;
+  ActivationParam()
+  : mode_(miopenActivationRELU),
+    alpha_(0.0), beta_(0.0), power_(0.0) {}
+#endif
+};
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const ActivationParam &activation_param) {
+  os << std::endl;
+  os << "[Activation Param] Mode: "
+     << activation_param.mode_ << std::endl;
+  return os;
+}
+
+inline void SetupActivationParam(const std::string &var, const std::string &val,
+                                 ActivationParam *activation_param) {
+  // Process all the keywords in config
+  if(isKeywordExist(var, activation_config_keywords)) {
+    if (!var.compare("activation_mode")) {
+#ifdef NVIDIA_CUDNN
+      if (!val.compare("sigmoid"))
+        activation_param->mode_ = CUDNN_ACTIVATION_SIGMOID;
+      else if (!val.compare("relu"))
+        activation_param->mode_ = CUDNN_ACTIVATION_RELU;
+      else if (!val.compare("tanh"))
+        activation_param->mode_ = CUDNN_ACTIVATION_TANH;
+      else if (!val.compare("clipped_relu"))
+        activation_param->mode_ = CUDNN_ACTIVATION_CLIPPED_RELU;
+      else
+        LOG(FATAL) << "Invalid activation mode" << std::endl;
+#endif
+#ifdef AMD_MIOPEN
+      if (!val.compare("sigmoid"))
+        activation_param->mode_ = miopenActivationLOGISTIC;
+      else if (!val.compare("relu"))
+        activation_param->mode_ = miopenActivationRELU;
+      else if (!val.compare("tanh"))
+        activation_param->mode_ = miopenActivationTANH;
+      else if (!val.compare("soft_relu"))
+        activation_param->mode_ = miopenActivationSOFTRELU;
+      else
+        LOG(FATAL) << "Invalid activation mode" << std::endl;
+#endif
+    }
+  } else {
+    LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+  }
+}
+
+struct FullyConnectedParam {
+  int output_num_;
+  FullyConnectedParam()
+  : output_num_(4096) {}
+};
+
+inline void SetupFcParam(const std::string &var, const std::string &val,
+                         FullyConnectedParam *fc_param) {
+  // Process all the keywords in config
+  if(isKeywordExist(var, fc_config_keywords)) {
+    if (!var.compare("num_output")) {
+      fc_param->output_num_ = atoi(val.c_str());
+    }
+  } else {
+    LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+  }
+}
+
+struct SoftmaxParam {
+#ifdef NVIDIA_CUDNN
+  cudnnSoftmaxAlgorithm_t algo_;
+  cudnnSoftmaxMode_t mode_;
+  SoftmaxParam()
+  : algo_(CUDNN_SOFTMAX_ACCURATE),
+    mode_(CUDNN_SOFTMAX_MODE_CHANNEL) {}
+#endif
+#ifdef AMD_MIOPEN
+  SoftmaxParam() {}
+#endif
+};
+
+inline void SetupSoftmaxParam(const std::string &var, const std::string &val,
+                              SoftmaxParam *softmax_param) {
+  // Process all the keywords in config
+  if(isKeywordExist(var, softmax_config_keywords)) {
+    if (!var.compare("softmax_algo")) {
+#ifdef NVIDIA_CUDNN
+      if (!val.compare("fast"))
+        softmax_param->algo_ = CUDNN_SOFTMAX_FAST;
+      else if (!val.compare("accurate"))
+        softmax_param->algo_ = CUDNN_SOFTMAX_ACCURATE;
+      else if (!val.compare("log"))
+        softmax_param->algo_ = CUDNN_SOFTMAX_LOG;
+#endif
+    }
+    if (!var.compare("softmax_mode")) {
+#ifdef NVIDIA_CUDNN
+      if (!val.compare("instance"))
+        softmax_param->mode_ = CUDNN_SOFTMAX_MODE_INSTANCE;
+      else if (!val.compare("channel"))
+        softmax_param->mode_ = CUDNN_SOFTMAX_MODE_CHANNEL;
+#endif
+    }
+  } else {
+    LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+  }
+}
+
+enum BatchNormMode {
+#ifdef NVIDIA_CUDNN
+  PerActivation = CUDNN_BATCHNORM_PER_ACTIVATION,
+  Spatial = CUDNN_BATCHNORM_SPATIAL
+#endif
+#ifdef AMD_MIOPEN
+  PerActivation = miopenBNPerActivation,
+  Spatial = miopenBNSpatial
+#endif
+};
+
+struct BatchNormParam {
+#ifdef NVIDIA_CUDNN
+  cudnnBatchNormMode_t mode_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenBatchNormMode_t mode_;
+#endif
+  bool save_intermediates_;
+  double exp_avg_factor_;
+  double epsilon_;
+  BatchNormParam()
+#ifdef NVIDIA_CUDNN
+  : mode_((cudnnBatchNormMode_t)PerActivation),
+#endif
+#ifdef AMD_MIOPEN
+  : mode_((miopenBatchNormMode_t)PerActivation),
+#endif
+    save_intermediates_(true),
+    exp_avg_factor_(1),
+    epsilon_(BN_MIN_EPSILON) {}
+};
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const BatchNormParam &bn_param) {
+  os << std::endl;
+  os << "[BatchNorm Param] Mode: "
+     << bn_param.mode_ << std::endl;
+  os << "[BatchNorm Param] Save Intermediates: "
+     << bn_param.save_intermediates_ << std::endl;
+  os << "[BatchNorm Param] Exponential Average Factor: "
+     << bn_param.exp_avg_factor_ << std::endl;
+  os << "[BatchNorm Param] Epsilon: "
+     << bn_param.epsilon_ << std::endl;
+  return os;
+}
+
+inline void SetupBatchNormParam(const std::string &var, const std::string &val,
+                                BatchNormParam *bn_param) {
+  // Process all the keywords in config
+  if(isKeywordExist(var, bn_config_keywords)) {
+    if(!var.compare("batchnorm_mode")) {
+      if(!val.compare("per_activation"))
+#ifdef NVIDIA_CUDNN
+        bn_param->mode_ = (cudnnBatchNormMode_t)PerActivation;
+#endif
+#ifdef AMD_MIOPEN
+        bn_param->mode_ = (miopenBatchNormMode_t)PerActivation;
+#endif
+      else if (!val.compare("spatial"))
+#ifdef NVIDIA_CUDNN
+        bn_param->mode_ = (cudnnBatchNormMode_t)Spatial;
+#endif
+#ifdef AMD_MIOPEN
+        bn_param->mode_ = (miopenBatchNormMode_t)Spatial;
+#endif
+    }
+    if(!var.compare("save_intermediates")) {
+      if(!val.compare("true"))
+        bn_param->save_intermediates_ = true;
+      else if (!val.compare("false"))
+        bn_param->save_intermediates_ = false;
+    }
+    if(!var.compare("exp_avg_factor")) {
+      bn_param->exp_avg_factor_ = atof(val.c_str());
+    }
+    if(!var.compare("epsilon")) {
+      bn_param->epsilon_ = atof(val.c_str());
+    }
+  } else {
+    LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+  }
+}
+
+struct DropoutParam {
+  float dropout_p_;
+  unsigned long long random_seed_;
+  DropoutParam()
+  : dropout_p_(.5),
+    random_seed_(0) {}
+};
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const DropoutParam &dropout_param) {
+  os << std::endl;
+  os << "[Dropout Param] Dropout Probability: "
+     << dropout_param.dropout_p_ << std::endl;
+  os << "[Dropout Param] Random Seed: "
+     << dropout_param.random_seed_ << std::endl;
+  return os;
+}
+
+inline void SetupDropoutParam(const std::string &var, const std::string &val,
+                              DropoutParam * dropout_param) {
+  // Process all the keywords in config
+  if(isKeywordExist(var, dropout_config_keywords)) {
+    if(!var.compare("dropout_probability")) {
+      dropout_param->dropout_p_ = atof(val.c_str());
+    }
+    if(!var.compare("random_seed")) {
+      dropout_param->random_seed_ = atoi(val.c_str());
+    }
+  } else {
+    LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+  }
+}
+
+struct BypassParam {
+	BypassParam() {}
+};
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const BypassParam &bypass_param) {
+  os << std::endl;
+  os << "[Bypass Param]" << std::endl;
+  return os;
+}
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_DNN_PARAM_H_
diff --git a/src/DNNMark/core/include/dnn_utility.h b/src/DNNMark/core/include/dnn_utility.h
new file mode 100644
index 0000000..e271b3d
--- /dev/null
+++ b/src/DNNMark/core/include/dnn_utility.h
@@ -0,0 +1,1107 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_DNN_UTILITY_H_
+#define CORE_INCLUDE_DNN_UTILITY_H_
+
+#include <iostream>
+
+#ifdef NVIDIA_CUDNN
+#include "cudnn.h"
+#endif
+
+#ifdef AMD_MIOPEN
+#include <miopen/miopen.h>
+#include <rocblas.h>
+#endif
+
+#include "common.h"
+#include "dnn_param.h"
+#include "timer.h"
+
+namespace dnnmark {
+
+class Handle {
+#ifdef NVIDIA_CUDNN
+ private:
+  cudnnHandle_t *cudnn_handles_;
+  cublasHandle_t *blas_handles_;
+  int num_cudnn_handles_;
+  int num_blas_handles_;
+ public:
+  Handle();
+  Handle(int num);
+  ~Handle();
+  cudnnHandle_t GetCudnn() const;
+  cudnnHandle_t GetCudnn(int index) const;
+  cublasHandle_t GetBlas() const;
+  cublasHandle_t GetBlas(int index) const;
+  int num_cudnn() const { return num_cudnn_handles_; }
+  int num_blas() const { return num_blas_handles_; }
+#endif
+#ifdef AMD_MIOPEN
+ private:
+  miopenHandle_t *miopen_handles_;
+  rocblas_handle *rocblas_handles_;
+  int num_miopen_handles_;
+  int num_rocblas_handles_;
+ public:
+  Handle();
+  Handle(int num);
+  ~Handle();
+  miopenHandle_t GetMIOpen() const;
+  miopenHandle_t GetMIOpen(int index) const;
+  rocblas_handle GetBlas() const;
+  rocblas_handle GetBlas(int index) const;
+  int num_miopen() const { return num_miopen_handles_; }
+  int num_blas() const { return num_rocblas_handles_; }
+#endif
+};
+
+class Descriptor {
+ protected:
+  bool set_;
+ public:
+  Descriptor();
+  ~Descriptor();
+  bool isSet();
+};
+
+template <typename T>
+class DataTensor : public Descriptor {
+ private:
+#ifdef NVIDIA_CUDNN
+  cudnnTensorDescriptor_t desc_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenTensorDescriptor_t desc_;
+#endif
+
+ public:
+  DataTensor()
+  : Descriptor() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenCreateTensorDescriptor(&desc_));
+#endif
+  }
+
+  ~DataTensor() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenDestroyTensorDescriptor(desc_));
+#endif
+  }
+
+  void Set(int n, int c, int h, int w) {
+    if (!set_) {
+#ifdef NVIDIA_CUDNN
+      CUDNN_CALL(cudnnSetTensor4dDescriptor(desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            DataType<T>::type,
+                                            n, c, h, w));
+#endif
+#ifdef AMD_MIOPEN
+      MIOPEN_CALL(miopenSet4dTensorDescriptor(desc_,
+                                              DataType<T>::type,
+                                              n, c, h, w));
+#endif
+    }
+    set_ = true;
+  }
+
+#ifdef NVIDIA_CUDNN
+  cudnnTensorDescriptor_t Get() const {
+#endif
+#ifdef AMD_MIOPEN
+  miopenTensorDescriptor_t Get() const {
+#endif
+    if (set_)
+      return desc_;
+    return nullptr;
+  }
+
+};
+
+template <typename T>
+class ConvolutionDesc : public Descriptor {
+ private:
+#ifdef NVIDIA_CUDNN
+  cudnnFilterDescriptor_t filter_desc_;
+  cudnnConvolutionDescriptor_t conv_desc_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenTensorDescriptor_t filter_desc_;
+  miopenConvolutionDescriptor_t conv_desc_;
+#endif
+
+ public:
+  ConvolutionDesc()
+  : Descriptor() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenCreateConvolutionDescriptor(&conv_desc_));
+    MIOPEN_CALL(miopenCreateTensorDescriptor(&filter_desc_));
+#endif
+  }
+
+  ~ConvolutionDesc() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenDestroyConvolutionDescriptor(conv_desc_));
+    MIOPEN_CALL(miopenDestroyTensorDescriptor(filter_desc_));
+#endif
+  }
+
+  void Set(const ConvolutionParam &param, int num_channel) {
+    if (!set_) {
+#ifdef NVIDIA_CUDNN
+      CUDNN_CALL(cudnnSetConvolution2dDescriptor(conv_desc_,
+                 param.pad_h_, param.pad_w_,
+                 param.stride_u_, param.stride_v_,
+                 param.upscale_x_, param.upscale_y_,
+                 param.mode_, DataType<T>::type));
+
+      CUDNN_CALL(cudnnSetFilter4dDescriptor(filter_desc_,
+                 DataType<T>::type, CUDNN_TENSOR_NCHW,
+                 param.output_num_, num_channel,
+                 param.kernel_size_h_, param.kernel_size_w_));
+#endif
+#ifdef AMD_MIOPEN
+      MIOPEN_CALL(miopenInitConvolutionDescriptor(conv_desc_,
+                 param.mode_,
+                 param.pad_h_, param.pad_w_,
+                 param.stride_u_, param.stride_v_,
+                 param.upscale_x_, param.upscale_y_));
+
+      MIOPEN_CALL(miopenSet4dTensorDescriptor(filter_desc_,
+                 DataType<T>::type,
+                 param.output_num_, num_channel,
+                 param.kernel_size_h_, param.kernel_size_w_));
+#endif
+    }
+    set_ = true;
+  }
+
+#ifdef NVIDIA_CUDNN
+  cudnnFilterDescriptor_t GetFilter() const {
+#endif
+#ifdef AMD_MIOPEN
+  miopenTensorDescriptor_t GetFilter() const {
+#endif
+    if (set_)
+      return filter_desc_;
+    return nullptr;
+  }
+
+#ifdef NVIDIA_CUDNN
+  cudnnConvolutionDescriptor_t GetConv() const {
+#endif
+#ifdef AMD_MIOPEN
+  miopenConvolutionDescriptor_t GetConv() const {
+#endif
+    if (set_)
+      return conv_desc_;
+    return nullptr;
+  }
+
+
+};
+
+template <typename T>
+class PoolingDesc : public Descriptor {
+ private:
+#ifdef NVIDIA_CUDNN
+  cudnnPoolingDescriptor_t pooling_desc_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenPoolingDescriptor_t pooling_desc_;
+#endif
+ public:
+  PoolingDesc()
+  : Descriptor() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenCreatePoolingDescriptor(&pooling_desc_));
+#endif
+  }
+
+  ~PoolingDesc() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenDestroyPoolingDescriptor(pooling_desc_));
+#endif
+  }
+
+  void Set(const PoolingParam &param) {
+    if (!set_) {
+#ifdef NVIDIA_CUDNN
+      CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
+                 param.mode_, CUDNN_PROPAGATE_NAN,
+                 param.kernel_size_h_, param.kernel_size_w_,
+                 param.pad_h_, param.pad_w_,
+                 param.stride_h_, param.stride_w_));
+#endif
+#ifdef AMD_MIOPEN
+      MIOPEN_CALL(miopenSet2dPoolingDescriptor(pooling_desc_,
+                 param.mode_,
+                 param.kernel_size_h_, param.kernel_size_w_,
+                 param.pad_h_, param.pad_w_,
+                 param.stride_h_, param.stride_w_));
+#endif
+    }
+
+    set_ = true;
+  }
+
+#ifdef NVIDIA_CUDNN
+  cudnnPoolingDescriptor_t Get() const {
+#endif
+#ifdef AMD_MIOPEN
+  miopenPoolingDescriptor_t Get() const {
+#endif
+    if (set_)
+      return pooling_desc_;
+    return nullptr;
+  }
+
+  void GetWorkspaceSize(const DataTensor<T> &y_desc,
+                        size_t *workspace_size) {
+#ifdef AMD_MIOPEN
+    if (set_)
+      MIOPEN_CALL(miopenPoolingGetWorkSpaceSize(y_desc.Get(), workspace_size));
+    else
+      LOG(FATAL) << "Pooling descriptor NOT set";
+#endif
+  }
+};
+
+template <typename T>
+class LRNDesc : public Descriptor {
+ private:
+#ifdef NVIDIA_CUDNN
+  cudnnLRNDescriptor_t lrn_desc_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenLRNDescriptor_t lrn_desc_;
+#endif
+ public:
+  LRNDesc()
+  : Descriptor() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnCreateLRNDescriptor(&lrn_desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenCreateLRNDescriptor(&lrn_desc_));
+#endif
+  }
+
+  ~LRNDesc() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnDestroyLRNDescriptor(lrn_desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenDestroyLRNDescriptor(lrn_desc_));
+#endif
+  }
+
+  void Set(const LRNParam &param) {
+    if (!set_) {
+#ifdef NVIDIA_CUDNN
+      CUDNN_CALL(cudnnSetLRNDescriptor(lrn_desc_,
+                 param.local_size_,
+                 param.alpha_, param.beta_,
+                 param.k_));
+#endif
+#ifdef AMD_MIOPEN
+      MIOPEN_CALL(miopenSetLRNDescriptor(lrn_desc_,
+                  param.mode_,
+                  param.local_size_,
+                  param.alpha_, param.beta_,
+                  param.k_));
+#endif
+    }
+
+    set_ = true;
+  }
+
+#ifdef NVIDIA_CUDNN
+  cudnnLRNDescriptor_t Get() const {
+#endif
+#ifdef AMD_MIOPEN
+  miopenLRNDescriptor_t Get() const {
+#endif
+    if (set_)
+      return lrn_desc_;
+    return nullptr;
+  }
+
+  void GetWorkspaceSize(const DataTensor<T> &y_desc,
+                        size_t *workspace_size) {
+#ifdef AMD_MIOPEN
+    if (set_)
+      MIOPEN_CALL(miopenLRNGetWorkSpaceSize(y_desc.Get(), workspace_size));
+    else
+      LOG(FATAL) << "LRN descriptor NOT set";
+#endif
+  }
+
+};
+
+template <typename T>
+class ActivationDesc : public Descriptor {
+ private:
+#ifdef NVIDIA_CUDNN
+  cudnnActivationDescriptor_t activation_desc_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenActivationDescriptor_t activation_desc_;
+#endif
+ public:
+  ActivationDesc()
+  : Descriptor() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnCreateActivationDescriptor(&activation_desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenCreateActivationDescriptor(&activation_desc_));
+#endif
+  }
+
+  ~ActivationDesc() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnDestroyActivationDescriptor(activation_desc_));
+#endif
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenDestroyActivationDescriptor(activation_desc_));
+#endif
+  }
+
+  void Set(const ActivationParam &param) {
+    if (!set_) {
+#ifdef NVIDIA_CUDNN
+      CUDNN_CALL(cudnnSetActivationDescriptor(activation_desc_,
+                 param.mode_,
+                 CUDNN_PROPAGATE_NAN,
+                 double(0.0)));
+#endif
+#ifdef AMD_MIOPEN
+      MIOPEN_CALL(miopenSetActivationDescriptor(activation_desc_,
+                 param.mode_,
+                 param.alpha_,
+                 param.beta_,
+                 param.power_));
+#endif
+    }
+
+    set_ = true;
+  }
+
+#ifdef NVIDIA_CUDNN
+  cudnnActivationDescriptor_t Get() const {
+#endif
+#ifdef AMD_MIOPEN
+  miopenActivationDescriptor_t Get() const {
+#endif
+    if (set_)
+      return activation_desc_;
+    return nullptr;
+  }
+
+};
+
+template <typename T>
+class BypassDesc : public Descriptor {
+ private:
+#ifdef NVIDIA_CUDNN
+  DataDim dim_;
+#endif
+#ifdef AMD_MIOPEN
+  miopenActivationDescriptor_t activation_desc_;
+#endif
+ public:
+  BypassDesc()
+  : Descriptor() {
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenCreateActivationDescriptor(&activation_desc_));
+#endif
+  }
+
+  ~BypassDesc() {
+#ifdef AMD_MIOPEN
+    MIOPEN_CALL(miopenDestroyActivationDescriptor(activation_desc_));
+#endif
+  }
+
+  void Set(DataDim dim) {
+    if (!set_) {
+#ifdef NVIDIA_CUDNN
+      dim_.n_ = dim.n_;
+      dim_.c_ = dim.c_;
+      dim_.h_ = dim.h_;
+      dim_.w_ = dim.w_;
+#endif
+#ifdef AMD_MIOPEN
+      MIOPEN_CALL(miopenSetActivationDescriptor(activation_desc_,
+                 miopenActivationPASTHRU,
+                 0, 0, 0));
+#endif
+    }
+
+    set_ = true;
+  }
+
+#ifdef NVIDIA_CUDNN
+  DataDim Get() const {
+    return dim_;
+  }
+#endif
+#ifdef AMD_MIOPEN
+  miopenActivationDescriptor_t Get() const {
+    if (set_)
+      return activation_desc_;
+    return nullptr;
+  }
+#endif
+};
+
+template <typename T>
+class DropoutDesc : public Descriptor {
+ private:
+#ifdef NVIDIA_CUDNN
+  cudnnDropoutDescriptor_t dropout_desc_;
+#endif
+#ifdef AMD_MIOPEN
+#endif
+ public:
+  DropoutDesc()
+  : Descriptor() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
+#endif
+#ifdef AMD_MIOPEN
+#endif
+  }
+
+  ~DropoutDesc() {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
+#endif
+#ifdef AMD_MIOPEN
+#endif
+  }
+
+  void SetStatesSize(const Handle &handle, RunMode mode, int idx,
+                     size_t *state_size) {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnDropoutGetStatesSize(mode ?
+                                         handle.GetCudnn(idx):
+                                         handle.GetCudnn(),
+                                         state_size));
+#endif
+#ifdef AMD_MIOPEN
+#endif
+  }
+
+  void SetReserveSpaceSize(DataTensor<T> &bottom_desc,
+                           size_t *reserve_space_size) {
+#ifdef NVIDIA_CUDNN
+    CUDNN_CALL(cudnnDropoutGetReserveSpaceSize(bottom_desc.Get(),
+                                               reserve_space_size));
+#endif
+#ifdef AMD_MIOPEN
+#endif
+  }
+
+  void Set(const Handle &handle, RunMode mode, int idx,
+           const DropoutParam &dropout_param,
+           void *states, size_t state_size) {
+    if (!set_) {
+#ifdef NVIDIA_CUDNN
+      if (state_size > 0)
+        CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_,
+                                       mode == COMPOSED ?
+                                       handle.GetCudnn(idx):
+                                       handle.GetCudnn(),
+                                       dropout_param.dropout_p_,
+                                       states,
+                                       state_size,
+                                       dropout_param.random_seed_));
+      else
+        LOG(FATAL) << "The size is ZERO";
+#endif
+#ifdef AMD_MIOPEN
+#endif
+    }
+
+    set_ = true;
+  }
+
+#ifdef NVIDIA_CUDNN
+  cudnnDropoutDescriptor_t Get() const {
+    return dropout_desc_;
+  }
+#endif
+#ifdef AMD_MIOPEN
+#endif
+};
+
+template <typename T>
+class ConvAlgo {
+#ifdef NVIDIA_CUDNN
+ private:
+  cudnnConvolutionFwdAlgo_t fwd_algo_;
+  cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
+  cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
+  std::string bwd_filter_algo_par;
+
+ public:
+  ConvAlgo()
+  : fwd_algo_(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM),
+    bwd_filter_algo_(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0),
+    bwd_data_algo_(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0),
+    bwd_filter_algo_par("") {}
+
+
+  cudnnConvolutionBwdDataAlgo_t getDataAlgo(){
+    return (bwd_data_algo_);
+  }
+
+
+  void SetFwdAlgo(std::string algo) {
+    if (!algo.compare("fft")) {
+      fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
+    } else if (!algo.compare("winograd")) {
+      fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
+    }
+  }
+  void SetFwdAlgo(cudnnConvolutionFwdAlgo_t fwd_algo) {
+    fwd_algo_ = fwd_algo;
+  }
+  void SetFwdAlgo(const Handle &handle, RunMode mode, int idx,
+                  const DataTensor<T> &bottom_desc,
+                  const ConvolutionDesc<T> &conv_desc,
+                  const DataTensor<T> &top_desc,
+                  cudnnConvolutionFwdPreference_t pref) {
+    CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(
+               mode == COMPOSED ?
+               handle.GetCudnn(idx) : handle.GetCudnn(),
+               bottom_desc.Get(),
+               conv_desc.GetFilter(),
+               conv_desc.GetConv(),
+               top_desc.Get(),
+               pref,
+               -1,
+               &fwd_algo_));
+  }
+  void FindFwdAlgo(const Handle &handle, RunMode mode, int idx,
+                   const DataTensor<T> &bottom_desc,
+                   const ConvolutionDesc<T> &conv_desc,
+                   const DataTensor<T> &top_desc) {
+    cudnnConvolutionFwdAlgoPerf_t *perf_results;
+    int *returned_algo_count;
+    CUDNN_CALL(cudnnFindConvolutionForwardAlgorithm(
+               mode == COMPOSED ?
+               handle.GetCudnn(idx) : handle.GetCudnn(),
+               bottom_desc.Get(),
+               conv_desc.GetFilter(),
+               conv_desc.GetConv(),
+               top_desc.Get(),
+               1, returned_algo_count,
+               perf_results));
+    if (*returned_algo_count > 0) {
+      fwd_algo_ = perf_results->algo;
+    } else {
+      fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+    }
+  }
+  void SetBwdFilterAlgo(cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo) {
+    bwd_filter_algo_ = bwd_filter_algo;
+  }
+  void SetBwdFilterAlgo(std::string algo) {
+    if (algo.empty()) {
+        return;
+    }
+    if (!algo.compare("fft")) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT;
+    } else if (!algo.compare("winograd")) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD;
+    } else if (stoi(algo) == 0) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+    } else if (stoi(algo) == 1) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+    } else if (stoi(algo) == 2) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT;
+    } else if (stoi(algo) == 3) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3;
+    } else if (stoi(algo) == 4) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD;
+    } else if (stoi(algo) == 5) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
+    } else if (stoi(algo) == 6) {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING;
+    } else if (algo.compare("")) {
+      std::cout << "Using algo "<< algo << "\n";
+      bwd_filter_algo_par = algo;
+    }
+    LOG(INFO) << "cuDNN algos: " << CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 << " "
+             << CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 << " "
+             << "FFT:" << CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT << " "
+             << CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 << " "
+             << "WIN:" << CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD << " "
+             << "WIN_NONFUSED:" << CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED << " "
+             << "FFT_TILING:" << CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING << "\n";
+    LOG(INFO) << "Set Bwd Filter Algo to " << bwd_filter_algo_ << " with " << algo;
+  }
+  void SetBwdFilterAlgo(const Handle &handle, RunMode mode, int idx,
+                        const DataTensor<T> &bottom_desc,
+                        const DataTensor<T> &top_desc,
+                        const ConvolutionDesc<T> &conv_desc,
+                        cudnnConvolutionBwdFilterPreference_t pref) {
+     CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithm(
+                mode == COMPOSED ?
+                handle.GetCudnn(idx) : handle.GetCudnn(),
+                bottom_desc.Get(),
+                top_desc.Get(),
+                conv_desc.GetConv(),
+                conv_desc.GetFilter(),
+                pref,
+                -1,
+                &bwd_filter_algo_));
+  }
+  void FindBwdFilterAlgo(const Handle &handle, RunMode mode, int idx,
+                         const DataTensor<T> &bottom_desc,
+                         const ConvolutionDesc<T> &conv_desc,
+                         const DataTensor<T> &top_desc) {
+    cudnnConvolutionBwdFilterAlgoPerf_t *perf_results;
+    int *returned_algo_count;
+    CUDNN_CALL(cudnnFindConvolutionBackwardFilterAlgorithm(
+               mode == COMPOSED ?
+               handle.GetCudnn(idx) : handle.GetCudnn(),
+               bottom_desc.Get(),
+               top_desc.Get(),
+               conv_desc.GetConv(),
+               conv_desc.GetFilter(),
+               3, returned_algo_count,
+               perf_results));
+    std::cout << "cuDNN call returned_algo_count :" << *returned_algo_count <<"\n";
+    cudnnConvolutionBwdFilterAlgo_t algo = static_cast<cudnnConvolutionBwdFilterAlgo_t>(perf_results->algo);
+    std::cout << "cuDNN call result :" << perf_results->algo <<"\n";
+    std::cout << "cuDNN casted result :" << algo <<"\n";
+    if (*returned_algo_count > 0) {
+      bwd_filter_algo_ = perf_results->algo;
+    } else {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+    }
+  }
+
+  void FindBwdFilterAlgoEx(const Handle &handle, RunMode mode, int idx,
+                         const DataTensor<T> &bottom_desc,
+                         const ConvolutionDesc<T> &conv_desc,
+                         const DataTensor<T> &top_desc,
+                         const void *w,
+                         const void *dy,
+                         void       *dx,
+                         void       *workSpace,
+                         size_t     workspace_size)
+  {
+    cudnnConvolutionBwdFilterAlgoPerf_t *perf_results;
+    int *returned_algo_count;
+    CUDNN_CALL(cudnnFindConvolutionBackwardFilterAlgorithmEx(
+               mode == COMPOSED ?
+               handle.GetCudnn(idx) : handle.GetCudnn(),
+               bottom_desc.Get(), w,
+               top_desc.Get(), dy,
+               conv_desc.GetConv(),
+               conv_desc.GetFilter(), dx,
+               1, returned_algo_count,
+               perf_results,
+               workSpace, workspace_size));
+    if (*returned_algo_count > 0) {
+      bwd_filter_algo_ = perf_results->algo;
+    } else {
+      bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+    }
+  }
+
+
+
+
+  void SetBwdDataAlgo(cudnnConvolutionBwdDataAlgo_t bwd_data_algo) {
+    bwd_data_algo_ = bwd_data_algo;
+  }
+  void SetBwdDataAlgo(std::string algo) {
+    if (algo.empty()) {
+        return;
+    }
+    if (!algo.compare("fft")) {
+      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT;
+    } else if (!algo.compare("winograd")) {
+      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD;
+    } else if (!algo.compare("0")) {
+      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+    } else if (!algo.compare("1")) {
+      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+    } else if (!algo.compare("winograd_nonfused")) {
+      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
+    } else if (!algo.compare("fft_tiling")) {
+      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
+    }
+  }
+  void SetBwdDataAlgo(const Handle &handle, RunMode mode, int idx,
+                      const DataTensor<T> &bottom_desc,
+                      const DataTensor<T> &top_desc,
+                      const ConvolutionDesc<T> &conv_desc,
+                      cudnnConvolutionBwdDataPreference_t pref) {
+     CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithm(
+                mode == COMPOSED ?
+                handle.GetCudnn(idx) : handle.GetCudnn(),
+                conv_desc.GetFilter(),
+                top_desc.Get(),
+                conv_desc.GetConv(),
+                bottom_desc.Get(),
+                pref,
+                -1,
+                &bwd_data_algo_));
+  }
+  void FindBwdDataAlgo(const Handle &handle, RunMode mode, int idx,
+                       const DataTensor<T> &bottom_desc,
+                       const ConvolutionDesc<T> &conv_desc,
+                       const DataTensor<T> &top_desc) {
+    cudnnConvolutionBwdDataAlgoPerf_t *perf_results;
+    int *returned_algo_count;
+    CUDNN_CALL(cudnnFindConvolutionBackwardDataAlgorithm(
+               mode == COMPOSED ?
+               handle.GetCudnn(idx) : handle.GetCudnn(),
+               conv_desc.GetFilter(),
+               top_desc.Get(),
+               conv_desc.GetConv(),
+               bottom_desc.Get(),
+               1, returned_algo_count,
+               perf_results));
+    if (*returned_algo_count > 0) {
+      bwd_data_algo_ = perf_results->algo;
+    } else {
+      bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+    }
+  }
+
+  cudnnConvolutionFwdAlgo_t GetFwdAlgo() const {
+    return fwd_algo_;
+  }
+  void GetFwdWorkspaceSize(const Handle &handle, RunMode mode, int idx,
+                           const DataTensor<T> &bottom_desc,
+                           const DataTensor<T> &top_desc,
+                           const ConvolutionDesc<T> &conv_desc,
+                           size_t *workspace_size) {
+    CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
+               mode == COMPOSED ?
+               handle.GetCudnn(idx) : handle.GetCudnn(),
+               bottom_desc.Get(),
+               conv_desc.GetFilter(),
+               conv_desc.GetConv(),
+               top_desc.Get(),
+               fwd_algo_,
+               workspace_size));
+  }
+
+  std::string GetBwdFilterAlgoParameter() {
+    return bwd_filter_algo_par;
+  }
+
+
+  cudnnConvolutionBwdFilterAlgo_t GetBwdFilterAlgo() const {
+    return bwd_filter_algo_;
+  }
+  void GetBwdFilterWorkspaceSize(const Handle &handle, RunMode mode, int idx,
+                                 const DataTensor<T> &bottom_desc,
+                                 const DataTensor<T> &top_desc,
+                                 const ConvolutionDesc<T> &conv_desc,
+                                 size_t *workspace_size) {
+    CUDNN_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+               mode == COMPOSED ?
+               handle.GetCudnn(idx) : handle.GetCudnn(),
+               bottom_desc.Get(),
+               top_desc.Get(),
+               conv_desc.GetConv(),
+               conv_desc.GetFilter(),
+               bwd_filter_algo_,
+               workspace_size));
+  }
+  cudnnConvolutionBwdDataAlgo_t GetBwdDataAlgo() const {
+    return bwd_data_algo_;
+  }
+  void GetBwdDataWorkspaceSize(const Handle &handle, RunMode mode, int idx,
+                               const DataTensor<T> &bottom_desc,
+                               const DataTensor<T> &top_desc,
+                               const ConvolutionDesc<T> &conv_desc,
+                               size_t *workspace_size) {
+    CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(
+               mode == COMPOSED ?
+               handle.GetCudnn(idx) : handle.GetCudnn(),
+               conv_desc.GetFilter(),
+               top_desc.Get(),
+               conv_desc.GetConv(),
+               bottom_desc.Get(),
+               bwd_data_algo_,
+               workspace_size));
+  }
+
+  // Dictionary for storing best bwd convolution algorithms
+  std::map<std::string, int> Algo4DataShape;
+
+  void checkAlgo4DataShape(const cudnnTensorDescriptor_t x,
+                                            const cudnnTensorDescriptor_t dy,
+                                            const cudnnFilterDescriptor_t dw)
+                                             // size_t workspace_in_bytes)
+  {
+
+    int n,c,h, w, k, nStride, cStride, hStride, wStride;
+    cudnnDataType_t datat;
+    cudnnTensorFormat_t format;
+    std::cout << "Call to checkAlgo4DataShape \n";
+    CUDNN_CALL(cudnnGetTensor4dDescriptor(x,
+                                          &datat,
+                                          &n, &c, &h, &w,
+                                          &nStride, &cStride, &hStride, &wStride));
+    std::cout << "x shape: " << n <<" "<< c << " " << h << "x" << w << "\n";
+    CUDNN_CALL(cudnnGetTensor4dDescriptor(dy,
+                                          &datat,
+                                          &n, &c, &h, &w,
+                                          &nStride, &cStride, &hStride, &wStride));
+    std::cout << "dy shape: " << n <<" "<< c << " " << h << "x" << w << "\n";
+    CUDNN_CALL(cudnnGetFilter4dDescriptor(dw,
+                                          &datat, &format,
+                                          &k, &c, &h, &w));
+    std::cout << "dw shape: " << k <<" "<< c << " " << h << "x" << w << "\n";
+    // std::string hash = std::to_string(x)+"/"+std::to_string(*dy)+"/"+std::to_string(*dw)+"/"+std::to_string(workspace_in_bytes);
+    // std::cout << "datashape hash:" << hash << "x:" << x << "dy:" << y << "w:" << w  "\n";
+  }
+
+
+#endif
+#ifdef AMD_MIOPEN
+ private:
+  miopenConvFwdAlgorithm_t fwd_algo_;
+  miopenConvBwdWeightsAlgorithm_t bwd_filter_algo_;
+  miopenConvBwdDataAlgorithm_t bwd_data_algo_;
+
+ public:
+  ConvAlgo()
+  : fwd_algo_(miopenConvolutionFwdAlgoGEMM),
+    bwd_filter_algo_(miopenConvolutionBwdWeightsAlgoGEMM),
+    bwd_data_algo_(miopenConvolutionBwdDataAlgoGEMM) {}
+
+  void SetFwdAlgo(miopenConvFwdAlgorithm_t fwd_algo) {
+    fwd_algo_ = fwd_algo;
+  }
+  void SetFwdAlgo(std::string algo) {
+  }
+  void FindFwdAlgo(const Handle &handle, RunMode mode, int idx,
+                   const DataTensor<T> &bottom_desc,
+                   const ConvolutionDesc<T> &conv_desc,
+                   const DataTensor<T> &top_desc,
+                   const void *x,
+                   const void *w,
+                   void *y,
+                   void *workspace,
+                   size_t workspace_size) {
+    fwd_algo_ = miopenConvolutionFwdAlgoGEMM;
+#ifdef NOSIM
+    miopenConvAlgoPerf_t perf_results;
+    int returned_algo_count;
+    MIOPEN_CALL(miopenFindConvolutionForwardAlgorithm(
+                mode == COMPOSED ?
+                handle.GetMIOpen(idx) : handle.GetMIOpen(),
+                bottom_desc.Get(), x,
+                conv_desc.GetFilter(), w,
+                conv_desc.GetConv(),
+                top_desc.Get(), y,
+                1, &returned_algo_count,
+                &perf_results, workspace, workspace_size, false));
+    if (returned_algo_count > 0) {
+      fwd_algo_ = perf_results.fwd_algo;
+    }
+#endif
+  }
+  void SetBwdFilterAlgo(miopenConvBwdWeightsAlgorithm_t bwd_filter_algo) {
+    bwd_filter_algo_ = bwd_filter_algo;
+  }
+  void SetBwdFilterAlgo(std::string algo) {
+  }
+  void FindBwdFilterAlgo(const Handle &handle, RunMode mode, int idx,
+                         const DataTensor<T> &bottom_desc,
+                         const ConvolutionDesc<T> &conv_desc,
+                         const DataTensor<T> &top_desc,
+                         const void *x,
+                         const void *dy,
+                         void *dw,
+                         void *workspace,
+                         size_t workspace_size) {
+    bwd_filter_algo_ = miopenConvolutionBwdWeightsAlgoGEMM;
+#ifdef NOSIM
+    miopenConvAlgoPerf_t perf_results;
+    int returned_algo_count;
+    MIOPEN_CALL(miopenFindConvolutionBackwardWeightsAlgorithm(
+                mode == COMPOSED ?
+                handle.GetMIOpen(idx) : handle.GetMIOpen(),
+                top_desc.Get(), dy,
+                bottom_desc.Get(), x,
+                conv_desc.GetConv(),
+                conv_desc.GetFilter(), dw,
+                1, &returned_algo_count,
+                &perf_results, workspace, workspace_size, false));
+    if (returned_algo_count > 0) {
+      bwd_filter_algo_ = perf_results.bwd_weights_algo;
+    }
+#endif
+  }
+  void SetBwdDataAlgo(miopenConvBwdDataAlgorithm_t bwd_data_algo) {
+    bwd_data_algo_ = bwd_data_algo;
+  }
+  void SetBwdDataAlgo(std::string algo) {
+  }
+  void FindBwdDataAlgo(const Handle &handle, RunMode mode, int idx,
+                       const DataTensor<T> &bottom_desc,
+                       const ConvolutionDesc<T> &conv_desc,
+                       const DataTensor<T> &top_desc,
+                       const void *dy,
+                       const void *w,
+                       void *dx,
+                       void *workspace,
+                       size_t workspace_size) {
+    bwd_data_algo_ = miopenConvolutionBwdDataAlgoGEMM;
+#ifdef NOSIM
+    miopenConvAlgoPerf_t perf_results;
+    int returned_algo_count;
+    MIOPEN_CALL(miopenFindConvolutionBackwardDataAlgorithm(
+                mode == COMPOSED ?
+                handle.GetMIOpen(idx) : handle.GetMIOpen(),
+                top_desc.Get(), dy,
+                conv_desc.GetFilter(), w,
+                conv_desc.GetConv(),
+                bottom_desc.Get(), dx,
+                1, &returned_algo_count,
+                &perf_results, workspace, workspace_size, false));
+    if (returned_algo_count > 0) {
+      bwd_data_algo_ = perf_results.bwd_data_algo;
+    }
+#endif
+  }
+
+  miopenConvFwdAlgorithm_t GetFwdAlgo() const {
+    return fwd_algo_;
+  }
+  void GetFwdWorkspaceSize(const Handle &handle, RunMode mode, int idx,
+                           const DataTensor<T> &bottom_desc,
+                           const DataTensor<T> &top_desc,
+                           const ConvolutionDesc<T> &conv_desc,
+                           size_t *workspace_size) {
+    MIOPEN_CALL(miopenConvolutionForwardGetWorkSpaceSize(
+                mode == COMPOSED ?
+                handle.GetMIOpen(idx) : handle.GetMIOpen(),
+                conv_desc.GetFilter(),
+                bottom_desc.Get(),
+                conv_desc.GetConv(),
+                top_desc.Get(),
+                workspace_size));
+  }
+  miopenConvBwdWeightsAlgorithm_t GetBwdFilterAlgo() const {
+    return bwd_filter_algo_;
+  }
+  void GetBwdFilterWorkspaceSize(const Handle &handle, RunMode mode, int idx,
+                                 const DataTensor<T> &bottom_desc,
+                                 const DataTensor<T> &top_desc,
+                                 const ConvolutionDesc<T> &conv_desc,
+                                 size_t *workspace_size) {
+    MIOPEN_CALL(miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+                mode == COMPOSED ?
+                handle.GetMIOpen(idx) : handle.GetMIOpen(),
+                top_desc.Get(),
+                bottom_desc.Get(),
+                conv_desc.GetConv(),
+                conv_desc.GetFilter(),
+                workspace_size));
+  }
+  miopenConvBwdDataAlgorithm_t GetBwdDataAlgo() const {
+    return bwd_data_algo_;
+  }
+  void GetBwdDataWorkspaceSize(const Handle &handle, RunMode mode, int idx,
+                               const DataTensor<T> &bottom_desc,
+                               const DataTensor<T> &top_desc,
+                               const ConvolutionDesc<T> &conv_desc,
+                               size_t *workspace_size) {
+    MIOPEN_CALL(miopenConvolutionBackwardDataGetWorkSpaceSize(
+                mode == COMPOSED ?
+                handle.GetMIOpen(idx) : handle.GetMIOpen(),
+                top_desc.Get(),
+                conv_desc.GetFilter(),
+                conv_desc.GetConv(),
+                bottom_desc.Get(),
+                workspace_size));
+  }
+#endif
+};
+
+// Profiling marker
+inline void ProfilerStart(const Handle &handle, RunMode mode, int idx,
+                          Timer *timer, const std::string &layer) {
+#ifdef NVIDIA_CUDNN
+  cudaProfilerStart();
+#endif
+#ifdef AMD_MIOPEN
+  miopenEnableProfiling(mode == COMPOSED ?
+                        handle.GetMIOpen(idx) : handle.GetMIOpen(), true);
+#endif
+  timer->Start(layer + "_" + std::to_string(idx));
+}
+inline void ProfilerStop(const Handle &handle, RunMode mode, int idx,
+                         Timer *timer, const std::string &layer) {
+#ifdef NVIDIA_CUDNN
+  cudaProfilerStop();
+  CUDA_CALL(cudaDeviceSynchronize());
+#endif
+#ifdef AMD_MIOPEN
+  miopenEnableProfiling(mode == COMPOSED ?
+                        handle.GetMIOpen(idx) : handle.GetMIOpen(), false);
+  HIP_CALL(hipDeviceSynchronize());
+#endif
+  timer->Stop(layer + "_" + std::to_string(idx));
+}
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_DNN_UTILITY_H_
diff --git a/src/DNNMark/core/include/dnn_wrapper.h b/src/DNNMark/core/include/dnn_wrapper.h
new file mode 100644
index 0000000..cdbb748
--- /dev/null
+++ b/src/DNNMark/core/include/dnn_wrapper.h
@@ -0,0 +1,768 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_DNN_WRAPPER_H_
+#define CORE_INCLUDE_DNN_WRAPPER_H_
+
+#include <iostream>
+#include <string>
+#include "common.h"
+#include "dnn_utility.h"
+#include "data_manager.h"
+#include "timer.h"
+
+namespace dnnmark {
+
+//
+// Convolution forward/backward functions
+//
+
+template <typename T>
+inline void dnnmarkConvolutionForward(const Handle &handle,
+                                      RunMode mode, int idx, Timer *timer,
+                                      const void *alpha,
+                                      const DataTensor<T> &bottom_desc,
+                                      const void *x,
+                                      const ConvolutionDesc<T> &conv_desc,
+                                      const void *w,
+                                      ConvAlgo<T> *conv_algo,
+                                      void *workspace,
+                                      size_t workspace_in_bytes,
+                                      const void *beta,
+                                      const DataTensor<T> &top_desc,
+                                      void *y) {
+#ifdef NVIDIA_CUDNN
+  ProfilerStart(handle, mode, idx, timer, "ConvFwd");
+  CUDNN_CALL(cudnnConvolutionForward(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             alpha,
+             bottom_desc.Get(), x,
+             conv_desc.GetFilter(), w,
+             conv_desc.GetConv(),
+             conv_algo->GetFwdAlgo(), workspace, workspace_in_bytes,
+             beta,
+             top_desc.Get(), y));
+  ProfilerStop(handle, mode, idx, timer, "ConvFwd");
+#endif
+#ifdef AMD_MIOPEN
+
+  conv_algo->FindFwdAlgo(handle, mode, idx,
+                        bottom_desc,
+                        conv_desc,
+                        top_desc,
+                        x, w, y,
+                        workspace, workspace_in_bytes);
+  ProfilerStart(handle, mode, idx, timer, "ConvFwd");
+  MIOPEN_CALL(miopenConvolutionForward(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              alpha,
+              bottom_desc.Get(), x,
+              conv_desc.GetFilter(), w,
+              conv_desc.GetConv(),
+              conv_algo->GetFwdAlgo(),
+              beta,
+              top_desc.Get(), y,
+              workspace, workspace_in_bytes));
+  ProfilerStop(handle, mode, idx, timer, "ConvFwd");
+#endif
+
+}
+
+template <typename T>
+inline void dnnmarkConvolutionBackwardData(const Handle &handle,
+                                           RunMode mode, int idx, Timer *timer,
+                                           const void *alpha,
+                                           const DataTensor<T> &top_desc,
+                                           const void *dy,
+                                           const ConvolutionDesc<T> &conv_desc,
+                                           const void *w,
+                                           ConvAlgo<T> *conv_algo,
+                                           void *workspace,
+                                           size_t workspace_in_bytes,
+                                           const void *beta,
+                                           const DataTensor<T> &bottom_desc,
+                                           void *dx) {
+#ifdef NVIDIA_CUDNN
+  ProfilerStart(handle, mode, idx, timer, "ConvBwdData");
+  CUDNN_CALL(cudnnConvolutionBackwardData(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             alpha,
+             conv_desc.GetFilter(), w,
+             top_desc.Get(), dy,
+             conv_desc.GetConv(),
+             conv_algo->GetBwdDataAlgo(),
+             workspace, workspace_in_bytes,
+             beta,
+             bottom_desc.Get(), dx));
+  ProfilerStop(handle, mode, idx, timer, "ConvBwdData");
+#endif
+#ifdef AMD_MIOPEN
+  conv_algo->FindBwdDataAlgo(handle, mode, idx,
+                            bottom_desc,
+                            conv_desc,
+                            top_desc,
+                            dy, w, dx,
+                            workspace, workspace_in_bytes);
+  ProfilerStart(handle, mode, idx, timer, "ConvBwdData");
+  MIOPEN_CALL(miopenConvolutionBackwardData(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              alpha,
+              top_desc.Get(), dy,
+              conv_desc.GetFilter(), w,
+              conv_desc.GetConv(),
+              conv_algo->GetBwdDataAlgo(),
+              beta,
+              bottom_desc.Get(), dx,
+              workspace, workspace_in_bytes));
+  ProfilerStop(handle, mode, idx, timer, "ConvBwdData");
+#endif
+}
+
+template <typename T>
+inline void dnnmarkConvolutionBackwardFilter(const Handle &handle,
+                                             RunMode mode, int idx, Timer *timer,
+                                             const void *alpha,
+                                             const DataTensor<T> &bottom_desc,
+                                             const void *x,
+                                             const DataTensor<T> &top_desc,
+                                             const void *dy,
+                                             const ConvolutionDesc<T> &conv_desc,
+                                             ConvAlgo<T> *conv_algo,
+                                             void *workspace,
+                                             size_t workspace_in_bytes,
+                                             const void *beta,
+                                             void *dw) {
+#ifdef NVIDIA_CUDNN
+  // std::string conv_algo_param;
+  cudnnFilterDescriptor_t filter_t = conv_desc.GetFilter();
+  ProfilerStart(handle, mode, idx, timer, "ConvBwdFilter");
+  // conv_algo_param = conv_algo->GetBwdFilterAlgoParameter();
+  // // std::cout << "algo_param "<< conv_algo_param <<"\n";
+  // if (conv_algo_param == "autoex") {
+  //   conv_algo->checkAlgo4DataShape(bottom_desc.Get(),top_desc.Get(), filter_t);
+  //   // ,workspace_in_bytes);
+  //   conv_algo->FindBwdFilterAlgoEx(handle, mode, idx,
+  //                             bottom_desc,
+  //                             conv_desc,
+  //                             top_desc,
+  //                             x, dy, dw,
+  //                             workspace, workspace_in_bytes);
+
+  //   LOG(INFO) << "cuDNN AUTO selected conv. bwd filter alg. to " << conv_algo->GetBwdFilterAlgo();
+  //   std::cout << "cuDNN AUTO selected bwd convolution filter algorithm:"<<conv_algo->GetBwdFilterAlgo()<<"\n";
+  // }
+  CUDNN_CALL(cudnnConvolutionBackwardFilter(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             alpha,
+             bottom_desc.Get(), x,
+             top_desc.Get(), dy,
+             conv_desc.GetConv(),
+             conv_algo->GetBwdFilterAlgo(),
+             workspace, workspace_in_bytes,
+             beta,
+             filter_t, dw));
+  ProfilerStop(handle, mode, idx, timer, "ConvBwdFilter");
+#endif
+#ifdef AMD_MIOPEN
+  conv_algo->FindBwdFilterAlgo(handle, mode, idx,
+                              bottom_desc,
+                              conv_desc,
+                              top_desc,
+                              x, dy, dw,
+                              workspace, workspace_in_bytes);
+  ProfilerStart(handle, mode, idx, timer, "ConvBwdFilter");
+  MIOPEN_CALL(miopenConvolutionBackwardWeights(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              alpha,
+              top_desc.Get(), dy,
+              bottom_desc.Get(), x,
+              conv_desc.GetConv(),
+              conv_algo->GetBwdFilterAlgo(),
+              beta,
+              conv_desc.GetFilter(), dw,
+              workspace, workspace_in_bytes));
+  ProfilerStop(handle, mode, idx, timer, "ConvBwdFilter");
+#endif
+}
+
+//
+// Pooling forward/backward functions
+//
+
+template <typename T>
+inline void dnnmarkPoolingForward(const Handle &handle,
+		RunMode mode, int idx,
+		const PoolingDesc<T> &pooling_desc,
+		const void *alpha,
+		const DataTensor<T> &x_desc,
+		const void *x,
+		const void *beta,
+		const DataTensor<T> &y_desc,
+		void * y,
+		Data<T> *workspace,
+		size_t workspace_in_bytes) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnPoolingForward(
+				mode == COMPOSED ? handle.GetCudnn(idx) : handle.GetCudnn(),
+				pooling_desc.Get(),
+				alpha,
+				x_desc.Get(), x,
+				beta,
+				y_desc.Get(), y));
+#endif
+#ifdef AMD_MIOPEN
+    LOG(INFO) << "Before MIOpen call";
+	MIOPEN_CALL(miopenPoolingForward(
+		        mode == COMPOSED ? handle.GetMIOpen(idx):handle.GetMIOpen(),
+		        pooling_desc.Get(),
+		        alpha,
+		        x_desc.Get(), x,
+		        beta,
+		        y_desc.Get(), y,
+		        false,
+		        workspace->Get(), workspace_in_bytes));
+    LOG(INFO) << "Done with MIOpen call";
+#endif
+}
+
+template <typename T>
+inline void dnnmarkPoolingBackward(const Handle &handle,
+		RunMode mode, int idx,
+		const PoolingDesc<T> &pooling_desc,
+		const void *alpha,
+		const DataTensor<T> &y_desc,
+		const void *y,
+		const DataTensor<T> &dy_desc,
+		const void *dy,
+		const DataTensor<T> &x_desc,
+		const void *x,
+		const void *beta,
+		const DataTensor<T> &dx_desc,
+		void *dx,
+		Data<T> *workspace) {
+#ifdef NVIDIA_CUDNN
+	CUDNN_CALL(cudnnPoolingBackward(
+				mode == COMPOSED ? handle.GetCudnn(idx) : handle.GetCudnn(),
+				pooling_desc.Get(),
+				alpha,
+				y_desc.Get(), y,
+				dy_desc.Get(), dy,
+				x_desc.Get(), x,
+				beta,
+				dx_desc.Get(), dx));
+#endif
+#ifdef AMD_MIOPEN
+	MIOPEN_CALL(miopenPoolingBackward(
+				mode == COMPOSED ? handle.GetMIOpen() : handle.GetMIOpen(),
+				pooling_desc.Get(),
+				alpha,
+				y_desc.Get(), y,
+				dy_desc.Get(), dy,
+				x_desc.Get(), x,
+				beta,
+				dx_desc.Get(), dx,
+				workspace->Get()));
+#endif
+}
+
+//
+// Activation forward/backward functions
+//
+
+template <typename T>
+inline void dnnmarkActivationForward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const ActivationDesc<T> &activation_desc,
+                         const void *alpha,
+                         const DataTensor<T> &bottom_desc,
+                         const void *x,
+                         const void *beta,
+                         const DataTensor<T> &top_desc,
+                         void *y) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnActivationForward(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             activation_desc.Get(),
+             alpha,
+             bottom_desc.Get(), x,
+             beta,
+             top_desc.Get(), y));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenActivationForward(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              activation_desc.Get(),
+              alpha,
+              bottom_desc.Get(), x,
+              beta,
+              top_desc.Get(), y));
+#endif
+}
+
+template <typename T>
+inline void dnnmarkActivationBackward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const ActivationDesc<T> &activation_desc,
+                         const void *alpha,
+                         const DataTensor<T> &top_desc,
+                         const void *y,
+                         const void *dy,
+                         const void *beta,
+                         const DataTensor<T> &bottom_desc,
+                         const void *x,
+                         void *dx) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnActivationBackward(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             activation_desc.Get(),
+             alpha,
+             top_desc.Get(), y,
+             top_desc.Get(), dy,
+             bottom_desc.Get(), x,
+             beta,
+             bottom_desc.Get(), dx));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenActivationBackward(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              activation_desc.Get(),
+              alpha,
+              top_desc.Get(), y,
+              top_desc.Get(), dy,
+              bottom_desc.Get(), x,
+              beta,
+              bottom_desc.Get(), dx));
+#endif
+}
+
+//
+// LRN forward/backward functions
+//
+
+template <typename T>
+inline void dnnmarkLRNForward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const LRNDesc<T> &lrn_desc,
+                         const LRNParam &lrn_param,
+                         const void *alpha,
+                         const DataTensor<T> &bottom_desc,
+                         const void *x,
+                         const void *beta,
+                         const DataTensor<T> &top_desc,
+                         void *y,
+                         Data<T> *workspace) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnLRNCrossChannelForward(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             lrn_desc.Get(),
+             lrn_param.mode_,
+             alpha,
+             bottom_desc.Get(), x,
+             beta,
+             top_desc.Get(), y));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenLRNForward(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              lrn_desc.Get(),
+              alpha,
+              bottom_desc.Get(), x,
+              beta,
+              top_desc.Get(), y,
+              true, workspace->Get()));
+#endif
+}
+
+template <typename T>
+inline void dnnmarkLRNBackward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const LRNDesc<T> &lrn_desc,
+                         const LRNParam &lrn_param,
+                         const void *alpha,
+                         const DataTensor<T> &top_desc,
+                         const void *y,
+                         const void *dy,
+                         const void *beta,
+                         const DataTensor<T> &bottom_desc,
+                         const void *x,
+                         void *dx,
+                         Data<T> *workspace) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnLRNCrossChannelBackward(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             lrn_desc.Get(),
+             lrn_param.mode_,
+             alpha,
+             top_desc.Get(), y,
+             top_desc.Get(), dy,
+             bottom_desc.Get(), x,
+             beta,
+             bottom_desc.Get(), dx));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenLRNBackward(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              lrn_desc.Get(),
+              alpha,
+              top_desc.Get(), y,
+              top_desc.Get(), dy,
+              bottom_desc.Get(), x,
+              beta,
+              bottom_desc.Get(), dx,
+              workspace->Get()));
+#endif
+}
+
+//
+// Fully Connected forward/backward functions
+//
+
+//
+// Softmax forward/backward functions
+//
+
+template <typename T>
+inline void dnnmarkSoftmaxForward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const SoftmaxParam &softmax_param,
+                         const void *alpha,
+                         const DataTensor<T> &bottom_desc,
+                         const void *x,
+                         const void *beta,
+                         const DataTensor<T> &top_desc,
+                         void *y) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnSoftmaxForward(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             softmax_param.algo_,
+             softmax_param.mode_,
+             alpha,
+             bottom_desc.Get(), x,
+             beta,
+             top_desc.Get(), y));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenSoftmaxForward(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              alpha,
+              bottom_desc.Get(), x,
+              beta,
+              top_desc.Get(), y));
+#endif
+}
+
+template <typename T>
+inline void dnnmarkSoftmaxBackward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const SoftmaxParam &softmax_param,
+                         const void *alpha,
+                         const DataTensor<T> &top_desc,
+                         const void *y,
+                         const void *dy,
+                         const void *beta,
+                         const DataTensor<T> &bottom_desc,
+                         void *dx) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnSoftmaxBackward(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             softmax_param.algo_,
+             softmax_param.mode_,
+             alpha,
+             top_desc.Get(), y,
+             top_desc.Get(), dy,
+             beta,
+             bottom_desc.Get(), dx));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenSoftmaxBackward(
+             mode == COMPOSED ?
+             handle.GetMIOpen(idx) : handle.GetMIOpen(),
+             alpha,
+             top_desc.Get(), y,
+             top_desc.Get(), dy,
+             beta,
+             bottom_desc.Get(), dx));
+
+#endif
+}
+
+//
+// Batch Normalization forward/backward functions
+//
+
+template <typename T>
+inline void dnnmarkBatchNormalizationForwardTraining(
+            const Handle &handle,
+            RunMode mode, int idx,
+            const BatchNormParam &bn_param,
+            void *alpha,
+            void *beta,
+            const DataTensor<T> &bottom_desc,
+            const void *x,
+            const DataTensor<T> &top_desc,
+            void *y,
+            const DataTensor<T> &scale_bias_mean_var_desc,
+            void *bn_scale,
+            void *bn_bias,
+            double exp_avg_factor,
+            void *result_running_mean,
+            void *result_running_var,
+            double epsilon,
+            void *result_save_mean,
+            void *result_save_var) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnBatchNormalizationForwardTraining(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             bn_param.mode_,
+             alpha,
+             beta,
+             bottom_desc.Get(), x,
+             top_desc.Get(), y,
+             scale_bias_mean_var_desc.Get(),
+             bn_scale, bn_bias,
+             exp_avg_factor,
+             result_running_mean, result_running_var,
+             epsilon,
+             result_save_mean, result_save_var));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenBatchNormalizationForwardTraining(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              bn_param.mode_,
+              alpha,
+              beta,
+              bottom_desc.Get(), x,
+              top_desc.Get(), y,
+              scale_bias_mean_var_desc.Get(),
+              bn_scale, bn_bias,
+              exp_avg_factor,
+              result_running_mean, result_running_var,
+              epsilon,
+              result_save_mean, result_save_var));
+#endif
+}
+
+template <typename T>
+inline void dnnmarkBatchNormalizationBackward(
+            const Handle &handle,
+            RunMode mode, int idx,
+            const BatchNormParam &bn_param,
+            const void *alpha_data_diff,
+            const void *beta_data_diff,
+            const void *alpha_param_diff,
+            const void *beta_param_diff,
+            const DataTensor<T> &bottom_desc,
+            const void *x,
+            void *dx,
+            const DataTensor<T> &top_desc,
+            const void *dy,
+            const DataTensor<T> &scale_bias_mean_var_desc,
+            const void *bn_scale,
+            void *result_bn_scale_diff,
+            void *result_bn_bias_diff,
+            double epsilon,
+            const void *saved_mean,
+            const void *saved_var) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnBatchNormalizationBackward(
+             mode == COMPOSED ?
+             handle.GetCudnn(idx) : handle.GetCudnn(),
+             bn_param.mode_,
+             alpha_data_diff,
+             beta_data_diff,
+             alpha_param_diff,
+             beta_param_diff,
+             bottom_desc.Get(), x,
+             top_desc.Get(), dy,
+             bottom_desc.Get(), dx,
+             scale_bias_mean_var_desc.Get(),
+             bn_scale,
+             result_bn_scale_diff, result_bn_bias_diff,
+             epsilon,
+             saved_mean, saved_var));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenBatchNormalizationBackward(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              bn_param.mode_,
+              alpha_data_diff,
+              beta_data_diff,
+              alpha_param_diff,
+              beta_param_diff,
+              bottom_desc.Get(), x,
+              top_desc.Get(), dy,
+              bottom_desc.Get(), dx,
+              scale_bias_mean_var_desc.Get(),
+              bn_scale,
+              result_bn_scale_diff, result_bn_bias_diff,
+              epsilon,
+              saved_mean, saved_var));
+#endif
+}
+
+//
+// Bypass layer
+//
+
+template <typename T>
+inline void dnnmarkBypassForward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const BypassDesc<T> &bypass_desc,
+                         const void *alpha,
+                         const DataTensor<T> &bottom_desc,
+                         const void *x,
+                         const void *beta,
+                         const DataTensor<T> &top_desc,
+                         void *y) {
+#ifdef NVIDIA_CUDNN
+  CUDA_CALL(cudaMemcpy(y,
+                       x,
+                       sizeof(T) * bypass_desc.Get().n_
+                                 * bypass_desc.Get().c_
+                                 * bypass_desc.Get().h_
+                                 * bypass_desc.Get().w_,
+                       cudaMemcpyDeviceToDevice
+                       ));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenActivationForward(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              bypass_desc.Get(),
+              alpha,
+              bottom_desc.Get(), x,
+              beta,
+              top_desc.Get(), y));
+#endif
+}
+
+template <typename T>
+inline void dnnmarkBypassBackward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const BypassDesc<T> &bypass_desc,
+                         const void *alpha,
+                         const DataTensor<T> &top_desc,
+                         const void *y,
+                         const void *dy,
+                         const void *beta,
+                         const DataTensor<T> &bottom_desc,
+                         const void *x,
+                         void *dx) {
+#ifdef NVIDIA_CUDNN
+  CUDA_CALL(cudaMemcpy(dx,
+                       dy,
+                       sizeof(T) * bypass_desc.Get().n_
+                                 * bypass_desc.Get().c_
+                                 * bypass_desc.Get().h_
+                                 * bypass_desc.Get().w_,
+                       cudaMemcpyDeviceToDevice
+                       ));
+#endif
+#ifdef AMD_MIOPEN
+  MIOPEN_CALL(miopenActivationBackward(
+              mode == COMPOSED ?
+              handle.GetMIOpen(idx) : handle.GetMIOpen(),
+              bypass_desc.Get(),
+              alpha,
+              top_desc.Get(), y,
+              top_desc.Get(), dy,
+              bottom_desc.Get(), x,
+              beta,
+              bottom_desc.Get(), dx));
+#endif
+}
+
+//
+// Dropout layer
+//
+
+template <typename T>
+inline void dnnmarkDropoutForward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const DropoutDesc<T> &dropout_desc,
+                         const DataTensor<T> &bottom_desc,
+                         const void *x,
+                         const DataTensor<T> &top_desc,
+                         void *y,
+                         void *reserve_space, size_t reserve_space_size) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnDropoutForward(
+          mode == COMPOSED ?
+          handle.GetCudnn(idx) : handle.GetCudnn(),
+          dropout_desc.Get(),
+          bottom_desc.Get(), x,
+          top_desc.Get(), y,
+          reserve_space,
+          reserve_space_size
+          ));
+#endif
+#ifdef AMD_MIOPEN
+#endif
+}
+
+template <typename T>
+inline void dnnmarkDropoutBackward(const Handle &handle,
+                         RunMode mode, int idx,
+                         const DropoutDesc<T> &dropout_desc,
+                         const DataTensor<T> &top_desc,
+                         const void *dy,
+                         const DataTensor<T> &bottom_desc,
+                         void *dx,
+                         void *reserve_space, size_t reserve_space_size) {
+#ifdef NVIDIA_CUDNN
+  CUDNN_CALL(cudnnDropoutBackward(
+          mode == COMPOSED ?
+          handle.GetCudnn(idx) : handle.GetCudnn(),
+          dropout_desc.Get(),
+          top_desc.Get(), dy,
+          bottom_desc.Get(), dx,
+          reserve_space,
+          reserve_space_size
+          ));
+#endif
+#ifdef AMD_MIOPEN
+#endif
+}
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_DNN_WRAPPER_H_
diff --git a/src/DNNMark/core/include/dnnmark.h b/src/DNNMark/core/include/dnnmark.h
new file mode 100644
index 0000000..cda7812
--- /dev/null
+++ b/src/DNNMark/core/include/dnnmark.h
@@ -0,0 +1,123 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_DNNMARK_H_
+#define CORE_INCLUDE_DNNMARK_H_
+
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <list>
+#include <map>
+#include <memory>
+#include <glog/logging.h>
+
+#include "common.h"
+#include "utility.h"
+#include "timer.h"
+#include "gemm_wrapper.h"
+#include "dnn_wrapper.h"
+#include "dnn_config_keywords.h"
+#include "dnn_param.h"
+#include "dnn_utility.h"
+#include "data_manager.h"
+#include "dnn_layer.h"
+
+#include "activation_layer.h"
+#include "bn_layer.h"
+#include "bypass_layer.h"
+#include "conv_layer.h"
+#include "dropout_layer.h"
+#include "fc_layer.h"
+#include "lrn_layer.h"
+#include "pool_layer.h"
+#include "softmax_layer.h"
+
+namespace dnnmark {
+
+const std::map<std::string, LayerType> layer_type_map = {
+{layer_section_keywords[0], CONVOLUTION},
+{layer_section_keywords[1], POOLING},
+{layer_section_keywords[2], LRN},
+{layer_section_keywords[3], ACTIVATION},
+{layer_section_keywords[4], FC},
+{layer_section_keywords[5], SOFTMAX},
+{layer_section_keywords[6], BN},
+{layer_section_keywords[7], DROPOUT},
+{layer_section_keywords[8], BYPASS}
+};
+
+template <typename T>
+class DNNMark {
+ private:
+  RunMode run_mode_;
+  Handle handle_;
+  // The map is ordered, so we don't need other container to store the layers
+  std::map<int, std::shared_ptr<Layer<T>>> layers_map_;
+  std::map<std::string, int> name_id_map_;
+  int num_layers_added_;
+
+  // Timer
+  Timer timer_;
+
+  // Private functions
+  void SetLayerParams(LayerType layer_type,
+                      int current_layer_id,
+                      const std::string &var,
+                      const std::string &val);
+
+ public:
+
+  DNNMark(const std::string &mmap_file);
+  DNNMark(int num_layers, const std::string &mmap_file);
+  void ParseAllConfig(const std::string &config_file);
+  int ParseGeneralConfig(const std::string &config_file);
+  int ParseLayerConfig(const std::string &config_file);
+  int Initialize();
+  int RunAll();
+  int Forward();
+  int Backward();
+
+  int TearDown() {
+    DataManager<T>::GetInstance()->DataManager<T>::~DataManager();
+    layers_map_.clear();
+    return 0;
+  };
+
+  Handle *GetHandle() { return &handle_; }
+  Layer<T> *GetLayerByID(int layer_id) { return layers_map_[layer_id].get(); }
+  Layer<T> *GetLayerByName(const std::string &name) {
+    return layers_map_[name_id_map_[name]].get();
+  }
+  bool isLayerExist(const std::string &name) {
+    return name_id_map_.find(name) != name_id_map_.end();
+  }
+  RunMode getRunMode() { return run_mode_; }
+
+  Timer *GetTimer() { return &timer_; }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_DNNMARK_H_
diff --git a/src/DNNMark/core/include/gemm_wrapper.h b/src/DNNMark/core/include/gemm_wrapper.h
new file mode 100644
index 0000000..d7db2d9
--- /dev/null
+++ b/src/DNNMark/core/include/gemm_wrapper.h
@@ -0,0 +1,44 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_GEMM_H_
+#define CORE_INCLUDE_GEMM_H_
+
+#include "common.h"
+#include "dnn_utility.h"
+
+namespace dnnmark {
+
+template <typename T>
+void dnnmarkGEMM(const Handle &handle, RunMode mode, int idx,
+                 bool is_a_transpose, bool is_b_transpose,
+                 int m, int n, int k,
+                 T *alpha,
+                 T *a, int lda,
+                 T *b, int ldb,
+                 T *beta,
+                 T *c, int ldc);
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_GEMM_H_
+
diff --git a/src/DNNMark/core/include/layers/activation_layer.h b/src/DNNMark/core/include/layers/activation_layer.h
new file mode 100644
index 0000000..976d400
--- /dev/null
+++ b/src/DNNMark/core/include/layers/activation_layer.h
@@ -0,0 +1,177 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_LAYERS_ACTIVATION_LAYER_H_
+#define CORE_INCLUDE_LAYERS_ACTIVATION_LAYER_H_
+
+#include "dnn_layer.h"
+
+namespace dnnmark {
+
+template <typename T>
+class ActivationLayer : public Layer<T> {
+  // using declaration for calling member from base class
+  using Layer<T>::p_dnnmark_;
+  using Layer<T>::layer_id_;
+  using Layer<T>::previous_layer_name_;
+  using Layer<T>::input_dim_;
+  using Layer<T>::output_dim_;
+  using Layer<T>::bottom_desc_;
+  using Layer<T>::top_desc_;
+  using Layer<T>::data_manager_;
+
+  using Layer<T>::num_bottoms_;
+  using Layer<T>::bottoms_;
+  using Layer<T>::bottom_chunk_ids_;
+  using Layer<T>::bottom_diffs_;
+  using Layer<T>::bottom_diff_chunk_ids_;
+
+  using Layer<T>::num_tops_;
+  using Layer<T>::tops_;
+  using Layer<T>::top_chunk_ids_;
+  using Layer<T>::top_diffs_;
+  using Layer<T>::top_diff_chunk_ids_;
+
+ private:
+  ActivationParam activation_param_;
+
+  // Activation specific descriptor
+  ActivationDesc<T> desc_;
+
+ public:
+  ActivationLayer(DNNMark<T> *p_dnnmark)
+  : Layer<T>(p_dnnmark),
+    activation_param_(), desc_() {
+  }
+
+  ActivationParam *getActivationParam() { return &activation_param_; }
+
+  void Setup() {
+    // Set up indispensable stuff here
+    Layer<T>::Setup();
+
+    // Set activationing related descriptors
+    desc_.Set(activation_param_);
+
+    // Set up activationing related data
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 &&
+        input_dim_.h_ != 0 && input_dim_.w_ != 0) {
+      //
+      // Standalone mode
+      //
+
+      // Compute dimension of output data
+      ComputeOutputDim();
+
+      // Set top tensor
+      top_desc_.Set(output_dim_.n_,
+                    output_dim_.c_,
+                    output_dim_.h_,
+                    output_dim_.w_);
+
+      // Prepare top data
+      int top_size = output_dim_.n_ *
+                     output_dim_.c_ *
+                     output_dim_.h_ *
+                     output_dim_.w_;
+      for (int i = 0; i < num_tops_; i++) {
+        top_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        tops_.push_back(
+          data_manager_->GetData(top_chunk_ids_[i]));
+        top_diff_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        top_diffs_.push_back(
+          data_manager_->GetData(top_diff_chunk_ids_[i]));
+      }
+    }
+  }
+
+  void ComputeOutputDim() {
+    output_dim_.n_ = input_dim_.n_;
+    output_dim_.c_ = input_dim_.c_;
+    output_dim_.h_ = input_dim_.h_;
+    output_dim_.w_ = input_dim_.w_;
+  }
+
+  void ForwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // activationing forward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "ActFwd");
+    for (int i = 0; i < num_bottoms_; i++) {
+      dnnmarkActivationForward(
+             *(p_dnnmark_->GetHandle()),
+             p_dnnmark_->getRunMode(), layer_id_,
+             desc_,
+             DataType<T>::one,
+             bottom_desc_, bottoms_[i]->Get(),
+             DataType<T>::zero,
+             top_desc_, tops_[i]->Get());
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "ActFwd");
+
+  }
+  void BackwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the top and top diff data
+      for (int i = 0; i < num_tops_; i++) {
+        tops_[i]->Filler();
+        top_diffs_[i]->Filler();
+      }
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // activationing backward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "ActBwd");
+    for (int i = 0; i < num_bottoms_; i++) {
+      dnnmarkActivationBackward(
+             *(p_dnnmark_->GetHandle()),
+             p_dnnmark_->getRunMode(), layer_id_,
+             desc_,
+             DataType<T>::one,
+             top_desc_, tops_[i]->Get(), top_diffs_[i]->Get(),
+             DataType<T>::zero,
+             bottom_desc_, bottoms_[i]->Get(), bottom_diffs_[i]->Get());
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "ActBwd");
+  }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_LAYERS_ACTIVATION_LAYER_H_
diff --git a/src/DNNMark/core/include/layers/bn_layer.h b/src/DNNMark/core/include/layers/bn_layer.h
new file mode 100644
index 0000000..5529d18
--- /dev/null
+++ b/src/DNNMark/core/include/layers/bn_layer.h
@@ -0,0 +1,269 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_LAYERS_BN_LAYER_H_
+#define CORE_INCLUDE_LAYERS_BN_LAYER_H_
+
+#include "dnn_layer.h"
+
+namespace dnnmark {
+
+template <typename T>
+class BatchNormLayer : public Layer<T> {
+  // using declaration for calling member from base class
+  using Layer<T>::p_dnnmark_;
+  using Layer<T>::layer_id_;
+  using Layer<T>::previous_layer_name_;
+  using Layer<T>::input_dim_;
+  using Layer<T>::output_dim_;
+  using Layer<T>::bottom_desc_;
+  using Layer<T>::top_desc_;
+  using Layer<T>::data_manager_;
+
+  using Layer<T>::num_bottoms_;
+  using Layer<T>::bottoms_;
+  using Layer<T>::bottom_chunk_ids_;
+  using Layer<T>::bottom_diffs_;
+  using Layer<T>::bottom_diff_chunk_ids_;
+
+  using Layer<T>::num_tops_;
+  using Layer<T>::tops_;
+  using Layer<T>::top_chunk_ids_;
+  using Layer<T>::top_diffs_;
+  using Layer<T>::top_diff_chunk_ids_;
+
+ private:
+  BatchNormParam bn_param_;
+  DataTensor<T> bn_specifics_desc_;
+  int bn_specifics_size_;
+  Data<T> *bn_scale_;
+  int bn_scale_chunk_id_;
+  Data<T> *bn_scale_diffs_;
+  int bn_scale_diffs_chunk_id_;
+  Data<T> *bn_bias_;
+  int bn_bias_chunk_id_;
+  Data<T> *bn_bias_diffs_;
+  int bn_bias_diffs_chunk_id_;
+  Data<T> *bn_running_mean_;
+  int bn_running_mean_chunk_id_;
+  Data<T> *bn_running_inv_variance_;
+  int bn_running_inv_variance_chunk_id_;
+  Data<T> *bn_saved_mean_;
+  int bn_saved_mean_chunk_id_;
+  Data<T> *bn_saved_inv_variance_;
+  int bn_saved_inv_variance_chunk_id_;
+
+  // Work around for MIOpen library
+  T alpha_;
+  T beta_;
+
+ public:
+  BatchNormLayer(DNNMark<T> *p_dnnmark)
+  : Layer<T>(p_dnnmark),
+    bn_param_() {
+      alpha_ = 1.0;
+      beta_ = 0.0;
+  }
+
+  BatchNormParam *getBatchNormParam() { return &bn_param_; }
+
+  void Setup() {
+    // Set up indispensable stuff here
+    Layer<T>::Setup();
+
+    // Set up batch normalization related data
+    if(bn_param_.epsilon_ < BN_MIN_EPSILON) {
+      LOG(FATAL) << "The value of epsilon cannot be less than BN_MIN_EPSILON."
+                 << "This value is defined as " << BN_MIN_EPSILON;
+    }
+    if((BatchNormMode)(bn_param_.mode_) == PerActivation) {
+      bn_specifics_desc_.Set(1, input_dim_.c_, input_dim_.h_, input_dim_.w_);
+      bn_specifics_size_ = input_dim_.c_ * input_dim_.h_ * input_dim_.w_;
+    }
+    else if ((BatchNormMode)(bn_param_.mode_) == Spatial) {
+      bn_specifics_desc_.Set(1, input_dim_.c_, 1, 1);
+      bn_specifics_size_ = input_dim_.c_;
+    }
+
+    //Initialize bn_scale_, bn_scale_diffs_, bn_bias_, bn_bias_diffs_, bn_running_mean_, and bn_running_inv_variance_
+    bn_scale_chunk_id_ = data_manager_->CreateData(bn_specifics_size_);
+    bn_scale_ = data_manager_->GetData(bn_scale_chunk_id_);
+    bn_scale_diffs_chunk_id_ = data_manager_->CreateData(bn_specifics_size_);
+    bn_scale_diffs_ = data_manager_->GetData(bn_scale_diffs_chunk_id_);
+    bn_bias_chunk_id_ = data_manager_->CreateData(bn_specifics_size_);
+    bn_bias_ = data_manager_->GetData(bn_bias_chunk_id_);
+    bn_bias_diffs_chunk_id_ = data_manager_->CreateData(bn_specifics_size_);
+    bn_bias_diffs_ = data_manager_->GetData(bn_bias_diffs_chunk_id_);
+    bn_running_mean_chunk_id_ = data_manager_->CreateData(bn_specifics_size_);
+    bn_running_mean_ = data_manager_->GetData(bn_running_mean_chunk_id_);
+    bn_running_inv_variance_chunk_id_ = data_manager_->CreateData(bn_specifics_size_);
+    bn_running_inv_variance_ = data_manager_->GetData(bn_running_inv_variance_chunk_id_);
+
+    bn_scale_->Filler();
+    bn_bias_->Filler();
+    bn_running_mean_->Filler();
+    bn_running_inv_variance_->Filler();
+
+    //All of these tensors use the bn_specifics_ tensor descriptor
+    if(bn_param_.save_intermediates_) {
+      bn_saved_mean_chunk_id_ = data_manager_->CreateData(bn_specifics_size_);
+      bn_saved_mean_ = data_manager_->GetData(bn_saved_mean_chunk_id_);
+      bn_saved_inv_variance_chunk_id_ = data_manager_->CreateData(bn_specifics_size_);
+      bn_saved_inv_variance_ = data_manager_->GetData(bn_saved_inv_variance_chunk_id_);
+
+      bn_saved_mean_->Filler();
+      bn_saved_inv_variance_->Filler();
+    }
+    else {
+      bn_saved_mean_ = nullptr;
+      bn_saved_inv_variance_ = nullptr;
+    }
+
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 &&
+        input_dim_.h_ != 0 && input_dim_.w_ != 0) {
+      //
+      // Standalone mode
+      //
+
+      // Compute dimension of output data
+      ComputeOutputDim();
+
+      // Set top tensor
+      top_desc_.Set(output_dim_.n_,
+                    output_dim_.c_,
+                    output_dim_.h_,
+                    output_dim_.w_);
+
+      // Prepare top data
+      int top_size = output_dim_.n_ *
+                     output_dim_.c_ *
+                     output_dim_.h_ *
+                     output_dim_.w_;
+      for (int i = 0; i < num_tops_; i++) {
+        top_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        tops_.push_back(
+          data_manager_->GetData(top_chunk_ids_[i]));
+        top_diff_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        top_diffs_.push_back(
+          data_manager_->GetData(top_diff_chunk_ids_[i]));
+      }
+
+    }
+  }
+
+  void ComputeOutputDim() {
+    output_dim_.n_ = input_dim_.n_;
+    output_dim_.c_ = input_dim_.c_;
+    output_dim_.h_ = input_dim_.h_;
+    output_dim_.w_ = input_dim_.w_;
+  }
+
+  void ForwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Batch normalization forward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "BnFwd");
+    for (int i = 0; i < num_bottoms_; i++) {
+      dnnmarkBatchNormalizationForwardTraining(
+              *(p_dnnmark_->GetHandle()),
+              p_dnnmark_->getRunMode(), layer_id_,
+              bn_param_,
+              //DataType<T>::one,
+              //DataType<T>::zero,
+              &alpha_,
+              &beta_,
+              bottom_desc_, bottoms_[i]->Get(),
+              top_desc_, tops_[i]->Get(),
+              bn_specifics_desc_,
+              bn_scale_->Get(),
+              bn_bias_->Get(),
+              bn_param_.exp_avg_factor_,
+              bn_running_mean_->Get(),
+              bn_running_inv_variance_->Get(),
+              bn_param_.epsilon_,
+              bn_saved_mean_->Get(),
+              bn_saved_inv_variance_->Get()
+              );
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "BnFwd");
+  }
+
+  void BackwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the top and top diff data
+      for (int i = 0; i < num_tops_; i++) {
+        tops_[i]->Filler();
+        top_diffs_[i]->Filler();
+      }
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Batch normalization backward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "BnBwd");
+    for (int i = 0; i < num_tops_; i++) {
+      dnnmarkBatchNormalizationBackward(
+              *(p_dnnmark_->GetHandle()),
+              p_dnnmark_->getRunMode(), layer_id_,
+              bn_param_,
+              //DataType<T>::one,
+              //DataType<T>::zero,
+              //DataType<T>::one,
+              //DataType<T>::zero,
+              &alpha_,
+              &beta_,
+              &alpha_,
+              &beta_,
+              bottom_desc_, bottoms_[i]->Get(), bottom_diffs_[i]->Get(),
+              top_desc_, top_diffs_[i]->Get(),
+              bn_specifics_desc_,
+              bn_scale_->Get(),
+              bn_scale_diffs_->Get(),
+              bn_bias_diffs_->Get(),
+              bn_param_.epsilon_,
+              bn_saved_mean_->Get(),
+              bn_saved_inv_variance_->Get()
+              );
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "BnBwd");
+  }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_LAYERS_BN_LAYER_H_
diff --git a/src/DNNMark/core/include/layers/bypass_layer.h b/src/DNNMark/core/include/layers/bypass_layer.h
new file mode 100644
index 0000000..a73614a
--- /dev/null
+++ b/src/DNNMark/core/include/layers/bypass_layer.h
@@ -0,0 +1,180 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_LAYERS_BYPASS_LAYER_H_
+#define CORE_INCLUDE_LAYERS_BYPASS_LAYER_H_
+
+#include "dnn_layer.h"
+
+namespace dnnmark {
+
+template <typename T>
+class BypassLayer : public Layer<T> {
+  // using declaration for calling member from base class
+  using Layer<T>::p_dnnmark_;
+  using Layer<T>::layer_id_;
+  using Layer<T>::previous_layer_name_;
+  using Layer<T>::input_dim_;
+  using Layer<T>::output_dim_;
+  using Layer<T>::bottom_desc_;
+  using Layer<T>::top_desc_;
+  using Layer<T>::data_manager_;
+
+  using Layer<T>::num_bottoms_;
+  using Layer<T>::bottoms_;
+  using Layer<T>::bottom_chunk_ids_;
+  using Layer<T>::bottom_diffs_;
+  using Layer<T>::bottom_diff_chunk_ids_;
+
+  using Layer<T>::num_tops_;
+  using Layer<T>::tops_;
+  using Layer<T>::top_chunk_ids_;
+  using Layer<T>::top_diffs_;
+  using Layer<T>::top_diff_chunk_ids_;
+
+ private:
+  BypassParam bypass_param_;
+
+  BypassDesc<T> desc_;
+
+ public:
+  BypassLayer(DNNMark<T> *p_dnnmark)
+  : Layer<T>(p_dnnmark),
+    bypass_param_() {
+  }
+
+  BypassParam *getBypassParam() { return &bypass_param_; }
+
+  void Setup() {
+    // Set up indispensable stuff here
+    Layer<T>::Setup();
+
+    // Set Bypass related descriptors
+    desc_.Set(input_dim_);
+
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 &&
+        input_dim_.h_ != 0 && input_dim_.w_ != 0) {
+      //
+      // Standalone mode
+      //
+
+      // Compute dimension of output data
+      ComputeOutputDim();
+
+      // Set top tensor
+      top_desc_.Set(output_dim_.n_,
+                    output_dim_.c_,
+                    output_dim_.h_,
+                    output_dim_.w_);
+
+      // Prepare top data
+      int top_size = output_dim_.n_ *
+                     output_dim_.c_ *
+                     output_dim_.h_ *
+                     output_dim_.w_;
+      for (int i = 0; i < num_tops_; i++) {
+        top_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        tops_.push_back(
+          data_manager_->GetData(top_chunk_ids_[i]));
+        top_diff_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        top_diffs_.push_back(
+          data_manager_->GetData(top_diff_chunk_ids_[i]));
+      }
+
+    }
+  }
+
+  void ComputeOutputDim() {
+    output_dim_.n_ = input_dim_.n_;
+    output_dim_.c_ = input_dim_.c_;
+    output_dim_.h_ = input_dim_.h_;
+    output_dim_.w_ = input_dim_.w_;
+  }
+
+  void ForwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Bypass forwards - copy bottom data to top.
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "BypassFwd");
+    for (int i = 0; i < num_bottoms_; i++) {
+      dnnmarkBypassForward(*(p_dnnmark_->GetHandle()),
+                           p_dnnmark_->getRunMode(), layer_id_,
+                           desc_,
+                           DataType<T>::one,
+                           bottom_desc_,
+                           bottoms_[i]->Get(),
+                           DataType<T>::zero,
+                           top_desc_,
+                           tops_[i]->Get());
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "BypassFwd");
+  }
+
+  void BackwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the top and top diff data
+      for (int i = 0; i < num_tops_; i++) {
+        tops_[i]->Filler();
+        top_diffs_[i]->Filler();
+      }
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Bypass backwards - copy top_diff data to bottom_diff
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "BypassBwd");
+    for (int i = 0; i < num_tops_; i++) {
+      dnnmarkBypassBackward(*(p_dnnmark_->GetHandle()),
+                           p_dnnmark_->getRunMode(), layer_id_,
+                           desc_,
+                           DataType<T>::one,
+                           top_desc_,
+                           tops_[i]->Get(),
+                           top_diffs_[i]->Get(),
+                           DataType<T>::zero,
+                           bottom_desc_,
+                           bottoms_[i]->Get(),
+                           bottom_diffs_[i]->Get());
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "BypassBwd");
+  }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_LAYERS_BYPASS_LAYER_H_
diff --git a/src/DNNMark/core/include/layers/conv_layer.h b/src/DNNMark/core/include/layers/conv_layer.h
new file mode 100644
index 0000000..cc1c13b
--- /dev/null
+++ b/src/DNNMark/core/include/layers/conv_layer.h
@@ -0,0 +1,349 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_LAYERS_CONV_LAYER_H_
+#define CORE_INCLUDE_LAYERS_CONV_LAYER_H_
+
+#include "dnn_layer.h"
+#include <iostream>
+
+namespace dnnmark {
+
+template <typename T>
+class ConvolutionLayer : public Layer<T> {
+  // using declaration for calling member from base class
+  using Layer<T>::p_dnnmark_;
+  using Layer<T>::layer_id_;
+  using Layer<T>::previous_layer_name_;
+  using Layer<T>::input_dim_;
+  using Layer<T>::output_dim_;
+  using Layer<T>::bottom_desc_;
+  using Layer<T>::top_desc_;
+  using Layer<T>::data_manager_;
+
+  using Layer<T>::num_bottoms_;
+  using Layer<T>::bottoms_;
+  using Layer<T>::bottom_chunk_ids_;
+  using Layer<T>::bottom_diffs_;
+  using Layer<T>::bottom_diff_chunk_ids_;
+
+  using Layer<T>::num_tops_;
+  using Layer<T>::tops_;
+  using Layer<T>::top_chunk_ids_;
+  using Layer<T>::top_diffs_;
+  using Layer<T>::top_diff_chunk_ids_;
+
+ private:
+  ConvolutionParam conv_param_;
+
+  // Convolution specific descriptor
+  ConvolutionDesc<T> desc_;
+
+  // Layer weights
+  Data<T> *weights_;
+  int weights_chunk_id_;
+  Data<T> *weights_diff_;
+  int weights_diff_chunk_id_;
+
+  // Algorithm specific parameters
+  ConvAlgo<T> conv_algo_;
+  size_t fwd_workspace_size_;
+  size_t bwd_data_workspace_size_;
+  size_t bwd_filter_workspace_size_;
+  Data<T> *fwd_workspace_;
+  int fwd_workspace_id_;
+  Data<T> *bwd_data_workspace_;
+  int bwd_data_workspace_id_;
+  Data<T> *bwd_filter_workspace_;
+  int bwd_filter_workspace_id_;
+  bool has_fwd_workspace_;
+  bool has_bwd_data_workspace_;
+  bool has_bwd_filter_workspace_;
+ public:
+  ConvolutionLayer(DNNMark<T> *p_dnnmark)
+  : Layer<T>(p_dnnmark),
+    conv_param_(), desc_(), conv_algo_() {
+    Layer<T>::has_learnable_params_ = true;
+    fwd_workspace_size_ = 0;
+    bwd_data_workspace_size_ = 0;
+    bwd_filter_workspace_size_ = 0;
+    has_fwd_workspace_ = false;
+    has_bwd_data_workspace_ = false;
+    has_bwd_filter_workspace_ = false;
+  }
+
+  ~ConvolutionLayer() {
+    // Free the workspace
+    if (has_fwd_workspace_) {
+      data_manager_->RemoveData(fwd_workspace_id_);
+      has_fwd_workspace_ = false;
+    }
+    if (has_bwd_data_workspace_) {
+      data_manager_->RemoveData(bwd_data_workspace_id_);
+      has_bwd_data_workspace_ = false;
+    }
+    if (has_bwd_filter_workspace_) {
+      data_manager_->RemoveData(bwd_filter_workspace_id_);
+      has_bwd_filter_workspace_ = false;
+    }
+  }
+
+  ConvolutionParam *getConvParam() { return &conv_param_; }
+
+  void Setup() {
+    // Set up indispensable stuff here
+    Layer<T>::Setup();
+
+    // Set convolution related descriptors
+    desc_.Set(conv_param_, input_dim_.c_);
+
+    // Set up convolution related data
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 &&
+        input_dim_.h_ != 0 && input_dim_.w_ != 0) {
+      //
+      // Standalone mode
+      //
+
+      // Compute dimension of output data
+      ComputeOutputDim();
+
+      // Set top tensor
+      top_desc_.Set(output_dim_.n_,
+                    output_dim_.c_,
+                    output_dim_.h_,
+                    output_dim_.w_);
+
+      // Prepare top data
+      int top_size = output_dim_.n_ *
+                     output_dim_.c_ *
+                     output_dim_.h_ *
+                     output_dim_.w_;
+      for (int i = 0; i < num_tops_; i++) {
+        top_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        tops_.push_back(
+          data_manager_->GetData(top_chunk_ids_[i]));
+        top_diff_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        top_diffs_.push_back(
+          data_manager_->GetData(top_diff_chunk_ids_[i]));
+      }
+
+    }
+
+    // Only one set of weights is considered
+
+    int weights_size = conv_param_.output_num_ *
+                       input_dim_.c_ *
+                       conv_param_.kernel_size_h_ *
+                       conv_param_.kernel_size_w_;
+    weights_chunk_id_ = data_manager_->CreateData(weights_size);
+    weights_ = data_manager_->GetData(weights_chunk_id_);
+    weights_diff_chunk_id_ =
+      data_manager_->CreateData(weights_size);
+    weights_diff_ = data_manager_->GetData(weights_diff_chunk_id_);
+
+    // Fill the weight data
+    weights_->Filler();
+
+    // Set convolution forward algorithm
+    // Use default algorithm for now
+    conv_algo_.SetFwdAlgo(conv_param_.algo_);
+
+    // Allocate workspace
+    conv_algo_.GetFwdWorkspaceSize(*(p_dnnmark_->GetHandle()),
+                                   p_dnnmark_->getRunMode(), layer_id_,
+                                   bottom_desc_,
+                                   top_desc_,
+                                   desc_,
+                                   &fwd_workspace_size_);
+    if (fwd_workspace_size_ > 0) {
+      fwd_workspace_id_ = data_manager_->CreateData(fwd_workspace_size_);
+      fwd_workspace_ = data_manager_->GetData(fwd_workspace_id_);
+      has_fwd_workspace_ = true;
+    }
+
+#ifdef NVIDIA_CUDNN
+    // Set convolution backward filter/weights algorithm
+    if (!conv_param_.algo_.compare("cudnn")) {
+        // Chainer default behaviour
+        // Use cuDNN function cudnnGetConvolutionBackwardFilterAlgorithm
+        conv_algo_.SetBwdFilterAlgo(*(p_dnnmark_->GetHandle()),
+                                         p_dnnmark_->getRunMode(), layer_id_,
+                                         bottom_desc_,
+                                         top_desc_,
+                                         desc_,
+                                         conv_param_.conv_bwd_filter_pref_);
+        LOG(INFO) << "Set cuDNN recommended conv. bwd filter alg. to " << conv_algo_.GetBwdFilterAlgo();
+        std::cout << "cuDNN recommended bwd convolution filter algorithm:"<<conv_algo_.GetBwdFilterAlgo()<<"\n";
+    } else if (conv_param_.algo_ == "auto" ) {
+        // Query cuDNN for the fastest BWD convolution filter gradient algorithm.
+        // Use cuDNN function cudnnFindConvolutionBackwardFilterAlgorithm (called inside FindBwdFilterAlgo())
+
+        // NOTE: The below code selects algorithms prior to run, during setup phase.
+        // FindBwdFilterAlgoEx must be called during run phase through dnn_wrapper.
+        //conv_algo_.SetBwdFilterAlgo("autoex");
+        conv_algo_.FindBwdFilterAlgo(*(p_dnnmark_->GetHandle()),
+                                         p_dnnmark_->getRunMode(), layer_id_,
+                                         bottom_desc_,
+                                         desc_,
+                                         top_desc_);
+        LOG(INFO) << "cuDNN fastest bwd conv. filter algo.:" << conv_algo_.GetBwdFilterAlgo();
+        std::cout << "cuDNN fastest bwd conv. filter algorithm:"<<conv_algo_.GetBwdFilterAlgo()<<"\n";
+    } else {
+      // Use default algorithm for now
+      LOG(INFO) << "Setting Bwd Filter Algo to " << conv_param_.algo_;
+      conv_algo_.SetBwdFilterAlgo(conv_param_.algo_);
+    }
+#endif
+#ifdef AMD_MIOPEN
+    // Use default algorithm for now
+    LOG(INFO) << "Setting Bwd Filter Algo to " << conv_param_.algo_;
+    conv_algo_.SetBwdFilterAlgo(conv_param_.algo_);
+#endif
+
+
+    // Allocate workspace
+    conv_algo_.GetBwdFilterWorkspaceSize(*(p_dnnmark_->GetHandle()),
+                                         p_dnnmark_->getRunMode(), layer_id_,
+                                         bottom_desc_,
+                                         top_desc_,
+                                         desc_,
+                                         &bwd_filter_workspace_size_);
+    if (bwd_filter_workspace_size_ > 0) {
+      bwd_filter_workspace_id_ = data_manager_->
+                                 CreateData(bwd_filter_workspace_size_);
+      bwd_filter_workspace_ = data_manager_->GetData(bwd_filter_workspace_id_);
+      has_bwd_filter_workspace_ = true;
+    }
+
+    // Set convolution backward data algorithm
+    // Use default algorithm for now
+    conv_algo_.SetBwdDataAlgo(conv_param_.algod_);
+#ifdef NVIDIA_CUDNN
+    LOG(INFO) << "BWD conv. data algo set to:"<< static_cast<int>(conv_algo_.getDataAlgo());
+    // std::cout << "cuDNN recommended BWD convolution data algorithm:"<<conv_algo_.GetBwdDataAlgo()<<"\n";
+#endif
+
+    // Allocate workspace
+    conv_algo_.GetBwdDataWorkspaceSize(*(p_dnnmark_->GetHandle()),
+                                       p_dnnmark_->getRunMode(), layer_id_,
+                                       bottom_desc_,
+                                       top_desc_,
+                                       desc_,
+                                       &bwd_data_workspace_size_);
+    if (bwd_data_workspace_size_ > 0) {
+      bwd_data_workspace_id_ = data_manager_->
+                                 CreateData(bwd_data_workspace_size_);
+      bwd_data_workspace_ = data_manager_->GetData(bwd_data_workspace_id_);
+      has_bwd_data_workspace_ = true;
+    }
+
+  }
+
+  void ComputeOutputDim() {
+    output_dim_.n_ = input_dim_.n_;
+    output_dim_.c_ = conv_param_.output_num_;
+    output_dim_.h_ = (input_dim_.h_ +
+      2 * conv_param_.pad_h_ - conv_param_.kernel_size_h_) /
+      conv_param_.stride_u_ + 1;
+    output_dim_.w_ = (input_dim_.w_ +
+      2 * conv_param_.pad_w_ - conv_param_.kernel_size_w_) /
+      conv_param_.stride_v_ + 1;
+  }
+
+  void ForwardPropagation() {
+    // Fill the bottom data
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+    // Convolution forward computation
+    for (int i = 0; i < num_bottoms_; i++) {
+      dnnmarkConvolutionForward(
+                *(p_dnnmark_->GetHandle()),
+                p_dnnmark_->getRunMode(), layer_id_,
+                p_dnnmark_->GetTimer(),
+                DataType<T>::one,
+                bottom_desc_, bottoms_[i]->Get(),
+                desc_, weights_->Get(),
+                &conv_algo_,
+                has_fwd_workspace_? fwd_workspace_->Get() : nullptr,
+                fwd_workspace_size_,
+                DataType<T>::zero,
+                top_desc_, tops_[i]->Get());
+    }
+  }
+
+  void BackwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the top data and top diff data
+      for (int i = 0; i < num_tops_; i++) {
+        tops_[i]->Filler();
+        top_diffs_[i]->Filler();
+      }
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Convolution forward computation
+    for (int i = 0; i < num_tops_; i++) {
+      dnnmarkConvolutionBackwardFilter(
+                  *(p_dnnmark_->GetHandle()),
+                  p_dnnmark_->getRunMode(), layer_id_,
+                  p_dnnmark_->GetTimer(),
+                  DataType<T>::one,
+                  bottom_desc_, bottoms_[i]->Get(),
+                  top_desc_, top_diffs_[i]->Get(),
+                  desc_,
+                  &conv_algo_,
+                  has_bwd_filter_workspace_? bwd_filter_workspace_->Get() : nullptr,
+                  bwd_filter_workspace_size_,
+                  DataType<T>::zero,
+                  weights_diff_->Get());
+      if (conv_param_.propagation_) {
+        dnnmarkConvolutionBackwardData(
+                  *(p_dnnmark_->GetHandle()),
+                  p_dnnmark_->getRunMode(), layer_id_,
+                  p_dnnmark_->GetTimer(),
+                  DataType<T>::one,
+                  top_desc_, top_diffs_[i]->Get(),
+                  desc_, weights_->Get(),
+                  &conv_algo_,
+                  has_bwd_data_workspace_? bwd_data_workspace_->Get() : nullptr,
+                  bwd_data_workspace_size_,
+                  DataType<T>::zero,
+                  bottom_desc_, bottoms_[i]->Get());
+      }
+    }
+  }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_LAYERS_CONV_LAYER_H_
diff --git a/src/DNNMark/core/include/layers/dropout_layer.h b/src/DNNMark/core/include/layers/dropout_layer.h
new file mode 100644
index 0000000..eb8afcd
--- /dev/null
+++ b/src/DNNMark/core/include/layers/dropout_layer.h
@@ -0,0 +1,195 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_LAYERS_DROPOUT_LAYER_H_
+#define CORE_INCLUDE_LAYERS_DROPOUT_LAYER_H_
+
+#include "dnn_layer.h"
+
+namespace dnnmark {
+
+template <typename T>
+class DropoutLayer : public Layer<T> {
+  // using declaration for calling member from base class
+  using Layer<T>::p_dnnmark_;
+  using Layer<T>::layer_id_;
+  using Layer<T>::previous_layer_name_;
+  using Layer<T>::input_dim_;
+  using Layer<T>::output_dim_;
+  using Layer<T>::bottom_desc_;
+  using Layer<T>::top_desc_;
+  using Layer<T>::data_manager_;
+
+  using Layer<T>::num_bottoms_;
+  using Layer<T>::bottoms_;
+  using Layer<T>::bottom_chunk_ids_;
+  using Layer<T>::bottom_diffs_;
+  using Layer<T>::bottom_diff_chunk_ids_;
+
+  using Layer<T>::num_tops_;
+  using Layer<T>::tops_;
+  using Layer<T>::top_chunk_ids_;
+  using Layer<T>::top_diffs_;
+  using Layer<T>::top_diff_chunk_ids_;
+
+ private:
+  DropoutParam dropout_param_;
+  DropoutDesc<T> desc_;
+  size_t random_states_size_;
+  int random_states_id_;
+  Data<T> *random_states_;
+  size_t reserve_space_size_;
+  int reserve_space_id_;
+  Data<T> *reserve_space_;
+
+ public:
+  DropoutLayer(DNNMark<T> *p_dnnmark)
+  : Layer<T>(p_dnnmark),
+    dropout_param_(), desc_() {
+  }
+
+  DropoutParam *getDropoutParam() { return &dropout_param_; }
+
+  void Setup() {
+    // Set up indispensable stuff here
+    Layer<T>::Setup();
+
+    // Set up dropout related data
+
+
+    desc_.SetReserveSpaceSize(bottom_desc_, &reserve_space_size_);
+    desc_.SetStatesSize(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                        layer_id_, &random_states_size_);
+
+    if (random_states_size_ > 0) {
+      random_states_id_ = data_manager_->CreateData(random_states_size_);
+      random_states_ = data_manager_->GetData(random_states_id_);
+    }
+
+    desc_.Set(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(), layer_id_,
+              dropout_param_, random_states_->Get(), random_states_size_);
+
+    if (reserve_space_size_ > 0) {
+      reserve_space_id_ = data_manager_->CreateData(reserve_space_size_);
+      reserve_space_ = data_manager_->GetData(reserve_space_id_);
+    }
+
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 &&
+        input_dim_.h_ != 0 && input_dim_.w_ != 0) {
+      //
+      // Standalone mode
+      //
+
+      // Compute dimension of output data
+      ComputeOutputDim();
+
+      // Set top tensor
+      top_desc_.Set(output_dim_.n_,
+                    output_dim_.c_,
+                    output_dim_.h_,
+                    output_dim_.w_);
+
+      // Prepare top data
+      int top_size = output_dim_.n_ *
+                     output_dim_.c_ *
+                     output_dim_.h_ *
+                     output_dim_.w_;
+      for (int i = 0; i < num_tops_; i++) {
+        top_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        tops_.push_back(
+          data_manager_->GetData(top_chunk_ids_[i]));
+        top_diff_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        top_diffs_.push_back(
+          data_manager_->GetData(top_diff_chunk_ids_[i]));
+      }
+
+    }
+  }
+
+  void ComputeOutputDim() {
+    output_dim_.n_ = input_dim_.n_;
+    output_dim_.c_ = input_dim_.c_;
+    output_dim_.h_ = input_dim_.h_;
+    output_dim_.w_ = input_dim_.w_;
+  }
+
+  void ForwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Dropout forwards
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "DropoutFwd");
+    for (int i = 0; i < num_bottoms_; i++) {
+      dnnmarkDropoutForward(
+              *(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+              layer_id_,
+              desc_,
+              bottom_desc_, bottoms_[i]->Get(),
+              top_desc_, tops_[i]->Get(),
+              reserve_space_->Get(),
+              reserve_space_size_
+              );
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "DropoutFwd");
+  }
+
+  void BackwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the top and top diff data
+      for (int i = 0; i < num_tops_; i++) {
+        top_diffs_[i]->Filler();
+      }
+    }
+
+    // Dropout backwards
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "DropoutBwd");
+    for (int i = 0; i < num_tops_; i++) {
+      dnnmarkDropoutBackward(
+              *(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+              layer_id_,
+              desc_,
+              top_desc_, top_diffs_[i]->Get(),
+              bottom_desc_, bottom_diffs_[i]->Get(),
+              reserve_space_->Get(),
+              reserve_space_size_
+              );
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "DropoutBwd");
+  }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_LAYERS_DROPOUT_LAYER_H_
diff --git a/src/DNNMark/core/include/layers/fc_layer.h b/src/DNNMark/core/include/layers/fc_layer.h
new file mode 100644
index 0000000..95c3627
--- /dev/null
+++ b/src/DNNMark/core/include/layers/fc_layer.h
@@ -0,0 +1,253 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_LAYERS_FC_LAYER_H_
+#define CORE_INCLUDE_LAYERS_FC_LAYER_H_
+
+#include "dnn_layer.h"
+
+namespace dnnmark {
+
+template <typename T>
+class FullyConnectedLayer : public Layer<T> {
+  // using declaration for calling member from base class
+  using Layer<T>::p_dnnmark_;
+  using Layer<T>::layer_id_;
+  using Layer<T>::previous_layer_name_;
+  using Layer<T>::input_dim_;
+  using Layer<T>::output_dim_;
+  using Layer<T>::bottom_desc_;
+  using Layer<T>::top_desc_;
+  using Layer<T>::data_manager_;
+
+  using Layer<T>::num_bottoms_;
+  using Layer<T>::bottoms_;
+  using Layer<T>::bottom_chunk_ids_;
+  using Layer<T>::bottom_diffs_;
+  using Layer<T>::bottom_diff_chunk_ids_;
+
+  using Layer<T>::num_tops_;
+  using Layer<T>::tops_;
+  using Layer<T>::top_chunk_ids_;
+  using Layer<T>::top_diffs_;
+  using Layer<T>::top_diff_chunk_ids_;
+
+ private:
+  FullyConnectedParam fc_param_;
+
+  // Weights demension
+  int num_rows_weights_;
+  int num_cols_weights_;
+  T scale_alpha_;
+  T scale_beta_;
+
+  // Layer weights
+  Data<T> *weights_;
+  int weights_chunk_id_;
+  Data<T> *weights_diff_;
+  int weights_diff_chunk_id_;
+
+ public:
+  FullyConnectedLayer(DNNMark<T> *p_dnnmark)
+  : Layer<T>(p_dnnmark),
+    fc_param_() {
+    Layer<T>::has_learnable_params_ = true;
+  }
+
+  FullyConnectedParam *getFullyConnectedParam() { return &fc_param_; }
+
+  void Setup() {
+    // Set up indispensable stuff here
+    Layer<T>::Setup();
+
+    // Set up fcing related data
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 &&
+        input_dim_.h_ != 0 && input_dim_.w_ != 0) {
+      //
+      // Standalone mode
+      //
+
+      // Compute dimension of output data
+      ComputeOutputDim();
+
+      // Set top tensor
+      top_desc_.Set(output_dim_.n_,
+                    output_dim_.c_,
+                    output_dim_.h_,
+                    output_dim_.w_);
+
+      // Prepare top data
+      int top_size = output_dim_.n_ *
+                     output_dim_.c_ *
+                     output_dim_.h_ *
+                     output_dim_.w_;
+      for (int i = 0; i < num_tops_; i++) {
+        top_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        tops_.push_back(
+          data_manager_->GetData(top_chunk_ids_[i]));
+        top_diff_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        top_diffs_.push_back(
+          data_manager_->GetData(top_diff_chunk_ids_[i]));
+      }
+    }
+
+    // Only one set of weights is considered
+    num_rows_weights_ = input_dim_.c_ *
+                           input_dim_.h_ *
+                           input_dim_.w_;
+    num_cols_weights_ = fc_param_.output_num_;
+    int weights_size = num_rows_weights_ * num_cols_weights_;
+    weights_chunk_id_ = data_manager_->CreateData(weights_size);
+    weights_ = data_manager_->GetData(weights_chunk_id_);
+    weights_diff_chunk_id_ =
+      data_manager_->CreateData(weights_size);
+    weights_diff_ = data_manager_->GetData(weights_diff_chunk_id_);
+
+    // Fill the weight data
+    weights_->Filler();
+
+    scale_alpha_ = (T)1.0;
+    scale_beta_ = (T)0.0;
+  }
+
+  void ComputeOutputDim() {
+    output_dim_.n_ = input_dim_.n_;
+    output_dim_.c_ = fc_param_.output_num_;
+    output_dim_.h_ = 1;
+    output_dim_.w_ = 1;
+  }
+
+  void ForwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Prepare CuBLAS parameters
+    int M = fc_param_.output_num_;
+    int N = input_dim_.n_;;
+    int K = num_rows_weights_;
+    int lda = K;
+    int ldb = K;
+    int ldc = M;
+    bool is_a_transpose = true;
+    bool is_b_transpose = false;
+
+    // Fully connected forward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "FcFwd");
+    for (int i = 0; i < num_bottoms_; i++) {
+      // Y = T(W) * X
+      dnnmarkGEMM(*(p_dnnmark_->GetHandle()),
+                  p_dnnmark_->getRunMode(), layer_id_,
+                  is_a_transpose, is_b_transpose,
+                  M, N, K,
+                  &scale_alpha_,
+                  weights_->Get(), lda,
+                  bottoms_[i]->Get(), ldb,
+                  &scale_beta_,
+                  tops_[i]->Get(), ldc);
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "FcFwd");
+
+  }
+
+  void BackwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the top diff data
+      for (int i = 0; i < num_tops_; i++) {
+        top_diffs_[i]->Filler();
+      }
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Prepare CuBLAS parameters for calculating d(W)
+    int M = num_rows_weights_;
+    int N = fc_param_.output_num_;
+    int K = input_dim_.n_;
+    int lda = M;
+    int ldb = N;
+    int ldc = M;
+    bool is_a_transpose = false;
+    bool is_b_transpose = true;
+
+    // Fully connected backward weights computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "FcBwdFilter");
+    for (int i = 0; i < num_tops_; i++) {
+      // d(W) = X * T(d(Y))
+      dnnmarkGEMM(*(p_dnnmark_->GetHandle()),
+                  p_dnnmark_->getRunMode(), layer_id_,
+                  is_a_transpose, is_b_transpose,
+                  M, N, K,
+                  &scale_alpha_,
+                  bottoms_[i]->Get(), lda,
+                  top_diffs_[i]->Get(), ldb,
+                  &scale_beta_,
+                  weights_diff_->Get(), ldc);
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "FcBwdFilter");
+
+    M = num_rows_weights_;
+    N = input_dim_.n_;
+    K = fc_param_.output_num_;
+    lda = M;
+    ldb = K;
+    ldc = M;
+    is_a_transpose = false;
+    is_b_transpose = false;
+
+    // Fully connected backward data computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "FcBwdData");
+    for (int i = 0; i < num_tops_; i++) {
+      // d(X) = W * d(Y)
+      dnnmarkGEMM(*(p_dnnmark_->GetHandle()),
+                  p_dnnmark_->getRunMode(), layer_id_,
+                  is_a_transpose, is_b_transpose,
+                  M, N, K,
+                  &scale_alpha_,
+                  weights_->Get(), lda,
+                  top_diffs_[i]->Get(), ldb,
+                  &scale_beta_,
+                  bottom_diffs_[i]->Get(), ldc);
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "FcBwdData");
+  }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_LAYERS_FC_LAYER_H_
diff --git a/src/DNNMark/core/include/layers/lrn_layer.h b/src/DNNMark/core/include/layers/lrn_layer.h
new file mode 100644
index 0000000..98d9c05
--- /dev/null
+++ b/src/DNNMark/core/include/layers/lrn_layer.h
@@ -0,0 +1,195 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_LAYERS_LRN_LAYER_H_
+#define CORE_INCLUDE_LAYERS_LRN_LAYER_H_
+
+#include "dnn_layer.h"
+
+namespace dnnmark {
+
+template <typename T>
+class LRNLayer : public Layer<T> {
+  // using declaration for calling member from base class
+  using Layer<T>::p_dnnmark_;
+  using Layer<T>::layer_id_;
+  using Layer<T>::previous_layer_name_;
+  using Layer<T>::input_dim_;
+  using Layer<T>::output_dim_;
+  using Layer<T>::bottom_desc_;
+  using Layer<T>::top_desc_;
+  using Layer<T>::data_manager_;
+
+  using Layer<T>::num_bottoms_;
+  using Layer<T>::bottoms_;
+  using Layer<T>::bottom_chunk_ids_;
+  using Layer<T>::bottom_diffs_;
+  using Layer<T>::bottom_diff_chunk_ids_;
+
+  using Layer<T>::num_tops_;
+  using Layer<T>::tops_;
+  using Layer<T>::top_chunk_ids_;
+  using Layer<T>::top_diffs_;
+  using Layer<T>::top_diff_chunk_ids_;
+
+ private:
+  LRNParam lrn_param_;
+
+  // LRN specific descriptor
+  LRNDesc<T> desc_;
+
+  // Workspace
+  size_t workspace_size_;
+  Data<T> *workspace_;
+  int workspace_id_;
+
+ public:
+  LRNLayer(DNNMark<T> *p_dnnmark)
+  : Layer<T>(p_dnnmark),
+    lrn_param_(), desc_() {
+    workspace_size_ = 0;
+  }
+
+  LRNParam *getLRNParam() { return &lrn_param_; }
+
+  void Setup() {
+    // Set up indispensable stuff here
+    Layer<T>::Setup();
+
+    // Set lrning related descriptors
+    desc_.Set(lrn_param_);
+
+    // Set up lrning related data
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 &&
+        input_dim_.h_ != 0 && input_dim_.w_ != 0) {
+      //
+      // Standalone mode
+      //
+
+      // Compute dimension of output data
+      ComputeOutputDim();
+
+      // Set top tensor
+      top_desc_.Set(output_dim_.n_,
+                    output_dim_.c_,
+                    output_dim_.h_,
+                    output_dim_.w_);
+
+      // Prepare top data
+      int top_size = output_dim_.n_ *
+                     output_dim_.c_ *
+                     output_dim_.h_ *
+                     output_dim_.w_;
+      for (int i = 0; i < num_tops_; i++) {
+        top_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        tops_.push_back(
+          data_manager_->GetData(top_chunk_ids_[i]));
+        top_diff_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        top_diffs_.push_back(
+          data_manager_->GetData(top_diff_chunk_ids_[i]));
+      }
+    }
+
+    // Allocate workspace
+    desc_.GetWorkspaceSize(top_desc_, &workspace_size_);
+    if (workspace_size_ > 0) {
+      workspace_id_ = data_manager_->CreateData(workspace_size_);
+      workspace_ = data_manager_->GetData(workspace_id_);
+    }
+  }
+
+  void ComputeOutputDim() {
+    output_dim_.n_ = input_dim_.n_;
+    output_dim_.c_ = input_dim_.c_;
+    output_dim_.h_ = input_dim_.h_;
+    output_dim_.w_ = input_dim_.w_;
+  }
+
+  void ForwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // lrn forward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "LrnFwd");
+    for (int i = 0; i < num_bottoms_; i++) {
+      dnnmarkLRNForward(
+             *(p_dnnmark_->GetHandle()),
+             p_dnnmark_->getRunMode(), layer_id_,
+             desc_,
+             lrn_param_,
+             DataType<T>::one,
+             bottom_desc_, bottoms_[i]->Get(),
+             DataType<T>::zero,
+             top_desc_, tops_[i]->Get(), workspace_);
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "LrnFwd");
+
+  }
+  void BackwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the top and top diff data
+      for (int i = 0; i < num_tops_; i++) {
+        tops_[i]->Filler();
+        top_diffs_[i]->Filler();
+      }
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // lrn backward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "LrnBwd");
+    for (int i = 0; i < num_tops_; i++) {
+      dnnmarkLRNBackward(
+             *(p_dnnmark_->GetHandle()),
+             p_dnnmark_->getRunMode(), layer_id_,
+             desc_,
+             lrn_param_,
+             DataType<T>::one,
+             top_desc_, tops_[i]->Get(), top_diffs_[i]->Get(),
+             DataType<T>::zero,
+             bottom_desc_,
+             bottoms_[i]->Get(),
+             bottom_diffs_[i]->Get(),
+             workspace_);
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "LrnBwd");
+  }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_LAYERS_LRN_LAYER_H_
diff --git a/src/DNNMark/core/include/layers/pool_layer.h b/src/DNNMark/core/include/layers/pool_layer.h
new file mode 100644
index 0000000..c2054a7
--- /dev/null
+++ b/src/DNNMark/core/include/layers/pool_layer.h
@@ -0,0 +1,208 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_LAYERS_POOL_LAYER_H_
+#define CORE_INCLUDE_LAYERS_POOL_LAYER_H_
+
+#include "dnn_layer.h"
+#include "dnn_wrapper.h"
+
+#include <cmath>
+
+namespace dnnmark {
+
+template <typename T>
+class PoolingLayer : public Layer<T> {
+  // using declaration for calling member from base class
+  using Layer<T>::p_dnnmark_;
+  using Layer<T>::layer_id_;
+  using Layer<T>::previous_layer_name_;
+  using Layer<T>::input_dim_;
+  using Layer<T>::output_dim_;
+  using Layer<T>::bottom_desc_;
+  using Layer<T>::top_desc_;
+  using Layer<T>::data_manager_;
+
+  using Layer<T>::num_bottoms_;
+  using Layer<T>::bottoms_;
+  using Layer<T>::bottom_chunk_ids_;
+  using Layer<T>::bottom_diffs_;
+  using Layer<T>::bottom_diff_chunk_ids_;
+
+  using Layer<T>::num_tops_;
+  using Layer<T>::tops_;
+  using Layer<T>::top_chunk_ids_;
+  using Layer<T>::top_diffs_;
+  using Layer<T>::top_diff_chunk_ids_;
+
+ private:
+  PoolingParam pool_param_;
+
+  // Pooling specific descriptor
+  PoolingDesc<T> desc_;
+
+  // Workspace
+  size_t workspace_size_;
+  Data<T> *workspace_;
+  int workspace_id_;
+
+ public:
+  PoolingLayer(DNNMark<T> *p_dnnmark)
+      : Layer<T>(p_dnnmark), pool_param_(), desc_() {
+    workspace_size_ = 0;
+  }
+
+  PoolingParam *getPoolParam() { return &pool_param_; }
+
+  void Setup() {
+    // Set up indispensable stuff here
+    Layer<T>::Setup();
+
+    // Set pooling related descriptors
+    desc_.Set(pool_param_);
+
+    // Set up pooling related data
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 && input_dim_.h_ != 0 &&
+        input_dim_.w_ != 0) {
+      //
+      // Standalone mode
+      //
+
+      // Compute dimension of output data
+      ComputeOutputDim();
+
+      // Set top tensor
+      top_desc_.Set(output_dim_.n_, output_dim_.c_, output_dim_.h_,
+                    output_dim_.w_);
+
+      // Prepare top data
+      int top_size =
+          output_dim_.n_ * output_dim_.c_ * output_dim_.h_ * output_dim_.w_;
+      for (int i = 0; i < num_tops_; i++) {
+        top_chunk_ids_.push_back(data_manager_->CreateData(top_size));
+        tops_.push_back(data_manager_->GetData(top_chunk_ids_[i]));
+        top_diff_chunk_ids_.push_back(data_manager_->CreateData(top_size));
+        top_diffs_.push_back(data_manager_->GetData(top_diff_chunk_ids_[i]));
+      }
+
+	  // Allocate workspace
+	  desc_.GetWorkspaceSize(top_desc_, &workspace_size_);
+      if (workspace_size_ > 0) {
+		  workspace_id_ = data_manager_->CreateData(workspace_size_);
+		  workspace_ = data_manager_->GetData(workspace_id_);
+	  }
+    }
+  }
+
+  void ComputeOutputDim() {
+    // Courtesy of Caffe
+    output_dim_.n_ = input_dim_.n_;
+    output_dim_.c_ = input_dim_.c_;
+    output_dim_.h_ =
+        static_cast<int>(
+            ceil(static_cast<float>(input_dim_.h_ + 2 * pool_param_.pad_h_ -
+                                    pool_param_.kernel_size_h_) /
+                 pool_param_.stride_h_)) +
+        1;
+    output_dim_.w_ =
+        static_cast<int>(
+            ceil(static_cast<float>(input_dim_.w_ + 2 * pool_param_.pad_w_ -
+                                    pool_param_.kernel_size_w_) /
+                 pool_param_.stride_w_)) +
+        1;
+    if (pool_param_.pad_h_ > 0 && pool_param_.pad_w_ > 0) {
+      if ((output_dim_.h_ - 1) * pool_param_.stride_h_ >=
+          input_dim_.h_ + pool_param_.pad_h_) {
+        --output_dim_.h_;
+      }
+      if ((output_dim_.w_ - 1) * pool_param_.stride_w_ >=
+          input_dim_.w_ + pool_param_.pad_w_) {
+        --output_dim_.w_;
+      }
+    }
+  }
+
+  void ForwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // pooling forward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "PoolFwd");
+    for (int i = 0; i < num_bottoms_; i++) {
+	  dnnmarkPoolingForward(*(p_dnnmark_->GetHandle()),
+			  p_dnnmark_->getRunMode(), layer_id_,
+			  desc_,
+			  DataType<T>::one,
+			  bottom_desc_,
+			  bottoms_[i]->Get(),
+			  DataType<T>::zero,
+			  top_desc_,
+			  tops_[i]->Get(),
+			  workspace_, workspace_size_);
+    }
+	ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "PoolFwd");
+  }
+
+  void BackwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the top and top diff data
+      for (int i = 0; i < num_tops_; i++) {
+        tops_[i]->Filler();
+        top_diffs_[i]->Filler();
+      }
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // pooling backward computation
+	ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "PoolBwd");
+    for (int i = 0; i < num_tops_; i++) {
+      dnnmarkPoolingBackward(*(p_dnnmark_->GetHandle()),
+			  p_dnnmark_->getRunMode(), layer_id_,
+			  desc_,
+			  DataType<T>::one,
+			  top_desc_, tops_[i]->Get(),
+			  top_desc_, top_diffs_[i]->Get(),
+			  bottom_desc_, bottoms_[i]->Get(),
+			  DataType<T>::zero,
+			  bottom_desc_, bottom_diffs_[i]->Get(),
+			  workspace_);
+    }
+	ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "PoolBwd");
+  }
+};
+
+}  // namespace dnnmark
+
+#endif  // CORE_INCLUDE_LAYERS_POOL_LAYER_H_
diff --git a/src/DNNMark/core/include/layers/softmax_layer.h b/src/DNNMark/core/include/layers/softmax_layer.h
new file mode 100644
index 0000000..31dbbdc
--- /dev/null
+++ b/src/DNNMark/core/include/layers/softmax_layer.h
@@ -0,0 +1,173 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_LAYERS_SOFTMAX_LAYER_H_
+#define CORE_INCLUDE_LAYERS_SOFTMAX_LAYER_H_
+
+#include "dnn_layer.h"
+
+namespace dnnmark {
+
+template <typename T>
+class SoftmaxLayer : public Layer<T> {
+  // using declaration for calling member from base class
+  using Layer<T>::p_dnnmark_;
+  using Layer<T>::layer_id_;
+  using Layer<T>::previous_layer_name_;
+  using Layer<T>::input_dim_;
+  using Layer<T>::output_dim_;
+  using Layer<T>::bottom_desc_;
+  using Layer<T>::top_desc_;
+  using Layer<T>::data_manager_;
+
+  using Layer<T>::num_bottoms_;
+  using Layer<T>::bottoms_;
+  using Layer<T>::bottom_chunk_ids_;
+  using Layer<T>::bottom_diffs_;
+  using Layer<T>::bottom_diff_chunk_ids_;
+
+  using Layer<T>::num_tops_;
+  using Layer<T>::tops_;
+  using Layer<T>::top_chunk_ids_;
+  using Layer<T>::top_diffs_;
+  using Layer<T>::top_diff_chunk_ids_;
+
+ private:
+  SoftmaxParam softmax_param_;
+
+ public:
+  SoftmaxLayer(DNNMark<T> *p_dnnmark)
+  : Layer<T>(p_dnnmark),
+    softmax_param_() {
+  }
+
+  SoftmaxParam *getSoftmaxParam() { return &softmax_param_; }
+
+  void Setup() {
+    // Set up indispensable stuff here
+    Layer<T>::Setup();
+
+    // Set up softmaxing related data
+    if (input_dim_.n_ != 0 && input_dim_.c_ != 0 &&
+        input_dim_.h_ != 0 && input_dim_.w_ != 0) {
+      //
+      // Standalone mode
+      //
+
+      // Compute dimension of output data
+      ComputeOutputDim();
+
+      // Set top tensor
+      top_desc_.Set(output_dim_.n_,
+                    output_dim_.c_,
+                    output_dim_.h_,
+                    output_dim_.w_);
+
+      // Prepare top data
+      int top_size = output_dim_.n_ *
+                     output_dim_.c_ *
+                     output_dim_.h_ *
+                     output_dim_.w_;
+      for (int i = 0; i < num_tops_; i++) {
+        top_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        tops_.push_back(
+          data_manager_->GetData(top_chunk_ids_[i]));
+        top_diff_chunk_ids_.push_back(
+          data_manager_->CreateData(top_size));
+        top_diffs_.push_back(
+          data_manager_->GetData(top_diff_chunk_ids_[i]));
+      }
+
+    }
+  }
+
+  void ComputeOutputDim() {
+    output_dim_.n_ = input_dim_.n_;
+    output_dim_.c_ = input_dim_.c_;
+    output_dim_.h_ = input_dim_.h_;
+    output_dim_.w_ = input_dim_.w_;
+  }
+
+  void ForwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Softmax forward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "SoftmaxFwd");
+    for (int i = 0; i < num_bottoms_; i++) {
+      dnnmarkSoftmaxForward(
+              *(p_dnnmark_->GetHandle()),
+              p_dnnmark_->getRunMode(), layer_id_,
+              softmax_param_,
+              DataType<T>::one,
+              bottom_desc_, bottoms_[i]->Get(),
+              DataType<T>::zero,
+              top_desc_, tops_[i]->Get());
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "SoftmaxFwd");
+  }
+
+  void BackwardPropagation() {
+    if (p_dnnmark_->getRunMode() == STANDALONE ||
+        !previous_layer_name_.compare("null")) {
+      // Fill the top and top diff data
+      for (int i = 0; i < num_tops_; i++) {
+        tops_[i]->Filler();
+        top_diffs_[i]->Filler();
+      }
+      // Fill the bottom data
+      for (int i = 0; i < num_bottoms_; i++) {
+        bottoms_[i]->Filler();
+      }
+    }
+
+    // Softmax backward computation
+    ProfilerStart(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "SoftmaxBwd");
+    for (int i = 0; i < num_tops_; i++) {
+      dnnmarkSoftmaxBackward(
+              *(p_dnnmark_->GetHandle()),
+              p_dnnmark_->getRunMode(), layer_id_,
+              softmax_param_,
+              DataType<T>::one,
+              top_desc_, tops_[i]->Get(), top_diffs_[i]->Get(),
+              DataType<T>::zero,
+              bottom_desc_,
+              bottom_diffs_[i]->Get());
+    }
+    ProfilerStop(*(p_dnnmark_->GetHandle()), p_dnnmark_->getRunMode(),
+                  layer_id_, p_dnnmark_->GetTimer(), "SoftmaxBwd");
+  }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_LAYERS_SOFTMAX_LAYER_H_
diff --git a/src/DNNMark/core/include/timer.h b/src/DNNMark/core/include/timer.h
new file mode 100644
index 0000000..31b45bc
--- /dev/null
+++ b/src/DNNMark/core/include/timer.h
@@ -0,0 +1,120 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_TIMER_H_
+#define CORE_INCLUDE_TIMER_H_
+
+#include <sys/time.h>
+#include <numeric>
+#include <map>
+
+namespace dnnmark {
+
+// Class StopWatch
+class StopWatch {
+ private:
+  struct timeval start_;
+  struct timeval end_;
+  bool started_;
+
+ public:
+  StopWatch() :
+    started_(false) {}
+
+  void Start() {
+    if (!started_)
+      gettimeofday(&start_, NULL);
+    else
+      LOG(FATAL) << "The Stop Watch has already started";
+    started_ = true;
+  }
+  void Stop() {
+    if (started_) {
+      gettimeofday(&end_, NULL);
+      started_ = false;
+    } else {
+      LOG(FATAL) << "No Stop Watch has been started yet";
+    }
+  }
+  double DiffInMs() {
+    return static_cast<double>(end_.tv_sec * 1000 +
+        static_cast<double>(end_.tv_usec) / 1000) -
+        static_cast<double>(start_.tv_sec * 1000 +
+            static_cast<double>(start_.tv_usec) / 1000);
+  }
+};
+
+class Timer {
+ private:
+  StopWatch watch_;
+  std::vector<std::string> layer_table_;
+  std::vector<double> timing_table_;
+  int num_records_;
+  double total_time_;
+
+ public:
+  Timer()
+  : watch_(), num_records_(0), total_time_(0.0) {}
+
+  void Start(const std::string &layer) {
+    watch_.Start();
+    layer_table_.push_back(layer);
+  }
+
+  // Stop and record the current elapsed time and record it
+  void Stop(const std::string &layer) {
+    watch_.Stop();
+    if (!layer.compare(layer_table_.back()))
+      timing_table_.push_back(watch_.DiffInMs());
+    else
+      LOG(FATAL) << "Layer to measure doesn't match";
+  }
+
+  // Reset all the recorded value to 0
+  void Clear() {
+	  layer_table_.clear();
+	  timing_table_.clear();
+  }
+
+  // Sum up all the recorded times and store the sum to vec_
+  void SumRecords() {
+    int index = 0;
+    for (auto const &i : layer_table_) {
+      LOG(INFO) << i << ": " << timing_table_[index] << "ms";
+      total_time_ += timing_table_[index];
+      index++;
+    }
+  }
+
+  double GetTotalTime() {
+    return total_time_;
+  }
+
+  int GetNumRecords() {
+    return num_records_;
+  }
+
+};
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_TIMER_H_
diff --git a/src/DNNMark/core/include/utility.h b/src/DNNMark/core/include/utility.h
new file mode 100644
index 0000000..6ad9356
--- /dev/null
+++ b/src/DNNMark/core/include/utility.h
@@ -0,0 +1,58 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef CORE_INCLUDE_UTILITY_H_
+#define CORE_INCLUDE_UTILITY_H_
+
+#include <string>
+#include <algorithm>
+#include <cctype>
+#include <functional>
+
+namespace dnnmark {
+
+//
+// Inplace trim string by eliminating the space like characters on both ends
+//
+
+void TrimStr(std::string *s);
+void TrimStrLeft(std::string *s);
+void TrimStrRight(std::string *s);
+
+//
+// Splict actual parameter into variable and value
+//
+
+void SplitStr(const std::string &s, std::string *var, std::string *val,
+              std::string delimiter = "=");
+
+//
+// Detect useless str
+//
+
+bool isCommentStr(const std::string &s, char comment_marker = '#');
+
+bool isEmptyStr(const std::string &s);
+
+} // namespace dnnmark
+
+#endif // CORE_INCLUDE_UTILITY_H_
diff --git a/src/DNNMark/core/src/common.cc b/src/DNNMark/core/src/common.cc
new file mode 100644
index 0000000..b2ab7ce
--- /dev/null
+++ b/src/DNNMark/core/src/common.cc
@@ -0,0 +1,47 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "common.h"
+
+namespace dnnmark {
+
+//
+// Internal data type. Code courtesy of Caffe
+//
+
+float DataType<float>::oneval = 1.0;
+float DataType<float>::zeroval = 0.0;
+const void* DataType<float>::one =
+    static_cast<void *>(&DataType<float>::oneval);
+const void* DataType<float>::zero =
+    static_cast<void *>(&DataType<float>::zeroval);
+#ifdef NVIDIA_CUDNN
+double DataType<double>::oneval = 1.0;
+double DataType<double>::zeroval = 0.0;
+const void* DataType<double>::one =
+    static_cast<void *>(&DataType<double>::oneval);
+const void* DataType<double>::zero =
+    static_cast<void *>(&DataType<double>::zeroval);
+#endif
+
+} // namespace dnnmark
+
diff --git a/src/DNNMark/core/src/dnn_config_keywords.cc b/src/DNNMark/core/src/dnn_config_keywords.cc
new file mode 100644
index 0000000..2f1b173
--- /dev/null
+++ b/src/DNNMark/core/src/dnn_config_keywords.cc
@@ -0,0 +1,57 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "dnn_config_keywords.h"
+
+namespace dnnmark {
+
+bool isSection(const std::string &s) {
+  return (std::find(general_section_keywords.begin(),
+                    general_section_keywords.end(), s)
+         != general_section_keywords.end()) &&
+         (std::find(layer_section_keywords.begin(),
+                    layer_section_keywords.end(), s)
+         != layer_section_keywords.end());
+}
+
+bool isGeneralSection(const std::string &s) {
+  return std::find(general_section_keywords.begin(),
+                   general_section_keywords.end(), s)
+         != general_section_keywords.end();
+}
+
+bool isLayerSection(const std::string &s) {
+  return std::find(layer_section_keywords.begin(),
+                   layer_section_keywords.end(), s)
+         != layer_section_keywords.end();
+}
+
+bool isKeywordExist(const std::string &s,
+                    const std::vector<std::string> &config_keywords) {
+  return std::find(config_keywords.begin(),
+                   config_keywords.end(), s)
+         != config_keywords.end();
+}
+
+}
+
+
diff --git a/src/DNNMark/core/src/dnn_utility.cc b/src/DNNMark/core/src/dnn_utility.cc
new file mode 100644
index 0000000..20ca497
--- /dev/null
+++ b/src/DNNMark/core/src/dnn_utility.cc
@@ -0,0 +1,120 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "dnn_utility.h"
+
+namespace dnnmark {
+
+Handle::Handle() {
+#ifdef NVIDIA_CUDNN
+  cudnn_handles_ = new cudnnHandle_t[1];
+  CUDNN_CALL(cudnnCreate(&cudnn_handles_[0]));
+  blas_handles_ = new cublasHandle_t[1];
+  CUBLAS_CALL(cublasCreate(&blas_handles_[0]));
+  num_cudnn_handles_ = 1;
+  num_blas_handles_ = 1;
+#endif
+#ifdef AMD_MIOPEN
+  miopen_handles_ = new miopenHandle_t[1];
+  rocblas_handles_ = new rocblas_handle[1];
+  MIOPEN_CALL(miopenCreate(&miopen_handles_[0]));
+  ROCBLAS_CALL(rocblas_create_handle(&rocblas_handles_[0]));
+  num_miopen_handles_ = 1;
+  num_rocblas_handles_ = 1;
+#endif
+}
+
+Handle::Handle(int num) {
+#ifdef NVIDIA_CUDNN
+  cudnn_handles_ = new cudnnHandle_t[num];
+  for (int i = 0; i < num; i++)
+    CUDNN_CALL(cudnnCreate(&cudnn_handles_[i]));
+  num_cudnn_handles_ = num;
+
+  blas_handles_ = new cublasHandle_t[num];
+  for (int i = 0; i < num; i++)
+    CUBLAS_CALL(cublasCreate(&blas_handles_[i]));
+  num_blas_handles_ = num;
+#endif
+#ifdef AMD_MIOPEN
+  miopen_handles_ = new miopenHandle_t[num];
+  rocblas_handles_ = new rocblas_handle[num];
+  for (int i = 0; i < num; i++) {
+    MIOPEN_CALL(miopenCreate(&miopen_handles_[i]));
+    ROCBLAS_CALL(rocblas_create_handle(&rocblas_handles_[i]));
+  }
+  num_miopen_handles_ = num;
+  num_rocblas_handles_ = num;
+#endif
+}
+
+Handle::~Handle() {
+#ifdef NVIDIA_CUDNN
+  for (int i = 0; i < num_cudnn_handles_; i++)
+    CUDNN_CALL(cudnnDestroy(cudnn_handles_[i]));
+  delete []cudnn_handles_;
+  for (int i = 0; i < num_blas_handles_; i++)
+    CUBLAS_CALL(cublasDestroy(blas_handles_[i]));
+  delete []blas_handles_;
+#endif
+#ifdef AMD_MIOPEN
+  for (int i = 0; i < num_miopen_handles_; i++) {
+    MIOPEN_CALL(miopenDestroy(miopen_handles_[i]));
+  }
+  delete []miopen_handles_;
+  for (int i = 0; i < num_rocblas_handles_; i++) {
+    ROCBLAS_CALL(rocblas_destroy_handle(rocblas_handles_[i]));
+  }
+  delete []rocblas_handles_;
+#endif
+}
+
+#ifdef NVIDIA_CUDNN
+cudnnHandle_t Handle::GetCudnn() const { return cudnn_handles_[0]; }
+cudnnHandle_t Handle::GetCudnn(int index) const {
+  return cudnn_handles_[index];
+}
+cublasHandle_t Handle::GetBlas() const { return blas_handles_[0]; }
+cublasHandle_t Handle::GetBlas(int index) const { return blas_handles_[index]; }
+#endif
+#ifdef AMD_MIOPEN
+miopenHandle_t Handle::GetMIOpen() const { return miopen_handles_[0]; }
+miopenHandle_t Handle::GetMIOpen(int index) const {
+  return miopen_handles_[index];
+}
+rocblas_handle Handle::GetBlas() const { return rocblas_handles_[0]; }
+rocblas_handle Handle::GetBlas(int index) const {
+  return rocblas_handles_[index];
+}
+#endif
+
+Descriptor::Descriptor()
+: set_(false) {}
+
+Descriptor::~Descriptor() {
+  set_ = false;
+}
+
+bool Descriptor::isSet() { return set_; }
+
+} // namespace dnnmark
+
diff --git a/src/DNNMark/core/src/dnnmark.cc b/src/DNNMark/core/src/dnnmark.cc
new file mode 100644
index 0000000..9029268
--- /dev/null
+++ b/src/DNNMark/core/src/dnnmark.cc
@@ -0,0 +1,594 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifdef NVIDIA_CUDNN
+#include "cudnn.h"
+#endif
+#ifdef AMD_MIOPEN
+#include <miopen/miopen.h>
+#endif
+
+#include "dnnmark.h"
+#include "data_png.h"
+
+namespace dnnmark {
+
+//
+// DNNMark class definition
+//
+
+template <typename T>
+DNNMark<T>::DNNMark(const std::string &mmap_file)
+: run_mode_(NONE), handle_(), timer_(), num_layers_added_(0)
+{
+  PseudoNumGenerator::CreateInstance(mmap_file);
+}
+
+template <typename T>
+DNNMark<T>::DNNMark(int num_layers, const std::string &mmap_file)
+: run_mode_(NONE), handle_(num_layers), timer_(), num_layers_added_(0)
+{
+  PseudoNumGenerator::CreateInstance(mmap_file);
+}
+template <typename T>
+void DNNMark<T>::SetLayerParams(LayerType layer_type,
+                    int current_layer_id,
+                    const std::string &var,
+                    const std::string &val) {
+  DataDim *input_dim;
+  ConvolutionParam *conv_param;
+  PoolingParam *pool_param;
+  LRNParam *lrn_param;
+  ActivationParam *activation_param;
+  FullyConnectedParam *fc_param;
+  SoftmaxParam *softmax_param;
+  BatchNormParam *bn_param;
+  DropoutParam *dropout_param;
+  BypassParam *bypass_param;
+  CHECK_GT(num_layers_added_, 0);
+
+  switch(layer_type) {
+    case CONVOLUTION: {
+      // Obtain the data dimension and parameters variable
+      // within specified layer
+      input_dim = std::dynamic_pointer_cast<ConvolutionLayer<T>>
+                  (layers_map_[current_layer_id])->getInputDim();
+      conv_param = std::dynamic_pointer_cast<ConvolutionLayer<T>>
+                   (layers_map_[current_layer_id])->getConvParam();
+
+      if(isKeywordExist(var, data_config_keywords))
+        break;
+
+      SetupConvParam(var, val, conv_param);
+      break;
+    } // End of case CONVOLUTION
+    case POOLING: {
+      // Obtain the data dimension and parameters variable within layer class
+      input_dim = std::dynamic_pointer_cast<PoolingLayer<T>>
+                  (layers_map_[current_layer_id])->getInputDim();
+      pool_param = std::dynamic_pointer_cast<PoolingLayer<T>>
+                   (layers_map_[current_layer_id])->getPoolParam();
+
+      if(isKeywordExist(var, data_config_keywords))
+        break;
+
+      SetupPoolingParam(var, val, pool_param);
+      break;
+    } // End of case POOLING
+    case LRN: {
+      // Obtain the data dimension and parameters variable within layer class
+      input_dim = std::dynamic_pointer_cast<LRNLayer<T>>
+                  (layers_map_[current_layer_id])->getInputDim();
+      lrn_param = std::dynamic_pointer_cast<LRNLayer<T>>
+                   (layers_map_[current_layer_id])->getLRNParam();
+
+      if(isKeywordExist(var, data_config_keywords))
+        break;
+
+      SetupLrnParam(var, val, lrn_param);
+      break;
+    } // End of case LRN
+    case ACTIVATION: {
+      // Obtain the data dimension and parameters variable within layer class
+      input_dim = std::dynamic_pointer_cast<ActivationLayer<T>>
+                  (layers_map_[current_layer_id])->getInputDim();
+      activation_param = std::dynamic_pointer_cast<ActivationLayer<T>>
+                   (layers_map_[current_layer_id])->getActivationParam();
+
+      if(isKeywordExist(var, data_config_keywords))
+        break;
+
+      // Process all the keywords in config
+      SetupActivationParam(var, val, activation_param);
+      break;
+    } // End of case ACTIVATION
+    case FC: {
+      // Obtain the data dimension and parameters variable within layer class
+      input_dim = std::dynamic_pointer_cast<FullyConnectedLayer<T>>
+                  (layers_map_[current_layer_id])->getInputDim();
+      fc_param = std::dynamic_pointer_cast<FullyConnectedLayer<T>>
+                 (layers_map_[current_layer_id])->getFullyConnectedParam();
+
+      if(isKeywordExist(var, data_config_keywords))
+        break;
+
+      SetupFcParam(var, val, fc_param);
+      break;
+    } // End of case FC
+    case SOFTMAX: {
+      // Obtain the data dimension and parameters variable within layer class
+      input_dim = std::dynamic_pointer_cast<SoftmaxLayer<T>>
+                  (layers_map_[current_layer_id])->getInputDim();
+      softmax_param = std::dynamic_pointer_cast<SoftmaxLayer<T>>
+                 (layers_map_[current_layer_id])->getSoftmaxParam();
+
+      if(isKeywordExist(var, data_config_keywords))
+        break;
+
+      SetupSoftmaxParam(var, val, softmax_param);
+      break;
+    } // End of case SOFTMAX
+    case BN: {
+      // Obtain the data dimension and parameters variable within layer class
+      input_dim = std::dynamic_pointer_cast<BatchNormLayer<T>>
+                  (layers_map_[current_layer_id])->getInputDim();
+      bn_param = std::dynamic_pointer_cast<BatchNormLayer<T>>
+                 (layers_map_[current_layer_id])->getBatchNormParam();
+
+      if(isKeywordExist(var, data_config_keywords))
+        break;
+
+      SetupBatchNormParam(var, val, bn_param);
+      break;
+    } // End of case BN
+    case DROPOUT: {
+      // Obtain the data dimension and parameters variable within layer class
+      input_dim = std::dynamic_pointer_cast<DropoutLayer<T>>
+                  (layers_map_[current_layer_id])->getInputDim();
+      dropout_param = std::dynamic_pointer_cast<DropoutLayer<T>>
+                 (layers_map_[current_layer_id])->getDropoutParam();
+
+      if(isKeywordExist(var, data_config_keywords))
+        break;
+
+      SetupDropoutParam(var, val, dropout_param);
+      break;
+    } // End of case DROPOUT
+    case BYPASS: {
+      // Obtain the data dimension and parameters variable within layer class
+      input_dim = std::dynamic_pointer_cast<BypassLayer<T>>
+                  (layers_map_[current_layer_id])->getInputDim();
+      bypass_param = std::dynamic_pointer_cast<BypassLayer<T>>
+                 (layers_map_[current_layer_id])->getBypassParam();
+
+      if(isKeywordExist(var, data_config_keywords))
+        break;
+
+      // Process all the keywords in config
+      if(!isKeywordExist(var, bypass_config_keywords)) {
+        LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+      }
+      break;
+    } // End of case BYPASS
+    default: {
+      LOG(WARNING) << "NOT supported layer";
+      break;
+    } // End of case default
+
+  }
+
+  // Set data configuration at last, since all layers share same parameters
+  if(isKeywordExist(var, data_config_keywords)) {
+    if (!var.compare("n")) {
+      input_dim->n_ = atoi(val.c_str());
+    } else if (!var.compare("c")) {
+      input_dim->c_ = atoi(val.c_str());
+    } else if (!var.compare("h")) {
+      input_dim->h_ = atoi(val.c_str());
+    } else if (!var.compare("w")) {
+      input_dim->w_ = atoi(val.c_str());
+    } else if (!var.compare("name")) {
+      layers_map_[current_layer_id]->setLayerName(val.c_str());
+      name_id_map_[val] = current_layer_id;
+    } else if (!var.compare("previous_layer")) {
+      layers_map_[current_layer_id]->setPrevLayerName(val.c_str());
+    }
+  }
+}
+
+template <typename T>
+void DNNMark<T>::ParseAllConfig(const std::string &config_file) {
+  // TODO: use multithread in the future
+  // Parse DNNMark specific config
+  ParseGeneralConfig(config_file);
+
+  // Parse Layers config
+  ParseLayerConfig(config_file);
+}
+
+template <typename T>
+int DNNMark<T>::ParseGeneralConfig(const std::string &config_file) {
+  std::ifstream is;
+  is.open(config_file.c_str(), std::ifstream::in);
+  LOG(INFO) << "Search and parse general DNNMark configuration";
+
+  // TODO: insert assert regarding run_mode_
+
+  // Parse DNNMark config
+  std::string s;
+  bool is_general_section = false;
+  while (!is.eof()) {
+    // Obtain the string in one line
+    std::getline(is, s);
+    TrimStr(&s);
+
+    // Check the specific configuration section markers
+    if (isCommentStr(s) || isEmptyStr(s)) {
+      continue;
+    } else if (isGeneralSection(s)) {
+      is_general_section = true;
+      continue;
+    } else if (isLayerSection(s)) {
+      is_general_section = false;
+      break;
+    } else if (is_general_section) {
+      // Obtain the acutal variable and value
+      std::string var;
+      std::string val;
+      SplitStr(s, &var, &val);
+
+      // Process all the keywords in config
+      if(isKeywordExist(var, dnnmark_config_keywords)) {
+        if (!var.compare("run_mode")) {
+          if (!val.compare("none"))
+            run_mode_ = NONE;
+          else if(!val.compare("standalone"))
+            run_mode_ = STANDALONE;
+          else if(!val.compare("composed"))
+            run_mode_ = COMPOSED;
+          else
+            std::cerr << "Unknown run mode" << std::endl;
+        }
+      } else {
+        LOG(FATAL) << var << ": Keywords not exists" << std::endl;
+      }
+    }
+  }
+
+  is.close();
+  return 0;
+}
+
+template <typename T>
+int DNNMark<T>::ParseLayerConfig(const std::string &config_file) {
+  std::ifstream is;
+  is.open(config_file.c_str(), std::ifstream::in);
+
+  // Parse DNNMark config
+  std::string s;
+  int current_layer_id;
+  LayerType layer_type;
+  bool is_layer_section = false;
+
+  LOG(INFO) << "Search and parse layer configuration";
+  while (!is.eof()) {
+    // Obtain the string in one line
+    std::getline(is, s);
+    TrimStr(&s);
+
+    // Check the specific configuration section markers
+    if (isCommentStr(s) || isEmptyStr(s)){
+      continue;
+    } else if (isGeneralSection(s)) {
+      is_layer_section = false;
+    } else if (isLayerSection(s)) {
+      is_layer_section = true;
+      layer_type = layer_type_map.at(s);
+      LOG(INFO) << "Add "
+                << s
+                << " layer";
+      // Create a layer in the main class
+      current_layer_id = num_layers_added_;
+      if (layer_type == CONVOLUTION)
+        layers_map_.emplace(current_layer_id,
+          std::make_shared<ConvolutionLayer<T>>(this));
+      else if (layer_type == POOLING)
+        layers_map_.emplace(current_layer_id,
+          std::make_shared<PoolingLayer<T>>(this));
+      else if (layer_type == LRN)
+        layers_map_.emplace(current_layer_id,
+          std::make_shared<LRNLayer<T>>(this));
+      else if (layer_type == ACTIVATION)
+        layers_map_.emplace(current_layer_id,
+          std::make_shared<ActivationLayer<T>>(this));
+      else if (layer_type == FC)
+        layers_map_.emplace(current_layer_id,
+          std::make_shared<FullyConnectedLayer<T>>(this));
+      else if (layer_type == SOFTMAX)
+        layers_map_.emplace(current_layer_id,
+          std::make_shared<SoftmaxLayer<T>>(this));
+      else if (layer_type == BN)
+        layers_map_.emplace(current_layer_id,
+          std::make_shared<BatchNormLayer<T>>(this));
+      else if (layer_type == DROPOUT)
+        layers_map_.emplace(current_layer_id,
+          std::make_shared<DropoutLayer<T>>(this));
+      else if (layer_type == BYPASS)
+	      layers_map_.emplace(current_layer_id,
+	        std::make_shared<BypassLayer<T>>(this));
+      layers_map_[current_layer_id]->setLayerId(current_layer_id);
+      layers_map_[current_layer_id]->setLayerType(layer_type);
+      num_layers_added_++;
+      continue;
+    } else if (is_layer_section) {
+      // Obtain the acutal variable and value
+      std::string var;
+      std::string val;
+      SplitStr(s, &var, &val);
+
+      // Obtain the data dimension and parameters variable within layer class
+      SetLayerParams(layer_type,
+                     current_layer_id,
+                     var, val);
+    }
+  }
+
+  is.close();
+  return 0;
+}
+
+template <typename T>
+int DNNMark<T>::Initialize() {
+  LOG(INFO) << "DNNMark: Initialize...";
+  LOG(INFO) << "Running mode: " << run_mode_;
+  LOG(INFO) << "Number of Layers: " << layers_map_.size();
+  for (auto it = layers_map_.begin(); it != layers_map_.end(); it++) {
+    LOG(INFO) << "Layer type: " << it->second->getLayerType();
+    if (it->second->getLayerType() == CONVOLUTION) {
+      LOG(INFO) << "DNNMark: Setup parameters of Convolution layer";
+      std::dynamic_pointer_cast<ConvolutionLayer<T>>(it->second)->Setup();
+    }
+    if (it->second->getLayerType() == POOLING) {
+      LOG(INFO) << "DNNMark: Setup parameters of Pooling layer";
+      std::dynamic_pointer_cast<PoolingLayer<T>>(it->second)->Setup();
+    }
+    if (it->second->getLayerType() == LRN) {
+      LOG(INFO) << "DNNMark: Setup parameters of LRN layer";
+      std::dynamic_pointer_cast<LRNLayer<T>>(it->second)->Setup();
+    }
+    if (it->second->getLayerType() == ACTIVATION) {
+      LOG(INFO) << "DNNMark: Setup parameters of Activation layer";
+      std::dynamic_pointer_cast<ActivationLayer<T>>(it->second)->Setup();
+    }
+    if (it->second->getLayerType() == FC) {
+      LOG(INFO) << "DNNMark: Setup parameters of Fully Connected layer";
+      std::dynamic_pointer_cast<FullyConnectedLayer<T>>(it->second)->Setup();
+    }
+    if (it->second->getLayerType() == SOFTMAX) {
+      LOG(INFO) << "DNNMark: Setup parameters of Softmax layer";
+      std::dynamic_pointer_cast<SoftmaxLayer<T>>(it->second)->Setup();
+    }
+    if (it->second->getLayerType() == BN) {
+      LOG(INFO) << "DNNMark: Setup parameters of Batch Normalization layer";
+      std::dynamic_pointer_cast<BatchNormLayer<T>>(it->second)->Setup();
+    }
+    if (it->second->getLayerType() == DROPOUT) {
+      LOG(INFO) << "DNNMark: Setup parameters of Dropout layer";
+      std::dynamic_pointer_cast<DropoutLayer<T>>(it->second)->Setup();
+    }
+    if (it->second->getLayerType() == BYPASS) {
+      LOG(INFO) << "DNNMark: Setup parameters of Bypass layer";
+      std::dynamic_pointer_cast<BypassLayer<T>>(it->second)->Setup();
+    }
+  }
+  return 0;
+}
+
+template <typename T>
+int DNNMark<T>::RunAll() {
+  for (auto it = layers_map_.begin(); it != layers_map_.end(); it++) {
+    if (it->second->getLayerType() == CONVOLUTION) {
+      std::dynamic_pointer_cast<ConvolutionLayer<T>>(it->second)
+        ->ForwardPropagation();
+      std::dynamic_pointer_cast<ConvolutionLayer<T>>(it->second)
+        ->BackwardPropagation();
+    }
+    if (it->second->getLayerType() == POOLING) {
+      std::dynamic_pointer_cast<PoolingLayer<T>>(it->second)
+        ->ForwardPropagation();
+      std::dynamic_pointer_cast<PoolingLayer<T>>(it->second)
+        ->BackwardPropagation();
+    }
+    if (it->second->getLayerType() == LRN) {
+      std::dynamic_pointer_cast<LRNLayer<T>>(it->second)
+        ->ForwardPropagation();
+      std::dynamic_pointer_cast<LRNLayer<T>>(it->second)
+        ->BackwardPropagation();
+    }
+    if (it->second->getLayerType() == ACTIVATION) {
+      std::dynamic_pointer_cast<ActivationLayer<T>>(it->second)
+        ->ForwardPropagation();
+      std::dynamic_pointer_cast<ActivationLayer<T>>(it->second)
+        ->BackwardPropagation();
+    }
+    if (it->second->getLayerType() == FC) {
+      std::dynamic_pointer_cast<FullyConnectedLayer<T>>(it->second)
+        ->ForwardPropagation();
+      std::dynamic_pointer_cast<FullyConnectedLayer<T>>(it->second)
+        ->BackwardPropagation();
+    }
+    if (it->second->getLayerType() == SOFTMAX) {
+      std::dynamic_pointer_cast<SoftmaxLayer<T>>(it->second)
+        ->ForwardPropagation();
+      std::dynamic_pointer_cast<SoftmaxLayer<T>>(it->second)
+        ->BackwardPropagation();
+    }
+    if (it->second->getLayerType() == BN) {
+      std::dynamic_pointer_cast<BatchNormLayer<T>>(it->second)
+        ->ForwardPropagation();
+      std::dynamic_pointer_cast<BatchNormLayer<T>>(it->second)
+        ->BackwardPropagation();
+    }
+    if (it->second->getLayerType() == DROPOUT) {
+      std::dynamic_pointer_cast<DropoutLayer<T>>(it->second)
+        ->ForwardPropagation();
+      std::dynamic_pointer_cast<DropoutLayer<T>>(it->second)
+        ->BackwardPropagation();
+    }
+    if (it->second->getLayerType() == BYPASS) {
+      std::dynamic_pointer_cast<BypassLayer<T>>(it->second)
+        ->ForwardPropagation();
+      std::dynamic_pointer_cast<BypassLayer<T>>(it->second)
+        ->BackwardPropagation();
+    }
+  }
+  return 0;
+}
+
+template <typename T>
+int DNNMark<T>::Forward() {
+  for (auto it = layers_map_.begin(); it != layers_map_.end(); it++) {
+    if (it->second->getLayerType() == CONVOLUTION) {
+      LOG(INFO) << "DNNMark: Running convolution forward: STARTED";
+      std::dynamic_pointer_cast<ConvolutionLayer<T>>(it->second)
+        ->ForwardPropagation();
+      LOG(INFO) << "DNNMark: Running convolution forward: FINISHED";
+    }
+    if (it->second->getLayerType() == POOLING) {
+      LOG(INFO) << "DNNMark: Running pooling forward: STARTED";
+      std::dynamic_pointer_cast<PoolingLayer<T>>(it->second)
+        ->ForwardPropagation();
+      LOG(INFO) << "DNNMark: Running pooling forward: FINISHED";
+    }
+    if (it->second->getLayerType() == LRN) {
+      LOG(INFO) << "DNNMark: Running LRN forward: STARTED";
+      std::dynamic_pointer_cast<LRNLayer<T>>(it->second)
+        ->ForwardPropagation();
+      LOG(INFO) << "DNNMark: Running LRN forward: FINISHED";
+    }
+    if (it->second->getLayerType() == ACTIVATION) {
+      LOG(INFO) << "DNNMark: Running Activation forward: STARTED";
+      std::dynamic_pointer_cast<ActivationLayer<T>>(it->second)
+        ->ForwardPropagation();
+      LOG(INFO) << "DNNMark: Running Activation forward: FINISHED";
+    }
+    if (it->second->getLayerType() == FC) {
+      LOG(INFO) << "DNNMark: Running FullyConnected forward: STARTED";
+      std::dynamic_pointer_cast<FullyConnectedLayer<T>>(it->second)
+        ->ForwardPropagation();
+      LOG(INFO) << "DNNMark: Running FullyConnected forward: FINISHED";
+    }
+    if (it->second->getLayerType() == SOFTMAX) {
+      LOG(INFO) << "DNNMark: Running Softmax forward: STARTED";
+      std::dynamic_pointer_cast<SoftmaxLayer<T>>(it->second)
+        ->ForwardPropagation();
+      LOG(INFO) << "DNNMark: Running Softmax forward: FINISHED";
+    }
+    if (it->second->getLayerType() == BN) {
+      LOG(INFO) << "DNNMark: Running BatchNormalization forward: STARTED";
+      std::dynamic_pointer_cast<BatchNormLayer<T>>(it->second)
+        ->ForwardPropagation();
+      LOG(INFO) << "DNNMark: Running BatchNormalization forward: FINISHED";
+    }
+    if (it->second->getLayerType() == DROPOUT) {
+      LOG(INFO) << "DNNMark: Running Dropout forward: STARTED";
+      std::dynamic_pointer_cast<DropoutLayer<T>>(it->second)
+        ->ForwardPropagation();
+      LOG(INFO) << "DNNMark: Running Dropout forward: FINISHED";
+    }
+    if (it->second->getLayerType() == BYPASS) {
+      LOG(INFO) << "DNNMark: Running Bypass forward: STARTED";
+      std::dynamic_pointer_cast<BypassLayer<T>>(it->second)
+        ->ForwardPropagation();
+      LOG(INFO) << "DNNMark: Running Bypass forward: FINISHED";
+    }
+  }
+  return 0;
+}
+
+template <typename T>
+int DNNMark<T>::Backward() {
+  for (auto it = layers_map_.rbegin(); it != layers_map_.rend(); it++) {
+    if (it->second->getLayerType() == CONVOLUTION) {
+      LOG(INFO) << "DNNMark: Running convolution backward: STARTED";
+      std::dynamic_pointer_cast<ConvolutionLayer<T>>(it->second)
+        ->BackwardPropagation();
+      LOG(INFO) << "DNNMark: Running convolution backward: FINISHED";
+    }
+    if (it->second->getLayerType() == POOLING) {
+      LOG(INFO) << "DNNMark: Running pooling backward: STARTED";
+      std::dynamic_pointer_cast<PoolingLayer<T>>(it->second)
+        ->BackwardPropagation();
+      LOG(INFO) << "DNNMark: Running pooling backward: FINISHED";
+    }
+    if (it->second->getLayerType() == LRN) {
+      LOG(INFO) << "DNNMark: Running LRN backward: STARTED";
+      std::dynamic_pointer_cast<LRNLayer<T>>(it->second)
+        ->BackwardPropagation();
+      LOG(INFO) << "DNNMark: Running LRN backward: FINISHED";
+    }
+    if (it->second->getLayerType() == ACTIVATION) {
+      LOG(INFO) << "DNNMark: Running Activation backward: STARTED";
+      std::dynamic_pointer_cast<ActivationLayer<T>>(it->second)
+        ->BackwardPropagation();
+      LOG(INFO) << "DNNMark: Running Activation backward: FINISHED";
+    }
+    if (it->second->getLayerType() == FC) {
+      LOG(INFO) << "DNNMark: Running FullyConnected backward: STARTED";
+      std::dynamic_pointer_cast<FullyConnectedLayer<T>>(it->second)
+        ->BackwardPropagation();
+      LOG(INFO) << "DNNMark: Running FullyConnected backward: FINISHED";
+    }
+    if (it->second->getLayerType() == SOFTMAX) {
+      LOG(INFO) << "DNNMark: Running Softmax backward: STARTED";
+      std::dynamic_pointer_cast<SoftmaxLayer<T>>(it->second)
+        ->BackwardPropagation();
+      LOG(INFO) << "DNNMark: Running Softmax backward: FINISHED";
+    }
+    if (it->second->getLayerType() == BN) {
+      LOG(INFO) << "DNNMark: Running BatchNormalization backward: STARTED";
+      std::dynamic_pointer_cast<BatchNormLayer<T>>(it->second)
+        ->BackwardPropagation();
+      LOG(INFO) << "DNNMark: Running BatchNormalization backward: FINISHED";
+    }
+    if (it->second->getLayerType() == DROPOUT) {
+      LOG(INFO) << "DNNMark: Running Dropout backward: STARTED";
+      std::dynamic_pointer_cast<DropoutLayer<T>>(it->second)
+        ->BackwardPropagation();
+      LOG(INFO) << "DNNMark: Running Dropout backward: FINISHED";
+    }
+    if (it->second->getLayerType() == BYPASS) {
+      LOG(INFO) << "DNNMark: Running Bypass backward: STARTED";
+      std::dynamic_pointer_cast<BypassLayer<T>>(it->second)
+        ->BackwardPropagation();
+      LOG(INFO) << "DNNMark: Running Bypass backward: FINISHED";
+    }
+  }
+  return 0;
+}
+
+
+// Explicit instantiation
+template class DNNMark<TestType>;
+
+} // namespace dnnmark
+
diff --git a/src/DNNMark/core/src/gemm_wrapper.cc b/src/DNNMark/core/src/gemm_wrapper.cc
new file mode 100644
index 0000000..add2cde
--- /dev/null
+++ b/src/DNNMark/core/src/gemm_wrapper.cc
@@ -0,0 +1,109 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gemm_wrapper.h"
+
+namespace dnnmark {
+
+template <>
+void dnnmarkGEMM(const Handle &handle, RunMode mode, int idx,
+                 bool is_a_transpose, bool is_b_transpose,
+                 int m, int n, int k,
+                 float *alpha,
+                 float *a, int lda,
+                 float *b, int ldb,
+                 float *beta,
+                 float *c, int ldc) {
+#ifdef NVIDIA_CUDNN
+  cublasOperation_t transa = is_a_transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t transb = is_b_transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CALL(cublasSgemm(mode == COMPOSED ?
+                          handle.GetBlas(idx) : handle.GetBlas(),
+                          transa, transb,
+                          m, n, k,
+                          alpha,
+                          a, lda,
+                          b, ldb,
+                          beta,
+                          c, ldc));
+#endif
+#ifdef AMD_MIOPEN
+  rocblas_operation transa = is_a_transpose ? rocblas_operation_transpose :
+                             rocblas_operation_none;
+  rocblas_operation transb = is_b_transpose ? rocblas_operation_transpose :
+                             rocblas_operation_none;
+
+  ROCBLAS_CALL(rocblas_sgemm(mode == COMPOSED ?
+                             handle.GetBlas(idx) : handle.GetBlas(),
+                             transa, transb,
+                             m, n, k,
+                             alpha,
+                             a, lda,
+                             b, ldb,
+                             beta,
+                             c, ldc));
+#endif
+
+}
+
+template <>
+void dnnmarkGEMM(const Handle &handle, RunMode mode, int idx,
+                 bool is_a_transpose, bool is_b_transpose,
+                 int m, int n, int k,
+                 double *alpha,
+                 double *a, int lda,
+                 double *b, int ldb,
+                 double *beta,
+                 double *c, int ldc) {
+#ifdef NVIDIA_CUDNN
+  cublasOperation_t transa = is_a_transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t transb = is_b_transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CALL(cublasDgemm(mode == COMPOSED ?
+                          handle.GetBlas(idx) : handle.GetBlas(),
+                          transa, transb,
+                          m, n, k,
+                          alpha,
+                          a, lda,
+                          b, ldb,
+                          beta,
+                          c, ldc));
+#endif
+#ifdef AMD_MIOPEN
+  rocblas_operation transa = is_a_transpose ? rocblas_operation_transpose :
+                             rocblas_operation_none;
+  rocblas_operation transb = is_b_transpose ? rocblas_operation_transpose :
+                             rocblas_operation_none;
+
+  ROCBLAS_CALL(rocblas_dgemm(mode == COMPOSED ?
+                             handle.GetBlas(idx) : handle.GetBlas(),
+                             transa, transb,
+                             m, n, k,
+                             alpha,
+                             a, lda,
+                             b, ldb,
+                             beta,
+                             c, ldc));
+#endif
+}
+
+} // namespace dnnmark
+
diff --git a/src/DNNMark/core/src/utility.cc b/src/DNNMark/core/src/utility.cc
new file mode 100644
index 0000000..a00a0d9
--- /dev/null
+++ b/src/DNNMark/core/src/utility.cc
@@ -0,0 +1,71 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Northeastern University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <glog/logging.h>
+#include "utility.h"
+
+namespace dnnmark {
+
+void TrimStr(std::string *s) {
+  TrimStrLeft(s);
+  TrimStrRight(s);
+}
+
+void TrimStrLeft(std::string *s) {
+  s->erase(s->begin(), std::find_if(s->begin(), s->end(),
+           std::not1(std::ptr_fun<int, int>(std::isspace))));
+}
+
+void TrimStrRight(std::string *s) {
+  s->erase(std::find_if(s->rbegin(), s->rend(),
+           std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s->end());
+}
+
+void SplitStr(const std::string &s, std::string *var, std::string *val,
+              std::string delimiter) {
+  // Obtain the position of equal sign
+  std::size_t pos = s.find_first_of(delimiter);
+
+  // Error detetion here
+  LOG_IF(FATAL, pos == std::string::npos) << "Illegal configuration!!!";
+
+  // Obtain the substring of variable and value
+  *var = s.substr(0, pos);
+  TrimStr(var);
+  *val = s.substr(pos+1, std::string::npos);
+  TrimStr(val);
+}
+
+bool isCommentStr(const std::string &s, char comment_marker) {
+  std::string local_s = s;
+  TrimStr(&local_s);
+  return local_s[0] == comment_marker;
+}
+
+bool isEmptyStr(const std::string &s) {
+  std::string local_s = s;
+  TrimStr(&local_s);
+  return !local_s.compare("");
+}
+
+} // namespace dnnmark
+
diff --git a/src/DNNMark/data/cifar/download.py b/src/DNNMark/data/cifar/download.py
new file mode 100755
index 0000000..97377fd
--- /dev/null
+++ b/src/DNNMark/data/cifar/download.py
@@ -0,0 +1,22 @@
+#! /usr/bin/env python
+
+import urllib
+import tarfile
+import os
+
+print("Downloading...")
+
+testfile = urllib.URLopener()
+testfile.retrieve("http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz", "cifar-10-binary.tar.gz")
+
+print("Unzipping...")
+
+tar = tarfile.open("cifar-10-binary.tar.gz")
+tar.extractall()
+tar.close()
+
+os.system("rm -f cifar-10-binary.tar.gz")
+os.system("mv cifar-10-batches-bin/* .")
+os.system("rm -rf cifar-10-batches-bin")
+
+print("Done.")
diff --git a/src/DNNMark/generate_cachefiles.sh b/src/DNNMark/generate_cachefiles.sh
new file mode 100755
index 0000000..1f2a2e3
--- /dev/null
+++ b/src/DNNMark/generate_cachefiles.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+cd /MIOpen/src/kernels
+
+# test_fwd_softmax/test_bwd_softmax
+mkdir -p /.cache/miopen/1.7.0/5c3130f7e6d7b29bb65a02f5de0084a6
+/opt/rocm/bin/clang-ocl  -DNUM_BATCH=1 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenSoftmax.cl -o /.cache/miopen/1.7.0/5c3130f7e6d7b29bb65a02f5de0084a6/MIOpenSoftmax.cl.o
+
+# test_fwd_bn
+mkdir -p /.cache/miopen/1.7.0/f8850ed3a540a1e8eb258b582f554d57
+/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_SAVE_MEAN_VARIANCE=1 -DMIO_RUNNING_RESULT=1 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_LDS_SIZE=256 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=256 -DMIO_BN_GRP2=1 -DMIO_BN_NCHW=100000 -mcpu=gfx801 -Wno-everything MIOpenBatchNormFwdTrainPerAct.cl -o  /.cache/miopen/1.7.0/f8850ed3a540a1e8eb258b582f554d57/MIOpenBatchNormFwdTrainPerAct.cl.o
+
+# test_bwd_bn
+mkdir -p /.cache/miopen/1.7.0/2d295f7887fee4bec3c01ac73f8a25cd
+/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FPMIX=0 -DMIO_BN_N=100 -DMIO_BN_C=1000 -DMIO_BN_HW=1 -DMIO_BN_NHW=100 -DMIO_BN_CHW=1000 -DMIO_BN_NCHW=100000 -DMIO_BN_NGRPS=1 -DMIO_BN_GRP0=1 -DMIO_BN_GRP1=64 -DMIO_BN_GRP2=1 -mcpu=gfx801 -Wno-everything MIOpenBatchNormBwdPerAct.cl -o /.cache/miopen/1.7.0/2d295f7887fee4bec3c01ac73f8a25cd/MIOpenBatchNormBwdPerAct.cl.o
+
+# test_fwd_bypass/test_bwd_bypass
+mkdir -p /.cache/miopen/1.7.0/e213d754468ef6732bb836ed186f5783
+/opt/rocm/bin/clang-ocl  -DLITE -DMIOPEN_READ_UNIT=4 -DMIOPEN_READ_TYPE=_FLOAT4 -DMIOPEN_NRN_OP_ID=0 -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenNeuron.cl -o /.cache/miopen/1.7.0/e213d754468ef6732bb836ed186f5783/MIOpenNeuron.cl.o
+
+# test_fwd_composed_model/test_bwd_composed_model
+mkdir -p /.cache/miopen/1.7.0/86de626b159aea830f0ba2f1788e0f40
+/opt/rocm/bin/clang-ocl  -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenUtilKernels2.cl -o /.cache/miopen/1.7.0/86de626b159aea830f0ba2f1788e0f40/MIOpenUtilKernels2.cl.o
+
+mkdir -p /.cache/miopen/1.7.0/ac0046008721a79b06896f9a5a3ca2cc
+/opt/rocm/bin/clang-ocl  -DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1 -DWORK_LENGTH_0=4096 -mcpu=gfx801 -Wno-everything MIOpenSubTensorOpWithScalarKernel.cl -o /.cache/miopen/1.7.0/ac0046008721a79b06896f9a5a3ca2cc/MIOpenSubTensorOpWithScalarKernel.cl.o
+
+mkdir -p /.cache/miopen/1.7.0/7a58553f312474aa3cf449e5d9969a51
+/opt/rocm/bin/clang-ocl  -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=32 -DMLO_POOLING_N_CHANNELS=32 -DMLO_POOLING_N_HORIZ_OUT_PIX=4 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=8 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_BOT_BATCH_STRIDE=32768 -DMLO_POOLING_BOT_CHANNEL_STRIDE=1024 -DMLO_POOLING_BOT_STRIDE=32 -DMLO_POOLING_TOP_BATCH_STRIDE=8192 -DMLO_POOLING_TOP_CHANNEL_STRIDE=256 -DMLO_POOLING_TOP_STRIDE=16 -DMLO_POOLING_BOT_WIDTH=32 -DMLO_POOLING_BOT_HEIGHT=32 -DMLO_POOLING_TOP_WIDTH=16 -DMLO_POOLING_TOP_HEIGHT=16 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPooling.cl -o /.cache/miopen/1.7.0/7a58553f312474aa3cf449e5d9969a51/MIOpenPooling.cl.o
+
+mkdir -p /.cache/miopen/1.7.0/f0ed53d85baef9414aa97b0b15b78a4d
+/opt/rocm/bin/clang-ocl  -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=32 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=2 -DMLO_POOLBWD_N_VERT_OUT_PIX=2 -DMLO_POOLBWD_GROUP_SZ0=8 -DMLO_POOLBWD_GROUP_SZ1=8 -DMLO_POOLBWD_BOT_WIDTH=32 -DMLO_POOLBWD_BOT_HEIGHT=32 -DMLO_POOLBWD_TOP_WIDTH=16 -DMLO_POOLBWD_TOP_HEIGHT=16 -DMLO_POOLBWD_BOTDF_BATCH_STRIDE=32768 -DMLO_POOLBWD_BOTDF_CHANNEL_STRIDE=1024 -DMLO_POOLBWD_BOTDF_STRIDE=32 -DMLO_POOLBWD_TOPDF_BATCH_STRIDE=8192 -DMLO_POOLBWD_TOPDF_CHANNEL_STRIDE=256 -DMLO_POOLBWD_TOPDF_STRIDE=16 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPoolingBwd.cl -o /.cache/miopen/1.7.0/f0ed53d85baef9414aa97b0b15b78a4d/MIOpenPoolingBwd.cl.o
+
+mkdir -p /.cache/miopen/1.7.0/ae3e81b4f4b5968e01343ac25026c938
+/opt/rocm/bin/clang-ocl  -DNUM_CH_PER_WG=1 -DNUM_IM_BLKS_X=1 -DNUM_IM_BLKS=4 -DLOCAL_MEM_SIZE=432 -DSTRIDE_GT_1=0 -DTILE_SZ_X=32 -DTILE_SZ_Y=8 -DUSE_IM_OFF_GUARD=1 -DMIOPEN_USE_FP32=1 -mcpu=gfx801 -Wno-everything MIOpenUtilKernels.cl -o /.cache/miopen/1.7.0/ae3e81b4f4b5968e01343ac25026c938/MIOpenUtilKernels.cl.o
+
+# test_fwd_pool
+mkdir -p /.cache/miopen/1.7.0/7cda2f346ecf0e84b50181f05e75480b
+/opt/rocm/bin/clang-ocl  -DMLO_POOLING_OP_ID=1 -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=3 -DMLO_POOLING_N_CHANNELS=3 -DMLO_POOLING_N_HORIZ_OUT_PIX=4 -DMLO_POOLING_N_VERT_OUT_PIX=4 -DMLO_POOLING_GROUP_SZ0=8 -DMLO_POOLING_GROUP_SZ1=8 -DMLO_POOLING_BOT_BATCH_STRIDE=196608 -DMLO_POOLING_BOT_CHANNEL_STRIDE=65536 -DMLO_POOLING_BOT_STRIDE=256 -DMLO_POOLING_TOP_BATCH_STRIDE=49152 -DMLO_POOLING_TOP_CHANNEL_STRIDE=16384 -DMLO_POOLING_TOP_STRIDE=128 -DMLO_POOLING_BOT_WIDTH=256 -DMLO_POOLING_BOT_HEIGHT=256 -DMLO_POOLING_TOP_WIDTH=128 -DMLO_POOLING_TOP_HEIGHT=128 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPooling.cl -o /.cache/miopen/1.7.0/7cda2f346ecf0e84b50181f05e75480b/MIOpenPooling.cl.o
+
+# test_bwd_pool
+mkdir -p /.cache/miopen/1.7.0/e33e7c33bfa58bc339c2c0ed6e8d29ad
+/opt/rocm/bin/clang-ocl  -DMLO_POOLING_KERNEL_SZ1=3 -DMLO_POOLING_PAD1=0 -DMLO_POOLING_STRIDE1=2 -DMLO_POOLING_KERNEL_SZ0=3 -DMLO_POOLING_PAD0=0 -DMLO_POOLING_STRIDE0=2 -DMLO_POOLING_N_OUTPUTS=3 -DMLO_POOLBWD_N_HORIZ_OUT_PIX=2 -DMLO_POOLBWD_N_VERT_OUT_PIX=2 -DMLO_POOLBWD_GROUP_SZ0=8 -DMLO_POOLBWD_GROUP_SZ1=8 -DMLO_POOLBWD_BOT_WIDTH=256 -DMLO_POOLBWD_BOT_HEIGHT=256 -DMLO_POOLBWD_TOP_WIDTH=128 -DMLO_POOLBWD_TOP_HEIGHT=128 -DMLO_POOLBWD_BOTDF_BATCH_STRIDE=196608 -DMLO_POOLBWD_BOTDF_CHANNEL_STRIDE=65536 -DMLO_POOLBWD_BOTDF_STRIDE=256 -DMLO_POOLBWD_TOPDF_BATCH_STRIDE=49152 -DMLO_POOLBWD_TOPDF_CHANNEL_STRIDE=16384 -DMLO_POOLBWD_TOPDF_STRIDE=128 -DMIOPEN_USE_FP32=1 -DMIOPEN_USE_FP16=0 -mcpu=gfx801 -Wno-everything MIOpenPoolingBwd.cl -o /.cache/miopen/1.7.0/e33e7c33bfa58bc339c2c0ed6e8d29ad/MIOpenPoolingBwd.cl.o
diff --git a/src/DNNMark/generate_rand_data.cpp b/src/DNNMark/generate_rand_data.cpp
new file mode 100644
index 0000000..b4e614d
--- /dev/null
+++ b/src/DNNMark/generate_rand_data.cpp
@@ -0,0 +1,32 @@
+#include <cstdlib>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <fstream>
+#include <string>
+#include <ios>
+#include <iostream>
+#include <vector>
+
+int main(int argc, char *argv[])
+{
+    uint64_t SIZE = (uint64_t)2*1024*1024*1024/sizeof(float); // 2GB
+    std::vector<std::vector<float>> vecs(1);
+    for(auto &vec : vecs)
+        vec.reserve(SIZE);
+
+    srand(1);
+    for(int j = 0; j < vecs.size(); j++)
+        for(int i = 0; i < SIZE; i++) {
+            float r = rand();
+            vecs[j].push_back(r);
+        }
+
+    for(int i = 0; i < vecs.size(); i++)
+    {
+        std::ofstream fout("mmap.bin", std::ios::out | std::ios::binary);
+        fout.write((char *)&vecs[i][0], vecs[i].size() * sizeof(float));
+        fout.close();
+    }
+
+    return 0;
+}
diff --git a/src/DNNMark/generate_rand_data.py b/src/DNNMark/generate_rand_data.py
new file mode 100644
index 0000000..ea2be80
--- /dev/null
+++ b/src/DNNMark/generate_rand_data.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+
+import argparse
+import random
+import struct
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--output', type=str, default='DNNMark_data.dat',
+                        help='Name of the binary data file to generate')
+    parser.add_argument('--size', type=int, default=2,
+                        help='Size of binary file to generate (in GB)')
+    parser.add_argument('--seed', type=int, default=777,
+                        help='Seed for RNG')
+    parser.add_argument('--double', action='store_true',
+                        help='Store values as doubles (floats by default)')
+
+    return parser.parse_args()
+
+def main():
+    args = parse_arguments()
+
+    num_nums = int(args.size*1024*1024*1024/4) # Size in GB, divided by sizeof float
+    print(num_nums)
+
+    random.seed(args.seed)
+
+    with open(args.output, "wb") as f:
+        for i in range(num_nums):
+            f.write(struct.pack(f'{"d" if args.double else "f"}', random.random()))
+
+if __name__ == '__main__':
+    main()
diff --git a/src/DNNMark/run_dnnmark_template.sh b/src/DNNMark/run_dnnmark_template.sh
new file mode 100755
index 0000000..70eb948
--- /dev/null
+++ b/src/DNNMark/run_dnnmark_template.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+# Wrapper API for DNNMark
+# 2018 (C) Peter Bryzgalov @ CHITECH Stair Lab
+
+usage=$(cat <<- USAGEBLOCK
+Run DNNMark with parameters from CLI.
+Usage:
+$(basename $0)  [-n <number of images, batch size>]
+                [-c <number of channels in input images>]
+                [-h <height of input images>]
+                [-w <widht of input images>]
+                [-k <number of filters, ouput channels>]
+                [-s <size of filter kernel>]
+                [-u <stride>]
+                [-p <padding>]
+                [ --algo <cudnnConvolutionBwdFilterAlgo_t> - cuDNN algorithm for backward filter convolution]
+                [ --bwd_filter_pref <fastest/no_workspace/specify_workspace_limit> - cuDNN backward filter algorithm selection preference]
+                [ --algod <cudnnConvolutionBwdDataAlgo_t> - cuDNN algorithm for backward data convolution]
+                [-b <benchmark executable, default=test_bwd_conv>]
+                [ --iter <int> - number of FWD+BWD passes to measure time]
+                [ --template - benchmark configuration template file]
+                [ --debug - debug info ]
+                [ --help  - usage info ]
+                [ -d <dataset size> - number of samples in dataset, derives number of iterations from batch size and datasetsize]
+
+Configuration saved in temporary file conf_tmp.dnnmark
+USAGEBLOCK
+)
+
+template="conf_convolution_block.dnntemplate"
+config_file="conf_tmp.dnnmark"
+conv_bwd_filter_pref="fastest"
+# Defaults
+N=64
+C=3
+H=32
+W=32
+K=128
+S=3
+U=1
+P=1
+BENCH="test_composed_model"
+ITER=1
+debug=0
+datasetsize=0
+
+
+while test $# -gt 0; do
+    case "$1" in
+        --help)
+            echo "$usage"
+            exit 0
+            ;;
+        -n)
+            N="$2";shift;
+            ;;
+        -c)
+            C="$2";shift;
+            ;;
+        -h)
+            H="$2";shift;
+            ;;
+        -w)
+            W="$2";shift;
+            ;;
+        -k)
+            K="$2";shift;
+            ;;
+        -s)
+            S="$2";shift;
+            ;;
+        -u)
+            U="$2";shift;
+            ;;
+        -p)
+            P="$2";shift;
+            ;;
+        -b)
+            BENCH="$2";shift;
+            ;;
+        -d)
+            datasetsize="$2";shift;
+            ;;
+        --algo)
+            CBFA="$2";shift;
+            ;;
+        --bwd_filter_pref)
+            conv_bwd_filter_pref="$2";shift;
+            ;;
+        --algod)
+            CBDA="$2";shift;
+            ;;
+        --iter)
+            ITER="$2";shift;
+            ;;
+        --debug)
+            debug=1
+            ;;
+        --template)
+            template="$2";shift;
+            ;;
+        --)
+            shift
+            break;;
+        -*)
+            echo "Unknown option $1";
+            echo "$usage"
+            exit 1
+            ;;
+        *)
+            break;;
+    esac
+    shift
+done
+
+if [ $CBFA ];then
+    CUDNN_CBFA="algo=$CBFA"
+fi
+
+if [ $CBDA ];then
+    CUDNN_CBDA="algod=$CBDA"
+fi
+
+divide_ceil() {
+    echo "($1 + $2 - 1)/$2" | bc
+}
+
+# Calculate number of iterations from BS ($N) and dataset size
+if [ $datasetsize -gt 0 ]; then
+    echo "datasetsize=$datasetsize"
+    # echo "$datasetsize / $N = "
+    # echo "$(divide_ceil $datasetsize $N)"
+    ITER=$(divide_ceil $datasetsize $N)
+fi
+
+echo "Using template $template"
+conf="$(echo EOF;cat $template;echo EOF)"
+
+eval "cat <<$conf" >$config_file
+echo "Config: ---"
+cat $config_file
+echo "-----------"
+echo "Benchmark: $BENCH"
+echo "Iterations:$ITER"
+
+com="./build/benchmarks/${BENCH}/dnnmark_${BENCH} -config $config_file --warmup 0 --iterations $ITER --debuginfo $debug"
+echo $com
+$com
+
+
+
diff --git a/src/DNNMark/setup.sh b/src/DNNMark/setup.sh
new file mode 100755
index 0000000..30baf95
--- /dev/null
+++ b/src/DNNMark/setup.sh
@@ -0,0 +1,31 @@
+#! /bin/sh
+
+if [ $# -ne 1 ]
+then
+  echo "[Error] The setup script requires one additional parameter specifying whether CUDA or HCC is used"
+  echo "Options: [CUDA, HIP]"
+  exit
+fi
+
+OPTION=$1
+
+BUILD_DIR=build
+if [ ! -d ${BUILD_DIR} ]; then
+  mkdir ${BUILD_DIR}
+fi
+cd ${BUILD_DIR}
+
+if [ ${OPTION} = "CUDA" ]
+then
+  CUDNN_PATH=${HOME}/cudnn
+  cmake -DCUDA_ENABLE=ON -DCUDNN_ROOT=${CUDNN_PATH} ..
+elif [ ${OPTION} = "HIP" ]
+then
+  MIOPEN_PATH=/opt/rocm/miopen
+  ROCBLAS_PATH=/opt/rocm/rocblas
+  CXX=/opt/rocm/hcc/bin/hcc cmake \
+    -DHCC_ENABLE=ON \
+    -DMIOPEN_ROOT=${MIOPEN_PATH} \
+    -DROCBLAS_ROOT=${ROCBLAS_PATH} \
+    ..
+fi
diff --git a/src/DNNMark/tools/has_cuda_gpu.c b/src/DNNMark/tools/has_cuda_gpu.c
new file mode 100644
index 0000000..41496e0
--- /dev/null
+++ b/src/DNNMark/tools/has_cuda_gpu.c
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+int main() {
+  int deviceCount, device;
+  int gpuDeviceCount = 0;
+  struct cudaDeviceProp properties;
+  cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount);
+  if (cudaResultCode != cudaSuccess)
+    deviceCount = 0;
+  /* machines with no GPUs can still report one emulation device */
+  for (device = 0; device < deviceCount; ++device) {
+    cudaGetDeviceProperties(&properties, device);
+    if (properties.major != 9999) /* 9999 means emulation only */
+      ++gpuDeviceCount;
+  }
+  printf("%d GPU CUDA device(s) found\n", gpuDeviceCount);
+
+  /* don't just return the number of gpus, because other runtime cuda
+     errors can also yield non-zero return values */
+  if (gpuDeviceCount > 0)
+    return 0; /* success */
+  else
+    return 1; /* failure */
+}
diff --git a/src/DNNMark/tools/parse_nvprof_csv_metrics_with_plot.py b/src/DNNMark/tools/parse_nvprof_csv_metrics_with_plot.py
new file mode 100755
index 0000000..a3ea90b
--- /dev/null
+++ b/src/DNNMark/tools/parse_nvprof_csv_metrics_with_plot.py
@@ -0,0 +1,437 @@
+#! /usr/bin/python
+
+import numpy as np
+import matplotlib.pyplot as plt
+import re
+import sys
+import collections
+import csv
+
+if len(sys.argv) == 1:
+  print "Input file name is empty"
+  exit()
+
+num_files = len(sys.argv) - 1
+
+# Obtain all trace files if there are many
+filename_list = []
+for i in range(num_files):
+  # Obtain file name
+  filename_list.append(sys.argv[i+1])
+
+regex_kernel = re.compile(r"(?:void)?\s?(?:\w+(?:::))*(\w+)(?:<.*>)*\(?.*\)?$")
+regex_int = re.compile(r"^(?:\w+)*\s*\(*(\d+)\)*$")
+regex_float = re.compile(r"(\d+\.\d+(e\+)*\d*)(?:\w+\/\w+|%)*")
+
+kernel_name = ""
+filename_kernel_dict = collections.OrderedDict()
+metric_dict = collections.OrderedDict()
+kernel_idx = 1
+metric_name_idx = 3
+metric_avg_value_idx = 7;
+
+# Extract occupancy of each trace file
+for filename in filename_list:
+  # Obtain csv object
+  log_file = csv.reader(open(filename, "rb"))
+
+  # The file name has to be something like *_metrics.csv
+  simplified_filename = filename[0:-12]
+  if simplified_filename not in filename_kernel_dict:
+    filename_kernel_dict[simplified_filename] = []
+
+  # Number of useless lines
+  num_useless_lines = 6
+
+  for i in range(num_useless_lines):
+    next(log_file)
+
+  for row in log_file:
+    if len(row) < 8:
+      continue
+    if regex_kernel.match(row[kernel_idx]):
+      content = row[kernel_idx]
+      kernel_name = regex_kernel.match(content).group(1)
+      if kernel_name not in filename_kernel_dict[simplified_filename]:
+        filename_kernel_dict[simplified_filename].append(kernel_name)
+      if kernel_name not in metric_dict:
+        metric_dict[kernel_name] = collections.OrderedDict()
+    if regex_int.match(row[metric_avg_value_idx]):
+      content = row[metric_avg_value_idx]
+      value = int(regex_int.match(content).group(1))
+    elif regex_float.match(row[metric_avg_value_idx]):
+      content = row[metric_avg_value_idx]
+      value = float(regex_float.match(content).group(1))
+    metric_dict[kernel_name][row[metric_name_idx]] = value
+
+# General information
+regex_type = re.compile(r"(fwd|bwd)_(\w+)")
+benchmark_list = []
+fwd_benchmark_list = []
+bwd_benchmark_list = []
+for key in filename_kernel_dict:
+  if regex_type.match(key):
+    if regex_type.match(key).group(2) not in benchmark_list:
+      benchmark_list.append(regex_type.match(key).group(2))
+    if "fwd" == regex_type.match(key).group(1):
+      fwd_benchmark_list.append(key)
+    elif "bwd" == regex_type.match(key).group(1):
+      bwd_benchmark_list.append(key)
+
+benchmark_num = len(benchmark_list)
+n_groups = benchmark_num
+opacity = 0.8
+index = np.arange(n_groups)
+
+bar_width = 0.35
+
+# Collect data and generate plot for IPC
+metric = "ipc"
+prefix = metric
+fwd_ipc = []
+bwd_ipc = []
+for i in range(0, benchmark_num):
+  fwd_kernel = filename_kernel_dict[fwd_benchmark_list[i]]
+  print fwd_kernel
+  fwd_ipc.append(metric_dict[fwd_kernel[-1]][metric])
+  bwd_kernel = filename_kernel_dict[bwd_benchmark_list[i]]
+  print bwd_kernel
+  bwd_ipc.append(metric_dict[bwd_kernel[-1]][metric])
+
+plt.figure(1)
+fig, ax = plt.subplots()
+rects1 = plt.bar(index, fwd_ipc, bar_width,
+                 alpha=opacity,
+                 color='b',
+                 label='Forward propagation')
+
+rects2 = plt.bar(index + bar_width, bwd_ipc, bar_width,
+                 alpha=opacity,
+                 color='r',
+                 label='Backward propagation')
+plt.xlabel('Banchmarks')
+plt.ylabel('IPC')
+
+plt.grid()
+plt.xticks(index + bar_width, benchmark_list)
+plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
+           ncol=3, mode="expand", borderaxespad=0.)
+plt.tight_layout()
+plt.savefig(prefix + '.pdf', format='pdf', bbox_inches='tight')
+
+# Collect data and generate plot for occupancy
+metric = "achieved_occupancy"
+prefix = metric
+fwd_occupancy = []
+bwd_occupancy = []
+for i in range(0, benchmark_num):
+  fwd_kernel = filename_kernel_dict[fwd_benchmark_list[i]]
+  fwd_occupancy.append(metric_dict[fwd_kernel[-1]][metric])
+  bwd_kernel = filename_kernel_dict[bwd_benchmark_list[i]]
+  bwd_occupancy.append(metric_dict[bwd_kernel[-1]][metric])
+
+plt.figure(1)
+fig, ax = plt.subplots()
+rects1 = plt.bar(index, fwd_occupancy, bar_width,
+                 alpha=opacity,
+                 color='b',
+                 label='Forward propagation')
+
+rects2 = plt.bar(index + bar_width, bwd_occupancy, bar_width,
+                 alpha=opacity,
+                 color='r',
+                 label='Backward propagation')
+plt.xlabel('Banchmarks')
+plt.ylabel('Achieved Occpancy')
+plt.ylim((0, 1))
+
+plt.grid()
+plt.xticks(index + bar_width, benchmark_list)
+plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
+           ncol=3, mode="expand", borderaxespad=0.)
+plt.tight_layout()
+plt.savefig(prefix + '.pdf', format='pdf', bbox_inches='tight')
+
+# Collect data and generate plot for L1/shared utilization
+metric = "l1_shared_utilization"
+prefix = metric
+fwd_l1_shared_utilization = []
+bwd_l1_shared_utilization = []
+for i in range(0, benchmark_num):
+  fwd_kernel = filename_kernel_dict[fwd_benchmark_list[i]]
+  fwd_l1_shared_utilization.append(metric_dict[fwd_kernel[-1]][metric])
+  bwd_kernel = filename_kernel_dict[bwd_benchmark_list[i]]
+  bwd_l1_shared_utilization.append(metric_dict[bwd_kernel[-1]][metric])
+
+plt.figure(1)
+fig, ax = plt.subplots()
+rects1 = plt.bar(index, fwd_l1_shared_utilization, bar_width,
+                 alpha=opacity,
+                 color='b',
+                 label='Forward propagation')
+
+rects2 = plt.bar(index + bar_width, bwd_l1_shared_utilization, bar_width,
+                 alpha=opacity,
+                 color='r',
+                 label='Backward propagation')
+plt.xlabel('Banchmarks')
+plt.ylabel('L1/Shared Utilization')
+plt.ylim((0, 10))
+
+plt.grid()
+plt.xticks(index + bar_width, benchmark_list)
+plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
+           ncol=3, mode="expand", borderaxespad=0., fontsize='x-large')
+plt.tight_layout()
+plt.savefig(prefix + '.pdf', format='pdf', bbox_inches='tight')
+
+
+# Collect data and generate plot for L1/shared utilization
+metric = "l2_utilization"
+prefix = metric
+fwd_l2_utilization = []
+bwd_l2_utilization = []
+for i in range(0, benchmark_num):
+  fwd_kernel = filename_kernel_dict[fwd_benchmark_list[i]]
+  fwd_l2_utilization.append(metric_dict[fwd_kernel[-1]][metric])
+  bwd_kernel = filename_kernel_dict[bwd_benchmark_list[i]]
+  bwd_l2_utilization.append(metric_dict[bwd_kernel[-1]][metric])
+
+plt.figure(1)
+fig, ax = plt.subplots()
+
+rects1 = plt.bar(index, fwd_l2_utilization, bar_width,
+                 alpha=opacity,
+                 color='b',
+                 label='Forward propagation')
+
+rects2 = plt.bar(index + bar_width, bwd_l2_utilization, bar_width,
+                 alpha=opacity,
+                 color='r',
+                 label='Backward propagation')
+plt.xlabel('Banchmarks')
+plt.ylabel('L2 Utilization')
+plt.ylim((0, 10))
+
+plt.grid()
+plt.xticks(index + bar_width, benchmark_list)
+plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
+           ncol=3, mode="expand", borderaxespad=0., fontsize='x-large')
+plt.tight_layout()
+plt.savefig(prefix + '.pdf', format='pdf', bbox_inches='tight')
+
+# Collect data and generate plot for L1/shared utilization
+metric = "dram_utilization"
+prefix = metric
+fwd_dram_utilization = []
+bwd_dram_utilization = []
+for i in range(0, benchmark_num):
+  fwd_kernel = filename_kernel_dict[fwd_benchmark_list[i]]
+  fwd_dram_utilization.append(metric_dict[fwd_kernel[-1]][metric])
+  bwd_kernel = filename_kernel_dict[bwd_benchmark_list[i]]
+  bwd_dram_utilization.append(metric_dict[bwd_kernel[-1]][metric])
+
+plt.figure(1)
+fig, ax = plt.subplots()
+
+rects1 = plt.bar(index, fwd_dram_utilization, bar_width,
+                 alpha=opacity,
+                 color='b',
+                 label='Forward propagation')
+
+rects2 = plt.bar(index + bar_width, bwd_dram_utilization, bar_width,
+                 alpha=opacity,
+                 color='r',
+                 label='Backward propagation')
+plt.xlabel('Banchmarks')
+plt.ylabel('DRAM Utilization')
+plt.ylim((0, 10))
+
+plt.grid()
+plt.xticks(index + bar_width, benchmark_list)
+plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
+           ncol=3, mode="expand", borderaxespad=0., fontsize='x-large')
+plt.tight_layout()
+plt.savefig(prefix + '.pdf', format='pdf', bbox_inches='tight')
+
+# Collect data and generate plot for L1/shared utilization
+metric = "alu_fu_utilization"
+prefix = metric
+fwd_alu_fu_utilization = []
+bwd_alu_fu_utilization = []
+for i in range(0, benchmark_num):
+  fwd_kernel = filename_kernel_dict[fwd_benchmark_list[i]]
+  fwd_alu_fu_utilization.append(metric_dict[fwd_kernel[-1]][metric])
+  bwd_kernel = filename_kernel_dict[bwd_benchmark_list[i]]
+  bwd_alu_fu_utilization.append(metric_dict[bwd_kernel[-1]][metric])
+
+plt.figure(1)
+fig, ax = plt.subplots()
+
+rects1 = plt.bar(index, fwd_alu_fu_utilization, bar_width,
+                 alpha=opacity,
+                 color='b',
+                 label='Forward propagation')
+
+rects2 = plt.bar(index + bar_width, bwd_alu_fu_utilization, bar_width,
+                 alpha=opacity,
+                 color='r',
+                 label='Backward propagation')
+plt.xlabel('Banchmarks')
+plt.ylabel('ALU Utilization')
+plt.ylim((0, 10))
+
+plt.grid()
+plt.xticks(index + bar_width, benchmark_list)
+plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
+           ncol=3, mode="expand", borderaxespad=0., fontsize='x-large')
+plt.tight_layout()
+plt.savefig(prefix + '.pdf', format='pdf', bbox_inches='tight')
+
+# Collect data and generate plot for L1/shared utilization
+metric = "ldst_fu_utilization"
+prefix = metric
+fwd_ldst_fu_utilization = []
+bwd_ldst_fu_utilization = []
+for i in range(0, benchmark_num):
+  fwd_kernel = filename_kernel_dict[fwd_benchmark_list[i]]
+  fwd_ldst_fu_utilization.append(metric_dict[fwd_kernel[-1]][metric])
+  bwd_kernel = filename_kernel_dict[bwd_benchmark_list[i]]
+  bwd_ldst_fu_utilization.append(metric_dict[bwd_kernel[-1]][metric])
+
+plt.figure(1)
+fig, ax = plt.subplots()
+
+rects1 = plt.bar(index, fwd_ldst_fu_utilization, bar_width,
+                 alpha=opacity,
+                 color='b',
+                 label='Forward propagation')
+
+rects2 = plt.bar(index + bar_width, bwd_ldst_fu_utilization, bar_width,
+                 alpha=opacity,
+                 color='r',
+                 label='Backward propagation')
+plt.xlabel('Banchmarks')
+plt.ylabel('Load/Store Unit Utilization')
+plt.ylim((0, 10))
+
+plt.grid()
+plt.xticks(index + bar_width, benchmark_list)
+plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
+           ncol=3, mode="expand", borderaxespad=0., fontsize='x-large')
+plt.tight_layout()
+plt.savefig(prefix + '.pdf', format='pdf', bbox_inches='tight')
+
+
+# Collect data and generate plot for stall reason of Forward
+prefix = "stall_reason_"
+fwd_stall_reason_rate = collections.OrderedDict()
+bwd_stall_reason_rate = collections.OrderedDict()
+for i in range(0, benchmark_num):
+  benchmark_name = fwd_benchmark_list[i]
+  if benchmark_name not in fwd_stall_reason_rate:
+    fwd_stall_reason_rate[benchmark_name] = collections.OrderedDict()
+  fwd_kernel = filename_kernel_dict[benchmark_name]
+  fwd_stall_reason_rate[benchmark_name]["stall_inst_fetch"] = metric_dict[fwd_kernel[-1]]["stall_inst_fetch"]
+  fwd_stall_reason_rate[benchmark_name]["stall_exec_dependency"] = metric_dict[fwd_kernel[-1]]["stall_exec_dependency"]
+  fwd_stall_reason_rate[benchmark_name]["stall_texture"] = metric_dict[fwd_kernel[-1]]["stall_texture"]
+  fwd_stall_reason_rate[benchmark_name]["stall_sync"] = metric_dict[fwd_kernel[-1]]["stall_sync"]
+  fwd_stall_reason_rate[benchmark_name]["stall_other"] = metric_dict[fwd_kernel[-1]]["stall_other"]
+  fwd_stall_reason_rate[benchmark_name]["stall_memory_dependency"] = metric_dict[fwd_kernel[-1]]["stall_memory_dependency"]
+  fwd_stall_reason_rate[benchmark_name]["stall_pipe_busy"] = metric_dict[fwd_kernel[-1]]["stall_pipe_busy"]
+  fwd_stall_reason_rate[benchmark_name]["stall_constant_memory_dependency"] = metric_dict[fwd_kernel[-1]]["stall_constant_memory_dependency"]
+  fwd_stall_reason_rate[benchmark_name]["stall_memory_throttle"] = metric_dict[fwd_kernel[-1]]["stall_memory_throttle"]
+  fwd_stall_reason_rate[benchmark_name]["stall_not_selected"] = metric_dict[fwd_kernel[-1]]["stall_not_selected"]
+  benchmark_name = bwd_benchmark_list[i]
+  if benchmark_name not in bwd_stall_reason_rate:
+    bwd_stall_reason_rate[benchmark_name] = collections.OrderedDict()
+  bwd_kernel = filename_kernel_dict[benchmark_name]
+  bwd_stall_reason_rate[benchmark_name]["stall_inst_fetch"] = metric_dict[bwd_kernel[-1]]["stall_inst_fetch"]
+  bwd_stall_reason_rate[benchmark_name]["stall_exec_dependency"] = metric_dict[bwd_kernel[-1]]["stall_exec_dependency"]
+  bwd_stall_reason_rate[benchmark_name]["stall_texture"] = metric_dict[bwd_kernel[-1]]["stall_texture"]
+  bwd_stall_reason_rate[benchmark_name]["stall_sync"] = metric_dict[bwd_kernel[-1]]["stall_sync"]
+  bwd_stall_reason_rate[benchmark_name]["stall_other"] = metric_dict[bwd_kernel[-1]]["stall_other"]
+  bwd_stall_reason_rate[benchmark_name]["stall_memory_dependency"] = metric_dict[bwd_kernel[-1]]["stall_memory_dependency"]
+  bwd_stall_reason_rate[benchmark_name]["stall_pipe_busy"] = metric_dict[bwd_kernel[-1]]["stall_pipe_busy"]
+  bwd_stall_reason_rate[benchmark_name]["stall_constant_memory_dependency"] = metric_dict[bwd_kernel[-1]]["stall_constant_memory_dependency"]
+  bwd_stall_reason_rate[benchmark_name]["stall_memory_throttle"] = metric_dict[bwd_kernel[-1]]["stall_memory_throttle"]
+  bwd_stall_reason_rate[benchmark_name]["stall_not_selected"] = metric_dict[bwd_kernel[-1]]["stall_not_selected"]
+
+color_map = ['r', 'g', 'b', 'y', 'grey', 'gold', 'purple', 'brown', 'orange', 'ivory']
+for benchmark in fwd_stall_reason_rate:
+  rates = []
+  labels = []
+  colors = []
+  count = 0
+  for reason in fwd_stall_reason_rate[benchmark]:
+    if fwd_stall_reason_rate[benchmark][reason] > 0.00000:
+      rates.append(fwd_stall_reason_rate[benchmark][reason])
+      #labels.append(reason+"("+str(fwd_stall_reason_rate[benchmark][reason])+")")
+      labels.append(reason)
+      colors.append(color_map[count])
+    count += 1
+  plt.figure(2)
+  fig, ax = plt.subplots()
+  ax.pie(rates, labels=labels, colors=colors, autopct='%1.1f%%', pctdistance=1.1, labeldistance=1.2, startangle=90)
+  #ax.pie(rates, labels=labels, autopct='%1.1f%%', startangle=90)
+  plt.axis('equal')
+  plt.tight_layout()
+  plt.savefig(prefix + benchmark + '.pdf', format='pdf', bbox_inches='tight')
+
+for benchmark in bwd_stall_reason_rate:
+  rates = []
+  labels = []
+  colors = []
+  count = 0
+  for reason in bwd_stall_reason_rate[benchmark]:
+    if bwd_stall_reason_rate[benchmark][reason] > 0.00000:
+      rates.append(bwd_stall_reason_rate[benchmark][reason])
+      #labels.append(reason+"("+str(bwd_stall_reason_rate[benchmark][reason])+")")
+      labels.append(reason)
+      colors.append(color_map[count])
+    count += 1
+  plt.figure(2)
+  fig, ax = plt.subplots()
+  ax.pie(rates, labels=labels, colors=colors, autopct='%1.1f%%', pctdistance=1.1, labeldistance=1.2, startangle=90)
+  #ax.pie(rates, labels=labels, autopct='%1.1f%%', startangle=90)
+  plt.axis('equal')
+  plt.tight_layout()
+  plt.savefig(prefix + benchmark + '.pdf', format='pdf', bbox_inches='tight')
+
+# Collect data and generate plot for L1/shared utilization
+metric = "eligible_warps_per_cycle"
+prefix = metric
+fwd_eligible_warps = []
+bwd_eligible_warps = []
+for i in range(0, benchmark_num):
+  fwd_kernel = filename_kernel_dict[fwd_benchmark_list[i]]
+  fwd_eligible_warps.append(metric_dict[fwd_kernel[-1]][metric])
+  bwd_kernel = filename_kernel_dict[bwd_benchmark_list[i]]
+  bwd_eligible_warps.append(metric_dict[bwd_kernel[-1]][metric])
+
+plt.figure(1)
+fig, ax = plt.subplots()
+
+rects1 = plt.bar(index, fwd_eligible_warps, bar_width,
+                 alpha=opacity,
+                 color='b',
+                 label='Forward propagation')
+
+rects2 = plt.bar(index + bar_width, bwd_eligible_warps, bar_width,
+                 alpha=opacity,
+                 color='r',
+                 label='Backward propagation')
+plt.xlabel('Banchmarks')
+plt.ylabel('Eligible Warps Per Cyle')
+plt.ylim((0, 10))
+
+plt.grid()
+plt.xticks(index + bar_width, benchmark_list)
+plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
+           ncol=3, mode="expand", borderaxespad=0.)
+plt.tight_layout()
+plt.savefig(prefix + '.pdf', format='pdf', bbox_inches='tight')
+
+exit()
+
+
diff --git a/src/DNNMark/tools/sanity_test.sh b/src/DNNMark/tools/sanity_test.sh
new file mode 100755
index 0000000..7c81d11
--- /dev/null
+++ b/src/DNNMark/tools/sanity_test.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+# Setup directories
+WORK_DIR="$(pwd)"/..
+CONFIG_DIR=${WORK_DIR}/config_example/
+BENCHMARK_DIR=${WORK_DIR}/build/benchmarks/
+
+BENCHMARK_LIST="$(ls ${BENCHMARK_DIR} | grep test*)"
+CONFIG_LIST="$(ls ${CONFIG_DIR})"
+PROFILER=nvprof
+
+for bm in ${BENCHMARK_LIST[@]}
+do
+  EXE="$(find ${BENCHMARK_DIR}${bm} -executable -type f)"
+  echo $bm
+  trimed_bm="$(echo $bm | cut -d "_" -f2)"
+  if [ ${trimed_bm} == "fwd" ] || [ ${trimed_bm} == "bwd" ]; then
+    trimed_bm="$(echo $bm | cut -d "_" -f3)"
+  fi
+  for config in ${CONFIG_LIST[@]}
+  do
+    if [[ $config == *"$trimed_bm"* ]]; then
+      echo "Configure file: " $config
+      echo "${EXE} -config ${config} -debuginfo 1 -warmup 1"
+      ${EXE} -config ${CONFIG_DIR}${config} -debuginfo 1 -warmup 1
+    fi
+  done
+done
+
+