Enable the ruby tester of GPU_VIPER protocol

Added an example script 'viper_ruby_test.py' to test VIPER,
and slightly changed the flags in packet/port interface.

Change-Id: Ic5fc551e294687bc4838671dd36fac92673f958b
diff --git a/configs/example/viper_ruby_test.py b/configs/example/viper_ruby_test.py
new file mode 100644
index 0000000..2a69ffd
--- /dev/null
+++ b/configs/example/viper_ruby_test.py
@@ -0,0 +1,359 @@
+#
+# Copyright (c) 2018 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Tuan Ta, Xianwei Zhang
+#
+
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from m5.util import addToPath
+import os, optparse, sys
+
+addToPath('../')
+
+from common import Options
+from ruby import Ruby
+
+# Get paths we might need.  It's expected this file is in m5/configs/example.
+config_path = os.path.dirname(os.path.abspath(__file__))
+config_root = os.path.dirname(config_path)
+m5_root = os.path.dirname(config_root)
+
+parser = optparse.OptionParser()
+Options.addNoISAOptions(parser)
+
+# GPU Ruby tester options
+parser.add_option("--cache-size", type="int", default=0,
+                  help="Cache sizes to use. Small encourages races between \
+                        requests and writebacks. Large stresses write-through \
+                        and/or write-back GPU caches. Range [0..1]")
+parser.add_option("--system-size", type="int", default=0,
+                  help="This option defines how many CUs, CPUs and cache \
+                        components in the test system. Range[0..1]")
+parser.add_option("--address-range", type="int", default=0,
+                  help="This option defines the number of atomic \
+                        locations that affects the working set's size. \
+                        A small number of atomic locations encourage more \
+                        races among threads. The large option stresses cache \
+                        resources. Range [0..1]")
+parser.add_option("--episode-length", type="int", default=0,
+                  help="This option defines the number of LDs and \
+                        STs in an episode. The small option encourages races \
+                        between the start and end of an episode. The long \
+                        option encourages races between LDs and STs in the \
+                        same episode. Range [0..2]")
+parser.add_option("--test-length", type="int", default=1,
+                  help="The number of episodes to be executed by each \
+                        wavefront. This determines the maximum number, i.e., \
+                        val X #WFs, of episodes to be executed in the test.")
+parser.add_option("--debug-tester", action='store_true',
+                  help="This option will turn on DRF checker")
+parser.add_option("--random-seed", type="int", default=0,
+                  help="Random seed number. Default value (i.e., 0) means \
+                        using runtime-specific value")
+parser.add_option("--log-file", type="string", default="gpu-ruby-test.log")
+
+# GPU configurations
+parser.add_option("--wf-size", type="int", default=64, help="wavefront size")
+
+parser.add_option("-w", "--wavefronts-per-cu", type="int", default=1,
+                  help="Number of wavefronts per cu")
+
+parser.add_option("--cu-per-sqc", type="int", default=4,
+                  help="number of CUs sharing an SQC")
+
+parser.add_option("--cu-per-scalar-cache", type="int", default=4,
+                  help="number of CUs sharing an scalar cache")
+
+parser.add_option("--cu-per-sa", type="int", default=4,
+                  help="number of CUs per shader array \
+                        This must be a multiple of options.cu-per-sqc and \
+                        options.cu-per-scalar")
+#
+# Add the ruby specific and protocol specific options
+#
+Ruby.define_options(parser)
+
+execfile(os.path.join(config_root, "common", "Options.py"))
+
+(options, args) = parser.parse_args()
+
+#
+# Set the default cache size and associativity to be very small to encourage
+# races between requests and writebacks.
+#
+options.l1d_size="256B"
+options.l1i_size="256B"
+options.l2_size="512B"
+options.l3_size="1kB"
+options.l1d_assoc=2
+options.l1i_assoc=2
+options.l2_assoc=2
+options.l3_assoc=2
+
+#
+# Set up cache size - 2 options
+#   0: small cache
+#   1: large cache
+#
+if (options.cache_size == 0):
+    options.tcp_size="256B"
+    options.tcp_assoc=2
+    options.tcc_size="1kB"
+    options.tcc_assoc=2
+elif (options.cache_size == 1):
+    options.tcp_size="256kB"
+    options.tcp_assoc=16
+    options.tcc_size="1024kB"
+    options.tcc_assoc=16
+else:
+     print "Error: option cache_size '%s' not recognized" % options.cache_size
+     sys.exit(1)
+
+#
+# Set up system size - 2 options
+#
+if (options.system_size == 0):
+    # 1 CU, 1 CPU, 1 SQC, 1 Scalar
+    options.wf_size = 1
+    options.wavefronts_per_cu = 1
+    options.num_cpus = 1
+    options.cu_per_sqc = 1
+    options.cu_per_scalar_cache = 1
+    options.num_compute_units = 1
+elif (options.system_size == 1):
+    # 32 CUs, 4 CPUs, 8 SQCs, 8 Scalars
+    options.wf_size = 16
+    options.wavefronts_per_cu = 8
+    options.num_cpus = 4
+    options.cu_per_sqc = 4
+    options.cu_per_scalar_cache = 4
+    options.num_compute_units = 32
+else:
+    print "Error: option system size '%s' not recognized" \
+                % options.system_size
+    sys.exit(1)
+
+#
+# set address range - 2 options
+#   level 0: small
+#   level 1: large
+# each location corresponds to a 4-byte piece of data
+#
+options.mem_size = '1024MB'
+num_atomic_locs = 10
+num_regular_locs_per_atomic_loc = 10000
+if (options.address_range == 1):
+    num_atomic_locs = 100
+    num_regular_locs_per_atomic_loc = 100000
+elif (options.address_range != 0):
+    print "Error: option address_range '%s' not recognized" \
+            % options.address_range
+    sys.exit(1)
+
+#
+# set episode length (# of actions per episode) - 3 options
+#   0: 10 actions
+#   1: 100 actions
+#   2: 500 actions
+#
+eps_length = 10
+if (options.episode_length == 1):
+    eps_length = 100
+elif (options.episode_length == 2):
+    eps_length = 500
+elif (options.episode_length != 0):
+    print "Error: option episode_length '%s' not recognized" \
+            % options.episode_length
+    sys.exit(1)
+
+# set the Ruby's and tester's deadlock thresholds
+# the Ruby's deadlock detection is the primary check for deadlock.
+# the tester's deadlock threshold detection is a secondary check for deadlock
+# if there is a bug in RubyPort that causes a packet not to return to the
+# tester properly, the tester will throw a deadlock exception.
+# we set cache_deadlock_threshold < tester_deadlock_threshold to detect
+# deadlock caused by Ruby protocol first before one caused by the coalescer
+options.cache_deadlock_threshold = 100000000
+tester_deadlock_threshold = 1000000000
+
+# for now, we're testing only GPU protocol, so we set num_cpus to 0
+options.num_cpus = 0
+# number of CPUs and CUs
+n_CPUs = options.num_cpus
+n_CUs = options.num_compute_units
+# set test length, i.e., number of episodes per wavefront * #WFs
+# test length can be 1x#WFs, 10x#WFs, 100x#WFs, ...
+n_WFs = n_CUs * options.wavefronts_per_cu
+max_episodes = options.test_length * n_WFs
+# number of SQC and Scalar caches
+assert(n_CUs % options.cu_per_sqc == 0)
+n_SQCs = int(n_CUs/options.cu_per_sqc)
+options.num_sqc = n_SQCs
+assert(n_CUs % options.cu_per_scalar_cache == 0)
+n_Scalars = int(n_CUs/options.cu_per_scalar_cache)
+
+# for now, we only set CUs and SQCs
+# TODO: add scalars if necessary
+n_Scalars = 0
+options.num_scalar_cache = n_Scalars
+if n_Scalars == 0:
+    options.cu_per_scalar_cache = 0
+
+if args:
+     print "Error: script doesn't take any positional arguments"
+     sys.exit(1)
+
+#
+# Create GPU Ruby random tester
+#
+tester = ProtocolTester(cus_per_sqc = options.cu_per_sqc,
+                        cus_per_scalar = options.cu_per_scalar_cache,
+                        wavefronts_per_cu = options.wavefronts_per_cu,
+                        workitems_per_wavefront = options.wf_size,
+                        num_atomic_locations = num_atomic_locs,
+                        num_normal_locs_per_atomic = \
+                                          num_regular_locs_per_atomic_loc,
+                        max_num_episodes = max_episodes,
+                        episode_length = eps_length,
+                        debug_tester = options.debug_tester,
+                        random_seed = options.random_seed,
+                        log_file = options.log_file)
+
+#
+# Create the M5 system. Note that the memory object isn't actually
+# used by the vitester, but is included to support
+# the M5 memory size == Ruby memory size checks
+#
+# The system doesn't have real CPUs or CUs.
+# It just has a tester that has physical ports to be connected to Ruby
+#
+system = System(cpu = tester,
+                mem_ranges = [AddrRange(options.mem_size)],
+                cache_line_size = options.cacheline_size,
+                mem_mode = 'timing')
+
+system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
+system.clk_domain = SrcClockDomain(clock = options.sys_clock,
+                                   voltage_domain = system.voltage_domain)
+
+options.num_cp = 0
+
+#
+# Create the Ruby system
+#
+Ruby.create_system(options, False, system)
+
+#
+# The tester is most effective when randomization is turned on and
+# artifical delay is randomly inserted on messages
+#
+system.ruby.randomization = True
+
+# assert that we got the right number of Ruby ports
+assert(len(system.ruby._cpu_ports) == n_CPUs + n_CUs + n_SQCs + n_Scalars)
+
+#
+# attach Ruby ports to the tester
+# in the order: cpu_sequencers,
+#               vector_coalescers,
+#               sqc_sequencers,
+#               scalar_sequencers
+#
+print "Attaching ruby ports to the tester"
+i = 0
+for ruby_port in system.ruby._cpu_ports:
+    ruby_port.no_retry_on_stall = True
+    ruby_port.using_ruby_tester = False
+
+    if i < n_CPUs:
+        tester.cpu_ports = ruby_port.slave
+    elif i < (n_CPUs + n_CUs):
+        tester.cu_vector_ports = ruby_port.slave
+    elif i < (n_CPUs + n_CUs + n_SQCs):
+        tester.cu_sqc_ports = ruby_port.slave
+    else:
+        tester.cu_scalar_ports = ruby_port.slave
+
+    i += 1
+
+#
+# Create CPU threads
+#
+thread_clock = SrcClockDomain(clock = '1GHz',
+                              voltage_domain = system.voltage_domain)
+
+cpu_threads = []
+print "Creating %i CpuThreads" % (n_CPUs)
+for cpu_idx in range(n_CPUs):
+    cpu_threads.append(CpuThread(thread_id = cpu_idx,
+                                 num_lanes = 1,     # CPU thread is scalar
+                                 clk_domain = thread_clock,
+                                 deadlock_threshold = \
+                                        tester_deadlock_threshold))
+tester.cpu_threads = cpu_threads
+
+#
+# Create GPU wavefronts
+#
+wavefronts = []
+g_thread_idx = n_CPUs
+print "Creating %i WFs attached to %i CUs" % \
+                (n_CUs * tester.wavefronts_per_cu, n_CUs)
+for cu_idx in range(n_CUs):
+    for wf_idx in range(tester.wavefronts_per_cu):
+        wavefronts.append(GpuWavefront(thread_id = g_thread_idx,
+                                         cu_id = cu_idx,
+                                         num_lanes = options.wf_size,
+                                         clk_domain = thread_clock,
+                                         deadlock_threshold = \
+                                                tester_deadlock_threshold))
+        g_thread_idx += 1
+tester.wavefronts = wavefronts
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root( full_system = False, system = system )
+
+# Not much point in this being higher than the L1 latency
+m5.ticks.setGlobalFrequency('1ns')
+
+# instantiate configuration
+m5.instantiate()
+
+# simulate until program terminates
+exit_event = m5.simulate(options.abs_max_tick)
+
+print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause()
diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
index e4ba180..7432a4b 100644
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -311,24 +311,15 @@
         self.probeToL3 = probe_to_l3
         self.respToL3 = resp_to_l3
 
-class DirMem(RubyDirectoryMemory, CntrlBase):
-    def create(self, options, ruby_system, system):
-        self.version = self.versionCount()
-
-        phys_mem_size = AddrRange(options.mem_size).size()
-        mem_module_size = phys_mem_size / options.num_dirs
-        dir_size = MemorySize('0B')
-        dir_size.value = mem_module_size
-        self.size = dir_size
-
 class DirCntrl(Directory_Controller, CntrlBase):
-    def create(self, options, ruby_system, system):
+    def create(self, options, dir_ranges, ruby_system, system):
         self.version = self.versionCount()
 
         self.response_latency = 30
 
-        self.directory = DirMem()
-        self.directory.create(options, ruby_system, system)
+        self.addr_ranges = dir_ranges
+
+        self.directory = RubyDirectoryMemory()
 
         self.L3CacheMemory = L3Cache()
         self.L3CacheMemory.create(options, ruby_system, system)
@@ -389,6 +380,8 @@
                       help = "tcp assoc")
     parser.add_option("--noL1", action = "store_true", default = False,
                       help = "bypassL1")
+    parser.add_option("--buffers-size", type="int", default=128,
+                      help="Size of MessageBuffers at the controller")
 
 def create_system(options, full_system, system, dma_devices, ruby_system):
     if buildEnv['PROTOCOL'] != 'GPU_VIPER':
@@ -427,10 +420,35 @@
         mainCluster = Cluster(intBW=crossbar_bw)
     else:
         mainCluster = Cluster(intBW=8) # 16 GB/s
+
+    # See comment in config/common/MemConfig.py for explanation of this value
+    xor_low_bit = 20
+
+    if options.numa_high_bit:
+        numa_bit = options.numa_high_bit
+        dir_bits = int(math.log(options.num_dirs, 2))
+        xor_high_bit = xor_low_bit + dir_bits - 1
+    else:
+        # if the numa_bit is not specified, set the directory bits as the
+        # lowest bits above the block offset bits, and the numa_bit as the
+        # highest of those directory bits
+        dir_bits = int(math.log(options.num_dirs, 2))
+        block_size_bits = int(math.log(options.cacheline_size, 2))
+        numa_bit = block_size_bits + dir_bits - 1
+        xor_high_bit = xor_low_bit + dir_bits - 1
     for i in xrange(options.num_dirs):
+        dir_ranges = []
+        for r in system.mem_ranges:
+            #addr_range = m5.objects.AddrRange(r.size())
+            addr_range = m5.objects.AddrRange(r.start, size = r.size(),
+                                              intlvHighBit = numa_bit,
+                                              intlvBits = dir_bits,
+                                              intlvMatch = i,
+                                              xorHighBit = xor_high_bit)
+            dir_ranges.append(addr_range)
 
         dir_cntrl = DirCntrl(noTCCdir = True, TCC_select_num_bits = TCC_bits)
-        dir_cntrl.create(options, ruby_system, system)
+        dir_cntrl.create(options, dir_ranges, ruby_system, system)
         dir_cntrl.number_of_TBEs = options.num_tbes
         dir_cntrl.useL3OnWT = options.use_L3_on_WT
         # the number_of_TBEs is inclusive of TBEs below
@@ -492,8 +510,9 @@
         cp_cntrl.responseToCore = MessageBuffer()
         cp_cntrl.responseToCore.slave = ruby_system.network.master
 
-        cp_cntrl.mandatoryQueue = MessageBuffer()
         cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
+        cp_cntrl.mandatoryQueue = \
+            MessageBuffer(buffer_size=options.buffers_size)
 
         cpuCluster.add(cp_cntrl)
 
@@ -537,7 +556,8 @@
         tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
         tcp_cntrl.responseToTCP.slave = ruby_system.network.master
 
-        tcp_cntrl.mandatoryQueue = MessageBuffer()
+        tcp_cntrl.mandatoryQueue = \
+            MessageBuffer(buffer_size=options.buffers_size)
 
         gpuCluster.add(tcp_cntrl)
 
@@ -562,7 +582,8 @@
         sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
         sqc_cntrl.responseToSQC.slave = ruby_system.network.master
 
-        sqc_cntrl.mandatoryQueue = MessageBuffer()
+        sqc_cntrl.mandatoryQueue = \
+            MessageBuffer(buffer_size=options.buffers_size)
 
         # SQC also in GPU cluster
         gpuCluster.add(sqc_cntrl)
@@ -605,7 +626,8 @@
         tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
         tcp_cntrl.responseToTCP.slave = ruby_system.network.master
 
-        tcp_cntrl.mandatoryQueue = MessageBuffer()
+        tcp_cntrl.mandatoryQueue = \
+            MessageBuffer(buffer_size=options.buffers_size)
 
         gpuCluster.add(tcp_cntrl)
 
diff --git a/src/cpu/testers/gpu_ruby_test/AddressManager.cc b/src/cpu/testers/gpu_ruby_test/AddressManager.cc
new file mode 100644
index 0000000..6ef52fc
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/AddressManager.cc
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#include "cpu/testers/gpu_ruby_test/AddressManager.hh"
+
+#include <algorithm>
+
+#include "base/intmath.hh"
+#include "base/random.hh"
+#include "base/trace.hh"
+
+const int AddressManager::INVALID_VALUE = -1;
+const int AddressManager::INVALID_LOCATION = -1;
+
+AddressManager::AddressManager(int n_atomic_locs, int n_normal_locs_per_atomic)
+      : numAtomicLocs(n_atomic_locs),
+        numLocsPerAtomic(n_normal_locs_per_atomic)
+{
+    assert(numAtomicLocs > 0 && numLocsPerAtomic > 0);
+    numNormalLocs = numAtomicLocs * numLocsPerAtomic;
+
+    // generate random address map
+    randAddressMap.resize(numAtomicLocs + numNormalLocs);
+    for (Location i = 0; i < numAtomicLocs + numNormalLocs; ++i) {
+        // all addresses are sizeof(Value) (i.e., 4-byte) aligned
+        randAddressMap[i] = (Addr)((i + 128) << floorLog2(sizeof(Value)));
+    }
+
+    // randomly shuffle randAddressMap
+    std::random_shuffle(randAddressMap.begin(), randAddressMap.end());
+
+    // initialize atomic locations
+    // first and last normal location per atomic location
+    Location first, last;
+    for (Location atomic_loc = 0; atomic_loc < numAtomicLocs; ++atomic_loc) {
+        first = numAtomicLocs + numLocsPerAtomic * atomic_loc;
+        last = first + numLocsPerAtomic - 1;
+        atomicStructs.push_back(new AtomicStruct(atomic_loc, first, last));
+    }
+
+    // initialize log table
+    for (Location loc = 0; loc < numAtomicLocs + numNormalLocs; ++loc) {
+        logTable.push_back(new LastWriter());
+    }
+}
+
+AddressManager::~AddressManager()
+{
+    for (AtomicStruct* atomic_struct : atomicStructs)
+        delete atomic_struct;
+    for (LastWriter* lw : logTable)
+        delete lw;
+}
+
+Addr
+AddressManager::getAddress(Location loc)
+{
+    assert(loc < numAtomicLocs + numNormalLocs && loc >= 0);
+    return randAddressMap[loc];
+}
+
+AddressManager::Location
+AddressManager::getAtomicLoc()
+{
+    Location ret_atomic_loc = random() % numAtomicLocs;
+    atomicStructs[ret_atomic_loc]->startLocSelection();
+    return ret_atomic_loc;
+}
+
+AddressManager::Location
+AddressManager::getLoadLoc(Location atomic_loc)
+{
+    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
+    return atomicStructs[atomic_loc]->getLoadLoc();
+}
+
+AddressManager::Location
+AddressManager::getStoreLoc(Location atomic_loc)
+{
+    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
+    return atomicStructs[atomic_loc]->getStoreLoc();
+}
+
+void
+AddressManager::finishLocSelection(Location atomic_loc)
+{
+    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
+    atomicStructs[atomic_loc]->endLocSelection();
+}
+
+void
+AddressManager::releaseLocation(Location atomic_loc, Location loc)
+{
+    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
+    atomicStructs[atomic_loc]->releaseLoc(loc);
+}
+
+std::string
+AddressManager::printLastWriter(Location loc) const
+{
+    return logTable[loc]->print();
+}
+
+// ------------------- AtomicStruct --------------------------
+AddressManager::AtomicStruct::AtomicStruct(Location atomic_loc,
+                                           Location loc_begin,
+                                           Location loc_end)
+{
+    // the location range must have at least 1 location
+    assert(loc_begin <= loc_end);
+
+    atomicLoc = atomic_loc;
+    arraySize = loc_end - loc_begin + 1;
+    locationBase = loc_begin;
+
+    // allocate an array of arrray_size
+    locArray = new Location[arraySize];
+
+    // initialize locArray & locProps
+    Location loc;
+    for (int offset = 0; offset < arraySize; ++offset) {
+        loc = locationBase + offset;
+        locArray[offset] = loc;
+        locProps.push_back(LocProperty(offset, 0));
+    }
+
+    // region (1) and (3) are initially empty
+    firstMark = 0;
+    secondMark = arraySize;
+    // no request made at this location so far
+    requestCount = 0;
+}
+
+AddressManager::AtomicStruct::~AtomicStruct()
+{
+    delete[] locArray;
+}
+
+void
+AddressManager::AtomicStruct::startLocSelection()
+{
+    assert(firstMark >= 0);
+    assert(firstMark <= secondMark);
+    assert(secondMark <= arraySize);
+    // make sure loadStoreMap has been cleared
+    assert(loadStoreMap.empty());
+
+    // this atomic location is picked for Atomic_ACQ
+    // and Atomic_REL in an episode
+    requestCount += 2;
+    // add two expected values in expectedValues set
+    expectedValues.insert(requestCount - 1);
+    expectedValues.insert(requestCount - 2);
+}
+
+AddressManager::Location
+AddressManager::AtomicStruct::getLoadLoc()
+{
+    assert(firstMark >= 0);
+    assert(firstMark <= secondMark);
+    assert(secondMark <= arraySize);
+
+    if (firstMark == arraySize) {
+        // no location can be picked for a LD now, so return an empty location
+        return INVALID_LOCATION;
+    } else {
+        // we can pick any location btw
+        // locArray [firstMark : arraySize-1]
+        int range_size = arraySize - firstMark;
+        Location ret_loc = locArray[firstMark + random() % range_size];
+
+        // update loadStoreMap
+        LdStMap::iterator it = loadStoreMap.find(ret_loc);
+
+        if (it == loadStoreMap.end()) {
+            // insert a new entry to the map b/c the entry is not there yet
+            // to mark this location has been picked for a LD
+            loadStoreMap.insert(std::pair<Location, LdStBits>
+                                            (ret_loc, LdStBits(true,false)));
+        } else {
+            // otherwise, just update the LD bit
+            (it->second).first = true;
+        }
+
+        return ret_loc;
+    }
+}
+
+AddressManager::Location
+AddressManager::AtomicStruct::getStoreLoc()
+{
+    assert(firstMark >= 0);
+    assert(firstMark <= secondMark);
+    assert(secondMark <= arraySize);
+
+    if (firstMark == secondMark) {
+        // no location can be picked for a ST now, return an invalid location
+        return INVALID_LOCATION;
+    } else {
+        // we can pick any location btw [firstMark : secondMark-1]
+        int range_size = secondMark - firstMark;
+        Location ret_loc = locArray[firstMark + random() % range_size];
+
+        // update loadStoreMap
+        LdStMap::iterator it = loadStoreMap.find(ret_loc);
+
+        if (it == loadStoreMap.end()) {
+            // insert a new entry to the map b/c the entry is not there yet
+            // to mark this location has been picked for a ST
+            loadStoreMap.insert(std::pair<Location, LdStBits>
+                                            (ret_loc, LdStBits(false,true)));
+        } else {
+            // otherwise, just update the ST bit
+            (it->second).second = true;
+        }
+
+        return ret_loc;
+    }
+}
+
+// for each entry in loadStoreMap,
+//  if <LD_bit, ST_bit> == <1,0>
+//    - if the location is in (2), then move it to (3)
+//    - if the location is in (3), no move
+//    - otherwise, throw an error
+//  if <LD_bit, ST_bit> == <0,1> or <1,1>
+//    - move it from (2) to (1)
+void
+AddressManager::AtomicStruct::endLocSelection()
+{
+    assert(firstMark >= 0);
+    assert(firstMark <= secondMark);
+    assert(secondMark <= arraySize);
+
+    for (auto& it : loadStoreMap) {
+        Location loc = it.first;
+        LdStBits p = it.second;
+
+        assert(loc >= locationBase && loc < locationBase + arraySize);
+        LocProperty& loc_prop = locProps[loc - locationBase];
+
+        if (p.first && !p.second) {
+            // this location has been picked for LD(s) but not ST
+            // it must be in either region (2) or (3)
+            assert(inSecondRegion(loc_prop.first) ||
+                   inThirdRegion(loc_prop.first));
+
+            if (inSecondRegion(loc_prop.first)) {
+                // there is no owner of this location yet
+                assert(loc_prop.second == 0);
+
+                // pick the last location in (2) to swap
+                Location swapped_loc = locArray[secondMark - 1];
+                LocProperty& swapped_loc_prop =
+                                         locProps[swapped_loc - locationBase];
+
+                // swap loc and swapped_loc
+                swap(loc_prop, swapped_loc_prop);
+
+                // then, expand (3)
+                secondMark--;
+            }
+
+            // increment the location's number of owners
+            loc_prop.second++;
+        } else if (p.second) {
+            // this location has been picked for ST(s) and/or LD(s)
+            // it must be in region (2)
+            assert(inSecondRegion(loc_prop.first) && loc_prop.second == 0);
+
+            // pick the first location in (2) to swap
+            Location swapped_loc = locArray[firstMark];
+            LocProperty& swapped_loc_prop =
+                                        locProps[swapped_loc - locationBase];
+
+            // swap loc and swapped_loc
+            swap(loc_prop, swapped_loc_prop);
+
+            // then, expand (1)
+            firstMark++;
+
+            // increment the location's number of owners
+            loc_prop.second++;
+        } else {
+            panic("Location in loadStoreMap but wasn't picked "
+                            "in any action\n");
+        }
+    }
+
+    // clear the ld_st_map
+    loadStoreMap.clear();
+}
+
+void
+AddressManager::AtomicStruct::releaseLoc(Location loc)
+{
+    assert(loc >= locationBase && loc < locationBase + arraySize);
+
+    LocProperty& loc_prop = locProps[loc - locationBase];
+
+    if (inFirstRegion(loc_prop.first)) {
+        // this location must have exactly 1 owner
+        assert(loc_prop.second == 1);
+
+        // pick the last location in region 1 to swap
+        Location swapped_loc = locArray[firstMark - 1];
+        LocProperty& swapped_loc_prop = locProps[swapped_loc - locationBase];
+
+        // swap loc and swapped_loc
+        swap(loc_prop, swapped_loc_prop);
+
+        // then shrink (1)
+        firstMark--;
+
+        // reset the location's number of owners
+        loc_prop.second = 0;
+    } else if (inThirdRegion(loc_prop.first)) {
+        // this location must have at least 1 owner
+        assert(loc_prop.second >= 1);
+
+        if (loc_prop.second == 1) {
+            // pick the first location in region 3 to swap
+            Location swapped_loc = locArray[secondMark];
+            LocProperty& swapped_loc_prop =
+                                        locProps[swapped_loc - locationBase];
+
+            // swap loc and swapped_loc
+            swap(loc_prop, swapped_loc_prop);
+
+            // then shrink (3)
+            secondMark++;
+        }
+        // decrement the loc's number of owners
+        loc_prop.second--;
+    } else {
+        // some one else must already reset this counter
+        assert(inSecondRegion(loc_prop.first) && loc_prop.second == 0);
+    }
+}
+
+bool
+AddressManager::AtomicStruct::isExpectedValue(Value val)
+{
+    ExpectedValueSet::iterator it = expectedValues.find(val);
+
+    if (it == expectedValues.end()) {
+      return false;
+    }
+
+    // erase this value b/c it's done
+    expectedValues.erase(it);
+
+    return true;
+}
+
+void
+AddressManager::AtomicStruct::swap(LocProperty& prop_1, LocProperty& prop_2)
+{
+    int new_idx_1 = prop_2.first;
+    int new_idx_2 = prop_1.first;
+
+    // swap the two locations in locArray
+    Location tmp = locArray[prop_1.first];
+    locArray[prop_1.first] = locArray[prop_2.first];
+    locArray[prop_2.first] = tmp;
+
+    // update their new indices
+    prop_1.first = new_idx_1;
+    prop_2.first = new_idx_2;
+}
+
+// ------------------ log table ---------------------
+void
+AddressManager::updateLogTable(Location loc, int thread_id, int episode_id,
+                               Value new_value, Tick cur_tick, int cu_id)
+{
+    assert(loc >= 0 && loc < numAtomicLocs + numNormalLocs);
+    logTable[loc]->update(thread_id, cu_id, episode_id, new_value, cur_tick);
+}
+
+AddressManager::Value
+AddressManager::getLoggedValue(Location loc) const
+{
+    assert(loc >= 0 && loc < numAtomicLocs + numNormalLocs);
+    return logTable[loc]->getLastStoredValue();
+}
+
+bool
+AddressManager::validateAtomicResp(Location loc, Value ret_val)
+{
+    assert(loc >= 0 && loc < numAtomicLocs);
+    return atomicStructs[loc]->isExpectedValue(ret_val);
+}
diff --git a/src/cpu/testers/gpu_ruby_test/AddressManager.hh b/src/cpu/testers/gpu_ruby_test/AddressManager.hh
new file mode 100644
index 0000000..b7ccbf3
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/AddressManager.hh
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_ADDRESSMANAGER_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_ADDRESSMANAGER_HH_
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "base/types.hh"
+#include "sim/eventq.hh"
+
+/*
+ * --- AddressManager has 3 main tasks ---
+ *    (1) generate DRF request sequences
+ *    (2) maintain internal log table
+ *    (3) validate return values against ones in the log table
+ *
+ * A location is an abstract index of a unique real address.
+ *    It's used internally within the tester only.
+ *    randAddressMap has the mapping between a location and its real address.
+ *
+ * A value is an integer that a location in real memory can store.
+ *    for now, we assume a value is 4-byte
+ *
+ * The location range (randAddressMap) has two distinct parts:
+ *    Atomic locations: in the 1st part of randAddressMap &
+ *    Non-atomic locations (or just locations): in the 2nd part
+ */
+
+/*
+ * --- DRF request sequence generation ---
+ *    Each lane of an episode starts selecting its location by calling:
+ *      (1) getAtomicLoc
+ *      (2) getLoadLoc/getStoreLoc
+ *      (3) finishLocSelection
+ *
+ *    Each lane of an episode completes its executing by calling:
+ *      releaseLocation for all locations it selected
+ */
+
+/*
+ * --- Internal structures ---
+ *  There are multiple atomic structures, each of which corresponds
+ *    to an atomic location.
+ *
+ *  Each atomic structure manages a distinct range of locations in locArray
+ *  This array is partitioned into 3 parts that are used to select locations
+ *  for LDs and STs. Here is the location selecting rule:
+ *                  |    (1)    |    (2)    |    (3)    |
+ *    - all locations in (1) cannot be picked for any LD and ST action
+ *    - all locations in (2) can be picked for either LD or ST action
+ *    - all locations in (3) can be picked for LD action only
+ *
+ *  We maintain the 3 parts by 2 indices firstMark and secondMark.
+ *  As locations are moved between partitions, both indices are updated
+ *  accordingly.
+ *    [0 .. firstMark-1]                  part (1)
+ *    [firstMark .. secondMark-1]      part (2)
+ *    [secondMark .. arraySize-1]        part (3)
+ *
+ *  Each location has its context/property. locProps maintains
+ *  contexts/properties of all locations. Context/property includes
+ *      - current index of a location in locArray
+ *      - the number of owners who are currently using the location
+ *
+ *  To guarantee DRF constraints, the following conditions must hold
+ *    - all locations in (1) have exactly 1 owner
+ *    - all locations in (2) have exactly 0 owner
+ *    - all locations in (3) have at least 1 owner
+ *    - A LD request can randomly pick any location in (2) & (3)
+ *    - A ST request can randomly pick any location in (2)
+ *
+ *  loadStoreMap maintains all locations already selected for LDs/STs so far
+ *
+ *  When endLocSelection is called (i.e., we've picked all locations for an
+ *  episode), we need to move each selected location to its right partition.
+ *    if LD_bit == 1 && ST_bit == 0 (i.e., picked for LDs), then move the
+ *          location to (3) -> future LDs can pick it.
+ *    if LD_bit == 0 && ST_bit == 1, then move the location to (1) -> NO future
+ *          action can pick it until this episode is done.
+ *    if LD_bit == 1 && ST_bit == 1, then move the location to (1) -> NO future
+ *          action can pick it until this episode is done.
+ *    clear the loadStoreMap
+ */
+
+class AddressManager
+{
+  public:
+    AddressManager(int n_atomic_locs, int numNormalLocsPerAtomic);
+    ~AddressManager();
+
+    typedef int32_t Value;
+    typedef int32_t Location;
+
+    // return the unique address mapped to a location
+    Addr getAddress(Location loc);
+    // return a unique atomic location & start picking locations
+    Location getAtomicLoc();
+    // return a random location for LD
+    Location getLoadLoc(Location atomic_loc);
+    // return a random location for ST
+    Location getStoreLoc(Location atomic_loc);
+    // finish picking locations
+    void finishLocSelection(Location atomic_loc);
+    // an episode is done, release location I've picked
+    void releaseLocation(Location atomic_loc, Location loc);
+    // update a log table entry with a given set of values
+    void updateLogTable(Location loc, int threadId, int episodeId,
+                        Value new_value, Tick curTick, int cuId = -1);
+    // return the current value in the log table
+    Value getLoggedValue(Location loc) const;
+    // validate atomic response
+    bool validateAtomicResp(Location loc, Value ret_val);
+
+    std::string printLastWriter(Location loc) const;
+
+    static const int INVALID_VALUE;
+    static const int INVALID_LOCATION;
+
+  private:
+    class LastWriter
+    {
+      public:
+        LastWriter()
+            : threadId(-1), cuId(-1), episodeId(-1), value(0),
+              writeTick(0)
+        { }
+
+        const std::string print() const
+        {
+            return "(Thread ID " + std::to_string(threadId) +
+                   ", CU ID " + std::to_string(cuId) +
+                   ", Episode ID " + std::to_string(episodeId) +
+                   ", Value " + std::to_string(value) +
+                   ", Tick " + std::to_string(writeTick) +
+                   ")";
+        }
+
+        void update(int _thread, int _cu, int _episode, Value _value,
+                    Tick _tick)
+        {
+            threadId = _thread;
+            cuId = _cu;
+            episodeId = _episode;
+            value = _value;
+            writeTick = _tick;
+        }
+
+        Value getLastStoredValue() const { return value; }
+
+      private:
+        int threadId;
+        int cuId;
+        int episodeId;
+        Value value;
+        Tick writeTick;
+    };
+
+    class AtomicStruct
+    {
+      public:
+        AtomicStruct(Location atom_loc, Location loc_begin, Location loc_end);
+        ~AtomicStruct();
+
+        // functions picking locations for LD/ST/ATOMIC ops
+        void startLocSelection();
+        Location getLoadLoc();
+        Location getStoreLoc();
+        void endLocSelection();
+
+        // an episode completed its actions
+        // return locations to their correct positions
+        void releaseLoc(Location loc);
+        // is the value what we expect?
+        bool isExpectedValue(Value val);
+
+      private:
+        Location atomicLoc;
+        Location locationBase;
+
+        // array storing all locations this structure is managing
+        Location* locArray;
+        int firstMark, secondMark;
+        int arraySize;
+
+        // a vector of location's properties
+        typedef std::pair<int, int> LocProperty;
+        typedef std::vector<LocProperty> LocPropTable;
+        LocPropTable locProps;
+
+        // a temporary map of location and its LD/ST selection
+        typedef std::pair<bool, bool> LdStBits;
+        typedef std::unordered_map<Location, LdStBits> LdStMap;
+        LdStMap loadStoreMap;
+
+        // number of atomic requests at this location so far
+        int requestCount;
+        // a set of expected values
+        // when we request the first n atomic ops, we expect to receive n
+        // return values from [0 .. n-1]
+        typedef std::unordered_set<Value> ExpectedValueSet;
+        ExpectedValueSet expectedValues;
+
+        // swap two locations in locArray
+        void swap(LocProperty& prop_1, LocProperty& prop_2);
+
+        bool inFirstRegion(int idx) const
+        {
+            return (idx >= 0 && idx < firstMark);
+        }
+        bool inSecondRegion(int idx) const
+        {
+            return (idx >= firstMark && idx < secondMark);
+        }
+        bool inThirdRegion(int idx) const
+        {
+            return (idx >= secondMark && idx < arraySize);
+        }
+    };
+
+    // number of atomic locations
+    int numAtomicLocs;
+    // number of normal/non-atomic locations per atomic structure
+    int numLocsPerAtomic;
+    // total number of non-atomic locations
+    int numNormalLocs;
+
+    // location - address mapping
+    typedef std::vector<Addr> AddressMap;
+    AddressMap randAddressMap;
+
+    // a list of atomic structures
+    typedef std::vector<AtomicStruct*> AtomicStructTable;
+    AtomicStructTable atomicStructs;
+
+    // internal log table
+    typedef std::vector<LastWriter*> LogTable;
+    LogTable logTable;
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_ADDRESSMANAGER_HH_ */
diff --git a/src/cpu/testers/gpu_ruby_test/CpuThread.cc b/src/cpu/testers/gpu_ruby_test/CpuThread.cc
new file mode 100644
index 0000000..0e814cf
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/CpuThread.cc
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#include "cpu/testers/gpu_ruby_test/CpuThread.hh"
+
+#include "debug/ProtocolTest.hh"
+
+CpuThread::CpuThread(const Params *p)
+    :Thread(p)
+{
+    threadName = "CpuThread(Thread ID " + std::to_string(threadId) + ")";
+    threadEvent.setDesc("CpuThread tick");
+    assert(numLanes == 1);
+}
+
+CpuThread::~CpuThread()
+{
+}
+
+CpuThread*
+CpuThreadParams::create()
+{
+    return new CpuThread(this);
+}
+
+void
+CpuThread::issueLoadOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::LOAD);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    fatal("CpuThread::issueLoadOps - not yet implemented");
+}
+
+void
+CpuThread::issueStoreOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::STORE);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    fatal("CpuThread::issueStoreOps - not yet implemented");
+}
+
+void
+CpuThread::issueAtomicOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ATOMIC);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    fatal("CpuThread::issueAtomicOps - not yet implemented");
+}
+
+void
+CpuThread::issueAcquireOp()
+{
+    DPRINTF(ProtocolTest, "Issuing Acquire Op ...\n");
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ACQUIRE);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // no-op: Acquire does not apply to CPU threads
+}
+
+void
+CpuThread::issueReleaseOp()
+{
+    DPRINTF(ProtocolTest, "Issuing Release Op ...\n");
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::RELEASE);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // no-op: Release does not apply to CPU threads
+}
+
+void
+CpuThread::hitCallback(PacketPtr pkt)
+{
+    fatal("CpuThread::hitCallback - not yet implemented");
+}
diff --git a/src/cpu/testers/gpu_ruby_test/CpuThread.hh b/src/cpu/testers/gpu_ruby_test/CpuThread.hh
new file mode 100644
index 0000000..42441af
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/CpuThread.hh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_CPUTHREAD_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_CPUTHREAD_HH_
+
+#include "cpu/testers/gpu_ruby_test/Thread.hh"
+#include "params/CpuThread.hh"
+#include "sim/clocked_object.hh"
+
+class CpuThread : public Thread
+{
+  public:
+    typedef CpuThreadParams Params;
+    CpuThread(const Params *p);
+    virtual ~CpuThread();
+
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    void hitCallback(PacketPtr pkt);
+
+  protected:
+    void issueLoadOps();
+    void issueStoreOps();
+    void issueAtomicOps();
+    void issueAcquireOp();
+    void issueReleaseOp();
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_CPUTHREAD_HH_ */
diff --git a/src/cpu/testers/gpu_ruby_test/CpuThread.py b/src/cpu/testers/gpu_ruby_test/CpuThread.py
new file mode 100644
index 0000000..a0f04e7
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/CpuThread.py
@@ -0,0 +1,43 @@
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Tuan Ta
+#
+
+from m5.params import *
+from m5.proxy import *
+
+from Thread import Thread
+
+class CpuThread(Thread):
+    type = 'CpuThread'
+    cxx_header = "cpu/testers/gpu_ruby_test/CpuThread.hh"
diff --git a/src/cpu/testers/gpu_ruby_test/Episode.cc b/src/cpu/testers/gpu_ruby_test/Episode.cc
new file mode 100644
index 0000000..fba21f5
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/Episode.cc
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#include "cpu/testers/gpu_ruby_test/Episode.hh"
+
+#include <fstream>
+#include <unordered_set>
+
+#include "cpu/testers/gpu_ruby_test/ProtocolTester.hh"
+#include "cpu/testers/gpu_ruby_test/Thread.hh"
+
+Episode::Episode(ProtocolTester* _tester, Thread* _thread, int num_loads,
+                 int num_stores)
+      : tester(_tester),
+        thread(_thread),
+        numLoads(num_loads),
+        numStores(num_stores),
+        nextActionIdx(0)
+{
+    assert(tester && thread);
+
+    episodeId = tester->getNextEpisodeID();
+    numLanes = thread->getNumLanes();
+    assert(numLanes > 0);
+
+    addrManager = tester->getAddressManager();
+    assert(addrManager);
+
+    atomicLocs.resize(numLanes, AddressManager::INVALID_LOCATION);
+    // generate a sequence of actions
+    initActions();
+    isActive = true;
+
+    DPRINTFN("Episode %d\n", episodeId);
+}
+
+Episode::~Episode()
+{
+    for (Episode::Action* action : actions) {
+        assert(action);
+        delete action;
+    }
+}
+
+const Episode::Action*
+Episode::peekCurAction() const
+{
+    if (nextActionIdx < actions.size())
+        return actions[nextActionIdx];
+    else
+        return nullptr;
+}
+
+void
+Episode::popAction()
+{
+    assert(nextActionIdx < actions.size());
+    nextActionIdx++;
+}
+
+void
+Episode::initActions()
+{
+    // first, push Atomic & then Acquire action
+    actions.push_back(new Action(Action::Type::ATOMIC, numLanes));
+    actions.push_back(new Action(Action::Type::ACQUIRE, numLanes));
+
+    // second, push a number of LD/ST actions
+    int num_loads = numLoads;
+    int num_stores = numStores;
+    while ((num_loads + num_stores) > 0) {
+        switch (random() % 2) {
+            case 0: // Load
+                if (num_loads > 0) {
+                    actions.push_back(new Action(Action::Type::LOAD,
+                                                   numLanes));
+                    num_loads--;
+                }
+                break;
+            case 1: // Store
+                if (num_stores > 0) {
+                    actions.push_back(new Action(Action::Type::STORE,
+                                                   numLanes));
+                    num_stores--;
+                }
+                break;
+            default:
+                assert(false);
+        }
+    }
+
+    // last, push an Release & then Atomic action
+    actions.push_back(new Action(Action::Type::RELEASE, numLanes));
+    actions.push_back(new Action(Action::Type::ATOMIC, numLanes));
+
+    // for each lane, pick a list of locations
+    Location normal_loc;
+
+    for (int lane = 0; lane < numLanes; ++lane) {
+        normal_loc = AddressManager::INVALID_LOCATION;
+
+        // first, we select atomic loc for this lane
+        // atomic loc for this lane should not have been picked yet
+        assert(atomicLocs[lane] == AddressManager::INVALID_LOCATION);
+        // pick randomly an atomic location
+        atomicLocs[lane] = addrManager->getAtomicLoc();
+        assert(atomicLocs[lane] >= 0);
+
+        // go through each action in this lane and set its location
+        for (Action* action : actions) {
+            assert(action);
+
+            switch (action->getType()) {
+                case Action::Type::ATOMIC:
+                    action->setLocation(lane, atomicLocs[lane]);
+                    break;
+                case Action::Type::LOAD:
+                    // pick randomly a normal location
+                    normal_loc = addrManager->
+                                            getLoadLoc(atomicLocs[lane]);
+                    assert(normal_loc >= AddressManager::INVALID_LOCATION);
+
+                    if (normal_loc != AddressManager::INVALID_LOCATION) {
+                        // check DRF
+                        if (!tester->checkDRF(atomicLocs[lane],
+                                                normal_loc, false) ||
+                            !this->checkDRF(atomicLocs[lane], normal_loc,
+                                            false, lane)) {
+                            panic("Thread %d - Data race detected. STOPPED!\n",
+                                  thread->getThreadId());
+                        }
+                    }
+
+                    action->setLocation(lane, normal_loc);
+                    break;
+                case Action::Type::STORE:
+                    // pick randomly a normal location
+                    normal_loc = addrManager->
+                                            getStoreLoc(atomicLocs[lane]);
+                    assert(normal_loc >= AddressManager::INVALID_LOCATION);
+
+                    if (normal_loc != AddressManager::INVALID_LOCATION) {
+                        // check DRF
+                        if (!tester->checkDRF(atomicLocs[lane],
+                                                normal_loc, true) ||
+                            !this->checkDRF(atomicLocs[lane], normal_loc,
+                                            true, lane)) {
+                            panic("Thread %d - Data race detected. STOPPED!\n",
+                                  thread->getThreadId());
+                        }
+                    }
+
+                    action->setLocation(lane, normal_loc);
+                    break;
+                case Action::Type::ACQUIRE:
+                case Action::Type::RELEASE:
+                    // no op
+                    break;
+                default:
+                    panic("Invalid action type\n");
+            }
+        }
+
+        addrManager->finishLocSelection(atomicLocs[lane]);
+    }
+}
+
+void
+Episode::completeEpisode()
+{
+    // release all locations this episode has picked and used
+    Location atomic_loc, normal_loc;
+    for (int lane = 0; lane < numLanes; ++lane) {
+        atomic_loc = AddressManager::INVALID_LOCATION;
+        normal_loc = AddressManager::INVALID_LOCATION;
+
+        std::unordered_set<Location> unique_loc_set;
+
+        for (Action* action : actions) {
+            assert(action);
+
+            if (action->isAtomicAction()) {
+                if (atomic_loc == AddressManager::INVALID_LOCATION) {
+                    atomic_loc = action->getLocation(lane);
+                } else {
+                    // both atomic ops in the same lane must be
+                    // at the same location
+                    assert(atomic_loc == action->getLocation(lane));
+                }
+            } else if (!action->isMemFenceAction()) {
+                assert(atomic_loc >= 0);
+                normal_loc = action->getLocation(lane);
+
+                if (normal_loc >= 0)
+                    unique_loc_set.insert(normal_loc);
+            }
+        }
+
+        // each unique loc can be released only once
+        for (Location loc : unique_loc_set)
+            addrManager->releaseLocation(atomic_loc, loc);
+    }
+
+    // this episode is no longer active
+    isActive = false;
+}
+
+bool
+Episode::checkDRF(Location atomic_loc, Location loc, bool isStore,
+                  int max_lane) const
+{
+    assert(atomic_loc != AddressManager::INVALID_LOCATION);
+    assert(loc != AddressManager::INVALID_LOCATION);
+    assert(max_lane <= numLanes);
+
+    for (int lane = 0; lane < max_lane; ++lane) {
+        if (atomic_loc == atomicLocs[lane]) {
+            for (const Action* action : actions) {
+                if (!action->isAtomicAction() &&
+                    !action->isMemFenceAction()) {
+                    if (isStore && loc == action->getLocation(lane)) {
+                        warn("ST at location %d races against thread %d\n",
+                             loc, thread->getThreadId());
+                        return false;
+                    } else if (!isStore &&
+                               action->getType() == Action::Type::STORE &&
+                               loc == action->getLocation(lane)) {
+                        warn("LD at location %d races against thread %d\n",
+                             loc, thread->getThreadId());
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+// -------------------- Action class ----------------------------
+Episode::Action::Action(Type t, int num_lanes)
+    : type(t),
+      numLanes(num_lanes)
+{
+    assert(numLanes > 0);
+    locations.resize(numLanes);
+    for (Location &loc : locations) loc = AddressManager::INVALID_LOCATION;
+}
+
+void
+Episode::Action::setLocation(int lane, Location loc)
+{
+    assert(lane >= 0 && lane < numLanes);
+    locations[lane] = loc;
+}
+
+AddressManager::Location
+Episode::Action::getLocation(int lane) const
+{
+    assert(lane >= 0 && lane < numLanes);
+    return locations[lane];
+}
+
+bool
+Episode::Action::isAtomicAction() const
+{
+    return (type == Type::ATOMIC);
+}
+
+bool
+Episode::Action::isMemFenceAction() const
+{
+    return (type == Type::ACQUIRE || type == Type::RELEASE);
+}
+
+const std::string
+Episode::Action::printType() const
+{
+    if (type == Type::ACQUIRE)
+        return "ACQUIRE";
+    else if (type == Type::RELEASE)
+        return "RELEASE";
+    else if (type == Type::ATOMIC)
+        return "ATOMIC";
+    else if (type == Type::LOAD)
+        return "LOAD";
+    else if (type == Type::STORE)
+        return "STORE";
+    else
+        panic("Invalid action type\n");
+}
diff --git a/src/cpu/testers/gpu_ruby_test/Episode.hh b/src/cpu/testers/gpu_ruby_test/Episode.hh
new file mode 100644
index 0000000..8edd803
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/Episode.hh
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_EPISODE_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_EPISODE_HH_
+
+#include <vector>
+
+#include "cpu/testers/gpu_ruby_test/AddressManager.hh"
+
+class ProtocolTester;
+class Thread;
+
+class Episode
+{
+  public:
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    class Action {
+      public:
+        enum class Type {
+            ACQUIRE,
+            RELEASE,
+            ATOMIC,
+            LOAD,
+            STORE,
+        };
+
+        Action(Type t, int num_lanes);
+        ~Action() {}
+
+        Type getType() const { return type; }
+        void setLocation(int lane, Location loc);
+        Location getLocation(int lane) const;
+        bool isAtomicAction() const;
+        bool isMemFenceAction() const;
+        const std::string printType() const;
+
+      private:
+        Type type;
+        int numLanes;
+        typedef std::vector<Location> LocationList;
+        LocationList locations;
+    };
+
+    Episode(ProtocolTester* tester, Thread* thread, int num_loads,
+            int num_stores);
+    ~Episode();
+
+    // return episode id
+    int getEpisodeId() const { return episodeId; }
+    // return the action at the head of the action queue
+    const Action* peekCurAction() const;
+    // pop the action at the head of the action queue
+    void popAction();
+    // check if there is more action to be issued in this episode
+    bool hasMoreActions() const { return nextActionIdx < actions.size();}
+    // complete this episode by releasing all locations & updating st effects
+    void completeEpisode();
+    // check if this episode is executing
+    bool isEpsActive() const { return isActive; }
+    // check if the input episode and this one have any data race
+    bool checkDRF(Location atomic_loc, Location loc, bool isStore,
+                  int max_lane) const;
+
+  private:
+    // pointers to tester, thread and address amanger structures
+    ProtocolTester *tester;
+    Thread *thread;
+    AddressManager *addrManager;
+
+    // a unique episode id
+    int episodeId;
+    // list of actions in this episode
+    typedef std::vector<Action*> ActionList;
+    ActionList actions;
+    // list of atomic locations picked for this episode
+    typedef std::vector<Location> AtomicLocationList;
+    AtomicLocationList atomicLocs;
+
+    // is a thread running this episode?
+    bool isActive;
+    // episode length = num_loads + num_stores
+    int numLoads;
+    int numStores;
+    // index of the next action in actions
+    int nextActionIdx;
+    // number of lanes in this thread
+    int numLanes;
+
+    // randomly generate actions in this episode
+    void initActions();
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_EPISODE_HH_ */
diff --git a/src/cpu/testers/gpu_ruby_test/GpuWavefront.cc b/src/cpu/testers/gpu_ruby_test/GpuWavefront.cc
new file mode 100644
index 0000000..82b0d3b
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/GpuWavefront.cc
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#include "cpu/testers/gpu_ruby_test/GpuWavefront.hh"
+
+#include "debug/ProtocolTest.hh"
+
+GpuWavefront::GpuWavefront(const Params *p)
+      : Thread(p)
+{
+    cuId = p->cu_id;
+    threadName = "GpuWavefront(Thread ID = " + std::to_string(threadId) +
+                 ", CU ID = " + std::to_string(cuId) + ")";
+    threadEvent.setDesc("GpuWavefront tick");
+}
+
+GpuWavefront::~GpuWavefront()
+{
+
+}
+
+GpuWavefront*
+GpuWavefrontParams::create()
+{
+    return new GpuWavefront(this);
+}
+
+void
+GpuWavefront::issueLoadOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::LOAD);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    for (int lane = 0; lane < numLanes; ++lane) {
+        Location location = curAction->getLocation(lane);
+        assert(location >= AddressManager::INVALID_LOCATION);
+
+        if (location >= 0) {
+            Addr address = addrManager->getAddress(location);
+            DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
+                    this->getName(), curEpisode->getEpisodeId(),
+                    printAddress(address));
+
+            int load_size = sizeof(Value);
+
+            // for now, assert address is 4-byte aligned
+            assert(address % load_size == 0);
+
+            Request *req = new Request(0,                     // asid
+                                       address,               // virtual addr
+                                       load_size,             // size in bytes
+                                       0,                     // flags
+                                       tester->masterId(),  // port id
+                                       0,                     // pc
+                                       threadId,           // thread_id
+                                       0);
+            req->setPaddr(address);
+            req->setReqInstSeqNum(tester->getActionSeqNum());
+            // set protocol-specific flags
+            setExtraRequestFlags(req);
+
+            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+            uint8_t* data = new uint8_t[load_size];
+            pkt->dataDynamic(data);
+            pkt->senderState = new ProtocolTester::SenderState(this);
+
+            if (!port->sendTimingReq(pkt)) {
+                panic("Not expected failed sendTimingReq\n");
+            }
+
+            // insert an outstanding load
+            addOutstandingReqs(outstandingLoads, address, lane, location);
+
+            // increment the number of outstanding ld_st requests
+            pendingLdStCount++;
+        }
+    }
+}
+
+void
+GpuWavefront::issueStoreOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::STORE);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    for (int lane = 0; lane < numLanes; ++lane) {
+        Location location = curAction->getLocation(lane);
+        assert(location >= AddressManager::INVALID_LOCATION);
+
+        if (location >= 0) {
+            // prepare the next value to store
+            Value new_value = addrManager->getLoggedValue(location) + 1;
+
+            Addr address = addrManager->getAddress(location);
+            // must be aligned with store size
+            assert(address % sizeof(Value) == 0);
+
+            DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
+                    "Value %d\n", this->getName(),
+                    curEpisode->getEpisodeId(), printAddress(address),
+                    new_value);
+
+            Request *req = new Request(0,                     // asid
+                                       address,               // virtual addr
+                                       sizeof(Value),         // size in bytes
+                                       0,                     // flags
+                                       tester->masterId(),  // port id
+                                       0,                     // pc
+                                       threadId,           // thread_id
+                                       0);
+            req->setPaddr(address);
+            req->setReqInstSeqNum(tester->getActionSeqNum());
+            // set protocol-specific flags
+            setExtraRequestFlags(req);
+
+            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+            uint8_t *writeData = new uint8_t[sizeof(Value)];
+            for (int j = 0; j < sizeof(Value); ++j) {
+                writeData[j] = ((uint8_t*)&new_value)[j];
+            }
+            pkt->dataDynamic(writeData);
+            pkt->senderState = new ProtocolTester::SenderState(this);
+
+            if (!port->sendTimingReq(pkt)) {
+                panic("Not expecting a failed sendTimingReq\n");
+            }
+
+            // add an outstanding store
+            addOutstandingReqs(outstandingStores, address, lane, location,
+                               new_value);
+
+            // increment the number of outstanding ld_st requests
+            pendingLdStCount++;
+        }
+    }
+}
+
+void
+GpuWavefront::issueAtomicOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ATOMIC);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // we use atomic_inc in the tester
+    Request::Flags flags = Request::ATOMIC_RETURN_OP;
+
+    for (int lane = 0; lane < numLanes; ++lane) {
+        Location location = curAction->getLocation(lane);
+        assert(location >= 0);
+
+        Addr address = addrManager->getAddress(location);
+
+        DPRINTF(ProtocolTest, "%s Episode %d: Issuing Atomic_Inc - Addr %s\n",
+                this->getName(), curEpisode->getEpisodeId(),
+                printAddress(address));
+
+        // must be aligned with store size
+        assert(address % sizeof(Value) == 0);
+        Request *req = new Request(0,                     // asid
+                                   address,               // virtual addr
+                                   sizeof(Value),         // size in bytes
+                                   flags,                 // flags
+                                   tester->masterId(),  // port id
+                                   0,                     // pc
+                                   threadId,           // thread_id
+                                   new AtomicOpInc<Value>());
+        req->setPaddr(address);
+        req->setReqInstSeqNum(tester->getActionSeqNum());
+        // set protocol-specific flags
+        setExtraRequestFlags(req);
+
+        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
+        uint8_t* data = new uint8_t[sizeof(Value)];
+        pkt->dataDynamic(data);
+        pkt->senderState = new ProtocolTester::SenderState(this);
+
+        if (!port->sendTimingReq(pkt)) {
+            panic("Not expecting failed sendTimingReq\n");
+        }
+
+        // add an outstanding atomic
+        addOutstandingReqs(outstandingAtomics, address, lane, location);
+
+        // increment the number of outstanding atomic ops
+        pendingAtomicCount++;
+    }
+}
+
+void
+GpuWavefront::issueAcquireOp()
+{
+    DPRINTF(ProtocolTest, "%s Episode %d: Issuing Acquire\n", this->getName(),
+            curEpisode->getEpisodeId());
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ACQUIRE);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    Request *acq_req = new Request(0,
+                                   0,                     // vaddr
+                                   0,                     // request size
+                                   0,                     // flags
+                                   tester->masterId(),
+                                   0,
+                                   threadId,
+                                   0);
+    acq_req->setPaddr(0);
+    acq_req->setReqInstSeqNum(tester->getActionSeqNum());
+    // set protocol-specific flags
+    setExtraRequestFlags(acq_req);
+
+    PacketPtr pkt = new Packet(acq_req, MemCmd::MemSyncReq);
+    pkt->senderState = new ProtocolTester::SenderState(this);
+
+    if (!port->sendTimingReq(pkt)) {
+        panic("Not expecting failed sendTimingReq\n");
+    }
+
+    // increment the number of outstanding fence requests
+    pendingFenceCount++;
+}
+
+void
+GpuWavefront::issueReleaseOp()
+{
+    DPRINTF(ProtocolTest, "%s Episode %d: Issuing Release\n", this->getName(),
+            curEpisode->getEpisodeId());
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::RELEASE);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    Request *rel_req = new Request(0,
+                                   0,                       // vaddr
+                                   0,                       // request size
+                                   0,                       // flags
+                                   tester->masterId(),
+                                   0,
+                                   threadId,
+                                   0);
+    rel_req->setPaddr(0);
+    rel_req->setReqInstSeqNum(tester->getActionSeqNum());
+    // set protocol-specific flags
+    setExtraRequestFlags(rel_req);
+
+    PacketPtr pkt = new Packet(rel_req, MemCmd::MemSyncReq);
+    pkt->senderState = new ProtocolTester::SenderState(this);
+
+    if (!port->sendTimingReq(pkt)) {
+        panic("Not expecting failed sendTimingReq\n");
+    }
+
+    // increment the number of outstanding fence requests
+    pendingFenceCount++;
+}
+
+void
+GpuWavefront::hitCallback(PacketPtr pkt)
+{
+    assert(pkt);
+    MemCmd resp_cmd = pkt->cmd;
+    Addr addr = pkt->getAddr();
+
+    DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s - "
+                    "Addr %s\n", this->getName(),
+                    curEpisode->getEpisodeId(), resp_cmd.toString(),
+                    printAddress(addr));
+
+    // whether the transaction is done after this hitCallback
+    bool isTransactionDone = true;
+
+    if (resp_cmd == MemCmd::MemSyncResp) {
+        // response to a pending fence
+        // no validation needed for fence responses
+        assert(pendingFenceCount > 0);
+        assert(pendingLdStCount == 0);
+        assert(pendingAtomicCount == 0);
+        pendingFenceCount--;
+    } else if (resp_cmd == MemCmd::ReadResp) {
+        // response to a pending read
+        assert(pendingLdStCount > 0);
+        assert(pendingAtomicCount == 0);
+        assert(outstandingLoads.count(addr) > 0);
+
+        // get return data
+        Value value = *(pkt->getPtr<Value>());
+        OutstandingReq req = popOutstandingReq(outstandingLoads, addr);
+        validateLoadResp(req.origLoc, req.lane, value);
+
+        // this Read is done
+        pendingLdStCount--;
+    } else if (resp_cmd == MemCmd::WriteResp) {
+        // response to a pending write
+        assert(pendingLdStCount > 0);
+        assert(pendingAtomicCount == 0);
+
+        // no need to validate Write response
+        // just pop it from the outstanding req table so that subsequent
+        // requests dependent on this write can proceed
+        // note that we don't decrement pendingLdStCount here yet since
+        // the write is not yet completed in downstream memory. Instead, we
+        // decrement the counter when we receive the write completion ack
+        assert(outstandingStores.count(addr) > 0);
+        OutstandingReq req = popOutstandingReq(outstandingStores, addr);
+        assert(req.storedValue != AddressManager::INVALID_VALUE);
+
+        // update log table
+        addrManager->updateLogTable(req.origLoc, threadId,
+                                    curEpisode->getEpisodeId(),
+                                    req.storedValue,
+                                    curTick(),
+                                    cuId);
+
+        // the transaction is not done yet. Waiting for write completion ack
+        isTransactionDone = false;
+    } else if (resp_cmd == MemCmd::SwapResp) {
+        // response to a pending atomic
+        assert(pendingAtomicCount > 0);
+        assert(pendingLdStCount == 0);
+        assert(outstandingAtomics.count(addr) > 0);
+
+        // get return data
+        Value value = *(pkt->getPtr<Value>());
+
+        // validate atomic op return
+        OutstandingReq req = popOutstandingReq(outstandingAtomics, addr);
+        validateAtomicResp(req.origLoc, req.lane, value);
+
+        // update log table
+        addrManager->updateLogTable(req.origLoc, threadId,
+                                    curEpisode->getEpisodeId(), value,
+                                    curTick(),
+                                    cuId);
+
+        // this Atomic is done
+        pendingAtomicCount--;
+    } else if (resp_cmd == MemCmd::MessageResp) {
+        // write completion ACK
+        assert(pendingLdStCount > 0);
+        assert(pendingAtomicCount == 0);
+
+        // the Write is now done
+        pendingLdStCount--;
+    } else {
+        panic("Unsupported MemCmd response type");
+    }
+
+    if (isTransactionDone) {
+        // no need to keep senderState and request around
+        delete pkt->senderState;
+        delete pkt->req;
+    }
+
+    delete pkt;
+
+    // record the last active cycle to check for deadlock
+    lastActiveCycle = curCycle();
+
+    // we may be able to issue an action. Let's check
+    if (!threadEvent.scheduled()) {
+        scheduleWakeup();
+    }
+}
+
+void
+GpuWavefront::setExtraRequestFlags(Request* req)
+{
+    // No extra request flag is set
+}
diff --git a/src/cpu/testers/gpu_ruby_test/GpuWavefront.hh b/src/cpu/testers/gpu_ruby_test/GpuWavefront.hh
new file mode 100644
index 0000000..96c9f37
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/GpuWavefront.hh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_GPUWAVEFRONT_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_GPUWAVEFRONT_HH_
+
+#include "cpu/testers/gpu_ruby_test/Thread.hh"
+#include "params/GpuWavefront.hh"
+#include "sim/clocked_object.hh"
+
+class GpuWavefront : public Thread
+{
+  public:
+    typedef GpuWavefrontParams Params;
+    GpuWavefront(const Params *p);
+    virtual ~GpuWavefront();
+
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    virtual void hitCallback(PacketPtr pkt);
+
+  protected:
+    void issueLoadOps();
+    void issueStoreOps();
+    void issueAtomicOps();
+    // acquire and release ops are protocol-specific, so their issue functions
+    // may be redefined by a child class of GpuWavefront
+    virtual void issueAcquireOp();
+    virtual void issueReleaseOp();
+    // set extra request flags that is specific to a target protocol
+    virtual void setExtraRequestFlags(Request* req);
+
+  protected:
+    int cuId;    // compute unit associated with this wavefront
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_GPUWAVEFRONT_HH_ */
diff --git a/src/cpu/testers/gpu_ruby_test/GpuWavefront.py b/src/cpu/testers/gpu_ruby_test/GpuWavefront.py
new file mode 100644
index 0000000..a55f7f0
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/GpuWavefront.py
@@ -0,0 +1,44 @@
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Tuan Ta
+#
+
+from m5.params import *
+from m5.proxy import *
+
+from Thread import Thread
+
+class GpuWavefront(Thread):
+    type = 'GpuWavefront'
+    cxx_header = "cpu/testers/gpu_ruby_test/GpuWavefront.hh"
+    cu_id = Param.Int("Compute Unit ID")
diff --git a/src/cpu/testers/gpu_ruby_test/ProtocolTester.cc b/src/cpu/testers/gpu_ruby_test/ProtocolTester.cc
new file mode 100644
index 0000000..5184df7
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.cc
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#include "cpu/testers/gpu_ruby_test/ProtocolTester.hh"
+
+#include <algorithm>
+#include <ctime>
+#include <fstream>
+#include <random>
+
+#include "cpu/testers/gpu_ruby_test/CpuThread.hh"
+#include "cpu/testers/gpu_ruby_test/GpuWavefront.hh"
+#include "cpu/testers/gpu_ruby_test/Thread.hh"
+#include "debug/ProtocolTest.hh"
+#include "mem/request.hh"
+#include "sim/sim_exit.hh"
+#include "sim/system.hh"
+
+ProtocolTester::ProtocolTester(const Params *p)
+      : MemObject(p),
+        _masterId(p->system->getMasterId(name())),
+        numCpuPorts(p->port_cpu_ports_connection_count),
+        numVectorPorts(p->port_cu_vector_ports_connection_count),
+        numSqcPorts(p->port_cu_sqc_ports_connection_count),
+        numScalarPorts(p->port_cu_scalar_ports_connection_count),
+        numCusPerSqc(p->cus_per_sqc),
+        numCusPerScalar(p->cus_per_scalar),
+        numWfsPerCu(p->wavefronts_per_cu),
+        numWisPerWf(p->workitems_per_wavefront),
+        numAtomicLocs(p->num_atomic_locations),
+        numNormalLocsPerAtomic(p->num_normal_locs_per_atomic),
+        episodeLength(p->episode_length),
+        maxNumEpisodes(p->max_num_episodes),
+        debugTester(p->debug_tester),
+        cpuThreads(p->cpu_threads),
+        wfs(p->wavefronts)
+{
+    int idx = 0;  // global port index
+
+    numCpus = numCpuPorts;     // 1 cpu port per CPU
+    numCus = numVectorPorts;   // 1 vector port per CU
+
+    // create all physical cpu's data ports
+    for (int i = 0; i < numCpuPorts; ++i) {
+        DPRINTF(ProtocolTest, "Creating %s\n",
+                csprintf("%s-cpuPort%d", name(), i));
+        cpuPorts.push_back(new SeqPort(csprintf("%s-cpuPort%d", name(), i),
+                                       this, i, idx));
+        idx++;
+    }
+
+    // create all physical gpu's data ports
+    for (int i = 0; i < numVectorPorts; ++i) {
+        DPRINTF(ProtocolTest, "Creating %s\n",
+                csprintf("%s-cuVectorPort%d", name(), i));
+        cuVectorPorts.push_back(new SeqPort(csprintf("%s-cuVectorPort%d",
+                                                     name(), i),
+                                            this, i, idx));
+        idx++;
+    }
+
+    for (int i = 0; i < numScalarPorts; ++i) {
+        DPRINTF(ProtocolTest, "Creating %s\n",
+                              csprintf("%s-cuScalarPort%d", name(), i));
+        cuScalarPorts.push_back(new SeqPort(csprintf("%s-cuScalarPort%d",
+                                                     name(), i),
+                                            this, i, idx));
+        idx++;
+    }
+
+    for (int i = 0; i < numSqcPorts; ++i) {
+        DPRINTF(ProtocolTest, "Creating %s\n",
+                              csprintf("%s-cuSqcPort%d", name(), i));
+        cuSqcPorts.push_back(new SeqPort(csprintf("%s-cuSqcPort%d",
+                                                  name(), i),
+                                         this, i, idx));
+        idx++;
+    }
+
+    // create an address manager
+    addrManager = new AddressManager(numAtomicLocs,
+                                       numNormalLocsPerAtomic);
+    nextEpisodeId = 0;
+
+    if (!debugTester)
+      warn("Data race check is not enabled\n");
+
+    sentExitSignal = false;
+
+    // set random seed number
+    if (p->random_seed != 0) {
+        srand(p->random_seed);
+    } else {
+        srand(time(NULL));
+    }
+
+    actionCount = 0;
+
+    // create a new log file
+    logFile = simout.create(p->log_file);
+    assert(logFile);
+
+    // print test configs
+    std::stringstream ss;
+    ss << "GPU Ruby test's configurations" << std::endl
+       << "\tNumber of CPUs: " << numCpus << std::endl
+       << "\tNumber of CUs: " << numCus << std::endl
+       << "\tNumber of wavefronts per CU: " << numWfsPerCu << std::endl
+       << "\tWavefront size: " << numWisPerWf << std::endl
+       << "\tNumber of atomic locations: " << numAtomicLocs << std::endl
+       << "\tNumber of non-atomic locations: "
+       << numNormalLocsPerAtomic * numAtomicLocs << std::endl
+       << "\tEpisode length: " << episodeLength << std::endl
+       << "\tTest length (max number of episodes): " << maxNumEpisodes
+       << std::endl
+       << "\tRandom seed: " << p->random_seed
+       << std::endl;
+
+    ccprintf(*(logFile->stream()), "%s", ss.str());
+    logFile->stream()->flush();
+}
+
+ProtocolTester::~ProtocolTester()
+{
+    for (int i = 0; i < cpuPorts.size(); ++i)
+        delete cpuPorts[i];
+    for (int i = 0; i < cuVectorPorts.size(); ++i)
+        delete cuVectorPorts[i];
+    for (int i = 0; i < cuScalarPorts.size(); ++i)
+        delete cuScalarPorts[i];
+    for (int i = 0; i < cuSqcPorts.size(); ++i)
+        delete cuSqcPorts[i];
+    delete addrManager;
+
+    // close the log file
+    simout.close(logFile);
+}
+
+void
+ProtocolTester::init()
+{
+    DPRINTF(ProtocolTest, "Attach threads to ports\n");
+
+    // connect cpu threads to cpu's ports
+    for (int cpu_id = 0; cpu_id < numCpus; ++cpu_id) {
+        cpuThreads[cpu_id]->attachThreadToPorts(this,
+                                      static_cast<SeqPort*>(cpuPorts[cpu_id]));
+        cpuThreads[cpu_id]->scheduleWakeup();
+        cpuThreads[cpu_id]->scheduleDeadlockCheckEvent();
+    }
+
+    // connect gpu wavefronts to gpu's ports
+    int wfId = 0;
+    int vectorPortId = 0;
+    int sqcPortId = 0;
+    int scalarPortId = 0;
+
+    for (int cu_id = 0; cu_id < numCus; ++cu_id) {
+        vectorPortId = cu_id;
+        sqcPortId = cu_id/numCusPerSqc;
+        // no scalar port if 'numCusPerScalar' is '0'
+        if (numCusPerScalar != 0)
+            scalarPortId = cu_id/numCusPerScalar;
+
+        for (int i = 0; i < numWfsPerCu; ++i) {
+            wfId = cu_id * numWfsPerCu + i;
+            wfs[wfId]->attachThreadToPorts(this,
+                           static_cast<SeqPort*>(cuVectorPorts[vectorPortId]),
+                           static_cast<SeqPort*>(cuSqcPorts[sqcPortId]),
+                           !numCusPerScalar ? nullptr :
+                           static_cast<SeqPort*>(cuScalarPorts[scalarPortId]));
+            wfs[wfId]->scheduleWakeup();
+            wfs[wfId]->scheduleDeadlockCheckEvent();
+        }
+    }
+}
+
+BaseMasterPort &
+ProtocolTester::getMasterPort(const std::string & if_name, PortID idx)
+{
+    if (if_name != "cpu_ports" && if_name != "cu_vector_ports" &&
+        if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports") {
+        // pass along to super class
+        return MemObject::getMasterPort(if_name, idx);
+    } else {
+        if (if_name == "cpu_ports") {
+            if (idx > numCpuPorts)
+                panic("ProtocolTester: unknown cpu port %d\n", idx);
+            return *cpuPorts[idx];
+        } else if (if_name == "cu_vector_ports") {
+            if (idx > numVectorPorts)
+                panic("ProtocolTester: unknown cu vect port %d\n", idx);
+            return *cuVectorPorts[idx];
+        } else if (if_name == "cu_sqc_ports") {
+            if (idx > numSqcPorts)
+                panic("ProtocolTester: unknown cu sqc port %d\n", idx);
+            return *cuSqcPorts[idx];
+        } else {
+            assert(if_name == "cu_scalar_ports");
+            if (idx > numScalarPorts)
+                panic("ProtocolTester: unknown cu scal port %d\n", idx);
+            return *cuScalarPorts[idx];
+        }
+    }
+
+    assert(false);
+}
+
+bool
+ProtocolTester::checkExit()
+{
+    if (nextEpisodeId > maxNumEpisodes) {
+        if (!sentExitSignal) {
+            // all done
+            inform("Total completed episodes: %d\n", nextEpisodeId - 1);
+            inform("Protocol Test: Passed!\n");
+            exitSimLoop("ProtocolTester completed!");
+            sentExitSignal = true;
+        }
+        return true;
+    }
+    return false;
+}
+
+bool
+ProtocolTester::checkDRF(Location atomic_loc,
+                         Location loc, bool isStore) const
+{
+    if (debugTester) {
+        // go through all active episodes in all threads
+        for (const Thread* th : wfs) {
+            if (!th->checkDRF(atomic_loc, loc, isStore))
+                return false;
+        }
+
+        for (const Thread* th : cpuThreads) {
+            if (!th->checkDRF(atomic_loc, loc, isStore))
+                return false;
+        }
+    }
+
+    return true;
+}
+
+void
+ProtocolTester::dumpErrorLog(std::stringstream& ss)
+{
+    if (!sentExitSignal) {
+        // go through all threads and dump their outstanding requests
+        for (auto t : cpuThreads) {
+            t->printAllOutstandingReqs(ss);
+        }
+
+        for (auto t : wfs) {
+            t->printAllOutstandingReqs(ss);
+        }
+
+        // dump error log into a file
+        assert(logFile);
+        ccprintf(*(logFile->stream()), "%s", ss.str());
+        logFile->stream()->flush();
+
+        // exit the sim loop
+        exitSimLoop("GPU Ruby Tester: Failed!", -1);
+        sentExitSignal = true;
+    }
+}
+
+bool
+ProtocolTester::SeqPort::recvTimingResp(PacketPtr pkt)
+{
+    // get the requesting thread from the original sender state
+    ProtocolTester::SenderState* senderState =
+                    safe_cast<ProtocolTester::SenderState*>(pkt->senderState);
+    Thread *th = senderState->th;
+
+    th->hitCallback(pkt);
+
+    return true;
+}
+
+ProtocolTester*
+ProtocolTesterParams::create()
+{
+    return new ProtocolTester(this);
+}
diff --git a/src/cpu/testers/gpu_ruby_test/ProtocolTester.hh b/src/cpu/testers/gpu_ruby_test/ProtocolTester.hh
new file mode 100644
index 0000000..d6b356d
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.hh
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_PROTOCOLTESTER_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_PROTOCOLTESTER_HH_
+
+/*
+ * The tester includes the main ProtocolTester that manages all ports to the
+ * memory system.
+ * Threads are mapped to certain data port(s)
+ *
+ * Threads inject memory requests through their data ports.
+ * The tester receives and validates responses from the memory.
+ *
+ * Main components
+ *    - AddressManager: generate DRF request streams &
+ *                      validate data response against an internal log_table
+ *    - Episode: a sequence of requests
+ *    - Thread: either GPU wavefront or CPU thread
+ */
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "base/types.hh"
+#include "cpu/testers/gpu_ruby_test/AddressManager.hh"
+#include "mem/mem_object.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/RubyPort.hh"
+#include "params/ProtocolTester.hh"
+
+class Thread;
+class CpuThread;
+class GpuWavefront;
+
+class ProtocolTester : public MemObject
+{
+  public:
+    class SeqPort : public MasterPort
+    {
+      public:
+        SeqPort(const std::string &_name, ProtocolTester *_tester, PortID _id,
+                PortID _index)
+            : MasterPort(_name, _tester, _id)
+        {}
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry()
+            { panic("%s does not expect a retry\n", name()); }
+    };
+
+    struct SenderState : public Packet::SenderState
+    {
+        Thread* th;
+        SenderState(Thread* _th)
+        {
+            assert(_th);
+            th = _th;
+        }
+
+        ~SenderState()
+        {}
+    };
+
+  public:
+    typedef ProtocolTesterParams Params;
+    ProtocolTester(const Params *p);
+    ~ProtocolTester();
+
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    void init();
+    MasterID masterId() { return _masterId; };
+    virtual BaseMasterPort &getMasterPort(const std::string &if_name,
+                                          PortID idx = InvalidPortID);
+
+    int getEpisodeLength() const { return episodeLength; }
+    // return pointer to the address manager
+    AddressManager* getAddressManager() const { return addrManager; }
+    // return true if the tester should stop issuing new episodes
+    bool checkExit();
+    // verify if a location to be picked for LD/ST will satisfy
+    // data race free requirement
+    bool checkDRF(Location atomic_loc, Location loc, bool isStore) const;
+    // return the next episode id and increment it
+    int getNextEpisodeID() { return nextEpisodeId++; }
+    // get action sequence number
+    int getActionSeqNum() { return actionCount++; }
+
+    // dump error log into a file and exit the simulation
+    void dumpErrorLog(std::stringstream& ss);
+
+  private:
+    MasterID _masterId;
+
+    // list of parameters taken from python scripts
+    int numCpuPorts;
+    int numVectorPorts;
+    int numSqcPorts;
+    int numScalarPorts;
+    int numCusPerSqc;
+    int numCusPerScalar;
+    int numWfsPerCu;
+    int numWisPerWf;
+    // parameters controlling the address range that the tester can access
+    int numAtomicLocs;
+    int numNormalLocsPerAtomic;
+    // the number of actions in an episode (episodeLength +- random number)
+    int episodeLength;
+    // the maximum number of episodes to be completed by this tester
+    int maxNumEpisodes;
+    // are we debuggin the tester
+    bool debugTester;
+
+    // all available master ports connected to Ruby
+    std::vector<MasterPort*> cpuPorts;      // cpu data ports
+    std::vector<MasterPort*> cuVectorPorts; // ports to GPU vector cache
+    std::vector<MasterPort*> cuSqcPorts;    // ports to GPU instruction cache
+    std::vector<MasterPort*> cuScalarPorts; // ports to GPU scalar cache
+    // all CPU and GPU threads
+    std::vector<CpuThread*> cpuThreads;
+    std::vector<GpuWavefront*> wfs;
+
+    // address manager that (1) generates DRF sequences of requests,
+    //                      (2) manages an internal log table and
+    //                      (3) validate response data
+    AddressManager* addrManager;
+
+    // number of CPUs and CUs
+    int numCpus;
+    int numCus;
+    // unique id of the next episode
+    int nextEpisodeId;
+
+    // global action count. Overflow is fine. It's used to uniquely identify
+    // per-wave & per-instruction memory requests in the coalescer
+    int actionCount;
+
+    // if an exit signal was already sent
+    bool sentExitSignal;
+
+    OutputStream* logFile;
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_PROTOCOLTESTER_HH_ */
diff --git a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
new file mode 100644
index 0000000..0ee6ca5
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Tuan Ta
+#
+
+from MemObject import MemObject
+from m5.params import *
+from m5.proxy import *
+
+class ProtocolTester(MemObject):
+    type = 'ProtocolTester'
+    cxx_header = "cpu/testers/gpu_ruby_test/ProtocolTester.hh"
+
+    cpu_ports = VectorMasterPort("Ports for CPUs")
+    cu_vector_ports = VectorMasterPort("Vector ports for GPUs")
+    cu_sqc_ports = VectorMasterPort("SQC ports for GPUs")
+    cu_scalar_ports = VectorMasterPort("Scalar ports for GPUs")
+
+    cus_per_sqc = Param.Int(4, "Number of CUs per SQC")
+    cus_per_scalar = Param.Int(4, "Number of CUs per scalar cache")
+
+    wavefronts_per_cu = Param.Int(1, "Number of wavefronts per CU")
+    workitems_per_wavefront = Param.Int(64, "Number of workitems per wf")
+
+    cpu_threads = VectorParam.CpuThread("All cpus")
+    wavefronts = VectorParam.GpuWavefront("All wavefronts")
+
+    num_atomic_locations = Param.Int(2, "Number of atomic locations")
+    num_normal_locs_per_atomic = Param.Int(1000, \
+                                "Number of normal locations per atomic")
+
+    episode_length = Param.Int(10, "Number of actions per episode")
+    max_num_episodes = Param.Int(20, "Maximum number of episodes")
+    debug_tester = Param.Bool(False, "Are we debugging the tester?")
+    random_seed = Param.Int(0, "Random seed number. Default value (0) means \
+                                using runtime-specific value.")
+    log_file = Param.String("Log file's name")
+    system = Param.System(Parent.any, "System we belong to")
diff --git a/src/cpu/testers/gpu_ruby_test/README b/src/cpu/testers/gpu_ruby_test/README
new file mode 100644
index 0000000..964adb9
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/README
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+This directory contains a tester for gem5 GPU protocols. Unlike the Ruby random
+teter, this tester does not rely on sequential consistency. Instead, it
+assumes tested protocols supports release consistency.
+
+----- Theory Overview -----
+
+The GPU Ruby tester creates a system consisting of both CPU threads and GPU
+wavefronts. CPU threads are scalar, so there is one lane per CPU thread. GPU
+wavefront may have multiple lanes. The number of lanes is initialized when
+a thread/wavefront is created.
+
+Each thread/wavefront executes a number of episodes. Each episode is a series
+of memory actions (i.e., atomic, load, store, acquire and release). In a
+wavefront, all lanes execute the same sequence of actions, but they may target
+different addresses. One can think of an episode as a critical section which
+is bounded by a lock acquire in the beginning and a lock release at the end. An
+episode consists of actions in the following order:
+
+1 - Atomic action
+2 - Acquire action
+3 - A number of load and store actions
+4 - Release action
+5 - Atomic action that targets the same address as (1) does
+
+There are two separate set of addresses: atomic and non-atomic. Atomic actions
+target only atomic addresses. Load and store actions target only non-atomic
+addresses. Memory addresses are all 4-byte aligned in the tester.
+
+To test false sharing cases in which both atomic and non-atomic addresses are
+placed in the same cache line, we abstract out the concept of memory addresses
+from the tester's perspective by introducing the concept of location. Locations
+are numbered from 0 to N-1 (if there are N addresses). The first X locations
+[0..X-1] are atomic locations, and the rest are non-atomic locations.
+The 1-1 mapping between locations and addresses are randomly created when the
+tester is initialized.
+
+Per load and store action, its target location is selected so that there is no
+data race in the generated stream of memory requests at any time during the
+test. Since in Data-Race-Free model, the memory system's behavior is undefined
+in data race cases, we exclude data race scenarios from our protocol test.
+
+Once location per load/store action is determined, each thread/wavefront either
+loads current value at the location or stores an incremental value to that
+location. The tester maintains a table tracking all last writers and their
+written values, so we know what value should be returned from a load and what
+value should be written next at a particular location. Value returned from a
+load must match with the value written by the last writer.
+
+----- Directory Structure -----
+
+ProtocolTester.hh/cc -- This is the main tester class that orchestrates the
+                        entire test.
+AddressManager.hh/cc -- This manages address space, randomly maps address to
+                        location, generates locations for all episodes,
+                        maintains per-location last writer and validates
+                        values returned from load actions.
+Thread.hh/cc         -- This is abstract class for CPU threads and GPU
+                        wavefronts. It generates and executes a series of
+                        episodes.
+CpuThread.hh/cc      -- Thread class for CPU threads. Not fully implemented yet
+GpuWavefront.hh/cc   -- Thread class for GPU wavefronts.
+Episode.hh/cc        -- Class to encapsulate an episode, notably including
+                        episode load/store structure and ordering.
diff --git a/src/cpu/testers/gpu_ruby_test/SConscript b/src/cpu/testers/gpu_ruby_test/SConscript
new file mode 100644
index 0000000..6e7a815
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/SConscript
@@ -0,0 +1,53 @@
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Tuan Ta
+#
+
+Import('*')
+
+if env['PROTOCOL'] == 'None':
+    Return()
+
+SimObject('ProtocolTester.py')
+SimObject('Thread.py')
+SimObject('CpuThread.py')
+SimObject('GpuWavefront.py')
+
+Source('AddressManager.cc')
+Source('Episode.cc')
+Source('ProtocolTester.cc')
+Source('Thread.cc')
+Source('CpuThread.cc')
+Source('GpuWavefront.cc')
+
+DebugFlag('ProtocolTest')
diff --git a/src/cpu/testers/gpu_ruby_test/Thread.cc b/src/cpu/testers/gpu_ruby_test/Thread.cc
new file mode 100644
index 0000000..b499c9b
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/Thread.cc
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+#include "cpu/testers/gpu_ruby_test/Thread.hh"
+
+#include <fstream>
+
+#include "debug/ProtocolTest.hh"
+
+Thread::Thread(const Params* p)
+      : ClockedObject(p),
+        threadEvent(this, "Thread tick"),
+        deadlockCheckEvent(this),
+        threadId(p->thread_id),
+        numLanes(p->num_lanes),
+        deadlockThreshold(p->deadlock_threshold)
+{
+    tester = nullptr;       // set by attachThreadToPorts()
+    addrManager = nullptr;  // set by attachThreadToPorts()
+    port = nullptr;         // set by attachThreadToPorts()
+    scalarPort = nullptr;  // set by attachThreadToPorts()
+    sqcPort = nullptr;     // set by attachThreadToPorts()
+
+    curEpisode = nullptr;
+    curAction = nullptr;
+
+    pendingLdStCount = 0;
+    pendingFenceCount = 0;
+    pendingAtomicCount = 0;
+
+    lastActiveCycle = Cycles(0);
+}
+
+Thread::~Thread()
+{
+    for (auto ep : episodeHistory) {
+        assert(ep != nullptr);
+        delete ep;
+    }
+}
+
+void
+Thread::wakeup()
+{
+    // this thread is waken up by one of the following events
+    //      - hitCallback is called
+    //      - a new episode is created
+
+    // check if this is the first episode in this thread
+    if (curEpisode == nullptr) {
+        issueNewEpisode();
+        assert(curEpisode);
+    }
+
+    if (isNextActionReady()) {
+        // isNextActionReady should check if the action list is empty
+        assert(curAction != nullptr);
+
+        // issue the next action
+        issueNextAction();
+    } else {
+        // check for completion of the current episode
+        // completion = no outstanding requests + not having more actions
+        if (!curEpisode->hasMoreActions() &&
+            pendingLdStCount == 0 &&
+            pendingFenceCount == 0 &&
+            pendingAtomicCount == 0) {
+
+            curEpisode->completeEpisode();
+
+            // check if it's time to stop the tester
+            if (tester->checkExit()) {
+                // no more event is scheduled for this thread
+                return;
+            }
+
+            // issue the next episode
+            issueNewEpisode();
+            assert(curEpisode);
+
+            // now we get a new episode
+            // let's wake up the thread in the next cycle
+            if (!threadEvent.scheduled()) {
+                scheduleWakeup();
+            }
+        }
+    }
+}
+
+void
+Thread::scheduleWakeup()
+{
+    assert(!threadEvent.scheduled());
+    schedule(threadEvent, nextCycle());
+}
+
+void
+Thread::scheduleDeadlockCheckEvent()
+{
+    // after this first schedule, the deadlock event is scheduled by itself
+    assert(!deadlockCheckEvent.scheduled());
+    schedule(deadlockCheckEvent, nextCycle());
+}
+
+void
+Thread::attachThreadToPorts(ProtocolTester *_tester,
+                            ProtocolTester::SeqPort *_port,
+                            ProtocolTester::SeqPort *_scalarPort,
+                            ProtocolTester::SeqPort *_sqcPort)
+{
+    tester = _tester;
+    port = _port;
+    scalarPort = _scalarPort;
+    sqcPort = _sqcPort;
+
+    assert(tester && port);
+    addrManager = tester->getAddressManager();
+    assert(addrManager);
+}
+
+void
+Thread::issueNewEpisode()
+{
+    int num_reg_loads = random() % tester->getEpisodeLength();
+    int num_reg_stores = tester->getEpisodeLength() - num_reg_loads;
+
+    // create a new episode
+    curEpisode = new Episode(tester, this, num_reg_loads, num_reg_stores);
+    episodeHistory.push_back(curEpisode);
+}
+
+bool
+Thread::isNextActionReady()
+{
+    if (!curEpisode->hasMoreActions()) {
+        return false;
+    } else {
+        curAction = curEpisode->peekCurAction();
+
+        switch(curAction->getType()) {
+            case Episode::Action::Type::ATOMIC:
+                // an atomic action must wait for all previous requests
+                // to complete
+                if (pendingLdStCount == 0 &&
+                    pendingFenceCount == 0 &&
+                    pendingAtomicCount == 0) {
+                    return true;
+                }
+
+                return false;
+            case Episode::Action::Type::ACQUIRE:
+                // we should not see any outstanding ld_st or fence here
+                assert(pendingLdStCount == 0 &&
+                       pendingFenceCount == 0);
+
+                // an acquire action must wait for all previous atomic
+                // requests to complete
+                if (pendingAtomicCount == 0) {
+                    return true;
+                }
+
+                return false;
+            case Episode::Action::Type::RELEASE:
+                // we should not see any outstanding atomic or fence here
+                assert(pendingAtomicCount == 0 &&
+                       pendingFenceCount == 0);
+
+                // a release action must wait for all previous ld/st
+                // requests to complete
+                if (pendingLdStCount == 0) {
+                    return true;
+                }
+
+                return false;
+            case Episode::Action::Type::LOAD:
+            case Episode::Action::Type::STORE:
+                // we should not see any outstanding atomic here
+                assert(pendingAtomicCount == 0);
+
+                // can't issue if there is a pending fence
+                if (pendingFenceCount > 0) {
+                    return false;
+                }
+
+                // a Load or Store is ready if it doesn't overlap
+                // with any outstanding request
+                for (int lane = 0; lane < numLanes; ++lane) {
+                    Location loc = curAction->getLocation(lane);
+
+                    if (loc != AddressManager::INVALID_LOCATION) {
+                        Addr addr = addrManager->getAddress(loc);
+
+                        if (outstandingLoads.find(addr) !=
+                            outstandingLoads.end()) {
+                            return false;
+                        }
+
+                        if (outstandingStores.find(addr) !=
+                            outstandingStores.end()) {
+                            return false;
+                        }
+
+                        if (outstandingAtomics.find(addr) !=
+                            outstandingAtomics.end()) {
+                            // this is not an atomic action, so the address
+                            // should not be in outstandingAtomics list
+                            assert(false);
+                        }
+                    }
+                }
+
+                return true;
+            default:
+                panic("The tester got an invalid action\n");
+        }
+    }
+}
+
+void
+Thread::issueNextAction()
+{
+    switch(curAction->getType()) {
+        case Episode::Action::Type::ATOMIC:
+            issueAtomicOps();
+            break;
+        case Episode::Action::Type::ACQUIRE:
+            issueAcquireOp();
+            break;
+        case Episode::Action::Type::RELEASE:
+            issueReleaseOp();
+            break;
+        case Episode::Action::Type::LOAD:
+            issueLoadOps();
+            break;
+        case Episode::Action::Type::STORE:
+            issueStoreOps();
+            break;
+        default:
+            panic("The tester got an invalid action\n");
+    }
+
+    // the current action has been issued, pop it from the action list
+    curEpisode->popAction();
+    lastActiveCycle = curCycle();
+
+    // we may be able to schedule the next action
+    // just wake up this thread in the next cycle
+    if (!threadEvent.scheduled()) {
+        scheduleWakeup();
+    }
+}
+
+void
+Thread::addOutstandingReqs(OutstandingReqTable& req_table, Addr address,
+                           int lane, Location loc, Value stored_val)
+{
+    OutstandingReqTable::iterator it = req_table.find(address);
+    OutstandingReq req(lane, loc, stored_val, curCycle());
+
+    if (it == req_table.end()) {
+        // insert a new list of requests for this address
+        req_table.insert(std::pair<Addr, OutstandingReqList>(address,
+                                                OutstandingReqList(1, req)));
+    } else {
+        // add a new request
+        (it->second).push_back(req);
+    }
+}
+
+Thread::OutstandingReq
+Thread::popOutstandingReq(OutstandingReqTable& req_table, Addr addr)
+{
+    OutstandingReqTable::iterator it = req_table.find(addr);
+
+    // there must be exactly one list of requests for this address in the table
+    assert(it != req_table.end());
+
+    // get the request list
+    OutstandingReqList& req_list = it->second;
+    assert(!req_list.empty());
+
+    // save a request
+    OutstandingReq ret_req = req_list.back();
+
+    // remove the request from the list
+    req_list.pop_back();
+
+    // if the list is now empty, remove it from req_table
+    if (req_list.empty()) {
+        req_table.erase(it);
+    }
+
+    return ret_req;
+}
+
+void
+Thread::validateAtomicResp(Location loc, int lane, Value ret_val)
+{
+    if (!addrManager->validateAtomicResp(loc, ret_val)) {
+        std::stringstream ss;
+        Addr addr = addrManager->getAddress(loc);
+
+        // basic info
+        ss << threadName << ": Atomic Op returned unexpected value\n"
+           << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
+           << "\tLane ID " << lane << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
+           << "\tAtomic Op's return value " << ret_val << "\n";
+
+        // print out basic info
+        warn("%s\n", ss.str());
+
+        // TODO add more detailed info
+
+        // dump all error info and exit the simulation
+        tester->dumpErrorLog(ss);
+    }
+}
+
+void
+Thread::validateLoadResp(Location loc, int lane, Value ret_val)
+{
+    if (ret_val != addrManager->getLoggedValue(loc)) {
+        std::stringstream ss;
+        Addr addr = addrManager->getAddress(loc);
+
+        // basic info
+        ss << threadName << ": Loaded value is not consistent with "
+           << "the last stored value\n"
+           << "\tThread " << threadId << "\n"
+           << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
+           << "\tLane ID " << lane << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
+           << "\tLoaded value " << ret_val << "\n"
+           << "\tLast writer " << addrManager->printLastWriter(loc) << "\n";
+
+        // print out basic info
+        warn("%s\n", ss.str());
+
+        // TODO add more detailed info
+
+        // dump all error info and exit the simulation
+        tester->dumpErrorLog(ss);
+    }
+}
+
+bool
+Thread::checkDRF(Location atomic_loc, Location loc, bool isStore) const
+{
+    if (curEpisode && curEpisode->isEpsActive()) {
+        // check against the current episode this thread is executing
+        return curEpisode->checkDRF(atomic_loc, loc, isStore, numLanes);
+    }
+
+    return true;
+}
+
+void
+Thread::checkDeadlock()
+{
+    if ((curCycle() - lastActiveCycle) > deadlockThreshold) {
+        // deadlock detected
+        std::stringstream ss;
+
+        ss << threadName << ": Deadlock detected\n"
+           << "\tLast active cycle: " <<  lastActiveCycle << "\n"
+           << "\tCurrent cycle: " << curCycle() << "\n"
+           << "\tDeadlock threshold: " << deadlockThreshold << "\n";
+
+        // print out basic info
+        warn("%s\n", ss.str());
+
+        // dump all error info and exit the simulation
+        tester->dumpErrorLog(ss);
+    } else if (!tester->checkExit()) {
+        // schedule a future deadlock check event
+        assert(!deadlockCheckEvent.scheduled());
+        schedule(deadlockCheckEvent,
+                 deadlockThreshold * clockPeriod() + curTick());
+    }
+}
+
+void
+Thread::printOutstandingReqs(const OutstandingReqTable& table,
+                             std::stringstream& ss) const
+{
+    Cycles cur_cycle = curCycle();
+
+    for (const auto& m : table) {
+        for (const auto& req : m.second) {
+            ss << "\t\t\tAddr " << printAddress(m.first)
+               << ": delta (curCycle - issueCycle) = "
+               << (cur_cycle - req.issueCycle) << std::endl;
+        }
+    }
+}
+
+void
+Thread::printAllOutstandingReqs(std::stringstream& ss) const
+{
+    // dump all outstanding requests of this thread
+    ss << "\t\tOutstanding Loads:\n";
+    printOutstandingReqs(outstandingLoads, ss);
+    ss << "\t\tOutstanding Stores:\n";
+    printOutstandingReqs(outstandingStores, ss);
+    ss << "\t\tOutstanding Atomics:\n";
+    printOutstandingReqs(outstandingAtomics, ss);
+    ss << "\t\tNumber of outstanding acquires & releases: "
+       << pendingFenceCount << std::endl;
+}
diff --git a/src/cpu/testers/gpu_ruby_test/Thread.hh b/src/cpu/testers/gpu_ruby_test/Thread.hh
new file mode 100644
index 0000000..6816bb8
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/Thread.hh
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Tuan Ta
+ */
+
+/*
+ * Thread issues requests to and receives responses from Ruby memory
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_THREAD_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_THREAD_HH_
+
+#include "cpu/testers/gpu_ruby_test/AddressManager.hh"
+#include "cpu/testers/gpu_ruby_test/Episode.hh"
+#include "cpu/testers/gpu_ruby_test/ProtocolTester.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "sim/clocked_object.hh"
+
+class Thread : public ClockedObject
+{
+  public:
+    typedef ThreadParams Params;
+    Thread(const Params *p);
+    virtual ~Thread();
+
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    void wakeup();
+    void scheduleWakeup();
+    void checkDeadlock();
+    void scheduleDeadlockCheckEvent();
+
+    void attachThreadToPorts(ProtocolTester *_tester,
+                             ProtocolTester::SeqPort *_port,
+                             ProtocolTester::SeqPort *_sqcPort = nullptr,
+                             ProtocolTester::SeqPort *_scalarPort = nullptr);
+
+    const std::string& getName() const { return threadName; }
+
+    // must be implemented by a child class
+    virtual void hitCallback(PacketPtr pkt) = 0;
+
+    int getThreadId() const { return threadId; }
+    int getNumLanes() const { return numLanes; }
+    // check if the input location would satisfy DRF constraint
+    bool checkDRF(Location atomic_loc, Location loc, bool isStore) const;
+
+    void printAllOutstandingReqs(std::stringstream& ss) const;
+
+  protected:
+    class ThreadEvent : public Event
+    {
+      private:
+        Thread* thread;
+        std::string desc;
+
+      public:
+        ThreadEvent(Thread* _thread, std::string _description)
+            : Event(CPU_Tick_Pri), thread(_thread), desc(_description)
+        {}
+        void setDesc(std::string _description) { desc = _description; }
+        void process() { thread->wakeup(); }
+        const std::string name() { return desc; }
+    };
+
+    ThreadEvent threadEvent;
+
+    class DeadlockCheckEvent : public Event
+    {
+      private:
+        Thread* thread;
+
+      public:
+        DeadlockCheckEvent(Thread* _thread)
+            : Event(CPU_Tick_Pri), thread(_thread)
+        {}
+        void process() { thread->checkDeadlock(); }
+        const std::string name() const { return "Tester deadlock check"; }
+    };
+
+    DeadlockCheckEvent deadlockCheckEvent;
+
+    struct OutstandingReq
+    {
+        int lane;
+        Location origLoc;
+        Value storedValue;
+        Cycles issueCycle;
+
+        OutstandingReq(int _lane, Location _loc, Value _val, Cycles _cycle)
+            : lane(_lane), origLoc(_loc), storedValue(_val), issueCycle(_cycle)
+        {}
+
+        ~OutstandingReq()
+        {}
+    };
+
+    // the unique global id of this thread
+    int threadId;
+    // width of this thread (1 for cpu thread & wf size for gpu wavefront)
+    int numLanes;
+    // thread name
+    std::string threadName;
+    // pointer to the main tester
+    ProtocolTester *tester;
+    // pointer to the address manager
+    AddressManager *addrManager;
+
+    ProtocolTester::SeqPort *port;       // main data port (GPU-vector data)
+    ProtocolTester::SeqPort *scalarPort; // nullptr for CPU
+    ProtocolTester::SeqPort *sqcPort;   // nullptr for CPU
+
+    // a list of issued episodes sorted by time
+    // the last episode in the list is the current episode
+    typedef std::vector<Episode*> EpisodeHistory;
+    EpisodeHistory episodeHistory;
+    // pointer to the current episode
+    Episode *curEpisode;
+    // pointer to the current action
+    const Episode::Action *curAction;
+
+    // number of outstanding requests that are waiting for their responses
+    int pendingLdStCount;
+    int pendingFenceCount;
+    int pendingAtomicCount;
+
+    // last cycle when there is an event in this thread
+    Cycles lastActiveCycle;
+    Cycles deadlockThreshold;
+
+    // a per-address list of outstanding requests
+    typedef std::vector<OutstandingReq> OutstandingReqList;
+    typedef std::unordered_map<Addr, OutstandingReqList> OutstandingReqTable;
+    OutstandingReqTable outstandingLoads;
+    OutstandingReqTable outstandingStores;
+    OutstandingReqTable outstandingAtomics;
+
+    void issueNewEpisode();
+    // check if the next action in the current episode satisfies all wait_cnt
+    // constraints and is ready to issue
+    bool isNextActionReady();
+    void issueNextAction();
+
+    // issue Ops to Ruby memory
+    // must be implemented by a child class
+    virtual void issueLoadOps() = 0;
+    virtual void issueStoreOps() = 0;
+    virtual void issueAtomicOps() = 0;
+    virtual void issueAcquireOp() = 0;
+    virtual void issueReleaseOp() = 0;
+
+    // add an outstanding request to its corresponding table
+    void addOutstandingReqs(OutstandingReqTable& req_table, Addr addr,
+                            int lane, Location loc,
+                            Value stored_val = AddressManager::INVALID_VALUE);
+
+    // pop an outstanding request from the input table
+    OutstandingReq popOutstandingReq(OutstandingReqTable& req_table,
+                                     Addr address);
+
+    // validate all atomic responses
+    void validateAtomicResp(Location loc, int lane, Value ret_val);
+    // validate all Load responses
+    void validateLoadResp(Location loc, int lane, Value ret_val);
+
+    void printOutstandingReqs(const OutstandingReqTable& table,
+                              std::stringstream& ss) const;
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_THREAD_HH_ */
diff --git a/src/cpu/testers/gpu_ruby_test/Thread.py b/src/cpu/testers/gpu_ruby_test/Thread.py
new file mode 100644
index 0000000..22d2b78
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/Thread.py
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Tuan Ta
+#
+
+from ClockedObject import ClockedObject
+from m5.params import *
+from m5.proxy import *
+
+class Thread(ClockedObject):
+    type = 'Thread'
+    abstract = True
+    cxx_header = "cpu/testers/gpu_ruby_test/Thread.hh"
+    thread_id = Param.Int("Unique Thread ID")
+    num_lanes = Param.Int("Number of lanes this thread has")
+    deadlock_threshold = Param.Cycles(1000000000, "Deadlock threshold")
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 87f29eb..7fd0167 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -878,7 +878,7 @@
                    cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
         }
     } else {
-        if (pkt->cmd == MemCmd::MemFenceReq) {
+        if (pkt->cmd == MemCmd::MemSyncReq) {
             gpuDynInst->statusBitVector = VectorMask(0);
         } else {
             gpuDynInst->statusBitVector &= (~(1ll << index));
@@ -951,7 +951,7 @@
         req->setFlags(Request::KERNEL);
     }
 
-    // for non-kernel MemFence operations, memorder flags are set depending
+    // for non-kernel MemSync operations, memorder flags are set depending
     // on which type of request is currently being sent, so this
     // should be set by the caller (e.g. if an inst has acq-rel
     // semantics, it will send one acquire req an one release req)
@@ -961,7 +961,7 @@
     assert(req->isAcquire() || req->isRelease());
 
     // create packet
-    PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
+    PacketPtr pkt = new Packet(req, MemCmd::MemSyncReq);
 
     // set packet's sender state
     pkt->senderState =
@@ -988,7 +988,7 @@
 
     Addr paddr = pkt->req->getPaddr();
 
-    if (pkt->cmd != MemCmd::MemFenceResp) {
+    if (pkt->cmd != MemCmd::MemSyncResp) {
         int index = gpuDynInst->memStatusVector[paddr].back();
 
         DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index ffda3d5..0058df3 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -178,12 +178,12 @@
     /* IntReq -- for interrupts */
     { SET4(IsWrite, IsRequest, NeedsResponse, HasData),
         MessageResp, "MessageReq" },
-    /* IntResp -- for interrupts */
+    /* MessageResp -- for interrupts or memory ACKs */
     { SET2(IsWrite, IsResponse), InvalidCmd, "MessageResp" },
-    /* MemFenceReq -- for synchronization requests */
-    {SET2(IsRequest, NeedsResponse), MemFenceResp, "MemFenceReq"},
-    /* MemFenceResp -- for synchronization responses */
-    {SET1(IsResponse), InvalidCmd, "MemFenceResp"},
+    /* MemSyncReq -- for synchronization requests */
+    {SET2(IsRequest, NeedsResponse), MemSyncResp, "MemSyncReq"},
+    /* MemSyncResp -- for synchronization responses */
+    {SET1(IsResponse), InvalidCmd, "MemSyncResp"},
     /* Cache Clean Request -- Update with the latest data all existing
        copies of the block down to the point indicated by the
        request */
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 66625b3..28f4cfc 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -114,8 +114,8 @@
         SwapResp,
         MessageReq,
         MessageResp,
-        MemFenceReq,
-        MemFenceResp,
+        MemSyncReq,
+        MemSyncResp,
         CleanSharedReq,
         CleanSharedResp,
         CleanInvalidReq,
diff --git a/src/mem/protocol/GPU_VIPER-TCC.sm b/src/mem/protocol/GPU_VIPER-TCC.sm
index e21e980..42367d4 100644
--- a/src/mem/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/protocol/GPU_VIPER-TCC.sm
@@ -126,6 +126,7 @@
   void wakeUpAllBuffers();
   void wakeUpBuffers(Addr a);
 
+  MachineID mapAddressToMachine(Addr addr, MachineType mtype);
 
   // FUNCTION DEFINITIONS
   Tick clockEdge();
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 5977ce9..5162015 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -267,7 +267,7 @@
     }
     // Check for pio requests and directly send them to the dedicated
     // pio port.
-    if (pkt->cmd != MemCmd::MemFenceReq) {
+    if (pkt->cmd != MemCmd::MemSyncReq) {
         if (!isPhysMemAddress(pkt->getAddr())) {
             assert(ruby_port->memMasterPort.isConnected());
             DPRINTF(RubyPort, "Request address %#x assumed to be a "
@@ -304,7 +304,7 @@
         return true;
     }
 
-    if (pkt->cmd != MemCmd::MemFenceReq) {
+    if (pkt->cmd != MemCmd::MemSyncReq) {
         DPRINTF(RubyPort,
                 "Request for address %#x did not issued because %s\n",
                 pkt->getAddr(), RequestStatus_to_string(requestStatus));
@@ -326,7 +326,7 @@
 
     // Check for pio requests and directly send them to the dedicated
     // pio port.
-    if (pkt->cmd != MemCmd::MemFenceReq) {
+    if (pkt->cmd != MemCmd::MemSyncReq) {
         if (!isPhysMemAddress(pkt->getAddr())) {
             assert(ruby_port->memMasterPort.isConnected());
             DPRINTF(RubyPort, "Request address %#x assumed to be a "
@@ -544,7 +544,7 @@
     }
 
     // Flush, acquire, release requests don't access physical memory
-    if (pkt->isFlush() || pkt->cmd == MemCmd::MemFenceReq) {
+    if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq) {
         accessPhysMem = false;
     }