misc: Merge branch version update into develop
diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py
index 9443520..b530145 100644
--- a/configs/common/MemConfig.py
+++ b/configs/common/MemConfig.py
@@ -40,7 +40,8 @@
 from common import ObjectList
 from common import HMC
 
-def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size):
+def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size,\
+                    xor_low_bit):
     """
     Helper function for creating a single memoy controller from the given
     options.  This function is invoked multiple times in config_mem function
@@ -55,7 +56,10 @@
     # the details of the caches here, make an educated guess. 4 MByte
     # 4-way associative with 64 byte cache lines is 6 offset bits and
     # 14 index bits.
-    xor_low_bit = 20
+    if (xor_low_bit):
+        xor_high_bit = xor_low_bit + intlv_bits - 1
+    else:
+        xor_high_bit = 0
 
     # Create an instance so we can figure out the address
     # mapping and row-buffer size
@@ -81,8 +85,7 @@
     ctrl.range = m5.objects.AddrRange(r.start, size = r.size(),
                                       intlvHighBit = \
                                           intlv_low_bit + intlv_bits - 1,
-                                      xorHighBit = \
-                                          xor_low_bit + intlv_bits - 1,
+                                      xorHighBit = xor_high_bit,
                                       intlvBits = intlv_bits,
                                       intlvMatch = i)
     return ctrl
@@ -110,6 +113,7 @@
     opt_mem_ranks = getattr(options, "mem_ranks", None)
     opt_dram_powerdown = getattr(options, "enable_dram_powerdown", None)
     opt_mem_channels_intlv = getattr(options, "mem_channels_intlv", 128)
+    opt_xor_low_bit = getattr(options, "xor_low_bit", 0)
 
     if opt_mem_type == "HMC_2500_1x32":
         HMChost = HMC.config_hmc_host_ctrl(options, system)
@@ -163,7 +167,7 @@
     for r in system.mem_ranges:
         for i in range(nbr_mem_ctrls):
             mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits,
-                                       intlv_size)
+                                       intlv_size, opt_xor_low_bit)
             # Set the number of ranks based on the command-line
             # options if it was explicitly set
             if issubclass(cls, m5.objects.DRAMCtrl) and opt_mem_ranks:
diff --git a/configs/ruby/MOESI_CMP_directory.py b/configs/ruby/MOESI_CMP_directory.py
index 8778b61..a78f73c 100644
--- a/configs/ruby/MOESI_CMP_directory.py
+++ b/configs/ruby/MOESI_CMP_directory.py
@@ -211,6 +211,7 @@
         dir_cntrl.forwardFromDir.master = ruby_system.network.slave
         dir_cntrl.requestToMemory = MessageBuffer()
         dir_cntrl.responseFromMemory = MessageBuffer()
+        dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
 
 
     for i, dma_port in enumerate(dma_ports):
diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py
index e69784f..9bceaa3 100644
--- a/configs/ruby/Ruby.py
+++ b/configs/ruby/Ruby.py
@@ -76,6 +76,15 @@
     parser.add_option("--numa-high-bit", type="int", default=0,
                       help="high order address bit to use for numa mapping. " \
                            "0 = highest bit, not specified = lowest bit")
+    parser.add_option("--interleaving-bits", type="int", default=0,
+                      help="number of bits to specify interleaving " \
+                           "in directory, memory controllers and caches. "
+                           "0 = not specified")
+    parser.add_option("--xor-low-bit", type="int", default=20,
+                      help="hashing bit for channel selection" \
+                           "see MemConfig for explanation of the default"\
+                           "parameter. If set to 0, xor_high_bit is also"\
+                           "set to 0.")
 
     parser.add_option("--recycle-latency", type="int", default=10,
                       help="Recycle latency for ruby controller input buffers")
@@ -86,7 +95,13 @@
     Network.define_options(parser)
 
 def setup_memory_controllers(system, ruby, dir_cntrls, options):
-    ruby.block_size_bytes = options.cacheline_size
+    if (options.numa_high_bit):
+        block_size_bits = options.numa_high_bit + 1 - \
+                          int(math.log(options.num_dirs, 2))
+        ruby.block_size_bytes = 2 ** (block_size_bits)
+    else:
+        ruby.block_size_bytes = options.cacheline_size
+
     ruby.memory_size_bits = 48
 
     index = 0
@@ -117,7 +132,7 @@
             mem_type = ObjectList.mem_list.get(options.mem_type)
             mem_ctrl = MemConfig.create_mem_ctrl(mem_type, r, index,
                 options.num_dirs, int(math.log(options.num_dirs, 2)),
-                intlv_size)
+                intlv_size, options.xor_low_bit)
 
             if options.access_backing_store:
                 mem_ctrl.kvm_map=False
diff --git a/src/arch/arm/freebsd/fs_workload.cc b/src/arch/arm/freebsd/fs_workload.cc
index 91a1d89..e3660d9 100644
--- a/src/arch/arm/freebsd/fs_workload.cc
+++ b/src/arch/arm/freebsd/fs_workload.cc
@@ -83,13 +83,12 @@
     if (params()->early_kernel_symbols) {
         kernelObj->loadGlobalSymbols(kernelSymtab, 0, 0, _loadAddrMask);
         kernelObj->loadGlobalSymbols(
-                Loader::debugSymbolTable, 0, 0, _loadAddrMask);
+                &Loader::debugSymbolTable, 0, 0, _loadAddrMask);
     }
 
     // Check if the kernel image has a symbol that tells us it supports
     // device trees.
-    Addr addr;
-    fatal_if(!kernelSymtab->findAddress("fdt_get_range", addr),
+    fatal_if(kernelSymtab->find("fdt_get_range") == kernelSymtab->end(),
              "Kernel must have fdt support.");
     fatal_if(params()->dtb_filename == "", "dtb file is not specified.");
 
diff --git a/src/arch/arm/fs_workload.cc b/src/arch/arm/fs_workload.cc
index 09c7bb2..4c654b8 100644
--- a/src/arch/arm/fs_workload.cc
+++ b/src/arch/arm/fs_workload.cc
@@ -91,7 +91,7 @@
              "Can't find a matching boot loader / kernel combination!");
 
     if (bootldr)
-        bootldr->loadGlobalSymbols(Loader::debugSymbolTable);
+        bootldr->loadGlobalSymbols(&Loader::debugSymbolTable);
 }
 
 void
diff --git a/src/arch/arm/insts/static_inst.cc b/src/arch/arm/insts/static_inst.cc
index b84aa81..9c686f6 100644
--- a/src/arch/arm/insts/static_inst.cc
+++ b/src/arch/arm/insts/static_inst.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2014, 2016-2019 ARM Limited
+ * Copyright (c) 2010-2014, 2016-2020 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -393,18 +393,19 @@
 ArmStaticInst::printTarget(std::ostream &os, Addr target,
                            const Loader::SymbolTable *symtab) const
 {
-    Addr symbolAddr;
-    std::string symbol;
-
-    if (symtab && symtab->findNearestSymbol(target, symbol, symbolAddr)) {
-        ccprintf(os, "<%s", symbol);
-        if (symbolAddr != target)
-            ccprintf(os, "+%d>", target - symbolAddr);
-        else
-            ccprintf(os, ">");
-    } else {
-        ccprintf(os, "%#x", target);
+    if (symtab) {
+        auto it = symtab->findNearest(target);
+        if (it != symtab->end()) {
+            ccprintf(os, "<%s", it->name);
+            Addr delta = target - it->address;
+            if (delta)
+                ccprintf(os, "+%d>", delta);
+            else
+                ccprintf(os, ">");
+            return;
+        }
     }
+    ccprintf(os, "%#x", target);
 }
 
 void
@@ -477,13 +478,14 @@
                               const Addr addr,
                               const std::string &suffix) const
 {
-    Addr symbolAddr;
-    std::string symbol;
-    if (symtab && symtab->findNearestSymbol(addr, symbol, symbolAddr)) {
-        ccprintf(os, "%s%s", prefix, symbol);
-        if (symbolAddr != addr)
-            ccprintf(os, "+%d", addr - symbolAddr);
-        ccprintf(os, suffix);
+    if (symtab) {
+        auto it = symtab->findNearest(addr);
+        if (it != symtab->end()) {
+            ccprintf(os, "%s%s", prefix, it->name);
+            if (it->address != addr)
+                ccprintf(os, "+%d", addr - it->address);
+            ccprintf(os, suffix);
+        }
     }
 }
 
@@ -984,37 +986,41 @@
 }
 
 Fault
-ArmStaticInst::checkSveTrap(ThreadContext *tc, CPSR cpsr) const
+ArmStaticInst::checkSveEnabled(ThreadContext *tc, CPSR cpsr, CPACR cpacr) const
 {
     const ExceptionLevel el = (ExceptionLevel) (uint8_t) cpsr.el;
+    // Check if access disabled in CPACR_EL1
+    if (el <= EL1 && !ELIsInHost(tc, el)) {
+        if ((el == EL0 && cpacr.zen == 0x1) ||
+            (!(cpacr.zen & 0x1)))
+            return sveAccessTrap(EL1);
 
-    if (ArmSystem::haveVirtualization(tc) && el <= EL2) {
-        CPTR cptrEnCheck = tc->readMiscReg(MISCREG_CPTR_EL2);
-        if (cptrEnCheck.tz)
-            return sveAccessTrap(EL2);
+        if ((el == EL0 && cpacr.fpen == 0x1) ||
+            (!(cpacr.fpen & 0x1)))
+            return advSIMDFPAccessTrap64(EL1);
     }
 
+    // Check if access disabled in CPTR_EL2
+    if (el <= EL2 && EL2Enabled(tc)) {
+        CPTR cptr_en_check = tc->readMiscReg(MISCREG_CPTR_EL2);
+        if (cptr_en_check.tz)
+            return sveAccessTrap(EL2);
+        if (cptr_en_check.tfp)
+            return advSIMDFPAccessTrap64(EL2);
+    }
+
+    // Check if access disabled in CPTR_EL3
     if (ArmSystem::haveSecurity(tc)) {
-        CPTR cptrEnCheck = tc->readMiscReg(MISCREG_CPTR_EL3);
-        if (!cptrEnCheck.ez)
+        CPTR cptr_en_check = tc->readMiscReg(MISCREG_CPTR_EL3);
+        if (!cptr_en_check.ez)
             return sveAccessTrap(EL3);
+        if (cptr_en_check.tfp)
+            return advSIMDFPAccessTrap64(EL3);
     }
 
     return NoFault;
 }
 
-Fault
-ArmStaticInst::checkSveEnabled(ThreadContext *tc, CPSR cpsr, CPACR cpacr) const
-{
-    const ExceptionLevel el = (ExceptionLevel) (uint8_t) cpsr.el;
-    if ((el == EL0 && cpacr.zen != 0x3) ||
-        (el == EL1 && !(cpacr.zen & 0x1)))
-        return sveAccessTrap(EL1);
-
-    return checkSveTrap(tc, cpsr);
-}
-
-
 static uint8_t
 getRestoredITBits(ThreadContext *tc, CPSR spsr)
 {
diff --git a/src/arch/arm/insts/static_inst.hh b/src/arch/arm/insts/static_inst.hh
index ce6569a..bee3903 100644
--- a/src/arch/arm/insts/static_inst.hh
+++ b/src/arch/arm/insts/static_inst.hh
@@ -476,11 +476,6 @@
     Fault sveAccessTrap(ExceptionLevel el) const;
 
     /**
-     * Check an SVE access against CPTR_EL2 and CPTR_EL3.
-     */
-    Fault checkSveTrap(ThreadContext *tc, CPSR cpsr) const;
-
-    /**
      * Check an SVE access against CPACR_EL1, CPTR_EL2, and CPTR_EL3.
      */
     Fault checkSveEnabled(ThreadContext *tc, CPSR cpsr, CPACR cpacr) const;
diff --git a/src/arch/arm/isa.cc b/src/arch/arm/isa.cc
index b3d6726..b18bbb0 100644
--- a/src/arch/arm/isa.cc
+++ b/src/arch/arm/isa.cc
@@ -787,12 +787,12 @@
     if (upper > 0) {
         miscRegs[lower] = bits(v, 31, 0);
         miscRegs[upper] = bits(v, 63, 32);
-        DPRINTF(MiscRegs, "Writing to misc reg %d (%d:%d) : %#x\n",
-                misc_reg, lower, upper, v);
+        DPRINTF(MiscRegs, "Writing MiscReg %s (%d %d:%d) : %#x\n",
+                miscRegName[misc_reg], misc_reg, lower, upper, v);
     } else {
         miscRegs[lower] = v;
-        DPRINTF(MiscRegs, "Writing to misc reg %d (%d) : %#x\n",
-                misc_reg, lower, v);
+        DPRINTF(MiscRegs, "Writing MiscReg %s (%d %d) : %#x\n",
+                miscRegName[misc_reg], misc_reg, lower, v);
     }
 }
 
diff --git a/src/arch/arm/isa/formats/branch.isa b/src/arch/arm/isa/formats/branch.isa
index b7360fc..7c726ef 100644
--- a/src/arch/arm/isa/formats/branch.isa
+++ b/src/arch/arm/isa/formats/branch.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010,2012-2013,2017-2018 ARM Limited
+// Copyright (c) 2010,2012-2013,2017-2018, 2020 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -187,8 +187,7 @@
                                   case 0x4:
                                     return new SevInst(machInst);
                                   case 0x5:
-                                    return new WarnUnimplemented(
-                                            "sevl", machInst);
+                                    return new SevlInst(machInst);
                                 }
                                 break;
                               case 0x1:
diff --git a/src/arch/arm/isa/formats/data.isa b/src/arch/arm/isa/formats/data.isa
index a927f2b..b742951 100644
--- a/src/arch/arm/isa/formats/data.isa
+++ b/src/arch/arm/isa/formats/data.isa
@@ -1,4 +1,4 @@
-// Copyright (c) 2010,2017-2018 ARM Limited
+// Copyright (c) 2010,2017-2018, 2020 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -1136,8 +1136,7 @@
                       case 0x4:
                         return new SevInst(machInst);
                       case 0x5:
-                        return new WarnUnimplemented(
-                                "sevl", machInst);
+                        return new SevlInst(machInst);
                       case 0x10:
                         return new WarnUnimplemented(
                                 "esb", machInst);
@@ -1283,6 +1282,8 @@
                 return new WfiInst(machInst);
               case 0x4:
                 return new SevInst(machInst);
+              case 0x5:
+                return new SevlInst(machInst);
               default:
                 return new WarnUnimplemented("unallocated_hint", machInst);
             }
diff --git a/src/arch/arm/isa/insts/ldr.isa b/src/arch/arm/isa/insts/ldr.isa
index dc1d650..d828fcf 100644
--- a/src/arch/arm/isa/insts/ldr.isa
+++ b/src/arch/arm/isa/insts/ldr.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2011,2019 ARM Limited
+// Copyright (c) 2010-2011,2019-2020 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -182,6 +182,7 @@
                 self.instFlags.extend(["IsMemBarrier",
                                        "IsWriteBarrier",
                                        "IsReadBarrier"])
+                self.memFlags.append("Request::ACQUIRE")
 
             # Disambiguate the class name for different flavors of loads
             if self.flavor != "normal":
@@ -256,6 +257,7 @@
                 self.instFlags.extend(["IsMemBarrier",
                                        "IsWriteBarrier",
                                        "IsReadBarrier"])
+                self.memFlags.append("Request::ACQUIRE")
 
         def emit(self):
             # Address computation code
diff --git a/src/arch/arm/isa/insts/ldr64.isa b/src/arch/arm/isa/insts/ldr64.isa
index 4f12509..fc4f34f 100644
--- a/src/arch/arm/isa/insts/ldr64.isa
+++ b/src/arch/arm/isa/insts/ldr64.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2011-2014, 2017, 2019 ARM Limited
+// Copyright (c) 2011-2014, 2017, 2019-2020 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -94,6 +94,8 @@
                 self.instFlags.extend(["IsMemBarrier",
                                        "IsWriteBarrier",
                                        "IsReadBarrier"])
+                self.memFlags.append("Request::ACQUIRE")
+
             if self.flavor in ("acex", "exclusive", "exp", "acexp"):
                 self.memFlags.append("Request::LLSC")
 
diff --git a/src/arch/arm/isa/insts/str.isa b/src/arch/arm/isa/insts/str.isa
index f542478..e99f6ad 100644
--- a/src/arch/arm/isa/insts/str.isa
+++ b/src/arch/arm/isa/insts/str.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2011,2017,2019 ARM Limited
+// Copyright (c) 2010-2011,2017,2019-2020 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -190,6 +190,7 @@
                 self.instFlags.extend(["IsMemBarrier",
                                        "IsWriteBarrier",
                                        "IsReadBarrier"])
+                self.memFlags.append("Request::RELEASE")
 
             # Disambiguate the class name for different flavors of stores
             if self.flavor != "normal":
@@ -271,6 +272,7 @@
                 self.instFlags.extend(["IsMemBarrier",
                                        "IsWriteBarrier",
                                        "IsReadBarrier"])
+                self.memFlags.append("Request::RELEASE")
 
             # Disambiguate the class name for different flavors of stores
             if self.flavor != "normal":
diff --git a/src/arch/arm/isa/insts/str64.isa b/src/arch/arm/isa/insts/str64.isa
index 22d1456..7ad1cad 100644
--- a/src/arch/arm/isa/insts/str64.isa
+++ b/src/arch/arm/isa/insts/str64.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2011-2013,2017,2019 ARM Limited
+// Copyright (c) 2011-2013,2017,2019-2020 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -82,6 +82,8 @@
                 self.instFlags.extend(["IsMemBarrier",
                                        "IsWriteBarrier",
                                        "IsReadBarrier"])
+                self.memFlags.append("Request::RELEASE")
+
             if self.flavor in ("relex", "exclusive", "exp", "relexp"):
                 self.instFlags.append("IsStoreConditional")
                 self.memFlags.append("Request::LLSC")
diff --git a/src/arch/arm/isa/insts/sve.isa b/src/arch/arm/isa/insts/sve.isa
index aa4f194..9314ba9 100644
--- a/src/arch/arm/isa/insts/sve.isa
+++ b/src/arch/arm/isa/insts/sve.isa
@@ -1516,27 +1516,49 @@
     # Generates definitions for SVE floating-point conversions (always
     # unary, constructive, merging
     def sveCvtInst(name, Name, opClass, types, op, direction=CvtDir.Narrow,
-                   decoder='Generic'):
+                   decoder='Generic', signed=False):
         global header_output, exec_output, decoders
+
+        if signed:
+            mask = "SElement msk = mask(sizeof(DElement)*8);"
+            assign_code = '''
+                int sign_bit = bits(destElem, sizeof(DElement)*8 -1);
+                AA64FpDest_x%(bigElemSuffix)s[i] =
+                                    sign_bit? (destElem|~msk): destElem;
+                          '''  % {
+               'bigElemSuffix': 's' if direction == CvtDir.Narrow else 'd'
+               }
+        else:
+            mask = "";
+            assign_code = '''
+                AA64FpDest_x%(bigElemSuffix)s[i] = destElem;
+            '''  % {
+               'bigElemSuffix': 's' if direction == CvtDir.Narrow else 'd'
+                   }
+
         code = sveEnabledCheckCode + '''
         unsigned eCount = ArmStaticInst::getCurSveVecLen<%(bigElemType)s>(
                 xc->tcBase());
+        %(mask)s
         for (unsigned i = 0; i < eCount; i++) {
             SElement srcElem1 = AA64FpOp1_x%(bigElemSuffix)s[i] &
                     mask(sizeof(SElement) * 8);
             DElement destElem = 0;
             if (GpOp_x%(bigElemSuffix)s[i]) {
                 %(op)s
-                AA64FpDest_x%(bigElemSuffix)s[i] = destElem;
+                %(assign)s;
             } else {
                 AA64FpDest_x%(bigElemSuffix)s[i] =
                         AA64FpDestMerge_x%(bigElemSuffix)s[i];
             }
         }
-        ''' % {'op': op,
-               'bigElemType': 'SElement' if direction == CvtDir.Narrow
+        ''' % {'bigElemType': 'SElement' if direction == CvtDir.Narrow
                                          else 'DElement',
-               'bigElemSuffix': 's' if direction == CvtDir.Narrow else 'd'}
+               'op': op, 'mask': mask,
+               'bigElemSuffix': 's' if direction == CvtDir.Narrow else 'd',
+               'assign': assign_code
+               }
+
         iop = InstObjParams(name, 'Sve' + Name, 'SveUnaryPredOp',
                             {'code': code, 'op_class': opClass}, [])
         header_output += SveWideningUnaryPredOpDeclare.subst(iop)
@@ -1813,26 +1835,25 @@
                 xc->tcBase());
 
         // Number of elements in a 128 bit segment
-        constexpr unsigned ePerSegment = 128 / sizeof(Element);
+        constexpr unsigned ePerSegment = 16 / sizeof(Element);
 
-        '''
-
-        code += '''
+        ArmISA::VecRegContainer tmpC;
+        auto auxDest = tmpC.as<TPElem>();
         for (unsigned i = 0; i < eCount; i++) {
-                const auto segmentBase = i - i % ePerSegment;
-                const auto segmentIdx = segmentBase + index;
+            const auto segmentBase = i - i %% ePerSegment;
+            const auto segmentIdx = segmentBase + index;
 
-                const Element& srcElem1 = AA64FpOp1_x[i];
-                const Element& srcElem2 = AA64FpOp2_x[segmentIdx];
-                Element destElem = 0;
+            const Element& srcElem1 = AA64FpOp1_x[i];
+            const Element& srcElem2 = AA64FpOp2_x[segmentIdx];
+            Element destElem = 0;
 
-        '''
-
-        code += '''
-        %(op)s
-        AA64FpDest_x[i] = destElem;
+            %(op)s
+            auxDest[i] = destElem;
         }
-        ''' % {'op': op}
+
+        for (unsigned i = 0; i < eCount; i++) {
+            AA64FpDest_x[i] = auxDest[i];
+        }''' % {'op':op}
 
         baseClass = 'SveBinIdxUnpredOp'
 
@@ -2045,8 +2066,10 @@
                 xc->tcBase());
 
         // Number of elements in a 128 bit segment
-        constexpr unsigned ePerSegment = 128 / sizeof(Element);
+        constexpr unsigned ePerSegment = 16 / sizeof(Element);
 
+        ArmISA::VecRegContainer tmpC;
+        auto auxDest = tmpC.as<TPElem>();
         for (unsigned i = 0; i < eCount; i++) {
             const auto segmentBase = i - i % ePerSegment;
             const auto segmentIdx = segmentBase + index;
@@ -2055,10 +2078,13 @@
             const Element& srcElem2 = AA64FpOp2_x[segmentIdx];
             Element destElem = AA64FpDestMerge_x[i];
         '''
-
         code += '''
             %(op)s
-            AA64FpDest_x[i] = destElem;
+            auxDest[i] = destElem;
+        }
+
+        for (unsigned i = 0; i < eCount; i++) {
+            AA64FpDest_x[i] = auxDest[i];
         }''' % {'op': op}
 
         iop = InstObjParams(name, 'Sve' + Name, 'SveBinIdxUnpredOp',
@@ -2743,6 +2769,7 @@
         code = sveEnabledCheckCode + '''
         unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
                 xc->tcBase());
+
         ArmISA::VecRegContainer tmpVecC;
         auto auxDest = tmpVecC.as<Element>();
         int firstelem = -1, lastelem = -2;
@@ -3001,6 +3028,9 @@
             code += '''
         uint32_t eltspersegment = 16 / (2 * sizeof(Element));'''
         code += '''
+        ArmISA::VecRegContainer tmpC;
+        auto auxDest = tmpC.as<TPElem>();
+
         for (int i = 0; i < eCount / 2; ++i) {'''
         if predType == PredType.NONE:
             code += '''
@@ -3044,9 +3074,14 @@
             code += '''
             }'''
         code += '''
-            AA64FpDest_x[2 * i] = addend_r;
-            AA64FpDest_x[2 * i + 1] = addend_i;
-        }'''
+            auxDest[2 * i] = addend_r;
+            auxDest[2 * i + 1] = addend_i;
+        }
+
+        for (unsigned i = 0; i < eCount; i++) {
+            AA64FpDest_x[i] = auxDest[i];
+        }
+        '''
         iop = InstObjParams(name, 'Sve' + Name,
                 'SveComplexIdxOp' if predType == PredType.NONE
                                   else 'SveComplexOp',
@@ -3596,7 +3631,7 @@
                 'uint32_t, uint32_t',
                 'uint64_t, uint32_t',
                 'uint64_t, uint64_t'),
-               fcvtzsCode, CvtDir.Narrow)
+               fcvtzsCode, CvtDir.Narrow, signed=True)
     sveCvtInst('fcvtzs', 'FcvtzsWiden', 'SimdCvtOp',
                ('uint16_t, uint32_t',
                 'uint16_t, uint64_t',
diff --git a/src/arch/arm/linux/fs_workload.cc b/src/arch/arm/linux/fs_workload.cc
index cc28193..7f0853f 100644
--- a/src/arch/arm/linux/fs_workload.cc
+++ b/src/arch/arm/linux/fs_workload.cc
@@ -78,15 +78,14 @@
     if (params()->early_kernel_symbols) {
         kernelObj->loadGlobalSymbols(kernelSymtab, 0, 0, _loadAddrMask);
         kernelObj->loadGlobalSymbols(
-                Loader::debugSymbolTable, 0, 0, _loadAddrMask);
+                &Loader::debugSymbolTable, 0, 0, _loadAddrMask);
     }
 
     // Setup boot data structure
-    Addr addr;
     // Check if the kernel image has a symbol that tells us it supports
     // device trees.
     bool kernel_has_fdt_support =
-        kernelSymtab->findAddress("unflatten_device_tree", addr);
+        kernelSymtab->find("unflatten_device_tree") != kernelSymtab->end();
     bool dtb_file_specified = params()->dtb_filename != "";
 
     if (kernel_has_fdt_support && dtb_file_specified) {
diff --git a/src/arch/arm/linux/process.cc b/src/arch/arm/linux/process.cc
index b5b6553..4c679b3 100644
--- a/src/arch/arm/linux/process.cc
+++ b/src/arch/arm/linux/process.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2015 ARM Limited
+ * Copyright (c) 2010-2013, 2015, 2020 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -290,14 +290,14 @@
         { base + 151, "munlock" },
         { base + 152, "mlockall" },
         { base + 153, "munlockall" },
-        { base + 154, "sched_setparam" },
-        { base + 155, "sched_getparam" },
-        { base + 156, "sched_setscheduler" },
-        { base + 157, "sched_getscheduler" },
-        { base + 158, "sched_yield" },
-        { base + 159, "sched_get_priority_max" },
-        { base + 160, "sched_get_priority_min" },
-        { base + 161, "sched_rr_get_interval" },
+        { base + 154, "sched_setparam", ignoreWarnOnceFunc },
+        { base + 155, "sched_getparam", ignoreWarnOnceFunc },
+        { base + 156, "sched_setscheduler", ignoreWarnOnceFunc },
+        { base + 157, "sched_getscheduler", ignoreWarnOnceFunc },
+        { base + 158, "sched_yield", ignoreWarnOnceFunc },
+        { base + 159, "sched_get_priority_max", ignoreWarnOnceFunc },
+        { base + 160, "sched_get_priority_min", ignoreWarnOnceFunc },
+        { base + 161, "sched_rr_get_interval", ignoreWarnOnceFunc },
         { base + 162, "nanosleep", ignoreWarnOnceFunc },
         { base + 163, "mremap", mremapFunc<ArmLinux32> }, // ARM-specific
         { base + 164, "setresuid" },
@@ -375,7 +375,7 @@
         { base + 238, "tkill" },
         { base + 239, "sendfile64" },
         { base + 240, "futex", futexFunc<ArmLinux32> },
-        { base + 241, "sched_setaffinity" },
+        { base + 241, "sched_setaffinity", ignoreWarnOnceFunc },
         { base + 242, "sched_getaffinity", ignoreFunc },
         { base + 243, "io_setup" },
         { base + 244, "io_destroy" },
@@ -476,7 +476,7 @@
         { base + 342, "tee" },
         { base + 343, "vmsplice" },
         { base + 344, "move_pages" },
-        { base + 345, "getcpu" },
+        { base + 345, "getcpu", getcpuFunc },
         { base + 346, "epoll_pwait" },
         { base + 347, "sys_kexec_load" },
         { base + 348, "sys_utimensat" },
@@ -631,16 +631,16 @@
         {  base + 115, "clock_nanosleep" },
         {  base + 116, "syslog" },
         {  base + 117, "ptrace" },
-        {  base + 118, "sched_setparam" },
-        {  base + 119, "sched_setscheduler" },
-        {  base + 120, "sched_getscheduler" },
-        {  base + 121, "sched_getparam" },
-        {  base + 122, "sched_setaffinity" },
+        {  base + 118, "sched_setparam", ignoreWarnOnceFunc },
+        {  base + 119, "sched_setscheduler", ignoreWarnOnceFunc },
+        {  base + 120, "sched_getscheduler", ignoreWarnOnceFunc },
+        {  base + 121, "sched_getparam", ignoreWarnOnceFunc },
+        {  base + 122, "sched_setaffinity", ignoreWarnOnceFunc },
         {  base + 123, "sched_getaffinity", ignoreFunc },
-        {  base + 124, "sched_yield" },
-        {  base + 125, "sched_get_priority_max" },
-        {  base + 126, "sched_get_priority_min" },
-        {  base + 127, "sched_rr_get_interval" },
+        {  base + 124, "sched_yield", ignoreWarnOnceFunc },
+        {  base + 125, "sched_get_priority_max", ignoreWarnOnceFunc },
+        {  base + 126, "sched_get_priority_min", ignoreWarnOnceFunc },
+        {  base + 127, "sched_rr_get_interval", ignoreWarnOnceFunc },
         {  base + 128, "restart_syscall" },
         {  base + 129, "kill", ignoreFunc },
         {  base + 130, "tkill" },
@@ -681,7 +681,7 @@
         {  base + 165, "getrusage", getrusageFunc<ArmLinux64> },
         {  base + 166, "umask" },
         {  base + 167, "prctl" },
-        {  base + 168, "getcpu" },
+        {  base + 168, "getcpu", getcpuFunc },
         {  base + 169, "gettimeofday", gettimeofdayFunc<ArmLinux64> },
         {  base + 170, "settimeofday" },
         {  base + 171, "adjtimex" },
diff --git a/src/arch/arm/semihosting.cc b/src/arch/arm/semihosting.cc
index a7a4d2a..ab21dda 100644
--- a/src/arch/arm/semihosting.cc
+++ b/src/arch/arm/semihosting.cc
@@ -696,18 +696,6 @@
 namespace GuestABI
 {
 
-// Ignore return values since those will be handled by semihosting.
-template <typename T>
-struct Result<SemiPseudoAbi32, T>
-{
-    static void store(ThreadContext *tc, const T &ret) {}
-};
-template <typename T>
-struct Result<SemiPseudoAbi64, T>
-{
-    static void store(ThreadContext *tc, const T &ret) {}
-};
-
 // Handle arguments the same as for semihosting operations. Skipping the first
 // slot is handled internally by the State type.
 template <typename T>
diff --git a/src/arch/arm/stacktrace.cc b/src/arch/arm/stacktrace.cc
index d0e702c..535cddd 100644
--- a/src/arch/arm/stacktrace.cc
+++ b/src/arch/arm/stacktrace.cc
@@ -47,11 +47,10 @@
     PortProxy &vp = tc->getVirtProxy();
     const auto *symtab = tc->getSystemPtr()->workload->symtab(tc);
 
-    Addr addr;
-    if (!symtab->findAddress(name, addr))
-        panic("thread info not compiled into kernel\n");
+    auto it = symtab->find(name);
+    panic_if(it == symtab->end(), "Thread info not compiled into kernel.");
 
-    return vp.read<int32_t>(addr, GuestByteOrder);
+    return vp.read<int32_t>(it->address, GuestByteOrder);
 }
 
 ProcessInfo::ProcessInfo(ThreadContext *_tc) : tc(_tc)
diff --git a/src/arch/generic/linux/threadinfo.hh b/src/arch/generic/linux/threadinfo.hh
index 83e41b9..5e058cf 100644
--- a/src/arch/generic/linux/threadinfo.hh
+++ b/src/arch/generic/linux/threadinfo.hh
@@ -46,15 +46,16 @@
     bool
     get_data(const char *symbol, T &data)
     {
-        Addr addr = 0;
-        if (!sys->workload->symtab(tc)->findAddress(symbol, addr)) {
+        auto *symtab = sys->workload->symtab(tc);
+        auto it = symtab->find(symbol);
+        if (it == symtab->end()) {
             warn_once("Unable to find kernel symbol %s\n", symbol);
             warn_once("Kernel not compiled with task_struct info; can't get "
                       "currently executing task/process/thread name/ids!\n");
             return false;
         }
 
-        data = tc->getVirtProxy().read<T>(addr, TheISA::GuestByteOrder);
+        data = tc->getVirtProxy().read<T>(it->address, TheISA::GuestByteOrder);
 
         return true;
     }
diff --git a/src/arch/mips/isa/formats/branch.isa b/src/arch/mips/isa/formats/branch.isa
index 06662f0..4975a13 100644
--- a/src/arch/mips/isa/formats/branch.isa
+++ b/src/arch/mips/isa/formats/branch.isa
@@ -193,11 +193,11 @@
 
         Addr target = pc + 4 + disp;
 
-        std::string str;
-        if (symtab && symtab->findSymbol(target, str))
-            ss << str;
+        Loader::SymbolTable::const_iterator it;
+        if (symtab && (it = symtab->find(target)) != symtab->end())
+            ss << it->name;
         else
-            ccprintf(ss, "0x%x", target);
+            ccprintf(ss, "%#x", target);
 
         return ss.str();
     }
@@ -213,9 +213,9 @@
             Addr npc = pc + 4;
             ccprintf(ss,"0x%x",(npc & 0xF0000000) | disp);
         } else if (_numSrcRegs == 0) {
-            std::string str;
-            if (symtab && symtab->findSymbol(disp, str))
-                ss << str;
+            Loader::SymbolTable::const_iterator it;
+            if (symtab && (it = symtab->find(disp)) != symtab->end())
+                ss << it->name;
             else
                 ccprintf(ss, "0x%x", disp);
         } else if (_numSrcRegs == 1) {
diff --git a/src/arch/power/insts/branch.cc b/src/arch/power/insts/branch.cc
index 5fe0c4e..3511b6b 100644
--- a/src/arch/power/insts/branch.cc
+++ b/src/arch/power/insts/branch.cc
@@ -68,11 +68,11 @@
 
     Addr target = pc + disp;
 
-    std::string str;
-    if (symtab && symtab->findSymbol(target, str))
-        ss << str;
+    Loader::SymbolTable::const_iterator it;
+    if (symtab && (it = symtab->find(target)) != symtab->end())
+        ss << it->name;
     else
-        ccprintf(ss, "0x%x", target);
+        ccprintf(ss, "%#x", target);
 
     return ss.str();
 }
@@ -91,11 +91,11 @@
 
     ccprintf(ss, "%-10s ", mnemonic);
 
-    std::string str;
-    if (symtab && symtab->findSymbol(targetAddr, str))
-        ss << str;
+    Loader::SymbolTable::const_iterator it;
+    if (symtab && (it = symtab->find(targetAddr)) != symtab->end())
+        ss << it->name;
     else
-        ccprintf(ss, "0x%x", targetAddr);
+        ccprintf(ss, "%#x", targetAddr);
 
     return ss.str();
 }
@@ -118,11 +118,11 @@
 
     Addr target = pc + disp;
 
-    std::string str;
-    if (symtab && symtab->findSymbol(target, str))
-        ss << str;
+    Loader::SymbolTable::const_iterator it;
+    if (symtab && (it = symtab->find(target)) != symtab->end())
+        ss << it->name;
     else
-        ccprintf(ss, "0x%x", target);
+        ccprintf(ss, "%#x", target);
 
     return ss.str();
 }
@@ -143,11 +143,11 @@
 
     ss << bo << ", " << bi << ", ";
 
-    std::string str;
-    if (symtab && symtab->findSymbol(targetAddr, str))
-        ss << str;
+    Loader::SymbolTable::const_iterator it;
+    if (symtab && (it = symtab->find(targetAddr)) != symtab->end())
+        ss << it->name;
     else
-        ccprintf(ss, "0x%x", targetAddr);
+        ccprintf(ss, "%#x", targetAddr);
 
     return ss.str();
 }
diff --git a/src/arch/riscv/bare_metal/fs_workload.hh b/src/arch/riscv/bare_metal/fs_workload.hh
index 2e26ad1..a142d47 100644
--- a/src/arch/riscv/bare_metal/fs_workload.hh
+++ b/src/arch/riscv/bare_metal/fs_workload.hh
@@ -55,9 +55,9 @@
         return bootloaderSymtab;
     }
     bool
-    insertSymbol(Addr address, const std::string &symbol) override
+    insertSymbol(const Loader::Symbol &symbol) override
     {
-        return bootloaderSymtab->insert(address, symbol);
+        return bootloaderSymtab->insert(symbol);
     }
 };
 
diff --git a/src/arch/sparc/fs_workload.hh b/src/arch/sparc/fs_workload.hh
index 0323714..650ed37 100644
--- a/src/arch/sparc/fs_workload.hh
+++ b/src/arch/sparc/fs_workload.hh
@@ -61,9 +61,9 @@
     }
 
     bool
-    insertSymbol(Addr address, const std::string &symbol) override
+    insertSymbol(const Loader::Symbol &symbol) override
     {
-        return defaultSymtab.insert(address, symbol);
+        return defaultSymtab.insert(symbol);
     }
 };
 
diff --git a/src/arch/sparc/insts/branch.cc b/src/arch/sparc/insts/branch.cc
index 8ffa241..52517e6 100644
--- a/src/arch/sparc/insts/branch.cc
+++ b/src/arch/sparc/insts/branch.cc
@@ -77,18 +77,17 @@
         Addr pc, const Loader::SymbolTable *symtab) const
 {
     std::stringstream response;
-    std::string symbol;
-    Addr symbol_addr;
 
     Addr target = disp + pc;
 
     printMnemonic(response, mnemonic);
-    ccprintf(response, "0x%x", target);
+    ccprintf(response, "%#x", target);
 
-    if (symtab && symtab->findNearestSymbol(target, symbol, symbol_addr)) {
-        ccprintf(response, " <%s", symbol);
-        if (symbol_addr != target)
-            ccprintf(response, "+%d>", target - symbol_addr);
+    Loader::SymbolTable::const_iterator it;
+    if (symtab && (it = symtab->findNearest(target)) != symtab->end()) {
+        ccprintf(response, " <%s", it->name);
+        if (it->address != target)
+            ccprintf(response, "+%d>", target - it->address);
         else
             ccprintf(response, ">");
     }
diff --git a/src/arch/x86/faults.cc b/src/arch/x86/faults.cc
index 98bd107..0754da3 100644
--- a/src/arch/x86/faults.cc
+++ b/src/arch/x86/faults.cc
@@ -169,7 +169,7 @@
                 panic("Tried to %s unmapped address %#x.\nPC: %#x, Instr: %s",
                       modeStr, addr, tc->pcState().pc(),
                       inst->disassemble(tc->pcState().pc(),
-                          Loader::debugSymbolTable));
+                          &Loader::debugSymbolTable));
             }
         }
     }
diff --git a/src/arch/x86/linux/process.cc b/src/arch/x86/linux/process.cc
index 6b50dbf..2c594e7 100644
--- a/src/arch/x86/linux/process.cc
+++ b/src/arch/x86/linux/process.cc
@@ -566,7 +566,7 @@
     { 306, "syncfs" },
     { 307, "sendmmsg" },
     { 308, "setns" },
-    { 309, "getcpu" },
+    { 309, "getcpu", getcpuFunc },
     { 310, "proess_vm_readv" },
     { 311, "proess_vm_writev" },
     { 312, "kcmp" },
@@ -914,7 +914,7 @@
     { 315, "tee" },
     { 316, "vmsplice" },
     { 317, "move_pages" },
-    { 318, "getcpu" },
+    { 318, "getcpu", getcpuFunc },
     { 319, "epoll_pwait" },
     { 320, "utimensat" },
     { 321, "signalfd" },
diff --git a/src/arch/x86/stacktrace.cc b/src/arch/x86/stacktrace.cc
index a7c548e..0e5341c 100644
--- a/src/arch/x86/stacktrace.cc
+++ b/src/arch/x86/stacktrace.cc
@@ -47,11 +47,10 @@
     PortProxy &vp = tc->getVirtProxy();
     const auto *symtab = tc->getSystemPtr()->workload->symtab(tc);
 
-    Addr addr;
-    if (!symtab->findAddress(name, addr))
-        panic("thread info not compiled into kernel\n");
+    auto it = symtab->find(name);
+    panic_if(it == symtab->end(), "Thread info not compiled into kernel.");
 
-    return vp.read<int32_t>(addr, GuestByteOrder);
+    return vp.read<int32_t>(it->address, GuestByteOrder);
 }
 
 ProcessInfo::ProcessInfo(ThreadContext *_tc) : tc(_tc)
@@ -196,14 +195,15 @@
     std::string symbol;
     for (int i = 0, size = stack.size(); i < size; ++i) {
         Addr addr = stack[size - i - 1];
+        Loader::SymbolTable::const_iterator it;
         if (addr == user)
             symbol = "user";
         else if (addr == console)
             symbol = "console";
         else if (addr == unknown)
             symbol = "unknown";
-        else
-            symtab->findSymbol(addr, symbol);
+        else if ((it = symtab->find(addr)) != symtab->end())
+            symbol = it->name;
 
         DPRINTFN("%#x: %s\n", addr, symbol);
     }
diff --git a/src/arch/x86/tlb.cc b/src/arch/x86/tlb.cc
index 8068423..4e9d4be 100644
--- a/src/arch/x86/tlb.cc
+++ b/src/arch/x86/tlb.cc
@@ -268,7 +268,7 @@
             [func, mode](ThreadContext *tc, PacketPtr pkt) -> Cycles
             {
                 uint64_t ret;
-                PseudoInst::pseudoInst<X86PseudoInstABI>(tc, func, ret);
+                PseudoInst::pseudoInst<X86PseudoInstABI, true>(tc, func, ret);
                 if (mode == Read)
                     pkt->setLE(ret);
                 return Cycles(1);
diff --git a/src/base/cp_annotate.cc b/src/base/cp_annotate.cc
index c886e39..159e6e0 100644
--- a/src/base/cp_annotate.cc
+++ b/src/base/cp_annotate.cc
@@ -163,7 +163,7 @@
     Addr junk;
     char sm[50];
     if (!TheISA::inUserMode(tc))
-        Loader::debugSymbolTable->findNearestSymbol(
+        Loader::debugSymbolTable.findNearestSymbol(
             tc->readIntReg(ReturnAddressReg), st, junk);
 
     tc->getVirtProxy().readString(sm, sm_string, 50);
@@ -337,7 +337,7 @@
     Addr sym_addr = 0;
 
     if (!TheISA::inUserMode(tc)) {
-        Loader::debugSymbolTable->findNearestSymbol(next_pc, sym, sym_addr);
+        Loader::debugSymbolTable.findNearestSymbol(next_pc, sym, sym_addr);
     } else {
         Linux::ThreadInfo ti(tc);
         string app = ti.curTaskName();
@@ -390,7 +390,7 @@
     std::string st;
     Addr junk;
     if (!TheISA::inUserMode(tc))
-        Loader::debugSymbolTable->findNearestSymbol(
+        Loader::debugSymbolTable.findNearestSymbol(
             tc->readIntReg(ReturnAddressReg), st, junk);
     System *sys = tc->getSystemPtr();
     StringWrap name(sys->name());
diff --git a/src/base/loader/elf_object.cc b/src/base/loader/elf_object.cc
index 8876a87..ceafc53 100644
--- a/src/base/loader/elf_object.cc
+++ b/src/base/loader/elf_object.cc
@@ -341,7 +341,23 @@
                         elf_strptr(elf, shdr.sh_link, sym.st_name);
                     if (sym_name && sym_name[0] != '$') {
                         Addr value = sym.st_value - base + offset;
-                        if (symtab->insert(value & mask, sym_name)) {
+                        Loader::Symbol symbol;
+                        symbol.address = value & mask;
+                        symbol.name = sym_name;
+                        switch (binding) {
+                          case STB_GLOBAL:
+                            symbol.binding = Loader::Symbol::Binding::Global;
+                            break;
+                          case STB_LOCAL:
+                            symbol.binding = Loader::Symbol::Binding::Local;
+                            break;
+                          case STB_WEAK:
+                            symbol.binding = Loader::Symbol::Binding::Weak;
+                            break;
+                          default:
+                            panic("Unrecognized binding type");
+                        }
+                        if (symtab->insert(symbol)) {
                             DPRINTF(Loader, "Symbol: %-40s value %#x\n",
                                     sym_name, value);
                         }
diff --git a/src/base/loader/symtab.cc b/src/base/loader/symtab.cc
index 9e0f1f6..eaada22 100644
--- a/src/base/loader/symtab.cc
+++ b/src/base/loader/symtab.cc
@@ -35,6 +35,7 @@
 
 #include "base/logging.hh"
 #include "base/str.hh"
+#include "base/trace.hh"
 #include "base/types.hh"
 #include "sim/serialize.hh"
 
@@ -43,31 +44,53 @@
 namespace Loader
 {
 
-SymbolTable *debugSymbolTable = NULL;
+SymbolTable debugSymbolTable;
 
 void
 SymbolTable::clear()
 {
-    addrTable.clear();
-    symbolTable.clear();
+    addrMap.clear();
+    nameMap.clear();
+    symbols.clear();
 }
 
 bool
-SymbolTable::insert(Addr address, string symbol)
+SymbolTable::insert(const Symbol &symbol)
 {
-    if (symbol.empty())
+    if (symbol.name.empty())
         return false;
 
-    if (!symbolTable.insert(make_pair(symbol, address)).second)
+    int idx = symbols.size();
+
+    if (!nameMap.insert({ symbol.name, idx }).second)
         return false;
 
     // There can be multiple symbols for the same address, so always
     // update the addrTable multimap when we see a new symbol name.
-    addrTable.insert(make_pair(address, symbol));
+    addrMap.insert({ symbol.address, idx });
+
+    symbols.emplace_back(symbol);
 
     return true;
 }
 
+bool
+SymbolTable::insert(const SymbolTable &other)
+{
+    // Check if any symbol in other already exists in our table.
+    NameMap intersection;
+    std::set_intersection(other.nameMap.begin(), other.nameMap.end(),
+                          nameMap.begin(), nameMap.end(),
+                          std::inserter(intersection, intersection.begin()),
+                          nameMap.value_comp());
+    if (!intersection.empty())
+        return false;
+
+    for (const Symbol &symbol: other)
+        insert(symbol);
+
+    return true;
+}
 
 bool
 SymbolTable::load(const string &filename)
@@ -92,51 +115,54 @@
         if (address.empty())
             return false;
 
-        string symbol = buffer.substr(idx + 1);
-        eat_white(symbol);
-        if (symbol.empty())
+        string name = buffer.substr(idx + 1);
+        eat_white(name);
+        if (name.empty())
             return false;
 
         Addr addr;
         if (!to_number(address, addr))
             return false;
 
-        if (!insert(addr, symbol))
+        if (!insert({ Symbol::Binding::Global, name, addr }))
             return false;
     }
 
     file.close();
-
     return true;
 }
 
 void
 SymbolTable::serialize(const string &base, CheckpointOut &cp) const
 {
-    paramOut(cp, base + ".size", addrTable.size());
+    paramOut(cp, base + ".size", symbols.size());
 
     int i = 0;
-    ATable::const_iterator p, end = addrTable.end();
-    for (p = addrTable.begin(); p != end; ++p) {
-        paramOut(cp, csprintf("%s.addr_%d", base, i), p->first);
-        paramOut(cp, csprintf("%s.symbol_%d", base, i), p->second);
-        ++i;
+    for (auto &symbol: symbols) {
+        paramOut(cp, csprintf("%s.addr_%d", base, i), symbol.address);
+        paramOut(cp, csprintf("%s.symbol_%d", base, i), symbol.name);
+        paramOut(cp, csprintf("%s.binding_%d", base, i), (int)symbol.binding);
+        i++;
     }
 }
 
 void
-SymbolTable::unserialize(const string &base, CheckpointIn &cp)
+SymbolTable::unserialize(const string &base, CheckpointIn &cp,
+                         Symbol::Binding default_binding)
 {
     clear();
     int size;
     paramIn(cp, base + ".size", size);
     for (int i = 0; i < size; ++i) {
-        Addr addr;
-        std::string symbol;
+        Addr address;
+        std::string name;
+        Symbol::Binding binding = default_binding;
 
-        paramIn(cp, csprintf("%s.addr_%d", base, i), addr);
-        paramIn(cp, csprintf("%s.symbol_%d", base, i), symbol);
-        insert(addr, symbol);
+        paramIn(cp, csprintf("%s.addr_%d", base, i), address);
+        paramIn(cp, csprintf("%s.symbol_%d", base, i), name);
+        if (!optParamIn(cp, csprintf("%s.binding_%d", base, i), binding))
+            binding = default_binding;
+        insert({binding, name, address});
     }
 }
 
diff --git a/src/base/loader/symtab.hh b/src/base/loader/symtab.hh
index b09d854..1e99fec 100644
--- a/src/base/loader/symtab.hh
+++ b/src/base/loader/symtab.hh
@@ -29,143 +29,205 @@
 #ifndef __SYMTAB_HH__
 #define __SYMTAB_HH__
 
+#include <functional>
 #include <iosfwd>
 #include <map>
+#include <memory>
 #include <string>
+#include <vector>
 
+#include "base/trace.hh"
 #include "base/types.hh"
 #include "sim/serialize.hh"
 
 namespace Loader
 {
 
+struct Symbol
+{
+    enum class Binding {
+        Global,
+        Local,
+        Weak
+    };
+
+    Binding binding;
+    std::string name;
+    Addr address;
+};
+
 class SymbolTable
 {
   public:
-    typedef std::multimap<Addr, std::string> ATable;
-    typedef std::map<std::string, Addr> STable;
+    typedef std::shared_ptr<SymbolTable> SymbolTablePtr;
 
   private:
-    ATable addrTable;
-    STable symbolTable;
+    typedef std::vector<Symbol> SymbolVector;
+    // Map addresses to an index into the symbol vector.
+    typedef std::multimap<Addr, int> AddrMap;
+    // Map a symbol name to an index into the symbol vector.
+    typedef std::map<std::string, int> NameMap;
 
-  private:
+    SymbolVector symbols;
+    AddrMap addrMap;
+    NameMap nameMap;
+
     bool
-    upperBound(Addr addr, ATable::const_iterator &iter) const
+    upperBound(Addr addr, AddrMap::const_iterator &iter) const
     {
         // find first key *larger* than desired address
-        iter = addrTable.upper_bound(addr);
+        iter = addrMap.upper_bound(addr);
 
         // if very first key is larger, we're out of luck
-        if (iter == addrTable.begin())
+        if (iter == addrMap.begin())
             return false;
 
         return true;
     }
 
+    typedef std::function<void(SymbolTable &symtab,
+                               const Symbol &symbol)> SymTabOp;
+    SymbolTablePtr
+    operate(SymTabOp op) const
+    {
+        SymbolTablePtr symtab(new SymbolTable);
+        for (const auto &symbol: symbols)
+            op(*symtab, symbol);
+        return symtab;
+    }
+
+    typedef std::function<bool(const Symbol &symbol)> SymTabFilter;
+    SymbolTablePtr
+    filter(SymTabFilter filter) const
+    {
+        SymTabOp apply_filter =
+            [filter](SymbolTable &symtab, const Symbol &symbol) {
+            if (filter(symbol)) {
+                symtab.insert(symbol);
+            }
+        };
+        return operate(apply_filter);
+    }
+
+    SymbolTablePtr
+    filterByBinding(Symbol::Binding binding) const
+    {
+        auto filt = [binding](const Symbol &symbol) {
+            return symbol.binding == binding;
+        };
+        return filter(filt);
+    }
+
   public:
-    SymbolTable() {}
-    SymbolTable(const std::string &file) { load(file); }
-    ~SymbolTable() {}
+    typedef SymbolVector::iterator iterator;
+    typedef SymbolVector::const_iterator const_iterator;
+
+    const_iterator begin() const { return symbols.begin(); }
+    const_iterator end() const { return symbols.end(); }
 
     void clear();
-    bool insert(Addr address, std::string symbol);
+    // Insert either a single symbol or the contents of an entire symbol table
+    // into this one.
+    bool insert(const Symbol &symbol);
+    bool insert(const SymbolTable &other);
     bool load(const std::string &file);
+    bool empty() const { return symbols.empty(); }
 
-    const ATable &getAddrTable() const { return addrTable; }
-    const STable &getSymbolTable() const { return symbolTable; }
-
-  public:
-    void serialize(const std::string &base, CheckpointOut &cp) const;
-    void unserialize(const std::string &base, CheckpointIn &cp);
-
-  public:
-    bool
-    findSymbol(Addr address, std::string &symbol) const
+    SymbolTablePtr
+    offset(Addr by) const
     {
-        ATable::const_iterator i = addrTable.find(address);
-        if (i == addrTable.end())
-            return false;
+        SymTabOp op = [by](SymbolTable &symtab, const Symbol &symbol) {
+            Symbol sym = symbol;
+            sym.address += by;
+            symtab.insert(sym);
+        };
+        return operate(op);
+    }
+
+    SymbolTablePtr
+    mask(Addr m) const
+    {
+        SymTabOp op = [m](SymbolTable &symtab, const Symbol &symbol) {
+            Symbol sym = symbol;
+            sym.address &= m;
+            symtab.insert(sym);
+        };
+        return operate(op);
+    }
+
+    SymbolTablePtr
+    globals() const
+    {
+        return filterByBinding(Symbol::Binding::Global);
+    }
+
+    SymbolTablePtr
+    locals() const
+    {
+        return filterByBinding(Symbol::Binding::Local);
+    }
+
+    SymbolTablePtr
+    weaks() const
+    {
+        return filterByBinding(Symbol::Binding::Weak);
+    }
+
+    void serialize(const std::string &base, CheckpointOut &cp) const;
+    void unserialize(const std::string &base, CheckpointIn &cp,
+                     Symbol::Binding default_binding=Symbol::Binding::Global);
+
+    const_iterator
+    find(Addr address) const
+    {
+        AddrMap::const_iterator i = addrMap.find(address);
+        if (i == addrMap.end())
+            return end();
 
         // There are potentially multiple symbols that map to the same
         // address. For simplicity, just return the first one.
-        symbol = (*i).second;
-        return true;
+        return symbols.begin() + i->second;
     }
 
-    bool
-    findAddress(const std::string &symbol, Addr &address) const
+    const_iterator
+    find(const std::string &name) const
     {
-        STable::const_iterator i = symbolTable.find(symbol);
-        if (i == symbolTable.end())
-            return false;
+        NameMap::const_iterator i = nameMap.find(name);
+        if (i == nameMap.end())
+            return end();
 
-        address = (*i).second;
-        return true;
+        return symbols.begin() + i->second;
     }
 
     /// Find the nearest symbol equal to or less than the supplied
     /// address (e.g., the label for the enclosing function).
     /// @param addr     The address to look up.
-    /// @param symbol   Return reference for symbol string.
-    /// @param symaddr  Return reference for symbol address.
     /// @param nextaddr Address of following symbol (for
     ///                 determining valid range of symbol).
-    /// @retval True if a symbol was found.
-    bool
-    findNearestSymbol(Addr addr, std::string &symbol, Addr &symaddr,
-                      Addr &nextaddr) const
+    /// @retval A const_iterator which points to the symbol if found, or end.
+    const_iterator
+    findNearest(Addr addr, Addr &nextaddr) const
     {
-        ATable::const_iterator i;
+        AddrMap::const_iterator i = addrMap.end();
         if (!upperBound(addr, i))
-            return false;
+            return end();
 
         nextaddr = i->first;
         --i;
-        symaddr = i->first;
-        symbol = i->second;
-        return true;
+        return symbols.begin() + i->second;
     }
 
     /// Overload for findNearestSymbol() for callers who don't care
     /// about nextaddr.
-    bool
-    findNearestSymbol(Addr addr, std::string &symbol, Addr &symaddr) const
+    const_iterator
+    findNearest(Addr addr) const
     {
-        ATable::const_iterator i;
+        AddrMap::const_iterator i = addrMap.end();
         if (!upperBound(addr, i))
-            return false;
+            return end();
 
         --i;
-        symaddr = i->first;
-        symbol = i->second;
-        return true;
-    }
-
-
-    bool
-    findNearestAddr(Addr addr, Addr &symaddr, Addr &nextaddr) const
-    {
-        ATable::const_iterator i;
-        if (!upperBound(addr, i))
-            return false;
-
-        nextaddr = i->first;
-        --i;
-        symaddr = i->first;
-        return true;
-    }
-
-    bool
-    findNearestAddr(Addr addr, Addr &symaddr) const
-    {
-        ATable::const_iterator i;
-        if (!upperBound(addr, i))
-            return false;
-
-        --i;
-        symaddr = i->first;
-        return true;
+        return symbols.begin() + i->second;
     }
 };
 
@@ -173,7 +235,7 @@
 /// there should be one of these per System object for full system,
 /// and per Process object for non-full-system, but so far one big
 /// global one has worked well enough.
-extern SymbolTable *debugSymbolTable;
+extern SymbolTable debugSymbolTable;
 
 } // namespace Loader
 
diff --git a/src/cpu/BaseCPU.py b/src/cpu/BaseCPU.py
index ab70d1d..e487cbb 100644
--- a/src/cpu/BaseCPU.py
+++ b/src/cpu/BaseCPU.py
@@ -149,7 +149,6 @@
         "enable statistics pseudo instructions")
 
     profile = Param.Latency('0ns', "trace the kernel stack")
-    do_quiesce = Param.Bool(True, "enable quiesce instructions")
 
     wait_for_remote_gdb = Param.Bool(False,
         "Wait for a remote GDB connection");
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 3647482..dc3cbf0 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -751,21 +751,24 @@
 void
 BaseCPU::traceFunctionsInternal(Addr pc)
 {
-    if (!Loader::debugSymbolTable)
+    if (Loader::debugSymbolTable.empty())
         return;
 
     // if pc enters different function, print new function symbol and
     // update saved range.  Otherwise do nothing.
     if (pc < currentFunctionStart || pc >= currentFunctionEnd) {
-        string sym_str;
-        bool found = Loader::debugSymbolTable->findNearestSymbol(
-                pc, sym_str, currentFunctionStart, currentFunctionEnd);
+        auto it = Loader::debugSymbolTable.findNearest(
+                pc, currentFunctionEnd);
 
-        if (!found) {
+        string sym_str;
+        if (it == Loader::debugSymbolTable.end()) {
             // no symbol found: use addr as label
-            sym_str = csprintf("0x%x", pc);
+            sym_str = csprintf("%#x", pc);
             currentFunctionStart = pc;
             currentFunctionEnd = pc + 1;
+        } else {
+            sym_str = it->name;
+            currentFunctionStart = it->address;
         }
 
         ccprintf(*functionTraceStream, " (%d)\n%d: %s",
diff --git a/src/cpu/exetrace.cc b/src/cpu/exetrace.cc
index 06154da..c9c8b68 100644
--- a/src/cpu/exetrace.cc
+++ b/src/cpu/exetrace.cc
@@ -76,34 +76,34 @@
     if (Debug::ExecThread)
         outs << "T" << thread->threadId() << " : ";
 
-    std::string sym_str;
-    Addr sym_addr;
     Addr cur_pc = pc.instAddr();
-    if (Loader::debugSymbolTable && Debug::ExecSymbol &&
-            (!FullSystem || !inUserMode(thread)) &&
-            Loader::debugSymbolTable->findNearestSymbol(
-                cur_pc, sym_str, sym_addr)) {
-        if (cur_pc != sym_addr)
-            sym_str += csprintf("+%d",cur_pc - sym_addr);
-        outs << "@" << sym_str;
+    Loader::SymbolTable::const_iterator it;
+    if (Debug::ExecSymbol && (!FullSystem || !inUserMode(thread)) &&
+            (it = Loader::debugSymbolTable.findNearest(cur_pc)) !=
+                Loader::debugSymbolTable.end()) {
+        Addr delta = cur_pc - it->address;
+        if (delta)
+            ccprintf(outs, "@%s+%d", it->name, delta);
+        else
+            ccprintf(outs, "@%s", it->name);
     } else {
-        outs << "0x" << hex << cur_pc;
+        ccprintf(outs, "%#x", cur_pc);
     }
 
     if (inst->isMicroop()) {
-        outs << "." << setw(2) << dec << pc.microPC();
+        ccprintf(outs, ".%2d", pc.microPC());
     } else {
-        outs << "   ";
+        ccprintf(outs, "   ");
     }
 
-    outs << " : ";
+    ccprintf(outs, " : ");
 
     //
     //  Print decoded instruction
     //
 
     outs << setw(26) << left;
-    outs << inst->disassemble(cur_pc, Loader::debugSymbolTable);
+    outs << inst->disassemble(cur_pc, &Loader::debugSymbolTable);
 
     if (ran) {
         outs << " : ";
diff --git a/src/cpu/kvm/base.cc b/src/cpu/kvm/base.cc
index 38fea19..d44bb3d 100644
--- a/src/cpu/kvm/base.cc
+++ b/src/cpu/kvm/base.cc
@@ -223,7 +223,7 @@
 void
 BaseKvmCPU::finishMMIOPending()
 {
-    assert(_status = RunningMMIOPending);
+    assert(_status == RunningMMIOPending);
     assert(!tickEvent.scheduled());
 
     _status = RunningServiceCompletion;
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index e50d498..e4a9dc0 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014,2017-2018 ARM Limited
+ * Copyright (c) 2013-2014,2017-2018,2020 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -1029,10 +1029,11 @@
 
     bool is_load = request->isLoad;
     bool is_llsc = request->request->isLLSC();
+    bool is_release = request->request->isRelease();
     bool is_swap = request->request->isSwap();
     bool is_atomic = request->request->isAtomic();
     bool bufferable = !(request->request->isStrictlyOrdered() ||
-                        is_llsc || is_swap || is_atomic);
+                        is_llsc || is_swap || is_atomic || is_release);
 
     if (is_load) {
         if (numStoresInTransfers != 0) {
@@ -1050,6 +1051,15 @@
         }
     }
 
+    // Process store conditionals or store release after all previous
+    // stores are completed
+    if (((!is_load && is_llsc) || is_release) &&
+        !storeBuffer.isDrained()) {
+        DPRINTF(MinorMem, "Memory access needs to wait for store buffer"
+                          " to drain\n");
+        return;
+    }
+
     /* Check if this is the head instruction (and so must be executable as
      *  its stream sequence number was checked above) for loads which must
      *  not be speculatively issued and stores which must be issued here */
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index f7fb3fe..7383c6f 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2010-2014, 2017-2019 ARM Limited
+ * Copyright (c) 2010-2014, 2017-2020 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -753,6 +753,21 @@
 
         DynInstPtr inst = storeWBIt->instruction();
         LSQRequest* req = storeWBIt->request();
+
+        // Process store conditionals or store release after all previous
+        // stores are completed
+        if ((req->mainRequest()->isLLSC() ||
+             req->mainRequest()->isRelease()) &&
+             (storeWBIt.idx() != storeQueue.head())) {
+            DPRINTF(LSQUnit, "Store idx:%i PC:%s to Addr:%#x "
+                "[sn:%lli] is %s%s and not head of the queue\n",
+                storeWBIt.idx(), inst->pcState(),
+                req->request()->getPaddr(), inst->seqNum,
+                req->mainRequest()->isLLSC() ? "SC" : "",
+                req->mainRequest()->isRelease() ? "/Release" : "");
+            break;
+        }
+
         storeWBIt->committed() = true;
 
         assert(!inst->memData);
diff --git a/src/cpu/o3/mem_dep_unit.hh b/src/cpu/o3/mem_dep_unit.hh
index c4a3310..3d24b1f 100644
--- a/src/cpu/o3/mem_dep_unit.hh
+++ b/src/cpu/o3/mem_dep_unit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2014 ARM Limited
+ * Copyright (c) 2012, 2014, 2020 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -45,6 +45,7 @@
 #include <memory>
 #include <set>
 #include <unordered_map>
+#include <unordered_set>
 
 #include "base/statistics.hh"
 #include "cpu/inst_seq.hh"
@@ -177,7 +178,7 @@
       public:
         /** Constructs a memory dependence entry. */
         MemDepEntry(const DynInstPtr &new_inst)
-            : inst(new_inst), regsReady(false), memDepReady(false),
+            : inst(new_inst), regsReady(false), memDeps(0),
               completed(false), squashed(false)
         {
 #ifdef DEBUG
@@ -216,8 +217,8 @@
 
         /** If the registers are ready or not. */
         bool regsReady;
-        /** If all memory dependencies have been satisfied. */
-        bool memDepReady;
+        /** Number of memory dependencies that need to be satisfied. */
+        int memDeps;
         /** If the instruction is completed. */
         bool completed;
         /** If the instruction is squashed. */
@@ -257,14 +258,20 @@
      */
     MemDepPred depPred;
 
+    /** Sequence numbers of outstanding load barriers. */
+    std::unordered_set<InstSeqNum> loadBarrierSNs;
+
+    /** Sequence numbers of outstanding store barriers. */
+    std::unordered_set<InstSeqNum> storeBarrierSNs;
+
     /** Is there an outstanding load barrier that loads must wait on. */
-    bool loadBarrier;
-    /** The sequence number of the load barrier. */
-    InstSeqNum loadBarrierSN;
+    bool hasLoadBarrier() const { return !loadBarrierSNs.empty(); }
+
     /** Is there an outstanding store barrier that loads must wait on. */
-    bool storeBarrier;
-    /** The sequence number of the store barrier. */
-    InstSeqNum storeBarrierSN;
+    bool hasStoreBarrier() const { return !storeBarrierSNs.empty(); }
+
+    /** Inserts the SN of a barrier inst. to the list of tracked barriers */
+    void insertBarrierSN(const DynInstPtr &barr_inst);
 
     /** Pointer to the IQ. */
     InstructionQueue<Impl> *iqPtr;
diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh
index c712965..d1eac29 100644
--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2014 ARM Limited
+ * Copyright (c) 2012, 2014, 2020 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -42,6 +42,7 @@
 #define __CPU_O3_MEM_DEP_UNIT_IMPL_HH__
 
 #include <map>
+#include <vector>
 
 #include "cpu/o3/inst_queue.hh"
 #include "cpu/o3/mem_dep_unit.hh"
@@ -50,8 +51,7 @@
 
 template <class MemDepPred, class Impl>
 MemDepUnit<MemDepPred, Impl>::MemDepUnit()
-    : loadBarrier(false), loadBarrierSN(0), storeBarrier(false),
-      storeBarrierSN(0), iqPtr(NULL)
+    : iqPtr(NULL)
 {
 }
 
@@ -60,8 +60,7 @@
     : _name(params->name + ".memdepunit"),
       depPred(params->store_set_clear_period, params->SSITSize,
               params->LFSTSize),
-      loadBarrier(false), loadBarrierSN(0), storeBarrier(false),
-      storeBarrierSN(0), iqPtr(NULL)
+      iqPtr(NULL)
 {
     DPRINTF(MemDepUnit, "Creating MemDepUnit object.\n");
 }
@@ -155,8 +154,8 @@
 MemDepUnit<MemDepPred, Impl>::takeOverFrom()
 {
     // Be sure to reset all state.
-    loadBarrier = storeBarrier = false;
-    loadBarrierSN = storeBarrierSN = 0;
+    loadBarrierSNs.clear();
+    storeBarrierSNs.clear();
     depPred.clear();
 }
 
@@ -169,6 +168,29 @@
 
 template <class MemDepPred, class Impl>
 void
+MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const DynInstPtr &barr_inst)
+{
+    InstSeqNum barr_sn = barr_inst->seqNum;
+    // Memory barriers block loads and stores, write barriers only stores.
+    if (barr_inst->isMemBarrier()) {
+        loadBarrierSNs.insert(barr_sn);
+        storeBarrierSNs.insert(barr_sn);
+        DPRINTF(MemDepUnit, "Inserted a memory barrier %s SN:%lli\n",
+                barr_inst->pcState(), barr_sn);
+    } else if (barr_inst->isWriteBarrier()) {
+        storeBarrierSNs.insert(barr_sn);
+        DPRINTF(MemDepUnit, "Inserted a write barrier %s SN:%lli\n",
+                barr_inst->pcState(), barr_sn);
+    }
+    if (loadBarrierSNs.size() || storeBarrierSNs.size()) {
+        DPRINTF(MemDepUnit, "Outstanding load barriers = %d; "
+                            "store barriers = %d\n",
+                loadBarrierSNs.size(), storeBarrierSNs.size());
+    }
+}
+
+template <class MemDepPred, class Impl>
+void
 MemDepUnit<MemDepPred, Impl>::insert(const DynInstPtr &inst)
 {
     ThreadID tid = inst->threadNumber;
@@ -188,39 +210,46 @@
 
     // Check any barriers and the dependence predictor for any
     // producing memrefs/stores.
-    InstSeqNum producing_store;
-    if ((inst->isLoad() || inst->isAtomic()) && loadBarrier) {
-        DPRINTF(MemDepUnit, "Load barrier [sn:%lli] in flight\n",
-                loadBarrierSN);
-        producing_store = loadBarrierSN;
-    } else if ((inst->isStore() || inst->isAtomic()) && storeBarrier) {
-        DPRINTF(MemDepUnit, "Store barrier [sn:%lli] in flight\n",
-                storeBarrierSN);
-        producing_store = storeBarrierSN;
+    std::vector<InstSeqNum>  producing_stores;
+    if ((inst->isLoad() || inst->isAtomic()) && hasLoadBarrier()) {
+        DPRINTF(MemDepUnit, "%d load barriers in flight\n",
+                loadBarrierSNs.size());
+        producing_stores.insert(std::end(producing_stores),
+                                std::begin(loadBarrierSNs),
+                                std::end(loadBarrierSNs));
+    } else if ((inst->isStore() || inst->isAtomic()) && hasStoreBarrier()) {
+        DPRINTF(MemDepUnit, "%d store barriers in flight\n",
+                storeBarrierSNs.size());
+        producing_stores.insert(std::end(producing_stores),
+                                std::begin(storeBarrierSNs),
+                                std::end(storeBarrierSNs));
     } else {
-        producing_store = depPred.checkInst(inst->instAddr());
+        InstSeqNum dep = depPred.checkInst(inst->instAddr());
+        if (dep != 0)
+            producing_stores.push_back(dep);
     }
 
-    MemDepEntryPtr store_entry = NULL;
+    std::vector<MemDepEntryPtr> store_entries;
 
     // If there is a producing store, try to find the entry.
-    if (producing_store != 0) {
-        DPRINTF(MemDepUnit, "Searching for producer\n");
+    for (auto producing_store : producing_stores) {
+        DPRINTF(MemDepUnit, "Searching for producer [sn:%lli]\n",
+                            producing_store);
         MemDepHashIt hash_it = memDepHash.find(producing_store);
 
         if (hash_it != memDepHash.end()) {
-            store_entry = (*hash_it).second;
-            DPRINTF(MemDepUnit, "Proucer found\n");
+            store_entries.push_back((*hash_it).second);
+            DPRINTF(MemDepUnit, "Producer found\n");
         }
     }
 
     // If no store entry, then instruction can issue as soon as the registers
     // are ready.
-    if (!store_entry) {
+    if (store_entries.empty()) {
         DPRINTF(MemDepUnit, "No dependency for inst PC "
                 "%s [sn:%lli].\n", inst->pcState(), inst->seqNum);
 
-        inst_entry->memDepReady = true;
+        assert(inst_entry->memDeps == 0);
 
         if (inst->readyToIssue()) {
             inst_entry->regsReady = true;
@@ -229,8 +258,9 @@
         }
     } else {
         // Otherwise make the instruction dependent on the store/barrier.
-        DPRINTF(MemDepUnit, "Adding to dependency list; "
-                "inst PC %s is dependent on [sn:%lli].\n",
+        DPRINTF(MemDepUnit, "Adding to dependency list\n");
+        for (auto M5_VAR_USED producing_store : producing_stores)
+            DPRINTF(MemDepUnit, "\tinst PC %s is dependent on [sn:%lli].\n",
                 inst->pcState(), producing_store);
 
         if (inst->readyToIssue()) {
@@ -241,7 +271,10 @@
         inst->clearCanIssue();
 
         // Add this instruction to the list of dependents.
-        store_entry->dependInsts.push_back(inst_entry);
+        for (auto store_entry : store_entries)
+            store_entry->dependInsts.push_back(inst_entry);
+
+        inst_entry->memDeps = store_entries.size();
 
         if (inst->isLoad()) {
             ++conflictingLoads;
@@ -250,6 +283,9 @@
         }
     }
 
+    // for load-acquire store-release that could also be a barrier
+    insertBarrierSN(inst);
+
     if (inst->isStore() || inst->isAtomic()) {
         DPRINTF(MemDepUnit, "Inserting store/atomic PC %s [sn:%lli].\n",
                 inst->pcState(), inst->seqNum);
@@ -268,21 +304,7 @@
 void
 MemDepUnit<MemDepPred, Impl>::insertNonSpec(const DynInstPtr &inst)
 {
-    ThreadID tid = inst->threadNumber;
-
-    MemDepEntryPtr inst_entry = std::make_shared<MemDepEntry>(inst);
-
-    // Insert the MemDepEntry into the hash.
-    memDepHash.insert(
-        std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
-#ifdef DEBUG
-    MemDepEntry::memdep_insert++;
-#endif
-
-    // Add the instruction to the list.
-    instList[tid].push_back(inst);
-
-    inst_entry->listIt = --(instList[tid].end());
+    insertBarrier(inst);
 
     // Might want to turn this part into an inline function or something.
     // It's shared between both insert functions.
@@ -304,28 +326,13 @@
 void
 MemDepUnit<MemDepPred, Impl>::insertBarrier(const DynInstPtr &barr_inst)
 {
-    InstSeqNum barr_sn = barr_inst->seqNum;
-    // Memory barriers block loads and stores, write barriers only stores.
-    if (barr_inst->isMemBarrier()) {
-        loadBarrier = true;
-        loadBarrierSN = barr_sn;
-        storeBarrier = true;
-        storeBarrierSN = barr_sn;
-        DPRINTF(MemDepUnit, "Inserted a memory barrier %s SN:%lli\n",
-                barr_inst->pcState(),barr_sn);
-    } else if (barr_inst->isWriteBarrier()) {
-        storeBarrier = true;
-        storeBarrierSN = barr_sn;
-        DPRINTF(MemDepUnit, "Inserted a write barrier\n");
-    }
-
     ThreadID tid = barr_inst->threadNumber;
 
     MemDepEntryPtr inst_entry = std::make_shared<MemDepEntry>(barr_inst);
 
     // Add the MemDepEntry to the hash.
     memDepHash.insert(
-        std::pair<InstSeqNum, MemDepEntryPtr>(barr_sn, inst_entry));
+        std::pair<InstSeqNum, MemDepEntryPtr>(barr_inst->seqNum, inst_entry));
 #ifdef DEBUG
     MemDepEntry::memdep_insert++;
 #endif
@@ -334,6 +341,8 @@
     instList[tid].push_back(barr_inst);
 
     inst_entry->listIt = --(instList[tid].end());
+
+    insertBarrierSN(barr_inst);
 }
 
 template <class MemDepPred, class Impl>
@@ -348,7 +357,7 @@
 
     inst_entry->regsReady = true;
 
-    if (inst_entry->memDepReady) {
+    if (inst_entry->memDeps == 0) {
         DPRINTF(MemDepUnit, "Instruction has its memory "
                 "dependencies resolved, adding it to the ready list.\n");
 
@@ -430,18 +439,19 @@
 {
     wakeDependents(inst);
     completed(inst);
-
     InstSeqNum barr_sn = inst->seqNum;
-    DPRINTF(MemDepUnit, "barrier completed: %s SN:%lli\n", inst->pcState(),
-            inst->seqNum);
     if (inst->isMemBarrier()) {
-        if (loadBarrierSN == barr_sn)
-            loadBarrier = false;
-        if (storeBarrierSN == barr_sn)
-            storeBarrier = false;
+        assert(hasLoadBarrier());
+        assert(hasStoreBarrier());
+        loadBarrierSNs.erase(barr_sn);
+        storeBarrierSNs.erase(barr_sn);
+        DPRINTF(MemDepUnit, "Memory barrier completed: %s SN:%lli\n",
+                            inst->pcState(), inst->seqNum);
     } else if (inst->isWriteBarrier()) {
-        if (storeBarrierSN == barr_sn)
-            storeBarrier = false;
+        assert(hasStoreBarrier());
+        storeBarrierSNs.erase(barr_sn);
+        DPRINTF(MemDepUnit, "Write barrier completed: %s SN:%lli\n",
+                            inst->pcState(), inst->seqNum);
     }
 }
 
@@ -469,10 +479,13 @@
                 "[sn:%lli].\n",
                 woken_inst->inst->seqNum);
 
-        if (woken_inst->regsReady && !woken_inst->squashed) {
+        assert(woken_inst->memDeps > 0);
+        woken_inst->memDeps -= 1;
+
+        if ((woken_inst->memDeps == 0) &&
+            woken_inst->regsReady &&
+            !woken_inst->squashed) {
             moveToReady(woken_inst);
-        } else {
-            woken_inst->memDepReady = true;
         }
     }
 
@@ -507,11 +520,9 @@
         DPRINTF(MemDepUnit, "Squashing inst [sn:%lli]\n",
                 (*squash_it)->seqNum);
 
-        if ((*squash_it)->seqNum == loadBarrierSN)
-              loadBarrier = false;
+        loadBarrierSNs.erase((*squash_it)->seqNum);
 
-        if ((*squash_it)->seqNum == storeBarrierSN)
-              storeBarrier = false;
+        storeBarrierSNs.erase((*squash_it)->seqNum);
 
         hash_it = memDepHash.find((*squash_it)->seqNum);
 
diff --git a/src/cpu/profile.cc b/src/cpu/profile.cc
index aa2cff8..fee0681 100644
--- a/src/cpu/profile.cc
+++ b/src/cpu/profile.cc
@@ -57,6 +57,7 @@
 
     ccprintf(os, "\n");
 
+    Loader::SymbolTable::const_iterator it;
     for (i = children.begin(); i != end; ++i) {
         Addr addr = i->first;
         string symbol;
@@ -66,7 +67,9 @@
             symbol = "console";
         else if (addr == 3)
             symbol = "unknown";
-        else if (!symtab->findSymbol(addr, symbol))
+        else if ((it = symtab->find(addr)) != symtab->end())
+            symbol = it->name;
+        else
             panic("could not find symbol for address %#x\n", addr);
 
         const ProfileNode *node = i->second;
@@ -127,13 +130,15 @@
         Addr pc = i->first;
         Counter count = i->second;
 
-        std::string symbol;
-        if (pc == 1)
+        Loader::SymbolTable::const_iterator it;
+        if (pc == 1) {
             ccprintf(os, "user %d\n", count);
-        else if (symtab->findSymbol(pc, symbol) && !symbol.empty())
-            ccprintf(os, "%s %d\n", symbol, count);
-        else
+        } else if ((it = symtab->find(pc)) != symtab->end() &&
+                !it->name.empty()) {
+            ccprintf(os, "%s %d\n", it->name, count);
+        } else {
             ccprintf(os, "%#x %d\n", pc, count);
+        }
     }
 
     ccprintf(os, ">>>function data\n");
@@ -145,9 +150,9 @@
 {
     node->count++;
 
-    Addr symaddr;
-    if (symtab->findNearestAddr(pc, symaddr)) {
-        pc_count[symaddr]++;
+    auto it = symtab->findNearest(pc);
+    if (it != symtab->end()) {
+        pc_count[it->address]++;
     } else {
         // record PC even if we don't have a symbol to avoid
         // silently biasing the histogram
diff --git a/src/cpu/thread_context.cc b/src/cpu/thread_context.cc
index de6997a..9b93d75 100644
--- a/src/cpu/thread_context.cc
+++ b/src/cpu/thread_context.cc
@@ -130,9 +130,6 @@
 void
 ThreadContext::quiesce()
 {
-    if (!getCpuPtr()->params()->do_quiesce)
-        return;
-
     DPRINTF(Quiesce, "%s: quiesce()\n", getCpuPtr()->name());
 
     suspend();
@@ -146,9 +143,6 @@
 {
     BaseCPU *cpu = getCpuPtr();
 
-    if (!cpu->params()->do_quiesce)
-        return;
-
     EndQuiesceEvent *quiesceEvent = getQuiesceEvent();
 
     cpu->reschedule(quiesceEvent, resume, true);
diff --git a/src/dev/net/etherswitch.cc b/src/dev/net/etherswitch.cc
index e7a60a1..972cf56 100644
--- a/src/dev/net/etherswitch.cc
+++ b/src/dev/net/etherswitch.cc
@@ -184,7 +184,7 @@
     if (!sendPacket(outputFifo.front())) {
         DPRINTF(Ethernet, "output port busy...retry later\n");
         if (!txEvent.scheduled())
-            parent->schedule(txEvent, curTick() + retryTime);
+            parent->schedule(txEvent, curTick() + SimClock::Int::ns);
     } else {
         DPRINTF(Ethernet, "packet sent: len=%d\n", outputFifo.front()->length);
         outputFifo.pop();
diff --git a/src/dev/net/ethertap.cc b/src/dev/net/ethertap.cc
index 33bafb7..f4aba21 100644
--- a/src/dev/net/ethertap.cc
+++ b/src/dev/net/ethertap.cc
@@ -196,7 +196,7 @@
         DPRINTF(Ethernet, "bus busy...buffer for retransmission\n");
         packetBuffer.push(packet);
         if (!txEvent.scheduled())
-            schedule(txEvent, curTick() + retryTime);
+            schedule(txEvent, curTick() + SimClock::Int::ns);
     } else if (dump) {
         dump->dump(packet);
     }
@@ -218,7 +218,7 @@
     }
 
     if (!packetBuffer.empty() && !txEvent.scheduled())
-        schedule(txEvent, curTick() + retryTime);
+        schedule(txEvent, curTick() + SimClock::Int::ns);
 }
 
 
diff --git a/src/dev/net/ns_gige.cc b/src/dev/net/ns_gige.cc
index 68c4497..71f449a 100644
--- a/src/dev/net/ns_gige.cc
+++ b/src/dev/net/ns_gige.cc
@@ -1395,7 +1395,7 @@
 
    if (!txFifo.empty() && !txEvent.scheduled()) {
        DPRINTF(Ethernet, "reschedule transmit\n");
-       schedule(txEvent, curTick() + retryTime);
+       schedule(txEvent, curTick() + SimClock::Int::ns);
    }
 }
 
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index fee0254..7eaf65f 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -129,6 +129,8 @@
                                       "memory pipeline's queues")
     local_mem_queue_size = Param.Int(256, "Number of entries in the local "
                                       "memory pipeline's queues")
+    max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
+                            " of instructions that can be sent to coalescer")
     ldsBus = Bridge() # the bridge between the CU and its LDS
     ldsPort = MasterPort("The port that goes to the LDS")
     localDataStore = Param.LdsState("the LDS for this CU")
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 59bc6a0..cd880d6 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -74,9 +74,9 @@
     req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
     resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
     _masterId(p->system->getMasterId(this, "ComputeUnit")),
-    lds(*p->localDataStore), _cacheLineSize(p->system->cacheLineSize()),
-    globalSeqNum(0), wavefrontSize(p->wfSize),
-    kernelLaunchInst(new KernelLaunchStaticInst())
+    lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
+    _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
+    wavefrontSize(p->wfSize), kernelLaunchInst(new KernelLaunchStaticInst())
 {
     /**
      * This check is necessary because std::bitset only provides conversion
@@ -139,6 +139,10 @@
 
     memPort.resize(wfSize());
 
+    // Setup tokens for slave ports. The number of tokens in memSlaveTokens
+    // is the total token count for the entire vector port (i.e., this CU).
+    memPortTokens = new TokenManager(p->max_cu_tokens);
+
     // resize the tlbPort vectorArray
     int tlbPort_width = perLaneTLB ? wfSize() : 1;
     tlbPort.resize(tlbPort_width);
@@ -612,6 +616,8 @@
     vectorAluInstAvail.resize(numSIMDs, false);
     shrMemInstAvail = 0;
     glbMemInstAvail = 0;
+
+    gmTokenPort.setTokenManager(memPortTokens);
 }
 
 bool
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index a023cb2..49713e9 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -51,6 +51,7 @@
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "mem/port.hh"
+#include "mem/token_port.hh"
 #include "sim/clocked_object.hh"
 
 static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
@@ -415,6 +416,26 @@
 
     CUExitCallback *cuExitCallback;
 
+    class GMTokenPort : public TokenMasterPort
+    {
+      public:
+        GMTokenPort(const std::string& name, SimObject *owner,
+                    PortID id = InvalidPortID)
+            : TokenMasterPort(name, owner, id)
+        { }
+        ~GMTokenPort() { }
+
+      protected:
+        bool recvTimingResp(PacketPtr) { return false; }
+        void recvReqRetry() { }
+    };
+
+    // Manager for the number of tokens available to this compute unit to
+    // send global memory request packets to the coalescer this is only used
+    // between global memory pipe and TCP coalescer.
+    TokenManager *memPortTokens;
+    GMTokenPort gmTokenPort;
+
     /** Data access Port **/
     class DataPort : public MasterPort
     {
@@ -677,6 +698,12 @@
         return ldsPort;
     }
 
+    TokenManager *
+    getTokenManager()
+    {
+        return memPortTokens;
+    }
+
     /** The memory port for SIMD data accesses.
      *  Can be connected to PhysMem for Ruby for timing simulations
      */
@@ -712,6 +739,8 @@
             }
             ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
             return *ldsPort;
+        } else if (if_name == "gmTokenPort") {
+            return gmTokenPort;
         } else {
             panic("incorrect port name");
         }
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index d8e6d47..64778f0 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -33,6 +33,7 @@
 
 #include "gpu-compute/global_memory_pipeline.hh"
 
+#include "debug/GPUCoalescer.hh"
 #include "debug/GPUMem.hh"
 #include "debug/GPUReg.hh"
 #include "gpu-compute/compute_unit.hh"
@@ -56,6 +57,25 @@
     _name = computeUnit->name() + ".GlobalMemPipeline";
 }
 
+bool
+GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
+{
+    // We require one token from the coalescer's uncoalesced table to
+    // proceed
+    int token_count = 1;
+
+    // Make sure the vector port has tokens. There is a single pool
+    // of tokens so only one port in the vector port needs to be checked.
+    // Lane 0 is chosen arbirarily.
+    DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
+    if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
+        DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
+        return false;
+    }
+
+    return true;
+}
+
 void
 GlobalMemPipeline::exec()
 {
@@ -124,6 +144,14 @@
             }
         }
 
+        DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
+                mp->disassemble(), mp->seqNum());
+        // Memfences will not return tokens and must be issued so we should
+        // not request one as this will deplete the token count until deadlock
+        if (!mp->isMemFence()) {
+            assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
+            mp->computeUnit()->getTokenManager()->acquireTokens(1);
+        }
         mp->initiateAcc(mp);
 
         if (!outOfOrderDataDelivery && !mp->isMemFence()) {
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
index 0bc8596..2f83185 100644
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -121,6 +121,8 @@
         loadVrfBankConflictCycles += num_cycles;
     }
 
+    bool coalescerReady(GPUDynInstPtr mp) const;
+
   private:
     ComputeUnit *computeUnit;
     std::string _name;
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index e70a874..46cce9c 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -434,6 +434,11 @@
             return 0;
         }
 
+        // Does the coalescer have space for our instruction?
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            return 0;
+        }
+
         if (!computeUnit->globalMemoryPipe.
             isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
             // Can we insert a new request to the Global Mem Request FIFO?
@@ -504,6 +509,12 @@
         if (!locMemIssueRdy) {
             return 0;
         }
+
+        // Does the coalescer have space for our instruction?
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            return 0;
+        }
+
         if (!computeUnit->globalMemoryPipe.
             isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
             // Can we insert a new request to the Global Mem Request FIFO?
diff --git a/src/kern/linux/helpers.cc b/src/kern/linux/helpers.cc
index 9286ab0..80ff526 100644
--- a/src/kern/linux/helpers.cc
+++ b/src/kern/linux/helpers.cc
@@ -96,24 +96,25 @@
     const auto *symtab = system->workload->symtab(tc);
     PortProxy &proxy = tc->getVirtProxy();
 
-    Addr addr_lb = 0, addr_lb_len = 0, addr_first = 0, addr_next = 0;
-    const bool found_symbols =
-        symtab->findAddress("__log_buf", addr_lb) &&
-        symtab->findAddress("log_buf_len", addr_lb_len) &&
-        symtab->findAddress("log_first_idx", addr_first) &&
-        symtab->findAddress("log_next_idx", addr_next);
+    auto lb = symtab->find("__log_buf");
+    auto lb_len = symtab->find("log_buf_len");
+    auto first = symtab->find("log_first_idx");
+    auto next = symtab->find("log_next_idx");
 
-    if (!found_symbols) {
+    auto end_it = symtab->end();
+
+    if (lb == end_it || lb_len == end_it ||
+            first == end_it || next == end_it) {
         warn("Failed to find kernel dmesg symbols.\n");
         return;
     }
 
     uint32_t log_buf_len =
-        proxy.read<uint32_t>(addr_lb_len, TheISA::GuestByteOrder);
+        proxy.read<uint32_t>(lb_len->address, TheISA::GuestByteOrder);
     uint32_t log_first_idx =
-        proxy.read<uint32_t>(addr_first, TheISA::GuestByteOrder);
+        proxy.read<uint32_t>(first->address, TheISA::GuestByteOrder);
     uint32_t log_next_idx =
-        proxy.read<uint32_t>(addr_next, TheISA::GuestByteOrder);
+        proxy.read<uint32_t>(next->address, TheISA::GuestByteOrder);
 
     if (log_first_idx >= log_buf_len || log_next_idx >= log_buf_len) {
         warn("dmesg pointers/length corrupted\n");
@@ -129,7 +130,7 @@
             warn("Unexpected dmesg buffer length\n");
             return;
         }
-        proxy.readBlob(addr_lb + log_first_idx, log_buf.data(), length);
+        proxy.readBlob(lb->address + log_first_idx, log_buf.data(), length);
     } else {
         const int length_2 = log_buf_len - log_first_idx;
         if (length_2 < 0 || length_2 + log_next_idx > log_buf.size()) {
@@ -137,8 +138,8 @@
             return;
         }
         length = log_buf_len;
-        proxy.readBlob(addr_lb + log_first_idx, log_buf.data(), length_2);
-        proxy.readBlob(addr_lb, log_buf.data() + length_2, log_next_idx);
+        proxy.readBlob(lb->address + log_first_idx, log_buf.data(), length_2);
+        proxy.readBlob(lb->address, log_buf.data() + length_2, log_next_idx);
     }
 
     // Print dmesg buffer content
diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc
index 9323f61..678527e 100644
--- a/src/mem/abstract_mem.cc
+++ b/src/mem/abstract_mem.cc
@@ -80,7 +80,7 @@
     auto *object = Loader::createObjectFile(file, true);
     fatal_if(!object, "%s: Could not load %s.", name(), file);
 
-    panic_if(!object->loadGlobalSymbols(Loader::debugSymbolTable),
+    panic_if(!object->loadGlobalSymbols(&Loader::debugSymbolTable),
              "%s: Could not load symbols from %s.", name(), file);
 
     Loader::MemoryImage image = object->buildImage();
diff --git a/src/mem/cache/prefetch/pif.cc b/src/mem/cache/prefetch/pif.cc
index 6a2983b..c557bd2 100644
--- a/src/mem/cache/prefetch/pif.cc
+++ b/src/mem/cache/prefetch/pif.cc
@@ -75,12 +75,12 @@
     Addr blk_distance = distanceFromTrigger(pc, log_blk_size);
 
     bool hit = (pc > trigger) ?
-        (succ.size() >= blk_distance) : (prec.size() >= blk_distance);
+        (succ.size() > blk_distance) : (prec.size() > blk_distance);
     if (hit && update) {
         if (pc > trigger) {
-            succ[blk_distance - 1] = true;
+            succ[blk_distance] = true;
         } else if (pc < trigger) {
-            prec[blk_distance - 1] = true;
+            prec[blk_distance] = true;
         }
     }
     return hit;
@@ -93,9 +93,9 @@
     Addr blk_distance = distanceFromTrigger(target, log_blk_size);
     bool hit = false;
     if (target > trigger) {
-        hit = blk_distance <= succ.size() && succ[blk_distance - 1];
+        hit = blk_distance < succ.size() && succ[blk_distance];
     } else if (target < trigger) {
-        hit = blk_distance <= prec.size() && succ[blk_distance - 1];
+        hit = blk_distance < prec.size() && prec[blk_distance];
     } else {
         hit = true;
     }
@@ -134,6 +134,7 @@
     // First access to the prefetcher
     if (temporalCompactor.size() == 0) {
         spatialCompactor = CompactorEntry(pc, precSize, succSize);
+        temporalCompactor.push_back(spatialCompactor);
     } else {
         // If the PC of the instruction retired is in the same spatial region
         // than the last trigger address, update the bit vectors based on the
@@ -195,12 +196,16 @@
 PIF::calculatePrefetch(const PrefetchInfo &pfi,
     std::vector<AddrPriority> &addresses)
 {
-    const Addr addr = pfi.getAddr();
+    if (!pfi.hasPC()) {
+        return;
+    }
+
+    const Addr pc = pfi.getPC();
 
     // First check if the access has been prefetched, this is done by
     // comparing the access against the active Stream Address Buffers
     for (auto &sabEntry : streamAddressBuffer) {
-        if (sabEntry->hasAddress(addr, lBlkSize)) {
+        if (sabEntry->hasAddress(pc, lBlkSize)) {
             sabEntry++;
             sabEntry->getPredictedAddresses(lBlkSize, addresses);
             // We are done
@@ -210,7 +215,7 @@
 
     // Check if a valid entry in the 'index' table is found and allocate a new
     // active prediction stream
-    IndexEntry *idx_entry = index.findEntry(addr, /* unused */ false);
+    IndexEntry *idx_entry = index.findEntry(pc, /* unused */ false);
 
     if (idx_entry != nullptr) {
         index.accessEntry(idx_entry);
diff --git a/src/mem/ruby/common/Address.cc b/src/mem/ruby/common/Address.cc
index 40ce0fe..39de974 100644
--- a/src/mem/ruby/common/Address.cc
+++ b/src/mem/ruby/common/Address.cc
@@ -56,6 +56,12 @@
     return mbits<Addr>(addr, 63, RubySystem::getBlockSizeBits());
 }
 
+Addr
+makeLineAddress(Addr addr, int cacheLineBits)
+{
+    return maskLowOrderBits(addr, cacheLineBits);
+}
+
 // returns the next stride address based on line address
 Addr
 makeNextStrideAddress(Addr addr, int stride)
diff --git a/src/mem/ruby/common/Address.hh b/src/mem/ruby/common/Address.hh
index 30682fa..e5e320f 100644
--- a/src/mem/ruby/common/Address.hh
+++ b/src/mem/ruby/common/Address.hh
@@ -40,6 +40,7 @@
 Addr maskLowOrderBits(Addr addr, unsigned int number);
 Addr getOffset(Addr addr);
 Addr makeLineAddress(Addr addr);
+Addr makeLineAddress(Addr addr, int cacheLineBits);
 Addr makeNextStrideAddress(Addr addr, int stride);
 std::string printAddress(Addr addr);
 
diff --git a/src/mem/ruby/network/MessageBuffer.cc b/src/mem/ruby/network/MessageBuffer.cc
index f5562dc..3db8515 100644
--- a/src/mem/ruby/network/MessageBuffer.cc
+++ b/src/mem/ruby/network/MessageBuffer.cc
@@ -394,6 +394,42 @@
     m_stall_count++;
 }
 
+bool
+MessageBuffer::hasStalledMsg(Addr addr) const
+{
+    return (m_stall_msg_map.count(addr) != 0);
+}
+
+void
+MessageBuffer::deferEnqueueingMessage(Addr addr, MsgPtr message)
+{
+    DPRINTF(RubyQueue, "Deferring enqueueing message: %s, Address %#x\n",
+            *(message.get()), addr);
+    (m_deferred_msg_map[addr]).push_back(message);
+}
+
+void
+MessageBuffer::enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay)
+{
+    assert(!isDeferredMsgMapEmpty(addr));
+    std::vector<MsgPtr>& msg_vec = m_deferred_msg_map[addr];
+    assert(msg_vec.size() > 0);
+
+    // enqueue all deferred messages associated with this address
+    for (MsgPtr m : msg_vec) {
+        enqueue(m, curTime, delay);
+    }
+
+    msg_vec.clear();
+    m_deferred_msg_map.erase(addr);
+}
+
+bool
+MessageBuffer::isDeferredMsgMapEmpty(Addr addr) const
+{
+    return m_deferred_msg_map.count(addr) == 0;
+}
+
 void
 MessageBuffer::print(ostream& out) const
 {
diff --git a/src/mem/ruby/network/MessageBuffer.hh b/src/mem/ruby/network/MessageBuffer.hh
index 0e11529..8abf3bd 100644
--- a/src/mem/ruby/network/MessageBuffer.hh
+++ b/src/mem/ruby/network/MessageBuffer.hh
@@ -51,6 +51,7 @@
 #include <functional>
 #include <iostream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "base/trace.hh"
@@ -73,6 +74,8 @@
     void reanalyzeMessages(Addr addr, Tick current_time);
     void reanalyzeAllMessages(Tick current_time);
     void stallMessage(Addr addr, Tick current_time);
+    // return true if the stall map has a message of this address
+    bool hasStalledMsg(Addr addr) const;
 
     // TRUE if head of queue timestamp <= SystemTime
     bool isReady(Tick current_time) const;
@@ -113,6 +116,18 @@
 
     void enqueue(MsgPtr message, Tick curTime, Tick delta);
 
+    // Defer enqueueing a message to a later cycle by putting it aside and not
+    // enqueueing it in this cycle
+    // The corresponding controller will need to explicitly enqueue the
+    // deferred message into the message buffer. Otherwise, the message will
+    // be lost.
+    void deferEnqueueingMessage(Addr addr, MsgPtr message);
+
+    // enqueue all previously deferred messages that are associated with the
+    // input address
+    void enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay);
+    bool isDeferredMsgMapEmpty(Addr addr) const;
+
     //! Updates the delay cycles of the message at the head of the queue,
     //! removes it from the queue and returns its total delay.
     Tick dequeue(Tick current_time, bool decrement_messages = true);
@@ -192,6 +207,14 @@
     StallMsgMapType m_stall_msg_map;
 
     /**
+     * A map from line addresses to corresponding vectors of messages that
+     * are deferred for enqueueing. Messages in this map are waiting to be
+     * enqueued into the message buffer.
+     */
+    typedef std::unordered_map<Addr, std::vector<MsgPtr>> DeferredMsgMapType;
+    DeferredMsgMapType m_deferred_msg_map;
+
+    /**
      * Current size of the stall map.
      * Track the number of messages held in stall map lists. This is used to
      * ensure that if the buffer is finite-sized, it blocks further requests
diff --git a/src/mem/ruby/network/Network.cc b/src/mem/ruby/network/Network.cc
index 57834f2..982b57e 100644
--- a/src/mem/ruby/network/Network.cc
+++ b/src/mem/ruby/network/Network.cc
@@ -55,9 +55,35 @@
     m_virtual_networks = p->number_of_virtual_networks;
     m_control_msg_size = p->control_msg_size;
 
-    // Total nodes/controllers in network
+    // Populate localNodeVersions with the version of each MachineType in
+    // this network. This will be used to compute a global to local ID.
+    // Do this by looking at the ext_node for each ext_link. There is one
+    // ext_node per ext_link and it points to an AbstractController.
+    // For RubySystems with one network global and local ID are the same.
+    std::unordered_map<MachineType, std::vector<NodeID>> localNodeVersions;
+    for (auto &it : params()->ext_links) {
+        AbstractController *cntrl = it->params()->ext_node;
+        localNodeVersions[cntrl->getType()].push_back(cntrl->getVersion());
+    }
+
+    // Compute a local ID for each MachineType using the same order as SLICC
+    NodeID local_node_id = 0;
+    for (int i = 0; i < MachineType_base_level(MachineType_NUM); ++i) {
+        MachineType mach = static_cast<MachineType>(i);
+        if (localNodeVersions.count(mach)) {
+            for (auto &ver : localNodeVersions.at(mach)) {
+                // Get the global ID Ruby will pass around
+                NodeID global_node_id = MachineType_base_number(mach) + ver;
+                globalToLocalMap.emplace(global_node_id, local_node_id);
+                ++local_node_id;
+            }
+        }
+    }
+
+    // Total nodes/controllers in network is equal to the local node count
     // Must make sure this is called after the State Machine constructors
-    m_nodes = MachineType_base_number(MachineType_NUM);
+    m_nodes = local_node_id;
+
     assert(m_nodes != 0);
     assert(m_virtual_networks != 0);
 
@@ -158,11 +184,11 @@
 }
 
 void
-Network::checkNetworkAllocation(NodeID id, bool ordered,
+Network::checkNetworkAllocation(NodeID local_id, bool ordered,
                                         int network_num,
                                         std::string vnet_type)
 {
-    fatal_if(id >= m_nodes, "Node ID is out of range");
+    fatal_if(local_id >= m_nodes, "Node ID is out of range");
     fatal_if(network_num >= m_virtual_networks, "Network id is out of range");
 
     if (ordered) {
@@ -174,25 +200,29 @@
 
 
 void
-Network::setToNetQueue(NodeID id, bool ordered, int network_num,
+Network::setToNetQueue(NodeID global_id, bool ordered, int network_num,
                                  std::string vnet_type, MessageBuffer *b)
 {
-    checkNetworkAllocation(id, ordered, network_num, vnet_type);
-    while (m_toNetQueues[id].size() <= network_num) {
-        m_toNetQueues[id].push_back(nullptr);
+    NodeID local_id = getLocalNodeID(global_id);
+    checkNetworkAllocation(local_id, ordered, network_num, vnet_type);
+
+    while (m_toNetQueues[local_id].size() <= network_num) {
+        m_toNetQueues[local_id].push_back(nullptr);
     }
-    m_toNetQueues[id][network_num] = b;
+    m_toNetQueues[local_id][network_num] = b;
 }
 
 void
-Network::setFromNetQueue(NodeID id, bool ordered, int network_num,
+Network::setFromNetQueue(NodeID global_id, bool ordered, int network_num,
                                    std::string vnet_type, MessageBuffer *b)
 {
-    checkNetworkAllocation(id, ordered, network_num, vnet_type);
-    while (m_fromNetQueues[id].size() <= network_num) {
-        m_fromNetQueues[id].push_back(nullptr);
+    NodeID local_id = getLocalNodeID(global_id);
+    checkNetworkAllocation(local_id, ordered, network_num, vnet_type);
+
+    while (m_fromNetQueues[local_id].size() <= network_num) {
+        m_fromNetQueues[local_id].push_back(nullptr);
     }
-    m_fromNetQueues[id][network_num] = b;
+    m_fromNetQueues[local_id][network_num] = b;
 }
 
 NodeID
@@ -212,3 +242,10 @@
     }
     return MachineType_base_count(mtype);
 }
+
+NodeID
+Network::getLocalNodeID(NodeID global_id) const
+{
+    assert(globalToLocalMap.count(global_id));
+    return globalToLocalMap.at(global_id);
+}
diff --git a/src/mem/ruby/network/Network.hh b/src/mem/ruby/network/Network.hh
index 606e670..bba0c5e 100644
--- a/src/mem/ruby/network/Network.hh
+++ b/src/mem/ruby/network/Network.hh
@@ -90,13 +90,13 @@
     static uint32_t MessageSizeType_to_int(MessageSizeType size_type);
 
     // returns the queue requested for the given component
-    void setToNetQueue(NodeID id, bool ordered, int netNumber,
+    void setToNetQueue(NodeID global_id, bool ordered, int netNumber,
                                std::string vnet_type, MessageBuffer *b);
-    virtual void setFromNetQueue(NodeID id, bool ordered, int netNumber,
+    virtual void setFromNetQueue(NodeID global_id, bool ordered, int netNumber,
                                  std::string vnet_type, MessageBuffer *b);
 
-    virtual void checkNetworkAllocation(NodeID id, bool ordered,
-        int network_num, std::string vnet_type);
+    virtual void checkNetworkAllocation(NodeID local_id, bool ordered,
+                                       int network_num, std::string vnet_type);
 
     virtual void makeExtOutLink(SwitchID src, NodeID dest, BasicLink* link,
                              const NetDest& routing_table_entry) = 0;
@@ -140,6 +140,8 @@
         return RubyDummyPort::instance();
     }
 
+    NodeID getLocalNodeID(NodeID global_id) const;
+
   protected:
     // Private copy constructor and assignment operator
     Network(const Network& obj);
@@ -182,6 +184,10 @@
         AddrRangeList ranges;
     };
     std::unordered_multimap<MachineType, AddrMapNode> addrMap;
+
+    // Global NodeID to local node map. If there are not multiple networks in
+    // the same RubySystem, this is a one-to-one mapping of global to local.
+    std::unordered_map<NodeID, NodeID> globalToLocalMap;
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/network/garnet2.0/GarnetNetwork.cc b/src/mem/ruby/network/garnet2.0/GarnetNetwork.cc
index 1eff921..a88302b 100644
--- a/src/mem/ruby/network/garnet2.0/GarnetNetwork.cc
+++ b/src/mem/ruby/network/garnet2.0/GarnetNetwork.cc
@@ -146,10 +146,11 @@
 */
 
 void
-GarnetNetwork::makeExtInLink(NodeID src, SwitchID dest, BasicLink* link,
+GarnetNetwork::makeExtInLink(NodeID global_src, SwitchID dest, BasicLink* link,
                             const NetDest& routing_table_entry)
 {
-    assert(src < m_nodes);
+    NodeID local_src = getLocalNodeID(global_src);
+    assert(local_src < m_nodes);
 
     GarnetExtLink* garnet_link = safe_cast<GarnetExtLink*>(link);
 
@@ -163,7 +164,7 @@
 
     PortDirection dst_inport_dirn = "Local";
     m_routers[dest]->addInPort(dst_inport_dirn, net_link, credit_link);
-    m_nis[src]->addOutPort(net_link, credit_link, dest);
+    m_nis[local_src]->addOutPort(net_link, credit_link, dest);
 }
 
 /*
@@ -173,10 +174,12 @@
 */
 
 void
-GarnetNetwork::makeExtOutLink(SwitchID src, NodeID dest, BasicLink* link,
-                             const NetDest& routing_table_entry)
+GarnetNetwork::makeExtOutLink(SwitchID src, NodeID global_dest,
+                              BasicLink* link,
+                              const NetDest& routing_table_entry)
 {
-    assert(dest < m_nodes);
+    NodeID local_dest = getLocalNodeID(global_dest);
+    assert(local_dest < m_nodes);
     assert(src < m_routers.size());
     assert(m_routers[src] != NULL);
 
@@ -194,7 +197,7 @@
     m_routers[src]->addOutPort(src_outport_dirn, net_link,
                                routing_table_entry,
                                link->m_weight, credit_link);
-    m_nis[dest]->addInPort(net_link, credit_link);
+    m_nis[local_dest]->addInPort(net_link, credit_link);
 }
 
 /*
@@ -233,9 +236,11 @@
 
 // Get ID of router connected to a NI.
 int
-GarnetNetwork::get_router_id(int ni)
+GarnetNetwork::get_router_id(int global_ni)
 {
-    return m_nis[ni]->get_router_id();
+    NodeID local_ni = getLocalNodeID(global_ni);
+
+    return m_nis[local_ni]->get_router_id();
 }
 
 void
@@ -415,6 +420,20 @@
 }
 
 void
+GarnetNetwork::resetStats()
+{
+    for (int i = 0; i < m_routers.size(); i++) {
+        m_routers[i]->resetStats();
+    }
+    for (int i = 0; i < m_networklinks.size(); i++) {
+        m_networklinks[i]->resetStats();
+    }
+    for (int i = 0; i < m_creditlinks.size(); i++) {
+        m_creditlinks[i]->resetStats();
+    }
+}
+
+void
 GarnetNetwork::print(ostream& out) const
 {
     out << "[GarnetNetwork]";
diff --git a/src/mem/ruby/network/garnet2.0/GarnetNetwork.hh b/src/mem/ruby/network/garnet2.0/GarnetNetwork.hh
index 9acbeef..3821dd8 100644
--- a/src/mem/ruby/network/garnet2.0/GarnetNetwork.hh
+++ b/src/mem/ruby/network/garnet2.0/GarnetNetwork.hh
@@ -101,6 +101,7 @@
     // Stats
     void collateStats();
     void regStats();
+    void resetStats();
     void print(std::ostream& out) const;
 
     // increment counters
diff --git a/src/mem/ruby/network/garnet2.0/Router.cc b/src/mem/ruby/network/garnet2.0/Router.cc
index 14c0e84..73b7dce 100644
--- a/src/mem/ruby/network/garnet2.0/Router.cc
+++ b/src/mem/ruby/network/garnet2.0/Router.cc
@@ -215,10 +215,8 @@
 void
 Router::resetStats()
 {
-    for (int j = 0; j < m_virtual_networks; j++) {
-        for (int i = 0; i < m_input_unit.size(); i++) {
+    for (int i = 0; i < m_input_unit.size(); i++) {
             m_input_unit[i]->resetStats();
-        }
     }
 
     crossbarSwitch.resetStats();
diff --git a/src/mem/ruby/network/simple/SimpleNetwork.cc b/src/mem/ruby/network/simple/SimpleNetwork.cc
index 84817e4..d3b5515 100644
--- a/src/mem/ruby/network/simple/SimpleNetwork.cc
+++ b/src/mem/ruby/network/simple/SimpleNetwork.cc
@@ -83,27 +83,30 @@
 
 // From a switch to an endpoint node
 void
-SimpleNetwork::makeExtOutLink(SwitchID src, NodeID dest, BasicLink* link,
-                           const NetDest& routing_table_entry)
+SimpleNetwork::makeExtOutLink(SwitchID src, NodeID global_dest,
+                              BasicLink* link,
+                              const NetDest& routing_table_entry)
 {
-    assert(dest < m_nodes);
+    NodeID local_dest = getLocalNodeID(global_dest);
+    assert(local_dest < m_nodes);
     assert(src < m_switches.size());
     assert(m_switches[src] != NULL);
 
     SimpleExtLink *simple_link = safe_cast<SimpleExtLink*>(link);
 
-    m_switches[src]->addOutPort(m_fromNetQueues[dest], routing_table_entry,
-                                simple_link->m_latency,
+    m_switches[src]->addOutPort(m_fromNetQueues[local_dest],
+                                routing_table_entry, simple_link->m_latency,
                                 simple_link->m_bw_multiplier);
 }
 
 // From an endpoint node to a switch
 void
-SimpleNetwork::makeExtInLink(NodeID src, SwitchID dest, BasicLink* link,
+SimpleNetwork::makeExtInLink(NodeID global_src, SwitchID dest, BasicLink* link,
                           const NetDest& routing_table_entry)
 {
-    assert(src < m_nodes);
-    m_switches[dest]->addInPort(m_toNetQueues[src]);
+    NodeID local_src = getLocalNodeID(global_src);
+    assert(local_src < m_nodes);
+    m_switches[dest]->addInPort(m_toNetQueues[local_src]);
 }
 
 // From a switch to a switch
diff --git a/src/mem/ruby/protocol/MOESI_CMP_directory-L1cache.sm b/src/mem/ruby/protocol/MOESI_CMP_directory-L1cache.sm
index 5a31d28..15bbdd3 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_directory-L1cache.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_directory-L1cache.sm
@@ -74,19 +74,20 @@
     I, AccessPermission:Invalid, desc="Idle";
     S, AccessPermission:Read_Only, desc="Shared";
     O, AccessPermission:Read_Only, desc="Owned";
-    M, AccessPermission:Read_Only, desc="Modified (dirty)";
-    M_W, AccessPermission:Read_Only, desc="Modified (dirty)";
+    M, AccessPermission:Read_Write, desc="Modified (dirty)";
+    M_W, AccessPermission:Read_Write, desc="Modified (dirty)";
     MM, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)";
     MM_W, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)";
 
     // Transient States
+    // Notice we still have a valid copy of the block in most states
     IM, AccessPermission:Busy, "IM", desc="Issued GetX";
+    IS, AccessPermission:Busy, "IS", desc="Issued GetS";
     SM, AccessPermission:Read_Only, "SM", desc="Issued GetX, we still have an old copy of the line";
     OM, AccessPermission:Read_Only, "SM", desc="Issued GetX, received data";
-    IS, AccessPermission:Busy, "IS", desc="Issued GetS";
-    SI, AccessPermission:Busy, "OI", desc="Issued PutS, waiting for ack";
-    OI, AccessPermission:Busy, "OI", desc="Issued PutO, waiting for ack";
-    MI, AccessPermission:Busy, "MI", desc="Issued PutX, waiting for ack";
+    SI, AccessPermission:Read_Only, "OI", desc="Issued PutS, waiting for ack";
+    OI, AccessPermission:Read_Only, "OI", desc="Issued PutO, waiting for ack";
+    MI, AccessPermission:Read_Write, "MI", desc="Issued PutX, waiting for ack";
     II, AccessPermission:Busy, "II", desc="Issued PutX/O, saw Fwd_GETS or Fwd_GETX, waiting for ack";
   }
 
@@ -215,7 +216,6 @@
          ((cache_entry.CacheState != State:O) && (state == State:O)) ) {
 
         cache_entry.CacheState := state;
-        sequencer.checkCoherence(addr);
       }
       else {
         cache_entry.CacheState := state;
@@ -226,13 +226,13 @@
   AccessPermission getAccessPermission(Addr addr) {
     TBE tbe := TBEs[addr];
     if(is_valid(tbe)) {
-      DPRINTF(RubySlicc, "%s\n", L1Cache_State_to_permission(tbe.TBEState));
+      DPRINTF(RubySlicc, "%s,%s\n", tbe.TBEState, L1Cache_State_to_permission(tbe.TBEState));
       return L1Cache_State_to_permission(tbe.TBEState);
     }
 
     Entry cache_entry := getCacheEntry(addr);
     if(is_valid(cache_entry)) {
-      DPRINTF(RubySlicc, "%s\n", L1Cache_State_to_permission(cache_entry.CacheState));
+      DPRINTF(RubySlicc, "%s,%s\n", cache_entry.CacheState, L1Cache_State_to_permission(cache_entry.CacheState));
       return L1Cache_State_to_permission(cache_entry.CacheState);
     }
 
@@ -271,8 +271,10 @@
     }
 
     TBE tbe := TBEs[addr];
-    num_functional_writes := num_functional_writes +
-        testAndWrite(addr, tbe.DataBlk, pkt);
+    if (is_valid(tbe)){
+      num_functional_writes := num_functional_writes +
+          testAndWrite(addr, tbe.DataBlk, pkt);
+    }
     return num_functional_writes;
   }
 
diff --git a/src/mem/ruby/protocol/MOESI_CMP_directory-L2cache.sm b/src/mem/ruby/protocol/MOESI_CMP_directory-L2cache.sm
index 18e3b89..9894107 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_directory-L2cache.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_directory-L2cache.sm
@@ -66,14 +66,13 @@
   state_declaration(State, desc="L2 Cache states", default="L2Cache_State_I") {
 
     // Stable states
-    NP, AccessPermission:Invalid, desc="Not Present";
     I, AccessPermission:Invalid, desc="Invalid";
-    ILS, AccessPermission:Invalid, desc="Idle/NP, but local sharers exist";
-    ILX, AccessPermission:Invalid, desc="Idle/NP, but local exclusive exists";
-    ILO, AccessPermission:Invalid, desc="Idle/NP, but local owner exists";
-    ILOX, AccessPermission:Invalid, desc="Idle/NP, but local owner exists and chip is exclusive";
-    ILOS, AccessPermission:Invalid, desc="Idle/NP, but local owner exists and local sharers as well";
-    ILOSX, AccessPermission:Invalid, desc="Idle/NP, but local owner exists, local sharers exist, chip is exclusive ";
+    ILS, AccessPermission:Invalid, desc="Not present, but local sharers exist";
+    ILX, AccessPermission:Invalid, desc="Not present, but local exclusive exists";
+    ILO, AccessPermission:Invalid, desc="Not present, but local owner exists";
+    ILOX, AccessPermission:Invalid, desc="Not present, but local owner exists and chip is exclusive";
+    ILOS, AccessPermission:Invalid, desc="Not present, but local owner exists and local sharers as well";
+    ILOSX, AccessPermission:Invalid, desc="Not present, but local owner exists, local sharers exist, chip is exclusive ";
     S, AccessPermission:Read_Only, desc="Shared, no local sharers";
     O, AccessPermission:Read_Only, desc="Owned, no local sharers";
     OLS, AccessPermission:Read_Only, desc="Owned with local sharers";
@@ -86,23 +85,25 @@
     IFGX, AccessPermission:Busy, desc="Blocked, forwarded global GETX to local owner/exclusive.  No other on-chip invs needed";
     IFGS, AccessPermission:Busy, desc="Blocked, forwarded global GETS to local owner";
     ISFGS, AccessPermission:Busy, desc="Blocked, forwarded global GETS to local owner, local sharers exist";
-    IFGXX, AccessPermission:Busy, desc="Blocked, forwarded global GETX to local owner but may need acks from other sharers";
-    OLSF, AccessPermission:Busy, desc="Blocked, got Fwd_GETX with local sharers, waiting for local inv acks";
+    IFGXX, AccessPermission:Busy,       desc="Blocked, forwarded global GETX to local owner, waiting for data and acks from other sharers";
+    IFGXXD, AccessPermission:Read_Only, desc="Blocked, was IFGXX and received data, still waiting for acks";
+    OLSF, AccessPermission:Read_Only, desc="Blocked, got Fwd_GETX with local sharers, waiting for local inv acks";
 
-    // writebacks
+    // Writebacks
+    // Notice we still have a valid copy of the block in some states
     ILOW, AccessPermission:Busy, desc="local WB request, was ILO";
     ILOXW, AccessPermission:Busy, desc="local WB request, was ILOX";
     ILOSW, AccessPermission:Busy, desc="local WB request, was ILOS";
     ILOSXW, AccessPermission:Busy, desc="local WB request, was ILOSX";
-    SLSW, AccessPermission:Busy, desc="local WB request, was SLS";
-    OLSW, AccessPermission:Busy, desc="local WB request, was OLS";
     ILSW, AccessPermission:Busy, desc="local WB request, was ILS";
     IW, AccessPermission:Busy, desc="local WB request from only sharer, was ILS";
-    OW, AccessPermission:Busy, desc="local WB request from only sharer, was OLS";
-    SW, AccessPermission:Busy, desc="local WB request from only sharer, was SLS";
-    OXW, AccessPermission:Busy, desc="local WB request from only sharer, was OLSX";
-    OLSXW, AccessPermission:Busy, desc="local WB request from sharer, was OLSX";
     ILXW, AccessPermission:Busy, desc="local WB request, was ILX";
+    SLSW, AccessPermission:Read_Only, desc="local WB request, was SLS";
+    OLSW, AccessPermission:Read_Only, desc="local WB request, was OLS";
+    OW, AccessPermission:Read_Only, desc="local WB request from only sharer, was OLS";
+    SW, AccessPermission:Read_Only, desc="local WB request from only sharer, was SLS";
+    OXW, AccessPermission:Read_Only, desc="local WB request from only sharer, was OLSX";
+    OLSXW, AccessPermission:Read_Only, desc="local WB request from sharer, was OLSX";
 
     IFLS, AccessPermission:Busy, desc="Blocked, forwarded local GETS to _some_ local sharer";
     IFLO, AccessPermission:Busy, desc="Blocked, forwarded local GETS to local owner";
@@ -111,29 +112,34 @@
     IFLOSX, AccessPermission:Busy, desc="Blocked, forwarded local GETS to local owner w/ other sharers, chip is exclusive";
     IFLXO, AccessPermission:Busy, desc="Blocked, forwarded local GETX to local owner with other sharers, chip is exclusive";
 
+    // Some states hold valid data while waiting for acks
     IGS, AccessPermission:Busy, desc="Semi-blocked, issued local GETS to directory";
     IGM, AccessPermission:Busy, desc="Blocked, issued local GETX to directory. Need global acks and data";
     IGMLS, AccessPermission:Busy, desc="Blocked, issued local GETX to directory but may need to INV local sharers";
-    IGMO, AccessPermission:Busy, desc="Blocked, have data for local GETX but need all acks";
+    IGMO, AccessPermission:Read_Only, desc="Blocked, have data for local GETX but need all acks";
+    IGMOU, AccessPermission:Busy, desc="Blocked, responded to GETX, waiting unblock";
     IGMIO, AccessPermission:Busy, desc="Blocked, issued local GETX, local owner with possible local sharer, may need to INV";
     OGMIO, AccessPermission:Busy, desc="Blocked, issued local GETX, was owner, may need to INV";
-    IGMIOF, AccessPermission:Busy, desc="Blocked, issued local GETX, local owner, waiting for global acks, got Fwd_GETX";
+    IGMIOF, AccessPermission:Busy,       desc="Blocked, issued local GETX, local owner, waiting for global acks, got Fwd_GETX";
+    IGMIOFD, AccessPermission:Read_Only, desc="Blocked, was IGMIOF but received data, still waiting acks";
     IGMIOFS, AccessPermission:Busy, desc="Blocked, issued local GETX, local owner, waiting for global acks, got Fwd_GETS";
     OGMIOF, AccessPermission:Busy, desc="Blocked, issued local GETX, was owner, waiting for global acks, got Fwd_GETX";
 
+    // Have valid data in some of these transient states
     II, AccessPermission:Busy, desc="Blocked, handling invalidations";
     MM, AccessPermission:Busy, desc="Blocked, was M satisfying local GETX";
     SS, AccessPermission:Busy, desc="Blocked, was S satisfying local GETS";
     OO, AccessPermission:Busy, desc="Blocked, was O satisfying local GETS";
-    OLSS, AccessPermission:Busy, desc="Blocked, satisfying local GETS";
-    OLSXS, AccessPermission:Busy, desc="Blocked, satisfying local GETS";
-    SLSS, AccessPermission:Busy, desc="Blocked, satisfying local GETS";
+    OLSS, AccessPermission:Read_Only, desc="Blocked, satisfying local GETS";
+    OLSXS, AccessPermission:Read_Only, desc="Blocked, satisfying local GETS";
+    SLSS, AccessPermission:Read_Only, desc="Blocked, satisfying local GETS";
 
-    OI, AccessPermission:Busy, desc="Blocked, doing writeback, was O";
-    MI, AccessPermission:Busy, desc="Blocked, doing writeback, was M";
+    // Have valid data in most of this states
+    OI, AccessPermission:Read_Only, desc="Blocked, doing writeback, was O";
+    MI, AccessPermission:Read_Write, desc="Blocked, doing writeback, was M";
     MII, AccessPermission:Busy, desc="Blocked, doing writeback, was M, got Fwd_GETX";
-    OLSI, AccessPermission:Busy, desc="Blocked, doing writeback, was OLS";
-    ILSI, AccessPermission:Busy, desc="Blocked, doing writeback, was OLS got Fwd_GETX";
+    OLSI, AccessPermission:Read_Only, desc="Blocked, doing writeback, was OLS";
+    ILSI, AccessPermission:Read_Only, desc="Blocked, doing writeback, was OLS got Fwd_GETX";
 
     // DMA blocking states
     ILOSD, AccessPermission:Busy, desc="Blocked, waiting for DMA ack";
@@ -324,11 +330,22 @@
   void copyDirToCache(Entry cache_entry, Addr addr) {
     assert(is_valid(cache_entry));
     DirEntry dir_entry := getDirEntry(addr);
+    assert(is_valid(dir_entry));
     cache_entry.Sharers := dir_entry.Sharers;
     cache_entry.Owner := dir_entry.Owner;
     cache_entry.OwnerValid := dir_entry.OwnerValid;
   }
 
+  bool isDirEntryClean(DirEntry dir_entry) {
+    assert(is_valid(dir_entry));
+    return (dir_entry.Sharers.count() == 0) &&
+           (dir_entry.OwnerValid == false);
+  }
+
+  bool isCacheEntryClean(Entry cache_entry) {
+    return (cache_entry.Sharers.count() == 0) &&
+           (cache_entry.OwnerValid == false);
+  }
 
   void recordLocalSharerInDir(Entry cache_entry, Addr addr, MachineID shar_id) {
     if (is_valid(cache_entry)) {
@@ -478,7 +495,6 @@
   }
 
   State getState(TBE tbe, Entry cache_entry, Addr addr) {
-
     if (is_valid(tbe)) {
       return tbe.TBEState;
     } else if (is_valid(cache_entry)) {
@@ -487,7 +503,7 @@
       DirEntry dir_entry := getDirEntry(addr);
       return dir_entry.DirState;
     } else {
-      return State:NP;
+      return State:I;
     }
   }
 
@@ -496,45 +512,39 @@
   }
 
   void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    // Consistency checks
+
+    // Either on the the cache, directory, or invalid
     assert((localDirectory.isTagPresent(addr) && L2cache.isTagPresent(addr)) == false);
 
+    if (state == State:I) {
+      assert(L2cache.isTagPresent(addr) == false);
+      assert(is_valid(cache_entry) == false);
+      assert(localDirectory.isTagPresent(addr) == false);
+    } else if ( (state == State:M) ||
+                (state == State:O) ||
+                (state == State:S) ||
+                (state == State:OLS) ||
+                (state == State:OLSX) ||
+                (state == State:SLS)) {
+       assert(is_valid(cache_entry));
+       assert(L2cache.isTagPresent(addr));
+    } else if ( (state == State:ILS) ||
+                (state == State:ILX) ||
+                (state == State:ILO) ||
+                (state == State:ILOX) ||
+                (state == State:ILOS) ||
+                (state == State:ILOSX)) {
+       assert(localDirectory.isTagPresent(addr));
+    }
+
+    // Update state
     if (is_valid(tbe)) {
       tbe.TBEState := state;
     }
 
-    if (
-         (state == State:M) ||
-         (state == State:O) ||
-         (state == State:S) ||
-         (state == State:OLS) ||
-         (state == State:SLS) ||
-         (state == State:OLSX) ||
-         (state == State:SLS)
-       ) {
-       assert(is_valid(cache_entry));
-    }
-    else if (
-         (state == State:ILS) ||
-         (state == State:ILX) ||
-         (state == State:ILO) ||
-         (state == State:ILOX) ||
-         (state == State:ILOS) ||
-         (state == State:ILOSX)
-       ) {
-       // assert(isCacheTagPresent(addr) == false);
-    }
-
     if (is_valid(cache_entry)) {
-      if ( ((cache_entry.CacheState != State:M) && (state == State:M)) ||
-           ((cache_entry.CacheState != State:S) && (state == State:S)) ||
-           ((cache_entry.CacheState != State:O) && (state == State:O)) ) {
-        cache_entry.CacheState := state;
-        // disable Coherence Checker for now
-        // sequencer.checkCoherence(addr);
-      }
-      else {
-        cache_entry.CacheState := state;
-      }
+      cache_entry.CacheState := state;
     }
     else if (localDirectory.isTagPresent(addr)) {
       DirEntry dir_entry := getDirEntry(addr);
@@ -545,13 +555,13 @@
   AccessPermission getAccessPermission(Addr addr) {
     TBE tbe := TBEs[addr];
     if(is_valid(tbe)) {
-      DPRINTF(RubySlicc, "%s\n", L2Cache_State_to_permission(tbe.TBEState));
+      DPRINTF(RubySlicc, "%s,%s\n", tbe.TBEState, L2Cache_State_to_permission(tbe.TBEState));
       return L2Cache_State_to_permission(tbe.TBEState);
     }
 
     Entry cache_entry := getCacheEntry(addr);
     if(is_valid(cache_entry)) {
-      DPRINTF(RubySlicc, "%s\n", L2Cache_State_to_permission(cache_entry.CacheState));
+      DPRINTF(RubySlicc, "%s,%s\n", cache_entry.CacheState, L2Cache_State_to_permission(cache_entry.CacheState));
       return L2Cache_State_to_permission(cache_entry.CacheState);
     }
 
@@ -567,10 +577,13 @@
 
   void functionalRead(Addr addr, Packet *pkt) {
     TBE tbe := TBEs[addr];
+    Entry cache_entry := getCacheEntry(addr);
     if(is_valid(tbe)) {
       testAndRead(addr, tbe.DataBlk, pkt);
+    } else if (is_valid(cache_entry)) {
+      testAndRead(addr, cache_entry.DataBlk, pkt);
     } else {
-      testAndRead(addr, getCacheEntry(addr).DataBlk, pkt);
+      error("Block not present!");
     }
   }
 
@@ -581,11 +594,14 @@
     if(is_valid(tbe)) {
       num_functional_writes := num_functional_writes +
         testAndWrite(addr, tbe.DataBlk, pkt);
-      return num_functional_writes;
     }
 
-    num_functional_writes := num_functional_writes +
-        testAndWrite(addr, getCacheEntry(addr).DataBlk, pkt);
+    Entry cache_entry := getCacheEntry(addr);
+    if (is_valid(cache_entry)) {
+      num_functional_writes := num_functional_writes +
+        testAndWrite(addr, cache_entry.DataBlk, pkt);
+    }
+
     return num_functional_writes;
   }
 
@@ -1560,6 +1576,16 @@
     localDirectory.deallocate(address);
   }
 
+  action(checkCacheNoSharersNoOwner, "/ckcache", desc="Remove dir state") {
+    assert(is_valid(cache_entry));
+    assert(isCacheEntryClean(cache_entry));
+  }
+
+  action(removeFromDir, "/rmdir", desc="Remove dir state") {
+    assert(isDirEntryClean(getDirEntry(address)));
+    localDirectory.deallocate(address);
+  }
+
   action(zz_recycleGlobalRequestQueue, "\zglb", desc="Send the head of the mandatory queue to the back of the queue.") {
     peek(requestNetwork_in, RequestMsg) {
       APPEND_TRANSITION_COMMENT(in_msg.Requestor);
@@ -1599,27 +1625,27 @@
   // TRANSITIONS
   //*****************************************************
 
-  transition({II, IFGX, IFGS, ISFGS, IFGXX, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, OW, SW, OXW, OLSXW, ILXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX, OLSXS, IGS, IGM, IGMLS, IGMO, IGMIO, OGMIO, IGMIOF, OGMIOF, MM, SS, OO, OI, MI, MII, OLSI, ILSI, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, {L1_PUTO, L1_PUTS, L1_PUTS_only, L1_PUTX}) {
+  transition({II, IFGX, IFGS, ISFGS, IFGXX, IFGXXD, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, OW, SW, OXW, OLSXW, ILXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX, OLSXS, IGS, IGM, IGMLS, IGMO, IGMOU, IGMIO, OGMIO, IGMIOF, IGMIOFD, OGMIOF, MM, SS, OO, OI, MI, MII, OLSI, ILSI, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, {L1_PUTO, L1_PUTS, L1_PUTS_only, L1_PUTX}) {
     st_stallAndWaitL1RequestQueue;
   }
 
-  transition({II, IFGX, IFGS, ISFGS, IFGXX, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, OW, SW, OXW, OLSXW, ILXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX, OLSXS, IGS, IGM, IGMLS, IGMO, IGMIO, OGMIO, IGMIOF, OGMIOF, MM, SS, OO, OI, MI, MII, OLSI, ILSI, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, {L1_GETX, L1_GETS}) {
+  transition({II, IFGX, IFGS, ISFGS, IFGXX, IFGXXD, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, OW, SW, OXW, OLSXW, ILXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX, OLSXS, IGS, IGM, IGMLS, IGMO, IGMOU, IGMIO, OGMIO, IGMIOF, IGMIOFD, OGMIOF, MM, SS, OO, OI, MI, MII, OLSI, ILSI, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, {L1_GETX, L1_GETS}) {
     st_stallAndWaitL1RequestQueue;
   }
 
-  transition({IFGX, IFGS, ISFGS, IFGXX, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, ILXW, OW, SW, OXW, OLSXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX,OLSXS,  IGS, IGM, IGMLS, IGMO, MM, SS, OO, OI, MI, MII, OLSI, ILSI, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, L2_Replacement) {
+  transition({IFGX, IFGS, ISFGS, IFGXX, IFGXXD, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, ILXW, OW, SW, OXW, OLSXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX,OLSXS,  IGS, IGM, IGMLS, IGMO, IGMOU, MM, SS, OO, OI, MI, MII, OLSI, ILSI, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, L2_Replacement) {
     zz_recycleL1RequestQueue;
   }
 
-  transition({IFGX, IFGS, ISFGS, IFGXX, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, OW, SW, OXW, OLSXW, ILXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX,OLSXS, IGS, IGM, MM, SS, OO, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, {Fwd_GETX, Fwd_GETS, Fwd_DMA}) {
+  transition({IFGX, IFGS, ISFGS, IFGXX, IFGXXD, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, OW, SW, OXW, OLSXW, ILXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX,OLSXS, IGS, IGM, MM, SS, OO, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, {Fwd_GETX, Fwd_GETS, Fwd_DMA}) {
     zz_recycleGlobalRequestQueue;
   }
 
-  transition({OGMIO, IGMIO, IGMO}, Fwd_DMA) {
+  transition({OGMIO, IGMIO, IGMO, IGMOU}, Fwd_DMA) {
     zz_recycleGlobalRequestQueue;
   }
 
-  transition({IFGX, IFGS, ISFGS, IFGXX, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, OW, SW, OXW, OLSXW, ILXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX,OLSXS, MM, SS, OO, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, {Inv}) {
+  transition({IFGX, IFGS, ISFGS, IFGXX, IFGXXD, IFLXO, ILOW, ILOXW, ILOSW, ILOSXW, SLSW, OLSW, ILSW, IW, OW, SW, OXW, OLSXW, ILXW, IFLS, IFLO, IFLOX, IFLOXX, IFLOSX,OLSXS, MM, SS, OO, SLSS, OLSS, OLSF, IGMIOFS, ILOSD, ILOSXD, ILOD, ILXD, ILOXD}, {Inv}) {
     zz_recycleGlobalRequestQueue;
   }
 
@@ -1628,7 +1654,7 @@
   }
 
   // must happened because we forwarded GETX to local exclusive trying to do wb
-  transition({I, M, O, ILS, ILOX, OLS, OLSX, SLS, S}, L1_PUTX) {
+  transition({M, O, ILS, ILOX, OLS, OLSX, SLS, S}, L1_PUTX) {
     ll_writebackNack;
     o_popL1RequestQueue;
   }
@@ -1661,16 +1687,6 @@
     o_popL1RequestQueue;
   }
 
-  // must happened because we got Inv when L1 attempted PUTS
-  transition(I, L1_PUTS) {
-    ll_writebackNack;
-    o_popL1RequestQueue;
-  }
-
-  transition(I, L1_PUTO) {
-    ll_writebackNack;
-    o_popL1RequestQueue;
-  }
 
   // FORWARDED REQUESTS
 
@@ -1784,6 +1800,7 @@
     i_copyDataToTBE;
     c_sendExclusiveDataFromTBEToFwdGETS;
     gg_clearLocalSharers;
+    removeFromDir;
     s_deallocateTBE;
     n_popResponseQueue;
     wa_wakeUpDependents;
@@ -1801,6 +1818,7 @@
     i_copyDataToTBE;
     c_sendDataFromTBEToFwdGETX;
     gg_clearLocalSharers;
+    removeFromDir;
     s_deallocateTBE;
     n_popResponseQueue;
     wa_wakeUpDependents;
@@ -1816,32 +1834,31 @@
   }
 
 
-  transition(IFGXX, IntAck) {
+  transition({IFGXX, IFGXXD}, IntAck) {
     m_decrementNumberOfMessagesInt;
     o_checkForIntCompletion;
     n_popResponseQueue;
   }
 
-  transition(IFGXX, Data_Exclusive) {
+  transition(IFGXX, Data_Exclusive, IFGXXD) {
     i_copyDataToTBE;
     m_decrementNumberOfMessagesInt;
     o_checkForIntCompletion;
     n_popResponseQueue;
   }
 
-  transition(IFGXX, All_Acks, I) {
+  transition(IFGXXD, All_Acks, I) {
     c_sendDataFromTBEToFwdGETX;
     gg_clearLocalSharers;
+    removeFromDir;
     s_deallocateTBE;
     n_popTriggerQueue;
     wa_wakeUpDependents;
   }
 
-
-  // transition({O, OX}, Fwd_GETX, I) {
   transition(O, Fwd_GETX, I) {
     dd_sendDataToFwdGETX;
-    y_copyCacheStateToDir;
+    checkCacheNoSharersNoOwner;
     rr_deallocateL2CacheBlock;
     m_popRequestQueue;
   }
@@ -1871,19 +1888,14 @@
 
   transition(M, Fwd_GETX, I) {
     dd_sendDataToFwdGETX;
+    checkCacheNoSharersNoOwner;
     rr_deallocateL2CacheBlock;
     m_popRequestQueue;
   }
 
-  // MAKE THIS THE SAME POLICY FOR NOW
-
-  // transition(M, Fwd_GETS, O) {
-  //   dd_sendDataToFwdGETS;
-  //   m_popRequestQueue;
-  // }
-
   transition(M, Fwd_GETS, I) {
      dd_sendExclusiveDataToFwdGETS;
+     checkCacheNoSharersNoOwner;
      rr_deallocateL2CacheBlock;
      m_popRequestQueue;
   }
@@ -1898,6 +1910,9 @@
     i_allocateTBE;
     t_recordFwdXID;
     ee_sendLocalInv;
+    gg_clearLocalSharers;
+    checkCacheNoSharersNoOwner;
+    rr_deallocateL2CacheBlock;
     m_popRequestQueue;
   }
 
@@ -1909,9 +1924,7 @@
 
   transition(OLSF, All_Acks, I) {
     c_sendDataFromTBEToFwdGETX;
-    gg_clearLocalSharers;
     s_deallocateTBE;
-    rr_deallocateL2CacheBlock;
     n_popTriggerQueue;
     wa_wakeUpDependents;
   }
@@ -1926,7 +1939,7 @@
     m_popRequestQueue;
   }
 
-  transition({I,NP}, Inv) {
+  transition(I, Inv) {
     i_allocateTBE;
     t_recordFwdXID;
     e_sendAck;
@@ -1941,6 +1954,7 @@
     t_recordFwdXID;
     ee_sendLocalInv;
     gg_clearLocalSharers;
+    removeFromDir;
     m_popRequestQueue;
   }
 
@@ -1970,6 +1984,7 @@
     t_recordFwdXID;
     e_sendAck;
     s_deallocateTBE;
+    checkCacheNoSharersNoOwner;
     rr_deallocateL2CacheBlock;
     m_popRequestQueue;
   }
@@ -2115,12 +2130,12 @@
 
   // LOCAL REQUESTS THAT MUST ISSUE
 
-  transition(NP, {L1_PUTS, L1_PUTX, L1_PUTO}) {
+  transition(I, {L1_PUTS, L1_PUTX, L1_PUTO}) {
     ll_writebackNack;
     o_popL1RequestQueue;
   }
 
-  transition({NP, I}, L1_GETS, IGS) {
+  transition(I, L1_GETS, IGS) {
     i_allocateTBE;
     s_recordGetSL1ID;
     a_issueGETS;
@@ -2128,7 +2143,7 @@
     o_popL1RequestQueue;
   }
 
-  transition({NP, I}, L1_GETX, IGM) {
+  transition(I, L1_GETX, IGM) {
     i_allocateTBE;
     s_recordGetXL1ID;
     a_issueGETX;
@@ -2176,7 +2191,6 @@
     n_popTriggerQueue;
   }
 
-  // transition(IGMLS, ExtAck, IGMO) {
   transition(IGMLS, ExtAck) {
     m_decrementNumberOfMessagesExt;
     o_checkForExtCompletion;
@@ -2283,26 +2297,26 @@
     n_popTriggerQueue;
   }
 
-  transition(IGMIOF, IntAck) {
+  transition({IGMIOF, IGMIOFD}, IntAck) {
     m_decrementNumberOfMessagesInt;
     o_checkForIntCompletion;
     n_popResponseQueue;
   }
 
-  transition(IGMIOF, Data_Exclusive) {
+  transition(IGMIOF, Data_Exclusive, IGMIOFD) {
     i_copyDataToTBE;
     m_decrementNumberOfMessagesInt;
     o_checkForIntCompletion;
     n_popResponseQueue;
   }
 
-  transition(IGMIOF, All_Acks, IGM) {
+  transition(IGMIOFD, All_Acks, IGM) {
     gg_clearLocalSharers;
     c_sendDataFromTBEToFwdGETX;
     n_popTriggerQueue;
   }
 
-  transition(IGMIO, All_Acks, IGMO) {
+  transition(IGMIO, All_Acks, IGMOU) {
     hh_countLocalSharersExceptL1GETXRequestorInTBE;
     ee_issueLocalInvExceptL1RequestorInTBE;
     k_forwardLocalGETXToLocalOwner;
@@ -2310,7 +2324,7 @@
     n_popTriggerQueue;
   }
 
-  transition(OGMIO, All_Acks, IGMO) {
+  transition(OGMIO, All_Acks, IGMOU) {
     ee_issueLocalInvExceptL1RequestorInTBE;
     c_sendDataFromTBEToL1GETX;
     n_popTriggerQueue;
@@ -2372,12 +2386,12 @@
     wa_wakeUpDependents;
   }
 
-  transition(IGMO, All_Acks) {
+  transition(IGMO, All_Acks, IGMOU) {
     c_sendDataFromTBEToL1GETX;
     n_popTriggerQueue;
   }
 
-  transition(IGMO, Exclusive_Unblock, ILX) {
+  transition(IGMOU, Exclusive_Unblock, ILX) {
     g_recordLocalExclusive;
     f_sendExclusiveUnblock;
     s_deallocateTBE;
@@ -2791,7 +2805,8 @@
 
 
   // L2 WRITEBACKS
-  transition({I, S}, L2_Replacement, I) {
+  transition(S, L2_Replacement, I) {
+    checkCacheNoSharersNoOwner;
     rr_deallocateL2CacheBlock;
   }
 
@@ -2885,12 +2900,14 @@
 
   transition({MI, OI}, Writeback_Ack, I) {
     qq_sendDataFromTBEToMemory;
+    removeFromDir;
     s_deallocateTBE;
     n_popResponseQueue;
     wa_wakeUpDependents;
   }
 
   transition(MII, Writeback_Nack, I) {
+    removeFromDir;
     s_deallocateTBE;
     n_popResponseQueue;
     wa_wakeUpDependents;
@@ -2910,6 +2927,7 @@
 
   transition(MII, Writeback_Ack, I) {
     f_sendUnblock;
+    removeFromDir;
     s_deallocateTBE;
     n_popResponseQueue;
     wa_wakeUpDependents;
diff --git a/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm b/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm
index 7faa8e0..03010d5 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm
@@ -56,30 +56,40 @@
 
    MessageBuffer * requestToMemory;
    MessageBuffer * responseFromMemory;
+
+   MessageBuffer * triggerQueue;
 {
   // STATES
   state_declaration(State, desc="Directory states", default="Directory_State_I") {
     // Base states
     I, AccessPermission:Read_Write, desc="Invalid";
-    S, AccessPermission:Read_Only, desc="Shared";
+    S, AccessPermission:Read_Write, desc="Shared";
     O, AccessPermission:Maybe_Stale, desc="Owner";
     M, AccessPermission:Maybe_Stale, desc="Modified";
 
-    IS, AccessPermission:Busy, desc="Blocked, was in idle";
+    // Transient states
+    // The memory has valid data in some of these
+    IS_M, AccessPermission:Read_Write, desc="Blocked, was in I, waiting for mem";
+    IS, AccessPermission:Read_Write, desc="Blocked, was in I, data forwarded";
     SS, AccessPermission:Read_Only, desc="Blocked, was in shared";
     OO, AccessPermission:Busy, desc="Blocked, was in owned";
     MO, AccessPermission:Busy, desc="Blocked, going to owner or maybe modified";
-    MM, AccessPermission:Busy, desc="Blocked, going to modified";
+    MM_M, AccessPermission:Read_Only, desc="Blocked, fetching from memory, going to MM";
+    MM, AccessPermission:Busy,      desc="Blocked, req or mem data forwarded, going to modified";
 
     MI, AccessPermission:Busy, desc="Blocked on a writeback";
     MIS, AccessPermission:Busy, desc="Blocked on a writeback, but don't remove from sharers when received";
     OS, AccessPermission:Busy, desc="Blocked on a writeback";
     OSS, AccessPermission:Busy, desc="Blocked on a writeback, but don't remove from sharers when received";
 
-    XI_M, AccessPermission:Busy, desc="In a stable state, going to I, waiting for the memory controller";
-    XI_U, AccessPermission:Busy, desc="In a stable state, going to I, waiting for an unblock";
-    OI_D, AccessPermission:Busy, desc="In O, going to I, waiting for data";
+    // We have valid data in a TBE
+    WBI, AccessPermission:Read_Only, desc="Sent writeback, waiting for memory; will be I";
+    WBS, AccessPermission:Read_Only, desc="Sent writeback, waiting for memory; will be S";
+    XI_M, AccessPermission:Read_Only, desc="Blocked, going to I, waiting for the memory controller";
+    XI_M_U, AccessPermission:Read_Only, desc="Blocked, going to XI_U, waiting for the memory controller";
+    XI_U, AccessPermission:Read_Only, desc="Blocked, going to I, waiting for an unblock";
 
+    OI_D, AccessPermission:Busy, desc="In O, going to I, waiting for data";
     OD, AccessPermission:Busy, desc="In O, waiting for dma ack from L2";
     MD, AccessPermission:Busy, desc="In M, waiting for dma ack from L2";
   }
@@ -96,12 +106,15 @@
     Exclusive_Unblock, desc="The processor become the exclusive owner (E or M) of the line";
     Clean_Writeback, desc="The final message as part of a PutX/PutS, no data";
     Dirty_Writeback, desc="The final message as part of a PutX/PutS, contains data";
-    Memory_Data,   desc="Fetched data from memory arrives";
+    Memory_Data_DMA,   desc="Fetched data from memory arrives; original requestor is DMA";
+    Memory_Data_Cache, desc="Fetched data from memory arrives; original requestor is Cache";
     Memory_Ack,    desc="Writeback Ack from memory arrives";
     DMA_READ,      desc="DMA Read";
-    DMA_WRITE,     desc="DMA Write";
+    DMA_WRITE_LINE,    desc="DMA Write full line";
+    DMA_WRITE_PARTIAL, desc="DMA Write partial line";
     DMA_ACK,       desc="DMA Ack";
     Data,          desc="Data to directory";
+    All_Acks,      desk="All pending acks, unblocks, etc have been received";
   }
 
   // TYPES
@@ -119,6 +132,8 @@
     int Len,           desc="Length of request";
     DataBlock DataBlk, desc="DataBlk";
     MachineID Requestor, desc="original requestor";
+    bool WaitingWBAck, desc="DataBlk WB request sent, but no ack from mem yet";
+    bool WaitingDMAAck, desc="DMA ack sent, waiting for unblock";
   }
 
   structure(TBETable, external = "yes") {
@@ -128,6 +143,8 @@
     bool isPresent(Addr);
   }
 
+  int blockSize, default="RubySystem::getBlockSizeBytes()";
+
   // ** OBJECTS **
   TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
 
@@ -138,79 +155,113 @@
 
   Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
     Entry dir_entry := static_cast(Entry, "pointer", directory[addr]);
+    assert(is_valid(dir_entry));
+    return dir_entry;
+  }
 
-    if (is_valid(dir_entry)) {
-      return dir_entry;
-    }
-
-    dir_entry :=  static_cast(Entry, "pointer",
+  Entry allocateDirectoryEntry(Addr addr), return_by_pointer="yes" {
+    Entry dir_entry := static_cast(Entry, "pointer",
                               directory.allocate(addr, new Entry));
     return dir_entry;
   }
 
+  void deallocateDirectoryEntry(Addr addr) {
+    // Always going to transition from a valid state to I when deallocating
+    // Owners and shares must be clear
+    assert(getDirectoryEntry(addr).DirectoryState != State:I);
+    assert(getDirectoryEntry(addr).Owner.count() == 0);
+    assert(getDirectoryEntry(addr).Sharers.count() == 0);
+
+    directory.deallocate(addr);
+  }
+
   State getState(TBE tbe, Addr addr) {
-    return getDirectoryEntry(addr).DirectoryState;
+    Entry dir_entry := static_cast(Entry, "pointer", directory[addr]);
+    if (is_valid(dir_entry)) {
+      return dir_entry.DirectoryState;
+    }
+    else {
+      return State:I;
+    }
   }
 
   void setState(TBE tbe, Addr addr, State state) {
     if (directory.isPresent(addr)) {
 
-      if (state == State:I) {
-        assert(getDirectoryEntry(addr).Owner.count() == 0);
-        assert(getDirectoryEntry(addr).Sharers.count() == 0);
-      }
+      Entry dir_entry := static_cast(Entry, "pointer", directory[addr]);
 
-      if (state == State:S) {
-        assert(getDirectoryEntry(addr).Owner.count() == 0);
-      }
+      if (is_valid(dir_entry)) {
 
-      if (state == State:O) {
-        assert(getDirectoryEntry(addr).Owner.count() == 1);
-        assert(getDirectoryEntry(addr).Sharers.isSuperset(getDirectoryEntry(addr).Owner) == false);
-      }
+        assert(state != State:I);
 
-      if (state == State:M) {
-        assert(getDirectoryEntry(addr).Owner.count() == 1);
-        assert(getDirectoryEntry(addr).Sharers.count() == 0);
-      }
+        if (state == State:S) {
+          assert(dir_entry.Owner.count() == 0);
+        }
 
-      if ((state != State:SS) && (state != State:OO)) {
-        assert(getDirectoryEntry(addr).WaitingUnblocks == 0);
-      }
+        if (state == State:O) {
+          assert(dir_entry.Owner.count() == 1);
+          assert(dir_entry.Sharers.isSuperset(dir_entry.Owner) == false);
+        }
 
-      if ( (getDirectoryEntry(addr).DirectoryState != State:I) && (state == State:I) ) {
-        getDirectoryEntry(addr).DirectoryState := state;
-         // disable coherence checker
-        // sequencer.checkCoherence(addr);
-      }
-      else {
-        getDirectoryEntry(addr).DirectoryState := state;
+        if (state == State:M) {
+          assert(dir_entry.Owner.count() == 1);
+          assert(dir_entry.Sharers.count() == 0);
+        }
+
+        if ((state != State:SS) && (state != State:OO)) {
+          assert(dir_entry.WaitingUnblocks == 0);
+        }
+
+        dir_entry.DirectoryState := state;
+
+      } else {
+        assert(state == State:I);
       }
     }
   }
 
   AccessPermission getAccessPermission(Addr addr) {
     if (directory.isPresent(addr)) {
-      DPRINTF(RubySlicc, "%s\n", Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState));
-      return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+      Entry dir_entry := static_cast(Entry, "pointer", directory[addr]);
+      if (is_valid(dir_entry)) {
+        DPRINTF(RubySlicc, "%s,%s\n", dir_entry.DirectoryState, Directory_State_to_permission(dir_entry.DirectoryState));
+        return Directory_State_to_permission(dir_entry.DirectoryState);
+      } else {
+        DPRINTF(RubySlicc, "%s,%s\n", State:I, Directory_State_to_permission(State:I));
+        return Directory_State_to_permission(State:I);
+      }
     }
-
     DPRINTF(RubySlicc, "AccessPermission_NotPresent\n");
     return AccessPermission:NotPresent;
   }
 
   void setAccessPermission(Addr addr, State state) {
     if (directory.isPresent(addr)) {
-      getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+      Entry dir_entry := static_cast(Entry, "pointer", directory[addr]);
+      if (is_valid(dir_entry)) {
+        dir_entry.changePermission(Directory_State_to_permission(state));
+      } else {
+        assert(state == State:I);
+      }
     }
   }
 
   void functionalRead(Addr addr, Packet *pkt) {
-    functionalMemoryRead(pkt);
+    TBE tbe := TBEs[addr];
+    if (is_valid(tbe) && tbe.WaitingWBAck) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
   }
 
   int functionalWrite(Addr addr, Packet *pkt) {
     int num_functional_writes := 0;
+    TBE tbe := TBEs[addr];
+    if (is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+        testAndWrite(addr, tbe.DataBlk, pkt);
+    }
     num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
     return num_functional_writes;
   }
@@ -240,8 +291,26 @@
   out_port(responseNetwork_out, ResponseMsg, responseFromDir);
   out_port(memQueue_out, MemoryMsg, requestToMemory);
 
+  // For inserting internal unblocks only
+  out_port(unblockNetwork_out_internal, ResponseMsg, responseToDir);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
   // ** IN_PORTS **
 
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=3) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        if (in_msg.Type == TriggerType:ALL_ACKS) {
+          trigger(Event:All_Acks, in_msg.addr, TBEs[in_msg.addr]);
+        } else {
+          error("Unexpected message");
+        }
+      }
+    }
+  }
+
   in_port(unblockNetwork_in, ResponseMsg, responseToDir, rank=2) {
     if (unblockNetwork_in.isReady(clockEdge())) {
       peek(unblockNetwork_in, ResponseMsg) {
@@ -292,8 +361,13 @@
           trigger(Event:DMA_READ, makeLineAddress(in_msg.addr),
                   TBEs[makeLineAddress(in_msg.addr)]);
         } else if (in_msg.Type == CoherenceRequestType:DMA_WRITE) {
-          trigger(Event:DMA_WRITE, makeLineAddress(in_msg.addr),
+          if (in_msg.Len == blockSize) {
+            assert(makeLineAddress(in_msg.addr) == in_msg.addr);
+            trigger(Event:DMA_WRITE_LINE, in_msg.addr, TBEs[in_msg.addr]);
+          } else {
+            trigger(Event:DMA_WRITE_PARTIAL, makeLineAddress(in_msg.addr),
                   TBEs[makeLineAddress(in_msg.addr)]);
+          }
         } else {
           error("Invalid message");
         }
@@ -306,7 +380,12 @@
     if (memQueue_in.isReady(clockEdge())) {
       peek(memQueue_in, MemoryMsg) {
         if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
-          trigger(Event:Memory_Data, in_msg.addr, TBEs[in_msg.addr]);
+          if (machineIDToMachineType(in_msg.OriginalRequestorMachId) ==
+              MachineType:L2Cache) {
+            trigger(Event:Memory_Data_Cache, in_msg.addr, TBEs[in_msg.addr]);
+          } else {
+            trigger(Event:Memory_Data_DMA, in_msg.addr, TBEs[in_msg.addr]);
+          }
         } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
           trigger(Event:Memory_Ack, in_msg.addr, TBEs[in_msg.addr]);
         } else {
@@ -319,6 +398,14 @@
 
   // Actions
 
+  action(allocDirEntry, "alloc", desc="Allocate directory entry") {
+    allocateDirectoryEntry(address);
+  }
+
+  action(deallocDirEntry, "dealloc", desc="Deallocate directory entry") {
+    deallocateDirectoryEntry(address);
+  }
+
   action(a_sendWriteBackAck, "a", desc="Send writeback ack to requestor") {
     peek(requestQueue_in, RequestMsg) {
       enqueue(responseNetwork_out, ResponseMsg, directory_latency) {
@@ -345,6 +432,18 @@
     }
   }
 
+  action(clearDMA, "cD", desc="Clear DMA flag in TBE") {
+    assert(is_valid(tbe));
+    assert(tbe.WaitingDMAAck);
+    tbe.WaitingDMAAck := false;
+  }
+
+  action(clearWBAck, "cWb", desc="Clear WB ack flag in TBE") {
+    assert(is_valid(tbe));
+    assert(tbe.WaitingWBAck);
+    tbe.WaitingWBAck := false;
+  }
+
   action(c_clearOwner, "c", desc="Clear the owner field") {
     getDirectoryEntry(address).Owner.clear();
   }
@@ -360,6 +459,9 @@
 
   action(d_sendDataMsg, "d", desc="Send data to requestor") {
     peek(memQueue_in, MemoryMsg) {
+      // Not using tbe here, but we must have allocated on a memory
+      // request
+      assert(is_valid(tbe));
       enqueue(responseNetwork_out, ResponseMsg, 1) {
         out_msg.addr := address;
         out_msg.Sender := machineID;
@@ -367,7 +469,11 @@
         out_msg.Destination.add(in_msg.OriginalRequestorMachId);
         out_msg.DataBlk := in_msg.DataBlk;
         out_msg.Dirty := false; // By definition, the block is now clean
-        out_msg.Acks := in_msg.Acks;
+        if (getDirectoryEntry(in_msg.addr).Sharers.isElement(in_msg.OriginalRequestorMachId) == true) {
+         out_msg.Acks := (getDirectoryEntry(in_msg.addr).Sharers.count()) - 1;
+        } else {
+         out_msg.Acks := getDirectoryEntry(in_msg.addr).Sharers.count();
+        }
         if (in_msg.ReadX) {
           out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
         } else {
@@ -378,16 +484,15 @@
     }
   }
 
-  action(p_fwdDataToDMA, "\d", desc="Send data to requestor") {
-    peek(requestQueue_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, 1) {
+  action(insertDMAUnblock, "idu", desc="insert dummy DMA unblock") {
+    peek(memQueue_in, MemoryMsg) {
+      enqueue(unblockNetwork_out_internal, ResponseMsg, 1) {
         out_msg.addr := address;
-        out_msg.Sender := machineID;
-        out_msg.SenderMachine := MachineType:Directory;
-        out_msg.Destination.add(in_msg.Requestor);
-        out_msg.Dirty := false; // By definition, the block is now clean
-        out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
-        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Type := CoherenceResponseType:UNBLOCK;
+        out_msg.Destination.add(machineID);
+        out_msg.Sender := in_msg.OriginalRequestorMachId;
+        out_msg.SenderMachine := MachineType:DMA;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
       }
     }
   }
@@ -460,9 +565,26 @@
     unblockNetwork_in.dequeue(clockEdge());
   }
 
+  action(popTriggerQueue, "pt", desc="Pop trigger queue.") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(checkForCompletion, "\o", desc="Check if we have received all the messages required for completion") {
+    assert(is_valid(tbe));
+    if ((tbe.WaitingDMAAck == false) &&
+        (tbe.WaitingWBAck == false)) {
+      enqueue(triggerQueue_out, TriggerMsg) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:ALL_ACKS;
+      }
+    }
+  }
+
   action(m_addUnlockerToSharers, "m", desc="Add the unlocker to the sharer list") {
     peek(unblockNetwork_in, ResponseMsg) {
-      getDirectoryEntry(address).Sharers.add(in_msg.Sender);
+      if (in_msg.SenderMachine == MachineType:L2Cache) {
+        getDirectoryEntry(address).Sharers.add(in_msg.Sender);
+      }
     }
   }
 
@@ -481,6 +603,7 @@
 
   action(qf_queueMemoryFetchRequest, "qf", desc="Queue off-chip fetch request") {
     peek(requestQueue_in, RequestMsg) {
+      assert(is_valid(tbe));
       enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
         out_msg.addr := address;
         out_msg.Type := MemoryRequestType:MEMORY_READ;
@@ -493,31 +616,24 @@
 
   action(qw_queueMemoryWBFromCacheRequest, "qw", desc="Queue off-chip writeback request") {
     peek(requestQueue_in, RequestMsg) {
-      if (is_valid(tbe)) {
-        enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
-          out_msg.addr := address;
-          out_msg.Type := MemoryRequestType:MEMORY_WB;
-          out_msg.Sender := tbe.Requestor;
-          out_msg.MessageSize := MessageSizeType:Writeback_Data;
-          out_msg.DataBlk := in_msg.DataBlk;
-          out_msg.Len := 0;
-        }
-      } else {
-        enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
-          out_msg.addr := address;
-          out_msg.Type := MemoryRequestType:MEMORY_WB;
-          out_msg.Sender := in_msg.Requestor;
-          out_msg.MessageSize := MessageSizeType:Writeback_Data;
-          out_msg.DataBlk := in_msg.DataBlk;
-          out_msg.Len := 0;
-        }
+      assert(is_valid(tbe));
+      enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
+        out_msg.addr := address;
+        out_msg.Type := MemoryRequestType:MEMORY_WB;
+        out_msg.Sender := in_msg.Requestor;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Len := 0;
       }
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.WaitingWBAck := true;
     }
   }
 
-  action(qw_queueMemoryWBRequestFromMessageAndTBE, "qwmt",
-    desc="Queue off-chip writeback request") {
+  action(qw_queueMemoryWBFromCacheResp, "qwcmt",
+    desc="Queue partial off-chip writeback request") {
     peek(unblockNetwork_in, ResponseMsg) {
+      assert(is_valid(tbe));
       DataBlock DataBlk := in_msg.DataBlk;
       DataBlk.copyPartial(tbe.DataBlk, getOffset(tbe.PhysicalAddress),
                           tbe.Len);
@@ -529,11 +645,34 @@
         out_msg.DataBlk := DataBlk;
         out_msg.Len := 0;
       }
+      tbe.DataBlk := DataBlk;
+      tbe.WaitingWBAck := true;
+    }
+  }
+
+  action(qw_queueMemoryWBFromMemResp, "qwmmt",
+    desc="Queue partial off-chip writeback request") {
+    peek(memQueue_in, MemoryMsg) {
+      assert(is_valid(tbe));
+      DataBlock DataBlk := in_msg.DataBlk;
+      DataBlk.copyPartial(tbe.DataBlk, getOffset(tbe.PhysicalAddress),
+                          tbe.Len);
+      enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
+        out_msg.addr := address;
+        out_msg.Type := MemoryRequestType:MEMORY_WB;
+        out_msg.Sender := tbe.Requestor;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        out_msg.DataBlk := DataBlk;
+        out_msg.Len := 0;
+      }
+      tbe.DataBlk := DataBlk;
+      tbe.WaitingWBAck := true;
     }
   }
 
   action(qw_queueMemoryWBFromDMARequest, "/qw", desc="Queue off-chip writeback request") {
     peek(requestQueue_in, RequestMsg) {
+      assert(is_valid(tbe));
       enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
         out_msg.addr := address;
         out_msg.Type := MemoryRequestType:MEMORY_WB;
@@ -542,6 +681,8 @@
         out_msg.DataBlk := in_msg.DataBlk;
         out_msg.Len := 0;
       }
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.WaitingWBAck := true;
     }
   }
 
@@ -549,115 +690,166 @@
     requestQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
   }
 
-  action(a_sendDMAAck, "\a", desc="Send DMA Ack that write completed, along with Inv Ack count") {
+  action(a_sendDMAAckFromReq, "\a", desc="Send DMA Ack that write completed, along with Inv Ack count") {
     peek(requestQueue_in, RequestMsg) {
       enqueue(responseNetwork_out, ResponseMsg, 1) {
-      out_msg.addr := address;
-      out_msg.Sender := machineID;
-      out_msg.SenderMachine := MachineType:Directory;
-      out_msg.Destination.add(in_msg.Requestor);
-      out_msg.DataBlk := in_msg.DataBlk;
-      out_msg.Acks := getDirectoryEntry(address).Sharers.count();  // for dma requests
-      out_msg.Type := CoherenceResponseType:DMA_ACK;
-      out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        assert(is_valid(tbe));
+        out_msg.addr := address;
+        out_msg.Sender := machineID;
+        out_msg.SenderMachine := MachineType:Directory;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Acks := getDirectoryEntry(address).Sharers.count();  // for dma requests
+        out_msg.Type := CoherenceResponseType:DMA_ACK;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        tbe.WaitingDMAAck := true;
       }
     }
   }
 
-  action(a_sendDMAAck2, "\aa", desc="Send DMA Ack that write completed, along with Inv Ack count") {
-    peek(unblockNetwork_in, ResponseMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, 1) {
+  action(a_sendDMAAckFromTBE, "\aa", desc="Send DMA Ack that write completed, along with Inv Ack count") {
+    enqueue(responseNetwork_out, ResponseMsg, 1) {
+      assert(is_valid(tbe));
       out_msg.addr := address;
       out_msg.Sender := machineID;
       out_msg.SenderMachine := MachineType:Directory;
-      if (is_valid(tbe)) {
-        out_msg.Destination.add(tbe.Requestor);
-      }
-      out_msg.DataBlk := in_msg.DataBlk;
+      out_msg.Destination.add(tbe.Requestor);
       out_msg.Acks := getDirectoryEntry(address).Sharers.count();  // for dma requests
       out_msg.Type := CoherenceResponseType:DMA_ACK;
       out_msg.MessageSize := MessageSizeType:Writeback_Control;
-      }
+      tbe.WaitingDMAAck := true;
     }
   }
 
   action(v_allocateTBE, "v", desc="Allocate TBE entry") {
+    check_allocate(TBEs);
     peek (requestQueue_in, RequestMsg) {
+      assert(is_valid(tbe) == false);
       TBEs.allocate(address);
       set_tbe(TBEs[address]);
       tbe.PhysicalAddress := in_msg.addr;
       tbe.Len := in_msg.Len;
       tbe.DataBlk := in_msg.DataBlk;
       tbe.Requestor := in_msg.Requestor;
+      tbe.WaitingWBAck := false;
+      tbe.WaitingDMAAck := false;
     }
   }
 
   action(w_deallocateTBE, "w", desc="Deallocate TBE entry") {
+    assert(is_valid(tbe));
+    assert(tbe.WaitingWBAck == false);
+    assert(tbe.WaitingDMAAck == false);
     TBEs.deallocate(address);
     unset_tbe();
   }
 
 
   // TRANSITIONS
-  transition(I, GETX, MM) {
+  transition(I, GETX, MM_M) {
+    allocDirEntry;
+    v_allocateTBE;
     qf_queueMemoryFetchRequest;
     i_popIncomingRequestQueue;
   }
 
   transition(I, DMA_READ, XI_M) {
+    allocDirEntry;
+    v_allocateTBE;
     qf_queueMemoryFetchRequest;
     i_popIncomingRequestQueue;
   }
 
-  transition(I, DMA_WRITE, XI_U) {
+  transition(I, DMA_WRITE_LINE, XI_U) {
+    allocDirEntry;
+    v_allocateTBE;
     qw_queueMemoryWBFromDMARequest;
-    a_sendDMAAck;  // ack count may be zero
+    a_sendDMAAckFromReq;  // ack count may be zero
     i_popIncomingRequestQueue;
   }
 
-  transition(XI_M, Memory_Data, I) {
-    d_sendDataMsg;  // ack count may be zero
+  transition(I, DMA_WRITE_PARTIAL, XI_M_U) {
+    allocDirEntry;
+    v_allocateTBE;
+    qf_queueMemoryFetchRequest;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(XI_M_U, Memory_Data_DMA, XI_U) {
+    qw_queueMemoryWBFromMemResp;
+    a_sendDMAAckFromTBE;  // ack count may be zero
     q_popMemQueue;
   }
 
-  transition(XI_U, Exclusive_Unblock, I) {
+  transition(XI_M, Memory_Data_DMA, I) {
+    d_sendDataMsg;  // ack count may be zero
+    deallocDirEntry;
+    w_deallocateTBE;
+    q_popMemQueue;
+  }
+
+  transition(XI_U, Exclusive_Unblock, XI_U) {
     cc_clearSharers;
     c_clearOwner;
+    clearDMA;
+    checkForCompletion;
     j_popIncomingUnblockQueue;
   }
 
-  transition(S, GETX, MM) {
+  transition(XI_U, Memory_Ack, XI_U) {
+    clearWBAck;
+    checkForCompletion;
+    q_popMemQueue;
+  }
+
+  transition(XI_U, All_Acks, I) {
+    deallocDirEntry;
+    w_deallocateTBE;
+    popTriggerQueue;
+  }
+
+  transition(S, GETX, MM_M) {
+    v_allocateTBE;
     qf_queueMemoryFetchRequest;
     g_sendInvalidations;
     i_popIncomingRequestQueue;
   }
 
-  transition(S, DMA_READ) {
-    //qf_queueMemoryFetchRequest;
-    p_fwdDataToDMA;
-    //g_sendInvalidations;  // the DMA will collect the invalidations then send an Unblock Exclusive
-    i_popIncomingRequestQueue;
-  }
-
-  transition(S, DMA_WRITE, XI_U) {
+  transition(S, DMA_WRITE_LINE, XI_U) {
+    v_allocateTBE;
     qw_queueMemoryWBFromDMARequest;
-    a_sendDMAAck;  // ack count may be zero
+    a_sendDMAAckFromReq;  // ack count may be zero
     g_sendInvalidations;  // the DMA will collect invalidations
     i_popIncomingRequestQueue;
   }
 
-  transition(I, GETS, IS) {
+  transition(S, DMA_WRITE_PARTIAL, XI_M_U) {
+    v_allocateTBE;
+    qf_queueMemoryFetchRequest;
+    g_sendInvalidations;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(I, GETS, IS_M) {
+    allocDirEntry;
+    v_allocateTBE;
     qf_queueMemoryFetchRequest;
     i_popIncomingRequestQueue;
   }
 
-  transition({S, SS}, GETS, SS) {
+  transition(S, {GETS, DMA_READ}, SS) {
+    v_allocateTBE;
     qf_queueMemoryFetchRequest;
     n_incrementOutstanding;
     i_popIncomingRequestQueue;
   }
 
-  transition({I, S}, PUTO) {
+  transition(SS, {GETS, DMA_READ}) {
+    qf_queueMemoryFetchRequest;
+    n_incrementOutstanding;
+    i_popIncomingRequestQueue;
+  }
+
+  transition({I, S}, {PUTO, PUTO_SHARERS}) {
     b_sendWriteBackNack;
     i_popIncomingRequestQueue;
   }
@@ -675,7 +867,6 @@
 
   transition(O, DMA_READ, OD) {
     f_forwardRequest;     // this will cause the data to go to DMA directly
-    //g_sendInvalidations;  // this will cause acks to be sent to the DMA
     i_popIncomingRequestQueue;
   }
 
@@ -683,7 +874,7 @@
     j_popIncomingUnblockQueue;
   }
 
-  transition({O,M}, DMA_WRITE, OI_D) {
+  transition({O,M}, {DMA_WRITE_LINE, DMA_WRITE_PARTIAL}, OI_D) {
     f_forwardRequestDirIsRequestor;    // need the modified data before we can proceed
     g_sendInvalidations;               // these go to the DMA Controller
     v_allocateTBE;
@@ -691,9 +882,8 @@
   }
 
   transition(OI_D, Data, XI_U) {
-    qw_queueMemoryWBRequestFromMessageAndTBE;
-    a_sendDMAAck2;  // ack count may be zero
-    w_deallocateTBE;
+    qw_queueMemoryWBFromCacheResp;
+    a_sendDMAAckFromTBE;  // ack count may be zero
     j_popIncomingUnblockQueue;
   }
 
@@ -719,22 +909,26 @@
   }
 
   transition(M, GETS, MO) {
+    v_allocateTBE;
     f_forwardRequest;
     i_popIncomingRequestQueue;
   }
 
   transition(M, PUTX, MI) {
+    v_allocateTBE;
     a_sendWriteBackAck;
     i_popIncomingRequestQueue;
   }
 
   // happens if M->O transition happens on-chip
   transition(M, PUTO, MI) {
+    v_allocateTBE;
     a_sendWriteBackAck;
     i_popIncomingRequestQueue;
   }
 
   transition(M, PUTO_SHARERS, MIS) {
+    v_allocateTBE;
     a_sendWriteBackAck;
     i_popIncomingRequestQueue;
   }
@@ -750,35 +944,39 @@
   }
 
 
-  transition({MM, MO, MI, MIS, OS, OSS, XI_M, XI_U, OI_D, OD, MD}, {GETS, GETX, PUTO, PUTO_SHARERS, PUTX, DMA_READ, DMA_WRITE}) {
+  transition({MM_M, MM, MO, MI, MIS, OS, OSS, WBI, WBS, XI_M, XI_M_U, XI_U, OI_D, OD, MD}, {GETS, GETX, PUTO, PUTO_SHARERS, PUTX, DMA_READ, DMA_WRITE_LINE, DMA_WRITE_PARTIAL}) {
     zz_recycleRequest;
   }
 
   transition({MM, MO}, Exclusive_Unblock, M) {
+    w_deallocateTBE;
     cc_clearSharers;
     e_ownerIsUnblocker;
     j_popIncomingUnblockQueue;
   }
 
   transition(MO, Unblock, O) {
+    w_deallocateTBE;
     m_addUnlockerToSharers;
     j_popIncomingUnblockQueue;
   }
 
-  transition({IS, SS, OO}, {GETX, PUTO, PUTO_SHARERS, PUTX, DMA_READ, DMA_WRITE}) {
+  transition({IS, IS_M, SS, OO}, {GETX, PUTO, PUTO_SHARERS, PUTX, DMA_WRITE_LINE,DMA_WRITE_PARTIAL}) {
     zz_recycleRequest;
   }
 
-  transition(IS, GETS) {
+  transition({IS, IS_M}, {GETS, DMA_READ}) {
     zz_recycleRequest;
   }
 
   transition(IS, Unblock, S) {
+    w_deallocateTBE;
     m_addUnlockerToSharers;
     j_popIncomingUnblockQueue;
   }
 
   transition(IS, Exclusive_Unblock, M) {
+    w_deallocateTBE;
     cc_clearSharers;
     e_ownerIsUnblocker;
     j_popIncomingUnblockQueue;
@@ -791,6 +989,7 @@
   }
 
   transition(SS, Last_Unblock, S) {
+    w_deallocateTBE;
     m_addUnlockerToSharers;
     o_decrementOutstanding;
     j_popIncomingUnblockQueue;
@@ -808,14 +1007,21 @@
     j_popIncomingUnblockQueue;
   }
 
-  transition(MI, Dirty_Writeback, I) {
+  transition(MI, Dirty_Writeback, WBI) {
     c_clearOwner;
     cc_clearSharers;
     qw_queueMemoryWBFromCacheRequest;
     i_popIncomingRequestQueue;
   }
 
-  transition(MIS, Dirty_Writeback, S) {
+  transition(WBI, Memory_Ack, I) {
+    clearWBAck;
+    w_deallocateTBE;
+    deallocDirEntry;
+    q_popMemQueue;
+  }
+
+  transition(MIS, Dirty_Writeback, WBS) {
     c_moveOwnerToSharer;
     qw_queueMemoryWBFromCacheRequest;
     i_popIncomingRequestQueue;
@@ -823,21 +1029,30 @@
 
   transition(MIS, Clean_Writeback, S) {
     c_moveOwnerToSharer;
+    w_deallocateTBE;
     i_popIncomingRequestQueue;
   }
 
-  transition(OS, Dirty_Writeback, S) {
+  transition(OS, Dirty_Writeback, WBS) {
     c_clearOwner;
+    v_allocateTBE;
     qw_queueMemoryWBFromCacheRequest;
     i_popIncomingRequestQueue;
   }
 
-  transition(OSS, Dirty_Writeback, S) {
+  transition(OSS, Dirty_Writeback, WBS) {
     c_moveOwnerToSharer;
+    v_allocateTBE;
     qw_queueMemoryWBFromCacheRequest;
     i_popIncomingRequestQueue;
   }
 
+  transition(WBS, Memory_Ack, S) {
+    clearWBAck;
+    w_deallocateTBE;
+    q_popMemQueue;
+  }
+
   transition(OSS, Clean_Writeback, S) {
     c_moveOwnerToSharer;
     i_popIncomingRequestQueue;
@@ -846,6 +1061,8 @@
   transition(MI, Clean_Writeback, I) {
     c_clearOwner;
     cc_clearSharers;
+    w_deallocateTBE;
+    deallocDirEntry;
     i_popIncomingRequestQueue;
   }
 
@@ -854,21 +1071,24 @@
     i_popIncomingRequestQueue;
   }
 
-  transition({MI, MIS}, Unblock, M) {
-    j_popIncomingUnblockQueue;
-  }
-
-  transition({OS, OSS}, Unblock, O) {
-    j_popIncomingUnblockQueue;
-  }
-
-  transition({I, S, O, M, IS, SS, OO, MO, MM, MI, MIS, OS, OSS}, Memory_Data) {
+  transition({S, SS}, Memory_Data_Cache) {
     d_sendDataMsg;
     q_popMemQueue;
   }
 
-  transition({I, S, O, M, IS, SS, OO, MO, MM, MI, MIS, OS, OSS, XI_U, XI_M}, Memory_Ack) {
-    //a_sendAck;
+  transition(IS_M, Memory_Data_Cache, IS) {
+    d_sendDataMsg;
+    q_popMemQueue;
+  }
+
+  transition(MM_M, Memory_Data_Cache, MM) {
+    d_sendDataMsg;
+    q_popMemQueue;
+  }
+
+  transition(SS, Memory_Data_DMA) {
+    d_sendDataMsg;
+    insertDMAUnblock; // DMA will not send unblocks in response to reads
     q_popMemQueue;
   }
 
diff --git a/src/mem/ruby/protocol/MOESI_CMP_directory-dma.sm b/src/mem/ruby/protocol/MOESI_CMP_directory-dma.sm
index a3a9f63..c2eb593 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_directory-dma.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_directory-dma.sm
@@ -100,6 +100,7 @@
   }
 
   AccessPermission getAccessPermission(Addr addr) {
+    DPRINTF(RubySlicc, "AccessPermission_NotPresent\n");
     return AccessPermission:NotPresent;
   }
 
@@ -192,7 +193,7 @@
           out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
           out_msg.Requestor := machineID;
           out_msg.RequestorMachine := MachineType:DMA;
-          out_msg.MessageSize := MessageSizeType:Writeback_Control;
+          out_msg.MessageSize := MessageSizeType:Data;
         }
       }
   }
@@ -254,6 +255,7 @@
   }
 
   action(v_allocateTBE, "v", desc="Allocate TBE entry") {
+    check_allocate(TBEs);
     TBEs.allocate(address);
     set_tbe(TBEs[address]);
   }
diff --git a/src/mem/ruby/protocol/MOESI_CMP_directory-msg.sm b/src/mem/ruby/protocol/MOESI_CMP_directory-msg.sm
index 7dc5822..2dd34e4 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_directory-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_directory-msg.sm
@@ -109,9 +109,7 @@
 
   bool functionalRead(Packet *pkt) {
     // Read only those messages that contain the data
-    if (Type == CoherenceRequestType:DMA_READ ||
-        Type == CoherenceRequestType:DMA_WRITE ||
-        Type == CoherenceRequestType:WRITEBACK_CLEAN_DATA ||
+    if (Type == CoherenceRequestType:WRITEBACK_CLEAN_DATA ||
         Type == CoherenceRequestType:WRITEBACK_DIRTY_DATA) {
         return testAndRead(addr, DataBlk, pkt);
     }
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index e3a136f..b59cf97 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -49,9 +49,13 @@
 //
 
 external_type(MessageBuffer, buffer="yes", inport="yes", outport="yes");
-external_type(OutPort, primitive="yes");
 external_type(Scalar, primitive="yes");
 
+structure(OutPort, external = "yes", primitive="yes") {
+    void enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay);
+    bool isDeferredMsgMapEmpty(Addr addr);
+}
+
 structure(InPort, external = "yes", primitive="yes") {
   bool isReady(Tick current_time);
   Tick dequeue(Tick current_time);
@@ -59,6 +63,7 @@
   bool isEmpty();
   bool isStallMapEmpty();
   int getStallMapSize();
+  bool hasStalledMsg(Addr addr);
 }
 
 external_type(NodeID, default="0", primitive="yes");
@@ -128,7 +133,6 @@
   void writeCallbackScFail(Addr, DataBlock);
   bool llscCheckMonitor(Addr);
 
-  void checkCoherence(Addr);
   void evictionCallback(Addr);
   void recordRequestType(SequencerRequestType);
   bool checkResourceAvailable(CacheResourceType, Addr);
@@ -147,7 +151,6 @@
                      Cycles, Cycles, Cycles);
   void writeCallback(Addr, MachineType, DataBlock,
                      Cycles, Cycles, Cycles, bool);
-  void checkCoherence(Addr);
   void evictionCallback(Addr);
   void recordCPReadCallBack(MachineID, MachineID);
   void recordCPWriteCallBack(MachineID, MachineID);
@@ -168,7 +171,6 @@
                      Cycles, Cycles, Cycles, bool);
   void invCallback(Addr);
   void wbCallback(Addr);
-  void checkCoherence(Addr);
   void evictionCallback(Addr);
 }
 
@@ -196,6 +198,7 @@
 structure (DirectoryMemory, external = "yes") {
   AbstractCacheEntry allocate(Addr, AbstractCacheEntry);
   AbstractCacheEntry lookup(Addr);
+  void deallocate(Addr);
   bool isPresent(Addr);
   void invalidateBlock(Addr);
   void recordRequestType(DirectoryRequestType);
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index 59611ae..b729d26 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -56,7 +56,7 @@
       m_transitions_per_cycle(p->transitions_per_cycle),
       m_buffer_size(p->buffer_size), m_recycle_latency(p->recycle_latency),
       m_mandatory_queue_latency(p->mandatory_queue_latency),
-      memoryPort(csprintf("%s.memory", name()), this, ""),
+      memoryPort(csprintf("%s.memory", name()), this),
       addrRanges(p->addr_ranges.begin(), p->addr_ranges.end())
 {
     if (m_version == 0) {
@@ -250,12 +250,15 @@
         // to make more progress. Make sure it wakes up
         scheduleEvent(Cycles(1));
         recvTimingResp(pkt);
-    } else {
+    } else if (memoryPort.sendTimingReq(pkt)) {
         mem_queue->dequeue(clockEdge());
-        memoryPort.schedTimingReq(pkt, clockEdge());
         // Since the queue was popped the controller may be able
         // to make more progress. Make sure it wakes up
         scheduleEvent(Cycles(1));
+    } else {
+        scheduleEvent(Cycles(1));
+        delete pkt;
+        delete s;
     }
 
     return true;
@@ -306,11 +309,6 @@
 {
     int num_functional_writes = 0;
 
-    // Check the buffer from the controller to the memory.
-    if (memoryPort.trySatisfyFunctional(pkt)) {
-        num_functional_writes++;
-    }
-
     // Update memory itself.
     memoryPort.sendFunctional(pkt);
     return num_functional_writes + 1;
@@ -369,12 +367,15 @@
     return true;
 }
 
+void
+AbstractController::MemoryPort::recvReqRetry()
+{
+    controller->serviceMemoryQueue();
+}
+
 AbstractController::MemoryPort::MemoryPort(const std::string &_name,
                                            AbstractController *_controller,
-                                           const std::string &_label)
-    : QueuedMasterPort(_name, _controller, reqQueue, snoopRespQueue),
-      reqQueue(*_controller, *this, _label),
-      snoopRespQueue(*_controller, *this, false, _label),
-      controller(_controller)
+                                           PortID id)
+    : MasterPort(_name, _controller, id), controller(_controller)
 {
 }
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index 15aff12..1577cfa 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -228,26 +228,25 @@
 
     /**
      * Port that forwards requests and receives responses from the
-     * memory controller.  It has a queue of packets not yet sent.
+     * memory controller.
      */
-    class MemoryPort : public QueuedMasterPort
+    class MemoryPort : public MasterPort
     {
       private:
-        // Packet queues used to store outgoing requests and snoop responses.
-        ReqPacketQueue reqQueue;
-        SnoopRespPacketQueue snoopRespQueue;
-
         // Controller that operates this port.
         AbstractController *controller;
 
       public:
         MemoryPort(const std::string &_name, AbstractController *_controller,
-                   const std::string &_label);
+                   PortID id = InvalidPortID);
 
+      protected:
         // Function for receiving a timing response from the peer port.
         // Currently the pkt is handed to the coherence controller
         // associated with this port.
         bool recvTimingResp(PacketPtr pkt);
+
+        void recvReqRetry();
     };
 
     /* Master port to the memory controller. */
diff --git a/src/mem/ruby/structures/DirectoryMemory.cc b/src/mem/ruby/structures/DirectoryMemory.cc
index e2ee0fc..c6e3ccf 100644
--- a/src/mem/ruby/structures/DirectoryMemory.cc
+++ b/src/mem/ruby/structures/DirectoryMemory.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited
+ * Copyright (c) 2017,2019 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -127,6 +127,7 @@
 
     idx = mapAddressToLocalIdx(address);
     assert(idx < m_num_entries);
+    assert(m_entries[idx] == NULL);
     entry->changePermission(AccessPermission_Read_Only);
     m_entries[idx] = entry;
 
@@ -134,6 +135,20 @@
 }
 
 void
+DirectoryMemory::deallocate(Addr address)
+{
+    assert(isPresent(address));
+    uint64_t idx;
+    DPRINTF(RubyCache, "Removing entry for address: %#x\n", address);
+
+    idx = mapAddressToLocalIdx(address);
+    assert(idx < m_num_entries);
+    assert(m_entries[idx] != NULL);
+    delete m_entries[idx];
+    m_entries[idx] = NULL;
+}
+
+void
 DirectoryMemory::print(ostream& out) const
 {
 }
diff --git a/src/mem/ruby/structures/DirectoryMemory.hh b/src/mem/ruby/structures/DirectoryMemory.hh
index f879b29..3dd0e95 100644
--- a/src/mem/ruby/structures/DirectoryMemory.hh
+++ b/src/mem/ruby/structures/DirectoryMemory.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited
+ * Copyright (c) 2017,2019 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -79,6 +79,9 @@
     AbstractCacheEntry *lookup(Addr address);
     AbstractCacheEntry *allocate(Addr address, AbstractCacheEntry* new_entry);
 
+    // Explicitly free up this address
+    void deallocate(Addr address);
+
     void print(std::ostream& out) const;
     void recordRequestType(DirectoryRequestType requestType);
 
diff --git a/src/mem/ruby/structures/PerfectCacheMemory.hh b/src/mem/ruby/structures/PerfectCacheMemory.hh
index 363e3e8..9898995 100644
--- a/src/mem/ruby/structures/PerfectCacheMemory.hh
+++ b/src/mem/ruby/structures/PerfectCacheMemory.hh
@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2019 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
  * All rights reserved.
  *
@@ -138,7 +150,8 @@
 inline void
 PerfectCacheMemory<ENTRY>::deallocate(Addr address)
 {
-    m_map.erase(makeLineAddress(address));
+    auto num_erased M5_VAR_USED = m_map.erase(makeLineAddress(address));
+    assert(num_erased == 1);
 }
 
 // Returns with the physical address of the conflicting cache line
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 93275cb..0153b4c 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -113,11 +113,95 @@
     return accessSegment;
 }
 
+UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
+    : coalescer(gc)
+{
+}
+
+void
+UncoalescedTable::insertPacket(PacketPtr pkt)
+{
+    uint64_t seqNum = pkt->req->getReqInstSeqNum();
+
+    instMap[seqNum].push_back(pkt);
+    DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
+            pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
+}
+
+bool
+UncoalescedTable::packetAvailable()
+{
+    return !instMap.empty();
+}
+
+PerInstPackets*
+UncoalescedTable::getInstPackets(int offset)
+{
+    if (offset >= instMap.size()) {
+        return nullptr;
+    }
+
+    auto instMapIter = instMap.begin();
+    std::advance(instMapIter, offset);
+
+    return &(instMapIter->second);
+}
+
+void
+UncoalescedTable::updateResources()
+{
+    for (auto iter = instMap.begin(); iter != instMap.end(); ) {
+        if (iter->second.empty()) {
+            instMap.erase(iter++);
+            coalescer->getGMTokenPort().sendTokens(1);
+        } else {
+            ++iter;
+        }
+    }
+}
+
+void
+UncoalescedTable::printRequestTable(std::stringstream& ss)
+{
+    ss << "UncoalescedTable contains " << instMap.size()
+       << " address entries." << std::endl;
+    for (auto& inst : instMap) {
+        ss << "Addr 0x" << std::hex << inst.first << std::dec
+           << " with " << inst.second.size() << " packets"
+           << std::endl;
+    }
+}
+
+void
+UncoalescedTable::checkDeadlock(Tick threshold)
+{
+    Tick current_time = curTick();
+
+    for (auto &it : instMap) {
+        for (auto &pkt : it.second) {
+            if (current_time - pkt->req->time() > threshold) {
+                std::stringstream ss;
+                printRequestTable(ss);
+
+                panic("Possible Deadlock detected. Aborting!\n"
+                     "version: %d request.paddr: 0x%x uncoalescedTable: %d "
+                     "current time: %u issue_time: %d difference: %d\n"
+                     "Request Tables:\n\n%s", coalescer->getId(),
+                      pkt->getAddr(), instMap.size(), current_time,
+                      pkt->req->time(), current_time - pkt->req->time(),
+                      ss.str());
+            }
+        }
+    }
+}
+
 GPUCoalescer::GPUCoalescer(const Params *p)
     : RubyPort(p),
       issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
                  false, Event::Progress_Event_Pri),
-      deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
+      uncoalescedTable(this),
+      deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
+      gmTokenPort(name() + ".gmTokenPort", this)
 {
     m_store_waiting_on_load_cycles = 0;
     m_store_waiting_on_store_cycles = 0;
@@ -126,8 +210,9 @@
 
     m_outstanding_count = 0;
 
+    coalescingWindow = p->max_coalesces_per_cycle;
+
     m_max_outstanding_requests = 0;
-    m_deadlock_threshold = 0;
     m_instCache_ptr = nullptr;
     m_dataCache_ptr = nullptr;
 
@@ -149,52 +234,46 @@
 {
 }
 
+Port &
+GPUCoalescer::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "gmTokenPort") {
+        return gmTokenPort;
+    }
+
+    // delgate to RubyPort otherwise
+    return RubyPort::getPort(if_name, idx);
+}
+
 void
 GPUCoalescer::wakeup()
 {
-    // Check for deadlock of any of the requests
     Cycles current_time = curCycle();
+    for (auto& requestList : coalescedTable) {
+        for (auto& req : requestList.second) {
+            if (current_time - req->getIssueTime() > m_deadlock_threshold) {
+                std::stringstream ss;
+                printRequestTable(ss);
+                ss << "Outstanding requests: " << m_outstanding_count
+                   << std::endl;
 
-    // Check across all outstanding requests
-    int total_outstanding = 0;
-
-    RequestTable::iterator read = m_readRequestTable.begin();
-    RequestTable::iterator read_end = m_readRequestTable.end();
-    for (; read != read_end; ++read) {
-        GPUCoalescerRequest* request = read->second;
-        if (current_time - request->issue_time < m_deadlock_threshold)
-            continue;
-
-        panic("Possible Deadlock detected. Aborting!\n"
-             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
-             "current time: %u issue_time: %d difference: %d\n", m_version,
-              request->pkt->getAddr(), m_readRequestTable.size(),
-              current_time * clockPeriod(), request->issue_time * clockPeriod(),
-              (current_time - request->issue_time)*clockPeriod());
+                panic("Possible Deadlock detected. Aborting!\n"
+                     "version: %d request.paddr: 0x%x coalescedTable: %d "
+                     "current time: %u issue_time: %d difference: %d\n"
+                     "Request Tables:\n %s", m_version,
+                      req->getFirstPkt()->getAddr(),
+                      coalescedTable.size(), cyclesToTicks(current_time),
+                      cyclesToTicks(req->getIssueTime()),
+                      cyclesToTicks(current_time - req->getIssueTime()),
+                      ss.str());
+            }
+        }
     }
 
-    RequestTable::iterator write = m_writeRequestTable.begin();
-    RequestTable::iterator write_end = m_writeRequestTable.end();
-    for (; write != write_end; ++write) {
-        GPUCoalescerRequest* request = write->second;
-        if (current_time - request->issue_time < m_deadlock_threshold)
-            continue;
-
-        panic("Possible Deadlock detected. Aborting!\n"
-             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
-             "current time: %u issue_time: %d difference: %d\n", m_version,
-              request->pkt->getAddr(), m_writeRequestTable.size(),
-              current_time * clockPeriod(), request->issue_time * clockPeriod(),
-              (current_time - request->issue_time) * clockPeriod());
-    }
-
-    total_outstanding += m_writeRequestTable.size();
-    total_outstanding += m_readRequestTable.size();
-
-    assert(m_outstanding_count == total_outstanding);
+    Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
+    uncoalescedTable.checkDeadlock(tick_threshold);
 
     if (m_outstanding_count > 0) {
-        // If there are still outstanding requests, keep checking
         schedule(deadlockCheckEvent,
                  m_deadlock_threshold * clockPeriod() +
                  curTick());
@@ -202,6 +281,26 @@
 }
 
 void
+GPUCoalescer::printRequestTable(std::stringstream& ss)
+{
+    uncoalescedTable.printRequestTable(ss);
+
+    ss << "CoalescedTable contains " << coalescedTable.size()
+       << " address entries." << std::endl;
+    for (auto& requestList : coalescedTable) {
+        ss << "Addr 0x" << std::hex << requestList.first << std::dec
+           << ": type-";
+        for (auto& request : requestList.second) {
+            ss << RubyRequestType_to_string(request->getRubyType())
+               << " pkts-" << request->getPackets().size()
+               << " issued-" << request->getIssueTime() << " seqNum-"
+               << request->getSeqNum() << "; ";
+        }
+        ss << std::endl;
+    }
+}
+
+void
 GPUCoalescer::resetStats()
 {
     m_latencyHist.reset();
@@ -229,65 +328,6 @@
 {
 }
 
-RequestStatus
-GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
-{
-    Addr line_addr = makeLineAddress(pkt->getAddr());
-
-    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
-        return RequestStatus_BufferFull;
-    }
-
-    if (m_controller->isBlocked(line_addr) &&
-       request_type != RubyRequestType_Locked_RMW_Write) {
-        return RequestStatus_Aliased;
-    }
-
-    if ((request_type == RubyRequestType_ST) ||
-        (request_type == RubyRequestType_ATOMIC) ||
-        (request_type == RubyRequestType_ATOMIC_RETURN) ||
-        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
-        (request_type == RubyRequestType_RMW_Read) ||
-        (request_type == RubyRequestType_RMW_Write) ||
-        (request_type == RubyRequestType_Load_Linked) ||
-        (request_type == RubyRequestType_Store_Conditional) ||
-        (request_type == RubyRequestType_Locked_RMW_Read) ||
-        (request_type == RubyRequestType_Locked_RMW_Write) ||
-        (request_type == RubyRequestType_FLUSH)) {
-
-        // Check if there is any outstanding read request for the same
-        // cache line.
-        if (m_readRequestTable.count(line_addr) > 0) {
-            m_store_waiting_on_load_cycles++;
-            return RequestStatus_Aliased;
-        }
-
-        if (m_writeRequestTable.count(line_addr) > 0) {
-          // There is an outstanding write request for the cache line
-          m_store_waiting_on_store_cycles++;
-          return RequestStatus_Aliased;
-        }
-    } else {
-        // Check if there is any outstanding write request for the same
-        // cache line.
-        if (m_writeRequestTable.count(line_addr) > 0) {
-            m_load_waiting_on_store_cycles++;
-            return RequestStatus_Aliased;
-        }
-
-        if (m_readRequestTable.count(line_addr) > 0) {
-            // There is an outstanding read request for the cache line
-            m_load_waiting_on_load_cycles++;
-            return RequestStatus_Aliased;
-        }
-    }
-
-    return RequestStatus_Ready;
-
-}
-
-
-
 // sets the kernelEndList
 void
 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
@@ -303,153 +343,6 @@
             kernelEndList.size());
 }
 
-
-// Insert the request on the correct request table.  Return true if
-// the entry was already present.
-bool
-GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
-{
-    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
-           pkt->req->isLockedRMW() ||
-           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
-
-    int total_outstanding M5_VAR_USED =
-        m_writeRequestTable.size() + m_readRequestTable.size();
-
-    assert(m_outstanding_count == total_outstanding);
-
-    // See if we should schedule a deadlock check
-    if (!deadlockCheckEvent.scheduled()) {
-        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
-    }
-
-    Addr line_addr = makeLineAddress(pkt->getAddr());
-    if ((request_type == RubyRequestType_ST) ||
-        (request_type == RubyRequestType_ATOMIC) ||
-        (request_type == RubyRequestType_ATOMIC_RETURN) ||
-        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
-        (request_type == RubyRequestType_RMW_Read) ||
-        (request_type == RubyRequestType_RMW_Write) ||
-        (request_type == RubyRequestType_Load_Linked) ||
-        (request_type == RubyRequestType_Store_Conditional) ||
-        (request_type == RubyRequestType_Locked_RMW_Read) ||
-        (request_type == RubyRequestType_Locked_RMW_Write) ||
-        (request_type == RubyRequestType_FLUSH)) {
-
-        pair<RequestTable::iterator, bool> r =
-          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
-                                       (GPUCoalescerRequest*) NULL));
-        if (r.second) {
-            RequestTable::iterator i = r.first;
-            i->second = new GPUCoalescerRequest(pkt, request_type,
-                                                curCycle());
-            DPRINTF(GPUCoalescer,
-                    "Inserting write request for paddr %#x for type %d\n",
-                    pkt->req->getPaddr(), i->second->m_type);
-            m_outstanding_count++;
-        } else {
-            return true;
-        }
-    } else {
-        pair<RequestTable::iterator, bool> r =
-            m_readRequestTable.insert(RequestTable::value_type(line_addr,
-                                        (GPUCoalescerRequest*) NULL));
-
-        if (r.second) {
-            RequestTable::iterator i = r.first;
-            i->second = new GPUCoalescerRequest(pkt, request_type,
-                                             curCycle());
-            DPRINTF(GPUCoalescer,
-                    "Inserting read request for paddr %#x for type %d\n",
-                    pkt->req->getPaddr(), i->second->m_type);
-            m_outstanding_count++;
-        } else {
-            return true;
-        }
-    }
-
-    m_outstandReqHist.sample(m_outstanding_count);
-
-    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
-    assert(m_outstanding_count == total_outstanding);
-
-    return false;
-}
-
-void
-GPUCoalescer::markRemoved()
-{
-    m_outstanding_count--;
-    assert(m_outstanding_count ==
-           m_writeRequestTable.size() + m_readRequestTable.size());
-}
-
-void
-GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
-{
-    assert(m_outstanding_count ==
-           m_writeRequestTable.size() + m_readRequestTable.size());
-
-    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
-    if ((srequest->m_type == RubyRequestType_ST) ||
-        (srequest->m_type == RubyRequestType_RMW_Read) ||
-        (srequest->m_type == RubyRequestType_RMW_Write) ||
-        (srequest->m_type == RubyRequestType_Load_Linked) ||
-        (srequest->m_type == RubyRequestType_Store_Conditional) ||
-        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
-        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
-        m_writeRequestTable.erase(line_addr);
-    } else {
-        m_readRequestTable.erase(line_addr);
-    }
-
-    markRemoved();
-}
-
-bool
-GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
-{
-    //
-    // The success flag indicates whether the LLSC operation was successful.
-    // LL ops will always succeed, but SC may fail if the cache line is no
-    // longer locked.
-    //
-    bool success = true;
-    if (request->m_type == RubyRequestType_Store_Conditional) {
-        if (!m_dataCache_ptr->isLocked(address, m_version)) {
-            //
-            // For failed SC requests, indicate the failure to the cpu by
-            // setting the extra data to zero.
-            //
-            request->pkt->req->setExtraData(0);
-            success = false;
-        } else {
-            //
-            // For successful SC requests, indicate the success to the cpu by
-            // setting the extra data to one.
-            //
-            request->pkt->req->setExtraData(1);
-        }
-        //
-        // Independent of success, all SC operations must clear the lock
-        //
-        m_dataCache_ptr->clearLocked(address);
-    } else if (request->m_type == RubyRequestType_Load_Linked) {
-        //
-        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
-        // previously locked cache lines?
-        //
-        m_dataCache_ptr->setLocked(address, m_version);
-    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
-               (m_dataCache_ptr->isLocked(address, m_version))) {
-        //
-        // Normal writes should clear the locked address
-        //
-        m_dataCache_ptr->clearLocked(address);
-    }
-    return success;
-}
-
 void
 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
 {
@@ -487,49 +380,22 @@
                          bool isRegion)
 {
     assert(address == makeLineAddress(address));
+    assert(coalescedTable.count(address));
 
-    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
-    assert(m_writeRequestTable.count(makeLineAddress(address)));
+    auto crequest = coalescedTable.at(address).front();
 
-    RequestTable::iterator i = m_writeRequestTable.find(address);
-    assert(i != m_writeRequestTable.end());
-    GPUCoalescerRequest* request = i->second;
+    hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
+                forwardRequestTime, firstResponseTime, isRegion);
 
-    m_writeRequestTable.erase(i);
-    markRemoved();
+    delete crequest;
+    coalescedTable.at(address).pop_front();
 
-    assert((request->m_type == RubyRequestType_ST) ||
-           (request->m_type == RubyRequestType_ATOMIC) ||
-           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
-           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
-           (request->m_type == RubyRequestType_RMW_Read) ||
-           (request->m_type == RubyRequestType_RMW_Write) ||
-           (request->m_type == RubyRequestType_Load_Linked) ||
-           (request->m_type == RubyRequestType_Store_Conditional) ||
-           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
-           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
-           (request->m_type == RubyRequestType_FLUSH));
-
-
-    //
-    // For Alpha, properly handle LL, SC, and write requests with respect to
-    // locked cache blocks.
-    //
-    // Not valid for Garnet_standalone protocl
-    //
-    bool success = true;
-    if (!m_runningGarnetStandalone)
-        success = handleLlsc(address, request);
-
-    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
-        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
-    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
-        m_controller->unblock(address);
+    if (coalescedTable.at(address).empty()) {
+        coalescedTable.erase(address);
+    } else {
+        auto nextRequest = coalescedTable.at(address).front();
+        issueRequest(nextRequest);
     }
-
-    hitCallback(request, mach, data, success,
-                request->issue_time, forwardRequestTime, firstResponseTime,
-                isRegion);
 }
 
 void
@@ -570,26 +436,37 @@
                         bool isRegion)
 {
     assert(address == makeLineAddress(address));
-    assert(m_readRequestTable.count(makeLineAddress(address)));
+    assert(coalescedTable.count(address));
 
-    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
-    RequestTable::iterator i = m_readRequestTable.find(address);
-    assert(i != m_readRequestTable.end());
-    GPUCoalescerRequest* request = i->second;
+    auto crequest = coalescedTable.at(address).front();
+    fatal_if(crequest->getRubyType() != RubyRequestType_LD,
+             "readCallback received non-read type response\n");
 
-    m_readRequestTable.erase(i);
-    markRemoved();
+    // Iterate over the coalesced requests to respond to as many loads as
+    // possible until another request type is seen. Models MSHR for TCP.
+    while (crequest->getRubyType() == RubyRequestType_LD) {
+        hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
+                    forwardRequestTime, firstResponseTime, isRegion);
 
-    assert((request->m_type == RubyRequestType_LD) ||
-           (request->m_type == RubyRequestType_IFETCH));
+        delete crequest;
+        coalescedTable.at(address).pop_front();
+        if (coalescedTable.at(address).empty()) {
+            break;
+        }
 
-    hitCallback(request, mach, data, true,
-                request->issue_time, forwardRequestTime, firstResponseTime,
-                isRegion);
+        crequest = coalescedTable.at(address).front();
+    }
+
+    if (coalescedTable.at(address).empty()) {
+        coalescedTable.erase(address);
+    } else {
+        auto nextRequest = coalescedTable.at(address).front();
+        issueRequest(nextRequest);
+    }
 }
 
 void
-GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                        MachineType mach,
                        DataBlock& data,
                        bool success,
@@ -598,22 +475,15 @@
                        Cycles firstResponseTime,
                        bool isRegion)
 {
-    PacketPtr pkt = srequest->pkt;
+    PacketPtr pkt = crequest->getFirstPkt();
     Addr request_address = pkt->getAddr();
     Addr request_line_address = makeLineAddress(request_address);
 
-    RubyRequestType type = srequest->m_type;
+    RubyRequestType type = crequest->getRubyType();
 
-    // Set this cache entry to the most recently used
-    if (type == RubyRequestType_IFETCH) {
-        if (m_instCache_ptr->isTagPresent(request_line_address))
-            m_instCache_ptr->setMRU(request_line_address);
-    } else {
-        if (m_dataCache_ptr->isTagPresent(request_line_address))
-            m_dataCache_ptr->setMRU(request_line_address);
-    }
+    DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
 
-    recordMissLatency(srequest, mach,
+    recordMissLatency(crequest, mach,
                       initialRequestTime,
                       forwardRequestTime,
                       firstResponseTime,
@@ -621,13 +491,11 @@
     // update the data
     //
     // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
-    int len = reqCoalescer[request_line_address].size();
-    std::vector<PacketPtr> mylist;
-    for (int i = 0; i < len; ++i) {
-        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
-        assert(type == reqCoalescer[request_line_address][i].primaryType);
+    std::vector<PacketPtr> pktList = crequest->getPackets();
+    DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
+            pktList.size(), request_line_address);
+    for (auto& pkt : pktList) {
         request_address = pkt->getAddr();
-        request_line_address = makeLineAddress(pkt->getAddr());
         if (pkt->getPtr<uint8_t>()) {
             if ((type == RubyRequestType_LD) ||
                 (type == RubyRequestType_ATOMIC) ||
@@ -658,36 +526,56 @@
             RubyPort::SenderState *requestSenderState =
                 safe_cast<RubyPort::SenderState*>(pkt->senderState);
             RubyTester::SenderState* testerSenderState =
-                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+                safe_cast<RubyTester::SenderState*>
+                    (requestSenderState->predecessor);
             testerSenderState->subBlock.mergeFrom(data);
         }
-
-        mylist.push_back(pkt);
     }
-    delete srequest;
-    reqCoalescer.erase(request_line_address);
-    assert(!reqCoalescer.count(request_line_address));
 
 
 
-    completeHitCallback(mylist, len);
+    m_outstanding_count--;
+    assert(m_outstanding_count >= 0);
+
+    completeHitCallback(pktList);
 }
 
 bool
 GPUCoalescer::empty() const
 {
-    return m_writeRequestTable.empty() && m_readRequestTable.empty();
+    return coalescedTable.empty();
 }
 
-// Analyzes the packet to see if this request can be coalesced.
-// If request can be coalesced, this request is added to the reqCoalescer table
-// and makeRequest returns RequestStatus_Issued;
-// If this is the first request to a cacheline, request is added to both
-// newRequests queue and to the reqCoalescer table; makeRequest
-// returns RequestStatus_Issued.
-// If there is a pending request to this cacheline and this request
-// can't be coalesced, RequestStatus_Aliased is returned and
-// the packet needs to be reissued.
+RubyRequestType
+GPUCoalescer::getRequestType(PacketPtr pkt)
+{
+    RubyRequestType req_type = RubyRequestType_NULL;
+
+    // These types are not support or not used in GPU caches.
+    assert(!pkt->req->isLLSC());
+    assert(!pkt->req->isLockedRMW());
+    assert(!pkt->req->isInstFetch());
+    assert(!pkt->isFlush());
+
+    if (pkt->req->isAtomicReturn()) {
+        req_type = RubyRequestType_ATOMIC_RETURN;
+    } else if (pkt->req->isAtomicNoReturn()) {
+        req_type = RubyRequestType_ATOMIC_NO_RETURN;
+    } else if (pkt->isRead()) {
+        req_type = RubyRequestType_LD;
+    } else if (pkt->isWrite()) {
+        req_type = RubyRequestType_ST;
+    } else {
+        // Acquire and release packets will have been issued by
+        // makeRequest, so we do not need to check for it here.
+        panic("Unsupported ruby packet type\n");
+    }
+
+    return req_type;
+}
+
+// Places an uncoalesced packet in uncoalescedTable. If the packet is a
+// special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
@@ -719,147 +607,37 @@
         }
     }
 
-    // If number of outstanding requests greater than the max allowed,
-    // return RequestStatus_BufferFull. This logic can be extended to
-    // support proper backpressure.
-    if (m_outstanding_count >= m_max_outstanding_requests) {
-        return RequestStatus_BufferFull;
-    }
-
-    RubyRequestType primary_type = RubyRequestType_NULL;
-    RubyRequestType secondary_type = RubyRequestType_NULL;
-
-    if (pkt->isLLSC()) {
-        //
-        // Alpha LL/SC instructions need to be handled carefully by the cache
-        // coherence protocol to ensure they follow the proper semantics. In
-        // particular, by identifying the operations as atomic, the protocol
-        // should understand that migratory sharing optimizations should not
-        // be performed (i.e. a load between the LL and SC should not steal
-        // away exclusive permission).
-        //
-        if (pkt->isWrite()) {
-            primary_type = RubyRequestType_Store_Conditional;
-        } else {
-            assert(pkt->isRead());
-            primary_type = RubyRequestType_Load_Linked;
-        }
-        secondary_type = RubyRequestType_ATOMIC;
-    } else if (pkt->req->isLockedRMW()) {
-        //
-        // x86 locked instructions are translated to store cache coherence
-        // requests because these requests should always be treated as read
-        // exclusive operations and should leverage any migratory sharing
-        // optimization built into the protocol.
-        //
-        if (pkt->isWrite()) {
-            primary_type = RubyRequestType_Locked_RMW_Write;
-        } else {
-            assert(pkt->isRead());
-            primary_type = RubyRequestType_Locked_RMW_Read;
-        }
-        secondary_type = RubyRequestType_ST;
-    } else if (pkt->isAtomicOp()) {
-        //
-        // GPU Atomic Operation
-        //
-        primary_type = RubyRequestType_ATOMIC;
-        secondary_type = RubyRequestType_ATOMIC;
-    } else {
-        if (pkt->isRead()) {
-            if (pkt->req->isInstFetch()) {
-                primary_type = secondary_type = RubyRequestType_IFETCH;
-            } else {
-#if THE_ISA == X86_ISA
-                uint32_t flags = pkt->req->getFlags();
-                bool storeCheck = flags &
-                        (TheISA::StoreCheck << TheISA::FlagShift);
-#else
-                bool storeCheck = false;
-#endif // X86_ISA
-                if (storeCheck) {
-                    primary_type = RubyRequestType_RMW_Read;
-                    secondary_type = RubyRequestType_ST;
-                } else {
-                    primary_type = secondary_type = RubyRequestType_LD;
-                }
+    if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
+        !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
+        (pkt->req->isRelease() || pkt->req->isAcquire())) {
+        if (assumingRfOCoherence) {
+            // If we reached here, this request must be a memFence
+            // and the protocol implements RfO, the coalescer can
+            // assume sequentially consistency and schedule the callback
+            // immediately.
+            // Currently the code implements fence callbacks
+            // by reusing the mechanism for kernel completions.
+            // This should be fixed.
+            int wf_id = 0;
+            if (pkt->req->hasContextId()) {
+                wf_id = pkt->req->contextId();
             }
-        } else if (pkt->isWrite()) {
-            //
-            // Note: M5 packets do not differentiate ST from RMW_Write
-            //
-            primary_type = secondary_type = RubyRequestType_ST;
-        } else if (pkt->isFlush()) {
-            primary_type = secondary_type = RubyRequestType_FLUSH;
-        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
-            if (assumingRfOCoherence) {
-                // If we reached here, this request must be a memFence
-                // and the protocol implements RfO, the coalescer can
-                // assume sequentially consistency and schedule the callback
-                // immediately.
-                // Currently the code implements fence callbacks
-                // by reusing the mechanism for kernel completions.
-                // This should be fixed.
-                int wf_id = 0;
-                if (pkt->req->hasContextId()) {
-                    wf_id = pkt->req->contextId();
-                }
-                insertKernel(wf_id, pkt);
-                newKernelEnds.push_back(wf_id);
-                if (!issueEvent.scheduled()) {
-                    schedule(issueEvent, curTick());
-                }
-                return RequestStatus_Issued;
-            } else {
-                // If not RfO, return issued here and let the child coalescer
-                // take care of it.
-                return RequestStatus_Issued;
+            insertKernel(wf_id, pkt);
+            newKernelEnds.push_back(wf_id);
+            if (!issueEvent.scheduled()) {
+                schedule(issueEvent, curTick());
             }
+            return RequestStatus_Issued;
         } else {
-            panic("Unsupported ruby packet type\n");
+            // If not RfO, return issued here and let the child coalescer
+            // take care of it.
+            return RequestStatus_Issued;
         }
     }
 
-    // Check if there is any pending request to this cache line from
-    // previous cycles.
-    // If there is a pending request, return aliased. Since coalescing
-    // across time is not permitted, aliased requests are not coalesced.
-    // If a request for this address has already been issued, we must block
-    RequestStatus status = getRequestStatus(pkt, primary_type);
-    if (status != RequestStatus_Ready)
-        return status;
+    uncoalescedTable.insertPacket(pkt);
+    DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
 
-    Addr line_addr = makeLineAddress(pkt->getAddr());
-
-    // Check if this request can be coalesced with previous
-    // requests from this cycle.
-    if (!reqCoalescer.count(line_addr)) {
-        // This is the first access to this cache line.
-        // A new request to the memory subsystem has to be
-        // made in the next cycle for this cache line, so
-        // add this line addr to the "newRequests" queue
-        newRequests.push_back(line_addr);
-
-    // There was a request to this cache line in this cycle,
-    // let us see if we can coalesce this request with the previous
-    // requests from this cycle
-    } else if (primary_type !=
-               reqCoalescer[line_addr][0].primaryType) {
-        // can't coalesce loads, stores and atomics!
-        return RequestStatus_Aliased;
-    } else if (pkt->req->isLockedRMW() ||
-               reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
-        // can't coalesce locked accesses, but can coalesce atomics!
-        return RequestStatus_Aliased;
-    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
-               pkt->req->contextId() !=
-               reqCoalescer[line_addr][0].pkt->req->contextId()) {
-        // can't coalesce releases from different wavefronts
-        return RequestStatus_Aliased;
-    }
-
-    // in addition to the packet, we need to save both request types
-    reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
     if (!issueEvent.scheduled())
         schedule(issueEvent, curTick());
     // TODO: issue hardware prefetches here
@@ -867,8 +645,9 @@
 }
 
 void
-GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 {
+    PacketPtr pkt = crequest->getFirstPkt();
 
     int proc_id = -1;
     if (pkt != NULL && pkt->req->hasContextId()) {
@@ -901,9 +680,9 @@
     uint32_t blockSize = RubySystem::getBlockSizeBytes();
     std::vector<bool> accessMask(blockSize,false);
     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
-    uint32_t tableSize = reqCoalescer[line_addr].size();
+    uint32_t tableSize = crequest->getPackets().size();
     for (int i = 0; i < tableSize; i++) {
-        PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
+        PacketPtr tmpPkt = crequest->getPackets()[i];
         uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
         uint32_t tmpSize = tmpPkt->getSize();
         if (tmpPkt->isAtomicOp()) {
@@ -922,7 +701,7 @@
     if (pkt->isAtomicOp()) {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
-                              pkt->getSize(), pc, secondary_type,
+                              pkt->getSize(), pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
@@ -931,7 +710,7 @@
     } else {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
-                              pkt->getSize(), pc, secondary_type,
+                              pkt->getSize(), pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
@@ -941,15 +720,21 @@
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
              curTick(), m_version, "Coal", "Begin", "", "",
              printAddress(msg->getPhysicalAddress()),
-             RubyRequestType_to_string(secondary_type));
+             RubyRequestType_to_string(crequest->getRubyType()));
 
-    fatal_if(secondary_type == RubyRequestType_IFETCH,
+    fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
              "there should not be any I-Fetch requests in the GPU Coalescer");
 
     Tick latency = cyclesToTicks(
-                        m_controller->mandatoryQueueLatency(secondary_type));
+                m_controller->mandatoryQueueLatency(crequest->getRubyType()));
     assert(latency > 0);
 
+    if (!deadlockCheckEvent.scheduled()) {
+        schedule(deadlockCheckEvent,
+                 m_deadlock_threshold * clockPeriod() +
+                 curTick());
+    }
+
     assert(m_mandatory_q_ptr);
     m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
 }
@@ -971,18 +756,9 @@
 {
     out << "[GPUCoalescer: " << m_version
         << ", outstanding requests: " << m_outstanding_count
-        << ", read request table: " << m_readRequestTable
-        << ", write request table: " << m_writeRequestTable
         << "]";
 }
 
-// this can be called from setState whenever coherence permissions are
-// upgraded when invoked, coherence violations will be checked for the
-// given block
-void
-GPUCoalescer::checkCoherence(Addr addr)
-{
-}
 
 void
 GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
@@ -990,40 +766,96 @@
             SequencerRequestType_to_string(requestType));
 }
 
+bool
+GPUCoalescer::coalescePacket(PacketPtr pkt)
+{
+    uint64_t seqNum = pkt->req->getReqInstSeqNum();
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // If the packet has the same line address as a request already in the
+    // coalescedTable and has the same sequence number, it can be coalesced.
+    if (coalescedTable.count(line_addr)) {
+        // Search for a previous coalesced request with the same seqNum.
+        auto& creqQueue = coalescedTable.at(line_addr);
+        auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
+            [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
+        );
+        if (citer != creqQueue.end()) {
+            (*citer)->insertPacket(pkt);
+            return true;
+        }
+    }
+
+    if (m_outstanding_count < m_max_outstanding_requests) {
+        // This is an "aliased" or new request. Create a RubyRequest and
+        // append it to the list of "targets" in the coalescing table.
+        DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
+                line_addr);
+
+        CoalescedRequest *creq = new CoalescedRequest(seqNum);
+        creq->insertPacket(pkt);
+        creq->setRubyType(getRequestType(pkt));
+        creq->setIssueTime(curCycle());
+
+        if (!coalescedTable.count(line_addr)) {
+            // If there is no outstanding request for this line address,
+            // create a new coalecsed request and issue it immediately.
+            auto reqList = std::deque<CoalescedRequest*> { creq };
+            coalescedTable.insert(std::make_pair(line_addr, reqList));
+
+            DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
+                    RubyRequestType_to_string(creq->getRubyType()), seqNum);
+            issueRequest(creq);
+        } else {
+            // The request is for a line address that is already outstanding
+            // but for a different instruction. Add it as a new request to be
+            // issued when the current outstanding request is completed.
+            coalescedTable.at(line_addr).push_back(creq);
+            DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
+                    line_addr, seqNum);
+        }
+
+        // In both cases, requests are added to the coalescing table and will
+        // be counted as outstanding requests.
+        m_outstanding_count++;
+
+        return true;
+    }
+
+    // The maximum number of outstanding requests have been issued.
+    return false;
+}
 
 void
 GPUCoalescer::completeIssue()
 {
-    // newRequests has the cacheline addresses of all the
-    // requests which need to be issued to the memory subsystem
-    // in this cycle
-    int len = newRequests.size();
-    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
-    for (int i = 0; i < len; ++i) {
-        // Get the requests from reqCoalescer table. Get only the
-        // first request for each cacheline, the remaining requests
-        // can be coalesced with the first request. So, only
-        // one request is issued per cacheline.
-        RequestDesc info = reqCoalescer[newRequests[i]][0];
-        PacketPtr pkt = info.pkt;
-        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
-                i, pkt->req->getPaddr());
-        // Insert this request to the read/writeRequestTables. These tables
-        // are used to track aliased requests in makeRequest subroutine
-        bool found = insertRequest(pkt, info.primaryType);
+    // Iterate over the maximum number of instructions we can coalesce
+    // per cycle (coalescingWindow).
+    for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
+        PerInstPackets *pktList =
+            uncoalescedTable.getInstPackets(instIdx);
 
-        if (found) {
-            panic("GPUCoalescer::makeRequest should never be called if the "
-                  "request is already outstanding\n");
+        // getInstPackets will return nullptr if no instruction
+        // exists at the current offset.
+        if (!pktList) {
+            break;
+        } else {
+            // Since we have a pointer to the list of packets in the inst,
+            // erase them from the list if coalescing is successful and
+            // leave them in the list otherwise. This aggressively attempts
+            // to coalesce as many packets as possible from the current inst.
+            pktList->remove_if(
+                [&](PacketPtr pkt) { return coalescePacket(pkt); }
+            );
         }
-
-        // Issue request to ruby subsystem
-        issueRequest(pkt, info.secondaryType);
     }
-    newRequests.clear();
+
+    // Clean up any instructions in the uncoalesced table that have had
+    // all of their packets coalesced and return a token for that column.
+    uncoalescedTable.updateResources();
 
     // have Kernel End releases been issued this cycle
-    len = newKernelEnds.size();
+    int len = newKernelEnds.size();
     for (int i = 0; i < len; i++) {
         kernelCallback(newKernelEnds[i]);
     }
@@ -1052,71 +884,27 @@
                              const DataBlock& data)
 {
     assert(address == makeLineAddress(address));
+    assert(coalescedTable.count(address));
 
-    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
-    assert(m_writeRequestTable.count(makeLineAddress(address)));
+    auto crequest = coalescedTable.at(address).front();
 
-    RequestTable::iterator i = m_writeRequestTable.find(address);
-    assert(i != m_writeRequestTable.end());
-    GPUCoalescerRequest* srequest = i->second;
+    fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
+              crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
+              crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
+             "atomicCallback saw non-atomic type response\n");
 
-    m_writeRequestTable.erase(i);
-    markRemoved();
+    hitCallback(crequest, mach, (DataBlock&)data, true,
+                crequest->getIssueTime(), Cycles(0), Cycles(0), false);
 
-    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
-           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
-           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+    delete crequest;
+    coalescedTable.at(address).pop_front();
 
-
-    // Atomics don't write to cache, so there is no MRU update...
-
-    recordMissLatency(srequest, mach,
-                      srequest->issue_time, Cycles(0), Cycles(0), true, false);
-
-    PacketPtr pkt = srequest->pkt;
-    Addr request_address = pkt->getAddr();
-    Addr request_line_address = makeLineAddress(pkt->getAddr());
-
-    int len = reqCoalescer[request_line_address].size();
-    std::vector<PacketPtr> mylist;
-    for (int i = 0; i < len; ++i) {
-        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
-        assert(srequest->m_type ==
-               reqCoalescer[request_line_address][i].primaryType);
-        request_address = (pkt->getAddr());
-        request_line_address = makeLineAddress(request_address);
-        if (pkt->getPtr<uint8_t>() &&
-            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
-            /* atomics are done in memory, and return the data *before* the atomic op... */
-            pkt->setData(
-                data.getData(getOffset(request_address), pkt->getSize()));
-        } else {
-            DPRINTF(MemoryAccess,
-                    "WARNING.  Data not transfered from Ruby to M5 for type " \
-                    "%s\n",
-                    RubyRequestType_to_string(srequest->m_type));
-        }
-
-        // If using the RubyTester, update the RubyTester sender state's
-        // subBlock with the recieved data.  The tester will later access
-        // this state.
-        // Note: RubyPort will access it's sender state before the
-        // RubyTester.
-        if (m_usingRubyTester) {
-            RubyPort::SenderState *requestSenderState =
-                safe_cast<RubyPort::SenderState*>(pkt->senderState);
-            RubyTester::SenderState* testerSenderState =
-                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
-            testerSenderState->subBlock.mergeFrom(data);
-        }
-
-        mylist.push_back(pkt);
+    if (coalescedTable.at(address).empty()) {
+        coalescedTable.erase(address);
+    } else {
+        auto nextRequest = coalescedTable.at(address).front();
+        issueRequest(nextRequest);
     }
-    delete srequest;
-    reqCoalescer.erase(request_line_address);
-    assert(!reqCoalescer.count(request_line_address));
-
-    completeHitCallback(mylist, len);
 }
 
 void
@@ -1148,42 +936,42 @@
 }
 
 void
-GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
-    for (int i = 0; i < len; ++i) {
+    for (auto& pkt : mylist) {
         RubyPort::SenderState *ss =
-            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+            safe_cast<RubyPort::SenderState *>(pkt->senderState);
         MemSlavePort *port = ss->port;
         assert(port != NULL);
 
-        mylist[i]->senderState = ss->predecessor;
+        pkt->senderState = ss->predecessor;
         delete ss;
-        port->hitCallback(mylist[i]);
+        port->hitCallback(pkt);
         trySendRetries();
     }
 
+    // We schedule an event in the same tick as hitCallback (similar to
+    // makeRequest) rather than calling completeIssue directly to reduce
+    // function calls to complete issue. This can only happen if the max
+    // outstanding requests is less than the number of slots in the
+    // uncoalesced table and makeRequest is not called again.
+    if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
+        schedule(issueEvent, curTick());
+    }
+
     testDrainComplete();
 }
 
-PacketPtr
-GPUCoalescer::mapAddrToPkt(Addr address)
-{
-    RequestTable::iterator i = m_readRequestTable.find(address);
-    assert(i != m_readRequestTable.end());
-    GPUCoalescerRequest* request = i->second;
-    return request->pkt;
-}
-
 void
-GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
                                 MachineType mach,
                                 Cycles initialRequestTime,
                                 Cycles forwardRequestTime,
                                 Cycles firstResponseTime,
                                 bool success, bool isRegion)
 {
-    RubyRequestType type = srequest->m_type;
-    Cycles issued_time = srequest->issue_time;
+    RubyRequestType type = crequest->getRubyType();
+    Cycles issued_time = crequest->getIssueTime();
     Cycles completion_time = curCycle();
     assert(completion_time >= issued_time);
     Cycles total_lat = completion_time - issued_time;
@@ -1249,7 +1037,7 @@
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
              curTick(), m_version, "Coal",
              success ? "Done" : "SC_Failed", "", "",
-             printAddress(srequest->pkt->getAddr()), total_lat);
+             printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
 }
 
 void
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 1321173..56a2079 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -48,6 +48,7 @@
 #include "mem/ruby/protocol/RubyRequestType.hh"
 #include "mem/ruby/protocol/SequencerRequestType.hh"
 #include "mem/ruby/system/Sequencer.hh"
+#include "mem/token_port.hh"
 
 class DataBlock;
 class CacheMsg;
@@ -59,47 +60,99 @@
 HSAScope reqScopeToHSAScope(const RequestPtr &req);
 HSASegment reqSegmentToHSASegment(const RequestPtr &req);
 
-struct GPUCoalescerRequest
-{
-    PacketPtr pkt;
-    RubyRequestType m_type;
-    Cycles issue_time;
+// List of packets that belongs to a specific instruction.
+typedef std::list<PacketPtr> PerInstPackets;
 
-    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
-                        Cycles _issue_time)
-        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
-    {}
-};
-
-class RequestDesc
+class UncoalescedTable
 {
   public:
-    RequestDesc(PacketPtr pkt, RubyRequestType p_type, RubyRequestType s_type)
-        : pkt(pkt), primaryType(p_type), secondaryType(s_type)
-    {
-    }
+    UncoalescedTable(GPUCoalescer *gc);
+    ~UncoalescedTable() {}
 
-    RequestDesc() : pkt(nullptr), primaryType(RubyRequestType_NULL),
-        secondaryType(RubyRequestType_NULL)
-    {
-    }
+    void insertPacket(PacketPtr pkt);
+    bool packetAvailable();
+    void printRequestTable(std::stringstream& ss);
 
-    PacketPtr pkt;
-    RubyRequestType primaryType;
-    RubyRequestType secondaryType;
+    // Returns a pointer to the list of packets corresponding to an
+    // instruction in the instruction map or nullptr if there are no
+    // instructions at the offset.
+    PerInstPackets* getInstPackets(int offset);
+    void updateResources();
+
+    // Check if a packet hasn't been removed from instMap in too long.
+    // Panics if a deadlock is detected and returns nothing otherwise.
+    void checkDeadlock(Tick threshold);
+
+  private:
+    GPUCoalescer *coalescer;
+
+    // Maps an instructions unique sequence number to a queue of packets
+    // which need responses. This data structure assumes the sequence number
+    // is monotonically increasing (which is true for CU class) in order to
+    // issue packets in age order.
+    std::map<uint64_t, PerInstPackets> instMap;
 };
 
-std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
+class CoalescedRequest
+{
+  public:
+    CoalescedRequest(uint64_t _seqNum)
+        : seqNum(_seqNum), issueTime(Cycles(0)),
+          rubyType(RubyRequestType_NULL)
+    {}
+    ~CoalescedRequest() {}
+
+    void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
+    void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
+    void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
+    void setRubyType(RubyRequestType type) { rubyType = type; }
+
+    uint64_t getSeqNum() const { return seqNum; }
+    PacketPtr getFirstPkt() const { return pkts[0]; }
+    Cycles getIssueTime() const { return issueTime; }
+    RubyRequestType getRubyType() const { return rubyType; }
+    std::vector<PacketPtr>& getPackets() { return pkts; }
+
+  private:
+    uint64_t seqNum;
+    Cycles issueTime;
+    RubyRequestType rubyType;
+    std::vector<PacketPtr> pkts;
+};
 
 class GPUCoalescer : public RubyPort
 {
   public:
+    class GMTokenPort : public TokenSlavePort
+    {
+      public:
+        GMTokenPort(const std::string& name, ClockedObject *owner,
+                    PortID id = InvalidPortID)
+            : TokenSlavePort(name, owner, id)
+        { }
+        ~GMTokenPort() { }
+
+      protected:
+        Tick recvAtomic(PacketPtr) { return Tick(0); }
+        void recvFunctional(PacketPtr) { }
+        bool recvTimingReq(PacketPtr) { return false; }
+        AddrRangeList getAddrRanges() const
+        {
+            AddrRangeList ranges;
+            return ranges;
+        }
+    };
+
     typedef RubyGPUCoalescerParams Params;
     GPUCoalescer(const Params *);
     ~GPUCoalescer();
 
+    Port &getPort(const std::string &if_name,
+                  PortID idx = InvalidPortID) override;
+
     // Public Methods
     void wakeup(); // Used only for deadlock detection
+    void printRequestTable(std::stringstream& ss);
 
     void printProgress(std::ostream& out) const;
     void resetStats() override;
@@ -176,15 +229,14 @@
     bool empty() const;
 
     void print(std::ostream& out) const;
-    void checkCoherence(Addr address);
 
-    void markRemoved();
-    void removeRequest(GPUCoalescerRequest* request);
     void evictionCallback(Addr address);
     void completeIssue();
 
     void insertKernel(int wavefront_id, PacketPtr pkt);
 
+    GMTokenPort& getGMTokenPort() { return gmTokenPort; }
+
     void recordRequestType(SequencerRequestType requestType);
     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
 
@@ -225,11 +277,11 @@
                         Addr pc, RubyAccessMode access_mode,
                         int size, DataBlock*& data_ptr);
     // Alternate implementations in VIPER Coalescer
-    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
+    virtual void issueRequest(CoalescedRequest* crequest);
 
     void kernelCallback(int wavfront_id);
 
-    void hitCallback(GPUCoalescerRequest* request,
+    void hitCallback(CoalescedRequest* crequest,
                      MachineType mach,
                      DataBlock& data,
                      bool success,
@@ -237,21 +289,23 @@
                      Cycles forwardRequestTime,
                      Cycles firstResponseTime,
                      bool isRegion);
-    void recordMissLatency(GPUCoalescerRequest* request,
+    void recordMissLatency(CoalescedRequest* crequest,
                            MachineType mach,
                            Cycles initialRequestTime,
                            Cycles forwardRequestTime,
                            Cycles firstResponseTime,
                            bool success, bool isRegion);
-    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
-    PacketPtr mapAddrToPkt(Addr address);
+    void completeHitCallback(std::vector<PacketPtr> & mylist);
 
 
-    RequestStatus getRequestStatus(PacketPtr pkt,
-                                   RubyRequestType request_type);
-    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
+    virtual RubyRequestType getRequestType(PacketPtr pkt);
 
-    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
+    // Attempt to remove a packet from the uncoalescedTable and coalesce
+    // with a previous request from the same instruction. If there is no
+    // previous instruction and the max number of outstanding requests has
+    // not be reached, a new coalesced request is created and added to the
+    // "target" list of the coalescedTable.
+    bool coalescePacket(PacketPtr pkt);
 
     EventFunctionWrapper issueEvent;
 
@@ -259,22 +313,27 @@
   // Changed to protected to enable inheritance by VIPER Coalescer
   protected:
     int m_max_outstanding_requests;
-    int m_deadlock_threshold;
+    Cycles m_deadlock_threshold;
 
     CacheMemory* m_dataCache_ptr;
     CacheMemory* m_instCache_ptr;
 
-    // We need to track both the primary and secondary request types.
-    // The secondary request type comprises a subset of RubyRequestTypes that
-    // are understood by the L1 Controller. A primary request type can be any
-    // RubyRequestType.
-    typedef std::unordered_map<Addr, std::vector<RequestDesc>> CoalescingTable;
-    CoalescingTable reqCoalescer;
-    std::vector<Addr> newRequests;
+    // coalescingWindow is the maximum number of instructions that are
+    // allowed to be coalesced in a single cycle.
+    int coalescingWindow;
 
-    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
-    RequestTable m_writeRequestTable;
-    RequestTable m_readRequestTable;
+    // The uncoalescedTable contains several "columns" which hold memory
+    // request packets for an instruction. The maximum size is the number of
+    // columns * the wavefront size.
+    UncoalescedTable uncoalescedTable;
+
+    // An MSHR-like struct for holding coalesced requests. The requests in
+    // this table may or may not be outstanding in the memory hierarchy. The
+    // maximum size is equal to the maximum outstanding requests for a CU
+    // (typically the number of blocks in TCP). If there are duplicates of
+    // an address, the are serviced in age order.
+    std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
+
     // Global outstanding request count, across all request tables
     int m_outstanding_count;
     bool m_deadlock_check_scheduled;
@@ -335,7 +394,12 @@
     std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
     std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
 
-private:
+  private:
+    // Token port is used to send/receive tokens to/from GPU's global memory
+    // pipeline across the port boundary. There is one per <wave size> data
+    // ports in the CU.
+    GMTokenPort gmTokenPort;
+
     // Private copy constructor and assignment operator
     GPUCoalescer(const GPUCoalescer& obj);
     GPUCoalescer& operator=(const GPUCoalescer& obj);
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
index c02fb75..0335981 100644
--- a/src/mem/ruby/system/GPUCoalescer.py
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -42,6 +42,8 @@
    # max_outstanding_requests = (wave front slots) x (wave front size)
    max_outstanding_requests = Param.Int(40*64,
                                 "max requests (incl. prefetches) outstanding")
+   max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
+                                "coalesced in a single cycle")
    assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
                            "Ownership coherence");
 
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index c28f880..9fa4736 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -94,7 +94,7 @@
 void
 RubySystem::registerNetwork(Network* network_ptr)
 {
-    m_network = network_ptr;
+    m_networks.emplace_back(network_ptr);
 }
 
 void
@@ -108,7 +108,6 @@
 
 RubySystem::~RubySystem()
 {
-    delete m_network;
     delete m_profiler;
 }
 
@@ -407,6 +406,9 @@
 RubySystem::resetStats()
 {
     m_start_cycle = curCycle();
+    for (auto& network : m_networks) {
+        network->resetStats();
+    }
 }
 
 bool
@@ -511,8 +513,10 @@
         DPRINTF(RubySystem, "Network functionalRead lookup "
                             "(num_maybe_stale=%d, num_busy = %d)\n",
                 num_maybe_stale, num_busy);
-        if (m_network->functionalRead(pkt))
-            return true;
+        for (auto& network : m_networks) {
+            if (network->functionalRead(pkt))
+                return true;
+        }
     }
 
     return false;
@@ -557,7 +561,9 @@
         }
     }
 
-    num_functional_writes += m_network->functionalWrite(pkt);
+    for (auto& network : m_networks) {
+        num_functional_writes += network->functionalWrite(pkt);
+    }
     DPRINTF(RubySystem, "Messages written = %u\n", num_functional_writes);
 
     return true;
diff --git a/src/mem/ruby/system/RubySystem.hh b/src/mem/ruby/system/RubySystem.hh
index 5d10991..2407072 100644
--- a/src/mem/ruby/system/RubySystem.hh
+++ b/src/mem/ruby/system/RubySystem.hh
@@ -130,7 +130,8 @@
     SimpleMemory *m_phys_mem;
     const bool m_access_backing_store;
 
-    Network* m_network;
+    //std::vector<Network *> m_networks;
+    std::vector<std::unique_ptr<Network>> m_networks;
     std::vector<AbstractController *> m_abs_cntrl_vec;
     Cycles m_start_cycle;
 
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index de7941a..aa134f4 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -738,14 +738,6 @@
         << "]";
 }
 
-// this can be called from setState whenever coherence permissions are
-// upgraded when invoked, coherence violations will be checked for the
-// given block
-void
-Sequencer::checkCoherence(Addr addr)
-{
-}
-
 void
 Sequencer::recordRequestType(SequencerRequestType requestType) {
     DPRINTF(RubyStats, "Recorded statistic: %s\n",
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index bb93607..ebca568 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -124,7 +124,6 @@
     { deschedule(deadlockCheckEvent); }
 
     void print(std::ostream& out) const;
-    void checkCoherence(Addr address);
 
     void markRemoved();
     void evictionCallback(Addr address);
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index feb13c5..d8977ac 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -76,15 +76,8 @@
 {
 }
 
-// Analyzes the packet to see if this request can be coalesced.
-// If request can be coalesced, this request is added to the reqCoalescer table
-// and makeRequest returns RequestStatus_Issued;
-// If this is the first request to a cacheline, request is added to both
-// newRequests queue and to the reqCoalescer table; makeRequest
-// returns RequestStatus_Issued.
-// If there is a pending request to this cacheline and this request
-// can't be coalesced, RequestStatus_Aliased is returned and
-// the packet needs to be reissued.
+// Places an uncoalesced packet in uncoalescedTable. If the packet is a
+// special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
 VIPERCoalescer::makeRequest(PacketPtr pkt)
 {
@@ -109,7 +102,6 @@
 
             return RequestStatus_Issued;
         }
-//        return RequestStatus_Aliased;
     } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
         // Flush Dirty Data on Kernel End
         // isKernel + isRelease
@@ -123,13 +115,10 @@
         }
         return RequestStatus_Issued;
     }
-    RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt);
-    if (requestStatus!=RequestStatus_Issued) {
-        // Request not isssued
-        // enqueue Retry
-        DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n");
-        return requestStatus;
-    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+
+    GPUCoalescer::makeRequest(pkt);
+
+    if (pkt->req->isKernel() && pkt->req->isAcquire()) {
         // Invalidate clean Data on Kernel Begin
         // isKernel + isAcquire
         invL1();
diff --git a/src/mem/slicc/ast/DeferEnqueueingStatementAST.py b/src/mem/slicc/ast/DeferEnqueueingStatementAST.py
new file mode 100644
index 0000000..40b9a4c
--- /dev/null
+++ b/src/mem/slicc/ast/DeferEnqueueingStatementAST.py
@@ -0,0 +1,82 @@
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Tuan Ta
+#
+
+from slicc.ast.StatementAST import StatementAST
+from slicc.symbols import Var
+
+class DeferEnqueueingStatementAST(StatementAST):
+    def __init__(self, slicc, queue_name, type_ast, statements):
+        super(DeferEnqueueingStatementAST, self).__init__(slicc)
+
+        self.queue_name = queue_name
+        self.type_ast = type_ast
+        self.statements = statements
+
+    def __repr__(self):
+        return "[DeferEnqueueingStatementAst: %s %s %s]" % \
+               (self.queue_name, self.type_ast.ident, self.statements)
+
+    def generate(self, code, return_type):
+        code("{")
+        code.indent()
+        self.symtab.pushFrame()
+
+        msg_type = self.type_ast.type
+
+        # Add new local var to symbol table
+        v = Var(self.symtab, "out_msg", self.location, msg_type, "*out_msg",
+                self.pairs)
+        self.symtab.newSymbol(v)
+
+        # Declare message
+        code("std::shared_ptr<${{msg_type.c_ident}}> out_msg = "\
+             "std::make_shared<${{msg_type.c_ident}}>(clockEdge());")
+
+        # The other statements
+        t = self.statements.generate(code, None)
+        self.queue_name.assertType("OutPort")
+
+        code("(${{self.queue_name.var.code}}).deferEnqueueingMessage(addr, "\
+             "out_msg);")
+
+        # End scope
+        self.symtab.popFrame()
+        code.dedent()
+        code("}")
+
+    def findResources(self, resources):
+        var = self.queue_name.var
+        res_count = int(resources.get(var, 0))
+        resources[var] = str(res_count + 1)
diff --git a/src/mem/slicc/ast/__init__.py b/src/mem/slicc/ast/__init__.py
index e3169e8..c410104 100644
--- a/src/mem/slicc/ast/__init__.py
+++ b/src/mem/slicc/ast/__init__.py
@@ -33,6 +33,7 @@
 from slicc.ast.CheckNextCycleAST import *
 from slicc.ast.DeclAST import *
 from slicc.ast.DeclListAST import *
+from slicc.ast.DeferEnqueueingStatementAST import *
 from slicc.ast.EnqueueStatementAST import *
 from slicc.ast.EnumDeclAST import *
 from slicc.ast.EnumExprAST import *
diff --git a/src/mem/slicc/parser.py b/src/mem/slicc/parser.py
index 846a74f..643eec6 100644
--- a/src/mem/slicc/parser.py
+++ b/src/mem/slicc/parser.py
@@ -120,6 +120,7 @@
         'void' : 'VOID',
         'new' : 'NEW',
         'OOD' : 'OOD',
+        'defer_enqueueing' : 'DEFER_ENQUEUEING',
     }
 
     literals = ':[]{}(),='
@@ -583,6 +584,10 @@
         "statement : ENQUEUE '(' var ',' type ',' expr ')' statements"
         p[0] = ast.EnqueueStatementAST(self, p[3], p[5], p[7], p[9])
 
+    def p_statement__defer_enqueueing(self, p):
+        "statement : DEFER_ENQUEUEING '(' var ',' type ')' statements"
+        p[0] = ast.DeferEnqueueingStatementAST(self, p[3], p[5], p[7])
+
     def p_statement__stall_and_wait(self, p):
         "statement : STALL_AND_WAIT '(' var ',' var ')' SEMI"
         p[0] = ast.StallAndWaitStatementAST(self, p[3], p[5])
diff --git a/src/python/m5/simulate.py b/src/python/m5/simulate.py
index 3317ae8..698dfbc 100644
--- a/src/python/m5/simulate.py
+++ b/src/python/m5/simulate.py
@@ -107,8 +107,9 @@
         except ImportError:
             pass
 
-    do_dot(root, options.outdir, options.dot_config)
-    do_ruby_dot(root, options.outdir, options.dot_config)
+    if options.dot_config:
+        do_dot(root, options.outdir, options.dot_config)
+        do_ruby_dot(root, options.outdir, options.dot_config)
 
     # Initialize the global statistics
     stats.initSimStats()
diff --git a/src/sim/core.hh b/src/sim/core.hh
index 7a7e911..48b7096 100644
--- a/src/sim/core.hh
+++ b/src/sim/core.hh
@@ -43,8 +43,6 @@
 /// The universal simulation clock.
 inline Tick curTick() { return _curEventQueue->getCurTick(); }
 
-const Tick retryTime = 1000;
-
 /// These are variables that are set based on the simulator frequency
 ///@{
 namespace SimClock {
diff --git a/src/sim/guest_abi/definition.hh b/src/sim/guest_abi/definition.hh
index becdb3c..4928b93 100644
--- a/src/sim/guest_abi/definition.hh
+++ b/src/sim/guest_abi/definition.hh
@@ -57,27 +57,28 @@
 template <typename ABI, typename Ret, typename Enabled=void>
 struct Result
 {
-  private:
     /*
      * Store result "ret" into the state accessible through tc. Optionally
      * accept "state" in case it holds some signature wide information.
      *
      * Note that the declaration below is only to document the expected
-     * signature and is private so it won't be used by accident.
+     * signature and is commented out so it won't be used by accident.
      * Specializations of this Result class should define their own version
-     * of this method which actually does something and is public.
+     * of this method which actually does something.
+     *
+     * static void store(ThreadContext *tc, const Ret &ret);
+     * static void store(ThreadContext *tc, const Ret &ret,
+     *                   typename ABI::State &state);
      */
-    static void store(ThreadContext *tc, const Ret &ret);
-    static void store(ThreadContext *tc, const Ret &ret,
-                      typename ABI::State &state);
 
     /*
      * Prepare for a result of type Ret. This might mean, for instance,
      * allocating an argument register for a result pointer.
      *
      * This method can be excluded if no preparation is necessary.
+     *
+     * static void prepare(ThreadContext *tc, typename ABI::State &state);
      */
-    static void prepare(ThreadContext *tc, typename ABI::State &state);
 };
 
 /*
@@ -98,16 +99,18 @@
      *
      * Like Result::store above, the declaration below is only to document
      * the expected method signature.
+     *
+     * static Arg get(ThreadContext *tc, typename ABI::State &state);
      */
-    static Arg get(ThreadContext *tc, typename ABI::State &state);
 
     /*
      * Prepare for an argument of type Arg. This might mean, for instance,
      * allocating an argument register for a result pointer.
      *
      * This method can be excluded if no preparation is necessary.
+     *
+     * static void allocate(ThreadContext *tc, typename ABI::State &state);
      */
-    static void allocate(ThreadContext *tc, typename ABI::State &state);
 };
 
 } // namespace GuestABI
diff --git a/src/sim/kernel_workload.cc b/src/sim/kernel_workload.cc
index 415ff96..74f9fc7 100644
--- a/src/sim/kernel_workload.cc
+++ b/src/sim/kernel_workload.cc
@@ -35,9 +35,6 @@
     _loadAddrMask(p.load_addr_mask), _loadAddrOffset(p.load_addr_offset),
     kernelSymtab(new Loader::SymbolTable), commandLine(p.command_line)
 {
-    if (!Loader::debugSymbolTable)
-        Loader::debugSymbolTable = new Loader::SymbolTable;
-
     if (params().object_file == "") {
         inform("No kernel set for full system simulation. "
                "Assuming you know what you're doing.");
@@ -70,10 +67,10 @@
         fatal_if(!kernelObj->loadLocalSymbols(kernelSymtab),
                 "Could not load kernel local symbols.");
 
-        fatal_if(!kernelObj->loadGlobalSymbols(Loader::debugSymbolTable),
+        fatal_if(!kernelObj->loadGlobalSymbols(&Loader::debugSymbolTable),
                 "Could not load kernel symbols.");
 
-        fatal_if(!kernelObj->loadLocalSymbols(Loader::debugSymbolTable),
+        fatal_if(!kernelObj->loadLocalSymbols(&Loader::debugSymbolTable),
                 "Could not load kernel local symbols.");
     }
 
diff --git a/src/sim/kernel_workload.hh b/src/sim/kernel_workload.hh
index b88051a..34406eb 100644
--- a/src/sim/kernel_workload.hh
+++ b/src/sim/kernel_workload.hh
@@ -98,9 +98,9 @@
     }
 
     bool
-    insertSymbol(Addr address, const std::string &symbol) override
+    insertSymbol(const Loader::Symbol &symbol) override
     {
-        return kernelSymtab->insert(address, symbol);
+        return kernelSymtab->insert(symbol);
     }
 
     void initState() override;
diff --git a/src/sim/process.cc b/src/sim/process.cc
index a55362d..9a88163 100644
--- a/src/sim/process.cc
+++ b/src/sim/process.cc
@@ -155,13 +155,11 @@
 
     image = objFile->buildImage();
 
-    if (!::Loader::debugSymbolTable) {
-        ::Loader::debugSymbolTable = new ::Loader::SymbolTable();
-        if (!objFile->loadGlobalSymbols(::Loader::debugSymbolTable) ||
-            !objFile->loadLocalSymbols(::Loader::debugSymbolTable) ||
-            !objFile->loadWeakSymbols(::Loader::debugSymbolTable)) {
-            delete ::Loader::debugSymbolTable;
-            ::Loader::debugSymbolTable = nullptr;
+    if (::Loader::debugSymbolTable.empty()) {
+        if (!objFile->loadGlobalSymbols(&::Loader::debugSymbolTable) ||
+            !objFile->loadLocalSymbols(&::Loader::debugSymbolTable) ||
+            !objFile->loadWeakSymbols(&::Loader::debugSymbolTable)) {
+            ::Loader::debugSymbolTable.clear();
         }
     }
 }
diff --git a/src/sim/pseudo_inst.cc b/src/sim/pseudo_inst.cc
index 203afc0..2d87b05 100644
--- a/src/sim/pseudo_inst.cc
+++ b/src/sim/pseudo_inst.cc
@@ -242,8 +242,10 @@
         if (!to_number(address, addr))
             continue;
 
-        if (!tc->getSystemPtr()->workload->insertSymbol(addr, symbol))
+        if (!tc->getSystemPtr()->workload->insertSymbol(
+                    { Loader::Symbol::Binding::Global, symbol, addr })) {
             continue;
+        }
 
 
         DPRINTF(Loader, "Loaded symbol: %s @ %#llx\n", symbol, addr);
@@ -264,8 +266,10 @@
 
     DPRINTF(Loader, "Loaded symbol: %s @ %#llx\n", symbol, addr);
 
-    tc->getSystemPtr()->workload->insertSymbol(addr, symbol);
-    Loader::debugSymbolTable->insert(addr, symbol);
+    tc->getSystemPtr()->workload->insertSymbol(
+            { Loader::Symbol::Binding::Global, symbol, addr });
+    Loader::debugSymbolTable.insert(
+            { Loader::Symbol::Binding::Global, symbol, addr });
 }
 
 uint64_t
diff --git a/src/sim/pseudo_inst.hh b/src/sim/pseudo_inst.hh
index 6a63812..982d6c8 100644
--- a/src/sim/pseudo_inst.hh
+++ b/src/sim/pseudo_inst.hh
@@ -59,16 +59,6 @@
 namespace GuestABI
 {
 
-template <typename T>
-struct Result<PseudoInstABI, T>
-{
-    static void
-    store(ThreadContext *tc, const T &ret)
-    {
-        // Don't do anything with the pseudo inst results by default.
-    }
-};
-
 template <>
 struct Argument<PseudoInstABI, uint64_t>
 {
@@ -134,9 +124,9 @@
  * @return Whether the pseudo instruction was recognized/handled.
  */
 
-template <typename ABI>
+template <typename ABI, bool store_ret>
 bool
-pseudoInst(ThreadContext *tc, uint8_t func, uint64_t &result)
+pseudoInstWork(ThreadContext *tc, uint8_t func, uint64_t &result)
 {
     DPRINTF(PseudoInst, "PseudoInst::pseudoInst(%i)\n", func);
 
@@ -160,11 +150,11 @@
         return true;
 
       case M5OP_QUIESCE_TIME:
-        result = invokeSimcall<ABI>(tc, quiesceTime);
+        result = invokeSimcall<ABI, store_ret>(tc, quiesceTime);
         return true;
 
       case M5OP_RPNS:
-        result = invokeSimcall<ABI>(tc, rpns);
+        result = invokeSimcall<ABI, store_ret>(tc, rpns);
         return true;
 
       case M5OP_WAKE_CPU:
@@ -180,7 +170,7 @@
         return true;
 
       case M5OP_INIT_PARAM:
-        result = invokeSimcall<ABI>(tc, initParam);
+        result = invokeSimcall<ABI, store_ret>(tc, initParam);
         return true;
 
       case M5OP_LOAD_SYMBOL:
@@ -204,11 +194,11 @@
         return true;
 
       case M5OP_WRITE_FILE:
-        result = invokeSimcall<ABI>(tc, writefile);
+        result = invokeSimcall<ABI, store_ret>(tc, writefile);
         return true;
 
       case M5OP_READ_FILE:
-        result = invokeSimcall<ABI>(tc, readfile);
+        result = invokeSimcall<ABI, store_ret>(tc, readfile);
         return true;
 
       case M5OP_DEBUG_BREAK:
@@ -262,6 +252,21 @@
     }
 }
 
+template <typename ABI, bool store_ret=false>
+bool
+pseudoInst(ThreadContext *tc, uint8_t func, uint64_t &result)
+{
+    return pseudoInstWork<ABI, store_ret>(tc, func, result);
+}
+
+template <typename ABI, bool store_ret=true>
+bool
+pseudoInst(ThreadContext *tc, uint8_t func)
+{
+    uint64_t result;
+    return pseudoInstWork<ABI, store_ret>(tc, func, result);
+}
+
 } // namespace PseudoInst
 
 #endif // __SIM_PSEUDO_INST_HH__
diff --git a/src/sim/syscall_emul.cc b/src/sim/syscall_emul.cc
index bffedfd..6d39823 100644
--- a/src/sim/syscall_emul.cc
+++ b/src/sim/syscall_emul.cc
@@ -1633,3 +1633,26 @@
     return (status == -1) ? -errno : status;
 }
 
+SyscallReturn
+getcpuFunc(SyscallDesc *desc, ThreadContext *tc,
+           Addr cpu_ptr, Addr node_ptr, Addr tcache_ptr)
+{
+    bool error = false;
+
+    // unsigned is the same size (4) on all Linux supported ISAs.
+    if (cpu_ptr != 0) {
+        TypedBufferArg<uint32_t> result(cpu_ptr);
+        *result = htog(tc->contextId(),
+            tc->getSystemPtr()->getGuestByteOrder());
+        error |= !result.copyOut(tc->getVirtProxy());
+    }
+
+    // Set a fixed NUMA node 0.
+    if (node_ptr != 0) {
+        TypedBufferArg<uint32_t> result(node_ptr);
+        *result = 0;
+        error |= !result.copyOut(tc->getVirtProxy());
+    }
+
+    return error ? -EFAULT : 0;
+}
diff --git a/src/sim/syscall_emul.hh b/src/sim/syscall_emul.hh
index 290c48e..5b966cd 100644
--- a/src/sim/syscall_emul.hh
+++ b/src/sim/syscall_emul.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013, 2015, 2019 ARM Limited
+ * Copyright (c) 2012-2013, 2015, 2019-2020 ARM Limited
  * Copyright (c) 2015 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -354,6 +354,9 @@
                              int tgt_fd, int level, int optname,
                              Addr valPtr, socklen_t len);
 
+SyscallReturn getcpuFunc(SyscallDesc *desc, ThreadContext *tc,
+                         Addr cpu_ptr, Addr node_ptr, Addr tcache_ptr);
+
 // Target getsockname() handler.
 SyscallReturn getsocknameFunc(SyscallDesc *desc, ThreadContext *tc,
                               int tgt_fd, Addr addrPtr, Addr lenPtr);
@@ -674,7 +677,8 @@
     tgt->f_frsize = htog(host->f_frsize, bo);
 #endif
 #if defined(__linux__)
-    memcpy(&tgt->f_spare, &host->f_spare, sizeof(host->f_spare));
+    memcpy(&tgt->f_spare, &host->f_spare,
+            std::min(sizeof(host->f_spare), sizeof(tgt->f_spare)));
 #else
     /*
      * The fields are different sizes per OS. Don't bother with
@@ -1718,7 +1722,7 @@
                 ffdp->getFileName());
 
             if (lib) {
-                lib->loadAllSymbols(Loader::debugSymbolTable,
+                lib->loadAllSymbols(&Loader::debugSymbolTable,
                                 lib->buildImage().minAddr(), start);
             }
         }
diff --git a/src/sim/ticked_object.cc b/src/sim/ticked_object.cc
index c6d1f98..7af439c 100644
--- a/src/sim/ticked_object.cc
+++ b/src/sim/ticked_object.cc
@@ -44,7 +44,7 @@
     Stats::Scalar *imported_num_cycles,
     Event::Priority priority) :
     object(object_),
-    event([this]{ processClockEvent(); }, name(), false, priority),
+    event([this]{ processClockEvent(); }, object_.name(), false, priority),
     running(false),
     lastStopped(0),
     /* Allocate numCycles if an external stat wasn't passed in */
diff --git a/src/sim/workload.hh b/src/sim/workload.hh
index ca2dffb..e24aa74 100644
--- a/src/sim/workload.hh
+++ b/src/sim/workload.hh
@@ -50,7 +50,7 @@
     virtual Loader::Arch getArch() const = 0;
 
     virtual const Loader::SymbolTable *symtab(ThreadContext *tc) = 0;
-    virtual bool insertSymbol(Addr address, const std::string &symbol) = 0;
+    virtual bool insertSymbol(const Loader::Symbol &symbol) = 0;
 
     /** @{ */
     /**
@@ -70,14 +70,12 @@
     addFuncEvent(const Loader::SymbolTable *symtab, const char *lbl,
                  const std::string &desc, Args... args)
     {
-        Addr addr M5_VAR_USED = 0; // initialize only to avoid compiler warning
+        auto it = symtab->find(lbl);
+        if (it == symtab->end())
+            return nullptr;
 
-        if (symtab->findAddress(lbl, addr)) {
-            return new T(system, desc, fixFuncEventAddr(addr),
-                          std::forward<Args>(args)...);
-        }
-
-        return nullptr;
+        return new T(system, desc, fixFuncEventAddr(it->address),
+                      std::forward<Args>(args)...);
     }
 
     template <class T>
diff --git a/src/systemc/dt/int/SConscript b/src/systemc/dt/int/SConscript
index 7b97d5f..26bb6ae 100644
--- a/src/systemc/dt/int/SConscript
+++ b/src/systemc/dt/int/SConscript
@@ -25,13 +25,22 @@
 
 Import('*')
 
+from m5.util import compareVersions
+
 if env['USE_SYSTEMC']:
+    if main['GCC'] and compareVersions(main['GCC_VERSION'], '10.1') >= 0:
+        disable_false_positives = {
+            "CCFLAGS": [ "-Wno-array-bounds",
+                         "-Wno-stringop-overflow" ]
+        }
+    else:
+        disable_false_positives = {}
     Source('messages.cc')
     Source('sc_int_base.cc')
     Source('sc_int_mask.cc')
     Source('sc_length_param.cc')
     Source('sc_nbexterns.cc')
     Source('sc_nbutils.cc')
-    Source('sc_signed.cc')
+    Source('sc_signed.cc', append=disable_false_positives)
     Source('sc_uint_base.cc')
-    Source('sc_unsigned.cc')
+    Source('sc_unsigned.cc', append=disable_false_positives)
diff --git a/src/unittest/nmtest.cc b/src/unittest/nmtest.cc
index f444a90..3601fa8 100644
--- a/src/unittest/nmtest.cc
+++ b/src/unittest/nmtest.cc
@@ -52,27 +52,24 @@
     obj->loadLocalSymbols(&symtab);
 
     if (argc == 2) {
-        Loader::SymbolTable::ATable::const_iterator i =
-            symtab.getAddrTable().begin();
-        Loader::SymbolTable::ATable::const_iterator end =
-            symtab.getAddrTable().end();
-        while (i != end) {
-            cprintf("%#x %s\n", i->first, i->second);
-            ++i;
-        }
+        for (const Loader::Symbol &symbol: symtab)
+            cprintf("%#x %s\n", symbol.address, symbol.name);
     } else {
         string symbol = argv[2];
         Addr address;
 
         if (symbol[0] == '0' && symbol[1] == 'x') {
+            Loader::SymbolTable::const_iterator it;
             if (to_number(symbol, address) &&
-                symtab.findSymbol(address, symbol))
-                cprintf("address = %#x, symbol = %s\n", address, symbol);
-            else
+                (it = symtab.find(address)) != symtab.end()) {
+                cprintf("address = %#x, symbol = %s\n", address, it->name);
+            } else {
                 cprintf("address = %#x was not found\n", address);
+            }
         } else {
-            if (symtab.findAddress(symbol, address))
-                cprintf("symbol = %s address = %#x\n", symbol, address);
+            auto it = symtab.find(symbol);
+            if (it != symtab.end())
+                cprintf("symbol = %s address = %#x\n", symbol, it->address);
             else
                 cprintf("symbol = %s was not found\n", symbol);
         }
diff --git a/src/unittest/symtest.cc b/src/unittest/symtest.cc
index fd006b4..369e1a4 100644
--- a/src/unittest/symtest.cc
+++ b/src/unittest/symtest.cc
@@ -60,19 +60,21 @@
     Addr address;
 
     if (!to_number(symbol, address)) {
-        if (!symtab.findAddress(symbol, address)) {
+        auto it = symtab.find(symbol);
+        if (it == symtab.end()) {
             cout << "could not find symbol: " << symbol << endl;
             exit(1);
         }
 
-        cout << symbol << " -> " << "0x" << hex << address << endl;
+        cout << symbol << " -> " << "0x" << hex << it->address << endl;
     } else {
-        if (!symtab.findSymbol(address, symbol)) {
+        auto it = symtab.find(address);
+        if (it == symtab.end()) {
             cout << "could not find address: " << address << endl;
             exit(1);
         }
 
-        cout << "0x" << hex << address << " -> " << symbol<< endl;
+        cout << "0x" << hex << address << " -> " << it->name << endl;
     }
 
     return 0;
diff --git a/tests/gem5/memory/test.py b/tests/gem5/memory/test.py
index bf87a27..beed084 100644
--- a/tests/gem5/memory/test.py
+++ b/tests/gem5/memory/test.py
@@ -71,6 +71,8 @@
 null_tests = [
     ('garnet_synth_traffic', ['--sim-cycles', '5000000']),
     ('memcheck', ['--maxtick', '2000000000', '--prefetchers']),
+    ('ruby_mem_test', ['--abs-max-tick', '20000000',
+        '--functional', '10']),
     ('ruby_random_test', ['--maxloads', '5000']),
     ('ruby_direct_test', ['--requests', '50000']),
 ]
diff --git a/util/dockerfiles/gcn-gpu/Dockerfile b/util/dockerfiles/gcn-gpu/Dockerfile
new file mode 100644
index 0000000..485a406
--- /dev/null
+++ b/util/dockerfiles/gcn-gpu/Dockerfile
@@ -0,0 +1,132 @@
+FROM ubuntu:16.04
+
+# Should be minimal needed packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    findutils \
+    file \
+    libunwind8 \
+    libunwind-dev \
+    pkg-config \
+    build-essential \
+    gcc-multilib \
+    g++-multilib \
+    git \
+    ca-certificates \
+    m4 \
+    scons \
+    zlib1g \
+    zlib1g-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    libprotoc-dev \
+    libgoogle-perftools-dev \
+    python-dev \
+    python \
+    python-yaml \
+    wget \
+    libpci3 \
+    libelf1 \
+    libelf-dev \
+    cmake \
+    openssl \
+    libssl-dev \
+    libboost-filesystem-dev \
+    libboost-system-dev \
+    libboost-dev
+
+ARG gem5_dist=http://dist.gem5.org/dist/current
+
+# Install ROCm 1.6 binaries
+RUN wget -qO- ${gem5_dist}/apt_1.6.2.tar.bz2 \
+    | tar -xjv \
+    && cd apt_1.6.2/pool/main/ \
+    && dpkg -i h/hsakmt-roct-dev/* \
+    && dpkg -i h/hsa-ext-rocr-dev/* \
+    && dpkg -i h/hsa-rocr-dev/* \
+    && dpkg -i r/rocm-utils/* \
+    && dpkg -i h/hcc/* \
+    && dpkg -i r/rocm-opencl/* \
+    && dpkg -i r/rocm-opencl-dev/*
+
+# Get ROCm libraries we need to compile from source (and ROCm-profiler)
+RUN git clone --single-branch https://github.com/ROCm-Developer-Tools/HIP/ && \
+    git clone --single-branch https://github.com/ROCmSoftwarePlatform/hipBLAS/ && \
+    git clone --single-branch https://github.com/ROCmSoftwarePlatform/rocBLAS/ && \
+    git clone --single-branch https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/ && \
+    git clone --single-branch https://github.com/ROCmSoftwarePlatform/MIOpen/ && \
+    git clone --single-branch https://github.com/RadeonOpenCompute/rocm-cmake/ && \
+    git clone --single-branch https://github.com/rocmarchive/ROCm-Profiler.git
+
+# Apply patches to various repos
+RUN mkdir -p /patch && cd /patch && \
+    wget ${gem5_dist}/rocm_patches/hipBLAS.patch && \
+    wget ${gem5_dist}/rocm_patches/hip.patch && \
+    wget ${gem5_dist}/rocm_patches/miopen.patch && \
+    wget ${gem5_dist}/rocm_patches/rocBLAS.patch
+
+RUN git -C /HIP/ checkout 0e3d824e && git -C /HIP/ apply /patch/hip.patch && \
+    git -C /hipBLAS/ checkout ee57787e && git -C /hipBLAS/ apply /patch/hipBLAS.patch && \
+    git -C /rocBLAS/ checkout cbff4b4e && git -C /rocBLAS/ apply /patch/rocBLAS.patch && \
+    git -C /MIOpenGEMM/ checkout 9547fb9e && \
+    git -C /MIOpen/ checkout a9949e30 && git -C /MIOpen/ apply /patch/miopen.patch
+
+ENV ROCM_PATH /opt/rocm
+ENV HCC_HOME ${ROCM_PATH}/hcc
+ENV HSA_PATH ${ROCM_PATH}/hsa
+ENV HIP_PATH ${ROCM_PATH}/hip
+ENV HIP_PLATFORM hcc
+ENV PATH ${ROCM_PATH}/bin:${HCC_HOME}/bin:${HSA_PATH}/bin:${HIP_PATH}/bin:${PATH}
+ENV HCC_AMDGPU_TARGET gfx801
+
+# Create build dirs for machine learning ROCm installs
+RUN mkdir -p /HIP/build && \
+    mkdir -p /rocBLAS/build && \
+    mkdir -p /hipBLAS/build && \
+    mkdir -p /rocm-cmake/build && \
+    mkdir -p /MIOpenGEMM/build && \
+    mkdir -p /MIOpen/build
+
+# Do the builds, empty build dir to trim image size
+WORKDIR /HIP/build
+RUN cmake .. && make -j$(nproc) && make install && rm -rf *
+
+WORKDIR /rocBLAS/build
+RUN CXX=/opt/rocm/bin/hcc cmake -DCMAKE_CXX_FLAGS="--amdgpu-target=gfx801" .. && \
+    make -j$(nproc) && make install && rm -rf *
+
+WORKDIR /hipBLAS/build
+RUN CXX=/opt/rocm/bin/hcc cmake -DCMAKE_CXX_FLAGS="--amdgpu-target=gfx801" .. && \
+    make -j$(nproc) && make install && rm -rf *
+
+WORKDIR /rocm-cmake/build
+RUN cmake .. && cmake --build . --target install && rm -rf *
+
+WORKDIR /MIOpenGEMM/build
+RUN cmake .. && make miopengemm && make install && rm -rf *
+
+# Should link this in as a volume if at all possible
+RUN mkdir -p /.cache/miopen && chmod 777 /.cache/miopen
+
+WORKDIR /MIOpen/build
+RUN CXX=/opt/rocm/hcc/bin/hcc cmake \
+    -DCMAKE_BUILD_TYPE=Debug \
+    -DCMAKE_INSTALL_PREFIX=/opt/rocm \
+    -DMIOPEN_BACKEND=HIP \
+    -DCMAKE_PREFIX_PATH="/opt/rocm/hip;/opt/rocm/hcc;/opt/rocm/rocdl;/opt/rocm/miopengemm;/opt/rocm/hsa" \
+    -DMIOPEN_CACHE_DIR=/.cache/miopen \
+    -DMIOPEN_AMDGCN_ASSEMBLER_PATH=/opt/rocm/opencl/bin \
+    -DCMAKE_CXX_FLAGS="-isystem /usr/include/x86_64-linux-gnu" .. && \
+    make -j$(nproc) && make install && rm -rf *
+
+# Create performance DB for gfx801. May need personal dbs still
+WORKDIR /opt/rocm/miopen/share/miopen/db
+RUN ln -s gfx803_64.cd.pdb.txt gfx801_8.cd.pdb.txt && \
+    ln -s gfx803_64.cd.pdb.txt gfx801_16.cd.pdb.txt && \
+    ln -s gfx803_64.cd.pdb.txt gfx801_32.cd.pdb.txt && \
+    ln -s gfx803_64.cd.pdb.txt gfx801_64.cd.pdb.txt
+
+# Install profiler from .deb file, works for 1.6.2
+WORKDIR /ROCm-Profiler
+RUN dpkg -i package/rocm-profiler_4.0.6036_amd64.deb
+
+WORKDIR /
diff --git a/util/dockerfiles/gcn-gpu/README.md b/util/dockerfiles/gcn-gpu/README.md
new file mode 100644
index 0000000..0764cad
--- /dev/null
+++ b/util/dockerfiles/gcn-gpu/README.md
@@ -0,0 +1,27 @@
+## gcn3-gpu dockerfile
+This dockerfile contains all the dependences necessary to run GPU applications in gem5 using the gcn3 APU model
+
+### Building the image
+```
+docker build -t <image_name> .
+```
+
+### Building gem5 using the image
+The following command assumes the gem5 directory is a subdirectory of your current directory
+```
+docker run --rm -v $PWD/gem5:/gem5 -w /gem5 <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+```
+
+### Test gem5 using a prebuilt application
+```
+wget http://dist.gem5.org/dist/current/test-progs/hip_sample_bins/MatrixTranspose
+docker run --rm -v $PWD/MatrixTranspose:/MatrixTranspose -v $PWD/public_gem5:/gem5 -w /gem5 \
+        <image_name> build/GCN3_X86/gem5.opt configs/example/apu_se.py -n2 --benchmark-root=/ -cMatrixTranspose
+```
+
+### Notes
+* When using the `-v` flag, the path to the input file/directory needs to be the absolute path; symlinks don't work
+* Currently linking in an AFS volume is not supported, as it uses ACLs instead of owner/group IDs
+
+### ToDo
+* Add square to gem5-resources github, add directions for building and running an application
diff --git a/util/gem5img.py b/util/gem5img.py
index 607f034..51a5487 100755
--- a/util/gem5img.py
+++ b/util/gem5img.py
@@ -135,11 +135,17 @@
         exit(returncode)
     lines = out.splitlines()
     # Make sure the first few lines of the output look like what we expect.
-    assert(lines[0][0] == '#')
-    assert(lines[1] == 'unit: sectors')
-    assert(lines[2] == '')
-    # This line has information about the first partition.
-    chunks = lines[3].split()
+    assert(lines[0][0] == '#' or lines[0].startswith('label:'))
+    assert(lines[1] == 'unit: sectors' or lines[1].startswith('label-id:'))
+    assert(lines[2] == '' or lines[2].startswith('device:'))
+    if lines[0][0] == '#' :
+        # Parsing an 'old style' dump oputput
+        # Line 4 has information about the first partition.
+        chunks = lines[3].split()
+    else :
+        # Parsing a 'new style' dump oputput
+        # Line 6 has information about the first partition.
+        chunks = lines[5].split()
     # The fourth chunk is the offset of the partition in sectors followed by
     # a comma. We drop the comma and convert that to an integer.
     sectors = string.atoi(chunks[3][:-1])
@@ -282,12 +288,11 @@
                        [('file', 'Name of the image file.')])
 
 def partition(dev, cylinders, heads, sectors):
-    # Use fdisk to partition the device
-    comStr = '0,\n;\n;\n;\n'
-    return runPriv([findProg('sfdisk'), '--no-reread', '-D', \
-                   '-C', "%d" % cylinders, \
-                   '-H', "%d" % heads, \
-                   '-S', "%d" % sectors, \
+    # Use sfdisk to partition the device
+    # The specified options are intended to work with both new and old
+    # versions of sfdisk (see https://askubuntu.com/a/819614)
+    comStr = ';'
+    return runPriv([findProg('sfdisk'), '--no-reread', '-u', 'S', '-L', \
                    str(dev)], inputVal=comStr)
 
 def partitionComFunc(options, args):
diff --git a/util/m5/src/SConscript b/util/m5/src/SConscript
index c2a3ede..1bf781c 100644
--- a/util/m5/src/SConscript
+++ b/util/m5/src/SConscript
@@ -29,6 +29,7 @@
 
 # Raw source files.
 m5_mmap = 'm5_mmap.c'
+args = 'args.c'
 m5 = 'm5.c'
 jni = 'jni_gem5Op.c'
 lua = 'lua_gem5Op.c'
@@ -63,7 +64,8 @@
 # The m5 stand alone command line utility.
 #
 ct_support = list([ File('%s_call_type.c' % ct.name) for ct in call_types ])
-m5_bin = static_env.Program('out/m5', ct_support + [ m5, m5_mmap, libm5 ])
+m5_bin = static_env.Program('out/m5',
+        ct_support + [ args, m5, m5_mmap, libm5 ])
 
 
 # The shared version of the m5 op call sights, used by mutliple targets below.
diff --git a/util/m5/src/args.c b/util/m5/src/args.c
new file mode 100644
index 0000000..f1896c8
--- /dev/null
+++ b/util/m5/src/args.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2011, 2017 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Copyright (c) 2003-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+
+int
+parse_int_args(int argc, char *argv[], uint64_t ints[], int len)
+{
+    if (argc > len)
+        return 0;
+
+// On 32 bit platforms we need to use strtoull to do the conversion
+#ifdef __LP64__
+#define strto64 strtoul
+#else
+#define strto64 strtoull
+#endif
+    int i;
+    for (i = 0; i < len; ++i)
+        ints[i] = (i < argc) ? strto64(argv[i], NULL, 0) : 0;
+
+#undef strto64
+    return 1;
+}
+
+int
+pack_str_into_regs(const char *str, uint64_t regs[], int num_regs)
+{
+    const size_t RegSize = sizeof(regs[0]);
+    const size_t MaxLen = num_regs * RegSize;
+
+    size_t len = strlen(str);
+
+    if (len > MaxLen)
+        return 0;
+
+    memset(regs, 0, MaxLen);
+
+    while (len) {
+        for (int offset = 0; offset < RegSize && len; offset++, len--) {
+            int shift = offset * 8;
+            *regs |= (uint64_t)(uint8_t)*str++ << shift;
+        }
+        regs++;
+    }
+    return 1;
+}
diff --git a/util/m5/src/args.h b/util/m5/src/args.h
new file mode 100644
index 0000000..530462e
--- /dev/null
+++ b/util/m5/src/args.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2011, 2017 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Copyright (c) 2003-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARGS_H__
+#define __ARGS_H__
+
+#include <stdint.h>
+
+int parse_int_args(int argc, char *argv[], uint64_t ints[], int len);
+int pack_str_into_regs(const char *str, uint64_t regs[], int num_regs);
+
+#endif // __ARGS_H__
diff --git a/util/m5/src/m5.c b/util/m5/src/m5.c
index cda6bf6..11e7d60 100644
--- a/util/m5/src/m5.c
+++ b/util/m5/src/m5.c
@@ -51,6 +51,8 @@
 
 #include <gem5/asm/generic/m5ops.h>
 #include <gem5/m5ops.h>
+
+#include "args.h"
 #include "call_type.h"
 #include "dispatch_table.h"
 #include "m5_mmap.h"
@@ -59,47 +61,6 @@
 char *command = "unspecified";
 void usage();
 
-void
-parse_int_args(int argc, char *argv[], uint64_t ints[], int len)
-{
-    if (argc > len)
-        usage();
-
-// On 32 bit platforms we need to use strtoull to do the conversion
-#ifdef __LP64__
-#define strto64 strtoul
-#else
-#define strto64 strtoull
-#endif
-    int i;
-    for (i = 0; i < len; ++i)
-        ints[i] = (i < argc) ? strto64(argv[i], NULL, 0) : 0;
-
-#undef strto64
-}
-
-void
-pack_str_into_regs(const char *str, uint64_t regs[], int num_regs)
-{
-    const size_t RegSize = sizeof(regs[0]);
-    const size_t MaxLen = num_regs * RegSize;
-
-    size_t len = strlen(str);
-
-    if (len > MaxLen)
-        usage();
-
-    memset(regs, 0, MaxLen);
-
-    while (len) {
-        for (int offset = 0; offset < RegSize && len; offset++, len--) {
-            int shift = offset * 8;
-            *regs |= (uint64_t)(uint8_t)*str++ << shift;
-        }
-        regs++;
-    }
-}
-
 int
 read_file(DispatchTable *dt, int dest_fid)
 {
@@ -168,7 +129,8 @@
         usage();
 
     uint64_t ints[1];
-    parse_int_args(argc, argv, ints, 1);
+    if (!parse_int_args(argc, argv, ints, 1))
+        usage();
     (*dt->m5_exit)(ints[0]);
 }
 
@@ -179,7 +141,8 @@
         usage();
 
     uint64_t ints[2] = {0,0};
-    parse_int_args(argc, argv, ints, argc);
+    if (!parse_int_args(argc, argv, ints, argc))
+        usage();
     (*dt->m5_fail)(ints[1], ints[0]);
 }
 
@@ -187,7 +150,8 @@
 do_reset_stats(DispatchTable *dt, int argc, char *argv[])
 {
     uint64_t ints[2];
-    parse_int_args(argc, argv, ints, 2);
+    if (!parse_int_args(argc, argv, ints, 2))
+        usage();
     (*dt->m5_reset_stats)(ints[0], ints[1]);
 }
 
@@ -195,7 +159,8 @@
 do_dump_stats(DispatchTable *dt, int argc, char *argv[])
 {
     uint64_t ints[2];
-    parse_int_args(argc, argv, ints, 2);
+    if (!parse_int_args(argc, argv, ints, 2))
+        usage();
     (*dt->m5_dump_stats)(ints[0], ints[1]);
 }
 
@@ -203,7 +168,8 @@
 do_dump_reset_stats(DispatchTable *dt, int argc, char *argv[])
 {
     uint64_t ints[2];
-    parse_int_args(argc, argv, ints, 2);
+    if (!parse_int_args(argc, argv, ints, 2))
+        usage();
     (*dt->m5_dump_reset_stats)(ints[0], ints[1]);
 }
 
@@ -232,7 +198,8 @@
 do_checkpoint(DispatchTable *dt, int argc, char *argv[])
 {
     uint64_t ints[2];
-    parse_int_args(argc, argv, ints, 2);
+    if (!parse_int_args(argc, argv, ints, 2))
+        usage();
     (*dt->m5_checkpoint)(ints[0], ints[1]);
 }
 
@@ -264,7 +231,8 @@
         usage();
 
     uint64_t key_str[2];
-    pack_str_into_regs(argc == 0 ? "" : argv[0], key_str, 2);
+    if (!pack_str_into_regs(argc == 0 ? "" : argv[0], key_str, 2))
+        usage();
     uint64_t val = (*dt->m5_init_param)(key_str[0], key_str[1]);
     printf("%"PRIu64, val);
 }