mem-ruby: HTM mem implementation This patch augments the MESI_Three_Level Ruby protocol with hardware transactional memory support. The HTM implementation relies on buffering of speculative memory updates. The core notifies the L0 cache controller that a new transaction has started and the controller in turn places itself in transactional state (htmTransactionalState := true). When operating in transactional state, the usual MESI protocol changes slightly. Lines loaded or stored are marked as part of a transaction's read and write set respectively. If there is an invalidation request to cache line in the read/write set, the transaction is marked as failed. Similarly, if there is a read request by another core to a speculatively written cache line, i.e. in the write set, the transaction is marked as failed. If failed, all subsequent loads and stores from the core are made benign, i.e. made into NOPS at the cache controller, and responses are marked to indicate that the transactional state has failed. When the core receives these marked responses, it generates a HtmFailureFault with the reason for the transaction failure. Servicing this fault does two things-- (a) Restores the architectural checkpoint (b) Sends an HTM abort signal to the cache controller The restoration includes all registers in the checkpoint as well as the program counter of the instruction before the transaction started. The abort signal is sent to the L0 cache controller and resets the failed transactional state. It resets the transactional read and write sets and invalidates any speculatively written cache lines. It also exits the transactional state so that the MESI protocol operates as usual. Alternatively, if the instructions within a transaction complete without triggering a HtmFailureFault, the transaction can be committed. The core is responsible for notifying the cache controller that the transaction is complete and the cache controller makes all speculative writes visible to the rest of the system and exits the transactional state. Notifting the cache controller is done through HtmCmd Requests which are a subtype of Load Requests. KUDOS: The code is based on a previous pull request by Pradip Vallathol who developed HTM and TSX support in Gem5 as part of his master’s thesis: http://reviews.gem5.org/r/2308/index.html JIRA: https://gem5.atlassian.net/browse/GEM5-587 Change-Id: Icc328df93363486e923b8bd54f4d77741d8f5650 Signed-off-by: Giacomo Travaglini <giacomo.travaglini@arm.com> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30319 Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>

commit: 0a8a787de3bfd6b37b1ca0f973a033ec4482c532 [log] [tgz]
author: Timothy Hayes <timothy.hayes@arm.com> Tue Jan 14 16:29:09 2020 +0000
committer: Giacomo Travaglini <giacomo.travaglini@arm.com> Tue Sep 08 09:13:30 2020 +0000
tree: 4b5a74a848b0c518d82a57b3e16395de8479d148
parent: 1c61dae99b27380d9d5230b1b66cf3e54c7da663 [diff]
diff --git a/build_opts/ARM_MESI_Three_Level_HTM b/build_opts/ARM_MESI_Three_Level_HTM
new file mode 100644
index 0000000..fd7c164
--- /dev/null
+++ b/build_opts/ARM_MESI_Three_Level_HTM

@@ -0,0 +1,6 @@
+# Copyright (c) 2019 ARM Limited
+# All rights reserved.
+
+TARGET_ISA = 'arm'
+CPU_MODELS = 'TimingSimpleCPU,O3CPU'
+PROTOCOL = 'MESI_Three_Level_HTM'

diff --git a/configs/ruby/MESI_Three_Level_HTM.py b/configs/ruby/MESI_Three_Level_HTM.py
new file mode 100644
index 0000000..89ca93c
--- /dev/null
+++ b/configs/ruby/MESI_Three_Level_HTM.py

@@ -0,0 +1,337 @@
+# Copyright (c) 2006-2007 The Regents of The University of Michigan
+# Copyright (c) 2009,2015 Advanced Micro Devices, Inc.
+# Copyright (c) 2013 Mark D. Hill and David A. Wood
+# Copyright (c) 2020 ARM Limited
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from .Ruby import create_topology, create_directories
+from .Ruby import send_evicts
+from common import FileSystemConfig
+
+#
+# Declare caches used by the protocol
+#
+class L0Cache(RubyCache): pass
+class L1Cache(RubyCache): pass
+class L2Cache(RubyCache): pass
+
+def define_options(parser):
+    parser.add_option("--num-clusters", type = "int", default = 1,
+            help = "number of clusters in a design in which there are shared\
+            caches private to clusters")
+    parser.add_option("--l0i_size", type="string", default="4096B")
+    parser.add_option("--l0d_size", type="string", default="4096B")
+    parser.add_option("--l0i_assoc", type="int", default=1)
+    parser.add_option("--l0d_assoc", type="int", default=1)
+    parser.add_option("--l0_transitions_per_cycle", type="int", default=32)
+    parser.add_option("--l1_transitions_per_cycle", type="int", default=32)
+    parser.add_option("--l2_transitions_per_cycle", type="int", default=4)
+    parser.add_option("--enable-prefetch", action="store_true", default=False,\
+                        help="Enable Ruby hardware prefetcher")
+    return
+
+def create_system(options, full_system, system, dma_ports, bootmem,
+                  ruby_system):
+
+    if buildEnv['PROTOCOL'] != 'MESI_Three_Level_HTM':
+        fatal("This script requires the MESI_Three_Level protocol to be\
+               built.")
+
+    cpu_sequencers = []
+
+    #
+    # The ruby network creation expects the list of nodes in the system to be
+    # consistent with the NetDest list.  Therefore the l1 controller nodes
+    # must be listed before the directory nodes and directory nodes before
+    # dma nodes, etc.
+    #
+    l0_cntrl_nodes = []
+    l1_cntrl_nodes = []
+    l2_cntrl_nodes = []
+    dma_cntrl_nodes = []
+
+    assert (options.num_cpus % options.num_clusters == 0)
+    num_cpus_per_cluster = options.num_cpus / options.num_clusters
+
+    assert (options.num_l2caches % options.num_clusters == 0)
+    num_l2caches_per_cluster = options.num_l2caches / options.num_clusters
+
+    l2_bits = int(math.log(num_l2caches_per_cluster, 2))
+    block_size_bits = int(math.log(options.cacheline_size, 2))
+    l2_index_start = block_size_bits + l2_bits
+
+    #
+    # Must create the individual controllers before the network to ensure the
+    # controller constructors are called before the network constructor
+    #
+    for i in range(options.num_clusters):
+        for j in range(num_cpus_per_cluster):
+            #
+            # First create the Ruby objects associated with this cpu
+            #
+            l0i_cache = L0Cache(size = options.l0i_size,
+                assoc = options.l0i_assoc,
+                is_icache = True,
+                start_index_bit = block_size_bits,
+                replacement_policy = LRURP())
+
+            l0d_cache = L0Cache(size = options.l0d_size,
+                assoc = options.l0d_assoc,
+                is_icache = False,
+                start_index_bit = block_size_bits,
+                replacement_policy = LRURP())
+
+            # the ruby random tester reuses num_cpus to specify the
+            # number of cpu ports connected to the tester object, which
+            # is stored in system.cpu. because there is only ever one
+            # tester object, num_cpus is not necessarily equal to the
+            # size of system.cpu; therefore if len(system.cpu) == 1
+            # we use system.cpu[0] to set the clk_domain, thereby ensuring
+            # we don't index off the end of the cpu list.
+            if len(system.cpu) == 1:
+                clk_domain = system.cpu[0].clk_domain
+            else:
+                clk_domain = system.cpu[i].clk_domain
+
+            # Ruby prefetcher
+            prefetcher = RubyPrefetcher(
+                num_streams=16,
+                unit_filter = 256,
+                nonunit_filter = 256,
+                train_misses = 5,
+                num_startup_pfs = 4,
+                cross_page = True
+            )
+
+            l0_cntrl = L0Cache_Controller(
+                   version = i * num_cpus_per_cluster + j,
+                   Icache = l0i_cache, Dcache = l0d_cache,
+                   transitions_per_cycle = options.l0_transitions_per_cycle,
+                   prefetcher = prefetcher,
+                   enable_prefetch = options.enable_prefetch,
+                   send_evictions = send_evicts(options),
+                   clk_domain = clk_domain,
+                   ruby_system = ruby_system)
+
+            cpu_seq = RubyHTMSequencer(version = i * num_cpus_per_cluster + j,
+                                       icache = l0i_cache,
+                                       clk_domain = clk_domain,
+                                       dcache = l0d_cache,
+                                       ruby_system = ruby_system)
+
+            l0_cntrl.sequencer = cpu_seq
+
+            l1_cache = L1Cache(size = options.l1d_size,
+                               assoc = options.l1d_assoc,
+                               start_index_bit = block_size_bits,
+                               is_icache = False)
+
+            l1_cntrl = L1Cache_Controller(
+                    version = i * num_cpus_per_cluster + j,
+                    cache = l1_cache, l2_select_num_bits = l2_bits,
+                    cluster_id = i,
+                    transitions_per_cycle = options.l1_transitions_per_cycle,
+                    ruby_system = ruby_system)
+
+            exec("ruby_system.l0_cntrl%d = l0_cntrl"
+                 % ( i * num_cpus_per_cluster + j))
+            exec("ruby_system.l1_cntrl%d = l1_cntrl"
+                 % ( i * num_cpus_per_cluster + j))
+
+            #
+            # Add controllers and sequencers to the appropriate lists
+            #
+            cpu_sequencers.append(cpu_seq)
+            l0_cntrl_nodes.append(l0_cntrl)
+            l1_cntrl_nodes.append(l1_cntrl)
+
+            # Connect the L0 and L1 controllers
+            l0_cntrl.prefetchQueue = MessageBuffer()
+            l0_cntrl.mandatoryQueue = MessageBuffer()
+            l0_cntrl.bufferToL1 = MessageBuffer(ordered = True)
+            l1_cntrl.bufferFromL0 = l0_cntrl.bufferToL1
+            l0_cntrl.bufferFromL1 = MessageBuffer(ordered = True)
+            l1_cntrl.bufferToL0 = l0_cntrl.bufferFromL1
+
+            # Connect the L1 controllers and the network
+            l1_cntrl.requestToL2 = MessageBuffer()
+            l1_cntrl.requestToL2.master = ruby_system.network.slave
+            l1_cntrl.responseToL2 = MessageBuffer()
+            l1_cntrl.responseToL2.master = ruby_system.network.slave
+            l1_cntrl.unblockToL2 = MessageBuffer()
+            l1_cntrl.unblockToL2.master = ruby_system.network.slave
+
+            l1_cntrl.requestFromL2 = MessageBuffer()
+            l1_cntrl.requestFromL2.slave = ruby_system.network.master
+            l1_cntrl.responseFromL2 = MessageBuffer()
+            l1_cntrl.responseFromL2.slave = ruby_system.network.master
+
+
+        for j in range(num_l2caches_per_cluster):
+            l2_cache = L2Cache(size = options.l2_size,
+                               assoc = options.l2_assoc,
+                               start_index_bit = l2_index_start)
+
+            l2_cntrl = L2Cache_Controller(
+                        version = i * num_l2caches_per_cluster + j,
+                        L2cache = l2_cache, cluster_id = i,
+                        transitions_per_cycle =\
+                         options.l2_transitions_per_cycle,
+                        ruby_system = ruby_system)
+
+            exec("ruby_system.l2_cntrl%d = l2_cntrl"
+                 % (i * num_l2caches_per_cluster + j))
+            l2_cntrl_nodes.append(l2_cntrl)
+
+            # Connect the L2 controllers and the network
+            l2_cntrl.DirRequestFromL2Cache = MessageBuffer()
+            l2_cntrl.DirRequestFromL2Cache.master = ruby_system.network.slave
+            l2_cntrl.L1RequestFromL2Cache = MessageBuffer()
+            l2_cntrl.L1RequestFromL2Cache.master = ruby_system.network.slave
+            l2_cntrl.responseFromL2Cache = MessageBuffer()
+            l2_cntrl.responseFromL2Cache.master = ruby_system.network.slave
+
+            l2_cntrl.unblockToL2Cache = MessageBuffer()
+            l2_cntrl.unblockToL2Cache.slave = ruby_system.network.master
+            l2_cntrl.L1RequestToL2Cache = MessageBuffer()
+            l2_cntrl.L1RequestToL2Cache.slave = ruby_system.network.master
+            l2_cntrl.responseToL2Cache = MessageBuffer()
+            l2_cntrl.responseToL2Cache.slave = ruby_system.network.master
+
+    # Run each of the ruby memory controllers at a ratio of the frequency of
+    # the ruby system
+    # clk_divider value is a fix to pass regression.
+    ruby_system.memctrl_clk_domain = DerivedClockDomain(
+            clk_domain = ruby_system.clk_domain, clk_divider = 3)
+
+    mem_dir_cntrl_nodes, rom_dir_cntrl_node = create_directories(
+        options, bootmem, ruby_system, system)
+    dir_cntrl_nodes = mem_dir_cntrl_nodes[:]
+    if rom_dir_cntrl_node is not None:
+        dir_cntrl_nodes.append(rom_dir_cntrl_node)
+    for dir_cntrl in dir_cntrl_nodes:
+        # Connect the directory controllers and the network
+        dir_cntrl.requestToDir = MessageBuffer()
+        dir_cntrl.requestToDir.slave = ruby_system.network.master
+        dir_cntrl.responseToDir = MessageBuffer()
+        dir_cntrl.responseToDir.slave = ruby_system.network.master
+        dir_cntrl.responseFromDir = MessageBuffer()
+        dir_cntrl.responseFromDir.master = ruby_system.network.slave
+        dir_cntrl.requestToMemory = MessageBuffer()
+        dir_cntrl.responseFromMemory = MessageBuffer()
+
+    for i, dma_port in enumerate(dma_ports):
+        #
+        # Create the Ruby objects associated with the dma controller
+        #
+        dma_seq = DMASequencer(version = i, ruby_system = ruby_system)
+
+        dma_cntrl = DMA_Controller(version = i,
+                                   dma_sequencer = dma_seq,
+                                   transitions_per_cycle = options.ports,
+                                   ruby_system = ruby_system)
+
+        exec("ruby_system.dma_cntrl%d = dma_cntrl" % i)
+        exec("ruby_system.dma_cntrl%d.dma_sequencer.slave = dma_port" % i)
+        dma_cntrl_nodes.append(dma_cntrl)
+
+        # Connect the dma controller to the network
+        dma_cntrl.mandatoryQueue = MessageBuffer()
+        dma_cntrl.responseFromDir = MessageBuffer(ordered = True)
+        dma_cntrl.responseFromDir.slave = ruby_system.network.master
+        dma_cntrl.requestToDir = MessageBuffer()
+        dma_cntrl.requestToDir.master = ruby_system.network.slave
+
+    all_cntrls = l0_cntrl_nodes + \
+                 l1_cntrl_nodes + \
+                 l2_cntrl_nodes + \
+                 dir_cntrl_nodes + \
+                 dma_cntrl_nodes
+
+    # Create the io controller and the sequencer
+    if full_system:
+        io_seq = DMASequencer(version=len(dma_ports), ruby_system=ruby_system)
+        ruby_system._io_port = io_seq
+        io_controller = DMA_Controller(version = len(dma_ports),
+                                       dma_sequencer = io_seq,
+                                       ruby_system = ruby_system)
+        ruby_system.io_controller = io_controller
+
+        # Connect the dma controller to the network
+        io_controller.mandatoryQueue = MessageBuffer()
+        io_controller.responseFromDir = MessageBuffer(ordered = True)
+        io_controller.responseFromDir.slave = ruby_system.network.master
+        io_controller.requestToDir = MessageBuffer()
+        io_controller.requestToDir.master = ruby_system.network.slave
+
+        all_cntrls = all_cntrls + [io_controller]
+    # Register configuration with filesystem
+    else:
+        for i in range(options.num_clusters):
+            for j in range(num_cpus_per_cluster):
+                FileSystemConfig.register_cpu(physical_package_id = 0,
+                                              core_siblings = range(options.num_cpus),
+                                              core_id = i*num_cpus_per_cluster+j,
+                                              thread_siblings = [])
+
+                FileSystemConfig.register_cache(level = 0,
+                                                idu_type = 'Instruction',
+                                                size = options.l0i_size,
+                                                line_size =\
+                                                 options.cacheline_size,
+                                                assoc = 1,
+                                                cpus = [i*num_cpus_per_cluster+j])
+                FileSystemConfig.register_cache(level = 0,
+                                                idu_type = 'Data',
+                                                size = options.l0d_size,
+                                                line_size =\
+                                                 options.cacheline_size,
+                                                assoc = 1,
+                                                cpus = [i*num_cpus_per_cluster+j])
+
+                FileSystemConfig.register_cache(level = 1,
+                                                idu_type = 'Unified',
+                                                size = options.l1d_size,
+                                                line_size = options.cacheline_size,
+                                                assoc = options.l1d_assoc,
+                                                cpus = [i*num_cpus_per_cluster+j])
+
+            FileSystemConfig.register_cache(level = 2,
+                                            idu_type = 'Unified',
+                                            size = str(MemorySize(options.l2_size) * \
+                                                   num_l2caches_per_cluster)+'B',
+                                            line_size = options.cacheline_size,
+                                            assoc = options.l2_assoc,
+                                            cpus = [n for n in range(i*num_cpus_per_cluster, \
+                                                                     (i+1)*num_cpus_per_cluster)])
+
+    ruby_system.network.number_of_virtual_networks = 3
+    topology = create_topology(all_cntrls, options)
+    return (cpu_sequencers, mem_dir_cntrl_nodes, topology)

diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript
index fc90f8a..b31416d 100644
--- a/src/mem/ruby/SConscript
+++ b/src/mem/ruby/SConscript

@@ -138,4 +138,5 @@
 # <# include "mem/ruby/protocol/header.hh"> in any file
 # generated_dir = Dir('protocol')
 MakeInclude('system/GPUCoalescer.hh')
+MakeInclude('system/HTMSequencer.hh')
 MakeInclude('system/VIPERCoalescer.hh')

diff --git a/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm b/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm
index 0f5a7ac..7344ca1 100644
--- a/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm
+++ b/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm

@@ -130,6 +130,14 @@
     Ack_all,      desc="Last ack for processor";
 
     WB_Ack,        desc="Ack for replacement";
+
+    // hardware transactional memory
+    L0_DataCopy,     desc="Data Block from L0. Should remain in M state.";
+
+    // L0 cache received the invalidation message and has
+    // sent a NAK (because of htm abort) saying that the data
+    // in L1 is the latest value.
+    L0_DataNak,      desc="L0 received INV message, specifies its data is also stale";
   }
 
   // TYPES
@@ -361,6 +369,10 @@
 
         if(in_msg.Class == CoherenceClass:INV_DATA) {
             trigger(Event:L0_DataAck, in_msg.addr, cache_entry, tbe);
+        }  else if (in_msg.Class == CoherenceClass:NAK) {
+              trigger(Event:L0_DataNak, in_msg.addr, cache_entry, tbe);
+        }  else if (in_msg.Class == CoherenceClass:PUTX_COPY) {
+              trigger(Event:L0_DataCopy, in_msg.addr, cache_entry, tbe);
         }  else if (in_msg.Class == CoherenceClass:INV_ACK) {
             trigger(Event:L0_Ack, in_msg.addr, cache_entry, tbe);
         }  else {
@@ -808,18 +820,6 @@
     k_popL0RequestQueue;
   }
 
-  transition(EE, Load, E) {
-    hh_xdata_to_l0;
-    uu_profileHit;
-    k_popL0RequestQueue;
-  }
-
-  transition(MM, Load, M) {
-    hh_xdata_to_l0;
-    uu_profileHit;
-    k_popL0RequestQueue;
-  }
-
   transition({S,SS}, Store, SM) {
     i_allocateTBE;
     c_issueUPGRADE;
@@ -1034,7 +1034,7 @@
     kd_wakeUpDependents;
   }
 
-  transition(SM, L0_Invalidate_Else, SM_IL0) {
+  transition(SM, {Inv,L0_Invalidate_Else}, SM_IL0) {
     forward_eviction_to_L0_else;
   }
 
@@ -1093,4 +1093,55 @@
   transition({S_IL0, M_IL0, E_IL0, MM_IL0}, {Inv, Fwd_GETX, Fwd_GETS}) {
     z2_stallAndWaitL2Queue;
   }
+
+  // hardware transactional memory
+
+  // If a transaction has aborted, the L0 could re-request
+  // data which is in E or EE state in L1.
+  transition({EE,E}, Load, E) {
+    hh_xdata_to_l0;
+    uu_profileHit;
+    k_popL0RequestQueue;
+  }
+
+  // If a transaction has aborted, the L0 could re-request
+  // data which is in M or MM state in L1.
+  transition({MM,M}, Load, M) {
+    hh_xdata_to_l0;
+    uu_profileHit;
+    k_popL0RequestQueue;
+  }
+
+  // If a transaction has aborted, the L0 could re-request
+  // data which is in M state in L1.
+  transition({E,M}, Store, M) {
+    hh_xdata_to_l0;
+    uu_profileHit;
+    k_popL0RequestQueue;
+  }
+
+  // A transaction may have tried to modify a cache block in M state with
+  // non-speculative (pre-transactional) data. This needs to be copied
+  // to the L1 before any further modifications occur at the L0.
+  transition({M,E}, L0_DataCopy, M) {
+    u_writeDataFromL0Request;
+    k_popL0RequestQueue;
+  }
+
+  transition({M_IL0, E_IL0}, L0_DataCopy, M_IL0) {
+    u_writeDataFromL0Request;
+    k_popL0RequestQueue;
+  }
+
+  // A NAK from the L0 means that the L0 invalidated its
+  // modified line (due to an abort) so it is therefore necessary
+  // to use the L1's correct version instead
+  transition({M_IL0, E_IL0}, L0_DataNak, MM) {
+    k_popL0RequestQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(I, L1_Replacement) {
+    ff_deallocateCacheBlock;
+  }
 }

diff --git a/src/mem/ruby/protocol/MESI_Three_Level-msg.sm b/src/mem/ruby/protocol/MESI_Three_Level-msg.sm
index e738b8a..a16e374 100644
--- a/src/mem/ruby/protocol/MESI_Three_Level-msg.sm
+++ b/src/mem/ruby/protocol/MESI_Three_Level-msg.sm

@@ -48,6 +48,7 @@
   INV_OWN,   desc="Invalidate (own)";
   INV_ELSE,  desc="Invalidate (else)";
   PUTX,      desc="Replacement message";
+  PUTX_COPY, desc="Data block to be copied in L1. L0 will still be in M state";
 
   WB_ACK,    desc="Writeback ack";
 
@@ -59,6 +60,7 @@
   DATA, desc="Data block for L1 cache in S state";
   DATA_EXCLUSIVE, desc="Data block for L1 cache in M/E state";
   ACK, desc="Generic invalidate ack";
+  NAK, desc="Used by L0 to tell L1 that it cannot provide the latest value";
 
   // This is a special case in which the L1 cache lost permissions to the
   // shared block before it got the data. So the L0 cache can use the data

diff --git a/src/mem/ruby/protocol/MESI_Three_Level_HTM-L0cache.sm b/src/mem/ruby/protocol/MESI_Three_Level_HTM-L0cache.sm
new file mode 100644
index 0000000..a6e4faf
--- /dev/null
+++ b/src/mem/ruby/protocol/MESI_Three_Level_HTM-L0cache.sm

@@ -0,0 +1,1606 @@
+/*
+ * Copyright (c) 2020 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Copyright (c) 2013 Mark D. Hill and David A. Wood
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+machine(MachineType:L0Cache, "MESI Directory L0 Cache")
+ : HTMSequencer * sequencer;
+   CacheMemory * Icache;
+   CacheMemory * Dcache;
+   Cycles request_latency := 2;
+   Cycles response_latency := 2;
+   bool send_evictions;
+
+   RubyPrefetcher * prefetcher;
+   bool enable_prefetch := "False";
+
+   // From this node's L0 cache to the network
+   MessageBuffer * bufferToL1, network="To";
+
+   // To this node's L0 cache FROM the network
+   MessageBuffer * bufferFromL1, network="From";
+
+   // Message queue between this controller and the processor
+   MessageBuffer * mandatoryQueue;
+
+   // Request Buffer for prefetches
+   MessageBuffer * prefetchQueue;
+{
+  // hardware transactional memory
+  bool htmTransactionalState, default="false";
+  bool htmFailed, default="false";
+  int htmUid, default=0;
+  HtmFailedInCacheReason htmFailedRc, default=HtmFailedInCacheReason_NO_FAIL;
+
+  // STATES
+  state_declaration(State, desc="Cache states", default="L0Cache_State_I") {
+    // Base states
+
+    // The cache entry has not been allocated.
+    I, AccessPermission:Invalid, desc="Invalid";
+
+    // The cache entry is in shared mode. The processor can read this entry
+    // but it cannot write to it.
+    S, AccessPermission:Read_Only, desc="Shared";
+
+    // The cache entry is in exclusive mode. The processor can read this
+    // entry. It can write to this entry without informing the directory.
+    // On writing, the entry moves to M state.
+    E, AccessPermission:Read_Only, desc="Exclusive";
+
+    // The processor has read and write permissions on this entry.
+    M, AccessPermission:Read_Write, desc="Modified";
+
+    // Transient States
+
+    // The cache controller has requested an instruction.  It will be stored
+    // in the shared state so that the processor can read it.
+    Inst_IS, AccessPermission:Busy, desc="Issued GETS, have not seen response yet";
+
+    // The cache controller has requested that this entry be fetched in
+    // shared state so that the processor can read it.
+    IS, AccessPermission:Busy, desc="Issued GETS, have not seen response yet";
+
+    // The cache controller has requested that this entry be fetched in
+    // modify state so that the processor can read/write it.
+    IM, AccessPermission:Busy, desc="Issued GETX, have not seen response yet";
+
+    // The cache controller had read permission over the entry. But now the
+    // processor needs to write to it. So, the controller has requested for
+    // write permission.
+    SM, AccessPermission:Read_Only, desc="Issued GETX, have not seen response yet";
+
+    // Transient states in which block is being prefetched
+    PF_Inst_IS, AccessPermission:Busy, desc="Issued GETS, have not seen response yet";
+    PF_IS, AccessPermission:Busy, desc="Issued GETS, have not seen response yet";
+    PF_IE, AccessPermission:Busy, desc="Issued GETX, have not seen response yet";
+  }
+
+  // EVENTS
+  enumeration(Event, desc="Cache events") {
+    // Events from core
+    Load,            desc="Load request from the home processor";
+    Ifetch,          desc="I-fetch request from the home processor";
+    Store,           desc="Store request from the home processor";
+
+    // invalidations from L1 (due to self or other core)
+    InvOwn,          desc="Invalidate request from L1 (own)";
+    InvElse,         desc="Invalidate request from L1 (else)";
+
+    // internal generated request
+    L0_Replacement,  desc="L0 Replacement", format="!r";
+
+    // requests forwarded from other processors
+    Fwd_GETX,   desc="GETX from other processor";
+    Fwd_GETS,   desc="GETS from other processor";
+    Fwd_GET_INSTR,   desc="GET_INSTR from other processor";
+
+    // data arrives from L1 cache
+    Data,               desc="Data for processor";
+    Data_Exclusive,     desc="Data for processor";
+    Data_Stale,         desc="Data for processor, but not for storage";
+
+    Ack,        desc="Ack for processor";
+
+    WB_Ack,        desc="Ack for replacement";
+
+    Failed_SC,        desc="Store conditional request that will fail";
+
+    // Prefetch events (generated by prefetcher)
+    PF_L0_Replacement, desc="L0 Replacement caused by pretcher", format="!pr";
+    PF_Load,         desc="Load request from prefetcher";
+    PF_Ifetch,       desc="Instruction fetch request from prefetcher";
+    PF_Store,        desc="Exclusive load request from prefetcher";
+    PF_Bad_Addr,     desc="Throw away prefetch request due to bad address generation";
+
+    // hardware transactional memory
+    HTM_Abort,        desc="Abort HTM transaction and rollback cache to pre-transactional state";
+    HTM_Start,        desc="Place cache in HTM transactional state";
+    HTM_Commit,       desc="Commit speculative loads/stores and place cache in normal state";
+    HTM_Cancel,       desc="Fail HTM transaction explicitely without aborting";
+    HTM_notifyCMD,    desc="Notify core via HTM CMD that HTM transaction has failed";
+    HTM_notifyLD,     desc="Notify core via LD that HTM transaction has failed";
+    HTM_notifyST,     desc="Notify core via ST that HTM transaction has failed";
+  }
+
+  // TYPES
+
+  // CacheEntry
+  structure(Entry, desc="...", interface="AbstractCacheEntry" ) {
+    State CacheState,        desc="cache state";
+    DataBlock DataBlk,       desc="data for the block";
+    bool Dirty, default="false",   desc="data is dirty";
+    bool isPrefetched, default="false", desc="Set if this block was prefetched";
+
+    // hardware transactional memory
+    // read/write set state
+    void setInHtmReadSet(bool), external="yes";
+    void setInHtmWriteSet(bool), external="yes";
+    bool getInHtmReadSet(), external="yes";
+    bool getInHtmWriteSet(), external="yes";
+
+    // override invalidateEntry
+    void invalidateEntry() {
+      CacheState := State:I;
+      Dirty := false;
+    }
+  }
+
+  // TBE fields
+  structure(TBE, desc="...") {
+    Addr addr,              desc="Physical address for this TBE";
+    State TBEState,        desc="Transient state";
+    DataBlock DataBlk,                desc="Buffer for the data block";
+    bool Dirty, default="false",   desc="data is dirty";
+    int pendingAcks, default="0", desc="number of pending acks";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+    TBE getNullEntry();
+  }
+
+  TBETable TBEs, template="<L0Cache_TBE>", constructor="m_number_of_TBEs";
+
+  Tick clockEdge();
+  Cycles ticksToCycles(Tick t);
+  void set_cache_entry(AbstractCacheEntry a);
+  void unset_cache_entry();
+  void set_tbe(TBE a);
+  void unset_tbe();
+  void wakeUpBuffers(Addr a);
+  void wakeUpAllBuffers(Addr a);
+  void profileMsgDelay(int virtualNetworkType, Cycles c);
+  MachineID mapAddressToMachine(Addr addr, MachineType mtype);
+
+  // inclusive cache returns L0 entries only
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    Entry Dcache_entry := static_cast(Entry, "pointer", Dcache[addr]);
+    if(is_valid(Dcache_entry)) {
+      return Dcache_entry;
+    }
+
+    Entry Icache_entry := static_cast(Entry, "pointer", Icache[addr]);
+    return Icache_entry;
+  }
+
+  Entry getDCacheEntry(Addr addr), return_by_pointer="yes" {
+    Entry Dcache_entry := static_cast(Entry, "pointer", Dcache[addr]);
+    return Dcache_entry;
+  }
+
+  Entry getICacheEntry(Addr addr), return_by_pointer="yes" {
+    Entry Icache_entry := static_cast(Entry, "pointer", Icache[addr]);
+    return Icache_entry;
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    assert((Dcache.isTagPresent(addr) && Icache.isTagPresent(addr)) == false);
+
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    assert((Dcache.isTagPresent(addr) && Icache.isTagPresent(addr)) == false);
+
+    // MUST CHANGE
+    if(is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      DPRINTF(RubySlicc, "%s\n", L0Cache_State_to_permission(tbe.TBEState));
+      return L0Cache_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      DPRINTF(RubySlicc, "%s\n", L0Cache_State_to_permission(cache_entry.CacheState));
+      return L0Cache_State_to_permission(cache_entry.CacheState);
+    }
+
+    DPRINTF(RubySlicc, "%s\n", AccessPermission:NotPresent);
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      testAndRead(addr, getCacheEntry(addr).DataBlk, pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+        testAndWrite(addr, tbe.DataBlk, pkt);
+      return num_functional_writes;
+    }
+
+    num_functional_writes := num_functional_writes +
+        testAndWrite(addr, getCacheEntry(addr).DataBlk, pkt);
+    return num_functional_writes;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(L0Cache_State_to_permission(state));
+    }
+  }
+
+  Event mandatory_request_type_to_event(RubyRequestType type) {
+    if (type == RubyRequestType:LD) {
+      return Event:Load;
+    } else if (type == RubyRequestType:IFETCH) {
+      return Event:Ifetch;
+    } else if ((type == RubyRequestType:ST) || (type == RubyRequestType:ATOMIC)
+               || (type == RubyRequestType:Store_Conditional)) {
+      return Event:Store;
+    } else {
+      error("Invalid RubyRequestType");
+    }
+  }
+
+  Event prefetch_request_type_to_event(RubyRequestType type) {
+    if (type == RubyRequestType:LD) {
+      return Event:PF_Load;
+    } else if (type == RubyRequestType:IFETCH) {
+      return Event:PF_Ifetch;
+    } else if (type == RubyRequestType:ST) {
+      return Event:PF_Store;
+    } else {
+      error("Invalid RubyRequestType");
+    }
+  }
+
+  int getPendingAcks(TBE tbe) {
+    return tbe.pendingAcks;
+  }
+
+  out_port(requestNetwork_out, CoherenceMsg, bufferToL1);
+  out_port(optionalQueue_out, RubyRequest, prefetchQueue);
+
+  void enqueuePrefetch(Addr address, RubyRequestType type) {
+    enqueue(optionalQueue_out, RubyRequest, 1) {
+      out_msg.LineAddress := address;
+      out_msg.Type := type;
+      out_msg.Prefetch := PrefetchBit:Yes;
+      out_msg.AccessMode := RubyAccessMode:Supervisor;
+    }
+  }
+
+  // Prefetch queue between the controller and the prefetcher
+  // As per Spracklen et al. (HPCA 2005), the prefetch queue should be
+  // implemented as a LIFO structure.  The structure would allow for fast
+  // searches of all entries in the queue, not just the head msg. All
+  // msgs in the structure can be invalidated if a demand miss matches.
+  in_port(optionalQueue_in, RubyRequest, prefetchQueue, desc="...", rank = 2) {
+    if (optionalQueue_in.isReady(clockEdge())) {
+      peek(optionalQueue_in, RubyRequest) {
+        // first check for valid address
+        MachineID mid := mapAddressToMachine(in_msg.LineAddress, MachineType:Directory);
+        NodeID nid := machineIDToNodeID(mid);
+        int nidint := IDToInt(nid);
+        int numDirs := machineCount(MachineType:Directory);
+        if (nidint >= numDirs) {
+          Entry cache_entry := static_cast(Entry, "pointer", Dcache.getNullEntry());
+          TBE tbe := TBEs.getNullEntry();
+          trigger(Event:PF_Bad_Addr, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:IFETCH) {
+          // Instruction Prefetch
+          Entry icache_entry := getICacheEntry(in_msg.LineAddress);
+          if (is_valid(icache_entry)) {
+            // The block to be prefetched is already present in the
+            // cache. This request will be made benign and cause the
+            // prefetch queue to be popped.
+            trigger(prefetch_request_type_to_event(in_msg.Type),
+                    in_msg.LineAddress,
+                    icache_entry, TBEs[in_msg.LineAddress]);
+          }
+
+          // Check to see if it is in the L0-D
+          Entry cache_entry := getDCacheEntry(in_msg.LineAddress);
+          if (is_valid(cache_entry)) {
+            // The block is in the wrong L0 cache. We should drop
+            // this request.
+            trigger(prefetch_request_type_to_event(in_msg.Type),
+                    in_msg.LineAddress,
+                    cache_entry, TBEs[in_msg.LineAddress]);
+          }
+
+          if (Icache.cacheAvail(in_msg.LineAddress)) {
+            // L0-I does't have the line, but we have space for it
+            // in the L0-I so let's see if the L1 has it
+            trigger(prefetch_request_type_to_event(in_msg.Type),
+                    in_msg.LineAddress,
+                    icache_entry, TBEs[in_msg.LineAddress]);
+          } else {
+            // No room in the L0-I, so we need to make room in the L0-I
+            Addr addr := Icache.cacheProbe(in_msg.LineAddress);
+            check_on_cache_probe(optionalQueue_in, addr);
+
+            trigger(Event:PF_L0_Replacement, addr,
+                    getICacheEntry(addr),
+                    TBEs[addr]);
+          }
+        } else {
+          // Data prefetch
+          Entry cache_entry := getDCacheEntry(in_msg.LineAddress);
+          if (is_valid(cache_entry)) {
+            // The block to be prefetched is already present in the
+            // cache. This request will be made benign and cause the
+            // prefetch queue to be popped.
+            trigger(prefetch_request_type_to_event(in_msg.Type),
+                    in_msg.LineAddress,
+                    cache_entry, TBEs[in_msg.LineAddress]);
+          }
+
+          // Check to see if it is in the L0-I
+          Entry icache_entry := getICacheEntry(in_msg.LineAddress);
+          if (is_valid(icache_entry)) {
+            // The block is in the wrong L0. Just drop the prefetch
+            // request.
+            trigger(prefetch_request_type_to_event(in_msg.Type),
+                    in_msg.LineAddress,
+                    icache_entry, TBEs[in_msg.LineAddress]);
+          }
+
+          if (Dcache.cacheAvail(in_msg.LineAddress)) {
+            // L0-D does't have the line, but we have space for it in
+            // the L0-D let's see if the L1 has it
+            trigger(prefetch_request_type_to_event(in_msg.Type),
+                    in_msg.LineAddress,
+                    cache_entry, TBEs[in_msg.LineAddress]);
+          } else {
+            // No room in the L0-D, so we need to make room in the L0-D
+            Addr addr := Dcache.cacheProbe(in_msg.LineAddress);
+            check_on_cache_probe(optionalQueue_in, addr);
+
+            trigger(Event:PF_L0_Replacement, addr,
+                    getDCacheEntry(addr),
+                    TBEs[addr]);
+          }
+        }
+      }
+    }
+  }
+
+  // Messages for this L0 cache from the L1 cache
+  in_port(messgeBuffer_in, CoherenceMsg, bufferFromL1, rank = 1) {
+    if (messgeBuffer_in.isReady(clockEdge())) {
+      peek(messgeBuffer_in, CoherenceMsg, block_on="addr") {
+        assert(in_msg.Dest == machineID);
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs[in_msg.addr];
+
+        if(in_msg.Class == CoherenceClass:DATA_EXCLUSIVE) {
+            trigger(Event:Data_Exclusive, in_msg.addr, cache_entry, tbe);
+        } else if(in_msg.Class == CoherenceClass:DATA) {
+            trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+        } else if(in_msg.Class == CoherenceClass:STALE_DATA) {
+            trigger(Event:Data_Stale, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Class == CoherenceClass:ACK) {
+            trigger(Event:Ack, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Class == CoherenceClass:WB_ACK) {
+            trigger(Event:WB_Ack, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Class == CoherenceClass:INV_OWN) {
+          trigger(Event:InvOwn, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Class == CoherenceClass:INV_ELSE) {
+          trigger(Event:InvElse, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Class == CoherenceClass:GETX ||
+                   in_msg.Class == CoherenceClass:UPGRADE) {
+          // upgrade transforms to GETX due to race
+          trigger(Event:Fwd_GETX, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Class == CoherenceClass:GETS) {
+          trigger(Event:Fwd_GETS, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Class == CoherenceClass:GET_INSTR) {
+          trigger(Event:Fwd_GET_INSTR, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Invalid forwarded request type");
+        }
+      }
+    }
+  }
+
+  // Mandatory Queue betweens Node's CPU and it's L0 caches
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...", rank = 0) {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+
+        // hardware transactional memory support begins here
+
+        // If this cache controller is in a transactional state/mode,
+        // ensure that its failure status is something recognisable.
+        if (htmFailed) {
+          assert(htmFailedRc == HtmFailedInCacheReason:FAIL_SELF ||
+            htmFailedRc == HtmFailedInCacheReason:FAIL_REMOTE ||
+            htmFailedRc == HtmFailedInCacheReason:FAIL_OTHER);
+        }
+
+        // HTM_Start commands set a new htmUid
+        // This is used for debugging and sanity checks
+        if (in_msg.Type == RubyRequestType:HTM_Start) {
+            assert (htmUid != in_msg.htmTransactionUid);
+            htmUid := in_msg.htmTransactionUid;
+        }
+
+        // If the incoming memory request was generated within a transaction,
+        // ensure that the request's htmUid matches the htmUid of this
+        // cache controller. A mismatch here is fatal and implies there was
+        // a reordering that should never have taken place.
+        if (in_msg.htmFromTransaction &&
+            (htmUid != in_msg.htmTransactionUid)) {
+          DPRINTF(HtmMem,
+            "mandatoryQueue_in: (%u) 0x%lx mismatch between cache htmUid=%u and message htmUid=%u\n",
+            in_msg.Type, in_msg.LineAddress, htmUid, in_msg.htmTransactionUid);
+        }
+
+        // special/rare case which hopefully won't occur
+        if (htmFailed && in_msg.Type == RubyRequestType:HTM_Start) {
+          error("cannot handle this special HTM case yet");
+        }
+
+        // The transaction is to be aborted--
+        // Aborting a transaction returns the cache to a non-transactional
+        // state/mode, resets the read/write sets, and invalidates any
+        // speculatively written lines.
+        if (in_msg.Type == RubyRequestType:HTM_Abort) {
+          Entry cache_entry := static_cast(Entry, "pointer", Dcache.getNullEntry());
+          TBE tbe := TBEs.getNullEntry();
+          trigger(Event:HTM_Abort, in_msg.LineAddress, cache_entry, tbe);
+        }
+        // The transaction has failed but not yet aborted--
+        // case 1:
+        // If memory request is transactional but the transaction has failed,
+        // it is necessary to inform the CPU of the failure.
+        // case 2:
+        // If load/store memory request is transactional and cache is not
+        // in transactional state, it's likely that the transaction aborted
+        // and Ruby is still receiving scheduled memory operations.
+        // The solution is to make these requests benign.
+        else if ((in_msg.htmFromTransaction && htmFailed) || (in_msg.htmFromTransaction && !isHtmCmdRequest(in_msg.Type) && !htmTransactionalState)) {
+          if (isHtmCmdRequest(in_msg.Type)) {
+            Entry cache_entry := static_cast(Entry, "pointer", Dcache.getNullEntry());
+            TBE tbe := TBEs.getNullEntry();
+            trigger(Event:HTM_notifyCMD, in_msg.LineAddress, cache_entry, tbe);
+          } else if (isDataReadRequest(in_msg.Type)) {
+            Entry cache_entry := getDCacheEntry(in_msg.LineAddress);
+            TBE tbe := TBEs[in_msg.LineAddress];
+            trigger(Event:HTM_notifyLD, in_msg.LineAddress, cache_entry, tbe);
+          } else if (isWriteRequest(in_msg.Type)) {
+            Entry cache_entry := getDCacheEntry(in_msg.LineAddress);
+            TBE tbe := TBEs[in_msg.LineAddress];
+            trigger(Event:HTM_notifyST, in_msg.LineAddress, cache_entry, tbe);
+          } else {
+            error("unknown message type");
+          }
+        }
+        // The transaction has not failed and this is
+        // one of three HTM commands--
+        // (1) start a transaction
+        // (2) commit a transaction
+        // (3) cancel/fail a transaction (but don't yet abort it)
+        else if (isHtmCmdRequest(in_msg.Type) && in_msg.Type != RubyRequestType:HTM_Abort) {
+          Entry cache_entry := static_cast(Entry, "pointer", Dcache.getNullEntry());
+          TBE tbe := TBEs.getNullEntry();
+          if (in_msg.Type == RubyRequestType:HTM_Start) {
+            DPRINTF(HtmMem,
+              "mandatoryQueue_in: Starting htm transaction htmUid=%u\n",
+              htmUid);
+            trigger(Event:HTM_Start, in_msg.LineAddress, cache_entry, tbe);
+          } else if (in_msg.Type == RubyRequestType:HTM_Commit) {
+            DPRINTF(HtmMem,
+              "mandatoryQueue_in: Committing transaction htmUid=%d\n",
+              htmUid);
+            trigger(Event:HTM_Commit, in_msg.LineAddress, cache_entry, tbe);
+          } else if (in_msg.Type == RubyRequestType:HTM_Cancel) {
+            DPRINTF(HtmMem,
+              "mandatoryQueue_in: Cancelling transaction htmUid=%d\n",
+              htmUid);
+            trigger(Event:HTM_Cancel, in_msg.LineAddress, cache_entry, tbe);
+          }
+        }
+        // end: hardware transactional memory
+        else if (in_msg.Type == RubyRequestType:IFETCH) {
+          // Check for data access to blocks in I-cache and ifetchs to blocks in D-cache
+          // ** INSTRUCTION ACCESS ***
+
+          Entry Icache_entry := getICacheEntry(in_msg.LineAddress);
+          if (is_valid(Icache_entry)) {
+            // The tag matches for the L0, so the L0 asks the L2 for it.
+            trigger(mandatory_request_type_to_event(in_msg.Type), in_msg.LineAddress,
+                    Icache_entry, TBEs[in_msg.LineAddress]);
+          } else {
+
+            // Check to see if it is in the OTHER L0
+            Entry Dcache_entry := getDCacheEntry(in_msg.LineAddress);
+            if (is_valid(Dcache_entry)) {
+              // The block is in the wrong L0, put the request on the queue to the shared L2
+              trigger(Event:L0_Replacement, in_msg.LineAddress,
+                      Dcache_entry, TBEs[in_msg.LineAddress]);
+            }
+
+            if (Icache.cacheAvail(in_msg.LineAddress)) {
+              // L0 does't have the line, but we have space for it
+              // in the L0 so let's see if the L2 has it
+              trigger(mandatory_request_type_to_event(in_msg.Type), in_msg.LineAddress,
+                      Icache_entry, TBEs[in_msg.LineAddress]);
+            } else {
+              // No room in the L0, so we need to make room in the L0
+              // Check if the line we want to evict is not locked
+              Addr addr := Icache.cacheProbe(in_msg.LineAddress);
+              check_on_cache_probe(mandatoryQueue_in, addr);
+              trigger(Event:L0_Replacement, addr,
+                      getICacheEntry(addr),
+                      TBEs[addr]);
+            }
+          }
+        } else {
+          // *** DATA ACCESS ***
+          Entry Dcache_entry := getDCacheEntry(in_msg.LineAddress);
+
+          // early out for failed store conditionals
+
+          if (in_msg.Type == RubyRequestType:Store_Conditional) {
+              if (!sequencer.llscCheckMonitor(in_msg.LineAddress)) {
+                trigger(Event:Failed_SC, in_msg.LineAddress,
+                        Dcache_entry, TBEs[in_msg.LineAddress]);
+            }
+          }
+
+          if (is_valid(Dcache_entry)) {
+            // The tag matches for the L0, so the L0 ask the L1 for it
+            trigger(mandatory_request_type_to_event(in_msg.Type), in_msg.LineAddress,
+                    Dcache_entry, TBEs[in_msg.LineAddress]);
+          } else {
+            // if the request is not valid, the store conditional will fail
+            if (in_msg.Type == RubyRequestType:Store_Conditional) {
+                // if the line is not valid, it can't be locked
+                trigger(Event:Failed_SC, in_msg.LineAddress,
+                        Dcache_entry, TBEs[in_msg.LineAddress]);
+            } else {
+              // Check to see if it is in the OTHER L0
+              Entry Icache_entry := getICacheEntry(in_msg.LineAddress);
+              if (is_valid(Icache_entry)) {
+                // The block is in the wrong L0, put the request on the queue to the private L1
+                trigger(Event:L0_Replacement, in_msg.LineAddress,
+                        Icache_entry, TBEs[in_msg.LineAddress]);
+              }
+
+              if (Dcache.cacheAvail(in_msg.LineAddress)) {
+                // L1 does't have the line, but we have space for it
+                // in the L0 let's see if the L1 has it
+                trigger(mandatory_request_type_to_event(in_msg.Type), in_msg.LineAddress,
+                        Dcache_entry, TBEs[in_msg.LineAddress]);
+              } else {
+                // No room in the L1, so we need to make room in the L0
+                // Check if the line we want to evict is not locked
+                Addr addr := Dcache.cacheProbe(in_msg.LineAddress);
+                check_on_cache_probe(mandatoryQueue_in, addr);
+                trigger(Event:L0_Replacement, addr,
+                        getDCacheEntry(addr),
+                        TBEs[addr]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ACTIONS
+  action(a_issueGETS, "a", desc="Issue GETS") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      enqueue(requestNetwork_out, CoherenceMsg, request_latency) {
+        out_msg.addr := address;
+        out_msg.Class := CoherenceClass:GETS;
+        out_msg.Sender := machineID;
+        out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+        DPRINTF(RubySlicc, "address: %#x, destination: %s\n",
+                address, out_msg.Dest);
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.AccessMode := in_msg.AccessMode;
+      }
+    }
+  }
+
+  action(b_issueGETX, "b", desc="Issue GETX") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      enqueue(requestNetwork_out, CoherenceMsg, request_latency) {
+        out_msg.addr := address;
+        out_msg.Class := CoherenceClass:GETX;
+        out_msg.Sender := machineID;
+        DPRINTF(RubySlicc, "%s\n", machineID);
+        out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+
+        DPRINTF(RubySlicc, "address: %#x, destination: %s\n",
+                address, out_msg.Dest);
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.AccessMode := in_msg.AccessMode;
+      }
+    }
+  }
+
+  action(c_issueUPGRADE, "c", desc="Issue GETX") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      enqueue(requestNetwork_out, CoherenceMsg, request_latency) {
+        out_msg.addr := address;
+        out_msg.Class := CoherenceClass:UPGRADE;
+        out_msg.Sender := machineID;
+        out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+
+        DPRINTF(RubySlicc, "address: %#x, destination: %s\n",
+                address, out_msg.Dest);
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.AccessMode := in_msg.AccessMode;
+      }
+    }
+  }
+
+  action(f_sendDataToL1, "f", desc="Send data to the L1 cache") {
+    // hardware transactional memory
+    // Cannot write speculative data to L1 cache
+    if (cache_entry.getInHtmWriteSet()) {
+      // If in HTM write set then send NAK to L1
+      enqueue(requestNetwork_out, CoherenceMsg, response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Class := CoherenceClass:NAK;
+        out_msg.Sender := machineID;
+        out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+      }
+    } else {
+      enqueue(requestNetwork_out, CoherenceMsg, response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Class := CoherenceClass:INV_DATA;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.Sender := machineID;
+        out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      }
+      cache_entry.Dirty := false;
+    }
+  }
+
+  action(fi_sendInvAck, "fi", desc="Send data to the L1 cache") {
+    peek(messgeBuffer_in, CoherenceMsg) {
+      enqueue(requestNetwork_out, CoherenceMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Class := CoherenceClass:INV_ACK;
+        out_msg.Sender := machineID;
+        out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+      }
+    }
+  }
+
+  action(forward_eviction_to_cpu, "\cc", desc="Send eviction information to the processor") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %#x to the CPU\n", address);
+      sequencer.evictionCallback(address);
+    }
+  }
+
+  action(g_issuePUTE, "\ge", desc="Relinquish line to the L1 cache") {
+    enqueue(requestNetwork_out, CoherenceMsg, response_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Class := CoherenceClass:PUTX;
+      out_msg.Dirty := cache_entry.Dirty;
+      out_msg.Sender:= machineID;
+      out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+      out_msg.MessageSize := MessageSizeType:Writeback_Control;
+    }
+  }
+
+  action(g_issuePUTM, "\gm", desc="Send modified line to the L1 cache") {
+    if (!cache_entry.getInHtmWriteSet()) {
+      enqueue(requestNetwork_out, CoherenceMsg, response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Class := CoherenceClass:PUTX;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.Sender:= machineID;
+        out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        out_msg.DataBlk := cache_entry.DataBlk;
+      }
+    }
+  }
+
+  action(h_load_hit, "hd", desc="Notify sequencer the load completed (cache hit)") {
+    assert(is_valid(cache_entry));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    Dcache.setMRU(cache_entry);
+    sequencer.readCallback(address, cache_entry.DataBlk);
+  }
+
+  action(h_ifetch_hit, "hi", desc="Notify sequencer the ifetch completed (cache hit)") {
+    assert(is_valid(cache_entry));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    Icache.setMRU(cache_entry);
+    sequencer.readCallback(address, cache_entry.DataBlk);
+  }
+
+  // The action name uses a counterintuitive _hit prefix when it is only
+  // called due to a cache miss. It is technically now a hit after having
+  // serviced the miss.
+  action(hx_load_hit, "hxd", desc="Notify sequencer the load completed (cache miss)") {
+    assert(is_valid(cache_entry));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    Dcache.setMRU(cache_entry);
+    sequencer.readCallback(address, cache_entry.DataBlk, true);
+  }
+
+  // The action name uses a counterintuitive _hit prefix when it is only
+  // called due to a cache miss. It is technically now a hit after having
+  // serviced the miss.
+  action(hx_ifetch_hit, "hxi", desc="Notify sequencer the ifetch completed (cache miss)") {
+    assert(is_valid(cache_entry));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    Icache.setMRU(cache_entry);
+    sequencer.readCallback(address, cache_entry.DataBlk, true);
+  }
+
+  action(hh_store_hit, "\h", desc="Notify sequencer that store completed (cache hit)") {
+    assert(is_valid(cache_entry));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    Dcache.setMRU(cache_entry);
+    sequencer.writeCallback(address, cache_entry.DataBlk);
+    cache_entry.Dirty := true;
+  }
+
+  // The action name uses a counterintuitive _hit prefix when it is only
+  // called due to a cache miss. It is technically now a hit after having
+  // serviced the miss.
+  action(hhx_store_hit, "\hx", desc="Notify sequencer that store completed (cache miss)") {
+    assert(is_valid(cache_entry));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    Dcache.setMRU(cache_entry);
+    sequencer.writeCallback(address, cache_entry.DataBlk, true);
+    cache_entry.Dirty := true;
+  }
+
+  action(i_allocateTBE, "i", desc="Allocate TBE (number of invalidates=0)") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs[address]);
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.DataBlk := cache_entry.DataBlk;
+  }
+
+  action(k_popMandatoryQueue, "k", desc="Pop mandatory queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(l_popRequestQueue, "l",
+         desc="Pop incoming request queue and profile the delay within this virtual network") {
+    Tick delay := messgeBuffer_in.dequeue(clockEdge());
+    profileMsgDelay(2, ticksToCycles(delay));
+  }
+
+  action(o_popIncomingResponseQueue, "o",
+         desc="Pop Incoming Response queue and profile the delay within this virtual network") {
+    Tick delay := messgeBuffer_in.dequeue(clockEdge());
+    profileMsgDelay(1, ticksToCycles(delay));
+  }
+
+  action(s_deallocateTBE, "s", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(u_writeDataToCache, "u", desc="Write data to cache") {
+    peek(messgeBuffer_in, CoherenceMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+    }
+  }
+
+  action(u_writeInstToCache, "ui", desc="Write data to cache") {
+    peek(messgeBuffer_in, CoherenceMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+    }
+  }
+
+  action(ff_deallocateCacheBlock, "\f",
+         desc="Deallocate L1 cache block.") {
+    if (Dcache.isTagPresent(address)) {
+      Dcache.deallocate(address);
+    } else {
+      Icache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(oo_allocateDCacheBlock, "\o", desc="Set L1 D-cache tag equal to tag of block B") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(Dcache.allocate(address, new Entry));
+    }
+  }
+
+  action(pp_allocateICacheBlock, "\p", desc="Set L1 I-cache tag equal to tag of block B") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(Icache.allocate(address, new Entry));
+    }
+  }
+
+  action(z_stallAndWaitMandatoryQueue, "\z", desc="Stall cpu request queue") {
+    stall_and_wait(mandatoryQueue_in, address);
+  }
+
+  action(kd_wakeUpDependents, "kd", desc="Wake-up dependents") {
+    wakeUpAllBuffers(address);
+  }
+
+  action(uu_profileInstMiss, "\ui", desc="Profile the demand miss") {
+        ++Icache.demand_misses;
+  }
+
+  action(uu_profileInstHit, "\uih", desc="Profile the demand hit") {
+        ++Icache.demand_hits;
+  }
+
+  action(uu_profileDataMiss, "\ud", desc="Profile the demand miss") {
+        ++Dcache.demand_misses;
+  }
+
+  action(uu_profileDataHit, "\udh", desc="Profile the demand hit") {
+        ++Dcache.demand_hits;
+  }
+
+  // store conditionals
+
+  action(hhc_storec_fail, "\hc",
+         desc="Notify sequencer that store conditional failed") {
+    sequencer.writeCallbackScFail(address, cache_entry.DataBlk);
+  }
+
+  // prefetching
+
+  action(pa_issuePfGETS, "pa", desc="Issue prefetch GETS") {
+    peek(optionalQueue_in, RubyRequest) {
+      enqueue(requestNetwork_out, CoherenceMsg, request_latency) {
+        out_msg.addr := address;
+        out_msg.Class := CoherenceClass:GETS;
+        out_msg.Sender := machineID;
+        out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+        DPRINTF(RubySlicc, "address: %#x, destination: %s\n",
+                address, out_msg.Dest);
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Prefetch := in_msg.Prefetch;
+        out_msg.AccessMode := in_msg.AccessMode;
+      }
+    }
+  }
+
+  action(pb_issuePfGETX, "pb", desc="Issue prefetch GETX") {
+    peek(optionalQueue_in, RubyRequest) {
+      enqueue(requestNetwork_out, CoherenceMsg, request_latency) {
+        out_msg.addr := address;
+        out_msg.Class := CoherenceClass:GETX;
+        out_msg.Sender := machineID;
+        DPRINTF(RubySlicc, "%s\n", machineID);
+        out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+
+        DPRINTF(RubySlicc, "address: %#x, destination: %s\n",
+                address, out_msg.Dest);
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Prefetch := in_msg.Prefetch;
+        out_msg.AccessMode := in_msg.AccessMode;
+      }
+    }
+  }
+
+  action(pq_popPrefetchQueue, "\pq", desc="Pop the prefetch request queue") {
+    optionalQueue_in.dequeue(clockEdge());
+  }
+
+  action(mp_markPrefetched, "mp", desc="Write data from response queue to cache") {
+      assert(is_valid(cache_entry));
+      cache_entry.isPrefetched := true;
+  }
+
+  action(po_observeMiss, "\po", desc="Inform the prefetcher about a cache miss") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      if (enable_prefetch) {
+        prefetcher.observeMiss(in_msg.LineAddress, in_msg.Type);
+      }
+    }
+  }
+
+  action(ppm_observePfMiss, "\ppm",
+         desc="Inform the prefetcher about a cache miss with in-flight prefetch") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      prefetcher.observePfMiss(in_msg.LineAddress);
+    }
+  }
+
+  action(pph_observePfHit, "\pph",
+         desc="Inform the prefetcher if a cache hit was the result of a prefetch") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      if (cache_entry.isPrefetched) {
+        prefetcher.observePfHit(in_msg.LineAddress);
+        cache_entry.isPrefetched := false;
+      }
+    }
+  }
+
+  action(z_stallAndWaitOptionalQueue, "\pz", desc="recycle prefetch request queue") {
+    stall_and_wait(optionalQueue_in, address);
+  }
+
+  // hardware transactional memory
+
+  action(hars_htmAddToReadSet, "\hars", desc="add to HTM read set") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      if (htmTransactionalState && in_msg.htmFromTransaction) {
+        assert(!htmFailed);
+        if (!cache_entry.getInHtmReadSet()) {
+          DPRINTF(HtmMem,
+              "Adding 0x%lx to transactional read set htmUid=%u.\n",
+              address, htmUid);
+          cache_entry.setInHtmReadSet(true);
+        }
+      }
+    }
+  }
+
+  action(haws_htmAddToWriteSet, "\haws", desc="add to HTM write set") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      if (htmTransactionalState && in_msg.htmFromTransaction) {
+        assert(!htmFailed);
+        assert(!((cache_entry.getInHtmWriteSet() == false) &&
+          (cache_entry.CacheState == State:IM)));
+        assert(!((cache_entry.getInHtmWriteSet() == false) &&
+          (cache_entry.CacheState == State:SM)));
+        // ON DEMAND write-back
+        // if modified and not in write set,
+        // write back and retain M state
+        if((cache_entry.CacheState == State:M) &&
+           !cache_entry.getInHtmWriteSet()) {
+          // code copied from issuePUTX
+          enqueue(requestNetwork_out, CoherenceMsg, response_latency) {
+            assert(is_valid(cache_entry));
+            out_msg.addr := address;
+            out_msg.Class := CoherenceClass:PUTX_COPY;
+            out_msg.DataBlk := cache_entry.DataBlk;
+            out_msg.Dirty := cache_entry.Dirty;
+            out_msg.Sender:= machineID;
+            out_msg.Dest := createMachineID(MachineType:L1Cache, version);
+            out_msg.MessageSize := MessageSizeType:Writeback_Data;
+          }
+        }
+        if (!cache_entry.getInHtmWriteSet()) {
+          DPRINTF(HtmMem,
+              "Adding 0x%lx to transactional write set htmUid=%u.\n",
+              address, htmUid);
+          cache_entry.setInHtmWriteSet(true);
+        }
+      }
+    }
+  }
+
+  action(hfts_htmFailTransactionSize, "\hfts^",
+  desc="Fail transaction due to cache associativity/capacity conflict") {
+    if (htmTransactionalState &&
+        (cache_entry.getInHtmReadSet() || cache_entry.getInHtmWriteSet())) {
+      DPRINTF(HtmMem,
+          "Failure of a transaction due to cache associativity/capacity: rs=%s, ws=%s, addr=0x%lx, htmUid=%u\n",
+          cache_entry.getInHtmReadSet(), cache_entry.getInHtmWriteSet(),
+          address, htmUid);
+      htmFailed := true;
+      htmFailedRc := HtmFailedInCacheReason:FAIL_SELF;
+    }
+  }
+
+  action(hftm_htmFailTransactionMem, "\hftm^",
+  desc="Fail transaction due to memory conflict") {
+    if (htmTransactionalState &&
+        (cache_entry.getInHtmReadSet() || cache_entry.getInHtmWriteSet())) {
+      DPRINTF(HtmMem,
+          "Failure of a transaction due to memory conflict: rs=%s, ws=%s, addr=0x%lx, htmUid=%u\n",
+          cache_entry.getInHtmReadSet(), cache_entry.getInHtmWriteSet(),
+          address, htmUid);
+      htmFailed := true;
+      htmFailedRc := HtmFailedInCacheReason:FAIL_REMOTE;
+    }
+  }
+
+  action(hvu_htmVerifyUid, "\hvu",
+  desc="Ensure cache htmUid is equivalent to message htmUid") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      if (htmUid != in_msg.htmTransactionUid) {
+        DPRINTF(HtmMem, "cache's htmUid=%u and request's htmUid=%u\n",
+            htmUid, in_msg.htmTransactionUid);
+        error("mismatch between cache's htmUid and request's htmUid");
+      }
+    }
+  }
+
+  action(hcs_htmCommandSucceed, "\hcs",
+  desc="Notify sequencer HTM command succeeded") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      assert(is_invalid(cache_entry) && is_invalid(tbe));
+      DPRINTF(RubySlicc, "htm command successful\n");
+      sequencer.htmCallback(in_msg.LineAddress,
+        HtmCallbackMode:HTM_CMD, HtmFailedInCacheReason:NO_FAIL);
+    }
+  }
+
+  action(hcs_htmCommandFail, "\hcf",
+  desc="Notify sequencer HTM command failed") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      assert(is_invalid(cache_entry) && is_invalid(tbe));
+      DPRINTF(RubySlicc, "htm command failure\n");
+      sequencer.htmCallback(in_msg.LineAddress,
+        HtmCallbackMode:HTM_CMD, htmFailedRc);
+    }
+  }
+
+  action(hcs_htmLoadFail, "\hlf",
+  desc="Notify sequencer HTM transactional load failed") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      DPRINTF(RubySlicc, "htm transactional load failure\n");
+      sequencer.htmCallback(in_msg.LineAddress,
+        HtmCallbackMode:LD_FAIL, htmFailedRc);
+    }
+  }
+
+  action(hcs_htmStoreFail, "\hsf",
+  desc="Notify sequencer HTM transactional store failed") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      DPRINTF(RubySlicc, "htm transactional store failure\n");
+      sequencer.htmCallback(in_msg.LineAddress,
+        HtmCallbackMode:ST_FAIL, htmFailedRc);
+    }
+  }
+
+  action(hat_htmAbortTransaction, "\hat",
+  desc="Abort HTM transaction and rollback cache to pre-transactional state") {
+    assert(is_invalid(cache_entry) && is_invalid(tbe));
+    assert (htmTransactionalState);
+    Dcache.htmAbortTransaction();
+    htmTransactionalState := false;
+    htmFailed := false;
+    sequencer.llscClearLocalMonitor();
+    DPRINTF(RubySlicc, "Aborted htm transaction\n");
+  }
+
+  action(hst_htmStartTransaction, "\hst",
+  desc="Place cache in HTM transactional state") {
+    assert(is_invalid(cache_entry) && is_invalid(tbe));
+    assert (!htmTransactionalState);
+    htmTransactionalState := true;
+    htmFailedRc := HtmFailedInCacheReason:NO_FAIL;
+    sequencer.llscClearLocalMonitor();
+    DPRINTF(RubySlicc, "Started htm transaction\n");
+  }
+
+  action(hct_htmCommitTransaction, "\hct",
+  desc="Commit speculative loads/stores and place cache in normal state") {
+    assert(is_invalid(cache_entry) && is_invalid(tbe));
+    assert (htmTransactionalState);
+    assert (!htmFailed);
+    Dcache.htmCommitTransaction();
+    sequencer.llscClearLocalMonitor();
+    htmTransactionalState := false;
+    DPRINTF(RubySlicc, "Committed htm transaction\n");
+  }
+
+  action(hcnt_htmCancelTransaction, "\hcnt",
+  desc="Fail HTM transaction explicitely without aborting") {
+    assert(is_invalid(cache_entry) && is_invalid(tbe));
+    assert (htmTransactionalState);
+    htmFailed := true;
+    htmFailedRc := HtmFailedInCacheReason:FAIL_OTHER;
+    DPRINTF(RubySlicc, "Cancelled htm transaction\n");
+  }
+
+  //*****************************************************
+  // TRANSITIONS
+  //*****************************************************
+
+  // Transitions for Load/Store/Replacement/WriteBack from transient states
+  transition({Inst_IS, IS, IM, SM}, {Load, Ifetch, Store, L0_Replacement}) {
+    z_stallAndWaitMandatoryQueue;
+  }
+
+  // Transitions from Idle
+  transition(I, Load, IS) {
+    oo_allocateDCacheBlock;
+    i_allocateTBE;
+    hars_htmAddToReadSet;
+    a_issueGETS;
+    uu_profileDataMiss;
+    po_observeMiss;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, Ifetch, Inst_IS) {
+    pp_allocateICacheBlock;
+    i_allocateTBE;
+    a_issueGETS;
+    uu_profileInstMiss;
+    po_observeMiss;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, Store, IM) {
+    oo_allocateDCacheBlock;
+    i_allocateTBE;
+    haws_htmAddToWriteSet;
+    b_issueGETX;
+    uu_profileDataMiss;
+    po_observeMiss;
+    k_popMandatoryQueue;
+  }
+
+  transition({I, Inst_IS}, {InvOwn, InvElse}) {
+    forward_eviction_to_cpu;
+    fi_sendInvAck;
+    l_popRequestQueue;
+  }
+
+  transition({IS, IM}, InvOwn) {
+    hfts_htmFailTransactionSize;
+    forward_eviction_to_cpu;
+    fi_sendInvAck;
+    l_popRequestQueue;
+  }
+
+  transition({IS, IM}, InvElse) {
+    hftm_htmFailTransactionMem;
+    forward_eviction_to_cpu;
+    fi_sendInvAck;
+    l_popRequestQueue;
+  }
+
+  transition(SM, InvOwn, IM) {
+    hfts_htmFailTransactionSize;
+    forward_eviction_to_cpu;
+    fi_sendInvAck;
+    l_popRequestQueue;
+  }
+
+  transition(SM, InvElse, IM) {
+    hftm_htmFailTransactionMem;
+    forward_eviction_to_cpu;
+    fi_sendInvAck;
+    l_popRequestQueue;
+  }
+
+  // Transitions from Shared
+  transition({S,E,M}, Load) {
+    hars_htmAddToReadSet;
+    h_load_hit;
+    uu_profileDataHit;
+    pph_observePfHit;
+    k_popMandatoryQueue;
+  }
+
+  transition({S,E,M}, Ifetch) {
+    h_ifetch_hit;
+    uu_profileInstHit;
+    pph_observePfHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(S, Store, SM) {
+    i_allocateTBE;
+    haws_htmAddToWriteSet;
+    c_issueUPGRADE;
+    uu_profileDataMiss;
+    k_popMandatoryQueue;
+  }
+
+  transition(S, {L0_Replacement,PF_L0_Replacement}, I) {
+    hfts_htmFailTransactionSize;
+    forward_eviction_to_cpu;
+    ff_deallocateCacheBlock;
+  }
+
+  transition(S, InvOwn, I) {
+    hfts_htmFailTransactionSize;
+    forward_eviction_to_cpu;
+    fi_sendInvAck;
+    ff_deallocateCacheBlock;
+    l_popRequestQueue;
+  }
+
+  transition(S, InvElse, I) {
+    hftm_htmFailTransactionMem;
+    forward_eviction_to_cpu;
+    fi_sendInvAck;
+    ff_deallocateCacheBlock;
+    l_popRequestQueue;
+  }
+
+  // Transitions from Exclusive
+  transition({E,M}, Store, M) {
+    haws_htmAddToWriteSet;
+    hh_store_hit;
+    uu_profileDataHit;
+    pph_observePfHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(E, {L0_Replacement,PF_L0_Replacement}, I) {
+    hfts_htmFailTransactionSize;
+    forward_eviction_to_cpu;
+    g_issuePUTE;
+    ff_deallocateCacheBlock;
+  }
+
+  transition(E, {InvElse, Fwd_GETX}, I) {
+    hftm_htmFailTransactionMem;
+    // don't send data
+    forward_eviction_to_cpu;
+    fi_sendInvAck;
+    ff_deallocateCacheBlock;
+    l_popRequestQueue;
+  }
+
+  transition(E, InvOwn, I) {
+    hfts_htmFailTransactionSize;
+    // don't send data
+    forward_eviction_to_cpu;
+    fi_sendInvAck;
+    ff_deallocateCacheBlock;
+    l_popRequestQueue;
+  }
+
+  transition(E, {Fwd_GETS, Fwd_GET_INSTR}, S) {
+    f_sendDataToL1;
+    l_popRequestQueue;
+  }
+
+  // Transitions from Modified
+  transition(M, {L0_Replacement,PF_L0_Replacement}, I) {
+    hfts_htmFailTransactionSize;
+    forward_eviction_to_cpu;
+    g_issuePUTM;
+    ff_deallocateCacheBlock;
+  }
+
+  transition(M, InvOwn, I) {
+    hfts_htmFailTransactionSize;
+    forward_eviction_to_cpu;
+    f_sendDataToL1;
+    ff_deallocateCacheBlock;
+    l_popRequestQueue;
+  }
+
+  transition(M, {InvElse, Fwd_GETX}, I) {
+    hftm_htmFailTransactionMem;
+    forward_eviction_to_cpu;
+    f_sendDataToL1;
+    ff_deallocateCacheBlock;
+    l_popRequestQueue;
+  }
+
+  transition(M, {Fwd_GETS, Fwd_GET_INSTR}, S) {
+    hftm_htmFailTransactionMem;
+    f_sendDataToL1;
+    l_popRequestQueue;
+  }
+
+  transition(IS, Data, S) {
+    u_writeDataToCache;
+    hx_load_hit;
+    s_deallocateTBE;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(IS, Data_Exclusive, E) {
+    u_writeDataToCache;
+    hx_load_hit;
+    s_deallocateTBE;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(IS, Data_Stale, I) {
+    hftm_htmFailTransactionMem;
+    u_writeDataToCache;
+    forward_eviction_to_cpu;
+    hx_load_hit;
+    s_deallocateTBE;
+    ff_deallocateCacheBlock;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(Inst_IS, Data, S) {
+    u_writeInstToCache;
+    hx_ifetch_hit;
+    s_deallocateTBE;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(Inst_IS, Data_Exclusive, E) {
+    u_writeInstToCache;
+    hx_ifetch_hit;
+    s_deallocateTBE;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(Inst_IS, Data_Stale, I) {
+    u_writeInstToCache;
+    hx_ifetch_hit;
+    s_deallocateTBE;
+    ff_deallocateCacheBlock;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition({IM,SM}, Data_Exclusive, M) {
+    u_writeDataToCache;
+    hhx_store_hit;
+    s_deallocateTBE;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  // store conditionals
+
+  transition({I,S,E,M}, Failed_SC) {
+    // IS,IM,SM don't handle store conditionals
+    hhc_storec_fail;
+    k_popMandatoryQueue;
+  }
+
+  // prefetcher
+
+  transition({Inst_IS, IS, IM, SM, PF_Inst_IS, PF_IS, PF_IE}, PF_L0_Replacement) {
+     z_stallAndWaitOptionalQueue;
+  }
+
+  transition({PF_Inst_IS, PF_IS}, {Store, L0_Replacement}) {
+    z_stallAndWaitMandatoryQueue;
+  }
+
+  transition({PF_IE}, {Load, Ifetch, L0_Replacement}) {
+    z_stallAndWaitMandatoryQueue;
+  }
+
+  transition({S,E,M,Inst_IS,IS,IM,SM,PF_Inst_IS,PF_IS,PF_IE},
+             {PF_Load, PF_Store, PF_Ifetch}) {
+      pq_popPrefetchQueue;
+  }
+
+  transition(I, PF_Load, PF_IS) {
+    oo_allocateDCacheBlock;
+    i_allocateTBE;
+    pa_issuePfGETS;
+    pq_popPrefetchQueue;
+  }
+
+  transition(PF_IS, Load, IS) {
+    hars_htmAddToReadSet;
+    uu_profileDataMiss;
+    ppm_observePfMiss;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, PF_Ifetch, PF_Inst_IS) {
+    pp_allocateICacheBlock;
+    i_allocateTBE;
+    pa_issuePfGETS;
+    pq_popPrefetchQueue;
+  }
+
+  transition(PF_Inst_IS, Ifetch, Inst_IS) {
+    uu_profileInstMiss;
+    ppm_observePfMiss;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, PF_Store, PF_IE) {
+    oo_allocateDCacheBlock;
+    i_allocateTBE;
+    pb_issuePfGETX;
+    pq_popPrefetchQueue;
+  }
+
+  transition(PF_IE, Store, IM) {
+    haws_htmAddToWriteSet;
+    uu_profileDataMiss;
+    ppm_observePfMiss;
+    k_popMandatoryQueue;
+  }
+
+  transition({PF_Inst_IS, PF_IS, PF_IE}, {InvOwn, InvElse}) {
+    fi_sendInvAck;
+    l_popRequestQueue;
+  }
+
+  transition(PF_IS, Data, S) {
+    u_writeDataToCache;
+    s_deallocateTBE;
+    mp_markPrefetched;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(PF_IS, Data_Exclusive, E) {
+    u_writeDataToCache;
+    s_deallocateTBE;
+    mp_markPrefetched;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(PF_IS, Data_Stale, I) {
+    u_writeDataToCache;
+    s_deallocateTBE;
+    mp_markPrefetched;
+    ff_deallocateCacheBlock;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(PF_Inst_IS, Data, S) {
+    u_writeInstToCache;
+    s_deallocateTBE;
+    mp_markPrefetched;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(PF_Inst_IS, Data_Exclusive, E) {
+    u_writeInstToCache;
+    s_deallocateTBE;
+    mp_markPrefetched;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(PF_IE, Data_Exclusive, E) {
+    u_writeDataToCache;
+    s_deallocateTBE;
+    mp_markPrefetched;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(I, PF_Bad_Addr) {
+    pq_popPrefetchQueue;
+  }
+
+  // hardware transactional memory
+
+  transition(I, HTM_Abort) {
+    hvu_htmVerifyUid;
+    hat_htmAbortTransaction;
+    hcs_htmCommandSucceed;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, HTM_Start) {
+    hvu_htmVerifyUid;
+    hst_htmStartTransaction;
+    hcs_htmCommandSucceed;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, HTM_Commit) {
+    hvu_htmVerifyUid;
+    hct_htmCommitTransaction;
+    hcs_htmCommandSucceed;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, HTM_Cancel) {
+    hvu_htmVerifyUid;
+    hcnt_htmCancelTransaction;
+    hcs_htmCommandSucceed;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, HTM_notifyCMD) {
+    hvu_htmVerifyUid;
+    hcs_htmCommandFail;
+    k_popMandatoryQueue;
+  }
+
+  transition({I,S,E,M,IS,IM,SM,PF_IS,PF_IE}, HTM_notifyLD) {
+    hvu_htmVerifyUid;
+    hcs_htmLoadFail;
+    k_popMandatoryQueue;
+  }
+
+  transition({I,S,E,M,IS,IM,SM,PF_IS,PF_IE}, HTM_notifyST) {
+    hvu_htmVerifyUid;
+    hcs_htmStoreFail;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, {L0_Replacement,PF_L0_Replacement}) {
+    ff_deallocateCacheBlock;
+  }
+}

diff --git a/src/mem/ruby/protocol/MESI_Three_Level_HTM.slicc b/src/mem/ruby/protocol/MESI_Three_Level_HTM.slicc
new file mode 100644
index 0000000..4ec31b5
--- /dev/null
+++ b/src/mem/ruby/protocol/MESI_Three_Level_HTM.slicc

@@ -0,0 +1,9 @@
+protocol "MESI_Three_Level_HTM";
+include "RubySlicc_interfaces.slicc";
+include "MESI_Two_Level-msg.sm";
+include "MESI_Three_Level-msg.sm";
+include "MESI_Three_Level_HTM-L0cache.sm";
+include "MESI_Three_Level-L1cache.sm";
+include "MESI_Two_Level-L2cache.sm";
+include "MESI_Two_Level-dir.sm";
+include "MESI_Two_Level-dma.sm";

diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index f1d17c8..ea61350 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited
+ * Copyright (c) 2020 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -167,6 +167,31 @@
   Release,           desc="Release operation";
   Acquire,           desc="Acquire opertion";
   AcquireRelease,    desc="Acquire and Release opertion";
+  HTM_Start,         desc="hardware memory transaction: begin";
+  HTM_Commit,        desc="hardware memory transaction: commit";
+  HTM_Cancel,        desc="hardware memory transaction: cancel";
+  HTM_Abort,         desc="hardware memory transaction: abort";
+}
+
+bool isWriteRequest(RubyRequestType type);
+bool isDataReadRequest(RubyRequestType type);
+bool isReadRequest(RubyRequestType type);
+bool isHtmCmdRequest(RubyRequestType type);
+
+// hardware transactional memory
+RubyRequestType htmCmdToRubyRequestType(Packet *pkt);
+
+enumeration(HtmCallbackMode, desc="...", default="HtmCallbackMode_NULL") {
+  HTM_CMD,          desc="htm command";
+  LD_FAIL,          desc="htm transaction failed - inform via read";
+  ST_FAIL,          desc="htm transaction failed - inform via write";
+}
+
+enumeration(HtmFailedInCacheReason, desc="...", default="HtmFailedInCacheReason_NO_FAIL") {
+  NO_FAIL,          desc="no failure in cache";
+  FAIL_SELF,        desc="failed due local cache's replacement policy";
+  FAIL_REMOTE,      desc="failed due remote invalidation";
+  FAIL_OTHER,       desc="failed due other circumstances";
 }
 
 enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL") {

diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 71716f9..9c64732 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm

@@ -132,12 +132,18 @@
   // ll/sc support
   void writeCallbackScFail(Addr, DataBlock);
   bool llscCheckMonitor(Addr);
+  void llscClearLocalMonitor();
 
   void evictionCallback(Addr);
   void recordRequestType(SequencerRequestType);
   bool checkResourceAvailable(CacheResourceType, Addr);
 }
 
+structure (HTMSequencer, interface="Sequencer", external = "yes") {
+  // hardware transactional memory
+  void htmCallback(Addr, HtmCallbackMode, HtmFailedInCacheReason);
+}
+
 structure(RubyRequest, desc="...", interface="Message", external="yes") {
   Addr LineAddress,       desc="Line address for this request";
   Addr PhysicalAddress,   desc="Physical address for this request";
@@ -152,6 +158,8 @@
   int wfid,                  desc="Writethrough wavefront";
   uint64_t instSeqNum,       desc="Instruction sequence number";
   PacketPtr pkt,             desc="Packet associated with this request";
+  bool htmFromTransaction,   desc="Memory request originates within a HTM transaction";
+  int htmTransactionUid,     desc="Used to identify the unique HTM transaction that produced this request";
 }
 
 structure(AbstractCacheEntry, primitive="yes", external = "yes") {
@@ -185,6 +193,10 @@
   void recordRequestType(CacheRequestType, Addr);
   bool checkResourceAvailable(CacheResourceType, Addr);
 
+  // hardware transactional memory
+  void htmCommitTransaction();
+  void htmAbortTransaction();
+
   int getCacheSize();
   int getNumBlocks();
   Addr getAddressAtIdx(int);

diff --git a/src/mem/ruby/protocol/SConsopts b/src/mem/ruby/protocol/SConsopts
index 4a309e6..104d425 100644
--- a/src/mem/ruby/protocol/SConsopts
+++ b/src/mem/ruby/protocol/SConsopts

@@ -38,6 +38,7 @@
     'MOESI_AMD_Base',
     'MESI_Two_Level',
     'MESI_Three_Level',
+    'MESI_Three_Level_HTM',
     'MI_example',
     'MOESI_CMP_directory',
     'MOESI_CMP_token',

diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.cc b/src/mem/ruby/slicc_interface/AbstractCacheEntry.cc
index b98425d..c669b1c 100644
--- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.cc
+++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.cc

@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2020 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
  * All rights reserved.
  *
@@ -37,6 +49,8 @@
     m_Address = 0;
     m_locked = -1;
     m_last_touch_tick = 0;
+    m_htmInReadSet = false;
+    m_htmInWriteSet = false;
 }
 
 AbstractCacheEntry::~AbstractCacheEntry()
@@ -81,3 +95,27 @@
             m_Address, m_locked, context);
     return m_locked == context;
 }
+
+void
+AbstractCacheEntry::setInHtmReadSet(bool val)
+{
+    m_htmInReadSet = val;
+}
+
+void
+AbstractCacheEntry::setInHtmWriteSet(bool val)
+{
+    m_htmInWriteSet = val;
+}
+
+bool
+AbstractCacheEntry::getInHtmReadSet() const
+{
+    return m_htmInReadSet;
+}
+
+bool
+AbstractCacheEntry::getInHtmWriteSet() const
+{
+    return m_htmInWriteSet;
+}

diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
index 056486c..aa37cc5 100644
--- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
+++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh

@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2020 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
  * All rights reserved.
  *
@@ -90,6 +102,18 @@
 
     // Set the last access Tick.
     void setLastAccess(Tick tick) { m_last_touch_tick = tick; }
+
+    // hardware transactional memory
+    void setInHtmReadSet(bool val);
+    void setInHtmWriteSet(bool val);
+    bool getInHtmReadSet() const;
+    bool getInHtmWriteSet() const;
+    virtual void invalidateEntry() {}
+
+  private:
+    // hardware transactional memory
+    bool m_htmInReadSet;
+    bool m_htmInWriteSet;
 };
 
 inline std::ostream&

diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index b3e2396..ed8dbbb 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh

@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2020 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2009 Mark D. Hill and David A. Wood
  * All rights reserved.
  *
@@ -57,6 +69,8 @@
     DataBlock m_WTData;
     int m_wfid;
     uint64_t m_instSeqNum;
+    bool m_htmFromTransaction;
+    uint64_t m_htmTransactionUid;
 
     RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
@@ -71,7 +85,9 @@
           m_Prefetch(_pb),
           data(_data),
           m_pkt(_pkt),
-          m_contextId(_core_id)
+          m_contextId(_core_id),
+          m_htmFromTransaction(false),
+          m_htmTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
     }
@@ -96,7 +112,9 @@
           m_writeMask(_wm_size,_wm_mask),
           m_WTData(_Data),
           m_wfid(_proc_id),
-          m_instSeqNum(_instSeqNum)
+          m_instSeqNum(_instSeqNum),
+          m_htmFromTransaction(false),
+          m_htmTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
     }
@@ -122,7 +140,9 @@
           m_writeMask(_wm_size,_wm_mask,_atomicOps),
           m_WTData(_Data),
           m_wfid(_proc_id),
-          m_instSeqNum(_instSeqNum)
+          m_instSeqNum(_instSeqNum),
+          m_htmFromTransaction(false),
+          m_htmTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
     }

diff --git a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
index e3d4f0b..8ff8884 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh

@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2020 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
@@ -85,6 +97,75 @@
   return 1024;
 }
 
+inline bool
+isWriteRequest(RubyRequestType type)
+{
+    if ((type == RubyRequestType_ST) ||
+        (type == RubyRequestType_ATOMIC) ||
+        (type == RubyRequestType_RMW_Read) ||
+        (type == RubyRequestType_RMW_Write) ||
+        (type == RubyRequestType_Store_Conditional) ||
+        (type == RubyRequestType_Locked_RMW_Read) ||
+        (type == RubyRequestType_Locked_RMW_Write) ||
+        (type == RubyRequestType_FLUSH)) {
+            return true;
+    } else {
+            return false;
+    }
+}
+
+inline bool
+isDataReadRequest(RubyRequestType type)
+{
+    if ((type == RubyRequestType_LD) ||
+        (type == RubyRequestType_Load_Linked)) {
+            return true;
+    } else {
+            return false;
+    }
+}
+
+inline bool
+isReadRequest(RubyRequestType type)
+{
+    if (isDataReadRequest(type) ||
+        (type == RubyRequestType_IFETCH)) {
+            return true;
+    } else {
+            return false;
+    }
+}
+
+inline bool
+isHtmCmdRequest(RubyRequestType type)
+{
+    if ((type == RubyRequestType_HTM_Start)  ||
+        (type == RubyRequestType_HTM_Commit) ||
+        (type == RubyRequestType_HTM_Cancel) ||
+        (type == RubyRequestType_HTM_Abort)) {
+            return true;
+    } else {
+            return false;
+    }
+}
+
+inline RubyRequestType
+htmCmdToRubyRequestType(const Packet *pkt)
+{
+    if (pkt->req->isHTMStart()) {
+        return RubyRequestType_HTM_Start;
+    } else if (pkt->req->isHTMCommit()) {
+        return RubyRequestType_HTM_Commit;
+    } else if (pkt->req->isHTMCancel()) {
+        return RubyRequestType_HTM_Cancel;
+    } else if (pkt->req->isHTMAbort()) {
+        return RubyRequestType_HTM_Abort;
+    }
+    else {
+        panic("invalid ruby packet type\n");
+    }
+}
+
 /**
  * This function accepts an address, a data block and a packet. If the address
  * range for the data block contains the address which the packet needs to

diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index efdd77d..b3f2c61 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc

@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2020 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 1999-2012 Mark D. Hill and David A. Wood
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
@@ -31,6 +43,7 @@
 
 #include "base/intmath.hh"
 #include "base/logging.hh"
+#include "debug/HtmMem.hh"
 #include "debug/RubyCache.hh"
 #include "debug/RubyCacheTrace.hh"
 #include "debug/RubyResourceStalls.hh"
@@ -479,6 +492,23 @@
     entry->clearLocked();
 }
 
+void
+CacheMemory::clearLockedAll(int context)
+{
+    // iterate through every set and way to get a cache line
+    for (auto i = m_cache.begin(); i != m_cache.end(); ++i) {
+        std::vector<AbstractCacheEntry*> set = *i;
+        for (auto j = set.begin(); j != set.end(); ++j) {
+            AbstractCacheEntry *line = *j;
+            if (line && line->isLocked(context)) {
+                DPRINTF(RubyCache, "Clear Lock for addr: %#x\n",
+                    line->m_Address);
+                line->clearLocked();
+            }
+        }
+    }
+}
+
 bool
 CacheMemory::isLocked(Addr address, int context)
 {
@@ -578,6 +608,34 @@
         .desc("number of stalls caused by data array")
         .flags(Stats::nozero)
         ;
+
+    htmTransCommitReadSet
+        .init(8)
+        .name(name() + ".htm_transaction_committed_read_set")
+        .desc("read set size of a committed transaction")
+        .flags(Stats::pdf | Stats::dist | Stats::nozero | Stats::nonan)
+        ;
+
+    htmTransCommitWriteSet
+        .init(8)
+        .name(name() + ".htm_transaction_committed_write_set")
+        .desc("write set size of a committed transaction")
+        .flags(Stats::pdf | Stats::dist | Stats::nozero | Stats::nonan)
+        ;
+
+    htmTransAbortReadSet
+        .init(8)
+        .name(name() + ".htm_transaction_aborted_read_set")
+        .desc("read set size of a aborted transaction")
+        .flags(Stats::pdf | Stats::dist | Stats::nozero | Stats::nonan)
+        ;
+
+    htmTransAbortWriteSet
+        .init(8)
+        .name(name() + ".htm_transaction_aborted_write_set")
+        .desc("write set size of a aborted transaction")
+        .flags(Stats::pdf | Stats::dist | Stats::nozero | Stats::nonan)
+        ;
 }
 
 // assumption: SLICC generated files will only call this function
@@ -655,3 +713,69 @@
 {
   return (m_cache[cache_set][loc]->m_Permission != AccessPermission_Busy);
 }
+
+/* hardware transactional memory */
+
+void
+CacheMemory::htmAbortTransaction()
+{
+    uint64_t htmReadSetSize = 0;
+    uint64_t htmWriteSetSize = 0;
+
+    // iterate through every set and way to get a cache line
+    for (auto i = m_cache.begin(); i != m_cache.end(); ++i)
+    {
+        std::vector<AbstractCacheEntry*> set = *i;
+
+        for (auto j = set.begin(); j != set.end(); ++j)
+        {
+            AbstractCacheEntry *line = *j;
+
+            if (line != nullptr) {
+                htmReadSetSize += (line->getInHtmReadSet() ? 1 : 0);
+                htmWriteSetSize += (line->getInHtmWriteSet() ? 1 : 0);
+                if (line->getInHtmWriteSet()) {
+                    line->invalidateEntry();
+                }
+                line->setInHtmWriteSet(false);
+                line->setInHtmReadSet(false);
+                line->clearLocked();
+            }
+        }
+    }
+
+    htmTransAbortReadSet.sample(htmReadSetSize);
+    htmTransAbortWriteSet.sample(htmWriteSetSize);
+    DPRINTF(HtmMem, "htmAbortTransaction: read set=%u write set=%u\n",
+        htmReadSetSize, htmWriteSetSize);
+}
+
+void
+CacheMemory::htmCommitTransaction()
+{
+    uint64_t htmReadSetSize = 0;
+    uint64_t htmWriteSetSize = 0;
+
+    // iterate through every set and way to get a cache line
+    for (auto i = m_cache.begin(); i != m_cache.end(); ++i)
+    {
+        std::vector<AbstractCacheEntry*> set = *i;
+
+        for (auto j = set.begin(); j != set.end(); ++j)
+        {
+            AbstractCacheEntry *line = *j;
+            if (line != nullptr) {
+                htmReadSetSize += (line->getInHtmReadSet() ? 1 : 0);
+                htmWriteSetSize += (line->getInHtmWriteSet() ? 1 : 0);
+                line->setInHtmWriteSet(false);
+                line->setInHtmReadSet(false);
+                line->clearLocked();
+             }
+        }
+    }
+
+    htmTransCommitReadSet.sample(htmReadSetSize);
+    htmTransCommitWriteSet.sample(htmWriteSetSize);
+    DPRINTF(HtmMem, "htmCommitTransaction: read set=%u write set=%u\n",
+        htmReadSetSize, htmWriteSetSize);
+}

diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index 7d82d5f..0899e68 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh

@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2020 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 1999-2012 Mark D. Hill and David A. Wood
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
@@ -121,6 +133,7 @@
     // provided by the AbstractCacheEntry class.
     void setLocked (Addr addr, int context);
     void clearLocked (Addr addr);
+    void clearLockedAll (int context);
     bool isLocked (Addr addr, int context);
 
     // Print cache contents
@@ -131,6 +144,10 @@
     bool checkResourceAvailable(CacheResourceType res, Addr addr);
     void recordRequestType(CacheRequestType requestType, Addr addr);
 
+    // hardware transactional memory
+    void htmAbortTransaction();
+    void htmCommitTransaction();
+
   public:
     Stats::Scalar m_demand_hits;
     Stats::Scalar m_demand_misses;
@@ -150,6 +167,12 @@
     Stats::Scalar numTagArrayStalls;
     Stats::Scalar numDataArrayStalls;
 
+    // hardware transactional memory
+    Stats::Histogram htmTransCommitReadSet;
+    Stats::Histogram htmTransCommitWriteSet;
+    Stats::Histogram htmTransAbortReadSet;
+    Stats::Histogram htmTransAbortWriteSet;
+
     int getCacheSize() const { return m_cache_size; }
     int getCacheAssoc() const { return m_cache_assoc; }
     int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; }

diff --git a/src/mem/ruby/system/HTMSequencer.cc b/src/mem/ruby/system/HTMSequencer.cc
new file mode 100644
index 0000000..d2cfa07
--- /dev/null
+++ b/src/mem/ruby/system/HTMSequencer.cc

@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2020 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mem/ruby/system/HTMSequencer.hh"
+
+#include "debug/HtmMem.hh"
+#include "debug/RubyPort.hh"
+#include "mem/ruby/slicc_interface/RubySlicc_Util.hh"
+#include "sim/system.hh"
+
+using namespace std;
+
+HtmCacheFailure
+HTMSequencer::htmRetCodeConversion(
+    const HtmFailedInCacheReason ruby_ret_code)
+{
+    switch (ruby_ret_code) {
+      case HtmFailedInCacheReason_NO_FAIL:
+        return HtmCacheFailure::NO_FAIL;
+      case HtmFailedInCacheReason_FAIL_SELF:
+        return HtmCacheFailure::FAIL_SELF;
+      case HtmFailedInCacheReason_FAIL_REMOTE:
+        return HtmCacheFailure::FAIL_REMOTE;
+      case HtmFailedInCacheReason_FAIL_OTHER:
+        return HtmCacheFailure::FAIL_OTHER;
+      default:
+        panic("Invalid htm return code\n");
+    }
+}
+
+HTMSequencer *
+RubyHTMSequencerParams::create()
+{
+    return new HTMSequencer(this);
+}
+
+HTMSequencer::HTMSequencer(const RubyHTMSequencerParams *p)
+    : Sequencer(p)
+{
+    m_htmstart_tick = 0;
+    m_htmstart_instruction = 0;
+}
+
+HTMSequencer::~HTMSequencer()
+{
+}
+
+void
+HTMSequencer::htmCallback(Addr address,
+                       const HtmCallbackMode mode,
+                       const HtmFailedInCacheReason htm_return_code)
+{
+    // mode=0: HTM command
+    // mode=1: transaction failed - inform via LD
+    // mode=2: transaction failed - inform via ST
+
+    if (mode == HtmCallbackMode_HTM_CMD) {
+        SequencerRequest* request = nullptr;
+
+        assert(m_htmCmdRequestTable.size() > 0);
+
+        request = m_htmCmdRequestTable.front();
+        m_htmCmdRequestTable.pop_front();
+
+        assert(isHtmCmdRequest(request->m_type));
+
+        PacketPtr pkt = request->pkt;
+        delete request;
+
+        // valid responses have zero as the payload
+        uint8_t* dataptr = pkt->getPtr<uint8_t>();
+        memset(dataptr, 0, pkt->getSize());
+        *dataptr = (uint8_t) htm_return_code;
+
+        // record stats
+        if (htm_return_code == HtmFailedInCacheReason_NO_FAIL) {
+            if (pkt->req->isHTMStart()) {
+                m_htmstart_tick = pkt->req->time();
+                m_htmstart_instruction = pkt->req->getInstCount();
+                DPRINTF(HtmMem, "htmStart - htmUid=%u\n",
+                        pkt->getHtmTransactionUid());
+            } else if (pkt->req->isHTMCommit()) {
+                Tick transaction_ticks = pkt->req->time() - m_htmstart_tick;
+                Cycles transaction_cycles = ticksToCycles(transaction_ticks);
+                m_htm_transaction_cycles.sample(transaction_cycles);
+                m_htmstart_tick = 0;
+                Counter transaction_instructions =
+                    pkt->req->getInstCount() - m_htmstart_instruction;
+                m_htm_transaction_instructions.sample(
+                  transaction_instructions);
+                m_htmstart_instruction = 0;
+                DPRINTF(HtmMem, "htmCommit - htmUid=%u\n",
+                        pkt->getHtmTransactionUid());
+            } else if (pkt->req->isHTMAbort()) {
+                HtmFailureFaultCause cause = pkt->req->getHtmAbortCause();
+                assert(cause != HtmFailureFaultCause::INVALID);
+                auto cause_idx = static_cast<int>(cause);
+                m_htm_transaction_abort_cause[cause_idx]++;
+                DPRINTF(HtmMem, "htmAbort - reason=%s - htmUid=%u\n",
+                        htmFailureToStr(cause),
+                        pkt->getHtmTransactionUid());
+            }
+        } else {
+            DPRINTF(HtmMem, "HTM_CMD: fail - htmUid=%u\n",
+                pkt->getHtmTransactionUid());
+        }
+
+        rubyHtmCallback(pkt, htm_return_code);
+        testDrainComplete();
+    } else if (mode == HtmCallbackMode_LD_FAIL ||
+               mode == HtmCallbackMode_ST_FAIL) {
+        // transaction failed
+        assert(address == makeLineAddress(address));
+        assert(m_RequestTable.find(address) != m_RequestTable.end());
+
+        auto &seq_req_list = m_RequestTable[address];
+        while (!seq_req_list.empty()) {
+            SequencerRequest &request = seq_req_list.front();
+
+            PacketPtr pkt = request.pkt;
+            markRemoved();
+
+            // TODO - atomics
+
+            // store conditionals should indicate failure
+            if (request.m_type == RubyRequestType_Store_Conditional) {
+                pkt->req->setExtraData(0);
+            }
+
+            DPRINTF(HtmMem, "%s_FAIL: size=%d - "
+                            "addr=0x%lx - htmUid=%d\n",
+                            (mode == HtmCallbackMode_LD_FAIL) ? "LD" : "ST",
+                            pkt->getSize(),
+                            address, pkt->getHtmTransactionUid());
+
+            rubyHtmCallback(pkt, htm_return_code);
+            testDrainComplete();
+            pkt = nullptr;
+            seq_req_list.pop_front();
+        }
+        // free all outstanding requests corresponding to this address
+        if (seq_req_list.empty()) {
+            m_RequestTable.erase(address);
+        }
+    } else {
+        panic("unrecognised HTM callback mode\n");
+    }
+}
+
+void
+HTMSequencer::regStats()
+{
+    Sequencer::regStats();
+
+    // hardware transactional memory
+    m_htm_transaction_cycles
+        .init(10)
+        .name(name() + ".htm_transaction_cycles")
+        .desc("number of cycles spent in an outer transaction")
+        .flags(Stats::pdf | Stats::dist | Stats::nozero | Stats::nonan)
+        ;
+    m_htm_transaction_instructions
+        .init(10)
+        .name(name() + ".htm_transaction_instructions")
+        .desc("number of instructions spent in an outer transaction")
+        .flags(Stats::pdf | Stats::dist | Stats::nozero | Stats::nonan)
+        ;
+    auto num_causes = static_cast<int>(HtmFailureFaultCause::NUM_CAUSES);
+    m_htm_transaction_abort_cause
+        .init(num_causes)
+        .name(name() + ".htm_transaction_abort_cause")
+        .desc("cause of htm transaction abort")
+        .flags(Stats::total | Stats::pdf | Stats::dist | Stats::nozero)
+        ;
+
+    for (unsigned cause_idx = 0; cause_idx < num_causes; ++cause_idx) {
+        m_htm_transaction_abort_cause.subname(
+            cause_idx,
+            htmFailureToStr(HtmFailureFaultCause(cause_idx)));
+    }
+}
+
+void
+HTMSequencer::rubyHtmCallback(PacketPtr pkt,
+                          const HtmFailedInCacheReason htm_return_code)
+{
+    // The packet was destined for memory and has not yet been turned
+    // into a response
+    assert(system->isMemAddr(pkt->getAddr()) || system->isDeviceMemAddr(pkt));
+    assert(pkt->isRequest());
+
+    // First retrieve the request port from the sender State
+    RubyPort::SenderState *senderState =
+        safe_cast<RubyPort::SenderState *>(pkt->popSenderState());
+
+    MemSlavePort *port = safe_cast<MemSlavePort*>(senderState->port);
+    assert(port != nullptr);
+    delete senderState;
+
+    //port->htmCallback(pkt, htm_return_code);
+    DPRINTF(HtmMem, "HTM callback: start=%d, commit=%d, "
+                    "cancel=%d, rc=%d\n",
+            pkt->req->isHTMStart(), pkt->req->isHTMCommit(),
+            pkt->req->isHTMCancel(), htm_return_code);
+
+    // turn packet around to go back to requester if response expected
+    if (pkt->needsResponse()) {
+        DPRINTF(RubyPort, "Sending packet back over port\n");
+        pkt->makeHtmTransactionalReqResponse(
+            htmRetCodeConversion(htm_return_code));
+        port->schedTimingResp(pkt, curTick());
+    } else {
+        delete pkt;
+    }
+
+    trySendRetries();
+}
+
+void
+HTMSequencer::wakeup()
+{
+    Sequencer::wakeup();
+
+    // Check for deadlock of any of the requests
+    Cycles current_time = curCycle();
+
+    // hardware transactional memory commands
+    std::deque<SequencerRequest*>::iterator htm =
+      m_htmCmdRequestTable.begin();
+    std::deque<SequencerRequest*>::iterator htm_end =
+      m_htmCmdRequestTable.end();
+
+    for (; htm != htm_end; ++htm) {
+        SequencerRequest* request = *htm;
+        if (current_time - request->issue_time < m_deadlock_threshold)
+            continue;
+
+        panic("Possible Deadlock detected. Aborting!\n"
+              "version: %d m_htmCmdRequestTable: %d "
+              "current time: %u issue_time: %d difference: %d\n",
+              m_version, m_htmCmdRequestTable.size(),
+              current_time * clockPeriod(),
+              request->issue_time * clockPeriod(),
+              (current_time * clockPeriod()) -
+              (request->issue_time * clockPeriod()));
+    }
+}
+
+bool
+HTMSequencer::empty() const
+{
+    return Sequencer::empty() && m_htmCmdRequestTable.empty();
+}
+
+template <class VALUE>
+std::ostream &
+operator<<(ostream &out, const std::deque<VALUE> &queue)
+{
+    auto i = queue.begin();
+    auto end = queue.end();
+
+    out << "[";
+    for (; i != end; ++i)
+        out << " " << *i;
+    out << " ]";
+
+    return out;
+}
+
+void
+HTMSequencer::print(ostream& out) const
+{
+    Sequencer::print(out);
+
+    out << "+ [HTMSequencer: " << m_version
+        << ", htm cmd request table: " << m_htmCmdRequestTable
+        << "]";
+}
+
+// Insert the request in the request table. Return RequestStatus_Aliased
+// if the entry was already present.
+RequestStatus
+HTMSequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
+                            RubyRequestType secondary_type)
+{
+    if (isHtmCmdRequest(primary_type)) {
+        // for the moment, allow just one HTM cmd into the cache controller.
+        // Later this can be adjusted for optimization, e.g.
+        // back-to-back HTM_Starts.
+        if ((m_htmCmdRequestTable.size() > 0) && !pkt->req->isHTMAbort())
+            return RequestStatus_BufferFull;
+
+        // insert request into HtmCmd queue
+        SequencerRequest* htmReq =
+            new SequencerRequest(pkt, primary_type, secondary_type,
+                curCycle());
+        assert(htmReq);
+        m_htmCmdRequestTable.push_back(htmReq);
+        return RequestStatus_Ready;
+    } else {
+        return Sequencer::insertRequest(pkt, primary_type, secondary_type);
+    }
+}

diff --git a/src/mem/ruby/system/HTMSequencer.hh b/src/mem/ruby/system/HTMSequencer.hh
new file mode 100644
index 0000000..5add836
--- /dev/null
+++ b/src/mem/ruby/system/HTMSequencer.hh

@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2020 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_HTMSEQUENCER_HH__
+#define __MEM_RUBY_SYSTEM_HTMSEQUENCER_HH__
+
+#include <cassert>
+#include <iostream>
+
+#include "mem/htm.hh"
+#include "mem/ruby/protocol/HtmCallbackMode.hh"
+#include "mem/ruby/protocol/HtmFailedInCacheReason.hh"
+#include "mem/ruby/system/RubyPort.hh"
+#include "mem/ruby/system/Sequencer.hh"
+#include "params/RubyHTMSequencer.hh"
+
+class HTMSequencer : public Sequencer
+{
+  public:
+    HTMSequencer(const RubyHTMSequencerParams *p);
+    ~HTMSequencer();
+
+    // callback to acknowledge HTM requests and
+    // notify cpu core when htm transaction fails in cache
+    void htmCallback(Addr,
+                     const HtmCallbackMode,
+                     const HtmFailedInCacheReason);
+
+    bool empty() const override;
+    void print(std::ostream& out) const override;
+    void regStats() override;
+    void wakeup() override;
+
+  private:
+    /**
+     * Htm return code conversion
+     *
+     * This helper is a hack meant to convert the autogenerated ruby
+     * enum (HtmFailedInCacheReason) to the manually defined one
+     * (HtmCacheFailure). This is needed since the cpu code would
+     * otherwise have to include the ruby generated headers in order
+     * to handle the htm return code.
+    */
+    HtmCacheFailure htmRetCodeConversion(const HtmFailedInCacheReason rc);
+
+    void rubyHtmCallback(PacketPtr pkt, const HtmFailedInCacheReason fail_r);
+
+    RequestStatus insertRequest(PacketPtr pkt,
+                                RubyRequestType primary_type,
+                                RubyRequestType secondary_type) override;
+
+    // Private copy constructor and assignment operator
+    HTMSequencer(const HTMSequencer& obj);
+    HTMSequencer& operator=(const HTMSequencer& obj);
+
+    // table/queue for hardware transactional memory commands
+    // these do not have an address so a deque/queue is used instead.
+    std::deque<SequencerRequest*> m_htmCmdRequestTable;
+
+    Tick m_htmstart_tick;
+    Counter m_htmstart_instruction;
+
+    //! Histogram of cycle latencies of HTM transactions
+    Stats::Histogram m_htm_transaction_cycles;
+    //! Histogram of instruction lengths of HTM transactions
+    Stats::Histogram m_htm_transaction_instructions;
+    //! Causes for HTM transaction aborts
+    Stats::Vector m_htm_transaction_abort_cause;
+};
+
+inline std::ostream&
+operator<<(std::ostream& out, const HTMSequencer& obj)
+{
+    obj.print(out);
+    out << std::flush;
+    return out;
+}
+
+#endif // __MEM_RUBY_SYSTEM_HTMSEQUENCER_HH__

diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 4510e3a..bb86e60 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013,2019 ARM Limited
+ * Copyright (c) 2012-2013,2020 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -169,6 +169,7 @@
 {
     // got a response from a device
     assert(pkt->isResponse());
+    assert(!pkt->htmTransactionFailedInCache());
 
     // First we must retrieve the request port from the sender State
     RubyPort::SenderState *senderState =
@@ -253,6 +254,7 @@
     // pio port.
     if (pkt->cmd != MemCmd::MemSyncReq) {
         if (!isPhysMemAddress(pkt)) {
+            assert(!pkt->req->isHTMCmd());
             assert(ruby_port->memMasterPort.isConnected());
             DPRINTF(RubyPort, "Request address %#x assumed to be a "
                     "pio address\n", pkt->getAddr());
@@ -638,7 +640,6 @@
     }
 }
 
-
 int
 RubyPort::functionalWrite(Packet *func_pkt)
 {

diff --git a/src/mem/ruby/system/SConscript b/src/mem/ruby/system/SConscript
index 7496971..a5d2fb1 100644
--- a/src/mem/ruby/system/SConscript
+++ b/src/mem/ruby/system/SConscript

@@ -56,6 +56,7 @@
 Source('DMASequencer.cc')
 if env['BUILD_GPU']:
     Source('GPUCoalescer.cc')
+Source('HTMSequencer.cc')
 Source('RubyPort.cc')
 Source('RubyPortProxy.cc')
 Source('RubySystem.cc')

diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 42398e2..75c58d6 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc

@@ -55,6 +55,7 @@
 #include "mem/ruby/protocol/PrefetchBit.hh"
 #include "mem/ruby/protocol/RubyAccessMode.hh"
 #include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/slicc_interface/RubySlicc_Util.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "sim/system.hh"
 
@@ -149,6 +150,12 @@
 }
 
 void
+Sequencer::llscClearLocalMonitor()
+{
+    m_dataCache_ptr->clearLockedAll(m_version);
+}
+
+void
 Sequencer::wakeup()
 {
     assert(drainState() != DrainState::Draining);
@@ -243,7 +250,8 @@
     // Check if there is any outstanding request for the same cache line.
     auto &seq_req_list = m_RequestTable[line_addr];
     // Create a default entry
-    seq_req_list.emplace_back(pkt, primary_type, secondary_type, curCycle());
+    seq_req_list.emplace_back(pkt, primary_type,
+        secondary_type, curCycle());
     m_outstanding_count++;
 
     if (seq_req_list.size() > 1) {
@@ -569,7 +577,10 @@
 RequestStatus
 Sequencer::makeRequest(PacketPtr pkt)
 {
-    if (m_outstanding_count >= m_max_outstanding_requests) {
+    // HTM abort signals must be allowed to reach the Sequencer
+    // the same cycle they are issued. They cannot be retried.
+    if ((m_outstanding_count >= m_max_outstanding_requests) &&
+        !pkt->req->isHTMAbort()) {
         return RequestStatus_BufferFull;
     }
 
@@ -590,7 +601,7 @@
         if (pkt->isWrite()) {
             DPRINTF(RubySequencer, "Issuing SC\n");
             primary_type = RubyRequestType_Store_Conditional;
-#ifdef PROTOCOL_MESI_Three_Level
+#if defined (PROTOCOL_MESI_Three_Level) || defined (PROTOCOL_MESI_Three_Level_HTM)
             secondary_type = RubyRequestType_Store_Conditional;
 #else
             secondary_type = RubyRequestType_ST;
@@ -629,7 +640,10 @@
             //
             primary_type = secondary_type = RubyRequestType_ST;
         } else if (pkt->isRead()) {
-            if (pkt->req->isInstFetch()) {
+            // hardware transactional memory commands
+            if (pkt->req->isHTMCmd()) {
+                primary_type = secondary_type = htmCmdToRubyRequestType(pkt);
+            } else if (pkt->req->isInstFetch()) {
                 primary_type = secondary_type = RubyRequestType_IFETCH;
             } else {
                 bool storeCheck = false;
@@ -706,6 +720,14 @@
             printAddress(msg->getPhysicalAddress()),
             RubyRequestType_to_string(secondary_type));
 
+    // hardware transactional memory
+    // If the request originates in a transaction,
+    // then mark the Ruby message as such.
+    if (pkt->isHtmTransactional()) {
+        msg->m_htmFromTransaction = true;
+        msg->m_htmTransactionUid = pkt->getHtmTransactionUid();
+    }
+
     Tick latency = cyclesToTicks(
                         m_controller->mandatoryQueueLatency(secondary_type));
     assert(latency > 0);

diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index ebca568..92fdab6 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh

@@ -92,7 +92,7 @@
                         DataBlock& data);
 
     // Public Methods
-    void wakeup(); // Used only for deadlock detection
+    virtual void wakeup(); // Used only for deadlock detection
     void resetStats() override;
     void collateStats();
     void regStats() override;
@@ -114,7 +114,7 @@
                       const Cycles firstResponseTime = Cycles(0));
 
     RequestStatus makeRequest(PacketPtr pkt) override;
-    bool empty() const;
+    virtual bool empty() const;
     int outstandingCount() const override { return m_outstanding_count; }
 
     bool isDeadlockEventScheduled() const override
@@ -123,7 +123,7 @@
     void descheduleDeadlockEvent() override
     { deschedule(deadlockCheckEvent); }
 
-    void print(std::ostream& out) const;
+    virtual void print(std::ostream& out) const;
 
     void markRemoved();
     void evictionCallback(Addr address);
@@ -194,16 +194,22 @@
                            Cycles forwardRequestTime,
                            Cycles firstResponseTime);
 
-    RequestStatus insertRequest(PacketPtr pkt, RubyRequestType primary_type,
-                                RubyRequestType secondary_type);
-
     // Private copy constructor and assignment operator
     Sequencer(const Sequencer& obj);
     Sequencer& operator=(const Sequencer& obj);
 
+  protected:
+    // RequestTable contains both read and write requests, handles aliasing
+    std::unordered_map<Addr, std::list<SequencerRequest>> m_RequestTable;
+
+    Cycles m_deadlock_threshold;
+
+    virtual RequestStatus insertRequest(PacketPtr pkt,
+                                        RubyRequestType primary_type,
+                                        RubyRequestType secondary_type);
+
   private:
     int m_max_outstanding_requests;
-    Cycles m_deadlock_threshold;
 
     CacheMemory* m_dataCache_ptr;
     CacheMemory* m_instCache_ptr;
@@ -215,9 +221,6 @@
     Cycles m_data_cache_hit_latency;
     Cycles m_inst_cache_hit_latency;
 
-    // RequestTable contains both read and write requests, handles aliasing
-    std::unordered_map<Addr, std::list<SequencerRequest>> m_RequestTable;
-
     // Global outstanding request count, across all request tables
     int m_outstanding_count;
     bool m_deadlock_check_scheduled;
@@ -294,6 +297,13 @@
      * @return a boolean indicating if the line address was found.
      */
     bool llscCheckMonitor(const Addr);
+
+
+    /**
+     * Removes all addresses from the local monitor.
+     * This is independent of this Sequencer object's version id.
+     */
+    void llscClearLocalMonitor();
 };
 
 inline std::ostream&

diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
index f97224d..47c5b41 100644
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py

@@ -1,4 +1,5 @@
 # Copyright (c) 2009 Advanced Micro Devices, Inc.
+# Copyright (c) 2020 ARM Limited
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -70,6 +71,11 @@
    # 99 is the dummy default value
    coreid = Param.Int(99, "CorePair core id")
 
+class RubyHTMSequencer(RubySequencer):
+   type = 'RubyHTMSequencer'
+   cxx_class = 'HTMSequencer'
+   cxx_header = "mem/ruby/system/HTMSequencer.hh"
+
 class DMASequencer(RubyPort):
    type = 'DMASequencer'
    cxx_header = "mem/ruby/system/DMASequencer.hh"

diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py
index 987f3b5..7f92d87 100644
--- a/src/mem/slicc/symbols/StateMachine.py
+++ b/src/mem/slicc/symbols/StateMachine.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2019 ARM Limited
+# Copyright (c) 2019-2020 ARM Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -54,6 +54,7 @@
                     "CacheMemory": "RubyCache",
                     "WireBuffer": "RubyWireBuffer",
                     "Sequencer": "RubySequencer",
+                    "HTMSequencer": "RubyHTMSequencer",
                     "GPUCoalescer" : "RubyGPUCoalescer",
                     "VIPERCoalescer" : "VIPERCoalescer",
                     "DirectoryMemory": "RubyDirectoryMemory",
commit	0a8a787de3bfd6b37b1ca0f973a033ec4482c532	[log] [tgz]
author	Timothy Hayes <timothy.hayes@arm.com>	Tue Jan 14 16:29:09 2020 +0000
committer	Giacomo Travaglini <giacomo.travaglini@arm.com>	Tue Sep 08 09:13:30 2020 +0000
tree	4b5a74a848b0c518d82a57b3e16395de8479d148
parent	1c61dae99b27380d9d5230b1b66cf3e54c7da663 [diff]