cpu: HTM Implementation for O3CPU

JIRA: https://gem5.atlassian.net/browse/GEM5-587

Change-Id: I83787f4594963a15d856b81ad283b4f032d1c007
Signed-off-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30328
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index 31dee6c..b0e9ef2 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -61,6 +61,7 @@
 #include "cpu/op_class.hh"
 #include "cpu/static_inst.hh"
 #include "cpu/translation.hh"
+#include "debug/HtmCpu.hh"
 #include "mem/packet.hh"
 #include "mem/request.hh"
 #include "sim/byteswap.hh"
@@ -140,6 +141,7 @@
         IsStrictlyOrdered,
         ReqMade,
         MemOpDone,
+        HtmFromTransaction,
         MaxFlags
     };
 
@@ -240,6 +242,11 @@
     // Need a copy of main request pointer to verify on writes.
     RequestPtr reqToVerify;
 
+  private:
+    // hardware transactional memory
+    uint64_t htmUid;
+    uint64_t htmDepth;
+
   protected:
     /** Flattened register index of the destination registers of this
      *  instruction.
@@ -548,8 +555,8 @@
 
     uint64_t getHtmTransactionUid() const override
     {
-        panic("Not yet implemented\n");
-        return 0;
+        assert(instFlags[HtmFromTransaction]);
+        return this->htmUid;
     }
 
     uint64_t newHtmTransactionUid() const override
@@ -560,14 +567,35 @@
 
     bool inHtmTransactionalState() const override
     {
-        panic("Not yet implemented\n");
-        return false;
+        return instFlags[HtmFromTransaction];
     }
 
     uint64_t getHtmTransactionalDepth() const override
     {
-        panic("Not yet implemented\n");
-        return 0;
+        if (inHtmTransactionalState())
+            return this->htmDepth;
+        else
+            return 0;
+    }
+
+    void setHtmTransactionalState(uint64_t htm_uid, uint64_t htm_depth)
+    {
+        instFlags.set(HtmFromTransaction);
+        htmUid = htm_uid;
+        htmDepth = htm_depth;
+    }
+
+    void clearHtmTransactionalState()
+    {
+        if (inHtmTransactionalState()) {
+            DPRINTF(HtmCpu,
+                "clearing instuction's transactional state htmUid=%u\n",
+                getHtmTransactionUid());
+
+            instFlags.reset(HtmFromTransaction);
+            htmUid = -1;
+            htmDepth = 0;
+        }
     }
 
     /** Temporarily sets this instruction as a serialize before instruction. */
@@ -997,8 +1025,9 @@
 Fault
 BaseDynInst<Impl>::initiateHtmCmd(Request::Flags flags)
 {
-    panic("Not yet implemented\n");
-    return NoFault;
+    return cpu->pushRequest(
+            dynamic_cast<typename DynInstPtr::PtrType>(this),
+            /* ld */ true, nullptr, 8, 0x0ul, flags, nullptr, nullptr);
 }
 
 template<class Impl>
diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh
index 45b938d..bfe8ff5 100644
--- a/src/cpu/base_dyn_inst_impl.hh
+++ b/src/cpu/base_dyn_inst_impl.hh
@@ -95,6 +95,9 @@
     physEffAddr = 0;
     readyRegs = 0;
     memReqFlags = 0;
+    // hardware transactional memory
+    htmUid = -1;
+    htmDepth = 0;
 
     status.reset();
 
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index 69d1c86..85d00a9 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2012, 2014 ARM Limited
+ * Copyright (c) 2010-2012, 2014, 2019 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -205,6 +205,12 @@
     /** Deschedules a thread from scheduling */
     void deactivateThread(ThreadID tid);
 
+    /** Is the CPU currently processing a HTM transaction? */
+    bool executingHtmTransaction(ThreadID) const;
+
+    /* Reset HTM tracking, e.g. after an abort */
+    void resetHtmStartsStops(ThreadID);
+
     /** Ticks the commit stage, which tries to commit instructions. */
     void tick();
 
@@ -473,6 +479,11 @@
     /** Updates commit stats based on this instruction. */
     void updateComInstStats(const DynInstPtr &inst);
 
+
+    // HTM
+    int htmStarts[Impl::MaxThreads];
+    int htmStops[Impl::MaxThreads];
+
     /** Stat for the total number of squashed instructions discarded by commit.
      */
     Stats::Scalar commitSquashedInsts;
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index de79206..73041ba 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -60,6 +60,7 @@
 #include "debug/CommitRate.hh"
 #include "debug/Drain.hh"
 #include "debug/ExecFaulting.hh"
+#include "debug/HtmCpu.hh"
 #include "debug/O3PipeView.hh"
 #include "params/DerivO3CPU.hh"
 #include "sim/faults.hh"
@@ -121,6 +122,8 @@
         committedStores[tid] = false;
         checkEmptyROB[tid] = false;
         renameMap[tid] = nullptr;
+        htmStarts[tid] = 0;
+        htmStops[tid] = 0;
     }
     interrupt = NoFault;
 }
@@ -404,6 +407,14 @@
 {
     assert(isDrained());
     rob->drainSanityCheck();
+
+    // hardware transactional memory
+    // cannot drain partially through a transaction
+    for (ThreadID tid = 0; tid < numThreads; tid++) {
+        if (executingHtmTransaction(tid)) {
+            panic("cannot drain partially through a HTM transaction");
+        }
+    }
 }
 
 template <class Impl>
@@ -462,6 +473,27 @@
     }
 }
 
+template <class Impl>
+bool
+DefaultCommit<Impl>::executingHtmTransaction(ThreadID tid) const
+{
+    if (tid == InvalidThreadID)
+        return false;
+    else
+        return (htmStarts[tid] > htmStops[tid]);
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::resetHtmStartsStops(ThreadID tid)
+{
+    if (tid != InvalidThreadID)
+    {
+        htmStarts[tid] = 0;
+        htmStops[tid] = 0;
+    }
+}
+
 
 template <class Impl>
 void
@@ -532,6 +564,14 @@
     Cycles latency = dynamic_pointer_cast<SyscallRetryFault>(inst_fault) ?
                      cpu->syscallRetryLatency : trapLatency;
 
+    // hardware transactional memory
+    if (inst_fault != nullptr &&
+        std::dynamic_pointer_cast<GenericHtmFailureFault>(inst_fault)) {
+        // TODO
+        // latency = default abort/restore latency
+        // could also do some kind of exponential back off if desired
+    }
+
     cpu->schedule(trap, cpu->clockEdge(latency));
     trapInFlight[tid] = true;
     thread[tid]->trapPending = true;
@@ -991,13 +1031,28 @@
     // Commit as many instructions as possible until the commit bandwidth
     // limit is reached, or it becomes impossible to commit any more.
     while (num_committed < commitWidth) {
-        // Check for any interrupt that we've already squashed for
-        // and start processing it.
-        if (interrupt != NoFault)
-            handleInterrupt();
+        // hardware transactionally memory
+        // If executing within a transaction,
+        // need to handle interrupts specially
 
         ThreadID commit_thread = getCommittingThread();
 
+        // Check for any interrupt that we've already squashed for
+        // and start processing it.
+        if (interrupt != NoFault) {
+            // If inside a transaction, postpone interrupts
+            if (executingHtmTransaction(commit_thread)) {
+                cpu->clearInterrupts(0);
+                toIEW->commitInfo[0].clearInterrupt = true;
+                interrupt = NoFault;
+                avoidQuiesceLiveLock = true;
+            } else {
+                handleInterrupt();
+            }
+        }
+
+        // ThreadID commit_thread = getCommittingThread();
+
         if (commit_thread == -1 || !rob->isHeadReady(commit_thread))
             break;
 
@@ -1044,6 +1099,23 @@
                 statCommittedInstType[tid][head_inst->opClass()]++;
                 ppCommit->notify(head_inst);
 
+                // hardware transactional memory
+
+                // update nesting depth
+                if (head_inst->isHtmStart())
+                    htmStarts[tid]++;
+
+                // sanity check
+                if (head_inst->inHtmTransactionalState()) {
+                    assert(executingHtmTransaction(tid));
+                } else {
+                    assert(!executingHtmTransaction(tid));
+                }
+
+                // update nesting depth
+                if (head_inst->isHtmStop())
+                    htmStops[tid]++;
+
                 changedROBNumEntries[tid] = true;
 
                 // Set the doneSeqNum to the youngest committed instruction.
@@ -1206,6 +1278,23 @@
     // Check if the instruction caused a fault.  If so, trap.
     Fault inst_fault = head_inst->getFault();
 
+    // hardware transactional memory
+    // if a fault occurred within a HTM transaction
+    // ensure that the transaction aborts
+    if (inst_fault != NoFault && head_inst->inHtmTransactionalState()) {
+        // There exists a generic HTM fault common to all ISAs
+        if (!std::dynamic_pointer_cast<GenericHtmFailureFault>(inst_fault)) {
+            DPRINTF(HtmCpu, "%s - fault (%s) encountered within transaction"
+                            " - converting to GenericHtmFailureFault\n",
+            head_inst->staticInst->getName(), inst_fault->name());
+            inst_fault = std::make_shared<GenericHtmFailureFault>(
+                head_inst->getHtmTransactionUid(),
+                HtmFailureFaultCause::EXCEPTION);
+        }
+        // If this point is reached and the fault inherits from the HTM fault,
+        // then there is no need to raise a new fault
+    }
+
     // Stores mark themselves as completed.
     if (!head_inst->isStore() && inst_fault == NoFault) {
         head_inst->setCompleted();
@@ -1301,6 +1390,11 @@
                                  head_inst->renamedDestRegIdx(i));
     }
 
+    // hardware transactional memory
+    // the HTM UID is purely for correctness and debugging purposes
+    if (head_inst->isHtmStart())
+        iewStage->setLastRetiredHtmUid(tid, head_inst->getHtmTransactionUid());
+
     // Finally clear the head ROB entry.
     rob->retireHead(tid);
 
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 613ffd1..ed69b1a 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -623,6 +623,10 @@
 void
 FullO3CPU<Impl>::deactivateThread(ThreadID tid)
 {
+    // hardware transactional memory
+    // shouldn't deactivate thread in the middle of a transaction
+    assert(!commit.executingHtmTransaction(tid));
+
     //Remove From Active List, if Active
     list<ThreadID>::iterator thread_it =
         std::find(activeThreads.begin(), activeThreads.end(), tid);
@@ -1829,10 +1833,38 @@
 
 template <class Impl>
 void
-FullO3CPU<Impl>::htmSendAbortSignal(ThreadID tid, uint64_t htmUid,
+FullO3CPU<Impl>::htmSendAbortSignal(ThreadID tid, uint64_t htm_uid,
      HtmFailureFaultCause cause)
 {
-    panic("not yet supported!");
+    const Addr addr = 0x0ul;
+    const int size = 8;
+    const Request::Flags flags =
+      Request::PHYSICAL|Request::STRICT_ORDER|Request::HTM_ABORT;
+
+    // O3-specific actions
+    this->iew.ldstQueue.resetHtmStartsStops(tid);
+    this->commit.resetHtmStartsStops(tid);
+
+    // notify l1 d-cache (ruby) that core has aborted transaction
+    RequestPtr req =
+        std::make_shared<Request>(addr, size, flags, _dataMasterId);
+
+    req->taskId(taskId());
+    req->setContext(this->thread[tid]->contextId());
+    req->setHtmAbortCause(cause);
+
+    assert(req->isHTMAbort());
+
+    PacketPtr abort_pkt = Packet::createRead(req);
+    uint8_t *memData = new uint8_t[8];
+    assert(memData);
+    abort_pkt->dataStatic(memData);
+    abort_pkt->setHtmTransactional(htm_uid);
+
+    // TODO include correct error handling here
+    if (!this->iew.ldstQueue.getDataPort().sendTimingReq(abort_pkt)) {
+        panic("HTM abort signal was not sent to the memory subsystem.");
+    }
 }
 
 // Forward declaration of FullO3CPU.
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index 137fbc8..0447275 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -61,7 +61,6 @@
 #include "cpu/base.hh"
 #include "cpu/simple_thread.hh"
 #include "cpu/timebuf.hh"
-//#include "cpu/o3/thread_context.hh"
 #include "params/DerivO3CPU.hh"
 #include "sim/process.hh"
 
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index 7f3409e..4dbb9ef 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2012, 2014 ARM Limited
+ * Copyright (c) 2010-2012, 2014, 2019 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -233,6 +233,16 @@
     /** Check misprediction  */
     void checkMisprediction(const DynInstPtr &inst);
 
+    // hardware transactional memory
+    // For debugging purposes, it is useful to keep track of the most recent
+    // htmUid that has been committed (architecturally, not transactionally)
+    // to ensure that the core and the memory subsystem are observing
+    // correct ordering constraints.
+    void setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid)
+    {
+        ldstQueue.setLastRetiredHtmUid(tid, htmUid);
+    }
+
   private:
     /** Sends commit proper information for a squash due to a branch
      * mispredict.
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index 99dfd19..9a04fe6 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2018 ARM Limited
+ * Copyright (c) 2010-2013, 2018-2019 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
@@ -1051,6 +1051,20 @@
             break;
         }
 
+        // hardware transactional memory
+        // CPU needs to track transactional state in program order.
+        const int numHtmStarts = ldstQueue.numHtmStarts(tid);
+        const int numHtmStops = ldstQueue.numHtmStops(tid);
+        const int htmDepth = numHtmStarts - numHtmStops;
+
+        if (htmDepth > 0) {
+            inst->setHtmTransactionalState(ldstQueue.getLatestHtmUid(tid),
+                                            htmDepth);
+        } else {
+            inst->clearHtmTransactionalState();
+        }
+
+
         // Otherwise issue the instruction just fine.
         if (inst->isAtomic()) {
             DPRINTF(IEW, "[tid:%i] Issue: Memory instruction "
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 9ef3b0c..35c2873 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -687,6 +687,8 @@
         {
             flags.set(Flag::Complete);
         }
+
+        virtual std::string name() const { return "LSQRequest"; }
     };
 
     class SingleDataRequest : public LSQRequest
@@ -739,6 +741,35 @@
         virtual void buildPackets();
         virtual Cycles handleLocalAccess(ThreadContext *thread, PacketPtr pkt);
         virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
+        virtual std::string name() const { return "SingleDataRequest"; }
+    };
+
+    // hardware transactional memory
+    // This class extends SingleDataRequest for the sole purpose
+    // of encapsulating hardware transactional memory command requests
+    class HtmCmdRequest : public SingleDataRequest
+    {
+    protected:
+      /* Given that we are inside templates, children need explicit
+       * declaration of the names in the parent class. */
+      using Flag = typename LSQRequest::Flag;
+      using State = typename LSQRequest::State;
+      using LSQRequest::_addr;
+      using LSQRequest::_size;
+      using LSQRequest::_byteEnable;
+      using LSQRequest::_requests;
+      using LSQRequest::_inst;
+      using LSQRequest::_taskId;
+      using LSQRequest::flags;
+      using LSQRequest::setState;
+    public:
+      HtmCmdRequest(LSQUnit* port, const DynInstPtr& inst,
+                        const Request::Flags& flags_);
+      inline virtual ~HtmCmdRequest() {}
+      virtual void initiateTranslation();
+      virtual void finish(const Fault &fault, const RequestPtr &req,
+              ThreadContext* tc, BaseTLB::Mode mode);
+      virtual std::string name() const { return "HtmCmdRequest"; }
     };
 
     class SplitDataRequest : public LSQRequest
@@ -815,6 +846,7 @@
 
         virtual RequestPtr mainRequest();
         virtual PacketPtr mainPacket();
+        virtual std::string name() const { return "SplitDataRequest"; }
     };
 
     /** Constructs an LSQ with the given parameters. */
@@ -933,6 +965,44 @@
     /** Returns the total number of stores for a single thread. */
     int numStores(ThreadID tid) { return thread.at(tid).numStores(); }
 
+
+    // hardware transactional memory
+
+    int numHtmStarts(ThreadID tid) const
+    {
+        if (tid == InvalidThreadID)
+            return 0;
+        else
+            return thread[tid].numHtmStarts();
+    }
+    int numHtmStops(ThreadID tid) const
+    {
+        if (tid == InvalidThreadID)
+            return 0;
+        else
+            return thread[tid].numHtmStops();
+    }
+
+    void resetHtmStartsStops(ThreadID tid)
+    {
+        if (tid != InvalidThreadID)
+            thread[tid].resetHtmStartsStops();
+    }
+
+    uint64_t getLatestHtmUid(ThreadID tid) const
+    {
+        if (tid == InvalidThreadID)
+            return 0;
+        else
+            return thread[tid].getLatestHtmUid();
+    }
+
+    void setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid)
+    {
+        if (tid != InvalidThreadID)
+            thread[tid].setLastRetiredHtmUid(htmUid);
+    }
+
     /** Returns the number of free load entries. */
     unsigned numFreeLoadEntries();
 
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 1ca7d53..a535dcc 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012, 2014, 2017-2018 ARM Limited
+ * Copyright (c) 2011-2012, 2014, 2017-2019 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -51,6 +51,7 @@
 #include "cpu/o3/lsq.hh"
 #include "debug/Drain.hh"
 #include "debug/Fetch.hh"
+#include "debug/HtmCpu.hh"
 #include "debug/LSQ.hh"
 #include "debug/Writeback.hh"
 #include "params/DerivO3CPU.hh"
@@ -706,11 +707,17 @@
     // lines. For now, such cross-line update is not supported.
     assert(!isAtomic || (isAtomic && !needs_burst));
 
+    const bool htm_cmd = isLoad && (flags & Request::HTM_CMD);
+
     if (inst->translationStarted()) {
         req = inst->savedReq;
         assert(req);
     } else {
-        if (needs_burst) {
+        if (htm_cmd) {
+            assert(addr == 0x0lu);
+            assert(size == 8);
+            req = new HtmCmdRequest(&thread[tid], inst, flags);
+        } else if (needs_burst) {
             req = new SplitDataRequest(&thread[tid], inst, isLoad, addr,
                     size, flags, data, res);
         } else {
@@ -1033,6 +1040,23 @@
                     :  Packet::createWrite(request()));
         _packets.back()->dataStatic(_inst->memData);
         _packets.back()->senderState = _senderState;
+
+        // hardware transactional memory
+        // If request originates in a transaction (not necessarily a HtmCmd),
+        // then the packet should be marked as such.
+        if (_inst->inHtmTransactionalState()) {
+            _packets.back()->setHtmTransactional(
+                _inst->getHtmTransactionUid());
+
+            DPRINTF(HtmCpu,
+              "HTM %s pc=0x%lx - vaddr=0x%lx - paddr=0x%lx - htmUid=%u\n",
+              isLoad() ? "LD" : "ST",
+              _inst->instAddr(),
+              _packets.back()->req->hasVaddr() ?
+                  _packets.back()->req->getVaddr() : 0lu,
+              _packets.back()->getAddr(),
+              _inst->getHtmTransactionUid());
+        }
     }
     assert(_packets.size() == 1);
 }
@@ -1049,6 +1073,21 @@
         if (isLoad()) {
             _mainPacket = Packet::createRead(mainReq);
             _mainPacket->dataStatic(_inst->memData);
+
+            // hardware transactional memory
+            // If request originates in a transaction,
+            // packet should be marked as such
+            if (_inst->inHtmTransactionalState()) {
+                _mainPacket->setHtmTransactional(
+                    _inst->getHtmTransactionUid());
+                DPRINTF(HtmCpu,
+                  "HTM LD.0 pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n",
+                  _inst->instAddr(),
+                  _mainPacket->req->hasVaddr() ?
+                      _mainPacket->req->getVaddr() : 0lu,
+                  _mainPacket->getAddr(),
+                  _inst->getHtmTransactionUid());
+            }
         }
         for (int i = 0; i < _requests.size() && _fault[i] == NoFault; i++) {
             RequestPtr r = _requests[i];
@@ -1066,6 +1105,23 @@
             }
             pkt->senderState = _senderState;
             _packets.push_back(pkt);
+
+            // hardware transactional memory
+            // If request originates in a transaction,
+            // packet should be marked as such
+            if (_inst->inHtmTransactionalState()) {
+                _packets.back()->setHtmTransactional(
+                    _inst->getHtmTransactionUid());
+                DPRINTF(HtmCpu,
+                  "HTM %s.%d pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n",
+                  isLoad() ? "LD" : "ST",
+                  i+1,
+                  _inst->instAddr(),
+                  _packets.back()->req->hasVaddr() ?
+                      _packets.back()->req->getVaddr() : 0lu,
+                  _packets.back()->getAddr(),
+                  _inst->getHtmTransactionUid());
+            }
         }
     }
     assert(_packets.size() > 0);
@@ -1192,4 +1248,59 @@
     lsq->recvReqRetry();
 }
 
+template<class Impl>
+LSQ<Impl>::HtmCmdRequest::HtmCmdRequest(LSQUnit* port,
+                  const DynInstPtr& inst,
+                  const Request::Flags& flags_) :
+    SingleDataRequest(port, inst, true, 0x0lu, 8, flags_,
+        nullptr, nullptr, nullptr)
+{
+    assert(_requests.size() == 0);
+
+    this->addRequest(_addr, _size, _byteEnable);
+
+    if (_requests.size() > 0) {
+        _requests.back()->setReqInstSeqNum(_inst->seqNum);
+        _requests.back()->taskId(_taskId);
+        _requests.back()->setPaddr(_addr);
+        _requests.back()->setInstCount(_inst->getCpuPtr()->totalInsts());
+
+        _inst->strictlyOrdered(_requests.back()->isStrictlyOrdered());
+        _inst->fault = NoFault;
+        _inst->physEffAddr = _requests.back()->getPaddr();
+        _inst->memReqFlags = _requests.back()->getFlags();
+        _inst->savedReq = this;
+
+        setState(State::Translation);
+    } else {
+        panic("unexpected behaviour");
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::HtmCmdRequest::initiateTranslation()
+{
+    // Transaction commands are implemented as loads to avoid significant
+    // changes to the cpu and memory interfaces
+    // The virtual and physical address uses a dummy value of 0x00
+    // Address translation does not really occur thus the code below
+
+    flags.set(Flag::TranslationStarted);
+    flags.set(Flag::TranslationFinished);
+
+    _inst->translationStarted(true);
+    _inst->translationCompleted(true);
+
+    setState(State::Request);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::HtmCmdRequest::finish(const Fault &fault, const RequestPtr &req,
+        ThreadContext* tc, BaseTLB::Mode mode)
+{
+    panic("unexpected behaviour");
+}
+
 #endif//__CPU_O3_LSQ_IMPL_HH__
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 06d43ee..70995d6 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -53,6 +53,7 @@
 #include "config/the_isa.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/timebuf.hh"
+#include "debug/HtmCpu.hh"
 #include "debug/LSQUnit.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
@@ -312,6 +313,21 @@
     /** Returns the number of stores in the SQ. */
     int numStores() { return stores; }
 
+    // hardware transactional memory
+    int numHtmStarts() const { return htmStarts; }
+    int numHtmStops() const { return htmStops; }
+    void resetHtmStartsStops() { htmStarts = htmStops = 0; }
+    uint64_t getLatestHtmUid() const
+    {
+        const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr();
+        return htm_cpt->getHtmUid();
+    }
+    void setLastRetiredHtmUid(uint64_t htm_uid)
+    {
+        assert(htm_uid >= lastRetiredHtmUid);
+        lastRetiredHtmUid = htm_uid;
+    }
+
     /** Returns if either the LQ or SQ is full. */
     bool isFull() { return lqFull() || sqFull(); }
 
@@ -496,6 +512,13 @@
     /** The number of store instructions in the SQ waiting to writeback. */
     int storesToWB;
 
+    // hardware transactional memory
+    // nesting depth
+    int htmStarts;
+    int htmStops;
+    // sanity checks and debugging
+    uint64_t lastRetiredHtmUid;
+
     /** The index of the first instruction that may be ready to be
      * written back, and has not yet been written back.
      */
@@ -665,6 +688,7 @@
 
     if (req->mainRequest()->isLocalAccess()) {
         assert(!load_inst->memData);
+        assert(!load_inst->inHtmTransactionalState());
         load_inst->memData = new uint8_t[MaxDataBytes];
 
         ThreadContext *thread = cpu->tcBase(lsqID);
@@ -679,6 +703,37 @@
         return NoFault;
     }
 
+    // hardware transactional memory
+    if (req->mainRequest()->isHTMStart() || req->mainRequest()->isHTMCommit())
+    {
+        // don't want to send nested transactionStarts and
+        // transactionStops outside of core, e.g. to Ruby
+        if (req->mainRequest()->getFlags().isSet(Request::NO_ACCESS)) {
+            Cycles delay(0);
+            PacketPtr data_pkt =
+                new Packet(req->mainRequest(), MemCmd::ReadReq);
+
+            // Allocate memory if this is the first time a load is issued.
+            if (!load_inst->memData) {
+                load_inst->memData =
+                    new uint8_t[req->mainRequest()->getSize()];
+                // sanity checks espect zero in request's data
+                memset(load_inst->memData, 0, req->mainRequest()->getSize());
+            }
+
+            data_pkt->dataStatic(load_inst->memData);
+            if (load_inst->inHtmTransactionalState()) {
+                data_pkt->setHtmTransactional(
+                    load_inst->getHtmTransactionUid());
+            }
+            data_pkt->makeResponse();
+
+            WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
+            cpu->schedule(wb, cpu->clockEdge(delay));
+            return NoFault;
+        }
+    }
+
     // Check the SQ for any previous stores that might lead to forwarding
     auto store_it = load_inst->sqIt;
     assert (store_it >= storeWBIt);
@@ -771,6 +826,35 @@
                         MemCmd::ReadReq);
                 data_pkt->dataStatic(load_inst->memData);
 
+                // hardware transactional memory
+                // Store to load forwarding within a transaction
+                // This should be okay because the store will be sent to
+                // the memory subsystem and subsequently get added to the
+                // write set of the transaction. The write set has a stronger
+                // property than the read set, so the load doesn't necessarily
+                // have to be there.
+                assert(!req->mainRequest()->isHTMCmd());
+                if (load_inst->inHtmTransactionalState()) {
+                    assert (!storeQueue[store_it._idx].completed());
+                    assert (
+                        storeQueue[store_it._idx].instruction()->
+                          inHtmTransactionalState());
+                    assert (
+                        load_inst->getHtmTransactionUid() ==
+                        storeQueue[store_it._idx].instruction()->
+                          getHtmTransactionUid());
+                    data_pkt->setHtmTransactional(
+                        load_inst->getHtmTransactionUid());
+                    DPRINTF(HtmCpu, "HTM LD (ST2LDF) "
+                      "pc=0x%lx - vaddr=0x%lx - "
+                      "paddr=0x%lx - htmUid=%u\n",
+                      load_inst->instAddr(),
+                      data_pkt->req->hasVaddr() ?
+                        data_pkt->req->getVaddr() : 0lu,
+                      data_pkt->getAddr(),
+                      load_inst->getHtmTransactionUid());
+                }
+
                 if (req->isAnyOutstandingRequest()) {
                     assert(req->_numOutstandingPackets > 0);
                     // There are memory requests packets in flight already.
@@ -841,6 +925,15 @@
         load_inst->memData = new uint8_t[req->mainRequest()->getSize()];
     }
 
+
+    // hardware transactional memory
+    if (req->mainRequest()->isHTMCmd()) {
+        // this is a simple sanity check
+        // the Ruby cache controller will set
+        // memData to 0x0ul if successful.
+        *load_inst->memData = (uint64_t) 0x1ull;
+    }
+
     // For now, load throughput is constrained by the number of
     // load FUs only, and loads do not consume a cache port (only
     // stores do).
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index c39f894..fcbfc9c 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -51,6 +51,7 @@
 #include "cpu/o3/lsq.hh"
 #include "cpu/o3/lsq_unit.hh"
 #include "debug/Activity.hh"
+#include "debug/HtmCpu.hh"
 #include "debug/IEW.hh"
 #include "debug/LSQUnit.hh"
 #include "debug/O3PipeView.hh"
@@ -112,6 +113,59 @@
     LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
     DynInstPtr inst = state->inst;
 
+    // hardware transactional memory
+    // sanity check
+    if (pkt->isHtmTransactional() && !inst->isSquashed()) {
+        assert(inst->getHtmTransactionUid() == pkt->getHtmTransactionUid());
+    }
+
+    // if in a HTM transaction, it's possible
+    // to abort within the cache hierarchy.
+    // This is signalled back to the processor
+    // through responses to memory requests.
+    if (pkt->htmTransactionFailedInCache()) {
+        // cannot do this for write requests because
+        // they cannot tolerate faults
+        const HtmCacheFailure htm_rc =
+            pkt->getHtmTransactionFailedInCacheRC();
+        if(pkt->isWrite()) {
+            DPRINTF(HtmCpu,
+                "store notification (ignored) of HTM transaction failure "
+                "in cache - addr=0x%lx - rc=%s - htmUid=%d\n",
+                pkt->getAddr(), htmFailureToStr(htm_rc),
+                pkt->getHtmTransactionUid());
+        } else {
+            HtmFailureFaultCause fail_reason =
+                HtmFailureFaultCause::INVALID;
+
+            if (htm_rc == HtmCacheFailure::FAIL_SELF) {
+                fail_reason = HtmFailureFaultCause::SIZE;
+            } else if (htm_rc == HtmCacheFailure::FAIL_REMOTE) {
+                fail_reason = HtmFailureFaultCause::MEMORY;
+            } else if (htm_rc == HtmCacheFailure::FAIL_OTHER) {
+                // these are likely loads that were issued out of order
+                // they are faulted here, but it's unlikely that these will
+                // ever reach the commit head.
+                fail_reason = HtmFailureFaultCause::OTHER;
+            } else {
+                panic("HTM error - unhandled return code from cache (%s)",
+                      htmFailureToStr(htm_rc));
+            }
+
+            inst->fault =
+            std::make_shared<GenericHtmFailureFault>(
+                inst->getHtmTransactionUid(),
+                fail_reason);
+
+            DPRINTF(HtmCpu,
+                "load notification of HTM transaction failure "
+                "in cache - pc=%s - addr=0x%lx - "
+                "rc=%u - htmUid=%d\n",
+                inst->pcState(), pkt->getAddr(),
+                htmFailureToStr(htm_rc), pkt->getHtmTransactionUid());
+        }
+    }
+
     cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt));
 
     /* Notify the sender state that the access is complete (for ownership
@@ -125,6 +179,13 @@
             // after receving the response from the memory
             assert(inst->isLoad() || inst->isStoreConditional() ||
                    inst->isAtomic());
+
+            // hardware transactional memory
+            if (pkt->htmTransactionFailedInCache()) {
+                state->request()->mainPacket()->setHtmTransactionFailedInCache(
+                    pkt->getHtmTransactionFailedInCacheRC() );
+            }
+
             writeback(inst, state->request()->mainPacket());
             if (inst->isStore() || inst->isAtomic()) {
                 auto ss = dynamic_cast<SQSenderState*>(state);
@@ -142,7 +203,10 @@
 template <class Impl>
 LSQUnit<Impl>::LSQUnit(uint32_t lqEntries, uint32_t sqEntries)
     : lsqID(-1), storeQueue(sqEntries+1), loadQueue(lqEntries+1),
-      loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false),
+      loads(0), stores(0), storesToWB(0),
+      htmStarts(0), htmStops(0),
+      lastRetiredHtmUid(0),
+      cacheBlockMask(0), stalled(false),
       isStoreBlocked(false), storeInFlight(false), hasPendingRequest(false),
       pendingRequest(nullptr)
 {
@@ -176,6 +240,9 @@
 {
     loads = stores = storesToWB = 0;
 
+    // hardware transactional memory
+    // nesting depth
+    htmStarts = htmStops = 0;
 
     storeWBIt = storeQueue.begin();
 
@@ -306,6 +373,45 @@
     load_inst->lqIt = loadQueue.getIterator(load_inst->lqIdx);
 
     ++loads;
+
+    // hardware transactional memory
+    // transactional state and nesting depth must be tracked
+    // in the in-order part of the core.
+    if (load_inst->isHtmStart()) {
+        htmStarts++;
+        DPRINTF(HtmCpu, ">> htmStarts++ (%d) : htmStops (%d)\n",
+                htmStarts, htmStops);
+
+        const int htm_depth = htmStarts - htmStops;
+        const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr();
+        auto htm_uid = htm_cpt->getHtmUid();
+
+        // for debugging purposes
+        if (!load_inst->inHtmTransactionalState()) {
+            htm_uid = htm_cpt->newHtmUid();
+            DPRINTF(HtmCpu, "generating new htmUid=%u\n", htm_uid);
+            if (htm_depth != 1) {
+                DPRINTF(HtmCpu,
+                    "unusual HTM transactional depth (%d)"
+                    " possibly caused by mispeculation - htmUid=%u\n",
+                    htm_depth, htm_uid);
+            }
+        }
+        load_inst->setHtmTransactionalState(htm_uid, htm_depth);
+    }
+
+    if (load_inst->isHtmStop()) {
+        htmStops++;
+        DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops++ (%d)\n",
+                htmStarts, htmStops);
+
+        if (htmStops==1 && htmStarts==0) {
+            DPRINTF(HtmCpu,
+            "htmStops==1 && htmStarts==0. "
+            "This generally shouldn't happen "
+            "(unless due to misspeculation)\n");
+        }
+    }
 }
 
 template <class Impl>
@@ -831,6 +937,7 @@
 
         if (req->request()->isLocalAccess()) {
             assert(!inst->isStoreConditional());
+            assert(!inst->inHtmTransactionalState());
             ThreadContext *thread = cpu->tcBase(lsqID);
             PacketPtr main_pkt = new Packet(req->mainRequest(),
                                             MemCmd::WriteReq);
@@ -876,6 +983,21 @@
             stallingLoadIdx = 0;
         }
 
+        // hardware transactional memory
+        // Squashing instructions can alter the transaction nesting depth
+        // and must be corrected before fetching resumes.
+        if (loadQueue.back().instruction()->isHtmStart())
+        {
+            htmStarts = (--htmStarts < 0) ? 0 : htmStarts;
+            DPRINTF(HtmCpu, ">> htmStarts-- (%d) : htmStops (%d)\n",
+              htmStarts, htmStops);
+        }
+        if (loadQueue.back().instruction()->isHtmStop())
+        {
+            htmStops = (--htmStops < 0) ? 0 : htmStops;
+            DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops-- (%d)\n",
+              htmStarts, htmStops);
+        }
         // Clear the smart pointer to make sure it is decremented.
         loadQueue.back().instruction()->setSquashed();
         loadQueue.back().clear();
@@ -886,6 +1008,40 @@
         ++lsqSquashedLoads;
     }
 
+    // hardware transactional memory
+    // scan load queue (from oldest to youngest) for most recent valid htmUid
+    auto scan_it = loadQueue.begin();
+    uint64_t in_flight_uid = 0;
+    while (scan_it != loadQueue.end()) {
+        if (scan_it->instruction()->isHtmStart() &&
+            !scan_it->instruction()->isSquashed()) {
+            in_flight_uid = scan_it->instruction()->getHtmTransactionUid();
+            DPRINTF(HtmCpu, "loadQueue[%d]: found valid HtmStart htmUid=%u\n",
+                scan_it._idx, in_flight_uid);
+        }
+        scan_it++;
+    }
+    // If there's a HtmStart in the pipeline then use its htmUid,
+    // otherwise use the most recently committed uid
+    const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr();
+    if (htm_cpt) {
+        const uint64_t old_local_htm_uid = htm_cpt->getHtmUid();
+        uint64_t new_local_htm_uid;
+        if (in_flight_uid > 0)
+            new_local_htm_uid = in_flight_uid;
+        else
+            new_local_htm_uid = lastRetiredHtmUid;
+
+        if (old_local_htm_uid != new_local_htm_uid) {
+            DPRINTF(HtmCpu, "flush: lastRetiredHtmUid=%u\n",
+                lastRetiredHtmUid);
+            DPRINTF(HtmCpu, "flush: resetting localHtmUid=%u\n",
+                new_local_htm_uid);
+
+            htm_cpt->setHtmUid(new_local_htm_uid);
+        }
+    }
+
     if (memDepViolator && squashed_num < memDepViolator->seqNum) {
         memDepViolator = NULL;
     }
@@ -965,7 +1121,7 @@
 
     // Squashed instructions do not need to complete their access.
     if (inst->isSquashed()) {
-        assert(!inst->isStore());
+        assert (!inst->isStore() || inst->isStoreConditional());
         ++lsqIgnoredResponses;
         return;
     }
@@ -983,8 +1139,27 @@
             // If we have an outstanding fault, the fault should only be of
             // type ReExec or - in case of a SplitRequest - a partial
             // translation fault
-            assert(dynamic_cast<ReExec*>(inst->fault.get()) != nullptr ||
-                   inst->savedReq->isPartialFault());
+
+            // Unless it's a hardware transactional memory fault
+            auto htm_fault = std::dynamic_pointer_cast<
+                GenericHtmFailureFault>(inst->fault);
+
+            if (!htm_fault) {
+                assert(dynamic_cast<ReExec*>(inst->fault.get()) != nullptr ||
+                       inst->savedReq->isPartialFault());
+
+            } else if (!pkt->htmTransactionFailedInCache()) {
+                // Situation in which the instruction has a hardware transactional
+                // memory fault but not the packet itself. This can occur with
+                // ldp_uop microops since access is spread over multiple packets.
+                DPRINTF(HtmCpu,
+                        "%s writeback with HTM failure fault, "
+                        "however, completing packet is not aware of "
+                        "transaction failure. cause=%s htmUid=%u\n",
+                        inst->staticInst->getName(),
+                        htmFailureToStr(htm_fault->getHtmFailureFaultCause()),
+                        htm_fault->getHtmUid());
+            }
 
             DPRINTF(LSQUnit, "Not completing instruction [sn:%lli] access "
                     "due to pending fault.\n", inst->seqNum);
diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh
index 3a7ad36..4be98c5 100644
--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@@ -172,7 +172,9 @@
 {
     InstSeqNum barr_sn = barr_inst->seqNum;
     // Memory barriers block loads and stores, write barriers only stores.
-    if (barr_inst->isMemBarrier()) {
+    // Required also for hardware transactional memory commands which
+    // can have strict ordering semantics
+    if (barr_inst->isMemBarrier() || barr_inst->isHtmCmd()) {
         loadBarrierSNs.insert(barr_sn);
         storeBarrierSNs.insert(barr_sn);
         DPRINTF(MemDepUnit, "Inserted a memory barrier %s SN:%lli\n",
@@ -182,6 +184,7 @@
         DPRINTF(MemDepUnit, "Inserted a write barrier %s SN:%lli\n",
                 barr_inst->pcState(), barr_sn);
     }
+
     if (loadBarrierSNs.size() || storeBarrierSNs.size()) {
         DPRINTF(MemDepUnit, "Outstanding load barriers = %d; "
                             "store barriers = %d\n",
@@ -440,7 +443,8 @@
     wakeDependents(inst);
     completed(inst);
     InstSeqNum barr_sn = inst->seqNum;
-    if (inst->isMemBarrier()) {
+
+    if (inst->isMemBarrier() || inst->isHtmCmd()) {
         assert(hasLoadBarrier());
         assert(hasStoreBarrier());
         loadBarrierSNs.erase(barr_sn);
@@ -459,9 +463,10 @@
 void
 MemDepUnit<MemDepPred, Impl>::wakeDependents(const DynInstPtr &inst)
 {
-    // Only stores, atomics and barriers have dependents.
+    // Only stores, atomics, barriers and
+    // hardware transactional memory commands have dependents.
     if (!inst->isStore() && !inst->isAtomic() && !inst->isMemBarrier() &&
-        !inst->isWriteBarrier()) {
+        !inst->isWriteBarrier() && !inst->isHtmCmd()) {
         return;
     }
 
diff --git a/src/cpu/o3/thread_context_impl.hh b/src/cpu/o3/thread_context_impl.hh
index 014b0f5..005aa57 100644
--- a/src/cpu/o3/thread_context_impl.hh
+++ b/src/cpu/o3/thread_context_impl.hh
@@ -331,21 +331,24 @@
 O3ThreadContext<Impl>::htmAbortTransaction(uint64_t htmUid,
                                            HtmFailureFaultCause cause)
 {
-    panic("function not implemented\n");
+    cpu->htmSendAbortSignal(thread->threadId(), htmUid, cause);
+
+    conditionalSquash();
 }
 
 template <class Impl>
 BaseHTMCheckpointPtr&
 O3ThreadContext<Impl>::getHtmCheckpointPtr()
 {
-    panic("function not implemented\n");
+    return thread->htmCheckpoint;
 }
 
 template <class Impl>
 void
 O3ThreadContext<Impl>::setHtmCheckpointPtr(BaseHTMCheckpointPtr new_cpt)
 {
-    panic("function not implemented\n");
+    assert(!thread->htmCheckpoint->valid());
+    thread->htmCheckpoint = std::move(new_cpt);
 }
 
 #endif //__CPU_O3_THREAD_CONTEXT_IMPL_HH__
diff --git a/src/cpu/o3/thread_state.hh b/src/cpu/o3/thread_state.hh
index 6420da9..3226832 100644
--- a/src/cpu/o3/thread_state.hh
+++ b/src/cpu/o3/thread_state.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012 ARM Limited
+ * Copyright (c) 2012, 2019 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -92,6 +92,9 @@
      */
     bool trapPending;
 
+    /** Pointer to the hardware transactional memory checkpoint. */
+    std::unique_ptr<BaseHTMCheckpoint> htmCheckpoint;
+
     O3ThreadState(O3CPU *_cpu, int _thread_num, Process *_process)
         : ThreadState(_cpu, _thread_num, _process), cpu(_cpu),
           comInstEventQueue("instruction-based event queue"),