cpu-o3: O3 LSQ Generalisation

This patch does a large modification of the LSQ in the O3 model. The
main goal of the patch is to remove the 'an operation can be served with
one or two memory requests' assumption that is present in the LSQ
and the instruction with the req, reqLow, reqHigh triplet, and
generalising it to operations that can be addressed with one request,
and operations that require many requests, embodied in the
SingleDataRequest and the SplitDataRequest.

This modification has been done mimicking the minor model to an extent,
shifting the responsibilities of dealing with VtoP translation and
tracking the status and resources from the DynInst to the LSQ via the
LSQRequest. The LSQRequest models the information concerning the
operation, handles the creation of fragments for translation and request
as well as assembling/splitting the data accordingly.

With this modifications, the implementation of vector ISAs, particularly
on the memory side, become more rich, as the new model permits a
dissociation of the ISA characteristics as vector length, from the
microarchitectural characteristics that govern how contiguous loads are
executing, allowing exploration of different LSQ to DL1 bus widths to
understand the tradeoffs in complexity and performance.

Part of the complexities introduced stem from the fact that gem5 keeps a
large amount of metadata regarding, in particular, memory operations,
thus, when an instruction is squashed while some operation as TLB lookup
or cache access is ongoing, when the relevant structure communicates to
the LSQ that the operation is over, it tries to access some pieces of
data that should have died when the instruction is squashed, leading to
asserts, panics, or memory corruption. To ensure the correct behaviour,
the LSQRequest rely on assesing who is their owner, and self-destroying
if they detect their owner is done with the request, and there will be
no subsequent action. For example, in the case of an instruction
squashed whal the TLB is doing a walk to serve the translation, when the
translation is served by the TLB, the LSQRequest detects that the
instruction was squashed, and as the translation is done, no one else
expect to access its information, and therefore, it self-destructs.
Having destroyed the LSQRequest earlier, would lead to wrong behaviour
as the TLB walk may access some fields of it.

Additional authors:
- Gabor Dozsa <gabor.dozsa@arm.com>

Change-Id: I9578a1a3f6b899c390cdd886856a24db68ff7d0c
Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/13516
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
diff --git a/src/base/refcnt.hh b/src/base/refcnt.hh
index 197e418..53bb1ae 100644
--- a/src/base/refcnt.hh
+++ b/src/base/refcnt.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited
+ * Copyright (c) 2017-2018 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index c2a1408..d81b58b 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011,2013,2016 ARM Limited
+ * Copyright (c) 2011, 2013, 2016-2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
@@ -84,6 +84,10 @@
     typedef typename ImplCPU::ImplState ImplState;
     using VecRegContainer = TheISA::VecRegContainer;
 
+    using LSQRequestPtr = typename Impl::CPUPol::LSQ::LSQRequest*;
+    using LQIterator = typename Impl::CPUPol::LSQUnit::LQIterator;
+    using SQIterator = typename Impl::CPUPol::LSQUnit::SQIterator;
+
     // The DynInstPtr type.
     typedef typename Impl::DynInstPtr DynInstPtr;
     typedef RefCountingPtr<BaseDynInst<Impl> > BaseDynInstPtr;
@@ -203,12 +207,7 @@
     Addr effAddr;
 
     /** The effective physical address. */
-    Addr physEffAddrLow;
-
-    /** The effective physical address
-     *  of the second request for a split request
-     */
-    Addr physEffAddrHigh;
+    Addr physEffAddr;
 
     /** The memory request flags (from translation). */
     unsigned memReqFlags;
@@ -224,19 +223,19 @@
 
     /** Load queue index. */
     int16_t lqIdx;
+    LQIterator lqIt;
 
     /** Store queue index. */
     int16_t sqIdx;
+    SQIterator sqIt;
 
 
     /////////////////////// TLB Miss //////////////////////
     /**
-     * Saved memory requests (needed when the DTB address translation is
+     * Saved memory request (needed when the DTB address translation is
      * delayed due to a hw page table walk).
      */
-    RequestPtr savedReq;
-    RequestPtr savedSreqLow;
-    RequestPtr savedSreqHigh;
+    LSQRequestPtr savedReq;
 
     /////////////////////// Checker //////////////////////
     // Need a copy of main request pointer to verify on writes.
@@ -270,6 +269,7 @@
 
     /** Is the effective virtual address valid. */
     bool effAddrValid() const { return instFlags[EffAddrValid]; }
+    void effAddrValid(bool b) { instFlags[EffAddrValid] = b; }
 
     /** Whether or not the memory operation is done. */
     bool memOpDone() const { return instFlags[MemOpDone]; }
@@ -303,18 +303,6 @@
     Fault writeMem(uint8_t *data, unsigned size, Addr addr,
                    Request::Flags flags, uint64_t *res);
 
-    /** Splits a request in two if it crosses a dcache block. */
-    void splitRequest(const RequestPtr &req, RequestPtr &sreqLow,
-                      RequestPtr &sreqHigh);
-
-    /** Initiate a DTB address translation. */
-    void initiateTranslation(const RequestPtr &req, const RequestPtr &sreqLow,
-                             const RequestPtr &sreqHigh, uint64_t *res,
-                             BaseTLB::Mode mode);
-
-    /** Finish a DTB address translation. */
-    void finishTranslation(WholeTranslationState *state);
-
     /** True if the DTB address translation has started. */
     bool translationStarted() const { return instFlags[TranslationStarted]; }
     void translationStarted(bool f) { instFlags[TranslationStarted] = f; }
@@ -454,6 +442,9 @@
 
     /** Returns the fault type. */
     Fault getFault() const { return fault; }
+    /** TODO: This I added for the LSQRequest side to be able to modify the
+     * fault. There should be a better mechanism in place. */
+    Fault& getFault() { return fault; }
 
     /** Checks whether or not this instruction has had its branch target
      *  calculated yet.  For now it is not utilized and is hacked to be
@@ -589,7 +580,8 @@
     int8_t numIntDestRegs() const { return staticInst->numIntDestRegs(); }
     int8_t numCCDestRegs() const { return staticInst->numCCDestRegs(); }
     int8_t numVecDestRegs() const { return staticInst->numVecDestRegs(); }
-    int8_t numVecElemDestRegs() const {
+    int8_t numVecElemDestRegs() const
+    {
         return staticInst->numVecElemDestRegs();
     }
 
@@ -837,6 +829,7 @@
 
     /** Sets the ASID. */
     void setASID(short addr_space_id) { asid = addr_space_id; }
+    short getASID() { return asid; }
 
     /** Sets the thread id. */
     void setTid(ThreadID tid) { threadNumber = tid; }
@@ -853,9 +846,12 @@
 
     /** Is this instruction's memory access strictly ordered? */
     bool strictlyOrdered() const { return instFlags[IsStrictlyOrdered]; }
+    void strictlyOrdered(bool so) { instFlags[IsStrictlyOrdered] = so; }
 
     /** Has this instruction generated a memory request. */
     bool hasRequest() const { return instFlags[ReqMade]; }
+    /** Assert this instruction has generated a memory request. */
+    void setRequest() { instFlags[ReqMade] = true; }
 
     /** Returns iterator to this instruction in the list of all insts. */
     ListIt &getInstListIt() { return instListIt; }
@@ -887,50 +883,9 @@
 BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size,
                                    Request::Flags flags)
 {
-    instFlags[ReqMade] = true;
-    RequestPtr req = NULL;
-    RequestPtr sreqLow = NULL;
-    RequestPtr sreqHigh = NULL;
-
-    if (instFlags[ReqMade] && translationStarted()) {
-        req = savedReq;
-        sreqLow = savedSreqLow;
-        sreqHigh = savedSreqHigh;
-    } else {
-        req = std::make_shared<Request>(
-            asid, addr, size, flags, masterId(),
-            this->pc.instAddr(), thread->contextId());
-
-        req->taskId(cpu->taskId());
-
-        // Only split the request if the ISA supports unaligned accesses.
-        if (TheISA::HasUnalignedMemAcc) {
-            splitRequest(req, sreqLow, sreqHigh);
-        }
-        initiateTranslation(req, sreqLow, sreqHigh, NULL, BaseTLB::Read);
-    }
-
-    if (translationCompleted()) {
-        if (fault == NoFault) {
-            effAddr = req->getVaddr();
-            effSize = size;
-            instFlags[EffAddrValid] = true;
-
-            if (cpu->checker) {
-                reqToVerify = std::make_shared<Request>(*req);
-            }
-            fault = cpu->read(req, sreqLow, sreqHigh, lqIdx);
-        } else {
-            // Commit will have to clean up whatever happened.  Set this
-            // instruction as executed.
-            this->setExecuted();
-        }
-    }
-
-    if (traceData)
-        traceData->setMem(addr, size, flags);
-
-    return fault;
+    return cpu->pushRequest(
+            dynamic_cast<typename DynInstPtr::PtrType>(this),
+            /* ld */ true, nullptr, size, addr, flags, nullptr);
 }
 
 template<class Impl>
@@ -938,154 +893,9 @@
 BaseDynInst<Impl>::writeMem(uint8_t *data, unsigned size, Addr addr,
                             Request::Flags flags, uint64_t *res)
 {
-    if (traceData)
-        traceData->setMem(addr, size, flags);
-
-    instFlags[ReqMade] = true;
-    RequestPtr req = NULL;
-    RequestPtr sreqLow = NULL;
-    RequestPtr sreqHigh = NULL;
-
-    if (instFlags[ReqMade] && translationStarted()) {
-        req = savedReq;
-        sreqLow = savedSreqLow;
-        sreqHigh = savedSreqHigh;
-    } else {
-        req = std::make_shared<Request>(
-            asid, addr, size, flags, masterId(),
-            this->pc.instAddr(), thread->contextId());
-
-        req->taskId(cpu->taskId());
-
-        // Only split the request if the ISA supports unaligned accesses.
-        if (TheISA::HasUnalignedMemAcc) {
-            splitRequest(req, sreqLow, sreqHigh);
-        }
-        initiateTranslation(req, sreqLow, sreqHigh, res, BaseTLB::Write);
-    }
-
-    if (fault == NoFault && translationCompleted()) {
-        effAddr = req->getVaddr();
-        effSize = size;
-        instFlags[EffAddrValid] = true;
-
-        if (cpu->checker) {
-            reqToVerify = std::make_shared<Request>(*req);
-        }
-        fault = cpu->write(req, sreqLow, sreqHigh, data, sqIdx);
-    }
-
-    return fault;
-}
-
-template<class Impl>
-inline void
-BaseDynInst<Impl>::splitRequest(const RequestPtr &req, RequestPtr &sreqLow,
-                                RequestPtr &sreqHigh)
-{
-    // Check to see if the request crosses the next level block boundary.
-    unsigned block_size = cpu->cacheLineSize();
-    Addr addr = req->getVaddr();
-    Addr split_addr = roundDown(addr + req->getSize() - 1, block_size);
-    assert(split_addr <= addr || split_addr - addr < block_size);
-
-    // Spans two blocks.
-    if (split_addr > addr) {
-        req->splitOnVaddr(split_addr, sreqLow, sreqHigh);
-    }
-}
-
-template<class Impl>
-inline void
-BaseDynInst<Impl>::initiateTranslation(const RequestPtr &req,
-                                       const RequestPtr &sreqLow,
-                                       const RequestPtr &sreqHigh,
-                                       uint64_t *res,
-                                       BaseTLB::Mode mode)
-{
-    translationStarted(true);
-
-    if (!TheISA::HasUnalignedMemAcc || sreqLow == NULL) {
-        WholeTranslationState *state =
-            new WholeTranslationState(req, NULL, res, mode);
-
-        // One translation if the request isn't split.
-        DataTranslation<BaseDynInstPtr> *trans =
-            new DataTranslation<BaseDynInstPtr>(this, state);
-
-        cpu->dtb->translateTiming(req, thread->getTC(), trans, mode);
-
-        if (!translationCompleted()) {
-            // The translation isn't yet complete, so we can't possibly have a
-            // fault. Overwrite any existing fault we might have from a previous
-            // execution of this instruction (e.g. an uncachable load that
-            // couldn't execute because it wasn't at the head of the ROB).
-            fault = NoFault;
-
-            // Save memory requests.
-            savedReq = state->mainReq;
-            savedSreqLow = state->sreqLow;
-            savedSreqHigh = state->sreqHigh;
-        }
-    } else {
-        WholeTranslationState *state =
-            new WholeTranslationState(req, sreqLow, sreqHigh, NULL, res, mode);
-
-        // Two translations when the request is split.
-        DataTranslation<BaseDynInstPtr> *stransLow =
-            new DataTranslation<BaseDynInstPtr>(this, state, 0);
-        DataTranslation<BaseDynInstPtr> *stransHigh =
-            new DataTranslation<BaseDynInstPtr>(this, state, 1);
-
-        cpu->dtb->translateTiming(sreqLow, thread->getTC(), stransLow, mode);
-        cpu->dtb->translateTiming(sreqHigh, thread->getTC(), stransHigh, mode);
-
-        if (!translationCompleted()) {
-            // The translation isn't yet complete, so we can't possibly have a
-            // fault. Overwrite any existing fault we might have from a previous
-            // execution of this instruction (e.g. an uncachable load that
-            // couldn't execute because it wasn't at the head of the ROB).
-            fault = NoFault;
-
-            // Save memory requests.
-            savedReq = state->mainReq;
-            savedSreqLow = state->sreqLow;
-            savedSreqHigh = state->sreqHigh;
-        }
-    }
-}
-
-template<class Impl>
-inline void
-BaseDynInst<Impl>::finishTranslation(WholeTranslationState *state)
-{
-    fault = state->getFault();
-
-    instFlags[IsStrictlyOrdered] = state->isStrictlyOrdered();
-
-    if (fault == NoFault) {
-        // save Paddr for a single req
-        physEffAddrLow = state->getPaddr();
-
-        // case for the request that has been split
-        if (state->isSplit) {
-          physEffAddrLow = state->sreqLow->getPaddr();
-          physEffAddrHigh = state->sreqHigh->getPaddr();
-        }
-
-        memReqFlags = state->getFlags();
-
-        if (state->mainReq->isCondSwap()) {
-            assert(state->res);
-            state->mainReq->setExtraData(*state->res);
-        }
-
-    } else {
-        state->deleteReqs();
-    }
-    delete state;
-
-    translationCompleted(true);
+    return cpu->pushRequest(
+            dynamic_cast<typename DynInstPtr::PtrType>(this),
+            /* st */ false, data, size, addr, flags, res);
 }
 
 #endif // __CPU_BASE_DYN_INST_HH__
diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh
index cd4740d..d8473f7 100644
--- a/src/cpu/base_dyn_inst_impl.hh
+++ b/src/cpu/base_dyn_inst_impl.hh
@@ -69,8 +69,6 @@
     macroop(_macroop),
     memData(nullptr),
     savedReq(nullptr),
-    savedSreqLow(nullptr),
-    savedSreqHigh(nullptr),
     reqToVerify(nullptr)
 {
     seqNum = seq_num;
@@ -96,8 +94,7 @@
 {
     memData = NULL;
     effAddr = 0;
-    physEffAddrLow = 0;
-    physEffAddrHigh = 0;
+    physEffAddr = 0;
     readyRegs = 0;
     memReqFlags = 0;
 
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 600c89a..7261f0c 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -850,7 +850,6 @@
 
     //Reset ROB/IQ/LSQ Entries
     commit.rob->resetEntries();
-    iew.resetEntries();
 }
 
 template <class Impl>
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index 90024bc..1159850 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013, 2016 ARM Limited
+ * Copyright (c) 2011-2013, 2016-2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -125,6 +125,7 @@
 
     BaseTLB *itb;
     BaseTLB *dtb;
+    using LSQRequest = typename LSQ<Impl>::LSQRequest;
 
     /** Overall CPU status. */
     Status _status;
@@ -733,21 +734,25 @@
     /** Available thread ids in the cpu*/
     std::vector<ThreadID> tids;
 
-    /** CPU read function, forwards read to LSQ. */
-    Fault read(const RequestPtr &req,
-               RequestPtr &sreqLow, RequestPtr &sreqHigh,
-               int load_idx)
+    /** CPU pushRequest function, forwards request to LSQ. */
+    Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+                      unsigned int size, Addr addr, Request::Flags flags,
+                      uint64_t *res)
     {
-        return this->iew.ldstQueue.read(req, sreqLow, sreqHigh, load_idx);
+        return iew.ldstQueue.pushRequest(inst, isLoad, data, size, addr,
+                flags, res);
+    }
+
+    /** CPU read function, forwards read to LSQ. */
+    Fault read(LSQRequest* req, int load_idx)
+    {
+        return this->iew.ldstQueue.read(req, load_idx);
     }
 
     /** CPU write function, forwards write to LSQ. */
-    Fault write(const RequestPtr &req,
-                const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                uint8_t *data, int store_idx)
+    Fault write(LSQRequest* req, uint8_t *data, int store_idx)
     {
-        return this->iew.ldstQueue.write(req, sreqLow, sreqHigh,
-                                         data, store_idx);
+        return this->iew.ldstQueue.write(req, data, store_idx);
     }
 
     /** Used by the fetch unit to get a hold of the instruction port. */
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index e706b09..3d5d848 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013 ARM Limited
+ * Copyright (c) 2010-2013, 2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
@@ -744,14 +744,6 @@
 }
 
 template <class Impl>
-void
-DefaultIEW<Impl>::resetEntries()
-{
-    instQueue.resetEntries();
-    ldstQueue.resetEntries();
-}
-
-template <class Impl>
 bool
 DefaultIEW<Impl>::checkStall(ThreadID tid)
 {
@@ -1353,7 +1345,7 @@
                 DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s "
                         "[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n",
                         violator->pcState(), violator->seqNum,
-                        inst->pcState(), inst->seqNum, inst->physEffAddrLow);
+                        inst->pcState(), inst->seqNum, inst->physEffAddr);
 
                 fetchRedirect[tid] = true;
 
@@ -1376,7 +1368,7 @@
                 DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
                         "%s, inst PC: %s.  Addr is: %#x.\n",
                         violator->pcState(), inst->pcState(),
-                        inst->physEffAddrLow);
+                        inst->physEffAddr);
                 DPRINTF(IEW, "Violation will not be handled because "
                         "already squashing\n");
 
@@ -1460,6 +1452,8 @@
     wroteToTimeBuffer = false;
     updatedQueues = false;
 
+    ldstQueue.tick();
+
     sortInsts();
 
     // Free function units marked as being freed this cycle.
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index a8895f8..4a55a91 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2014 ARM Limited
+ * Copyright (c) 2011-2014, 2017-2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
@@ -1140,9 +1140,6 @@
 void
 InstructionQueue<Impl>::blockMemInst(const DynInstPtr &blocked_inst)
 {
-    blocked_inst->translationStarted(false);
-    blocked_inst->translationCompleted(false);
-
     blocked_inst->clearIssued();
     blocked_inst->clearCanIssue();
     blockedMemInsts.push_back(blocked_inst);
@@ -1285,9 +1282,9 @@
                                            squashed_inst);
                     }
 
-
                     ++iqSquashedOperandsExamined;
                 }
+
             } else if (!squashed_inst->isStoreConditional() ||
                        !squashed_inst->isCompleted()) {
                 NonSpecMapIt ns_inst_it =
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 2b2d39b..003726c 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012, 2014 ARM Limited
+ * Copyright (c) 2011-2012, 2014, 2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -47,8 +47,9 @@
 #include <map>
 #include <queue>
 
-#include "cpu/o3/lsq_unit.hh"
+#include "arch/generic/tlb.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/lsq_unit.hh"
 #include "enums/SMTQueuePolicy.hh"
 #include "mem/port.hh"
 #include "sim/sim_object.hh"
@@ -56,13 +57,659 @@
 struct DerivO3CPUParams;
 
 template <class Impl>
-class LSQ {
+class LSQ
+
+{
   public:
     typedef typename Impl::O3CPU O3CPU;
     typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::CPUPol::IEW IEW;
     typedef typename Impl::CPUPol::LSQUnit LSQUnit;
 
+    class LSQRequest;
+    /** Derived class to hold any sender state the LSQ needs. */
+    class LSQSenderState : public Packet::SenderState
+    {
+      protected:
+        /** The senderState needs to know the LSQRequest who owns it. */
+        LSQRequest* _request;
+
+        /** Default constructor. */
+        LSQSenderState(LSQRequest* request, bool isLoad_)
+            : _request(request), mainPkt(nullptr), pendingPacket(nullptr),
+              outstanding(0), isLoad(isLoad_), needWB(isLoad_), isSplit(false),
+              pktToSend(false), deleted(false)
+          { }
+      public:
+
+        /** Instruction which initiated the access to memory. */
+        DynInstPtr inst;
+        /** The main packet from a split load, used during writeback. */
+        PacketPtr mainPkt;
+        /** A second packet from a split store that needs sending. */
+        PacketPtr pendingPacket;
+        /** Number of outstanding packets to complete. */
+        uint8_t outstanding;
+        /** Whether or not it is a load. */
+        bool isLoad;
+        /** Whether or not the instruction will need to writeback. */
+        bool needWB;
+        /** Whether or not this access is split in two. */
+        bool isSplit;
+        /** Whether or not there is a packet that needs sending. */
+        bool pktToSend;
+        /** Has the request been deleted?
+         * LSQ entries can be squashed before the response comes back. in that
+         * case the SenderState knows.
+         */
+        bool deleted;
+        ContextID contextId() { return inst->contextId(); }
+
+        /** Completes a packet and returns whether the access is finished. */
+        inline bool isComplete() { return outstanding == 0; }
+        inline void deleteRequest() { deleted = true; }
+        inline bool alive() { return !deleted; }
+        LSQRequest* request() { return _request; }
+        virtual void complete() = 0;
+        void writebackDone() { _request->writebackDone(); }
+    };
+
+    /** Memory operation metadata.
+     * This class holds the information about a memory operation. It lives
+     * from initiateAcc to resource deallocation at commit or squash.
+     * LSQRequest objects are owned by the LQ/SQ Entry in the LSQUnit that
+     * holds the operation. It is also used by the LSQSenderState. In addition,
+     * the LSQRequest is a TranslationState, therefore, upon squash, there must
+     * be a defined ownership transferal in case the LSQ resources are
+     * deallocated before the TLB is done using the TranslationState. If that
+     * happens, the LSQRequest will be self-owned, and responsible to detect
+     * that its services are no longer required and self-destruct.
+     *
+     * Lifetime of a LSQRequest:
+     *                 +--------------------+
+     *                 |LSQ creates and owns|
+     *                 +--------------------+
+     *                           |
+     *                 +--------------------+
+     *                 | Initate translation|
+     *                 +--------------------+
+     *                           |
+     *                        ___^___
+     *                    ___/       \___
+     *             ______/   Squashed?   \
+     *            |      \___         ___/
+     *            |          \___ ___/
+     *            |              v
+     *            |              |
+     *            |    +--------------------+
+     *            |    |  Translation done  |
+     *            |    +--------------------+
+     *            |              |
+     *            |    +--------------------+
+     *            |    |     Send packet    |<------+
+     *            |    +--------------------+       |
+     *            |              |                  |
+     *            |           ___^___               |
+     *            |       ___/       \___           |
+     *            |  ____/   Squashed?   \          |
+     *            | |    \___         ___/          |
+     *            | |        \___ ___/              |
+     *            | |            v                  |
+     *            | |            |                  |
+     *            | |         ___^___               |
+     *            | |     ___/       \___           |
+     *            | |    /     Done?     \__________|
+     *            | |    \___         ___/
+     *            | |        \___ ___/
+     *            | |            v
+     *            | |            |
+     *            | |  +--------------------+
+     *            | |  |    Manage stuff    |
+     *            | |  |   Free resources   |
+     *            | |  +--------------------+
+     *            | |
+     *            | |  +--------------------+
+     *            | |  |  senderState owns  |
+     *            | +->|  onRecvTimingResp  |
+     *            |    |   free resources   |
+     *            |    +--------------------+
+     *            |
+     *            |   +----------------------+
+     *            |   |  self owned (Trans)  |
+     *            +-->| on TranslationFinish |
+     *                |    free resources    |
+     *                +----------------------+
+     *
+     *
+     */
+    class LSQRequest : public BaseTLB::Translation
+    {
+      protected:
+        typedef uint32_t FlagsStorage;
+        typedef ::Flags<FlagsStorage> FlagsType;
+
+        enum Flag : FlagsStorage
+        {
+            IsLoad              = 0x00000001,
+            /** True if this is a store that writes registers (SC). */
+            WbStore             = 0x00000002,
+            Delayed             = 0x00000004,
+            IsSplit             = 0x00000008,
+            /** True if any translation has been sent to TLB. */
+            TranslationStarted  = 0x00000010,
+            /** True if there are un-replied outbound translations.. */
+            TranslationFinished = 0x00000020,
+            Sent                = 0x00000040,
+            Retry               = 0x00000080,
+            Complete            = 0x00000100,
+            /** Ownership tracking flags. */
+            /** Translation squashed. */
+            TranslationSquashed = 0x00000200,
+            /** Request discarded */
+            Discarded           = 0x00000400,
+            /** LSQ resources freed. */
+            LSQEntryFreed       = 0x00000800,
+            /** Store written back. */
+            WritebackScheduled  = 0x00001000,
+            WritebackDone       = 0x00002000
+        };
+        FlagsType flags;
+
+        enum class State
+        {
+            NotIssued,
+            Translation,
+            Request,
+            Complete,
+            Squashed,
+            Fault,
+        };
+        State _state;
+        LSQSenderState* _senderState;
+        void setState(const State& newState) { _state = newState; }
+
+        uint32_t numTranslatedFragments;
+        uint32_t numInTranslationFragments;
+
+        /** LQ/SQ entry idx. */
+        uint32_t _entryIdx;
+
+        void markDelayed() { flags.set(Flag::Delayed); }
+        bool isDelayed() { return flags.isSet(Flag::Delayed); }
+
+      public:
+        LSQUnit& _port;
+        const DynInstPtr _inst;
+        uint32_t _taskId;
+        PacketDataPtr _data;
+        std::vector<PacketPtr> _packets;
+        std::vector<RequestPtr> _requests;
+        std::vector<Fault> _fault;
+        uint64_t* _res;
+        const Addr _addr;
+        const uint32_t _size;
+        const Request::Flags _flags;
+        uint32_t _numOutstandingPackets;
+      protected:
+        LSQUnit* lsqUnit() { return &_port; }
+        LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad) :
+            _state(State::NotIssued), _senderState(nullptr),
+            _port(*port), _inst(inst), _data(nullptr),
+            _res(nullptr), _addr(0), _size(0), _flags(0),
+            _numOutstandingPackets(0)
+        {
+            flags.set(Flag::IsLoad, isLoad);
+            flags.set(Flag::WbStore, _inst->isStoreConditional());
+            install();
+        }
+        LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
+                   const Addr& addr, const uint32_t& size,
+                   const Request::Flags& flags_,
+                   PacketDataPtr data = nullptr, uint64_t* res = nullptr)
+            : _state(State::NotIssued), _senderState(nullptr),
+            numTranslatedFragments(0),
+            numInTranslationFragments(0),
+            _port(*port), _inst(inst), _data(data),
+            _res(res), _addr(addr), _size(size),
+            _flags(flags_),
+            _numOutstandingPackets(0)
+        {
+            flags.set(Flag::IsLoad, isLoad);
+            flags.set(Flag::WbStore, _inst->isStoreConditional());
+            install();
+        }
+
+        bool
+        isLoad() const
+        {
+            return flags.isSet(Flag::IsLoad);
+        }
+
+        /** Install the request in the LQ/SQ. */
+        void install()
+        {
+            if (isLoad()) {
+                _port.loadQueue[_inst->lqIdx].setRequest(this);
+            } else {
+                _port.storeQueue[_inst->sqIdx].setRequest(this);
+            }
+        }
+        virtual bool
+        squashed() const override
+        {
+            return _inst->isSquashed();
+        }
+
+        /**
+         * Test if the LSQRequest has been released, i.e. self-owned.
+         * An LSQRequest manages itself when the resources on the LSQ are freed
+         * but the translation is still going on and the LSQEntry was freed.
+         */
+        bool
+        isReleased()
+        {
+            return flags.isSet(Flag::LSQEntryFreed) ||
+                flags.isSet(Flag::Discarded);
+        }
+
+        /** Release the LSQRequest.
+         * Notify the sender state that the request it points to is not valid
+         * anymore. Understand if the request is orphan (self-managed) and if
+         * so, mark it as freed, else destroy it, as this means
+         * the end of its life cycle.
+         * An LSQRequest is orphan when its resources are released
+         * but there is any in-flight translation request to the TLB or access
+         * request to the memory.
+         */
+        void release(Flag reason)
+        {
+            assert(reason == Flag::LSQEntryFreed || reason == Flag::Discarded);
+            if (!isAnyOutstandingRequest()) {
+                delete this;
+            } else {
+                if (_senderState) {
+                    _senderState->deleteRequest();
+                }
+                flags.set(reason);
+            }
+        }
+
+        /** Destructor.
+         * The LSQRequest owns the request. If the packet has already been
+         * sent, the sender state will be deleted upon receiving the reply.
+         */
+        virtual ~LSQRequest()
+        {
+            assert(!isAnyOutstandingRequest());
+            _inst->savedReq = nullptr;
+            if (_senderState)
+                delete _senderState;
+
+            for (auto r: _packets)
+                delete r;
+        };
+
+
+      public:
+        /** Convenience getters/setters. */
+        /** @{ */
+        /** Set up Context numbers. */
+        void
+        setContext(const ContextID& context_id)
+        {
+            request()->setContext(context_id);
+        }
+
+        const DynInstPtr&
+        instruction()
+        {
+            return _inst;
+        }
+
+        /** Set up virtual request.
+         * For a previously allocated Request objects.
+         */
+        void
+        setVirt(int asid, Addr vaddr, unsigned size, Request::Flags flags_,
+                MasterID mid, Addr pc)
+        {
+            request()->setVirt(asid, vaddr, size, flags_, mid, pc);
+        }
+
+        void
+        taskId(const uint32_t& v)
+        {
+            _taskId = v;
+            for (auto& r: _requests)
+                r->taskId(v);
+        }
+
+        uint32_t taskId() const { return _taskId; }
+        RequestPtr request(int idx = 0) { return _requests.at(idx); }
+
+        const RequestPtr
+        request(int idx = 0) const
+        {
+            return _requests.at(idx);
+        }
+
+        Addr getVaddr(int idx = 0) const { return request(idx)->getVaddr(); }
+        virtual void initiateTranslation() = 0;
+
+        PacketPtr packet(int idx = 0) { return _packets.at(idx); }
+
+        virtual PacketPtr
+        mainPacket()
+        {
+            assert (_packets.size() == 1);
+            return packet();
+        }
+
+        virtual RequestPtr
+        mainRequest()
+        {
+            assert (_requests.size() == 1);
+            return request();
+        }
+
+        void
+        senderState(LSQSenderState* st)
+        {
+            _senderState = st;
+            for (auto& pkt: _packets) {
+                if (pkt)
+                    pkt->senderState = st;
+            }
+        }
+
+        const LSQSenderState*
+        senderState() const
+        {
+            return _senderState;
+        }
+
+        /**
+         * Mark senderState as discarded. This will cause to discard response
+         * packets from the cache.
+         */
+        void
+        discardSenderState()
+        {
+            assert(_senderState);
+            _senderState->deleteRequest();
+        }
+
+        /**
+         * Test if there is any in-flight translation or mem access request
+         */
+        bool
+        isAnyOutstandingRequest()
+        {
+            return numInTranslationFragments > 0 ||
+                _numOutstandingPackets > 0 ||
+                (flags.isSet(Flag::WritebackScheduled) &&
+                 !flags.isSet(Flag::WritebackDone));
+        }
+
+        bool
+        isSplit() const
+        {
+            return flags.isSet(Flag::IsSplit);
+        }
+        /** @} */
+        virtual bool recvTimingResp(PacketPtr pkt) = 0;
+        virtual void sendPacketToCache() = 0;
+        virtual void buildPackets() = 0;
+
+        /**
+         * Memory mapped IPR accesses
+         */
+        virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt) = 0;
+        virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt) = 0;
+
+        /**
+         * Test if the request accesses a particular cache line.
+         */
+        virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask) = 0;
+
+        /** Update the status to reflect that a packet was sent. */
+        void
+        packetSent()
+        {
+            flags.set(Flag::Sent);
+        }
+        /** Update the status to reflect that a packet was not sent.
+         * When a packet fails to be sent, we mark the request as needing a
+         * retry. Note that Retry flag is sticky.
+         */
+        void
+        packetNotSent()
+        {
+            flags.set(Flag::Retry);
+            flags.clear(Flag::Sent);
+        }
+
+        void sendFragmentToTranslation(int i);
+        bool
+        isComplete()
+        {
+            return flags.isSet(Flag::Complete);
+        }
+
+        bool
+        isInTranslation()
+        {
+            return _state == State::Translation;
+        }
+
+        bool
+        isTranslationComplete()
+        {
+            return flags.isSet(Flag::TranslationStarted) &&
+                   !isInTranslation();
+        }
+
+        bool
+        isTranslationBlocked()
+        {
+            return _state == State::Translation &&
+                flags.isSet(Flag::TranslationStarted) &&
+                !flags.isSet(Flag::TranslationFinished);
+        }
+
+        bool
+        isSent()
+        {
+            return flags.isSet(Flag::Sent);
+        }
+
+        /**
+         * The LSQ entry is cleared
+         */
+        void
+        freeLSQEntry()
+        {
+            release(Flag::LSQEntryFreed);
+        }
+
+        /**
+         * The request is discarded (e.g. partial store-load forwarding)
+         */
+        void
+        discard()
+        {
+            release(Flag::Discarded);
+        }
+
+        void
+        packetReplied()
+        {
+            assert(_numOutstandingPackets > 0);
+            _numOutstandingPackets--;
+            if (_numOutstandingPackets == 0 && isReleased())
+                delete this;
+        }
+
+        void
+        writebackScheduled()
+        {
+            assert(!flags.isSet(Flag::WritebackScheduled));
+            flags.set(Flag::WritebackScheduled);
+        }
+
+        void
+        writebackDone()
+        {
+            flags.set(Flag::WritebackDone);
+            /* If the lsq resources are already free */
+            if (isReleased()) {
+                delete this;
+            }
+        }
+
+        void
+        squashTranslation()
+        {
+            assert(numInTranslationFragments == 0);
+            flags.set(Flag::TranslationSquashed);
+            /* If we are on our own, self-destruct. */
+            if (isReleased()) {
+                delete this;
+            }
+        }
+
+        void
+        complete()
+        {
+            flags.set(Flag::Complete);
+        }
+    };
+
+    class SingleDataRequest : public LSQRequest
+    {
+      protected:
+        /* Given that we are inside templates, children need explicit
+         * declaration of the names in the parent class. */
+        using Flag = typename LSQRequest::Flag;
+        using State = typename LSQRequest::State;
+        using LSQRequest::_fault;
+        using LSQRequest::_inst;
+        using LSQRequest::_packets;
+        using LSQRequest::_port;
+        using LSQRequest::_res;
+        using LSQRequest::_senderState;
+        using LSQRequest::_state;
+        using LSQRequest::flags;
+        using LSQRequest::isLoad;
+        using LSQRequest::isTranslationComplete;
+        using LSQRequest::lsqUnit;
+        using LSQRequest::request;
+        using LSQRequest::sendFragmentToTranslation;
+        using LSQRequest::setState;
+        using LSQRequest::numInTranslationFragments;
+        using LSQRequest::numTranslatedFragments;
+        using LSQRequest::_numOutstandingPackets;
+      public:
+        SingleDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
+                          const Addr& addr, const uint32_t& size,
+                          const Request::Flags& flags_,
+                          PacketDataPtr data = nullptr,
+                          uint64_t* res = nullptr) :
+            LSQRequest(port, inst, isLoad, addr, size, flags_, data, res)
+        {
+            LSQRequest::_requests.push_back(
+                std::make_shared<Request>(inst->getASID(), addr, size, flags_,
+                    inst->masterId(), inst->instAddr(), inst->contextId()));
+            LSQRequest::_requests.back()->setReqInstSeqNum(inst->seqNum);
+        }
+        inline virtual ~SingleDataRequest() {}
+        virtual void initiateTranslation();
+        virtual void finish(const Fault &fault, const RequestPtr &req,
+                ThreadContext* tc, BaseTLB::Mode mode);
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void sendPacketToCache();
+        virtual void buildPackets();
+        virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt);
+        virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt);
+        virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
+    };
+
+    class SplitDataRequest : public LSQRequest
+    {
+      protected:
+        /* Given that we are inside templates, children need explicit
+         * declaration of the names in the parent class. */
+        using Flag = typename LSQRequest::Flag;
+        using State = typename LSQRequest::State;
+        using LSQRequest::_addr;
+        using LSQRequest::_data;
+        using LSQRequest::_fault;
+        using LSQRequest::_flags;
+        using LSQRequest::_inst;
+        using LSQRequest::_packets;
+        using LSQRequest::_port;
+        using LSQRequest::_requests;
+        using LSQRequest::_res;
+        using LSQRequest::_senderState;
+        using LSQRequest::_size;
+        using LSQRequest::_state;
+        using LSQRequest::_taskId;
+        using LSQRequest::flags;
+        using LSQRequest::isLoad;
+        using LSQRequest::isTranslationComplete;
+        using LSQRequest::lsqUnit;
+        using LSQRequest::numInTranslationFragments;
+        using LSQRequest::numTranslatedFragments;
+        using LSQRequest::request;
+        using LSQRequest::sendFragmentToTranslation;
+        using LSQRequest::setState;
+        using LSQRequest::_numOutstandingPackets;
+
+        uint32_t numFragments;
+        uint32_t numReceivedPackets;
+        RequestPtr mainReq;
+        PacketPtr _mainPacket;
+
+
+      public:
+        SplitDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
+                         const Addr& addr, const uint32_t& size,
+                         const Request::Flags & flags_,
+                         PacketDataPtr data = nullptr,
+                         uint64_t* res = nullptr) :
+            LSQRequest(port, inst, isLoad, addr, size, flags_, data, res),
+            numFragments(0),
+            numReceivedPackets(0),
+            mainReq(nullptr),
+            _mainPacket(nullptr)
+        {
+            flags.set(Flag::IsSplit);
+        }
+        virtual ~SplitDataRequest()
+        {
+            if (mainReq) {
+                mainReq = nullptr;
+            }
+            if (_mainPacket) {
+                delete _mainPacket;
+                _mainPacket = nullptr;
+            }
+        }
+        virtual void finish(const Fault &fault, const RequestPtr &req,
+                ThreadContext* tc, BaseTLB::Mode mode);
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void initiateTranslation();
+        virtual void sendPacketToCache();
+        virtual void buildPackets();
+
+        virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt);
+        virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt);
+        virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
+
+        virtual RequestPtr mainRequest();
+        virtual PacketPtr mainPacket();
+    };
+
     /** Constructs an LSQ with the given parameters. */
     LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params);
     ~LSQ() { }
@@ -85,17 +732,9 @@
 
     /** Number of entries needed for the given amount of threads.*/
     int entryAmount(ThreadID num_threads);
-    void removeEntries(ThreadID tid);
-    /** Reset the max entries for each thread. */
-    void resetEntries();
-    /** Resize the max entries for a thread. */
-    void resizeEntries(unsigned size, ThreadID tid);
 
     /** Ticks the LSQ. */
-    void tick();
-    /** Ticks a specific LSQ Unit. */
-    void tick(ThreadID tid)
-    { thread[tid].tick(); }
+    void tick() { usedStorePorts = 0; }
 
     /** Inserts a load into the LSQ. */
     void insertLoad(const DynInstPtr &load_inst);
@@ -112,13 +751,13 @@
      * Commits loads up until the given sequence number for a specific thread.
      */
     void commitLoads(InstSeqNum &youngest_inst, ThreadID tid)
-    { thread[tid].commitLoads(youngest_inst); }
+    { thread.at(tid).commitLoads(youngest_inst); }
 
     /**
      * Commits stores up until the given sequence number for a specific thread.
      */
     void commitStores(InstSeqNum &youngest_inst, ThreadID tid)
-    { thread[tid].commitStores(youngest_inst); }
+    { thread.at(tid).commitStores(youngest_inst); }
 
     /**
      * Attempts to write back stores until all cache ports are used or the
@@ -131,8 +770,11 @@
     /**
      * Squash instructions from a thread until the specified sequence number.
      */
-    void squash(const InstSeqNum &squashed_num, ThreadID tid)
-    { thread[tid].squash(squashed_num); }
+    void
+    squash(const InstSeqNum &squashed_num, ThreadID tid)
+    {
+        thread.at(tid).squash(squashed_num);
+    }
 
     /** Returns whether or not there was a memory ordering violation. */
     bool violation();
@@ -140,50 +782,49 @@
      * Returns whether or not there was a memory ordering violation for a
      * specific thread.
      */
-    bool violation(ThreadID tid)
-    { return thread[tid].violation(); }
+    bool violation(ThreadID tid) { return thread.at(tid).violation(); }
 
     /** Gets the instruction that caused the memory ordering violation. */
-    DynInstPtr getMemDepViolator(ThreadID tid)
-    { return thread[tid].getMemDepViolator(); }
+    DynInstPtr
+    getMemDepViolator(ThreadID tid)
+    {
+        return thread.at(tid).getMemDepViolator();
+    }
 
     /** Returns the head index of the load queue for a specific thread. */
-    int getLoadHead(ThreadID tid)
-    { return thread[tid].getLoadHead(); }
+    int getLoadHead(ThreadID tid) { return thread.at(tid).getLoadHead(); }
 
     /** Returns the sequence number of the head of the load queue. */
-    InstSeqNum getLoadHeadSeqNum(ThreadID tid)
+    InstSeqNum
+    getLoadHeadSeqNum(ThreadID tid)
     {
-        return thread[tid].getLoadHeadSeqNum();
+        return thread.at(tid).getLoadHeadSeqNum();
     }
 
     /** Returns the head index of the store queue. */
-    int getStoreHead(ThreadID tid)
-    { return thread[tid].getStoreHead(); }
+    int getStoreHead(ThreadID tid) { return thread.at(tid).getStoreHead(); }
 
     /** Returns the sequence number of the head of the store queue. */
-    InstSeqNum getStoreHeadSeqNum(ThreadID tid)
+    InstSeqNum
+    getStoreHeadSeqNum(ThreadID tid)
     {
-        return thread[tid].getStoreHeadSeqNum();
+        return thread.at(tid).getStoreHeadSeqNum();
     }
 
     /** Returns the number of instructions in all of the queues. */
     int getCount();
     /** Returns the number of instructions in the queues of one thread. */
-    int getCount(ThreadID tid)
-    { return thread[tid].getCount(); }
+    int getCount(ThreadID tid) { return thread.at(tid).getCount(); }
 
     /** Returns the total number of loads in the load queue. */
     int numLoads();
     /** Returns the total number of loads for a single thread. */
-    int numLoads(ThreadID tid)
-    { return thread[tid].numLoads(); }
+    int numLoads(ThreadID tid) { return thread.at(tid).numLoads(); }
 
     /** Returns the total number of stores in the store queue. */
     int numStores();
     /** Returns the total number of stores for a single thread. */
-    int numStores(ThreadID tid)
-    { return thread[tid].numStores(); }
+    int numStores(ThreadID tid) { return thread.at(tid).numStores(); }
 
     /** Returns the number of free load entries. */
     unsigned numFreeLoadEntries();
@@ -242,46 +883,39 @@
     /** Returns whether or not a specific thread has any stores to write back
      * to memory.
      */
-    bool hasStoresToWB(ThreadID tid)
-    { return thread[tid].hasStoresToWB(); }
+    bool hasStoresToWB(ThreadID tid) { return thread.at(tid).hasStoresToWB(); }
 
     /** Returns the number of stores a specific thread has to write back. */
-    int numStoresToWB(ThreadID tid)
-    { return thread[tid].numStoresToWB(); }
+    int numStoresToWB(ThreadID tid) { return thread.at(tid).numStoresToWB(); }
 
     /** Returns if the LSQ will write back to memory this cycle. */
     bool willWB();
     /** Returns if the LSQ of a specific thread will write back to memory this
      * cycle.
      */
-    bool willWB(ThreadID tid)
-    { return thread[tid].willWB(); }
+    bool willWB(ThreadID tid) { return thread.at(tid).willWB(); }
 
     /** Debugging function to print out all instructions. */
     void dumpInsts() const;
     /** Debugging function to print out instructions from a specific thread. */
-    void dumpInsts(ThreadID tid) const
-    { thread[tid].dumpInsts(); }
+    void dumpInsts(ThreadID tid) const { thread.at(tid).dumpInsts(); }
 
     /** Executes a read operation, using the load specified at the load
      * index.
      */
-    Fault read(const RequestPtr &req,
-               RequestPtr &sreqLow, RequestPtr &sreqHigh,
-               int load_idx);
+    Fault read(LSQRequest* req, int load_idx);
 
     /** Executes a store operation, using the store specified at the store
      * index.
      */
-    Fault write(const RequestPtr &req,
-                const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                uint8_t *data, int store_idx);
+    Fault write(LSQRequest* req, uint8_t *data, int store_idx);
 
     /**
      * Retry the previous send that failed.
      */
     void recvReqRetry();
 
+    void completeDataAccess(PacketPtr pkt);
     /**
      * Handles writing back and completing the load or store that has
      * returned from memory.
@@ -292,13 +926,34 @@
 
     void recvTimingSnoopReq(PacketPtr pkt);
 
+    Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+                      unsigned int size, Addr addr, Request::Flags flags,
+                      uint64_t *res);
+
     /** The CPU pointer. */
     O3CPU *cpu;
 
     /** The IEW stage pointer. */
     IEW *iewStage;
 
+    /** Is D-cache blocked? */
+    bool cacheBlocked() const;
+    /** Set D-cache blocked status */
+    void cacheBlocked(bool v);
+    /** Is any store port available to use? */
+    bool storePortAvailable() const;
+    /** Another store port is in use */
+    void storePortBusy();
+
   protected:
+    /** D-cache is blocked */
+    bool _cacheBlocked;
+    /** The number of cache ports available each cycle (stores only). */
+    int cacheStorePorts;
+    /** The number of used cache ports in this cycle by stores. */
+    int usedStorePorts;
+
+
     /** The LSQ policy for SMT mode. */
     SMTQueuePolicy lsqPolicy;
 
@@ -307,8 +962,10 @@
      * and threshold, this function calculates how many resources each thread
      * can occupy at most.
      */
-    static uint32_t maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries,
-            uint32_t numThreads, uint32_t SMTThreshold) {
+    static uint32_t
+    maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries,
+            uint32_t numThreads, uint32_t SMTThreshold)
+    {
         if (pol == SMTQueuePolicy::Dynamic) {
             return entries;
         } else if (pol == SMTQueuePolicy::Partitioned) {
@@ -346,24 +1003,20 @@
 
 template <class Impl>
 Fault
-LSQ<Impl>::read(const RequestPtr &req,
-                RequestPtr &sreqLow, RequestPtr &sreqHigh,
-                int load_idx)
+LSQ<Impl>::read(LSQRequest* req, int load_idx)
 {
-    ThreadID tid = cpu->contextToThread(req->contextId());
+    ThreadID tid = cpu->contextToThread(req->request()->contextId());
 
-    return thread[tid].read(req, sreqLow, sreqHigh, load_idx);
+    return thread.at(tid).read(req, load_idx);
 }
 
 template <class Impl>
 Fault
-LSQ<Impl>::write(const RequestPtr &req,
-                 const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                 uint8_t *data, int store_idx)
+LSQ<Impl>::write(LSQRequest* req, uint8_t *data, int store_idx)
 {
-    ThreadID tid = cpu->contextToThread(req->contextId());
+    ThreadID tid = cpu->contextToThread(req->request()->contextId());
 
-    return thread[tid].write(req, sreqLow, sreqHigh, data, store_idx);
+    return thread.at(tid).write(req, data, store_idx);
 }
 
 #endif // __CPU_O3_LSQ_HH__
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index edc3f46..8a221a8 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012, 2014 ARM Limited
+ * Copyright (c) 2011-2012, 2014, 2017-2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -61,6 +61,8 @@
 template <class Impl>
 LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params)
     : cpu(cpu_ptr), iewStage(iew_ptr),
+      _cacheBlocked(false),
+      cacheStorePorts(params->cacheStorePorts), usedStorePorts(0),
       lsqPolicy(params->smtLSQPolicy),
       LQEntries(params->LQEntries),
       SQEntries(params->SQEntries),
@@ -76,8 +78,8 @@
     //************ Handle SMT Parameters ***********/
     //**********************************************/
 
-    //Figure out fetch policy
-    if (lsqPolicy == SMTQueuePolicy::Dynamic) {
+    /* Run SMT olicy checks. */
+        if (lsqPolicy == SMTQueuePolicy::Dynamic) {
         DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n");
     } else if (lsqPolicy == SMTQueuePolicy::Partitioned) {
         DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: "
@@ -85,8 +87,8 @@
                 maxLQEntries,maxSQEntries);
     } else if (lsqPolicy == SMTQueuePolicy::Threshold) {
 
-        assert(params->smtLSQThreshold > LQEntries);
-        assert(params->smtLSQThreshold > SQEntries);
+        assert(params->smtLSQThreshold > params->LQEntries);
+        assert(params->smtLSQThreshold > params->SQEntries);
 
         DPRINTF(LSQ, "LSQ sharing policy set to Threshold: "
                 "%i entries per LQ | %i entries per SQ\n",
@@ -163,79 +165,41 @@
 void
 LSQ<Impl>::takeOverFrom()
 {
+    usedStorePorts = 0;
+    _cacheBlocked = false;
+
     for (ThreadID tid = 0; tid < numThreads; tid++) {
         thread[tid].takeOverFrom();
     }
 }
 
-template <class Impl>
-int
-LSQ<Impl>::entryAmount(ThreadID num_threads)
+template<class Impl>
+bool
+LSQ<Impl>::cacheBlocked() const
 {
-    if (lsqPolicy == SMTQueuePolicy::Partitioned) {
-        return LQEntries / num_threads;
-    } else {
-        return 0;
-    }
-}
-
-template <class Impl>
-void
-LSQ<Impl>::resetEntries()
-{
-    if (lsqPolicy != SMTQueuePolicy::Dynamic || numThreads > 1) {
-        int active_threads = activeThreads->size();
-
-        int maxEntries;
-
-        if (lsqPolicy == SMTQueuePolicy::Partitioned) {
-            maxEntries = LQEntries / active_threads;
-        } else if (lsqPolicy == SMTQueuePolicy::Threshold &&
-                   active_threads == 1) {
-            maxEntries = LQEntries;
-        } else {
-            maxEntries = LQEntries;
-        }
-
-        list<ThreadID>::iterator threads  = activeThreads->begin();
-        list<ThreadID>::iterator end = activeThreads->end();
-
-        while (threads != end) {
-            ThreadID tid = *threads++;
-
-            resizeEntries(maxEntries, tid);
-        }
-    }
+    return _cacheBlocked;
 }
 
 template<class Impl>
 void
-LSQ<Impl>::removeEntries(ThreadID tid)
+LSQ<Impl>::cacheBlocked(bool v)
 {
-    thread[tid].clearLQ();
-    thread[tid].clearSQ();
+    _cacheBlocked = v;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::storePortAvailable() const
+{
+    return usedStorePorts < cacheStorePorts;
 }
 
 template<class Impl>
 void
-LSQ<Impl>::resizeEntries(unsigned size, ThreadID tid)
+LSQ<Impl>::storePortBusy()
 {
-    thread[tid].resizeLQ(size);
-    thread[tid].resizeSQ(size);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::tick()
-{
-    list<ThreadID>::iterator threads = activeThreads->begin();
-    list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        thread[tid].tick();
-    }
+    usedStorePorts++;
+    assert(usedStorePorts <= cacheStorePorts);
 }
 
 template<class Impl>
@@ -316,6 +280,7 @@
 LSQ<Impl>::recvReqRetry()
 {
     iewStage->cacheUnblocked();
+    cacheBlocked(false);
 
     for (ThreadID tid : *activeThreads) {
         thread[tid].recvRetry();
@@ -323,6 +288,15 @@
 }
 
 template <class Impl>
+void
+LSQ<Impl>::completeDataAccess(PacketPtr pkt)
+{
+    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    thread[cpu->contextToThread(senderState->contextId())]
+        .completeDataAccess(pkt);
+}
+
+template <class Impl>
 bool
 LSQ<Impl>::recvTimingResp(PacketPtr pkt)
 {
@@ -330,8 +304,10 @@
         DPRINTF(LSQ, "Got error packet back for address: %#X\n",
                 pkt->getAddr());
 
-    thread[cpu->contextToThread(pkt->req->contextId())]
-        .completeDataAccess(pkt);
+    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    panic_if(!senderState, "Got packet back with unknown sender state\n");
+
+    thread[cpu->contextToThread(senderState->contextId())].recvTimingResp(pkt);
 
     if (pkt->isInvalidate()) {
         // This response also contains an invalidate; e.g. this can be the case
@@ -352,8 +328,9 @@
             thread[tid].checkSnoop(pkt);
         }
     }
+    // Update the LSQRequest state (this may delete the request)
+    senderState->request()->packetReplied();
 
-    delete pkt;
     return true;
 }
 
@@ -681,4 +658,442 @@
     }
 }
 
+static Addr
+addrBlockOffset(Addr addr, unsigned int block_size)
+{
+    return addr & (block_size - 1);
+}
+
+static Addr
+addrBlockAlign(Addr addr, uint64_t block_size)
+{
+    return addr & ~(block_size - 1);
+}
+
+static bool
+transferNeedsBurst(Addr addr, uint64_t size, uint64_t block_size)
+{
+    return (addrBlockOffset(addr, block_size) + size) > block_size;
+}
+
+template<class Impl>
+Fault
+LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+                       unsigned int size, Addr addr, Request::Flags flags,
+                       uint64_t *res)
+{
+    ThreadID tid = cpu->contextToThread(inst->contextId());
+    auto cacheLineSize = cpu->cacheLineSize();
+    bool needs_burst = transferNeedsBurst(addr, size, cacheLineSize);
+    LSQRequest* req = nullptr;
+
+    if (inst->translationStarted()) {
+        req = inst->savedReq;
+        assert(req);
+    } else {
+        if (needs_burst) {
+            req = new SplitDataRequest(&thread[tid], inst, isLoad, addr,
+                    size, flags, data, res);
+        } else {
+            req = new SingleDataRequest(&thread[tid], inst, isLoad, addr,
+                    size, flags, data, res);
+        }
+        assert(req);
+        inst->setRequest();
+        req->taskId(cpu->taskId());
+
+        req->initiateTranslation();
+    }
+
+    /* This is the place were instructions get the effAddr. */
+    if (req->isTranslationComplete()) {
+        if (inst->getFault() == NoFault) {
+            inst->effAddr = req->getVaddr();
+            inst->effSize = size;
+            inst->effAddrValid(true);
+
+            if (cpu->checker) {
+                inst->reqToVerify = std::make_shared<Request>(*req->request());
+            }
+            if (isLoad)
+                inst->getFault() = cpu->read(req, inst->lqIdx);
+            else
+                inst->getFault() = cpu->write(req, data, inst->sqIdx);
+        } else if (isLoad) {
+            // Commit will have to clean up whatever happened.  Set this
+            // instruction as executed.
+            inst->setExecuted();
+        }
+    }
+
+    if (inst->traceData)
+        inst->traceData->setMem(addr, size, flags);
+
+    return inst->getFault();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::finish(const Fault &fault, const RequestPtr &req,
+        ThreadContext* tc, BaseTLB::Mode mode)
+{
+    _fault.push_back(fault);
+    numInTranslationFragments = 0;
+    numTranslatedFragments = 1;
+    /* If the instruction has been squahsed, let the request know
+     * as it may have to self-destruct. */
+    if (_inst->isSquashed()) {
+        this->squashTranslation();
+    } else {
+        _inst->strictlyOrdered(req->isStrictlyOrdered());
+
+        flags.set(Flag::TranslationFinished);
+        if (fault == NoFault) {
+            _inst->physEffAddr = req->getPaddr();
+            _inst->memReqFlags = req->getFlags();
+            if (req->isCondSwap()) {
+                assert(_res);
+                req->setExtraData(*_res);
+            }
+            setState(State::Request);
+        } else {
+            setState(State::Fault);
+        }
+
+        LSQRequest::_inst->fault = fault;
+        LSQRequest::_inst->translationCompleted(true);
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req,
+        ThreadContext* tc, BaseTLB::Mode mode)
+{
+    _fault.push_back(fault);
+    assert(req == _requests[numTranslatedFragments] || this->isDelayed());
+
+    numInTranslationFragments--;
+    numTranslatedFragments++;
+
+    mainReq->setFlags(req->getFlags());
+
+    if (numTranslatedFragments == _requests.size()) {
+        if (_inst->isSquashed()) {
+            this->squashTranslation();
+        } else {
+            _inst->strictlyOrdered(mainReq->isStrictlyOrdered());
+            flags.set(Flag::TranslationFinished);
+            auto fault_it = _fault.begin();
+            /* Ffwd to the first NoFault. */
+            while (fault_it != _fault.end() && *fault_it == NoFault)
+                fault_it++;
+            /* If none of the fragments faulted: */
+            if (fault_it == _fault.end()) {
+                _inst->physEffAddr = request(0)->getPaddr();
+
+                _inst->memReqFlags = mainReq->getFlags();
+                if (mainReq->isCondSwap()) {
+                    assert(_res);
+                    mainReq->setExtraData(*_res);
+                }
+                setState(State::Request);
+                _inst->fault = NoFault;
+            } else {
+                setState(State::Fault);
+                _inst->fault = *fault_it;
+            }
+            _inst->translationCompleted(true);
+        }
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::initiateTranslation()
+{
+    _inst->translationStarted(true);
+    setState(State::Translation);
+    flags.set(Flag::TranslationStarted);
+
+    _inst->savedReq = this;
+    sendFragmentToTranslation(0);
+
+    if (isTranslationComplete()) {
+    }
+}
+
+template<class Impl>
+PacketPtr
+LSQ<Impl>::SplitDataRequest::mainPacket()
+{
+    return _mainPacket;
+}
+
+template<class Impl>
+RequestPtr
+LSQ<Impl>::SplitDataRequest::mainRequest()
+{
+    return mainReq;
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::initiateTranslation()
+{
+    _inst->translationStarted(true);
+    setState(State::Translation);
+    flags.set(Flag::TranslationStarted);
+
+    unsigned int cacheLineSize = _port.cacheLineSize();
+    Addr base_addr = _addr;
+    Addr next_addr = addrBlockAlign(_addr + cacheLineSize, cacheLineSize);
+    Addr final_addr = addrBlockAlign(_addr + _size, cacheLineSize);
+    uint32_t size_so_far = 0;
+
+    mainReq = std::make_shared<Request>(_inst->getASID(), base_addr,
+                _size, _flags, _inst->masterId(),
+                _inst->instAddr(), _inst->contextId());
+
+    // Paddr is not used in mainReq. However, we will accumulate the flags
+    // from the sub requests into mainReq by calling setFlags() in finish().
+    // setFlags() assumes that paddr is set so flip the paddr valid bit here to
+    // avoid a potential assert in setFlags() when we call it from  finish().
+    mainReq->setPaddr(0);
+
+    /* Get the pre-fix, possibly unaligned. */
+    _requests.push_back(std::make_shared<Request>(_inst->getASID(), base_addr,
+                next_addr - base_addr, _flags, _inst->masterId(),
+                _inst->instAddr(), _inst->contextId()));
+    size_so_far = next_addr - base_addr;
+
+    /* We are block aligned now, reading whole blocks. */
+    base_addr = next_addr;
+    while (base_addr != final_addr) {
+        _requests.push_back(std::make_shared<Request>(_inst->getASID(),
+                    base_addr, cacheLineSize, _flags, _inst->masterId(),
+                    _inst->instAddr(), _inst->contextId()));
+        size_so_far += cacheLineSize;
+        base_addr += cacheLineSize;
+    }
+
+    /* Deal with the tail. */
+    if (size_so_far < _size) {
+        _requests.push_back(std::make_shared<Request>(_inst->getASID(),
+                    base_addr, _size - size_so_far, _flags, _inst->masterId(),
+                    _inst->instAddr(), _inst->contextId()));
+    }
+
+    /* Setup the requests and send them to translation. */
+    for (auto& r: _requests) {
+        r->setReqInstSeqNum(_inst->seqNum);
+        r->taskId(_taskId);
+    }
+    this->_inst->savedReq = this;
+    numInTranslationFragments = 0;
+    numTranslatedFragments = 0;
+
+    for (uint32_t i = 0; i < _requests.size(); i++) {
+        sendFragmentToTranslation(i);
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::LSQRequest::sendFragmentToTranslation(int i)
+{
+    numInTranslationFragments++;
+    _port.dTLB()->translateTiming(
+            this->request(i),
+            this->_inst->thread->getTC(), this,
+            this->isLoad() ? BaseTLB::Read : BaseTLB::Write);
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::SingleDataRequest::recvTimingResp(PacketPtr pkt)
+{
+    assert(_numOutstandingPackets == 1);
+    auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    setState(State::Complete);
+    flags.set(Flag::Complete);
+    state->outstanding--;
+    assert(pkt == _packets.front());
+    _port.completeDataAccess(pkt);
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::SplitDataRequest::recvTimingResp(PacketPtr pkt)
+{
+    auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    uint32_t pktIdx = 0;
+    while (pktIdx < _packets.size() && pkt != _packets[pktIdx])
+        pktIdx++;
+    assert(pktIdx < _packets.size());
+    assert(pkt->req == _requests[pktIdx]);
+    assert(pkt == _packets[pktIdx]);
+    numReceivedPackets++;
+    state->outstanding--;
+    if (numReceivedPackets == _packets.size()) {
+        setState(State::Complete);
+        flags.set(Flag::Complete);
+        /* Assemble packets. */
+        PacketPtr resp = isLoad()
+            ? Packet::createRead(mainReq)
+            : Packet::createWrite(mainReq);
+        if (isLoad())
+            resp->dataStatic(_inst->memData);
+        else
+            resp->dataStatic(_data);
+        resp->senderState = _senderState;
+        _port.completeDataAccess(resp);
+        delete resp;
+    }
+    return true;
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::buildPackets()
+{
+    assert(_senderState);
+    /* Retries do not create new packets. */
+    if (_packets.size() == 0) {
+        _packets.push_back(
+                isLoad()
+                    ?  Packet::createRead(request())
+                    :  Packet::createWrite(request()));
+        _packets.back()->dataStatic(_inst->memData);
+        _packets.back()->senderState = _senderState;
+    }
+    assert(_packets.size() == 1);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::buildPackets()
+{
+    /* Extra data?? */
+    ptrdiff_t offset = 0;
+    if (_packets.size() == 0) {
+        /* New stuff */
+        if (isLoad()) {
+            _mainPacket = Packet::createRead(mainReq);
+            _mainPacket->dataStatic(_inst->memData);
+        }
+        for (auto& r: _requests) {
+            PacketPtr pkt = isLoad() ? Packet::createRead(r)
+                                    : Packet::createWrite(r);
+            if (isLoad()) {
+                pkt->dataStatic(_inst->memData + offset);
+            } else {
+                uint8_t* req_data = new uint8_t[r->getSize()];
+                std::memcpy(req_data,
+                        _inst->memData + offset,
+                        r->getSize());
+                pkt->dataDynamic(req_data);
+            }
+            offset += r->getSize();
+            pkt->senderState = _senderState;
+            _packets.push_back(pkt);
+        }
+    }
+    assert(_packets.size() == _requests.size());
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::sendPacketToCache()
+{
+    assert(_numOutstandingPackets == 0);
+    if (lsqUnit()->trySendPacket(isLoad(), _packets.at(0)))
+        _numOutstandingPackets = 1;
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::sendPacketToCache()
+{
+    /* Try to send the packets. */
+    while (numReceivedPackets + _numOutstandingPackets < _packets.size() &&
+            lsqUnit()->trySendPacket(isLoad(),
+                _packets.at(numReceivedPackets + _numOutstandingPackets))) {
+        _numOutstandingPackets++;
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::handleIprWrite(ThreadContext *thread,
+                                             PacketPtr pkt)
+{
+    TheISA::handleIprWrite(thread, pkt);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::handleIprWrite(ThreadContext *thread,
+                                            PacketPtr mainPkt)
+{
+    unsigned offset = 0;
+    for (auto r: _requests) {
+        PacketPtr pkt = new Packet(r, MemCmd::WriteReq);
+        pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset);
+        TheISA::handleIprWrite(thread, pkt);
+        offset += r->getSize();
+        delete pkt;
+    }
+}
+
+template<class Impl>
+Cycles
+LSQ<Impl>::SingleDataRequest::handleIprRead(ThreadContext *thread,
+                                            PacketPtr pkt)
+{
+    return TheISA::handleIprRead(thread, pkt);
+}
+
+template<class Impl>
+Cycles
+LSQ<Impl>::SplitDataRequest::handleIprRead(ThreadContext *thread,
+                                           PacketPtr mainPkt)
+{
+    Cycles delay(0);
+    unsigned offset = 0;
+
+    for (auto r: _requests) {
+        PacketPtr pkt = new Packet(r, MemCmd::ReadReq);
+        pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset);
+        Cycles d = TheISA::handleIprRead(thread, pkt);
+        if (d > delay)
+            delay = d;
+        offset += r->getSize();
+        delete pkt;
+    }
+    return delay;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::SingleDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
+{
+    return ( (LSQRequest::_requests[0]->getPaddr() & blockMask) == blockAddr);
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::SplitDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
+{
+    bool is_hit = false;
+    for (auto &r: _requests) {
+        if ((r->getPaddr() & blockMask) == blockAddr) {
+            is_hit = true;
+            break;
+        }
+    }
+    return is_hit;
+}
+
 #endif//__CPU_O3_LSQ_IMPL_HH__
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 48a06b3..5b90da4 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2014,2017 ARM Limited
+ * Copyright (c) 2012-2014,2017-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -62,6 +62,7 @@
 #include "mem/port.hh"
 
 struct DerivO3CPUParams;
+#include "base/circular_queue.hh"
 
 /**
  * Class that implements the actual LQ and SQ for each specific
@@ -76,7 +77,8 @@
  * replayed.
  */
 template <class Impl>
-class LSQUnit {
+class LSQUnit
+{
   public:
     typedef typename Impl::O3CPU O3CPU;
     typedef typename Impl::DynInstPtr DynInstPtr;
@@ -84,6 +86,130 @@
     typedef typename Impl::CPUPol::LSQ LSQ;
     typedef typename Impl::CPUPol::IssueStruct IssueStruct;
 
+    using LSQSenderState = typename LSQ::LSQSenderState;
+    using LSQRequest = typename Impl::CPUPol::LSQ::LSQRequest;
+  private:
+    class LSQEntry
+    {
+      private:
+        /** The instruction. */
+        DynInstPtr inst;
+        /** The request. */
+        LSQRequest* req;
+        /** The size of the operation. */
+        uint8_t _size;
+        /** Valid entry. */
+        bool _valid;
+      public:
+        /** Constructs an empty store queue entry. */
+        LSQEntry()
+            : inst(nullptr), req(nullptr), _size(0), _valid(false)
+        {
+        }
+
+        ~LSQEntry()
+        {
+            inst = nullptr;
+            if (req != nullptr) {
+                req->freeLSQEntry();
+                req = nullptr;
+            }
+        }
+
+        void
+        clear()
+        {
+            inst = nullptr;
+            if (req != nullptr) {
+                req->freeLSQEntry();
+            }
+            req = nullptr;
+            _valid = false;
+            _size = 0;
+        }
+
+        void
+        set(const DynInstPtr& inst)
+        {
+            assert(!_valid);
+            this->inst = inst;
+            _valid = true;
+            _size = 0;
+        }
+        LSQRequest* request() { return req; }
+        void setRequest(LSQRequest* r) { req = r; }
+        bool hasRequest() { return req != nullptr; }
+        /** Member accessors. */
+        /** @{ */
+        bool valid() const { return _valid; }
+        uint8_t& size() { return _size; }
+        const uint8_t& size() const { return _size; }
+        const DynInstPtr& instruction() const { return inst; }
+        /** @} */
+    };
+
+    class SQEntry : public LSQEntry
+    {
+      private:
+        /** The store data. */
+        char _data[64];  // TODO: 64 should become a parameter
+        /** Whether or not the store can writeback. */
+        bool _canWB;
+        /** Whether or not the store is committed. */
+        bool _committed;
+        /** Whether or not the store is completed. */
+        bool _completed;
+        /** Does this request write all zeros and thus doesn't
+         * have any data attached to it. Used for cache block zero
+         * style instructs (ARM DC ZVA; ALPHA WH64)
+         */
+        bool _isAllZeros;
+      public:
+        static constexpr size_t DataSize = sizeof(_data);
+        /** Constructs an empty store queue entry. */
+        SQEntry()
+            : _canWB(false), _committed(false), _completed(false),
+              _isAllZeros(false)
+        {
+            std::memset(_data, 0, DataSize);
+        }
+
+        ~SQEntry()
+        {
+        }
+
+        void
+        set(const DynInstPtr& inst)
+        {
+            LSQEntry::set(inst);
+        }
+
+        void
+        clear()
+        {
+            LSQEntry::clear();
+            _canWB = _completed = _committed = _isAllZeros = false;
+        }
+        /** Member accessors. */
+        /** @{ */
+        bool& canWB() { return _canWB; }
+        const bool& canWB() const { return _canWB; }
+        bool& completed() { return _completed; }
+        const bool& completed() const { return _completed; }
+        bool& committed() { return _committed; }
+        const bool& committed() const { return _committed; }
+        bool& isAllZeros() { return _isAllZeros; }
+        const bool& isAllZeros() const { return _isAllZeros; }
+        char* data() { return _data; }
+        const char* data() const { return _data; }
+        /** @} */
+    };
+    using LQEntry = LSQEntry;
+
+  public:
+    using LoadQueue = CircularQueue<LQEntry>;
+    using StoreQueue = CircularQueue<SQEntry>;
+
   public:
     /** Constructs an LSQ unit. init() must be called prior to use. */
     LSQUnit(uint32_t lqEntries, uint32_t sqEntries);
@@ -113,13 +239,6 @@
     /** Takes over from another CPU's thread. */
     void takeOverFrom();
 
-    /** Ticks the LSQ unit, which in this case only resets the number of
-     * used cache ports.
-     * @todo: Move the number of used ports up to the LSQ level so it can
-     * be shared by all LSQ units.
-     */
-    void tick() { usedStorePorts = 0; }
-
     /** Inserts an instruction. */
     void insert(const DynInstPtr &inst);
     /** Inserts a load instruction. */
@@ -133,7 +252,8 @@
      * @param load_idx index to start checking at
      * @param inst the instruction to check
      */
-    Fault checkViolations(int load_idx, const DynInstPtr &inst);
+    Fault checkViolations(typename LoadQueue::iterator& loadIt,
+            const DynInstPtr& inst);
 
     /** Check if an incoming invalidate hits in the lsq on a load
      * that might have issued out of order wrt another load beacuse
@@ -163,18 +283,6 @@
      * memory system. */
     void completeDataAccess(PacketPtr pkt);
 
-    /** Clears all the entries in the LQ. */
-    void clearLQ();
-
-    /** Clears all the entries in the SQ. */
-    void clearSQ();
-
-    /** Resizes the LQ to a given size. */
-    void resizeLQ(unsigned size);
-
-    /** Resizes the SQ to a given size. */
-    void resizeSQ(unsigned size);
-
     /** Squashes all instructions younger than a specific sequence number. */
     void squash(const InstSeqNum &squashed_num);
 
@@ -205,10 +313,10 @@
     bool isEmpty() const { return lqEmpty() && sqEmpty(); }
 
     /** Returns if the LQ is full. */
-    bool lqFull() { return loads >= (LQEntries - 1); }
+    bool lqFull() { return loadQueue.full(); }
 
     /** Returns if the SQ is full. */
-    bool sqFull() { return stores >= (SQEntries - 1); }
+    bool sqFull() { return storeQueue.full(); }
 
     /** Returns if the LQ is empty. */
     bool lqEmpty() const { return loads == 0; }
@@ -226,13 +334,20 @@
     int numStoresToWB() { return storesToWB; }
 
     /** Returns if the LSQ unit will writeback on this cycle. */
-    bool willWB() { return storeQueue[storeWBIdx].canWB &&
-                        !storeQueue[storeWBIdx].completed &&
-                        !isStoreBlocked; }
+    bool
+    willWB()
+    {
+        return storeWBIt.dereferenceable() &&
+                        storeWBIt->valid() &&
+                        storeWBIt->canWB() &&
+                        !storeWBIt->completed() &&
+                        !isStoreBlocked;
+    }
 
     /** Handles doing the retry. */
     void recvRetry();
 
+    unsigned int cacheLineSize();
   private:
     /** Reset the LSQ state */
     void resetState();
@@ -240,31 +355,31 @@
     /** Writes back the instruction, sending it to IEW. */
     void writeback(const DynInstPtr &inst, PacketPtr pkt);
 
-    /** Writes back a store that couldn't be completed the previous cycle. */
-    void writebackPendingStore();
-
-    /** Handles completing the send of a store to memory. */
-    void storePostSend(PacketPtr pkt);
+    /** Try to finish a previously blocked write back attempt */
+    void writebackBlockedStore();
 
     /** Completes the store at the specified index. */
-    void completeStore(int store_idx);
+    void completeStore(typename StoreQueue::iterator store_idx);
 
-    /** Attempts to send a store to the cache. */
-    bool sendStore(PacketPtr data_pkt);
-
-    /** Increments the given store index (circular queue). */
-    inline void incrStIdx(int &store_idx) const;
-    /** Decrements the given store index (circular queue). */
-    inline void decrStIdx(int &store_idx) const;
-    /** Increments the given load index (circular queue). */
-    inline void incrLdIdx(int &load_idx) const;
-    /** Decrements the given load index (circular queue). */
-    inline void decrLdIdx(int &load_idx) const;
+    /** Handles completing the send of a store to memory. */
+    void storePostSend();
 
   public:
+    /** Attempts to send a packet to the cache.
+     * Check if there are ports available. Return true if
+     * there are, false if there are not.
+     */
+    bool trySendPacket(bool isLoad, PacketPtr data_pkt);
+
+
     /** Debugging function to dump instructions in the LSQ. */
     void dumpInsts() const;
 
+    /** Schedule event for the cpu. */
+    void schedule(Event& ev, Tick when) { cpu->schedule(ev, when); }
+
+    BaseTLB* dTLB() { return cpu->dtb; }
+
   private:
     /** Pointer to the CPU. */
     O3CPU *cpu;
@@ -278,44 +393,46 @@
     /** Pointer to the dcache port.  Used only for sending. */
     MasterPort *dcachePort;
 
-    /** Derived class to hold any sender state the LSQ needs. */
-    class LSQSenderState : public Packet::SenderState
+    /** Particularisation of the LSQSenderState to the LQ. */
+    class LQSenderState : public LSQSenderState
     {
+        using LSQSenderState::alive;
       public:
-        /** Default constructor. */
-        LSQSenderState()
-            : mainPkt(NULL), pendingPacket(NULL), idx(0), outstanding(1),
-              isLoad(false), noWB(false), isSplit(false),
-              pktToSend(false), cacheBlocked(false)
-          { }
+        LQSenderState(typename LoadQueue::iterator idx_)
+            : LSQSenderState(idx_->request(), true), idx(idx_) { }
 
-        /** Instruction who initiated the access to memory. */
-        DynInstPtr inst;
-        /** The main packet from a split load, used during writeback. */
-        PacketPtr mainPkt;
-        /** A second packet from a split store that needs sending. */
-        PacketPtr pendingPacket;
-        /** The LQ/SQ index of the instruction. */
-        uint8_t idx;
-        /** Number of outstanding packets to complete. */
-        uint8_t outstanding;
-        /** Whether or not it is a load. */
-        bool isLoad;
-        /** Whether or not the instruction will need to writeback. */
-        bool noWB;
-        /** Whether or not this access is split in two. */
-        bool isSplit;
-        /** Whether or not there is a packet that needs sending. */
-        bool pktToSend;
-        /** Whether or not the second packet of this split load was blocked */
-        bool cacheBlocked;
+        /** The LQ index of the instruction. */
+        typename LoadQueue::iterator idx;
+        //virtual LSQRequest* request() { return idx->request(); }
+        virtual void
+        complete()
+        {
+            //if (alive())
+            //  idx->request()->senderState(nullptr);
+        }
+    };
 
-        /** Completes a packet and returns whether the access is finished. */
-        inline bool complete() { return --outstanding == 0; }
+    /** Particularisation of the LSQSenderState to the SQ. */
+    class SQSenderState : public LSQSenderState
+    {
+        using LSQSenderState::alive;
+      public:
+        SQSenderState(typename StoreQueue::iterator idx_)
+            : LSQSenderState(idx_->request(), false), idx(idx_) { }
+        /** The SQ index of the instruction. */
+        typename StoreQueue::iterator idx;
+        //virtual LSQRequest* request() { return idx->request(); }
+        virtual void
+        complete()
+        {
+            //if (alive())
+            //   idx->request()->senderState(nullptr);
+        }
     };
 
     /** Writeback event, specifically for when stores forward data to loads. */
-    class WritebackEvent : public Event {
+    class WritebackEvent : public Event
+    {
       public:
         /** Constructs a writeback event. */
         WritebackEvent(const DynInstPtr &_inst, PacketPtr pkt,
@@ -339,72 +456,25 @@
     };
 
   public:
-    struct SQEntry {
-        /** Constructs an empty store queue entry. */
-        SQEntry()
-            : inst(NULL), req(NULL), size(0),
-              canWB(0), committed(0), completed(0)
-        {
-            std::memset(data, 0, sizeof(data));
-        }
-
-        ~SQEntry()
-        {
-            inst = NULL;
-        }
-
-        /** Constructs a store queue entry for a given instruction. */
-        SQEntry(const DynInstPtr &_inst)
-            : inst(_inst), req(NULL), sreqLow(NULL), sreqHigh(NULL), size(0),
-              isSplit(0), canWB(0), committed(0), completed(0), isAllZeros(0)
-        {
-            std::memset(data, 0, sizeof(data));
-        }
-        /** The store data. */
-        char data[16];
-        /** The store instruction. */
-        DynInstPtr inst;
-        /** The request for the store. */
-        RequestPtr req;
-        /** The split requests for the store. */
-        RequestPtr sreqLow;
-        RequestPtr sreqHigh;
-        /** The size of the store. */
-        uint8_t size;
-        /** Whether or not the store is split into two requests. */
-        bool isSplit;
-        /** Whether or not the store can writeback. */
-        bool canWB;
-        /** Whether or not the store is committed. */
-        bool committed;
-        /** Whether or not the store is completed. */
-        bool completed;
-        /** Does this request write all zeros and thus doesn't
-         * have any data attached to it. Used for cache block zero
-         * style instructs (ARM DC ZVA; ALPHA WH64)
-         */
-        bool isAllZeros;
-    };
+    /**
+     * Handles writing back and completing the load or store that has
+     * returned from memory.
+     *
+     * @param pkt Response packet from the memory sub-system
+     */
+    bool recvTimingResp(PacketPtr pkt);
 
   private:
     /** The LSQUnit thread id. */
     ThreadID lsqID;
-
+  public:
     /** The store queue. */
-    std::vector<SQEntry> storeQueue;
+    CircularQueue<SQEntry> storeQueue;
 
     /** The load queue. */
-    std::vector<DynInstPtr> loadQueue;
+    LoadQueue loadQueue;
 
-    /** The number of LQ entries, plus a sentinel entry (circular queue).
-     *  @todo: Consider having var that records the true number of LQ entries.
-     */
-    unsigned LQEntries;
-    /** The number of SQ entries, plus a sentinel entry (circular queue).
-     *  @todo: Consider having var that records the true number of SQ entries.
-     */
-    unsigned SQEntries;
-
+  private:
     /** The number of places to shift addresses in the LSQ before checking
      * for dependency violations
      */
@@ -420,28 +490,10 @@
     /** The number of store instructions in the SQ waiting to writeback. */
     int storesToWB;
 
-    /** The index of the head instruction in the LQ. */
-    int loadHead;
-    /** The index of the tail instruction in the LQ. */
-    int loadTail;
-
-    /** The index of the head instruction in the SQ. */
-    int storeHead;
     /** The index of the first instruction that may be ready to be
      * written back, and has not yet been written back.
      */
-    int storeWBIdx;
-    /** The index of the tail instruction in the SQ. */
-    int storeTail;
-
-    /// @todo Consider moving to a more advanced model with write vs read ports
-    /** The number of cache ports available each cycle (stores only). */
-    int cacheStorePorts;
-
-    /** The number of used cache ports in this cycle by stores. */
-    int usedStorePorts;
-
-    //list<InstSeqNum> mshrSeqNums;
+    typename StoreQueue::iterator storeWBIt;
 
     /** Address Mask for a cache block (e.g. ~(cache_block_size-1)) */
     Addr cacheBlockMask;
@@ -472,10 +524,10 @@
 
     /** Whether or not there is a packet that couldn't be sent because of
      * a lack of cache ports. */
-    bool hasPendingPkt;
+    bool hasPendingRequest;
 
     /** The packet that is pending free cache ports. */
-    PacketPtr pendingPkt;
+    LSQRequest* pendingRequest;
 
     /** Flag for memory model. */
     bool needsTSO;
@@ -516,53 +568,51 @@
 
   public:
     /** Executes the load at the given index. */
-    Fault read(const RequestPtr &req,
-               RequestPtr &sreqLow, RequestPtr &sreqHigh,
-               int load_idx);
+    Fault read(LSQRequest *req, int load_idx);
 
     /** Executes the store at the given index. */
-    Fault write(const RequestPtr &req,
-                const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                uint8_t *data, int store_idx);
+    Fault write(LSQRequest *req, uint8_t *data, int store_idx);
 
     /** Returns the index of the head load instruction. */
-    int getLoadHead() { return loadHead; }
-    /** Returns the sequence number of the head load instruction. */
-    InstSeqNum getLoadHeadSeqNum()
-    {
-        if (loadQueue[loadHead]) {
-            return loadQueue[loadHead]->seqNum;
-        } else {
-            return 0;
-        }
+    int getLoadHead() { return loadQueue.head(); }
 
+    /** Returns the sequence number of the head load instruction. */
+    InstSeqNum
+    getLoadHeadSeqNum()
+    {
+        return loadQueue.front().valid()
+            ? loadQueue.front().instruction()->seqNum
+            : 0;
     }
 
     /** Returns the index of the head store instruction. */
-    int getStoreHead() { return storeHead; }
+    int getStoreHead() { return storeQueue.head(); }
     /** Returns the sequence number of the head store instruction. */
-    InstSeqNum getStoreHeadSeqNum()
+    InstSeqNum
+    getStoreHeadSeqNum()
     {
-        if (storeQueue[storeHead].inst) {
-            return storeQueue[storeHead].inst->seqNum;
-        } else {
-            return 0;
-        }
-
+        return storeQueue.front().valid()
+            ? storeQueue.front().instruction()->seqNum
+            : 0;
     }
 
     /** Returns whether or not the LSQ unit is stalled. */
     bool isStalled()  { return stalled; }
+  public:
+    typedef typename CircularQueue<LQEntry>::iterator LQIterator;
+    typedef typename CircularQueue<SQEntry>::iterator SQIterator;
+    typedef CircularQueue<LQEntry> LQueue;
+    typedef CircularQueue<SQEntry> SQueue;
 };
 
 template <class Impl>
 Fault
-LSQUnit<Impl>::read(const RequestPtr &req,
-                    RequestPtr &sreqLow, RequestPtr &sreqHigh,
-                    int load_idx)
+LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
 {
-    DynInstPtr load_inst = loadQueue[load_idx];
+    LQEntry& load_req = loadQueue[load_idx];
+    const DynInstPtr& load_inst = load_req.instruction();
 
+    load_req.setRequest(req);
     assert(load_inst);
 
     assert(!load_inst->isExecuted());
@@ -571,184 +621,188 @@
     // A bit of a hackish way to get strictly ordered accesses to work
     // only if they're at the head of the LSQ and are ready to commit
     // (at the head of the ROB too).
-    if (req->isStrictlyOrdered() &&
-        (load_idx != loadHead || !load_inst->isAtCommit())) {
+
+    if (req->mainRequest()->isStrictlyOrdered() &&
+        (load_idx != loadQueue.head() || !load_inst->isAtCommit())) {
+        // Tell IQ/mem dep unit that this instruction will need to be
+        // rescheduled eventually
         iewStage->rescheduleMemInst(load_inst);
+        load_inst->clearIssued();
+        load_inst->effAddrValid(false);
         ++lsqRescheduledLoads;
         DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n",
                 load_inst->seqNum, load_inst->pcState());
 
+        // Must delete request now that it wasn't handed off to
+        // memory.  This is quite ugly.  @todo: Figure out the proper
+        // place to really handle request deletes.
+        load_req.setRequest(nullptr);
+        req->discard();
         return std::make_shared<GenericISA::M5PanicFault>(
             "Strictly ordered load [sn:%llx] PC %s\n",
             load_inst->seqNum, load_inst->pcState());
     }
 
-    // Check the SQ for any previous stores that might lead to forwarding
-    int store_idx = load_inst->sqIdx;
-
-    int store_size = 0;
-
     DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
             "storeHead: %i addr: %#x%s\n",
-            load_idx, store_idx, storeHead, req->getPaddr(),
-            sreqLow ? " split" : "");
+            load_idx - 1, load_inst->sqIt._idx, storeQueue.head() - 1,
+            req->mainRequest()->getPaddr(), req->isSplit() ? " split" : "");
 
-    if (req->isLLSC()) {
-        assert(!sreqLow);
+    if (req->mainRequest()->isLLSC()) {
         // Disable recording the result temporarily.  Writing to misc
         // regs normally updates the result, but this is not the
         // desired behavior when handling store conditionals.
         load_inst->recordResult(false);
-        TheISA::handleLockedRead(load_inst.get(), req);
+        TheISA::handleLockedRead(load_inst.get(), req->mainRequest());
         load_inst->recordResult(true);
     }
 
-    if (req->isMmappedIpr()) {
+    if (req->mainRequest()->isMmappedIpr()) {
         assert(!load_inst->memData);
         load_inst->memData = new uint8_t[64];
 
         ThreadContext *thread = cpu->tcBase(lsqID);
-        Cycles delay(0);
-        PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq);
+        PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::ReadReq);
 
-        data_pkt->dataStatic(load_inst->memData);
-        if (!TheISA::HasUnalignedMemAcc || !sreqLow) {
-            delay = TheISA::handleIprRead(thread, data_pkt);
-        } else {
-            assert(sreqLow->isMmappedIpr() && sreqHigh->isMmappedIpr());
-            PacketPtr fst_data_pkt = new Packet(sreqLow, MemCmd::ReadReq);
-            PacketPtr snd_data_pkt = new Packet(sreqHigh, MemCmd::ReadReq);
+        Cycles delay = req->handleIprRead(thread, main_pkt);
 
-            fst_data_pkt->dataStatic(load_inst->memData);
-            snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize());
-
-            delay = TheISA::handleIprRead(thread, fst_data_pkt);
-            Cycles delay2 = TheISA::handleIprRead(thread, snd_data_pkt);
-            if (delay2 > delay)
-                delay = delay2;
-
-            delete fst_data_pkt;
-            delete snd_data_pkt;
-        }
-        WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
+        WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this);
         cpu->schedule(wb, cpu->clockEdge(delay));
         return NoFault;
     }
 
-    while (store_idx != -1) {
-        // End once we've reached the top of the LSQ
-        if (store_idx == storeWBIdx) {
-            break;
-        }
-
+    // Check the SQ for any previous stores that might lead to forwarding
+    auto store_it = load_inst->sqIt;
+    assert (store_it >= storeWBIt);
+    // End once we've reached the top of the LSQ
+    while (store_it != storeWBIt) {
         // Move the index to one younger
-        if (--store_idx < 0)
-            store_idx += SQEntries;
+        store_it--;
+        assert(store_it->valid());
+        assert(store_it->instruction()->seqNum < load_inst->seqNum);
+        int store_size = store_it->size();
 
-        assert(storeQueue[store_idx].inst);
+        // Cache maintenance instructions go down via the store
+        // path but they carry no data and they shouldn't be
+        // considered for forwarding
+        if (store_size != 0 && !store_it->instruction()->strictlyOrdered() &&
+            !(store_it->request()->mainRequest() &&
+              store_it->request()->mainRequest()->isCacheMaintenance())) {
+            assert(store_it->instruction()->effAddrValid());
 
-        store_size = storeQueue[store_idx].size;
+            // Check if the store data is within the lower and upper bounds of
+            // addresses that the request needs.
+            auto req_s = req->mainRequest()->getVaddr();
+            auto req_e = req_s + req->mainRequest()->getSize();
+            auto st_s = store_it->instruction()->effAddr;
+            auto st_e = st_s + store_size;
 
-        if (!store_size || storeQueue[store_idx].inst->strictlyOrdered() ||
-            (storeQueue[store_idx].req &&
-             storeQueue[store_idx].req->isCacheMaintenance())) {
-            // Cache maintenance instructions go down via the store
-            // path but they carry no data and they shouldn't be
-            // considered for forwarding
-            continue;
-        }
+            bool store_has_lower_limit = req_s >= st_s;
+            bool store_has_upper_limit = req_e <= st_e;
+            bool lower_load_has_store_part = req_s < st_e;
+            bool upper_load_has_store_part = req_e > st_s;
 
-        assert(storeQueue[store_idx].inst->effAddrValid());
+            // If the store's data has all of the data needed and the load
+            // isn't LLSC then
+            // we can forward.
+            if (store_has_lower_limit && store_has_upper_limit &&
+                !req->mainRequest()->isLLSC()) {
 
-        // Check if the store data is within the lower and upper bounds of
-        // addresses that the request needs.
-        bool store_has_lower_limit =
-            req->getVaddr() >= storeQueue[store_idx].inst->effAddr;
-        bool store_has_upper_limit =
-            (req->getVaddr() + req->getSize()) <=
-            (storeQueue[store_idx].inst->effAddr + store_size);
-        bool lower_load_has_store_part =
-            req->getVaddr() < (storeQueue[store_idx].inst->effAddr +
-                           store_size);
-        bool upper_load_has_store_part =
-            (req->getVaddr() + req->getSize()) >
-            storeQueue[store_idx].inst->effAddr;
+                // Get shift amount for offset into the store's data.
+                int shift_amt = req->mainRequest()->getVaddr() -
+                    store_it->instruction()->effAddr;
 
-        // If the store's data has all of the data needed and the load isn't
-        // LLSC, we can forward.
-        if (store_has_lower_limit && store_has_upper_limit && !req->isLLSC()) {
-            // Get shift amount for offset into the store's data.
-            int shift_amt = req->getVaddr() - storeQueue[store_idx].inst->effAddr;
+                // Allocate memory if this is the first time a load is issued.
+                if (!load_inst->memData) {
+                    load_inst->memData =
+                        new uint8_t[req->mainRequest()->getSize()];
+                }
+                if (store_it->isAllZeros())
+                    memset(load_inst->memData, 0,
+                            req->mainRequest()->getSize());
+                else
+                    memcpy(load_inst->memData,
+                        store_it->data() + shift_amt,
+                        req->mainRequest()->getSize());
 
-            // Allocate memory if this is the first time a load is issued.
-            if (!load_inst->memData) {
-                load_inst->memData = new uint8_t[req->getSize()];
-            }
-            if (storeQueue[store_idx].isAllZeros)
-                memset(load_inst->memData, 0, req->getSize());
-            else
-                memcpy(load_inst->memData,
-                    storeQueue[store_idx].data + shift_amt, req->getSize());
+                DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
+                        "addr %#x\n", store_it._idx,
+                        req->mainRequest()->getVaddr());
 
-            DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
-                    "addr %#x\n", store_idx, req->getVaddr());
+                PacketPtr data_pkt = new Packet(req->mainRequest(),
+                        MemCmd::ReadReq);
+                data_pkt->dataStatic(load_inst->memData);
 
-            PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq);
-            data_pkt->dataStatic(load_inst->memData);
+                if (req->isAnyOutstandingRequest()) {
+                    assert(req->_numOutstandingPackets > 0);
+                    // There are memory requests packets in flight already.
+                    // This may happen if the store was not complete the
+                    // first time this load got executed. Signal the senderSate
+                    // that response packets should be discarded.
+                    req->discardSenderState();
+                }
 
-            WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
+                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
+                        this);
 
-            // We'll say this has a 1 cycle load-store forwarding latency
-            // for now.
-            // @todo: Need to make this a parameter.
-            cpu->schedule(wb, curTick());
+                // We'll say this has a 1 cycle load-store forwarding latency
+                // for now.
+                // @todo: Need to make this a parameter.
+                cpu->schedule(wb, curTick());
 
-            ++lsqForwLoads;
-            return NoFault;
-        } else if (
-                (!req->isLLSC() &&
+                // Don't need to do anything special for split loads.
+                ++lsqForwLoads;
+
+                return NoFault;
+            } else if (
+                (!req->mainRequest()->isLLSC() &&
                  ((store_has_lower_limit && lower_load_has_store_part) ||
                   (store_has_upper_limit && upper_load_has_store_part) ||
                   (lower_load_has_store_part && upper_load_has_store_part))) ||
-                (req->isLLSC() &&
+                (req->mainRequest()->isLLSC() &&
                  ((store_has_lower_limit || upper_load_has_store_part) &&
                   (store_has_upper_limit || lower_load_has_store_part)))) {
-            // This is the partial store-load forwarding case where a store
-            // has only part of the load's data and the load isn't LLSC or
-            // the load is LLSC and the store has all or part of the load's
-            // data
+                // This is the partial store-load forwarding case where a store
+                // has only part of the load's data and the load isn't LLSC or
+                // the load is LLSC and the store has all or part of the load's
+                // data
 
-            // If it's already been written back, then don't worry about
-            // stalling on it.
-            if (storeQueue[store_idx].completed) {
-                panic("Should not check one of these");
-                continue;
+                // If it's already been written back, then don't worry about
+                // stalling on it.
+                if (store_it->completed()) {
+                    panic("Should not check one of these");
+                    continue;
+                }
+
+                // Must stall load and force it to retry, so long as it's the
+                // oldest load that needs to do so.
+                if (!stalled ||
+                    (stalled &&
+                     load_inst->seqNum <
+                     loadQueue[stallingLoadIdx].instruction()->seqNum)) {
+                    stalled = true;
+                    stallingStoreIsn = store_it->instruction()->seqNum;
+                    stallingLoadIdx = load_idx;
+                }
+
+                // Tell IQ/mem dep unit that this instruction will need to be
+                // rescheduled eventually
+                iewStage->rescheduleMemInst(load_inst);
+                load_inst->clearIssued();
+                load_inst->effAddrValid(false);
+                ++lsqRescheduledLoads;
+
+                // Do not generate a writeback event as this instruction is not
+                // complete.
+                DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
+                        "Store idx %i to load addr %#x\n",
+                        store_it._idx, req->mainRequest()->getVaddr());
+
+                // Must discard the request.
+                req->discard();
+                load_req.setRequest(nullptr);
+                return NoFault;
             }
-
-            // Must stall load and force it to retry, so long as it's the oldest
-            // load that needs to do so.
-            if (!stalled ||
-                (stalled &&
-                 load_inst->seqNum <
-                 loadQueue[stallingLoadIdx]->seqNum)) {
-                stalled = true;
-                stallingStoreIsn = storeQueue[store_idx].inst->seqNum;
-                stallingLoadIdx = load_idx;
-            }
-
-            // Tell IQ/mem dep unit that this instruction will need to be
-            // rescheduled eventually
-            iewStage->rescheduleMemInst(load_inst);
-            load_inst->clearIssued();
-            ++lsqRescheduledLoads;
-
-            // Do not generate a writeback event as this instruction is not
-            // complete.
-            DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
-                    "Store idx %i to load addr %#x\n",
-                    store_idx, req->getVaddr());
-
-            return NoFault;
         }
     }
 
@@ -758,40 +812,7 @@
 
     // Allocate memory if this is the first time a load is issued.
     if (!load_inst->memData) {
-        load_inst->memData = new uint8_t[req->getSize()];
-    }
-
-    // if we the cache is not blocked, do cache access
-    bool completedFirst = false;
-    PacketPtr data_pkt = Packet::createRead(req);
-    PacketPtr fst_data_pkt = NULL;
-    PacketPtr snd_data_pkt = NULL;
-
-    data_pkt->dataStatic(load_inst->memData);
-
-    LSQSenderState *state = new LSQSenderState;
-    state->isLoad = true;
-    state->idx = load_idx;
-    state->inst = load_inst;
-    data_pkt->senderState = state;
-
-    if (!TheISA::HasUnalignedMemAcc || !sreqLow) {
-        // Point the first packet at the main data packet.
-        fst_data_pkt = data_pkt;
-    } else {
-        // Create the split packets.
-        fst_data_pkt = Packet::createRead(sreqLow);
-        snd_data_pkt = Packet::createRead(sreqHigh);
-
-        fst_data_pkt->dataStatic(load_inst->memData);
-        snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize());
-
-        fst_data_pkt->senderState = state;
-        snd_data_pkt->senderState = state;
-
-        state->isSplit = true;
-        state->outstanding = 2;
-        state->mainPkt = data_pkt;
+        load_inst->memData = new uint8_t[req->mainRequest()->getSize()];
     }
 
     // For now, load throughput is constrained by the number of
@@ -799,97 +820,46 @@
     // stores do).
     // @todo We should account for cache port contention
     // and arbitrate between loads and stores.
-    bool successful_load = true;
-    if (!dcachePort->sendTimingReq(fst_data_pkt)) {
-        successful_load = false;
-    } else if (TheISA::HasUnalignedMemAcc && sreqLow) {
-        completedFirst = true;
 
-        // The first packet was sent without problems, so send this one
-        // too. If there is a problem with this packet then the whole
-        // load will be squashed, so indicate this to the state object.
-        // The first packet will return in completeDataAccess and be
-        // handled there.
-        // @todo We should also account for cache port contention
-        // here.
-        if (!dcachePort->sendTimingReq(snd_data_pkt)) {
-            // The main packet will be deleted in completeDataAccess.
-            state->complete();
-            // Signify to 1st half that the 2nd half was blocked via state
-            state->cacheBlocked = true;
-            successful_load = false;
-        }
+    // if we the cache is not blocked, do cache access
+    if (req->senderState() == nullptr) {
+        LQSenderState *state = new LQSenderState(
+                loadQueue.getIterator(load_idx));
+        state->isLoad = true;
+        state->inst = load_inst;
+        state->isSplit = req->isSplit();
+        req->senderState(state);
     }
-
-    // If the cache was blocked, or has become blocked due to the access,
-    // handle it.
-    if (!successful_load) {
-        if (!sreqLow) {
-            // Packet wasn't split, just delete main packet info
-            delete state;
-            delete data_pkt;
-        }
-
-        if (TheISA::HasUnalignedMemAcc && sreqLow) {
-            if (!completedFirst) {
-                // Split packet, but first failed.  Delete all state.
-                delete state;
-                delete data_pkt;
-                delete fst_data_pkt;
-                delete snd_data_pkt;
-                sreqLow.reset();
-                sreqHigh.reset();
-            } else {
-                // Can't delete main packet data or state because first packet
-                // was sent to the memory system
-                delete data_pkt;
-                delete snd_data_pkt;
-                sreqHigh.reset();
-            }
-        }
-
-        ++lsqCacheBlocked;
-
+    req->buildPackets();
+    req->sendPacketToCache();
+    if (!req->isSent())
         iewStage->blockMemInst(load_inst);
 
-        // No fault occurred, even though the interface is blocked.
-        return NoFault;
-    }
-
     return NoFault;
 }
 
 template <class Impl>
 Fault
-LSQUnit<Impl>::write(const RequestPtr &req,
-                     const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                     uint8_t *data, int store_idx)
+LSQUnit<Impl>::write(LSQRequest *req, uint8_t *data, int store_idx)
 {
-    assert(storeQueue[store_idx].inst);
+    assert(storeQueue[store_idx].valid());
 
-    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x"
-            " | storeHead:%i [sn:%i]\n",
-            store_idx, req->getPaddr(), storeHead,
-            storeQueue[store_idx].inst->seqNum);
+    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i "
+            "[sn:%i]\n",
+            store_idx - 1, req->request()->getPaddr(), storeQueue.head() - 1,
+            storeQueue[store_idx].instruction()->seqNum);
 
-    storeQueue[store_idx].req = req;
-    storeQueue[store_idx].sreqLow = sreqLow;
-    storeQueue[store_idx].sreqHigh = sreqHigh;
-    unsigned size = req->getSize();
-    storeQueue[store_idx].size = size;
-    bool store_no_data = req->getFlags() & Request::STORE_NO_DATA;
-    storeQueue[store_idx].isAllZeros = store_no_data;
-    assert(size <= sizeof(storeQueue[store_idx].data) || store_no_data);
+    storeQueue[store_idx].setRequest(req);
+    unsigned size = req->_size;
+    storeQueue[store_idx].size() = size;
+    bool store_no_data =
+        req->mainRequest()->getFlags() & Request::STORE_NO_DATA;
+    storeQueue[store_idx].isAllZeros() = store_no_data;
+    assert(size <= SQEntry::DataSize || store_no_data);
 
-    // Split stores can only occur in ISAs with unaligned memory accesses.  If
-    // a store request has been split, sreqLow and sreqHigh will be non-null.
-    if (TheISA::HasUnalignedMemAcc && sreqLow) {
-        storeQueue[store_idx].isSplit = true;
-    }
-
-    if (!(req->getFlags() & Request::CACHE_BLOCK_ZERO) && \
-        !req->isCacheMaintenance())
-        memcpy(storeQueue[store_idx].data, data, size);
+    if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) &&
+        !req->request()->isCacheMaintenance())
+        memcpy(storeQueue[store_idx].data(), data, size);
 
     // This function only writes the data to the store queue, so no fault
     // can happen here.
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 13b1487..9756a9e 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2010-2014, 2017 ARM Limited
+ * Copyright (c) 2010-2014, 2017-2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -66,6 +66,8 @@
     : Event(Default_Pri, AutoDelete),
       inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr)
 {
+    assert(_inst->savedReq);
+    _inst->savedReq->writebackScheduled();
 }
 
 template<class Impl>
@@ -76,9 +78,8 @@
 
     lsqPtr->writeback(inst, pkt);
 
-    if (pkt->senderState)
-        delete pkt->senderState;
-
+    assert(inst->savedReq);
+    inst->savedReq->writebackDone();
     delete pkt;
 }
 
@@ -89,65 +90,61 @@
     return "Store writeback";
 }
 
+template <class Impl>
+bool
+LSQUnit<Impl>::recvTimingResp(PacketPtr pkt)
+{
+    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    LSQRequest* req = senderState->request();
+    assert(req != nullptr);
+    bool ret = true;
+    /* Check that the request is still alive before any further action. */
+    if (senderState->alive()) {
+        ret = req->recvTimingResp(pkt);
+    } else {
+        senderState->outstanding--;
+    }
+    return ret;
+
+}
+
 template<class Impl>
 void
 LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
 {
     LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
     DynInstPtr inst = state->inst;
-    DPRINTF(IEW, "Writeback event [sn:%lli].\n", inst->seqNum);
-    DPRINTF(Activity, "Activity: Writeback event [sn:%lli].\n", inst->seqNum);
 
-    if (state->cacheBlocked) {
-        // This is the first half of a previous split load,
-        // where the 2nd half blocked, ignore this response
-        DPRINTF(IEW, "[sn:%lli]: Response from first half of earlier "
-                "blocked split load recieved. Ignoring.\n", inst->seqNum);
-        delete state;
-        return;
-    }
+    cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt));
 
-    // If this is a split access, wait until all packets are received.
-    if (TheISA::HasUnalignedMemAcc && !state->complete()) {
-        return;
-    }
+    /* Notify the sender state that the access is complete (for ownership
+     * tracking). */
+    state->complete();
 
     assert(!cpu->switchedOut());
     if (!inst->isSquashed()) {
-        if (!state->noWB) {
+        if (state->needWB) {
             // Only loads and store conditionals perform the writeback
             // after receving the response from the memory
             assert(inst->isLoad() || inst->isStoreConditional());
-            if (!TheISA::HasUnalignedMemAcc || !state->isSplit ||
-                !state->isLoad) {
-                writeback(inst, pkt);
-            } else {
-                writeback(inst, state->mainPkt);
+            writeback(inst, state->request()->mainPacket());
+            if (inst->isStore()) {
+                auto ss = dynamic_cast<SQSenderState*>(state);
+                ss->writebackDone();
+                completeStore(ss->idx);
             }
-        }
-
-        if (inst->isStore()) {
-            completeStore(state->idx);
+        } else if (inst->isStore()) {
+            completeStore(dynamic_cast<SQSenderState*>(state)->idx);
         }
     }
-
-    if (TheISA::HasUnalignedMemAcc && state->isSplit && state->isLoad) {
-        delete state->mainPkt;
-    }
-
-    pkt->req->setAccessLatency();
-    cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt));
-
-    delete state;
 }
 
 template <class Impl>
 LSQUnit<Impl>::LSQUnit(uint32_t lqEntries, uint32_t sqEntries)
     : lsqID(-1), storeQueue(sqEntries+1), loadQueue(lqEntries+1),
-      LQEntries(lqEntries+1), SQEntries(sqEntries+1),
       loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false),
-      isStoreBlocked(false), storeInFlight(false), hasPendingPkt(false),
-      pendingPkt(nullptr)
+      isStoreBlocked(false), storeInFlight(false), hasPendingRequest(false),
+      pendingRequest(nullptr)
 {
 }
 
@@ -167,7 +164,6 @@
 
     depCheckShift = params->LSQDepCheckShift;
     checkLoads = params->LSQCheckLoads;
-    cacheStorePorts = params->cacheStorePorts;
     needsTSO = params->needsTSO;
 
     resetState();
@@ -180,11 +176,8 @@
 {
     loads = stores = storesToWB = 0;
 
-    loadHead = loadTail = 0;
 
-    storeHead = storeWBIdx = storeTail = 0;
-
-    usedStorePorts = 0;
+    storeWBIt = storeQueue.begin();
 
     retryPkt = NULL;
     memDepViolator = NULL;
@@ -259,24 +252,10 @@
 
 template<class Impl>
 void
-LSQUnit<Impl>::clearLQ()
-{
-    loadQueue.clear();
-}
-
-template<class Impl>
-void
-LSQUnit<Impl>::clearSQ()
-{
-    storeQueue.clear();
-}
-
-template<class Impl>
-void
 LSQUnit<Impl>::drainSanityCheck() const
 {
-    for (int i = 0; i < loadQueue.size(); ++i)
-        assert(!loadQueue[i]);
+    for (int i = 0; i < loadQueue.capacity(); ++i)
+        assert(!loadQueue[i].valid());
 
     assert(storesToWB == 0);
     assert(!retryPkt);
@@ -289,44 +268,6 @@
     resetState();
 }
 
-template<class Impl>
-void
-LSQUnit<Impl>::resizeLQ(unsigned size)
-{
-    unsigned size_plus_sentinel = size + 1;
-    assert(size_plus_sentinel >= LQEntries);
-
-    if (size_plus_sentinel > LQEntries) {
-        while (size_plus_sentinel > loadQueue.size()) {
-            DynInstPtr dummy;
-            loadQueue.push_back(dummy);
-            LQEntries++;
-        }
-    } else {
-        LQEntries = size_plus_sentinel;
-    }
-
-    assert(LQEntries <= 256);
-}
-
-template<class Impl>
-void
-LSQUnit<Impl>::resizeSQ(unsigned size)
-{
-    unsigned size_plus_sentinel = size + 1;
-    if (size_plus_sentinel > SQEntries) {
-        while (size_plus_sentinel > storeQueue.size()) {
-            SQEntry dummy;
-            storeQueue.push_back(dummy);
-            SQEntries++;
-        }
-    } else {
-        SQEntries = size_plus_sentinel;
-    }
-
-    assert(SQEntries <= 256);
-}
-
 template <class Impl>
 void
 LSQUnit<Impl>::insert(const DynInstPtr &inst)
@@ -348,44 +289,42 @@
 void
 LSQUnit<Impl>::insertLoad(const DynInstPtr &load_inst)
 {
-    assert((loadTail + 1) % LQEntries != loadHead);
-    assert(loads < LQEntries);
+    assert(!loadQueue.full());
+    assert(loads < loadQueue.capacity());
 
     DPRINTF(LSQUnit, "Inserting load PC %s, idx:%i [sn:%lli]\n",
-            load_inst->pcState(), loadTail, load_inst->seqNum);
+            load_inst->pcState(), loadQueue.tail(), load_inst->seqNum);
 
-    load_inst->lqIdx = loadTail;
+    /* Grow the queue. */
+    loadQueue.advance_tail();
 
-    if (stores == 0) {
-        load_inst->sqIdx = -1;
-    } else {
-        load_inst->sqIdx = storeTail;
-    }
+    load_inst->sqIt = storeQueue.end();
 
-    loadQueue[loadTail] = load_inst;
-
-    incrLdIdx(loadTail);
+    assert(!loadQueue.back().valid());
+    loadQueue.back().set(load_inst);
+    load_inst->lqIdx = loadQueue.tail();
+    load_inst->lqIt = loadQueue.getIterator(load_inst->lqIdx);
 
     ++loads;
 }
 
 template <class Impl>
 void
-LSQUnit<Impl>::insertStore(const DynInstPtr &store_inst)
+LSQUnit<Impl>::insertStore(const DynInstPtr& store_inst)
 {
     // Make sure it is not full before inserting an instruction.
-    assert((storeTail + 1) % SQEntries != storeHead);
-    assert(stores < SQEntries);
+    assert(!storeQueue.full());
+    assert(stores < storeQueue.capacity());
 
     DPRINTF(LSQUnit, "Inserting store PC %s, idx:%i [sn:%lli]\n",
-            store_inst->pcState(), storeTail, store_inst->seqNum);
+            store_inst->pcState(), storeQueue.tail(), store_inst->seqNum);
+    storeQueue.advance_tail();
 
-    store_inst->sqIdx = storeTail;
-    store_inst->lqIdx = loadTail;
+    store_inst->sqIdx = storeQueue.tail();
+    store_inst->lqIdx = loadQueue.moduloAdd(loadQueue.tail(), 1);
+    store_inst->lqIt = loadQueue.end();
 
-    storeQueue[storeTail] = SQEntry(store_inst);
-
-    incrStIdx(storeTail);
+    storeQueue.back().set(store_inst);
 
     ++stores;
 }
@@ -407,8 +346,9 @@
 {
         //LQ has an extra dummy entry to differentiate
         //empty/full conditions. Subtract 1 from the free entries.
-        DPRINTF(LSQUnit, "LQ size: %d, #loads occupied: %d\n", LQEntries, loads);
-        return LQEntries - loads - 1;
+        DPRINTF(LSQUnit, "LQ size: %d, #loads occupied: %d\n",
+                1 + loadQueue.capacity(), loads);
+        return loadQueue.capacity() - loads;
 }
 
 template <class Impl>
@@ -417,8 +357,9 @@
 {
         //SQ has an extra dummy entry to differentiate
         //empty/full conditions. Subtract 1 from the free entries.
-        DPRINTF(LSQUnit, "SQ size: %d, #stores occupied: %d\n", SQEntries, stores);
-        return SQEntries - stores - 1;
+        DPRINTF(LSQUnit, "SQ size: %d, #stores occupied: %d\n",
+                1 + storeQueue.capacity(), stores);
+        return storeQueue.capacity() - stores;
 
  }
 
@@ -429,11 +370,8 @@
     // Should only ever get invalidations in here
     assert(pkt->isInvalidate());
 
-    int load_idx = loadHead;
     DPRINTF(LSQUnit, "Got snoop for address %#x\n", pkt->getAddr());
 
-    // Only Invalidate packet calls checkSnoop
-    assert(pkt->isInvalidate());
     for (int x = 0; x < cpu->numContexts(); x++) {
         ThreadContext *tc = cpu->getContext(x);
         bool no_squash = cpu->thread[x]->noSquashFromTC;
@@ -442,44 +380,37 @@
         cpu->thread[x]->noSquashFromTC = no_squash;
     }
 
-    Addr invalidate_addr = pkt->getAddr() & cacheBlockMask;
-
-    DynInstPtr ld_inst = loadQueue[load_idx];
-    if (ld_inst) {
-        Addr load_addr_low = ld_inst->physEffAddrLow & cacheBlockMask;
-        Addr load_addr_high = ld_inst->physEffAddrHigh & cacheBlockMask;
-
-        // Check that this snoop didn't just invalidate our lock flag
-        if (ld_inst->effAddrValid() && (load_addr_low == invalidate_addr
-                                        || load_addr_high == invalidate_addr)
-            && ld_inst->memReqFlags & Request::LLSC)
-            TheISA::handleLockedSnoopHit(ld_inst.get());
-    }
-
-    // If this is the only load in the LSQ we don't care
-    if (load_idx == loadTail)
+    if (loadQueue.empty())
         return;
 
-    incrLdIdx(load_idx);
+    auto iter = loadQueue.begin();
+
+    Addr invalidate_addr = pkt->getAddr() & cacheBlockMask;
+
+    DynInstPtr ld_inst = iter->instruction();
+    assert(ld_inst);
+    LSQRequest *req = iter->request();
+
+    // Check that this snoop didn't just invalidate our lock flag
+    if (ld_inst->effAddrValid() &&
+        req->isCacheBlockHit(invalidate_addr, cacheBlockMask)
+        && ld_inst->memReqFlags & Request::LLSC)
+        TheISA::handleLockedSnoopHit(ld_inst.get());
 
     bool force_squash = false;
 
-    while (load_idx != loadTail) {
-        DynInstPtr ld_inst = loadQueue[load_idx];
-
-        if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
-            incrLdIdx(load_idx);
+    while (++iter != loadQueue.end()) {
+        ld_inst = iter->instruction();
+        assert(ld_inst);
+        req = iter->request();
+        if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered())
             continue;
-        }
 
-        Addr load_addr_low = ld_inst->physEffAddrLow & cacheBlockMask;
-        Addr load_addr_high = ld_inst->physEffAddrHigh & cacheBlockMask;
+        DPRINTF(LSQUnit, "-- inst [sn:%lli] to pktAddr:%#x\n",
+                    ld_inst->seqNum, invalidate_addr);
 
-        DPRINTF(LSQUnit, "-- inst [sn:%lli] load_addr: %#x to pktAddr:%#x\n",
-                    ld_inst->seqNum, load_addr_low, invalidate_addr);
-
-        if ((load_addr_low == invalidate_addr
-             || load_addr_high == invalidate_addr) || force_squash) {
+        if (force_squash ||
+            req->isCacheBlockHit(invalidate_addr, cacheBlockMask)) {
             if (needsTSO) {
                 // If we have a TSO system, as all loads must be ordered with
                 // all other loads, this load as well as *all* subsequent loads
@@ -508,14 +439,14 @@
                 ld_inst->hitExternalSnoop(true);
             }
         }
-        incrLdIdx(load_idx);
     }
     return;
 }
 
 template <class Impl>
 Fault
-LSQUnit<Impl>::checkViolations(int load_idx, const DynInstPtr &inst)
+LSQUnit<Impl>::checkViolations(typename LoadQueue::iterator& loadIt,
+        const DynInstPtr& inst)
 {
     Addr inst_eff_addr1 = inst->effAddr >> depCheckShift;
     Addr inst_eff_addr2 = (inst->effAddr + inst->effSize - 1) >> depCheckShift;
@@ -525,10 +456,10 @@
      * all instructions that will execute before the store writes back. Thus,
      * like the implementation that came before it, we're overly conservative.
      */
-    while (load_idx != loadTail) {
-        DynInstPtr ld_inst = loadQueue[load_idx];
+    while (loadIt != loadQueue.end()) {
+        DynInstPtr ld_inst = loadIt->instruction();
         if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
-            incrLdIdx(load_idx);
+            ++loadIt;
             continue;
         }
 
@@ -585,7 +516,7 @@
             }
         }
 
-        incrLdIdx(load_idx);
+        ++loadIt;
     }
     return NoFault;
 }
@@ -608,8 +539,7 @@
 
     load_fault = inst->initiateAcc();
 
-    if (inst->isTranslationDelayed() &&
-        load_fault == NoFault)
+    if (inst->isTranslationDelayed() && load_fault == NoFault)
         return load_fault;
 
     // If the instruction faulted or predicated false, then we need to send it
@@ -631,12 +561,13 @@
         iewStage->instToCommit(inst);
         iewStage->activityThisCycle();
     } else {
-        assert(inst->effAddrValid());
-        int load_idx = inst->lqIdx;
-        incrLdIdx(load_idx);
+        if (inst->effAddrValid()) {
+            auto it = inst->lqIt;
+            ++it;
 
-        if (checkLoads)
-            return checkViolations(load_idx, inst);
+            if (checkLoads)
+                return checkViolations(it, inst);
+        }
     }
 
     return load_fault;
@@ -659,7 +590,7 @@
 
     // Check the recently completed loads to see if any match this store's
     // address.  If so, then we have a memory ordering violation.
-    int load_idx = store_inst->lqIdx;
+    typename LoadQueue::iterator loadIt = store_inst->lqIt;
 
     Fault store_fault = store_inst->initiateAcc();
 
@@ -674,7 +605,7 @@
         return store_fault;
     }
 
-    if (storeQueue[store_idx].size == 0) {
+    if (storeQueue[store_idx].size() == 0) {
         DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n",
                 store_inst->pcState(), store_inst->seqNum);
 
@@ -686,12 +617,12 @@
     if (store_inst->isStoreConditional()) {
         // Store conditionals need to set themselves as able to
         // writeback if we haven't had a fault by here.
-        storeQueue[store_idx].canWB = true;
+        storeQueue[store_idx].canWB() = true;
 
         ++storesToWB;
     }
 
-    return checkViolations(load_idx, store_inst);
+    return checkViolations(loadIt, store_inst);
 
 }
 
@@ -699,14 +630,13 @@
 void
 LSQUnit<Impl>::commitLoad()
 {
-    assert(loadQueue[loadHead]);
+    assert(loadQueue.front().valid());
 
     DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n",
-            loadQueue[loadHead]->pcState());
+            loadQueue.front().instruction()->pcState());
 
-    loadQueue[loadHead] = NULL;
-
-    incrLdIdx(loadHead);
+    loadQueue.front().clear();
+    loadQueue.pop_front();
 
     --loads;
 }
@@ -715,9 +645,10 @@
 void
 LSQUnit<Impl>::commitLoads(InstSeqNum &youngest_inst)
 {
-    assert(loads == 0 || loadQueue[loadHead]);
+    assert(loads == 0 || loadQueue.front().valid());
 
-    while (loads != 0 && loadQueue[loadHead]->seqNum <= youngest_inst) {
+    while (loads != 0 && loadQueue.front().instruction()->seqNum
+            <= youngest_inst) {
         commitLoad();
     }
 }
@@ -726,45 +657,37 @@
 void
 LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
 {
-    assert(stores == 0 || storeQueue[storeHead].inst);
+    assert(stores == 0 || storeQueue.front().valid());
 
-    int store_idx = storeHead;
-
-    while (store_idx != storeTail) {
-        assert(storeQueue[store_idx].inst);
+    /* Forward iterate the store queue (age order). */
+    for (auto& x : storeQueue) {
+        assert(x.valid());
         // Mark any stores that are now committed and have not yet
         // been marked as able to write back.
-        if (!storeQueue[store_idx].canWB) {
-            if (storeQueue[store_idx].inst->seqNum > youngest_inst) {
+        if (!x.canWB()) {
+            if (x.instruction()->seqNum > youngest_inst) {
                 break;
             }
             DPRINTF(LSQUnit, "Marking store as able to write back, PC "
                     "%s [sn:%lli]\n",
-                    storeQueue[store_idx].inst->pcState(),
-                    storeQueue[store_idx].inst->seqNum);
+                    x.instruction()->pcState(),
+                    x.instruction()->seqNum);
 
-            storeQueue[store_idx].canWB = true;
+            x.canWB() = true;
 
             ++storesToWB;
         }
-
-        incrStIdx(store_idx);
     }
 }
 
 template <class Impl>
 void
-LSQUnit<Impl>::writebackPendingStore()
+LSQUnit<Impl>::writebackBlockedStore()
 {
-    if (hasPendingPkt) {
-        assert(pendingPkt != NULL);
-
-        // If the cache is blocked, this will store the packet for retry.
-        if (sendStore(pendingPkt)) {
-            storePostSend(pendingPkt);
-        }
-        pendingPkt = NULL;
-        hasPendingPkt = false;
+    assert(isStoreBlocked);
+    storeWBIt->request()->sendPacketToCache();
+    if (storeWBIt->request()->isSent()){
+        storePostSend();
     }
 }
 
@@ -772,18 +695,17 @@
 void
 LSQUnit<Impl>::writebackStores()
 {
-    // First writeback the second packet from any split store that didn't
-    // complete last cycle because there weren't enough cache ports available.
-    if (TheISA::HasUnalignedMemAcc) {
-        writebackPendingStore();
+    if (isStoreBlocked) {
+        DPRINTF(LSQUnit, "Writing back  blocked store\n");
+        writebackBlockedStore();
     }
 
     while (storesToWB > 0 &&
-           storeWBIdx != storeTail &&
-           storeQueue[storeWBIdx].inst &&
-           storeQueue[storeWBIdx].canWB &&
+           storeWBIt.dereferenceable() &&
+           storeWBIt->valid() &&
+           storeWBIt->canWB() &&
            ((!needsTSO) || (!storeInFlight)) &&
-           usedStorePorts < cacheStorePorts) {
+           lsq->storePortAvailable()) {
 
         if (isStoreBlocked) {
             DPRINTF(LSQUnit, "Unable to write back any more stores, cache"
@@ -793,188 +715,112 @@
 
         // Store didn't write any data so no need to write it back to
         // memory.
-        if (storeQueue[storeWBIdx].size == 0) {
-            completeStore(storeWBIdx);
-
-            incrStIdx(storeWBIdx);
-
+        if (storeWBIt->size() == 0) {
+            /* It is important that the preincrement happens at (or before)
+             * the call, as the the code of completeStore checks
+             * storeWBIt. */
+            completeStore(storeWBIt++);
             continue;
         }
 
-        ++usedStorePorts;
-
-        if (storeQueue[storeWBIdx].inst->isDataPrefetch()) {
-            incrStIdx(storeWBIdx);
-
+        if (storeWBIt->instruction()->isDataPrefetch()) {
+            storeWBIt++;
             continue;
         }
 
-        assert(storeQueue[storeWBIdx].req);
-        assert(!storeQueue[storeWBIdx].committed);
+        assert(storeWBIt->hasRequest());
+        assert(!storeWBIt->committed());
 
-        if (TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit) {
-            assert(storeQueue[storeWBIdx].sreqLow);
-            assert(storeQueue[storeWBIdx].sreqHigh);
-        }
-
-        DynInstPtr inst = storeQueue[storeWBIdx].inst;
-
-        RequestPtr &req = storeQueue[storeWBIdx].req;
-        const RequestPtr &sreqLow = storeQueue[storeWBIdx].sreqLow;
-        const RequestPtr &sreqHigh = storeQueue[storeWBIdx].sreqHigh;
-
-        storeQueue[storeWBIdx].committed = true;
+        DynInstPtr inst = storeWBIt->instruction();
+        LSQRequest* req = storeWBIt->request();
+        storeWBIt->committed() = true;
 
         assert(!inst->memData);
-        inst->memData = new uint8_t[req->getSize()];
+        inst->memData = new uint8_t[req->_size];
 
-        if (storeQueue[storeWBIdx].isAllZeros)
-            memset(inst->memData, 0, req->getSize());
+        if (storeWBIt->isAllZeros())
+            memset(inst->memData, 0, req->_size);
         else
-            memcpy(inst->memData, storeQueue[storeWBIdx].data, req->getSize());
+            memcpy(inst->memData, storeWBIt->data(), req->_size);
 
-        PacketPtr data_pkt;
-        PacketPtr snd_data_pkt = NULL;
 
-        LSQSenderState *state = new LSQSenderState;
-        state->isLoad = false;
-        state->idx = storeWBIdx;
-        state->inst = inst;
+        if (req->senderState() == nullptr) {
+            SQSenderState *state = new SQSenderState(storeWBIt);
+            state->isLoad = false;
+            state->needWB = false;
+            state->inst = inst;
 
-        if (!TheISA::HasUnalignedMemAcc || !storeQueue[storeWBIdx].isSplit) {
-
-            // Build a single data packet if the store isn't split.
-            data_pkt = Packet::createWrite(req);
-            data_pkt->dataStatic(inst->memData);
-            data_pkt->senderState = state;
-        } else {
-            // Create two packets if the store is split in two.
-            data_pkt = Packet::createWrite(sreqLow);
-            snd_data_pkt = Packet::createWrite(sreqHigh);
-
-            data_pkt->dataStatic(inst->memData);
-            snd_data_pkt->dataStatic(inst->memData + sreqLow->getSize());
-
-            data_pkt->senderState = state;
-            snd_data_pkt->senderState = state;
-
-            state->isSplit = true;
-            state->outstanding = 2;
-
-            // Can delete the main request now.
-            req = sreqLow;
+            req->senderState(state);
+            if (inst->isStoreConditional()) {
+                /* Only store conditionals need a writeback. */
+                state->needWB = true;
+            }
         }
+        req->buildPackets();
 
         DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%s "
                 "to Addr:%#x, data:%#x [sn:%lli]\n",
-                storeWBIdx, inst->pcState(),
-                req->getPaddr(), (int)*(inst->memData),
+                storeWBIt.idx(), inst->pcState(),
+                req->request()->getPaddr(), (int)*(inst->memData),
                 inst->seqNum);
 
         // @todo: Remove this SC hack once the memory system handles it.
         if (inst->isStoreConditional()) {
-            assert(!storeQueue[storeWBIdx].isSplit);
             // Disable recording the result temporarily.  Writing to
             // misc regs normally updates the result, but this is not
             // the desired behavior when handling store conditionals.
             inst->recordResult(false);
-            bool success = TheISA::handleLockedWrite(inst.get(), req, cacheBlockMask);
+            bool success = TheISA::handleLockedWrite(inst.get(),
+                    req->request(), cacheBlockMask);
             inst->recordResult(true);
+            req->packetSent();
 
             if (!success) {
+                req->complete();
                 // Instantly complete this store.
                 DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed.  "
                         "Instantly completing it.\n",
                         inst->seqNum);
-                WritebackEvent *wb = new WritebackEvent(inst, data_pkt, this);
+                PacketPtr new_pkt = new Packet(*req->packet());
+                WritebackEvent *wb = new WritebackEvent(inst,
+                        new_pkt, this);
                 cpu->schedule(wb, curTick() + 1);
-                completeStore(storeWBIdx);
-                incrStIdx(storeWBIdx);
+                completeStore(storeWBIt);
+                if (!storeQueue.empty())
+                    storeWBIt++;
+                else
+                    storeWBIt = storeQueue.end();
                 continue;
             }
-        } else {
-            // Non-store conditionals do not need a writeback.
-            state->noWB = true;
         }
 
-        bool split =
-            TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit;
-
-        ThreadContext *thread = cpu->tcBase(lsqID);
-
-        if (req->isMmappedIpr()) {
+        if (req->request()->isMmappedIpr()) {
             assert(!inst->isStoreConditional());
-            TheISA::handleIprWrite(thread, data_pkt);
-            delete data_pkt;
-            if (split) {
-                assert(snd_data_pkt->req->isMmappedIpr());
-                TheISA::handleIprWrite(thread, snd_data_pkt);
-                delete snd_data_pkt;
-            }
-            delete state;
-            completeStore(storeWBIdx);
-            incrStIdx(storeWBIdx);
-        } else if (!sendStore(data_pkt)) {
-            DPRINTF(IEW, "D-Cache became blocked when writing [sn:%lli], will"
-                    "retry later\n",
-                    inst->seqNum);
+            ThreadContext *thread = cpu->tcBase(lsqID);
+            PacketPtr main_pkt = new Packet(req->mainRequest(),
+                                            MemCmd::WriteReq);
+            main_pkt->dataStatic(inst->memData);
+            req->handleIprWrite(thread, main_pkt);
+            delete main_pkt;
+            completeStore(storeWBIt);
+            storeWBIt++;
+            continue;
+        }
+        /* Send to cache */
+        req->sendPacketToCache();
 
-            // Need to store the second packet, if split.
-            if (split) {
-                state->pktToSend = true;
-                state->pendingPacket = snd_data_pkt;
-            }
+        /* If successful, do the post send */
+        if (req->isSent()) {
+            storePostSend();
         } else {
-
-            // If split, try to send the second packet too
-            if (split) {
-                assert(snd_data_pkt);
-
-                // Ensure there are enough ports to use.
-                if (usedStorePorts < cacheStorePorts) {
-                    ++usedStorePorts;
-                    if (sendStore(snd_data_pkt)) {
-                        storePostSend(snd_data_pkt);
-                    } else {
-                        DPRINTF(IEW, "D-Cache became blocked when writing"
-                                " [sn:%lli] second packet, will retry later\n",
-                                inst->seqNum);
-                    }
-                } else {
-
-                    // Store the packet for when there's free ports.
-                    assert(pendingPkt == NULL);
-                    pendingPkt = snd_data_pkt;
-                    hasPendingPkt = true;
-                }
-            } else {
-
-                // Not a split store.
-                storePostSend(data_pkt);
-            }
+            DPRINTF(LSQUnit, "D-Cache became blocked when writing [sn:%lli], "
+                    "will retry later\n",
+                    inst->seqNum);
         }
     }
-
-    // Not sure this should set it to 0.
-    usedStorePorts = 0;
-
     assert(stores >= 0 && storesToWB >= 0);
 }
 
-/*template <class Impl>
-void
-LSQUnit<Impl>::removeMSHR(InstSeqNum seqNum)
-{
-    list<InstSeqNum>::iterator mshr_it = find(mshrSeqNums.begin(),
-                                              mshrSeqNums.end(),
-                                              seqNum);
-
-    if (mshr_it != mshrSeqNums.end()) {
-        mshrSeqNums.erase(mshr_it);
-        DPRINTF(LSQUnit, "Removing MSHR. count = %i\n",mshrSeqNums.size());
-    }
-}*/
-
 template <class Impl>
 void
 LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
@@ -982,30 +828,26 @@
     DPRINTF(LSQUnit, "Squashing until [sn:%lli]!"
             "(Loads:%i Stores:%i)\n", squashed_num, loads, stores);
 
-    int load_idx = loadTail;
-    decrLdIdx(load_idx);
-
-    while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) {
+    while (loads != 0 &&
+            loadQueue.back().instruction()->seqNum > squashed_num) {
         DPRINTF(LSQUnit,"Load Instruction PC %s squashed, "
                 "[sn:%lli]\n",
-                loadQueue[load_idx]->pcState(),
-                loadQueue[load_idx]->seqNum);
+                loadQueue.back().instruction()->pcState(),
+                loadQueue.back().instruction()->seqNum);
 
-        if (isStalled() && load_idx == stallingLoadIdx) {
+        if (isStalled() && loadQueue.tail() == stallingLoadIdx) {
             stalled = false;
             stallingStoreIsn = 0;
             stallingLoadIdx = 0;
         }
 
         // Clear the smart pointer to make sure it is decremented.
-        loadQueue[load_idx]->setSquashed();
-        loadQueue[load_idx] = NULL;
+        loadQueue.back().instruction()->setSquashed();
+        loadQueue.back().clear();
+
         --loads;
 
-        // Inefficient!
-        loadTail = load_idx;
-
-        decrLdIdx(load_idx);
+        loadQueue.pop_back();
         ++lsqSquashedLoads;
     }
 
@@ -1013,76 +855,63 @@
         memDepViolator = NULL;
     }
 
-    int store_idx = storeTail;
-    decrStIdx(store_idx);
-
     while (stores != 0 &&
-           storeQueue[store_idx].inst->seqNum > squashed_num) {
+           storeQueue.back().instruction()->seqNum > squashed_num) {
         // Instructions marked as can WB are already committed.
-        if (storeQueue[store_idx].canWB) {
+        if (storeQueue.back().canWB()) {
             break;
         }
 
         DPRINTF(LSQUnit,"Store Instruction PC %s squashed, "
                 "idx:%i [sn:%lli]\n",
-                storeQueue[store_idx].inst->pcState(),
-                store_idx, storeQueue[store_idx].inst->seqNum);
+                storeQueue.back().instruction()->pcState(),
+                storeQueue.tail(), storeQueue.back().instruction()->seqNum);
 
         // I don't think this can happen.  It should have been cleared
         // by the stalling load.
         if (isStalled() &&
-            storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+            storeQueue.back().instruction()->seqNum == stallingStoreIsn) {
             panic("Is stalled should have been cleared by stalling load!\n");
             stalled = false;
             stallingStoreIsn = 0;
         }
 
         // Clear the smart pointer to make sure it is decremented.
-        storeQueue[store_idx].inst->setSquashed();
-        storeQueue[store_idx].inst = NULL;
-        storeQueue[store_idx].canWB = 0;
+        storeQueue.back().instruction()->setSquashed();
 
         // Must delete request now that it wasn't handed off to
         // memory.  This is quite ugly.  @todo: Figure out the proper
         // place to really handle request deletes.
-        storeQueue[store_idx].req.reset();
-        if (TheISA::HasUnalignedMemAcc && storeQueue[store_idx].isSplit) {
-            storeQueue[store_idx].sreqLow.reset();
-            storeQueue[store_idx].sreqHigh.reset();
-        }
-
+        storeQueue.back().clear();
         --stores;
 
-        // Inefficient!
-        storeTail = store_idx;
-
-        decrStIdx(store_idx);
+        storeQueue.pop_back();
         ++lsqSquashedStores;
     }
 }
 
 template <class Impl>
 void
-LSQUnit<Impl>::storePostSend(PacketPtr pkt)
+LSQUnit<Impl>::storePostSend()
 {
     if (isStalled() &&
-        storeQueue[storeWBIdx].inst->seqNum == stallingStoreIsn) {
+        storeWBIt->instruction()->seqNum == stallingStoreIsn) {
         DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
                 "load idx:%i\n",
                 stallingStoreIsn, stallingLoadIdx);
         stalled = false;
         stallingStoreIsn = 0;
-        iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
+        iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction());
     }
 
-    if (!storeQueue[storeWBIdx].inst->isStoreConditional()) {
+    if (!storeWBIt->instruction()->isStoreConditional()) {
         // The store is basically completed at this time. This
         // only works so long as the checker doesn't try to
         // verify the value in memory for stores.
-        storeQueue[storeWBIdx].inst->setCompleted();
+        storeWBIt->instruction()->setCompleted();
 
         if (cpu->checker) {
-            cpu->checker->verify(storeQueue[storeWBIdx].inst);
+            cpu->checker->verify(storeWBIt->instruction());
         }
     }
 
@@ -1090,7 +919,7 @@
         storeInFlight = true;
     }
 
-    incrStIdx(storeWBIdx);
+    storeWBIt++;
 }
 
 template <class Impl>
@@ -1136,10 +965,10 @@
 
 template <class Impl>
 void
-LSQUnit<Impl>::completeStore(int store_idx)
+LSQUnit<Impl>::completeStore(typename StoreQueue::iterator store_idx)
 {
-    assert(storeQueue[store_idx].inst);
-    storeQueue[store_idx].completed = true;
+    assert(store_idx->valid());
+    store_idx->completed() = true;
     --storesToWB;
     // A bit conservative because a store completion may not free up entries,
     // but hopefully avoids two store completions in one cycle from making
@@ -1147,39 +976,42 @@
     cpu->wakeCPU();
     cpu->activityThisCycle();
 
-    if (store_idx == storeHead) {
+    /* We 'need' a copy here because we may clear the entry from the
+     * store queue. */
+    DynInstPtr store_inst = store_idx->instruction();
+    if (store_idx == storeQueue.begin()) {
         do {
-            incrStIdx(storeHead);
-
+            storeQueue.front().clear();
+            storeQueue.pop_front();
             --stores;
-        } while (storeQueue[storeHead].completed &&
-                 storeHead != storeTail);
+        } while (storeQueue.front().completed() &&
+                 !storeQueue.empty());
 
         iewStage->updateLSQNextCycle = true;
     }
 
     DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head "
             "idx:%i\n",
-            storeQueue[store_idx].inst->seqNum, store_idx, storeHead);
+            store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1);
 
 #if TRACING_ON
     if (DTRACE(O3PipeView)) {
-        storeQueue[store_idx].inst->storeTick =
-            curTick() - storeQueue[store_idx].inst->fetchTick;
+        store_idx->instruction()->storeTick =
+            curTick() - store_idx->instruction()->fetchTick;
     }
 #endif
 
     if (isStalled() &&
-        storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+        store_inst->seqNum == stallingStoreIsn) {
         DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
                 "load idx:%i\n",
                 stallingStoreIsn, stallingLoadIdx);
         stalled = false;
         stallingStoreIsn = 0;
-        iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
+        iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction());
     }
 
-    storeQueue[store_idx].inst->setCompleted();
+    store_inst->setCompleted();
 
     if (needsTSO) {
         storeInFlight = false;
@@ -1188,28 +1020,52 @@
     // Tell the checker we've completed this instruction.  Some stores
     // may get reported twice to the checker, but the checker can
     // handle that case.
-
     // Store conditionals cannot be sent to the checker yet, they have
     // to update the misc registers first which should take place
     // when they commit
-    if (cpu->checker && !storeQueue[store_idx].inst->isStoreConditional()) {
-        cpu->checker->verify(storeQueue[store_idx].inst);
+    if (cpu->checker &&  !store_inst->isStoreConditional()) {
+        cpu->checker->verify(store_inst);
     }
 }
 
 template <class Impl>
 bool
-LSQUnit<Impl>::sendStore(PacketPtr data_pkt)
+LSQUnit<Impl>::trySendPacket(bool isLoad, PacketPtr data_pkt)
 {
-    if (!dcachePort->sendTimingReq(data_pkt)) {
-        // Need to handle becoming blocked on a store.
-        isStoreBlocked = true;
-        ++lsqCacheBlocked;
-        assert(retryPkt == NULL);
-        retryPkt = data_pkt;
-        return false;
+    bool ret = true;
+    bool cache_got_blocked = false;
+
+    auto state = dynamic_cast<LSQSenderState*>(data_pkt->senderState);
+
+    if (!lsq->cacheBlocked() && (isLoad || lsq->storePortAvailable())) {
+        if (!dcachePort->sendTimingReq(data_pkt)) {
+            ret = false;
+            cache_got_blocked = true;
+        }
+    } else {
+        ret = false;
     }
-    return true;
+
+    if (ret) {
+        if (!isLoad) {
+            lsq->storePortBusy();
+            isStoreBlocked = false;
+        }
+        state->outstanding++;
+        state->request()->packetSent();
+    } else {
+        if (cache_got_blocked) {
+            lsq->cacheBlocked(true);
+            ++lsqCacheBlocked;
+        }
+        if (!isLoad) {
+            assert(state->request() == storeWBIt->request());
+            isStoreBlocked = true;
+        }
+        state->request()->packetNotSent();
+    }
+
+    return ret;
 }
 
 template <class Impl>
@@ -1217,69 +1073,12 @@
 LSQUnit<Impl>::recvRetry()
 {
     if (isStoreBlocked) {
-        DPRINTF(LSQUnit, "Receiving retry: store blocked\n");
-        assert(retryPkt != NULL);
-
-        LSQSenderState *state =
-            dynamic_cast<LSQSenderState *>(retryPkt->senderState);
-
-        if (dcachePort->sendTimingReq(retryPkt)) {
-            // Don't finish the store unless this is the last packet.
-            if (!TheISA::HasUnalignedMemAcc || !state->pktToSend ||
-                    state->pendingPacket == retryPkt) {
-                state->pktToSend = false;
-                storePostSend(retryPkt);
-            }
-            retryPkt = NULL;
-            isStoreBlocked = false;
-
-            // Send any outstanding packet.
-            if (TheISA::HasUnalignedMemAcc && state->pktToSend) {
-                assert(state->pendingPacket);
-                if (sendStore(state->pendingPacket)) {
-                    storePostSend(state->pendingPacket);
-                }
-            }
-        } else {
-            // Still blocked!
-            ++lsqCacheBlocked;
-        }
+        DPRINTF(LSQUnit, "Receiving retry: blocked store\n");
+        writebackBlockedStore();
     }
 }
 
 template <class Impl>
-inline void
-LSQUnit<Impl>::incrStIdx(int &store_idx) const
-{
-    if (++store_idx >= SQEntries)
-        store_idx = 0;
-}
-
-template <class Impl>
-inline void
-LSQUnit<Impl>::decrStIdx(int &store_idx) const
-{
-    if (--store_idx < 0)
-        store_idx += SQEntries;
-}
-
-template <class Impl>
-inline void
-LSQUnit<Impl>::incrLdIdx(int &load_idx) const
-{
-    if (++load_idx >= LQEntries)
-        load_idx = 0;
-}
-
-template <class Impl>
-inline void
-LSQUnit<Impl>::decrLdIdx(int &load_idx) const
-{
-    if (--load_idx < 0)
-        load_idx += LQEntries;
-}
-
-template <class Impl>
 void
 LSQUnit<Impl>::dumpInsts() const
 {
@@ -1287,29 +1086,28 @@
     cprintf("Load queue size: %i\n", loads);
     cprintf("Load queue: ");
 
-    int load_idx = loadHead;
-
-    while (load_idx != loadTail && loadQueue[load_idx]) {
-        const DynInstPtr &inst(loadQueue[load_idx]);
+    for (const auto& e: loadQueue) {
+        const DynInstPtr &inst(e.instruction());
         cprintf("%s.[sn:%i] ", inst->pcState(), inst->seqNum);
-
-        incrLdIdx(load_idx);
     }
     cprintf("\n");
 
     cprintf("Store queue size: %i\n", stores);
     cprintf("Store queue: ");
 
-    int store_idx = storeHead;
-
-    while (store_idx != storeTail && storeQueue[store_idx].inst) {
-        const DynInstPtr &inst(storeQueue[store_idx].inst);
+    for (const auto& e: storeQueue) {
+        const DynInstPtr &inst(e.instruction());
         cprintf("%s.[sn:%i] ", inst->pcState(), inst->seqNum);
-
-        incrStIdx(store_idx);
     }
 
     cprintf("\n");
 }
 
+template <class Impl>
+unsigned int
+LSQUnit<Impl>::cacheLineSize()
+{
+    return cpu->cacheLineSize();
+}
+
 #endif//__CPU_O3_LSQ_UNIT_IMPL_HH__
diff --git a/src/cpu/o3/probe/elastic_trace.cc b/src/cpu/o3/probe/elastic_trace.cc
index a4a2013..36d8297 100644
--- a/src/cpu/o3/probe/elastic_trace.cc
+++ b/src/cpu/o3/probe/elastic_trace.cc
@@ -409,7 +409,7 @@
     new_record->reqFlags = head_inst->memReqFlags;
     new_record->virtAddr = head_inst->effAddr;
     new_record->asid = head_inst->asid;
-    new_record->physAddr = head_inst->physEffAddrLow;
+    new_record->physAddr = head_inst->physEffAddr;
     // Currently the tracing does not support split requests.
     new_record->size = head_inst->effSize;
     new_record->pc = head_inst->instAddr();