cpu-o3: Add support for pinned writes

This patch adds support for pinning registers for a certain number of
consecutive writes.  This is only relevant for timing CPU models
(functional-only models are unaffected), and it is primarily needed to
provide a realistic execution model for micro-coded operations whose
microops can write to non-overlapping portions of a destination
register, e.g. vector gather loads.  In those cases, this mechanism
can disable renaming for a sequence of consecutive writes, thus making
the resulting execution more efficient: allocating a new physical
register for each microop would introduce a read-modify-write chain of
dependencies, while with these modifications the microops can write
back in parallel.

Please note that this new feature is only leveraged by O3CPU for the
time being.

Additional authors:
- Gabor Dozsa <gabor.dozsa@arm.com>

Change-Id: I07eb5fdbd1fa0b748c9bdc1174d9f330fda34f81
Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/13520
Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com>
Maintainer: Andreas Sandberg <andreas.sandberg@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index 22a32ec..6f9555a 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011, 2013, 2016-2018 ARM Limited
+ * Copyright (c) 2011, 2013, 2016-2019 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
@@ -116,6 +116,9 @@
         SquashedInIQ,            /// Instruction is squashed in the IQ
         SquashedInLSQ,           /// Instruction is squashed in the LSQ
         SquashedInROB,           /// Instruction is squashed in the ROB
+        PinnedRegsRenamed,       /// Pinned registers are renamed
+        PinnedRegsWritten,       /// Pinned registers are written back
+        PinnedRegsSquashDone,    /// Regs pinning status updated after squash
         RecoverInst,             /// Is a recover instruction
         BlockingInst,            /// Is a blocking instruction
         ThreadsyncWait,          /// Is a thread synchronization instruction
@@ -173,12 +176,14 @@
     /** PC state for this instruction. */
     TheISA::PCState pc;
 
+  private:
     /* An amalgamation of a lot of boolean values into one */
     std::bitset<MaxFlags> instFlags;
 
     /** The status of this BaseDynInst.  Several bits can be set. */
     std::bitset<NumStatus> status;
 
+  protected:
      /** Whether or not the source register is ready.
      *  @todo: Not sure this should be here vs the derived class.
      */
@@ -385,6 +390,8 @@
     {
         _destRegIdx[idx] = renamed_dest;
         _prevDestRegIdx[idx] = previous_rename;
+        if (renamed_dest->isPinned())
+            setPinnedRegsRenamed();
     }
 
     /** Renames a source logical register to the physical register which
@@ -767,7 +774,7 @@
     bool isCommitted() const { return status[Committed]; }
 
     /** Sets this instruction as squashed. */
-    void setSquashed() { status.set(Squashed); }
+    void setSquashed();
 
     /** Returns whether or not this instruction is squashed. */
     bool isSquashed() const { return status[Squashed]; }
@@ -802,7 +809,7 @@
     bool isInLSQ() const { return status[LsqEntry]; }
 
     /** Sets this instruction as squashed in the LSQ. */
-    void setSquashedInLSQ() { status.set(SquashedInLSQ);}
+    void setSquashedInLSQ() { status.set(SquashedInLSQ); status.set(Squashed);}
 
     /** Returns whether or not this instruction is squashed in the LSQ. */
     bool isSquashedInLSQ() const { return status[SquashedInLSQ]; }
@@ -825,6 +832,41 @@
     /** Returns whether or not this instruction is squashed in the ROB. */
     bool isSquashedInROB() const { return status[SquashedInROB]; }
 
+    /** Returns whether pinned registers are renamed */
+    bool isPinnedRegsRenamed() const { return status[PinnedRegsRenamed]; }
+
+    /** Sets the destination registers as renamed */
+    void
+    setPinnedRegsRenamed()
+    {
+        assert(!status[PinnedRegsSquashDone]);
+        assert(!status[PinnedRegsWritten]);
+        status.set(PinnedRegsRenamed);
+    }
+
+    /** Returns whether destination registers are written */
+    bool isPinnedRegsWritten() const { return status[PinnedRegsWritten]; }
+
+    /** Sets destination registers as written */
+    void
+    setPinnedRegsWritten()
+    {
+        assert(!status[PinnedRegsSquashDone]);
+        assert(status[PinnedRegsRenamed]);
+        status.set(PinnedRegsWritten);
+    }
+
+    /** Return whether dest registers' pinning status updated after squash */
+    bool
+    isPinnedRegsSquashDone() const { return status[PinnedRegsSquashDone]; }
+
+    /** Sets dest registers' status updated after squash */
+    void
+    setPinnedRegsSquashDone() {
+        assert(!status[PinnedRegsSquashDone]);
+        status.set(PinnedRegsSquashDone);
+    }
+
     /** Read the PC state of this instruction. */
     TheISA::PCState pcState() const { return pc; }
 
diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh
index 6d3a3ac..41eb64c 100644
--- a/src/cpu/base_dyn_inst_impl.hh
+++ b/src/cpu/base_dyn_inst_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011, 2018 ARM Limited
+ * Copyright (c) 2011, 2019 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -237,4 +237,34 @@
     return true;
 }
 
+
+
+template <class Impl>
+void
+BaseDynInst<Impl>::setSquashed()
+{
+    status.set(Squashed);
+
+    if (!isPinnedRegsRenamed() || isPinnedRegsSquashDone())
+        return;
+
+    // This inst has been renamed already so it may go through rename
+    // again (e.g. if the squash is due to memory access order violation).
+    // Reset the write counters for all pinned destination register to ensure
+    // that they are in a consistent state for a possible re-rename. This also
+    // ensures that dest regs will be pinned to the same phys register if
+    // re-rename happens.
+    for (int idx = 0; idx < numDestRegs(); idx++) {
+        PhysRegIdPtr phys_dest_reg = renamedDestRegIdx(idx);
+        if (phys_dest_reg->isPinned()) {
+            phys_dest_reg->incrNumPinnedWrites();
+            if (isPinnedRegsWritten())
+                phys_dest_reg->incrNumPinnedWritesToComplete();
+        }
+    }
+    setPinnedRegsSquashDone();
+}
+
+
+
 #endif//__CPU_BASE_DYN_INST_IMPL_HH__
diff --git a/src/cpu/o3/free_list.hh b/src/cpu/o3/free_list.hh
index 46bebf3..82ff25d 100644
--- a/src/cpu/o3/free_list.hh
+++ b/src/cpu/o3/free_list.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 ARM Limited
+ * Copyright (c) 2016-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -79,10 +79,9 @@
     template<class InputIt>
     void
     addRegs(InputIt first, InputIt last) {
-        std::for_each(first, last,
-            [this](const typename InputIt::value_type& reg) {
-                this->freeRegs.push(&reg);
-            });
+        std::for_each(first, last, [this](typename InputIt::value_type& reg) {
+            this->freeRegs.push(&reg);
+        });
     }
 
     /** Get the next available register from the free list */
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index 56c1825..5872f90 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -1480,11 +1480,14 @@
             int dependents = instQueue.wakeDependents(inst);
 
             for (int i = 0; i < inst->numDestRegs(); i++) {
-                //mark as Ready
-                DPRINTF(IEW,"Setting Destination Register %i (%s)\n",
-                        inst->renamedDestRegIdx(i)->index(),
-                        inst->renamedDestRegIdx(i)->className());
-                scoreboard->setReg(inst->renamedDestRegIdx(i));
+                // Mark register as ready if not pinned
+                if (inst->renamedDestRegIdx(i)->
+                        getNumPinnedWritesToComplete() == 0) {
+                    DPRINTF(IEW,"Setting Destination Register %i (%s)\n",
+                            inst->renamedDestRegIdx(i)->index(),
+                            inst->renamedDestRegIdx(i)->className());
+                    scoreboard->setReg(inst->renamedDestRegIdx(i));
+                }
             }
 
             if (dependents) {
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index 20b41e5..c3e3fdf 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2014, 2017-2018 ARM Limited
+ * Copyright (c) 2011-2014, 2017-2019 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
@@ -1033,6 +1033,17 @@
             continue;
         }
 
+        // Avoid waking up dependents if the register is pinned
+        dest_reg->decrNumPinnedWritesToComplete();
+        if (dest_reg->isPinned())
+            completed_inst->setPinnedRegsWritten();
+
+        if (dest_reg->getNumPinnedWritesToComplete() != 0) {
+            DPRINTF(IQ, "Reg %d [%s] is pinned, skipping\n",
+                    dest_reg->index(), dest_reg->className());
+            continue;
+        }
+
         DPRINTF(IQ, "Waking any dependents on register %i (%s).\n",
                 dest_reg->index(),
                 dest_reg->className());
diff --git a/src/cpu/o3/regfile.cc b/src/cpu/o3/regfile.cc
index cc4bba6..afed8f9 100644
--- a/src/cpu/o3/regfile.cc
+++ b/src/cpu/o3/regfile.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 ARM Limited
+ * Copyright (c) 2016-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -216,8 +216,8 @@
         return std::make_pair(miscRegIds.begin(), miscRegIds.end());
     }
     /* There is no way to make an empty iterator */
-    return std::make_pair(PhysIds::const_iterator(),
-                          PhysIds::const_iterator());
+    return std::make_pair(PhysIds::iterator(),
+                          PhysIds::iterator());
 }
 
 PhysRegIdPtr
diff --git a/src/cpu/o3/regfile.hh b/src/cpu/o3/regfile.hh
index d4b6602..352e6ac 100644
--- a/src/cpu/o3/regfile.hh
+++ b/src/cpu/o3/regfile.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 ARM Limited
+ * Copyright (c) 2016-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -70,8 +70,8 @@
     using VecMode = Enums::VecRegRenameMode;
     using VecPredRegContainer = TheISA::VecPredRegContainer;
   public:
-    using IdRange = std::pair<PhysIds::const_iterator,
-                              PhysIds::const_iterator>;
+    using IdRange = std::pair<PhysIds::iterator,
+                              PhysIds::iterator>;
   private:
     static constexpr auto NumVecElemPerVecReg = TheISA::NumVecElemPerVecReg;
 
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index 43f0b27..d55bed6 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2012, 2014-2016 ARM Limited
+ * Copyright (c) 2010-2012, 2014-2019 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
@@ -976,7 +976,9 @@
         assert(hb_it != historyBuffer[tid].end());
 
         DPRINTF(Rename, "[tid:%i] Removing history entry with sequence "
-                "number %i.\n", tid, hb_it->instSeqNum);
+                "number %i (archReg: %d, newPhysReg: %d, prevPhysReg: %d).\n",
+                tid, hb_it->instSeqNum, hb_it->archReg.index(),
+                hb_it->newPhysReg->index(), hb_it->prevPhysReg->index());
 
         // Undo the rename mapping only if it was really a change.
         // Special regs that are not really renamed (like misc regs
@@ -1140,12 +1142,12 @@
         typename RenameMap::RenameInfo rename_result;
 
         RegId flat_dest_regid = tc->flattenRegId(dest_reg);
+        flat_dest_regid.setNumPinnedWrites(dest_reg.getNumPinnedWrites());
 
         rename_result = map->rename(flat_dest_regid);
 
         inst->flattenDestReg(dest_idx, flat_dest_regid);
 
-        // Mark Scoreboard entry as not ready
         scoreboard->unsetReg(rename_result.first);
 
         DPRINTF(Rename,
diff --git a/src/cpu/o3/rename_map.cc b/src/cpu/o3/rename_map.cc
index 9d912e5..64f3dbf 100644
--- a/src/cpu/o3/rename_map.cc
+++ b/src/cpu/o3/rename_map.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017,2019 ARM Limited
+ * Copyright (c) 2016-2018,2019 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -78,16 +78,23 @@
     // requested architected register.
     PhysRegIdPtr prev_reg = map[arch_reg.flatIndex()];
 
-    // If it's not referencing the zero register, then rename the
-    // register.
-    if (arch_reg != zeroReg) {
-        renamed_reg = freeList->getReg();
-
-        map[arch_reg.flatIndex()] = renamed_reg;
-    } else {
-        // Otherwise return the zero register so nothing bad happens.
+    if (arch_reg == zeroReg) {
         assert(prev_reg->isZeroReg());
         renamed_reg = prev_reg;
+    } else if (prev_reg->getNumPinnedWrites() > 0) {
+        // Do not rename if the register is pinned
+        assert(arch_reg.getNumPinnedWrites() == 0);  // Prevent pinning the
+                                                     // same register twice
+        DPRINTF(Rename, "Renaming pinned reg, numPinnedWrites %d\n",
+                prev_reg->getNumPinnedWrites());
+        renamed_reg = prev_reg;
+        renamed_reg->decrNumPinnedWrites();
+    } else {
+        renamed_reg = freeList->getReg();
+        map[arch_reg.flatIndex()] = renamed_reg;
+        renamed_reg->setNumPinnedWrites(arch_reg.getNumPinnedWrites());
+        renamed_reg->setNumPinnedWritesToComplete(
+            arch_reg.getNumPinnedWrites() + 1);
     }
 
     DPRINTF(Rename, "Renamed reg %d to physical reg %d (%d) old mapping was"
diff --git a/src/cpu/reg_class.hh b/src/cpu/reg_class.hh
index 63a6c86..bd49d15 100644
--- a/src/cpu/reg_class.hh
+++ b/src/cpu/reg_class.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 ARM Limited
+ * Copyright (c) 2016-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -77,24 +77,29 @@
  * index 3 is represented by Regid(IntRegClass, 3).
  */
 class RegId {
-  private:
+  protected:
     static const char* regClassStrings[];
     RegClass regClass;
     RegIndex regIdx;
     ElemIndex elemIdx;
     static constexpr size_t Scale = TheISA::NumVecElemPerVecReg;
+    int numPinnedWrites;
+
     friend struct std::hash<RegId>;
+
   public:
     RegId() : regClass(IntRegClass), regIdx(0), elemIdx(-1) {}
     RegId(RegClass reg_class, RegIndex reg_idx)
-        : regClass(reg_class), regIdx(reg_idx), elemIdx(-1)
+        : regClass(reg_class), regIdx(reg_idx), elemIdx(-1),
+          numPinnedWrites(0)
     {
         panic_if(regClass == VecElemClass,
                 "Creating vector physical index w/o element index");
     }
 
     explicit RegId(RegClass reg_class, RegIndex reg_idx, ElemIndex elem_idx)
-        : regClass(reg_class), regIdx(reg_idx), elemIdx(elem_idx)
+        : regClass(reg_class), regIdx(reg_idx), elemIdx(elem_idx),
+          numPinnedWrites(0)
     {
         panic_if(regClass != VecElemClass,
                 "Creating non-vector physical index w/ element index");
@@ -202,6 +207,9 @@
     /** Return a const char* with the register class name. */
     const char* className() const { return regClassStrings[regClass]; }
 
+    int getNumPinnedWrites() const { return numPinnedWrites; }
+    void setNumPinnedWrites(int num_writes) { numPinnedWrites = num_writes; }
+
     friend std::ostream&
     operator<<(std::ostream& os, const RegId& rid) {
         return os << rid.className() << "{" << rid.index() << "}";
@@ -221,20 +229,27 @@
 class PhysRegId : private RegId {
   private:
     PhysRegIndex flatIdx;
+    int numPinnedWritesToComplete;
+    bool pinned;
 
   public:
-    explicit PhysRegId() : RegId(IntRegClass, -1), flatIdx(-1) {}
+    explicit PhysRegId() : RegId(IntRegClass, -1), flatIdx(-1),
+                           numPinnedWritesToComplete(0)
+    {}
 
     /** Scalar PhysRegId constructor. */
     explicit PhysRegId(RegClass _regClass, PhysRegIndex _regIdx,
               PhysRegIndex _flatIdx)
-        : RegId(_regClass, _regIdx), flatIdx(_flatIdx)
+        : RegId(_regClass, _regIdx), flatIdx(_flatIdx),
+          numPinnedWritesToComplete(0), pinned(false)
     {}
 
     /** Vector PhysRegId constructor (w/ elemIndex). */
     explicit PhysRegId(RegClass _regClass, PhysRegIndex _regIdx,
               ElemIndex elem_idx, PhysRegIndex flat_idx)
-        : RegId(_regClass, _regIdx, elem_idx), flatIdx(flat_idx) { }
+        : RegId(_regClass, _regIdx, elem_idx), flatIdx(flat_idx),
+          numPinnedWritesToComplete(0), pinned(false)
+    {}
 
     /** Visible RegId methods */
     /** @{ */
@@ -295,17 +310,46 @@
     /** Flat index accessor */
     const PhysRegIndex& flatIndex() const { return flatIdx; }
 
-    static PhysRegId elemId(const PhysRegId* vid, ElemIndex elem)
+    static PhysRegId elemId(PhysRegId* vid, ElemIndex elem)
     {
         assert(vid->isVectorPhysReg());
         return PhysRegId(VecElemClass, vid->index(), elem);
     }
+
+    int getNumPinnedWrites() const { return numPinnedWrites; }
+
+    void setNumPinnedWrites(int numWrites)
+    {
+        // An instruction with a pinned destination reg can get
+        // squashed. The numPinnedWrites counter may be zero when
+        // the squash happens but we need to know if the dest reg
+        // was pinned originally in order to reset counters properly
+        // for a possible re-rename using the same physical reg (which
+        // may be required in case of a mem access order violation).
+        pinned = (numWrites != 0);
+        numPinnedWrites = numWrites;
+    }
+
+    void decrNumPinnedWrites() { --numPinnedWrites; }
+    void incrNumPinnedWrites() { ++numPinnedWrites; }
+
+    bool isPinned() const { return pinned; }
+
+    int getNumPinnedWritesToComplete() const
+    {
+        return numPinnedWritesToComplete;
+    }
+
+    void setNumPinnedWritesToComplete(int numWrites)
+    {
+        numPinnedWritesToComplete = numWrites;
+    }
+
+    void decrNumPinnedWritesToComplete() { --numPinnedWritesToComplete; }
+    void incrNumPinnedWritesToComplete() { ++numPinnedWritesToComplete; }
 };
 
-/** Constant pointer definition.
- * PhysRegIds only need to be created once and then we can just share
- * pointers */
-using PhysRegIdPtr = const PhysRegId*;
+using PhysRegIdPtr = PhysRegId*;
 
 namespace std
 {