mem-cache: A Best-Offset Prefetcher

Michaud, P. (2015, June). A best-offset prefetcher.
In 2nd Data Prefetching Championship.

Change-Id: I61bb89ca5639356d54aeb04e856d5bf6e8805c22
Reviewed-on: https://gem5-review.googlesource.com/c/14820
Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com>
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index 6049ca6..5b4307b 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -478,6 +478,7 @@
             writeAllocator->allocate() : mshr->allocOnFill();
         blk = handleFill(pkt, blk, writebacks, allocate);
         assert(blk != nullptr);
+        ppFill->notify(pkt);
     }
 
     if (blk && blk->isValid() && pkt->isClean() && !pkt->isInvalidate()) {
@@ -2223,6 +2224,7 @@
 {
     ppHit = new ProbePointArg<PacketPtr>(this->getProbeManager(), "Hit");
     ppMiss = new ProbePointArg<PacketPtr>(this->getProbeManager(), "Miss");
+    ppFill = new ProbePointArg<PacketPtr>(this->getProbeManager(), "Fill");
 }
 
 ///////////////
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index 6144fd5..1137241 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -331,6 +331,9 @@
     /** To probe when a cache miss occurs */
     ProbePointArg<PacketPtr> *ppMiss;
 
+    /** To probe when a cache fill occurs */
+    ProbePointArg<PacketPtr> *ppFill;
+
     /**
      * The writeAllocator drive optimizations for streaming writes.
      * It first determines whether a WriteReq MSHR should be delayed,
diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py
index 36e2541..31d7486 100644
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -341,3 +341,25 @@
     dcpt = Param.DeltaCorrelatingPredictionTables(
         SlimDeltaCorrelatingPredictionTables(),
         "Delta Correlating Prediction Tables object")
+
+class BOPPrefetcher(QueuedPrefetcher):
+    type = "BOPPrefetcher"
+    cxx_class = "BOPPrefetcher"
+    cxx_header = "mem/cache/prefetch/bop.hh"
+    score_max = Param.Unsigned(31, "Max. score to update the best offset")
+    round_max = Param.Unsigned(100, "Max. round to update the best offset")
+    bad_score = Param.Unsigned(10, "Score at which the HWP is disabled")
+    rr_size = Param.Unsigned(64, "Number of entries of each RR bank")
+    tag_bits = Param.Unsigned(12, "Bits used to store the tag")
+    offset_list_size = Param.Unsigned(46,
+                "Number of entries in the offsets list")
+    negative_offsets_enable = Param.Bool(True,
+                "Initialize the offsets list also with negative values \
+                (i.e. the table will have half of the entries with positive \
+                offsets and the other half with negative ones)")
+    delay_queue_enable = Param.Bool(True, "Enable the delay queue")
+    delay_queue_size = Param.Unsigned(15,
+                "Number of entries in the delay queue")
+    delay_queue_cycles = Param.Cycles(60,
+                "Cycles to delay a write in the left RR table from the delay \
+                queue")
diff --git a/src/mem/cache/prefetch/SConscript b/src/mem/cache/prefetch/SConscript
index 48ee218..d87b694 100644
--- a/src/mem/cache/prefetch/SConscript
+++ b/src/mem/cache/prefetch/SConscript
@@ -34,6 +34,7 @@
 
 Source('access_map_pattern_matching.cc')
 Source('base.cc')
+Source('bop.cc')
 Source('delta_correlating_prediction_tables.cc')
 Source('irregular_stream_buffer.cc')
 Source('queued.cc')
diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc
index cd3eade..52f5d1a 100644
--- a/src/mem/cache/prefetch/base.cc
+++ b/src/mem/cache/prefetch/base.cc
@@ -72,7 +72,11 @@
 void
 BasePrefetcher::PrefetchListener::notify(const PacketPtr &pkt)
 {
-    parent.probeNotify(pkt);
+    if (isFill) {
+        parent.notifyFill(pkt);
+    } else {
+        parent.probeNotify(pkt);
+    }
 }
 
 BasePrefetcher::BasePrefetcher(const BasePrefetcherParams *p)
@@ -224,6 +228,7 @@
     if (listeners.empty() && cache != nullptr) {
         ProbeManager *pm(cache->getProbeManager());
         listeners.push_back(new PrefetchListener(*this, pm, "Miss"));
+        listeners.push_back(new PrefetchListener(*this, pm, "Fill", true));
         if (prefetchOnAccess) {
             listeners.push_back(new PrefetchListener(*this, pm, "Hit"));
         }
diff --git a/src/mem/cache/prefetch/base.hh b/src/mem/cache/prefetch/base.hh
index de275f8..4df8428 100644
--- a/src/mem/cache/prefetch/base.hh
+++ b/src/mem/cache/prefetch/base.hh
@@ -67,12 +67,13 @@
     {
       public:
         PrefetchListener(BasePrefetcher &_parent, ProbeManager *pm,
-                         const std::string &name)
+                         const std::string &name, bool _isFill = false)
             : ProbeListenerArgBase(pm, name),
-              parent(_parent) {}
+              parent(_parent), isFill(_isFill) {}
         void notify(const PacketPtr &pkt) override;
       protected:
         BasePrefetcher &parent;
+        bool isFill;
     };
 
     std::vector<PrefetchListener *> listeners;
@@ -253,6 +254,10 @@
      */
     virtual void notify(const PacketPtr &pkt, const PrefetchInfo &pfi) = 0;
 
+    /** Notify prefetcher of cache fill */
+    virtual void notifyFill(const PacketPtr &pkt)
+    {}
+
     virtual PacketPtr getPacket() = 0;
 
     virtual Tick nextPrefetchReadyTime() const = 0;
diff --git a/src/mem/cache/prefetch/bop.cc b/src/mem/cache/prefetch/bop.cc
new file mode 100644
index 0000000..b79082f
--- /dev/null
+++ b/src/mem/cache/prefetch/bop.cc
@@ -0,0 +1,267 @@
+/**
+ * Copyright (c) 2018 Metempsy Technology Consulting
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Ivan Pizarro
+ */
+
+#include "mem/cache/prefetch/bop.hh"
+
+#include "debug/HWPrefetch.hh"
+#include "params/BOPPrefetcher.hh"
+
+BOPPrefetcher::BOPPrefetcher(const BOPPrefetcherParams *p)
+    : QueuedPrefetcher(p),
+      scoreMax(p->score_max), roundMax(p->round_max),
+      badScore(p->bad_score), rrEntries(p->rr_size),
+      tagMask((1 << p->tag_bits) - 1),
+      delayQueueEnabled(p->delay_queue_enable),
+      delayQueueSize(p->delay_queue_size),
+      delayTicks(cyclesToTicks(p->delay_queue_cycles)),
+      delayQueueEvent([this]{ delayQueueEventWrapper(); }, name()),
+      issuePrefetchRequests(false), bestOffset(1), phaseBestOffset(0),
+      bestScore(0), round(0)
+{
+    if (!isPowerOf2(rrEntries)) {
+        fatal("%s: number of RR entries is not power of 2\n", name());
+    }
+    if (!isPowerOf2(blkSize)) {
+        fatal("%s: cache line size is not power of 2\n", name());
+    }
+    if (!(p->negative_offsets_enable && (p->offset_list_size % 2 == 0))) {
+        fatal("%s: negative offsets enabled with odd offset list size\n",
+              name());
+    }
+
+    rrLeft.resize(rrEntries);
+    rrRight.resize(rrEntries);
+
+    // Following the paper implementation, a list with the specified number
+    // of offsets which are of the form 2^i * 3^j * 5^k with i,j,k >= 0
+    const int factors[] = { 2, 3, 5 };
+    unsigned int i = 0;
+    int64_t offset_i = 1;
+
+    while (i < p->offset_list_size)
+    {
+        int64_t offset = offset_i;
+
+        for (int n : factors) {
+            while ((offset % n) == 0) {
+                offset /= n;
+            }
+        }
+
+        if (offset == 1) {
+            offsetsList.push_back(OffsetListEntry(offset_i, 0));
+            i++;
+            // If we want to use negative offsets, add also the negative value
+            // of the offset just calculated
+            if (p->negative_offsets_enable)  {
+                offsetsList.push_back(OffsetListEntry(-offset_i, 0));
+                i++;
+            }
+        }
+
+        offset_i++;
+    }
+
+    offsetsListIterator = offsetsList.begin();
+}
+
+void
+BOPPrefetcher::delayQueueEventWrapper()
+{
+    while (!delayQueue.empty() &&
+            delayQueue.front().processTick <= curTick())
+    {
+        Addr addr_x = delayQueue.front().baseAddr;
+        insertIntoRR(addr_x, RRWay::Left);
+        delayQueue.pop_front();
+    }
+
+    // Schedule an event for the next element if there is one
+    if (!delayQueue.empty()) {
+        schedule(delayQueueEvent, delayQueue.front().processTick);
+    }
+}
+
+unsigned int
+BOPPrefetcher::hash(Addr addr, unsigned int way) const
+{
+    Addr hash1 = addr >> way;
+    Addr hash2 = hash1 >> floorLog2(rrEntries);
+    return (hash1 ^ hash2) & (Addr)(rrEntries - 1);
+}
+
+void
+BOPPrefetcher::insertIntoRR(Addr addr, unsigned int way)
+{
+    switch (way) {
+        case RRWay::Left:
+            rrLeft[hash(addr, RRWay::Left)] = addr;
+            break;
+        case RRWay::Right:
+            rrRight[hash(addr, RRWay::Right)] = addr;
+            break;
+    }
+}
+
+void
+BOPPrefetcher::insertIntoDelayQueue(Addr x)
+{
+    if (delayQueue.size() == delayQueueSize) {
+        return;
+    }
+
+    // Add the address to the delay queue and schedule an event to process
+    // it after the specified delay cycles
+    Tick process_tick = curTick() + delayTicks;
+
+    delayQueue.push_back(DelayQueueEntry(x, process_tick));
+
+    if (!delayQueueEvent.scheduled()) {
+        schedule(delayQueueEvent, process_tick);
+    }
+}
+
+void
+BOPPrefetcher::resetScores()
+{
+    for (auto& it : offsetsList) {
+        it.second = 0;
+    }
+}
+
+inline Addr
+BOPPrefetcher::tag(Addr addr) const
+{
+    return (addr >> blkSize) & tagMask;
+}
+
+bool
+BOPPrefetcher::testRR(Addr addr) const
+{
+    for (auto& it : rrLeft) {
+        if (it == addr) {
+            return true;
+        }
+    }
+
+    for (auto& it : rrRight) {
+        if (it == addr) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void
+BOPPrefetcher::bestOffsetLearning(Addr x)
+{
+    Addr offset_addr = (*offsetsListIterator).first;
+    Addr lookup_addr = x - offset_addr;
+
+    // There was a hit in the RR table, increment the score for this offset
+    if (testRR(lookup_addr)) {
+        DPRINTF(HWPrefetch, "Address %#lx found in the RR table\n", x);
+        (*offsetsListIterator).second++;
+        if ((*offsetsListIterator).second > bestScore) {
+            bestScore = (*offsetsListIterator).second;
+            phaseBestOffset = (*offsetsListIterator).first;
+            DPRINTF(HWPrefetch, "New best score is %lu\n", bestScore);
+        }
+    }
+
+    offsetsListIterator++;
+
+    // All the offsets in the list were visited meaning that a learning
+    // phase finished. Check if
+    if (offsetsListIterator == offsetsList.end()) {
+        offsetsListIterator = offsetsList.begin();
+        round++;
+
+        // Check if the best offset must be updated if:
+        // (1) One of the scores equals SCORE_MAX
+        // (2) The number of rounds equals ROUND_MAX
+        if ((bestScore >= scoreMax) || (round == roundMax)) {
+            bestOffset = phaseBestOffset;
+            round = 0;
+            bestScore = 0;
+            phaseBestOffset = 0;
+            resetScores();
+            issuePrefetchRequests = true;
+        } else if (phaseBestOffset <= badScore) {
+            issuePrefetchRequests = false;
+        }
+    }
+}
+
+void
+BOPPrefetcher::calculatePrefetch(const PrefetchInfo &pfi,
+        std::vector<AddrPriority> &addresses)
+{
+    Addr addr = pfi.getAddr();
+    Addr tag_x = tag(addr);
+
+    if (delayQueueEnabled) {
+        insertIntoDelayQueue(tag_x);
+    } else {
+        insertIntoRR(tag_x, RRWay::Left);
+    }
+
+    // Go through the nth offset and update the score, the best score and the
+    // current best offset if a better one is found
+    bestOffsetLearning(tag_x);
+
+    // This prefetcher is a degree 1 prefetch, so it will only generate one
+    // prefetch at most per access
+    if (issuePrefetchRequests) {
+        Addr prefetch_addr = addr + (bestOffset << lBlkSize);
+        addresses.push_back(AddrPriority(prefetch_addr, 0));
+        DPRINTF(HWPrefetch, "Generated prefetch %#lx\n", prefetch_addr);
+    }
+}
+
+void
+BOPPrefetcher::notifyFill(const PacketPtr& pkt)
+{
+    // Only insert into the RR right way if it's the pkt is a HWP
+    if (!pkt->cmd.isHWPrefetch()) return;
+
+    Addr tag_y = tag(pkt->getAddr());
+
+    if (issuePrefetchRequests) {
+        insertIntoRR(tag_y - bestOffset, RRWay::Right);
+    }
+}
+
+BOPPrefetcher*
+BOPPrefetcherParams::create()
+{
+   return new BOPPrefetcher(this);
+}
diff --git a/src/mem/cache/prefetch/bop.hh b/src/mem/cache/prefetch/bop.hh
new file mode 100644
index 0000000..e8fd770
--- /dev/null
+++ b/src/mem/cache/prefetch/bop.hh
@@ -0,0 +1,157 @@
+/**
+ * Copyright (c) 2018 Metempsy Technology Consulting
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Ivan Pizarro
+ */
+
+/**
+ * Implementation of the 'A Best-Offset Prefetcher'
+ * Reference:
+ *   Michaud, P. (2015, June). A best-offset prefetcher.
+ *   In 2nd Data Prefetching Championship.
+ */
+
+#ifndef __MEM_CACHE_PREFETCH_BOP_HH__
+#define __MEM_CACHE_PREFETCH_BOP_HH__
+
+#include <queue>
+
+#include "mem/cache/prefetch/queued.hh"
+#include "mem/packet.hh"
+
+struct BOPPrefetcherParams;
+
+class BOPPrefetcher : public QueuedPrefetcher
+{
+    private:
+
+        enum RRWay {
+            Left,
+            Right
+        };
+
+        /** Learning phase parameters */
+        const unsigned int scoreMax;
+        const unsigned int roundMax;
+        const unsigned int badScore;
+        /** Recent requests table parameteres */
+        const unsigned int rrEntries;
+        const unsigned int tagMask;
+        /** Delay queue parameters */
+        const bool         delayQueueEnabled;
+        const unsigned int delayQueueSize;
+        const unsigned int delayTicks;
+
+        std::vector<Addr> rrLeft;
+        std::vector<Addr> rrRight;
+
+        /** Structure to save the offset and the score */
+        typedef std::pair<int16_t, uint8_t> OffsetListEntry;
+        std::vector<OffsetListEntry> offsetsList;
+
+        /** In a first implementation of the BO prefetcher, both banks of the
+         *  RR were written simultaneously when a prefetched line is inserted
+         *  into the cache. Adding the delay queue tries to avoid always
+         *  striving for timeless prefetches, which has been found to not
+         *  always being optimal.
+         */
+        struct DelayQueueEntry
+        {
+            Addr baseAddr;
+            Tick processTick;
+
+            DelayQueueEntry(Addr x, Tick t) : baseAddr(x), processTick(t)
+            {}
+        };
+
+        std::deque<DelayQueueEntry> delayQueue;
+
+        /** Event to handle the delay queue processing */
+        void delayQueueEventWrapper();
+        EventFunctionWrapper delayQueueEvent;
+
+        /** Hardware prefetcher enabled */
+        bool issuePrefetchRequests;
+        /** Current best offset to issue prefetches */
+        Addr bestOffset;
+        /** Current best offset found in the learning phase */
+        Addr phaseBestOffset;
+        /** Current test offset index */
+        std::vector<OffsetListEntry>::iterator offsetsListIterator;
+        /** Max score found so far */
+        unsigned int bestScore;
+        /** Current round */
+        unsigned int round;
+
+        /** Generate a hash for the specified address to index the RR table
+         *  @param addr: address to hash
+         *  @param way:  RR table to which is addressed (left/right)
+         */
+        unsigned int hash(Addr addr, unsigned int way) const;
+
+        /** Insert the specified address into the RR table
+         *  @param addr: address to insert
+         *  @param way: RR table to which the address will be inserted
+         */
+        void insertIntoRR(Addr addr, unsigned int way);
+
+        /** Insert the specified address into the delay queue. This will
+         *  trigger an event after the delay cycles pass
+         *  @param addr: address to insert into the delay queue
+         */
+        void insertIntoDelayQueue(Addr addr);
+
+        /** Reset all the scores from the offset list */
+        void resetScores();
+
+        /** Generate the tag for the specified address based on the tag bits
+         *  and the block size
+         *  @param addr: address to get the tag from
+        */
+        Addr tag(Addr addr) const;
+
+        /** Test if @X-O is hitting in the RR table to update the
+            offset score */
+        bool testRR(Addr) const;
+
+        /** Learning phase of the BOP. Update the intermediate values of the
+            round and update the best offset if found */
+        void bestOffsetLearning(Addr);
+
+        /** Update the RR right table after a prefetch fill */
+        void notifyFill(const PacketPtr& pkt) override;
+
+    public:
+
+        BOPPrefetcher(const BOPPrefetcherParams *p);
+        ~BOPPrefetcher() {}
+
+        void calculatePrefetch(const PrefetchInfo &pfi,
+                               std::vector<AddrPriority> &addresses);
+};
+
+#endif /* __MEM_CACHE_PREFETCH_BOP_HH__ */