mem: Add support for CMOs in the cache

This change adds support for maintenance operations (CMOs) in the
cache. The supported memory operations clean and/or invalidate a cache
block as specified by its VA to the specified xbar (PoU, PoC).

A cache maintenance packet visits all memories down to the specified
xbar. Caches need to invalidate their copy if it is an invalidating
CMO. If it is (additionally) a cleaning CMO and a dirty copy exists,
the cache cleans it with a WriteClean request.

Change-Id: Ibf31daa7213925898f3408738b11b1dd76c90b79
Reviewed-by: Stephan Diestelhorst <stephan.diestelhorst@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/5049
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
Reviewed-by: Jason Lowe-Power <jason@lowepower.com>
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index 48974c7..b0e904d 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -44,6 +44,7 @@
  *          Steve Reinhardt
  *          Ron Dreslinski
  *          Andreas Sandberg
+ *          Nikos Nikoleris
  */
 
 /**
@@ -322,6 +323,15 @@
     DPRINTF(Cache, "%s %s\n", pkt->print(),
             blk ? "hit " + blk->print() : "miss");
 
+    if (pkt->req->isCacheMaintenance()) {
+        // A cache maintenance operation is always forwarded to the
+        // memory below even if the block is found in dirty state.
+
+        // We defer any changes to the state of the block until we
+        // create and mark as in service the mshr for the downstream
+        // packet.
+        return false;
+    }
 
     if (pkt->isEviction()) {
         // We check for presence of block in above caches before issuing
@@ -650,6 +660,10 @@
 
     promoteWholeLineWrites(pkt);
 
+    // Cache maintenance operations have to visit all the caches down
+    // to the specified xbar (PoC, PoU, etc.). Even if a cache above
+    // is responding we forward the packet to the memory below rather
+    // than creating an express snoop.
     if (pkt->cacheResponding()) {
         // a cache above us (but not where the packet came from) is
         // responding to the request, in other words it has the line
@@ -763,8 +777,10 @@
                 blk->status &= ~BlkHWPrefetched;
 
             // Don't notify on SWPrefetch
-            if (!pkt->cmd.isSWPrefetch())
+            if (!pkt->cmd.isSWPrefetch()) {
+                assert(!pkt->req->isCacheMaintenance());
                 next_pf_time = prefetcher->notify(pkt);
+            }
         }
 
         if (needsResponse) {
@@ -855,6 +871,13 @@
                 // outstanding requests in MSHRs are simply sunk here
                 if (pkt->cmd == MemCmd::CleanEvict) {
                     pendingDelete.reset(pkt);
+                } else if (pkt->cmd == MemCmd::WriteClean) {
+                    // A WriteClean should never coalesce with any
+                    // outstanding cache maintenance requests.
+
+                    // We use forward_time here because there is an
+                    // uncached memory write, forwarded to WriteBuffer.
+                    allocateWriteBuffer(pkt, forward_time);
                 } else {
                     DPRINTF(Cache, "%s coalescing MSHR for %s\n", __func__,
                             pkt->print());
@@ -888,7 +911,8 @@
                 // know about the request
                 if (prefetcher) {
                     // Don't notify on SWPrefetch
-                    if (!pkt->cmd.isSWPrefetch())
+                    if (!pkt->cmd.isSWPrefetch() &&
+                        !pkt->req->isCacheMaintenance())
                         next_pf_time = prefetcher->notify(pkt);
                 }
             }
@@ -926,8 +950,8 @@
                     // internally, and have a sufficiently weak memory
                     // model, this is probably unnecessary, but at some
                     // point it must have seemed like we needed it...
-                    assert(pkt->needsWritable());
-                    assert(!blk->isWritable());
+                    assert((pkt->needsWritable() && !blk->isWritable()) ||
+                           pkt->req->isCacheMaintenance());
                     blk->status &= ~BlkReadable;
                 }
                 // Here we are using forward_time, modelling the latency of
@@ -938,7 +962,8 @@
 
             if (prefetcher) {
                 // Don't notify on SWPrefetch
-                if (!pkt->cmd.isSWPrefetch())
+                if (!pkt->cmd.isSWPrefetch() &&
+                    !pkt->req->isCacheMaintenance())
                     next_pf_time = prefetcher->notify(pkt);
             }
         }
@@ -961,7 +986,7 @@
 
     if (cpu_pkt->req->isUncacheable() ||
         (!blkValid && cpu_pkt->isUpgrade()) ||
-        cpu_pkt->cmd == MemCmd::InvalidateReq) {
+        cpu_pkt->cmd == MemCmd::InvalidateReq || cpu_pkt->isClean()) {
         // uncacheable requests and upgrades from upper-level caches
         // that missed completely just go through as is
         return nullptr;
@@ -1038,7 +1063,8 @@
 
     // follow the same flow as in recvTimingReq, and check if a cache
     // above us is responding
-    if (pkt->cacheResponding()) {
+    if (pkt->cacheResponding() && !pkt->isClean()) {
+        assert(!pkt->req->isCacheInvalidate());
         DPRINTF(Cache, "Cache above responding to %s: not responding\n",
                 pkt->print());
 
@@ -1059,6 +1085,18 @@
     PacketList writebacks;
     bool satisfied = access(pkt, blk, lat, writebacks);
 
+    if (pkt->isClean() && blk && blk->isDirty()) {
+        // A cache clean opearation is looking for a dirty
+        // block. If a dirty block is encountered a WriteClean
+        // will update any copies to the path to the memory
+        // until the point of reference.
+        DPRINTF(CacheVerbose, "%s: packet %s found block: %s\n",
+                __func__, pkt->print(), blk->print());
+        PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest());
+        writebacks.push_back(wb_pkt);
+        pkt->setSatisfied();
+    }
+
     // handle writebacks resulting from the access here to ensure they
     // logically proceed anything happening below
     doWritebacksAtomic(writebacks);
@@ -1344,7 +1382,8 @@
     // If the response indicates that there are no sharers and we
     // either had the block already or the response is filling we can
     // promote our copy to writable
-    if (!pkt->hasSharers() && (is_fill || valid_blk)) {
+    if (!pkt->hasSharers() &&
+        (is_fill || (valid_blk && !pkt->req->isCacheInvalidate()))) {
         mshr->promoteWritable();
     }
 
@@ -1360,6 +1399,12 @@
     // requests to be discarded
     bool is_invalidate = pkt->isInvalidate();
 
+    // The block was marked as not readable while there was a pending
+    // cache maintenance operation, restore its flag.
+    if (pkt->isClean() && !is_invalidate && valid_blk) {
+        blk->status |= BlkReadable;
+    }
+
     // First offset for critical word first calculations
     int initial_offset = initial_tgt->pkt->getOffset(blkSize);
 
@@ -1502,16 +1547,16 @@
             DPRINTF(Cache, "processing deferred snoop...\n");
             // If the response is invalidating, a snooping target can
             // be satisfied if it is also invalidating. If the reponse is, not
-            // only invalidating, but more specifically an InvalidateResp, the
-            // MSHR was created due to an InvalidateReq and a cache above is
-            // waiting to satisfy a WriteLineReq. In this case even an
+            // only invalidating, but more specifically an InvalidateResp and
+            // the MSHR was created due to an InvalidateReq then a cache above
+            // is waiting to satisfy a WriteLineReq. In this case even an
             // non-invalidating snoop is added as a target here since this is
             // the ordering point. When the InvalidateResp reaches this cache,
             // the snooping target will snoop further the cache above with the
             // WriteLineReq.
-            assert(!(is_invalidate &&
-                     pkt->cmd != MemCmd::InvalidateResp &&
-                     !mshr->hasPostInvalidate()));
+            assert(!is_invalidate || pkt->cmd == MemCmd::InvalidateResp ||
+                   pkt->req->isCacheMaintenance() ||
+                   mshr->hasPostInvalidate());
             handleSnoop(tgt_pkt, blk, true, true, mshr->hasPostInvalidate());
             break;
 
@@ -2027,6 +2072,11 @@
             if (snoopPkt.isBlockCached()) {
                 pkt->setBlockCached();
             }
+            // If the request was satisfied by snooping the cache
+            // above, mark the original packet as satisfied too.
+            if (snoopPkt.satisfied()) {
+                pkt->setSatisfied();
+            }
         } else {
             cpuSidePort->sendAtomicSnoop(pkt);
             if (!alreadyResponded && pkt->cacheResponding()) {
@@ -2037,7 +2087,28 @@
         }
     }
 
-    if (!blk || !blk->isValid()) {
+    bool respond = false;
+    bool blk_valid = blk && blk->isValid();
+    if (pkt->isClean()) {
+        if (blk_valid && blk->isDirty()) {
+            DPRINTF(CacheVerbose, "%s: packet (snoop) %s found block: %s\n",
+                    __func__, pkt->print(), blk->print());
+            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest());
+            PacketList writebacks;
+            writebacks.push_back(wb_pkt);
+
+            if (is_timing) {
+                // anything that is merely forwarded pays for the forward
+                // latency and the delay provided by the crossbar
+                Tick forward_time = clockEdge(forwardLatency) +
+                    pkt->headerDelay;
+                doWritebacks(writebacks, forward_time);
+            } else {
+                doWritebacksAtomic(writebacks);
+            }
+            pkt->setSatisfied();
+        }
+    } else if (!blk_valid) {
         DPRINTF(CacheVerbose, "%s: snoop miss for %s\n", __func__,
                 pkt->print());
         if (is_deferred) {
@@ -2056,20 +2127,20 @@
     } else {
         DPRINTF(Cache, "%s: snoop hit for %s, old state is %s\n", __func__,
                 pkt->print(), blk->print());
+
+        // We may end up modifying both the block state and the packet (if
+        // we respond in atomic mode), so just figure out what to do now
+        // and then do it later. We respond to all snoops that need
+        // responses provided we have the block in dirty state. The
+        // invalidation itself is taken care of below. We don't respond to
+        // cache maintenance operations as this is done by the destination
+        // xbar.
+        respond = blk->isDirty() && pkt->needsResponse();
+
+        chatty_assert(!(isReadOnly && blk->isDirty()), "Should never have "
+                      "a dirty block in a read-only cache %s\n", name());
     }
 
-    chatty_assert(!(isReadOnly && blk->isDirty()),
-                  "Should never have a dirty block in a read-only cache %s\n",
-                  name());
-
-    // We may end up modifying both the block state and the packet (if
-    // we respond in atomic mode), so just figure out what to do now
-    // and then do it later. We respond to all snoops that need
-    // responses provided we have the block in dirty state. The
-    // invalidation itself is taken care of below.
-    bool respond = blk->isDirty() && pkt->needsResponse();
-    bool have_writable = blk->isWritable();
-
     // Invalidate any prefetch's from below that would strip write permissions
     // MemCmd::HardPFReq is only observed by upstream caches.  After missing
     // above and in it's own cache, a new MemCmd::ReadReq is created that
@@ -2093,6 +2164,7 @@
         // Exclusive to Shared, or remain in Shared
         if (!pkt->req->isUncacheable())
             blk->status &= ~BlkWritable;
+        DPRINTF(Cache, "new state is %s\n", blk->print());
     }
 
     if (respond) {
@@ -2100,7 +2172,7 @@
         // memory, and also prevent any memory from even seeing the
         // request
         pkt->setCacheResponding();
-        if (have_writable) {
+        if (!pkt->isClean() && blk->isWritable()) {
             // inform the cache hierarchy that this cache had the line
             // in the Modified state so that we avoid unnecessary
             // invalidations (see Packet::setResponderHadWritable)
@@ -2149,12 +2221,11 @@
 
     // Do this last in case it deallocates block data or something
     // like that
-    if (invalidate) {
+    if (blk_valid && invalidate) {
         invalidateBlock(blk);
+        DPRINTF(Cache, "new state is %s\n", blk->print());
     }
 
-    DPRINTF(Cache, "new state is %s\n", blk->print());
-
     return snoop_delay;
 }
 
@@ -2195,6 +2266,13 @@
         return;
     }
 
+    // Bypass any existing cache maintenance requests if the request
+    // has been satisfied already (i.e., the dirty block has been
+    // found).
+    if (mshr && pkt->req->isCacheMaintenance() && pkt->satisfied()) {
+        return;
+    }
+
     // Let the MSHR itself track the snoop and decide whether we want
     // to go ahead and do the regular cache snoop
     if (mshr && mshr->handleSnoop(pkt, order++)) {
@@ -2259,7 +2337,7 @@
                                    false, false);
         }
 
-        if (invalidate) {
+        if (invalidate && wb_pkt->cmd != MemCmd::WriteClean) {
             // Invalidation trumps our writeback... discard here
             // Note: markInService will remove entry from writeback buffer.
             markInService(wb_entry);
@@ -2528,6 +2606,13 @@
     // as forwarded packets may already have existing state
     pkt->pushSenderState(mshr);
 
+    if (pkt->isClean() && blk && blk->isDirty()) {
+        // A cache clean opearation is looking for a dirty block. Mark
+        // the packet so that the destination xbar can determine that
+        // there will be a follow-up write packet as well.
+        pkt->setSatisfied();
+    }
+
     if (!memSidePort->sendTimingReq(pkt)) {
         // we are awaiting a retry, but we
         // delete the packet and will be creating a new packet
@@ -2551,6 +2636,19 @@
         bool pending_modified_resp = !pkt->hasSharers() &&
             pkt->cacheResponding();
         markInService(mshr, pending_modified_resp);
+        if (pkt->isClean() && blk && blk->isDirty()) {
+            // A cache clean opearation is looking for a dirty
+            // block. If a dirty block is encountered a WriteClean
+            // will update any copies to the path to the memory
+            // until the point of reference.
+            DPRINTF(CacheVerbose, "%s: packet %s found block: %s\n",
+                    __func__, pkt->print(), blk->print());
+            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest());
+            PacketList writebacks;
+            writebacks.push_back(wb_pkt);
+            doWritebacks(writebacks, 0);
+        }
+
         return false;
     }
 }
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index fcdd73a..88829d3 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -42,6 +42,7 @@
  *          Steve Reinhardt
  *          Ali Saidi
  *          Andreas Hansson
+ *          Nikos Nikoleris
  */
 
 /**
@@ -259,7 +260,7 @@
 
     enum : FlagsType {
         // Flags to transfer across when copying a packet
-        COPY_FLAGS             = 0x0000001F,
+        COPY_FLAGS             = 0x0000003F,
 
         // Does this packet have sharers (which means it should not be
         // considered writable) or not. See setHasSharers below.
@@ -282,6 +283,10 @@
         // downstream by the receiver
         WRITE_THROUGH          = 0x00000010,
 
+        // Response co-ordination flag for cache maintenance
+        // operations
+        SATISFIED              = 0x00000020,
+
         /// Are the 'addr' and 'size' fields valid?
         VALID_ADDR             = 0x00000100,
         VALID_SIZE             = 0x00000200,
@@ -643,6 +648,19 @@
     void clearWriteThrough() { flags.clear(WRITE_THROUGH); }
     bool writeThrough() const { return flags.isSet(WRITE_THROUGH); }
 
+    /**
+     * Set when a request hits in a cache and the cache is not going
+     * to respond. This is used by the crossbar to coordinate
+     * responses for cache maintenance operations.
+     */
+    void setSatisfied()
+    {
+        assert(cmd.isClean());
+        assert(!flags.isSet(SATISFIED));
+        flags.set(SATISFIED);
+    }
+    bool satisfied() const { return flags.isSet(SATISFIED); }
+
     void setSuppressFuncError()     { flags.set(SUPPRESS_FUNC_ERROR); }
     bool suppressFuncError() const  { return flags.isSet(SUPPRESS_FUNC_ERROR); }
     void setBlockCached()          { flags.set(BLOCK_CACHED); }