mem: Co-ordination of CMOs in the xbar

A clean packet request serving a cache maintenance operation (CMO)
visits all memories down to the specified xbar. The visited caches
invalidate their copy (if the CMO is invalidating) and if a dirty copy
is found a write packet writes the dirty data to the memory level
below the specified xbar. A response is send back when all the caches
are clean and/or invalidated and the specified xbar has seen the write
packet.

This patch adds the following functionality in the xbar:
1) Accounts for the cache clean requests that go through the xbar
2) Generates the cache clean response when both the cache clean
request and the corresponding writeclean packet has crossed the
destination xbar.

Previously transactions in the xbar were identified using the pointer
of the original request. Cache clean transactions comprise of two
different packets, the clean request and the writeclean, and therefore
have different request pointers. This patch adds support for custom
transaction IDs that by default take the value of the request pointer
but can be overriden by the contructor. This allows the clean request
and writeclean share the same id which the coherent xbar uses to
co-ordinate them and send the response in a timely manner.

Change-Id: I80db76386a1caded38dc66e6e18f930c3bb800ff
Reviewed-by: Stephan Diestelhorst <stephan.diestelhorst@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/5051
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
Reviewed-by: Jason Lowe-Power <jason@lowepower.com>
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index b0e904d..a83f8ab 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -1092,7 +1092,7 @@
         // until the point of reference.
         DPRINTF(CacheVerbose, "%s: packet %s found block: %s\n",
                 __func__, pkt->print(), blk->print());
-        PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest());
+        PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(), pkt->id);
         writebacks.push_back(wb_pkt);
         pkt->setSatisfied();
     }
@@ -1679,7 +1679,7 @@
 }
 
 PacketPtr
-Cache::writecleanBlk(CacheBlk *blk, Request::Flags dest)
+Cache::writecleanBlk(CacheBlk *blk, Request::Flags dest, PacketId id)
 {
     Request *req = new Request(tags->regenerateBlkAddr(blk->tag, blk->set),
                                blkSize, 0, Request::wbMasterId);
@@ -1688,7 +1688,7 @@
     }
     req->taskId(blk->task_id);
     blk->task_id = ContextSwitchTaskId::Unknown;
-    PacketPtr pkt = new Packet(req, MemCmd::WriteClean);
+    PacketPtr pkt = new Packet(req, MemCmd::WriteClean, blkSize, id);
     DPRINTF(Cache, "Create %s writable: %d, dirty: %d\n", pkt->print(),
             blk->isWritable(), blk->isDirty());
     // make sure the block is not marked dirty
@@ -2093,7 +2093,7 @@
         if (blk_valid && blk->isDirty()) {
             DPRINTF(CacheVerbose, "%s: packet (snoop) %s found block: %s\n",
                     __func__, pkt->print(), blk->print());
-            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest());
+            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(), pkt->id);
             PacketList writebacks;
             writebacks.push_back(wb_pkt);
 
@@ -2643,7 +2643,8 @@
             // until the point of reference.
             DPRINTF(CacheVerbose, "%s: packet %s found block: %s\n",
                     __func__, pkt->print(), blk->print());
-            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest());
+            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(),
+                                             pkt->id);
             PacketList writebacks;
             writebacks.push_back(wb_pkt);
             doWritebacks(writebacks, 0);
diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh
index cd3a1d8..4d840be 100644
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -458,7 +458,7 @@
      * @param dest The destination of this clean operation
      * @return The write clean packet for the block.
      */
-    PacketPtr writecleanBlk(CacheBlk *blk, Request::Flags dest = 0);
+    PacketPtr writecleanBlk(CacheBlk *blk, Request::Flags dest, PacketId id);
 
     /**
      * Create a CleanEvict request for the given block.
diff --git a/src/mem/cache/mshr.cc b/src/mem/cache/mshr.cc
index d89adef..f1a9b98 100644
--- a/src/mem/cache/mshr.cc
+++ b/src/mem/cache/mshr.cc
@@ -426,7 +426,7 @@
         // the packet and the request as part of handling the deferred
         // snoop.
         PacketPtr cp_pkt = will_respond ? new Packet(pkt, true, true) :
-            new Packet(new Request(*pkt->req), pkt->cmd, blkSize);
+            new Packet(new Request(*pkt->req), pkt->cmd, blkSize, pkt->id);
 
         if (will_respond) {
             // we are the ordering point, and will consequently
diff --git a/src/mem/coherent_xbar.cc b/src/mem/coherent_xbar.cc
index e946134..02b3122 100644
--- a/src/mem/coherent_xbar.cc
+++ b/src/mem/coherent_xbar.cc
@@ -40,6 +40,7 @@
  * Authors: Ali Saidi
  *          Andreas Hansson
  *          William Wang
+ *          Nikos Nikoleris
  */
 
 /**
@@ -194,6 +195,22 @@
     if (snoop_caches) {
         assert(pkt->snoopDelay == 0);
 
+        if (pkt->isClean() && !is_destination) {
+            // before snooping we need to make sure that the memory
+            // below is not busy and the cache clean request can be
+            // forwarded to it
+            if (!masterPorts[master_port_id]->tryTiming(pkt)) {
+                DPRINTF(CoherentXBar, "%s: src %s packet %s RETRY\n", __func__,
+                        src_port->name(), pkt->print());
+
+                // update the layer state and schedule an idle event
+                reqLayers[master_port_id]->failedTiming(src_port,
+                                                        clockEdge(Cycles(1)));
+                return false;
+            }
+        }
+
+
         // the packet is a memory-mapped request and should be
         // broadcasted to our snoopers but the source
         if (snoopFilter) {
@@ -342,21 +359,76 @@
         // queue the packet for deletion
         pendingDelete.reset(pkt);
 
+    // normally we respond to the packet we just received if we need to
+    PacketPtr rsp_pkt = pkt;
+    PortID rsp_port_id = slave_port_id;
+
+    // If this is the destination of the cache clean operation the
+    // crossbar is responsible for responding. This crossbar will
+    // respond when the cache clean is complete. A cache clean
+    // is complete either:
+    // * direcly, if no cache above had a dirty copy of the block
+    //   as indicated by the satisfied flag of the packet, or
+    // * when the crossbar has seen both the cache clean request
+    //   (CleanSharedReq, CleanInvalidReq) and the corresponding
+    //   write (WriteClean) which updates the block in the memory
+    //   below.
+    if (success &&
+        ((pkt->isClean() && pkt->satisfied()) ||
+         pkt->cmd == MemCmd::WriteClean) &&
+        is_destination) {
+        PacketPtr deferred_rsp = pkt->isWrite() ? nullptr : pkt;
+        auto cmo_lookup = outstandingCMO.find(pkt->id);
+        if (cmo_lookup != outstandingCMO.end()) {
+            // the cache clean request has already reached this xbar
+            respond_directly = true;
+            if (pkt->isWrite()) {
+                rsp_pkt = cmo_lookup->second;
+                assert(rsp_pkt);
+
+                // determine the destination
+                const auto route_lookup = routeTo.find(rsp_pkt->req);
+                assert(route_lookup != routeTo.end());
+                rsp_port_id = route_lookup->second;
+                assert(rsp_port_id != InvalidPortID);
+                assert(rsp_port_id < respLayers.size());
+                // remove the request from the routing table
+                routeTo.erase(route_lookup);
+            }
+            outstandingCMO.erase(cmo_lookup);
+        } else {
+            respond_directly = false;
+            outstandingCMO.emplace(pkt->id, deferred_rsp);
+            if (!pkt->isWrite()) {
+                assert(routeTo.find(pkt->req) == routeTo.end());
+                routeTo[pkt->req] = slave_port_id;
+
+                panic_if(routeTo.size() > 512,
+                         "Routing table exceeds 512 packets\n");
+            }
+        }
+    }
+
+
     if (respond_directly) {
-        assert(pkt->needsResponse());
+        assert(rsp_pkt->needsResponse());
         assert(success);
 
-        pkt->makeResponse();
+        rsp_pkt->makeResponse();
 
         if (snoopFilter && !system->bypassCaches()) {
             // let the snoop filter inspect the response and update its state
-            snoopFilter->updateResponse(pkt, *slavePorts[slave_port_id]);
+            snoopFilter->updateResponse(rsp_pkt, *slavePorts[rsp_port_id]);
         }
 
+        // we send the response after the current packet, even if the
+        // response is not for this packet (e.g. cache clean operation
+        // where both the request and the write packet have to cross
+        // the destination xbar before the response is sent.)
         Tick response_time = clockEdge() + pkt->headerDelay;
-        pkt->headerDelay = 0;
+        rsp_pkt->headerDelay = 0;
 
-        slavePorts[slave_port_id]->schedTimingResp(pkt, response_time);
+        slavePorts[rsp_port_id]->schedTimingResp(rsp_pkt, response_time);
     }
 
     return success;
@@ -754,6 +826,30 @@
         response_latency = snoop_response_latency;
     }
 
+    // If this is the destination of the cache clean operation the
+    // crossbar is responsible for responding. This crossbar will
+    // respond when the cache clean is complete. An atomic cache clean
+    // is complete when the crossbars receives the cache clean
+    // request (CleanSharedReq, CleanInvalidReq), as either:
+    // * no cache above had a dirty copy of the block as indicated by
+    //   the satisfied flag of the packet, or
+    // * the crossbar has already seen the corresponding write
+    //   (WriteClean) which updates the block in the memory below.
+    if (pkt->isClean() && isDestination(pkt) && pkt->satisfied()) {
+        auto it = outstandingCMO.find(pkt->id);
+        assert(it != outstandingCMO.end());
+        // we are responding right away
+        outstandingCMO.erase(it);
+    } else if (pkt->cmd == MemCmd::WriteClean && isDestination(pkt)) {
+        // if this is the destination of the operation, the xbar
+        // sends the responce to the cache clean operation only
+        // after having encountered the cache clean request
+        auto M5_VAR_USED ret = outstandingCMO.emplace(pkt->id, nullptr);
+        // in atomic mode we know that the WriteClean packet should
+        // precede the clean request
+        assert(ret.second);
+    }
+
     // add the response data
     if (pkt->isResponse()) {
         pkt_size = pkt->hasData() ? pkt->getSize() : 0;
@@ -988,8 +1084,13 @@
 CoherentXBar::forwardPacket(const PacketPtr pkt)
 {
     // we are forwarding the packet if:
-    // 1) this is a read or a write
-    // 2) this crossbar is above the point of coherency
+    // 1) this is a cache clean request to the PoU/PoC and this
+    //    crossbar is above the PoU/PoC
+    // 2) this is a read or a write
+    // 3) this crossbar is above the point of coherency
+    if (pkt->isClean()) {
+        return !isDestination(pkt);
+    }
     return pkt->isRead() || pkt->isWrite() || !pointOfCoherency;
 }
 
diff --git a/src/mem/coherent_xbar.hh b/src/mem/coherent_xbar.hh
index 0c2907f..79777b9 100644
--- a/src/mem/coherent_xbar.hh
+++ b/src/mem/coherent_xbar.hh
@@ -51,6 +51,7 @@
 #ifndef __MEM_COHERENT_XBAR_HH__
 #define __MEM_COHERENT_XBAR_HH__
 
+#include <unordered_map>
 #include <unordered_set>
 
 #include "mem/snoop_filter.hh"
@@ -263,6 +264,13 @@
     std::unordered_set<RequestPtr> outstandingSnoop;
 
     /**
+     * Store the outstanding cache maintenance that we are expecting
+     * snoop responses from so we can determine when we received all
+     * snoop responses and if any of the agents satisfied the request.
+     */
+    std::unordered_map<PacketId, PacketPtr> outstandingCMO;
+
+    /**
      * Keep a pointer to the system to be allow to querying memory system
      * properties.
      */
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 88829d3..66625b3 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -70,6 +70,7 @@
 typedef Packet *PacketPtr;
 typedef uint8_t* PacketDataPtr;
 typedef std::list<PacketPtr> PacketList;
+typedef uint64_t PacketId;
 
 class MemCmd
 {
@@ -316,6 +317,8 @@
     /// The command field of the packet.
     MemCmd cmd;
 
+    const PacketId id;
+
     /// A pointer to the original request.
     const RequestPtr req;
 
@@ -743,9 +746,9 @@
      * not be valid. The command must be supplied.
      */
     Packet(const RequestPtr _req, MemCmd _cmd)
-        :  cmd(_cmd), req(_req), data(nullptr), addr(0), _isSecure(false),
-           size(0), headerDelay(0), snoopDelay(0), payloadDelay(0),
-           senderState(NULL)
+        :  cmd(_cmd), id((PacketId)_req), req(_req), data(nullptr), addr(0),
+           _isSecure(false), size(0), headerDelay(0), snoopDelay(0),
+           payloadDelay(0), senderState(NULL)
     {
         if (req->hasPaddr()) {
             addr = req->getPaddr();
@@ -763,10 +766,10 @@
      * a request that is for a whole block, not the address from the
      * req.  this allows for overriding the size/addr of the req.
      */
-    Packet(const RequestPtr _req, MemCmd _cmd, int _blkSize)
-        :  cmd(_cmd), req(_req), data(nullptr), addr(0), _isSecure(false),
-           headerDelay(0), snoopDelay(0), payloadDelay(0),
-           senderState(NULL)
+    Packet(const RequestPtr _req, MemCmd _cmd, int _blkSize, PacketId _id = 0)
+        :  cmd(_cmd), id(_id ? _id : (PacketId)_req), req(_req), data(nullptr),
+           addr(0), _isSecure(false), headerDelay(0), snoopDelay(0),
+           payloadDelay(0), senderState(NULL)
     {
         if (req->hasPaddr()) {
             addr = req->getPaddr() & ~(_blkSize - 1);
@@ -785,7 +788,7 @@
      * packet should allocate its own data.
      */
     Packet(const PacketPtr pkt, bool clear_flags, bool alloc_data)
-        :  cmd(pkt->cmd), req(pkt->req),
+        :  cmd(pkt->cmd), id(pkt->id), req(pkt->req),
            data(nullptr),
            addr(pkt->addr), _isSecure(pkt->_isSecure), size(pkt->size),
            bytesValid(pkt->bytesValid),