Revert "mem-cache: Remove writebacks packet list"

This reverts commit bf0a722acdd8247602e83720a5f81a0b69c76250.

Reason for revert: This patch introduces a bug:

The problem here is that the insertion of block A may cause the
eviction of block B, which on the lower level may cause the
eviction of block A. Since A is not marked as present yet, A is
"safely" removed from the snoop filter

However, by reverting it, using atomic and a Tags sub-class that
can generate multiple evictions at once becomes broken when using
Atomic mode and shall be fixed in a future patch.

Change-Id: I5b27e54b54ae5b50255588835c1a2ebf3015f002
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/19088
Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com>
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index 1536ada..082650c 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -342,11 +342,20 @@
     // the delay provided by the crossbar
     Tick forward_time = clockEdge(forwardLatency) + pkt->headerDelay;
 
-    // Note that lat is passed by reference here. The function
-    // access() will set the lat value.
     Cycles lat;
     CacheBlk *blk = nullptr;
-    bool satisfied = access(pkt, blk, lat);
+    bool satisfied = false;
+    {
+        PacketList writebacks;
+        // Note that lat is passed by reference here. The function
+        // access() will set the lat value.
+        satisfied = access(pkt, blk, lat, writebacks);
+
+        // After the evicted blocks are selected, they must be forwarded
+        // to the write buffer to ensure they logically precede anything
+        // happening below
+        doWritebacks(writebacks, clockEdge(lat + forwardLatency));
+    }
 
     // Here we charge the headerDelay that takes into account the latencies
     // of the bus, if the packet comes from it.
@@ -448,6 +457,8 @@
             miss_latency;
     }
 
+    PacketList writebacks;
+
     bool is_fill = !mshr->isForward &&
         (pkt->isRead() || pkt->cmd == MemCmd::UpgradeResp ||
          mshr->wasWholeLineWrite);
@@ -464,7 +475,7 @@
 
         const bool allocate = (writeAllocator && mshr->wasWholeLineWrite) ?
             writeAllocator->allocate() : mshr->allocOnFill();
-        blk = handleFill(pkt, blk, allocate);
+        blk = handleFill(pkt, blk, writebacks, allocate);
         assert(blk != nullptr);
         ppFill->notify(pkt);
     }
@@ -520,9 +531,13 @@
 
     // if we used temp block, check to see if its valid and then clear it out
     if (blk == tempBlock && tempBlock->isValid()) {
-        evictBlock(blk, clockEdge(forwardLatency) + pkt->headerDelay);
+        evictBlock(blk, writebacks);
     }
 
+    const Tick forward_time = clockEdge(forwardLatency) + pkt->headerDelay;
+    // copy writebacks to write buffer
+    doWritebacks(writebacks, forward_time);
+
     DPRINTF(CacheVerbose, "%s: Leaving with %s\n", __func__, pkt->print());
     delete pkt;
 }
@@ -540,7 +555,8 @@
     Cycles lat = lookupLatency;
 
     CacheBlk *blk = nullptr;
-    bool satisfied = access(pkt, blk, lat);
+    PacketList writebacks;
+    bool satisfied = access(pkt, blk, lat, writebacks);
 
     if (pkt->isClean() && blk && blk->isDirty()) {
         // A cache clean opearation is looking for a dirty
@@ -550,12 +566,17 @@
         DPRINTF(CacheVerbose, "%s: packet %s found block: %s\n",
                 __func__, pkt->print(), blk->print());
         PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(), pkt->id);
+        writebacks.push_back(wb_pkt);
         pkt->setSatisfied();
-        doWritebacksAtomic(wb_pkt);
     }
 
+    // handle writebacks resulting from the access here to ensure they
+    // logically precede anything happening below
+    doWritebacksAtomic(writebacks);
+    assert(writebacks.empty());
+
     if (!satisfied) {
-        lat += handleAtomicReqMiss(pkt, blk);
+        lat += handleAtomicReqMiss(pkt, blk, writebacks);
     }
 
     // Note that we don't invoke the prefetcher at all in atomic mode.
@@ -569,6 +590,9 @@
     // immediately rather than calling requestMemSideBus() as we do
     // there).
 
+    // do any writebacks resulting from the response handling
+    doWritebacksAtomic(writebacks);
+
     // if we used temp block, check to see if its valid and if so
     // clear it out, but only do so after the call to recvAtomic is
     // finished so that any downstream observers (such as a snoop
@@ -776,7 +800,7 @@
 
 bool
 BaseCache::updateCompressionData(CacheBlk *blk, const uint64_t* data,
-    uint32_t delay, Cycles tag_latency)
+                                 PacketList &writebacks)
 {
     // tempBlock does not exist in the tags, so don't do anything for it.
     if (blk == tempBlock) {
@@ -866,8 +890,7 @@
             if (evict_blk->wasPrefetched()) {
                 unusedPrefetches++;
             }
-            Cycles lat = calculateAccessLatency(evict_blk, delay, tag_latency);
-            evictBlock(evict_blk, clockEdge(lat + forwardLatency));
+            evictBlock(evict_blk, writebacks);
         }
     }
 
@@ -1001,7 +1024,8 @@
 }
 
 bool
-BaseCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
+BaseCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                  PacketList &writebacks)
 {
     // sanity check
     assert(pkt->isRequest());
@@ -1100,7 +1124,7 @@
 
         if (!blk) {
             // need to do a replacement
-            blk = allocateBlock(pkt, tag_latency);
+            blk = allocateBlock(pkt, writebacks);
             if (!blk) {
                 // no replaceable block available: give up, fwd to next level.
                 incMissCount(pkt);
@@ -1119,7 +1143,7 @@
             // a smaller size, and now it doesn't fit the entry anymore).
             // If that is the case we might need to evict blocks.
             if (!updateCompressionData(blk, pkt->getConstPtr<uint64_t>(),
-                pkt->headerDelay, tag_latency)) {
+                writebacks)) {
                 // This is a failed data expansion (write), which happened
                 // after finding the replacement entries and accessing the
                 // block's data. There were no replaceable entries available
@@ -1195,7 +1219,7 @@
                 return false;
             } else {
                 // a writeback that misses needs to allocate a new block
-                blk = allocateBlock(pkt, tag_latency);
+                blk = allocateBlock(pkt, writebacks);
                 if (!blk) {
                     // no replaceable block available: give up, fwd to
                     // next level.
@@ -1218,7 +1242,7 @@
             // a smaller size, and now it doesn't fit the entry anymore).
             // If that is the case we might need to evict blocks.
             if (!updateCompressionData(blk, pkt->getConstPtr<uint64_t>(),
-                pkt->headerDelay, tag_latency)) {
+                writebacks)) {
                 // This is a failed data expansion (write), which happened
                 // after finding the replacement entries and accessing the
                 // block's data. There were no replaceable entries available
@@ -1311,7 +1335,8 @@
 }
 
 CacheBlk*
-BaseCache::handleFill(PacketPtr pkt, CacheBlk *blk, bool allocate)
+BaseCache::handleFill(PacketPtr pkt, CacheBlk *blk, PacketList &writebacks,
+                      bool allocate)
 {
     assert(pkt->isResponse());
     Addr addr = pkt->getAddr();
@@ -1328,12 +1353,9 @@
         // better have read new data...
         assert(pkt->hasData() || pkt->cmd == MemCmd::InvalidateResp);
 
-        // Need to do a replacement if allocating, otherwise we stick
-        // with the temporary storage. The tag lookup has already been
-        // done to decide the eviction victims, so it is set to 0 here.
-        // The eviction itself, however, is delayed until the new data
-        // for the block that is requesting the replacement arrives.
-        blk = allocate ? allocateBlock(pkt, Cycles(0)) : nullptr;
+        // need to do a replacement if allocating, otherwise we stick
+        // with the temporary storage
+        blk = allocate ? allocateBlock(pkt, writebacks) : nullptr;
 
         if (!blk) {
             // No replaceable block or a mostly exclusive
@@ -1434,7 +1456,7 @@
 }
 
 CacheBlk*
-BaseCache::allocateBlock(const PacketPtr pkt, Cycles tag_latency)
+BaseCache::allocateBlock(const PacketPtr pkt, PacketList &writebacks)
 {
     // Get address
     const Addr addr = pkt->getAddr();
@@ -1507,9 +1529,7 @@
                     unusedPrefetches++;
                 }
 
-                Cycles lat =
-                    calculateAccessLatency(blk, pkt->headerDelay, tag_latency);
-                evictBlock(blk, clockEdge(lat + forwardLatency));
+                evictBlock(blk, writebacks);
             }
         }
 
@@ -1542,15 +1562,11 @@
 }
 
 void
-BaseCache::evictBlock(CacheBlk *blk, Tick forward_timing)
+BaseCache::evictBlock(CacheBlk *blk, PacketList &writebacks)
 {
     PacketPtr pkt = evictBlock(blk);
     if (pkt) {
-        if (system->isTimingMode()) {
-            doWritebacks(pkt, forward_timing);
-        } else {
-            doWritebacksAtomic(pkt);
-        }
+        writebacks.push_back(pkt);
     }
 }
 
@@ -1819,7 +1835,9 @@
                     __func__, pkt->print(), blk->print());
             PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(),
                                              pkt->id);
-            doWritebacks(wb_pkt, 0);
+            PacketList writebacks;
+            writebacks.push_back(wb_pkt);
+            doWritebacks(writebacks, 0);
         }
 
         return false;
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index c4e3971..8794829 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -454,9 +454,11 @@
      * @param pkt The memory request to perform.
      * @param blk The cache block to be updated.
      * @param lat The latency of the access.
+     * @param writebacks List for any writebacks that need to be performed.
      * @return Boolean indicating whether the request was satisfied.
      */
-    virtual bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat);
+    virtual bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                        PacketList &writebacks);
 
     /*
      * Handle a timing request that hit in the cache
@@ -549,9 +551,11 @@
      *
      * @param pkt The packet with the requests
      * @param blk The referenced block
+     * @param writebacks A list with packets for any performed writebacks
      * @return Cycles for handling the request
      */
-    virtual Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk) = 0;
+    virtual Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                                       PacketList &writebacks) = 0;
 
     /**
      * Performs the access specified by the request.
@@ -591,18 +595,13 @@
 
     /**
      * Insert writebacks into the write buffer
-     *
-     * @param pkt The writeback packet.
-     * @param forward_time Tick to which the writeback should be scheduled.
      */
-    virtual void doWritebacks(PacketPtr pkt, Tick forward_time) = 0;
+    virtual void doWritebacks(PacketList& writebacks, Tick forward_time) = 0;
 
     /**
-     * Send writebacks down the memory hierarchy in atomic mode.
-     *
-     * @param pkt The writeback packet.
+     * Send writebacks down the memory hierarchy in atomic mode
      */
-    virtual void doWritebacksAtomic(PacketPtr pkt) = 0;
+    virtual void doWritebacksAtomic(PacketList& writebacks) = 0;
 
     /**
      * Create an appropriate downstream bus request packet.
@@ -648,7 +647,8 @@
      */
     void writebackTempBlockAtomic() {
         assert(tempBlockWriteback != nullptr);
-        doWritebacksAtomic(tempBlockWriteback);
+        PacketList writebacks{tempBlockWriteback};
+        doWritebacksAtomic(writebacks);
         tempBlockWriteback = nullptr;
     }
 
@@ -680,12 +680,11 @@
      *
      * @param blk The block to be overwriten.
      * @param data A pointer to the data to be compressed (blk's new data).
-     * @param delay The delay until the packet's metadata is present.
-     * @param tag_latency Latency to access the tags of the replacement victim.
+     * @param writebacks List for any writebacks that need to be performed.
      * @return Whether operation is successful or not.
      */
     bool updateCompressionData(CacheBlk *blk, const uint64_t* data,
-        uint32_t delay, Cycles tag_latency);
+                               PacketList &writebacks);
 
     /**
      * Perform any necessary updates to the block and perform any data
@@ -718,27 +717,34 @@
      * Populates a cache block and handles all outstanding requests for the
      * satisfied fill request. This version takes two memory requests. One
      * contains the fill data, the other is an optional target to satisfy.
+     * Note that the reason we return a list of writebacks rather than
+     * inserting them directly in the write buffer is that this function
+     * is called by both atomic and timing-mode accesses, and in atomic
+     * mode we don't mess with the write buffer (we just perform the
+     * writebacks atomically once the original request is complete).
      *
      * @param pkt The memory request with the fill data.
      * @param blk The cache block if it already exists.
+     * @param writebacks List for any writebacks that need to be performed.
      * @param allocate Whether to allocate a block or use the temp block
      * @return Pointer to the new cache block.
      */
-    CacheBlk *handleFill(PacketPtr pkt, CacheBlk *blk, bool allocate);
+    CacheBlk *handleFill(PacketPtr pkt, CacheBlk *blk,
+                         PacketList &writebacks, bool allocate);
 
     /**
-     * Allocate a new block for the packet's data. The victim block might be
-     * valid, and thus the necessary writebacks are done. May return nullptr
-     * if there are no replaceable blocks. If a replaceable block is found,
-     * it inserts the new block in its place. The new block, however, is not
-     * set as valid yet.
+     * Allocate a new block and perform any necessary writebacks
+     *
+     * Find a victim block and if necessary prepare writebacks for any
+     * existing data. May return nullptr if there are no replaceable
+     * blocks. If a replaceable block is found, it inserts the new block in
+     * its place. The new block, however, is not set as valid yet.
      *
      * @param pkt Packet holding the address to update
-     * @param tag_latency Latency to access the tags of the replacement victim.
+     * @param writebacks A list of writeback packets for the evicted blocks
      * @return the allocated block
      */
-    CacheBlk *allocateBlock(const PacketPtr pkt, Cycles tag_latency);
-
+    CacheBlk *allocateBlock(const PacketPtr pkt, PacketList &writebacks);
     /**
      * Evict a cache block.
      *
@@ -755,10 +761,9 @@
      * Performs a writeback if necesssary and invalidates the block
      *
      * @param blk Block to invalidate
-     * @param forward_time Tick to which the writeback should be scheduled if
-     *                     in timing mode.
+     * @param writebacks Return a list of packets with writebacks
      */
-    void evictBlock(CacheBlk *blk, Tick forward_time);
+    void evictBlock(CacheBlk *blk, PacketList &writebacks);
 
     /**
      * Invalidate a cache block.
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index b72ff42..bded746 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -161,7 +161,8 @@
 /////////////////////////////////////////////////////
 
 bool
-Cache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
+Cache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+              PacketList &writebacks)
 {
 
     if (pkt->req->isUncacheable()) {
@@ -173,91 +174,98 @@
 
         DPRINTF(Cache, "%s for %s\n", __func__, pkt->print());
 
-        // lookupLatency is the latency in case the request is uncacheable.
-        lat = lookupLatency;
-
         // flush and invalidate any existing block
         CacheBlk *old_blk(tags->findBlock(pkt->getAddr(), pkt->isSecure()));
         if (old_blk && old_blk->isValid()) {
-            BaseCache::evictBlock(old_blk, clockEdge(lat + forwardLatency));
+            BaseCache::evictBlock(old_blk, writebacks);
         }
 
         blk = nullptr;
+        // lookupLatency is the latency in case the request is uncacheable.
+        lat = lookupLatency;
         return false;
     }
 
-    return BaseCache::access(pkt, blk, lat);
+    return BaseCache::access(pkt, blk, lat, writebacks);
 }
 
 void
-Cache::doWritebacks(PacketPtr pkt, Tick forward_time)
+Cache::doWritebacks(PacketList& writebacks, Tick forward_time)
 {
-    // We use forwardLatency here because we are copying writebacks to
-    // write buffer.
+    while (!writebacks.empty()) {
+        PacketPtr wbPkt = writebacks.front();
+        // We use forwardLatency here because we are copying writebacks to
+        // write buffer.
 
-    // Call isCachedAbove for Writebacks, CleanEvicts and
-    // WriteCleans to discover if the block is cached above.
-    if (isCachedAbove(pkt)) {
-        if (pkt->cmd == MemCmd::CleanEvict) {
-            // Delete CleanEvict because cached copies exist above. The
-            // packet destructor will delete the request object because
-            // this is a non-snoop request packet which does not require a
-            // response.
-            delete pkt;
-        } else if (pkt->cmd == MemCmd::WritebackClean) {
-            // clean writeback, do not send since the block is
-            // still cached above
-            assert(writebackClean);
-            delete pkt;
+        // Call isCachedAbove for Writebacks, CleanEvicts and
+        // WriteCleans to discover if the block is cached above.
+        if (isCachedAbove(wbPkt)) {
+            if (wbPkt->cmd == MemCmd::CleanEvict) {
+                // Delete CleanEvict because cached copies exist above. The
+                // packet destructor will delete the request object because
+                // this is a non-snoop request packet which does not require a
+                // response.
+                delete wbPkt;
+            } else if (wbPkt->cmd == MemCmd::WritebackClean) {
+                // clean writeback, do not send since the block is
+                // still cached above
+                assert(writebackClean);
+                delete wbPkt;
+            } else {
+                assert(wbPkt->cmd == MemCmd::WritebackDirty ||
+                       wbPkt->cmd == MemCmd::WriteClean);
+                // Set BLOCK_CACHED flag in Writeback and send below, so that
+                // the Writeback does not reset the bit corresponding to this
+                // address in the snoop filter below.
+                wbPkt->setBlockCached();
+                allocateWriteBuffer(wbPkt, forward_time);
+            }
         } else {
-            assert(pkt->cmd == MemCmd::WritebackDirty ||
-                   pkt->cmd == MemCmd::WriteClean);
-            // Set BLOCK_CACHED flag in Writeback and send below, so that
-            // the Writeback does not reset the bit corresponding to this
-            // address in the snoop filter below.
-            pkt->setBlockCached();
-            allocateWriteBuffer(pkt, forward_time);
+            // If the block is not cached above, send packet below. Both
+            // CleanEvict and Writeback with BLOCK_CACHED flag cleared will
+            // reset the bit corresponding to this address in the snoop filter
+            // below.
+            allocateWriteBuffer(wbPkt, forward_time);
         }
-    } else {
-        // If the block is not cached above, send packet below. Both
-        // CleanEvict and Writeback with BLOCK_CACHED flag cleared will
-        // reset the bit corresponding to this address in the snoop filter
-        // below.
-        allocateWriteBuffer(pkt, forward_time);
+        writebacks.pop_front();
     }
 }
 
 void
-Cache::doWritebacksAtomic(PacketPtr pkt)
+Cache::doWritebacksAtomic(PacketList& writebacks)
 {
-    // Call isCachedAbove for both Writebacks and CleanEvicts. If
-    // isCachedAbove returns true we set BLOCK_CACHED flag in Writebacks
-    // and discard CleanEvicts.
-    if (isCachedAbove(pkt, false)) {
-        if (pkt->cmd == MemCmd::WritebackDirty ||
-            pkt->cmd == MemCmd::WriteClean) {
-            // Set BLOCK_CACHED flag in Writeback and send below,
-            // so that the Writeback does not reset the bit
-            // corresponding to this address in the snoop filter
-            // below. We can discard CleanEvicts because cached
-            // copies exist above. Atomic mode isCachedAbove
-            // modifies packet to set BLOCK_CACHED flag
-            memSidePort.sendAtomic(pkt);
+    while (!writebacks.empty()) {
+        PacketPtr wbPkt = writebacks.front();
+        // Call isCachedAbove for both Writebacks and CleanEvicts. If
+        // isCachedAbove returns true we set BLOCK_CACHED flag in Writebacks
+        // and discard CleanEvicts.
+        if (isCachedAbove(wbPkt, false)) {
+            if (wbPkt->cmd == MemCmd::WritebackDirty ||
+                wbPkt->cmd == MemCmd::WriteClean) {
+                // Set BLOCK_CACHED flag in Writeback and send below,
+                // so that the Writeback does not reset the bit
+                // corresponding to this address in the snoop filter
+                // below. We can discard CleanEvicts because cached
+                // copies exist above. Atomic mode isCachedAbove
+                // modifies packet to set BLOCK_CACHED flag
+                memSidePort.sendAtomic(wbPkt);
+            }
+        } else {
+            // If the block is not cached above, send packet below. Both
+            // CleanEvict and Writeback with BLOCK_CACHED flag cleared will
+            // reset the bit corresponding to this address in the snoop filter
+            // below.
+            memSidePort.sendAtomic(wbPkt);
         }
-    } else {
-        // If the block is not cached above, send packet below. Both
-        // CleanEvict and Writeback with BLOCK_CACHED flag cleared will
-        // reset the bit corresponding to this address in the snoop filter
-        // below.
-        memSidePort.sendAtomic(pkt);
+        writebacks.pop_front();
+        // In case of CleanEvicts, the packet destructor will delete the
+        // request object because this is a non-snoop request packet which
+        // does not require a response.
+        delete wbPkt;
     }
-
-    // In case of CleanEvicts, the packet destructor will delete the
-    // request object because this is a non-snoop request packet which
-    // does not require a response.
-    delete pkt;
 }
 
+
 void
 Cache::recvTimingSnoopResp(PacketPtr pkt)
 {
@@ -555,7 +563,8 @@
 
 
 Cycles
-Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk)
+Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                           PacketList &writebacks)
 {
     // deal with the packets that go through the write path of
     // the cache, i.e. any evictions and writes
@@ -617,7 +626,7 @@
                 // the write to a whole line
                 const bool allocate = allocOnFill(pkt->cmd) &&
                     (!writeAllocator || writeAllocator->allocate());
-                blk = handleFill(bus_pkt, blk, allocate);
+                blk = handleFill(bus_pkt, blk, writebacks, allocate);
                 assert(blk != NULL);
                 is_invalidate = false;
                 satisfyRequest(pkt, blk);
@@ -625,7 +634,8 @@
                        bus_pkt->cmd == MemCmd::UpgradeResp) {
                 // we're updating cache state to allow us to
                 // satisfy the upstream request from the cache
-                blk = handleFill(bus_pkt, blk, allocOnFill(pkt->cmd));
+                blk = handleFill(bus_pkt, blk, writebacks,
+                                 allocOnFill(pkt->cmd));
                 satisfyRequest(pkt, blk);
                 maintainClusivity(pkt->fromCache(), blk);
             } else {
@@ -1011,15 +1021,17 @@
             DPRINTF(CacheVerbose, "%s: packet (snoop) %s found block: %s\n",
                     __func__, pkt->print(), blk->print());
             PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(), pkt->id);
+            PacketList writebacks;
+            writebacks.push_back(wb_pkt);
 
             if (is_timing) {
                 // anything that is merely forwarded pays for the forward
                 // latency and the delay provided by the crossbar
                 Tick forward_time = clockEdge(forwardLatency) +
                     pkt->headerDelay;
-                doWritebacks(wb_pkt, forward_time);
+                doWritebacks(writebacks, forward_time);
             } else {
-                doWritebacksAtomic(wb_pkt);
+                doWritebacksAtomic(writebacks);
             }
             pkt->setSatisfied();
         }
diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh
index d1b876e..33c5a24 100644
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -87,7 +87,8 @@
      */
     void promoteWholeLineWrites(PacketPtr pkt);
 
-    bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat) override;
+    bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                PacketList &writebacks) override;
 
     void handleTimingReqHit(PacketPtr pkt, CacheBlk *blk,
                             Tick request_time) override;
@@ -98,9 +99,9 @@
 
     void recvTimingReq(PacketPtr pkt) override;
 
-    void doWritebacks(PacketPtr pkt, Tick forward_time) override;
+    void doWritebacks(PacketList& writebacks, Tick forward_time) override;
 
-    void doWritebacksAtomic(PacketPtr pkt) override;
+    void doWritebacksAtomic(PacketList& writebacks) override;
 
     void serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt,
                             CacheBlk *blk) override;
@@ -109,7 +110,8 @@
 
     void recvTimingSnoopResp(PacketPtr pkt) override;
 
-    Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk) override;
+    Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                               PacketList &writebacks) override;
 
     Tick recvAtomic(PacketPtr pkt) override;
 
diff --git a/src/mem/cache/noncoherent_cache.cc b/src/mem/cache/noncoherent_cache.cc
index 5ad75ee..9a2a1db 100644
--- a/src/mem/cache/noncoherent_cache.cc
+++ b/src/mem/cache/noncoherent_cache.cc
@@ -80,9 +80,10 @@
 }
 
 bool
-NoncoherentCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
+NoncoherentCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                         PacketList &writebacks)
 {
-    bool success = BaseCache::access(pkt, blk, lat);
+    bool success = BaseCache::access(pkt, blk, lat, writebacks);
 
     if (pkt->isWriteback() || pkt->cmd == MemCmd::WriteClean) {
         assert(blk && blk->isValid());
@@ -97,16 +98,24 @@
 }
 
 void
-NoncoherentCache::doWritebacks(PacketPtr pkt, Tick forward_time)
+NoncoherentCache::doWritebacks(PacketList& writebacks, Tick forward_time)
 {
-    allocateWriteBuffer(pkt, forward_time);
+    while (!writebacks.empty()) {
+        PacketPtr wb_pkt = writebacks.front();
+        allocateWriteBuffer(wb_pkt, forward_time);
+        writebacks.pop_front();
+    }
 }
 
 void
-NoncoherentCache::doWritebacksAtomic(PacketPtr pkt)
+NoncoherentCache::doWritebacksAtomic(PacketList& writebacks)
 {
-    memSidePort.sendAtomic(pkt);
-    delete pkt;
+    while (!writebacks.empty()) {
+        PacketPtr wb_pkt = writebacks.front();
+        memSidePort.sendAtomic(wb_pkt);
+        writebacks.pop_front();
+        delete wb_pkt;
+    }
 }
 
 void
@@ -162,7 +171,8 @@
 
 
 Cycles
-NoncoherentCache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk)
+NoncoherentCache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                                      PacketList &writebacks)
 {
     PacketPtr bus_pkt = createMissPacket(pkt, blk, true,
                                          pkt->isWholeLineWrite(blkSize));
@@ -187,7 +197,7 @@
         // afterall it is a read response
         DPRINTF(Cache, "Block for addr %#llx being updated in Cache\n",
                 bus_pkt->getAddr());
-        blk = handleFill(bus_pkt, blk, allocOnFill(bus_pkt->cmd));
+        blk = handleFill(bus_pkt, blk, writebacks, allocOnFill(bus_pkt->cmd));
         assert(blk);
     }
     satisfyRequest(pkt, blk);
diff --git a/src/mem/cache/noncoherent_cache.hh b/src/mem/cache/noncoherent_cache.hh
index d909746..3da87d9 100644
--- a/src/mem/cache/noncoherent_cache.hh
+++ b/src/mem/cache/noncoherent_cache.hh
@@ -71,7 +71,8 @@
 class NoncoherentCache : public BaseCache
 {
   protected:
-    bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat) override;
+    bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                PacketList &writebacks) override;
 
     void handleTimingReqMiss(PacketPtr pkt, CacheBlk *blk,
                              Tick forward_time,
@@ -79,10 +80,10 @@
 
     void recvTimingReq(PacketPtr pkt) override;
 
-    void doWritebacks(PacketPtr pkt,
+    void doWritebacks(PacketList& writebacks,
                       Tick forward_time) override;
 
-    void doWritebacksAtomic(PacketPtr pkt) override;
+    void doWritebacksAtomic(PacketList& writebacks) override;
 
     void serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt,
                             CacheBlk *blk) override;
@@ -97,7 +98,8 @@
         panic("Unexpected timing snoop response %s", pkt->print());
     }
 
-    Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk) override;
+    Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                               PacketList &writebacks) override;
 
     Tick recvAtomic(PacketPtr pkt) override;