dev-amdgpu: Writeback RLC queue MQD when unmapped

Currently when RLC queues (user mode queues) are mapped, the read/write
pointers of the ring buffer are set to zero. However, these queues could
be unmapped and then remapped later. In that situation the read/write
pointers should be the previous value before unmapping occurred. Since
the read pointer gets reset to zero, the queue begins reading from the
start of the ring, which usually contains older packets. There is a 99%
chance those packets contain addresses which are no longer in the page
tables which will cause a page fault.

To fix this we update the MQD with the current read/write pointer values
and then writeback the MQD to memory when the queue is unmapped. This
requires adding a pointer to the MQD and the host address of the MQD
where it should be written back to. The interface for registering RLC
queue is also simplified. Since we need to pass the MQD anyway, we can
get values from it as well.

Fixes b+tree and streamcluster from rodinia (when using RLC queues).

Change-Id: Ie5dad4d7d90ea240c3e9f0cddf3e844a3cd34c4f
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65791
Tested-by: kokoro <noreply+kokoro@google.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index f78f833..152fd4d 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -458,9 +458,7 @@
     SDMAEngine *sdma_eng = gpuDevice->getSDMAById(pkt->engineSel - 2);
 
     // Register RLC queue with SDMA
-    sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2,
-                               mqd->rb_base << 8, rlc_size,
-                               rptr_wb_addr);
+    sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2, addr, mqd);
 
     // Register doorbell with GPU device
     gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
diff --git a/src/dev/amdgpu/pm4_queues.hh b/src/dev/amdgpu/pm4_queues.hh
index 8b6626d..ddadd65 100644
--- a/src/dev/amdgpu/pm4_queues.hh
+++ b/src/dev/amdgpu/pm4_queues.hh
@@ -33,6 +33,8 @@
 #ifndef __DEV_AMDGPU_PM4_QUEUES_HH__
 #define __DEV_AMDGPU_PM4_QUEUES_HH__
 
+#include "dev/amdgpu/pm4_defines.hh"
+
 namespace gem5
 {
 
@@ -201,10 +203,24 @@
         };
         uint64_t rb_base;
     };
-    uint32_t sdmax_rlcx_rb_rptr;
-    uint32_t sdmax_rlcx_rb_rptr_hi;
-    uint32_t sdmax_rlcx_rb_wptr;
-    uint32_t sdmax_rlcx_rb_wptr_hi;
+    union
+    {
+        struct
+        {
+            uint32_t sdmax_rlcx_rb_rptr;
+            uint32_t sdmax_rlcx_rb_rptr_hi;
+        };
+        uint64_t rptr;
+    };
+    union
+    {
+        struct
+        {
+            uint32_t sdmax_rlcx_rb_wptr;
+            uint32_t sdmax_rlcx_rb_wptr_hi;
+        };
+        uint64_t wptr;
+    };
     uint32_t sdmax_rlcx_rb_wptr_poll_cntl;
     uint32_t sdmax_rlcx_rb_rptr_addr_hi;
     uint32_t sdmax_rlcx_rb_rptr_addr_lo;
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index 02203c8..4c03bf5 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -165,30 +165,40 @@
 }
 
 void
-SDMAEngine::registerRLCQueue(Addr doorbell, Addr rb_base, uint32_t size,
-                             Addr rptr_wb_addr)
+SDMAEngine::registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd)
 {
+    uint32_t rlc_size = 4UL << bits(mqd->sdmax_rlcx_rb_cntl, 6, 1);
+    Addr rptr_wb_addr = mqd->sdmax_rlcx_rb_rptr_addr_hi;
+    rptr_wb_addr <<= 32;
+    rptr_wb_addr |= mqd->sdmax_rlcx_rb_rptr_addr_lo;
+
     // Get first free RLC
     if (!rlc0.valid()) {
         DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC0\n", doorbell);
         rlcInfo[0] = doorbell;
         rlc0.valid(true);
-        rlc0.base(rb_base);
+        rlc0.base(mqd->rb_base << 8);
+        rlc0.size(rlc_size);
         rlc0.rptr(0);
-        rlc0.wptr(0);
+        rlc0.incRptr(mqd->rptr);
+        rlc0.setWptr(mqd->wptr);
         rlc0.rptrWbAddr(rptr_wb_addr);
         rlc0.processing(false);
-        rlc0.size(size);
+        rlc0.setMQD(mqd);
+        rlc0.setMQDAddr(mqdAddr);
     } else if (!rlc1.valid()) {
         DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC1\n", doorbell);
         rlcInfo[1] = doorbell;
         rlc1.valid(true);
-        rlc1.base(rb_base);
+        rlc1.base(mqd->rb_base << 8);
+        rlc1.size(rlc_size);
         rlc1.rptr(0);
-        rlc1.wptr(0);
+        rlc1.incRptr(mqd->rptr);
+        rlc1.setWptr(mqd->wptr);
         rlc1.rptrWbAddr(rptr_wb_addr);
         rlc1.processing(false);
-        rlc1.size(size);
+        rlc1.setMQD(mqd);
+        rlc1.setMQDAddr(mqdAddr);
     } else {
         panic("No free RLCs. Check they are properly unmapped.");
     }
@@ -199,9 +209,37 @@
 {
     DPRINTF(SDMAEngine, "Unregistering RLC queue at %#lx\n", doorbell);
     if (rlcInfo[0] == doorbell) {
+        SDMAQueueDesc *mqd = rlc0.getMQD();
+        if (mqd) {
+            DPRINTF(SDMAEngine, "Writing RLC0 SDMAMQD back to %#lx\n",
+                    rlc0.getMQDAddr());
+
+            mqd->rptr = rlc0.globalRptr();
+            mqd->wptr = rlc0.getWptr();
+
+            auto cb = new DmaVirtCallback<uint32_t>(
+                [ = ] (const uint32_t &) { });
+            dmaWriteVirt(rlc0.getMQDAddr(), sizeof(SDMAQueueDesc), cb, mqd);
+        } else {
+            warn("RLC0 SDMAMQD address invalid\n");
+        }
         rlc0.valid(false);
         rlcInfo[0] = 0;
     } else if (rlcInfo[1] == doorbell) {
+        SDMAQueueDesc *mqd = rlc1.getMQD();
+        if (mqd) {
+            DPRINTF(SDMAEngine, "Writing RLC1 SDMAMQD back to %#lx\n",
+                    rlc1.getMQDAddr());
+
+            mqd->rptr = rlc1.globalRptr();
+            mqd->wptr = rlc1.getWptr();
+
+            auto cb = new DmaVirtCallback<uint32_t>(
+                [ = ] (const uint32_t &) { });
+            dmaWriteVirt(rlc1.getMQDAddr(), sizeof(SDMAQueueDesc), cb, mqd);
+        } else {
+            warn("RLC1 SDMAMQD address invalid\n");
+        }
         rlc1.valid(false);
         rlcInfo[1] = 0;
     } else {
@@ -213,7 +251,9 @@
 SDMAEngine::deallocateRLCQueues()
 {
     for (auto doorbell: rlcInfo) {
-        unregisterRLCQueue(doorbell);
+        if (doorbell) {
+            unregisterRLCQueue(doorbell);
+        }
     }
 }
 
diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh
index 0bfee12..27c1691 100644
--- a/src/dev/amdgpu/sdma_engine.hh
+++ b/src/dev/amdgpu/sdma_engine.hh
@@ -34,6 +34,7 @@
 
 #include "base/bitunion.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
+#include "dev/amdgpu/pm4_queues.hh"
 #include "dev/amdgpu/sdma_packets.hh"
 #include "dev/dma_virt_device.hh"
 #include "params/SDMAEngine.hh"
@@ -65,9 +66,11 @@
         SDMAQueue *_parent;
         SDMAQueue *_ib;
         SDMAType _type;
+        SDMAQueueDesc *_mqd;
+        Addr _mqd_addr = 0;
       public:
         SDMAQueue() : _rptr(0), _wptr(0), _valid(false), _processing(false),
-            _parent(nullptr), _ib(nullptr), _type(SDMAGfx) {}
+            _parent(nullptr), _ib(nullptr), _type(SDMAGfx), _mqd(nullptr) {}
 
         Addr base() { return _base; }
         Addr rptr() { return _base + _rptr; }
@@ -82,6 +85,8 @@
         SDMAQueue* parent() { return _parent; }
         SDMAQueue* ib() { return _ib; }
         SDMAType queueType() { return _type; }
+        SDMAQueueDesc* getMQD() { return _mqd; }
+        Addr getMQDAddr() { return _mqd_addr; }
 
         void base(Addr value) { _base = value; }
 
@@ -114,6 +119,8 @@
         void parent(SDMAQueue* q) { _parent = q; }
         void ib(SDMAQueue* ib) { _ib = ib; }
         void queueType(SDMAType type) { _type = type; }
+        void setMQD(SDMAQueueDesc *mqd) { _mqd = mqd; }
+        void setMQDAddr(Addr mqdAddr) { _mqd_addr = mqdAddr; }
     };
 
     /* SDMA Engine ID */
@@ -280,8 +287,7 @@
     /**
      * Methods for RLC queues
      */
-    void registerRLCQueue(Addr doorbell, Addr rb_base, uint32_t size,
-                          Addr rptr_wb_addr);
+    void registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd);
     void unregisterRLCQueue(Addr doorbell);
     void deallocateRLCQueues();