dev-amdgpu: Fix issues with PM4 queue map, fences

The PM4 release_mem packet is used as a DMA fence in the driver. It
specifies which queue the interrupt came from by encoding the me, pipe,
and queue fields from the map_queue packet into the interrupt ring ID.
Currently these fields are incorrect because (1) the order in the
bitfield is backwards, (2) the queue constructor assigns a pointer to
the PM4MapQueue packet containing this data to the dmaBuffer which gets
deleted in short order, and (3) the order of the encoding of ring ID is
incorrect.

This change fixes these issues by (1) placing the struct vales in
correct order, (2) creating a const copy of the dmaBuffer on
construction, and (3) using the ring ID encoding expected by the driver:
https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/roc-4.3.x/
     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c#L5989

Change-Id: I72c382980e57573f8a8a6879912c4139c7e2f505
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65095
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh
index b690e54..42832d5 100644
--- a/src/dev/amdgpu/pm4_defines.hh
+++ b/src/dev/amdgpu/pm4_defines.hh
@@ -124,9 +124,9 @@
     uint32_t reserved2 : 2;
     uint32_t vmid : 4;
     uint32_t reserved3 : 1;
-    uint32_t me : 1;
-    uint32_t pipe : 2;
     uint32_t queueSlot : 3;
+    uint32_t pipe : 2;
+    uint32_t me : 1;
     uint32_t reserved6 : 2;
     uint32_t queueType : 3;
     uint32_t allocFormat : 2;
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index 48496f6..404beab 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -116,14 +116,14 @@
 PM4PacketProcessor::mapKiq(Addr offset)
 {
     DPRINTF(PM4PacketProcessor, "Mapping KIQ\n");
-    newQueue((QueueDesc *)&kiq, offset);
+    newQueue((QueueDesc *)&kiq, offset, &kiq_pkt);
 }
 
 void
 PM4PacketProcessor::mapPq(Addr offset)
 {
     DPRINTF(PM4PacketProcessor, "Mapping PQ\n");
-    newQueue((QueueDesc *)&pq, offset);
+    newQueue((QueueDesc *)&pq, offset, &pq_pkt);
 }
 
 void
@@ -146,8 +146,9 @@
                   : QueueType::Compute;
     gpuDevice->setDoorbellType(offset, qt);
 
-    DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p\n",
-            id, q->base(), q->offset());
+    DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: "
+            "%d, pipe %d queue: %d\n", id, q->base(), q->offset(), q->me(),
+            q->pipe(), q->queue());
 }
 
 void
@@ -490,14 +491,16 @@
     DPRINTF(PM4PacketProcessor, "PM4 release_mem wrote %d to %p\n",
             pkt->dataLo, addr);
     if (pkt->intSelect == 2) {
-        DPRINTF(PM4PacketProcessor, "PM4 interrupt, ctx: %d, me: %d, pipe: "
-                "%d, queueSlot:%d\n", pkt->intCtxId, q->me(), q->pipe(),
-                q->queue());
-        // Rearranging the queue field of PM4MapQueues as the interrupt RingId
-        // format specified in PM4ReleaseMem pkt.
-        uint32_t ringId = (q->me() << 6) | (q->pipe() << 4) | q->queue();
+        DPRINTF(PM4PacketProcessor, "PM4 interrupt, id: %d ctx: %d, me: %d, "
+                "pipe: %d, queueSlot:%d\n", q->id(), pkt->intCtxId, q->me(),
+                q->pipe(), q->queue());
+
+        uint8_t ringId = 0;
+        if (q->id() != 0) {
+            ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe();
+        }
         gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId,
-            SOC15_IH_CLIENTID_GRBM_CP, CP_EOP);
+                                            SOC15_IH_CLIENTID_GRBM_CP, CP_EOP);
         gpuDevice->getIH()->submitInterruptCookie();
     }
 
diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh
index c77edd2..4806671 100644
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -54,8 +54,10 @@
     AMDGPUDevice *gpuDevice;
     /* First graphics queue */
     PrimaryQueue pq;
+    PM4MapQueues pq_pkt;
     /* First compute queue */
     QueueDesc kiq;
+    PM4MapQueues kiq_pkt;
 
     /* All PM4 queues, indexed by VMID */
     std::unordered_map<uint16_t, PM4Queue *> queues;
diff --git a/src/dev/amdgpu/pm4_queues.hh b/src/dev/amdgpu/pm4_queues.hh
index 4e8638b..19973b1 100644
--- a/src/dev/amdgpu/pm4_queues.hh
+++ b/src/dev/amdgpu/pm4_queues.hh
@@ -375,16 +375,16 @@
     Addr _offset;
     bool _processing;
     bool _ib;
-    PM4MapQueues *_pkt;
+    const PM4MapQueues _pkt;
   public:
     PM4Queue() : _id(0), q(nullptr), _wptr(0), _offset(0), _processing(false),
-        _ib(false), _pkt(nullptr) {}
+        _ib(false), _pkt() {}
     PM4Queue(int id, QueueDesc *queue, Addr offset) :
         _id(id), q(queue), _wptr(queue->rptr), _ibWptr(0), _offset(offset),
-        _processing(false), _ib(false), _pkt(nullptr) {}
+        _processing(false), _ib(false), _pkt() {}
     PM4Queue(int id, QueueDesc *queue, Addr offset, PM4MapQueues *pkt) :
         _id(id), q(queue), _wptr(queue->rptr), _ibWptr(0), _offset(offset),
-        _processing(false), _ib(false), _pkt(pkt) {}
+        _processing(false), _ib(false), _pkt(*pkt) {}
 
     QueueDesc *getMQD() { return q; }
     int id() { return _id; }
@@ -466,10 +466,10 @@
     void offset(Addr value) { _offset = value; }
     void processing(bool value) { _processing = value; }
     void ib(bool value) { _ib = value; }
-    uint32_t me() { if (_pkt) return _pkt->me; else return 0; }
-    uint32_t pipe() { if (_pkt) return _pkt->pipe; else return 0; }
-    uint32_t queue() { if (_pkt) return _pkt->queueSlot; else return 0; }
-    bool privileged() { assert(_pkt); return _pkt->queueSel == 0 ? 1 : 0; }
+    uint32_t me() { return _pkt.me + 1; }
+    uint32_t pipe() { return _pkt.pipe; }
+    uint32_t queue() { return _pkt.queueSlot; }
+    bool privileged() { return _pkt.queueSel == 0 ? 1 : 0; }
 };
 
 } // namespace gem5