dev-amdgpu: Chunkify SDMA copies that use device memory

The current implementation of SDMA copy calls the GPU memory manager's
read/write method one time passing a physical address as the
source/destination. This implicitly assumes the physical addresses are
contiguous which is generally not true for large allocations. This
results in reading from/writing to the wrong address.

This changeset fixes the problem by copying large copies in chunks of
the minimum possible page size on the GPU (4kB). Each page is translated
seperately to ensure the correct physical address. The final copy "done"
callback is only used for the last transfer. The transfers should
complete in order so the copy command will not complete until all chunks
have been copied. Tested and verified on an application with a large
allocation (~5GB).

Change-Id: I27018a963da7133f5e49dec13b0475c3637c8765
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/64752
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index cafef6c..1cd6ff2 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -589,8 +589,22 @@
         DPRINTF(SDMAEngine, "Copying from device address %#lx\n", device_addr);
         auto cb = new EventFunctionWrapper(
             [ = ]{ copyReadData(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->readRequest(device_addr, dmaBuffer, pkt->count,
-                                            0, cb);
+
+        // Copy the minimum page size at a time in case the physical addresses
+        // are not contiguous.
+        ChunkGenerator gen(pkt->source, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = getDeviceAddress(gen.addr());
+            assert(chunk_addr);
+
+            DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
+                    gen.size(), gen.addr(), chunk_addr);
+
+            gpuDevice->getMemMgr()->readRequest(chunk_addr, dmaBuffer,
+                                                gen.size(), 0,
+                                                gen.last() ? cb : nullptr);
+            dmaBuffer += gen.size();
+        }
     } else {
         auto cb = new DmaVirtCallback<uint64_t>(
             [ = ] (const uint64_t &) { copyReadData(q, pkt, dmaBuffer); });
@@ -620,8 +634,23 @@
         DPRINTF(SDMAEngine, "Copying to device address %#lx\n", device_addr);
         auto cb = new EventFunctionWrapper(
             [ = ]{ copyDone(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->writeRequest(device_addr, dmaBuffer,
-                                             pkt->count, 0, cb);
+
+        // Copy the minimum page size at a time in case the physical addresses
+        // are not contiguous.
+        ChunkGenerator gen(pkt->dest, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = getDeviceAddress(gen.addr());
+            assert(chunk_addr);
+
+            DPRINTF(SDMAEngine, "Copying chunk of %d bytes to %#lx (%#lx)\n",
+                    gen.size(), gen.addr(), chunk_addr);
+
+            gpuDevice->getMemMgr()->writeRequest(chunk_addr, dmaBuffer,
+                                                 gen.size(), 0,
+                                                 gen.last() ? cb : nullptr);
+
+            dmaBuffer += gen.size();
+        }
     } else {
         auto cb = new DmaVirtCallback<uint64_t>(
             [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });