dev-amdgpu: Chunkify SDMA copies that use device memory The current implementation of SDMA copy calls the GPU memory manager's read/write method one time passing a physical address as the source/destination. This implicitly assumes the physical addresses are contiguous which is generally not true for large allocations. This results in reading from/writing to the wrong address. This changeset fixes the problem by copying large copies in chunks of the minimum possible page size on the GPU (4kB). Each page is translated seperately to ensure the correct physical address. The final copy "done" callback is only used for the last transfer. The transfers should complete in order so the copy command will not complete until all chunks have been copied. Tested and verified on an application with a large allocation (~5GB). Change-Id: I27018a963da7133f5e49dec13b0475c3637c8765 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/64752 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>

commit: 7b16b17e619490685edb875cd9678fc2fb1943e9 [log] [tgz]
author: Matthew Poremba <matthew.poremba@amd.com> Tue Oct 18 13:01:33 2022 -0700
committer: Matthew Poremba <matthew.poremba@amd.com> Mon Oct 31 14:30:24 2022 +0000
tree: 1aefe2237b6b25c2a3b4e9f6f04f97450ba5f4ab
parent: 6a4a12ebbdcc4591f02dca0989ff0353af2b9422 [diff]
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index cafef6c..1cd6ff2 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc

@@ -589,8 +589,22 @@
         DPRINTF(SDMAEngine, "Copying from device address %#lx\n", device_addr);
         auto cb = new EventFunctionWrapper(
             [ = ]{ copyReadData(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->readRequest(device_addr, dmaBuffer, pkt->count,
-                                            0, cb);
+
+        // Copy the minimum page size at a time in case the physical addresses
+        // are not contiguous.
+        ChunkGenerator gen(pkt->source, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = getDeviceAddress(gen.addr());
+            assert(chunk_addr);
+
+            DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
+                    gen.size(), gen.addr(), chunk_addr);
+
+            gpuDevice->getMemMgr()->readRequest(chunk_addr, dmaBuffer,
+                                                gen.size(), 0,
+                                                gen.last() ? cb : nullptr);
+            dmaBuffer += gen.size();
+        }
     } else {
         auto cb = new DmaVirtCallback<uint64_t>(
             [ = ] (const uint64_t &) { copyReadData(q, pkt, dmaBuffer); });
@@ -620,8 +634,23 @@
         DPRINTF(SDMAEngine, "Copying to device address %#lx\n", device_addr);
         auto cb = new EventFunctionWrapper(
             [ = ]{ copyDone(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->writeRequest(device_addr, dmaBuffer,
-                                             pkt->count, 0, cb);
+
+        // Copy the minimum page size at a time in case the physical addresses
+        // are not contiguous.
+        ChunkGenerator gen(pkt->dest, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = getDeviceAddress(gen.addr());
+            assert(chunk_addr);
+
+            DPRINTF(SDMAEngine, "Copying chunk of %d bytes to %#lx (%#lx)\n",
+                    gen.size(), gen.addr(), chunk_addr);
+
+            gpuDevice->getMemMgr()->writeRequest(chunk_addr, dmaBuffer,
+                                                 gen.size(), 0,
+                                                 gen.last() ? cb : nullptr);
+
+            dmaBuffer += gen.size();
+        }
     } else {
         auto cb = new DmaVirtCallback<uint64_t>(
             [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
commit	7b16b17e619490685edb875cd9678fc2fb1943e9	[log] [tgz]
author	Matthew Poremba <matthew.poremba@amd.com>	Tue Oct 18 13:01:33 2022 -0700
committer	Matthew Poremba <matthew.poremba@amd.com>	Mon Oct 31 14:30:24 2022 +0000
tree	1aefe2237b6b25c2a3b4e9f6f04f97450ba5f4ab
parent	6a4a12ebbdcc4591f02dca0989ff0353af2b9422 [diff]