blob: e9a4c176d8ebc2737fd73e0c0a0166d482e048c6 [file] [log] [blame]
/*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "dev/amdgpu/sdma_engine.hh"
#include "arch/amdgpu/vega/pagetable_walker.hh"
#include "arch/generic/mmu.hh"
#include "debug/SDMAData.hh"
#include "debug/SDMAEngine.hh"
#include "dev/amdgpu/interrupt_handler.hh"
#include "dev/amdgpu/sdma_commands.hh"
#include "dev/amdgpu/sdma_mmio.hh"
#include "mem/packet.hh"
#include "mem/packet_access.hh"
#include "params/SDMAEngine.hh"
namespace gem5
{
SDMAEngine::SDMAEngine(const SDMAEngineParams &p)
: DmaVirtDevice(p), id(0), gfxBase(0), gfxRptr(0),
gfxDoorbell(0), gfxDoorbellOffset(0), gfxWptr(0), pageBase(0),
pageRptr(0), pageDoorbell(0), pageDoorbellOffset(0),
pageWptr(0), gpuDevice(nullptr), walker(p.walker)
{
gfx.ib(&gfxIb);
gfxIb.parent(&gfx);
gfx.valid(true);
gfxIb.valid(true);
page.ib(&pageIb);
pageIb.parent(&page);
page.valid(true);
pageIb.valid(true);
rlc0.ib(&rlc0Ib);
rlc0Ib.parent(&rlc0);
rlc1.ib(&rlc1Ib);
rlc1Ib.parent(&rlc1);
}
void
SDMAEngine::setGPUDevice(AMDGPUDevice *gpu_device)
{
gpuDevice = gpu_device;
walker->setDevRequestor(gpuDevice->vramRequestorId());
}
int
SDMAEngine::getIHClientId()
{
switch (id) {
case 0:
return SOC15_IH_CLIENTID_SDMA0;
case 1:
return SOC15_IH_CLIENTID_SDMA1;
default:
panic("Unknown SDMA id");
}
}
Addr
SDMAEngine::getGARTAddr(Addr addr) const
{
if (!gpuDevice->getVM().inAGP(addr)) {
Addr low_bits = bits(addr, 11, 0);
addr = (((addr >> 12) << 3) << 12) | low_bits;
}
return addr;
}
Addr
SDMAEngine::getDeviceAddress(Addr raw_addr)
{
// SDMA packets can access both host and device memory as either a source
// or destination address. We don't know which until it is translated, so
// we do a dummy functional translation to determine if the address
// resides in system memory or not.
auto tgen = translate(raw_addr, 64);
auto addr_range = *(tgen->begin());
Addr tmp_addr = addr_range.paddr;
DPRINTF(SDMAEngine, "getDeviceAddress raw_addr %#lx -> %#lx\n",
raw_addr, tmp_addr);
// SDMA packets will access device memory through the MMHUB aperture in
// supervisor mode (vmid == 0) and in user mode (vmid > 0). In the case
// of vmid == 0 the address is already an MMHUB address in the packet,
// so simply subtract the MMHUB base. For vmid > 0 the address is a
// virtual address that must first be translated. The translation will
// return an MMHUB address, then we can similarly subtract the base to
// get the device address. Otherwise, for host, device address is 0.
Addr device_addr = 0;
if ((gpuDevice->getVM().inMMHUB(raw_addr) && cur_vmid == 0) ||
(gpuDevice->getVM().inMMHUB(tmp_addr) && cur_vmid != 0)) {
if (cur_vmid == 0) {
device_addr = raw_addr - gpuDevice->getVM().getMMHUBBase();
} else {
device_addr = tmp_addr - gpuDevice->getVM().getMMHUBBase();
}
}
return device_addr;
}
/**
* GPUController will perform DMA operations on VAs, and because
* page faults are not currently supported for GPUController, we
* must be able to find the pages mapped for the process.
*/
TranslationGenPtr
SDMAEngine::translate(Addr vaddr, Addr size)
{
if (cur_vmid > 0) {
// Only user translation is available to user queues (vmid > 0)
return TranslationGenPtr(new AMDGPUVM::UserTranslationGen(
&gpuDevice->getVM(), walker,
cur_vmid, vaddr, size));
} else if (gpuDevice->getVM().inAGP(vaddr)) {
// Use AGP translation gen
return TranslationGenPtr(
new AMDGPUVM::AGPTranslationGen(&gpuDevice->getVM(), vaddr, size));
} else if (gpuDevice->getVM().inMMHUB(vaddr)) {
// Use MMHUB translation gen
return TranslationGenPtr(new AMDGPUVM::MMHUBTranslationGen(
&gpuDevice->getVM(), vaddr, size));
}
// Assume GART otherwise as this is the only other translation aperture
// available to the SDMA engine processor.
return TranslationGenPtr(
new AMDGPUVM::GARTTranslationGen(&gpuDevice->getVM(), vaddr, size));
}
void
SDMAEngine::registerRLCQueue(Addr doorbell, Addr rb_base)
{
// Get first free RLC
if (!rlc0.valid()) {
DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC0\n", doorbell);
rlcInfo[0] = doorbell;
rlc0.valid(true);
rlc0.base(rb_base);
rlc0.rptr(0);
rlc0.wptr(0);
rlc0.processing(false);
// TODO: size - I think pull from MQD 2^rb_cntrl[6:1]-1
rlc0.size(1024*1024);
} else if (!rlc1.valid()) {
DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC1\n", doorbell);
rlcInfo[1] = doorbell;
rlc1.valid(true);
rlc1.base(rb_base);
rlc1.rptr(1);
rlc1.wptr(1);
rlc1.processing(false);
// TODO: size - I think pull from MQD 2^rb_cntrl[6:1]-1
rlc1.size(1024*1024);
} else {
panic("No free RLCs. Check they are properly unmapped.");
}
}
void
SDMAEngine::unregisterRLCQueue(Addr doorbell)
{
DPRINTF(SDMAEngine, "Unregistering RLC queue at %#lx\n", doorbell);
if (rlcInfo[0] == doorbell) {
rlc0.valid(false);
rlcInfo[0] = 0;
} else if (rlcInfo[1] == doorbell) {
rlc1.valid(false);
rlcInfo[1] = 0;
} else {
panic("Cannot unregister: no RLC queue at %#lx\n", doorbell);
}
}
void
SDMAEngine::deallocateRLCQueues()
{
for (auto doorbell: rlcInfo) {
unregisterRLCQueue(doorbell);
}
}
/* Start decoding packets from the Gfx queue. */
void
SDMAEngine::processGfx(Addr wptrOffset)
{
gfx.setWptr(wptrOffset);
if (!gfx.processing()) {
gfx.processing(true);
decodeNext(&gfx);
}
}
/* Start decoding packets from the Page queue. */
void
SDMAEngine::processPage(Addr wptrOffset)
{
page.setWptr(wptrOffset);
if (!page.processing()) {
page.processing(true);
decodeNext(&page);
}
}
/* Process RLC queue at given doorbell. */
void
SDMAEngine::processRLC(Addr doorbellOffset, Addr wptrOffset)
{
if (rlcInfo[0] == doorbellOffset) {
processRLC0(wptrOffset);
} else if (rlcInfo[1] == doorbellOffset) {
processRLC1(wptrOffset);
} else {
panic("Cannot process: no RLC queue at %#lx\n", doorbellOffset);
}
}
/* Start decoding packets from the RLC0 queue. */
void
SDMAEngine::processRLC0(Addr wptrOffset)
{
assert(rlc0.valid());
rlc0.setWptr(wptrOffset);
if (!rlc0.processing()) {
cur_vmid = 1;
rlc0.processing(true);
decodeNext(&rlc0);
}
}
/* Start decoding packets from the RLC1 queue. */
void
SDMAEngine::processRLC1(Addr wptrOffset)
{
assert(rlc1.valid());
rlc1.setWptr(wptrOffset);
if (!rlc1.processing()) {
cur_vmid = 1;
rlc1.processing(true);
decodeNext(&rlc1);
}
}
/* Decoding next packet in the queue. */
void
SDMAEngine::decodeNext(SDMAQueue *q)
{
DPRINTF(SDMAEngine, "SDMA decode rptr %p wptr %p\n", q->rptr(), q->wptr());
if (q->rptr() != q->wptr()) {
// We are using lambda functions passed to the DmaVirtCallback objects
// which will call the actuall callback method (e.g., decodeHeader).
// The dmaBuffer member of the DmaVirtCallback is passed to the lambda
// function as header in this case.
auto cb = new DmaVirtCallback<uint32_t>(
[ = ] (const uint32_t &header)
{ decodeHeader(q, header); });
dmaReadVirt(q->rptr(), sizeof(uint32_t), cb, &cb->dmaBuffer);
} else {
q->processing(false);
if (q->parent()) {
DPRINTF(SDMAEngine, "SDMA switching queues\n");
decodeNext(q->parent());
}
cur_vmid = 0;
}
}
/* Decoding the header of a packet. */
void
SDMAEngine::decodeHeader(SDMAQueue *q, uint32_t header)
{
q->incRptr(sizeof(header));
int opcode = bits(header, 7, 0);
int sub_opcode = bits(header, 15, 8);
DmaVirtCallback<uint64_t> *cb = nullptr;
void *dmaBuffer = nullptr;
DPRINTF(SDMAEngine, "SDMA opcode %p sub-opcode %p\n", opcode, sub_opcode);
switch(opcode) {
case SDMA_OP_NOP: {
uint32_t NOP_count = (header >> 16) & 0x3FFF;
DPRINTF(SDMAEngine, "SDMA NOP packet with count %d\n", NOP_count);
if (NOP_count > 0) q->incRptr(NOP_count * 4);
decodeNext(q);
} break;
case SDMA_OP_COPY: {
DPRINTF(SDMAEngine, "SDMA Copy packet\n");
switch (sub_opcode) {
case SDMA_SUBOP_COPY_LINEAR: {
dmaBuffer = new sdmaCopy();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ copy(q, (sdmaCopy *)dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(sdmaCopy), cb, dmaBuffer);
} break;
case SDMA_SUBOP_COPY_LINEAR_SUB_WIND: {
panic("SDMA_SUBOP_COPY_LINEAR_SUB_WIND not implemented");
} break;
case SDMA_SUBOP_COPY_TILED: {
panic("SDMA_SUBOP_COPY_TILED not implemented");
} break;
case SDMA_SUBOP_COPY_TILED_SUB_WIND: {
panic("SDMA_SUBOP_COPY_TILED_SUB_WIND not implemented");
} break;
case SDMA_SUBOP_COPY_T2T_SUB_WIND: {
panic("SDMA_SUBOP_COPY_T2T_SUB_WIND not implemented");
} break;
case SDMA_SUBOP_COPY_SOA: {
panic("SDMA_SUBOP_COPY_SOA not implemented");
} break;
case SDMA_SUBOP_COPY_DIRTY_PAGE: {
panic("SDMA_SUBOP_COPY_DIRTY_PAGE not implemented");
} break;
case SDMA_SUBOP_COPY_LINEAR_PHY: {
panic("SDMA_SUBOP_COPY_LINEAR_PHY not implemented");
} break;
default: {
panic("SDMA unknown copy sub-opcode.");
} break;
}
} break;
case SDMA_OP_WRITE: {
DPRINTF(SDMAEngine, "SDMA Write packet\n");
switch (sub_opcode) {
case SDMA_SUBOP_WRITE_LINEAR: {
dmaBuffer = new sdmaWrite();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ write(q, (sdmaWrite *)dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(sdmaWrite), cb, dmaBuffer);
} break;
case SDMA_SUBOP_WRITE_TILED: {
panic("SDMA_SUBOP_WRITE_TILED not implemented.\n");
} break;
default:
break;
}
} break;
case SDMA_OP_INDIRECT: {
DPRINTF(SDMAEngine, "SDMA IndirectBuffer packet\n");
dmaBuffer = new sdmaIndirectBuffer();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ indirectBuffer(q, (sdmaIndirectBuffer *)dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(sdmaIndirectBuffer), cb, dmaBuffer);
} break;
case SDMA_OP_FENCE: {
DPRINTF(SDMAEngine, "SDMA Fence packet\n");
dmaBuffer = new sdmaFence();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ fence(q, (sdmaFence *)dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(sdmaFence), cb, dmaBuffer);
} break;
case SDMA_OP_TRAP: {
DPRINTF(SDMAEngine, "SDMA Trap packet\n");
dmaBuffer = new sdmaTrap();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ trap(q, (sdmaTrap *)dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(sdmaTrap), cb, dmaBuffer);
} break;
case SDMA_OP_SEM: {
q->incRptr(sizeof(sdmaSemaphore));
warn("SDMA_OP_SEM not implemented");
decodeNext(q);
} break;
case SDMA_OP_POLL_REGMEM: {
DPRINTF(SDMAEngine, "SDMA PollRegMem packet\n");
sdmaPollRegMemHeader *h = new sdmaPollRegMemHeader();
*h = *(sdmaPollRegMemHeader *)&header;
dmaBuffer = new sdmaPollRegMem();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ pollRegMem(q, h, (sdmaPollRegMem *)dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(sdmaPollRegMem), cb, dmaBuffer);
switch (sub_opcode) {
case SDMA_SUBOP_POLL_REG_WRITE_MEM: {
panic("SDMA_SUBOP_POLL_REG_WRITE_MEM not implemented");
} break;
case SDMA_SUBOP_POLL_DBIT_WRITE_MEM: {
panic("SDMA_SUBOP_POLL_DBIT_WRITE_MEM not implemented");
} break;
case SDMA_SUBOP_POLL_MEM_VERIFY: {
panic("SDMA_SUBOP_POLL_MEM_VERIFY not implemented");
} break;
default:
break;
}
} break;
case SDMA_OP_COND_EXE: {
q->incRptr(sizeof(sdmaCondExec));
warn("SDMA_OP_SEM not implemented");
decodeNext(q);
} break;
case SDMA_OP_ATOMIC: {
DPRINTF(SDMAEngine, "SDMA Atomic packet\n");
dmaBuffer = new sdmaAtomic();
sdmaAtomicHeader *h = new sdmaAtomicHeader();
*h = *(sdmaAtomicHeader *)&header;
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ atomic(q, h, (sdmaAtomic *)dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(sdmaAtomic), cb, dmaBuffer);
} break;
case SDMA_OP_CONST_FILL: {
q->incRptr(sizeof(sdmaConstFill));
warn("SDMA_OP_CONST_FILL not implemented");
decodeNext(q);
} break;
case SDMA_OP_PTEPDE: {
DPRINTF(SDMAEngine, "SDMA PTEPDE packet\n");
switch (sub_opcode) {
case SDMA_SUBOP_PTEPDE_GEN:
DPRINTF(SDMAEngine, "SDMA PTEPDE_GEN sub-opcode\n");
dmaBuffer = new sdmaPtePde();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ ptePde(q, (sdmaPtePde *)dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(sdmaPtePde), cb, dmaBuffer);
break;
case SDMA_SUBOP_PTEPDE_COPY:
panic("SDMA_SUBOP_PTEPDE_COPY not implemented");
break;
case SDMA_SUBOP_PTEPDE_COPY_BACKWARDS:
panic("SDMA_SUBOP_PTEPDE_COPY not implemented");
break;
case SDMA_SUBOP_PTEPDE_RMW: {
panic("SDMA_SUBOP_PTEPDE_RMW not implemented");
} break;
default:
DPRINTF(SDMAEngine, "Unsupported PTEPDE sub-opcode %d\n",
sub_opcode);
decodeNext(q);
break;
}
} break;
case SDMA_OP_TIMESTAMP: {
q->incRptr(sizeof(sdmaTimestamp));
switch (sub_opcode) {
case SDMA_SUBOP_TIMESTAMP_SET: {
} break;
case SDMA_SUBOP_TIMESTAMP_GET: {
} break;
case SDMA_SUBOP_TIMESTAMP_GET_GLOBAL: {
} break;
default:
break;
}
warn("SDMA_OP_TIMESTAMP not implemented");
decodeNext(q);
} break;
case SDMA_OP_SRBM_WRITE: {
DPRINTF(SDMAEngine, "SDMA SRBMWrite packet\n");
sdmaSRBMWriteHeader *header = new sdmaSRBMWriteHeader();
*header = *(sdmaSRBMWriteHeader *)&header;
dmaBuffer = new sdmaSRBMWrite();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ srbmWrite(q, header, (sdmaSRBMWrite *)dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(sdmaSRBMWrite), cb, dmaBuffer);
} break;
case SDMA_OP_PRE_EXE: {
q->incRptr(sizeof(sdmaPredExec));
warn("SDMA_OP_PRE_EXE not implemented");
decodeNext(q);
} break;
case SDMA_OP_DUMMY_TRAP: {
q->incRptr(sizeof(sdmaDummyTrap));
warn("SDMA_OP_DUMMY_TRAP not implemented");
decodeNext(q);
} break;
default: {
panic("Invalid SDMA packet.\n");
} break;
}
}
/* Implements a write packet. */
void
SDMAEngine::write(SDMAQueue *q, sdmaWrite *pkt)
{
q->incRptr(sizeof(sdmaWrite));
// count represents the number of dwords - 1 to write
pkt->count++;
DPRINTF(SDMAEngine, "Write %d dwords to %lx\n", pkt->count, pkt->dest);
// first we have to read needed data from the SDMA queue
uint32_t *dmaBuffer = new uint32_t[pkt->count];
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &) { writeReadData(q, pkt, dmaBuffer); });
dmaReadVirt(q->rptr(), sizeof(uint32_t) * pkt->count, cb,
(void *)dmaBuffer);
}
/* Completion of data reading for a write packet. */
void
SDMAEngine::writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
{
int bufferSize = sizeof(uint32_t) * pkt->count;
q->incRptr(bufferSize);
DPRINTF(SDMAEngine, "Write packet data:\n");
for (int i = 0; i < pkt->count; ++i) {
DPRINTF(SDMAEngine, "%08x\n", dmaBuffer[i]);
}
// lastly we write read data to the destination address
if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
auto cb = new EventFunctionWrapper(
[ = ]{ writeDone(q, pkt, dmaBuffer); }, name());
gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
bufferSize, 0, cb);
} else {
// TODO: getGARTAddr?
pkt->dest = getGARTAddr(pkt->dest);
auto cb = new DmaVirtCallback<uint32_t>(
[ = ] (const uint64_t &) { writeDone(q, pkt, dmaBuffer); });
dmaWriteVirt(pkt->dest, bufferSize, cb, (void *)dmaBuffer);
}
}
/* Completion of a write packet. */
void
SDMAEngine::writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
{
DPRINTF(SDMAEngine, "Write packet completed to %p, %d dwords\n",
pkt->dest, pkt->count);
delete []dmaBuffer;
delete pkt;
decodeNext(q);
}
/* Implements a copy packet. */
void
SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
{
DPRINTF(SDMAEngine, "Copy src: %lx -> dest: %lx count %d\n",
pkt->source, pkt->dest, pkt->count);
q->incRptr(sizeof(sdmaCopy));
// count represents the number of bytes - 1 to be copied
pkt->count++;
DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
pkt->source = getGARTAddr(pkt->source);
DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
// Read data from the source first, then call the copyReadData method
uint8_t *dmaBuffer = new uint8_t[pkt->count];
Addr device_addr = getDeviceAddress(pkt->source);
if (device_addr) {
DPRINTF(SDMAEngine, "Copying from device address %#lx\n", device_addr);
auto cb = new EventFunctionWrapper(
[ = ]{ copyReadData(q, pkt, dmaBuffer); }, name());
// Copy the minimum page size at a time in case the physical addresses
// are not contiguous.
ChunkGenerator gen(pkt->source, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
for (; !gen.done(); gen.next()) {
Addr chunk_addr = getDeviceAddress(gen.addr());
assert(chunk_addr);
DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
gen.size(), gen.addr(), chunk_addr);
gpuDevice->getMemMgr()->readRequest(chunk_addr, dmaBuffer,
gen.size(), 0,
gen.last() ? cb : nullptr);
dmaBuffer += gen.size();
}
} else {
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &) { copyReadData(q, pkt, dmaBuffer); });
dmaReadVirt(pkt->source, pkt->count, cb, (void *)dmaBuffer);
}
}
/* Completion of data reading for a copy packet. */
void
SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
{
// lastly we write read data to the destination address
uint64_t *dmaBuffer64 = reinterpret_cast<uint64_t *>(dmaBuffer);
DPRINTF(SDMAEngine, "Copy packet last/first qwords:\n");
DPRINTF(SDMAEngine, "First: %016lx\n", dmaBuffer64[0]);
DPRINTF(SDMAEngine, "Last: %016lx\n", dmaBuffer64[(pkt->count/8)-1]);
DPRINTF(SDMAData, "Copy packet data:\n");
for (int i = 0; i < pkt->count/8; ++i) {
DPRINTF(SDMAData, "%016lx\n", dmaBuffer64[i]);
}
Addr device_addr = getDeviceAddress(pkt->dest);
// Write read data to the destination address then call the copyDone method
if (device_addr) {
DPRINTF(SDMAEngine, "Copying to device address %#lx\n", device_addr);
auto cb = new EventFunctionWrapper(
[ = ]{ copyDone(q, pkt, dmaBuffer); }, name());
// Copy the minimum page size at a time in case the physical addresses
// are not contiguous.
ChunkGenerator gen(pkt->dest, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
for (; !gen.done(); gen.next()) {
Addr chunk_addr = getDeviceAddress(gen.addr());
assert(chunk_addr);
DPRINTF(SDMAEngine, "Copying chunk of %d bytes to %#lx (%#lx)\n",
gen.size(), gen.addr(), chunk_addr);
gpuDevice->getMemMgr()->writeRequest(chunk_addr, dmaBuffer,
gen.size(), 0,
gen.last() ? cb : nullptr);
dmaBuffer += gen.size();
}
} else {
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
}
}
/* Completion of a copy packet. */
void
SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
{
DPRINTF(SDMAEngine, "Copy completed to %p, %d dwords\n",
pkt->dest, pkt->count);
delete []dmaBuffer;
delete pkt;
decodeNext(q);
}
/* Implements an indirect buffer packet. */
void
SDMAEngine::indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt)
{
q->ib()->base(getGARTAddr(pkt->base));
q->ib()->rptr(0);
q->ib()->size(pkt->size * sizeof(uint32_t) + 1);
q->ib()->setWptr(pkt->size * sizeof(uint32_t));
q->incRptr(sizeof(sdmaIndirectBuffer));
delete pkt;
decodeNext(q->ib());
}
/* Implements a fence packet. */
void
SDMAEngine::fence(SDMAQueue *q, sdmaFence *pkt)
{
q->incRptr(sizeof(sdmaFence));
pkt->dest = getGARTAddr(pkt->dest);
// Writing the data from the fence packet to the destination address.
auto cb = new DmaVirtCallback<uint32_t>(
[ = ] (const uint32_t &) { fenceDone(q, pkt); }, pkt->data);
dmaWriteVirt(pkt->dest, sizeof(pkt->data), cb, &cb->dmaBuffer);
}
/* Completion of a fence packet. */
void
SDMAEngine::fenceDone(SDMAQueue *q, sdmaFence *pkt)
{
DPRINTF(SDMAEngine, "Fence completed to %p, data 0x%x\n",
pkt->dest, pkt->data);
delete pkt;
decodeNext(q);
}
/* Implements a trap packet. */
void
SDMAEngine::trap(SDMAQueue *q, sdmaTrap *pkt)
{
q->incRptr(sizeof(sdmaTrap));
DPRINTF(SDMAEngine, "Trap contextId: %p\n", pkt->intrContext);
uint32_t ring_id = 0;
assert(page.processing() ^ gfx.processing());
if (page.processing()) {
ring_id = 3;
}
gpuDevice->getIH()->prepareInterruptCookie(pkt->intrContext, ring_id,
getIHClientId(), TRAP_ID);
gpuDevice->getIH()->submitInterruptCookie();
delete pkt;
decodeNext(q);
}
/* Implements a write SRBM packet. */
void
SDMAEngine::srbmWrite(SDMAQueue *q, sdmaSRBMWriteHeader *header,
sdmaSRBMWrite *pkt)
{
q->incRptr(sizeof(sdmaSRBMWrite));
[[maybe_unused]] uint32_t reg_addr = pkt->regAddr << 2;
uint32_t reg_mask = 0x00000000;
if (header->byteEnable & 0x8) reg_mask |= 0xFF000000;
if (header->byteEnable & 0x4) reg_mask |= 0x00FF0000;
if (header->byteEnable & 0x2) reg_mask |= 0x0000FF00;
if (header->byteEnable & 0x1) reg_mask |= 0x000000FF;
pkt->data &= reg_mask;
DPRINTF(SDMAEngine, "SRBM write to %#x with data %#x\n",
reg_addr, pkt->data);
warn_once("SRBM write not performed, no SRBM model. This needs to be fixed"
" if correct system simulation is relying on SRBM registers.");
delete header;
delete pkt;
decodeNext(q);
}
/**
* Implements a poll reg/mem packet that polls an SRBM register or a memory
* location, compares the retrieved value with a reference value and if
* unsuccessfull it retries indefinitely or for a limited number of times.
*/
void
SDMAEngine::pollRegMem(SDMAQueue *q, sdmaPollRegMemHeader *header,
sdmaPollRegMem *pkt)
{
q->incRptr(sizeof(sdmaPollRegMem));
DPRINTF(SDMAEngine, "POLL_REGMEM: M=%d, func=%d, op=%d, addr=%p, ref=%d, "
"mask=%p, retry=%d, pinterval=%d\n", header->mode, header->func,
header->op, pkt->address, pkt->ref, pkt->mask, pkt->retryCount,
pkt->pollInt);
bool skip = false;
if (header->mode == 1) {
// polling on a memory location
if (header->op == 0) {
auto cb = new DmaVirtCallback<uint32_t>(
[ = ] (const uint32_t &dma_buffer) {
pollRegMemRead(q, header, pkt, dma_buffer, 0); });
dmaReadVirt(pkt->address >> 3, sizeof(uint32_t), cb,
(void *)&cb->dmaBuffer);
} else {
panic("SDMA poll mem operation not implemented.");
skip = true;
}
} else {
warn_once("SDMA poll reg is not implemented. If this is required for "
"correctness, an SRBM model needs to be implemented.");
skip = true;
}
if (skip) {
delete header;
delete pkt;
decodeNext(q);
}
}
void
SDMAEngine::pollRegMemRead(SDMAQueue *q, sdmaPollRegMemHeader *header,
sdmaPollRegMem *pkt, uint32_t dma_buffer, int count)
{
assert(header->mode == 1 && header->op == 0);
if (!pollRegMemFunc(dma_buffer, pkt->ref, header->func) &&
((count < (pkt->retryCount + 1) && pkt->retryCount != 0xfff) ||
pkt->retryCount == 0xfff)) {
// continue polling on a memory location until reference value is met,
// retryCount is met or indefinitelly if retryCount is 0xfff
DPRINTF(SDMAEngine, "SDMA polling mem addr %p, val %d ref %d.\n",
pkt->address, dma_buffer, pkt->ref);
auto cb = new DmaVirtCallback<uint32_t>(
[ = ] (const uint32_t &dma_buffer) {
pollRegMemRead(q, header, pkt, dma_buffer, count + 1); });
dmaReadVirt(pkt->address, sizeof(uint32_t), cb,
(void *)&cb->dmaBuffer);
} else {
DPRINTF(SDMAEngine, "SDMA polling mem addr %p, val %d ref %d done.\n",
pkt->address, dma_buffer, pkt->ref);
delete header;
delete pkt;
decodeNext(q);
}
}
bool
SDMAEngine::pollRegMemFunc(uint32_t value, uint32_t reference, uint32_t func)
{
switch (func) {
case 0:
return true;
break;
case 1:
return value < reference;
break;
case 2:
return value <= reference;
break;
case 3:
return value == reference;
break;
case 4:
return value != reference;
break;
case 5:
return value >= reference;
break;
case 6:
return value > reference;
break;
default:
panic("SDMA POLL_REGMEM unknown comparison function.");
break;
}
}
/* Implements a PTE PDE generation packet. */
void
SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt)
{
q->incRptr(sizeof(sdmaPtePde));
pkt->count++;
DPRINTF(SDMAEngine, "PTEPDE init: %d inc: %d count: %d\n",
pkt->initValue, pkt->increment, pkt->count);
// Generating pkt->count double dwords using the initial value, increment
// and a mask.
uint64_t *dmaBuffer = new uint64_t[pkt->count];
for (int i = 0; i < pkt->count; i++) {
dmaBuffer[i] = (pkt->mask | (pkt->initValue + (i * pkt->increment)));
}
// Writing generated data to the destination address.
if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
auto cb = new EventFunctionWrapper(
[ = ]{ ptePdeDone(q, pkt, dmaBuffer); }, name());
gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
sizeof(uint64_t) * pkt->count, 0,
cb);
} else {
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &) { ptePdeDone(q, pkt, dmaBuffer); });
dmaWriteVirt(pkt->dest, sizeof(uint64_t) * pkt->count, cb,
(void *)dmaBuffer);
}
}
/* Completion of a PTE PDE generation packet. */
void
SDMAEngine::ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer)
{
DPRINTF(SDMAEngine, "PtePde packet completed to %p, %d 2dwords\n",
pkt->dest, pkt->count);
delete []dmaBuffer;
delete pkt;
decodeNext(q);
}
void
SDMAEngine::atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt)
{
q->incRptr(sizeof(sdmaAtomic));
DPRINTF(SDMAEngine, "Atomic op %d on addr %#lx, src: %ld, cmp: %ld, loop?"
" %d loopInt: %d\n", header->opcode, pkt->addr, pkt->srcData,
pkt->cmpData, header->loop, pkt->loopInt);
// Read the data at pkt->addr
uint64_t *dmaBuffer = new uint64_t;
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ atomicData(q, header, pkt, dmaBuffer); });
dmaReadVirt(pkt->addr, sizeof(uint64_t), cb, (void *)dmaBuffer);
}
void
SDMAEngine::atomicData(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
uint64_t *dmaBuffer)
{
DPRINTF(SDMAEngine, "Atomic op %d on addr %#lx got data %#lx\n",
header->opcode, pkt->addr, *dmaBuffer);
if (header->opcode == SDMA_ATOMIC_ADD64) {
// Atomic add with return -- dst = dst + src
int64_t dst_data = *dmaBuffer;
int64_t src_data = pkt->srcData;
DPRINTF(SDMAEngine, "Atomic ADD_RTN: %ld + %ld = %ld\n", dst_data,
src_data, dst_data + src_data);
// Reuse the dmaBuffer allocated
*dmaBuffer = dst_data + src_data;
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ atomicDone(q, header, pkt, dmaBuffer); });
dmaWriteVirt(pkt->addr, sizeof(uint64_t), cb, (void *)dmaBuffer);
} else {
panic("Unsupported SDMA atomic opcode: %d\n", header->opcode);
}
}
void
SDMAEngine::atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
uint64_t *dmaBuffer)
{
DPRINTF(SDMAEngine, "Atomic op %d op addr %#lx complete (sent %lx)\n",
header->opcode, pkt->addr, *dmaBuffer);
delete dmaBuffer;
delete header;
delete pkt;
decodeNext(q);
}
AddrRangeList
SDMAEngine::getAddrRanges() const
{
AddrRangeList ranges;
return ranges;
}
void
SDMAEngine::serialize(CheckpointOut &cp) const
{
// Serialize the DmaVirtDevice base class
DmaVirtDevice::serialize(cp);
SERIALIZE_SCALAR(gfxBase);
SERIALIZE_SCALAR(gfxRptr);
SERIALIZE_SCALAR(gfxDoorbell);
SERIALIZE_SCALAR(gfxDoorbellOffset);
SERIALIZE_SCALAR(gfxWptr);
SERIALIZE_SCALAR(pageBase);
SERIALIZE_SCALAR(pageRptr);
SERIALIZE_SCALAR(pageDoorbell);
SERIALIZE_SCALAR(pageDoorbellOffset);
SERIALIZE_SCALAR(pageWptr);
int num_queues = 4;
std::vector<SDMAQueue *> queues;
queues.push_back((SDMAQueue *)&gfx);
queues.push_back((SDMAQueue *)&page);
queues.push_back((SDMAQueue *)&gfxIb);
queues.push_back((SDMAQueue *)&pageIb);
Addr base[num_queues];
Addr rptr[num_queues];
Addr wptr[num_queues];
Addr size[num_queues];
bool processing[num_queues];
for (int i = 0; i < num_queues; i++) {
base[i] = queues[i]->base();
rptr[i] = queues[i]->getRptr();
wptr[i] = queues[i]->getWptr();
size[i] = queues[i]->size();
processing[i] = queues[i]->processing();
}
SERIALIZE_ARRAY(base, num_queues);
SERIALIZE_ARRAY(rptr, num_queues);
SERIALIZE_ARRAY(wptr, num_queues);
SERIALIZE_ARRAY(size, num_queues);
SERIALIZE_ARRAY(processing, num_queues);
}
void
SDMAEngine::unserialize(CheckpointIn &cp)
{
// Serialize the DmaVirtDevice base class
DmaVirtDevice::unserialize(cp);
UNSERIALIZE_SCALAR(gfxBase);
UNSERIALIZE_SCALAR(gfxRptr);
UNSERIALIZE_SCALAR(gfxDoorbell);
UNSERIALIZE_SCALAR(gfxDoorbellOffset);
UNSERIALIZE_SCALAR(gfxWptr);
UNSERIALIZE_SCALAR(pageBase);
UNSERIALIZE_SCALAR(pageRptr);
UNSERIALIZE_SCALAR(pageDoorbell);
UNSERIALIZE_SCALAR(pageDoorbellOffset);
UNSERIALIZE_SCALAR(pageWptr);
int num_queues = 4;
Addr base[num_queues];
Addr rptr[num_queues];
Addr wptr[num_queues];
Addr size[num_queues];
bool processing[num_queues];
UNSERIALIZE_ARRAY(base, num_queues);
UNSERIALIZE_ARRAY(rptr, num_queues);
UNSERIALIZE_ARRAY(wptr, num_queues);
UNSERIALIZE_ARRAY(size, num_queues);
UNSERIALIZE_ARRAY(processing, num_queues);
std::vector<SDMAQueue *> queues;
queues.push_back((SDMAQueue *)&gfx);
queues.push_back((SDMAQueue *)&page);
queues.push_back((SDMAQueue *)&gfxIb);
queues.push_back((SDMAQueue *)&pageIb);
for (int i = 0; i < num_queues; i++) {
queues[i]->base(base[i]);
queues[i]->rptr(rptr[i]);
queues[i]->wptr(wptr[i]);
queues[i]->size(size[i]);
queues[i]->processing(processing[i]);
}
}
void
SDMAEngine::writeMMIO(PacketPtr pkt, Addr mmio_offset)
{
DPRINTF(SDMAEngine, "Writing offset %#x with data %x\n", mmio_offset,
pkt->getLE<uint32_t>());
// In Vega10 headers, the offsets are the same for both SDMAs
switch (mmio_offset) {
case mmSDMA_GFX_RB_BASE:
setGfxBaseLo(pkt->getLE<uint32_t>());
break;
case mmSDMA_GFX_RB_BASE_HI:
setGfxBaseHi(pkt->getLE<uint32_t>());
break;
case mmSDMA_GFX_RB_RPTR_ADDR_LO:
setGfxRptrLo(pkt->getLE<uint32_t>());
break;
case mmSDMA_GFX_RB_RPTR_ADDR_HI:
setGfxRptrHi(pkt->getLE<uint32_t>());
break;
case mmSDMA_GFX_DOORBELL:
setGfxDoorbellLo(pkt->getLE<uint32_t>());
break;
case mmSDMA_GFX_DOORBELL_OFFSET:
setGfxDoorbellOffsetLo(pkt->getLE<uint32_t>());
// Bit 28 of doorbell indicates that doorbell is enabled.
if (bits(getGfxDoorbell(), 28, 28)) {
gpuDevice->setDoorbellType(getGfxDoorbellOffset(),
QueueType::SDMAGfx);
gpuDevice->setSDMAEngine(getGfxDoorbellOffset(), this);
}
break;
case mmSDMA_GFX_RB_CNTL: {
uint32_t rb_size = bits(pkt->getLE<uint32_t>(), 6, 1);
assert(rb_size >= 6 && rb_size <= 62);
setGfxSize(1 << (rb_size + 2));
} break;
case mmSDMA_GFX_RB_WPTR_POLL_ADDR_LO:
setGfxWptrLo(pkt->getLE<uint32_t>());
break;
case mmSDMA_GFX_RB_WPTR_POLL_ADDR_HI:
setGfxWptrHi(pkt->getLE<uint32_t>());
break;
case mmSDMA_PAGE_RB_BASE:
setPageBaseLo(pkt->getLE<uint32_t>());
break;
case mmSDMA_PAGE_RB_RPTR_ADDR_LO:
setPageRptrLo(pkt->getLE<uint32_t>());
break;
case mmSDMA_PAGE_RB_RPTR_ADDR_HI:
setPageRptrHi(pkt->getLE<uint32_t>());
break;
case mmSDMA_PAGE_DOORBELL:
setPageDoorbellLo(pkt->getLE<uint32_t>());
break;
case mmSDMA_PAGE_DOORBELL_OFFSET:
setPageDoorbellOffsetLo(pkt->getLE<uint32_t>());
// Bit 28 of doorbell indicates that doorbell is enabled.
if (bits(getPageDoorbell(), 28, 28)) {
gpuDevice->setDoorbellType(getPageDoorbellOffset(),
QueueType::SDMAPage);
gpuDevice->setSDMAEngine(getPageDoorbellOffset(), this);
}
break;
case mmSDMA_PAGE_RB_CNTL: {
uint32_t rb_size = bits(pkt->getLE<uint32_t>(), 6, 1);
assert(rb_size >= 6 && rb_size <= 62);
setPageSize(1 << (rb_size + 2));
} break;
case mmSDMA_PAGE_RB_WPTR_POLL_ADDR_LO:
setPageWptrLo(pkt->getLE<uint32_t>());
break;
default:
DPRINTF(SDMAEngine, "Unknown SDMA MMIO %#x\n", mmio_offset);
break;
}
}
void
SDMAEngine::setGfxBaseLo(uint32_t data)
{
gfxBase = insertBits(gfxBase, 31, 0, 0);
gfxBase |= data;
gfx.base((gfxBase >> 1) << 12);
}
void
SDMAEngine::setGfxBaseHi(uint32_t data)
{
gfxBase = insertBits(gfxBase, 63, 32, 0);
gfxBase |= ((uint64_t)data) << 32;
gfx.base((gfxBase >> 1) << 12);
}
void
SDMAEngine::setGfxRptrLo(uint32_t data)
{
gfxRptr = insertBits(gfxRptr, 31, 0, 0);
gfxRptr |= data;
}
void
SDMAEngine::setGfxRptrHi(uint32_t data)
{
gfxRptr = insertBits(gfxRptr, 63, 32, 0);
gfxRptr |= ((uint64_t)data) << 32;
}
void
SDMAEngine::setGfxDoorbellLo(uint32_t data)
{
gfxDoorbell = insertBits(gfxDoorbell, 31, 0, 0);
gfxDoorbell |= data;
}
void
SDMAEngine::setGfxDoorbellHi(uint32_t data)
{
gfxDoorbell = insertBits(gfxDoorbell, 63, 32, 0);
gfxDoorbell |= ((uint64_t)data) << 32;
}
void
SDMAEngine::setGfxDoorbellOffsetLo(uint32_t data)
{
gfxDoorbellOffset = insertBits(gfxDoorbellOffset, 31, 0, 0);
gfxDoorbellOffset |= data;
}
void
SDMAEngine::setGfxDoorbellOffsetHi(uint32_t data)
{
gfxDoorbellOffset = insertBits(gfxDoorbellOffset, 63, 32, 0);
gfxDoorbellOffset |= ((uint64_t)data) << 32;
}
void
SDMAEngine::setGfxSize(uint64_t data)
{
gfx.size(data);
}
void
SDMAEngine::setGfxWptrLo(uint32_t data)
{
gfxWptr = insertBits(gfxWptr, 31, 0, 0);
gfxWptr |= data;
}
void
SDMAEngine::setGfxWptrHi(uint32_t data)
{
gfxWptr = insertBits(gfxWptr, 31, 0, 0);
gfxWptr |= ((uint64_t)data) << 32;
}
void
SDMAEngine::setPageBaseLo(uint32_t data)
{
pageBase = insertBits(pageBase, 31, 0, 0);
pageBase |= data;
page.base((pageBase >> 1) << 12);
}
void
SDMAEngine::setPageBaseHi(uint32_t data)
{
pageBase = insertBits(pageBase, 63, 32, 0);
pageBase |= ((uint64_t)data) << 32;
page.base((pageBase >> 1) << 12);
}
void
SDMAEngine::setPageRptrLo(uint32_t data)
{
pageRptr = insertBits(pageRptr, 31, 0, 0);
pageRptr |= data;
}
void
SDMAEngine::setPageRptrHi(uint32_t data)
{
pageRptr = insertBits(pageRptr, 63, 32, 0);
pageRptr |= ((uint64_t)data) << 32;
}
void
SDMAEngine::setPageDoorbellLo(uint32_t data)
{
pageDoorbell = insertBits(pageDoorbell, 31, 0, 0);
pageDoorbell |= data;
}
void
SDMAEngine::setPageDoorbellHi(uint32_t data)
{
pageDoorbell = insertBits(pageDoorbell, 63, 32, 0);
pageDoorbell |= ((uint64_t)data) << 32;
}
void
SDMAEngine::setPageDoorbellOffsetLo(uint32_t data)
{
pageDoorbellOffset = insertBits(pageDoorbellOffset, 31, 0, 0);
pageDoorbellOffset |= data;
}
void
SDMAEngine::setPageDoorbellOffsetHi(uint32_t data)
{
pageDoorbellOffset = insertBits(pageDoorbellOffset, 63, 32, 0);
pageDoorbellOffset |= ((uint64_t)data) << 32;
}
void
SDMAEngine::setPageSize(uint64_t data)
{
page.size(data);
}
void
SDMAEngine::setPageWptrLo(uint32_t data)
{
pageWptr = insertBits(pageWptr, 31, 0, 0);
pageWptr |= data;
}
void
SDMAEngine::setPageWptrHi(uint32_t data)
{
pageWptr = insertBits(pageWptr, 63, 32, 0);
pageWptr |= ((uint64_t)data) << 32;
}
} // namespace gem5