blob: b3cc702de5934e7f1c560d2ecc08638e686fd0fc [file] [log] [blame]
/*
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "dev/hsa/hsa_packet_processor.hh"
#include <cassert>
#include <cstring>
#include "base/chunk_generator.hh"
#include "base/compiler.hh"
#include "base/logging.hh"
#include "base/trace.hh"
#include "debug/HSAPacketProcessor.hh"
#include "dev/dma_device.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hw_scheduler.hh"
#include "gpu-compute/gpu_command_processor.hh"
#include "mem/packet_access.hh"
#include "mem/page_table.hh"
#include "sim/process.hh"
#include "sim/proxy_ptr.hh"
#include "sim/system.hh"
#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT) \
const char* \
HSAPacketProcessor::XEVENT::description() const \
{ \
return #XEVENT; \
}
#define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
HSA_PACKET_HEADER_TYPE) & (HSA_PACKET_HEADER_WIDTH_TYPE - 1)))
// checks if the barrier bit is set in the header -- shift the barrier bit
// to LSB, then bitwise "and" to mask off all other bits
#define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \
HSA_PACKET_HEADER_BARRIER) & HSA_PACKET_HEADER_WIDTH_BARRIER))
namespace gem5
{
HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)
HSAPacketProcessor::HSAPacketProcessor(const Params &p)
: DmaVirtDevice(p), numHWQueues(p.numHWQueues), pioAddr(p.pioAddr),
pioSize(PAGE_SIZE), pioDelay(10), pktProcessDelay(p.pktProcessDelay)
{
DPRINTF(HSAPacketProcessor, "%s:\n", __FUNCTION__);
hwSchdlr = new HWScheduler(this, p.wakeupDelay);
regdQList.resize(numHWQueues);
for (int i = 0; i < numHWQueues; i++) {
regdQList[i] = new RQLEntry(this, i);
}
}
HSAPacketProcessor::~HSAPacketProcessor()
{
for (auto &queue : regdQList) {
delete queue;
}
}
void
HSAPacketProcessor::unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
{
hwSchdlr->unregisterQueue(queue_id, doorbellSize);
}
void
HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
uint64_t basePointer,
uint64_t queue_id,
uint32_t size, int doorbellSize)
{
DPRINTF(HSAPacketProcessor,
"%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
(void *)basePointer, queue_id, size);
hwSchdlr->registerNewQueue(hostReadIndexPointer,
basePointer, queue_id, size, doorbellSize);
}
AddrRangeList
HSAPacketProcessor::getAddrRanges() const
{
assert(pioSize != 0);
AddrRangeList ranges;
ranges.push_back(RangeSize(pioAddr, pioSize));
return ranges;
}
// Basically only processes writes to the queue doorbell register.
Tick
HSAPacketProcessor::write(Packet *pkt)
{
assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
// TODO: How to get pid??
GEM5_VAR_USED Addr daddr = pkt->getAddr() - pioAddr;
DPRINTF(HSAPacketProcessor,
"%s: write of size %d to reg-offset %d (0x%x)\n",
__FUNCTION__, pkt->getSize(), daddr, daddr);
assert(gpu_device->driver()->doorbellSize() == pkt->getSize());
uint64_t doorbell_reg(0);
if (pkt->getSize() == 8)
doorbell_reg = pkt->getLE<uint64_t>() + 1;
else if (pkt->getSize() == 4)
doorbell_reg = pkt->getLE<uint32_t>();
else
fatal("invalid db size");
DPRINTF(HSAPacketProcessor,
"%s: write data 0x%x to offset %d (0x%x)\n",
__FUNCTION__, doorbell_reg, daddr, daddr);
hwSchdlr->write(daddr, doorbell_reg);
pkt->makeAtomicResponse();
return pioDelay;
}
Tick
HSAPacketProcessor::read(Packet *pkt)
{
pkt->makeAtomicResponse();
pkt->setBadAddress();
return pioDelay;
}
void
HSAPacketProcessor::translateOrDie(Addr vaddr, Addr &paddr)
{
// Grab the process and try to translate the virtual address with it; with
// new extensions, it will likely be wrong to just arbitrarily grab context
// zero.
auto process = sys->threads[0]->getProcessPtr();
if (!process->pTable->translate(vaddr, paddr))
fatal("failed translation: vaddr 0x%x\n", vaddr);
}
/**
* this event is used to update the read_disp_id field (the read pointer)
* of the MQD, which is how the host code knows the status of the HQD's
* read pointer
*/
void
HSAPacketProcessor::updateReadDispIdDma()
{
DPRINTF(HSAPacketProcessor, "updateReaddispId\n");
}
void
HSAPacketProcessor::updateReadIndex(int pid, uint32_t rl_idx)
{
AQLRingBuffer* aqlbuf = regdQList[rl_idx]->qCntxt.aqlBuf;
HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint32_t &dma_data) { this->updateReadDispIdDma(); }, 0);
DPRINTF(HSAPacketProcessor,
"%s: read-pointer offset [0x%x]\n", __FUNCTION__, aqlbuf->rdIdx());
dmaWriteVirt((Addr)qDesc->hostReadIndexPtr, sizeof(aqlbuf->rdIdx()),
cb, aqlbuf->rdIdxPtr());
DPRINTF(HSAPacketProcessor,
"%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
" q size = %d, is_empty = %s, active list ID = %d\n", __FUNCTION__,
qDesc->readIndex, qDesc->writeIndex, qDesc->spaceUsed(),
qDesc->numElts, qDesc->isEmpty()? "true" : "false", rl_idx);
if (qDesc->writeIndex != aqlbuf->wrIdx()) {
getCommandsFromHost(pid, rl_idx);
}
}
void
HSAPacketProcessor::cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid,
bool isRead, uint32_t ix_start, unsigned num_pkts,
dma_series_ctx *series_ctx, void *dest_4debug)
{
uint32_t rl_idx = series_ctx->rl_idx;
GEM5_VAR_USED AQLRingBuffer *aqlRingBuffer =
hsaPP->regdQList[rl_idx]->qCntxt.aqlBuf;
HSAQueueDescriptor* qDesc =
hsaPP->regdQList[rl_idx]->qCntxt.qDesc;
DPRINTF(HSAPacketProcessor, ">%s, ix = %d, npkts = %d," \
" pktsRemaining = %d, active list ID = %d\n", __FUNCTION__,
ix_start, num_pkts, series_ctx->pkts_2_go,
rl_idx);
if (isRead) {
series_ctx->pkts_2_go -= num_pkts;
if (series_ctx->pkts_2_go == 0) {
// Mark DMA as completed
qDesc->dmaInProgress = false;
DPRINTF(HSAPacketProcessor,
"%s: schedule Qwakeup next cycle, rdIdx %d, wrIdx %d," \
" dispIdx %d, active list ID = %d\n",
__FUNCTION__, aqlRingBuffer->rdIdx(),
aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rl_idx);
// schedule queue wakeup
hsaPP->schedAQLProcessing(rl_idx);
delete series_ctx;
}
}
}
void
HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx, Tick delay)
{
RQLEntry *queue = regdQList[rl_idx];
if (!queue->aqlProcessEvent.scheduled()) {
Tick processingTick = curTick() + delay;
schedule(queue->aqlProcessEvent, processingTick);
DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
processingTick);
} else {
DPRINTF(HSAPacketProcessor, "AQL processing already scheduled\n");
}
}
void
HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
{
schedAQLProcessing(rl_idx, pktProcessDelay);
}
Q_STATE
HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
{
Q_STATE is_submitted = BLOCKED_BPKT;
SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
// Dependency signals are not read yet. And this can only be a retry.
// The retry logic will schedule the packet processor wakeup
if (dep_sgnl_rd_st->pendingReads != 0) {
return BLOCKED_BPKT;
}
// `pkt` can be typecasted to any type of AQL packet since they all
// have header information at offset zero
auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
if (IS_BARRIER(disp_pkt) &&
regdQList[rl_idx]->compltnPending() > 0) {
// If this packet is using the "barrier bit" to enforce ordering with
// previous packets, and if there are outstanding packets, set the
// barrier bit for this queue and block the queue.
DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
" list ID = %d\n", __FUNCTION__, rl_idx);
regdQList[rl_idx]->setBarrierBit(true);
return BLOCKED_BBIT;
}
if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// Submit packet to HSA device (dispatcher)
gpu_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
is_submitted = UNBLOCKED;
} else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// Submit packet to HSA device (dispatcher)
gpu_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
is_submitted = UNBLOCKED;
/*
If this packet is using the "barrier bit" to enforce ordering with
subsequent kernels, set the bit for this queue now, after
dispatching.
*/
if (IS_BARRIER(disp_pkt)) {
DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
" list ID = %d\n", __FUNCTION__, rl_idx);
regdQList[rl_idx]->setBarrierBit(true);
}
} else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
auto bar_and_pkt = (_hsa_barrier_and_packet_t *)pkt;
bool isReady = true;
// Loop thorugh all the completion signals to see if this barrier
// packet is ready.
for (int i = 0; i < NumSignalsPerBarrier; i++) {
// dep_signal = zero imply no signal connected
if (bar_and_pkt->dep_signal[i]) {
// The signal value is aligned 8 bytes from
// the actual handle in the runtime
uint64_t signal_addr =
(uint64_t) (((uint64_t *) bar_and_pkt->dep_signal[i]) + 1);
hsa_signal_value_t *signal_val =
&(dep_sgnl_rd_st->values[i]);
DPRINTF(HSAPacketProcessor, "%s: Barrier pkt dep sgnl[%d]" \
" , sig addr %x, value %d active list ID = %d\n",
__FUNCTION__, i, signal_addr,
*signal_val, rl_idx);
// The if condition will be executed everytime except the
// very first time this barrier packet is encounteresd.
if (dep_sgnl_rd_st->allRead) {
if (*signal_val != 0) {
// This signal is not yet ready, read it again
isReady = false;
auto cb = new DmaVirtCallback<int64_t>(
[ = ] (const uint32_t &dma_data)
{ dep_sgnl_rd_st->handleReadDMA(); }, 0);
dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
cb, signal_val);
dep_sgnl_rd_st->pendingReads++;
DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
" active list %d\n", __FUNCTION__,
dep_sgnl_rd_st->pendingReads, rl_idx);
}
} else {
// This signal is not yet ready, read it again
isReady = false;
auto cb = new DmaVirtCallback<int64_t>(
[ = ] (const uint32_t &dma_data)
{ dep_sgnl_rd_st->handleReadDMA(); }, 0);
dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
cb, signal_val);
dep_sgnl_rd_st->pendingReads++;
DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
" active list %d\n", __FUNCTION__,
dep_sgnl_rd_st->pendingReads, rl_idx);
}
}
}
if (isReady) {
assert(dep_sgnl_rd_st->pendingReads == 0);
DPRINTF(HSAPacketProcessor, "%s: Barrier packet completed" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// TODO: Completion signal of barrier packet to be
// atomically decremented here
finishPkt((void*)bar_and_pkt, rl_idx);
is_submitted = UNBLOCKED;
// Reset signal values
dep_sgnl_rd_st->resetSigVals();
// The completion signal is connected
if (bar_and_pkt->completion_signal != 0) {
// HACK: The semantics of the HSA signal is to
// decrement the current signal value
// I'm going to cheat here and read out
// the value from main memory using functional
// access, and then just DMA the decremented value.
uint64_t signal_value = gpu_device->functionalReadHsaSignal(\
bar_and_pkt->completion_signal);
DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
" completion signal! Addr: %x\n",
bar_and_pkt->completion_signal);
gpu_device->updateHsaSignal(bar_and_pkt->completion_signal,
signal_value - 1);
}
}
if (dep_sgnl_rd_st->pendingReads > 0) {
// Atleast one DepSignalsReadDmaEvent is scheduled this cycle
dep_sgnl_rd_st->allRead = false;
dep_sgnl_rd_st->discardRead = false;
}
} else if (pkt_type == HSA_PACKET_TYPE_BARRIER_OR) {
fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
} else if (pkt_type == HSA_PACKET_TYPE_INVALID) {
fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID");
} else if (pkt_type == HSA_PACKET_TYPE_AGENT_DISPATCH) {
DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// Submit packet to HSA device (dispatcher)
gpu_device->submitAgentDispatchPkt(
(void *)disp_pkt, rl_idx, host_pkt_addr);
is_submitted = UNBLOCKED;
sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
} else {
fatal("Unsupported packet type %d\n", pkt_type);
}
return is_submitted;
}
// Wakes up every fixed time interval (pktProcessDelay) and processes a single
// packet from the queue that scheduled this wakeup. If there are more
// packets in that queue, the next wakeup is scheduled.
void
HSAPacketProcessor::QueueProcessEvent::process()
{
AQLRingBuffer *aqlRingBuffer = hsaPP->regdQList[rqIdx]->qCntxt.aqlBuf;
DPRINTF(HSAPacketProcessor,
"%s: Qwakeup , rdIdx %d, wrIdx %d," \
" dispIdx %d, active list ID = %d\n",
__FUNCTION__, aqlRingBuffer->rdIdx(),
aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
// If barrier bit is set, then this wakeup is a dummy wakeup
// just to model the processing time. Do nothing.
if (hsaPP->regdQList[rqIdx]->getBarrierBit()) {
DPRINTF(HSAPacketProcessor,
"Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx);
return;
}
// In the future, we may support batch processing of packets.
// Then, we can just remove the break statements and the code
// will support batch processing. That is why we are using a
// "while loop" here instead on an "if" condition.
while (hsaPP->regdQList[rqIdx]->dispPending()) {
void *pkt = aqlRingBuffer->ptr(aqlRingBuffer->dispIdx());
DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n",
__FUNCTION__, aqlRingBuffer->dispIdx());
Addr host_addr = aqlRingBuffer->hostDispAddr();
Q_STATE q_state = hsaPP->processPkt(pkt, rqIdx, host_addr);
if (q_state == UNBLOCKED) {
aqlRingBuffer->incDispIdx(1);
DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n",
__FUNCTION__, aqlRingBuffer->dispIdx());
if (hsaPP->regdQList[rqIdx]->dispPending()) {
hsaPP->schedAQLProcessing(rqIdx);
}
break;
} else if (q_state == BLOCKED_BPKT) {
// This queue is blocked by barrier packet,
// schedule a processing event
hsaPP->schedAQLProcessing(rqIdx);
break;
} else if (q_state == BLOCKED_BBIT) {
// This queue is blocked by barrier bit, and processing event
// should be scheduled from finishPkt(). However, to elapse
// "pktProcessDelay" processing time, let us schedule a dummy
// wakeup once which will just wakeup and will do nothing.
hsaPP->schedAQLProcessing(rqIdx);
break;
} else {
panic("Unknown queue state\n");
}
}
}
void
HSAPacketProcessor::SignalState::handleReadDMA()
{
assert(pendingReads > 0);
pendingReads--;
if (pendingReads == 0) {
allRead = true;
if (discardRead) {
resetSigVals();
}
}
}
void
HSAPacketProcessor::getCommandsFromHost(int pid, uint32_t rl_idx)
{
HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
AQLRingBuffer *aqlRingBuffer = regdQList[rl_idx]->qCntxt.aqlBuf;
DPRINTF(HSAPacketProcessor,
"%s: read-pointer offset[0x%x], write-pointer offset[0x%x]"
" doorbell(%d)[0x%x] \n",
__FUNCTION__, qDesc->readIndex,
qDesc->writeIndex, pid, qDesc->doorbellPointer);
if (qDesc->dmaInProgress) {
// we'll try again when this dma transfer completes in updateReadIndex
return;
}
uint32_t num_umq = qDesc->spaceUsed();
if (num_umq == 0)
return; // nothing to be gotten
uint32_t umq_nxt = qDesc->readIndex;
// Total AQL buffer size
uint32_t ttl_aql_buf = aqlRingBuffer->numObjs();
// Available AQL buffer size. If the available buffer is less than
// demanded, number of available buffer is returned
uint32_t got_aql_buf = aqlRingBuffer->allocEntry(num_umq);
qDesc->readIndex += got_aql_buf;
uint32_t dma_start_ix = (aqlRingBuffer->wrIdx() - got_aql_buf) %
ttl_aql_buf;
dma_series_ctx *series_ctx = NULL;
DPRINTF(HSAPacketProcessor, "%s: umq_nxt = %d, ttl_aql_buf = %d, "
"dma_start_ix = %d, num_umq = %d\n", __FUNCTION__, umq_nxt,
ttl_aql_buf, dma_start_ix, num_umq);
if (got_aql_buf == 0) {
// we'll try again when some dma bufs are freed in freeEntry
qDesc->stalledOnDmaBufAvailability = true;
return;
} else {
qDesc->stalledOnDmaBufAvailability = false;
}
uint32_t dma_b4_wrap = ttl_aql_buf - dma_start_ix;
while (got_aql_buf != 0 && num_umq != 0) {
uint32_t umq_b4_wrap = qDesc->numObjs() -
(umq_nxt % qDesc->objSize());
uint32_t num_2_xfer
= std::min({umq_b4_wrap, dma_b4_wrap, num_umq, got_aql_buf});
if (!series_ctx) {
qDesc->dmaInProgress = true;
series_ctx = new dma_series_ctx(got_aql_buf, got_aql_buf,
dma_start_ix, rl_idx);
}
void *aql_buf = aqlRingBuffer->ptr(dma_start_ix);
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint32_t &dma_data)
{ this->cmdQueueCmdDma(this, pid, true, dma_start_ix,
num_2_xfer, series_ctx, aql_buf); }, 0);
dmaReadVirt(qDesc->ptr(umq_nxt), num_2_xfer * qDesc->objSize(),
cb, aql_buf);
aqlRingBuffer->saveHostDispAddr(qDesc->ptr(umq_nxt), num_2_xfer,
dma_start_ix);
DPRINTF(HSAPacketProcessor,
"%s: aql_buf = %p, umq_nxt = %d, dma_ix = %d, num2xfer = %d\n",
__FUNCTION__, aql_buf, umq_nxt, dma_start_ix, num_2_xfer);
num_umq -= num_2_xfer;
got_aql_buf -= num_2_xfer;
dma_start_ix = (dma_start_ix + num_2_xfer) % ttl_aql_buf;
umq_nxt = (umq_nxt + num_2_xfer) % qDesc->numObjs();
if (got_aql_buf == 0 && num_umq != 0) {
// There are more packets in the queue but
// not enough DMA buffers. Set the stalledOnDmaBufAvailability,
// we will try again in freeEntry
qDesc->stalledOnDmaBufAvailability = true;
}
}
}
void
HSAPacketProcessor::displayQueueDescriptor(int pid, uint32_t rl_idx)
{
GEM5_VAR_USED HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
DPRINTF(HSAPacketProcessor,
"%s: pid[%d], basePointer[0x%lx], dBPointer[0x%lx], "
"writeIndex[0x%x], readIndex[0x%x], size(bytes)[0x%x]\n",
__FUNCTION__, pid, qDesc->basePointer,
qDesc->doorbellPointer, qDesc->writeIndex,
qDesc->readIndex, qDesc->numElts);
}
AQLRingBuffer::AQLRingBuffer(uint32_t size,
const std::string name)
: _name(name), _wrIdx(0), _rdIdx(0), _dispIdx(0)
{
_aqlBuf.resize(size);
_aqlComplete.resize(size);
_hostDispAddresses.resize(size);
// Mark all packets as invalid and incomplete
for (auto& it : _aqlBuf)
it.header = HSA_PACKET_TYPE_INVALID;
std::fill(_aqlComplete.begin(), _aqlComplete.end(), false);
}
bool
AQLRingBuffer::freeEntry(void *pkt)
{
_aqlComplete[(hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data()] = true;
DPRINTF(HSAPacketProcessor, "%s: pkt_ix = %d; "\
" # free entries = %d, wrIdx = %d, rdIdx = %d\n", __FUNCTION__,
(hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data(),
nFree(), wrIdx(), rdIdx());
// Packets can complete out-of-order. This code "retires" packets in-order
// by updating the read pointer in the MQD when a contiguous chunk of
// packets have finished.
uint32_t old_rdIdx = rdIdx();
while (_aqlComplete[rdIdx() % numObjs()]) {
_aqlComplete[rdIdx() % numObjs()] = false;
_aqlBuf[rdIdx() % numObjs()].header = HSA_PACKET_TYPE_INVALID;
incRdIdx(1);
}
return (old_rdIdx != rdIdx());
}
void
HSAPacketProcessor::setDevice(GPUCommandProcessor *dev)
{
this->gpu_device = dev;
}
int
AQLRingBuffer::allocEntry(uint32_t nBufReq)
{
DPRINTF(HSAPacketProcessor, "%s: nReq = %d\n", __FUNCTION__, nBufReq);
if (nFree() == 0) {
DPRINTF(HSAPacketProcessor, "%s: return = %d\n", __FUNCTION__, 0);
return 0;
}
if (nBufReq > nFree())
nBufReq = nFree();
DPRINTF(HSAPacketProcessor, "%s: ix1stFree = %d\n", __FUNCTION__, wrIdx());
incWrIdx(nBufReq);
DPRINTF(HSAPacketProcessor, "%s: return = %d, wrIdx = %d\n",
__FUNCTION__, nBufReq, wrIdx());
return nBufReq;
}
void
HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
{
HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
// if barrier bit was set and this is the last
// outstanding packet from that queue,
// unset it here
if (regdQList[rl_idx]->getBarrierBit() &&
regdQList[rl_idx]->isLastOutstandingPkt()) {
DPRINTF(HSAPacketProcessor,
"Unset barrier bit for active list ID %d\n", rl_idx);
regdQList[rl_idx]->setBarrierBit(false);
// if pending kernels in the queue after this kernel, reschedule
if (regdQList[rl_idx]->dispPending()) {
DPRINTF(HSAPacketProcessor,
"Rescheduling active list ID %d after unsetting barrier "
"bit\n", rl_idx);
schedAQLProcessing(rl_idx);
}
}
// If set, then blocked schedule, so need to reschedule
if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
updateReadIndex(0, rl_idx);
DPRINTF(HSAPacketProcessor,
"%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
" q size = %d, stalled = %s, empty = %s, active list ID = %d\n",
__FUNCTION__, qDesc->readIndex, qDesc->writeIndex,
qDesc->spaceUsed(), qDesc->numElts,
qDesc->stalledOnDmaBufAvailability? "true" : "false",
qDesc->isEmpty()? "true" : "false", rl_idx);
// DMA buffer is freed, check the queue to see if there are DMA
// accesses blocked becasue of non-availability of DMA buffer
if (qDesc->stalledOnDmaBufAvailability) {
assert(!qDesc->isEmpty());
getCommandsFromHost(0, rl_idx); // TODO:assign correct pid
// when implementing
// multi-process support
}
}
void
HSAPacketProcessor::sendAgentDispatchCompletionSignal(
void *pkt, hsa_signal_value_t signal)
{
auto agent_pkt = (_hsa_agent_dispatch_packet_t *)pkt;
uint64_t signal_addr =
(uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1);
DPRINTF(HSAPacketProcessor, "Triggering Agent Dispatch packet" \
" completion signal: %x!\n", signal_addr);
/**
* HACK: The semantics of the HSA signal is to
* decrement the current signal value.
* I'm going to cheat here and read out
* the value from main memory using functional
* access, and then just DMA the decremented value.
* The reason for this is that the DMASequencer does
* not support atomic operations.
*/
VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
DPRINTF(HSAPacketProcessor,"HSADriver: Sending signal to %lu\n",
(uint64_t)sys->threads[0]->cpuId());
hsa_signal_value_t *new_signal = new hsa_signal_value_t;
*new_signal = (hsa_signal_value_t) *prev_signal - 1;
dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
}
void
HSAPacketProcessor::sendCompletionSignal(hsa_signal_value_t signal)
{
uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1);
DPRINTF(HSAPacketProcessor, "Triggering completion signal: %x!\n",
signal_addr);
/**
* HACK: The semantics of the HSA signal is to
* decrement the current signal value.
* I'm going to cheat here and read out
* the value from main memory using functional
* access, and then just DMA the decremented value.
* The reason for this is that the DMASequencer does
* not support atomic operations.
*/
VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
hsa_signal_value_t *new_signal = new hsa_signal_value_t;
*new_signal = (hsa_signal_value_t) *prev_signal - 1;
dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
}
} // namespace gem5