blob: b72092538d4073a21a716b705e72f45c308ed496 [file] [log] [blame]
/*
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __DEV_HSA_HSA_PACKET_PROCESSOR__
#define __DEV_HSA_HSA_PACKET_PROCESSOR__
#include <algorithm>
#include <cstdint>
#include <vector>
#include "base/types.hh"
#include "debug/HSAPacketProcessor.hh"
#include "dev/dma_virt_device.hh"
#include "dev/hsa/hsa.h"
#include "dev/hsa/hsa_queue.hh"
#include "enums/GfxVersion.hh"
#include "params/HSAPacketProcessor.hh"
#include "sim/eventq.hh"
#define AQL_PACKET_SIZE 64
#define PAGE_SIZE 4096
#define NUM_DMA_BUFS 16
#define DMA_BUF_SIZE (AQL_PACKET_SIZE * NUM_DMA_BUFS)
// HSA runtime supports only 5 signals per barrier packet
#define NumSignalsPerBarrier 5
namespace gem5
{
class AMDGPUDevice;
// Ideally, each queue should store this status and
// the processPkt() should make decisions based on that
// status variable.
enum Q_STATE
{
UNBLOCKED = 0, // Unblocked queue, can submit packets.
BLOCKED_BBIT, // Queue blocked by barrier bit.
// Can submit packet packets after
// previous packet completes.
BLOCKED_BPKT, // Queue blocked by barrier packet.
// Can submit packet packets after
// barrier packet completes.
};
class GPUCommandProcessor;
class HWScheduler;
// Our internal representation of an HSA queue
class HSAQueueDescriptor
{
public:
uint64_t basePointer;
uint64_t doorbellPointer;
uint64_t writeIndex;
uint64_t readIndex;
uint32_t numElts;
uint64_t hostReadIndexPtr;
bool stalledOnDmaBufAvailability;
bool dmaInProgress;
GfxVersion gfxVersion;
HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
uint64_t hri_ptr, uint32_t size,
GfxVersion gfxVersion)
: basePointer(base_ptr), doorbellPointer(db_ptr),
writeIndex(0), readIndex(0),
numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr),
stalledOnDmaBufAvailability(false),
dmaInProgress(false), gfxVersion(gfxVersion)
{ }
uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
uint64_t spaceUsed() { return writeIndex - readIndex; }
uint32_t objSize() { return AQL_PACKET_SIZE; }
uint32_t numObjs() { return numElts; }
bool isFull() { return spaceRemaining() == 0; }
bool isEmpty() { return spaceRemaining() == numElts; }
uint64_t ptr(uint64_t ix)
{
/*
* Based on ROCm Documentation:
* - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
rocr/src/core/runtime/amd_aql_queue.cpp#L99
* - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
rocr/src/core/runtime/amd_aql_queue.cpp#L624
*
* GFX7 and GFX8 will allocate twice as much space for their HSA
* queues as they actually access (using mod operations to map the
* virtual addresses from the upper half of the queue to the same
* virtual addresses as the lower half). Thus, we need to check if
* the ISA is GFX8 and mod the address by half of the queue size if
* so.
*/
uint64_t retAddr = 0ll;
if ((gfxVersion == GfxVersion::gfx801) ||
(gfxVersion == GfxVersion::gfx803)) {
retAddr = basePointer + ((ix % (numElts/2)) * objSize());
DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
"index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
"objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
numElts, numElts/2, objSize(), retAddr);
} else {
retAddr = basePointer + ((ix % numElts) * objSize());
DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
"index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
"retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
retAddr);
}
return retAddr;
}
};
/**
* Internal ring buffer which is used to prefetch/store copies of the
* in-memory HSA ring buffer. Each packet in the queue has three implicit
* states tracked by a packet's relative location to the write, read, and
* dispatch pointers.
*
* FREE: Entry is empty
* ALLOCATED: Entry has been allocated for a packet, but the DMA has not
* yet completed
* SUBMITTED: Packet has been submitted to the GPUCommandProcessor, but has not
* yet completed
*/
class AQLRingBuffer
{
private:
std::vector<hsa_kernel_dispatch_packet_t> _aqlBuf;
std::string _name;
std::vector<Addr> _hostDispAddresses;
std::vector<bool> _aqlComplete;
uint64_t _wrIdx; // Points to next write location
uint64_t _rdIdx; // Read pointer of AQL buffer
uint64_t _dispIdx; // Dispatch pointer of AQL buffer
public:
std::string name() {return _name;}
AQLRingBuffer(uint32_t size, const std::string name);
int allocEntry(uint32_t nBufReq);
bool freeEntry(void *pkt);
/**
* the kernel may try to read from the dispatch packet,
* so we need to keep the host address that corresponds
* to each of the dispatch packets this AQL buffer is
* storing. when we call submitPkt(), we send along the
* corresponding host address for the packet so the
* wavefront can properly initialize its SGPRs - which
* may include a pointer to the dispatch packet
*/
void
saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
{
for (int i = 0; i < num_pkts; ++i) {
_hostDispAddresses[ix % numObjs()] = host_pkt_addr + i * objSize();
++ix;
}
}
Addr
hostDispAddr() const
{
return _hostDispAddresses[dispIdx() % numObjs()];
}
bool
dispPending() const
{
int packet_type = (_aqlBuf[_dispIdx % _aqlBuf.size()].header
>> HSA_PACKET_HEADER_TYPE) &
((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1);
return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
}
/**
* Packets aren't guaranteed to be completed in-order, and we need
* to know when the last packet is finished in order to un-set
* the barrier bit. In order to confirm if the packet at _rdIdx
* is the last packet, we check if the packets ahead of _rdIdx
* are finished. If they are, _rdIdx is the last packet. If not,
* there are other outstanding packets.
*/
bool
isLastOutstandingPkt() const
{
for (int i = _rdIdx + 1; i < _dispIdx; i++) {
if (!_aqlComplete[i % _aqlBuf.size()]) {
return false;
}
}
return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx;
}
uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
uint32_t numObjs() const { return _aqlBuf.size(); };
uint32_t objSize() const { return AQL_PACKET_SIZE; }
uint64_t dispIdx() const { return _dispIdx; }
uint64_t wrIdx() const { return _wrIdx; }
uint64_t rdIdx() const { return _rdIdx; }
uint64_t* rdIdxPtr() { return &_rdIdx; }
void incRdIdx(uint64_t value) { _rdIdx += value; }
void incWrIdx(uint64_t value) { _wrIdx += value; }
void incDispIdx(uint64_t value) { _dispIdx += value; }
uint64_t compltnPending() { return (_dispIdx - _rdIdx); }
void setRdIdx(uint64_t value);
void setWrIdx(uint64_t value);
void setDispIdx(uint64_t value);
};
struct QCntxt
{
HSAQueueDescriptor* qDesc;
AQLRingBuffer* aqlBuf;
// used for HSA packets that enforce synchronization with barrier bit
bool barrierBit;
QCntxt(HSAQueueDescriptor* q_desc, AQLRingBuffer* aql_buf) :
qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false)
{}
QCntxt() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {}
};
class HSAPacketProcessor: public DmaVirtDevice
{
friend class HWScheduler;
protected:
typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
GPUCommandProcessor *gpu_device;
HWScheduler *hwSchdlr;
AMDGPUDevice *gpuDevice;
VegaISA::Walker *walker;
// Structure to store the read values of dependency signals
// from shared memory. Also used for tracking the status of
// those reads while they are in progress
class SignalState
{
public:
SignalState()
: pendingReads(0), allRead(false), discardRead(false)
{
values.resize(NumSignalsPerBarrier);
}
void handleReadDMA();
int pendingReads;
bool allRead;
// If this queue is unmapped when there are pending reads, then
// the pending reads has to be discarded.
bool discardRead;
// values stores the value of already read dependency signal
std::vector<hsa_signal_value_t> values;
void
resetSigVals()
{
std::fill(values.begin(), values.end(), 1);
}
};
class QueueProcessEvent : public Event
{
private:
HSAPacketProcessor *hsaPP;
uint32_t rqIdx;
public:
QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
: Event(Default_Pri), hsaPP(_hsaPP), rqIdx(_rqIdx)
{}
virtual void process();
virtual const char *description() const;
};
// Registered queue list entry; each entry has one queueDescriptor and
// associated AQL buffer
class RQLEntry
{
public:
RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
: aqlProcessEvent(hsaPP, rqIdx) {}
QCntxt qCntxt;
bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); }
SignalState depSignalRdState;
QueueProcessEvent aqlProcessEvent;
void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
bool getBarrierBit() const { return qCntxt.barrierBit; }
bool isLastOutstandingPkt() const
{
return qCntxt.aqlBuf->isLastOutstandingPkt();
}
};
// Keeps track of queueDescriptors of registered queues
std::vector<class RQLEntry *> regdQList;
Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
void displayQueueDescriptor(int pid, uint32_t rl_idx);
public:
HSAQueueDescriptor*
getQueueDesc(uint32_t queId)
{
return regdQList.at(queId)->qCntxt.qDesc;
}
class RQLEntry*
getRegdListEntry(uint32_t queId)
{
return regdQList.at(queId);
}
uint64_t
inFlightPkts(uint32_t queId)
{
auto aqlBuf = regdQList.at(queId)->qCntxt.aqlBuf;
return aqlBuf->dispIdx() - aqlBuf->rdIdx();
}
int numHWQueues;
Addr pioAddr;
Addr pioSize;
Tick pioDelay;
const Tick pktProcessDelay;
typedef HSAPacketProcessorParams Params;
HSAPacketProcessor(const Params &p);
~HSAPacketProcessor();
TranslationGenPtr translate(Addr vaddr, Addr size) override;
void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
uint64_t basePointer,
uint64_t queue_id,
uint32_t size, int doorbellSize,
GfxVersion gfxVersion,
Addr offset = 0, uint64_t rd_idx = 0);
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
void setDevice(GPUCommandProcessor * dev);
void setGPUDevice(AMDGPUDevice *gpu_device);
void updateReadIndex(int, uint32_t);
void getCommandsFromHost(int pid, uint32_t rl_idx);
HWScheduler *hwScheduler() { return hwSchdlr; }
// PIO interface
virtual Tick read(Packet*) override;
virtual Tick write(Packet*) override;
virtual AddrRangeList getAddrRanges() const override;
void finishPkt(void *pkt, uint32_t rl_idx);
void finishPkt(void *pkt) { finishPkt(pkt, 0); }
void schedAQLProcessing(uint32_t rl_idx);
void schedAQLProcessing(uint32_t rl_idx, Tick delay);
void sendAgentDispatchCompletionSignal(void *pkt,
hsa_signal_value_t signal);
void sendCompletionSignal(hsa_signal_value_t signal);
/**
* Calls getCurrentEntry once the queueEntry has been dmaRead.
*/
struct dma_series_ctx
{
// deal with the fact dma ops can complete out of issue order
uint32_t pkts_ttl;
uint32_t pkts_2_go;
uint32_t start_ix;
uint32_t rl_idx;
dma_series_ctx(uint32_t _pkts_ttl,
uint32_t _pkts_2_go,
uint32_t _start_ix,
uint32_t _rl_idx)
: pkts_ttl(_pkts_2_go), pkts_2_go(_pkts_2_go),
start_ix(_start_ix), rl_idx(_rl_idx)
{};
~dma_series_ctx() {};
};
void updateReadDispIdDma();
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead,
uint32_t ix_start, unsigned num_pkts,
dma_series_ctx *series_ctx, void *dest_4debug);
void handleReadDMA();
};
} // namespace gem5
#endif // __DEV_HSA_HSA_PACKET_PROCESSOR__