src/dev/hsa/hsa_packet_processor.hh - public/gem5 - Git at Google

 /*
  * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #ifndef __DEV_HSA_HSA_PACKET_PROCESSOR__
 #define __DEV_HSA_HSA_PACKET_PROCESSOR__

 #include <algorithm>
 #include <cstdint>
 #include <vector>

 #include "base/types.hh"
 #include "debug/HSAPacketProcessor.hh"
 #include "dev/dma_virt_device.hh"
 #include "dev/hsa/hsa.h"
 #include "dev/hsa/hsa_queue.hh"
 #include "enums/GfxVersion.hh"
 #include "params/HSAPacketProcessor.hh"
 #include "sim/eventq.hh"

 #define AQL_PACKET_SIZE 64
 #define PAGE_SIZE 4096
 #define NUM_DMA_BUFS 16
 #define DMA_BUF_SIZE (AQL_PACKET_SIZE * NUM_DMA_BUFS)
 // HSA runtime supports only 5 signals per barrier packet
 #define NumSignalsPerBarrier 5

 namespace gem5
 {

 class AMDGPUDevice;

 // Ideally, each queue should store this status and
 // the processPkt() should make decisions based on that
 // status variable.
 enum Q_STATE
 {
     UNBLOCKED = 0, // Unblocked queue, can submit packets.
     BLOCKED_BBIT,  // Queue blocked by barrier bit.
                    // Can submit packet packets after
                    // previous packet completes.
     BLOCKED_BPKT,  // Queue blocked by barrier packet.
                    // Can submit packet packets after
                    // barrier packet completes.
 };

 class GPUCommandProcessor;
 class HWScheduler;

 // Our internal representation of an HSA queue
 class HSAQueueDescriptor
 {
     public:
         uint64_t     basePointer;
         uint64_t     doorbellPointer;
         uint64_t     writeIndex;
         uint64_t     readIndex;
         uint32_t     numElts;
         uint64_t     hostReadIndexPtr;
         bool         stalledOnDmaBufAvailability;
         bool         dmaInProgress;
         GfxVersion   gfxVersion;

         HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
                            uint64_t hri_ptr, uint32_t size,
                            GfxVersion gfxVersion)
           : basePointer(base_ptr), doorbellPointer(db_ptr),
             writeIndex(0), readIndex(0),
             numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr),
             stalledOnDmaBufAvailability(false),
             dmaInProgress(false), gfxVersion(gfxVersion)
         {  }
         uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
         uint64_t spaceUsed() { return writeIndex - readIndex; }
         uint32_t objSize() { return AQL_PACKET_SIZE; }
         uint32_t numObjs() { return numElts; }
         bool isFull() { return spaceRemaining() == 0; }
         bool isEmpty() { return spaceRemaining() == numElts; }

         uint64_t ptr(uint64_t ix)
         {
             /*
              * Based on ROCm Documentation:
              * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
                      10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
                      rocr/src/core/runtime/amd_aql_queue.cpp#L99
              * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
                      10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
                      rocr/src/core/runtime/amd_aql_queue.cpp#L624
              *
              * GFX7 and GFX8 will allocate twice as much space for their HSA
              * queues as they actually access (using mod operations to map the
              * virtual addresses from the upper half of the queue to the same
              * virtual addresses as the lower half).  Thus, we need to check if
              * the ISA is GFX8 and mod the address by half of the queue size if
              * so.
              */
             uint64_t retAddr = 0ll;
             if ((gfxVersion == GfxVersion::gfx801) ||
                 (gfxVersion == GfxVersion::gfx803)) {
               retAddr = basePointer + ((ix % (numElts/2)) * objSize());
               DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
                       "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
                       "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
                       numElts, numElts/2, objSize(), retAddr);
             } else {
               retAddr = basePointer + ((ix % numElts) * objSize());
               DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
                       "index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
                       "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
                       retAddr);
             }
             return retAddr;
         }
 };

 /**
  * Internal ring buffer which is used to prefetch/store copies of the
  * in-memory HSA ring buffer.  Each packet in the queue has three implicit
  * states tracked by a packet's relative location to the write, read, and
  * dispatch pointers.
  *
  * FREE: Entry is empty
  * ALLOCATED: Entry has been allocated for a packet, but the DMA has not
  *            yet completed
  * SUBMITTED: Packet has been submitted to the GPUCommandProcessor, but has not
  *            yet completed
  */
 class AQLRingBuffer
 {
    private:
      std::vector<hsa_kernel_dispatch_packet_t> _aqlBuf;
      std::string _name;
      std::vector<Addr> _hostDispAddresses;
      std::vector<bool> _aqlComplete;
      uint64_t _wrIdx;   // Points to next write location
      uint64_t _rdIdx;   // Read pointer of AQL buffer
      uint64_t _dispIdx; // Dispatch pointer of AQL buffer

   public:
      std::string name() {return _name;}
      AQLRingBuffer(uint32_t size, const std::string name);
      int allocEntry(uint32_t nBufReq);
      bool freeEntry(void *pkt);

      /**
       * the kernel may try to read from the dispatch packet,
       * so we need to keep the host address that corresponds
       * to each of the dispatch packets this AQL buffer is
       * storing. when we call submitPkt(), we send along the
       * corresponding host address for the packet so the
       * wavefront can properly initialize its SGPRs - which
       * may include a pointer to the dispatch packet
       */
      void
      saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
      {
          for (int i = 0; i < num_pkts; ++i) {
             _hostDispAddresses[ix % numObjs()] = host_pkt_addr + i * objSize();
             ++ix;
          }
      }

      Addr
      hostDispAddr() const
      {
          return _hostDispAddresses[dispIdx() % numObjs()];
      }

      bool
      dispPending() const
      {
          int packet_type = (_aqlBuf[_dispIdx % _aqlBuf.size()].header
              >> HSA_PACKET_HEADER_TYPE) &
              ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1);
          return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
      }

      /**
       * Packets aren't guaranteed to be completed in-order, and we need
       * to know when the last packet is finished in order to un-set
       * the barrier bit. In order to confirm if the packet at _rdIdx
       * is the last packet, we check if the packets ahead of _rdIdx
       * are finished. If they are, _rdIdx is the last packet. If not,
       * there are other outstanding packets.
       */
      bool
      isLastOutstandingPkt() const
      {
        for (int i = _rdIdx + 1; i < _dispIdx; i++) {
          if (!_aqlComplete[i % _aqlBuf.size()]) {
            return false;
          }
        }
        return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx;
      }

      uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
      void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
      uint32_t numObjs() const { return _aqlBuf.size(); };
      uint32_t objSize() const { return AQL_PACKET_SIZE; }
      uint64_t dispIdx() const { return _dispIdx; }
      uint64_t wrIdx() const { return _wrIdx; }
      uint64_t rdIdx() const { return _rdIdx; }
      uint64_t* rdIdxPtr() { return &_rdIdx; }
      void incRdIdx(uint64_t value) { _rdIdx += value; }
      void incWrIdx(uint64_t value) { _wrIdx += value; }
      void incDispIdx(uint64_t value) { _dispIdx += value; }
      uint64_t compltnPending() { return (_dispIdx - _rdIdx); }
      void setRdIdx(uint64_t value);
 };

 struct QCntxt
 {
     HSAQueueDescriptor* qDesc;
     AQLRingBuffer* aqlBuf;
     // used for HSA packets that enforce synchronization with barrier bit
     bool barrierBit;
     QCntxt(HSAQueueDescriptor* q_desc, AQLRingBuffer* aql_buf) :
         qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false)
     {}
     QCntxt() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {}
 };

 class HSAPacketProcessor: public DmaVirtDevice
 {
     friend class HWScheduler;
   protected:
     typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
     GPUCommandProcessor *gpu_device;
     HWScheduler *hwSchdlr;
     AMDGPUDevice *gpuDevice;
     VegaISA::Walker *walker;

     // Structure to store the read values of dependency signals
     // from shared memory. Also used for tracking the status of
     // those reads while they are in progress
     class SignalState
     {
       public:
         SignalState()
             : pendingReads(0), allRead(false), discardRead(false)
         {
             values.resize(NumSignalsPerBarrier);
         }
         void handleReadDMA();
         int pendingReads;
         bool allRead;
         // If this queue is unmapped when there are pending reads, then
         // the pending reads has to be discarded.
         bool discardRead;
         // values stores the value of already read dependency signal
         std::vector<hsa_signal_value_t> values;
         void
         resetSigVals()
         {
             std::fill(values.begin(), values.end(), 1);
         }
     };

     class QueueProcessEvent : public Event
     {
       private:
         HSAPacketProcessor *hsaPP;
         uint32_t rqIdx;
       public:
         QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
             : Event(Default_Pri), hsaPP(_hsaPP), rqIdx(_rqIdx)
         {}
         virtual void process();
         virtual const char *description() const;
     };

     // Registered queue list entry; each entry has one queueDescriptor and
     // associated AQL buffer
     class RQLEntry
     {
       public:
         RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
             : aqlProcessEvent(hsaPP, rqIdx) {}
         QCntxt qCntxt;
         bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
         uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); }
         SignalState depSignalRdState;
         QueueProcessEvent aqlProcessEvent;
         void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
         bool getBarrierBit() const { return qCntxt.barrierBit; }
         bool isLastOutstandingPkt() const
         {
           return qCntxt.aqlBuf->isLastOutstandingPkt();
         }
     };
     // Keeps track of queueDescriptors of registered queues
     std::vector<class RQLEntry *> regdQList;

     Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
     void displayQueueDescriptor(int pid, uint32_t rl_idx);

   public:
     HSAQueueDescriptor*
     getQueueDesc(uint32_t queId)
     {
         return regdQList.at(queId)->qCntxt.qDesc;
     }
     class RQLEntry*
     getRegdListEntry(uint32_t queId)
     {
         return regdQList.at(queId);
     }

     uint64_t
     inFlightPkts(uint32_t queId)
     {
         auto aqlBuf = regdQList.at(queId)->qCntxt.aqlBuf;
         return aqlBuf->dispIdx() - aqlBuf->rdIdx();
     }

     int numHWQueues;
     Addr pioAddr;
     Addr pioSize;
     Tick pioDelay;
     const Tick pktProcessDelay;

     typedef HSAPacketProcessorParams Params;
     HSAPacketProcessor(const Params &p);
     ~HSAPacketProcessor();
     TranslationGenPtr translate(Addr vaddr, Addr size) override;
     void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
                             uint64_t basePointer,
                             uint64_t queue_id,
                             uint32_t size, int doorbellSize,
                             GfxVersion gfxVersion,
                             Addr offset = 0, uint64_t rd_idx = 0);
     void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
     void setDevice(GPUCommandProcessor * dev);
     void setGPUDevice(AMDGPUDevice *gpu_device);
     void updateReadIndex(int, uint32_t);
     void getCommandsFromHost(int pid, uint32_t rl_idx);
     HWScheduler *hwScheduler() { return hwSchdlr; }

     // PIO interface
     virtual Tick read(Packet*) override;
     virtual Tick write(Packet*) override;
     virtual AddrRangeList getAddrRanges() const override;
     void finishPkt(void *pkt, uint32_t rl_idx);
     void finishPkt(void *pkt) { finishPkt(pkt, 0); }
     void schedAQLProcessing(uint32_t rl_idx);
     void schedAQLProcessing(uint32_t rl_idx, Tick delay);

     void sendAgentDispatchCompletionSignal(void *pkt,
                                            hsa_signal_value_t signal);
     void sendCompletionSignal(hsa_signal_value_t signal);

     /**
      * Calls getCurrentEntry once the queueEntry has been dmaRead.
      */
     struct dma_series_ctx
     {
         // deal with the fact dma ops can complete out of issue order
         uint32_t pkts_ttl;
         uint32_t pkts_2_go;
         uint32_t start_ix;
         uint32_t rl_idx;

         dma_series_ctx(uint32_t _pkts_ttl,
                        uint32_t _pkts_2_go,
                        uint32_t _start_ix,
                        uint32_t _rl_idx)
             : pkts_ttl(_pkts_2_go), pkts_2_go(_pkts_2_go),
               start_ix(_start_ix), rl_idx(_rl_idx)
         {};
         ~dma_series_ctx() {};
     };

     void updateReadDispIdDma();
     void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead,
             uint32_t ix_start, unsigned num_pkts,
             dma_series_ctx *series_ctx, void *dest_4debug);
     void handleReadDMA();
 };

 } // namespace gem5

 #endif // __DEV_HSA_HSA_PACKET_PROCESSOR__
	/*
	* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* 3. Neither the name of the copyright holder nor the names of its
	* contributors may be used to endorse or promote products derived from this
	* software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#ifndef __DEV_HSA_HSA_PACKET_PROCESSOR__
	#define __DEV_HSA_HSA_PACKET_PROCESSOR__

	#include <algorithm>
	#include <cstdint>
	#include <vector>

	#include "base/types.hh"
	#include "debug/HSAPacketProcessor.hh"
	#include "dev/dma_virt_device.hh"
	#include "dev/hsa/hsa.h"
	#include "dev/hsa/hsa_queue.hh"
	#include "enums/GfxVersion.hh"
	#include "params/HSAPacketProcessor.hh"
	#include "sim/eventq.hh"

	#define AQL_PACKET_SIZE 64
	#define PAGE_SIZE 4096
	#define NUM_DMA_BUFS 16
	#define DMA_BUF_SIZE (AQL_PACKET_SIZE * NUM_DMA_BUFS)
	// HSA runtime supports only 5 signals per barrier packet
	#define NumSignalsPerBarrier 5

	namespace gem5
	{

	class AMDGPUDevice;

	// Ideally, each queue should store this status and
	// the processPkt() should make decisions based on that
	// status variable.
	enum Q_STATE
	{
	UNBLOCKED = 0, // Unblocked queue, can submit packets.
	BLOCKED_BBIT, // Queue blocked by barrier bit.
	// Can submit packet packets after
	// previous packet completes.
	BLOCKED_BPKT, // Queue blocked by barrier packet.
	// Can submit packet packets after
	// barrier packet completes.
	};

	class GPUCommandProcessor;
	class HWScheduler;

	// Our internal representation of an HSA queue
	class HSAQueueDescriptor
	{
	public:
	uint64_t basePointer;
	uint64_t doorbellPointer;
	uint64_t writeIndex;
	uint64_t readIndex;
	uint32_t numElts;
	uint64_t hostReadIndexPtr;
	bool stalledOnDmaBufAvailability;
	bool dmaInProgress;
	GfxVersion gfxVersion;

	HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
	uint64_t hri_ptr, uint32_t size,
	GfxVersion gfxVersion)
	: basePointer(base_ptr), doorbellPointer(db_ptr),
	writeIndex(0), readIndex(0),
	numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr),
	stalledOnDmaBufAvailability(false),
	dmaInProgress(false), gfxVersion(gfxVersion)
	{ }
	uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
	uint64_t spaceUsed() { return writeIndex - readIndex; }
	uint32_t objSize() { return AQL_PACKET_SIZE; }
	uint32_t numObjs() { return numElts; }
	bool isFull() { return spaceRemaining() == 0; }
	bool isEmpty() { return spaceRemaining() == numElts; }

	uint64_t ptr(uint64_t ix)
	{
	/*
	* Based on ROCm Documentation:
	* - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
	10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
	rocr/src/core/runtime/amd_aql_queue.cpp#L99
	* - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
	10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
	rocr/src/core/runtime/amd_aql_queue.cpp#L624
	*
	* GFX7 and GFX8 will allocate twice as much space for their HSA
	* queues as they actually access (using mod operations to map the
	* virtual addresses from the upper half of the queue to the same
	* virtual addresses as the lower half). Thus, we need to check if
	* the ISA is GFX8 and mod the address by half of the queue size if
	* so.
	*/
	uint64_t retAddr = 0ll;
	if ((gfxVersion == GfxVersion::gfx801) \|\|
	(gfxVersion == GfxVersion::gfx803)) {
	retAddr = basePointer + ((ix % (numElts/2)) * objSize());
	DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
	"index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
	"objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
	numElts, numElts/2, objSize(), retAddr);
	} else {
	retAddr = basePointer + ((ix % numElts) * objSize());
	DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
	"index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
	"retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
	retAddr);
	}
	return retAddr;
	}
	};

	/**
	* Internal ring buffer which is used to prefetch/store copies of the
	* in-memory HSA ring buffer. Each packet in the queue has three implicit
	* states tracked by a packet's relative location to the write, read, and
	* dispatch pointers.
	*
	* FREE: Entry is empty
	* ALLOCATED: Entry has been allocated for a packet, but the DMA has not
	* yet completed
	* SUBMITTED: Packet has been submitted to the GPUCommandProcessor, but has not
	* yet completed
	*/
	class AQLRingBuffer
	{
	private:
	std::vector<hsa_kernel_dispatch_packet_t> _aqlBuf;
	std::string _name;
	std::vector<Addr> _hostDispAddresses;
	std::vector<bool> _aqlComplete;
	uint64_t _wrIdx; // Points to next write location
	uint64_t _rdIdx; // Read pointer of AQL buffer
	uint64_t _dispIdx; // Dispatch pointer of AQL buffer

	public:
	std::string name() {return _name;}
	AQLRingBuffer(uint32_t size, const std::string name);
	int allocEntry(uint32_t nBufReq);
	bool freeEntry(void *pkt);

	/**
	* the kernel may try to read from the dispatch packet,
	* so we need to keep the host address that corresponds
	* to each of the dispatch packets this AQL buffer is
	* storing. when we call submitPkt(), we send along the
	* corresponding host address for the packet so the
	* wavefront can properly initialize its SGPRs - which
	* may include a pointer to the dispatch packet
	*/
	void
	saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
	{
	for (int i = 0; i < num_pkts; ++i) {
	_hostDispAddresses[ix % numObjs()] = host_pkt_addr + i * objSize();
	++ix;
	}
	}

	Addr
	hostDispAddr() const
	{
	return _hostDispAddresses[dispIdx() % numObjs()];
	}

	bool
	dispPending() const
	{
	int packet_type = (_aqlBuf[_dispIdx % _aqlBuf.size()].header
	>> HSA_PACKET_HEADER_TYPE) &
	((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1);
	return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
	}

	/**
	* Packets aren't guaranteed to be completed in-order, and we need
	* to know when the last packet is finished in order to un-set
	* the barrier bit. In order to confirm if the packet at _rdIdx
	* is the last packet, we check if the packets ahead of _rdIdx
	* are finished. If they are, _rdIdx is the last packet. If not,
	* there are other outstanding packets.
	*/
	bool
	isLastOutstandingPkt() const
	{
	for (int i = _rdIdx + 1; i < _dispIdx; i++) {
	if (!_aqlComplete[i % _aqlBuf.size()]) {
	return false;
	}
	}
	return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx;
	}

	uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
	void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
	uint32_t numObjs() const { return _aqlBuf.size(); };
	uint32_t objSize() const { return AQL_PACKET_SIZE; }
	uint64_t dispIdx() const { return _dispIdx; }
	uint64_t wrIdx() const { return _wrIdx; }
	uint64_t rdIdx() const { return _rdIdx; }
	uint64_t* rdIdxPtr() { return &_rdIdx; }
	void incRdIdx(uint64_t value) { _rdIdx += value; }
	void incWrIdx(uint64_t value) { _wrIdx += value; }
	void incDispIdx(uint64_t value) { _dispIdx += value; }
	uint64_t compltnPending() { return (_dispIdx - _rdIdx); }
	void setRdIdx(uint64_t value);
	};

	struct QCntxt
	{
	HSAQueueDescriptor* qDesc;
	AQLRingBuffer* aqlBuf;
	// used for HSA packets that enforce synchronization with barrier bit
	bool barrierBit;
	QCntxt(HSAQueueDescriptor* q_desc, AQLRingBuffer* aql_buf) :
	qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false)
	{}
	QCntxt() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {}
	};

	class HSAPacketProcessor: public DmaVirtDevice
	{
	friend class HWScheduler;
	protected:
	typedef void (DmaDevice::DmaFnPtr)(Addr, int, Event, uint8_t*, Tick);
	GPUCommandProcessor *gpu_device;
	HWScheduler *hwSchdlr;
	AMDGPUDevice *gpuDevice;
	VegaISA::Walker *walker;

	// Structure to store the read values of dependency signals
	// from shared memory. Also used for tracking the status of
	// those reads while they are in progress
	class SignalState
	{
	public:
	SignalState()
	: pendingReads(0), allRead(false), discardRead(false)
	{
	values.resize(NumSignalsPerBarrier);
	}
	void handleReadDMA();
	int pendingReads;
	bool allRead;
	// If this queue is unmapped when there are pending reads, then
	// the pending reads has to be discarded.
	bool discardRead;
	// values stores the value of already read dependency signal
	std::vector<hsa_signal_value_t> values;
	void
	resetSigVals()
	{
	std::fill(values.begin(), values.end(), 1);
	}
	};

	class QueueProcessEvent : public Event
	{
	private:
	HSAPacketProcessor *hsaPP;
	uint32_t rqIdx;
	public:
	QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
	: Event(Default_Pri), hsaPP(_hsaPP), rqIdx(_rqIdx)
	{}
	virtual void process();
	virtual const char *description() const;
	};

	// Registered queue list entry; each entry has one queueDescriptor and
	// associated AQL buffer
	class RQLEntry
	{
	public:
	RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
	: aqlProcessEvent(hsaPP, rqIdx) {}
	QCntxt qCntxt;
	bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
	uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); }
	SignalState depSignalRdState;
	QueueProcessEvent aqlProcessEvent;
	void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
	bool getBarrierBit() const { return qCntxt.barrierBit; }
	bool isLastOutstandingPkt() const
	{
	return qCntxt.aqlBuf->isLastOutstandingPkt();
	}
	};
	// Keeps track of queueDescriptors of registered queues
	std::vector<class RQLEntry *> regdQList;

	Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
	void displayQueueDescriptor(int pid, uint32_t rl_idx);

	public:
	HSAQueueDescriptor*
	getQueueDesc(uint32_t queId)
	{
	return regdQList.at(queId)->qCntxt.qDesc;
	}
	class RQLEntry*
	getRegdListEntry(uint32_t queId)
	{
	return regdQList.at(queId);
	}

	uint64_t
	inFlightPkts(uint32_t queId)
	{
	auto aqlBuf = regdQList.at(queId)->qCntxt.aqlBuf;
	return aqlBuf->dispIdx() - aqlBuf->rdIdx();
	}

	int numHWQueues;
	Addr pioAddr;
	Addr pioSize;
	Tick pioDelay;
	const Tick pktProcessDelay;

	typedef HSAPacketProcessorParams Params;
	HSAPacketProcessor(const Params &p);
	~HSAPacketProcessor();
	TranslationGenPtr translate(Addr vaddr, Addr size) override;
	void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
	uint64_t basePointer,
	uint64_t queue_id,
	uint32_t size, int doorbellSize,
	GfxVersion gfxVersion,
	Addr offset = 0, uint64_t rd_idx = 0);
	void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
	void setDevice(GPUCommandProcessor * dev);
	void setGPUDevice(AMDGPUDevice *gpu_device);
	void updateReadIndex(int, uint32_t);
	void getCommandsFromHost(int pid, uint32_t rl_idx);
	HWScheduler *hwScheduler() { return hwSchdlr; }

	// PIO interface
	virtual Tick read(Packet*) override;
	virtual Tick write(Packet*) override;
	virtual AddrRangeList getAddrRanges() const override;
	void finishPkt(void *pkt, uint32_t rl_idx);
	void finishPkt(void *pkt) { finishPkt(pkt, 0); }
	void schedAQLProcessing(uint32_t rl_idx);
	void schedAQLProcessing(uint32_t rl_idx, Tick delay);

	void sendAgentDispatchCompletionSignal(void *pkt,
	hsa_signal_value_t signal);
	void sendCompletionSignal(hsa_signal_value_t signal);

	/**
	* Calls getCurrentEntry once the queueEntry has been dmaRead.
	*/
	struct dma_series_ctx
	{
	// deal with the fact dma ops can complete out of issue order
	uint32_t pkts_ttl;
	uint32_t pkts_2_go;
	uint32_t start_ix;
	uint32_t rl_idx;

	dma_series_ctx(uint32_t _pkts_ttl,
	uint32_t _pkts_2_go,
	uint32_t _start_ix,
	uint32_t _rl_idx)
	: pkts_ttl(_pkts_2_go), pkts_2_go(_pkts_2_go),
	start_ix(_start_ix), rl_idx(_rl_idx)
	{};
	~dma_series_ctx() {};
	};

	void updateReadDispIdDma();
	void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead,
	uint32_t ix_start, unsigned num_pkts,
	dma_series_ctx series_ctx, void dest_4debug);
	void handleReadDMA();
	};

	} // namespace gem5

	#endif // __DEV_HSA_HSA_PACKET_PROCESSOR__