src/gpu-compute/gpu_command_processor.cc - public/gem5 - Git at Google

 /*
  * Copyright (c) 2018 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #include "gpu-compute/gpu_command_processor.hh"

 #include <cassert>

 #include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "mem/abstract_mem.hh"
 #include "mem/packet_access.hh"
 #include "mem/se_translating_port_proxy.hh"
 #include "mem/translating_port_proxy.hh"
 #include "params/GPUCommandProcessor.hh"
 #include "sim/full_system.hh"
 #include "sim/process.hh"
 #include "sim/proxy_ptr.hh"
 #include "sim/syscall_emul_buf.hh"

 namespace gem5
 {

 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
     : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
       walker(p.walker), hsaPP(p.hsapp)
 {
     assert(hsaPP);
     hsaPP->setDevice(this);
     dispatcher.setCommandProcessor(this);
 }

 HSAPacketProcessor&
 GPUCommandProcessor::hsaPacketProc()
 {
     return *hsaPP;
 }

 /**
  * Forward the VRAM requestor ID needed for device memory from GPU device.
  */
 RequestorID
 GPUCommandProcessor::vramRequestorId()
 {
     return gpuDevice->vramRequestorId();
 }

 TranslationGenPtr
 GPUCommandProcessor::translate(Addr vaddr, Addr size)
 {
     if (!FullSystem) {
         // Grab the process and try to translate the virtual address with it;
         // with new extensions, it will likely be wrong to just arbitrarily
         // grab context zero.
         auto process = sys->threads[0]->getProcessPtr();

         return process->pTable->translateRange(vaddr, size);
     }

     // In full system use the page tables setup by the kernel driver rather
     // than the CPU page tables.
     return TranslationGenPtr(
         new AMDGPUVM::UserTranslationGen(&gpuDevice->getVM(), walker,
                                          1 /* vmid */, vaddr, size));
 }

 /**
  * submitDispatchPkt() is the entry point into the CP from the HSAPP
  * and is only meant to be used with AQL kernel dispatch packets.
  * After the HSAPP receives and extracts an AQL packet, it sends
  * it to the CP, which is responsible for gathering all relevant
  * information about a task, initializing CU state, and sending
  * it to the dispatcher for WG creation and dispatch.
  *
  * First we need capture all information from the the AQL pkt and
  * the code object, then store it in an HSAQueueEntry. Once the
  * packet and code are extracted, we extract information from the
  * queue descriptor that the CP needs to perform state initialization
  * on the CU. Finally we call dispatch() to send the task to the
  * dispatcher. When the task completely finishes, we call finishPkt()
  * on the HSA packet processor in order to remove the packet from the
  * queue, and notify the runtime that the task has completed.
  */
 void
 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                                        Addr host_pkt_addr)
 {
     static int dynamic_task_id = 0;
     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
     assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));

     /**
      * we need to read a pointer in the application's address
      * space to pull out the kernel code descriptor.
      */
     auto *tc = sys->threads[0];

     TranslatingPortProxy fs_proxy(tc);
     SETranslatingPortProxy se_proxy(tc);
     PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;

     /**
      * In full system mode, the page table entry may point to a system page
      * or a device page. System pages use the proxy as normal, but a device
      * page needs to be read from device memory. Check what type it is here.
      */
     bool is_system_page = true;
     Addr phys_addr = disp_pkt->kernel_object;
     if (FullSystem) {
         /**
          * Full system currently only supports running on single VMID (one
          * virtual memory space), i.e., one application running on GPU at a
          * time. Because of this, for now we know the VMID is always 1. Later
          * the VMID would have to be passed on to the command processor.
          */
         int vmid = 1;
         unsigned tmp_bytes;
         walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
                                 phys_addr, tmp_bytes, BaseMMU::Mode::Read,
                                 is_system_page);
     }

     DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
             disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
             is_system_page);

     /**
      * The kernel_object is a pointer to the machine code, whose entry
      * point is an 'amd_kernel_code_t' type, which is included in the
      * kernel binary, and describes various aspects of the kernel. The
      * desired entry is the 'kernel_code_entry_byte_offset' field,
      * which provides the byte offset (positive or negative) from the
      * address of the amd_kernel_code_t to the start of the machine
      * instructions.
      */
     AMDKernelCode akc;
     if (is_system_page) {
         DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
         virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
             sizeof(AMDKernelCode));
     } else {
         assert(FullSystem);
         DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");

         // Read from GPU memory manager one cache line at a time to prevent
         // rare cases where the AKC spans two memory pages.
         ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
                            system()->cacheLineSize());
         for (; !gen.done(); gen.next()) {
             Addr chunk_addr = gen.addr();
             int vmid = 1;
             unsigned dummy;
             walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
                                     chunk_addr, dummy, BaseMMU::Mode::Read,
                                     is_system_page);

             Request::Flags flags = Request::PHYSICAL;
             RequestPtr request = std::make_shared<Request>(chunk_addr,
                 system()->cacheLineSize(), flags, walker->getDevRequestor());
             Packet *readPkt = new Packet(request, MemCmd::ReadReq);
             readPkt->dataStatic((uint8_t *)&akc + gen.complete());
             system()->getDeviceMemory(readPkt)->access(readPkt);
             delete readPkt;
         }
     }

     DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
         "kernel object\n", akc.kernel_code_entry_byte_offset);

     DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
         (uint64_t)tc->cpuId());


     Addr machine_code_addr = (Addr)disp_pkt->kernel_object
         + akc.kernel_code_entry_byte_offset;

     DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
         machine_code_addr);

     std::string kernel_name;

     /**
      * BLIT kernels don't have symbol names.  BLIT kernels are built-in compute
      * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
      * hardware engines are unavailable or explicitly disabled.  They can also
      * be used to do copies that ROCm things would be better performed
      * by the shader than the SDMA engines.  They are also sometimes used on
      * APUs to implement asynchronous memcopy operations from 2 pointers in
      * host memory.  I have no idea what BLIT stands for.
      * */
     if (akc.runtime_loader_kernel_symbol) {
         kernel_name = "Some kernel";
     } else {
         kernel_name = "Blit kernel";
     }

     DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());

     HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
         dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);

     DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
         "grid size (%dx%dx%d) kernarg addr: %#x, completion "
         "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
         disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
         disp_pkt->grid_size_x, disp_pkt->grid_size_y,
         disp_pkt->grid_size_z, disp_pkt->kernarg_address,
         disp_pkt->completion_signal);

     DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
         "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
         "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
         task->numScalarRegs(), task->codeAddr(), 0, 0);

     initABI(task);
     ++dynamic_task_id;
 }

 uint64_t
 GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
 {
     Addr value_addr = getHsaSignalValueAddr(signal_handle);
     auto tc = system()->threads[0];
     ConstVPtr<Addr> prev_value(value_addr, tc);
     return *prev_value;
 }

 void
 GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
                                      HsaSignalCallbackFunction function)
 {
     // The signal value is aligned 8 bytes from
     // the actual handle in the runtime
     Addr value_addr = getHsaSignalValueAddr(signal_handle);
     Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
     Addr event_addr = getHsaSignalEventAddr(signal_handle);
     DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);

     auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);

     dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);

     auto tc = system()->threads[0];
     ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);

     // Notifying an event with its mailbox pointer is
     // not supported in the current implementation. Just use
     // mailbox pointer to distinguish between interruptible
     // and default signal. Interruptible signal will have
     // a valid mailbox pointer.
     if (*mailbox_ptr != 0) {
         // This is an interruptible signal. Now, read the
         // event ID and directly communicate with the driver
         // about that event notification.
         ConstVPtr<uint32_t> event_val(event_addr, tc);

         DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
                 "signal event value %d\n", *event_val);

         // The mailbox/wakeup signal uses the SE mode proxy port to write
         // the event value. This is not available in full system mode so
         // instead we need to issue a DMA write to the address. The value of
         // *event_val clears the event.
         if (FullSystem) {
             auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);
             dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
         } else {
             signalWakeupEvent(*event_val);
         }
     }
 }

 void
 GPUCommandProcessor::attachDriver(GPUComputeDriver *gpu_driver)
 {
     fatal_if(_driver, "Should not overwrite driver.");
     // TODO: GPU Driver inheritance hierarchy doesn't really make sense.
     // Should get rid of the base class.
     _driver = gpu_driver;
     assert(_driver);
 }

 GPUComputeDriver*
 GPUCommandProcessor::driver()
 {
     return _driver;
 }

 /**
  * submitVendorPkt() is for accepting vendor-specific packets from
  * the HSAPP. Vendor-specific packets may be used by the runtime to
  * send commands to the HSA device that are specific to a particular
  * vendor. The vendor-specific packets should be defined by the vendor
  * in the runtime.
  */

 /**
  * TODO: For now we simply tell the HSAPP to finish the packet,
  *       however a future patch will update this method to provide
  *       the proper handling of any required vendor-specific packets.
  *       In the version of ROCm that is currently supported (1.6)
  *       the runtime will send packets that direct the CP to
  *       invalidate the GPUs caches. We do this automatically on
  *       each kernel launch in the CU, so this is safe for now.
  */
 void
 GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
     Addr host_pkt_addr)
 {
     hsaPP->finishPkt(raw_pkt, queue_id);
 }

 /**
  * submitAgentDispatchPkt() is for accepting agent dispatch packets.
  * These packets will control the dispatch of Wg on the device, and inform
  * the host when a specified number of Wg have been executed on the device.
  *
  * For now it simply finishes the pkt.
  */
 void
 GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
     Addr host_pkt_addr)
 {
     //Parse the Packet, see what it wants us to do
     _hsa_agent_dispatch_packet_t * agent_pkt =
         (_hsa_agent_dispatch_packet_t *)raw_pkt;

     if (agent_pkt->type == AgentCmd::Nop) {
         DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
     } else if (agent_pkt->type == AgentCmd::Steal) {
         //This is where we steal the HSA Task's completion signal
         int kid = agent_pkt->arg[0];
         DPRINTF(GPUCommandProc,
             "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
             kid);

         HSAQueueEntry *task = dispatcher.hsaTask(kid);
         uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);

         uint64_t return_address = agent_pkt->return_address;
         DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
         //*return_address = signal_addr;
         Addr *new_signal_addr = new Addr;
         *new_signal_addr  = (Addr)signal_addr;
         dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);

         DPRINTF(GPUCommandProc,
             "Agent Dispatch Packet Stealing signal handle from kid %d :" \
             "(%x:%x) writing into %x\n",
             kid,signal_addr,new_signal_addr,return_address);

     } else
     {
         panic("The agent dispatch packet provided an unknown argument in" \
         "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
     }

     hsaPP->finishPkt(raw_pkt, queue_id);
 }

 /**
  * Once the CP has finished extracting all relevant information about
  * a task and has initialized the ABI state, we send a description of
  * the task to the dispatcher. The dispatcher will create and dispatch
  * WGs to the CUs.
  */
 void
 GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
 {
     dispatcher.dispatch(task);
 }

 void
 GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
 {
     _driver->signalWakeupEvent(event_id);
 }

 /**
  * The CP is responsible for traversing all HSA-ABI-related data
  * structures from memory and initializing the ABI state.
  * Information provided by the MQD, AQL packet, and code object
  * metadata will be used to initialze register file state.
  */
 void
 GPUCommandProcessor::initABI(HSAQueueEntry *task)
 {
     auto cb = new DmaVirtCallback<uint32_t>(
         [ = ] (const uint32_t &readDispIdOffset)
             { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);

     Addr hostReadIdxPtr
         = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;

     dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
         sizeof(uint32_t), cb, &cb->dmaBuffer);
 }

 System*
 GPUCommandProcessor::system()
 {
     return sys;
 }

 AddrRangeList
 GPUCommandProcessor::getAddrRanges() const
 {
     AddrRangeList ranges;
     return ranges;
 }

 void
 GPUCommandProcessor::setGPUDevice(AMDGPUDevice *gpu_device)
 {
     gpuDevice = gpu_device;
     walker->setDevRequestor(gpuDevice->vramRequestorId());
 }

 void
 GPUCommandProcessor::setShader(Shader *shader)
 {
     _shader = shader;
 }

 Shader*
 GPUCommandProcessor::shader()
 {
     return _shader;
 }

 } // namespace gem5
	/*
	* Copyright (c) 2018 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* 3. Neither the name of the copyright holder nor the names of its
	* contributors may be used to endorse or promote products derived from this
	* software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "gpu-compute/gpu_command_processor.hh"

	#include <cassert>

	#include "arch/amdgpu/vega/pagetable_walker.hh"
	#include "base/chunk_generator.hh"
	#include "debug/GPUCommandProc.hh"
	#include "debug/GPUKernelInfo.hh"
	#include "dev/amdgpu/amdgpu_device.hh"
	#include "gpu-compute/dispatcher.hh"
	#include "mem/abstract_mem.hh"
	#include "mem/packet_access.hh"
	#include "mem/se_translating_port_proxy.hh"
	#include "mem/translating_port_proxy.hh"
	#include "params/GPUCommandProcessor.hh"
	#include "sim/full_system.hh"
	#include "sim/process.hh"
	#include "sim/proxy_ptr.hh"
	#include "sim/syscall_emul_buf.hh"

	namespace gem5
	{

	GPUCommandProcessor::GPUCommandProcessor(const Params &p)
	: DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
	walker(p.walker), hsaPP(p.hsapp)
	{
	assert(hsaPP);
	hsaPP->setDevice(this);
	dispatcher.setCommandProcessor(this);
	}

	HSAPacketProcessor&
	GPUCommandProcessor::hsaPacketProc()
	{
	return *hsaPP;
	}

	/**
	* Forward the VRAM requestor ID needed for device memory from GPU device.
	*/
	RequestorID
	GPUCommandProcessor::vramRequestorId()
	{
	return gpuDevice->vramRequestorId();
	}

	TranslationGenPtr
	GPUCommandProcessor::translate(Addr vaddr, Addr size)
	{
	if (!FullSystem) {
	// Grab the process and try to translate the virtual address with it;
	// with new extensions, it will likely be wrong to just arbitrarily
	// grab context zero.
	auto process = sys->threads[0]->getProcessPtr();

	return process->pTable->translateRange(vaddr, size);
	}

	// In full system use the page tables setup by the kernel driver rather
	// than the CPU page tables.
	return TranslationGenPtr(
	new AMDGPUVM::UserTranslationGen(&gpuDevice->getVM(), walker,
	1 /* vmid */, vaddr, size));
	}

	/**
	* submitDispatchPkt() is the entry point into the CP from the HSAPP
	* and is only meant to be used with AQL kernel dispatch packets.
	* After the HSAPP receives and extracts an AQL packet, it sends
	* it to the CP, which is responsible for gathering all relevant
	* information about a task, initializing CU state, and sending
	* it to the dispatcher for WG creation and dispatch.
	*
	* First we need capture all information from the the AQL pkt and
	* the code object, then store it in an HSAQueueEntry. Once the
	* packet and code are extracted, we extract information from the
	* queue descriptor that the CP needs to perform state initialization
	* on the CU. Finally we call dispatch() to send the task to the
	* dispatcher. When the task completely finishes, we call finishPkt()
	* on the HSA packet processor in order to remove the packet from the
	* queue, and notify the runtime that the task has completed.
	*/
	void
	GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
	Addr host_pkt_addr)
	{
	static int dynamic_task_id = 0;
	_hsa_dispatch_packet_t disp_pkt = (_hsa_dispatch_packet_t)raw_pkt;
	assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));

	/**
	* we need to read a pointer in the application's address
	* space to pull out the kernel code descriptor.
	*/
	auto *tc = sys->threads[0];

	TranslatingPortProxy fs_proxy(tc);
	SETranslatingPortProxy se_proxy(tc);
	PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;

	/**
	* In full system mode, the page table entry may point to a system page
	* or a device page. System pages use the proxy as normal, but a device
	* page needs to be read from device memory. Check what type it is here.
	*/
	bool is_system_page = true;
	Addr phys_addr = disp_pkt->kernel_object;
	if (FullSystem) {
	/**
	* Full system currently only supports running on single VMID (one
	* virtual memory space), i.e., one application running on GPU at a
	* time. Because of this, for now we know the VMID is always 1. Later
	* the VMID would have to be passed on to the command processor.
	*/
	int vmid = 1;
	unsigned tmp_bytes;
	walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
	phys_addr, tmp_bytes, BaseMMU::Mode::Read,
	is_system_page);
	}

	DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
	disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
	is_system_page);

	/**
	* The kernel_object is a pointer to the machine code, whose entry
	* point is an 'amd_kernel_code_t' type, which is included in the
	* kernel binary, and describes various aspects of the kernel. The
	* desired entry is the 'kernel_code_entry_byte_offset' field,
	* which provides the byte offset (positive or negative) from the
	* address of the amd_kernel_code_t to the start of the machine
	* instructions.
	*/
	AMDKernelCode akc;
	if (is_system_page) {
	DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
	virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
	sizeof(AMDKernelCode));
	} else {
	assert(FullSystem);
	DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");

	// Read from GPU memory manager one cache line at a time to prevent
	// rare cases where the AKC spans two memory pages.
	ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
	system()->cacheLineSize());
	for (; !gen.done(); gen.next()) {
	Addr chunk_addr = gen.addr();
	int vmid = 1;
	unsigned dummy;
	walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
	chunk_addr, dummy, BaseMMU::Mode::Read,
	is_system_page);

	Request::Flags flags = Request::PHYSICAL;
	RequestPtr request = std::make_shared<Request>(chunk_addr,
	system()->cacheLineSize(), flags, walker->getDevRequestor());
	Packet *readPkt = new Packet(request, MemCmd::ReadReq);
	readPkt->dataStatic((uint8_t *)&akc + gen.complete());
	system()->getDeviceMemory(readPkt)->access(readPkt);
	delete readPkt;
	}
	}

	DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
	"kernel object\n", akc.kernel_code_entry_byte_offset);

	DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
	(uint64_t)tc->cpuId());


	Addr machine_code_addr = (Addr)disp_pkt->kernel_object
	+ akc.kernel_code_entry_byte_offset;

	DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
	machine_code_addr);

	std::string kernel_name;

	/**
	* BLIT kernels don't have symbol names. BLIT kernels are built-in compute
	* kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
	* hardware engines are unavailable or explicitly disabled. They can also
	* be used to do copies that ROCm things would be better performed
	* by the shader than the SDMA engines. They are also sometimes used on
	* APUs to implement asynchronous memcopy operations from 2 pointers in
	* host memory. I have no idea what BLIT stands for.
	* */
	if (akc.runtime_loader_kernel_symbol) {
	kernel_name = "Some kernel";
	} else {
	kernel_name = "Blit kernel";
	}

	DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());

	HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
	dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);

	DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
	"grid size (%dx%dx%d) kernarg addr: %#x, completion "
	"signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
	disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
	disp_pkt->grid_size_x, disp_pkt->grid_size_y,
	disp_pkt->grid_size_z, disp_pkt->kernarg_address,
	disp_pkt->completion_signal);

	DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
	"num scalar regs: %d, code addr: %#x, kernarg size: %d, "
	"LDS size: %d)\n", kernel_name, task->numVectorRegs(),
	task->numScalarRegs(), task->codeAddr(), 0, 0);

	initABI(task);
	++dynamic_task_id;
	}

	uint64_t
	GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
	{
	Addr value_addr = getHsaSignalValueAddr(signal_handle);
	auto tc = system()->threads[0];
	ConstVPtr<Addr> prev_value(value_addr, tc);
	return *prev_value;
	}

	void
	GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
	HsaSignalCallbackFunction function)
	{
	// The signal value is aligned 8 bytes from
	// the actual handle in the runtime
	Addr value_addr = getHsaSignalValueAddr(signal_handle);
	Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
	Addr event_addr = getHsaSignalEventAddr(signal_handle);
	DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);

	auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);

	dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);

	auto tc = system()->threads[0];
	ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);

	// Notifying an event with its mailbox pointer is
	// not supported in the current implementation. Just use
	// mailbox pointer to distinguish between interruptible
	// and default signal. Interruptible signal will have
	// a valid mailbox pointer.
	if (*mailbox_ptr != 0) {
	// This is an interruptible signal. Now, read the
	// event ID and directly communicate with the driver
	// about that event notification.
	ConstVPtr<uint32_t> event_val(event_addr, tc);

	DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
	"signal event value %d\n", *event_val);

	// The mailbox/wakeup signal uses the SE mode proxy port to write
	// the event value. This is not available in full system mode so
	// instead we need to issue a DMA write to the address. The value of
	// *event_val clears the event.
	if (FullSystem) {
	auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);
	dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
	} else {
	signalWakeupEvent(*event_val);
	}
	}
	}

	void
	GPUCommandProcessor::attachDriver(GPUComputeDriver *gpu_driver)
	{
	fatal_if(_driver, "Should not overwrite driver.");
	// TODO: GPU Driver inheritance hierarchy doesn't really make sense.
	// Should get rid of the base class.
	_driver = gpu_driver;
	assert(_driver);
	}

	GPUComputeDriver*
	GPUCommandProcessor::driver()
	{
	return _driver;
	}

	/**
	* submitVendorPkt() is for accepting vendor-specific packets from
	* the HSAPP. Vendor-specific packets may be used by the runtime to
	* send commands to the HSA device that are specific to a particular
	* vendor. The vendor-specific packets should be defined by the vendor
	* in the runtime.
	*/

	/**
	* TODO: For now we simply tell the HSAPP to finish the packet,
	* however a future patch will update this method to provide
	* the proper handling of any required vendor-specific packets.
	* In the version of ROCm that is currently supported (1.6)
	* the runtime will send packets that direct the CP to
	* invalidate the GPUs caches. We do this automatically on
	* each kernel launch in the CU, so this is safe for now.
	*/
	void
	GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
	Addr host_pkt_addr)
	{
	hsaPP->finishPkt(raw_pkt, queue_id);
	}

	/**
	* submitAgentDispatchPkt() is for accepting agent dispatch packets.
	* These packets will control the dispatch of Wg on the device, and inform
	* the host when a specified number of Wg have been executed on the device.
	*
	* For now it simply finishes the pkt.
	*/
	void
	GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
	Addr host_pkt_addr)
	{
	//Parse the Packet, see what it wants us to do
	_hsa_agent_dispatch_packet_t * agent_pkt =
	(_hsa_agent_dispatch_packet_t *)raw_pkt;

	if (agent_pkt->type == AgentCmd::Nop) {
	DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
	} else if (agent_pkt->type == AgentCmd::Steal) {
	//This is where we steal the HSA Task's completion signal
	int kid = agent_pkt->arg[0];
	DPRINTF(GPUCommandProc,
	"Agent Dispatch Packet Stealing signal handle for kernel %d\n",
	kid);

	HSAQueueEntry *task = dispatcher.hsaTask(kid);
	uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);

	uint64_t return_address = agent_pkt->return_address;
	DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
	//*return_address = signal_addr;
	Addr *new_signal_addr = new Addr;
	*new_signal_addr = (Addr)signal_addr;
	dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);

	DPRINTF(GPUCommandProc,
	"Agent Dispatch Packet Stealing signal handle from kid %d :" \
	"(%x:%x) writing into %x\n",
	kid,signal_addr,new_signal_addr,return_address);

	} else
	{
	panic("The agent dispatch packet provided an unknown argument in" \
	"arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
	}

	hsaPP->finishPkt(raw_pkt, queue_id);
	}

	/**
	* Once the CP has finished extracting all relevant information about
	* a task and has initialized the ABI state, we send a description of
	* the task to the dispatcher. The dispatcher will create and dispatch
	* WGs to the CUs.
	*/
	void
	GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
	{
	dispatcher.dispatch(task);
	}

	void
	GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
	{
	_driver->signalWakeupEvent(event_id);
	}

	/**
	* The CP is responsible for traversing all HSA-ABI-related data
	* structures from memory and initializing the ABI state.
	* Information provided by the MQD, AQL packet, and code object
	* metadata will be used to initialze register file state.
	*/
	void
	GPUCommandProcessor::initABI(HSAQueueEntry *task)
	{
	auto cb = new DmaVirtCallback<uint32_t>(
	[ = ] (const uint32_t &readDispIdOffset)
	{ ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);

	Addr hostReadIdxPtr
	= hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;

	dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
	sizeof(uint32_t), cb, &cb->dmaBuffer);
	}

	System*
	GPUCommandProcessor::system()
	{
	return sys;
	}

	AddrRangeList
	GPUCommandProcessor::getAddrRanges() const
	{
	AddrRangeList ranges;
	return ranges;
	}

	void
	GPUCommandProcessor::setGPUDevice(AMDGPUDevice *gpu_device)
	{
	gpuDevice = gpu_device;
	walker->setDevRequestor(gpuDevice->vramRequestorId());
	}

	void
	GPUCommandProcessor::setShader(Shader *shader)
	{
	_shader = shader;
	}

	Shader*
	GPUCommandProcessor::shader()
	{
	return _shader;
	}

	} // namespace gem5