| /* |
| * Copyright (c) 2018 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "gpu-compute/gpu_command_processor.hh" |
| |
| #include <cassert> |
| |
| #include "arch/amdgpu/vega/pagetable_walker.hh" |
| #include "base/chunk_generator.hh" |
| #include "debug/GPUCommandProc.hh" |
| #include "debug/GPUKernelInfo.hh" |
| #include "dev/amdgpu/amdgpu_device.hh" |
| #include "gpu-compute/dispatcher.hh" |
| #include "mem/abstract_mem.hh" |
| #include "mem/packet_access.hh" |
| #include "mem/se_translating_port_proxy.hh" |
| #include "mem/translating_port_proxy.hh" |
| #include "params/GPUCommandProcessor.hh" |
| #include "sim/full_system.hh" |
| #include "sim/process.hh" |
| #include "sim/proxy_ptr.hh" |
| #include "sim/syscall_emul_buf.hh" |
| |
| namespace gem5 |
| { |
| |
| GPUCommandProcessor::GPUCommandProcessor(const Params &p) |
| : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr), |
| walker(p.walker), hsaPP(p.hsapp) |
| { |
| assert(hsaPP); |
| hsaPP->setDevice(this); |
| dispatcher.setCommandProcessor(this); |
| } |
| |
| HSAPacketProcessor& |
| GPUCommandProcessor::hsaPacketProc() |
| { |
| return *hsaPP; |
| } |
| |
| /** |
| * Forward the VRAM requestor ID needed for device memory from GPU device. |
| */ |
| RequestorID |
| GPUCommandProcessor::vramRequestorId() |
| { |
| return gpuDevice->vramRequestorId(); |
| } |
| |
| TranslationGenPtr |
| GPUCommandProcessor::translate(Addr vaddr, Addr size) |
| { |
| if (!FullSystem) { |
| // Grab the process and try to translate the virtual address with it; |
| // with new extensions, it will likely be wrong to just arbitrarily |
| // grab context zero. |
| auto process = sys->threads[0]->getProcessPtr(); |
| |
| return process->pTable->translateRange(vaddr, size); |
| } |
| |
| // In full system use the page tables setup by the kernel driver rather |
| // than the CPU page tables. |
| return TranslationGenPtr( |
| new AMDGPUVM::UserTranslationGen(&gpuDevice->getVM(), walker, |
| 1 /* vmid */, vaddr, size)); |
| } |
| |
| /** |
| * submitDispatchPkt() is the entry point into the CP from the HSAPP |
| * and is only meant to be used with AQL kernel dispatch packets. |
| * After the HSAPP receives and extracts an AQL packet, it sends |
| * it to the CP, which is responsible for gathering all relevant |
| * information about a task, initializing CU state, and sending |
| * it to the dispatcher for WG creation and dispatch. |
| * |
| * First we need capture all information from the the AQL pkt and |
| * the code object, then store it in an HSAQueueEntry. Once the |
| * packet and code are extracted, we extract information from the |
| * queue descriptor that the CP needs to perform state initialization |
| * on the CU. Finally we call dispatch() to send the task to the |
| * dispatcher. When the task completely finishes, we call finishPkt() |
| * on the HSA packet processor in order to remove the packet from the |
| * queue, and notify the runtime that the task has completed. |
| */ |
| void |
| GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, |
| Addr host_pkt_addr) |
| { |
| static int dynamic_task_id = 0; |
| _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt; |
| assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1))); |
| |
| /** |
| * we need to read a pointer in the application's address |
| * space to pull out the kernel code descriptor. |
| */ |
| auto *tc = sys->threads[0]; |
| |
| TranslatingPortProxy fs_proxy(tc); |
| SETranslatingPortProxy se_proxy(tc); |
| PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy; |
| |
| /** |
| * In full system mode, the page table entry may point to a system page |
| * or a device page. System pages use the proxy as normal, but a device |
| * page needs to be read from device memory. Check what type it is here. |
| */ |
| bool is_system_page = true; |
| Addr phys_addr = disp_pkt->kernel_object; |
| if (FullSystem) { |
| /** |
| * Full system currently only supports running on single VMID (one |
| * virtual memory space), i.e., one application running on GPU at a |
| * time. Because of this, for now we know the VMID is always 1. Later |
| * the VMID would have to be passed on to the command processor. |
| */ |
| int vmid = 1; |
| unsigned tmp_bytes; |
| walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid), |
| phys_addr, tmp_bytes, BaseMMU::Mode::Read, |
| is_system_page); |
| } |
| |
| DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n", |
| disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode), |
| is_system_page); |
| |
| /** |
| * The kernel_object is a pointer to the machine code, whose entry |
| * point is an 'amd_kernel_code_t' type, which is included in the |
| * kernel binary, and describes various aspects of the kernel. The |
| * desired entry is the 'kernel_code_entry_byte_offset' field, |
| * which provides the byte offset (positive or negative) from the |
| * address of the amd_kernel_code_t to the start of the machine |
| * instructions. |
| */ |
| AMDKernelCode akc; |
| if (is_system_page) { |
| DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n"); |
| virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc, |
| sizeof(AMDKernelCode)); |
| } else { |
| assert(FullSystem); |
| DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n"); |
| |
| // Read from GPU memory manager one cache line at a time to prevent |
| // rare cases where the AKC spans two memory pages. |
| ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode), |
| system()->cacheLineSize()); |
| for (; !gen.done(); gen.next()) { |
| Addr chunk_addr = gen.addr(); |
| int vmid = 1; |
| unsigned dummy; |
| walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid), |
| chunk_addr, dummy, BaseMMU::Mode::Read, |
| is_system_page); |
| |
| Request::Flags flags = Request::PHYSICAL; |
| RequestPtr request = std::make_shared<Request>(chunk_addr, |
| system()->cacheLineSize(), flags, walker->getDevRequestor()); |
| Packet *readPkt = new Packet(request, MemCmd::ReadReq); |
| readPkt->dataStatic((uint8_t *)&akc + gen.complete()); |
| system()->getDeviceMemory(readPkt)->access(readPkt); |
| delete readPkt; |
| } |
| } |
| |
| DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the " |
| "kernel object\n", akc.kernel_code_entry_byte_offset); |
| |
| DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n", |
| (uint64_t)tc->cpuId()); |
| |
| |
| Addr machine_code_addr = (Addr)disp_pkt->kernel_object |
| + akc.kernel_code_entry_byte_offset; |
| |
| DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n", |
| machine_code_addr); |
| |
| std::string kernel_name; |
| |
| /** |
| * BLIT kernels don't have symbol names. BLIT kernels are built-in compute |
| * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA |
| * hardware engines are unavailable or explicitly disabled. They can also |
| * be used to do copies that ROCm things would be better performed |
| * by the shader than the SDMA engines. They are also sometimes used on |
| * APUs to implement asynchronous memcopy operations from 2 pointers in |
| * host memory. I have no idea what BLIT stands for. |
| * */ |
| if (akc.runtime_loader_kernel_symbol) { |
| kernel_name = "Some kernel"; |
| } else { |
| kernel_name = "Blit kernel"; |
| } |
| |
| DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str()); |
| |
| HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id, |
| dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr); |
| |
| DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), " |
| "grid size (%dx%dx%d) kernarg addr: %#x, completion " |
| "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x, |
| disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z, |
| disp_pkt->grid_size_x, disp_pkt->grid_size_y, |
| disp_pkt->grid_size_z, disp_pkt->kernarg_address, |
| disp_pkt->completion_signal); |
| |
| DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, " |
| "num scalar regs: %d, code addr: %#x, kernarg size: %d, " |
| "LDS size: %d)\n", kernel_name, task->numVectorRegs(), |
| task->numScalarRegs(), task->codeAddr(), 0, 0); |
| |
| initABI(task); |
| ++dynamic_task_id; |
| } |
| |
| uint64_t |
| GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle) |
| { |
| Addr value_addr = getHsaSignalValueAddr(signal_handle); |
| auto tc = system()->threads[0]; |
| ConstVPtr<Addr> prev_value(value_addr, tc); |
| return *prev_value; |
| } |
| |
| void |
| GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value, |
| HsaSignalCallbackFunction function) |
| { |
| // The signal value is aligned 8 bytes from |
| // the actual handle in the runtime |
| Addr value_addr = getHsaSignalValueAddr(signal_handle); |
| Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle); |
| Addr event_addr = getHsaSignalEventAddr(signal_handle); |
| DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr); |
| |
| auto cb = new DmaVirtCallback<uint64_t>(function, signal_value); |
| |
| dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0); |
| |
| auto tc = system()->threads[0]; |
| ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc); |
| |
| // Notifying an event with its mailbox pointer is |
| // not supported in the current implementation. Just use |
| // mailbox pointer to distinguish between interruptible |
| // and default signal. Interruptible signal will have |
| // a valid mailbox pointer. |
| if (*mailbox_ptr != 0) { |
| // This is an interruptible signal. Now, read the |
| // event ID and directly communicate with the driver |
| // about that event notification. |
| ConstVPtr<uint32_t> event_val(event_addr, tc); |
| |
| DPRINTF(GPUCommandProc, "Calling signal wakeup event on " |
| "signal event value %d\n", *event_val); |
| |
| // The mailbox/wakeup signal uses the SE mode proxy port to write |
| // the event value. This is not available in full system mode so |
| // instead we need to issue a DMA write to the address. The value of |
| // *event_val clears the event. |
| if (FullSystem) { |
| auto cb = new DmaVirtCallback<uint64_t>(function, *event_val); |
| dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0); |
| } else { |
| signalWakeupEvent(*event_val); |
| } |
| } |
| } |
| |
| void |
| GPUCommandProcessor::attachDriver(GPUComputeDriver *gpu_driver) |
| { |
| fatal_if(_driver, "Should not overwrite driver."); |
| // TODO: GPU Driver inheritance hierarchy doesn't really make sense. |
| // Should get rid of the base class. |
| _driver = gpu_driver; |
| assert(_driver); |
| } |
| |
| GPUComputeDriver* |
| GPUCommandProcessor::driver() |
| { |
| return _driver; |
| } |
| |
| /** |
| * submitVendorPkt() is for accepting vendor-specific packets from |
| * the HSAPP. Vendor-specific packets may be used by the runtime to |
| * send commands to the HSA device that are specific to a particular |
| * vendor. The vendor-specific packets should be defined by the vendor |
| * in the runtime. |
| */ |
| |
| /** |
| * TODO: For now we simply tell the HSAPP to finish the packet, |
| * however a future patch will update this method to provide |
| * the proper handling of any required vendor-specific packets. |
| * In the version of ROCm that is currently supported (1.6) |
| * the runtime will send packets that direct the CP to |
| * invalidate the GPUs caches. We do this automatically on |
| * each kernel launch in the CU, so this is safe for now. |
| */ |
| void |
| GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id, |
| Addr host_pkt_addr) |
| { |
| hsaPP->finishPkt(raw_pkt, queue_id); |
| } |
| |
| /** |
| * submitAgentDispatchPkt() is for accepting agent dispatch packets. |
| * These packets will control the dispatch of Wg on the device, and inform |
| * the host when a specified number of Wg have been executed on the device. |
| * |
| * For now it simply finishes the pkt. |
| */ |
| void |
| GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, |
| Addr host_pkt_addr) |
| { |
| //Parse the Packet, see what it wants us to do |
| _hsa_agent_dispatch_packet_t * agent_pkt = |
| (_hsa_agent_dispatch_packet_t *)raw_pkt; |
| |
| if (agent_pkt->type == AgentCmd::Nop) { |
| DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n"); |
| } else if (agent_pkt->type == AgentCmd::Steal) { |
| //This is where we steal the HSA Task's completion signal |
| int kid = agent_pkt->arg[0]; |
| DPRINTF(GPUCommandProc, |
| "Agent Dispatch Packet Stealing signal handle for kernel %d\n", |
| kid); |
| |
| HSAQueueEntry *task = dispatcher.hsaTask(kid); |
| uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t); |
| |
| uint64_t return_address = agent_pkt->return_address; |
| DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address); |
| //*return_address = signal_addr; |
| Addr *new_signal_addr = new Addr; |
| *new_signal_addr = (Addr)signal_addr; |
| dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0); |
| |
| DPRINTF(GPUCommandProc, |
| "Agent Dispatch Packet Stealing signal handle from kid %d :" \ |
| "(%x:%x) writing into %x\n", |
| kid,signal_addr,new_signal_addr,return_address); |
| |
| } else |
| { |
| panic("The agent dispatch packet provided an unknown argument in" \ |
| "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted"); |
| } |
| |
| hsaPP->finishPkt(raw_pkt, queue_id); |
| } |
| |
| /** |
| * Once the CP has finished extracting all relevant information about |
| * a task and has initialized the ABI state, we send a description of |
| * the task to the dispatcher. The dispatcher will create and dispatch |
| * WGs to the CUs. |
| */ |
| void |
| GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task) |
| { |
| dispatcher.dispatch(task); |
| } |
| |
| void |
| GPUCommandProcessor::signalWakeupEvent(uint32_t event_id) |
| { |
| _driver->signalWakeupEvent(event_id); |
| } |
| |
| /** |
| * The CP is responsible for traversing all HSA-ABI-related data |
| * structures from memory and initializing the ABI state. |
| * Information provided by the MQD, AQL packet, and code object |
| * metadata will be used to initialze register file state. |
| */ |
| void |
| GPUCommandProcessor::initABI(HSAQueueEntry *task) |
| { |
| auto cb = new DmaVirtCallback<uint32_t>( |
| [ = ] (const uint32_t &readDispIdOffset) |
| { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0); |
| |
| Addr hostReadIdxPtr |
| = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr; |
| |
| dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr), |
| sizeof(uint32_t), cb, &cb->dmaBuffer); |
| } |
| |
| System* |
| GPUCommandProcessor::system() |
| { |
| return sys; |
| } |
| |
| AddrRangeList |
| GPUCommandProcessor::getAddrRanges() const |
| { |
| AddrRangeList ranges; |
| return ranges; |
| } |
| |
| void |
| GPUCommandProcessor::setGPUDevice(AMDGPUDevice *gpu_device) |
| { |
| gpuDevice = gpu_device; |
| walker->setDevRequestor(gpuDevice->vramRequestorId()); |
| } |
| |
| void |
| GPUCommandProcessor::setShader(Shader *shader) |
| { |
| _shader = shader; |
| } |
| |
| Shader* |
| GPUCommandProcessor::shader() |
| { |
| return _shader; |
| } |
| |
| } // namespace gem5 |