| /* |
| * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| |
| #include "gpu-compute/dispatcher.hh" |
| |
| #include "debug/GPUDisp.hh" |
| #include "debug/GPUKernelInfo.hh" |
| #include "debug/GPUWgLatency.hh" |
| #include "gpu-compute/gpu_command_processor.hh" |
| #include "gpu-compute/hsa_queue_entry.hh" |
| #include "gpu-compute/shader.hh" |
| #include "gpu-compute/wavefront.hh" |
| #include "sim/syscall_emul_buf.hh" |
| #include "sim/system.hh" |
| |
| GPUDispatcher::GPUDispatcher(const Params *p) |
| : SimObject(p), shader(nullptr), gpuCmdProc(nullptr), |
| tickEvent([this]{ exec(); }, |
| "GPU Dispatcher tick", false, Event::CPU_Tick_Pri), |
| dispatchActive(false) |
| { |
| schedule(&tickEvent, 0); |
| } |
| |
| GPUDispatcher::~GPUDispatcher() |
| { |
| } |
| |
| void |
| GPUDispatcher::regStats() |
| { |
| numKernelLaunched |
| .name(name() + ".num_kernel_launched") |
| .desc("number of kernel launched") |
| ; |
| |
| cyclesWaitingForDispatch |
| .name(name() + ".cycles_wait_dispatch") |
| .desc("number of cycles with outstanding wavefronts " |
| "that are waiting to be dispatched") |
| ; |
| } |
| |
| HSAQueueEntry* |
| GPUDispatcher::hsaTask(int disp_id) |
| { |
| assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end()); |
| return hsaQueueEntries[disp_id]; |
| } |
| |
| void |
| GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc) |
| { |
| gpuCmdProc = gpu_cmd_proc; |
| } |
| |
| void |
| GPUDispatcher::setShader(Shader *new_shader) |
| { |
| shader = new_shader; |
| } |
| |
| void |
| GPUDispatcher::serialize(CheckpointOut &cp) const |
| { |
| Tick event_tick = 0; |
| |
| if (tickEvent.scheduled()) |
| event_tick = tickEvent.when(); |
| |
| SERIALIZE_SCALAR(event_tick); |
| } |
| |
| void |
| GPUDispatcher::unserialize(CheckpointIn &cp) |
| { |
| Tick event_tick; |
| |
| if (tickEvent.scheduled()) |
| deschedule(&tickEvent); |
| |
| UNSERIALIZE_SCALAR(event_tick); |
| |
| if (event_tick) { |
| schedule(&tickEvent, event_tick); |
| } |
| } |
| |
| /** |
| * After all relevant HSA data structures have been traversed/extracted |
| * from memory by the CP, dispatch() is called on the dispatcher. This will |
| * schedule a dispatch event that, when triggered, will attempt to dispatch |
| * the WGs associated with the given task to the CUs. |
| */ |
| void |
| GPUDispatcher::dispatch(HSAQueueEntry *task) |
| { |
| ++numKernelLaunched; |
| |
| DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n", |
| task->kernelName(), task->dispatchId()); |
| |
| execIds.push(task->dispatchId()); |
| dispatchActive = true; |
| hsaQueueEntries.emplace(task->dispatchId(), task); |
| |
| if (!tickEvent.scheduled()) { |
| schedule(&tickEvent, curTick() + shader->clockPeriod()); |
| } |
| } |
| |
| void |
| GPUDispatcher::exec() |
| { |
| int fail_count(0); |
| |
| /** |
| * There are potentially multiple outstanding kernel launches. |
| * It is possible that the workgroups in a different kernel |
| * can fit on the GPU even if another kernel's workgroups cannot |
| */ |
| DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); |
| |
| if (execIds.size() > 0) { |
| ++cyclesWaitingForDispatch; |
| } |
| |
| /** |
| * dispatch work cannot start until the kernel's invalidate is |
| * completely finished; hence, kernel will always initiates |
| * invalidate first and keeps waiting until inv done |
| */ |
| while (execIds.size() > fail_count) { |
| int exec_id = execIds.front(); |
| auto task = hsaQueueEntries[exec_id]; |
| bool launched(false); |
| |
| // acq is needed before starting dispatch |
| if (shader->impl_kern_launch_acq) { |
| // try to invalidate cache |
| shader->prepareInvalidate(task); |
| } else { |
| // kern launch acquire is not set, skip invalidate |
| task->markInvDone(); |
| } |
| |
| /** |
| * invalidate is still ongoing, put the kernel on the queue to |
| * retry later |
| */ |
| if (!task->isInvDone()){ |
| execIds.push(exec_id); |
| ++fail_count; |
| |
| DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending" |
| " invalidate requests\n", exec_id, task->outstandingInvs()); |
| |
| // try the next kernel_id |
| execIds.pop(); |
| continue; |
| } |
| |
| // kernel invalidate is done, start workgroup dispatch |
| while (!task->dispComplete()) { |
| // update the thread context |
| shader->updateContext(task->contextId()); |
| |
| // attempt to dispatch workgroup |
| DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n", |
| curTick(), exec_id); |
| |
| if (!shader->dispatchWorkgroups(task)) { |
| /** |
| * if we failed try the next kernel, |
| * it may have smaller workgroups. |
| * put it on the queue to rety latter |
| */ |
| DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id); |
| execIds.push(exec_id); |
| ++fail_count; |
| break; |
| } else if (!launched) { |
| launched = true; |
| DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id); |
| } |
| } |
| |
| // try the next kernel_id |
| execIds.pop(); |
| } |
| |
| DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); |
| |
| while (doneIds.size()) { |
| DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front()); |
| doneIds.pop(); |
| } |
| } |
| |
| bool |
| GPUDispatcher::isReachingKernelEnd(Wavefront *wf) |
| { |
| int kern_id = wf->kernId; |
| assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end()); |
| auto task = hsaQueueEntries[kern_id]; |
| assert(task->dispatchId() == kern_id); |
| |
| /** |
| * whether the next workgroup is the final one in the kernel, |
| * +1 as we check first before taking action |
| */ |
| return (task->numWgCompleted() + 1 == task->numWgTotal()); |
| } |
| |
| /** |
| * update the counter of oustanding inv requests for the kernel |
| * kern_id: kernel id |
| * val: +1/-1, increment or decrement the counter (default: -1) |
| */ |
| void |
| GPUDispatcher::updateInvCounter(int kern_id, int val) { |
| assert(val == -1 || val == 1); |
| |
| auto task = hsaQueueEntries[kern_id]; |
| task->updateOutstandingInvs(val); |
| |
| // kernel invalidate is done, schedule dispatch work |
| if (task->isInvDone() && !tickEvent.scheduled()) { |
| schedule(&tickEvent, curTick() + shader->clockPeriod()); |
| } |
| } |
| |
| /** |
| * update the counter of oustanding wb requests for the kernel |
| * kern_id: kernel id |
| * val: +1/-1, increment or decrement the counter (default: -1) |
| * |
| * return true if all wbs are done for the kernel |
| */ |
| bool |
| GPUDispatcher::updateWbCounter(int kern_id, int val) { |
| assert(val == -1 || val == 1); |
| |
| auto task = hsaQueueEntries[kern_id]; |
| task->updateOutstandingWbs(val); |
| |
| // true: WB is done, false: WB is still ongoing |
| return (task->outstandingWbs() == 0); |
| } |
| |
| /** |
| * get kernel's outstanding cache writeback requests |
| */ |
| int |
| GPUDispatcher::getOutstandingWbs(int kernId) { |
| auto task = hsaQueueEntries[kernId]; |
| |
| return task->outstandingWbs(); |
| } |
| |
| /** |
| * When an end program instruction detects that the last WF in |
| * a WG has completed it will call this method on the dispatcher. |
| * If we detect that this is the last WG for the given task, then |
| * we ring the completion signal, which is used by the CPU to |
| * synchronize with the GPU. The HSAPP is also notified that the |
| * task has completed so it can be removed from its task queues. |
| */ |
| void |
| GPUDispatcher::notifyWgCompl(Wavefront *wf) |
| { |
| int kern_id = wf->kernId; |
| DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId); |
| auto task = hsaQueueEntries[kern_id]; |
| assert(task->dispatchId() == kern_id); |
| task->notifyWgCompleted(); |
| |
| DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n", |
| curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id); |
| |
| if (task->numWgCompleted() == task->numWgTotal()) { |
| // Notify the HSA PP that this kernel is complete |
| gpuCmdProc->hsaPacketProc() |
| .finishPkt(task->dispPktPtr(), task->queueId()); |
| if (task->completionSignal()) { |
| // The signal value is aligned 8 bytes from |
| // the actual handle in the runtime |
| Addr signal_addr = task->completionSignal() + sizeof(Addr); |
| DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering " |
| "completion signal: %x!\n", signal_addr); |
| |
| /** |
| * HACK: The semantics of the HSA signal is to decrement |
| * the current signal value. We cheat here and read out |
| * he value from main memory using functional access and |
| * then just DMA the decremented value. This is because |
| * the DMA controller does not currently support GPU |
| * atomics. |
| */ |
| auto *tc = gpuCmdProc->system()->threads[0]; |
| auto &virt_proxy = tc->getVirtProxy(); |
| TypedBufferArg<Addr> prev_signal(signal_addr); |
| prev_signal.copyIn(virt_proxy); |
| |
| Addr *new_signal = new Addr; |
| *new_signal = (Addr)*prev_signal - 1; |
| |
| gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr, |
| new_signal, 0); |
| } else { |
| DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion " |
| "signal\n"); |
| } |
| |
| DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n", |
| curTick(), kern_id); |
| DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id); |
| } |
| |
| if (!tickEvent.scheduled()) { |
| schedule(&tickEvent, curTick() + shader->clockPeriod()); |
| } |
| } |
| |
| void |
| GPUDispatcher::scheduleDispatch() |
| { |
| if (!tickEvent.scheduled()) { |
| schedule(&tickEvent, curTick() + shader->clockPeriod()); |
| } |
| } |
| |
| GPUDispatcher *GPUDispatcherParams::create() |
| { |
| return new GPUDispatcher(this); |
| } |