blob: 0e1035c8220f51dcc54cfab56a330dfacaff3497 [file] [log] [blame]
/*
* Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Brad Beckmann,
* Marc Orr,
* Anthony Gutierrez
*/
#include "gpu-compute/dispatcher.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUKernelInfo.hh"
#include "debug/GPUWgLatency.hh"
#include "gpu-compute/gpu_command_processor.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "sim/syscall_emul_buf.hh"
#include "sim/system.hh"
GPUDispatcher::GPUDispatcher(const Params *p)
: SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
tickEvent([this]{ exec(); },
"GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
dispatchActive(false)
{
schedule(&tickEvent, 0);
}
GPUDispatcher::~GPUDispatcher()
{
}
void
GPUDispatcher::regStats()
{
numKernelLaunched
.name(name() + ".num_kernel_launched")
.desc("number of kernel launched")
;
cyclesWaitingForDispatch
.name(name() + ".cycles_wait_dispatch")
.desc("number of cycles with outstanding wavefronts "
"that are waiting to be dispatched")
;
}
HSAQueueEntry*
GPUDispatcher::hsaTask(int disp_id)
{
assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());
return hsaQueueEntries[disp_id];
}
void
GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
{
gpuCmdProc = gpu_cmd_proc;
}
void
GPUDispatcher::setShader(Shader *new_shader)
{
shader = new_shader;
}
void
GPUDispatcher::serialize(CheckpointOut &cp) const
{
Tick event_tick = 0;
if (tickEvent.scheduled())
event_tick = tickEvent.when();
SERIALIZE_SCALAR(event_tick);
}
void
GPUDispatcher::unserialize(CheckpointIn &cp)
{
Tick event_tick;
if (tickEvent.scheduled())
deschedule(&tickEvent);
UNSERIALIZE_SCALAR(event_tick);
if (event_tick) {
schedule(&tickEvent, event_tick);
}
}
/**
* After all relevant HSA data structures have been traversed/extracted
* from memory by the CP, dispatch() is called on the dispatcher. This will
* schedule a dispatch event that, when triggered, will attempt to dispatch
* the WGs associated with the given task to the CUs.
*/
void
GPUDispatcher::dispatch(HSAQueueEntry *task)
{
++numKernelLaunched;
DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
task->kernelName(), task->dispatchId());
execIds.push(task->dispatchId());
dispatchActive = true;
hsaQueueEntries.emplace(task->dispatchId(), task);
if (!tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
}
void
GPUDispatcher::exec()
{
int fail_count(0);
/**
* There are potentially multiple outstanding kernel launches.
* It is possible that the workgroups in a different kernel
* can fit on the GPU even if another kernel's workgroups cannot
*/
DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
if (execIds.size() > 0) {
++cyclesWaitingForDispatch;
}
/**
* dispatch work cannot start until the kernel's invalidate is
* completely finished; hence, kernel will always initiates
* invalidate first and keeps waiting until inv done
*/
while (execIds.size() > fail_count) {
int exec_id = execIds.front();
auto task = hsaQueueEntries[exec_id];
bool launched(false);
// acq is needed before starting dispatch
if (shader->impl_kern_launch_acq) {
// try to invalidate cache
shader->prepareInvalidate(task);
} else {
// kern launch acquire is not set, skip invalidate
task->markInvDone();
}
/**
* invalidate is still ongoing, put the kernel on the queue to
* retry later
*/
if (!task->isInvDone()){
execIds.push(exec_id);
++fail_count;
DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"
" invalidate requests\n", exec_id, task->outstandingInvs());
// try the next kernel_id
execIds.pop();
continue;
}
// kernel invalidate is done, start workgroup dispatch
while (!task->dispComplete()) {
// update the thread context
shader->updateContext(task->contextId());
// attempt to dispatch workgroup
DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",
curTick(), exec_id);
if (!shader->dispatchWorkgroups(task)) {
/**
* if we failed try the next kernel,
* it may have smaller workgroups.
* put it on the queue to rety latter
*/
DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
execIds.push(exec_id);
++fail_count;
break;
} else if (!launched) {
launched = true;
DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
}
}
// try the next kernel_id
execIds.pop();
}
DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
while (doneIds.size()) {
DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
doneIds.pop();
}
}
bool
GPUDispatcher::isReachingKernelEnd(Wavefront *wf)
{
int kern_id = wf->kernId;
assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());
auto task = hsaQueueEntries[kern_id];
assert(task->dispatchId() == kern_id);
/**
* whether the next workgroup is the final one in the kernel,
* +1 as we check first before taking action
*/
return (task->numWgCompleted() + 1 == task->numWgTotal());
}
/**
* update the counter of oustanding inv requests for the kernel
* kern_id: kernel id
* val: +1/-1, increment or decrement the counter (default: -1)
*/
void
GPUDispatcher::updateInvCounter(int kern_id, int val) {
assert(val == -1 || val == 1);
auto task = hsaQueueEntries[kern_id];
task->updateOutstandingInvs(val);
// kernel invalidate is done, schedule dispatch work
if (task->isInvDone() && !tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
}
/**
* update the counter of oustanding wb requests for the kernel
* kern_id: kernel id
* val: +1/-1, increment or decrement the counter (default: -1)
*
* return true if all wbs are done for the kernel
*/
bool
GPUDispatcher::updateWbCounter(int kern_id, int val) {
assert(val == -1 || val == 1);
auto task = hsaQueueEntries[kern_id];
task->updateOutstandingWbs(val);
// true: WB is done, false: WB is still ongoing
return (task->outstandingWbs() == 0);
}
/**
* get kernel's outstanding cache writeback requests
*/
int
GPUDispatcher::getOutstandingWbs(int kernId) {
auto task = hsaQueueEntries[kernId];
return task->outstandingWbs();
}
/**
* When an end program instruction detects that the last WF in
* a WG has completed it will call this method on the dispatcher.
* If we detect that this is the last WG for the given task, then
* we ring the completion signal, which is used by the CPU to
* synchronize with the GPU. The HSAPP is also notified that the
* task has completed so it can be removed from its task queues.
*/
void
GPUDispatcher::notifyWgCompl(Wavefront *wf)
{
int kern_id = wf->kernId;
DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);
auto task = hsaQueueEntries[kern_id];
assert(task->dispatchId() == kern_id);
task->notifyWgCompleted();
DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);
if (task->numWgCompleted() == task->numWgTotal()) {
// Notify the HSA PP that this kernel is complete
gpuCmdProc->hsaPacketProc()
.finishPkt(task->dispPktPtr(), task->queueId());
if (task->completionSignal()) {
// The signal value is aligned 8 bytes from
// the actual handle in the runtime
Addr signal_addr = task->completionSignal() + sizeof(Addr);
DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
"completion signal: %x!\n", signal_addr);
/**
* HACK: The semantics of the HSA signal is to decrement
* the current signal value. We cheat here and read out
* he value from main memory using functional access and
* then just DMA the decremented value. This is because
* the DMA controller does not currently support GPU
* atomics.
*/
auto tc = gpuCmdProc->system()->getThreadContext(0);
auto process = tc->getProcessPtr();
auto mem_state = process->getMemState();
auto &virt_proxy = mem_state->getVirtProxy();
TypedBufferArg<Addr> prev_signal(signal_addr);
prev_signal.copyIn(virt_proxy);
Addr *new_signal = new Addr;
*new_signal = (Addr)*prev_signal - 1;
gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
new_signal, 0);
} else {
DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
"signal\n");
}
DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",
curTick(), kern_id);
DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
}
if (!tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
}
void
GPUDispatcher::scheduleDispatch()
{
if (!tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
}
GPUDispatcher *GPUDispatcherParams::create()
{
return new GPUDispatcher(this);
}