blob: 8a1adfe8022f54e6cca943dd26567ff2f2125377 [file] [log] [blame] [edit]
/*
* Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/wavefront.hh"
#include "base/bitfield.hh"
#include "debug/GPUExec.hh"
#include "debug/GPUInitAbi.hh"
#include "debug/WavefrontStack.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/vector_register_file.hh"
namespace gem5
{
Wavefront::Wavefront(const Params &p)
: SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
maxIbSize(p.max_ib_size), _gpuISA(*this),
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
{
lastTrace = 0;
execUnitId = -1;
status = S_STOPPED;
reservedVectorRegs = 0;
reservedScalarRegs = 0;
startVgprIndex = 0;
startSgprIndex = 0;
outstandingReqs = 0;
outstandingReqsWrGm = 0;
outstandingReqsWrLm = 0;
outstandingReqsRdGm = 0;
outstandingReqsRdLm = 0;
rdLmReqsInPipe = 0;
rdGmReqsInPipe = 0;
wrLmReqsInPipe = 0;
wrGmReqsInPipe = 0;
scalarRdGmReqsInPipe = 0;
scalarWrGmReqsInPipe = 0;
scalarOutstandingReqsRdGm = 0;
scalarOutstandingReqsWrGm = 0;
lastNonIdleTick = 0;
ldsChunk = nullptr;
memTraceBusy = 0;
oldVgprTcnt = 0xffffffffffffffffll;
oldDgprTcnt = 0xffffffffffffffffll;
oldVgpr.resize(p.wf_size);
pendingFetch = false;
dropFetch = false;
maxVgprs = 0;
maxSgprs = 0;
lastAddr.resize(p.wf_size);
workItemFlatId.resize(p.wf_size);
oldDgpr.resize(p.wf_size);
for (int i = 0; i < 3; ++i) {
workItemId[i].resize(p.wf_size);
}
_execMask.set();
rawDist.clear();
lastInstExec = 0;
vecReads.clear();
}
void
Wavefront::init()
{
reservedVectorRegs = 0;
reservedScalarRegs = 0;
startVgprIndex = 0;
startSgprIndex = 0;
scalarAlu = computeUnit->mapWaveToScalarAlu(this);
scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
globalMem = computeUnit->mapWaveToGlobalMem(this);
localMem = computeUnit->mapWaveToLocalMem(this);
scalarMem = computeUnit->mapWaveToScalarMem(this);
}
void
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
{
int regInitIdx = 0;
// Iterate over all the init fields and check which
// bits are enabled. Useful information can be found here:
// https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/
// blob/master/AMDGPU-ABI.md
for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
if (task->sgprBitEnabled(en_bit)) {
int physSgprIdx = 0;
uint32_t wiCount = 0;
uint32_t firstWave = 0;
int orderedAppendTerm = 0;
int numWfsInWg = 0;
uint32_t finalValue = 0;
Addr host_disp_pkt_addr = task->hostDispPktAddr();
Addr kernarg_addr = task->kernargAddr();
Addr hidden_priv_base(0);
switch (en_bit) {
case PrivateSegBuf:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[0]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[0]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[1]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[1]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[2]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[2]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[3]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[3]);
break;
case DispatchPtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(host_disp_pkt_addr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(host_disp_pkt_addr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(host_disp_pkt_addr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(host_disp_pkt_addr, 63, 32));
++regInitIdx;
break;
case QueuePtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(task->hostAMDQueueAddr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting QueuePtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(task->hostAMDQueueAddr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(task->hostAMDQueueAddr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting QueuePtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(task->hostAMDQueueAddr, 63, 32));
++regInitIdx;
break;
case KernargSegPtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(kernarg_addr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting KernargSegPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(kernarg_addr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(kernarg_addr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting KernargSegPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(kernarg_addr, 63, 32));
++regInitIdx;
break;
case DispatchId:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->dispatchId());
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchId: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->dispatchId());
// Dispatch ID in gem5 is an int. Set upper 32-bits to zero.
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx, 0);
++regInitIdx;
break;
case FlatScratchInit:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
(TheGpuISA::ScalarRegU32)(task->amdQueue
.scratch_backing_memory_location & 0xffffffff));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting FlatScratch Addr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
(TheGpuISA::ScalarRegU32)(task->amdQueue
.scratch_backing_memory_location & 0xffffffff));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
// This vallue should be sizeof(DWORD) aligned, that is
// 4 byte aligned
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_workitem_byte_size);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting FlatScratch size: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_workitem_byte_size);
/**
* Since flat scratch init is needed for this kernel, this
* kernel is going to have flat memory instructions and we
* need to initialize the hidden private base for this queue.
* scratch_resource_descriptor[0] has this queue's scratch
* base address. scratch_backing_memory_location has the
* offset to this queue's scratch base address from the
* SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
* queue's scratch base address for address calculation
* (stored in scratch_resource_descriptor[0]). But that
* address calculation shoule be done by first finding the
* queue's scratch base address using the calculation
* "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
* SH_HIDDEN_PRIVATE_BASE_VMID.
*
* For more details see:
* http://rocm-documentation.readthedocs.io/en/latest/
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
*
* https://github.com/ROCm-Developer-Tools/
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
* #flat-addressing
*/
hidden_priv_base =
(uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
(((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
& 0x000000000000ffff) << 32);
computeUnit->shader->initShHiddenPrivateBase(
hidden_priv_base,
task->amdQueue.scratch_backing_memory_location);
break;
case PrivateSegSize:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->privMemPerItem());
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting private segment size: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->privMemPerItem());
break;
case GridWorkgroupCountX:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
wiCount = ((task->gridSize(0) +
task->wgSize(0) - 1) /
task->wgSize(0));
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting num WG X: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, wiCount);
break;
case GridWorkgroupCountY:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
wiCount = ((task->gridSize(1) +
task->wgSize(1) - 1) /
task->wgSize(1));
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting num WG Y: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, wiCount);
break;
case GridWorkgroupCountZ:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
wiCount = ((task->gridSize(2) +
task->wgSize(2) - 1) /
task->wgSize(2));
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting num WG Z: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, wiCount);
break;
case WorkgroupIdX:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[0]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID X: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
break;
case WorkgroupIdY:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[1]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID Y: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
break;
case WorkgroupIdZ:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[2]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID Z: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
break;
case PrivSegWaveByteOffset:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
/**
* the compute_tmpring_size_wavesize specifies the number of
* kB allocated per wavefront, hence the multiplication by
* 1024.
*
* to get the per wavefront offset into the scratch
* memory, we also multiply this by the wfId. the wfId stored
* in the Wavefront class, however, is the wave ID within the
* WG, whereas here we need the global WFID because the
* scratch space will be divided amongst all waves in the
* kernel. to get the global ID we multiply the WGID by
* the WG size, then add the WFID of the wave within its WG.
*/
computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
(wgId * (wgSz / 64) + wfId) *
task->amdQueue.compute_tmpring_size_wavesize);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting Private Seg Offset: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
1024 * (wgId * (wgSz / 64) + wfId) *
task->amdQueue.compute_tmpring_size_wavesize);
break;
case WorkgroupInfo:
firstWave = (wfId == 0) ? 1 : 0;
numWfsInWg = divCeil(wgSizeInWorkItems,
computeUnit->wfSize());
finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
finalValue |= (orderedAppendTerm << 6);
finalValue |= numWfsInWg;
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->
write(physSgprIdx, finalValue);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG Info: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, finalValue);
break;
default:
fatal("SGPR enable bit %i not supported\n", en_bit);
break;
}
}
}
regInitIdx = 0;
// iterate over all the init fields and check which
// bits are enabled
for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
if (task->vgprBitEnabled(en_bit)) {
uint32_t physVgprIdx = 0;
TheGpuISA::VecRegContainerU32 raw_vgpr;
switch (en_bit) {
case WorkitemIdX:
{
physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_x
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[0].size(); ++lane) {
vgpr_x[lane] = workItemId[0][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
case WorkitemIdY:
{
physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_y
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[1].size(); ++lane) {
vgpr_y[lane] = workItemId[1][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
case WorkitemIdZ:
{
physVgprIdx = computeUnit->registerManager->
mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_z
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[2].size(); ++lane) {
vgpr_z[lane] = workItemId[2][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
}
}
}
}
void
Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
{
maxVgprs = num_vregs;
maxSgprs = num_sregs;
}
Wavefront::~Wavefront()
{
}
void
Wavefront::setStatus(status_e newStatus)
{
if (computeUnit->idleCUTimeout > 0) {
// Wavefront's status transitions to stalled or stopped
if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
(status != newStatus)) {
computeUnit->idleWfs++;
assert(computeUnit->idleWfs <=
(computeUnit->shader->n_wf * computeUnit->numVectorALUs));
if (computeUnit->idleWfs ==
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
lastNonIdleTick = curTick();
}
// Wavefront's status transitions to an active state (from
// a stopped or stalled state)
} else if ((status == S_STOPPED || status == S_STALLED ||
status == S_WAITCNT || status == S_BARRIER) &&
(status != newStatus)) {
// if all WFs in the CU were idle then check if the idleness
// period exceeded the timeout threshold
if (computeUnit->idleWfs ==
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
panic_if((curTick() - lastNonIdleTick) >=
computeUnit->idleCUTimeout,
"CU%d has been idle for %d ticks at tick %d",
computeUnit->cu_id, computeUnit->idleCUTimeout,
curTick());
}
computeUnit->idleWfs--;
assert(computeUnit->idleWfs >= 0);
}
}
status = newStatus;
}
void
Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
{
wfDynId = _wf_dyn_id;
_pc = init_pc;
status = S_RUNNING;
vecReads.resize(maxVgprs, 0);
}
bool
Wavefront::isGmInstruction(GPUDynInstPtr ii)
{
if (ii->isGlobalMem() ||
(ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
return true;
}
return false;
}
bool
Wavefront::isLmInstruction(GPUDynInstPtr ii)
{
if (ii->isLocalMem() ||
(ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstSleep()
{
if (instructionBuffer.empty())
return false;
GPUDynInstPtr ii = instructionBuffer.front();
if (ii->isSleep()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstWaitcnt()
{
if (instructionBuffer.empty())
return false;
GPUDynInstPtr ii = instructionBuffer.front();
if (ii->isWaitcnt()) {
// waitcnt is a scalar
assert(ii->isScalar());
return true;
}
return false;
}
bool
Wavefront::isOldestInstScalarALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
|| ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
(ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstVectorALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
|| (ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstBarrier()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isBarrier()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstGMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstScalarMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstLMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isLocalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstPrivMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isPrivateSeg()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstFlatMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isFlat()) {
return true;
}
return false;
}
bool
Wavefront::stopFetch()
{
for (auto it : instructionBuffer) {
GPUDynInstPtr ii = it;
if (ii->isReturn() || ii->isBranch() ||
ii->isEndOfKernel()) {
return true;
}
}
return false;
}
void
Wavefront::freeResources()
{
execUnitId = -1;
}
void Wavefront::validateRequestCounters()
{
panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
outstandingReqs < 0,
"Negative requests in pipe for WF%d for slot%d"
" and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
" Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
" Outstanding Reqs=%d\n",
wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
}
void
Wavefront::reserveGmResource(GPUDynInstPtr ii)
{
if (!ii->isScalar()) {
if (ii->isLoad()) {
rdGmReqsInPipe++;
} else if (ii->isStore()) {
wrGmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
rdGmReqsInPipe++;
wrGmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = globalMem;
} else {
if (ii->isLoad()) {
scalarRdGmReqsInPipe++;
} else if (ii->isStore()) {
scalarWrGmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
scalarWrGmReqsInPipe++;
scalarRdGmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = scalarMem;
}
}
void
Wavefront::reserveLmResource(GPUDynInstPtr ii)
{
fatal_if(ii->isScalar(),
"Scalar instructions can not access Shared memory!!!");
if (ii->isLoad()) {
rdLmReqsInPipe++;
} else if (ii->isStore()) {
wrLmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
wrLmReqsInPipe++;
rdLmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = localMem;
}
std::vector<int>
Wavefront::reserveResources()
{
// vector of execution unit IDs to return to schedule stage
// this return is only used for debugging and an assertion...
std::vector<int> execUnitIds;
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
assert(ii);
// Single precision ALU or Branch or Return or Special instruction
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
ii->isReturn() || ii->isEndOfKernel()) {
if (!ii->isScalar()) {
execUnitId = simdId;
} else {
execUnitId = scalarAluGlobalIdx;
}
// this is to enforce a fixed number of cycles per issue slot per SIMD
} else if (ii->isBarrier()) {
execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
} else if (ii->isFlat()) {
assert(!ii->isScalar());
reserveLmResource(ii);
// add execUnitId, reserved by reserveLmResource, list before it is
// overwriten by reserveGmResource
execUnitIds.push_back(execUnitId);
flatLmUnitId = execUnitId;
reserveGmResource(ii);
flatGmUnitId = execUnitId;
execUnitIds.push_back(flatGmUnitId);
execUnitId = -1;
} else if (ii->isGlobalMem()) {
reserveGmResource(ii);
} else if (ii->isLocalMem()) {
reserveLmResource(ii);
} else if (ii->isPrivateSeg()) {
fatal_if(ii->isScalar(),
"Scalar instructions can not access Private memory!!!");
reserveGmResource(ii);
} else {
panic("reserveResources -> Couldn't process op!\n");
}
if (execUnitId != -1) {
execUnitIds.push_back(execUnitId);
}
assert(execUnitIds.size());
return execUnitIds;
}
void
Wavefront::exec()
{
// ---- Exit if wavefront is inactive ----------------------------- //
if (status == S_STOPPED || status == S_RETURNING ||
status==S_STALLED || instructionBuffer.empty()) {
return;
}
if (status == S_WAITCNT) {
/**
* if this wave is in S_WAITCNT state, then
* it should enter exec() precisely one time
* before the waitcnts are satisfied, in order
* to execute the waitcnt instruction itself
* thus we assert that the waitcnt is the
* oldest instruction. if we enter exec() with
* active waitcnts, and we're not executing
* the waitcnt instruction, something must be
* wrong
*/
assert(isOldestInstWaitcnt());
}
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
const Addr old_pc = pc();
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
"(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
wfDynId, ii->disassemble(), old_pc, ii->seqNum());
ii->execute(ii);
// delete the dynamic instruction from the pipeline map
computeUnit->deleteFromPipeMap(this);
// update the instruction stats in the CU
computeUnit->updateInstStats(ii);
// inform VRF of instruction execution to schedule write-back
// and scoreboard ready for registers
if (!ii->isScalar()) {
computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
}
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
computeUnit->stats.numInstrExecuted++;
stats.numInstrExecuted++;
computeUnit->instExecPerSimd[simdId]++;
computeUnit->stats.execRateDist.sample(
computeUnit->stats.totalCycles.value() -
computeUnit->lastExecCycle[simdId]);
computeUnit->lastExecCycle[simdId] =
computeUnit->stats.totalCycles.value();
if (lastInstExec) {
computeUnit->stats.instInterleave[simdId].
sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
}
lastInstExec = computeUnit->instExecPerSimd[simdId];
// want to track:
// number of reads that occur per value written
// vector RAW dependency tracking
for (const auto& srcVecOp : ii->srcVecRegOperands()) {
for (const auto& virtIdx : srcVecOp.virtIndices()) {
// This check should never fail, but to be safe we check
if (rawDist.find(virtIdx) != rawDist.end()) {
stats.vecRawDistance.sample(stats.numInstrExecuted.value() -
rawDist[virtIdx]);
}
// increment number of reads to this register
vecReads[virtIdx]++;
}
}
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
for (const auto& virtIdx : dstVecOp.virtIndices()) {
// rawDist is set on writes, but will not be set for the first
// write to each physical register
if (rawDist.find(virtIdx) != rawDist.end()) {
// Sample the number of reads that were performed
stats.readsPerWrite.sample(vecReads[virtIdx]);
}
// on a write, reset count of reads to 0
vecReads[virtIdx] = 0;
rawDist[virtIdx] = stats.numInstrExecuted.value();
}
}
if (pc() == old_pc) {
// PC not modified by instruction, proceed to next
_gpuISA.advancePC(ii);
instructionBuffer.pop_front();
} else {
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
ii->disassemble());
discardFetch();
}
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
const int num_active_lanes = execMask().count();
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
if (ii->isF16() && ii->isALU()) {
if (ii->isF32() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F16, and (2)"
"either F32 or F64.");
}
computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF32() && ii->isALU()) {
if (ii->isF16() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F32, and (2)"
"either F16 or F64.");
}
computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF64() && ii->isALU()) {
if (ii->isF16() || ii->isF32()) {
fatal("Instruction is tagged as both (1) F64, and (2)"
"either F16 or F32.");
}
computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (isGmInstruction(ii)) {
computeUnit->stats.activeLanesPerGMemInstrDist.sample(
num_active_lanes);
} else if (isLmInstruction(ii)) {
computeUnit->stats.activeLanesPerLMemInstrDist.sample(
num_active_lanes);
}
}
/**
* we return here to avoid spurious errors related to flat insts
* and their address segment resolution.
*/
if (execMask().none() && ii->isFlat()) {
computeUnit->getTokenManager()->recvTokens(1);
return;
}
// Update Vector ALU pipeline and other resources
bool flat_as_gm = false;
bool flat_as_lm = false;
if (ii->isFlat()) {
flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
(ii->executedAs() == enums::SC_PRIVATE);
flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
}
// Single precision ALU or Branch or Return or Special instruction
// Note, we use the same timing regardless of SP or DP ALU operation.
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) ||
ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
// this is to enforce a fixed number of cycles per issue slot per SIMD
if (!ii->isScalar()) {
computeUnit->vectorALUs[simdId].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
} else {
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
}
// Barrier on Scalar ALU
} else if (ii->isBarrier()) {
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
// GM or Flat as GM Load
} else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(
computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
computeUnit->vrf_gm_bus_latency;
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->srf_scm_bus_latency));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
computeUnit->srf_scm_bus_latency;
}
// GM or Flat as GM Store
} else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
} else if ((ii->isAtomic() || ii->isMemSync()) &&
(ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
// LM or Flat as LM Load
} else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->vrf_lm_bus_latency));
computeUnit->vectorSharedMemUnit.
set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
computeUnit->vrf_lm_bus_latency;
// LM or Flat as LM Store
} else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
// LM or Flat as LM, Atomic or MemFence
} else if ((ii->isAtomic() || ii->isMemSync()) &&
(ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
} else {
panic("Bad instruction type!\n");
}
}
GPUDynInstPtr
Wavefront::nextInstr()
{
// Read next instruction from instruction buffer
GPUDynInstPtr ii = instructionBuffer.front();
// if the WF has been dispatched in the schedule stage then
// check the next oldest instruction for readiness
if (computeUnit->pipeMap.find(ii->seqNum()) !=
computeUnit->pipeMap.end()) {
if (instructionBuffer.size() > 1) {
auto it = instructionBuffer.begin() + 1;
return *it;
} else { // No new instructions to check
return nullptr;
}
}
return ii;
}
void
Wavefront::discardFetch()
{
instructionBuffer.clear();
dropFetch |= pendingFetch;
/**
* clear the fetch buffer for this wave in order to
* remove any stale inst data
*/
computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
}
bool
Wavefront::waitCntsSatisfied()
{
// Both vmWaitCnt && lgkmWaitCnt uninitialized means
// waitCnt instruction has been dispatched but not executed yet: next
// instruction should be blocked until waitCnt is executed.
if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
return false;
}
/**
* If we reach here, that means an s_waitcnt instruction was executed
* and the waitcnts are set by the execute method. Check if waitcnts
* are satisfied.
*/
if (vmWaitCnt != -1) {
if (vmemInstsIssued > vmWaitCnt) {
// vmWaitCnt not satisfied
return false;
}
}
if (expWaitCnt != -1) {
if (expInstsIssued > expWaitCnt) {
// expWaitCnt not satisfied
return false;
}
}
if (lgkmWaitCnt != -1) {
if (lgkmInstsIssued > lgkmWaitCnt) {
// lgkmWaitCnt not satisfied
return false;
}
}
// if we get here all outstanding waitcnts must
// be satisfied, so we resume normal operation
clearWaitCnts();
return true;
}
bool
Wavefront::sleepDone()
{
assert(status == S_STALLED_SLEEP);
// if the sleep count has not been set, then the sleep instruction has not
// been executed yet, so we will return true without setting the wavefront
// status
if (sleepCnt == 0)
return false;
sleepCnt--;
if (sleepCnt != 0)
return false;
status = S_RUNNING;
return true;
}
void
Wavefront::setSleepTime(int sleep_time)
{
assert(sleepCnt == 0);
sleepCnt = sleep_time;
}
void
Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
{
// the scoreboard should have set the status
// to S_WAITCNT once a waitcnt instruction
// was marked as ready
assert(status == S_WAITCNT);
// waitcnt instruction shouldn't be sending
// negative counts
assert(vm_wait_cnt >= 0);
assert(exp_wait_cnt >= 0);
assert(lgkm_wait_cnt >= 0);
// waitcnts are a max of 15 because we have
// only 1 nibble (4 bits) to set the counts
assert(vm_wait_cnt <= 0xf);
assert(exp_wait_cnt <= 0x7);
assert(lgkm_wait_cnt <= 0x1f);
/**
* prior waitcnts should be satisfied,
* at which time the WF resets them
* back to -1, indicating they are no
* longer active
*/
assert(vmWaitCnt == -1);
assert(expWaitCnt == -1);
assert(lgkmWaitCnt == -1);
/**
* if the instruction encoding
* indicates a waitcnt of 0xf,
* that means the waitcnt is
* not being used
*/
if (vm_wait_cnt != 0xf)
vmWaitCnt = vm_wait_cnt;
if (exp_wait_cnt != 0x7)
expWaitCnt = exp_wait_cnt;
if (lgkm_wait_cnt != 0x1f)
lgkmWaitCnt = lgkm_wait_cnt;
}
void
Wavefront::clearWaitCnts()
{
// reset the waitcnts back to
// -1, indicating they are no
// longer valid
vmWaitCnt = -1;
expWaitCnt = -1;
lgkmWaitCnt = -1;
// resume running normally
status = S_RUNNING;
}
void
Wavefront::incVMemInstsIssued()
{
++vmemInstsIssued;
}
void
Wavefront::incExpInstsIssued()
{
++expInstsIssued;
}
void
Wavefront::incLGKMInstsIssued()
{
++lgkmInstsIssued;
}
void
Wavefront::decVMemInstsIssued()
{
--vmemInstsIssued;
}
void
Wavefront::decExpInstsIssued()
{
--expInstsIssued;
}
void
Wavefront::decLGKMInstsIssued()
{
--lgkmInstsIssued;
}
Addr
Wavefront::pc() const
{
return _pc;
}
void
Wavefront::pc(Addr new_pc)
{
_pc = new_pc;
}
VectorMask&
Wavefront::execMask()
{
return _execMask;
}
bool
Wavefront::execMask(int lane) const
{
return _execMask[lane];
}
void
Wavefront::freeRegisterFile()
{
/* clear busy registers */
for (int i=0; i < maxVgprs; i++) {
int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
computeUnit->vrf[simdId]->markReg(vgprIdx, false);
}
/* Free registers used by this wavefront */
uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
computeUnit->vrf[simdId]->numRegs();
computeUnit->registerManager->vrfPoolMgrs[simdId]->
freeRegion(startVgprIndex, endIndex);
}
void
Wavefront::computeActualWgSz(HSAQueueEntry *task)
{
actualWgSzTotal = 1;
for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
- task->wgId(d) * workGroupSz[d]);
actualWgSzTotal *= actualWgSz[d];
}
}
void
Wavefront::barrierId(int bar_id)
{
assert(bar_id >= WFBarrier::InvalidID);
assert(bar_id < computeUnit->numBarrierSlots());
barId = bar_id;
}
int
Wavefront::barrierId() const
{
return barId;
}
bool
Wavefront::hasBarrier() const
{
return barId > WFBarrier::InvalidID;
}
void
Wavefront::releaseBarrier()
{
barId = WFBarrier::InvalidID;
}
Wavefront::WavefrontStats::WavefrontStats(statistics::Group *parent)
: statistics::Group(parent),
ADD_STAT(numInstrExecuted,
"number of instructions executed by this WF slot"),
ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
"RF denied adding instruction"),
ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
" not available"),
ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
"RF reads to complete"),
ADD_STAT(schLdsArbStalls,
"number of cycles wave stalled due to LDS-VRF arbitration"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
"instructions are blocked due to WAW or WAR dependencies"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
"instructions are blocked due to RAW dependencies"),
ADD_STAT(vecRawDistance,
"Count of RAW distance in dynamic instructions for this WF"),
ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
{
vecRawDistance.init(0, 20, 1);
readsPerWrite.init(0, 4, 1);
}
} // namespace gem5