| /* |
| * Copyright (c) 2011-2017 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "gpu-compute/wavefront.hh" |
| |
| #include "base/bitfield.hh" |
| #include "debug/GPUExec.hh" |
| #include "debug/GPUInitAbi.hh" |
| #include "debug/WavefrontStack.hh" |
| #include "gpu-compute/compute_unit.hh" |
| #include "gpu-compute/gpu_dyn_inst.hh" |
| #include "gpu-compute/scalar_register_file.hh" |
| #include "gpu-compute/shader.hh" |
| #include "gpu-compute/simple_pool_manager.hh" |
| #include "gpu-compute/vector_register_file.hh" |
| |
| Wavefront::Wavefront(const Params &p) |
| : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId), |
| maxIbSize(p.max_ib_size), _gpuISA(*this), |
| vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1), |
| vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0), |
| sleepCnt(0), barId(WFBarrier::InvalidID), stats(this) |
| { |
| lastTrace = 0; |
| execUnitId = -1; |
| status = S_STOPPED; |
| reservedVectorRegs = 0; |
| reservedScalarRegs = 0; |
| startVgprIndex = 0; |
| startSgprIndex = 0; |
| outstandingReqs = 0; |
| outstandingReqsWrGm = 0; |
| outstandingReqsWrLm = 0; |
| outstandingReqsRdGm = 0; |
| outstandingReqsRdLm = 0; |
| rdLmReqsInPipe = 0; |
| rdGmReqsInPipe = 0; |
| wrLmReqsInPipe = 0; |
| wrGmReqsInPipe = 0; |
| scalarRdGmReqsInPipe = 0; |
| scalarWrGmReqsInPipe = 0; |
| scalarOutstandingReqsRdGm = 0; |
| scalarOutstandingReqsWrGm = 0; |
| lastNonIdleTick = 0; |
| ldsChunk = nullptr; |
| |
| memTraceBusy = 0; |
| oldVgprTcnt = 0xffffffffffffffffll; |
| oldDgprTcnt = 0xffffffffffffffffll; |
| oldVgpr.resize(p.wf_size); |
| |
| pendingFetch = false; |
| dropFetch = false; |
| maxVgprs = 0; |
| maxSgprs = 0; |
| |
| lastAddr.resize(p.wf_size); |
| workItemFlatId.resize(p.wf_size); |
| oldDgpr.resize(p.wf_size); |
| for (int i = 0; i < 3; ++i) { |
| workItemId[i].resize(p.wf_size); |
| } |
| |
| _execMask.set(); |
| rawDist.clear(); |
| lastInstExec = 0; |
| vecReads.clear(); |
| } |
| |
| void |
| Wavefront::init() |
| { |
| reservedVectorRegs = 0; |
| reservedScalarRegs = 0; |
| startVgprIndex = 0; |
| startSgprIndex = 0; |
| |
| scalarAlu = computeUnit->mapWaveToScalarAlu(this); |
| scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this); |
| globalMem = computeUnit->mapWaveToGlobalMem(this); |
| localMem = computeUnit->mapWaveToLocalMem(this); |
| scalarMem = computeUnit->mapWaveToScalarMem(this); |
| } |
| |
| void |
| Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) |
| { |
| int regInitIdx = 0; |
| |
| // iterate over all the init fields and check which |
| // bits are enabled |
| for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) { |
| |
| if (task->sgprBitEnabled(en_bit)) { |
| int physSgprIdx = 0; |
| uint32_t wiCount = 0; |
| uint32_t firstWave = 0; |
| int orderedAppendTerm = 0; |
| int numWfsInWg = 0; |
| uint32_t finalValue = 0; |
| Addr host_disp_pkt_addr = task->hostDispPktAddr(); |
| Addr kernarg_addr = task->kernargAddr(); |
| Addr hidden_priv_base(0); |
| |
| switch (en_bit) { |
| case PrivateSegBuf: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| task->amdQueue.scratch_resource_descriptor[0]); |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting PrivateSegBuffer: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| task->amdQueue.scratch_resource_descriptor[0]); |
| |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| task->amdQueue.scratch_resource_descriptor[1]); |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting PrivateSegBuffer: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| task->amdQueue.scratch_resource_descriptor[1]); |
| |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| task->amdQueue.scratch_resource_descriptor[2]); |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting PrivateSegBuffer: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| task->amdQueue.scratch_resource_descriptor[2]); |
| |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| task->amdQueue.scratch_resource_descriptor[3]); |
| |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting PrivateSegBuffer: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| task->amdQueue.scratch_resource_descriptor[3]); |
| break; |
| case DispatchPtr: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| bits(host_disp_pkt_addr, 31, 0)); |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting DispatchPtr: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| bits(host_disp_pkt_addr, 31, 0)); |
| |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| bits(host_disp_pkt_addr, 63, 32)); |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting DispatchPtr: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| bits(host_disp_pkt_addr, 63, 32)); |
| |
| ++regInitIdx; |
| break; |
| case QueuePtr: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| bits(task->hostAMDQueueAddr, 31, 0)); |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting QueuePtr: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| bits(task->hostAMDQueueAddr, 31, 0)); |
| |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| bits(task->hostAMDQueueAddr, 63, 32)); |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting QueuePtr: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| bits(task->hostAMDQueueAddr, 63, 32)); |
| |
| ++regInitIdx; |
| break; |
| case KernargSegPtr: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| bits(kernarg_addr, 31, 0)); |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting KernargSegPtr: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| bits(kernarg_addr, 31, 0)); |
| |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| bits(kernarg_addr, 63, 32)); |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting KernargSegPtr: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| bits(kernarg_addr, 63, 32)); |
| |
| ++regInitIdx; |
| break; |
| case FlatScratchInit: |
| physSgprIdx |
| = computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| (TheGpuISA::ScalarRegU32)(task->amdQueue |
| .scratch_backing_memory_location & 0xffffffff)); |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting FlatScratch Addr: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| (TheGpuISA::ScalarRegU32)(task->amdQueue |
| .scratch_backing_memory_location & 0xffffffff)); |
| |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| // This vallue should be sizeof(DWORD) aligned, that is |
| // 4 byte aligned |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| task->amdQueue.scratch_workitem_byte_size); |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting FlatScratch size: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| task->amdQueue.scratch_workitem_byte_size); |
| /** |
| * Since flat scratch init is needed for this kernel, this |
| * kernel is going to have flat memory instructions and we |
| * need to initialize the hidden private base for this queue. |
| * scratch_resource_descriptor[0] has this queue's scratch |
| * base address. scratch_backing_memory_location has the |
| * offset to this queue's scratch base address from the |
| * SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this |
| * queue's scratch base address for address calculation |
| * (stored in scratch_resource_descriptor[0]). But that |
| * address calculation shoule be done by first finding the |
| * queue's scratch base address using the calculation |
| * "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize |
| * SH_HIDDEN_PRIVATE_BASE_VMID. |
| * |
| * For more details see: |
| * http://rocm-documentation.readthedocs.io/en/latest/ |
| * ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch |
| * |
| * https://github.com/ROCm-Developer-Tools/ |
| * ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md |
| * #flat-addressing |
| */ |
| hidden_priv_base = |
| (uint64_t)task->amdQueue.scratch_resource_descriptor[0] | |
| (((uint64_t)task->amdQueue.scratch_resource_descriptor[1] |
| & 0x000000000000ffff) << 32); |
| computeUnit->shader->initShHiddenPrivateBase( |
| hidden_priv_base, |
| task->amdQueue.scratch_backing_memory_location); |
| break; |
| case GridWorkgroupCountX: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| wiCount = ((task->gridSize(0) + |
| task->wgSize(0) - 1) / |
| task->wgSize(0)); |
| computeUnit->srf[simdId]->write(physSgprIdx, wiCount); |
| |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting num WG X: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, wiCount); |
| break; |
| case GridWorkgroupCountY: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| wiCount = ((task->gridSize(1) + |
| task->wgSize(1) - 1) / |
| task->wgSize(1)); |
| computeUnit->srf[simdId]->write(physSgprIdx, wiCount); |
| |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting num WG Y: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, wiCount); |
| break; |
| case GridWorkgroupCountZ: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| wiCount = ((task->gridSize(2) + |
| task->wgSize(2) - 1) / |
| task->wgSize(2)); |
| computeUnit->srf[simdId]->write(physSgprIdx, wiCount); |
| |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting num WG Z: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, wiCount); |
| break; |
| case WorkgroupIdX: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| workGroupId[0]); |
| |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting WG ID X: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, workGroupId[0]); |
| break; |
| case WorkgroupIdY: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| workGroupId[1]); |
| |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting WG ID Y: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, workGroupId[1]); |
| break; |
| case WorkgroupIdZ: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]->write(physSgprIdx, |
| workGroupId[2]); |
| |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting WG ID Z: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, workGroupId[2]); |
| break; |
| case PrivSegWaveByteOffset: |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| /** |
| * the compute_tmpring_size_wavesize specifies the number of |
| * kB allocated per wavefront, hence the multiplication by |
| * 1024. |
| * |
| * to get the per wavefront offset into the scratch |
| * memory, we also multiply this by the wfId. the wfId stored |
| * in the Wavefront class, however, is the wave ID within the |
| * WG, whereas here we need the global WFID because the |
| * scratch space will be divided amongst all waves in the |
| * kernel. to get the global ID we multiply the WGID by |
| * the WG size, then add the WFID of the wave within its WG. |
| */ |
| computeUnit->srf[simdId]->write(physSgprIdx, 1024 * |
| (wgId * (wgSz / 64) + wfId) * |
| task->amdQueue.compute_tmpring_size_wavesize); |
| |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting Private Seg Offset: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, |
| 1024 * (wgId * (wgSz / 64) + wfId) * |
| task->amdQueue.compute_tmpring_size_wavesize); |
| break; |
| case WorkgroupInfo: |
| firstWave = (wfId == 0) ? 1 : 0; |
| numWfsInWg = divCeil(wgSizeInWorkItems, |
| computeUnit->wfSize()); |
| finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1); |
| finalValue |= (orderedAppendTerm << 6); |
| finalValue |= numWfsInWg; |
| physSgprIdx = |
| computeUnit->registerManager->mapSgpr(this, regInitIdx); |
| computeUnit->srf[simdId]-> |
| write(physSgprIdx, finalValue); |
| |
| ++regInitIdx; |
| DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " |
| "Setting WG Info: s[%d] = %x\n", |
| computeUnit->cu_id, simdId, |
| wfSlotId, wfDynId, physSgprIdx, finalValue); |
| break; |
| default: |
| fatal("SGPR enable bit %i not supported\n", en_bit); |
| break; |
| } |
| } |
| } |
| |
| regInitIdx = 0; |
| |
| // iterate over all the init fields and check which |
| // bits are enabled |
| for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) { |
| if (task->vgprBitEnabled(en_bit)) { |
| uint32_t physVgprIdx = 0; |
| TheGpuISA::VecRegContainerU32 raw_vgpr; |
| |
| switch (en_bit) { |
| case WorkitemIdX: |
| { |
| physVgprIdx = computeUnit->registerManager |
| ->mapVgpr(this, regInitIdx); |
| TheGpuISA::VecRegU32 vgpr_x |
| = raw_vgpr.as<TheGpuISA::VecElemU32>(); |
| |
| for (int lane = 0; lane < workItemId[0].size(); ++lane) { |
| vgpr_x[lane] = workItemId[0][lane]; |
| } |
| |
| computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr); |
| rawDist[regInitIdx] = 0; |
| ++regInitIdx; |
| } |
| break; |
| case WorkitemIdY: |
| { |
| physVgprIdx = computeUnit->registerManager |
| ->mapVgpr(this, regInitIdx); |
| TheGpuISA::VecRegU32 vgpr_y |
| = raw_vgpr.as<TheGpuISA::VecElemU32>(); |
| |
| for (int lane = 0; lane < workItemId[1].size(); ++lane) { |
| vgpr_y[lane] = workItemId[1][lane]; |
| } |
| |
| computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr); |
| rawDist[regInitIdx] = 0; |
| ++regInitIdx; |
| } |
| break; |
| case WorkitemIdZ: |
| { |
| physVgprIdx = computeUnit->registerManager-> |
| mapVgpr(this, regInitIdx); |
| TheGpuISA::VecRegU32 vgpr_z |
| = raw_vgpr.as<TheGpuISA::VecElemU32>(); |
| |
| for (int lane = 0; lane < workItemId[2].size(); ++lane) { |
| vgpr_z[lane] = workItemId[2][lane]; |
| } |
| |
| computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr); |
| rawDist[regInitIdx] = 0; |
| ++regInitIdx; |
| } |
| break; |
| } |
| } |
| } |
| } |
| |
| void |
| Wavefront::resizeRegFiles(int num_vregs, int num_sregs) |
| { |
| maxVgprs = num_vregs; |
| maxSgprs = num_sregs; |
| } |
| |
| Wavefront::~Wavefront() |
| { |
| } |
| |
| void |
| Wavefront::setStatus(status_e newStatus) |
| { |
| if (computeUnit->idleCUTimeout > 0) { |
| // Wavefront's status transitions to stalled or stopped |
| if ((newStatus == S_STOPPED || newStatus == S_STALLED || |
| newStatus == S_WAITCNT || newStatus == S_BARRIER) && |
| (status != newStatus)) { |
| computeUnit->idleWfs++; |
| assert(computeUnit->idleWfs <= |
| (computeUnit->shader->n_wf * computeUnit->numVectorALUs)); |
| if (computeUnit->idleWfs == |
| (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) { |
| lastNonIdleTick = curTick(); |
| } |
| // Wavefront's status transitions to an active state (from |
| // a stopped or stalled state) |
| } else if ((status == S_STOPPED || status == S_STALLED || |
| status == S_WAITCNT || status == S_BARRIER) && |
| (status != newStatus)) { |
| // if all WFs in the CU were idle then check if the idleness |
| // period exceeded the timeout threshold |
| if (computeUnit->idleWfs == |
| (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) { |
| panic_if((curTick() - lastNonIdleTick) >= |
| computeUnit->idleCUTimeout, |
| "CU%d has been idle for %d ticks at tick %d", |
| computeUnit->cu_id, computeUnit->idleCUTimeout, |
| curTick()); |
| } |
| computeUnit->idleWfs--; |
| assert(computeUnit->idleWfs >= 0); |
| } |
| } |
| status = newStatus; |
| } |
| |
| void |
| Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc) |
| { |
| wfDynId = _wf_dyn_id; |
| _pc = init_pc; |
| |
| status = S_RUNNING; |
| |
| vecReads.resize(maxVgprs, 0); |
| } |
| |
| bool |
| Wavefront::isGmInstruction(GPUDynInstPtr ii) |
| { |
| if (ii->isGlobalMem() || |
| (ii->isFlat() && ii->executedAs() == Enums::SC_GLOBAL)) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isLmInstruction(GPUDynInstPtr ii) |
| { |
| if (ii->isLocalMem() || |
| (ii->isFlat() && ii->executedAs() == Enums::SC_GROUP)) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstSleep() |
| { |
| if (instructionBuffer.empty()) |
| return false; |
| |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (ii->isSleep()) { |
| return true; |
| } |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstWaitcnt() |
| { |
| if (instructionBuffer.empty()) |
| return false; |
| |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (ii->isWaitcnt()) { |
| // waitcnt is a scalar |
| assert(ii->isScalar()); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstScalarALU() |
| { |
| assert(!instructionBuffer.empty()); |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn() |
| || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() || |
| (ii->isKernArgSeg() && ii->isLoad()))) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstVectorALU() |
| { |
| assert(!instructionBuffer.empty()); |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() || |
| ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel() |
| || (ii->isKernArgSeg() && ii->isLoad()))) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstBarrier() |
| { |
| assert(!instructionBuffer.empty()); |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (status != S_STOPPED && ii->isBarrier()) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstGMem() |
| { |
| assert(!instructionBuffer.empty()); |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstScalarMem() |
| { |
| assert(!instructionBuffer.empty()); |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstLMem() |
| { |
| assert(!instructionBuffer.empty()); |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (status != S_STOPPED && ii->isLocalMem()) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstPrivMem() |
| { |
| assert(!instructionBuffer.empty()); |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (status != S_STOPPED && ii->isPrivateSeg()) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::isOldestInstFlatMem() |
| { |
| assert(!instructionBuffer.empty()); |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| if (status != S_STOPPED && ii->isFlat()) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool |
| Wavefront::stopFetch() |
| { |
| for (auto it : instructionBuffer) { |
| GPUDynInstPtr ii = it; |
| if (ii->isReturn() || ii->isBranch() || |
| ii->isEndOfKernel()) { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| void |
| Wavefront::freeResources() |
| { |
| execUnitId = -1; |
| } |
| |
| void Wavefront::validateRequestCounters() |
| { |
| panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 || |
| wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 || |
| outstandingReqs < 0, |
| "Negative requests in pipe for WF%d for slot%d" |
| " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d," |
| " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d," |
| " Outstanding Reqs=%d\n", |
| wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe, |
| rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs); |
| } |
| |
| void |
| Wavefront::reserveGmResource(GPUDynInstPtr ii) |
| { |
| if (!ii->isScalar()) { |
| if (ii->isLoad()) { |
| rdGmReqsInPipe++; |
| } else if (ii->isStore()) { |
| wrGmReqsInPipe++; |
| } else if (ii->isAtomic() || ii->isMemSync()) { |
| rdGmReqsInPipe++; |
| wrGmReqsInPipe++; |
| } else { |
| panic("Invalid memory operation!\n"); |
| } |
| execUnitId = globalMem; |
| } else { |
| if (ii->isLoad()) { |
| scalarRdGmReqsInPipe++; |
| } else if (ii->isStore()) { |
| scalarWrGmReqsInPipe++; |
| } else if (ii->isAtomic() || ii->isMemSync()) { |
| scalarWrGmReqsInPipe++; |
| scalarRdGmReqsInPipe++; |
| } else { |
| panic("Invalid memory operation!\n"); |
| } |
| execUnitId = scalarMem; |
| } |
| } |
| |
| void |
| Wavefront::reserveLmResource(GPUDynInstPtr ii) |
| { |
| fatal_if(ii->isScalar(), |
| "Scalar instructions can not access Shared memory!!!"); |
| if (ii->isLoad()) { |
| rdLmReqsInPipe++; |
| } else if (ii->isStore()) { |
| wrLmReqsInPipe++; |
| } else if (ii->isAtomic() || ii->isMemSync()) { |
| wrLmReqsInPipe++; |
| rdLmReqsInPipe++; |
| } else { |
| panic("Invalid memory operation!\n"); |
| } |
| execUnitId = localMem; |
| } |
| |
| std::vector<int> |
| Wavefront::reserveResources() |
| { |
| // vector of execution unit IDs to return to schedule stage |
| // this return is only used for debugging and an assertion... |
| std::vector<int> execUnitIds; |
| |
| // Get current instruction |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| assert(ii); |
| |
| // Single precision ALU or Branch or Return or Special instruction |
| if (ii->isALU() || ii->isSpecialOp() || |
| ii->isBranch() || ii->isNop() || |
| (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() || |
| ii->isReturn() || ii->isEndOfKernel()) { |
| if (!ii->isScalar()) { |
| execUnitId = simdId; |
| } else { |
| execUnitId = scalarAluGlobalIdx; |
| } |
| // this is to enforce a fixed number of cycles per issue slot per SIMD |
| } else if (ii->isBarrier()) { |
| execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId; |
| } else if (ii->isFlat()) { |
| assert(!ii->isScalar()); |
| reserveLmResource(ii); |
| // add execUnitId, reserved by reserveLmResource, list before it is |
| // overwriten by reserveGmResource |
| execUnitIds.push_back(execUnitId); |
| flatLmUnitId = execUnitId; |
| reserveGmResource(ii); |
| flatGmUnitId = execUnitId; |
| execUnitIds.push_back(flatGmUnitId); |
| execUnitId = -1; |
| } else if (ii->isGlobalMem()) { |
| reserveGmResource(ii); |
| } else if (ii->isLocalMem()) { |
| reserveLmResource(ii); |
| } else if (ii->isPrivateSeg()) { |
| fatal_if(ii->isScalar(), |
| "Scalar instructions can not access Private memory!!!"); |
| reserveGmResource(ii); |
| } else { |
| panic("reserveResources -> Couldn't process op!\n"); |
| } |
| |
| if (execUnitId != -1) { |
| execUnitIds.push_back(execUnitId); |
| } |
| assert(execUnitIds.size()); |
| return execUnitIds; |
| } |
| |
| void |
| Wavefront::exec() |
| { |
| // ---- Exit if wavefront is inactive ----------------------------- // |
| |
| if (status == S_STOPPED || status == S_RETURNING || |
| status==S_STALLED || instructionBuffer.empty()) { |
| return; |
| } |
| |
| if (status == S_WAITCNT) { |
| /** |
| * if this wave is in S_WAITCNT state, then |
| * it should enter exec() precisely one time |
| * before the waitcnts are satisfied, in order |
| * to execute the waitcnt instruction itself |
| * thus we assert that the waitcnt is the |
| * oldest instruction. if we enter exec() with |
| * active waitcnts, and we're not executing |
| * the waitcnt instruction, something must be |
| * wrong |
| */ |
| assert(isOldestInstWaitcnt()); |
| } |
| |
| // Get current instruction |
| |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| |
| const Addr old_pc = pc(); |
| DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " |
| "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId, |
| wfDynId, ii->disassemble(), old_pc, ii->seqNum()); |
| |
| ii->execute(ii); |
| // delete the dynamic instruction from the pipeline map |
| computeUnit->deleteFromPipeMap(this); |
| // update the instruction stats in the CU |
| computeUnit->updateInstStats(ii); |
| |
| // inform VRF of instruction execution to schedule write-back |
| // and scoreboard ready for registers |
| if (!ii->isScalar()) { |
| computeUnit->vrf[simdId]->waveExecuteInst(this, ii); |
| } |
| computeUnit->srf[simdId]->waveExecuteInst(this, ii); |
| |
| computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecOperands()); |
| computeUnit->shader->incVectorInstDstOperand(ii->numDstVecOperands()); |
| computeUnit->stats.numInstrExecuted++; |
| stats.numInstrExecuted++; |
| computeUnit->instExecPerSimd[simdId]++; |
| computeUnit->stats.execRateDist.sample( |
| computeUnit->stats.totalCycles.value() - |
| computeUnit->lastExecCycle[simdId]); |
| computeUnit->lastExecCycle[simdId] = |
| computeUnit->stats.totalCycles.value(); |
| |
| if (lastInstExec) { |
| computeUnit->stats.instInterleave[simdId]. |
| sample(computeUnit->instExecPerSimd[simdId] - lastInstExec); |
| } |
| lastInstExec = computeUnit->instExecPerSimd[simdId]; |
| |
| // want to track: |
| // number of reads that occur per value written |
| |
| // vector RAW dependency tracking |
| for (int i = 0; i < ii->getNumOperands(); i++) { |
| if (ii->isVectorRegister(i)) { |
| int vgpr = ii->getRegisterIndex(i, ii); |
| int nReg = ii->getOperandSize(i) <= 4 ? 1 : |
| ii->getOperandSize(i) / 4; |
| for (int n = 0; n < nReg; n++) { |
| if (ii->isSrcOperand(i)) { |
| // This check should never fail, but to be safe we check |
| if (rawDist.find(vgpr+n) != rawDist.end()) { |
| stats.vecRawDistance.sample( |
| stats.numInstrExecuted.value() - rawDist[vgpr+n]); |
| } |
| // increment number of reads to this register |
| vecReads[vgpr+n]++; |
| } else if (ii->isDstOperand(i)) { |
| // rawDist is set on writes, but will not be set |
| // for the first write to each physical register |
| if (rawDist.find(vgpr+n) != rawDist.end()) { |
| // sample the number of reads that were performed |
| stats.readsPerWrite.sample(vecReads[vgpr+n]); |
| } |
| // on a write, reset count of reads to 0 |
| vecReads[vgpr+n] = 0; |
| |
| rawDist[vgpr+n] = stats.numInstrExecuted.value(); |
| } |
| } |
| } |
| } |
| |
| if (pc() == old_pc) { |
| // PC not modified by instruction, proceed to next |
| _gpuISA.advancePC(ii); |
| instructionBuffer.pop_front(); |
| } else { |
| DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n", |
| computeUnit->cu_id, simdId, wfSlotId, wfDynId, |
| ii->disassemble()); |
| discardFetch(); |
| } |
| DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n", |
| computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc()); |
| |
| if (computeUnit->shader->hsail_mode==Shader::SIMT) { |
| const int num_active_lanes = execMask().count(); |
| computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes); |
| computeUnit->stats.numVecOpsExecuted += num_active_lanes; |
| |
| if (ii->isF16() && ii->isALU()) { |
| if (ii->isF32() || ii->isF64()) { |
| fatal("Instruction is tagged as both (1) F16, and (2)" |
| "either F32 or F64."); |
| } |
| computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes; |
| if (ii->isFMA()) { |
| computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes; |
| computeUnit->stats.numVecOpsExecutedTwoOpFP |
| += num_active_lanes; |
| } |
| else if (ii->isMAC()) { |
| computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes; |
| computeUnit->stats.numVecOpsExecutedTwoOpFP |
| += num_active_lanes; |
| } |
| else if (ii->isMAD()) { |
| computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes; |
| computeUnit->stats.numVecOpsExecutedTwoOpFP |
| += num_active_lanes; |
| } |
| } |
| if (ii->isF32() && ii->isALU()) { |
| if (ii->isF16() || ii->isF64()) { |
| fatal("Instruction is tagged as both (1) F32, and (2)" |
| "either F16 or F64."); |
| } |
| computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes; |
| if (ii->isFMA()) { |
| computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes; |
| computeUnit->stats.numVecOpsExecutedTwoOpFP |
| += num_active_lanes; |
| } |
| else if (ii->isMAC()) { |
| computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes; |
| computeUnit->stats.numVecOpsExecutedTwoOpFP |
| += num_active_lanes; |
| } |
| else if (ii->isMAD()) { |
| computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes; |
| computeUnit->stats.numVecOpsExecutedTwoOpFP |
| += num_active_lanes; |
| } |
| } |
| if (ii->isF64() && ii->isALU()) { |
| if (ii->isF16() || ii->isF32()) { |
| fatal("Instruction is tagged as both (1) F64, and (2)" |
| "either F16 or F32."); |
| } |
| computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes; |
| if (ii->isFMA()) { |
| computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes; |
| computeUnit->stats.numVecOpsExecutedTwoOpFP |
| += num_active_lanes; |
| } |
| else if (ii->isMAC()) { |
| computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes; |
| computeUnit->stats.numVecOpsExecutedTwoOpFP |
| += num_active_lanes; |
| } |
| else if (ii->isMAD()) { |
| computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes; |
| computeUnit->stats.numVecOpsExecutedTwoOpFP |
| += num_active_lanes; |
| } |
| } |
| if (isGmInstruction(ii)) { |
| computeUnit->stats.activeLanesPerGMemInstrDist.sample( |
| num_active_lanes); |
| } else if (isLmInstruction(ii)) { |
| computeUnit->stats.activeLanesPerLMemInstrDist.sample( |
| num_active_lanes); |
| } |
| } |
| |
| /** |
| * we return here to avoid spurious errors related to flat insts |
| * and their address segment resolution. |
| */ |
| if (execMask().none() && ii->isFlat()) { |
| computeUnit->getTokenManager()->recvTokens(1); |
| return; |
| } |
| |
| // Update Vector ALU pipeline and other resources |
| bool flat_as_gm = false; |
| bool flat_as_lm = false; |
| if (ii->isFlat()) { |
| flat_as_gm = (ii->executedAs() == Enums::SC_GLOBAL) || |
| (ii->executedAs() == Enums::SC_PRIVATE); |
| flat_as_lm = (ii->executedAs() == Enums::SC_GROUP); |
| } |
| |
| // Single precision ALU or Branch or Return or Special instruction |
| // Note, we use the same timing regardless of SP or DP ALU operation. |
| if (ii->isALU() || ii->isSpecialOp() || |
| ii->isBranch() || ii->isNop() || |
| (ii->isKernArgSeg() && ii->isLoad()) || |
| ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) { |
| // this is to enforce a fixed number of cycles per issue slot per SIMD |
| if (!ii->isScalar()) { |
| computeUnit->vectorALUs[simdId].set(computeUnit-> |
| cyclesToTicks(computeUnit->issuePeriod)); |
| } else { |
| computeUnit->scalarALUs[scalarAlu].set(computeUnit-> |
| cyclesToTicks(computeUnit->issuePeriod)); |
| } |
| // Barrier on Scalar ALU |
| } else if (ii->isBarrier()) { |
| computeUnit->scalarALUs[scalarAlu].set(computeUnit-> |
| cyclesToTicks(computeUnit->issuePeriod)); |
| // GM or Flat as GM Load |
| } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) { |
| if (!ii->isScalar()) { |
| computeUnit->vrfToGlobalMemPipeBus.set( |
| computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency)); |
| computeUnit->vectorGlobalMemUnit. |
| set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); |
| computeUnit->stats.instCyclesVMemPerSimd[simdId] += |
| computeUnit->vrf_gm_bus_latency; |
| } else { |
| computeUnit->srfToScalarMemPipeBus.set(computeUnit-> |
| cyclesToTicks(computeUnit->srf_scm_bus_latency)); |
| computeUnit->scalarMemUnit. |
| set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); |
| computeUnit->stats.instCyclesScMemPerSimd[simdId] += |
| computeUnit->srf_scm_bus_latency; |
| } |
| // GM or Flat as GM Store |
| } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) { |
| if (!ii->isScalar()) { |
| computeUnit->vrfToGlobalMemPipeBus.set(computeUnit-> |
| cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency))); |
| computeUnit->vectorGlobalMemUnit. |
| set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); |
| computeUnit->stats.instCyclesVMemPerSimd[simdId] += |
| (2 * computeUnit->vrf_gm_bus_latency); |
| } else { |
| computeUnit->srfToScalarMemPipeBus.set(computeUnit-> |
| cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency))); |
| computeUnit->scalarMemUnit. |
| set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); |
| computeUnit->stats.instCyclesScMemPerSimd[simdId] += |
| (2 * computeUnit->srf_scm_bus_latency); |
| } |
| } else if ((ii->isAtomic() || ii->isMemSync()) && |
| (ii->isGlobalMem() || flat_as_gm)) { |
| if (!ii->isScalar()) { |
| computeUnit->vrfToGlobalMemPipeBus.set(computeUnit-> |
| cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency))); |
| computeUnit->vectorGlobalMemUnit. |
| set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); |
| computeUnit->stats.instCyclesVMemPerSimd[simdId] += |
| (2 * computeUnit->vrf_gm_bus_latency); |
| } else { |
| computeUnit->srfToScalarMemPipeBus.set(computeUnit-> |
| cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency))); |
| computeUnit->scalarMemUnit. |
| set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); |
| computeUnit->stats.instCyclesScMemPerSimd[simdId] += |
| (2 * computeUnit->srf_scm_bus_latency); |
| } |
| // LM or Flat as LM Load |
| } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) { |
| computeUnit->vrfToLocalMemPipeBus.set(computeUnit-> |
| cyclesToTicks(computeUnit->vrf_lm_bus_latency)); |
| computeUnit->vectorSharedMemUnit. |
| set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod)); |
| computeUnit->stats.instCyclesLdsPerSimd[simdId] += |
| computeUnit->vrf_lm_bus_latency; |
| // LM or Flat as LM Store |
| } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) { |
| computeUnit->vrfToLocalMemPipeBus.set(computeUnit-> |
| cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency))); |
| computeUnit->vectorSharedMemUnit. |
| set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); |
| computeUnit->stats.instCyclesLdsPerSimd[simdId] += |
| (2 * computeUnit->vrf_lm_bus_latency); |
| // LM or Flat as LM, Atomic or MemFence |
| } else if ((ii->isAtomic() || ii->isMemSync()) && |
| (ii->isLocalMem() || flat_as_lm)) { |
| computeUnit->vrfToLocalMemPipeBus.set(computeUnit-> |
| cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency))); |
| computeUnit->vectorSharedMemUnit. |
| set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); |
| computeUnit->stats.instCyclesLdsPerSimd[simdId] += |
| (2 * computeUnit->vrf_lm_bus_latency); |
| } else { |
| panic("Bad instruction type!\n"); |
| } |
| } |
| |
| GPUDynInstPtr |
| Wavefront::nextInstr() |
| { |
| // Read next instruction from instruction buffer |
| GPUDynInstPtr ii = instructionBuffer.front(); |
| // if the WF has been dispatched in the schedule stage then |
| // check the next oldest instruction for readiness |
| if (computeUnit->pipeMap.find(ii->seqNum()) != |
| computeUnit->pipeMap.end()) { |
| if (instructionBuffer.size() > 1) { |
| auto it = instructionBuffer.begin() + 1; |
| return *it; |
| } else { // No new instructions to check |
| return nullptr; |
| } |
| } |
| return ii; |
| } |
| |
| void |
| Wavefront::discardFetch() |
| { |
| instructionBuffer.clear(); |
| dropFetch |= pendingFetch; |
| |
| /** |
| * clear the fetch buffer for this wave in order to |
| * remove any stale inst data |
| */ |
| computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId); |
| } |
| |
| bool |
| Wavefront::waitCntsSatisfied() |
| { |
| // Both vmWaitCnt && lgkmWaitCnt uninitialized means |
| // waitCnt instruction has been dispatched but not executed yet: next |
| // instruction should be blocked until waitCnt is executed. |
| if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) { |
| return false; |
| } |
| |
| /** |
| * If we reach here, that means an s_waitcnt instruction was executed |
| * and the waitcnts are set by the execute method. Check if waitcnts |
| * are satisfied. |
| */ |
| if (vmWaitCnt != -1) { |
| if (vmemInstsIssued > vmWaitCnt) { |
| // vmWaitCnt not satisfied |
| return false; |
| } |
| } |
| |
| if (expWaitCnt != -1) { |
| if (expInstsIssued > expWaitCnt) { |
| // expWaitCnt not satisfied |
| return false; |
| } |
| } |
| |
| if (lgkmWaitCnt != -1) { |
| if (lgkmInstsIssued > lgkmWaitCnt) { |
| // lgkmWaitCnt not satisfied |
| return false; |
| } |
| } |
| |
| // if we get here all outstanding waitcnts must |
| // be satisfied, so we resume normal operation |
| clearWaitCnts(); |
| |
| return true; |
| } |
| |
| bool |
| Wavefront::sleepDone() |
| { |
| assert(status == S_STALLED_SLEEP); |
| |
| // if the sleep count has not been set, then the sleep instruction has not |
| // been executed yet, so we will return true without setting the wavefront |
| // status |
| if (sleepCnt == 0) |
| return false; |
| |
| sleepCnt--; |
| if (sleepCnt != 0) |
| return false; |
| |
| status = S_RUNNING; |
| return true; |
| } |
| |
| void |
| Wavefront::setSleepTime(int sleep_time) |
| { |
| assert(sleepCnt == 0); |
| sleepCnt = sleep_time; |
| } |
| |
| void |
| Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt) |
| { |
| // the scoreboard should have set the status |
| // to S_WAITCNT once a waitcnt instruction |
| // was marked as ready |
| assert(status == S_WAITCNT); |
| |
| // waitcnt instruction shouldn't be sending |
| // negative counts |
| assert(vm_wait_cnt >= 0); |
| assert(exp_wait_cnt >= 0); |
| assert(lgkm_wait_cnt >= 0); |
| // waitcnts are a max of 15 because we have |
| // only 1 nibble (4 bits) to set the counts |
| assert(vm_wait_cnt <= 0xf); |
| assert(exp_wait_cnt <= 0x7); |
| assert(lgkm_wait_cnt <= 0x1f); |
| |
| /** |
| * prior waitcnts should be satisfied, |
| * at which time the WF resets them |
| * back to -1, indicating they are no |
| * longer active |
| */ |
| assert(vmWaitCnt == -1); |
| assert(expWaitCnt == -1); |
| assert(lgkmWaitCnt == -1); |
| |
| /** |
| * if the instruction encoding |
| * indicates a waitcnt of 0xf, |
| * that means the waitcnt is |
| * not being used |
| */ |
| if (vm_wait_cnt != 0xf) |
| vmWaitCnt = vm_wait_cnt; |
| |
| if (exp_wait_cnt != 0x7) |
| expWaitCnt = exp_wait_cnt; |
| |
| if (lgkm_wait_cnt != 0x1f) |
| lgkmWaitCnt = lgkm_wait_cnt; |
| } |
| |
| void |
| Wavefront::clearWaitCnts() |
| { |
| // reset the waitcnts back to |
| // -1, indicating they are no |
| // longer valid |
| vmWaitCnt = -1; |
| expWaitCnt = -1; |
| lgkmWaitCnt = -1; |
| |
| // resume running normally |
| status = S_RUNNING; |
| } |
| |
| void |
| Wavefront::incVMemInstsIssued() |
| { |
| ++vmemInstsIssued; |
| } |
| |
| void |
| Wavefront::incExpInstsIssued() |
| { |
| ++expInstsIssued; |
| } |
| |
| void |
| Wavefront::incLGKMInstsIssued() |
| { |
| ++lgkmInstsIssued; |
| } |
| |
| void |
| Wavefront::decVMemInstsIssued() |
| { |
| --vmemInstsIssued; |
| } |
| |
| void |
| Wavefront::decExpInstsIssued() |
| { |
| --expInstsIssued; |
| } |
| |
| void |
| Wavefront::decLGKMInstsIssued() |
| { |
| --lgkmInstsIssued; |
| } |
| |
| Addr |
| Wavefront::pc() const |
| { |
| return _pc; |
| } |
| |
| void |
| Wavefront::pc(Addr new_pc) |
| { |
| _pc = new_pc; |
| } |
| |
| VectorMask& |
| Wavefront::execMask() |
| { |
| return _execMask; |
| } |
| |
| bool |
| Wavefront::execMask(int lane) const |
| { |
| return _execMask[lane]; |
| } |
| |
| void |
| Wavefront::freeRegisterFile() |
| { |
| /* clear busy registers */ |
| for (int i=0; i < maxVgprs; i++) { |
| int vgprIdx = computeUnit->registerManager->mapVgpr(this, i); |
| computeUnit->vrf[simdId]->markReg(vgprIdx, false); |
| } |
| |
| /* Free registers used by this wavefront */ |
| uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) % |
| computeUnit->vrf[simdId]->numRegs(); |
| computeUnit->registerManager->vrfPoolMgrs[simdId]-> |
| freeRegion(startVgprIndex, endIndex); |
| } |
| |
| void |
| Wavefront::computeActualWgSz(HSAQueueEntry *task) |
| { |
| actualWgSzTotal = 1; |
| for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) { |
| actualWgSz[d] = std::min(workGroupSz[d], gridSz[d] |
| - task->wgId(d) * workGroupSz[d]); |
| actualWgSzTotal *= actualWgSz[d]; |
| } |
| } |
| |
| void |
| Wavefront::barrierId(int bar_id) |
| { |
| assert(bar_id >= WFBarrier::InvalidID); |
| assert(bar_id < computeUnit->numBarrierSlots()); |
| barId = bar_id; |
| } |
| |
| int |
| Wavefront::barrierId() const |
| { |
| return barId; |
| } |
| |
| bool |
| Wavefront::hasBarrier() const |
| { |
| return barId > WFBarrier::InvalidID; |
| } |
| |
| void |
| Wavefront::releaseBarrier() |
| { |
| barId = WFBarrier::InvalidID; |
| } |
| |
| Wavefront::WavefrontStats::WavefrontStats(Stats::Group *parent) |
| : Stats::Group(parent), |
| ADD_STAT(numInstrExecuted, |
| "number of instructions executed by this WF slot"), |
| ADD_STAT(schCycles, "number of cycles spent in schedule stage"), |
| ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"), |
| ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but " |
| "RF denied adding instruction"), |
| ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource" |
| " not available"), |
| ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for " |
| "RF reads to complete"), |
| ADD_STAT(schLdsArbStalls, |
| "number of cycles wave stalled due to LDS-VRF arbitration"), |
| // FIXME: the name of the WF needs to be unique |
| ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's " |
| "instructions are blocked due to WAW or WAR dependencies"), |
| // FIXME: the name of the WF needs to be unique |
| ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's " |
| "instructions are blocked due to RAW dependencies"), |
| ADD_STAT(vecRawDistance, |
| "Count of RAW distance in dynamic instructions for this WF"), |
| ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF") |
| { |
| vecRawDistance.init(0, 20, 1); |
| readsPerWrite.init(0, 4, 1); |
| } |