| /* |
| * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "gpu-compute/schedule_stage.hh" |
| |
| #include <unordered_set> |
| |
| #include "base/compiler.hh" |
| #include "debug/GPUSched.hh" |
| #include "debug/GPUVRF.hh" |
| #include "gpu-compute/compute_unit.hh" |
| #include "gpu-compute/gpu_static_inst.hh" |
| #include "gpu-compute/scalar_register_file.hh" |
| #include "gpu-compute/vector_register_file.hh" |
| #include "gpu-compute/wavefront.hh" |
| |
| namespace gem5 |
| { |
| |
| ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, |
| ScoreboardCheckToSchedule &from_scoreboard_check, |
| ScheduleToExecute &to_execute) |
| : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check), |
| toExecute(to_execute), |
| _name(cu.name() + ".ScheduleStage"), |
| vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false), |
| scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false), |
| locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits()) |
| { |
| for (int j = 0; j < cu.numExeUnits(); ++j) { |
| scheduler.emplace_back(p); |
| } |
| wavesInSch.clear(); |
| schList.resize(cu.numExeUnits()); |
| for (auto &dq : schList) { |
| dq.clear(); |
| } |
| } |
| |
| ScheduleStage::~ScheduleStage() |
| { |
| scheduler.clear(); |
| wavesInSch.clear(); |
| schList.clear(); |
| } |
| |
| void |
| ScheduleStage::init() |
| { |
| |
| fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(), |
| "Scheduler should have same number of entries as CU's readyList"); |
| for (int j = 0; j < computeUnit.numExeUnits(); ++j) { |
| scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j)); |
| } |
| |
| assert(computeUnit.numVectorGlobalMemUnits == 1); |
| assert(computeUnit.numVectorSharedMemUnits == 1); |
| } |
| |
| void |
| ScheduleStage::exec() |
| { |
| toExecute.reset(); |
| |
| // Update readyList |
| for (int j = 0; j < computeUnit.numExeUnits(); ++j) { |
| /** |
| * Remove any wave that already has an instruction present in SCH |
| * waiting for RF reads to complete. This prevents out of order |
| * execution within a wave. |
| */ |
| fromScoreboardCheck.updateReadyList(j); |
| for (auto wIt = fromScoreboardCheck.readyWFs(j).begin(); |
| wIt != fromScoreboardCheck.readyWFs(j).end();) { |
| if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) { |
| *wIt = nullptr; |
| wIt = fromScoreboardCheck.readyWFs(j).erase(wIt); |
| } else { |
| wIt++; |
| } |
| } |
| } |
| |
| // Attempt to add another wave for each EXE type to schList queues |
| // VMEM resources are iterated first, effectively giving priority |
| // to VMEM over VALU for scheduling read of operands to the RFs. |
| // Scalar Memory are iterated after VMEM |
| |
| // Iterate VMEM and SMEM |
| int firstMemUnit = computeUnit.firstMemUnit(); |
| int lastMemUnit = computeUnit.lastMemUnit(); |
| for (int j = firstMemUnit; j <= lastMemUnit; j++) { |
| int readyListSize = fromScoreboardCheck.readyWFs(j).size(); |
| // If no wave is ready to be scheduled on the execution resource |
| // then skip scheduling for this execution resource |
| if (!readyListSize) { |
| stats.rdyListEmpty[j]++; |
| continue; |
| } |
| stats.rdyListNotEmpty[j]++; |
| |
| // Pick a wave and attempt to add it to schList |
| Wavefront *wf = scheduler[j].chooseWave(); |
| GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front(); |
| assert(gpu_dyn_inst); |
| if (!addToSchList(j, gpu_dyn_inst)) { |
| // For waves not added to schList, increment count of cycles |
| // this wave spends in SCH stage. |
| wf->stats.schCycles++; |
| stats.addToSchListStalls[j]++; |
| } else { |
| if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) { |
| wf->incLGKMInstsIssued(); |
| } else { |
| wf->incVMemInstsIssued(); |
| if (gpu_dyn_inst->isFlat()) { |
| wf->incLGKMInstsIssued(); |
| } |
| } |
| if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) { |
| wf->incExpInstsIssued(); |
| } |
| } |
| } |
| |
| // Iterate everything else |
| for (int j = 0; j < computeUnit.numExeUnits(); ++j) { |
| // skip the VMEM resources |
| if (j >= firstMemUnit && j <= lastMemUnit) { |
| continue; |
| } |
| int readyListSize = fromScoreboardCheck.readyWFs(j).size(); |
| // If no wave is ready to be scheduled on the execution resource |
| // then skip scheduling for this execution resource |
| if (!readyListSize) { |
| stats.rdyListEmpty[j]++; |
| continue; |
| } |
| stats.rdyListNotEmpty[j]++; |
| |
| // Pick a wave and attempt to add it to schList |
| Wavefront *wf = scheduler[j].chooseWave(); |
| GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front(); |
| assert(gpu_dyn_inst); |
| if (!addToSchList(j, gpu_dyn_inst)) { |
| // For waves not added to schList, increment count of cycles |
| // this wave spends in SCH stage. |
| wf->stats.schCycles++; |
| stats.addToSchListStalls[j]++; |
| } |
| } |
| |
| // At this point, the schList queue per EXE type may contain |
| // multiple waves, in order of age (oldest to youngest). |
| // Wave may be in RFBUSY, indicating they are waiting for registers |
| // to be read, or in RFREADY, indicating they are candidates for |
| // the dispatchList and execution |
| |
| // Iterate schList queues and check if any of the waves have finished |
| // reading their operands, moving those waves to RFREADY status |
| checkRfOperandReadComplete(); |
| |
| // Fill the dispatch list with the oldest wave of each EXE type that |
| // is ready to execute |
| // Wave is picked if status in schList is RFREADY and it passes resource |
| // ready checks similar to those currently in SCB |
| fillDispatchList(); |
| |
| // Resource arbitration on waves in dispatchList |
| // Losing waves are re-inserted to the schList at a location determined |
| // by wave age |
| |
| // Arbitrate access to the VRF->LDS bus |
| arbitrateVrfToLdsBus(); |
| |
| // Schedule write operations to the register files |
| scheduleRfDestOperands(); |
| |
| // Lastly, reserve resources for waves that are ready to execute. |
| reserveResources(); |
| } |
| |
| void |
| ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s, |
| const GPUDynInstPtr &gpu_dyn_inst) |
| { |
| toExecute.dispatchTransition(gpu_dyn_inst, unitId, s); |
| } |
| |
| void |
| ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s) |
| { |
| toExecute.dispatchTransition(unitId, s); |
| } |
| |
| bool |
| ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst) |
| { |
| assert(gpu_dyn_inst); |
| Wavefront *wf = gpu_dyn_inst->wavefront(); |
| bool accessVrfWr = true; |
| if (!gpu_dyn_inst->isScalar()) { |
| accessVrfWr = computeUnit.vrf[wf->simdId] |
| ->canScheduleWriteOperands(wf, gpu_dyn_inst); |
| } |
| bool accessSrfWr = computeUnit.srf[wf->simdId] |
| ->canScheduleWriteOperands(wf, gpu_dyn_inst); |
| bool accessRf = accessVrfWr && accessSrfWr; |
| if (accessRf) { |
| if (!gpu_dyn_inst->isScalar()) { |
| computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf, |
| gpu_dyn_inst); |
| } |
| computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst); |
| return true; |
| } else { |
| stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++; |
| if (!accessSrfWr) { |
| stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++; |
| } |
| if (!accessVrfWr) { |
| stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++; |
| } |
| |
| // Increment stall counts for WF |
| wf->stats.schStalls++; |
| wf->stats.schRfAccessStalls++; |
| } |
| return false; |
| } |
| |
| void |
| ScheduleStage::scheduleRfDestOperands() |
| { |
| for (int j = 0; j < computeUnit.numExeUnits(); ++j) { |
| if (toExecute.dispatchStatus(j) == EMPTY || |
| toExecute.dispatchStatus(j) == SKIP) { |
| continue; |
| } |
| |
| // get the wave on dispatch list and attempt to allocate write |
| // resources in the RFs |
| const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j); |
| assert(gpu_dyn_inst); |
| Wavefront *wf = gpu_dyn_inst->wavefront(); |
| if (!schedRfWrites(j, gpu_dyn_inst)) { |
| reinsertToSchList(j, gpu_dyn_inst); |
| doDispatchListTransition(j, EMPTY); |
| // if this is a flat inst, also transition the LM pipe to empty |
| // Note: since FLAT/LM arbitration occurs before scheduling |
| // destination operands to the RFs, it is possible that a LM |
| // instruction lost arbitration, but would have been able to |
| // pass the RF destination operand check here, and execute |
| // instead of the FLAT. |
| if (wf->instructionBuffer.front()->isFlat()) { |
| assert(toExecute.dispatchStatus(wf->localMem) |
| == SKIP); |
| doDispatchListTransition(wf->localMem, EMPTY); |
| } |
| } |
| } |
| } |
| |
| bool |
| ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst) |
| { |
| // Attempt to add the wave to the schList if the VRF can support the |
| // wave's next instruction |
| assert(gpu_dyn_inst); |
| Wavefront *wf = gpu_dyn_inst->wavefront(); |
| bool accessVrf = true; |
| if (!gpu_dyn_inst->isScalar()) { |
| accessVrf = computeUnit.vrf[wf->simdId] |
| ->canScheduleReadOperands(wf, gpu_dyn_inst); |
| } |
| bool accessSrf = computeUnit.srf[wf->simdId] |
| ->canScheduleReadOperands(wf, gpu_dyn_inst); |
| // If RFs can support instruction, add to schList in RFBUSY state, |
| // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands |
| // to the VRF |
| bool accessRf = accessVrf && accessSrf; |
| if (accessRf) { |
| DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n", |
| exeType, wf->simdId, wf->wfDynId, |
| gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()); |
| |
| computeUnit.insertInPipeMap(wf); |
| wavesInSch.emplace(wf->wfDynId); |
| schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY)); |
| if (wf->isOldestInstBarrier() && wf->hasBarrier()) { |
| wf->setStatus(Wavefront::S_BARRIER); |
| } |
| if (wf->isOldestInstWaitcnt()) { |
| wf->setStatus(Wavefront::S_WAITCNT); |
| } |
| if (wf->isOldestInstSleep()) { |
| wf->setStatus(Wavefront::S_STALLED_SLEEP); |
| } |
| if (!gpu_dyn_inst->isScalar()) { |
| computeUnit.vrf[wf->simdId] |
| ->scheduleReadOperands(wf, gpu_dyn_inst); |
| } |
| computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst); |
| |
| DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n", |
| exeType, wf->simdId, wf->wfDynId, |
| gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()); |
| return true; |
| } else { |
| // Number of stall cycles due to RF access denied |
| stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++; |
| // Count number of denials due to each reason |
| // Multiple items may contribute to the denied request |
| if (!accessVrf) { |
| stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++; |
| } |
| if (!accessSrf) { |
| stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++; |
| } |
| |
| // Increment stall counts for WF |
| wf->stats.schStalls++; |
| wf->stats.schRfAccessStalls++; |
| DPRINTF(GPUSched, "schList[%d]: Could not add: " |
| "SIMD[%d] WV[%d]: %d: %s\n", |
| exeType, wf->simdId, wf->wfDynId, |
| gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()); |
| } |
| return false; |
| } |
| |
| void |
| ScheduleStage::reinsertToSchList(int exeType, |
| const GPUDynInstPtr &gpu_dyn_inst) |
| { |
| // Insert wave w into schList for specified exeType. |
| // Wave is inserted in age order, with oldest wave being at the |
| // front of the schList |
| assert(gpu_dyn_inst); |
| auto schIter = schList.at(exeType).begin(); |
| while (schIter != schList.at(exeType).end() |
| && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) { |
| schIter++; |
| } |
| schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY)); |
| } |
| |
| void |
| ScheduleStage::checkMemResources() |
| { |
| // Check for resource availability in the next cycle |
| scalarMemBusRdy = false; |
| scalarMemIssueRdy = false; |
| // check if there is a SRF->Global Memory bus available and |
| if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) { |
| scalarMemBusRdy = true; |
| } |
| // check if we can issue a scalar memory instruction |
| if (computeUnit.scalarMemUnit.rdy(Cycles(1))) { |
| scalarMemIssueRdy = true; |
| } |
| |
| glbMemBusRdy = false; |
| glbMemIssueRdy = false; |
| // check if there is a VRF->Global Memory bus available |
| if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) { |
| glbMemBusRdy = true; |
| } |
| // check if we can issue a Global memory instruction |
| if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) { |
| glbMemIssueRdy = true; |
| } |
| |
| locMemBusRdy = false; |
| locMemIssueRdy = false; |
| // check if there is a VRF->LDS bus available |
| if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) { |
| locMemBusRdy = true; |
| } |
| // check if we can issue a LDS instruction |
| if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) { |
| locMemIssueRdy = true; |
| } |
| } |
| |
| bool |
| ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst) |
| { |
| assert(gpu_dyn_inst); |
| Wavefront *wf = gpu_dyn_inst->wavefront(); |
| vectorAluRdy = false; |
| scalarAluRdy = false; |
| // check for available vector/scalar ALUs in the next cycle |
| if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) { |
| vectorAluRdy = true; |
| } |
| if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) { |
| scalarAluRdy = true; |
| } |
| |
| if (gpu_dyn_inst->isNop()) { |
| // S_NOP requires SALU. V_NOP requires VALU. |
| // TODO: Scalar NOP does not require SALU in hardware, |
| // and is executed out of IB directly. |
| if (gpu_dyn_inst->isScalar() && !scalarAluRdy) { |
| stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; |
| return false; |
| } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) { |
| stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; |
| return false; |
| } |
| } else if (gpu_dyn_inst->isEndOfKernel()) { |
| // EndPgm instruction |
| if (gpu_dyn_inst->isScalar() && !scalarAluRdy) { |
| stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; |
| return false; |
| } |
| } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch() |
| || gpu_dyn_inst->isALU()) { |
| // Barrier, Branch, or ALU instruction |
| if (gpu_dyn_inst->isScalar() && !scalarAluRdy) { |
| stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; |
| return false; |
| } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) { |
| stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; |
| return false; |
| } |
| } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) { |
| // Vector Global Memory instruction |
| bool rdy = true; |
| if (!glbMemIssueRdy) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++; |
| } |
| if (!glbMemBusRdy) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++; |
| } |
| if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++; |
| } |
| if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++; |
| } |
| if (!rdy) { |
| return false; |
| } |
| } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) { |
| // Scalar Global Memory instruction |
| bool rdy = true; |
| if (!scalarMemIssueRdy) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++; |
| } |
| if (!scalarMemBusRdy) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++; |
| } |
| if (!computeUnit.scalarMemoryPipe |
| .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe |
| + wf->scalarWrGmReqsInPipe)) |
| { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++; |
| } |
| if (!rdy) { |
| return false; |
| } |
| } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) { |
| // Vector Local Memory instruction |
| bool rdy = true; |
| if (!locMemIssueRdy) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++; |
| } |
| if (!locMemBusRdy) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++; |
| } |
| if (!computeUnit.localMemoryPipe. |
| isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++; |
| } |
| if (!rdy) { |
| return false; |
| } |
| } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) { |
| // Vector Flat memory instruction |
| bool rdy = true; |
| if (!glbMemIssueRdy || !locMemIssueRdy) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++; |
| } |
| if (!glbMemBusRdy || !locMemBusRdy) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++; |
| } |
| if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++; |
| } |
| if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++; |
| } |
| if (!computeUnit.localMemoryPipe. |
| isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) { |
| rdy = false; |
| stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++; |
| } |
| if (!rdy) { |
| return false; |
| } |
| } else { |
| panic("%s: unknown instr checked for readiness", |
| gpu_dyn_inst->disassemble()); |
| return false; |
| } |
| stats.dispNrdyStalls[SCH_RDY]++; |
| return true; |
| } |
| |
| void |
| ScheduleStage::fillDispatchList() |
| { |
| // update execution resource status |
| checkMemResources(); |
| // iterate execution resources |
| for (int j = 0; j < computeUnit.numExeUnits(); j++) { |
| assert(toExecute.dispatchStatus(j) == EMPTY); |
| |
| // iterate waves in schList to pick one for dispatch |
| auto schIter = schList.at(j).begin(); |
| bool dispatched = false; |
| while (schIter != schList.at(j).end()) { |
| // only attempt to dispatch if status is RFREADY |
| if (schIter->second == RFREADY) { |
| // Check if this wave is ready for dispatch |
| bool dispRdy = dispatchReady(schIter->first); |
| if (!dispatched && dispRdy) { |
| // No other wave has been dispatched for this exe |
| // resource, and this wave is ready. Place this wave |
| // on dispatchList and make it ready for execution |
| // next cycle. |
| |
| // Acquire a coalescer token if it is a global mem |
| // operation. |
| GPUDynInstPtr mp = schIter->first; |
| if (!mp->isMemSync() && !mp->isScalar() && |
| (mp->isGlobalMem() || mp->isFlat())) { |
| computeUnit.globalMemoryPipe.acqCoalescerToken(mp); |
| } |
| |
| // Set instruction's exec_mask if it's a mem operation |
| if (mp->isMemRef()) { |
| mp->exec_mask = mp->wavefront()->execMask(); |
| } |
| |
| doDispatchListTransition(j, EXREADY, schIter->first); |
| DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: " |
| "EMPTY->EXREADY\n", j); |
| schIter->first = nullptr; |
| schIter = schList.at(j).erase(schIter); |
| dispatched = true; |
| } else { |
| // Either another wave has been dispatched, or this wave |
| // was not ready, so it is stalled this cycle |
| schIter->first->wavefront()->stats.schStalls++; |
| if (!dispRdy) { |
| // not ready for dispatch, increment stall stat |
| schIter->first->wavefront()->stats.schResourceStalls++; |
| } |
| // Examine next wave for this resource |
| schIter++; |
| } |
| } else { |
| // Wave not in RFREADY, try next wave |
| schIter++; |
| } |
| } |
| |
| // Increment stall count if no wave sent to dispatchList for |
| // current execution resource |
| if (!dispatched) { |
| stats.schListToDispListStalls[j]++; |
| } else { |
| stats.schListToDispList[j]++; |
| } |
| } |
| } |
| |
| void |
| ScheduleStage::arbitrateVrfToLdsBus() |
| { |
| // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops |
| // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus |
| // and a VRF->LDS bus. In GFx9, this is not the case. |
| |
| // iterate the GM pipelines |
| for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) { |
| // get the GM pipe index in the dispatchList |
| int gm_exe_unit = computeUnit.firstMemUnit() + i; |
| // get the wave in the dispatchList |
| GPUDynInstPtr &gpu_dyn_inst |
| = toExecute.readyInst(gm_exe_unit); |
| // If the WF is valid, ready to execute, and the instruction |
| // is a flat access, arbitrate with the WF's assigned LM pipe |
| if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit) |
| == EXREADY && gpu_dyn_inst->isFlat()) { |
| Wavefront *wf = gpu_dyn_inst->wavefront(); |
| // If the associated LM pipe also has a wave selected, block |
| // that wave and let the Flat instruction issue. The WF in the |
| // LM pipe is added back to the schList for consideration next |
| // cycle. |
| if (toExecute.dispatchStatus(wf->localMem) == EXREADY) { |
| reinsertToSchList(wf->localMem, toExecute |
| .readyInst(wf->localMem)); |
| // Increment stall stats for LDS-VRF arbitration |
| stats.ldsBusArbStalls++; |
| toExecute.readyInst(wf->localMem) |
| ->wavefront()->stats.schLdsArbStalls++; |
| } |
| // With arbitration of LM pipe complete, transition the |
| // LM pipe to SKIP state in the dispatchList to inform EX stage |
| // that a Flat instruction is executing next cycle |
| doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst); |
| DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: " |
| "EXREADY->SKIP\n", wf->localMem); |
| } |
| } |
| } |
| |
| void |
| ScheduleStage::checkRfOperandReadComplete() |
| { |
| // Iterate the schList queues and check if operand reads |
| // have completed in the RFs. If so, mark the wave as ready for |
| // selection for dispatchList |
| for (int j = 0; j < computeUnit.numExeUnits(); ++j) { |
| for (auto &p : schList.at(j)) { |
| const GPUDynInstPtr &gpu_dyn_inst = p.first; |
| assert(gpu_dyn_inst); |
| Wavefront *wf = gpu_dyn_inst->wavefront(); |
| |
| // Increment the number of cycles the wave spends in the |
| // SCH stage, since this loop visits every wave in SCH. |
| wf->stats.schCycles++; |
| |
| bool vrfRdy = true; |
| if (!gpu_dyn_inst->isScalar()) { |
| vrfRdy = computeUnit.vrf[wf->simdId] |
| ->operandReadComplete(wf, gpu_dyn_inst); |
| } |
| bool srfRdy = computeUnit.srf[wf->simdId] |
| ->operandReadComplete(wf, gpu_dyn_inst); |
| bool operandsReady = vrfRdy && srfRdy; |
| if (operandsReady) { |
| DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: " |
| "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(), |
| gpu_dyn_inst->disassemble()); |
| DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n", |
| j, wf->wfDynId); |
| p.second = RFREADY; |
| } else { |
| DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready " |
| "for: %d: %s\n", j, wf->wfDynId, |
| gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()); |
| |
| // operands not ready yet, increment SCH stage stats |
| // aggregate to all wavefronts on the CU |
| p.second = RFBUSY; |
| |
| // Increment stall stats |
| wf->stats.schStalls++; |
| wf->stats.schOpdNrdyStalls++; |
| |
| stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++; |
| if (!vrfRdy) { |
| stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++; |
| } |
| if (!srfRdy) { |
| stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++; |
| } |
| } |
| } |
| } |
| } |
| |
| void |
| ScheduleStage::reserveResources() |
| { |
| std::vector<bool> exeUnitReservations; |
| exeUnitReservations.resize(computeUnit.numExeUnits(), false); |
| |
| for (int j = 0; j < computeUnit.numExeUnits(); ++j) { |
| GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j); |
| if (gpu_dyn_inst) { |
| DISPATCH_STATUS s = toExecute.dispatchStatus(j); |
| Wavefront *wf = gpu_dyn_inst->wavefront(); |
| if (s == EMPTY) { |
| continue; |
| } else if (s == EXREADY) { |
| // Wave is ready for execution |
| std::vector<int> execUnitIds = wf->reserveResources(); |
| |
| if (!gpu_dyn_inst->isScalar()) { |
| computeUnit.vrf[wf->simdId] |
| ->dispatchInstruction(gpu_dyn_inst); |
| } |
| computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst); |
| |
| std::stringstream ss; |
| for (auto id : execUnitIds) { |
| ss << id << " "; |
| } |
| DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s" |
| " Reserving ExeRes[ %s]\n", |
| j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(), |
| gpu_dyn_inst->disassemble(), ss.str()); |
| // mark the resources as reserved for this cycle |
| for (auto execUnitId : execUnitIds) { |
| panic_if(exeUnitReservations.at(execUnitId), |
| "Execution unit %d is reserved!!!\n" |
| "SIMD[%d] WV[%d]: %d: %s", |
| execUnitId, wf->simdId, wf->wfDynId, |
| gpu_dyn_inst->seqNum(), |
| gpu_dyn_inst->disassemble()); |
| exeUnitReservations.at(execUnitId) = true; |
| } |
| |
| // If wavefront::reserveResources reserved multiple resources, |
| // then we're executing a flat memory instruction. This means |
| // that we've reserved a global and local memory unit. Thus, |
| // we need to mark the latter execution unit as not available. |
| if (execUnitIds.size() > 1) { |
| [[maybe_unused]] int lm_exec_unit = wf->localMem; |
| assert(toExecute.dispatchStatus(lm_exec_unit) |
| == SKIP); |
| } |
| } else if (s == SKIP) { |
| // Shared Memory pipe reserved for FLAT instruction. |
| // Verify the GM pipe for this wave is ready to execute |
| // and the wave in the GM pipe is the same as the wave |
| // in the LM pipe |
| [[maybe_unused]] int gm_exec_unit = wf->globalMem; |
| assert(wf->wfDynId == toExecute |
| .readyInst(gm_exec_unit)->wfDynId); |
| assert(toExecute.dispatchStatus(gm_exec_unit) |
| == EXREADY); |
| } |
| } |
| } |
| } |
| |
| void |
| ScheduleStage::deleteFromSch(Wavefront *w) |
| { |
| wavesInSch.erase(w->wfDynId); |
| } |
| |
| ScheduleStage::ScheduleStageStats::ScheduleStageStats( |
| statistics::Group *parent, int num_exec_units) |
| : statistics::Group(parent, "ScheduleStage"), |
| ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per " |
| "execution resource"), |
| ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready " |
| "list per execution resource"), |
| ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to " |
| "schList per execution resource when ready list is not empty"), |
| ADD_STAT(schListToDispList, "number of cycles a wave is added to " |
| "dispatchList per execution resource"), |
| ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to" |
| " dispatchList per execution resource"), |
| ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"), |
| ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus " |
| "conflicts"), |
| ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not " |
| "ready"), |
| ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not " |
| "ready") |
| { |
| rdyListNotEmpty.init(num_exec_units); |
| rdyListEmpty.init(num_exec_units); |
| addToSchListStalls.init(num_exec_units); |
| schListToDispList.init(num_exec_units); |
| schListToDispListStalls.init(num_exec_units); |
| opdNrdyStalls.init(SCH_RF_OPD_NRDY_CONDITIONS); |
| dispNrdyStalls.init(SCH_NRDY_CONDITIONS); |
| rfAccessStalls.init(SCH_RF_ACCESS_NRDY_CONDITIONS); |
| |
| opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF")); |
| opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF")); |
| opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF")); |
| |
| dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu")); |
| dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu")); |
| dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY, |
| csprintf("VectorMemIssue")); |
| dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY, |
| csprintf("VectorMemBusBusy")); |
| dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY, |
| csprintf("VectorMemCoalescer")); |
| dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd")); |
| dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY, |
| csprintf("ScalarMemIssue")); |
| dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY, |
| csprintf("ScalarMemBusBusy")); |
| dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY, |
| csprintf("ScalarMemFIFO")); |
| dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY, |
| csprintf("LocalMemIssue")); |
| dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY, |
| csprintf("LocalMemBusBusy")); |
| dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY, |
| csprintf("LocalMemFIFO")); |
| dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY, |
| csprintf("FlatMemIssue")); |
| dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY, |
| csprintf("FlatMemBusBusy")); |
| dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY, |
| csprintf("FlatMemCoalescer")); |
| dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY, |
| csprintf("FlatMemFIFO")); |
| dispNrdyStalls.subname(SCH_RDY, csprintf("Ready")); |
| |
| rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd")); |
| rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr")); |
| rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd")); |
| rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr")); |
| rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any")); |
| } |
| |
| } // namespace gem5 |