src/gpu-compute/schedule_stage.cc - public/gem5 - Git at Google

 /*
  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #include "gpu-compute/schedule_stage.hh"

 #include <unordered_set>

 #include "base/compiler.hh"
 #include "debug/GPUSched.hh"
 #include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"

 namespace gem5
 {

 ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
                              ScoreboardCheckToSchedule &from_scoreboard_check,
                              ScheduleToExecute &to_execute)
     : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
       toExecute(to_execute),
       _name(cu.name() + ".ScheduleStage"),
       vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
       scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
       locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
 {
     for (int j = 0; j < cu.numExeUnits(); ++j) {
         scheduler.emplace_back(p);
     }
     wavesInSch.clear();
     schList.resize(cu.numExeUnits());
     for (auto &dq : schList) {
         dq.clear();
     }
 }

 ScheduleStage::~ScheduleStage()
 {
     scheduler.clear();
     wavesInSch.clear();
     schList.clear();
 }

 void
 ScheduleStage::init()
 {

     fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(),
              "Scheduler should have same number of entries as CU's readyList");
     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
         scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j));
     }

     assert(computeUnit.numVectorGlobalMemUnits == 1);
     assert(computeUnit.numVectorSharedMemUnits == 1);
 }

 void
 ScheduleStage::exec()
 {
     toExecute.reset();

     // Update readyList
     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
         /**
          * Remove any wave that already has an instruction present in SCH
          * waiting for RF reads to complete. This prevents out of order
          * execution within a wave.
          */
         fromScoreboardCheck.updateReadyList(j);
         for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
              wIt != fromScoreboardCheck.readyWFs(j).end();) {
             if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
                 *wIt = nullptr;
                 wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
             } else {
                 wIt++;
             }
         }
     }

     // Attempt to add another wave for each EXE type to schList queues
     // VMEM resources are iterated first, effectively giving priority
     // to VMEM over VALU for scheduling read of operands to the RFs.
     // Scalar Memory are iterated after VMEM

     // Iterate VMEM and SMEM
     int firstMemUnit = computeUnit.firstMemUnit();
     int lastMemUnit = computeUnit.lastMemUnit();
     for (int j = firstMemUnit; j <= lastMemUnit; j++) {
         int readyListSize = fromScoreboardCheck.readyWFs(j).size();
         // If no wave is ready to be scheduled on the execution resource
         // then skip scheduling for this execution resource
         if (!readyListSize) {
             stats.rdyListEmpty[j]++;
             continue;
         }
         stats.rdyListNotEmpty[j]++;

         // Pick a wave and attempt to add it to schList
         Wavefront *wf = scheduler[j].chooseWave();
         GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
         assert(gpu_dyn_inst);
         if (!addToSchList(j, gpu_dyn_inst)) {
             // For waves not added to schList, increment count of cycles
             // this wave spends in SCH stage.
             wf->stats.schCycles++;
             stats.addToSchListStalls[j]++;
         } else {
             if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
                 wf->incLGKMInstsIssued();
             } else {
                 wf->incVMemInstsIssued();
                 if (gpu_dyn_inst->isFlat()) {
                     wf->incLGKMInstsIssued();
                 }
             }
             if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
                 wf->incExpInstsIssued();
             }
         }
     }

     // Iterate everything else
     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
         // skip the VMEM resources
         if (j >= firstMemUnit && j <= lastMemUnit) {
             continue;
         }
         int readyListSize = fromScoreboardCheck.readyWFs(j).size();
         // If no wave is ready to be scheduled on the execution resource
         // then skip scheduling for this execution resource
         if (!readyListSize) {
             stats.rdyListEmpty[j]++;
             continue;
         }
         stats.rdyListNotEmpty[j]++;

         // Pick a wave and attempt to add it to schList
         Wavefront *wf = scheduler[j].chooseWave();
         GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
         assert(gpu_dyn_inst);
         if (!addToSchList(j, gpu_dyn_inst)) {
             // For waves not added to schList, increment count of cycles
             // this wave spends in SCH stage.
             wf->stats.schCycles++;
             stats.addToSchListStalls[j]++;
         }
     }

     // At this point, the schList queue per EXE type may contain
     // multiple waves, in order of age (oldest to youngest).
     // Wave may be in RFBUSY, indicating they are waiting for registers
     // to be read, or in RFREADY, indicating they are candidates for
     // the dispatchList and execution

     // Iterate schList queues and check if any of the waves have finished
     // reading their operands, moving those waves to RFREADY status
     checkRfOperandReadComplete();

     // Fill the dispatch list with the oldest wave of each EXE type that
     // is ready to execute
     // Wave is picked if status in schList is RFREADY and it passes resource
     // ready checks similar to those currently in SCB
     fillDispatchList();

     // Resource arbitration on waves in dispatchList
     // Losing waves are re-inserted to the schList at a location determined
     // by wave age

     // Arbitrate access to the VRF->LDS bus
     arbitrateVrfToLdsBus();

     // Schedule write operations to the register files
     scheduleRfDestOperands();

     // Lastly, reserve resources for waves that are ready to execute.
     reserveResources();
 }

 void
 ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
                                         const GPUDynInstPtr &gpu_dyn_inst)
 {
     toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
 }

 void
 ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s)
 {
     toExecute.dispatchTransition(unitId, s);
 }

 bool
 ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
 {
     assert(gpu_dyn_inst);
     Wavefront *wf = gpu_dyn_inst->wavefront();
     bool accessVrfWr = true;
     if (!gpu_dyn_inst->isScalar()) {
         accessVrfWr = computeUnit.vrf[wf->simdId]
             ->canScheduleWriteOperands(wf, gpu_dyn_inst);
     }
     bool accessSrfWr = computeUnit.srf[wf->simdId]
         ->canScheduleWriteOperands(wf, gpu_dyn_inst);
     bool accessRf = accessVrfWr && accessSrfWr;
     if (accessRf) {
         if (!gpu_dyn_inst->isScalar()) {
             computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
                                                                gpu_dyn_inst);
         }
         computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
         return true;
     } else {
         stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
         if (!accessSrfWr) {
             stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
         }
         if (!accessVrfWr) {
             stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
         }

         // Increment stall counts for WF
         wf->stats.schStalls++;
         wf->stats.schRfAccessStalls++;
     }
     return false;
 }

 void
 ScheduleStage::scheduleRfDestOperands()
 {
     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
         if (toExecute.dispatchStatus(j) == EMPTY ||
             toExecute.dispatchStatus(j) == SKIP) {
             continue;
         }

         // get the wave on dispatch list and attempt to allocate write
         // resources in the RFs
         const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
         assert(gpu_dyn_inst);
         Wavefront *wf = gpu_dyn_inst->wavefront();
         if (!schedRfWrites(j, gpu_dyn_inst)) {
             reinsertToSchList(j, gpu_dyn_inst);
             doDispatchListTransition(j, EMPTY);
             // if this is a flat inst, also transition the LM pipe to empty
             // Note: since FLAT/LM arbitration occurs before scheduling
             // destination operands to the RFs, it is possible that a LM
             // instruction lost arbitration, but would have been able to
             // pass the RF destination operand check here, and execute
             // instead of the FLAT.
             if (wf->instructionBuffer.front()->isFlat()) {
                 assert(toExecute.dispatchStatus(wf->localMem)
                        == SKIP);
                 doDispatchListTransition(wf->localMem, EMPTY);
             }
         }
     }
 }

 bool
 ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
 {
     // Attempt to add the wave to the schList if the VRF can support the
     // wave's next instruction
     assert(gpu_dyn_inst);
     Wavefront *wf = gpu_dyn_inst->wavefront();
     bool accessVrf = true;
     if (!gpu_dyn_inst->isScalar()) {
         accessVrf = computeUnit.vrf[wf->simdId]
             ->canScheduleReadOperands(wf, gpu_dyn_inst);
     }
     bool accessSrf = computeUnit.srf[wf->simdId]
         ->canScheduleReadOperands(wf, gpu_dyn_inst);
     // If RFs can support instruction, add to schList in RFBUSY state,
     // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
     // to the VRF
     bool accessRf = accessVrf && accessSrf;
     if (accessRf) {
         DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
                 exeType, wf->simdId, wf->wfDynId,
                 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());

         computeUnit.insertInPipeMap(wf);
         wavesInSch.emplace(wf->wfDynId);
         schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
         if (wf->isOldestInstBarrier() && wf->hasBarrier()) {
             wf->setStatus(Wavefront::S_BARRIER);
         }
         if (wf->isOldestInstWaitcnt()) {
             wf->setStatus(Wavefront::S_WAITCNT);
         }
         if (wf->isOldestInstSleep()) {
             wf->setStatus(Wavefront::S_STALLED_SLEEP);
         }
         if (!gpu_dyn_inst->isScalar()) {
             computeUnit.vrf[wf->simdId]
                 ->scheduleReadOperands(wf, gpu_dyn_inst);
         }
         computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);

         DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
                 exeType, wf->simdId, wf->wfDynId,
                 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
         return true;
     } else {
         // Number of stall cycles due to RF access denied
         stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
         // Count number of denials due to each reason
         // Multiple items may contribute to the denied request
         if (!accessVrf) {
             stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
         }
         if (!accessSrf) {
             stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
         }

         // Increment stall counts for WF
         wf->stats.schStalls++;
         wf->stats.schRfAccessStalls++;
         DPRINTF(GPUSched, "schList[%d]: Could not add: "
                 "SIMD[%d] WV[%d]: %d: %s\n",
                 exeType, wf->simdId, wf->wfDynId,
                 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
     }
     return false;
 }

 void
 ScheduleStage::reinsertToSchList(int exeType,
                                  const GPUDynInstPtr &gpu_dyn_inst)
 {
     // Insert wave w into schList for specified exeType.
     // Wave is inserted in age order, with oldest wave being at the
     // front of the schList
     assert(gpu_dyn_inst);
     auto schIter = schList.at(exeType).begin();
     while (schIter != schList.at(exeType).end()
            && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
         schIter++;
     }
     schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
 }

 void
 ScheduleStage::checkMemResources()
 {
     // Check for resource availability in the next cycle
     scalarMemBusRdy = false;
     scalarMemIssueRdy = false;
     // check if there is a SRF->Global Memory bus available and
     if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {
         scalarMemBusRdy = true;
     }
     // check if we can issue a scalar memory instruction
     if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {
         scalarMemIssueRdy = true;
     }

     glbMemBusRdy = false;
     glbMemIssueRdy = false;
     // check if there is a VRF->Global Memory bus available
     if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
         glbMemBusRdy = true;
     }
     // check if we can issue a Global memory instruction
     if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {
         glbMemIssueRdy = true;
     }

     locMemBusRdy = false;
     locMemIssueRdy = false;
     // check if there is a VRF->LDS bus available
     if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {
         locMemBusRdy = true;
     }
     // check if we can issue a LDS instruction
     if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {
         locMemIssueRdy = true;
     }
 }

 bool
 ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
 {
     assert(gpu_dyn_inst);
     Wavefront *wf = gpu_dyn_inst->wavefront();
     vectorAluRdy = false;
     scalarAluRdy = false;
     // check for available vector/scalar ALUs in the next cycle
     if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
         vectorAluRdy = true;
     }
     if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
         scalarAluRdy = true;
     }

     if (gpu_dyn_inst->isNop()) {
         // S_NOP requires SALU. V_NOP requires VALU.
         // TODO: Scalar NOP does not require SALU in hardware,
         // and is executed out of IB directly.
         if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
             stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
             return false;
         } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
             stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
             return false;
         }
     } else if (gpu_dyn_inst->isEndOfKernel()) {
         // EndPgm instruction
         if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
             stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
             return false;
         }
     } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
                || gpu_dyn_inst->isALU()) {
         // Barrier, Branch, or ALU instruction
         if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
             stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
             return false;
         } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
             stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
             return false;
         }
     } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
         // Vector Global Memory instruction
         bool rdy = true;
         if (!glbMemIssueRdy) {
             rdy = false;
             stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
         }
         if (!glbMemBusRdy) {
             rdy = false;
             stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
         }
         if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
             rdy = false;
             stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
         }
         if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
             rdy = false;
             stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
         }
         if (!rdy) {
             return false;
         }
     } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
         // Scalar Global Memory instruction
         bool rdy = true;
         if (!scalarMemIssueRdy) {
             rdy = false;
             stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
         }
         if (!scalarMemBusRdy) {
             rdy = false;
             stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
         }
         if (!computeUnit.scalarMemoryPipe
             .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
             + wf->scalarWrGmReqsInPipe))
         {
             rdy = false;
             stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
         }
         if (!rdy) {
             return false;
         }
     } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
         // Vector Local Memory instruction
         bool rdy = true;
         if (!locMemIssueRdy) {
             rdy = false;
             stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
         }
         if (!locMemBusRdy) {
             rdy = false;
             stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
         }
         if (!computeUnit.localMemoryPipe.
                 isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
             rdy = false;
             stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
         }
         if (!rdy) {
             return false;
         }
     } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
         // Vector Flat memory instruction
         bool rdy = true;
         if (!glbMemIssueRdy || !locMemIssueRdy) {
             rdy = false;
             stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
         }
         if (!glbMemBusRdy || !locMemBusRdy) {
             rdy = false;
             stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
         }
         if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
             rdy = false;
             stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
         }
         if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
             rdy = false;
             stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
         }
         if (!computeUnit.localMemoryPipe.
                 isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
             rdy = false;
             stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
         }
         if (!rdy) {
             return false;
         }
     } else {
         panic("%s: unknown instr checked for readiness",
               gpu_dyn_inst->disassemble());
         return false;
     }
     stats.dispNrdyStalls[SCH_RDY]++;
     return true;
 }

 void
 ScheduleStage::fillDispatchList()
 {
     // update execution resource status
     checkMemResources();
     // iterate execution resources
     for (int j = 0; j < computeUnit.numExeUnits(); j++) {
         assert(toExecute.dispatchStatus(j) == EMPTY);

         // iterate waves in schList to pick one for dispatch
         auto schIter = schList.at(j).begin();
         bool dispatched = false;
         while (schIter != schList.at(j).end()) {
             // only attempt to dispatch if status is RFREADY
             if (schIter->second == RFREADY) {
                 // Check if this wave is ready for dispatch
                 bool dispRdy = dispatchReady(schIter->first);
                 if (!dispatched && dispRdy) {
                     // No other wave has been dispatched for this exe
                     // resource, and this wave is ready. Place this wave
                     // on dispatchList and make it ready for execution
                     // next cycle.

                     // Acquire a coalescer token if it is a global mem
                     // operation.
                     GPUDynInstPtr mp = schIter->first;
                     if (!mp->isMemSync() && !mp->isScalar() &&
                         (mp->isGlobalMem() || mp->isFlat())) {
                         computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
                     }

                     // Set instruction's exec_mask if it's a mem operation
                     if (mp->isMemRef()) {
                         mp->exec_mask = mp->wavefront()->execMask();
                     }

                     doDispatchListTransition(j, EXREADY, schIter->first);
                     DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
                             "EMPTY->EXREADY\n", j);
                     schIter->first = nullptr;
                     schIter = schList.at(j).erase(schIter);
                     dispatched = true;
                 } else {
                     // Either another wave has been dispatched, or this wave
                     // was not ready, so it is stalled this cycle
                     schIter->first->wavefront()->stats.schStalls++;
                     if (!dispRdy) {
                         // not ready for dispatch, increment stall stat
                         schIter->first->wavefront()->stats.schResourceStalls++;
                     }
                     // Examine next wave for this resource
                     schIter++;
                 }
             } else {
                 // Wave not in RFREADY, try next wave
                 schIter++;
             }
         }

         // Increment stall count if no wave sent to dispatchList for
         // current execution resource
         if (!dispatched) {
             stats.schListToDispListStalls[j]++;
         } else {
             stats.schListToDispList[j]++;
         }
     }
 }

 void
 ScheduleStage::arbitrateVrfToLdsBus()
 {
     // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
     // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
     // and a VRF->LDS bus. In GFx9, this is not the case.

     // iterate the GM pipelines
     for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
         // get the GM pipe index in the dispatchList
         int gm_exe_unit = computeUnit.firstMemUnit() + i;
         // get the wave in the dispatchList
         GPUDynInstPtr &gpu_dyn_inst
             = toExecute.readyInst(gm_exe_unit);
         // If the WF is valid, ready to execute, and the instruction
         // is a flat access, arbitrate with the WF's assigned LM pipe
         if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
             == EXREADY && gpu_dyn_inst->isFlat()) {
             Wavefront *wf = gpu_dyn_inst->wavefront();
             // If the associated LM pipe also has a wave selected, block
             // that wave and let the Flat instruction issue. The WF in the
             // LM pipe is added back to the schList for consideration next
             // cycle.
             if (toExecute.dispatchStatus(wf->localMem) == EXREADY) {
                 reinsertToSchList(wf->localMem, toExecute
                                   .readyInst(wf->localMem));
                 // Increment stall stats for LDS-VRF arbitration
                 stats.ldsBusArbStalls++;
                 toExecute.readyInst(wf->localMem)
                     ->wavefront()->stats.schLdsArbStalls++;
             }
             // With arbitration of LM pipe complete, transition the
             // LM pipe to SKIP state in the dispatchList to inform EX stage
             // that a Flat instruction is executing next cycle
             doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
             DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
                     "EXREADY->SKIP\n", wf->localMem);
         }
     }
 }

 void
 ScheduleStage::checkRfOperandReadComplete()
 {
     // Iterate the schList queues and check if operand reads
     // have completed in the RFs. If so, mark the wave as ready for
     // selection for dispatchList
     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
         for (auto &p : schList.at(j)) {
             const GPUDynInstPtr &gpu_dyn_inst = p.first;
             assert(gpu_dyn_inst);
             Wavefront *wf = gpu_dyn_inst->wavefront();

             // Increment the number of cycles the wave spends in the
             // SCH stage, since this loop visits every wave in SCH.
             wf->stats.schCycles++;

             bool vrfRdy = true;
             if (!gpu_dyn_inst->isScalar()) {
                 vrfRdy = computeUnit.vrf[wf->simdId]
                     ->operandReadComplete(wf, gpu_dyn_inst);
             }
             bool srfRdy = computeUnit.srf[wf->simdId]
                 ->operandReadComplete(wf, gpu_dyn_inst);
             bool operandsReady = vrfRdy && srfRdy;
             if (operandsReady) {
                 DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
                         "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
                         gpu_dyn_inst->disassemble());
                 DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
                         j, wf->wfDynId);
                 p.second = RFREADY;
             } else {
                 DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
                         "for: %d: %s\n", j, wf->wfDynId,
                         gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());

                 // operands not ready yet, increment SCH stage stats
                 // aggregate to all wavefronts on the CU
                 p.second = RFBUSY;

                 // Increment stall stats
                 wf->stats.schStalls++;
                 wf->stats.schOpdNrdyStalls++;

                 stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++;
                 if (!vrfRdy) {
                     stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
                 }
                 if (!srfRdy) {
                     stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
                 }
             }
         }
     }
 }

 void
 ScheduleStage::reserveResources()
 {
     std::vector<bool> exeUnitReservations;
     exeUnitReservations.resize(computeUnit.numExeUnits(), false);

     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
         GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
         if (gpu_dyn_inst) {
             DISPATCH_STATUS s = toExecute.dispatchStatus(j);
             Wavefront *wf = gpu_dyn_inst->wavefront();
             if (s == EMPTY) {
                 continue;
             } else if (s == EXREADY) {
                 // Wave is ready for execution
                 std::vector<int> execUnitIds = wf->reserveResources();

                 if (!gpu_dyn_inst->isScalar()) {
                     computeUnit.vrf[wf->simdId]
                         ->dispatchInstruction(gpu_dyn_inst);
                 }
                 computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);

                 std::stringstream ss;
                 for (auto id : execUnitIds) {
                     ss << id << " ";
                 }
                 DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
                         "    Reserving ExeRes[ %s]\n",
                         j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
                         gpu_dyn_inst->disassemble(), ss.str());
                 // mark the resources as reserved for this cycle
                 for (auto execUnitId : execUnitIds) {
                     panic_if(exeUnitReservations.at(execUnitId),
                              "Execution unit %d is reserved!!!\n"
                              "SIMD[%d] WV[%d]: %d: %s",
                              execUnitId, wf->simdId, wf->wfDynId,
                              gpu_dyn_inst->seqNum(),
                              gpu_dyn_inst->disassemble());
                     exeUnitReservations.at(execUnitId) = true;
                 }

                 // If wavefront::reserveResources reserved multiple resources,
                 // then we're executing a flat memory instruction. This means
                 // that we've reserved a global and local memory unit. Thus,
                 // we need to mark the latter execution unit as not available.
                 if (execUnitIds.size() > 1) {
                     [[maybe_unused]] int lm_exec_unit = wf->localMem;
                     assert(toExecute.dispatchStatus(lm_exec_unit)
                            == SKIP);
                 }
             } else if (s == SKIP) {
                 // Shared Memory pipe reserved for FLAT instruction.
                 // Verify the GM pipe for this wave is ready to execute
                 // and the wave in the GM pipe is the same as the wave
                 // in the LM pipe
                 [[maybe_unused]] int gm_exec_unit = wf->globalMem;
                 assert(wf->wfDynId == toExecute
                        .readyInst(gm_exec_unit)->wfDynId);
                 assert(toExecute.dispatchStatus(gm_exec_unit)
                        == EXREADY);
             }
         }
     }
 }

 void
 ScheduleStage::deleteFromSch(Wavefront *w)
 {
     wavesInSch.erase(w->wfDynId);
 }

 ScheduleStage::ScheduleStageStats::ScheduleStageStats(
     statistics::Group *parent, int num_exec_units)
     : statistics::Group(parent, "ScheduleStage"),
       ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
                "execution resource"),
       ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
                "list per execution resource"),
       ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
                "schList per execution resource when ready list is not empty"),
       ADD_STAT(schListToDispList, "number of cycles a wave is added to "
                "dispatchList per execution resource"),
       ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
                " dispatchList per execution resource"),
       ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
       ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
                "conflicts"),
       ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
                "ready"),
       ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
                "ready")
 {
     rdyListNotEmpty.init(num_exec_units);
     rdyListEmpty.init(num_exec_units);
     addToSchListStalls.init(num_exec_units);
     schListToDispList.init(num_exec_units);
     schListToDispListStalls.init(num_exec_units);
     opdNrdyStalls.init(SCH_RF_OPD_NRDY_CONDITIONS);
     dispNrdyStalls.init(SCH_NRDY_CONDITIONS);
     rfAccessStalls.init(SCH_RF_ACCESS_NRDY_CONDITIONS);

     opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
     opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
     opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));

     dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
     dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
     dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
                                   csprintf("VectorMemIssue"));
     dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
                                   csprintf("VectorMemBusBusy"));
     dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
                                   csprintf("VectorMemCoalescer"));
     dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
     dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
                                   csprintf("ScalarMemIssue"));
     dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
                                   csprintf("ScalarMemBusBusy"));
     dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
                                   csprintf("ScalarMemFIFO"));
     dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
                                   csprintf("LocalMemIssue"));
     dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
                                   csprintf("LocalMemBusBusy"));
     dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
                                   csprintf("LocalMemFIFO"));
     dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
                                   csprintf("FlatMemIssue"));
     dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
                                   csprintf("FlatMemBusBusy"));
     dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
                                   csprintf("FlatMemCoalescer"));
     dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
                                   csprintf("FlatMemFIFO"));
     dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));

     rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
     rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
     rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
     rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
     rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
 }

 } // namespace gem5