blob: 8eed0021588c57c103012230c118d658df450ff9 [file] [log] [blame] [edit]
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __SCHEDULE_STAGE_HH__
#define __SCHEDULE_STAGE_HH__
#include <deque>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/misc.hh"
#include "gpu-compute/scheduler.hh"
namespace gem5
{
// Schedule or execution arbitration stage.
// From the pool of ready waves in the ready list,
// one wave is selected for each execution resource.
// The selection is made based on a scheduling policy
class ComputeUnit;
class ScheduleToExecute;
class ScoreboardCheckToSchedule;
class Wavefront;
struct ComputeUnitParams;
class ScheduleStage
{
public:
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
ScoreboardCheckToSchedule &from_scoreboard_check,
ScheduleToExecute &to_execute);
~ScheduleStage();
void init();
void exec();
// Stats related variables and methods
const std::string& name() const { return _name; }
enum SchNonRdyType
{
SCH_SCALAR_ALU_NRDY,
SCH_VECTOR_ALU_NRDY,
SCH_VECTOR_MEM_ISSUE_NRDY,
SCH_VECTOR_MEM_BUS_BUSY_NRDY,
SCH_VECTOR_MEM_COALESCER_NRDY,
SCH_VECTOR_MEM_REQS_NRDY,
SCH_CEDE_SIMD_NRDY,
SCH_SCALAR_MEM_ISSUE_NRDY,
SCH_SCALAR_MEM_BUS_BUSY_NRDY,
SCH_SCALAR_MEM_FIFO_NRDY,
SCH_LOCAL_MEM_ISSUE_NRDY,
SCH_LOCAL_MEM_BUS_BUSY_NRDY,
SCH_LOCAL_MEM_FIFO_NRDY,
SCH_FLAT_MEM_ISSUE_NRDY,
SCH_FLAT_MEM_BUS_BUSY_NRDY,
SCH_FLAT_MEM_COALESCER_NRDY,
SCH_FLAT_MEM_REQS_NRDY,
SCH_FLAT_MEM_FIFO_NRDY,
SCH_RDY,
SCH_NRDY_CONDITIONS
};
enum schopdnonrdytype_e
{
SCH_VRF_OPD_NRDY,
SCH_SRF_OPD_NRDY,
SCH_RF_OPD_NRDY,
SCH_RF_OPD_NRDY_CONDITIONS
};
enum schrfaccessnonrdytype_e
{
SCH_VRF_RD_ACCESS_NRDY,
SCH_VRF_WR_ACCESS_NRDY,
SCH_SRF_RD_ACCESS_NRDY,
SCH_SRF_WR_ACCESS_NRDY,
SCH_RF_ACCESS_NRDY,
SCH_RF_ACCESS_NRDY_CONDITIONS
};
// Called by ExecStage to inform SCH of instruction execution
void deleteFromSch(Wavefront *w);
// Schedule List status
enum SCH_STATUS
{
RFBUSY = 0, // RF busy reading operands
RFREADY, // ready for exec
};
private:
ComputeUnit &computeUnit;
ScoreboardCheckToSchedule &fromScoreboardCheck;
ScheduleToExecute &toExecute;
// Each execution resource will have its own
// scheduler and a dispatch list
std::vector<Scheduler> scheduler;
const std::string _name;
// called by exec() to add a wave to schList if the RFs can support it
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
// re-insert a wave to schList if wave lost arbitration
// wave is inserted such that age order (oldest to youngest) is preserved
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
// check waves in schList to see if RF reads complete
void checkRfOperandReadComplete();
// check execution resources for readiness
bool vectorAluRdy;
bool scalarAluRdy;
bool scalarMemBusRdy;
bool scalarMemIssueRdy;
bool glbMemBusRdy;
bool glbMemIssueRdy;
bool locMemBusRdy;
bool locMemIssueRdy;
// check status of memory pipes and RF to Mem buses
void checkMemResources();
// resource ready check called by fillDispatchList
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst);
// pick waves from schList and populate dispatchList with one wave
// per EXE resource type
void fillDispatchList();
// arbitrate Shared Mem Pipe VRF/LDS bus for waves in dispatchList
void arbitrateVrfToLdsBus();
// schedule destination operand writes to register files for waves in
// dispatchList
void scheduleRfDestOperands();
// invoked by scheduleRfDestOperands to schedule RF writes for a wave
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
// reserve resources for waves surviving arbitration in dispatchList
void reserveResources();
void doDispatchListTransition(int unitId, DISPATCH_STATUS s,
const GPUDynInstPtr &gpu_dyn_inst);
void doDispatchListTransition(int unitId, DISPATCH_STATUS s);
// Set tracking wfDynId for each wave present in schedule stage
// Used to allow only one instruction per wave in schedule
std::unordered_set<uint64_t> wavesInSch;
// List of waves (one list per exe resource) that are in schedule
// stage. Waves are added to this list after selected by scheduler
// from readyList. Waves are removed from this list and placed on
// dispatchList when status reaches SCHREADY.
// Waves are kept ordered by age for each resource, always favoring
// forward progress for the oldest wave.
// The maximum number of waves per resource can be determined by either
// the VRF/SRF availability or limits imposed by paremeters (to be added)
// of the SCH stage or CU.
std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;
protected:
struct ScheduleStageStats : public statistics::Group
{
ScheduleStageStats(statistics::Group *parent, int num_exec_units);
// Number of cycles with empty (or not empty) readyList, per execution
// resource, when the CU is active (not sleeping)
statistics::Vector rdyListEmpty;
statistics::Vector rdyListNotEmpty;
// Number of cycles, per execution resource, when at least one wave
// was on the readyList and picked by scheduler, but was unable to be
// added to the schList, when the CU is active (not sleeping)
statistics::Vector addToSchListStalls;
// Number of cycles, per execution resource, when a wave is selected
// as candidate for dispatchList from schList
// Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
statistics::Vector schListToDispList;
// Per execution resource stat, incremented once per cycle if no wave
// was selected as candidate for dispatch and moved to dispatchList
statistics::Vector schListToDispListStalls;
// Number of times a wave is selected by the scheduler but cannot
// be added to the schList due to register files not being able to
// support reads or writes of operands. RF_ACCESS_NRDY condition is
// always incremented if at least one read/write not supported, other
// conditions are incremented independently from each other.
statistics::Vector rfAccessStalls;
// Number of times a wave is executing FLAT instruction and
// forces another wave occupying its required local memory resource
// to be deselected for execution, and placed back on schList
statistics::Scalar ldsBusArbStalls;
// Count of times VRF and/or SRF blocks waves on schList from
// performing RFBUSY->RFREADY transition
statistics::Vector opdNrdyStalls;
// Count of times resource required for dispatch is not ready and
// blocks wave in RFREADY state on schList from potentially moving
// to dispatchList
statistics::Vector dispNrdyStalls;
} stats;
};
} // namespace gem5
#endif // __SCHEDULE_STAGE_HH__