| /* |
| * Copyright (c) 2011-2017 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifndef __GPU_COMPUTE_WAVEFRONT_HH__ |
| #define __GPU_COMPUTE_WAVEFRONT_HH__ |
| |
| #include <cassert> |
| #include <deque> |
| #include <list> |
| #include <memory> |
| #include <unordered_map> |
| #include <vector> |
| |
| #include "arch/gpu_isa.hh" |
| #include "base/logging.hh" |
| #include "base/statistics.hh" |
| #include "base/stats/group.hh" |
| #include "base/types.hh" |
| #include "config/the_gpu_isa.hh" |
| #include "gpu-compute/compute_unit.hh" |
| #include "gpu-compute/dispatcher.hh" |
| #include "gpu-compute/gpu_dyn_inst.hh" |
| #include "gpu-compute/hsa_queue_entry.hh" |
| #include "gpu-compute/lds_state.hh" |
| #include "gpu-compute/misc.hh" |
| #include "params/Wavefront.hh" |
| #include "sim/sim_object.hh" |
| |
| namespace gem5 |
| { |
| |
| class Wavefront : public SimObject |
| { |
| public: |
| enum status_e |
| { |
| // wavefront is stalled |
| S_STOPPED, |
| // wavefront is returning from a kernel |
| S_RETURNING, |
| // wavefront is running normally |
| S_RUNNING, |
| // wavefront is stalled |
| S_STALLED, |
| |
| S_STALLED_SLEEP, |
| |
| /** |
| * wavefront has unsatisfied wait counts |
| * |
| * while in this state the WF will only execute if |
| * the oldest instruction is the waitcnt. while in |
| * S_WAITCNT, the wavefront will not be ready until |
| * all of its waitcnts have been satisfied. the |
| * scoreboard ready() function will check the status |
| * of the waitcnts whenever the WF is in S_WAITCNT, |
| * and once they are satisfied, it will resume normal |
| * operation. |
| */ |
| S_WAITCNT, |
| /** |
| * WF is stalled at a barrier. |
| */ |
| S_BARRIER |
| }; |
| |
| // HW slot id where the WF is mapped to inside a SIMD unit |
| const int wfSlotId; |
| int kernId; |
| // SIMD unit where the WV has been scheduled |
| const int simdId; |
| // id of the execution unit (or pipeline) where the oldest instruction |
| // of the WF is scheduled |
| int execUnitId; |
| int flatLmUnitId; |
| int flatGmUnitId; |
| // pointer to parent CU |
| ComputeUnit *computeUnit; |
| int maxIbSize; |
| |
| std::deque<GPUDynInstPtr> instructionBuffer; |
| |
| bool pendingFetch; |
| bool dropFetch; |
| // last tick during which all WFs in the CU are not idle |
| Tick lastNonIdleTick; |
| |
| // Execution unit resource ID's associated with this WF |
| // These are static mappings set at WF slot construction and |
| // based off of the simdId and wfSlotId. |
| |
| // Index to scalarALUs resource vector in CU |
| int scalarAlu; |
| |
| // Indices into readyList/dispatchList of resources used by this |
| // wavefront |
| int scalarAluGlobalIdx; |
| int globalMem; |
| int localMem; |
| int scalarMem; |
| |
| // number of VGPRs required by WF |
| uint32_t maxVgprs; |
| // number of SGPRs required by WF |
| uint32_t maxSgprs; |
| void freeResources(); |
| GPUDynInstPtr nextInstr(); |
| void setStatus(status_e newStatus); |
| status_e getStatus() { return status; } |
| void resizeRegFiles(int num_vregs, int num_sregs); |
| bool isGmInstruction(GPUDynInstPtr ii); |
| bool isLmInstruction(GPUDynInstPtr ii); |
| bool isOldestInstWaitcnt(); |
| bool isOldestInstSleep(); |
| bool isOldestInstGMem(); |
| bool isOldestInstLMem(); |
| bool isOldestInstPrivMem(); |
| bool isOldestInstFlatMem(); |
| bool isOldestInstVectorALU(); |
| bool isOldestInstScalarALU(); |
| bool isOldestInstScalarMem(); |
| bool isOldestInstBarrier(); |
| |
| // used for passing spill address to DDInstGPU |
| std::vector<Addr> lastAddr; |
| std::vector<uint32_t> workItemId[3]; |
| std::vector<uint32_t> workItemFlatId; |
| /* kernel launch parameters */ |
| uint32_t workGroupId[3]; |
| uint32_t workGroupSz[3]; |
| uint32_t gridSz[3]; |
| uint32_t wgId; |
| uint32_t wgSz; |
| /* the actual WG size can differ than the maximum size */ |
| uint32_t actualWgSz[3]; |
| uint32_t actualWgSzTotal; |
| void computeActualWgSz(HSAQueueEntry *task); |
| // wavefront id within a workgroup |
| uint32_t wfId; |
| uint32_t maxDynWaveId; |
| uint32_t dispatchId; |
| // vector and scalar memory requests pending in memory system |
| int outstandingReqs; |
| // outstanding global memory write requests |
| int outstandingReqsWrGm; |
| // outstanding local memory write requests |
| int outstandingReqsWrLm; |
| // outstanding global memory read requests |
| int outstandingReqsRdGm; |
| // outstanding local memory read requests |
| int outstandingReqsRdLm; |
| // outstanding scalar memory read requests |
| int scalarOutstandingReqsRdGm; |
| // outstanding scalar memory write requests |
| int scalarOutstandingReqsWrGm; |
| int rdLmReqsInPipe; |
| int rdGmReqsInPipe; |
| int wrLmReqsInPipe; |
| int wrGmReqsInPipe; |
| int scalarRdGmReqsInPipe; |
| int scalarWrGmReqsInPipe; |
| |
| int memTraceBusy; |
| uint64_t lastTrace; |
| // number of virtual vector registers reserved by WF |
| int reservedVectorRegs; |
| // number of virtual scalar registers reserved by WF |
| int reservedScalarRegs; |
| // Index into the Vector Register File's namespace where the WF's registers |
| // will live while the WF is executed |
| uint32_t startVgprIndex; |
| // Index into the Scalar Register File's namespace where the WF's registers |
| // will live while the WF is executed |
| uint32_t startSgprIndex; |
| |
| // Old value of destination gpr (for trace) |
| std::vector<uint32_t> oldVgpr; |
| // Id of destination gpr (for trace) |
| uint32_t oldVgprId; |
| // Tick count of last old_vgpr copy |
| uint64_t oldVgprTcnt; |
| |
| // Old value of destination gpr (for trace) |
| std::vector<uint64_t> oldDgpr; |
| // Id of destination gpr (for trace) |
| uint32_t oldDgprId; |
| // Tick count of last old_vgpr copy |
| uint64_t oldDgprTcnt; |
| |
| // Execution mask at wavefront start |
| VectorMask initMask; |
| |
| // a pointer to the fraction of the LDS allocated |
| // to this workgroup (thus this wavefront) |
| LdsChunk *ldsChunk; |
| |
| // unique WF id over all WFs executed across all CUs |
| uint64_t wfDynId; |
| |
| // dyn inst id (per SIMD) of last instruction exec from this wave |
| uint64_t lastInstExec; |
| |
| // Map to track the dyn instruction id of each vector register value |
| // produced, indexed by physical vector register ID |
| std::unordered_map<int,uint64_t> rawDist; |
| |
| // Counts the number of reads performed to each physical register |
| // - counts are reset to 0 for each dynamic wavefront launched |
| std::vector<int> vecReads; |
| |
| void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems); |
| |
| // context for save/restore |
| uint8_t *context; |
| |
| typedef WavefrontParams Params; |
| Wavefront(const Params &p); |
| ~Wavefront(); |
| virtual void init(); |
| |
| void |
| setParent(ComputeUnit *cu) |
| { |
| computeUnit = cu; |
| } |
| |
| void validateRequestCounters(); |
| void start(uint64_t _wfDynId, uint64_t _base_ptr); |
| void exec(); |
| // called by SCH stage to reserve |
| std::vector<int> reserveResources(); |
| bool stopFetch(); |
| |
| Addr pc() const; |
| void pc(Addr new_pc); |
| |
| VectorMask& execMask(); |
| bool execMask(int lane) const; |
| |
| |
| void discardFetch(); |
| |
| bool waitCntsSatisfied(); |
| void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt); |
| void clearWaitCnts(); |
| |
| void incVMemInstsIssued(); |
| void incExpInstsIssued(); |
| void incLGKMInstsIssued(); |
| void decVMemInstsIssued(); |
| void decExpInstsIssued(); |
| void decLGKMInstsIssued(); |
| |
| /** Freeing VRF space */ |
| void freeRegisterFile(); |
| |
| bool sleepDone(); |
| void setSleepTime(int sleep_time); |
| |
| TheGpuISA::GPUISA& |
| gpuISA() |
| { |
| return _gpuISA; |
| } |
| |
| void barrierId(int bar_id); |
| int barrierId() const; |
| bool hasBarrier() const; |
| void releaseBarrier(); |
| |
| private: |
| TheGpuISA::GPUISA _gpuISA; |
| |
| void reserveGmResource(GPUDynInstPtr ii); |
| void reserveLmResource(GPUDynInstPtr ii); |
| |
| /** |
| * the following are used for waitcnt instructions |
| * vmWaitCnt: once set, we wait for the oustanding |
| * number of vector mem instructions to be |
| * at, or below vmWaitCnt. |
| * |
| * expWaitCnt: once set, we wait for the outstanding |
| * number outstanding VM writes or EXP |
| * insts to be at, or below expWaitCnt. |
| * |
| * lgkmWaitCnt: once set, we wait for the oustanding |
| * number of LDS, GDS, scalar memory, |
| * and message instructions to be at, or |
| * below lgkmCount. we currently do not |
| * support GDS/message ops. |
| */ |
| int vmWaitCnt; |
| int expWaitCnt; |
| int lgkmWaitCnt; |
| int vmemInstsIssued; |
| int expInstsIssued; |
| int lgkmInstsIssued; |
| int sleepCnt; |
| status_e status; |
| Addr _pc; |
| VectorMask _execMask; |
| int barId; |
| |
| public: |
| struct WavefrontStats : public statistics::Group |
| { |
| WavefrontStats(statistics::Group *parent); |
| |
| // Number of instructions executed by this wavefront slot across all |
| // dynamic wavefronts |
| statistics::Scalar numInstrExecuted; |
| |
| // Number of cycles this WF spends in SCH stage |
| statistics::Scalar schCycles; |
| |
| // Number of stall cycles encounterd by this WF in SCH stage |
| statistics::Scalar schStalls; |
| |
| // The following stats sum to the value of schStalls, and record, per |
| // WF slot, what the cause of each stall was at a coarse granularity. |
| |
| // Cycles WF is selected by scheduler, but RFs cannot support |
| // instruction |
| statistics::Scalar schRfAccessStalls; |
| // Cycles spent waiting for execution resources |
| statistics::Scalar schResourceStalls; |
| // cycles spent waiting for RF reads to complete in SCH stage |
| statistics::Scalar schOpdNrdyStalls; |
| // LDS arbitration stall cycles. WF attempts to execute LM instruction, |
| // but another wave is executing FLAT, which requires LM and GM and |
| // forces this WF to stall. |
| statistics::Scalar schLdsArbStalls; |
| |
| // number of times an instruction of a WF is blocked from being issued |
| // due to WAR and WAW dependencies |
| statistics::Scalar numTimesBlockedDueWAXDependencies; |
| // number of times an instruction of a WF is blocked from being issued |
| // due to WAR and WAW dependencies |
| statistics::Scalar numTimesBlockedDueRAWDependencies; |
| |
| // Distribution to track the distance between producer and consumer |
| // for vector register values |
| statistics::Distribution vecRawDistance; |
| |
| // Distribution to track the number of times every vector register |
| // value produced is consumed. |
| statistics::Distribution readsPerWrite; |
| } stats; |
| }; |
| |
| } // namespace gem5 |
| |
| #endif // __GPU_COMPUTE_WAVEFRONT_HH__ |