blob: cf73aa2723a473039a77dcd9b7b26eb5962cf989 [file] [log] [blame] [edit]
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __COMPUTE_UNIT_HH__
#define __COMPUTE_UNIT_HH__
#include <deque>
#include <map>
#include <unordered_set>
#include <vector>
#include "base/callback.hh"
#include "base/compiler.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "enums/PrefetchType.hh"
#include "gpu-compute/comm.hh"
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/fetch_stage.hh"
#include "gpu-compute/global_memory_pipeline.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/local_memory_pipeline.hh"
#include "gpu-compute/register_manager.hh"
#include "gpu-compute/scalar_memory_pipeline.hh"
#include "gpu-compute/schedule_stage.hh"
#include "gpu-compute/scoreboard_check_stage.hh"
#include "mem/port.hh"
#include "mem/token_port.hh"
#include "sim/clocked_object.hh"
namespace gem5
{
class HSAQueueEntry;
class LdsChunk;
class ScalarRegisterFile;
class Shader;
class VectorRegisterFile;
struct ComputeUnitParams;
enum EXEC_POLICY
{
OLDEST = 0,
RR
};
enum TLB_CACHE
{
TLB_MISS_CACHE_MISS = 0,
TLB_MISS_CACHE_HIT,
TLB_HIT_CACHE_MISS,
TLB_HIT_CACHE_HIT
};
/**
* WF barrier slots. This represents the barrier resource for
* WF-level barriers (i.e., barriers to sync WFs within a WG).
*/
class WFBarrier
{
public:
WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
{
}
static const int InvalidID = -1;
int
numAtBarrier() const
{
return _numAtBarrier;
}
/**
* Number of WFs that have not yet reached the barrier.
*/
int
numYetToReachBarrier() const
{
return _maxBarrierCnt - _numAtBarrier;
}
int
maxBarrierCnt() const
{
return _maxBarrierCnt;
}
/**
* Set the maximum barrier count (i.e., the number of WFs that are
* participating in the barrier).
*/
void
setMaxBarrierCnt(int max_barrier_cnt)
{
_maxBarrierCnt = max_barrier_cnt;
}
/**
* Mark that a WF has reached the barrier.
*/
void
incNumAtBarrier()
{
assert(_numAtBarrier < _maxBarrierCnt);
++_numAtBarrier;
}
/**
* Have all WFs participating in this barrier reached the barrier?
* If so, then the barrier is satisfied and WFs may proceed past
* the barrier.
*/
bool
allAtBarrier() const
{
return _numAtBarrier == _maxBarrierCnt;
}
/**
* Decrement the number of WFs that are participating in this barrier.
* This should be called when a WF exits.
*/
void
decMaxBarrierCnt()
{
assert(_maxBarrierCnt > 0);
--_maxBarrierCnt;
}
/**
* Release this barrier resource so it can be used by other WGs. This
* is generally called when a WG has finished.
*/
void
release()
{
_numAtBarrier = 0;
_maxBarrierCnt = 0;
}
/**
* Reset the barrier. This is used to reset the barrier, usually when
* a dynamic instance of a barrier has been satisfied.
*/
void
reset()
{
_numAtBarrier = 0;
}
private:
/**
* The number of WFs in the WG that have reached the barrier. Once
* the number of WFs that reach a barrier matches the number of WFs
* in the WG, the barrier is satisfied.
*/
int _numAtBarrier;
/**
* The maximum number of WFs that can reach this barrier. This is
* essentially the number of WFs in the WG, and a barrier is satisfied
* when the number of WFs that reach the barrier equal this value. If
* a WF exits early it must decrement this value so that it is no
* longer considered for this barrier.
*/
int _maxBarrierCnt;
};
class ComputeUnit : public ClockedObject
{
public:
// Execution resources
//
// The ordering of units is:
// Vector ALUs
// Scalar ALUs
// GM Pipe
// LM Pipe
// Scalar Mem Pipe
//
// Note: the ordering of units is important and the code assumes the
// above ordering. However, there may be more than one resource of
// each type (e.g., 4 VALUs or 2 SALUs)
int numVectorGlobalMemUnits;
// Resource control for global memory to VRF data/address bus
WaitClass glbMemToVrfBus;
// Resource control for Vector Register File->Global Memory pipe buses
WaitClass vrfToGlobalMemPipeBus;
// Resource control for Vector Global Memory execution unit
WaitClass vectorGlobalMemUnit;
int numVectorSharedMemUnits;
// Resource control for local memory to VRF data/address bus
WaitClass locMemToVrfBus;
// Resource control for Vector Register File->Local Memory pipe buses
WaitClass vrfToLocalMemPipeBus;
// Resource control for Vector Shared/Local Memory execution unit
WaitClass vectorSharedMemUnit;
int numScalarMemUnits;
// Resource control for scalar memory to SRF data/address bus
WaitClass scalarMemToSrfBus;
// Resource control for Scalar Register File->Scalar Memory pipe buses
WaitClass srfToScalarMemPipeBus;
// Resource control for Scalar Memory execution unit
WaitClass scalarMemUnit;
// vector ALU execution resources
int numVectorALUs;
std::vector<WaitClass> vectorALUs;
// scalar ALU execution resources
int numScalarALUs;
std::vector<WaitClass> scalarALUs;
// Return total number of execution units on this CU
int numExeUnits() const;
// index into readyList of the first memory unit
int firstMemUnit() const;
// index into readyList of the last memory unit
int lastMemUnit() const;
// index into scalarALUs vector of SALU used by the wavefront
int mapWaveToScalarAlu(Wavefront *w) const;
// index into readyList of SALU used by wavefront
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
// index into readyList of Global Memory unit used by wavefront
int mapWaveToGlobalMem(Wavefront *w) const;
// index into readyList of Local Memory unit used by wavefront
int mapWaveToLocalMem(Wavefront *w) const;
// index into readyList of Scalar Memory unit used by wavefront
int mapWaveToScalarMem(Wavefront *w) const;
int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
int numCyclesPerStoreTransfer; // number of cycles per vector store
int numCyclesPerLoadTransfer; // number of cycles per vector load
// track presence of dynamic instructions in the Schedule pipeline
// stage. This is used to check the readiness of the oldest,
// non-dispatched instruction of every WF in the Scoreboard stage.
std::unordered_set<uint64_t> pipeMap;
RegisterManager* registerManager;
FetchStage fetchStage;
ScoreboardCheckStage scoreboardCheckStage;
ScheduleStage scheduleStage;
ExecStage execStage;
GlobalMemPipeline globalMemoryPipe;
LocalMemPipeline localMemoryPipe;
ScalarMemPipeline scalarMemoryPipe;
EventFunctionWrapper tickEvent;
typedef ComputeUnitParams Params;
std::vector<std::vector<Wavefront*>> wfList;
int cu_id;
// array of vector register files, one per SIMD
std::vector<VectorRegisterFile*> vrf;
// array of scalar register files, one per SIMD
std::vector<ScalarRegisterFile*> srf;
// Width per VALU/SIMD unit: number of work items that can be executed
// on the vector ALU simultaneously in a SIMD unit
int simdWidth;
// number of pipe stages for bypassing data to next dependent single
// precision vector instruction inside the vector ALU pipeline
int spBypassPipeLength;
// number of pipe stages for bypassing data to next dependent double
// precision vector instruction inside the vector ALU pipeline
int dpBypassPipeLength;
// number of pipe stages for scalar ALU
int scalarPipeStages;
// number of pipe stages for operand collection & distribution network
int operandNetworkLength;
// number of cycles per instruction issue period
Cycles issuePeriod;
// VRF to GM Bus latency
Cycles vrf_gm_bus_latency;
// SRF to Scalar Mem Bus latency
Cycles srf_scm_bus_latency;
// VRF to LM Bus latency
Cycles vrf_lm_bus_latency;
// tracks the last cycle a vector instruction was executed on a SIMD
std::vector<uint64_t> lastExecCycle;
// tracks the number of dyn inst executed per SIMD
std::vector<uint64_t> instExecPerSimd;
// true if we allow a separate TLB per lane
bool perLaneTLB;
// if 0, TLB prefetching is off.
int prefetchDepth;
// if fixed-stride prefetching, this is the stride.
int prefetchStride;
std::vector<Addr> lastVaddrCU;
std::vector<std::vector<Addr>> lastVaddrSimd;
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
enums::PrefetchType prefetchType;
EXEC_POLICY exec_policy;
bool debugSegFault;
// Idle CU timeout in ticks
Tick idleCUTimeout;
int idleWfs;
bool functionalTLB;
bool localMemBarrier;
/*
* for Counting page accesses
*/
bool countPages;
Shader *shader;
Tick req_tick_latency;
Tick resp_tick_latency;
Tick scalar_req_tick_latency;
Tick scalar_resp_tick_latency;
/**
* Number of WFs to schedule to each SIMD. This vector is populated
* by hasDispResources(), and consumed by the subsequent call to
* dispWorkgroup(), to schedule the specified number of WFs to the
* SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
*/
std::vector<int> numWfsToSched;
// number of currently reserved vector registers per SIMD unit
std::vector<int> vectorRegsReserved;
// number of currently reserved scalar registers per SIMD unit
std::vector<int> scalarRegsReserved;
// number of vector registers per SIMD unit
int numVecRegsPerSimd;
// number of available scalar registers per SIMD unit
int numScalarRegsPerSimd;
// this hash map will keep track of page divergence
// per memory instruction per wavefront. The hash map
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
std::map<Addr, int> pagesTouched;
void insertInPipeMap(Wavefront *w);
void deleteFromPipeMap(Wavefront *w);
ComputeUnit(const Params &p);
~ComputeUnit();
// Timing Functions
int oprNetPipeLength() const { return operandNetworkLength; }
int simdUnitWidth() const { return simdWidth; }
int spBypassLength() const { return spBypassPipeLength; }
int dpBypassLength() const { return dpBypassPipeLength; }
int scalarPipeLength() const { return scalarPipeStages; }
int storeBusLength() const { return numCyclesPerStoreTransfer; }
int loadBusLength() const { return numCyclesPerLoadTransfer; }
int wfSize() const { return wavefrontSize; }
void exec();
void initiateFetch(Wavefront *wavefront);
void fetch(PacketPtr pkt, Wavefront *wavefront);
void fillKernelState(Wavefront *w, HSAQueueEntry *task);
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
HSAQueueEntry *task, int bar_id,
bool fetchContext=false);
void doInvalidate(RequestPtr req, int kernId);
void doFlush(GPUDynInstPtr gpuDynInst);
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
int cacheLineSize() const { return _cacheLineSize; }
int getCacheLineBits() const { return cacheLineBits; }
void resetRegisterPool();
private:
WFBarrier&
barrierSlot(int bar_id)
{
assert(bar_id > WFBarrier::InvalidID);
return wfBarrierSlots.at(bar_id);
}
int
getFreeBarrierId()
{
assert(freeBarrierIds.size());
auto free_bar_id = freeBarrierIds.begin();
int bar_id = *free_bar_id;
freeBarrierIds.erase(free_bar_id);
return bar_id;
}
public:
int numYetToReachBarrier(int bar_id);
bool allAtBarrier(int bar_id);
void incNumAtBarrier(int bar_id);
int numAtBarrier(int bar_id);
int maxBarrierCnt(int bar_id);
void resetBarrier(int bar_id);
void decMaxBarrierCnt(int bar_id);
void releaseBarrier(int bar_id);
void releaseWFsFromBarrier(int bar_id);
int numBarrierSlots() const { return _numBarrierSlots; }
template<typename c0, typename c1>
void doSmReturn(GPUDynInstPtr gpuDynInst);
virtual void init() override;
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
bool kernelMemSync,
RequestPtr req=nullptr);
void handleMemPacket(PacketPtr pkt, int memport_index);
bool processTimingPacket(PacketPtr pkt);
void processFetchReturn(PacketPtr pkt);
void updatePageDivergenceDist(Addr addr);
RequestorID requestorId() { return _requestorId; }
RequestorID vramRequestorId();
bool isDone() const;
bool isVectorAluIdle(uint32_t simdId) const;
void handleSQCReturn(PacketPtr pkt);
protected:
RequestorID _requestorId;
LdsState &lds;
public:
LdsState &
getLds() const
{
return lds;
}
int32_t
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
[[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst);
typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
pageDataStruct pageAccesses;
void exitCallback();
class GMTokenPort : public TokenRequestPort
{
public:
GMTokenPort(const std::string& name, SimObject *owner,
PortID id = InvalidPortID)
: TokenRequestPort(name, owner, id)
{ }
~GMTokenPort() { }
protected:
bool recvTimingResp(PacketPtr) { return false; }
void recvReqRetry() { }
};
// Manager for the number of tokens available to this compute unit to
// send global memory request packets to the coalescer this is only used
// between global memory pipe and TCP coalescer.
TokenManager *memPortTokens;
GMTokenPort gmTokenPort;
/** Data access Port **/
class DataPort : public RequestPort
{
public:
DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
: RequestPort(_name, id), computeUnit(_cu) { }
bool snoopRangeSent;
struct SenderState : public Packet::SenderState
{
GPUDynInstPtr _gpuDynInst;
PortID port_index;
Packet::SenderState *saved;
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
Packet::SenderState *sender_state=nullptr)
: _gpuDynInst(gpuDynInst),
port_index(_port_index),
saved(sender_state) { }
};
class SystemHubEvent : public Event
{
DataPort *dataPort;
PacketPtr reqPkt;
public:
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
: dataPort(_dataPort), reqPkt(pkt)
{
setFlags(Event::AutoDelete);
}
void
process()
{
// DMAs do not operate on packets and therefore do not
// convert to a response. Do that here instead.
reqPkt->makeResponse();
dataPort->handleResponse(reqPkt);
}
};
void processMemReqEvent(PacketPtr pkt);
EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
void processMemRespEvent(PacketPtr pkt);
EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
bool handleResponse(PacketPtr pkt);
protected:
ComputeUnit *computeUnit;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
{
resp.clear();
snoop = true;
}
};
// Scalar data cache access port
class ScalarDataPort : public RequestPort
{
public:
ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name), computeUnit(_cu)
{
}
bool recvTimingResp(PacketPtr pkt) override;
void recvReqRetry() override;
struct SenderState : public Packet::SenderState
{
SenderState(GPUDynInstPtr gpuDynInst,
Packet::SenderState *sender_state=nullptr)
: _gpuDynInst(gpuDynInst), saved(sender_state)
{
}
GPUDynInstPtr _gpuDynInst;
Packet::SenderState *saved;
};
class MemReqEvent : public Event
{
private:
ScalarDataPort &scalarDataPort;
PacketPtr pkt;
public:
MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
: Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
{
setFlags(Event::AutoDelete);
}
void process();
const char *description() const;
};
class SystemHubEvent : public Event
{
ScalarDataPort *dataPort;
PacketPtr reqPkt;
public:
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
: dataPort(_dataPort), reqPkt(pkt)
{
setFlags(Event::AutoDelete);
}
void
process()
{
// DMAs do not operate on packets and therefore do not
// convert to a response. Do that here instead.
reqPkt->makeResponse();
dataPort->handleResponse(reqPkt);
}
};
bool handleResponse(PacketPtr pkt);
std::deque<PacketPtr> retries;
private:
ComputeUnit *computeUnit;
};
// Instruction cache access port
class SQCPort : public RequestPort
{
public:
SQCPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name), computeUnit(_cu) { }
bool snoopRangeSent;
struct SenderState : public Packet::SenderState
{
Wavefront *wavefront;
Packet::SenderState *saved;
// kernel id to be used in handling I-Cache invalidate response
int kernId;
SenderState(Wavefront *_wavefront, Packet::SenderState
*sender_state=nullptr, int _kernId=-1)
: wavefront(_wavefront), saved(sender_state),
kernId(_kernId){ }
};
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
protected:
ComputeUnit *computeUnit;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
{
resp.clear();
snoop = true;
}
};
/** Data TLB port **/
class DTLBPort : public RequestPort
{
public:
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
: RequestPort(_name, id), computeUnit(_cu),
stalled(false)
{ }
bool isStalled() { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
/**
* here we queue all the translation requests that were
* not successfully sent.
*/
std::deque<PacketPtr> retries;
/** SenderState is information carried along with the packet
* throughout the TLB hierarchy
*/
struct SenderState: public Packet::SenderState
{
// the memInst that this is associated with
GPUDynInstPtr _gpuDynInst;
// the lane in the memInst this is associated with, so we send
// the memory request down the right port
PortID portIndex;
// constructor used for packets involved in timing accesses
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
: _gpuDynInst(gpuDynInst), portIndex(port_index) { }
};
protected:
ComputeUnit *computeUnit;
bool stalled;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
class ScalarDTLBPort : public RequestPort
{
public:
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name), computeUnit(_cu), stalled(false)
{
}
struct SenderState : public Packet::SenderState
{
SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
GPUDynInstPtr _gpuDynInst;
};
bool recvTimingResp(PacketPtr pkt) override;
void recvReqRetry() override { assert(false); }
bool isStalled() const { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
std::deque<PacketPtr> retries;
private:
ComputeUnit *computeUnit;
bool stalled;
};
class ITLBPort : public RequestPort
{
public:
ITLBPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name), computeUnit(_cu), stalled(false) { }
bool isStalled() { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
/**
* here we queue all the translation requests that were
* not successfully sent.
*/
std::deque<PacketPtr> retries;
/** SenderState is information carried along with the packet
* throughout the TLB hierarchy
*/
struct SenderState: public Packet::SenderState
{
// The wavefront associated with this request
Wavefront *wavefront;
SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
};
protected:
ComputeUnit *computeUnit;
bool stalled;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
/**
* the port intended to communicate between the CU and its LDS
*/
class LDSPort : public RequestPort
{
public:
LDSPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name), computeUnit(_cu)
{
}
bool isStalled() const { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
/**
* here we queue all the requests that were
* not successfully sent.
*/
std::queue<PacketPtr> retries;
/**
* SenderState is information carried along with the packet, esp. the
* GPUDynInstPtr
*/
class SenderState: public Packet::SenderState
{
protected:
// The actual read/write/atomic request that goes with this command
GPUDynInstPtr _gpuDynInst = nullptr;
public:
SenderState(GPUDynInstPtr gpuDynInst):
_gpuDynInst(gpuDynInst)
{
}
GPUDynInstPtr
getMemInst() const
{
return _gpuDynInst;
}
};
virtual bool
sendTimingReq(PacketPtr pkt);
protected:
bool stalled = false; ///< whether or not it is stalled
ComputeUnit *computeUnit;
virtual bool
recvTimingResp(PacketPtr pkt);
virtual Tick
recvAtomic(PacketPtr pkt) { return 0; }
virtual void
recvFunctional(PacketPtr pkt)
{
}
virtual void
recvRangeChange()
{
}
virtual void
recvReqRetry();
};
/** The port to access the Local Data Store
* Can be connected to a LDS object
*/
LDSPort ldsPort;
TokenManager *
getTokenManager()
{
return memPortTokens;
}
/** The memory port for SIMD data accesses.
* Can be connected to PhysMem for Ruby for timing simulations
*/
std::vector<DataPort> memPort;
// port to the TLB hierarchy (i.e., the L1 TLB)
std::vector<DTLBPort> tlbPort;
// port to the scalar data cache
ScalarDataPort scalarDataPort;
// port to the scalar data TLB
ScalarDTLBPort scalarDTLBPort;
// port to the SQC (i.e. the I-cache)
SQCPort sqcPort;
// port to the SQC TLB (there's a separate TLB for each I-cache)
ITLBPort sqcTLBPort;
Port &
getPort(const std::string &if_name, PortID idx) override
{
if (if_name == "memory_port" && idx < memPort.size()) {
return memPort[idx];
} else if (if_name == "translation_port" && idx < tlbPort.size()) {
return tlbPort[idx];
} else if (if_name == "scalar_port") {
return scalarDataPort;
} else if (if_name == "scalar_tlb_port") {
return scalarDTLBPort;
} else if (if_name == "sqc_port") {
return sqcPort;
} else if (if_name == "sqc_tlb_port") {
return sqcTLBPort;
} else if (if_name == "ldsPort") {
return ldsPort;
} else if (if_name == "gmTokenPort") {
return gmTokenPort;
} else {
return ClockedObject::getPort(if_name, idx);
}
}
InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
private:
const int _cacheLineSize;
const int _numBarrierSlots;
int cacheLineBits;
InstSeqNum globalSeqNum;
int wavefrontSize;
/**
* TODO: Update these comments once the pipe stage interface has
* been fully refactored.
*
* Pipeline stage interfaces.
*
* Buffers used to communicate between various pipeline stages
* List of waves which will be dispatched to
* each execution resource. An EXREADY implies
* dispatch list is non-empty and
* execution unit has something to execute
* this cycle. Currently, the dispatch list of
* an execution resource can hold only one wave because
* an execution resource can execute only one wave in a cycle.
* dispatchList is used to communicate between schedule
* and exec stage
*
* At a high level, the following intra-/inter-stage communication occurs:
* SCB to SCH: readyList provides per exec resource list of waves that
* passed dependency and readiness checks. If selected by
* scheduler, attempt to add wave to schList conditional on
* RF support.
* SCH: schList holds waves that are gathering operands or waiting
* for execution resource availability. Once ready, waves are
* placed on the dispatchList as candidates for execution. A wave
* may spend multiple cycles in SCH stage, on the schList due to
* RF access conflicts or execution resource contention.
* SCH to EX: dispatchList holds waves that are ready to be executed.
* LM/FLAT arbitration may remove an LM wave and place it
* back on the schList. RF model may also force a wave back
* to the schList if using the detailed model.
*/
ScoreboardCheckToSchedule scoreboardCheckToSchedule;
ScheduleToExecute scheduleToExecute;
/**
* The barrier slots for this CU.
*/
std::vector<WFBarrier> wfBarrierSlots;
/**
* A set used to easily retrieve a free barrier ID.
*/
std::unordered_set<int> freeBarrierIds;
// hold the time of the arrival of the first cache block related to
// a particular GPUDynInst. This is used to calculate the difference
// between the first and last chace block arrival times.
std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
public:
void updateInstStats(GPUDynInstPtr gpuDynInst);
int activeWaves;
struct ComputeUnitStats : public statistics::Group
{
ComputeUnitStats(statistics::Group *parent, int n_wf);
statistics::Scalar vALUInsts;
statistics::Formula vALUInstsPerWF;
statistics::Scalar sALUInsts;
statistics::Formula sALUInstsPerWF;
statistics::Scalar instCyclesVALU;
statistics::Scalar instCyclesSALU;
statistics::Scalar threadCyclesVALU;
statistics::Formula vALUUtilization;
statistics::Scalar ldsNoFlatInsts;
statistics::Formula ldsNoFlatInstsPerWF;
statistics::Scalar flatVMemInsts;
statistics::Formula flatVMemInstsPerWF;
statistics::Scalar flatLDSInsts;
statistics::Formula flatLDSInstsPerWF;
statistics::Scalar vectorMemWrites;
statistics::Formula vectorMemWritesPerWF;
statistics::Scalar vectorMemReads;
statistics::Formula vectorMemReadsPerWF;
statistics::Scalar scalarMemWrites;
statistics::Formula scalarMemWritesPerWF;
statistics::Scalar scalarMemReads;
statistics::Formula scalarMemReadsPerWF;
statistics::Formula vectorMemReadsPerKiloInst;
statistics::Formula vectorMemWritesPerKiloInst;
statistics::Formula vectorMemInstsPerKiloInst;
statistics::Formula scalarMemReadsPerKiloInst;
statistics::Formula scalarMemWritesPerKiloInst;
statistics::Formula scalarMemInstsPerKiloInst;
// Cycles required to send register source (addr and data) from
// register files to memory pipeline, per SIMD.
statistics::Vector instCyclesVMemPerSimd;
statistics::Vector instCyclesScMemPerSimd;
statistics::Vector instCyclesLdsPerSimd;
statistics::Scalar globalReads;
statistics::Scalar globalWrites;
statistics::Formula globalMemInsts;
statistics::Scalar argReads;
statistics::Scalar argWrites;
statistics::Formula argMemInsts;
statistics::Scalar spillReads;
statistics::Scalar spillWrites;
statistics::Formula spillMemInsts;
statistics::Scalar groupReads;
statistics::Scalar groupWrites;
statistics::Formula groupMemInsts;
statistics::Scalar privReads;
statistics::Scalar privWrites;
statistics::Formula privMemInsts;
statistics::Scalar readonlyReads;
statistics::Scalar readonlyWrites;
statistics::Formula readonlyMemInsts;
statistics::Scalar kernargReads;
statistics::Scalar kernargWrites;
statistics::Formula kernargMemInsts;
statistics::Distribution waveLevelParallelism;
// the following stats compute the avg. TLB accesslatency per
// uncoalesced request (only for data)
statistics::Scalar tlbRequests;
statistics::Scalar tlbCycles;
statistics::Formula tlbLatency;
// hitsPerTLBLevel[x] are the hits in Level x TLB.
// x = 0 is the page table.
statistics::Vector hitsPerTLBLevel;
statistics::Scalar ldsBankAccesses;
statistics::Distribution ldsBankConflictDist;
// over all memory instructions executed over all wavefronts
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
statistics::Distribution pageDivergenceDist;
// count of non-flat global memory vector instructions executed
statistics::Scalar dynamicGMemInstrCnt;
// count of flat global memory vector instructions executed
statistics::Scalar dynamicFlatMemInstrCnt;
statistics::Scalar dynamicLMemInstrCnt;
statistics::Scalar wgBlockedDueBarrierAllocation;
statistics::Scalar wgBlockedDueLdsAllocation;
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
// active when the instruction is committed, this number is still
// incremented by 1
statistics::Scalar numInstrExecuted;
// Number of cycles among successive instruction executions across all
// wavefronts of the same CU
statistics::Distribution execRateDist;
// number of individual vector operations executed
statistics::Scalar numVecOpsExecuted;
// number of individual f16 vector operations executed
statistics::Scalar numVecOpsExecutedF16;
// number of individual f32 vector operations executed
statistics::Scalar numVecOpsExecutedF32;
// number of individual f64 vector operations executed
statistics::Scalar numVecOpsExecutedF64;
// number of individual FMA 16,32,64 vector operations executed
statistics::Scalar numVecOpsExecutedFMA16;
statistics::Scalar numVecOpsExecutedFMA32;
statistics::Scalar numVecOpsExecutedFMA64;
// number of individual MAC 16,32,64 vector operations executed
statistics::Scalar numVecOpsExecutedMAC16;
statistics::Scalar numVecOpsExecutedMAC32;
statistics::Scalar numVecOpsExecutedMAC64;
// number of individual MAD 16,32,64 vector operations executed
statistics::Scalar numVecOpsExecutedMAD16;
statistics::Scalar numVecOpsExecutedMAD32;
statistics::Scalar numVecOpsExecutedMAD64;
// total number of two op FP vector operations executed
statistics::Scalar numVecOpsExecutedTwoOpFP;
// Total cycles that something is running on the GPU
statistics::Scalar totalCycles;
statistics::Formula vpc; // vector ops per cycle
statistics::Formula vpc_f16; // vector ops per cycle
statistics::Formula vpc_f32; // vector ops per cycle
statistics::Formula vpc_f64; // vector ops per cycle
statistics::Formula ipc; // vector instructions per cycle
statistics::Distribution controlFlowDivergenceDist;
statistics::Distribution activeLanesPerGMemInstrDist;
statistics::Distribution activeLanesPerLMemInstrDist;
// number of vector ALU instructions received
statistics::Formula numALUInstsExecuted;
// number of times a WG cannot start due to lack of free VGPRs in SIMDs
statistics::Scalar numTimesWgBlockedDueVgprAlloc;
// number of times a WG cannot start due to lack of free SGPRs in SIMDs
statistics::Scalar numTimesWgBlockedDueSgprAlloc;
statistics::Scalar numCASOps;
statistics::Scalar numFailedCASOps;
statistics::Scalar completedWfs;
statistics::Scalar completedWGs;
// distrubtion in latency difference between first and last cache block
// arrival ticks
statistics::Distribution headTailLatency;
// Track the amount of interleaving between wavefronts on each SIMD.
// This stat is sampled using instExecPerSimd to compute the number
// of instructions that have been executed on a SIMD between a WF
// executing two successive instructions.
statistics::VectorDistribution instInterleave;
} stats;
};
} // namespace gem5
#endif // __COMPUTE_UNIT_HH__