blob: 1c6aa319ddda6cfbb33a4a05d0e191230ed5b983 [file] [log] [blame]
/*
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __GPU_DYN_INST_HH__
#define __GPU_DYN_INST_HH__
#include <cstdint>
#include <memory>
#include <string>
#include "base/amo.hh"
#include "base/logging.hh"
#include "base/trace.hh"
#include "debug/GPUMem.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_exec_context.hh"
#include "gpu-compute/operand_info.hh"
namespace gem5
{
class GPUStaticInst;
template<typename T>
class AtomicOpCAS : public TypedAtomicOpFunctor<T>
{
public:
T c;
T s;
ComputeUnit *computeUnit;
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
: c(_c), s(_s), computeUnit(compute_unit) { }
void
execute(T *b)
{
computeUnit->stats.numCASOps++;
if (*b == c) {
*b = s;
} else {
computeUnit->stats.numFailedCASOps++;
}
}
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
};
class RegisterOperandInfo
{
public:
RegisterOperandInfo() = delete;
RegisterOperandInfo(int op_idx, int num_dwords,
const std::vector<int> &virt_indices,
const std::vector<int> &phys_indices)
: opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),
physIndices(phys_indices)
{
}
/**
* The number of registers required to store this operand.
*/
int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; }
int operandIdx() const { return opIdx; }
/**
* We typically only need the first virtual register for the operand
* regardless of its size.
*/
int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }
private:
/**
* Index of this operand within the set of its parent instruction's
* operand list.
*/
const int opIdx;
/**
* Size of this operand in DWORDs.
*/
const int numDWORDs;
const std::vector<int> virtIndices;
const std::vector<int> physIndices;
};
class GPUDynInst : public GPUExecContext
{
public:
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
uint64_t instSeqNum);
~GPUDynInst();
void execute(GPUDynInstPtr gpuDynInst);
const std::vector<OperandInfo>& srcVecRegOperands() const;
const std::vector<OperandInfo>& dstVecRegOperands() const;
const std::vector<OperandInfo>& srcScalarRegOperands() const;
const std::vector<OperandInfo>& dstScalarRegOperands() const;
int numSrcRegOperands();
int numDstRegOperands();
int numSrcVecRegOperands() const;
int numDstVecRegOperands() const;
int maxSrcVecRegOperandSize();
int numSrcVecDWords();
int numDstVecDWords();
int numSrcScalarRegOperands() const;
int numDstScalarRegOperands() const;
int maxSrcScalarRegOperandSize();
int numSrcScalarDWords();
int numDstScalarDWords();
int maxOperandSize();
int getNumOperands() const;
bool hasSourceSgpr() const;
bool hasDestinationSgpr() const;
bool hasSourceVgpr() const;
bool hasDestinationVgpr() const;
// returns true if the string "opcodeStr" is found in the
// opcode of the instruction
bool isOpcode(const std::string& opcodeStr) const;
bool isOpcode(const std::string& opcodeStr,
const std::string& extStr) const;
const std::string &disassemble() const;
InstSeqNum seqNum() const;
Addr pc();
void pc(Addr _pc);
enums::StorageClassType executedAs();
// virtual address for scalar memory operations
Addr scalarAddr;
// virtual addressies for vector memory operations
std::vector<Addr> addr;
Addr pAddr;
// vector data to get written
uint8_t *d_data;
// scalar data to be transferred
uint8_t *scalar_data;
// Additional data (for atomics)
uint8_t *a_data;
// Additional data (for atomics)
uint8_t *x_data;
// The execution mask
VectorMask exec_mask;
// SIMD where the WF of the memory instruction has been mapped to
int simdId;
// unique id of the WF where the memory instruction belongs to
int wfDynId;
// The kernel id of the requesting wf
int kern_id;
// The CU id of the requesting wf
int cu_id;
// The workgroup id of the requesting wf
int wg_id;
// HW slot id where the WF is mapped to inside a SIMD unit
int wfSlotId;
// execution pipeline id where the memory instruction has been scheduled
int execUnitId;
// The execution time of this operation
Tick time;
// The latency of this operation
WaitClass latency;
// Initiate the specified memory operation, by creating a
// memory request and sending it off to the memory system.
void initiateAcc(GPUDynInstPtr gpuDynInst);
// Complete the specified memory operation, by writing
// value back to the RF in the case of a load or atomic
// return or, in the case of a store, we do nothing
void completeAcc(GPUDynInstPtr gpuDynInst);
void updateStats();
GPUStaticInst* staticInstruction() { return _staticInst; }
TheGpuISA::ScalarRegU32 srcLiteral() const;
bool isALU() const;
bool isBranch() const;
bool isCondBranch() const;
bool isNop() const;
bool isReturn() const;
bool isEndOfKernel() const;
bool isKernelLaunch() const;
bool isSDWAInst() const;
bool isDPPInst() const;
bool isUnconditionalJump() const;
bool isSpecialOp() const;
bool isWaitcnt() const;
bool isSleep() const;
bool isBarrier() const;
bool isMemSync() const;
bool isMemRef() const;
bool isFlat() const;
bool isFlatGlobal() const;
bool isLoad() const;
bool isStore() const;
bool isAtomic() const;
bool isAtomicNoRet() const;
bool isAtomicRet() const;
bool isScalar() const;
bool isVector() const;
bool readsSCC() const;
bool writesSCC() const;
bool readsVCC() const;
bool writesVCC() const;
bool readsExec() const;
bool writesExec() const;
bool readsMode() const;
bool writesMode() const;
bool ignoreExec() const;
bool readsFlatScratch() const;
bool writesFlatScratch() const;
bool readsExecMask() const;
bool writesExecMask() const;
bool isAtomicAnd() const;
bool isAtomicOr() const;
bool isAtomicXor() const;
bool isAtomicCAS() const;
bool isAtomicExch() const;
bool isAtomicAdd() const;
bool isAtomicSub() const;
bool isAtomicInc() const;
bool isAtomicDec() const;
bool isAtomicMax() const;
bool isAtomicMin() const;
bool isArgLoad() const;
bool isGlobalMem() const;
bool isLocalMem() const;
bool isArgSeg() const;
bool isGlobalSeg() const;
bool isGroupSeg() const;
bool isKernArgSeg() const;
bool isPrivateSeg() const;
bool isReadOnlySeg() const;
bool isSpillSeg() const;
bool isGloballyCoherent() const;
bool isSystemCoherent() const;
bool isF16() const;
bool isF32() const;
bool isF64() const;
bool isFMA() const;
bool isMAC() const;
bool isMAD() const;
// for FLAT memory ops. check the segment address
// against the APE registers to see if it falls
// within one of the APE ranges for LDS/SCRATCH/GPUVM.
// if it does not fall into one of the three APEs, it
// will be a regular global access.
void doApertureCheck(const VectorMask &mask);
// Function to resolve a flat accesses during execution stage.
void resolveFlatSegment(const VectorMask &mask);
template<typename c0> AtomicOpFunctorPtr
makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
{
if (isAtomicAnd()) {
return std::make_unique<AtomicOpAnd<c0>>(*reg0);
} else if (isAtomicOr()) {
return std::make_unique<AtomicOpOr<c0>>(*reg0);
} else if (isAtomicXor()) {
return std::make_unique<AtomicOpXor<c0>>(*reg0);
} else if (isAtomicCAS()) {
return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
} else if (isAtomicExch()) {
return std::make_unique<AtomicOpExch<c0>>(*reg0);
} else if (isAtomicAdd()) {
return std::make_unique<AtomicOpAdd<c0>>(*reg0);
} else if (isAtomicSub()) {
return std::make_unique<AtomicOpSub<c0>>(*reg0);
} else if (isAtomicInc()) {
return std::make_unique<AtomicOpInc<c0>>();
} else if (isAtomicDec()) {
return std::make_unique<AtomicOpDec<c0>>();
} else if (isAtomicMax()) {
return std::make_unique<AtomicOpMax<c0>>(*reg0);
} else if (isAtomicMin()) {
return std::make_unique<AtomicOpMin<c0>>(*reg0);
} else {
fatal("Unrecognized atomic operation");
}
}
void
setRequestFlags(RequestPtr req) const
{
if (isGloballyCoherent()) {
req->setCacheCoherenceFlags(Request::GLC_BIT);
}
if (isSystemCoherent()) {
req->setCacheCoherenceFlags(Request::SLC_BIT);
}
if (isAtomicRet()) {
req->setFlags(Request::ATOMIC_RETURN_OP);
} else if (isAtomicNoRet()) {
req->setFlags(Request::ATOMIC_NO_RETURN_OP);
}
if (isMemSync()) {
// the path for kernel launch and kernel end is different
// from non-kernel mem sync.
assert(!isKernelLaunch());
assert(!isEndOfKernel());
// must be wbinv inst if not kernel launch/end
req->setCacheCoherenceFlags(Request::INV_L1);
}
}
// reset the number of pending memory requests for all lanes
void
resetEntireStatusVector()
{
assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
resetStatusVector(lane);
}
}
// reset the number of pending memory requests for the inputted lane
void
resetStatusVector(int lane)
{
setStatusVector(lane, 0);
}
// set the number of pending memory requests for the inputted lane
void
setStatusVector(int lane, int newVal)
{
// currently we can have up to 2 memory requests per lane (if the
// lane's request goes across multiple cache lines)
assert((newVal >= 0) && (newVal <= 2));
statusVector[lane] = newVal;
}
// subtracts the number of pending memory requests for the inputted lane
// by 1
void
decrementStatusVector(int lane)
{
// this lane may have multiple requests, so only subtract one for
// this request
assert(statusVector[lane] >= 1);
statusVector[lane]--;
}
// return the current number of pending memory requests for the inputted
// lane
int
getLaneStatus(int lane) const
{
return statusVector[lane];
}
// returns true if all memory requests from all lanes have been received,
// else returns false
bool
allLanesZero() const
{
// local variables
bool allZero = true;
// iterate over all lanes, checking the number of pending memory
// requests they have
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
// if any lane still has pending requests, return false
if (statusVector[lane] > 0) {
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
"request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
statusVector[lane], addr[lane]);
allZero = false;
}
}
if (allZero) {
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
" requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
}
return allZero;
}
// returns a string representing the current state of the statusVector
std::string
printStatusVector() const
{
std::string statusVec_str = "[";
// iterate over all lanes, adding the current number of pending
// requests for this lane to the string
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
statusVec_str += std::to_string(statusVector[lane]);
}
statusVec_str += "]";
return statusVec_str;
}
// Map returned packets and the addresses they satisfy with which lane they
// were requested from
typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
StatusVector memStatusVector;
// Track the status of memory requests per lane, an int per lane to allow
// unaligned accesses
std::vector<int> statusVector;
// for ld_v# or st_v#
std::vector<int> tlbHitLevel;
// for misaligned scalar ops we track the number
// of outstanding reqs here
int numScalarReqs;
Tick getAccessTime() const { return accessTime; }
void setAccessTime(Tick currentTime) { accessTime = currentTime; }
void profileRoundTripTime(Tick currentTime, int hopId);
std::vector<Tick> getRoundTripTime() const { return roundTripTime; }
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
{ return lineAddressTime; }
// inst used to save/restore a wavefront context
bool isSaveRestore;
private:
GPUStaticInst *_staticInst;
const InstSeqNum _seqNum;
int maxSrcVecRegOpSize;
int maxSrcScalarRegOpSize;
// the time the request was started
Tick accessTime = -1;
// hold the tick when the instruction arrives at certain hop points
// on it's way to main memory
std::vector<Tick> roundTripTime;
// hold each cache block address for the instruction and a vector
// to hold the tick when the block arrives at certain hop points
std::map<Addr, std::vector<Tick>> lineAddressTime;
};
} // namespace gem5
#endif // __GPU_DYN_INST_HH__