| /* |
| * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifndef __GPU_DYN_INST_HH__ |
| #define __GPU_DYN_INST_HH__ |
| |
| #include <cstdint> |
| #include <memory> |
| #include <string> |
| |
| #include "base/amo.hh" |
| #include "base/logging.hh" |
| #include "base/trace.hh" |
| #include "debug/GPUMem.hh" |
| #include "enums/StorageClassType.hh" |
| #include "gpu-compute/compute_unit.hh" |
| #include "gpu-compute/gpu_exec_context.hh" |
| #include "gpu-compute/operand_info.hh" |
| |
| namespace gem5 |
| { |
| |
| class GPUStaticInst; |
| |
| template<typename T> |
| class AtomicOpCAS : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T c; |
| T s; |
| |
| ComputeUnit *computeUnit; |
| |
| AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) |
| : c(_c), s(_s), computeUnit(compute_unit) { } |
| |
| void |
| execute(T *b) |
| { |
| computeUnit->stats.numCASOps++; |
| |
| if (*b == c) { |
| *b = s; |
| } else { |
| computeUnit->stats.numFailedCASOps++; |
| } |
| } |
| AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); } |
| }; |
| |
| class RegisterOperandInfo |
| { |
| public: |
| RegisterOperandInfo() = delete; |
| RegisterOperandInfo(int op_idx, int num_dwords, |
| const std::vector<int> &virt_indices, |
| const std::vector<int> &phys_indices) |
| : opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices), |
| physIndices(phys_indices) |
| { |
| } |
| |
| /** |
| * The number of registers required to store this operand. |
| */ |
| int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; } |
| int operandIdx() const { return opIdx; } |
| /** |
| * We typically only need the first virtual register for the operand |
| * regardless of its size. |
| */ |
| int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); } |
| |
| private: |
| /** |
| * Index of this operand within the set of its parent instruction's |
| * operand list. |
| */ |
| const int opIdx; |
| /** |
| * Size of this operand in DWORDs. |
| */ |
| const int numDWORDs; |
| const std::vector<int> virtIndices; |
| const std::vector<int> physIndices; |
| }; |
| |
| class GPUDynInst : public GPUExecContext |
| { |
| public: |
| GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, |
| uint64_t instSeqNum); |
| ~GPUDynInst(); |
| void execute(GPUDynInstPtr gpuDynInst); |
| |
| const std::vector<OperandInfo>& srcVecRegOperands() const; |
| const std::vector<OperandInfo>& dstVecRegOperands() const; |
| const std::vector<OperandInfo>& srcScalarRegOperands() const; |
| const std::vector<OperandInfo>& dstScalarRegOperands() const; |
| |
| int numSrcRegOperands(); |
| int numDstRegOperands(); |
| |
| int numSrcVecRegOperands() const; |
| int numDstVecRegOperands() const; |
| int maxSrcVecRegOperandSize(); |
| int numSrcVecDWords(); |
| int numDstVecDWords(); |
| |
| int numSrcScalarRegOperands() const; |
| int numDstScalarRegOperands() const; |
| int maxSrcScalarRegOperandSize(); |
| int numSrcScalarDWords(); |
| int numDstScalarDWords(); |
| |
| int maxOperandSize(); |
| |
| int getNumOperands() const; |
| |
| bool hasSourceSgpr() const; |
| bool hasDestinationSgpr() const; |
| bool hasSourceVgpr() const; |
| bool hasDestinationVgpr() const; |
| |
| // returns true if the string "opcodeStr" is found in the |
| // opcode of the instruction |
| bool isOpcode(const std::string& opcodeStr) const; |
| bool isOpcode(const std::string& opcodeStr, |
| const std::string& extStr) const; |
| |
| const std::string &disassemble() const; |
| |
| InstSeqNum seqNum() const; |
| |
| Addr pc(); |
| void pc(Addr _pc); |
| |
| enums::StorageClassType executedAs(); |
| |
| // virtual address for scalar memory operations |
| Addr scalarAddr; |
| // virtual addressies for vector memory operations |
| std::vector<Addr> addr; |
| Addr pAddr; |
| |
| // vector data to get written |
| uint8_t *d_data; |
| // scalar data to be transferred |
| uint8_t *scalar_data; |
| // Additional data (for atomics) |
| uint8_t *a_data; |
| // Additional data (for atomics) |
| uint8_t *x_data; |
| // The execution mask |
| VectorMask exec_mask; |
| |
| // SIMD where the WF of the memory instruction has been mapped to |
| int simdId; |
| // unique id of the WF where the memory instruction belongs to |
| int wfDynId; |
| // The kernel id of the requesting wf |
| int kern_id; |
| // The CU id of the requesting wf |
| int cu_id; |
| // The workgroup id of the requesting wf |
| int wg_id; |
| // HW slot id where the WF is mapped to inside a SIMD unit |
| int wfSlotId; |
| // execution pipeline id where the memory instruction has been scheduled |
| int execUnitId; |
| // The execution time of this operation |
| Tick time; |
| // The latency of this operation |
| WaitClass latency; |
| |
| // Initiate the specified memory operation, by creating a |
| // memory request and sending it off to the memory system. |
| void initiateAcc(GPUDynInstPtr gpuDynInst); |
| // Complete the specified memory operation, by writing |
| // value back to the RF in the case of a load or atomic |
| // return or, in the case of a store, we do nothing |
| void completeAcc(GPUDynInstPtr gpuDynInst); |
| |
| void updateStats(); |
| |
| GPUStaticInst* staticInstruction() { return _staticInst; } |
| |
| TheGpuISA::ScalarRegU32 srcLiteral() const; |
| |
| bool isALU() const; |
| bool isBranch() const; |
| bool isCondBranch() const; |
| bool isNop() const; |
| bool isReturn() const; |
| bool isEndOfKernel() const; |
| bool isKernelLaunch() const; |
| bool isSDWAInst() const; |
| bool isDPPInst() const; |
| bool isUnconditionalJump() const; |
| bool isSpecialOp() const; |
| bool isWaitcnt() const; |
| bool isSleep() const; |
| |
| bool isBarrier() const; |
| bool isMemSync() const; |
| bool isMemRef() const; |
| bool isFlat() const; |
| bool isFlatGlobal() const; |
| bool isLoad() const; |
| bool isStore() const; |
| |
| bool isAtomic() const; |
| bool isAtomicNoRet() const; |
| bool isAtomicRet() const; |
| |
| bool isScalar() const; |
| bool isVector() const; |
| bool readsSCC() const; |
| bool writesSCC() const; |
| bool readsVCC() const; |
| bool writesVCC() const; |
| bool readsExec() const; |
| bool writesExec() const; |
| bool readsMode() const; |
| bool writesMode() const; |
| bool ignoreExec() const; |
| bool readsFlatScratch() const; |
| bool writesFlatScratch() const; |
| bool readsExecMask() const; |
| bool writesExecMask() const; |
| |
| bool isAtomicAnd() const; |
| bool isAtomicOr() const; |
| bool isAtomicXor() const; |
| bool isAtomicCAS() const; |
| bool isAtomicExch() const; |
| bool isAtomicAdd() const; |
| bool isAtomicSub() const; |
| bool isAtomicInc() const; |
| bool isAtomicDec() const; |
| bool isAtomicMax() const; |
| bool isAtomicMin() const; |
| |
| bool isArgLoad() const; |
| bool isGlobalMem() const; |
| bool isLocalMem() const; |
| |
| bool isArgSeg() const; |
| bool isGlobalSeg() const; |
| bool isGroupSeg() const; |
| bool isKernArgSeg() const; |
| bool isPrivateSeg() const; |
| bool isReadOnlySeg() const; |
| bool isSpillSeg() const; |
| |
| bool isGloballyCoherent() const; |
| bool isSystemCoherent() const; |
| |
| bool isF16() const; |
| bool isF32() const; |
| bool isF64() const; |
| |
| bool isFMA() const; |
| bool isMAC() const; |
| bool isMAD() const; |
| |
| // for FLAT memory ops. check the segment address |
| // against the APE registers to see if it falls |
| // within one of the APE ranges for LDS/SCRATCH/GPUVM. |
| // if it does not fall into one of the three APEs, it |
| // will be a regular global access. |
| void doApertureCheck(const VectorMask &mask); |
| // Function to resolve a flat accesses during execution stage. |
| void resolveFlatSegment(const VectorMask &mask); |
| |
| template<typename c0> AtomicOpFunctorPtr |
| makeAtomicOpFunctor(c0 *reg0, c0 *reg1) |
| { |
| if (isAtomicAnd()) { |
| return std::make_unique<AtomicOpAnd<c0>>(*reg0); |
| } else if (isAtomicOr()) { |
| return std::make_unique<AtomicOpOr<c0>>(*reg0); |
| } else if (isAtomicXor()) { |
| return std::make_unique<AtomicOpXor<c0>>(*reg0); |
| } else if (isAtomicCAS()) { |
| return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu); |
| } else if (isAtomicExch()) { |
| return std::make_unique<AtomicOpExch<c0>>(*reg0); |
| } else if (isAtomicAdd()) { |
| return std::make_unique<AtomicOpAdd<c0>>(*reg0); |
| } else if (isAtomicSub()) { |
| return std::make_unique<AtomicOpSub<c0>>(*reg0); |
| } else if (isAtomicInc()) { |
| return std::make_unique<AtomicOpInc<c0>>(); |
| } else if (isAtomicDec()) { |
| return std::make_unique<AtomicOpDec<c0>>(); |
| } else if (isAtomicMax()) { |
| return std::make_unique<AtomicOpMax<c0>>(*reg0); |
| } else if (isAtomicMin()) { |
| return std::make_unique<AtomicOpMin<c0>>(*reg0); |
| } else { |
| fatal("Unrecognized atomic operation"); |
| } |
| } |
| |
| void |
| setRequestFlags(RequestPtr req) const |
| { |
| if (isGloballyCoherent()) { |
| req->setCacheCoherenceFlags(Request::GLC_BIT); |
| } |
| |
| if (isSystemCoherent()) { |
| req->setCacheCoherenceFlags(Request::SLC_BIT); |
| } |
| |
| if (isAtomicRet()) { |
| req->setFlags(Request::ATOMIC_RETURN_OP); |
| } else if (isAtomicNoRet()) { |
| req->setFlags(Request::ATOMIC_NO_RETURN_OP); |
| } |
| |
| if (isMemSync()) { |
| // the path for kernel launch and kernel end is different |
| // from non-kernel mem sync. |
| assert(!isKernelLaunch()); |
| assert(!isEndOfKernel()); |
| |
| // must be wbinv inst if not kernel launch/end |
| req->setCacheCoherenceFlags(Request::INV_L1); |
| } |
| } |
| |
| // reset the number of pending memory requests for all lanes |
| void |
| resetEntireStatusVector() |
| { |
| assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg); |
| for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { |
| resetStatusVector(lane); |
| } |
| } |
| |
| // reset the number of pending memory requests for the inputted lane |
| void |
| resetStatusVector(int lane) |
| { |
| setStatusVector(lane, 0); |
| } |
| |
| // set the number of pending memory requests for the inputted lane |
| void |
| setStatusVector(int lane, int newVal) |
| { |
| // currently we can have up to 2 memory requests per lane (if the |
| // lane's request goes across multiple cache lines) |
| assert((newVal >= 0) && (newVal <= 2)); |
| statusVector[lane] = newVal; |
| } |
| |
| // subtracts the number of pending memory requests for the inputted lane |
| // by 1 |
| void |
| decrementStatusVector(int lane) |
| { |
| // this lane may have multiple requests, so only subtract one for |
| // this request |
| assert(statusVector[lane] >= 1); |
| statusVector[lane]--; |
| } |
| |
| // return the current number of pending memory requests for the inputted |
| // lane |
| int |
| getLaneStatus(int lane) const |
| { |
| return statusVector[lane]; |
| } |
| |
| // returns true if all memory requests from all lanes have been received, |
| // else returns false |
| bool |
| allLanesZero() const |
| { |
| // local variables |
| bool allZero = true; |
| |
| // iterate over all lanes, checking the number of pending memory |
| // requests they have |
| for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { |
| // if any lane still has pending requests, return false |
| if (statusVector[lane] > 0) { |
| DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending " |
| "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane, |
| statusVector[lane], addr[lane]); |
| allZero = false; |
| } |
| } |
| |
| if (allZero) { |
| DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending" |
| " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]); |
| } |
| return allZero; |
| } |
| |
| // returns a string representing the current state of the statusVector |
| std::string |
| printStatusVector() const |
| { |
| std::string statusVec_str = "["; |
| |
| // iterate over all lanes, adding the current number of pending |
| // requests for this lane to the string |
| for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { |
| statusVec_str += std::to_string(statusVector[lane]); |
| } |
| statusVec_str += "]"; |
| |
| return statusVec_str; |
| } |
| |
| // Map returned packets and the addresses they satisfy with which lane they |
| // were requested from |
| typedef std::unordered_map<Addr, std::vector<int>> StatusVector; |
| StatusVector memStatusVector; |
| |
| // Track the status of memory requests per lane, an int per lane to allow |
| // unaligned accesses |
| std::vector<int> statusVector; |
| // for ld_v# or st_v# |
| std::vector<int> tlbHitLevel; |
| |
| // for misaligned scalar ops we track the number |
| // of outstanding reqs here |
| int numScalarReqs; |
| |
| Tick getAccessTime() const { return accessTime; } |
| |
| void setAccessTime(Tick currentTime) { accessTime = currentTime; } |
| |
| void profileRoundTripTime(Tick currentTime, int hopId); |
| std::vector<Tick> getRoundTripTime() const { return roundTripTime; } |
| |
| void profileLineAddressTime(Addr addr, Tick currentTime, int hopId); |
| const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const |
| { return lineAddressTime; } |
| |
| // inst used to save/restore a wavefront context |
| bool isSaveRestore; |
| private: |
| GPUStaticInst *_staticInst; |
| const InstSeqNum _seqNum; |
| int maxSrcVecRegOpSize; |
| int maxSrcScalarRegOpSize; |
| |
| // the time the request was started |
| Tick accessTime = -1; |
| |
| // hold the tick when the instruction arrives at certain hop points |
| // on it's way to main memory |
| std::vector<Tick> roundTripTime; |
| |
| // hold each cache block address for the instruction and a vector |
| // to hold the tick when the block arrives at certain hop points |
| std::map<Addr, std::vector<Tick>> lineAddressTime; |
| }; |
| |
| } // namespace gem5 |
| |
| #endif // __GPU_DYN_INST_HH__ |