| /* |
| * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "gpu-compute/gpu_dyn_inst.hh" |
| |
| #include "debug/GPUInst.hh" |
| #include "debug/GPUMem.hh" |
| #include "gpu-compute/gpu_static_inst.hh" |
| #include "gpu-compute/scalar_register_file.hh" |
| #include "gpu-compute/shader.hh" |
| #include "gpu-compute/wavefront.hh" |
| |
| GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, |
| GPUStaticInst *static_inst, InstSeqNum instSeqNum) |
| : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(), |
| (Addr)0), numScalarReqs(0), isSaveRestore(false), |
| _staticInst(static_inst), _seqNum(instSeqNum), |
| maxSrcVecRegOpSize(-1), maxSrcScalarRegOpSize(-1) |
| { |
| _staticInst->initOperandInfo(); |
| statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0); |
| tlbHitLevel.assign(computeUnit()->wfSize(), -1); |
| // vector instructions can have up to 4 source/destination operands |
| d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)]; |
| a_data = new uint8_t[computeUnit()->wfSize() * 8]; |
| x_data = new uint8_t[computeUnit()->wfSize() * 8]; |
| // scalar loads can read up to 16 Dwords of data (see publicly |
| // available GCN3 ISA manual) |
| scalar_data = new uint8_t[16 * sizeof(uint32_t)]; |
| for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) { |
| scalar_data[i] = 0; |
| } |
| for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) { |
| a_data[i] = 0; |
| x_data[i] = 0; |
| } |
| for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) { |
| d_data[i] = 0; |
| } |
| time = 0; |
| |
| cu_id = _cu->cu_id; |
| if (_wf) { |
| simdId = _wf->simdId; |
| wfDynId = _wf->wfDynId; |
| kern_id = _wf->kernId; |
| wg_id = _wf->wgId; |
| wfSlotId = _wf->wfSlotId; |
| } else { |
| simdId = -1; |
| wfDynId = -1; |
| kern_id = -1; |
| wg_id = -1; |
| wfSlotId = -1; |
| } |
| |
| |
| DPRINTF(GPUInst, "%s: generating operand info for %d operands\n", |
| disassemble(), getNumOperands()); |
| |
| _staticInst->initDynOperandInfo(wavefront(), computeUnit()); |
| |
| } |
| |
| GPUDynInst::~GPUDynInst() |
| { |
| delete[] d_data; |
| delete[] a_data; |
| delete[] x_data; |
| delete[] scalar_data; |
| delete _staticInst; |
| } |
| |
| void |
| GPUDynInst::execute(GPUDynInstPtr gpuDynInst) |
| { |
| _staticInst->execute(gpuDynInst); |
| } |
| |
| const std::vector<OperandInfo>& |
| GPUDynInst::srcVecRegOperands() const |
| { |
| return _staticInst->srcVecRegOperands(); |
| } |
| |
| const std::vector<OperandInfo>& |
| GPUDynInst::dstVecRegOperands() const |
| { |
| return _staticInst->dstVecRegOperands(); |
| } |
| |
| const std::vector<OperandInfo>& |
| GPUDynInst::srcScalarRegOperands() const |
| { |
| return _staticInst->srcScalarRegOperands(); |
| } |
| |
| const std::vector<OperandInfo>& |
| GPUDynInst::dstScalarRegOperands() const |
| { |
| return _staticInst->dstScalarRegOperands(); |
| } |
| |
| int |
| GPUDynInst::numSrcRegOperands() |
| { |
| return _staticInst->numSrcRegOperands(); |
| } |
| |
| int |
| GPUDynInst::numDstRegOperands() |
| { |
| return _staticInst->numDstRegOperands(); |
| } |
| |
| int |
| GPUDynInst::numSrcVecRegOperands() const |
| { |
| return _staticInst->numSrcVecOperands(); |
| } |
| |
| int |
| GPUDynInst::numDstVecRegOperands() const |
| { |
| return _staticInst->numDstVecOperands(); |
| } |
| |
| int |
| GPUDynInst::maxSrcVecRegOperandSize() |
| { |
| if (maxSrcVecRegOpSize != -1) |
| return maxSrcVecRegOpSize; |
| |
| maxSrcVecRegOpSize = 0; |
| for (const auto& srcVecOp : srcVecRegOperands()) |
| if (srcVecOp.sizeInDWords() > maxSrcVecRegOpSize) |
| maxSrcVecRegOpSize = srcVecOp.sizeInDWords(); |
| |
| return maxSrcVecRegOpSize; |
| } |
| |
| int |
| GPUDynInst::numSrcVecDWords() |
| { |
| return _staticInst->numSrcVecDWords(); |
| } |
| |
| int |
| GPUDynInst::numDstVecDWords() |
| { |
| return _staticInst->numDstVecDWords(); |
| } |
| |
| int |
| GPUDynInst::numSrcScalarRegOperands() const |
| { |
| return _staticInst->numSrcScalarOperands(); |
| } |
| |
| int |
| GPUDynInst::numDstScalarRegOperands() const |
| { |
| return _staticInst->numDstScalarOperands(); |
| } |
| |
| int |
| GPUDynInst::maxSrcScalarRegOperandSize() |
| { |
| if (maxSrcScalarRegOpSize != -1) |
| return maxSrcScalarRegOpSize; |
| |
| maxSrcScalarRegOpSize = 0; |
| for (const auto& srcScOp : srcScalarRegOperands()) |
| if (srcScOp.sizeInDWords() > maxSrcScalarRegOpSize) |
| maxSrcScalarRegOpSize = srcScOp.sizeInDWords(); |
| |
| return maxSrcScalarRegOpSize; |
| } |
| |
| int |
| GPUDynInst::numSrcScalarDWords() |
| { |
| return _staticInst->numSrcScalarDWords(); |
| } |
| |
| int |
| GPUDynInst::numDstScalarDWords() |
| { |
| return _staticInst->numDstScalarDWords(); |
| } |
| |
| int |
| GPUDynInst::maxOperandSize() |
| { |
| return _staticInst->maxOperandSize(); |
| } |
| |
| int |
| GPUDynInst::getNumOperands() const |
| { |
| return _staticInst->getNumOperands(); |
| } |
| |
| bool |
| GPUDynInst::hasSourceVgpr() const |
| { |
| return !srcVecRegOperands().empty(); |
| } |
| |
| bool |
| GPUDynInst::hasDestinationVgpr() const |
| { |
| return !dstVecRegOperands().empty(); |
| } |
| |
| bool |
| GPUDynInst::hasSourceSgpr() const |
| { |
| return !srcScalarRegOperands().empty(); |
| } |
| |
| bool |
| GPUDynInst::hasDestinationSgpr() const |
| { |
| return !dstScalarRegOperands().empty(); |
| } |
| |
| bool |
| GPUDynInst::isOpcode(const std::string& opcodeStr, |
| const std::string& extStr) const |
| { |
| return _staticInst->opcode().find(opcodeStr) != std::string::npos && |
| _staticInst->opcode().find(extStr) != std::string::npos; |
| } |
| |
| bool |
| GPUDynInst::isOpcode(const std::string& opcodeStr) const |
| { |
| return _staticInst->opcode().find(opcodeStr) != std::string::npos; |
| } |
| |
| const std::string& |
| GPUDynInst::disassemble() const |
| { |
| return _staticInst->disassemble(); |
| } |
| |
| InstSeqNum |
| GPUDynInst::seqNum() const |
| { |
| return _seqNum; |
| } |
| |
| enums::StorageClassType |
| GPUDynInst::executedAs() |
| { |
| return _staticInst->executed_as; |
| } |
| |
| // Process a memory instruction and (if necessary) submit timing request |
| void |
| GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) |
| { |
| DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n", |
| cu->cu_id, simdId, wfSlotId, exec_mask); |
| |
| _staticInst->initiateAcc(gpuDynInst); |
| } |
| |
| void |
| GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst) |
| { |
| DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=" |
| "%#x\n complete", |
| cu->cu_id, simdId, wfSlotId, exec_mask); |
| |
| _staticInst->completeAcc(gpuDynInst); |
| } |
| |
| /** |
| * accessor methods for the attributes of |
| * the underlying GPU static instruction |
| */ |
| bool |
| GPUDynInst::isALU() const |
| { |
| return _staticInst->isALU(); |
| } |
| |
| bool |
| GPUDynInst::isBranch() const |
| { |
| return _staticInst->isBranch(); |
| } |
| |
| bool |
| GPUDynInst::isCondBranch() const |
| { |
| return _staticInst->isCondBranch(); |
| } |
| |
| bool |
| GPUDynInst::isNop() const |
| { |
| return _staticInst->isNop(); |
| } |
| |
| bool |
| GPUDynInst::isEndOfKernel() const |
| { |
| return _staticInst->isEndOfKernel(); |
| } |
| |
| bool |
| GPUDynInst::isKernelLaunch() const |
| { |
| return _staticInst->isKernelLaunch(); |
| } |
| |
| bool |
| GPUDynInst::isSDWAInst() const |
| { |
| return _staticInst->isSDWAInst(); |
| } |
| |
| bool |
| GPUDynInst::isDPPInst() const |
| { |
| return _staticInst->isDPPInst(); |
| } |
| |
| bool |
| GPUDynInst::isReturn() const |
| { |
| return _staticInst->isReturn(); |
| } |
| |
| bool |
| GPUDynInst::isUnconditionalJump() const |
| { |
| return _staticInst->isUnconditionalJump(); |
| } |
| |
| bool |
| GPUDynInst::isSpecialOp() const |
| { |
| return _staticInst->isSpecialOp(); |
| } |
| |
| bool |
| GPUDynInst::isWaitcnt() const |
| { |
| return _staticInst->isWaitcnt(); |
| } |
| |
| bool |
| GPUDynInst::isSleep() const |
| { |
| return _staticInst->isSleep(); |
| } |
| |
| bool |
| GPUDynInst::isBarrier() const |
| { |
| return _staticInst->isBarrier(); |
| } |
| |
| bool |
| GPUDynInst::isMemSync() const |
| { |
| return _staticInst->isMemSync(); |
| } |
| |
| bool |
| GPUDynInst::isMemRef() const |
| { |
| return _staticInst->isMemRef(); |
| } |
| |
| bool |
| GPUDynInst::isFlat() const |
| { |
| return _staticInst->isFlat(); |
| } |
| |
| bool |
| GPUDynInst::isLoad() const |
| { |
| return _staticInst->isLoad(); |
| } |
| |
| bool |
| GPUDynInst::isStore() const |
| { |
| return _staticInst->isStore(); |
| } |
| |
| bool |
| GPUDynInst::isAtomic() const |
| { |
| return _staticInst->isAtomic(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicNoRet() const |
| { |
| return _staticInst->isAtomicNoRet(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicRet() const |
| { |
| return _staticInst->isAtomicRet(); |
| } |
| |
| bool |
| GPUDynInst::isVector() const |
| { |
| return !_staticInst->isScalar(); |
| } |
| |
| bool |
| GPUDynInst::isScalar() const |
| { |
| return _staticInst->isScalar(); |
| } |
| |
| bool |
| GPUDynInst::readsSCC() const |
| { |
| return _staticInst->readsSCC(); |
| } |
| |
| bool |
| GPUDynInst::writesSCC() const |
| { |
| return _staticInst->writesSCC(); |
| } |
| |
| bool |
| GPUDynInst::readsVCC() const |
| { |
| for (const auto& srcOp : _staticInst->srcOperands()) |
| if (srcOp.isVcc()) |
| return true; |
| |
| return _staticInst->readsVCC(); |
| } |
| |
| bool |
| GPUDynInst::writesVCC() const |
| { |
| for (const auto& dstOp : _staticInst->dstOperands()) |
| if (dstOp.isVcc()) |
| return true; |
| |
| return _staticInst->writesVCC(); |
| } |
| |
| bool |
| GPUDynInst::readsMode() const |
| { |
| return _staticInst->readsMode(); |
| } |
| |
| bool |
| GPUDynInst::writesMode() const |
| { |
| return _staticInst->writesMode(); |
| } |
| |
| bool |
| GPUDynInst::readsExec() const |
| { |
| return _staticInst->readsEXEC(); |
| } |
| |
| bool |
| GPUDynInst::writesExec() const |
| { |
| return _staticInst->writesEXEC(); |
| } |
| |
| bool |
| GPUDynInst::ignoreExec() const |
| { |
| return _staticInst->ignoreExec(); |
| } |
| |
| bool |
| GPUDynInst::writesExecMask() const |
| { |
| for (const auto& dstOp : _staticInst->dstOperands()) |
| if (dstOp.isExec()) |
| return true; |
| |
| return _staticInst->writesEXEC(); |
| } |
| |
| bool |
| GPUDynInst::readsExecMask() const |
| { |
| for (const auto& srcOp : _staticInst->srcOperands()) |
| if (srcOp.isExec()) |
| return true; |
| |
| return _staticInst->readsEXEC(); |
| } |
| |
| bool |
| GPUDynInst::writesFlatScratch() const |
| { |
| for (const auto& dstScalarOp : dstScalarRegOperands()) |
| if (dstScalarOp.isFlatScratch()) |
| return true; |
| |
| return false; |
| } |
| |
| bool |
| GPUDynInst::readsFlatScratch() const |
| { |
| for (const auto& srcScalarOp : srcScalarRegOperands()) |
| if (srcScalarOp.isFlatScratch()) |
| return true; |
| |
| return false; |
| } |
| |
| bool |
| GPUDynInst::isAtomicAnd() const |
| { |
| return _staticInst->isAtomicAnd(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicOr() const |
| { |
| return _staticInst->isAtomicOr(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicXor() const |
| { |
| return _staticInst->isAtomicXor(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicCAS() const |
| { |
| return _staticInst->isAtomicCAS(); |
| } |
| |
| bool GPUDynInst::isAtomicExch() const |
| { |
| return _staticInst->isAtomicExch(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicAdd() const |
| { |
| return _staticInst->isAtomicAdd(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicSub() const |
| { |
| return _staticInst->isAtomicSub(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicInc() const |
| { |
| return _staticInst->isAtomicInc(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicDec() const |
| { |
| return _staticInst->isAtomicDec(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicMax() const |
| { |
| return _staticInst->isAtomicMax(); |
| } |
| |
| bool |
| GPUDynInst::isAtomicMin() const |
| { |
| return _staticInst->isAtomicMin(); |
| } |
| |
| bool |
| GPUDynInst::isArgLoad() const |
| { |
| return _staticInst->isArgLoad(); |
| } |
| |
| bool |
| GPUDynInst::isGlobalMem() const |
| { |
| return _staticInst->isGlobalMem(); |
| } |
| |
| bool |
| GPUDynInst::isLocalMem() const |
| { |
| return _staticInst->isLocalMem(); |
| } |
| |
| bool |
| GPUDynInst::isArgSeg() const |
| { |
| return _staticInst->isArgSeg(); |
| } |
| |
| bool |
| GPUDynInst::isGlobalSeg() const |
| { |
| return _staticInst->isGlobalSeg(); |
| } |
| |
| bool |
| GPUDynInst::isGroupSeg() const |
| { |
| return _staticInst->isGroupSeg(); |
| } |
| |
| bool |
| GPUDynInst::isKernArgSeg() const |
| { |
| return _staticInst->isKernArgSeg(); |
| } |
| |
| bool |
| GPUDynInst::isPrivateSeg() const |
| { |
| return _staticInst->isPrivateSeg(); |
| } |
| |
| bool |
| GPUDynInst::isReadOnlySeg() const |
| { |
| return _staticInst->isReadOnlySeg(); |
| } |
| |
| bool |
| GPUDynInst::isSpillSeg() const |
| { |
| return _staticInst->isSpillSeg(); |
| } |
| |
| bool |
| GPUDynInst::isGloballyCoherent() const |
| { |
| return _staticInst->isGloballyCoherent(); |
| } |
| |
| bool |
| GPUDynInst::isSystemCoherent() const |
| { |
| return _staticInst->isSystemCoherent(); |
| } |
| |
| bool |
| GPUDynInst::isF16() const |
| { |
| return _staticInst->isF16(); |
| } |
| |
| bool |
| GPUDynInst::isF32() const |
| { |
| return _staticInst->isF32(); |
| } |
| |
| bool |
| GPUDynInst::isF64() const |
| { |
| return _staticInst->isF64(); |
| } |
| |
| bool |
| GPUDynInst::isFMA() const |
| { |
| return _staticInst->isFMA(); |
| } |
| |
| bool |
| GPUDynInst::isMAC() const |
| { |
| return _staticInst->isMAC(); |
| } |
| |
| bool |
| GPUDynInst::isMAD() const |
| { |
| return _staticInst->isMAD(); |
| } |
| |
| void |
| GPUDynInst::doApertureCheck(const VectorMask &mask) |
| { |
| assert(mask.any()); |
| // find the segment of the first active address, after |
| // that we check that all other active addresses also |
| // fall within the same APE |
| for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) { |
| if (mask[lane]) { |
| if (computeUnit()->shader->isLdsApe(addr[lane])) { |
| // group segment |
| staticInstruction()->executed_as = enums::SC_GROUP; |
| break; |
| } else if (computeUnit()->shader->isScratchApe(addr[lane])) { |
| // private segment |
| staticInstruction()->executed_as = enums::SC_PRIVATE; |
| break; |
| } else if (computeUnit()->shader->isGpuVmApe(addr[lane])) { |
| // we won't support GPUVM |
| fatal("flat access is in GPUVM APE\n"); |
| } else if (bits(addr[lane], 63, 47) != 0x1FFFF && |
| bits(addr[lane], 63, 47)) { |
| // we are in the "hole", this is a memory violation |
| fatal("flat access at addr %#x has a memory violation\n", |
| addr[lane]); |
| } else { |
| // global memory segment |
| staticInstruction()->executed_as = enums::SC_GLOBAL; |
| break; |
| } |
| } |
| } |
| |
| // we should have found the segment |
| assert(executedAs() != enums::SC_NONE); |
| |
| // flat accesses should not straddle multiple APEs so we |
| // must check that all addresses fall within the same APE |
| if (executedAs() == enums::SC_GROUP) { |
| for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) { |
| if (mask[lane]) { |
| // if the first valid addr we found above was LDS, |
| // all the rest should be |
| assert(computeUnit()->shader->isLdsApe(addr[lane])); |
| } |
| } |
| } else if (executedAs() == enums::SC_PRIVATE) { |
| for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) { |
| if (mask[lane]) { |
| // if the first valid addr we found above was private, |
| // all the rest should be |
| assert(computeUnit()->shader->isScratchApe(addr[lane])); |
| } |
| } |
| } else { |
| for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) { |
| if (mask[lane]) { |
| // if the first valid addr we found above was global, |
| // all the rest should be. because we don't have an |
| // explicit range of the global segment, we just make |
| // sure that the address fall in no other APE and that |
| // it is not a memory violation |
| assert(!computeUnit()->shader->isLdsApe(addr[lane])); |
| assert(!computeUnit()->shader->isScratchApe(addr[lane])); |
| assert(!computeUnit()->shader->isGpuVmApe(addr[lane])); |
| assert(!(bits(addr[lane], 63, 47) != 0x1FFFF |
| && bits(addr[lane], 63, 47))); |
| } |
| } |
| } |
| } |
| |
| void |
| GPUDynInst::resolveFlatSegment(const VectorMask &mask) |
| { |
| doApertureCheck(mask); |
| |
| |
| // Now that we know the aperature, do the following: |
| // 1. Transform the flat address to its segmented equivalent. |
| // 2. Set the execUnitId based an the aperture check. |
| // 3. Decrement any extra resources that were reserved. Other |
| // resources are released as normal, below. |
| if (executedAs() == enums::SC_GLOBAL) { |
| // no transormation for global segment |
| wavefront()->execUnitId = wavefront()->flatGmUnitId; |
| if (isLoad()) { |
| wavefront()->rdLmReqsInPipe--; |
| } else if (isStore()) { |
| wavefront()->wrLmReqsInPipe--; |
| } else if (isAtomic() || isMemSync()) { |
| wavefront()->wrLmReqsInPipe--; |
| wavefront()->rdLmReqsInPipe--; |
| } else { |
| panic("Invalid memory operation!\n"); |
| } |
| } else if (executedAs() == enums::SC_GROUP) { |
| for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { |
| if (mask[lane]) { |
| // flat address calculation goes here. |
| // addr[lane] = segmented address |
| panic("Flat group memory operation is unimplemented!\n"); |
| } |
| } |
| wavefront()->execUnitId = wavefront()->flatLmUnitId; |
| wavefront()->decVMemInstsIssued(); |
| if (isLoad()) { |
| wavefront()->rdGmReqsInPipe--; |
| } else if (isStore()) { |
| wavefront()->wrGmReqsInPipe--; |
| } else if (isAtomic() || isMemSync()) { |
| wavefront()->rdGmReqsInPipe--; |
| wavefront()->wrGmReqsInPipe--; |
| } else { |
| panic("Invalid memory operation!\n"); |
| } |
| } else if (executedAs() == enums::SC_PRIVATE) { |
| /** |
| * Flat instructions may resolve to the private segment (scratch), |
| * which is backed by main memory and provides per-lane scratch |
| * memory. Flat addressing uses apertures - registers that specify |
| * the address range in the VA space where LDS/private memory is |
| * mapped. The value of which is set by the kernel mode driver. |
| * These apertures use addresses that are not used by x86 CPUs. |
| * When the address of a Flat operation falls into one of the |
| * apertures, the Flat operation is redirected to either LDS or |
| * to the private memory segment. |
| * |
| * For private memory the SW runtime will allocate some space in |
| * the VA space for each AQL queue. The base address of which is |
| * stored in scalar registers per the AMD GPU ABI. The amd_queue_t |
| * scratch_backing_memory_location provides the base address in |
| * memory for the queue's private segment. Various other fields |
| * loaded into register state during kernel launch specify per-WF |
| * and per-work-item offsets so that individual lanes may access |
| * their private segment allocation. |
| * |
| * For more details about flat addressing see: |
| * http://rocm-documentation.readthedocs.io/en/latest/ |
| * ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch |
| * |
| * https://github.com/ROCm-Developer-Tools/ |
| * ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md |
| * #flat-addressing |
| */ |
| |
| uint32_t numSgprs = wavefront()->maxSgprs; |
| uint32_t physSgprIdx = |
| wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), |
| numSgprs - 3); |
| uint32_t offset = |
| wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); |
| physSgprIdx = |
| wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), |
| numSgprs - 4); |
| uint32_t size = |
| wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); |
| for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { |
| if (mask[lane]) { |
| addr[lane] = addr[lane] + lane * size + offset + |
| wavefront()->computeUnit->shader->getHiddenPrivateBase() - |
| wavefront()->computeUnit->shader->getScratchBase(); |
| } |
| } |
| wavefront()->execUnitId = wavefront()->flatLmUnitId; |
| wavefront()->decLGKMInstsIssued(); |
| if (isLoad()) { |
| wavefront()->rdGmReqsInPipe--; |
| } else if (isStore()) { |
| wavefront()->wrGmReqsInPipe--; |
| } else if (isAtomic() || isMemSync()) { |
| wavefront()->rdGmReqsInPipe--; |
| wavefront()->wrGmReqsInPipe--; |
| } else { |
| panic("Invalid memory operation!\n"); |
| } |
| } else { |
| for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { |
| if (mask[lane]) { |
| panic("flat addr %#llx maps to bad segment %d\n", |
| addr[lane], executedAs()); |
| } |
| } |
| } |
| } |
| |
| TheGpuISA::ScalarRegU32 |
| GPUDynInst::srcLiteral() const |
| { |
| return _staticInst->srcLiteral(); |
| } |
| |
| void |
| GPUDynInst::updateStats() |
| { |
| if (_staticInst->isLocalMem()) { |
| // access to LDS (shared) memory |
| cu->stats.dynamicLMemInstrCnt++; |
| } else if (_staticInst->isFlat()) { |
| cu->stats.dynamicFlatMemInstrCnt++; |
| } else { |
| // access to global memory |
| |
| // update PageDivergence histogram |
| int number_pages_touched = cu->pagesTouched.size(); |
| assert(number_pages_touched); |
| cu->stats.pageDivergenceDist.sample(number_pages_touched); |
| |
| std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret; |
| |
| for (auto it : cu->pagesTouched) { |
| // see if this page has been touched before. if not, this also |
| // inserts the page into the table. |
| ret = cu->pageAccesses |
| .insert(ComputeUnit::pageDataStruct::value_type(it.first, |
| std::make_pair(1, it.second))); |
| |
| // if yes, then update the stats |
| if (!ret.second) { |
| ret.first->second.first++; |
| ret.first->second.second += it.second; |
| } |
| } |
| |
| cu->pagesTouched.clear(); |
| |
| // total number of memory instructions (dynamic) |
| // Atomics are counted as a single memory instruction. |
| // this is # memory instructions per wavefronts, not per workitem |
| cu->stats.dynamicGMemInstrCnt++; |
| } |
| } |
| |
| void |
| GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId) |
| { |
| // Only take the first measurement in the case of coalescing |
| if (roundTripTime.size() > hopId) |
| return; |
| |
| roundTripTime.push_back(currentTime); |
| } |
| |
| void |
| GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId) |
| { |
| if (lineAddressTime.count(addr)) { |
| if (lineAddressTime[addr].size() > hopId) { |
| return; |
| } |
| |
| lineAddressTime[addr].push_back(currentTime); |
| } else if (hopId == 0) { |
| auto addressTimeVec = std::vector<Tick> { currentTime }; |
| lineAddressTime.insert(std::make_pair(addr, addressTimeVec)); |
| } |
| } |