| /* |
| * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| * |
| * Author: Steve Reinhardt |
| */ |
| |
| #ifndef __ARCH_HSAIL_INSTS_MEM_HH__ |
| #define __ARCH_HSAIL_INSTS_MEM_HH__ |
| |
| #include <type_traits> |
| |
| #include "arch/hsail/insts/decl.hh" |
| #include "arch/hsail/insts/gpu_static_inst.hh" |
| #include "arch/hsail/operand.hh" |
| #include "gpu-compute/compute_unit.hh" |
| |
| namespace HsailISA |
| { |
| class MemInst |
| { |
| public: |
| MemInst() : size(0), addr_operand(nullptr) { } |
| |
| MemInst(Enums::MemType m_type) |
| { |
| if (m_type == Enums::M_U64 || |
| m_type == Enums::M_S64 || |
| m_type == Enums::M_F64) { |
| size = 8; |
| } else if (m_type == Enums::M_U32 || |
| m_type == Enums::M_S32 || |
| m_type == Enums::M_F32) { |
| size = 4; |
| } else if (m_type == Enums::M_U16 || |
| m_type == Enums::M_S16 || |
| m_type == Enums::M_F16) { |
| size = 2; |
| } else { |
| size = 1; |
| } |
| |
| addr_operand = nullptr; |
| } |
| |
| void |
| init_addr(AddrOperandBase *_addr_operand) |
| { |
| addr_operand = _addr_operand; |
| } |
| |
| private: |
| int size; |
| AddrOperandBase *addr_operand; |
| |
| public: |
| int getMemOperandSize() { return size; } |
| AddrOperandBase *getAddressOperand() { return addr_operand; } |
| }; |
| |
| template<typename DestOperandType, typename AddrOperandType> |
| class LdaInstBase : public HsailGPUStaticInst |
| { |
| public: |
| typename DestOperandType::DestOperand dest; |
| AddrOperandType addr; |
| |
| LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, |
| const char *_opcode) |
| : HsailGPUStaticInst(obj, _opcode) |
| { |
| using namespace Brig; |
| |
| setFlag(ALU); |
| |
| unsigned op_offs = obj->getOperandPtr(ib->operands, 0); |
| dest.init(op_offs, obj); |
| op_offs = obj->getOperandPtr(ib->operands, 1); |
| addr.init(op_offs, obj); |
| } |
| |
| int numSrcRegOperands() override |
| { return(this->addr.isVectorRegister()); } |
| int numDstRegOperands() override |
| { return dest.isVectorRegister(); } |
| bool isVectorRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.isVectorRegister() : |
| this->addr.isVectorRegister()); |
| } |
| bool isCondRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.isCondRegister() : |
| this->addr.isCondRegister()); |
| } |
| bool isScalarRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.isScalarRegister() : |
| this->addr.isScalarRegister()); |
| } |
| bool isSrcOperand(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex > 0) |
| return(this->addr.isVectorRegister()); |
| return false; |
| } |
| bool isDstOperand(int operandIndex) override { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return(operandIndex == 0); |
| } |
| int getOperandSize(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.opSize() : |
| this->addr.opSize()); |
| } |
| int |
| getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.regIndex() : |
| this->addr.regIndex()); |
| } |
| int getNumOperands() override |
| { |
| if (this->addr.isVectorRegister()) |
| return 2; |
| return 1; |
| } |
| }; |
| |
| template<typename DestDataType, typename AddrOperandType> |
| class LdaInst : |
| public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>, |
| public MemInst |
| { |
| public: |
| void generateDisassembly(); |
| |
| LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, |
| const char *_opcode) |
| : LdaInstBase<typename DestDataType::OperandType, |
| AddrOperandType>(ib, obj, _opcode) |
| { |
| init_addr(&this->addr); |
| } |
| |
| void execute(GPUDynInstPtr gpuDynInst); |
| }; |
| |
| template<typename DataType> |
| GPUStaticInst* |
| decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj) |
| { |
| unsigned op_offs = obj->getOperandPtr(ib->operands, 1); |
| BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj); |
| |
| if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { |
| return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas"); |
| } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { |
| // V2/V4 not allowed |
| switch (regDataType.regKind) { |
| case Brig::BRIG_REGISTER_KIND_SINGLE: |
| return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas"); |
| case Brig::BRIG_REGISTER_KIND_DOUBLE: |
| return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas"); |
| default: |
| fatal("Bad ldas register operand type %d\n", regDataType.type); |
| } |
| } else { |
| fatal("Bad ldas register operand kind %d\n", regDataType.kind); |
| } |
| } |
| |
| template<typename MemOperandType, typename DestOperandType, |
| typename AddrOperandType> |
| class LdInstBase : public HsailGPUStaticInst |
| { |
| public: |
| Brig::BrigWidth8_t width; |
| typename DestOperandType::DestOperand dest; |
| AddrOperandType addr; |
| |
| Brig::BrigSegment segment; |
| Brig::BrigMemoryOrder memoryOrder; |
| Brig::BrigMemoryScope memoryScope; |
| unsigned int equivClass; |
| |
| LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, |
| const char *_opcode) |
| : HsailGPUStaticInst(obj, _opcode) |
| { |
| using namespace Brig; |
| |
| setFlag(MemoryRef); |
| setFlag(Load); |
| |
| if (ib->opcode == BRIG_OPCODE_LD) { |
| const BrigInstMem *ldst = (const BrigInstMem*)ib; |
| |
| segment = (BrigSegment)ldst->segment; |
| memoryOrder = BRIG_MEMORY_ORDER_NONE; |
| memoryScope = BRIG_MEMORY_SCOPE_NONE; |
| equivClass = ldst->equivClass; |
| |
| width = ldst->width; |
| unsigned op_offs = obj->getOperandPtr(ib->operands, 0); |
| const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); |
| if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) |
| dest.init(op_offs, obj); |
| |
| op_offs = obj->getOperandPtr(ib->operands, 1); |
| addr.init(op_offs, obj); |
| } else { |
| const BrigInstAtomic *at = (const BrigInstAtomic*)ib; |
| |
| segment = (BrigSegment)at->segment; |
| memoryOrder = (BrigMemoryOrder)at->memoryOrder; |
| memoryScope = (BrigMemoryScope)at->memoryScope; |
| equivClass = 0; |
| |
| width = BRIG_WIDTH_1; |
| unsigned op_offs = obj->getOperandPtr(ib->operands, 0); |
| const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); |
| |
| if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) |
| dest.init(op_offs, obj); |
| |
| op_offs = obj->getOperandPtr(ib->operands,1); |
| addr.init(op_offs, obj); |
| } |
| |
| switch (memoryOrder) { |
| case BRIG_MEMORY_ORDER_NONE: |
| setFlag(NoOrder); |
| break; |
| case BRIG_MEMORY_ORDER_RELAXED: |
| setFlag(RelaxedOrder); |
| break; |
| case BRIG_MEMORY_ORDER_SC_ACQUIRE: |
| setFlag(Acquire); |
| break; |
| case BRIG_MEMORY_ORDER_SC_RELEASE: |
| setFlag(Release); |
| break; |
| case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: |
| setFlag(AcquireRelease); |
| break; |
| default: |
| fatal("LdInst has bad memory order type\n"); |
| } |
| |
| switch (memoryScope) { |
| case BRIG_MEMORY_SCOPE_NONE: |
| setFlag(NoScope); |
| break; |
| case BRIG_MEMORY_SCOPE_WORKITEM: |
| setFlag(WorkitemScope); |
| break; |
| case BRIG_MEMORY_SCOPE_WORKGROUP: |
| setFlag(WorkgroupScope); |
| break; |
| case BRIG_MEMORY_SCOPE_AGENT: |
| setFlag(DeviceScope); |
| break; |
| case BRIG_MEMORY_SCOPE_SYSTEM: |
| setFlag(SystemScope); |
| break; |
| default: |
| fatal("LdInst has bad memory scope type\n"); |
| } |
| |
| switch (segment) { |
| case BRIG_SEGMENT_GLOBAL: |
| setFlag(GlobalSegment); |
| break; |
| case BRIG_SEGMENT_GROUP: |
| setFlag(GroupSegment); |
| break; |
| case BRIG_SEGMENT_PRIVATE: |
| setFlag(PrivateSegment); |
| break; |
| case BRIG_SEGMENT_READONLY: |
| setFlag(ReadOnlySegment); |
| break; |
| case BRIG_SEGMENT_SPILL: |
| setFlag(SpillSegment); |
| break; |
| case BRIG_SEGMENT_FLAT: |
| setFlag(Flat); |
| break; |
| case BRIG_SEGMENT_KERNARG: |
| setFlag(KernArgSegment); |
| break; |
| case BRIG_SEGMENT_ARG: |
| setFlag(ArgSegment); |
| break; |
| default: |
| panic("Ld: segment %d not supported\n", segment); |
| } |
| } |
| |
| int numSrcRegOperands() override |
| { return(this->addr.isVectorRegister()); } |
| int numDstRegOperands() override { return dest.isVectorRegister(); } |
| int getNumOperands() override |
| { |
| if (this->addr.isVectorRegister()) |
| return 2; |
| else |
| return 1; |
| } |
| bool isVectorRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.isVectorRegister() : |
| this->addr.isVectorRegister()); |
| } |
| bool isCondRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.isCondRegister() : |
| this->addr.isCondRegister()); |
| } |
| bool isScalarRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.isScalarRegister() : |
| this->addr.isScalarRegister()); |
| } |
| bool isSrcOperand(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex > 0) |
| return(this->addr.isVectorRegister()); |
| return false; |
| } |
| bool isDstOperand(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return(operandIndex == 0); |
| } |
| int getOperandSize(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.opSize() : |
| this->addr.opSize()); |
| } |
| int |
| getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return((operandIndex == 0) ? dest.regIndex() : |
| this->addr.regIndex()); |
| } |
| }; |
| |
| template<typename MemDataType, typename DestDataType, |
| typename AddrOperandType> |
| class LdInst : |
| public LdInstBase<typename MemDataType::CType, |
| typename DestDataType::OperandType, AddrOperandType>, |
| public MemInst |
| { |
| typename DestDataType::OperandType::DestOperand dest_vect[4]; |
| uint16_t num_dest_operands; |
| void generateDisassembly() override; |
| |
| public: |
| LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, |
| const char *_opcode) |
| : LdInstBase<typename MemDataType::CType, |
| typename DestDataType::OperandType, |
| AddrOperandType>(ib, obj, _opcode), |
| MemInst(MemDataType::memType) |
| { |
| init_addr(&this->addr); |
| |
| unsigned op_offs = obj->getOperandPtr(ib->operands,0); |
| const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); |
| |
| if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { |
| const Brig::BrigOperandOperandList *brigRegVecOp = |
| (const Brig::BrigOperandOperandList*)brigOp; |
| |
| num_dest_operands = |
| *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; |
| |
| assert(num_dest_operands <= 4); |
| } else { |
| num_dest_operands = 1; |
| } |
| |
| if (num_dest_operands > 1) { |
| assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); |
| |
| for (int i = 0; i < num_dest_operands; ++i) { |
| dest_vect[i].init_from_vect(op_offs, obj, i); |
| } |
| } |
| } |
| |
| void |
| initiateAcc(GPUDynInstPtr gpuDynInst) override |
| { |
| typedef typename MemDataType::CType c0; |
| |
| gpuDynInst->statusBitVector = gpuDynInst->exec_mask; |
| |
| if (num_dest_operands > 1) { |
| for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) |
| if (gpuDynInst->exec_mask[i]) |
| gpuDynInst->statusVector.push_back(num_dest_operands); |
| else |
| gpuDynInst->statusVector.push_back(0); |
| } |
| |
| for (int k = 0; k < num_dest_operands; ++k) { |
| |
| c0 *d = &((c0*)gpuDynInst->d_data) |
| [k * gpuDynInst->computeUnit()->wfSize()]; |
| |
| for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { |
| if (gpuDynInst->exec_mask[i]) { |
| Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); |
| |
| if (this->isLocalMem()) { |
| // load from shared memory |
| *d = gpuDynInst->wavefront()->ldsChunk-> |
| read<c0>(vaddr); |
| } else { |
| RequestPtr req = std::make_shared<Request>(0, |
| vaddr, sizeof(c0), 0, |
| gpuDynInst->computeUnit()->masterId(), |
| 0, gpuDynInst->wfDynId); |
| |
| gpuDynInst->setRequestFlags(req); |
| PacketPtr pkt = new Packet(req, MemCmd::ReadReq); |
| pkt->dataStatic(d); |
| |
| if (gpuDynInst->computeUnit()->shader-> |
| separate_acquire_release && |
| gpuDynInst->isAcquire()) { |
| // if this load has acquire semantics, |
| // set the response continuation function |
| // to perform an Acquire request |
| gpuDynInst->execContinuation = |
| &GPUStaticInst::execLdAcq; |
| |
| gpuDynInst->useContinuation = true; |
| } else { |
| // the request will be finished when |
| // the load completes |
| gpuDynInst->useContinuation = false; |
| } |
| // translation is performed in sendRequest() |
| gpuDynInst->computeUnit()->sendRequest(gpuDynInst, |
| i, pkt); |
| } |
| } |
| ++d; |
| } |
| } |
| |
| gpuDynInst->updateStats(); |
| } |
| |
| void |
| completeAcc(GPUDynInstPtr gpuDynInst) override |
| { |
| typedef typename MemDataType::CType c1; |
| |
| constexpr bool is_vt_32 = DestDataType::vgprType == VT_32; |
| |
| /** |
| * this code essentially replaces the long if-else chain |
| * that was in used GlobalMemPipeline::exec() to infer the |
| * size (single/double) and type (floating point/integer) of |
| * the destination register. this is needed for load |
| * instructions because the loaded value and the |
| * destination type can be of different sizes, and we also |
| * need to know if the value we're writing back is floating |
| * point and signed/unsigned, so we can properly cast the |
| * writeback value |
| */ |
| typedef typename std::conditional<is_vt_32, |
| typename std::conditional<std::is_floating_point<c1>::value, |
| float, typename std::conditional<std::is_signed<c1>::value, |
| int32_t, uint32_t>::type>::type, |
| typename std::conditional<std::is_floating_point<c1>::value, |
| double, typename std::conditional<std::is_signed<c1>::value, |
| int64_t, uint64_t>::type>::type>::type c0; |
| |
| |
| Wavefront *w = gpuDynInst->wavefront(); |
| |
| std::vector<uint32_t> regVec; |
| // iterate over number of destination register operands since |
| // this is a load |
| for (int k = 0; k < num_dest_operands; ++k) { |
| assert((sizeof(c1) * num_dest_operands) |
| <= MAX_WIDTH_FOR_MEM_INST); |
| |
| int dst = this->dest.regIndex() + k; |
| if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST) |
| dst = dest_vect[k].regIndex(); |
| // virtual->physical VGPR mapping |
| int physVgpr = w->remap(dst, sizeof(c0), 1); |
| // save the physical VGPR index |
| regVec.push_back(physVgpr); |
| |
| c1 *p1 = |
| &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()]; |
| |
| for (int i = 0; i < w->computeUnit->wfSize(); ++i) { |
| if (gpuDynInst->exec_mask[i]) { |
| DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " |
| "$%s%d <- %d global ld done (src = wavefront " |
| "ld inst)\n", w->computeUnit->cu_id, w->simdId, |
| w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", |
| dst, *p1); |
| // write the value into the physical VGPR. This is a |
| // purely functional operation. No timing is modeled. |
| w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, |
| *p1, i); |
| } |
| ++p1; |
| } |
| } |
| |
| // Schedule the write operation of the load data on the VRF. |
| // This simply models the timing aspect of the VRF write operation. |
| // It does not modify the physical VGPR. |
| int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> |
| vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, |
| sizeof(c0), gpuDynInst->time); |
| |
| if (this->isGlobalMem()) { |
| gpuDynInst->computeUnit()->globalMemoryPipe |
| .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); |
| } else { |
| assert(this->isLocalMem()); |
| gpuDynInst->computeUnit()->localMemoryPipe |
| .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); |
| } |
| } |
| |
| private: |
| void |
| execLdAcq(GPUDynInstPtr gpuDynInst) override |
| { |
| // after the load has complete and if the load has acquire |
| // semantics, issue an acquire request. |
| if (!this->isLocalMem()) { |
| if (gpuDynInst->computeUnit()->shader->separate_acquire_release |
| && gpuDynInst->isAcquire()) { |
| gpuDynInst->statusBitVector = VectorMask(1); |
| gpuDynInst->useContinuation = false; |
| // create request |
| RequestPtr req = std::make_shared<Request>(0, 0, 0, 0, |
| gpuDynInst->computeUnit()->masterId(), |
| 0, gpuDynInst->wfDynId); |
| req->setFlags(Request::ACQUIRE); |
| gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); |
| } |
| } |
| } |
| |
| public: |
| bool isVectorRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if ((num_dest_operands != getNumOperands()) && |
| (operandIndex == (getNumOperands()-1))) |
| return(this->addr.isVectorRegister()); |
| if (num_dest_operands > 1) { |
| return dest_vect[operandIndex].isVectorRegister(); |
| } |
| else if (num_dest_operands == 1) { |
| return LdInstBase<typename MemDataType::CType, |
| typename DestDataType::OperandType, |
| AddrOperandType>::dest.isVectorRegister(); |
| } |
| return false; |
| } |
| bool isCondRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if ((num_dest_operands != getNumOperands()) && |
| (operandIndex == (getNumOperands()-1))) |
| return(this->addr.isCondRegister()); |
| if (num_dest_operands > 1) |
| return dest_vect[operandIndex].isCondRegister(); |
| else if (num_dest_operands == 1) |
| return LdInstBase<typename MemDataType::CType, |
| typename DestDataType::OperandType, |
| AddrOperandType>::dest.isCondRegister(); |
| return false; |
| } |
| bool isScalarRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if ((num_dest_operands != getNumOperands()) && |
| (operandIndex == (getNumOperands()-1))) |
| return(this->addr.isScalarRegister()); |
| if (num_dest_operands > 1) |
| return dest_vect[operandIndex].isScalarRegister(); |
| else if (num_dest_operands == 1) |
| return LdInstBase<typename MemDataType::CType, |
| typename DestDataType::OperandType, |
| AddrOperandType>::dest.isScalarRegister(); |
| return false; |
| } |
| bool isSrcOperand(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if ((num_dest_operands != getNumOperands()) && |
| (operandIndex == (getNumOperands()-1))) |
| return(this->addr.isVectorRegister()); |
| return false; |
| } |
| bool isDstOperand(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if ((num_dest_operands != getNumOperands()) && |
| (operandIndex == (getNumOperands()-1))) |
| return false; |
| return true; |
| } |
| int getOperandSize(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if ((num_dest_operands != getNumOperands()) && |
| (operandIndex == (getNumOperands()-1))) |
| return(this->addr.opSize()); |
| if (num_dest_operands > 1) |
| return(dest_vect[operandIndex].opSize()); |
| else if (num_dest_operands == 1) |
| return(LdInstBase<typename MemDataType::CType, |
| typename DestDataType::OperandType, |
| AddrOperandType>::dest.opSize()); |
| return 0; |
| } |
| int |
| getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if ((num_dest_operands != getNumOperands()) && |
| (operandIndex == (getNumOperands()-1))) |
| return(this->addr.regIndex()); |
| if (num_dest_operands > 1) |
| return(dest_vect[operandIndex].regIndex()); |
| else if (num_dest_operands == 1) |
| return(LdInstBase<typename MemDataType::CType, |
| typename DestDataType::OperandType, |
| AddrOperandType>::dest.regIndex()); |
| return -1; |
| } |
| int getNumOperands() override |
| { |
| if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) |
| return(num_dest_operands+1); |
| else |
| return(num_dest_operands); |
| } |
| void execute(GPUDynInstPtr gpuDynInst) override; |
| }; |
| |
| template<typename MemDT, typename DestDT> |
| GPUStaticInst* |
| decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj) |
| { |
| unsigned op_offs = obj->getOperandPtr(ib->operands,1); |
| BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); |
| |
| if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { |
| return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld"); |
| } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER || |
| tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { |
| switch (tmp.regKind) { |
| case Brig::BRIG_REGISTER_KIND_SINGLE: |
| return new LdInst<MemDT, DestDT, |
| SRegAddrOperand>(ib, obj, "ld"); |
| case Brig::BRIG_REGISTER_KIND_DOUBLE: |
| return new LdInst<MemDT, DestDT, |
| DRegAddrOperand>(ib, obj, "ld"); |
| default: |
| fatal("Bad ld register operand type %d\n", tmp.regKind); |
| } |
| } else { |
| fatal("Bad ld register operand kind %d\n", tmp.kind); |
| } |
| } |
| |
| template<typename MemDT> |
| GPUStaticInst* |
| decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj) |
| { |
| unsigned op_offs = obj->getOperandPtr(ib->operands,0); |
| BrigRegOperandInfo dest = findRegDataType(op_offs, obj); |
| |
| assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER || |
| dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); |
| switch(dest.regKind) { |
| case Brig::BRIG_REGISTER_KIND_SINGLE: |
| switch (ib->type) { |
| case Brig::BRIG_TYPE_B8: |
| case Brig::BRIG_TYPE_B16: |
| case Brig::BRIG_TYPE_B32: |
| return decodeLd2<MemDT, B32>(ib, obj); |
| case Brig::BRIG_TYPE_U8: |
| case Brig::BRIG_TYPE_U16: |
| case Brig::BRIG_TYPE_U32: |
| return decodeLd2<MemDT, U32>(ib, obj); |
| case Brig::BRIG_TYPE_S8: |
| case Brig::BRIG_TYPE_S16: |
| case Brig::BRIG_TYPE_S32: |
| return decodeLd2<MemDT, S32>(ib, obj); |
| case Brig::BRIG_TYPE_F16: |
| case Brig::BRIG_TYPE_F32: |
| return decodeLd2<MemDT, U32>(ib, obj); |
| default: |
| fatal("Bad ld register operand type %d, %d\n", |
| dest.regKind, ib->type); |
| }; |
| case Brig::BRIG_REGISTER_KIND_DOUBLE: |
| switch (ib->type) { |
| case Brig::BRIG_TYPE_B64: |
| return decodeLd2<MemDT, B64>(ib, obj); |
| case Brig::BRIG_TYPE_U64: |
| return decodeLd2<MemDT, U64>(ib, obj); |
| case Brig::BRIG_TYPE_S64: |
| return decodeLd2<MemDT, S64>(ib, obj); |
| case Brig::BRIG_TYPE_F64: |
| return decodeLd2<MemDT, U64>(ib, obj); |
| default: |
| fatal("Bad ld register operand type %d, %d\n", |
| dest.regKind, ib->type); |
| }; |
| default: |
| fatal("Bad ld register operand type %d, %d\n", dest.regKind, |
| ib->type); |
| } |
| } |
| |
| template<typename MemDataType, typename SrcOperandType, |
| typename AddrOperandType> |
| class StInstBase : public HsailGPUStaticInst |
| { |
| public: |
| typename SrcOperandType::SrcOperand src; |
| AddrOperandType addr; |
| |
| Brig::BrigSegment segment; |
| Brig::BrigMemoryScope memoryScope; |
| Brig::BrigMemoryOrder memoryOrder; |
| unsigned int equivClass; |
| |
| StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, |
| const char *_opcode) |
| : HsailGPUStaticInst(obj, _opcode) |
| { |
| using namespace Brig; |
| |
| setFlag(MemoryRef); |
| setFlag(Store); |
| |
| if (ib->opcode == BRIG_OPCODE_ST) { |
| const BrigInstMem *ldst = (const BrigInstMem*)ib; |
| |
| segment = (BrigSegment)ldst->segment; |
| memoryOrder = BRIG_MEMORY_ORDER_NONE; |
| memoryScope = BRIG_MEMORY_SCOPE_NONE; |
| equivClass = ldst->equivClass; |
| |
| unsigned op_offs = obj->getOperandPtr(ib->operands, 0); |
| const BrigOperand *baseOp = obj->getOperand(op_offs); |
| |
| if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || |
| (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { |
| src.init(op_offs, obj); |
| } |
| |
| op_offs = obj->getOperandPtr(ib->operands, 1); |
| addr.init(op_offs, obj); |
| } else { |
| const BrigInstAtomic *at = (const BrigInstAtomic*)ib; |
| |
| segment = (BrigSegment)at->segment; |
| memoryScope = (BrigMemoryScope)at->memoryScope; |
| memoryOrder = (BrigMemoryOrder)at->memoryOrder; |
| equivClass = 0; |
| |
| unsigned op_offs = obj->getOperandPtr(ib->operands, 0); |
| addr.init(op_offs, obj); |
| |
| op_offs = obj->getOperandPtr(ib->operands, 1); |
| src.init(op_offs, obj); |
| } |
| |
| switch (memoryOrder) { |
| case BRIG_MEMORY_ORDER_NONE: |
| setFlag(NoOrder); |
| break; |
| case BRIG_MEMORY_ORDER_RELAXED: |
| setFlag(RelaxedOrder); |
| break; |
| case BRIG_MEMORY_ORDER_SC_ACQUIRE: |
| setFlag(Acquire); |
| break; |
| case BRIG_MEMORY_ORDER_SC_RELEASE: |
| setFlag(Release); |
| break; |
| case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: |
| setFlag(AcquireRelease); |
| break; |
| default: |
| fatal("StInst has bad memory order type\n"); |
| } |
| |
| switch (memoryScope) { |
| case BRIG_MEMORY_SCOPE_NONE: |
| setFlag(NoScope); |
| break; |
| case BRIG_MEMORY_SCOPE_WORKITEM: |
| setFlag(WorkitemScope); |
| break; |
| case BRIG_MEMORY_SCOPE_WORKGROUP: |
| setFlag(WorkgroupScope); |
| break; |
| case BRIG_MEMORY_SCOPE_AGENT: |
| setFlag(DeviceScope); |
| break; |
| case BRIG_MEMORY_SCOPE_SYSTEM: |
| setFlag(SystemScope); |
| break; |
| default: |
| fatal("StInst has bad memory scope type\n"); |
| } |
| |
| switch (segment) { |
| case BRIG_SEGMENT_GLOBAL: |
| setFlag(GlobalSegment); |
| break; |
| case BRIG_SEGMENT_GROUP: |
| setFlag(GroupSegment); |
| break; |
| case BRIG_SEGMENT_PRIVATE: |
| setFlag(PrivateSegment); |
| break; |
| case BRIG_SEGMENT_READONLY: |
| setFlag(ReadOnlySegment); |
| break; |
| case BRIG_SEGMENT_SPILL: |
| setFlag(SpillSegment); |
| break; |
| case BRIG_SEGMENT_FLAT: |
| setFlag(Flat); |
| break; |
| case BRIG_SEGMENT_ARG: |
| setFlag(ArgSegment); |
| break; |
| default: |
| panic("St: segment %d not supported\n", segment); |
| } |
| } |
| |
| int numDstRegOperands() override { return 0; } |
| int numSrcRegOperands() override |
| { |
| return src.isVectorRegister() + this->addr.isVectorRegister(); |
| } |
| int getNumOperands() override |
| { |
| if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) |
| return 2; |
| else |
| return 1; |
| } |
| bool isVectorRegister(int operandIndex) override |
| { |
| assert(operandIndex >= 0 && operandIndex < getNumOperands()); |
| return !operandIndex ? src.isVectorRegister() : |
| this->addr.isVectorRegister(); |
| } |
| bool isCondRegister(int operandIndex) override |
| { |
| assert(operandIndex >= 0 && operandIndex < getNumOperands()); |
| return !operandIndex ? src.isCondRegister() : |
| this->addr.isCondRegister(); |
| } |
| bool isScalarRegister(int operandIndex) override |
| { |
| assert(operandIndex >= 0 && operandIndex < getNumOperands()); |
| return !operandIndex ? src.isScalarRegister() : |
| this->addr.isScalarRegister(); |
| } |
| bool isSrcOperand(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return true; |
| } |
| bool isDstOperand(int operandIndex) override { return false; } |
| int getOperandSize(int operandIndex) override |
| { |
| assert(operandIndex >= 0 && operandIndex < getNumOperands()); |
| return !operandIndex ? src.opSize() : this->addr.opSize(); |
| } |
| int |
| getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override |
| { |
| assert(operandIndex >= 0 && operandIndex < getNumOperands()); |
| return !operandIndex ? src.regIndex() : this->addr.regIndex(); |
| } |
| }; |
| |
| |
| template<typename MemDataType, typename SrcDataType, |
| typename AddrOperandType> |
| class StInst : |
| public StInstBase<MemDataType, typename SrcDataType::OperandType, |
| AddrOperandType>, |
| public MemInst |
| { |
| public: |
| typename SrcDataType::OperandType::SrcOperand src_vect[4]; |
| uint16_t num_src_operands; |
| void generateDisassembly() override; |
| |
| StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, |
| const char *_opcode, int srcIdx) |
| : StInstBase<MemDataType, typename SrcDataType::OperandType, |
| AddrOperandType>(ib, obj, _opcode), |
| MemInst(SrcDataType::memType) |
| { |
| init_addr(&this->addr); |
| |
| BrigRegOperandInfo rinfo; |
| unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx); |
| const Brig::BrigOperand *baseOp = obj->getOperand(op_offs); |
| |
| if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { |
| const Brig::BrigOperandConstantBytes *op = |
| (Brig::BrigOperandConstantBytes*)baseOp; |
| |
| rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind, |
| Brig::BRIG_TYPE_NONE); |
| } else { |
| rinfo = findRegDataType(op_offs, obj); |
| } |
| |
| if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { |
| const Brig::BrigOperandOperandList *brigRegVecOp = |
| (const Brig::BrigOperandOperandList*)baseOp; |
| |
| num_src_operands = |
| *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; |
| |
| assert(num_src_operands <= 4); |
| } else { |
| num_src_operands = 1; |
| } |
| |
| if (num_src_operands > 1) { |
| assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); |
| |
| for (int i = 0; i < num_src_operands; ++i) { |
| src_vect[i].init_from_vect(op_offs, obj, i); |
| } |
| } |
| } |
| |
| void |
| initiateAcc(GPUDynInstPtr gpuDynInst) override |
| { |
| // before performing a store, check if this store has |
| // release semantics, and if so issue a release first |
| if (!this->isLocalMem()) { |
| if (gpuDynInst->computeUnit()->shader->separate_acquire_release |
| && gpuDynInst->isRelease()) { |
| |
| gpuDynInst->statusBitVector = VectorMask(1); |
| gpuDynInst->execContinuation = &GPUStaticInst::execSt; |
| gpuDynInst->useContinuation = true; |
| // create request |
| RequestPtr req = std::make_shared<Request>(0, 0, 0, 0, |
| gpuDynInst->computeUnit()->masterId(), |
| 0, gpuDynInst->wfDynId); |
| req->setFlags(Request::RELEASE); |
| gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); |
| |
| return; |
| } |
| } |
| |
| // if there is no release semantic, perform stores immediately |
| execSt(gpuDynInst); |
| } |
| |
| // stores don't write anything back, so there is nothing |
| // to do here. we only override this method to avoid the |
| // fatal in the base class implementation |
| void completeAcc(GPUDynInstPtr gpuDynInst) override { } |
| |
| private: |
| // execSt may be called through a continuation |
| // if the store had release semantics. see comment for |
| // execSt in gpu_static_inst.hh |
| void |
| execSt(GPUDynInstPtr gpuDynInst) override |
| { |
| typedef typename MemDataType::CType c0; |
| |
| gpuDynInst->statusBitVector = gpuDynInst->exec_mask; |
| |
| if (num_src_operands > 1) { |
| for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) |
| if (gpuDynInst->exec_mask[i]) |
| gpuDynInst->statusVector.push_back(num_src_operands); |
| else |
| gpuDynInst->statusVector.push_back(0); |
| } |
| |
| for (int k = 0; k < num_src_operands; ++k) { |
| c0 *d = &((c0*)gpuDynInst->d_data) |
| [k * gpuDynInst->computeUnit()->wfSize()]; |
| |
| for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { |
| if (gpuDynInst->exec_mask[i]) { |
| Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); |
| |
| if (this->isLocalMem()) { |
| //store to shared memory |
| gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr, |
| *d); |
| } else { |
| RequestPtr req = std::make_shared<Request>( |
| 0, vaddr, sizeof(c0), 0, |
| gpuDynInst->computeUnit()->masterId(), |
| 0, gpuDynInst->wfDynId); |
| |
| gpuDynInst->setRequestFlags(req); |
| PacketPtr pkt = new Packet(req, MemCmd::WriteReq); |
| pkt->dataStatic<c0>(d); |
| |
| // translation is performed in sendRequest() |
| // the request will be finished when the store completes |
| gpuDynInst->useContinuation = false; |
| gpuDynInst->computeUnit()->sendRequest(gpuDynInst, |
| i, pkt); |
| |
| } |
| } |
| ++d; |
| } |
| } |
| |
| gpuDynInst->updateStats(); |
| } |
| |
| public: |
| bool isVectorRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex == num_src_operands) |
| return this->addr.isVectorRegister(); |
| if (num_src_operands > 1) |
| return src_vect[operandIndex].isVectorRegister(); |
| else if (num_src_operands == 1) |
| return StInstBase<MemDataType, |
| typename SrcDataType::OperandType, |
| AddrOperandType>::src.isVectorRegister(); |
| return false; |
| } |
| bool isCondRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex == num_src_operands) |
| return this->addr.isCondRegister(); |
| if (num_src_operands > 1) |
| return src_vect[operandIndex].isCondRegister(); |
| else if (num_src_operands == 1) |
| return StInstBase<MemDataType, |
| typename SrcDataType::OperandType, |
| AddrOperandType>::src.isCondRegister(); |
| return false; |
| } |
| bool isScalarRegister(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex == num_src_operands) |
| return this->addr.isScalarRegister(); |
| if (num_src_operands > 1) |
| return src_vect[operandIndex].isScalarRegister(); |
| else if (num_src_operands == 1) |
| return StInstBase<MemDataType, |
| typename SrcDataType::OperandType, |
| AddrOperandType>::src.isScalarRegister(); |
| return false; |
| } |
| bool isSrcOperand(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| return true; |
| } |
| bool isDstOperand(int operandIndex) override { return false; } |
| int getOperandSize(int operandIndex) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex == num_src_operands) |
| return this->addr.opSize(); |
| if (num_src_operands > 1) |
| return src_vect[operandIndex].opSize(); |
| else if (num_src_operands == 1) |
| return StInstBase<MemDataType, |
| typename SrcDataType::OperandType, |
| AddrOperandType>::src.opSize(); |
| return 0; |
| } |
| int |
| getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex == num_src_operands) |
| return this->addr.regIndex(); |
| if (num_src_operands > 1) |
| return src_vect[operandIndex].regIndex(); |
| else if (num_src_operands == 1) |
| return StInstBase<MemDataType, |
| typename SrcDataType::OperandType, |
| AddrOperandType>::src.regIndex(); |
| return -1; |
| } |
| int getNumOperands() override |
| { |
| if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) |
| return num_src_operands + 1; |
| else |
| return num_src_operands; |
| } |
| void execute(GPUDynInstPtr gpuDynInst) override; |
| }; |
| |
| template<typename DataType, typename SrcDataType> |
| GPUStaticInst* |
| decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj) |
| { |
| int srcIdx = 0; |
| int destIdx = 1; |
| if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC || |
| ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) { |
| srcIdx = 1; |
| destIdx = 0; |
| } |
| unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx); |
| |
| BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); |
| |
| if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { |
| return new StInst<DataType, SrcDataType, |
| NoRegAddrOperand>(ib, obj, "st", srcIdx); |
| } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { |
| // V2/V4 not allowed |
| switch (tmp.regKind) { |
| case Brig::BRIG_REGISTER_KIND_SINGLE: |
| return new StInst<DataType, SrcDataType, |
| SRegAddrOperand>(ib, obj, "st", srcIdx); |
| case Brig::BRIG_REGISTER_KIND_DOUBLE: |
| return new StInst<DataType, SrcDataType, |
| DRegAddrOperand>(ib, obj, "st", srcIdx); |
| default: |
| fatal("Bad st register operand type %d\n", tmp.type); |
| } |
| } else { |
| fatal("Bad st register operand kind %d\n", tmp.kind); |
| } |
| } |
| |
| template<typename OperandType, typename AddrOperandType, int NumSrcOperands, |
| bool HasDst> |
| class AtomicInstBase : public HsailGPUStaticInst |
| { |
| public: |
| typename OperandType::DestOperand dest; |
| typename OperandType::SrcOperand src[NumSrcOperands]; |
| AddrOperandType addr; |
| |
| Brig::BrigSegment segment; |
| Brig::BrigMemoryOrder memoryOrder; |
| Brig::BrigAtomicOperation atomicOperation; |
| Brig::BrigMemoryScope memoryScope; |
| Brig::BrigOpcode opcode; |
| |
| AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, |
| const char *_opcode) |
| : HsailGPUStaticInst(obj, _opcode) |
| { |
| using namespace Brig; |
| |
| const BrigInstAtomic *at = (const BrigInstAtomic*)ib; |
| |
| segment = (BrigSegment)at->segment; |
| memoryScope = (BrigMemoryScope)at->memoryScope; |
| memoryOrder = (BrigMemoryOrder)at->memoryOrder; |
| atomicOperation = (BrigAtomicOperation)at->atomicOperation; |
| opcode = (BrigOpcode)ib->opcode; |
| |
| assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET || |
| opcode == Brig::BRIG_OPCODE_ATOMIC); |
| |
| setFlag(MemoryRef); |
| |
| if (opcode == Brig::BRIG_OPCODE_ATOMIC) { |
| setFlag(AtomicReturn); |
| } else { |
| setFlag(AtomicNoReturn); |
| } |
| |
| switch (memoryOrder) { |
| case BRIG_MEMORY_ORDER_NONE: |
| setFlag(NoOrder); |
| break; |
| case BRIG_MEMORY_ORDER_RELAXED: |
| setFlag(RelaxedOrder); |
| break; |
| case BRIG_MEMORY_ORDER_SC_ACQUIRE: |
| setFlag(Acquire); |
| break; |
| case BRIG_MEMORY_ORDER_SC_RELEASE: |
| setFlag(Release); |
| break; |
| case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: |
| setFlag(AcquireRelease); |
| break; |
| default: |
| fatal("AtomicInst has bad memory order type\n"); |
| } |
| |
| switch (memoryScope) { |
| case BRIG_MEMORY_SCOPE_NONE: |
| setFlag(NoScope); |
| break; |
| case BRIG_MEMORY_SCOPE_WORKITEM: |
| setFlag(WorkitemScope); |
| break; |
| case BRIG_MEMORY_SCOPE_WORKGROUP: |
| setFlag(WorkgroupScope); |
| break; |
| case BRIG_MEMORY_SCOPE_AGENT: |
| setFlag(DeviceScope); |
| break; |
| case BRIG_MEMORY_SCOPE_SYSTEM: |
| setFlag(SystemScope); |
| break; |
| default: |
| fatal("AtomicInst has bad memory scope type\n"); |
| } |
| |
| switch (atomicOperation) { |
| case Brig::BRIG_ATOMIC_AND: |
| setFlag(AtomicAnd); |
| break; |
| case Brig::BRIG_ATOMIC_OR: |
| setFlag(AtomicOr); |
| break; |
| case Brig::BRIG_ATOMIC_XOR: |
| setFlag(AtomicXor); |
| break; |
| case Brig::BRIG_ATOMIC_CAS: |
| setFlag(AtomicCAS); |
| break; |
| case Brig::BRIG_ATOMIC_EXCH: |
| setFlag(AtomicExch); |
| break; |
| case Brig::BRIG_ATOMIC_ADD: |
| setFlag(AtomicAdd); |
| break; |
| case Brig::BRIG_ATOMIC_WRAPINC: |
| setFlag(AtomicInc); |
| break; |
| case Brig::BRIG_ATOMIC_WRAPDEC: |
| setFlag(AtomicDec); |
| break; |
| case Brig::BRIG_ATOMIC_MIN: |
| setFlag(AtomicMin); |
| break; |
| case Brig::BRIG_ATOMIC_MAX: |
| setFlag(AtomicMax); |
| break; |
| case Brig::BRIG_ATOMIC_SUB: |
| setFlag(AtomicSub); |
| break; |
| default: |
| fatal("Bad BrigAtomicOperation code %d\n", atomicOperation); |
| } |
| |
| switch (segment) { |
| case BRIG_SEGMENT_GLOBAL: |
| setFlag(GlobalSegment); |
| break; |
| case BRIG_SEGMENT_GROUP: |
| setFlag(GroupSegment); |
| break; |
| case BRIG_SEGMENT_FLAT: |
| setFlag(Flat); |
| break; |
| default: |
| panic("Atomic: segment %d not supported\n", segment); |
| } |
| |
| if (HasDst) { |
| unsigned op_offs = obj->getOperandPtr(ib->operands, 0); |
| dest.init(op_offs, obj); |
| |
| op_offs = obj->getOperandPtr(ib->operands, 1); |
| addr.init(op_offs, obj); |
| |
| for (int i = 0; i < NumSrcOperands; ++i) { |
| op_offs = obj->getOperandPtr(ib->operands, i + 2); |
| src[i].init(op_offs, obj); |
| } |
| } else { |
| |
| unsigned op_offs = obj->getOperandPtr(ib->operands, 0); |
| addr.init(op_offs, obj); |
| |
| for (int i = 0; i < NumSrcOperands; ++i) { |
| op_offs = obj->getOperandPtr(ib->operands, i + 1); |
| src[i].init(op_offs, obj); |
| } |
| } |
| } |
| |
| int numSrcRegOperands() |
| { |
| int operands = 0; |
| for (int i = 0; i < NumSrcOperands; i++) { |
| if (src[i].isVectorRegister()) { |
| operands++; |
| } |
| } |
| if (addr.isVectorRegister()) |
| operands++; |
| return operands; |
| } |
| int numDstRegOperands() { return dest.isVectorRegister(); } |
| int getNumOperands() |
| { |
| if (addr.isVectorRegister()) |
| return(NumSrcOperands + 2); |
| return(NumSrcOperands + 1); |
| } |
| bool isVectorRegister(int operandIndex) |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex < NumSrcOperands) |
| return src[operandIndex].isVectorRegister(); |
| else if (operandIndex == NumSrcOperands) |
| return(addr.isVectorRegister()); |
| else |
| return dest.isVectorRegister(); |
| } |
| bool isCondRegister(int operandIndex) |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex < NumSrcOperands) |
| return src[operandIndex].isCondRegister(); |
| else if (operandIndex == NumSrcOperands) |
| return(addr.isCondRegister()); |
| else |
| return dest.isCondRegister(); |
| } |
| bool isScalarRegister(int operandIndex) |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex < NumSrcOperands) |
| return src[operandIndex].isScalarRegister(); |
| else if (operandIndex == NumSrcOperands) |
| return(addr.isScalarRegister()); |
| else |
| return dest.isScalarRegister(); |
| } |
| bool isSrcOperand(int operandIndex) |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex < NumSrcOperands) |
| return true; |
| else if (operandIndex == NumSrcOperands) |
| return(addr.isVectorRegister()); |
| else |
| return false; |
| } |
| bool isDstOperand(int operandIndex) |
| { |
| if (operandIndex <= NumSrcOperands) |
| return false; |
| else |
| return true; |
| } |
| int getOperandSize(int operandIndex) |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex < NumSrcOperands) |
| return(src[operandIndex].opSize()); |
| else if (operandIndex == NumSrcOperands) |
| return(addr.opSize()); |
| else |
| return(dest.opSize()); |
| } |
| int |
| getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) |
| { |
| assert((operandIndex >= 0) && (operandIndex < getNumOperands())); |
| if (operandIndex < NumSrcOperands) |
| return(src[operandIndex].regIndex()); |
| else if (operandIndex == NumSrcOperands) |
| return(addr.regIndex()); |
| else |
| return(dest.regIndex()); |
| return -1; |
| } |
| }; |
| |
| template<typename MemDataType, typename AddrOperandType, int NumSrcOperands, |
| bool HasDst> |
| class AtomicInst : |
| public AtomicInstBase<typename MemDataType::OperandType, |
| AddrOperandType, NumSrcOperands, HasDst>, |
| public MemInst |
| { |
| public: |
| void generateDisassembly() override; |
| |
| AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, |
| const char *_opcode) |
| : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType, |
| NumSrcOperands, HasDst> |
| (ib, obj, _opcode), |
| MemInst(MemDataType::memType) |
| { |
| init_addr(&this->addr); |
| } |
| |
| void |
| initiateAcc(GPUDynInstPtr gpuDynInst) override |
| { |
| // before doing the RMW, check if this atomic has |
| // release semantics, and if so issue a release first |
| if (!this->isLocalMem()) { |
| if (gpuDynInst->computeUnit()->shader->separate_acquire_release |
| && (gpuDynInst->isRelease() |
| || gpuDynInst->isAcquireRelease())) { |
| |
| gpuDynInst->statusBitVector = VectorMask(1); |
| |
| gpuDynInst->execContinuation = &GPUStaticInst::execAtomic; |
| gpuDynInst->useContinuation = true; |
| |
| // create request |
| RequestPtr req = std::make_shared<Request>(0, 0, 0, 0, |
| gpuDynInst->computeUnit()->masterId(), |
| 0, gpuDynInst->wfDynId); |
| req->setFlags(Request::RELEASE); |
| gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); |
| |
| return; |
| } |
| } |
| |
| // if there is no release semantic, execute the RMW immediately |
| execAtomic(gpuDynInst); |
| |
| } |
| |
| void |
| completeAcc(GPUDynInstPtr gpuDynInst) override |
| { |
| // if this is not an atomic return op, then we |
| // have nothing more to do. |
| if (this->isAtomicRet()) { |
| // the size of the src operands and the |
| // memory being operated on must match |
| // for HSAIL atomics - this assumption may |
| // not apply to all ISAs |
| typedef typename MemDataType::CType CType; |
| |
| Wavefront *w = gpuDynInst->wavefront(); |
| int dst = this->dest.regIndex(); |
| std::vector<uint32_t> regVec; |
| // virtual->physical VGPR mapping |
| int physVgpr = w->remap(dst, sizeof(CType), 1); |
| regVec.push_back(physVgpr); |
| CType *p1 = &((CType*)gpuDynInst->d_data)[0]; |
| |
| for (int i = 0; i < w->computeUnit->wfSize(); ++i) { |
| if (gpuDynInst->exec_mask[i]) { |
| DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " |
| "$%s%d <- %d global ld done (src = wavefront " |
| "ld inst)\n", w->computeUnit->cu_id, w->simdId, |
| w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d", |
| dst, *p1); |
| // write the value into the physical VGPR. This is a |
| // purely functional operation. No timing is modeled. |
| w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i); |
| } |
| ++p1; |
| } |
| |
| // Schedule the write operation of the load data on the VRF. |
| // This simply models the timing aspect of the VRF write operation. |
| // It does not modify the physical VGPR. |
| int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> |
| vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, |
| sizeof(CType), gpuDynInst->time); |
| |
| if (this->isGlobalMem()) { |
| gpuDynInst->computeUnit()->globalMemoryPipe |
| .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); |
| } else { |
| assert(this->isLocalMem()); |
| gpuDynInst->computeUnit()->localMemoryPipe |
| .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); |
| } |
| } |
| } |
| |
| void execute(GPUDynInstPtr gpuDynInst) override; |
| |
| private: |
| // execAtomic may be called through a continuation |
| // if the RMW had release semantics. see comment for |
| // execContinuation in gpu_dyn_inst.hh |
| void |
| execAtomic(GPUDynInstPtr gpuDynInst) override |
| { |
| gpuDynInst->statusBitVector = gpuDynInst->exec_mask; |
| |
| typedef typename MemDataType::CType c0; |
| |
| c0 *d = &((c0*) gpuDynInst->d_data)[0]; |
| c0 *e = &((c0*) gpuDynInst->a_data)[0]; |
| c0 *f = &((c0*) gpuDynInst->x_data)[0]; |
| |
| for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { |
| if (gpuDynInst->exec_mask[i]) { |
| Addr vaddr = gpuDynInst->addr[i]; |
| |
| if (this->isLocalMem()) { |
| Wavefront *wavefront = gpuDynInst->wavefront(); |
| *d = wavefront->ldsChunk->read<c0>(vaddr); |
| |
| if (this->isAtomicAdd()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| wavefront->ldsChunk->read<c0>(vaddr) + (*e)); |
| } else if (this->isAtomicSub()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| wavefront->ldsChunk->read<c0>(vaddr) - (*e)); |
| } else if (this->isAtomicMax()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| std::max(wavefront->ldsChunk->read<c0>(vaddr), |
| (*e))); |
| } else if (this->isAtomicMin()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| std::min(wavefront->ldsChunk->read<c0>(vaddr), |
| (*e))); |
| } else if (this->isAtomicAnd()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| wavefront->ldsChunk->read<c0>(vaddr) & (*e)); |
| } else if (this->isAtomicOr()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| wavefront->ldsChunk->read<c0>(vaddr) | (*e)); |
| } else if (this->isAtomicXor()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| wavefront->ldsChunk->read<c0>(vaddr) ^ (*e)); |
| } else if (this->isAtomicInc()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| wavefront->ldsChunk->read<c0>(vaddr) + 1); |
| } else if (this->isAtomicDec()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| wavefront->ldsChunk->read<c0>(vaddr) - 1); |
| } else if (this->isAtomicExch()) { |
| wavefront->ldsChunk->write<c0>(vaddr, (*e)); |
| } else if (this->isAtomicCAS()) { |
| wavefront->ldsChunk->write<c0>(vaddr, |
| (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ? |
| (*f) : wavefront->ldsChunk->read<c0>(vaddr)); |
| } else { |
| fatal("Unrecognized or invalid HSAIL atomic op " |
| "type.\n"); |
| } |
| } else { |
| RequestPtr req = |
| std::make_shared<Request>(0, vaddr, sizeof(c0), 0, |
| gpuDynInst->computeUnit()->masterId(), |
| 0, gpuDynInst->wfDynId, |
| gpuDynInst->makeAtomicOpFunctor<c0>(e, |
| f)); |
| |
| gpuDynInst->setRequestFlags(req); |
| PacketPtr pkt = new Packet(req, MemCmd::SwapReq); |
| pkt->dataStatic(d); |
| |
| if (gpuDynInst->computeUnit()->shader-> |
| separate_acquire_release && |
| (gpuDynInst->isAcquire())) { |
| // if this atomic has acquire semantics, |
| // schedule the continuation to perform an |
| // acquire after the RMW completes |
| gpuDynInst->execContinuation = |
| &GPUStaticInst::execAtomicAcq; |
| |
| gpuDynInst->useContinuation = true; |
| } else { |
| // the request will be finished when the RMW completes |
| gpuDynInst->useContinuation = false; |
| } |
| // translation is performed in sendRequest() |
| gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, |
| pkt); |
| } |
| } |
| |
| ++d; |
| ++e; |
| ++f; |
| } |
| |
| gpuDynInst->updateStats(); |
| } |
| |
| // execAtomicACq will always be called through a continuation. |
| // see comment for execContinuation in gpu_dyn_inst.hh |
| void |
| execAtomicAcq(GPUDynInstPtr gpuDynInst) override |
| { |
| // after performing the RMW, check to see if this instruction |
| // has acquire semantics, and if so, issue an acquire |
| if (!this->isLocalMem()) { |
| if (gpuDynInst->computeUnit()->shader->separate_acquire_release |
| && gpuDynInst->isAcquire()) { |
| gpuDynInst->statusBitVector = VectorMask(1); |
| |
| // the request will be finished when |
| // the acquire completes |
| gpuDynInst->useContinuation = false; |
| // create request |
| RequestPtr req = std::make_shared<Request>(0, 0, 0, 0, |
| gpuDynInst->computeUnit()->masterId(), |
| 0, gpuDynInst->wfDynId); |
| req->setFlags(Request::ACQUIRE); |
| gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); |
| } |
| } |
| } |
| }; |
| |
| template<typename DataType, typename AddrOperandType, int NumSrcOperands> |
| GPUStaticInst* |
| constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) |
| { |
| const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; |
| |
| if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) { |
| return decodeLd<DataType>(ib, obj); |
| } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) { |
| switch (ib->type) { |
| case Brig::BRIG_TYPE_B8: |
| return decodeSt<S8,S8>(ib, obj); |
| case Brig::BRIG_TYPE_B16: |
| return decodeSt<S16,S16>(ib, obj); |
| case Brig::BRIG_TYPE_B32: |
| return decodeSt<S32,S32>(ib, obj); |
| case Brig::BRIG_TYPE_B64: |
| return decodeSt<S64,S64>(ib, obj); |
| default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type); |
| } |
| } else { |
| if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) |
| return new AtomicInst<DataType, AddrOperandType, |
| NumSrcOperands, false>(ib, obj, "atomicnoret"); |
| else |
| return new AtomicInst<DataType, AddrOperandType, |
| NumSrcOperands, true>(ib, obj, "atomic"); |
| } |
| } |
| |
| template<typename DataType, int NumSrcOperands> |
| GPUStaticInst* |
| decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj) |
| { |
| unsigned addrIndex = (Brig::BrigOpcode)ib->opcode == |
| Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1; |
| |
| unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex); |
| |
| BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); |
| |
| if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { |
| return constructAtomic<DataType, NoRegAddrOperand, |
| NumSrcOperands>(ib, obj); |
| } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { |
| // V2/V4 not allowed |
| switch (tmp.regKind) { |
| case Brig::BRIG_REGISTER_KIND_SINGLE: |
| return constructAtomic<DataType, SRegAddrOperand, |
| NumSrcOperands>(ib, obj); |
| case Brig::BRIG_REGISTER_KIND_DOUBLE: |
| return constructAtomic<DataType, DRegAddrOperand, |
| NumSrcOperands>(ib, obj); |
| default: |
| fatal("Bad atomic register operand type %d\n", tmp.type); |
| } |
| } else { |
| fatal("Bad atomic register operand kind %d\n", tmp.kind); |
| } |
| } |
| |
| |
| template<typename DataType> |
| GPUStaticInst* |
| decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) |
| { |
| const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; |
| |
| if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { |
| return decodeAtomicHelper<DataType, 2>(ib, obj); |
| } else { |
| return decodeAtomicHelper<DataType, 1>(ib, obj); |
| } |
| } |
| |
| template<typename DataType> |
| GPUStaticInst* |
| decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj) |
| { |
| const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; |
| if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { |
| return decodeAtomicHelper<DataType, 2>(ib, obj); |
| } else { |
| return decodeAtomicHelper<DataType, 1>(ib, obj); |
| } |
| } |
| } // namespace HsailISA |
| |
| #endif // __ARCH_HSAIL_INSTS_MEM_HH__ |