| /* |
| * Copyright (c) 2015 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| * |
| * Author: Anthony Gutierrez |
| */ |
| |
| #ifndef __GPU_DYN_INST_HH__ |
| #define __GPU_DYN_INST_HH__ |
| |
| #include <cstdint> |
| #include <string> |
| |
| #include "enums/GenericMemoryOrder.hh" |
| #include "enums/GenericMemoryScope.hh" |
| #include "enums/MemOpType.hh" |
| #include "enums/MemType.hh" |
| #include "enums/OpType.hh" |
| #include "enums/StorageClassType.hh" |
| #include "gpu-compute/compute_unit.hh" |
| #include "gpu-compute/gpu_exec_context.hh" |
| |
| class GPUStaticInst; |
| |
| template<typename T> |
| class AtomicOpAnd : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T a; |
| |
| AtomicOpAnd(T _a) : a(_a) { } |
| void execute(T *b) { *b &= a; } |
| }; |
| |
| template<typename T> |
| class AtomicOpOr : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T a; |
| AtomicOpOr(T _a) : a(_a) { } |
| void execute(T *b) { *b |= a; } |
| }; |
| |
| template<typename T> |
| class AtomicOpXor : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T a; |
| AtomicOpXor(T _a) : a(_a) {} |
| void execute(T *b) { *b ^= a; } |
| }; |
| |
| template<typename T> |
| class AtomicOpCAS : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T c; |
| T s; |
| |
| ComputeUnit *computeUnit; |
| |
| AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) |
| : c(_c), s(_s), computeUnit(compute_unit) { } |
| |
| void |
| execute(T *b) |
| { |
| computeUnit->numCASOps++; |
| |
| if (*b == c) { |
| *b = s; |
| } else { |
| computeUnit->numFailedCASOps++; |
| } |
| |
| if (computeUnit->xact_cas_mode) { |
| computeUnit->xactCasLoadMap.clear(); |
| } |
| } |
| }; |
| |
| template<typename T> |
| class AtomicOpExch : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T a; |
| AtomicOpExch(T _a) : a(_a) { } |
| void execute(T *b) { *b = a; } |
| }; |
| |
| template<typename T> |
| class AtomicOpAdd : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T a; |
| AtomicOpAdd(T _a) : a(_a) { } |
| void execute(T *b) { *b += a; } |
| }; |
| |
| template<typename T> |
| class AtomicOpSub : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T a; |
| AtomicOpSub(T _a) : a(_a) { } |
| void execute(T *b) { *b -= a; } |
| }; |
| |
| template<typename T> |
| class AtomicOpInc : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| AtomicOpInc() { } |
| void execute(T *b) { *b += 1; } |
| }; |
| |
| template<typename T> |
| class AtomicOpDec : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| AtomicOpDec() {} |
| void execute(T *b) { *b -= 1; } |
| }; |
| |
| template<typename T> |
| class AtomicOpMax : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T a; |
| AtomicOpMax(T _a) : a(_a) { } |
| |
| void |
| execute(T *b) |
| { |
| if (a > *b) |
| *b = a; |
| } |
| }; |
| |
| template<typename T> |
| class AtomicOpMin : public TypedAtomicOpFunctor<T> |
| { |
| public: |
| T a; |
| AtomicOpMin(T _a) : a(_a) {} |
| |
| void |
| execute(T *b) |
| { |
| if (a < *b) |
| *b = a; |
| } |
| }; |
| |
| #define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) |
| #define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) |
| #define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) |
| |
| typedef enum |
| { |
| VT_32, |
| VT_64, |
| } vgpr_type; |
| |
| typedef enum |
| { |
| SEG_PRIVATE, |
| SEG_SPILL, |
| SEG_GLOBAL, |
| SEG_SHARED, |
| SEG_READONLY, |
| SEG_FLAT |
| } seg_type; |
| |
| class GPUDynInst : public GPUExecContext |
| { |
| public: |
| GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, |
| uint64_t instSeqNum); |
| |
| void execute(); |
| int numSrcRegOperands(); |
| int numDstRegOperands(); |
| int getNumOperands(); |
| bool isVectorRegister(int operandIdx); |
| bool isScalarRegister(int operandIdx); |
| int getRegisterIndex(int operandIdx); |
| int getOperandSize(int operandIdx); |
| bool isDstOperand(int operandIdx); |
| bool isSrcOperand(int operandIdx); |
| bool isArgLoad(); |
| |
| const std::string &disassemble() const; |
| |
| uint64_t seqNum() const; |
| |
| Enums::OpType opType(); |
| Enums::StorageClassType executedAs(); |
| |
| // The address of the memory operation |
| Addr addr[VSZ]; |
| Addr pAddr; |
| |
| // The data to get written |
| uint8_t d_data[VSZ * 16]; |
| // Additional data (for atomics) |
| uint8_t a_data[VSZ * 8]; |
| // Additional data (for atomics) |
| uint8_t x_data[VSZ * 8]; |
| // The execution mask |
| VectorMask exec_mask; |
| |
| // The memory type (M_U32, M_S32, ...) |
| Enums::MemType m_type; |
| // The memory operation (MO_LD, MO_ST, ...) |
| Enums::MemOpType m_op; |
| Enums::GenericMemoryOrder memoryOrder; |
| |
| // Scope of the request |
| Enums::GenericMemoryScope scope; |
| // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) |
| seg_type s_type; |
| // The equivalency class |
| int equiv; |
| // The return VGPR type (VT_32 or VT_64) |
| vgpr_type v_type; |
| // Number of VGPR's accessed (1, 2, or 4) |
| int n_reg; |
| // The return VGPR index |
| int dst_reg; |
| // There can be max 4 dest regs> |
| int dst_reg_vec[4]; |
| // SIMD where the WF of the memory instruction has been mapped to |
| int simdId; |
| // unique id of the WF where the memory instruction belongs to |
| int wfDynId; |
| // The kernel id of the requesting wf |
| int kern_id; |
| // The CU id of the requesting wf |
| int cu_id; |
| // HW slot id where the WF is mapped to inside a SIMD unit |
| int wfSlotId; |
| // execution pipeline id where the memory instruction has been scheduled |
| int pipeId; |
| // The execution time of this operation |
| Tick time; |
| // The latency of this operation |
| WaitClass latency; |
| // A list of bank conflicts for the 4 cycles. |
| uint32_t bc[4]; |
| |
| // A pointer to ROM |
| uint8_t *rom; |
| // The size of the READONLY segment |
| int sz_rom; |
| |
| // Initiate the specified memory operation, by creating a |
| // memory request and sending it off to the memory system. |
| void initiateAcc(GPUDynInstPtr gpuDynInst); |
| |
| void updateStats(); |
| |
| GPUStaticInst* staticInstruction() { return staticInst; } |
| |
| // Is the instruction a scalar or vector op? |
| bool scalarOp() const; |
| |
| /* |
| * Loads/stores/atomics may have acquire/release semantics associated |
| * withthem. Some protocols want to see the acquire/release as separate |
| * requests from the load/store/atomic. We implement that separation |
| * using continuations (i.e., a function pointer with an object associated |
| * with it). When, for example, the front-end generates a store with |
| * release semantics, we will first issue a normal store and set the |
| * continuation in the GPUDynInst to a function that generate a |
| * release request. That continuation will be called when the normal |
| * store completes (in ComputeUnit::DataPort::recvTimingResponse). The |
| * continuation will be called in the context of the same GPUDynInst |
| * that generated the initial store. |
| */ |
| std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation; |
| |
| // when true, call execContinuation when response arrives |
| bool useContinuation; |
| |
| template<typename c0> AtomicOpFunctor* |
| makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) |
| { |
| using namespace Enums; |
| |
| switch(op) { |
| case MO_AAND: |
| case MO_ANRAND: |
| return new AtomicOpAnd<c0>(*reg0); |
| case MO_AOR: |
| case MO_ANROR: |
| return new AtomicOpOr<c0>(*reg0); |
| case MO_AXOR: |
| case MO_ANRXOR: |
| return new AtomicOpXor<c0>(*reg0); |
| case MO_ACAS: |
| case MO_ANRCAS: |
| return new AtomicOpCAS<c0>(*reg0, *reg1, cu); |
| case MO_AEXCH: |
| case MO_ANREXCH: |
| return new AtomicOpExch<c0>(*reg0); |
| case MO_AADD: |
| case MO_ANRADD: |
| return new AtomicOpAdd<c0>(*reg0); |
| case MO_ASUB: |
| case MO_ANRSUB: |
| return new AtomicOpSub<c0>(*reg0); |
| case MO_AINC: |
| case MO_ANRINC: |
| return new AtomicOpInc<c0>(); |
| case MO_ADEC: |
| case MO_ANRDEC: |
| return new AtomicOpDec<c0>(); |
| case MO_AMAX: |
| case MO_ANRMAX: |
| return new AtomicOpMax<c0>(*reg0); |
| case MO_AMIN: |
| case MO_ANRMIN: |
| return new AtomicOpMin<c0>(*reg0); |
| default: |
| panic("Unrecognized atomic operation"); |
| } |
| } |
| |
| void |
| setRequestFlags(Request *req, bool setMemOrder=true) |
| { |
| // currently these are the easy scopes to deduce |
| switch (s_type) { |
| case SEG_PRIVATE: |
| req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); |
| break; |
| case SEG_SPILL: |
| req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); |
| break; |
| case SEG_GLOBAL: |
| req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); |
| break; |
| case SEG_READONLY: |
| req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); |
| break; |
| case SEG_SHARED: |
| req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); |
| break; |
| case SEG_FLAT: |
| // TODO: translate to correct scope |
| assert(false); |
| default: |
| panic("Bad segment type"); |
| break; |
| } |
| |
| switch (scope) { |
| case Enums::MEMORY_SCOPE_NONE: |
| case Enums::MEMORY_SCOPE_WORKITEM: |
| break; |
| case Enums::MEMORY_SCOPE_WAVEFRONT: |
| req->setMemSpaceConfigFlags(Request::SCOPE_VALID | |
| Request::WAVEFRONT_SCOPE); |
| break; |
| case Enums::MEMORY_SCOPE_WORKGROUP: |
| req->setMemSpaceConfigFlags(Request::SCOPE_VALID | |
| Request::WORKGROUP_SCOPE); |
| break; |
| case Enums::MEMORY_SCOPE_DEVICE: |
| req->setMemSpaceConfigFlags(Request::SCOPE_VALID | |
| Request::DEVICE_SCOPE); |
| break; |
| case Enums::MEMORY_SCOPE_SYSTEM: |
| req->setMemSpaceConfigFlags(Request::SCOPE_VALID | |
| Request::SYSTEM_SCOPE); |
| break; |
| default: |
| panic("Bad scope type"); |
| break; |
| } |
| |
| if (setMemOrder) { |
| // set acquire and release flags |
| switch (memoryOrder){ |
| case Enums::MEMORY_ORDER_SC_ACQUIRE: |
| req->setFlags(Request::ACQUIRE); |
| break; |
| case Enums::MEMORY_ORDER_SC_RELEASE: |
| req->setFlags(Request::RELEASE); |
| break; |
| case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: |
| req->setFlags(Request::ACQUIRE | Request::RELEASE); |
| break; |
| default: |
| break; |
| } |
| } |
| |
| // set atomic type |
| // currently, the instruction genenerator only produces atomic return |
| // but a magic instruction can produce atomic no return |
| if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || |
| m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || |
| m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || |
| m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || |
| m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || |
| m_op == Enums::MO_ACAS) { |
| req->setFlags(Request::ATOMIC_RETURN_OP); |
| } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || |
| m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || |
| m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || |
| m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || |
| m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || |
| m_op == Enums::MO_ANRCAS) { |
| req->setFlags(Request::ATOMIC_NO_RETURN_OP); |
| } |
| } |
| |
| // Map returned packets and the addresses they satisfy with which lane they |
| // were requested from |
| typedef std::unordered_map<Addr, std::vector<int>> StatusVector; |
| StatusVector memStatusVector; |
| |
| // Track the status of memory requests per lane, a bit per lane |
| VectorMask statusBitVector; |
| // for ld_v# or st_v# |
| std::vector<int> statusVector; |
| std::vector<int> tlbHitLevel; |
| |
| private: |
| GPUStaticInst *staticInst; |
| uint64_t _seqNum; |
| }; |
| |
| #endif // __GPU_DYN_INST_HH__ |