| /* |
| * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| * |
| * Author: Lisa Hsu |
| */ |
| |
| #ifndef __WAVEFRONT_HH__ |
| #define __WAVEFRONT_HH__ |
| |
| #include <cassert> |
| #include <deque> |
| #include <memory> |
| #include <stack> |
| #include <vector> |
| |
| #include "arch/gpu_isa.hh" |
| #include "base/logging.hh" |
| #include "base/types.hh" |
| #include "config/the_gpu_isa.hh" |
| #include "gpu-compute/condition_register_state.hh" |
| #include "gpu-compute/lds_state.hh" |
| #include "gpu-compute/misc.hh" |
| #include "gpu-compute/ndrange.hh" |
| #include "params/Wavefront.hh" |
| #include "sim/sim_object.hh" |
| |
| static const int MAX_NUM_INSTS_PER_WF = 12; |
| |
| /** |
| * A reconvergence stack entry conveys the necessary state to implement |
| * control flow divergence. |
| */ |
| struct ReconvergenceStackEntry { |
| /** |
| * PC of current instruction. |
| */ |
| uint32_t pc; |
| /** |
| * PC of the immediate post-dominator instruction, i.e., the value of |
| * @a pc for the first instruction that will be executed by the wavefront |
| * when a reconvergence point is reached. |
| */ |
| uint32_t rpc; |
| /** |
| * Execution mask. |
| */ |
| VectorMask execMask; |
| }; |
| |
| /* |
| * Arguments for the hsail opcode call, are user defined and variable length. |
| * The hardware/finalizer can support arguments in hardware or use memory to |
| * pass arguments. For now, let's assume that an unlimited number of arguments |
| * are supported in hardware (the compiler inlines functions whenver it can |
| * anyways, so unless someone is interested in the implications of linking/ |
| * library functions, I think this is a reasonable assumption given the typical |
| * size of an OpenCL kernel). |
| * |
| * Note that call args are different than kernel arguments: |
| * * All work-items in a kernel refer the same set of kernel arguments |
| * * Each work-item has it's on set of call args. So a call argument at |
| * address 0x4 is different for work-item 0 and work-item 1. |
| * |
| * Ok, the table below shows an example of how we organize the call arguments in |
| * the CallArgMem class. |
| * |
| * int foo(int arg1, double arg2) |
| * ___________________________________________________ |
| * | 0: return.0 | 4: return.1 | ... | 252: return.63 | |
| * |---------------------------------------------------| |
| * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | |
| * |---------------------------------------------------| |
| * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | |
| * ___________________________________________________ |
| */ |
| class CallArgMem |
| { |
| public: |
| // pointer to buffer for storing function arguments |
| uint8_t *mem; |
| int wfSize; |
| // size of function args |
| int funcArgsSizePerItem; |
| |
| template<typename CType> |
| int |
| getLaneOffset(int lane, int addr) |
| { |
| return addr * wfSize + sizeof(CType) * lane; |
| } |
| |
| CallArgMem(int func_args_size_per_item, int wf_size) |
| : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) |
| { |
| mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); |
| } |
| |
| ~CallArgMem() |
| { |
| free(mem); |
| } |
| |
| template<typename CType> |
| uint8_t* |
| getLaneAddr(int lane, int addr) |
| { |
| return mem + getLaneOffset<CType>(lane, addr); |
| } |
| |
| template<typename CType> |
| void |
| setLaneAddr(int lane, int addr, CType val) |
| { |
| *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; |
| } |
| }; |
| |
| class Wavefront : public SimObject |
| { |
| public: |
| enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; |
| enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; |
| |
| // Base pointer for array of instruction pointers |
| uint64_t basePtr; |
| |
| uint32_t oldBarrierCnt; |
| uint32_t barrierCnt; |
| uint32_t barrierId; |
| uint32_t barrierSlots; |
| status_e status; |
| // HW slot id where the WF is mapped to inside a SIMD unit |
| int wfSlotId; |
| int kernId; |
| // SIMD unit where the WV has been scheduled |
| int simdId; |
| // pointer to parent CU |
| ComputeUnit *computeUnit; |
| |
| std::deque<GPUDynInstPtr> instructionBuffer; |
| |
| bool pendingFetch; |
| bool dropFetch; |
| |
| // Condition Register State (for HSAIL simulations only) |
| class ConditionRegisterState *condRegState; |
| // number of single precision VGPRs required by WF |
| uint32_t maxSpVgprs; |
| // number of double precision VGPRs required by WF |
| uint32_t maxDpVgprs; |
| // map virtual to physical vector register |
| uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); |
| void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); |
| bool isGmInstruction(GPUDynInstPtr ii); |
| bool isLmInstruction(GPUDynInstPtr ii); |
| bool isOldestInstGMem(); |
| bool isOldestInstLMem(); |
| bool isOldestInstPrivMem(); |
| bool isOldestInstFlatMem(); |
| bool isOldestInstALU(); |
| bool isOldestInstBarrier(); |
| // used for passing spill address to DDInstGPU |
| std::vector<Addr> lastAddr; |
| std::vector<uint32_t> workItemId[3]; |
| std::vector<uint32_t> workItemFlatId; |
| /* kernel launch parameters */ |
| uint32_t workGroupId[3]; |
| uint32_t workGroupSz[3]; |
| uint32_t gridSz[3]; |
| uint32_t wgId; |
| uint32_t wgSz; |
| /* the actual WG size can differ than the maximum size */ |
| uint32_t actualWgSz[3]; |
| uint32_t actualWgSzTotal; |
| void computeActualWgSz(NDRange *ndr); |
| // wavefront id within a workgroup |
| uint32_t wfId; |
| uint32_t maxDynWaveId; |
| uint32_t dispatchId; |
| // outstanding global+local memory requests |
| uint32_t outstandingReqs; |
| // memory requests between scoreboard |
| // and execute stage not yet executed |
| uint32_t memReqsInPipe; |
| // outstanding global memory write requests |
| uint32_t outstandingReqsWrGm; |
| // outstanding local memory write requests |
| uint32_t outstandingReqsWrLm; |
| // outstanding global memory read requests |
| uint32_t outstandingReqsRdGm; |
| // outstanding local memory read requests |
| uint32_t outstandingReqsRdLm; |
| uint32_t rdLmReqsInPipe; |
| uint32_t rdGmReqsInPipe; |
| uint32_t wrLmReqsInPipe; |
| uint32_t wrGmReqsInPipe; |
| |
| int memTraceBusy; |
| uint64_t lastTrace; |
| // number of vector registers reserved by WF |
| int reservedVectorRegs; |
| // Index into the Vector Register File's namespace where the WF's registers |
| // will live while the WF is executed |
| uint32_t startVgprIndex; |
| |
| // Old value of destination gpr (for trace) |
| std::vector<uint32_t> oldVgpr; |
| // Id of destination gpr (for trace) |
| uint32_t oldVgprId; |
| // Tick count of last old_vgpr copy |
| uint64_t oldVgprTcnt; |
| |
| // Old value of destination gpr (for trace) |
| std::vector<uint64_t> oldDgpr; |
| // Id of destination gpr (for trace) |
| uint32_t oldDgprId; |
| // Tick count of last old_vgpr copy |
| uint64_t oldDgprTcnt; |
| |
| // Execution mask at wavefront start |
| VectorMask initMask; |
| |
| // number of barriers this WF has joined |
| std::vector<int> barCnt; |
| int maxBarCnt; |
| // Flag to stall a wave on barrier |
| bool stalledAtBarrier; |
| |
| // a pointer to the fraction of the LDS allocated |
| // to this workgroup (thus this wavefront) |
| LdsChunk *ldsChunk; |
| |
| // A pointer to the spill area |
| Addr spillBase; |
| // The size of the spill area |
| uint32_t spillSizePerItem; |
| // The vector width of the spill area |
| uint32_t spillWidth; |
| |
| // A pointer to the private memory area |
| Addr privBase; |
| // The size of the private memory area |
| uint32_t privSizePerItem; |
| |
| // A pointer ot the read-only memory area |
| Addr roBase; |
| // size of the read-only memory area |
| uint32_t roSize; |
| |
| // pointer to buffer for storing kernel arguments |
| uint8_t *kernelArgs; |
| // unique WF id over all WFs executed across all CUs |
| uint64_t wfDynId; |
| |
| // number of times instruction issue for this wavefront is blocked |
| // due to VRF port availability |
| Stats::Scalar numTimesBlockedDueVrfPortAvail; |
| // number of times an instruction of a WF is blocked from being issued |
| // due to WAR and WAW dependencies |
| Stats::Scalar numTimesBlockedDueWAXDependencies; |
| // number of times an instruction of a WF is blocked from being issued |
| // due to WAR and WAW dependencies |
| Stats::Scalar numTimesBlockedDueRAWDependencies; |
| // distribution of executed instructions based on their register |
| // operands; this is used to highlight the load on the VRF |
| Stats::Distribution srcRegOpDist; |
| Stats::Distribution dstRegOpDist; |
| |
| // Functions to operate on call argument memory |
| // argument memory for hsail call instruction |
| CallArgMem *callArgMem; |
| void |
| initCallArgMem(int func_args_size_per_item, int wf_size) |
| { |
| callArgMem = new CallArgMem(func_args_size_per_item, wf_size); |
| } |
| |
| template<typename CType> |
| CType |
| readCallArgMem(int lane, int addr) |
| { |
| return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); |
| } |
| |
| template<typename CType> |
| void |
| writeCallArgMem(int lane, int addr, CType val) |
| { |
| callArgMem->setLaneAddr<CType>(lane, addr, val); |
| } |
| |
| typedef WavefrontParams Params; |
| Wavefront(const Params *p); |
| ~Wavefront(); |
| virtual void init(); |
| |
| void |
| setParent(ComputeUnit *cu) |
| { |
| computeUnit = cu; |
| } |
| |
| void start(uint64_t _wfDynId, uint64_t _base_ptr); |
| void exec(); |
| void updateResources(); |
| int ready(itype_e type); |
| bool instructionBufferHasBranch(); |
| void regStats(); |
| VectorMask getPred() { return execMask() & initMask; } |
| |
| bool waitingAtBarrier(int lane); |
| |
| void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, |
| const VectorMask& exec_mask); |
| |
| void popFromReconvergenceStack(); |
| |
| uint32_t pc() const; |
| |
| uint32_t rpc() const; |
| |
| VectorMask execMask() const; |
| |
| bool execMask(int lane) const; |
| |
| void pc(uint32_t new_pc); |
| |
| void discardFetch(); |
| |
| /** |
| * Returns the size of the static hardware context of a particular wavefront |
| * This should be updated everytime the context is changed |
| */ |
| uint32_t getStaticContextSize() const; |
| |
| /** |
| * Returns the hardware context as a stream of bytes |
| * This method is designed for HSAIL execution |
| */ |
| void getContext(const void *out); |
| |
| /** |
| * Sets the hardware context fromt a stream of bytes |
| * This method is designed for HSAIL execution |
| */ |
| void setContext(const void *in); |
| |
| TheGpuISA::GPUISA& |
| gpuISA() |
| { |
| return _gpuISA; |
| } |
| |
| private: |
| TheGpuISA::GPUISA _gpuISA; |
| /** |
| * Stack containing Control Flow Graph nodes (i.e., kernel instructions) |
| * to be visited by the wavefront, and the associated execution masks. The |
| * reconvergence stack grows every time the wavefront reaches a divergence |
| * point (branch instruction), and shrinks every time the wavefront |
| * reaches a reconvergence point (immediate post-dominator instruction). |
| */ |
| std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; |
| }; |
| |
| #endif // __WAVEFRONT_HH__ |