src/gpu-compute/compute_unit.hh - testing/jenkins-gem5-prod - Git at Google

 /*
  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Authors: John Kalamatianos,
  *          Anthony Gutierrez
  */

 #ifndef __COMPUTE_UNIT_HH__
 #define __COMPUTE_UNIT_HH__

 #include <deque>
 #include <map>
 #include <unordered_map>
 #include <vector>

 #include "base/callback.hh"
 #include "base/statistics.hh"
 #include "base/types.hh"
 #include "enums/PrefetchType.hh"
 #include "gpu-compute/exec_stage.hh"
 #include "gpu-compute/fetch_stage.hh"
 #include "gpu-compute/global_memory_pipeline.hh"
 #include "gpu-compute/local_memory_pipeline.hh"
 #include "gpu-compute/qstruct.hh"
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "mem/mem_object.hh"
 #include "mem/port.hh"

 static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
 static const int MAX_WIDTH_FOR_MEM_INST = 32;

 class NDRange;
 class Shader;
 class VectorRegisterFile;

 struct ComputeUnitParams;

 enum EXEC_POLICY
 {
     OLDEST = 0,
     RR
 };

 // List of execution units
 enum EXEC_UNIT
 {
     SIMD0 = 0,
     SIMD1,
     SIMD2,
     SIMD3,
     GLBMEM_PIPE,
     LDSMEM_PIPE,
     NUM_UNITS
 };

 enum TLB_CACHE
 {
     TLB_MISS_CACHE_MISS = 0,
     TLB_MISS_CACHE_HIT,
     TLB_HIT_CACHE_MISS,
     TLB_HIT_CACHE_HIT
 };

 class ComputeUnit : public MemObject
 {
   public:
     FetchStage fetchStage;
     ScoreboardCheckStage scoreboardCheckStage;
     ScheduleStage scheduleStage;
     ExecStage execStage;
     GlobalMemPipeline globalMemoryPipe;
     LocalMemPipeline localMemoryPipe;

     // Buffers used to communicate between various pipeline stages

     // List of waves which are ready to be scheduled.
     // Each execution resource has a ready list. readyList is
     // used to communicate between scoreboardCheck stage and
     // schedule stage
     // TODO: make enum to index readyList
     std::vector<std::vector<Wavefront*>> readyList;

     // Stores the status of waves. A READY implies the
     // wave is ready to be scheduled this cycle and
     // is already present in the readyList. waveStatusList is
     // used to communicate between scoreboardCheck stage and
     // schedule stage
     // TODO: convert std::pair to a class to increase readability
     std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;

     // List of waves which will be dispatched to
     // each execution resource. A FILLED implies
     // dispatch list is non-empty and
     // execution unit has something to execute
     // this cycle. Currently, the dispatch list of
     // an execution resource can hold only one wave because
     // an execution resource can execute only one wave in a cycle.
     // dispatchList is used to communicate between schedule
     // and exec stage
     // TODO: convert std::pair to a class to increase readability
     std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;

     int rrNextMemID; // used by RR WF exec policy to cycle through WF's
     int rrNextALUWp;
     typedef ComputeUnitParams Params;
     std::vector<std::vector<Wavefront*>> wfList;
     int cu_id;

     // array of vector register files, one per SIMD
     std::vector<VectorRegisterFile*> vrf;
     // Number of vector ALU units (SIMDs) in CU
     int numSIMDs;
     // number of pipe stages for bypassing data to next dependent single
     // precision vector instruction inside the vector ALU pipeline
     int spBypassPipeLength;
     // number of pipe stages for bypassing data to next dependent double
     // precision vector instruction inside the vector ALU pipeline
     int dpBypassPipeLength;
     // number of cycles per issue period
     int issuePeriod;

     // Number of global and local memory execution resources in CU
     int numGlbMemUnits;
     int numLocMemUnits;
     // tracks the last cycle a vector instruction was executed on a SIMD
     std::vector<uint64_t> lastExecCycle;

     // true if we allow a separate TLB per lane
     bool perLaneTLB;
     // if 0, TLB prefetching is off.
     int prefetchDepth;
     // if fixed-stride prefetching, this is the stride.
     int prefetchStride;

     std::vector<Addr> lastVaddrCU;
     std::vector<std::vector<Addr>> lastVaddrSimd;
     std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
     Enums::PrefetchType prefetchType;
     EXEC_POLICY exec_policy;

     bool xact_cas_mode;
     bool debugSegFault;
     bool functionalTLB;
     bool localMemBarrier;

     /*
      * for Counting page accesses
      *
      * cuExitCallback inherits from Callback. When you register a callback
      * function as an exit callback, it will get added to an exit callback
      * queue, such that on simulation exit, all callbacks in the callback
      * queue will have their process() function called.
      */
     bool countPages;

     Shader *shader;
     uint32_t barrier_id;
     // vector of Vector ALU (MACC) pipelines
     std::vector<WaitClass> aluPipe;
     // minimum issue period per SIMD unit (in cycles)
     std::vector<WaitClass> wfWait;

     // Resource control for Vector Register File->Global Memory pipe buses
     std::vector<WaitClass> vrfToGlobalMemPipeBus;
     // Resource control for Vector Register File->Local Memory pipe buses
     std::vector<WaitClass> vrfToLocalMemPipeBus;
     int nextGlbMemBus;
     int nextLocMemBus;
     // Resource control for global memory to VRF data/address bus
     WaitClass glbMemToVrfBus;
     // Resource control for local memory to VRF data/address bus
     WaitClass locMemToVrfBus;

     uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
     uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
     uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
     uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load

     Tick req_tick_latency;
     Tick resp_tick_latency;

     // number of vector registers being reserved for each SIMD unit
     std::vector<int> vectorRegsReserved;
     // number of vector registers per SIMD unit
     uint32_t numVecRegsPerSimd;
     // Support for scheduling VGPR status update events
     std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
     std::vector<uint64_t> timestampVec;
     std::vector<uint8_t>  statusVec;

     void
     registerEvent(uint32_t simdId,
                   uint32_t regIdx,
                   uint32_t operandSize,
                   uint64_t when,
                   uint8_t newStatus) {
         regIdxVec.push_back(std::make_pair(simdId, regIdx));
         timestampVec.push_back(when);
         statusVec.push_back(newStatus);
         if (operandSize > 4) {
             regIdxVec.push_back(std::make_pair(simdId,
                                                ((regIdx + 1) %
                                                 numVecRegsPerSimd)));
             timestampVec.push_back(when);
             statusVec.push_back(newStatus);
         }
     }

     void updateEvents();

     // this hash map will keep track of page divergence
     // per memory instruction per wavefront. The hash map
     // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
     std::map<Addr, int> pagesTouched;

     ComputeUnit(const Params *p);
     ~ComputeUnit();
     int spBypassLength() { return spBypassPipeLength; };
     int dpBypassLength() { return dpBypassPipeLength; };
     int storeBusLength() { return numCyclesPerStoreTransfer; };
     int loadBusLength() { return numCyclesPerLoadTransfer; };
     int wfSize() const { return wavefrontSize; };

     void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
     void exec();
     void initiateFetch(Wavefront *wavefront);
     void fetch(PacketPtr pkt, Wavefront *wavefront);
     void fillKernelState(Wavefront *w, NDRange *ndr);

     void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
                         NDRange *ndr);

     void StartWorkgroup(NDRange *ndr);
     int ReadyWorkgroup(NDRange *ndr);

     bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
     bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
     bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
     int GlbMemUnitId() { return GLBMEM_PIPE; }
     int ShrMemUnitId() { return LDSMEM_PIPE; }
     int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
     int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
     /* This function cycles through all the wavefronts in all the phases to see
      * if all of the wavefronts which should be associated with one barrier
      * (denoted with _barrier_id), are all at the same barrier in the program
      * (denoted by bcnt). When the number at the barrier matches bslots, then
      * return true.
      */
     int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
     bool cedeSIMD(int simdId, int wfSlotId);

     template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
     virtual void init();
     void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
     void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
     void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
                               bool kernelLaunch=true,
                               RequestPtr req=nullptr);
     void handleMemPacket(PacketPtr pkt, int memport_index);
     bool processTimingPacket(PacketPtr pkt);
     void processFetchReturn(PacketPtr pkt);
     void updatePageDivergenceDist(Addr addr);

     MasterID masterId() { return _masterId; }

     bool isDone() const;
     bool isSimdDone(uint32_t) const;

   protected:
     MasterID _masterId;

     LdsState &lds;

   public:
     Stats::Scalar vALUInsts;
     Stats::Formula vALUInstsPerWF;
     Stats::Scalar sALUInsts;
     Stats::Formula sALUInstsPerWF;
     Stats::Scalar instCyclesVALU;
     Stats::Scalar instCyclesSALU;
     Stats::Scalar threadCyclesVALU;
     Stats::Formula vALUUtilization;
     Stats::Scalar ldsNoFlatInsts;
     Stats::Formula ldsNoFlatInstsPerWF;
     Stats::Scalar flatVMemInsts;
     Stats::Formula flatVMemInstsPerWF;
     Stats::Scalar flatLDSInsts;
     Stats::Formula flatLDSInstsPerWF;
     Stats::Scalar vectorMemWrites;
     Stats::Formula vectorMemWritesPerWF;
     Stats::Scalar vectorMemReads;
     Stats::Formula vectorMemReadsPerWF;
     Stats::Scalar scalarMemWrites;
     Stats::Formula scalarMemWritesPerWF;
     Stats::Scalar scalarMemReads;
     Stats::Formula scalarMemReadsPerWF;

     void updateInstStats(GPUDynInstPtr gpuDynInst);

     // the following stats compute the avg. TLB accesslatency per
     // uncoalesced request (only for data)
     Stats::Scalar tlbRequests;
     Stats::Scalar tlbCycles;
     Stats::Formula tlbLatency;
     // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
     Stats::Vector hitsPerTLBLevel;

     Stats::Scalar ldsBankAccesses;
     Stats::Distribution ldsBankConflictDist;

     // over all memory instructions executed over all wavefronts
     // how many touched 0-4 pages, 4-8, ..., 60-64 pages
     Stats::Distribution pageDivergenceDist;
     Stats::Scalar dynamicGMemInstrCnt;
     Stats::Scalar dynamicLMemInstrCnt;

     Stats::Scalar wgBlockedDueLdsAllocation;
     // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
     // when the instruction is committed, this number is still incremented by 1
     Stats::Scalar numInstrExecuted;
     // Number of cycles among successive instruction executions across all
     // wavefronts of the same CU
     Stats::Distribution execRateDist;
     // number of individual vector operations executed
     Stats::Scalar numVecOpsExecuted;
     // Total cycles that something is running on the GPU
     Stats::Scalar totalCycles;
     Stats::Formula vpc; // vector ops per cycle
     Stats::Formula ipc; // vector instructions per cycle
     Stats::Distribution controlFlowDivergenceDist;
     Stats::Distribution activeLanesPerGMemInstrDist;
     Stats::Distribution activeLanesPerLMemInstrDist;
     // number of vector ALU instructions received
     Stats::Formula numALUInstsExecuted;
     // number of times a WG can not start due to lack of free VGPRs in SIMDs
     Stats::Scalar numTimesWgBlockedDueVgprAlloc;
     Stats::Scalar numCASOps;
     Stats::Scalar numFailedCASOps;
     Stats::Scalar completedWfs;
     // flag per vector SIMD unit that is set when there is at least one
     // WV that has a vector ALU instruction as the oldest in its
     // Instruction Buffer: Defined in the Scoreboard stage, consumed
     // by the Execute stage.
     std::vector<bool> vectorAluInstAvail;
     // number of available (oldest) LDS instructions that could have
     // been issued to the LDS at a specific issue slot
     int shrMemInstAvail;
     // number of available Global memory instructions that could have
     // been issued to TCP at a specific issue slot
     int glbMemInstAvail;

     void
     regStats();

     LdsState &
     getLds() const
     {
         return lds;
     }

     int32_t
     getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;

     int cacheLineSize() const { return _cacheLineSize; }

     bool
     sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));

     typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
     pageDataStruct pageAccesses;

     class CUExitCallback : public Callback
     {
       private:
         ComputeUnit *computeUnit;

       public:
         virtual ~CUExitCallback() { }

         CUExitCallback(ComputeUnit *_cu)
         {
             computeUnit = _cu;
         }

         virtual void
         process();
     };

     CUExitCallback *cuExitCallback;

     /** Data access Port **/
     class DataPort : public MasterPort
     {
       public:
         DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
             : MasterPort(_name, _cu), computeUnit(_cu),
               index(_index) { }

         bool snoopRangeSent;

         struct SenderState : public Packet::SenderState
         {
             GPUDynInstPtr _gpuDynInst;
             int port_index;
             Packet::SenderState *saved;

             SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
                         Packet::SenderState *sender_state=nullptr)
                 : _gpuDynInst(gpuDynInst),
                   port_index(_port_index),
                   saved(sender_state) { }
         };

         void processMemReqEvent(PacketPtr pkt);
         EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);

         void processMemRespEvent(PacketPtr pkt);
         EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);

         std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;

       protected:
         ComputeUnit *computeUnit;
         int index;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
         virtual void recvFunctional(PacketPtr pkt) { }
         virtual void recvRangeChange() { }
         virtual void recvReqRetry();

         virtual void
         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
         {
             resp.clear();
             snoop = true;
         }

     };

     // Instruction cache access port
     class SQCPort : public MasterPort
     {
       public:
         SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
             : MasterPort(_name, _cu), computeUnit(_cu),
               index(_index) { }

         bool snoopRangeSent;

         struct SenderState : public Packet::SenderState
         {
             Wavefront *wavefront;
             Packet::SenderState *saved;

             SenderState(Wavefront *_wavefront, Packet::SenderState
                     *sender_state=nullptr)
                 : wavefront(_wavefront), saved(sender_state) { }
         };

         std::deque<std::pair<PacketPtr, Wavefront*>> retries;

       protected:
         ComputeUnit *computeUnit;
         int index;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
         virtual void recvFunctional(PacketPtr pkt) { }
         virtual void recvRangeChange() { }
         virtual void recvReqRetry();

         virtual void
         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
         {
             resp.clear();
             snoop = true;
         }
      };

     /** Data TLB port **/
     class DTLBPort : public MasterPort
     {
       public:
         DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
             : MasterPort(_name, _cu), computeUnit(_cu),
               index(_index), stalled(false)
         { }

         bool isStalled() { return stalled; }
         void stallPort() { stalled = true; }
         void unstallPort() { stalled = false; }

         /**
          * here we queue all the translation requests that were
          * not successfully sent.
          */
         std::deque<PacketPtr> retries;

         /** SenderState is information carried along with the packet
          * throughout the TLB hierarchy
          */
         struct SenderState: public Packet::SenderState
         {
             // the memInst that this is associated with
             GPUDynInstPtr _gpuDynInst;

             // the lane in the memInst this is associated with, so we send
             // the memory request down the right port
             int portIndex;

             // constructor used for packets involved in timing accesses
             SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
                 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }

         };

       protected:
         ComputeUnit *computeUnit;
         int index;
         bool stalled;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
         virtual void recvFunctional(PacketPtr pkt) { }
         virtual void recvRangeChange() { }
         virtual void recvReqRetry();
     };

     class ITLBPort : public MasterPort
     {
       public:
         ITLBPort(const std::string &_name, ComputeUnit *_cu)
             : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }


         bool isStalled() { return stalled; }
         void stallPort() { stalled = true; }
         void unstallPort() { stalled = false; }

         /**
          * here we queue all the translation requests that were
          * not successfully sent.
          */
         std::deque<PacketPtr> retries;

         /** SenderState is information carried along with the packet
          * throughout the TLB hierarchy
          */
         struct SenderState: public Packet::SenderState
         {
             // The wavefront associated with this request
             Wavefront *wavefront;

             SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
         };

       protected:
         ComputeUnit *computeUnit;
         bool stalled;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
         virtual void recvFunctional(PacketPtr pkt) { }
         virtual void recvRangeChange() { }
         virtual void recvReqRetry();
     };

     /**
      * the port intended to communicate between the CU and its LDS
      */
     class LDSPort : public MasterPort
     {
       public:
         LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
         : MasterPort(_name, _cu, _id), computeUnit(_cu)
         {
         }

         bool isStalled() const { return stalled; }
         void stallPort() { stalled = true; }
         void unstallPort() { stalled = false; }

         /**
          * here we queue all the requests that were
          * not successfully sent.
          */
         std::queue<PacketPtr> retries;

         /**
          *  SenderState is information carried along with the packet, esp. the
          *  GPUDynInstPtr
          */
         class SenderState: public Packet::SenderState
         {
           protected:
             // The actual read/write/atomic request that goes with this command
             GPUDynInstPtr _gpuDynInst = nullptr;

           public:
             SenderState(GPUDynInstPtr gpuDynInst):
               _gpuDynInst(gpuDynInst)
             {
             }

             GPUDynInstPtr
             getMemInst() const
             {
               return _gpuDynInst;
             }
         };

         virtual bool
         sendTimingReq(PacketPtr pkt);

       protected:

         bool stalled = false; ///< whether or not it is stalled

         ComputeUnit *computeUnit;

         virtual bool
         recvTimingResp(PacketPtr pkt);

         virtual Tick
         recvAtomic(PacketPtr pkt) { return 0; }

         virtual void
         recvFunctional(PacketPtr pkt)
         {
         }

         virtual void
         recvRangeChange()
         {
         }

         virtual void
         recvReqRetry();
     };

     /** The port to access the Local Data Store
      *  Can be connected to a LDS object
      */
     LDSPort *ldsPort = nullptr;

     LDSPort *
     getLdsPort() const
     {
         return ldsPort;
     }

     /** The memory port for SIMD data accesses.
      *  Can be connected to PhysMem for Ruby for timing simulations
      */
     std::vector<DataPort*> memPort;
     // port to the TLB hierarchy (i.e., the L1 TLB)
     std::vector<DTLBPort*> tlbPort;
     // port to the SQC (i.e. the I-cache)
     SQCPort *sqcPort;
     // port to the SQC TLB (there's a separate TLB for each I-cache)
     ITLBPort *sqcTLBPort;

     virtual BaseMasterPort&
     getMasterPort(const std::string &if_name, PortID idx)
     {
         if (if_name == "memory_port") {
             memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
                                         this, idx);
             return *memPort[idx];
         } else if (if_name == "translation_port") {
             tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
                                         this, idx);
             return *tlbPort[idx];
         } else if (if_name == "sqc_port") {
             sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
                                   this, idx);
             return *sqcPort;
         } else if (if_name == "sqc_tlb_port") {
             sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
             return *sqcTLBPort;
         } else if (if_name == "ldsPort") {
             if (ldsPort) {
                 fatal("an LDS port was already allocated");
             }
             ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
             return *ldsPort;
         } else {
             panic("incorrect port name");
         }
     }

     // xact_cas_load()
     class waveIdentifier
     {
       public:
         waveIdentifier() { }
         waveIdentifier(int _simdId, int _wfSlotId)
           : simdId(_simdId), wfSlotId(_wfSlotId) { }

         int simdId;
         int wfSlotId;
     };

     class waveQueue
     {
       public:
         std::list<waveIdentifier> waveIDQueue;
     };
     std::map<unsigned, waveQueue> xactCasLoadMap;

     uint64_t getAndIncSeqNum() { return globalSeqNum++; }

   private:
     const int _cacheLineSize;
     uint64_t globalSeqNum;
     int wavefrontSize;
     GPUStaticInst *kernelLaunchInst;
 };

 #endif // __COMPUTE_UNIT_HH__