src/gpu-compute/compute_unit.hh - public/gem5 - Git at Google

 /*
  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #ifndef __COMPUTE_UNIT_HH__
 #define __COMPUTE_UNIT_HH__

 #include <deque>
 #include <map>
 #include <unordered_set>
 #include <vector>

 #include "base/callback.hh"
 #include "base/compiler.hh"
 #include "base/statistics.hh"
 #include "base/stats/group.hh"
 #include "base/types.hh"
 #include "config/the_gpu_isa.hh"
 #include "enums/PrefetchType.hh"
 #include "gpu-compute/comm.hh"
 #include "gpu-compute/exec_stage.hh"
 #include "gpu-compute/fetch_stage.hh"
 #include "gpu-compute/global_memory_pipeline.hh"
 #include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/local_memory_pipeline.hh"
 #include "gpu-compute/register_manager.hh"
 #include "gpu-compute/scalar_memory_pipeline.hh"
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "mem/port.hh"
 #include "mem/token_port.hh"
 #include "sim/clocked_object.hh"

 namespace gem5
 {

 class HSAQueueEntry;
 class LdsChunk;
 class ScalarRegisterFile;
 class Shader;
 class VectorRegisterFile;

 struct ComputeUnitParams;

 enum EXEC_POLICY
 {
     OLDEST = 0,
     RR
 };

 enum TLB_CACHE
 {
     TLB_MISS_CACHE_MISS = 0,
     TLB_MISS_CACHE_HIT,
     TLB_HIT_CACHE_MISS,
     TLB_HIT_CACHE_HIT
 };

 /**
  * WF barrier slots. This represents the barrier resource for
  * WF-level barriers (i.e., barriers to sync WFs within a WG).
  */
 class WFBarrier
 {
   public:
     WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
     {
     }

     static const int InvalidID = -1;

     int
     numAtBarrier() const
     {
         return _numAtBarrier;
     }

     /**
      * Number of WFs that have not yet reached the barrier.
      */
     int
     numYetToReachBarrier() const
     {
         return _maxBarrierCnt - _numAtBarrier;
     }

     int
     maxBarrierCnt() const
     {
         return _maxBarrierCnt;
     }

     /**
      * Set the maximum barrier count (i.e., the number of WFs that are
      * participating in the barrier).
      */
     void
     setMaxBarrierCnt(int max_barrier_cnt)
     {
         _maxBarrierCnt = max_barrier_cnt;
     }

     /**
      * Mark that a WF has reached the barrier.
      */
     void
     incNumAtBarrier()
     {
         assert(_numAtBarrier < _maxBarrierCnt);
         ++_numAtBarrier;
     }

     /**
      * Have all WFs participating in this barrier reached the barrier?
      * If so, then the barrier is satisfied and WFs may proceed past
      * the barrier.
      */
     bool
     allAtBarrier() const
     {
         return _numAtBarrier == _maxBarrierCnt;
     }

     /**
      * Decrement the number of WFs that are participating in this barrier.
      * This should be called when a WF exits.
      */
     void
     decMaxBarrierCnt()
     {
         assert(_maxBarrierCnt > 0);
         --_maxBarrierCnt;
     }

     /**
      * Release this barrier resource so it can be used by other WGs. This
      * is generally called when a WG has finished.
      */
     void
     release()
     {
         _numAtBarrier = 0;
         _maxBarrierCnt = 0;
     }

     /**
      * Reset the barrier. This is used to reset the barrier, usually when
      * a dynamic instance of a barrier has been satisfied.
      */
     void
     reset()
     {
         _numAtBarrier = 0;
     }

   private:
     /**
      * The number of WFs in the WG that have reached the barrier. Once
      * the number of WFs that reach a barrier matches the number of WFs
      * in the WG, the barrier is satisfied.
      */
     int _numAtBarrier;

     /**
      * The maximum number of WFs that can reach this barrier. This is
      * essentially the number of WFs in the WG, and a barrier is satisfied
      * when the number of WFs that reach the barrier equal this value. If
      * a WF exits early it must decrement this value so that it is no
      * longer considered for this barrier.
      */
     int _maxBarrierCnt;
 };

 class ComputeUnit : public ClockedObject
 {
   public:


     // Execution resources
     //
     // The ordering of units is:
     // Vector ALUs
     // Scalar ALUs
     // GM Pipe
     // LM Pipe
     // Scalar Mem Pipe
     //
     // Note: the ordering of units is important and the code assumes the
     // above ordering. However, there may be more than one resource of
     // each type (e.g., 4 VALUs or 2 SALUs)

     int numVectorGlobalMemUnits;
     // Resource control for global memory to VRF data/address bus
     WaitClass glbMemToVrfBus;
     // Resource control for Vector Register File->Global Memory pipe buses
     WaitClass vrfToGlobalMemPipeBus;
     // Resource control for Vector Global Memory execution unit
     WaitClass vectorGlobalMemUnit;

     int numVectorSharedMemUnits;
     // Resource control for local memory to VRF data/address bus
     WaitClass locMemToVrfBus;
     // Resource control for Vector Register File->Local Memory pipe buses
     WaitClass vrfToLocalMemPipeBus;
     // Resource control for Vector Shared/Local Memory execution unit
     WaitClass vectorSharedMemUnit;

     int numScalarMemUnits;
     // Resource control for scalar memory to SRF data/address bus
     WaitClass scalarMemToSrfBus;
     // Resource control for Scalar Register File->Scalar Memory pipe buses
     WaitClass srfToScalarMemPipeBus;
     // Resource control for Scalar Memory execution unit
     WaitClass scalarMemUnit;

     // vector ALU execution resources
     int numVectorALUs;
     std::vector<WaitClass> vectorALUs;

     // scalar ALU execution resources
     int numScalarALUs;
     std::vector<WaitClass> scalarALUs;

     // Return total number of execution units on this CU
     int numExeUnits() const;
     // index into readyList of the first memory unit
     int firstMemUnit() const;
     // index into readyList of the last memory unit
     int lastMemUnit() const;
     // index into scalarALUs vector of SALU used by the wavefront
     int mapWaveToScalarAlu(Wavefront *w) const;
     // index into readyList of SALU used by wavefront
     int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
     // index into readyList of Global Memory unit used by wavefront
     int mapWaveToGlobalMem(Wavefront *w) const;
     // index into readyList of Local Memory unit used by wavefront
     int mapWaveToLocalMem(Wavefront *w) const;
     // index into readyList of Scalar Memory unit used by wavefront
     int mapWaveToScalarMem(Wavefront *w) const;

     int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
     int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
     int numCyclesPerStoreTransfer;  // number of cycles per vector store
     int numCyclesPerLoadTransfer;  // number of cycles per vector load

     // track presence of dynamic instructions in the Schedule pipeline
     // stage. This is used to check the readiness of the oldest,
     // non-dispatched instruction of every WF in the Scoreboard stage.
     std::unordered_set<uint64_t> pipeMap;

     RegisterManager* registerManager;

     FetchStage fetchStage;
     ScoreboardCheckStage scoreboardCheckStage;
     ScheduleStage scheduleStage;
     ExecStage execStage;
     GlobalMemPipeline globalMemoryPipe;
     LocalMemPipeline localMemoryPipe;
     ScalarMemPipeline scalarMemoryPipe;

     EventFunctionWrapper tickEvent;

     typedef ComputeUnitParams Params;
     std::vector<std::vector<Wavefront*>> wfList;
     int cu_id;

     // array of vector register files, one per SIMD
     std::vector<VectorRegisterFile*> vrf;
     // array of scalar register files, one per SIMD
     std::vector<ScalarRegisterFile*> srf;

     // Width per VALU/SIMD unit: number of work items that can be executed
     // on the vector ALU simultaneously in a SIMD unit
     int simdWidth;
     // number of pipe stages for bypassing data to next dependent single
     // precision vector instruction inside the vector ALU pipeline
     int spBypassPipeLength;
     // number of pipe stages for bypassing data to next dependent double
     // precision vector instruction inside the vector ALU pipeline
     int dpBypassPipeLength;
     // number of pipe stages for scalar ALU
     int scalarPipeStages;
     // number of pipe stages for operand collection & distribution network
     int operandNetworkLength;
     // number of cycles per instruction issue period
     Cycles issuePeriod;

     // VRF to GM Bus latency
     Cycles vrf_gm_bus_latency;
     // SRF to Scalar Mem Bus latency
     Cycles srf_scm_bus_latency;
     // VRF to LM Bus latency
     Cycles vrf_lm_bus_latency;

     // tracks the last cycle a vector instruction was executed on a SIMD
     std::vector<uint64_t> lastExecCycle;

     // tracks the number of dyn inst executed per SIMD
     std::vector<uint64_t> instExecPerSimd;

     // true if we allow a separate TLB per lane
     bool perLaneTLB;
     // if 0, TLB prefetching is off.
     int prefetchDepth;
     // if fixed-stride prefetching, this is the stride.
     int prefetchStride;

     std::vector<Addr> lastVaddrCU;
     std::vector<std::vector<Addr>> lastVaddrSimd;
     std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
     enums::PrefetchType prefetchType;
     EXEC_POLICY exec_policy;

     bool debugSegFault;
     // Idle CU timeout in ticks
     Tick idleCUTimeout;
     int idleWfs;
     bool functionalTLB;
     bool localMemBarrier;

     /*
      * for Counting page accesses
      */
     bool countPages;

     Shader *shader;

     Tick req_tick_latency;
     Tick resp_tick_latency;

     /**
      * Number of WFs to schedule to each SIMD. This vector is populated
      * by hasDispResources(), and consumed by the subsequent call to
      * dispWorkgroup(), to schedule the specified number of WFs to the
      * SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
      */
     std::vector<int> numWfsToSched;

     // number of currently reserved vector registers per SIMD unit
     std::vector<int> vectorRegsReserved;
     // number of currently reserved scalar registers per SIMD unit
     std::vector<int> scalarRegsReserved;
     // number of vector registers per SIMD unit
     int numVecRegsPerSimd;
     // number of available scalar registers per SIMD unit
     int numScalarRegsPerSimd;

     // this hash map will keep track of page divergence
     // per memory instruction per wavefront. The hash map
     // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
     std::map<Addr, int> pagesTouched;

     void insertInPipeMap(Wavefront *w);
     void deleteFromPipeMap(Wavefront *w);

     ComputeUnit(const Params &p);
     ~ComputeUnit();

     // Timing Functions
     int oprNetPipeLength() const { return operandNetworkLength; }
     int simdUnitWidth() const { return simdWidth; }
     int spBypassLength() const { return spBypassPipeLength; }
     int dpBypassLength() const { return dpBypassPipeLength; }
     int scalarPipeLength() const { return scalarPipeStages; }
     int storeBusLength() const { return numCyclesPerStoreTransfer; }
     int loadBusLength() const { return numCyclesPerLoadTransfer; }
     int wfSize() const { return wavefrontSize; }

     void exec();
     void initiateFetch(Wavefront *wavefront);
     void fetch(PacketPtr pkt, Wavefront *wavefront);
     void fillKernelState(Wavefront *w, HSAQueueEntry *task);

     void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
                         HSAQueueEntry *task, int bar_id,
                         bool fetchContext=false);

     void doInvalidate(RequestPtr req, int kernId);
     void doFlush(GPUDynInstPtr gpuDynInst);

     void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
     bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);

     int cacheLineSize() const { return _cacheLineSize; }
     int getCacheLineBits() const { return cacheLineBits; }

     void resetRegisterPool();

   private:
     WFBarrier&
     barrierSlot(int bar_id)
     {
         assert(bar_id > WFBarrier::InvalidID);
         return wfBarrierSlots.at(bar_id);
     }

     int
     getFreeBarrierId()
     {
         assert(freeBarrierIds.size());
         auto free_bar_id = freeBarrierIds.begin();
         int bar_id = *free_bar_id;
         freeBarrierIds.erase(free_bar_id);
         return bar_id;
     }

   public:
     int numYetToReachBarrier(int bar_id);
     bool allAtBarrier(int bar_id);
     void incNumAtBarrier(int bar_id);
     int numAtBarrier(int bar_id);
     int maxBarrierCnt(int bar_id);
     void resetBarrier(int bar_id);
     void decMaxBarrierCnt(int bar_id);
     void releaseBarrier(int bar_id);
     void releaseWFsFromBarrier(int bar_id);
     int numBarrierSlots() const { return _numBarrierSlots; }

     template<typename c0, typename c1>
     void doSmReturn(GPUDynInstPtr gpuDynInst);

     virtual void init() override;
     void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);
     void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
     void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
                               bool kernelMemSync,
                               RequestPtr req=nullptr);
     void handleMemPacket(PacketPtr pkt, int memport_index);
     bool processTimingPacket(PacketPtr pkt);
     void processFetchReturn(PacketPtr pkt);
     void updatePageDivergenceDist(Addr addr);

     RequestorID requestorId() { return _requestorId; }

     bool isDone() const;
     bool isVectorAluIdle(uint32_t simdId) const;

   protected:
     RequestorID _requestorId;

     LdsState &lds;

   public:
     LdsState &
     getLds() const
     {
         return lds;
     }

     int32_t
     getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;

     GEM5_NO_DISCARD bool sendToLds(GPUDynInstPtr gpuDynInst);

     typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
     pageDataStruct pageAccesses;

     void exitCallback();

     class GMTokenPort : public TokenRequestPort
     {
       public:
         GMTokenPort(const std::string& name, SimObject *owner,
                     PortID id = InvalidPortID)
             : TokenRequestPort(name, owner, id)
         { }
         ~GMTokenPort() { }

       protected:
         bool recvTimingResp(PacketPtr) { return false; }
         void recvReqRetry() { }
     };

     // Manager for the number of tokens available to this compute unit to
     // send global memory request packets to the coalescer this is only used
     // between global memory pipe and TCP coalescer.
     TokenManager *memPortTokens;
     GMTokenPort gmTokenPort;

     /** Data access Port **/
     class DataPort : public RequestPort
     {
       public:
         DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
             : RequestPort(_name, _cu, id), computeUnit(_cu) { }

         bool snoopRangeSent;

         struct SenderState : public Packet::SenderState
         {
             GPUDynInstPtr _gpuDynInst;
             PortID port_index;
             Packet::SenderState *saved;

             SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
                         Packet::SenderState *sender_state=nullptr)
                 : _gpuDynInst(gpuDynInst),
                   port_index(_port_index),
                   saved(sender_state) { }
         };

         void processMemReqEvent(PacketPtr pkt);
         EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);

         void processMemRespEvent(PacketPtr pkt);
         EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);

         std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;

       protected:
         ComputeUnit *computeUnit;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
         virtual void recvFunctional(PacketPtr pkt) { }
         virtual void recvRangeChange() { }
         virtual void recvReqRetry();

         virtual void
         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
         {
             resp.clear();
             snoop = true;
         }

     };

     // Scalar data cache access port
     class ScalarDataPort : public RequestPort
     {
       public:
         ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
             : RequestPort(_name, _cu), computeUnit(_cu)
         {
         }

         bool recvTimingResp(PacketPtr pkt) override;
         void recvReqRetry() override;

         struct SenderState : public Packet::SenderState
         {
             SenderState(GPUDynInstPtr gpuDynInst,
                         Packet::SenderState *sender_state=nullptr)
                 : _gpuDynInst(gpuDynInst), saved(sender_state)
             {
             }

             GPUDynInstPtr _gpuDynInst;
             Packet::SenderState *saved;
         };

         class MemReqEvent : public Event
         {
           private:
             ScalarDataPort &scalarDataPort;
             PacketPtr pkt;

           public:
             MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
                 : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
             {
               setFlags(Event::AutoDelete);
             }

             void process();
             const char *description() const;
         };

         std::deque<PacketPtr> retries;

       private:
         ComputeUnit *computeUnit;
     };

     // Instruction cache access port
     class SQCPort : public RequestPort
     {
       public:
         SQCPort(const std::string &_name, ComputeUnit *_cu)
             : RequestPort(_name, _cu), computeUnit(_cu) { }

         bool snoopRangeSent;

         struct SenderState : public Packet::SenderState
         {
             Wavefront *wavefront;
             Packet::SenderState *saved;
             // kernel id to be used in handling I-Cache invalidate response
             int kernId;

             SenderState(Wavefront *_wavefront, Packet::SenderState
                     *sender_state=nullptr, int _kernId=-1)
                 : wavefront(_wavefront), saved(sender_state),
                 kernId(_kernId){ }
         };

         std::deque<std::pair<PacketPtr, Wavefront*>> retries;

       protected:
         ComputeUnit *computeUnit;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
         virtual void recvFunctional(PacketPtr pkt) { }
         virtual void recvRangeChange() { }
         virtual void recvReqRetry();

         virtual void
         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
         {
             resp.clear();
             snoop = true;
         }
      };

     /** Data TLB port **/
     class DTLBPort : public RequestPort
     {
       public:
         DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
             : RequestPort(_name, _cu, id), computeUnit(_cu),
               stalled(false)
         { }

         bool isStalled() { return stalled; }
         void stallPort() { stalled = true; }
         void unstallPort() { stalled = false; }

         /**
          * here we queue all the translation requests that were
          * not successfully sent.
          */
         std::deque<PacketPtr> retries;

         /** SenderState is information carried along with the packet
          * throughout the TLB hierarchy
          */
         struct SenderState: public Packet::SenderState
         {
             // the memInst that this is associated with
             GPUDynInstPtr _gpuDynInst;

             // the lane in the memInst this is associated with, so we send
             // the memory request down the right port
             PortID portIndex;

             // constructor used for packets involved in timing accesses
             SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
                 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }

         };

       protected:
         ComputeUnit *computeUnit;
         bool stalled;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
         virtual void recvFunctional(PacketPtr pkt) { }
         virtual void recvRangeChange() { }
         virtual void recvReqRetry();
     };

     class ScalarDTLBPort : public RequestPort
     {
       public:
         ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
             : RequestPort(_name, _cu), computeUnit(_cu), stalled(false)
         {
         }

         struct SenderState : public Packet::SenderState
         {
             SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
             GPUDynInstPtr _gpuDynInst;
         };

         bool recvTimingResp(PacketPtr pkt) override;
         void recvReqRetry() override { assert(false); }

         bool isStalled() const { return stalled; }
         void stallPort() { stalled = true; }
         void unstallPort() { stalled = false; }

         std::deque<PacketPtr> retries;

       private:
         ComputeUnit *computeUnit;
         bool stalled;
     };

     class ITLBPort : public RequestPort
     {
       public:
         ITLBPort(const std::string &_name, ComputeUnit *_cu)
             : RequestPort(_name, _cu), computeUnit(_cu), stalled(false) { }


         bool isStalled() { return stalled; }
         void stallPort() { stalled = true; }
         void unstallPort() { stalled = false; }

         /**
          * here we queue all the translation requests that were
          * not successfully sent.
          */
         std::deque<PacketPtr> retries;

         /** SenderState is information carried along with the packet
          * throughout the TLB hierarchy
          */
         struct SenderState: public Packet::SenderState
         {
             // The wavefront associated with this request
             Wavefront *wavefront;

             SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
         };

       protected:
         ComputeUnit *computeUnit;
         bool stalled;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
         virtual void recvFunctional(PacketPtr pkt) { }
         virtual void recvRangeChange() { }
         virtual void recvReqRetry();
     };

     /**
      * the port intended to communicate between the CU and its LDS
      */
     class LDSPort : public RequestPort
     {
       public:
         LDSPort(const std::string &_name, ComputeUnit *_cu)
         : RequestPort(_name, _cu), computeUnit(_cu)
         {
         }

         bool isStalled() const { return stalled; }
         void stallPort() { stalled = true; }
         void unstallPort() { stalled = false; }

         /**
          * here we queue all the requests that were
          * not successfully sent.
          */
         std::queue<PacketPtr> retries;

         /**
          *  SenderState is information carried along with the packet, esp. the
          *  GPUDynInstPtr
          */
         class SenderState: public Packet::SenderState
         {
           protected:
             // The actual read/write/atomic request that goes with this command
             GPUDynInstPtr _gpuDynInst = nullptr;

           public:
             SenderState(GPUDynInstPtr gpuDynInst):
               _gpuDynInst(gpuDynInst)
             {
             }

             GPUDynInstPtr
             getMemInst() const
             {
               return _gpuDynInst;
             }
         };

         virtual bool
         sendTimingReq(PacketPtr pkt);

       protected:

         bool stalled = false; ///< whether or not it is stalled

         ComputeUnit *computeUnit;

         virtual bool
         recvTimingResp(PacketPtr pkt);

         virtual Tick
         recvAtomic(PacketPtr pkt) { return 0; }

         virtual void
         recvFunctional(PacketPtr pkt)
         {
         }

         virtual void
         recvRangeChange()
         {
         }

         virtual void
         recvReqRetry();
     };

     /** The port to access the Local Data Store
      *  Can be connected to a LDS object
      */
     LDSPort ldsPort;

     TokenManager *
     getTokenManager()
     {
         return memPortTokens;
     }

     /** The memory port for SIMD data accesses.
      *  Can be connected to PhysMem for Ruby for timing simulations
      */
     std::vector<DataPort> memPort;
     // port to the TLB hierarchy (i.e., the L1 TLB)
     std::vector<DTLBPort> tlbPort;
     // port to the scalar data cache
     ScalarDataPort scalarDataPort;
     // port to the scalar data TLB
     ScalarDTLBPort scalarDTLBPort;
     // port to the SQC (i.e. the I-cache)
     SQCPort sqcPort;
     // port to the SQC TLB (there's a separate TLB for each I-cache)
     ITLBPort sqcTLBPort;

     Port &
     getPort(const std::string &if_name, PortID idx) override
     {
         if (if_name == "memory_port" && idx < memPort.size()) {
             return memPort[idx];
         } else if (if_name == "translation_port" && idx < tlbPort.size()) {
             return tlbPort[idx];
         } else if (if_name == "scalar_port") {
             return scalarDataPort;
         } else if (if_name == "scalar_tlb_port") {
             return scalarDTLBPort;
         } else if (if_name == "sqc_port") {
             return sqcPort;
         } else if (if_name == "sqc_tlb_port") {
             return sqcTLBPort;
         } else if (if_name == "ldsPort") {
             return ldsPort;
         } else if (if_name == "gmTokenPort") {
             return gmTokenPort;
         } else {
             return ClockedObject::getPort(if_name, idx);
         }
     }

     InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }

   private:
     const int _cacheLineSize;
     const int _numBarrierSlots;
     int cacheLineBits;
     InstSeqNum globalSeqNum;
     int wavefrontSize;

     /**
      * TODO: Update these comments once the pipe stage interface has
      *       been fully refactored.
      *
      * Pipeline stage interfaces.
      *
      * Buffers used to communicate between various pipeline stages
      * List of waves which will be dispatched to
      * each execution resource. An EXREADY implies
      * dispatch list is non-empty and
      * execution unit has something to execute
      * this cycle. Currently, the dispatch list of
      * an execution resource can hold only one wave because
      * an execution resource can execute only one wave in a cycle.
      * dispatchList is used to communicate between schedule
      * and exec stage
      *
      * At a high level, the following intra-/inter-stage communication occurs:
      * SCB to SCH: readyList provides per exec resource list of waves that
      *             passed dependency and readiness checks. If selected by
      *             scheduler, attempt to add wave to schList conditional on
      *             RF support.
      * SCH: schList holds waves that are gathering operands or waiting
      *      for execution resource availability. Once ready, waves are
      *      placed on the dispatchList as candidates for execution. A wave
      *      may spend multiple cycles in SCH stage, on the schList due to
      *      RF access conflicts or execution resource contention.
      * SCH to EX: dispatchList holds waves that are ready to be executed.
      *            LM/FLAT arbitration may remove an LM wave and place it
      *            back on the schList. RF model may also force a wave back
      *            to the schList if using the detailed model.
      */
     ScoreboardCheckToSchedule scoreboardCheckToSchedule;
     ScheduleToExecute scheduleToExecute;

     /**
      * The barrier slots for this CU.
      */
     std::vector<WFBarrier> wfBarrierSlots;
     /**
      * A set used to easily retrieve a free barrier ID.
      */
     std::unordered_set<int> freeBarrierIds;

     // hold the time of the arrival of the first cache block related to
     // a particular GPUDynInst. This is used to calculate the difference
     // between the first and last chace block arrival times.
     std::unordered_map<GPUDynInstPtr, Tick> headTailMap;

   public:
     void updateInstStats(GPUDynInstPtr gpuDynInst);
     int activeWaves;

     struct ComputeUnitStats : public statistics::Group
     {
         ComputeUnitStats(statistics::Group *parent, int n_wf);

         statistics::Scalar vALUInsts;
         statistics::Formula vALUInstsPerWF;
         statistics::Scalar sALUInsts;
         statistics::Formula sALUInstsPerWF;
         statistics::Scalar instCyclesVALU;
         statistics::Scalar instCyclesSALU;
         statistics::Scalar threadCyclesVALU;
         statistics::Formula vALUUtilization;
         statistics::Scalar ldsNoFlatInsts;
         statistics::Formula ldsNoFlatInstsPerWF;
         statistics::Scalar flatVMemInsts;
         statistics::Formula flatVMemInstsPerWF;
         statistics::Scalar flatLDSInsts;
         statistics::Formula flatLDSInstsPerWF;
         statistics::Scalar vectorMemWrites;
         statistics::Formula vectorMemWritesPerWF;
         statistics::Scalar vectorMemReads;
         statistics::Formula vectorMemReadsPerWF;
         statistics::Scalar scalarMemWrites;
         statistics::Formula scalarMemWritesPerWF;
         statistics::Scalar scalarMemReads;
         statistics::Formula scalarMemReadsPerWF;

         statistics::Formula vectorMemReadsPerKiloInst;
         statistics::Formula vectorMemWritesPerKiloInst;
         statistics::Formula vectorMemInstsPerKiloInst;
         statistics::Formula scalarMemReadsPerKiloInst;
         statistics::Formula scalarMemWritesPerKiloInst;
         statistics::Formula scalarMemInstsPerKiloInst;

         // Cycles required to send register source (addr and data) from
         // register files to memory pipeline, per SIMD.
         statistics::Vector instCyclesVMemPerSimd;
         statistics::Vector instCyclesScMemPerSimd;
         statistics::Vector instCyclesLdsPerSimd;

         statistics::Scalar globalReads;
         statistics::Scalar globalWrites;
         statistics::Formula globalMemInsts;
         statistics::Scalar argReads;
         statistics::Scalar argWrites;
         statistics::Formula argMemInsts;
         statistics::Scalar spillReads;
         statistics::Scalar spillWrites;
         statistics::Formula spillMemInsts;
         statistics::Scalar groupReads;
         statistics::Scalar groupWrites;
         statistics::Formula groupMemInsts;
         statistics::Scalar privReads;
         statistics::Scalar privWrites;
         statistics::Formula privMemInsts;
         statistics::Scalar readonlyReads;
         statistics::Scalar readonlyWrites;
         statistics::Formula readonlyMemInsts;
         statistics::Scalar kernargReads;
         statistics::Scalar kernargWrites;
         statistics::Formula kernargMemInsts;

         statistics::Distribution waveLevelParallelism;

         // the following stats compute the avg. TLB accesslatency per
         // uncoalesced request (only for data)
         statistics::Scalar tlbRequests;
         statistics::Scalar tlbCycles;
         statistics::Formula tlbLatency;
         // hitsPerTLBLevel[x] are the hits in Level x TLB.
         // x = 0 is the page table.
         statistics::Vector hitsPerTLBLevel;

         statistics::Scalar ldsBankAccesses;
         statistics::Distribution ldsBankConflictDist;

         // over all memory instructions executed over all wavefronts
         // how many touched 0-4 pages, 4-8, ..., 60-64 pages
         statistics::Distribution pageDivergenceDist;
         // count of non-flat global memory vector instructions executed
         statistics::Scalar dynamicGMemInstrCnt;
         // count of flat global memory vector instructions executed
         statistics::Scalar dynamicFlatMemInstrCnt;
         statistics::Scalar dynamicLMemInstrCnt;

         statistics::Scalar wgBlockedDueBarrierAllocation;
         statistics::Scalar wgBlockedDueLdsAllocation;
         // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
         // active when the instruction is committed, this number is still
         // incremented by 1
         statistics::Scalar numInstrExecuted;
         // Number of cycles among successive instruction executions across all
         // wavefronts of the same CU
         statistics::Distribution execRateDist;
         // number of individual vector operations executed
         statistics::Scalar numVecOpsExecuted;
         // number of individual f16 vector operations executed
         statistics::Scalar numVecOpsExecutedF16;
         // number of individual f32 vector operations executed
         statistics::Scalar numVecOpsExecutedF32;
         // number of individual f64 vector operations executed
         statistics::Scalar numVecOpsExecutedF64;
         // number of individual FMA 16,32,64 vector operations executed
         statistics::Scalar numVecOpsExecutedFMA16;
         statistics::Scalar numVecOpsExecutedFMA32;
         statistics::Scalar numVecOpsExecutedFMA64;
         // number of individual MAC 16,32,64 vector operations executed
         statistics::Scalar numVecOpsExecutedMAC16;
         statistics::Scalar numVecOpsExecutedMAC32;
         statistics::Scalar numVecOpsExecutedMAC64;
         // number of individual MAD 16,32,64 vector operations executed
         statistics::Scalar numVecOpsExecutedMAD16;
         statistics::Scalar numVecOpsExecutedMAD32;
         statistics::Scalar numVecOpsExecutedMAD64;
         // total number of two op FP vector operations executed
         statistics::Scalar numVecOpsExecutedTwoOpFP;
         // Total cycles that something is running on the GPU
         statistics::Scalar totalCycles;
         statistics::Formula vpc; // vector ops per cycle
         statistics::Formula vpc_f16; // vector ops per cycle
         statistics::Formula vpc_f32; // vector ops per cycle
         statistics::Formula vpc_f64; // vector ops per cycle
         statistics::Formula ipc; // vector instructions per cycle
         statistics::Distribution controlFlowDivergenceDist;
         statistics::Distribution activeLanesPerGMemInstrDist;
         statistics::Distribution activeLanesPerLMemInstrDist;
         // number of vector ALU instructions received
         statistics::Formula numALUInstsExecuted;
         // number of times a WG cannot start due to lack of free VGPRs in SIMDs
         statistics::Scalar numTimesWgBlockedDueVgprAlloc;
         // number of times a WG cannot start due to lack of free SGPRs in SIMDs
         statistics::Scalar numTimesWgBlockedDueSgprAlloc;
         statistics::Scalar numCASOps;
         statistics::Scalar numFailedCASOps;
         statistics::Scalar completedWfs;
         statistics::Scalar completedWGs;

         // distrubtion in latency difference between first and last cache block
         // arrival ticks
         statistics::Distribution headTailLatency;

         // Track the amount of interleaving between wavefronts on each SIMD.
         // This stat is sampled using instExecPerSimd to compute the number
         // of instructions that have been executed on a SIMD between a WF
         // executing two successive instructions.
         statistics::VectorDistribution instInterleave;
     } stats;
 };

 } // namespace gem5

 #endif // __COMPUTE_UNIT_HH__