src/gpu-compute/lds_state.hh - public/gem5 - Git at Google

 /*
  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #ifndef __LDS_STATE_HH__
 #define __LDS_STATE_HH__

 #include <array>
 #include <queue>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>

 #include "gpu-compute/misc.hh"
 #include "mem/port.hh"
 #include "params/LdsState.hh"
 #include "sim/clocked_object.hh"

 namespace gem5
 {

 class ComputeUnit;

 /**
  * this represents a slice of the overall LDS, intended to be associated with
  * an individual workgroup
  */
 class LdsChunk
 {
   public:
     LdsChunk(const uint32_t x_size):
         chunk(x_size)
     {
     }

     LdsChunk() {}

     /**
      * a read operation
      */
     template<class T>
     T
     read(const uint32_t index)
     {
         /**
          * For reads that are outside the bounds of the LDS
          * chunk allocated to this WG we return 0.
          */
         if (index >= chunk.size()) {
             return (T)0;
         }

         T *p0 = (T *) (&(chunk.at(index)));
         return *p0;
     }

     /**
      * a write operation
      */
     template<class T>
     void
     write(const uint32_t index, const T value)
     {
         /**
          * Writes that are outside the bounds of the LDS
          * chunk allocated to this WG are dropped.
          */
         if (index >= chunk.size()) {
             return;
         }

         T *p0 = (T *) (&(chunk.at(index)));
         *p0 = value;
     }

     /**
      * an atomic operation
      */
     template<class T>
     T
     atomic(const uint32_t index, AtomicOpFunctorPtr amoOp)
     {
         /**
          * Atomics that are outside the bounds of the LDS
          * chunk allocated to this WG are dropped.
          */
         if (index >= chunk.size()) {
             return (T)0;
         }
         T *p0 = (T *) (&(chunk.at(index)));
         T tmp = *p0;

        (*amoOp)((uint8_t *)p0);
         return tmp;
     }

     /**
      * get the size of this chunk
      */
     std::vector<uint8_t>::size_type
     size() const
     {
         return chunk.size();
     }

   protected:
     // the actual data store for this slice of the LDS
     std::vector<uint8_t> chunk;
 };

 // Local Data Share (LDS) State per Wavefront (contents of the LDS region
 // allocated to the WorkGroup of this Wavefront)
 class LdsState: public ClockedObject
 {
   protected:

     /**
      * an event to allow event-driven execution
      */
     class TickEvent: public Event
     {
       protected:

         LdsState *ldsState = nullptr;

         Tick nextTick = 0;

       public:

         TickEvent(LdsState *_ldsState) :
             ldsState(_ldsState)
         {
         }

         virtual void
         process();

         void
         schedule(Tick when)
         {
             mainEventQueue[0]->schedule(this, when);
         }

         void
         deschedule()
         {
             mainEventQueue[0]->deschedule(this);
         }
     };

     /**
      * CuSidePort is the LDS Port closer to the CU side
      */
     class CuSidePort: public ResponsePort
     {
       public:
         CuSidePort(const std::string &_name, LdsState *_ownerLds) :
                 ResponsePort(_name), ownerLds(_ownerLds)
         {
         }

       protected:
         LdsState *ownerLds;

         virtual bool
         recvTimingReq(PacketPtr pkt);

         virtual Tick
         recvAtomic(PacketPtr pkt)
         {
           return 0;
         }

         virtual void
         recvFunctional(PacketPtr pkt);

         virtual void
         recvRangeChange()
         {
         }

         virtual void
         recvRetry();

         virtual void
         recvRespRetry();

         virtual AddrRangeList
         getAddrRanges() const
         {
           AddrRangeList ranges;
           ranges.push_back(ownerLds->getAddrRange());
           return ranges;
         }

         template<typename T>
         void
         loadData(PacketPtr packet);

         template<typename T>
         void
         storeData(PacketPtr packet);

         template<typename T>
         void
         atomicOperation(PacketPtr packet);
     };

   protected:

     /**
      * the lds reference counter
      * The key is the workgroup ID and dispatch ID
      * The value is the number of wavefronts that reference this LDS, as
      * wavefronts are launched, the counter goes up for that workgroup and when
      * they return it decreases, once it reaches 0 then this chunk of the LDS
      * is returned to the available pool. However,it is deallocated on the 1->0
      * transition, not whenever the counter is 0 as it always starts with 0
      * when the workgroup asks for space
      */
     std::unordered_map<uint32_t,
                        std::unordered_map<uint32_t, int32_t>> refCounter;

     // the map that allows workgroups to access their own chunk of the LDS
     std::unordered_map<uint32_t,
                        std::unordered_map<uint32_t, LdsChunk>> chunkMap;

     // an event to allow the LDS to wake up at a specified time
     TickEvent tickEvent;

     // the queue of packets that are going back to the CU after a
     // read/write/atomic op
     // TODO need to make this have a maximum size to create flow control
     std::queue<std::pair<Tick, PacketPtr>> returnQueue;

     // whether or not there are pending responses
     bool retryResp = false;

     bool
     process();

     GPUDynInstPtr
     getDynInstr(PacketPtr packet);

     bool
     processPacket(PacketPtr packet);

     unsigned
     countBankConflicts(PacketPtr packet, unsigned *bankAccesses);

     unsigned
     countBankConflicts(GPUDynInstPtr gpuDynInst,
                        unsigned *numBankAccesses);

   public:
     using Params = LdsStateParams;

     LdsState(const Params &params);

     // prevent copy construction
     LdsState(const LdsState&) = delete;

     ~LdsState()
     {
         parent = nullptr;
     }

     bool
     isRetryResp() const
     {
         return retryResp;
     }

     void
     setRetryResp(const bool value)
     {
         retryResp = value;
     }

     // prevent assignment
     LdsState &
     operator=(const LdsState &) = delete;

     /**
      * use the dynamic wave id to create or just increase the reference count
      */
     int
     increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
     {
         int refCount = getRefCounter(dispatchId, wgId);
         fatal_if(refCount < 0,
                  "reference count should not be below zero");
         return ++refCounter[dispatchId][wgId];
     }

     /**
      * decrease the reference count after making sure it is in the list
      * give back this chunk if the ref counter has reached 0
      */
     int
     decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
     {
       int refCount = getRefCounter(dispatchId, wgId);

       fatal_if(refCount <= 0,
               "reference count should not be below zero or at zero to"
               "decrement");

       refCounter[dispatchId][wgId]--;

       if (refCounter[dispatchId][wgId] == 0) {
         releaseSpace(dispatchId, wgId);
         return 0;
       } else {
         return refCounter[dispatchId][wgId];
       }
     }

     /**
      * return the current reference count for this workgroup id
      */
     int
     getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
     {
       auto dispatchIter = chunkMap.find(dispatchId);
       fatal_if(dispatchIter == chunkMap.end(),
                "could not locate this dispatch id [%d]", dispatchId);

       auto workgroup = dispatchIter->second.find(wgId);
       fatal_if(workgroup == dispatchIter->second.end(),
                "could not find this workgroup id within this dispatch id"
                " did[%d] wgid[%d]", dispatchId, wgId);

       auto refCountIter = refCounter.find(dispatchId);
       if (refCountIter == refCounter.end()) {
         fatal("could not locate this dispatch id [%d]", dispatchId);
       } else {
         auto workgroup = refCountIter->second.find(wgId);
         if (workgroup == refCountIter->second.end()) {
           fatal("could not find this workgroup id within this dispatch id"
                   " did[%d] wgid[%d]", dispatchId, wgId);
         } else {
           return refCounter.at(dispatchId).at(wgId);
         }
       }

       fatal("should not reach this point");
       return 0;
     }

     /**
      * assign a parent and request this amount of space be set aside
      * for this wgid
      */
     LdsChunk *
     reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
             const uint32_t size)
     {
         if (chunkMap.find(dispatchId) != chunkMap.end()) {
             panic_if(
                 chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
                 "duplicate workgroup ID asking for space in the LDS "
                 "did[%d] wgid[%d]", dispatchId, wgId);
         }

         if (bytesAllocated + size > maximumSize) {
             return nullptr;
         } else {
             bytesAllocated += size;

             auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
             panic_if(!value.second, "was unable to allocate a new chunkMap");

             // make an entry for this workgroup
             refCounter[dispatchId][wgId] = 0;

             return &chunkMap[dispatchId][wgId];
         }
     }

     /*
      * return pointer to lds chunk for wgid
      */
     LdsChunk *
     getLdsChunk(const uint32_t dispatchId, const uint32_t wgId)
     {
       fatal_if(chunkMap.find(dispatchId) == chunkMap.end(),
           "fetch for unknown dispatch ID did[%d]", dispatchId);

       fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(),
           "fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]",
           wgId, dispatchId);

       return &chunkMap[dispatchId][wgId];
     }

     bool
     returnQueuePush(std::pair<Tick, PacketPtr> thePair);

     Tick
     earliestReturnTime() const
     {
         // TODO set to max(lastCommand+1, curTick())
         return returnQueue.empty() ? curTick() : returnQueue.back().first;
     }

     void
     setParent(ComputeUnit *x_parent);

     // accessors
     ComputeUnit *
     getParent() const
     {
         return parent;
     }

     std::string
     getName()
     {
         return _name;
     }

     int
     getBanks() const
     {
         return banks;
     }

     ComputeUnit *
     getComputeUnit() const
     {
         return parent;
     }

     int
     getBankConflictPenalty() const
     {
         return bankConflictPenalty;
     }

     /**
      * get the allocated size for this workgroup
      */
     std::size_t
     ldsSize(const uint32_t x_wgId)
     {
         return chunkMap[x_wgId].size();
     }

     AddrRange
     getAddrRange() const
     {
         return range;
     }

     Port &
     getPort(const std::string &if_name, PortID idx)
     {
         if (if_name == "cuPort") {
             // TODO need to set name dynamically at this point?
             return cuPort;
         } else {
             fatal("cannot resolve the port name " + if_name);
         }
     }

     /**
      * can this much space be reserved for a workgroup?
      */
     bool
     canReserve(uint32_t x_size) const
     {
       return bytesAllocated + x_size <= maximumSize;
     }

   private:
     /**
      * give back the space
      */
     bool
     releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
     {
         auto dispatchIter = chunkMap.find(x_dispatchId);

         if (dispatchIter == chunkMap.end()) {
           fatal("dispatch id not found [%d]", x_dispatchId);
         } else {
           auto workgroupIter = dispatchIter->second.find(x_wgId);
           if (workgroupIter == dispatchIter->second.end()) {
             fatal("workgroup id [%d] not found in dispatch id [%d]",
                     x_wgId, x_dispatchId);
           }
         }

         fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
                  "releasing more space than was allocated");

         bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
         chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
         return true;
     }

     // the port that connects this LDS to its owner CU
     CuSidePort cuPort;

     ComputeUnit* parent = nullptr;

     std::string _name;

     // the number of bytes currently reserved by all workgroups
     int bytesAllocated = 0;

     // the size of the LDS, the most bytes available
     int maximumSize;

     // Address range of this memory
     AddrRange range;

     // the penalty, in cycles, for each LDS bank conflict
     int bankConflictPenalty = 0;

     // the number of banks in the LDS underlying data store
     int banks = 0;
 };

 } // namespace gem5

 #endif // __LDS_STATE_HH__
	/*
	* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* 3. Neither the name of the copyright holder nor the names of its
	* contributors may be used to endorse or promote products derived from this
	* software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#ifndef __LDS_STATE_HH__
	#define __LDS_STATE_HH__

	#include <array>
	#include <queue>
	#include <string>
	#include <unordered_map>
	#include <utility>
	#include <vector>

	#include "gpu-compute/misc.hh"
	#include "mem/port.hh"
	#include "params/LdsState.hh"
	#include "sim/clocked_object.hh"

	namespace gem5
	{

	class ComputeUnit;

	/**
	* this represents a slice of the overall LDS, intended to be associated with
	* an individual workgroup
	*/
	class LdsChunk
	{
	public:
	LdsChunk(const uint32_t x_size):
	chunk(x_size)
	{
	}

	LdsChunk() {}

	/**
	* a read operation
	*/
	template<class T>
	T
	read(const uint32_t index)
	{
	/**
	* For reads that are outside the bounds of the LDS
	* chunk allocated to this WG we return 0.
	*/
	if (index >= chunk.size()) {
	return (T)0;
	}

	T p0 = (T ) (&(chunk.at(index)));
	return *p0;
	}

	/**
	* a write operation
	*/
	template<class T>
	void
	write(const uint32_t index, const T value)
	{
	/**
	* Writes that are outside the bounds of the LDS
	* chunk allocated to this WG are dropped.
	*/
	if (index >= chunk.size()) {
	return;
	}

	T p0 = (T ) (&(chunk.at(index)));
	*p0 = value;
	}

	/**
	* an atomic operation
	*/
	template<class T>
	T
	atomic(const uint32_t index, AtomicOpFunctorPtr amoOp)
	{
	/**
	* Atomics that are outside the bounds of the LDS
	* chunk allocated to this WG are dropped.
	*/
	if (index >= chunk.size()) {
	return (T)0;
	}
	T p0 = (T ) (&(chunk.at(index)));
	T tmp = *p0;

	(amoOp)((uint8_t )p0);
	return tmp;
	}

	/**
	* get the size of this chunk
	*/
	std::vector<uint8_t>::size_type
	size() const
	{
	return chunk.size();
	}

	protected:
	// the actual data store for this slice of the LDS
	std::vector<uint8_t> chunk;
	};

	// Local Data Share (LDS) State per Wavefront (contents of the LDS region
	// allocated to the WorkGroup of this Wavefront)
	class LdsState: public ClockedObject
	{
	protected:

	/**
	* an event to allow event-driven execution
	*/
	class TickEvent: public Event
	{
	protected:

	LdsState *ldsState = nullptr;

	Tick nextTick = 0;

	public:

	TickEvent(LdsState *_ldsState) :
	ldsState(_ldsState)
	{
	}

	virtual void
	process();

	void
	schedule(Tick when)
	{
	mainEventQueue[0]->schedule(this, when);
	}

	void
	deschedule()
	{
	mainEventQueue[0]->deschedule(this);
	}
	};

	/**
	* CuSidePort is the LDS Port closer to the CU side
	*/
	class CuSidePort: public ResponsePort
	{
	public:
	CuSidePort(const std::string &_name, LdsState *_ownerLds) :
	ResponsePort(_name), ownerLds(_ownerLds)
	{
	}

	protected:
	LdsState *ownerLds;

	virtual bool
	recvTimingReq(PacketPtr pkt);

	virtual Tick
	recvAtomic(PacketPtr pkt)
	{
	return 0;
	}

	virtual void
	recvFunctional(PacketPtr pkt);

	virtual void
	recvRangeChange()
	{
	}

	virtual void
	recvRetry();

	virtual void
	recvRespRetry();

	virtual AddrRangeList
	getAddrRanges() const
	{
	AddrRangeList ranges;
	ranges.push_back(ownerLds->getAddrRange());
	return ranges;
	}

	template<typename T>
	void
	loadData(PacketPtr packet);

	template<typename T>
	void
	storeData(PacketPtr packet);

	template<typename T>
	void
	atomicOperation(PacketPtr packet);
	};

	protected:

	/**
	* the lds reference counter
	* The key is the workgroup ID and dispatch ID
	* The value is the number of wavefronts that reference this LDS, as
	* wavefronts are launched, the counter goes up for that workgroup and when
	* they return it decreases, once it reaches 0 then this chunk of the LDS
	* is returned to the available pool. However,it is deallocated on the 1->0
	* transition, not whenever the counter is 0 as it always starts with 0
	* when the workgroup asks for space
	*/
	std::unordered_map<uint32_t,
	std::unordered_map<uint32_t, int32_t>> refCounter;

	// the map that allows workgroups to access their own chunk of the LDS
	std::unordered_map<uint32_t,
	std::unordered_map<uint32_t, LdsChunk>> chunkMap;

	// an event to allow the LDS to wake up at a specified time
	TickEvent tickEvent;

	// the queue of packets that are going back to the CU after a
	// read/write/atomic op
	// TODO need to make this have a maximum size to create flow control
	std::queue<std::pair<Tick, PacketPtr>> returnQueue;

	// whether or not there are pending responses
	bool retryResp = false;

	bool
	process();

	GPUDynInstPtr
	getDynInstr(PacketPtr packet);

	bool
	processPacket(PacketPtr packet);

	unsigned
	countBankConflicts(PacketPtr packet, unsigned *bankAccesses);

	unsigned
	countBankConflicts(GPUDynInstPtr gpuDynInst,
	unsigned *numBankAccesses);

	public:
	using Params = LdsStateParams;

	LdsState(const Params &params);

	// prevent copy construction
	LdsState(const LdsState&) = delete;

	~LdsState()
	{
	parent = nullptr;
	}

	bool
	isRetryResp() const
	{
	return retryResp;
	}

	void
	setRetryResp(const bool value)
	{
	retryResp = value;
	}

	// prevent assignment
	LdsState &
	operator=(const LdsState &) = delete;

	/**
	* use the dynamic wave id to create or just increase the reference count
	*/
	int
	increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
	{
	int refCount = getRefCounter(dispatchId, wgId);
	fatal_if(refCount < 0,
	"reference count should not be below zero");
	return ++refCounter[dispatchId][wgId];
	}

	/**
	* decrease the reference count after making sure it is in the list
	* give back this chunk if the ref counter has reached 0
	*/
	int
	decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
	{
	int refCount = getRefCounter(dispatchId, wgId);

	fatal_if(refCount <= 0,
	"reference count should not be below zero or at zero to"
	"decrement");

	refCounter[dispatchId][wgId]--;

	if (refCounter[dispatchId][wgId] == 0) {
	releaseSpace(dispatchId, wgId);
	return 0;
	} else {
	return refCounter[dispatchId][wgId];
	}
	}

	/**
	* return the current reference count for this workgroup id
	*/
	int
	getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
	{
	auto dispatchIter = chunkMap.find(dispatchId);
	fatal_if(dispatchIter == chunkMap.end(),
	"could not locate this dispatch id [%d]", dispatchId);

	auto workgroup = dispatchIter->second.find(wgId);
	fatal_if(workgroup == dispatchIter->second.end(),
	"could not find this workgroup id within this dispatch id"
	" did[%d] wgid[%d]", dispatchId, wgId);

	auto refCountIter = refCounter.find(dispatchId);
	if (refCountIter == refCounter.end()) {
	fatal("could not locate this dispatch id [%d]", dispatchId);
	} else {
	auto workgroup = refCountIter->second.find(wgId);
	if (workgroup == refCountIter->second.end()) {
	fatal("could not find this workgroup id within this dispatch id"
	" did[%d] wgid[%d]", dispatchId, wgId);
	} else {
	return refCounter.at(dispatchId).at(wgId);
	}
	}

	fatal("should not reach this point");
	return 0;
	}

	/**
	* assign a parent and request this amount of space be set aside
	* for this wgid
	*/
	LdsChunk *
	reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
	const uint32_t size)
	{
	if (chunkMap.find(dispatchId) != chunkMap.end()) {
	panic_if(
	chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
	"duplicate workgroup ID asking for space in the LDS "
	"did[%d] wgid[%d]", dispatchId, wgId);
	}

	if (bytesAllocated + size > maximumSize) {
	return nullptr;
	} else {
	bytesAllocated += size;

	auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
	panic_if(!value.second, "was unable to allocate a new chunkMap");

	// make an entry for this workgroup
	refCounter[dispatchId][wgId] = 0;

	return &chunkMap[dispatchId][wgId];
	}
	}

	/*
	* return pointer to lds chunk for wgid
	*/
	LdsChunk *
	getLdsChunk(const uint32_t dispatchId, const uint32_t wgId)
	{
	fatal_if(chunkMap.find(dispatchId) == chunkMap.end(),
	"fetch for unknown dispatch ID did[%d]", dispatchId);

	fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(),
	"fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]",
	wgId, dispatchId);

	return &chunkMap[dispatchId][wgId];
	}

	bool
	returnQueuePush(std::pair<Tick, PacketPtr> thePair);

	Tick
	earliestReturnTime() const
	{
	// TODO set to max(lastCommand+1, curTick())
	return returnQueue.empty() ? curTick() : returnQueue.back().first;
	}

	void
	setParent(ComputeUnit *x_parent);

	// accessors
	ComputeUnit *
	getParent() const
	{
	return parent;
	}

	std::string
	getName()
	{
	return _name;
	}

	int
	getBanks() const
	{
	return banks;
	}

	ComputeUnit *
	getComputeUnit() const
	{
	return parent;
	}

	int
	getBankConflictPenalty() const
	{
	return bankConflictPenalty;
	}

	/**
	* get the allocated size for this workgroup
	*/
	std::size_t
	ldsSize(const uint32_t x_wgId)
	{
	return chunkMap[x_wgId].size();
	}

	AddrRange
	getAddrRange() const
	{
	return range;
	}

	Port &
	getPort(const std::string &if_name, PortID idx)
	{
	if (if_name == "cuPort") {
	// TODO need to set name dynamically at this point?
	return cuPort;
	} else {
	fatal("cannot resolve the port name " + if_name);
	}
	}

	/**
	* can this much space be reserved for a workgroup?
	*/
	bool
	canReserve(uint32_t x_size) const
	{
	return bytesAllocated + x_size <= maximumSize;
	}

	private:
	/**
	* give back the space
	*/
	bool
	releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
	{
	auto dispatchIter = chunkMap.find(x_dispatchId);

	if (dispatchIter == chunkMap.end()) {
	fatal("dispatch id not found [%d]", x_dispatchId);
	} else {
	auto workgroupIter = dispatchIter->second.find(x_wgId);
	if (workgroupIter == dispatchIter->second.end()) {
	fatal("workgroup id [%d] not found in dispatch id [%d]",
	x_wgId, x_dispatchId);
	}
	}

	fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
	"releasing more space than was allocated");

	bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
	chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
	return true;
	}

	// the port that connects this LDS to its owner CU
	CuSidePort cuPort;

	ComputeUnit* parent = nullptr;

	std::string _name;

	// the number of bytes currently reserved by all workgroups
	int bytesAllocated = 0;

	// the size of the LDS, the most bytes available
	int maximumSize;

	// Address range of this memory
	AddrRange range;

	// the penalty, in cycles, for each LDS bank conflict
	int bankConflictPenalty = 0;

	// the number of banks in the LDS underlying data store
	int banks = 0;
	};

	} // namespace gem5

	#endif // __LDS_STATE_HH__