src/gpu-compute/lds_state.cc - public/gem5 - Git at Google

 /*
  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #include "gpu-compute/lds_state.hh"

 #include <array>
 #include <cstdio>
 #include <cstdlib>

 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/shader.hh"

 /**
  * the default constructor that works with SWIG
  */
 LdsState::LdsState(const Params *params) :
     ClockedObject(params),
     tickEvent(this),
     cuPort(name() + ".port", this),
     maximumSize(params->size),
     range(params->range),
     bankConflictPenalty(params->bankConflictPenalty),
     banks(params->banks)
 {
     fatal_if(params->banks <= 0,
              "Number of LDS banks should be positive number");
     fatal_if((params->banks & (params->banks - 1)) != 0,
              "Number of LDS banks should be a power of 2");
     fatal_if(params->size <= 0,
              "cannot allocate an LDS with a size less than 1");
     fatal_if(params->size % 2,
           "the LDS should be an even number");
 }

 /**
  * Needed by the SWIG compiler
  */
 LdsState *
 LdsStateParams::create()
 {
     return new LdsState(this);
 }

 /**
  * set the parent and name based on the parent
  */
 void
 LdsState::setParent(ComputeUnit *x_parent)
 {
     // check that this gets assigned to the same thing each time
     fatal_if(!x_parent, "x_parent should not be nullptr");
     fatal_if(x_parent == parent,
              "should not be setting the parent twice");

     parent = x_parent;
     _name = x_parent->name() + ".LdsState";
 }

 /**
  * derive the gpu mem packet from the packet and then count the bank conflicts
  */
 unsigned
 LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
 {
     Packet::SenderState *baseSenderState = packet->senderState;
     while (baseSenderState->predecessor) {
         baseSenderState = baseSenderState->predecessor;
     }
     const ComputeUnit::LDSPort::SenderState *senderState =
             dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);

     fatal_if(!senderState,
              "did not get the right sort of sender state");

     GPUDynInstPtr gpuDynInst = senderState->getMemInst();

     return countBankConflicts(gpuDynInst, bankAccesses);
 }

 // Count the total number of bank conflicts for the local memory packet
 unsigned
 LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
                              unsigned *numBankAccesses)
 {
     int bank_conflicts = 0;
     std::vector<int> bank;
     // the number of LDS banks being touched by the memory instruction
     int numBanks = std::min(parent->wfSize(), banks);
     // if the wavefront size is larger than the number of LDS banks, we
     // need to iterate over all work items to calculate the total
     // number of bank conflicts
     int groups = (parent->wfSize() > numBanks) ?
         (parent->wfSize() / numBanks) : 1;
     for (int i = 0; i < groups; i++) {
         // Address Array holding all the work item addresses of an instruction
         std::vector<Addr> addr_array;
         addr_array.resize(numBanks, 0);
         bank.clear();
         bank.resize(banks, 0);
         int max_bank = 0;

         // populate the address array for all active work items
         for (int j = 0; j < numBanks; j++) {
             if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
                 addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
             } else {
                 addr_array[j] = std::numeric_limits<Addr>::max();
             }
         }

         if (gpuDynInst->isLoad() || gpuDynInst->isStore()) {
             // mask identical addresses
             for (int j = 0; j < numBanks; ++j) {
                 for (int j0 = 0; j0 < j; j0++) {
                     if (addr_array[j] != std::numeric_limits<Addr>::max()
                                     && addr_array[j] == addr_array[j0]) {
                         addr_array[j] = std::numeric_limits<Addr>::max();
                     }
                 }
             }
         }
         // calculate bank conflicts
         for (int j = 0; j < numBanks; ++j) {
             if (addr_array[j] != std::numeric_limits<Addr>::max()) {
                 int bankId = addr_array[j] % banks;
                 bank[bankId]++;
                 max_bank = std::max(max_bank, bank[bankId]);
                 // Count the number of LDS banks accessed.
                 // Since we have masked identical addresses all remaining
                 // accesses will need to be serialized if they access
                 // the same bank (bank conflict).
                 (*numBankAccesses)++;
             }
         }
         bank_conflicts += max_bank;
     }
     panic_if(bank_conflicts > parent->wfSize(),
              "Max bank conflicts should match num of work items per instr");
     return bank_conflicts;
 }

 /**
  * receive the packet from the CU
  */
 bool
 LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
 {
     return ownerLds->processPacket(packet);
 }

 GPUDynInstPtr
 LdsState::getDynInstr(PacketPtr packet)
 {
     ComputeUnit::LDSPort::SenderState *ss =
         dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
                      packet->senderState);
     return ss->getMemInst();
 }

 /**
  * process an incoming packet, add it to the return queue
  */
 bool
 LdsState::processPacket(PacketPtr packet)
 {
     unsigned bankAccesses = 0;
     // the number of conflicts this packet will have when accessing the LDS
     unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
     // count the total number of physical LDS bank accessed
     parent->ldsBankAccesses += bankAccesses;
     // count the LDS bank conflicts. A number set to 1 indicates one
     // access per bank maximum so there are no bank conflicts
     parent->ldsBankConflictDist.sample(bankConflicts-1);

     GPUDynInstPtr dynInst = getDynInstr(packet);
     // account for the LDS bank conflict overhead
     int busLength = (dynInst->isLoad()) ? parent->loadBusLength() :
         (dynInst->isStore()) ? parent->storeBusLength() :
         parent->loadBusLength();
     // delay for accessing the LDS
     Tick processingTime =
         parent->shader->ticks(bankConflicts * bankConflictPenalty) +
         parent->shader->ticks(busLength);
     // choose (delay + last packet in queue) or (now + delay) as the time to
     // return this
     Tick doneAt = earliestReturnTime() + processingTime;
     // then store it for processing
     return returnQueuePush(std::make_pair(doneAt, packet));
 }

 /**
  * add this to the queue of packets to be returned
  */
 bool
 LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
 {
     // TODO add time limits (e.g. one packet per cycle) and queue size limits
     // and implement flow control
     returnQueue.push(thePair);

     // if there is no set wakeup time, look through the queue
     if (!tickEvent.scheduled()) {
         process();
     }

     return true;
 }

 /**
  * receive a packet in functional mode
  */
 void
 LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
 {
     fatal("not implemented");
 }

 /**
  * receive a retry for a response
  */
 void
 LdsState::CuSidePort::recvRespRetry()
 {
     // TODO verify that this is the right way to do this
     assert(ownerLds->isRetryResp());
     ownerLds->setRetryResp(false);
     ownerLds->process();
 }

 /**
  * receive a retry
  */
 void
 LdsState::CuSidePort::recvRetry()
 {
     fatal("not implemented");
 }

 /**
  * look for packets to return at this time
  */
 bool
 LdsState::process()
 {
     Tick now = clockEdge();

     // send back completed packets
     while (!returnQueue.empty() && returnQueue.front().first <= now) {
         PacketPtr packet = returnQueue.front().second;

         ComputeUnit::LDSPort::SenderState *ss =
             dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
                             packet->senderState);

         GPUDynInstPtr gpuDynInst = ss->getMemInst();

         gpuDynInst->initiateAcc(gpuDynInst);

         packet->makeTimingResponse();

         returnQueue.pop();

         bool success = cuPort.sendTimingResp(packet);

         if (!success) {
             retryResp = true;
             panic("have not handled timing responses being NACK'd when sent"
                             "back");
         }
     }

     // determine the next wakeup time
     if (!returnQueue.empty()) {

         Tick next = returnQueue.front().first;

         if (tickEvent.scheduled()) {

             if (next < tickEvent.when()) {

                 tickEvent.deschedule();
                 tickEvent.schedule(next);
             }
         } else {
             tickEvent.schedule(next);
         }
     }

     return true;
 }

 /**
  * wake up at this time and perform specified actions
  */
 void
 LdsState::TickEvent::process()
 {
     ldsState->process();
 }
	/*
	* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* For use for simulation and test purposes only
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* 3. Neither the name of the copyright holder nor the names of its
	* contributors may be used to endorse or promote products derived from this
	* software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "gpu-compute/lds_state.hh"

	#include <array>
	#include <cstdio>
	#include <cstdlib>

	#include "gpu-compute/compute_unit.hh"
	#include "gpu-compute/gpu_dyn_inst.hh"
	#include "gpu-compute/shader.hh"

	/**
	* the default constructor that works with SWIG
	*/
	LdsState::LdsState(const Params *params) :
	ClockedObject(params),
	tickEvent(this),
	cuPort(name() + ".port", this),
	maximumSize(params->size),
	range(params->range),
	bankConflictPenalty(params->bankConflictPenalty),
	banks(params->banks)
	{
	fatal_if(params->banks <= 0,
	"Number of LDS banks should be positive number");
	fatal_if((params->banks & (params->banks - 1)) != 0,
	"Number of LDS banks should be a power of 2");
	fatal_if(params->size <= 0,
	"cannot allocate an LDS with a size less than 1");
	fatal_if(params->size % 2,
	"the LDS should be an even number");
	}

	/**
	* Needed by the SWIG compiler
	*/
	LdsState *
	LdsStateParams::create()
	{
	return new LdsState(this);
	}

	/**
	* set the parent and name based on the parent
	*/
	void
	LdsState::setParent(ComputeUnit *x_parent)
	{
	// check that this gets assigned to the same thing each time
	fatal_if(!x_parent, "x_parent should not be nullptr");
	fatal_if(x_parent == parent,
	"should not be setting the parent twice");

	parent = x_parent;
	_name = x_parent->name() + ".LdsState";
	}

	/**
	* derive the gpu mem packet from the packet and then count the bank conflicts
	*/
	unsigned
	LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
	{
	Packet::SenderState *baseSenderState = packet->senderState;
	while (baseSenderState->predecessor) {
	baseSenderState = baseSenderState->predecessor;
	}
	const ComputeUnit::LDSPort::SenderState *senderState =
	dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);

	fatal_if(!senderState,
	"did not get the right sort of sender state");

	GPUDynInstPtr gpuDynInst = senderState->getMemInst();

	return countBankConflicts(gpuDynInst, bankAccesses);
	}

	// Count the total number of bank conflicts for the local memory packet
	unsigned
	LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
	unsigned *numBankAccesses)
	{
	int bank_conflicts = 0;
	std::vector<int> bank;
	// the number of LDS banks being touched by the memory instruction
	int numBanks = std::min(parent->wfSize(), banks);
	// if the wavefront size is larger than the number of LDS banks, we
	// need to iterate over all work items to calculate the total
	// number of bank conflicts
	int groups = (parent->wfSize() > numBanks) ?
	(parent->wfSize() / numBanks) : 1;
	for (int i = 0; i < groups; i++) {
	// Address Array holding all the work item addresses of an instruction
	std::vector<Addr> addr_array;
	addr_array.resize(numBanks, 0);
	bank.clear();
	bank.resize(banks, 0);
	int max_bank = 0;

	// populate the address array for all active work items
	for (int j = 0; j < numBanks; j++) {
	if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
	addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
	} else {
	addr_array[j] = std::numeric_limits<Addr>::max();
	}
	}

	if (gpuDynInst->isLoad() \|\| gpuDynInst->isStore()) {
	// mask identical addresses
	for (int j = 0; j < numBanks; ++j) {
	for (int j0 = 0; j0 < j; j0++) {
	if (addr_array[j] != std::numeric_limits<Addr>::max()
	&& addr_array[j] == addr_array[j0]) {
	addr_array[j] = std::numeric_limits<Addr>::max();
	}
	}
	}
	}
	// calculate bank conflicts
	for (int j = 0; j < numBanks; ++j) {
	if (addr_array[j] != std::numeric_limits<Addr>::max()) {
	int bankId = addr_array[j] % banks;
	bank[bankId]++;
	max_bank = std::max(max_bank, bank[bankId]);
	// Count the number of LDS banks accessed.
	// Since we have masked identical addresses all remaining
	// accesses will need to be serialized if they access
	// the same bank (bank conflict).
	(*numBankAccesses)++;
	}
	}
	bank_conflicts += max_bank;
	}
	panic_if(bank_conflicts > parent->wfSize(),
	"Max bank conflicts should match num of work items per instr");
	return bank_conflicts;
	}

	/**
	* receive the packet from the CU
	*/
	bool
	LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
	{
	return ownerLds->processPacket(packet);
	}

	GPUDynInstPtr
	LdsState::getDynInstr(PacketPtr packet)
	{
	ComputeUnit::LDSPort::SenderState *ss =
	dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
	packet->senderState);
	return ss->getMemInst();
	}

	/**
	* process an incoming packet, add it to the return queue
	*/
	bool
	LdsState::processPacket(PacketPtr packet)
	{
	unsigned bankAccesses = 0;
	// the number of conflicts this packet will have when accessing the LDS
	unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
	// count the total number of physical LDS bank accessed
	parent->ldsBankAccesses += bankAccesses;
	// count the LDS bank conflicts. A number set to 1 indicates one
	// access per bank maximum so there are no bank conflicts
	parent->ldsBankConflictDist.sample(bankConflicts-1);

	GPUDynInstPtr dynInst = getDynInstr(packet);
	// account for the LDS bank conflict overhead
	int busLength = (dynInst->isLoad()) ? parent->loadBusLength() :
	(dynInst->isStore()) ? parent->storeBusLength() :
	parent->loadBusLength();
	// delay for accessing the LDS
	Tick processingTime =
	parent->shader->ticks(bankConflicts * bankConflictPenalty) +
	parent->shader->ticks(busLength);
	// choose (delay + last packet in queue) or (now + delay) as the time to
	// return this
	Tick doneAt = earliestReturnTime() + processingTime;
	// then store it for processing
	return returnQueuePush(std::make_pair(doneAt, packet));
	}

	/**
	* add this to the queue of packets to be returned
	*/
	bool
	LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
	{
	// TODO add time limits (e.g. one packet per cycle) and queue size limits
	// and implement flow control
	returnQueue.push(thePair);

	// if there is no set wakeup time, look through the queue
	if (!tickEvent.scheduled()) {
	process();
	}

	return true;
	}

	/**
	* receive a packet in functional mode
	*/
	void
	LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
	{
	fatal("not implemented");
	}

	/**
	* receive a retry for a response
	*/
	void
	LdsState::CuSidePort::recvRespRetry()
	{
	// TODO verify that this is the right way to do this
	assert(ownerLds->isRetryResp());
	ownerLds->setRetryResp(false);
	ownerLds->process();
	}

	/**
	* receive a retry
	*/
	void
	LdsState::CuSidePort::recvRetry()
	{
	fatal("not implemented");
	}

	/**
	* look for packets to return at this time
	*/
	bool
	LdsState::process()
	{
	Tick now = clockEdge();

	// send back completed packets
	while (!returnQueue.empty() && returnQueue.front().first <= now) {
	PacketPtr packet = returnQueue.front().second;

	ComputeUnit::LDSPort::SenderState *ss =
	dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
	packet->senderState);

	GPUDynInstPtr gpuDynInst = ss->getMemInst();

	gpuDynInst->initiateAcc(gpuDynInst);

	packet->makeTimingResponse();

	returnQueue.pop();

	bool success = cuPort.sendTimingResp(packet);

	if (!success) {
	retryResp = true;
	panic("have not handled timing responses being NACK'd when sent"
	"back");
	}
	}

	// determine the next wakeup time
	if (!returnQueue.empty()) {

	Tick next = returnQueue.front().first;

	if (tickEvent.scheduled()) {

	if (next < tickEvent.when()) {

	tickEvent.deschedule();
	tickEvent.schedule(next);
	}
	} else {
	tickEvent.schedule(next);
	}
	}

	return true;
	}

	/**
	* wake up at this time and perform specified actions
	*/
	void
	LdsState::TickEvent::process()
	{
	ldsState->process();
	}