src/gpu-compute/shader.hh - public/gem5 - Git at Google

 /*
  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #ifndef __SHADER_HH__
 #define __SHADER_HH__

 #include <functional>
 #include <string>

 #include "arch/isa.hh"
 #include "base/statistics.hh"
 #include "base/stats/group.hh"
 #include "base/types.hh"
 #include "cpu/simple/atomic.hh"
 #include "cpu/simple/timing.hh"
 #include "cpu/simple_thread.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/thread_state.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_tlb.hh"
 #include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/lds_state.hh"
 #include "mem/page_table.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/Shader.hh"
 #include "sim/faults.hh"
 #include "sim/process.hh"
 #include "sim/sim_object.hh"

 class BaseTLB;
 class GPUCommandProcessor;
 class GPUDispatcher;

 namespace TheISA
 {
     class GpuTLB;
 }

 static const int LDS_SIZE = 65536;

 // aperture (APE) registers define the base/limit
 // pair for the ATC mapped memory space. currently
 // the only APEs we consider are for GPUVM/LDS/scratch.
 // the APEs are registered with unique values based
 // on a per-device basis
 struct ApertureRegister
 {
     Addr base;
     Addr limit;
 };

 // Class Shader: This describes a single shader instance. Most
 // configurations will only have a single shader.

 class Shader : public ClockedObject
 {
   private:
     ApertureRegister _gpuVmApe;
     ApertureRegister _ldsApe;
     ApertureRegister _scratchApe;
     Addr shHiddenPrivateBaseVmid;

     // Number of active Cus attached to this shader
     int _activeCus;

     // Last tick that all CUs attached to this shader were inactive
     Tick _lastInactiveTick;

   public:
     typedef ShaderParams Params;
     enum hsail_mode_e {SIMT,VECTOR_SCALAR};

     GPUDispatcher &dispatcher();
     void sampleLoad(const Tick accessTime);
     void sampleStore(const Tick accessTime);
     void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
     void sampleLineRoundTrip(const std::map<Addr,
         std::vector<Tick>> &roundTripTime);

     SimpleThread *cpuThread;
     ThreadContext *gpuTc;
     BaseCPU *cpuPointer;

     const ApertureRegister&
     gpuVmApe() const
     {
         return _gpuVmApe;
     }

     const ApertureRegister&
     ldsApe() const
     {
         return _ldsApe;
     }

     const ApertureRegister&
     scratchApe() const
     {
         return _scratchApe;
     }

     bool
     isGpuVmApe(Addr addr) const
     {
         bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;

         return is_gpu_vm;
     }

     bool
     isLdsApe(Addr addr) const
     {
         bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;

         return is_lds;
     }

     bool
     isScratchApe(Addr addr) const
     {
         bool is_scratch
             = addr >= _scratchApe.base && addr <= _scratchApe.limit;

         return is_scratch;
     }

     Addr
     getScratchBase()
     {
         return _scratchApe.base;
     }

     Addr
     getHiddenPrivateBase()
     {
         return shHiddenPrivateBaseVmid;
     }

     void
     initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
     {
         Addr sh_hidden_base_new = queueBase - offset;

         // We are initializing sh_hidden_private_base_vmid from the
         // amd queue descriptor from the first queue.
         // The sh_hidden_private_base_vmid is supposed to be same for
         // all the queues from the same process
         if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
             // Do not panic if shHiddenPrivateBaseVmid == 0,
             // that is if it is uninitialized. Panic only
             // if the value is initilized and we get
             // a differnt base later.
             panic_if(shHiddenPrivateBaseVmid != 0,
                      "Currently we support only single process\n");
         }
         shHiddenPrivateBaseVmid = sh_hidden_base_new;
     }

     EventFunctionWrapper tickEvent;

     // is this simulation going to be timing mode in the memory?
     bool timingSim;
     hsail_mode_e hsail_mode;

     // If set, issue acq packet @ kernel launch
     int impl_kern_launch_acq;
     // If set, issue rel packet @ kernel end
     int impl_kern_end_rel;
     // If set, fetch returns may be coissued with instructions
     int coissue_return;
     // If set, always dump all 64 gprs to trace
     int trace_vgpr_all;
     // Number of cu units in the shader
     int n_cu;
     // Number of wavefront slots per SIMD per CU
     int n_wf;

     // The size of global memory
     int globalMemSize;

     // Tracks CU that rr dispatcher should attempt scheduling
     int nextSchedCu;

     // Size of scheduled add queue
     uint32_t sa_n;

     // Pointer to value to be increments
     std::vector<int*> sa_val;
     // When to do the increment
     std::vector<uint64_t> sa_when;
     // Amount to increment by
     std::vector<int32_t> sa_x;

     // List of Compute Units (CU's)
     std::vector<ComputeUnit*> cuList;

     GPUCommandProcessor &gpuCmdProc;
     GPUDispatcher &_dispatcher;

     int64_t max_valu_insts;
     int64_t total_valu_insts;

     Shader(const Params &p);
     ~Shader();
     virtual void init();

     // Run shader scheduled adds
     void execScheduledAdds();

     // Schedule a 32-bit value to be incremented some time in the future
     void ScheduleAdd(int *val, Tick when, int x);
     bool processTimingPacket(PacketPtr pkt);

     void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                    MemCmd cmd, bool suppress_func_errors);

     void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);

     void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
                  bool suppress_func_errors);

     void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);

     void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
                   bool suppress_func_errors);

     void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
                             bool suppress_func_errors, int cu_id);

     void
     registerCU(int cu_id, ComputeUnit *compute_unit)
     {
         cuList[cu_id] = compute_unit;
     }

     void prepareInvalidate(HSAQueueEntry *task);
     void prepareFlush(GPUDynInstPtr gpuDynInst);

     bool dispatchWorkgroups(HSAQueueEntry *task);
     Addr mmap(int length);
     void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
     void updateContext(int cid);
     void notifyCuSleep();

     void
     incVectorInstSrcOperand(int num_operands)
     {
         stats.vectorInstSrcOperand[num_operands]++;
     }

     void
     incVectorInstDstOperand(int num_operands)
     {
         stats.vectorInstDstOperand[num_operands]++;
     }

   protected:
     struct ShaderStats : public Stats::Group
     {
         ShaderStats(Stats::Group *parent, int wf_size);

         // some stats for measuring latency
         Stats::Distribution allLatencyDist;
         Stats::Distribution loadLatencyDist;
         Stats::Distribution storeLatencyDist;

         // average ticks from vmem inst initiateAcc to coalescer issue,
         Stats::Distribution initToCoalesceLatency;

         // average ticks from coalescer issue to coalescer hit callback,
         Stats::Distribution rubyNetworkLatency;

         // average ticks from coalescer hit callback to GM pipe enqueue,
         Stats::Distribution gmEnqueueLatency;

         // average ticks spent in GM pipe's ordered resp buffer.
         Stats::Distribution gmToCompleteLatency;

         // average number of cache blocks requested by vmem inst
         Stats::Distribution coalsrLineAddresses;

         // average ticks for cache blocks to main memory for the Nth
         // cache block generated by a vmem inst.
         Stats::Distribution *cacheBlockRoundTrip;

         Stats::Scalar shaderActiveTicks;
         Stats::Vector vectorInstSrcOperand;
         Stats::Vector vectorInstDstOperand;
     } stats;
 };

 #endif // __SHADER_HH__
	/*
	* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* For use for simulation and test purposes only
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* 3. Neither the name of the copyright holder nor the names of its
	* contributors may be used to endorse or promote products derived from this
	* software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#ifndef __SHADER_HH__
	#define __SHADER_HH__

	#include <functional>
	#include <string>

	#include "arch/isa.hh"
	#include "base/statistics.hh"
	#include "base/stats/group.hh"
	#include "base/types.hh"
	#include "cpu/simple/atomic.hh"
	#include "cpu/simple/timing.hh"
	#include "cpu/simple_thread.hh"
	#include "cpu/thread_context.hh"
	#include "cpu/thread_state.hh"
	#include "gpu-compute/compute_unit.hh"
	#include "gpu-compute/gpu_dyn_inst.hh"
	#include "gpu-compute/gpu_tlb.hh"
	#include "gpu-compute/hsa_queue_entry.hh"
	#include "gpu-compute/lds_state.hh"
	#include "mem/page_table.hh"
	#include "mem/port.hh"
	#include "mem/request.hh"
	#include "params/Shader.hh"
	#include "sim/faults.hh"
	#include "sim/process.hh"
	#include "sim/sim_object.hh"

	class BaseTLB;
	class GPUCommandProcessor;
	class GPUDispatcher;

	namespace TheISA
	{
	class GpuTLB;
	}

	static const int LDS_SIZE = 65536;

	// aperture (APE) registers define the base/limit
	// pair for the ATC mapped memory space. currently
	// the only APEs we consider are for GPUVM/LDS/scratch.
	// the APEs are registered with unique values based
	// on a per-device basis
	struct ApertureRegister
	{
	Addr base;
	Addr limit;
	};

	// Class Shader: This describes a single shader instance. Most
	// configurations will only have a single shader.

	class Shader : public ClockedObject
	{
	private:
	ApertureRegister _gpuVmApe;
	ApertureRegister _ldsApe;
	ApertureRegister _scratchApe;
	Addr shHiddenPrivateBaseVmid;

	// Number of active Cus attached to this shader
	int _activeCus;

	// Last tick that all CUs attached to this shader were inactive
	Tick _lastInactiveTick;

	public:
	typedef ShaderParams Params;
	enum hsail_mode_e {SIMT,VECTOR_SCALAR};

	GPUDispatcher &dispatcher();
	void sampleLoad(const Tick accessTime);
	void sampleStore(const Tick accessTime);
	void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
	void sampleLineRoundTrip(const std::map<Addr,
	std::vector<Tick>> &roundTripTime);

	SimpleThread *cpuThread;
	ThreadContext *gpuTc;
	BaseCPU *cpuPointer;

	const ApertureRegister&
	gpuVmApe() const
	{
	return _gpuVmApe;
	}

	const ApertureRegister&
	ldsApe() const
	{
	return _ldsApe;
	}

	const ApertureRegister&
	scratchApe() const
	{
	return _scratchApe;
	}

	bool
	isGpuVmApe(Addr addr) const
	{
	bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;

	return is_gpu_vm;
	}

	bool
	isLdsApe(Addr addr) const
	{
	bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;

	return is_lds;
	}

	bool
	isScratchApe(Addr addr) const
	{
	bool is_scratch
	= addr >= _scratchApe.base && addr <= _scratchApe.limit;

	return is_scratch;
	}

	Addr
	getScratchBase()
	{
	return _scratchApe.base;
	}

	Addr
	getHiddenPrivateBase()
	{
	return shHiddenPrivateBaseVmid;
	}

	void
	initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
	{
	Addr sh_hidden_base_new = queueBase - offset;

	// We are initializing sh_hidden_private_base_vmid from the
	// amd queue descriptor from the first queue.
	// The sh_hidden_private_base_vmid is supposed to be same for
	// all the queues from the same process
	if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
	// Do not panic if shHiddenPrivateBaseVmid == 0,
	// that is if it is uninitialized. Panic only
	// if the value is initilized and we get
	// a differnt base later.
	panic_if(shHiddenPrivateBaseVmid != 0,
	"Currently we support only single process\n");
	}
	shHiddenPrivateBaseVmid = sh_hidden_base_new;
	}

	EventFunctionWrapper tickEvent;

	// is this simulation going to be timing mode in the memory?
	bool timingSim;
	hsail_mode_e hsail_mode;

	// If set, issue acq packet @ kernel launch
	int impl_kern_launch_acq;
	// If set, issue rel packet @ kernel end
	int impl_kern_end_rel;
	// If set, fetch returns may be coissued with instructions
	int coissue_return;
	// If set, always dump all 64 gprs to trace
	int trace_vgpr_all;
	// Number of cu units in the shader
	int n_cu;
	// Number of wavefront slots per SIMD per CU
	int n_wf;

	// The size of global memory
	int globalMemSize;

	// Tracks CU that rr dispatcher should attempt scheduling
	int nextSchedCu;

	// Size of scheduled add queue
	uint32_t sa_n;

	// Pointer to value to be increments
	std::vector<int*> sa_val;
	// When to do the increment
	std::vector<uint64_t> sa_when;
	// Amount to increment by
	std::vector<int32_t> sa_x;

	// List of Compute Units (CU's)
	std::vector<ComputeUnit*> cuList;

	GPUCommandProcessor &gpuCmdProc;
	GPUDispatcher &_dispatcher;

	int64_t max_valu_insts;
	int64_t total_valu_insts;

	Shader(const Params &p);
	~Shader();
	virtual void init();

	// Run shader scheduled adds
	void execScheduledAdds();

	// Schedule a 32-bit value to be incremented some time in the future
	void ScheduleAdd(int *val, Tick when, int x);
	bool processTimingPacket(PacketPtr pkt);

	void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
	MemCmd cmd, bool suppress_func_errors);

	void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);

	void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
	bool suppress_func_errors);

	void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);

	void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
	bool suppress_func_errors);

	void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
	bool suppress_func_errors, int cu_id);

	void
	registerCU(int cu_id, ComputeUnit *compute_unit)
	{
	cuList[cu_id] = compute_unit;
	}

	void prepareInvalidate(HSAQueueEntry *task);
	void prepareFlush(GPUDynInstPtr gpuDynInst);

	bool dispatchWorkgroups(HSAQueueEntry *task);
	Addr mmap(int length);
	void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
	void updateContext(int cid);
	void notifyCuSleep();

	void
	incVectorInstSrcOperand(int num_operands)
	{
	stats.vectorInstSrcOperand[num_operands]++;
	}

	void
	incVectorInstDstOperand(int num_operands)
	{
	stats.vectorInstDstOperand[num_operands]++;
	}

	protected:
	struct ShaderStats : public Stats::Group
	{
	ShaderStats(Stats::Group *parent, int wf_size);

	// some stats for measuring latency
	Stats::Distribution allLatencyDist;
	Stats::Distribution loadLatencyDist;
	Stats::Distribution storeLatencyDist;

	// average ticks from vmem inst initiateAcc to coalescer issue,
	Stats::Distribution initToCoalesceLatency;

	// average ticks from coalescer issue to coalescer hit callback,
	Stats::Distribution rubyNetworkLatency;

	// average ticks from coalescer hit callback to GM pipe enqueue,
	Stats::Distribution gmEnqueueLatency;

	// average ticks spent in GM pipe's ordered resp buffer.
	Stats::Distribution gmToCompleteLatency;

	// average number of cache blocks requested by vmem inst
	Stats::Distribution coalsrLineAddresses;

	// average ticks for cache blocks to main memory for the Nth
	// cache block generated by a vmem inst.
	Stats::Distribution *cacheBlockRoundTrip;

	Stats::Scalar shaderActiveTicks;
	Stats::Vector vectorInstSrcOperand;
	Stats::Vector vectorInstDstOperand;
	} stats;
	};

	#endif // __SHADER_HH__