blob: 300fae16743cc46000897ccaacd2e333c43f8898 [file] [log] [blame]
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __SHADER_HH__
#define __SHADER_HH__
#include <functional>
#include <string>
#include "arch/isa.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "base/types.hh"
#include "cpu/simple/atomic.hh"
#include "cpu/simple/timing.hh"
#include "cpu/simple_thread.hh"
#include "cpu/thread_context.hh"
#include "cpu/thread_state.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_tlb.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
#include "mem/page_table.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/Shader.hh"
#include "sim/faults.hh"
#include "sim/process.hh"
#include "sim/sim_object.hh"
class BaseTLB;
class GPUCommandProcessor;
class GPUDispatcher;
namespace TheISA
{
class GpuTLB;
}
static const int LDS_SIZE = 65536;
// aperture (APE) registers define the base/limit
// pair for the ATC mapped memory space. currently
// the only APEs we consider are for GPUVM/LDS/scratch.
// the APEs are registered with unique values based
// on a per-device basis
struct ApertureRegister
{
Addr base;
Addr limit;
};
// Class Shader: This describes a single shader instance. Most
// configurations will only have a single shader.
class Shader : public ClockedObject
{
private:
ApertureRegister _gpuVmApe;
ApertureRegister _ldsApe;
ApertureRegister _scratchApe;
Addr shHiddenPrivateBaseVmid;
// Number of active Cus attached to this shader
int _activeCus;
// Last tick that all CUs attached to this shader were inactive
Tick _lastInactiveTick;
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
GPUDispatcher &dispatcher();
void sampleLoad(const Tick accessTime);
void sampleStore(const Tick accessTime);
void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
void sampleLineRoundTrip(const std::map<Addr,
std::vector<Tick>> &roundTripTime);
SimpleThread *cpuThread;
ThreadContext *gpuTc;
BaseCPU *cpuPointer;
const ApertureRegister&
gpuVmApe() const
{
return _gpuVmApe;
}
const ApertureRegister&
ldsApe() const
{
return _ldsApe;
}
const ApertureRegister&
scratchApe() const
{
return _scratchApe;
}
bool
isGpuVmApe(Addr addr) const
{
bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
return is_gpu_vm;
}
bool
isLdsApe(Addr addr) const
{
bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
return is_lds;
}
bool
isScratchApe(Addr addr) const
{
bool is_scratch
= addr >= _scratchApe.base && addr <= _scratchApe.limit;
return is_scratch;
}
Addr
getScratchBase()
{
return _scratchApe.base;
}
Addr
getHiddenPrivateBase()
{
return shHiddenPrivateBaseVmid;
}
void
initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
{
Addr sh_hidden_base_new = queueBase - offset;
// We are initializing sh_hidden_private_base_vmid from the
// amd queue descriptor from the first queue.
// The sh_hidden_private_base_vmid is supposed to be same for
// all the queues from the same process
if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
// Do not panic if shHiddenPrivateBaseVmid == 0,
// that is if it is uninitialized. Panic only
// if the value is initilized and we get
// a differnt base later.
panic_if(shHiddenPrivateBaseVmid != 0,
"Currently we support only single process\n");
}
shHiddenPrivateBaseVmid = sh_hidden_base_new;
}
EventFunctionWrapper tickEvent;
// is this simulation going to be timing mode in the memory?
bool timingSim;
hsail_mode_e hsail_mode;
// If set, issue acq packet @ kernel launch
int impl_kern_launch_acq;
// If set, issue rel packet @ kernel end
int impl_kern_end_rel;
// If set, fetch returns may be coissued with instructions
int coissue_return;
// If set, always dump all 64 gprs to trace
int trace_vgpr_all;
// Number of cu units in the shader
int n_cu;
// Number of wavefront slots per SIMD per CU
int n_wf;
// The size of global memory
int globalMemSize;
// Tracks CU that rr dispatcher should attempt scheduling
int nextSchedCu;
// Size of scheduled add queue
uint32_t sa_n;
// Pointer to value to be increments
std::vector<int*> sa_val;
// When to do the increment
std::vector<uint64_t> sa_when;
// Amount to increment by
std::vector<int32_t> sa_x;
// List of Compute Units (CU's)
std::vector<ComputeUnit*> cuList;
GPUCommandProcessor &gpuCmdProc;
GPUDispatcher &_dispatcher;
int64_t max_valu_insts;
int64_t total_valu_insts;
Shader(const Params &p);
~Shader();
virtual void init();
// Run shader scheduled adds
void execScheduledAdds();
// Schedule a 32-bit value to be incremented some time in the future
void ScheduleAdd(int *val, Tick when, int x);
bool processTimingPacket(PacketPtr pkt);
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
MemCmd cmd, bool suppress_func_errors);
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
bool suppress_func_errors);
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
bool suppress_func_errors);
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
bool suppress_func_errors, int cu_id);
void
registerCU(int cu_id, ComputeUnit *compute_unit)
{
cuList[cu_id] = compute_unit;
}
void prepareInvalidate(HSAQueueEntry *task);
void prepareFlush(GPUDynInstPtr gpuDynInst);
bool dispatchWorkgroups(HSAQueueEntry *task);
Addr mmap(int length);
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
void updateContext(int cid);
void notifyCuSleep();
void
incVectorInstSrcOperand(int num_operands)
{
stats.vectorInstSrcOperand[num_operands]++;
}
void
incVectorInstDstOperand(int num_operands)
{
stats.vectorInstDstOperand[num_operands]++;
}
protected:
struct ShaderStats : public Stats::Group
{
ShaderStats(Stats::Group *parent, int wf_size);
// some stats for measuring latency
Stats::Distribution allLatencyDist;
Stats::Distribution loadLatencyDist;
Stats::Distribution storeLatencyDist;
// average ticks from vmem inst initiateAcc to coalescer issue,
Stats::Distribution initToCoalesceLatency;
// average ticks from coalescer issue to coalescer hit callback,
Stats::Distribution rubyNetworkLatency;
// average ticks from coalescer hit callback to GM pipe enqueue,
Stats::Distribution gmEnqueueLatency;
// average ticks spent in GM pipe's ordered resp buffer.
Stats::Distribution gmToCompleteLatency;
// average number of cache blocks requested by vmem inst
Stats::Distribution coalsrLineAddresses;
// average ticks for cache blocks to main memory for the Nth
// cache block generated by a vmem inst.
Stats::Distribution *cacheBlockRoundTrip;
Stats::Scalar shaderActiveTicks;
Stats::Vector vectorInstSrcOperand;
Stats::Vector vectorInstDstOperand;
} stats;
};
#endif // __SHADER_HH__