blob: 517d1801c0a6daa052fb796f582fb513ba474eeb [file] [log] [blame]
# Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from m5.defines import buildEnv
from m5.params import *
from m5.proxy import *
from m5.SimObject import SimObject
from m5.objects.Bridge import Bridge
from m5.objects.ClockedObject import ClockedObject
from m5.objects.Device import DmaVirtDevice
from m5.objects.LdsState import LdsState
from m5.objects.Process import EmulatedDriver
from m5.objects.VegaGPUTLB import VegaPagetableWalker
class PrefetchType(Enum):
vals = ["PF_CU", "PF_PHASE", "PF_WF", "PF_STRIDE", "PF_END"]
class GfxVersion(ScopedEnum):
vals = ["gfx801", "gfx803", "gfx900", "gfx902"]
class PoolManager(SimObject):
type = "PoolManager"
abstract = True
cxx_class = "gem5::PoolManager"
cxx_header = "gpu-compute/pool_manager.hh"
min_alloc = Param.Int(4, "min number of VGPRs allocated per WF")
pool_size = Param.Int(2048, "number of vector registers per SIMD")
# The simple pool manage only allows one workgroup to
# be executing on a CU at any given time.
class SimplePoolManager(PoolManager):
type = "SimplePoolManager"
cxx_class = "gem5::SimplePoolManager"
cxx_header = "gpu-compute/simple_pool_manager.hh"
## This is for allowing multiple workgroups on one CU
class DynPoolManager(PoolManager):
type = "DynPoolManager"
cxx_class = "gem5::DynPoolManager"
cxx_header = "gpu-compute/dyn_pool_manager.hh"
class RegisterFile(SimObject):
type = "RegisterFile"
cxx_class = "gem5::RegisterFile"
cxx_header = "gpu-compute/register_file.hh"
simd_id = Param.Int(-1, "SIMD ID associated with this Register File")
num_regs = Param.Int(2048, "number of registers in this RF")
wf_size = Param.Int(64, "Wavefront size (in work items)")
class ScalarRegisterFile(RegisterFile):
type = "ScalarRegisterFile"
cxx_class = "gem5::ScalarRegisterFile"
cxx_header = "gpu-compute/scalar_register_file.hh"
class VectorRegisterFile(RegisterFile):
type = "VectorRegisterFile"
cxx_class = "gem5::VectorRegisterFile"
cxx_header = "gpu-compute/vector_register_file.hh"
class RegisterManager(SimObject):
type = "RegisterManager"
cxx_class = "gem5::RegisterManager"
cxx_header = "gpu-compute/register_manager.hh"
policy = Param.String("static", "Register Manager Policy")
vrf_pool_managers = VectorParam.PoolManager("VRF Pool Managers")
srf_pool_managers = VectorParam.PoolManager("SRF Pool Managers")
class Wavefront(SimObject):
type = "Wavefront"
cxx_class = "gem5::Wavefront"
cxx_header = "gpu-compute/wavefront.hh"
simdId = Param.Int("SIMD id (0-ComputeUnit.num_SIMDs)")
wf_slot_id = Param.Int("wavefront id (0-ComputeUnit.max_wfs)")
wf_size = Param.Int(64, "Wavefront size (in work items)")
max_ib_size = Param.Int(
13,
"Maximum size (in number of insts) of the " "instruction buffer (IB).",
)
# Most of the default values here are obtained from the
# AMD Graphics Core Next (GCN) Architecture whitepaper.
class ComputeUnit(ClockedObject):
type = "ComputeUnit"
cxx_class = "gem5::ComputeUnit"
cxx_header = "gpu-compute/compute_unit.hh"
wavefronts = VectorParam.Wavefront("Number of wavefronts")
# Wavefront size is 64. This is configurable, however changing
# this value to anything other than 64 will likely cause errors.
wf_size = Param.Int(64, "Wavefront size (in work items)")
num_barrier_slots = Param.Int(4, "Number of barrier slots in a CU")
num_SIMDs = Param.Int(4, "number of SIMD units per CU")
num_scalar_cores = Param.Int(1, "number of Scalar cores per CU")
num_scalar_mem_pipes = Param.Int(
1, "number of Scalar memory pipelines " "per CU"
)
simd_width = Param.Int(16, "width (number of lanes) per SIMD unit")
operand_network_length = Param.Int(
1, "number of pipe stages of operand " "network"
)
spbypass_pipe_length = Param.Int(
4, "vector ALU Single Precision bypass " "latency"
)
dpbypass_pipe_length = Param.Int(
4, "vector ALU Double Precision bypass " "latency"
)
scalar_pipe_length = Param.Int(1, "number of pipe stages per scalar ALU")
issue_period = Param.Int(4, "number of cycles per issue period")
vrf_gm_bus_latency = Param.Int(
1, "number of cycles per use of VRF to " "GM bus"
)
srf_scm_bus_latency = Param.Int(
1, "number of cycles per use of SRF " "to Scalar Mem bus"
)
vrf_lm_bus_latency = Param.Int(
1, "number of cycles per use of VRF to " "LM bus"
)
num_global_mem_pipes = Param.Int(1, "number of global memory pipes per CU")
num_shared_mem_pipes = Param.Int(1, "number of shared memory pipes per CU")
n_wf = Param.Int(10, "Number of wavefront slots per SIMD")
mem_req_latency = Param.Int(
50,
"Latency for request from the cu to ruby. "
"Represents the pipeline to reach the TCP "
"and specified in GPU clock cycles",
)
mem_resp_latency = Param.Int(
50,
"Latency for responses from ruby to the "
"cu. Represents the pipeline between the "
"TCP and cu as well as TCP data array "
"access. Specified in GPU clock cycles",
)
system = Param.System(Parent.any, "system object")
cu_id = Param.Int("CU id")
vrf_to_coalescer_bus_width = Param.Int(
64, "VRF->Coalescer data bus " "width in bytes"
)
coalescer_to_vrf_bus_width = Param.Int(
64, "Coalescer->VRF data bus " "width in bytes"
)
memory_port = VectorRequestPort("Port to the memory system")
translation_port = VectorRequestPort("Port to the TLB hierarchy")
sqc_port = RequestPort("Port to the SQC (I-cache")
sqc_tlb_port = RequestPort("Port to the TLB for the SQC (I-cache)")
scalar_port = RequestPort("Port to the scalar data cache")
scalar_tlb_port = RequestPort("Port to the TLB for the scalar data cache")
gmTokenPort = RequestPort("Port to the GPU coalesecer for sharing tokens")
perLaneTLB = Param.Bool(False, "enable per-lane TLB")
prefetch_depth = Param.Int(
0,
"Number of prefetches triggered at a time" "(0 turns off prefetching)",
)
prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
prefetch_prev_type = Param.PrefetchType(
"PF_PHASE",
"Prefetch the stride "
"from last mem req in lane of "
"CU|Phase|Wavefront",
)
execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy")
debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
localMemBarrier = Param.Bool(
False, "Assume Barriers do not wait on " "kernel end"
)
countPages = Param.Bool(
False,
"Generate per-CU file of all pages " "touched and how many times",
)
scalar_mem_queue_size = Param.Int(
32, "Number of entries in scalar " "memory pipeline's queues"
)
global_mem_queue_size = Param.Int(
256, "Number of entries in the global " "memory pipeline's queues"
)
local_mem_queue_size = Param.Int(
256, "Number of entries in the local " "memory pipeline's queues"
)
max_wave_requests = Param.Int(
64, "number of pending vector memory " "requests per wavefront"
)
max_cu_tokens = Param.Int(
4,
"Maximum number of tokens, i.e., the number"
" of instructions that can be sent to coalescer",
)
ldsBus = Bridge() # the bridge between the CU and its LDS
ldsPort = RequestPort("The port that goes to the LDS")
localDataStore = Param.LdsState("the LDS for this CU")
vector_register_file = VectorParam.VectorRegisterFile(
"Vector register " "file"
)
scalar_register_file = VectorParam.ScalarRegisterFile(
"Scalar register " "file"
)
out_of_order_data_delivery = Param.Bool(
False, "enable OoO data delivery" " in the GM pipeline"
)
register_manager = Param.RegisterManager("Register Manager")
fetch_depth = Param.Int(
2, "number of i-cache lines that may be " "buffered in the fetch unit."
)
class Shader(ClockedObject):
type = "Shader"
cxx_class = "gem5::Shader"
cxx_header = "gpu-compute/shader.hh"
CUs = VectorParam.ComputeUnit("Number of compute units")
gpu_cmd_proc = Param.GPUCommandProcessor("Command processor for GPU")
dispatcher = Param.GPUDispatcher("GPU workgroup dispatcher")
system_hub = Param.AMDGPUSystemHub(NULL, "GPU System Hub (FS Mode only)")
n_wf = Param.Int(10, "Number of wavefront slots per SIMD")
impl_kern_launch_acq = Param.Bool(
True,
"""Insert acq packet into
ruby at kernel launch""",
)
impl_kern_end_rel = Param.Bool(
False,
"""Insert rel packet into
ruby at kernel end""",
)
globalmem = Param.MemorySize("64kB", "Memory size")
timing = Param.Bool(False, "timing memory accesses")
cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
translation = Param.Bool(False, "address translation")
timer_period = Param.Clock("10us", "system timer period")
idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")
class GPUComputeDriver(EmulatedDriver):
type = "GPUComputeDriver"
cxx_class = "gem5::GPUComputeDriver"
cxx_header = "gpu-compute/gpu_compute_driver.hh"
device = Param.GPUCommandProcessor("GPU controlled by this driver")
isdGPU = Param.Bool(False, "Driver is for a dGPU")
gfxVersion = Param.GfxVersion("gfx801", "ISA of gpu to model")
dGPUPoolID = Param.Int(0, "Pool ID for dGPU.")
# Default Mtype for caches
# -- 1 1 1 C_RW_S (Cached-ReadWrite-Shared)
# -- 1 1 0 C_RW_US (Cached-ReadWrite-Unshared)
# -- 1 0 1 C_RO_S (Cached-ReadOnly-Shared)
# -- 1 0 0 C_RO_US (Cached-ReadOnly-Unshared)
# -- 0 1 x UC_L2 (Uncached_GL2)
# -- 0 0 x UC_All (Uncached_All_Load)
# default value: 5/C_RO_S (only allow caching in GL2 for read. Shared)
m_type = Param.Int("Default MTYPE for cache. Valid values between 0-7")
class GPURenderDriver(EmulatedDriver):
type = "GPURenderDriver"
cxx_class = "gem5::GPURenderDriver"
cxx_header = "gpu-compute/gpu_render_driver.hh"
class GPUDispatcher(SimObject):
type = "GPUDispatcher"
cxx_class = "gem5::GPUDispatcher"
cxx_header = "gpu-compute/dispatcher.hh"
class GPUCommandProcessor(DmaVirtDevice):
type = "GPUCommandProcessor"
cxx_class = "gem5::GPUCommandProcessor"
cxx_header = "gpu-compute/gpu_command_processor.hh"
dispatcher = Param.GPUDispatcher("workgroup dispatcher for the GPU")
hsapp = Param.HSAPacketProcessor("PP attached to this device")
walker = Param.VegaPagetableWalker(
VegaPagetableWalker(), "Page table walker"
)
class StorageClassType(Enum):
vals = [
"SC_SPILL",
"SC_GLOBAL",
"SC_GROUP",
"SC_PRIVATE",
"SC_READONLY",
"SC_KERNARG",
"SC_ARG",
"SC_NONE",
]