| # Copyright (c) 2015-2018 Advanced Micro Devices, Inc. |
| # All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions are met: |
| # |
| # 1. Redistributions of source code must retain the above copyright notice, |
| # this list of conditions and the following disclaimer. |
| # |
| # 2. Redistributions in binary form must reproduce the above copyright notice, |
| # this list of conditions and the following disclaimer in the documentation |
| # and/or other materials provided with the distribution. |
| # |
| # 3. Neither the name of the copyright holder nor the names of its |
| # contributors may be used to endorse or promote products derived from this |
| # software without specific prior written permission. |
| # |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| # POSSIBILITY OF SUCH DAMAGE. |
| |
| from m5.defines import buildEnv |
| from m5.params import * |
| from m5.proxy import * |
| from m5.SimObject import SimObject |
| |
| from m5.objects.Bridge import Bridge |
| from m5.objects.ClockedObject import ClockedObject |
| from m5.objects.Device import DmaVirtDevice |
| from m5.objects.LdsState import LdsState |
| from m5.objects.Process import EmulatedDriver |
| from m5.objects.VegaGPUTLB import VegaPagetableWalker |
| |
| |
| class PrefetchType(Enum): |
| vals = ["PF_CU", "PF_PHASE", "PF_WF", "PF_STRIDE", "PF_END"] |
| |
| |
| class GfxVersion(ScopedEnum): |
| vals = ["gfx801", "gfx803", "gfx900", "gfx902"] |
| |
| |
| class PoolManager(SimObject): |
| type = "PoolManager" |
| abstract = True |
| cxx_class = "gem5::PoolManager" |
| cxx_header = "gpu-compute/pool_manager.hh" |
| |
| min_alloc = Param.Int(4, "min number of VGPRs allocated per WF") |
| pool_size = Param.Int(2048, "number of vector registers per SIMD") |
| |
| |
| # The simple pool manage only allows one workgroup to |
| # be executing on a CU at any given time. |
| class SimplePoolManager(PoolManager): |
| type = "SimplePoolManager" |
| cxx_class = "gem5::SimplePoolManager" |
| cxx_header = "gpu-compute/simple_pool_manager.hh" |
| |
| |
| ## This is for allowing multiple workgroups on one CU |
| class DynPoolManager(PoolManager): |
| type = "DynPoolManager" |
| cxx_class = "gem5::DynPoolManager" |
| cxx_header = "gpu-compute/dyn_pool_manager.hh" |
| |
| |
| class RegisterFile(SimObject): |
| type = "RegisterFile" |
| cxx_class = "gem5::RegisterFile" |
| cxx_header = "gpu-compute/register_file.hh" |
| |
| simd_id = Param.Int(-1, "SIMD ID associated with this Register File") |
| num_regs = Param.Int(2048, "number of registers in this RF") |
| wf_size = Param.Int(64, "Wavefront size (in work items)") |
| |
| |
| class ScalarRegisterFile(RegisterFile): |
| type = "ScalarRegisterFile" |
| cxx_class = "gem5::ScalarRegisterFile" |
| cxx_header = "gpu-compute/scalar_register_file.hh" |
| |
| |
| class VectorRegisterFile(RegisterFile): |
| type = "VectorRegisterFile" |
| cxx_class = "gem5::VectorRegisterFile" |
| cxx_header = "gpu-compute/vector_register_file.hh" |
| |
| |
| class RegisterManager(SimObject): |
| type = "RegisterManager" |
| cxx_class = "gem5::RegisterManager" |
| cxx_header = "gpu-compute/register_manager.hh" |
| |
| policy = Param.String("static", "Register Manager Policy") |
| vrf_pool_managers = VectorParam.PoolManager("VRF Pool Managers") |
| srf_pool_managers = VectorParam.PoolManager("SRF Pool Managers") |
| |
| |
| class Wavefront(SimObject): |
| type = "Wavefront" |
| cxx_class = "gem5::Wavefront" |
| cxx_header = "gpu-compute/wavefront.hh" |
| |
| simdId = Param.Int("SIMD id (0-ComputeUnit.num_SIMDs)") |
| wf_slot_id = Param.Int("wavefront id (0-ComputeUnit.max_wfs)") |
| wf_size = Param.Int(64, "Wavefront size (in work items)") |
| max_ib_size = Param.Int( |
| 13, |
| "Maximum size (in number of insts) of the instruction buffer (IB).", |
| ) |
| |
| |
| # Most of the default values here are obtained from the |
| # AMD Graphics Core Next (GCN) Architecture whitepaper. |
| class ComputeUnit(ClockedObject): |
| type = "ComputeUnit" |
| cxx_class = "gem5::ComputeUnit" |
| cxx_header = "gpu-compute/compute_unit.hh" |
| |
| wavefronts = VectorParam.Wavefront("Number of wavefronts") |
| # Wavefront size is 64. This is configurable, however changing |
| # this value to anything other than 64 will likely cause errors. |
| wf_size = Param.Int(64, "Wavefront size (in work items)") |
| num_barrier_slots = Param.Int(4, "Number of barrier slots in a CU") |
| num_SIMDs = Param.Int(4, "number of SIMD units per CU") |
| num_scalar_cores = Param.Int(1, "number of Scalar cores per CU") |
| num_scalar_mem_pipes = Param.Int( |
| 1, "number of Scalar memory pipelines per CU" |
| ) |
| simd_width = Param.Int(16, "width (number of lanes) per SIMD unit") |
| |
| operand_network_length = Param.Int( |
| 1, "number of pipe stages of operand network" |
| ) |
| |
| spbypass_pipe_length = Param.Int( |
| 4, "vector ALU Single Precision bypass latency" |
| ) |
| |
| dpbypass_pipe_length = Param.Int( |
| 4, "vector ALU Double Precision bypass latency" |
| ) |
| scalar_pipe_length = Param.Int(1, "number of pipe stages per scalar ALU") |
| issue_period = Param.Int(4, "number of cycles per issue period") |
| |
| vrf_gm_bus_latency = Param.Int( |
| 1, "number of cycles per use of VRF to GM bus" |
| ) |
| srf_scm_bus_latency = Param.Int( |
| 1, "number of cycles per use of SRF to Scalar Mem bus" |
| ) |
| vrf_lm_bus_latency = Param.Int( |
| 1, "number of cycles per use of VRF to LM bus" |
| ) |
| |
| num_global_mem_pipes = Param.Int(1, "number of global memory pipes per CU") |
| num_shared_mem_pipes = Param.Int(1, "number of shared memory pipes per CU") |
| n_wf = Param.Int(10, "Number of wavefront slots per SIMD") |
| mem_req_latency = Param.Int( |
| 50, |
| "Latency for request from the cu to ruby. " |
| "Represents the pipeline to reach the TCP " |
| "and specified in GPU clock cycles", |
| ) |
| mem_resp_latency = Param.Int( |
| 50, |
| "Latency for responses from ruby to the " |
| "cu. Represents the pipeline between the " |
| "TCP and cu as well as TCP data array " |
| "access. Specified in GPU clock cycles", |
| ) |
| scalar_mem_req_latency = Param.Int( |
| 50, |
| "Latency for scalar requests from the cu to ruby. " |
| "Represents the pipeline to reach the TCP " |
| "and specified in GPU clock cycles", |
| ) |
| scalar_mem_resp_latency = Param.Int( |
| 50, |
| "Latency for scalar responses from ruby to the " |
| "cu. Represents the pipeline between the " |
| "TCP and cu as well as TCP data array " |
| "access. Specified in GPU clock cycles", |
| ) |
| system = Param.System(Parent.any, "system object") |
| cu_id = Param.Int("CU id") |
| vrf_to_coalescer_bus_width = Param.Int( |
| 64, "VRF->Coalescer data bus width in bytes" |
| ) |
| coalescer_to_vrf_bus_width = Param.Int( |
| 64, "Coalescer->VRF data bus width in bytes" |
| ) |
| |
| memory_port = VectorRequestPort("Port to the memory system") |
| translation_port = VectorRequestPort("Port to the TLB hierarchy") |
| sqc_port = RequestPort("Port to the SQC (I-cache") |
| sqc_tlb_port = RequestPort("Port to the TLB for the SQC (I-cache)") |
| scalar_port = RequestPort("Port to the scalar data cache") |
| scalar_tlb_port = RequestPort("Port to the TLB for the scalar data cache") |
| gmTokenPort = RequestPort("Port to the GPU coalesecer for sharing tokens") |
| |
| perLaneTLB = Param.Bool(False, "enable per-lane TLB") |
| prefetch_depth = Param.Int( |
| 0, |
| "Number of prefetches triggered at a time(0 turns off prefetching)", |
| ) |
| prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)") |
| prefetch_prev_type = Param.PrefetchType( |
| "PF_PHASE", |
| "Prefetch the stride " |
| "from last mem req in lane of " |
| "CU|Phase|Wavefront", |
| ) |
| execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy") |
| debugSegFault = Param.Bool(False, "enable debugging GPU seg faults") |
| functionalTLB = Param.Bool(False, "Assume TLB causes no delay") |
| |
| localMemBarrier = Param.Bool( |
| False, "Assume Barriers do not wait on kernel end" |
| ) |
| |
| countPages = Param.Bool( |
| False, |
| "Generate per-CU file of all pages touched and how many times", |
| ) |
| scalar_mem_queue_size = Param.Int( |
| 32, "Number of entries in scalar memory pipeline's queues" |
| ) |
| global_mem_queue_size = Param.Int( |
| 256, "Number of entries in the global memory pipeline's queues" |
| ) |
| local_mem_queue_size = Param.Int( |
| 256, "Number of entries in the local memory pipeline's queues" |
| ) |
| max_wave_requests = Param.Int( |
| 64, "number of pending vector memory requests per wavefront" |
| ) |
| max_cu_tokens = Param.Int( |
| 4, |
| "Maximum number of tokens, i.e., the number" |
| " of instructions that can be sent to coalescer", |
| ) |
| ldsBus = Bridge() # the bridge between the CU and its LDS |
| ldsPort = RequestPort("The port that goes to the LDS") |
| localDataStore = Param.LdsState("the LDS for this CU") |
| |
| vector_register_file = VectorParam.VectorRegisterFile( |
| "Vector register file" |
| ) |
| |
| scalar_register_file = VectorParam.ScalarRegisterFile( |
| "Scalar register file" |
| ) |
| out_of_order_data_delivery = Param.Bool( |
| False, "enable OoO data delivery in the GM pipeline" |
| ) |
| register_manager = Param.RegisterManager("Register Manager") |
| fetch_depth = Param.Int( |
| 2, "number of i-cache lines that may be buffered in the fetch unit." |
| ) |
| |
| |
| class Shader(ClockedObject): |
| type = "Shader" |
| cxx_class = "gem5::Shader" |
| cxx_header = "gpu-compute/shader.hh" |
| CUs = VectorParam.ComputeUnit("Number of compute units") |
| gpu_cmd_proc = Param.GPUCommandProcessor("Command processor for GPU") |
| dispatcher = Param.GPUDispatcher("GPU workgroup dispatcher") |
| system_hub = Param.AMDGPUSystemHub(NULL, "GPU System Hub (FS Mode only)") |
| n_wf = Param.Int(10, "Number of wavefront slots per SIMD") |
| impl_kern_launch_acq = Param.Bool( |
| True, |
| """Insert acq packet into |
| ruby at kernel launch""", |
| ) |
| impl_kern_end_rel = Param.Bool( |
| False, |
| """Insert rel packet into |
| ruby at kernel end""", |
| ) |
| globalmem = Param.MemorySize("64kB", "Memory size") |
| timing = Param.Bool(False, "timing memory accesses") |
| |
| cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU") |
| translation = Param.Bool(False, "address translation") |
| timer_period = Param.Clock("10us", "system timer period") |
| idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold") |
| max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting") |
| |
| |
| class GPUComputeDriver(EmulatedDriver): |
| type = "GPUComputeDriver" |
| cxx_class = "gem5::GPUComputeDriver" |
| cxx_header = "gpu-compute/gpu_compute_driver.hh" |
| device = Param.GPUCommandProcessor("GPU controlled by this driver") |
| isdGPU = Param.Bool(False, "Driver is for a dGPU") |
| gfxVersion = Param.GfxVersion("gfx801", "ISA of gpu to model") |
| dGPUPoolID = Param.Int(0, "Pool ID for dGPU.") |
| # Default Mtype for caches |
| # -- 1 1 1 C_RW_S (Cached-ReadWrite-Shared) |
| # -- 1 1 0 C_RW_US (Cached-ReadWrite-Unshared) |
| # -- 1 0 1 C_RO_S (Cached-ReadOnly-Shared) |
| # -- 1 0 0 C_RO_US (Cached-ReadOnly-Unshared) |
| # -- 0 1 x UC_L2 (Uncached_GL2) |
| # -- 0 0 x UC_All (Uncached_All_Load) |
| # default value: 5/C_RO_S (only allow caching in GL2 for read. Shared) |
| m_type = Param.Int("Default MTYPE for cache. Valid values between 0-7") |
| |
| |
| class GPURenderDriver(EmulatedDriver): |
| type = "GPURenderDriver" |
| cxx_class = "gem5::GPURenderDriver" |
| cxx_header = "gpu-compute/gpu_render_driver.hh" |
| |
| |
| class GPUDispatcher(SimObject): |
| type = "GPUDispatcher" |
| cxx_class = "gem5::GPUDispatcher" |
| cxx_header = "gpu-compute/dispatcher.hh" |
| |
| |
| class GPUCommandProcessor(DmaVirtDevice): |
| type = "GPUCommandProcessor" |
| cxx_class = "gem5::GPUCommandProcessor" |
| cxx_header = "gpu-compute/gpu_command_processor.hh" |
| dispatcher = Param.GPUDispatcher("workgroup dispatcher for the GPU") |
| |
| hsapp = Param.HSAPacketProcessor("PP attached to this device") |
| walker = Param.VegaPagetableWalker( |
| VegaPagetableWalker(), "Page table walker" |
| ) |
| |
| |
| class StorageClassType(Enum): |
| vals = [ |
| "SC_SPILL", |
| "SC_GLOBAL", |
| "SC_GROUP", |
| "SC_PRIVATE", |
| "SC_READONLY", |
| "SC_KERNARG", |
| "SC_ARG", |
| "SC_NONE", |
| ] |