| # Copyright (c) 2018 Advanced Micro Devices, Inc. |
| # All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions are met: |
| # |
| # 1. Redistributions of source code must retain the above copyright notice, |
| # this list of conditions and the following disclaimer. |
| # |
| # 2. Redistributions in binary form must reproduce the above copyright notice, |
| # this list of conditions and the following disclaimer in the documentation |
| # and/or other materials provided with the distribution. |
| # |
| # 3. Neither the name of the copyright holder nor the names of its |
| # contributors may be used to endorse or promote products derived from this |
| # software without specific prior written permission. |
| # |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| # POSSIBILITY OF SUCH DAMAGE. |
| |
| import m5 |
| |
| import operator |
| from os import mkdir, makedirs, getpid, listdir, fsync |
| from os.path import join as joinpath |
| from os.path import isdir |
| from shutil import rmtree, copyfile |
| from m5.util.convert import toFrequency, toMemorySize |
| |
| |
| def file_append(path, contents): |
| with open(joinpath(*path), "a") as f: |
| f.write(str(contents)) |
| f.flush() |
| fsync(f.fileno()) |
| |
| |
| def remake_dir(path): |
| if isdir(path): |
| rmtree(path) |
| makedirs(path) |
| |
| |
| # This fakes out a dGPU setup so the runtime operates correctly. The spoofed |
| # system has a single dGPU and a single socket CPU. Note that more complex |
| # topologies (multi-GPU, multi-socket CPUs) need to have a different setup |
| # here or the runtime won't be able to issue Memcpies from one node to another. |
| # |
| # TODO: There is way too much hardcoded here. It doesn't effect anything in |
| # our current ROCm stack (1.6), but it is highly possible that it will in the |
| # future. We might need to scrub through this and extract the appropriate |
| # fields from the simulator in the future. |
| def createVegaTopology(options): |
| topology_dir = joinpath( |
| m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology" |
| ) |
| remake_dir(topology_dir) |
| |
| amdgpu_dir = joinpath(m5.options.outdir, "fs/sys/module/amdgpu/parameters") |
| remake_dir(amdgpu_dir) |
| |
| pci_ids_dir = joinpath(m5.options.outdir, "fs/usr/share/hwdata/") |
| remake_dir(pci_ids_dir) |
| |
| # Vega reported VM size in GB. Used to reserve an allocation from CPU |
| # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree) |
| file_append((amdgpu_dir, "vm_size"), 256) |
| |
| # Ripped from real Vega platform to appease KMT version checks |
| file_append((topology_dir, "generation_id"), 2) |
| |
| # Set up system properties. Regiter as ast-rocm server |
| sys_prop = ( |
| "platform_oem 35498446626881\n" |
| + "platform_id 71791775140929\n" |
| + "platform_rev 2\n" |
| ) |
| file_append((topology_dir, "system_properties"), sys_prop) |
| |
| # Populate the topology tree |
| # Our dGPU system is two nodes. Node 0 is a CPU and Node 1 is a dGPU |
| node_dir = joinpath(topology_dir, "nodes/0") |
| remake_dir(node_dir) |
| |
| # Register as a CPU |
| file_append((node_dir, "gpu_id"), 0) |
| file_append((node_dir, "name"), "") |
| |
| # CPU links. Only thing that matters is we tell the runtime that GPU is |
| # connected through PCIe to CPU socket 0. |
| io_links = 1 |
| io_dir = joinpath(node_dir, "io_links/0") |
| remake_dir(io_dir) |
| io_prop = ( |
| "type 2\n" |
| + "version_major 0\n" |
| + "version_minor 0\n" |
| + "node_from 0\n" |
| + "node_to 1\n" |
| + "weight 20\n" |
| + "min_latency 0\n" |
| + "max_latency 0\n" |
| + "min_bandwidth 0\n" |
| + "max_bandwidth 0\n" |
| + "recommended_transfer_size 0\n" |
| + "flags 13\n" |
| ) |
| file_append((io_dir, "properties"), io_prop) |
| |
| # Populate CPU node properties |
| node_prop = ( |
| "cpu_cores_count %s\n" % options.num_cpus |
| + "simd_count 0\n" |
| + "mem_banks_count 1\n" |
| + "caches_count 0\n" |
| + "io_links_count %s\n" % io_links |
| + "cpu_core_id_base 0\n" |
| + "simd_id_base 0\n" |
| + "max_waves_per_simd 0\n" |
| + "lds_size_in_kb 0\n" |
| + "gds_size_in_kb 0\n" |
| + "wave_front_size 64\n" |
| + "array_count 0\n" |
| + "simd_arrays_per_engine 0\n" |
| + "cu_per_simd_array 0\n" |
| + "simd_per_cu 0\n" |
| + "max_slots_scratch_cu 0\n" |
| + "vendor_id 0\n" |
| + "device_id 0\n" |
| + "location_id 0\n" |
| + "drm_render_minor 0\n" |
| + "max_engine_clk_ccompute 3400\n" |
| ) |
| |
| file_append((node_dir, "properties"), node_prop) |
| |
| # CPU memory reporting |
| mem_dir = joinpath(node_dir, "mem_banks/0") |
| remake_dir(mem_dir) |
| # Heap type value taken from real system, heap type values: |
| # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317 |
| mem_prop = ( |
| "heap_type 0\n" |
| + "size_in_bytes 33704329216\n" |
| + "flags 0\n" |
| + "width 72\n" |
| + "mem_clk_max 2400\n" |
| ) |
| |
| file_append((mem_dir, "properties"), mem_prop) |
| |
| # Build the GPU node |
| node_dir = joinpath(topology_dir, "nodes/1") |
| remake_dir(node_dir) |
| |
| # Register as a Vega |
| file_append((node_dir, "gpu_id"), 22124) |
| file_append((node_dir, "name"), "Vega\n") |
| |
| # Should be the same as the render driver filename (dri/renderD<drm_num>) |
| drm_num = 128 |
| |
| # 96 in real Vega |
| # Random comment for comparison purposes |
| caches = 0 |
| |
| # GPU links. Only thing that matters is we tell the runtime that GPU is |
| # connected through PCIe to CPU socket 0. |
| io_links = 1 |
| io_dir = joinpath(node_dir, "io_links/0") |
| remake_dir(io_dir) |
| io_prop = ( |
| "type 2\n" |
| + "version_major 0\n" |
| + "version_minor 0\n" |
| + "node_from 1\n" |
| + "node_to 0\n" |
| + "weight 20\n" |
| + "min_latency 0\n" |
| + "max_latency 0\n" |
| + "min_bandwidth 0\n" |
| + "max_bandwidth 0\n" |
| + "recommended_transfer_size 0\n" |
| + "flags 1\n" |
| ) |
| file_append((io_dir, "properties"), io_prop) |
| |
| # Populate GPU node properties |
| cu_scratch = options.simds_per_cu * options.wfs_per_simd |
| node_prop = ( |
| "cpu_cores_count 0\n" |
| + "simd_count 256\n" |
| + "mem_banks_count 1\n" |
| + "caches_count %s\n" % caches |
| + "io_links_count %s\n" % io_links |
| + "cpu_core_id_base 0\n" |
| + "simd_id_base 2147487744\n" |
| + "max_waves_per_simd 10\n" |
| + "lds_size_in_kb 64\n" |
| + "gds_size_in_kb 0\n" |
| + "wave_front_size 64\n" |
| + "array_count 4\n" |
| + "simd_arrays_per_engine 1\n" |
| + "cu_per_simd_array 16\n" |
| + "simd_per_cu 4\n" |
| + "max_slots_scratch_cu %s\n" % cu_scratch |
| + "vendor_id 4098\n" |
| + "device_id 26720\n" |
| + "location_id 1024\n" |
| + "drm_render_minor %s\n" % drm_num |
| + "hive_id 0\n" |
| + "num_sdma_engines 2\n" |
| + "num_sdma_xgmi_engines 0\n" |
| + "max_engine_clk_fcompute 1500\n" |
| + "local_mem_size 17163091968\n" |
| + "fw_version 421\n" |
| + "capability 238208\n" |
| + "debug_prop 32768\n" |
| + "sdma_fw_version 430\n" |
| + "max_engine_clk_ccompute 3400\n" |
| ) |
| |
| file_append((node_dir, "properties"), node_prop) |
| |
| # Fiji HBM reporting |
| # TODO: Extract size, clk, and width from sim paramters |
| mem_dir = joinpath(node_dir, "mem_banks/0") |
| remake_dir(mem_dir) |
| # Heap type value taken from real system, heap type values: |
| # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317 |
| mem_prop = ( |
| "heap_type 1\n" |
| + "size_in_bytes 17163091968\n" |
| + "flags 0\n" |
| + "width 2048\n" |
| + "mem_clk_max 945\n" |
| ) |
| |
| file_append((mem_dir, "properties"), mem_prop) |
| |
| |
| # This fakes out a dGPU setup so the runtime correctly operations. The spoofed |
| # system has a single dGPU and a single socket CPU. Note that more complex |
| # topologies (multi-GPU, multi-socket CPUs) need to have a different setup |
| # here or the runtime won't be able to issue Memcpies from one node to another. |
| # |
| # TODO: There is way too much hardcoded here. It doesn't effect anything in |
| # our current ROCm stack (1.6), but it is highly possible that it will in the |
| # future. We might need to scrub through this and extract the appropriate |
| # fields from the simulator in the future. |
| def createFijiTopology(options): |
| topology_dir = joinpath( |
| m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology" |
| ) |
| remake_dir(topology_dir) |
| |
| amdgpu_dir = joinpath(m5.options.outdir, "fs/sys/module/amdgpu/parameters") |
| remake_dir(amdgpu_dir) |
| |
| # Fiji reported VM size in GB. Used to reserve an allocation from CPU |
| # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree) |
| file_append((amdgpu_dir, "vm_size"), 256) |
| |
| # Ripped from real Fiji platform to appease KMT version checks |
| file_append((topology_dir, "generation_id"), 2) |
| |
| # Set up system properties. Regiter as ast-rocm server |
| sys_prop = ( |
| "platform_oem 35498446626881\n" |
| + "platform_id 71791775140929\n" |
| + "platform_rev 2\n" |
| ) |
| file_append((topology_dir, "system_properties"), sys_prop) |
| |
| # Populate the topology tree |
| # Our dGPU system is two nodes. Node 0 is a CPU and Node 1 is a dGPU |
| node_dir = joinpath(topology_dir, "nodes/0") |
| remake_dir(node_dir) |
| |
| # Register as a CPU |
| file_append((node_dir, "gpu_id"), 0) |
| file_append((node_dir, "name"), "") |
| |
| # CPU links. Only thing that matters is we tell the runtime that GPU is |
| # connected through PCIe to CPU socket 0. |
| io_links = 1 |
| io_dir = joinpath(node_dir, "io_links/0") |
| remake_dir(io_dir) |
| io_prop = ( |
| "type 2\n" |
| + "version_major 0\n" |
| + "version_minor 0\n" |
| + "node_from 0\n" |
| + "node_to 1\n" |
| + "weight 20\n" |
| + "min_latency 0\n" |
| + "max_latency 0\n" |
| + "min_bandwidth 0\n" |
| + "max_bandwidth 0\n" |
| + "recommended_transfer_size 0\n" |
| + "flags 13\n" |
| ) |
| file_append((io_dir, "properties"), io_prop) |
| |
| # Populate CPU node properties |
| node_prop = ( |
| "cpu_cores_count %s\n" % options.num_cpus |
| + "simd_count 0\n" |
| + "mem_banks_count 1\n" |
| + "caches_count 0\n" |
| + "io_links_count %s\n" % io_links |
| + "cpu_core_id_base 0\n" |
| + "simd_id_base 0\n" |
| + "max_waves_per_simd 0\n" |
| + "lds_size_in_kb 0\n" |
| + "gds_size_in_kb 0\n" |
| + "wave_front_size 64\n" |
| + "array_count 0\n" |
| + "simd_arrays_per_engine 0\n" |
| + "cu_per_simd_array 0\n" |
| + "simd_per_cu 0\n" |
| + "max_slots_scratch_cu 0\n" |
| + "vendor_id 0\n" |
| + "device_id 0\n" |
| + "location_id 0\n" |
| + "drm_render_minor 0\n" |
| + "max_engine_clk_ccompute 3400\n" |
| ) |
| |
| file_append((node_dir, "properties"), node_prop) |
| |
| # CPU memory reporting |
| mem_dir = joinpath(node_dir, "mem_banks/0") |
| remake_dir(mem_dir) |
| # Heap type value taken from real system, heap type values: |
| # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317 |
| mem_prop = ( |
| "heap_type 0\n" |
| + "size_in_bytes 33704329216\n" |
| + "flags 0\n" |
| + "width 72\n" |
| + "mem_clk_max 2400\n" |
| ) |
| |
| file_append((mem_dir, "properties"), mem_prop) |
| |
| # Build the GPU node |
| node_dir = joinpath(topology_dir, "nodes/1") |
| remake_dir(node_dir) |
| |
| # Register as a Fiji |
| file_append((node_dir, "gpu_id"), 50156) |
| file_append((node_dir, "name"), "Fiji\n") |
| |
| # Should be the same as the render driver filename (dri/renderD<drm_num>) |
| drm_num = 128 |
| |
| # Real Fiji shows 96, but building that topology is complex and doesn't |
| # appear to be required for anything. |
| caches = 0 |
| |
| # GPU links. Only thing that matters is we tell the runtime that GPU is |
| # connected through PCIe to CPU socket 0. |
| io_links = 1 |
| io_dir = joinpath(node_dir, "io_links/0") |
| remake_dir(io_dir) |
| io_prop = ( |
| "type 2\n" |
| + "version_major 0\n" |
| + "version_minor 0\n" |
| + "node_from 1\n" |
| + "node_to 0\n" |
| + "weight 20\n" |
| + "min_latency 0\n" |
| + "max_latency 0\n" |
| + "min_bandwidth 0\n" |
| + "max_bandwidth 0\n" |
| + "recommended_transfer_size 0\n" |
| + "flags 1\n" |
| ) |
| file_append((io_dir, "properties"), io_prop) |
| |
| # Populate GPU node properties |
| node_prop = ( |
| "cpu_cores_count 0\n" |
| + "simd_count %s\n" |
| % (options.num_compute_units * options.simds_per_cu) |
| + "mem_banks_count 1\n" |
| + "caches_count %s\n" % caches |
| + "io_links_count %s\n" % io_links |
| + "cpu_core_id_base 0\n" |
| + "simd_id_base 2147487744\n" |
| + "max_waves_per_simd %s\n" % options.wfs_per_simd |
| + "lds_size_in_kb %s\n" % int(options.lds_size / 1024) |
| + "gds_size_in_kb 0\n" |
| + "wave_front_size %s\n" % options.wf_size |
| + "array_count 4\n" |
| + "simd_arrays_per_engine %s\n" % options.sa_per_complex |
| + "cu_per_simd_array %s\n" % options.cu_per_sa |
| + "simd_per_cu %s\n" % options.simds_per_cu |
| + "max_slots_scratch_cu 32\n" |
| + "vendor_id 4098\n" |
| + "device_id 29440\n" |
| + "location_id 512\n" |
| + "drm_render_minor %s\n" % drm_num |
| + "max_engine_clk_fcompute %s\n" |
| % int(toFrequency(options.gpu_clock) / 1e6) |
| + "local_mem_size 4294967296\n" |
| + "fw_version 730\n" |
| + "capability 4736\n" |
| + "max_engine_clk_ccompute %s\n" |
| % int(toFrequency(options.CPUClock) / 1e6) |
| ) |
| |
| file_append((node_dir, "properties"), node_prop) |
| |
| # Fiji HBM reporting |
| # TODO: Extract size, clk, and width from sim paramters |
| mem_dir = joinpath(node_dir, "mem_banks/0") |
| remake_dir(mem_dir) |
| # Heap type value taken from real system, heap type values: |
| # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317 |
| mem_prop = ( |
| "heap_type 1\n" |
| + "size_in_bytes 4294967296\n" |
| + "flags 0\n" |
| + "width 4096\n" |
| + "mem_clk_max 500\n" |
| ) |
| |
| file_append((mem_dir, "properties"), mem_prop) |
| |
| |
| def createCarrizoTopology(options): |
| topology_dir = joinpath( |
| m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology" |
| ) |
| remake_dir(topology_dir) |
| |
| # Ripped from real Kaveri platform to appease kmt version checks |
| # Set up generation_id |
| file_append((topology_dir, "generation_id"), 1) |
| |
| # Set up system properties |
| sys_prop = ( |
| "platform_oem 2314885673410447169\n" |
| + "platform_id 35322352389441\n" |
| + "platform_rev 1\n" |
| ) |
| file_append((topology_dir, "system_properties"), sys_prop) |
| |
| # Populate the topology tree |
| # TODO: Just the bare minimum to pass for now |
| node_dir = joinpath(topology_dir, "nodes/0") |
| remake_dir(node_dir) |
| |
| # must show valid kaveri gpu id or massive meltdown |
| file_append((node_dir, "gpu_id"), 2765) |
| |
| gfx_dict = { |
| "gfx801": {"name": "Carrizo\n", "id": 39028}, |
| "gfx902": {"name": "Raven\n", "id": 5597}, |
| } |
| |
| # must have marketing name |
| file_append((node_dir, "name"), gfx_dict[options.gfx_version]["name"]) |
| |
| mem_banks_cnt = 1 |
| |
| # Should be the same as the render driver filename (dri/renderD<drm_num>) |
| drm_num = 128 |
| |
| device_id = gfx_dict[options.gfx_version]["id"] |
| |
| # populate global node properties |
| # NOTE: SIMD count triggers a valid GPU agent creation |
| node_prop = ( |
| "cpu_cores_count %s\n" % options.num_cpus |
| + "simd_count %s\n" |
| % (options.num_compute_units * options.simds_per_cu) |
| + "mem_banks_count %s\n" % mem_banks_cnt |
| + "caches_count 0\n" |
| + "io_links_count 0\n" |
| + "cpu_core_id_base 16\n" |
| + "simd_id_base 2147483648\n" |
| + "max_waves_per_simd %s\n" % options.wfs_per_simd |
| + "lds_size_in_kb %s\n" % int(options.lds_size / 1024) |
| + "gds_size_in_kb 0\n" |
| + "wave_front_size %s\n" % options.wf_size |
| + "array_count 1\n" |
| + "simd_arrays_per_engine %s\n" % options.sa_per_complex |
| + "cu_per_simd_array %s\n" % options.cu_per_sa |
| + "simd_per_cu %s\n" % options.simds_per_cu |
| + "max_slots_scratch_cu 32\n" |
| + "vendor_id 4098\n" |
| + "device_id %s\n" % device_id |
| + "location_id 8\n" |
| + "drm_render_minor %s\n" % drm_num |
| + "max_engine_clk_fcompute %s\n" |
| % int(toFrequency(options.gpu_clock) / 1e6) |
| + "local_mem_size 0\n" |
| + "fw_version 699\n" |
| + "capability 4738\n" |
| + "max_engine_clk_ccompute %s\n" |
| % int(toFrequency(options.CPUClock) / 1e6) |
| ) |
| |
| file_append((node_dir, "properties"), node_prop) |
| |
| for i in range(mem_banks_cnt): |
| mem_dir = joinpath(node_dir, f"mem_banks/{i}") |
| remake_dir(mem_dir) |
| |
| # Heap type value taken from real system, heap type values: |
| # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317 |
| mem_prop = ( |
| f"heap_type 0\n" |
| + f"size_in_bytes {toMemorySize(options.mem_size)}" |
| + f"flags 0\n" |
| + f"width 64\n" |
| + f"mem_clk_max 1600\n" |
| ) |
| file_append((mem_dir, "properties"), mem_prop) |