blob: 1c2e4bae268a052c0d612c6614e2539240c00697 [file] [log] [blame]
#
# Copyright (c) 2018 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Authors: Tuan Ta, Xianwei Zhang
#
import m5
from m5.objects import *
from m5.defines import buildEnv
from m5.util import addToPath
import os, optparse, sys
addToPath('../')
from common import Options
from ruby import Ruby
# Get paths we might need. It's expected this file is in m5/configs/example.
config_path = os.path.dirname(os.path.abspath(__file__))
config_root = os.path.dirname(config_path)
m5_root = os.path.dirname(config_root)
parser = optparse.OptionParser()
Options.addNoISAOptions(parser)
# GPU Ruby tester options
parser.add_option("--cache-size", type="int", default=0,
help="Cache sizes to use. Small encourages races between \
requests and writebacks. Large stresses write-through \
and/or write-back GPU caches. Range [0..1]")
parser.add_option("--system-size", type="int", default=0,
help="This option defines how many CUs, CPUs and cache \
components in the test system. Range[0..2]")
parser.add_option("--address-range", type="int", default=0,
help="This option defines the number of atomic \
locations that affects the working set's size. \
A small number of atomic locations encourage more \
races among threads. The large option stresses cache \
resources. Range [0..1]")
parser.add_option("--episode-length", type="int", default=0,
help="This option defines the number of LDs and \
STs in an episode. The small option encourages races \
between the start and end of an episode. The long \
option encourages races between LDs and STs in the \
same episode. Range [0..2]")
parser.add_option("--test-length", type="int", default=1,
help="The number of episodes to be executed by each \
wavefront. This determines the maximum number, i.e., \
val X #WFs, of episodes to be executed in the test.")
parser.add_option("--debug-tester", action='store_true',
help="This option will turn on DRF checker")
parser.add_option("--random-seed", type="int", default=0,
help="Random seed number. Default value (i.e., 0) means \
using runtime-specific value")
parser.add_option("--log-file", type="string", default="gpu-ruby-test.log")
# GPU configurations
parser.add_option("--wf-size", type="int", default=64, help="wavefront size")
parser.add_option("-w", "--wavefronts-per-cu", type="int", default=1,
help="Number of wavefronts per cu")
parser.add_option("--cu-per-sqc", type="int", default=4,
help="number of CUs sharing an SQC")
parser.add_option("--cu-per-scalar-cache", type="int", default=4,
help="number of CUs sharing an scalar cache")
parser.add_option("--cu-per-sa", type="int", default=4,
help="number of CUs per shader array \
This must be a multiple of options.cu-per-sqc and \
options.cu-per-scalar")
#
# Add the ruby specific and protocol specific options
#
Ruby.define_options(parser)
execfile(os.path.join(config_root, "common", "Options.py"))
(options, args) = parser.parse_args()
#
# Set the default cache size and associativity to be very small to encourage
# races between requests and writebacks.
#
options.l1d_size="256B"
options.l1i_size="256B"
options.l2_size="512B"
options.l3_size="1kB"
options.l1d_assoc=2
options.l1i_assoc=2
options.l2_assoc=2
options.l3_assoc=2
#
# Set up cache size - 2 options
# 0: small cache
# 1: large cache
#
if (options.cache_size == 0):
options.tcp_size="256B"
options.tcp_assoc=2
options.tcc_size="1kB"
options.tcc_assoc=2
elif (options.cache_size == 1):
options.tcp_size="256kB"
options.tcp_assoc=16
options.tcc_size="1024kB"
options.tcc_assoc=16
else:
print("Error: option cache_size '%s' not recognized", options.cache_size)
sys.exit(1)
#
# Set up system size - 3 options
#
if (options.system_size == 0):
# 1 CU, 1 CPU, 1 SQC, 1 Scalar
options.wf_size = 1
options.wavefronts_per_cu = 1
options.num_cpus = 1
options.cu_per_sqc = 1
options.cu_per_scalar_cache = 1
options.num_compute_units = 1
elif (options.system_size == 1):
# 4 CUs, 4 CPUs, 1 SQCs, 1 Scalars
options.wf_size = 16
options.wavefronts_per_cu = 4
options.num_cpus = 4
options.cu_per_sqc = 4
options.cu_per_scalar_cache = 4
options.num_compute_units = 4
elif (options.system_size == 2):
# 8 CUs, 4 CPUs, 1 SQCs, 1 Scalars
options.wf_size = 32
options.wavefronts_per_cu = 4
options.num_cpus = 4
options.cu_per_sqc = 4
options.cu_per_scalar_cache = 4
options.num_compute_units = 8
else:
print("Error: option system size '%s' not recognized", options.system_size)
sys.exit(1)
#
# set address range - 2 options
# level 0: small
# level 1: large
# each location corresponds to a 4-byte piece of data
#
options.mem_size = '1024MB'
num_atomic_locs = 10
num_regular_locs_per_atomic_loc = 10000
if (options.address_range == 1):
num_atomic_locs = 100
num_regular_locs_per_atomic_loc = 100000
elif (options.address_range != 0):
print("Error: option address_range '%s' not recognized", \
options.address_range)
sys.exit(1)
#
# set episode length (# of actions per episode) - 3 options
# 0: 10 actions
# 1: 100 actions
# 2: 500 actions
#
eps_length = 10
if (options.episode_length == 1):
eps_length = 100
elif (options.episode_length == 2):
eps_length = 500
elif (options.episode_length != 0):
print("Error: option episode_length '%s' not recognized",
options.episode_length)
sys.exit(1)
# set the Ruby's and tester's deadlock thresholds
# the Ruby's deadlock detection is the primary check for deadlock.
# the tester's deadlock threshold detection is a secondary check for deadlock
# if there is a bug in RubyPort that causes a packet not to return to the
# tester properly, the tester will throw a deadlock exception.
# we set cache_deadlock_threshold < tester_deadlock_threshold to detect
# deadlock caused by Ruby protocol first before one caused by the coalescer
options.cache_deadlock_threshold = 100000000
tester_deadlock_threshold = 1000000000
# for now, we're testing only GPU protocol, so we set num_cpus to 0
options.num_cpus = 0
# number of CPUs and CUs
n_CPUs = options.num_cpus
n_CUs = options.num_compute_units
# set test length, i.e., number of episodes per wavefront * #WFs
# test length can be 1x#WFs, 10x#WFs, 100x#WFs, ...
n_WFs = n_CUs * options.wavefronts_per_cu
max_episodes = options.test_length * n_WFs
# number of SQC and Scalar caches
assert(n_CUs % options.cu_per_sqc == 0)
n_SQCs = int(n_CUs/options.cu_per_sqc)
options.num_sqc = n_SQCs
assert(n_CUs % options.cu_per_scalar_cache == 0)
n_Scalars = int(n_CUs/options.cu_per_scalar_cache)
# for now, we only set CUs and SQCs
# TODO: add scalars if necessary
n_Scalars = 0
options.num_scalar_cache = n_Scalars
if n_Scalars == 0:
options.cu_per_scalar_cache = 0
if args:
print("Error: script doesn't take any positional arguments")
sys.exit(1)
#
# Create GPU Ruby random tester
#
tester = ProtocolTester(cus_per_sqc = options.cu_per_sqc,
cus_per_scalar = options.cu_per_scalar_cache,
wavefronts_per_cu = options.wavefronts_per_cu,
workitems_per_wavefront = options.wf_size,
num_atomic_locations = num_atomic_locs,
num_normal_locs_per_atomic = \
num_regular_locs_per_atomic_loc,
max_num_episodes = max_episodes,
episode_length = eps_length,
debug_tester = options.debug_tester,
random_seed = options.random_seed,
log_file = options.log_file)
#
# Create the M5 system. Note that the memory object isn't actually
# used by the vitester, but is included to support
# the M5 memory size == Ruby memory size checks
#
# The system doesn't have real CPUs or CUs.
# It just has a tester that has physical ports to be connected to Ruby
#
system = System(cpu = tester,
mem_ranges = [AddrRange(options.mem_size)],
cache_line_size = options.cacheline_size,
mem_mode = 'timing')
system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
system.clk_domain = SrcClockDomain(clock = options.sys_clock,
voltage_domain = system.voltage_domain)
options.num_cp = 0
#
# Create the Ruby system
#
Ruby.create_system(options, False, system)
#
# The tester is most effective when randomization is turned on and
# artifical delay is randomly inserted on messages
#
system.ruby.randomization = True
# assert that we got the right number of Ruby ports
assert(len(system.ruby._cpu_ports) == n_CPUs + n_CUs + n_SQCs + n_Scalars)
#
# attach Ruby ports to the tester
# in the order: cpu_sequencers,
# vector_coalescers,
# sqc_sequencers,
# scalar_sequencers
#
print("Attaching ruby ports to the tester")
i = 0
for ruby_port in system.ruby._cpu_ports:
ruby_port.no_retry_on_stall = True
ruby_port.using_ruby_tester = True
if i < n_CPUs:
tester.cpu_ports = ruby_port.slave
elif i < (n_CPUs + n_CUs):
tester.cu_vector_ports = ruby_port.slave
elif i < (n_CPUs + n_CUs + n_SQCs):
tester.cu_sqc_ports = ruby_port.slave
else:
tester.cu_scalar_ports = ruby_port.slave
i += 1
#
# Create CPU threads
#
thread_clock = SrcClockDomain(clock = '1GHz',
voltage_domain = system.voltage_domain)
cpu_threads = []
print("Creating %i CpuThreads" % n_CPUs)
for cpu_idx in range(n_CPUs):
cpu_threads.append(CpuThread(thread_id = cpu_idx,
num_lanes = 1, # CPU thread is scalar
clk_domain = thread_clock,
deadlock_threshold = \
tester_deadlock_threshold))
tester.cpu_threads = cpu_threads
#
# Create GPU wavefronts
#
wavefronts = []
g_thread_idx = n_CPUs
print("Creating %i WFs attached to %i CUs" % \
(n_CUs * tester.wavefronts_per_cu, n_CUs))
for cu_idx in range(n_CUs):
for wf_idx in range(tester.wavefronts_per_cu):
wavefronts.append(GpuWavefront(thread_id = g_thread_idx,
cu_id = cu_idx,
num_lanes = options.wf_size,
clk_domain = thread_clock,
deadlock_threshold = \
tester_deadlock_threshold))
g_thread_idx += 1
tester.wavefronts = wavefronts
# -----------------------
# run simulation
# -----------------------
root = Root( full_system = False, system = system )
# Not much point in this being higher than the L1 latency
m5.ticks.setGlobalFrequency('1ns')
# instantiate configuration
m5.instantiate()
# simulate until program terminates
exit_event = m5.simulate(options.abs_max_tick)
print('Exiting tick: ', m5.curTick())
print('Exiting because ', exit_event.getCause())