configs/example/ruby_gpu_random_test.py - public/gem5 - Git at Google

 # Copyright (c) 2018-2021 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice,
 # this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 # this list of conditions and the following disclaimer in the documentation
 # and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the copyright holder nor the names of its
 # contributors may be used to endorse or promote products derived from this
 # software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

 import m5
 from m5.objects import *
 from m5.defines import buildEnv
 from m5.util import addToPath
 import os, argparse, sys

 addToPath("../")

 from common import Options
 from ruby import Ruby

 #
 # Add the ruby specific and protocol specific options
 #
 parser = argparse.ArgumentParser()
 Options.addNoISAOptions(parser)
 Ruby.define_options(parser)

 # GPU Ruby tester options
 parser.add_argument(
     "--cache-size",
     default="small",
     choices=["small", "large"],
     help="Cache sizes to use. Small encourages races between \
                         requests and writebacks. Large stresses write-through \
                         and/or write-back GPU caches.",
 )
 parser.add_argument(
     "--system-size",
     default="small",
     choices=["small", "medium", "large"],
     help="This option defines how many CUs, CPUs and cache \
                         components in the test system.",
 )
 parser.add_argument(
     "--address-range",
     default="small",
     choices=["small", "large"],
     help="This option defines the number of atomic \
                         locations that affects the working set's size. \
                         A small number of atomic locations encourage more \
                         races among threads. The large option stresses cache \
                         resources.",
 )
 parser.add_argument(
     "--episode-length",
     default="short",
     choices=["short", "medium", "long"],
     help="This option defines the number of LDs and \
                         STs in an episode. The small option encourages races \
                         between the start and end of an episode. The long \
                         option encourages races between LDs and STs in the \
                         same episode.",
 )
 parser.add_argument(
     "--test-length",
     type=int,
     default=1,
     help="The number of episodes to be executed by each \
                         wavefront. This determines the maximum number, i.e., \
                         val X #WFs, of episodes to be executed in the test.",
 )
 parser.add_argument(
     "--debug-tester",
     action="store_true",
     help="This option will turn on DRF checker",
 )
 parser.add_argument(
     "--random-seed",
     type=int,
     default=0,
     help="Random seed number. Default value (i.e., 0) means \
                         using runtime-specific value",
 )
 parser.add_argument("--log-file", type=str, default="gpu-ruby-test.log")
 parser.add_argument(
     "--num-dmas",
     type=int,
     default=None,
     help="The number of DMA engines to use in tester config.",
 )

 args = parser.parse_args()

 #
 # Set up cache size - 2 options
 #   0: small cache
 #   1: large cache
 #
 if args.cache_size == "small":
     args.tcp_size = "256B"
     args.tcp_assoc = 2
     args.tcc_size = "1kB"
     args.tcc_assoc = 2
 elif args.cache_size == "large":
     args.tcp_size = "256kB"
     args.tcp_assoc = 16
     args.tcc_size = "1024kB"
     args.tcc_assoc = 16

 #
 # Set up system size - 3 options
 #
 if args.system_size == "small":
     # 1 CU, 1 CPU, 1 SQC, 1 Scalar
     args.wf_size = 1
     args.wavefronts_per_cu = 1
     args.num_cpus = 1
     n_DMAs = 1
     args.cu_per_sqc = 1
     args.cu_per_scalar_cache = 1
     args.num_compute_units = 1
 elif args.system_size == "medium":
     # 4 CUs, 4 CPUs, 1 SQCs, 1 Scalars
     args.wf_size = 16
     args.wavefronts_per_cu = 4
     args.num_cpus = 4
     n_DMAs = 2
     args.cu_per_sqc = 4
     args.cu_per_scalar_cache = 4
     args.num_compute_units = 4
 elif args.system_size == "large":
     # 8 CUs, 4 CPUs, 1 SQCs, 1 Scalars
     args.wf_size = 32
     args.wavefronts_per_cu = 4
     args.num_cpus = 4
     n_DMAs = 4
     args.cu_per_sqc = 4
     args.cu_per_scalar_cache = 4
     args.num_compute_units = 8

 # Number of DMA engines
 if not (args.num_dmas is None):
     n_DMAs = args.num_dmas
     # currently the tester does not support requests returned as
     # aliased, thus we need num_dmas to be 0 for it
     if not (args.num_dmas == 0):
         print("WARNING: num_dmas != 0 not supported with VIPER")

 #
 # Set address range - 2 options
 #   level 0: small
 #   level 1: large
 # Each location corresponds to a 4-byte piece of data
 #
 args.mem_size = "1024MB"
 if args.address_range == "small":
     num_atomic_locs = 10
     num_regular_locs_per_atomic_loc = 10000
 elif args.address_range == "large":
     num_atomic_locs = 100
     num_regular_locs_per_atomic_loc = 100000

 #
 # Set episode length (# of actions per episode) - 3 options
 #   0: 10 actions
 #   1: 100 actions
 #   2: 500 actions
 #
 if args.episode_length == "short":
     eps_length = 10
 elif args.episode_length == "medium":
     eps_length = 100
 elif args.episode_length == "long":
     eps_length = 500

 #
 # Set Ruby and tester deadlock thresholds. Ruby's deadlock detection is the
 # primary check for deadlocks. The tester's deadlock threshold detection is
 # a secondary check for deadlock. If there is a bug in RubyPort that causes
 # a packet not to return to the tester properly, the tester will issue a
 # deadlock panic. We set cache_deadlock_threshold < tester_deadlock_threshold
 # to detect deadlock caused by Ruby protocol first before one caused by the
 # coalescer. Both units are in Ticks
 #
 args.cache_deadlock_threshold = 1e8
 tester_deadlock_threshold = 1e9

 # For now we're testing only GPU protocol, so we force num_cpus to be 0
 args.num_cpus = 0

 # Number of CUs
 n_CUs = args.num_compute_units

 # Set test length, i.e., number of episodes per wavefront * #WFs.
 # Test length can be 1x#WFs, 10x#WFs, 100x#WFs, ...
 n_WFs = n_CUs * args.wavefronts_per_cu
 max_episodes = args.test_length * n_WFs

 # Number of SQC and Scalar caches
 assert n_CUs % args.cu_per_sqc == 0
 n_SQCs = n_CUs // args.cu_per_sqc
 args.num_sqc = n_SQCs

 assert args.cu_per_scalar_cache != 0
 n_Scalars = n_CUs // args.cu_per_scalar_cache
 args.num_scalar_cache = n_Scalars

 #
 # Create GPU Ruby random tester
 #
 tester = ProtocolTester(
     cus_per_sqc=args.cu_per_sqc,
     cus_per_scalar=args.cu_per_scalar_cache,
     wavefronts_per_cu=args.wavefronts_per_cu,
     workitems_per_wavefront=args.wf_size,
     num_atomic_locations=num_atomic_locs,
     num_normal_locs_per_atomic=num_regular_locs_per_atomic_loc,
     max_num_episodes=max_episodes,
     episode_length=eps_length,
     debug_tester=args.debug_tester,
     random_seed=args.random_seed,
     log_file=args.log_file,
 )

 #
 # Create a gem5 system. Note that the memory object isn't actually used by the
 # tester, but is included to ensure the gem5 memory size == Ruby memory size
 # checks. The system doesn't have real CPUs or CUs. It just has a tester that
 # has physical ports to be connected to Ruby
 #
 system = System(
     cpu=tester,
     mem_ranges=[AddrRange(args.mem_size)],
     cache_line_size=args.cacheline_size,
     mem_mode="timing",
 )

 system.voltage_domain = VoltageDomain(voltage=args.sys_voltage)
 system.clk_domain = SrcClockDomain(
     clock=args.sys_clock, voltage_domain=system.voltage_domain
 )

 #
 # Command processor is not needed for the tester since we don't run real
 # kernels. Setting it to zero disables the VIPER protocol from creating
 # a command processor and its caches.
 #
 args.num_cp = 0

 #
 # Make generic DMA sequencer for Ruby to use
 #
 if n_DMAs > 0:
     dma_devices = [TesterDma()] * n_DMAs
     system.piobus = IOXBar()
     for _, dma_device in enumerate(dma_devices):
         dma_device.pio = system.piobus.mem_side_ports
     system.dma_devices = dma_devices

 #
 # Create the Ruby system
 #
 # the ruby tester reuses num_cpus to specify the
 # number of cpu ports connected to the tester object, which
 # is stored in system.cpu. because there is only ever one
 # tester object, num_cpus is not necessarily equal to the
 # size of system.cpu
 cpu_list = [system.cpu] * args.num_cpus
 Ruby.create_system(
     args,
     full_system=False,
     system=system,
     dma_ports=system.dma_devices if n_DMAs > 0 else [],
     cpus=cpu_list,
 )

 #
 # The tester is most effective when randomization is turned on and
 # artifical delay is randomly inserted on messages
 #
 system.ruby.randomization = True

 # Assert that we got the right number of Ruby ports
 assert len(system.ruby._cpu_ports) == n_CUs + n_SQCs + n_Scalars

 #
 # Attach Ruby ports to the tester in the order:
 #               cpu_sequencers,
 #               vector_coalescers,
 #               sqc_sequencers,
 #               scalar_sequencers
 #
 # Note that this requires the protocol to create sequencers in this order
 #
 print("Attaching ruby ports to the tester")
 for i, ruby_port in enumerate(system.ruby._cpu_ports):
     ruby_port.no_retry_on_stall = True
     ruby_port.using_ruby_tester = True

     # piobus is only created if there are DMAs
     if n_DMAs > 0:
         ruby_port.mem_request_port = system.piobus.cpu_side_ports

     if i < n_CUs:
         tester.cu_vector_ports = ruby_port.in_ports
         tester.cu_token_ports = ruby_port.gmTokenPort
         tester.max_cu_tokens = 4 * n_WFs
     elif i < (n_CUs + n_SQCs):
         tester.cu_sqc_ports = ruby_port.in_ports
     else:
         tester.cu_scalar_ports = ruby_port.in_ports

     i += 1

 #
 # Attach DMA ports. Since Ruby.py doesn't return these they need to be found.
 # Connect tester's request port to each DMA sequencer's in_ports. This assumes
 # the protocol names these system.dma_cntrl<#>.
 #
 dma_ports = []
 for i in range(n_DMAs):
     dma_cntrl = getattr(system, "dma_cntrl" + str(i))
     dma_ports.append(dma_cntrl.dma_sequencer.in_ports)
 tester.dma_ports = dma_ports

 #
 # Common variables for all types of threads
 #
 thread_clock = SrcClockDomain(
     clock="1GHz", voltage_domain=system.voltage_domain
 )
 g_thread_idx = 0

 #
 # No CPU threads are used for GPU tester
 #
 tester.cpu_threads = []

 #
 # Create DMA threads
 #
 dma_threads = []
 print("Creating %i DMAs" % n_DMAs)
 for dma_idx in range(n_DMAs):
     dma_threads.append(
         DmaThread(
             thread_id=g_thread_idx,
             num_lanes=1,
             clk_domain=thread_clock,
             deadlock_threshold=tester_deadlock_threshold,
         )
     )
     g_thread_idx += 1
 tester.dma_threads = dma_threads

 #
 # Create GPU wavefronts
 #
 wavefronts = []
 print(
     "Creating %i WFs attached to %i CUs"
     % (n_CUs * tester.wavefronts_per_cu, n_CUs)
 )
 for cu_idx in range(n_CUs):
     for wf_idx in range(tester.wavefronts_per_cu):
         wavefronts.append(
             GpuWavefront(
                 thread_id=g_thread_idx,
                 cu_id=cu_idx,
                 num_lanes=args.wf_size,
                 clk_domain=thread_clock,
                 deadlock_threshold=tester_deadlock_threshold,
             )
         )
         g_thread_idx += 1
 tester.wavefronts = wavefronts

 #
 # Run simulation
 #
 root = Root(full_system=False, system=system)

 # Not much point in this being higher than the L1 latency
 m5.ticks.setGlobalFrequency("1ns")

 # Instantiate configuration
 m5.instantiate()

 # Simulate until tester completes
 exit_event = m5.simulate()

 print("Exiting tick: ", m5.curTick())
 print("Exiting because ", exit_event.getCause())
	# Copyright (c) 2018-2021 Advanced Micro Devices, Inc.
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are met:
	#
	# 1. Redistributions of source code must retain the above copyright notice,
	# this list of conditions and the following disclaimer.
	#
	# 2. Redistributions in binary form must reproduce the above copyright notice,
	# this list of conditions and the following disclaimer in the documentation
	# and/or other materials provided with the distribution.
	#
	# 3. Neither the name of the copyright holder nor the names of its
	# contributors may be used to endorse or promote products derived from this
	# software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	# POSSIBILITY OF SUCH DAMAGE.

	import m5
	from m5.objects import *
	from m5.defines import buildEnv
	from m5.util import addToPath
	import os, argparse, sys

	addToPath("../")

	from common import Options
	from ruby import Ruby

	#
	# Add the ruby specific and protocol specific options
	#
	parser = argparse.ArgumentParser()
	Options.addNoISAOptions(parser)
	Ruby.define_options(parser)

	# GPU Ruby tester options
	parser.add_argument(
	"--cache-size",
	default="small",
	choices=["small", "large"],
	help="Cache sizes to use. Small encourages races between \
	requests and writebacks. Large stresses write-through \
	and/or write-back GPU caches.",
	)
	parser.add_argument(
	"--system-size",
	default="small",
	choices=["small", "medium", "large"],
	help="This option defines how many CUs, CPUs and cache \
	components in the test system.",
	)
	parser.add_argument(
	"--address-range",
	default="small",
	choices=["small", "large"],
	help="This option defines the number of atomic \
	locations that affects the working set's size. \
	A small number of atomic locations encourage more \
	races among threads. The large option stresses cache \
	resources.",
	)
	parser.add_argument(
	"--episode-length",
	default="short",
	choices=["short", "medium", "long"],
	help="This option defines the number of LDs and \
	STs in an episode. The small option encourages races \
	between the start and end of an episode. The long \
	option encourages races between LDs and STs in the \
	same episode.",
	)
	parser.add_argument(
	"--test-length",
	type=int,
	default=1,
	help="The number of episodes to be executed by each \
	wavefront. This determines the maximum number, i.e., \
	val X #WFs, of episodes to be executed in the test.",
	)
	parser.add_argument(
	"--debug-tester",
	action="store_true",
	help="This option will turn on DRF checker",
	)
	parser.add_argument(
	"--random-seed",
	type=int,
	default=0,
	help="Random seed number. Default value (i.e., 0) means \
	using runtime-specific value",
	)
	parser.add_argument("--log-file", type=str, default="gpu-ruby-test.log")
	parser.add_argument(
	"--num-dmas",
	type=int,
	default=None,
	help="The number of DMA engines to use in tester config.",
	)

	args = parser.parse_args()

	#
	# Set up cache size - 2 options
	# 0: small cache
	# 1: large cache
	#
	if args.cache_size == "small":
	args.tcp_size = "256B"
	args.tcp_assoc = 2
	args.tcc_size = "1kB"
	args.tcc_assoc = 2
	elif args.cache_size == "large":
	args.tcp_size = "256kB"
	args.tcp_assoc = 16
	args.tcc_size = "1024kB"
	args.tcc_assoc = 16

	#
	# Set up system size - 3 options
	#
	if args.system_size == "small":
	# 1 CU, 1 CPU, 1 SQC, 1 Scalar
	args.wf_size = 1
	args.wavefronts_per_cu = 1
	args.num_cpus = 1
	n_DMAs = 1
	args.cu_per_sqc = 1
	args.cu_per_scalar_cache = 1
	args.num_compute_units = 1
	elif args.system_size == "medium":
	# 4 CUs, 4 CPUs, 1 SQCs, 1 Scalars
	args.wf_size = 16
	args.wavefronts_per_cu = 4
	args.num_cpus = 4
	n_DMAs = 2
	args.cu_per_sqc = 4
	args.cu_per_scalar_cache = 4
	args.num_compute_units = 4
	elif args.system_size == "large":
	# 8 CUs, 4 CPUs, 1 SQCs, 1 Scalars
	args.wf_size = 32
	args.wavefronts_per_cu = 4
	args.num_cpus = 4
	n_DMAs = 4
	args.cu_per_sqc = 4
	args.cu_per_scalar_cache = 4
	args.num_compute_units = 8

	# Number of DMA engines
	if not (args.num_dmas is None):
	n_DMAs = args.num_dmas
	# currently the tester does not support requests returned as
	# aliased, thus we need num_dmas to be 0 for it
	if not (args.num_dmas == 0):
	print("WARNING: num_dmas != 0 not supported with VIPER")

	#
	# Set address range - 2 options
	# level 0: small
	# level 1: large
	# Each location corresponds to a 4-byte piece of data
	#
	args.mem_size = "1024MB"
	if args.address_range == "small":
	num_atomic_locs = 10
	num_regular_locs_per_atomic_loc = 10000
	elif args.address_range == "large":
	num_atomic_locs = 100
	num_regular_locs_per_atomic_loc = 100000

	#
	# Set episode length (# of actions per episode) - 3 options
	# 0: 10 actions
	# 1: 100 actions
	# 2: 500 actions
	#
	if args.episode_length == "short":
	eps_length = 10
	elif args.episode_length == "medium":
	eps_length = 100
	elif args.episode_length == "long":
	eps_length = 500

	#
	# Set Ruby and tester deadlock thresholds. Ruby's deadlock detection is the
	# primary check for deadlocks. The tester's deadlock threshold detection is
	# a secondary check for deadlock. If there is a bug in RubyPort that causes
	# a packet not to return to the tester properly, the tester will issue a
	# deadlock panic. We set cache_deadlock_threshold < tester_deadlock_threshold
	# to detect deadlock caused by Ruby protocol first before one caused by the
	# coalescer. Both units are in Ticks
	#
	args.cache_deadlock_threshold = 1e8
	tester_deadlock_threshold = 1e9

	# For now we're testing only GPU protocol, so we force num_cpus to be 0
	args.num_cpus = 0

	# Number of CUs
	n_CUs = args.num_compute_units

	# Set test length, i.e., number of episodes per wavefront * #WFs.
	# Test length can be 1x#WFs, 10x#WFs, 100x#WFs, ...
	n_WFs = n_CUs * args.wavefronts_per_cu
	max_episodes = args.test_length * n_WFs

	# Number of SQC and Scalar caches
	assert n_CUs % args.cu_per_sqc == 0
	n_SQCs = n_CUs // args.cu_per_sqc
	args.num_sqc = n_SQCs

	assert args.cu_per_scalar_cache != 0
	n_Scalars = n_CUs // args.cu_per_scalar_cache
	args.num_scalar_cache = n_Scalars

	#
	# Create GPU Ruby random tester
	#
	tester = ProtocolTester(
	cus_per_sqc=args.cu_per_sqc,
	cus_per_scalar=args.cu_per_scalar_cache,
	wavefronts_per_cu=args.wavefronts_per_cu,
	workitems_per_wavefront=args.wf_size,
	num_atomic_locations=num_atomic_locs,
	num_normal_locs_per_atomic=num_regular_locs_per_atomic_loc,
	max_num_episodes=max_episodes,
	episode_length=eps_length,
	debug_tester=args.debug_tester,
	random_seed=args.random_seed,
	log_file=args.log_file,
	)

	#
	# Create a gem5 system. Note that the memory object isn't actually used by the
	# tester, but is included to ensure the gem5 memory size == Ruby memory size
	# checks. The system doesn't have real CPUs or CUs. It just has a tester that
	# has physical ports to be connected to Ruby
	#
	system = System(
	cpu=tester,
	mem_ranges=[AddrRange(args.mem_size)],
	cache_line_size=args.cacheline_size,
	mem_mode="timing",
	)

	system.voltage_domain = VoltageDomain(voltage=args.sys_voltage)
	system.clk_domain = SrcClockDomain(
	clock=args.sys_clock, voltage_domain=system.voltage_domain
	)

	#
	# Command processor is not needed for the tester since we don't run real
	# kernels. Setting it to zero disables the VIPER protocol from creating
	# a command processor and its caches.
	#
	args.num_cp = 0

	#
	# Make generic DMA sequencer for Ruby to use
	#
	if n_DMAs > 0:
	dma_devices = [TesterDma()] * n_DMAs
	system.piobus = IOXBar()
	for _, dma_device in enumerate(dma_devices):
	dma_device.pio = system.piobus.mem_side_ports
	system.dma_devices = dma_devices

	#
	# Create the Ruby system
	#
	# the ruby tester reuses num_cpus to specify the
	# number of cpu ports connected to the tester object, which
	# is stored in system.cpu. because there is only ever one
	# tester object, num_cpus is not necessarily equal to the
	# size of system.cpu
	cpu_list = [system.cpu] * args.num_cpus
	Ruby.create_system(
	args,
	full_system=False,
	system=system,
	dma_ports=system.dma_devices if n_DMAs > 0 else [],
	cpus=cpu_list,
	)

	#
	# The tester is most effective when randomization is turned on and
	# artifical delay is randomly inserted on messages
	#
	system.ruby.randomization = True

	# Assert that we got the right number of Ruby ports
	assert len(system.ruby._cpu_ports) == n_CUs + n_SQCs + n_Scalars

	#
	# Attach Ruby ports to the tester in the order:
	# cpu_sequencers,
	# vector_coalescers,
	# sqc_sequencers,
	# scalar_sequencers
	#
	# Note that this requires the protocol to create sequencers in this order
	#
	print("Attaching ruby ports to the tester")
	for i, ruby_port in enumerate(system.ruby._cpu_ports):
	ruby_port.no_retry_on_stall = True
	ruby_port.using_ruby_tester = True

	# piobus is only created if there are DMAs
	if n_DMAs > 0:
	ruby_port.mem_request_port = system.piobus.cpu_side_ports

	if i < n_CUs:
	tester.cu_vector_ports = ruby_port.in_ports
	tester.cu_token_ports = ruby_port.gmTokenPort
	tester.max_cu_tokens = 4 * n_WFs
	elif i < (n_CUs + n_SQCs):
	tester.cu_sqc_ports = ruby_port.in_ports
	else:
	tester.cu_scalar_ports = ruby_port.in_ports

	i += 1

	#
	# Attach DMA ports. Since Ruby.py doesn't return these they need to be found.
	# Connect tester's request port to each DMA sequencer's in_ports. This assumes
	# the protocol names these system.dma_cntrl<#>.
	#
	dma_ports = []
	for i in range(n_DMAs):
	dma_cntrl = getattr(system, "dma_cntrl" + str(i))
	dma_ports.append(dma_cntrl.dma_sequencer.in_ports)
	tester.dma_ports = dma_ports

	#
	# Common variables for all types of threads
	#
	thread_clock = SrcClockDomain(
	clock="1GHz", voltage_domain=system.voltage_domain
	)
	g_thread_idx = 0

	#
	# No CPU threads are used for GPU tester
	#
	tester.cpu_threads = []

	#
	# Create DMA threads
	#
	dma_threads = []
	print("Creating %i DMAs" % n_DMAs)
	for dma_idx in range(n_DMAs):
	dma_threads.append(
	DmaThread(
	thread_id=g_thread_idx,
	num_lanes=1,
	clk_domain=thread_clock,
	deadlock_threshold=tester_deadlock_threshold,
	)
	)
	g_thread_idx += 1
	tester.dma_threads = dma_threads

	#
	# Create GPU wavefronts
	#
	wavefronts = []
	print(
	"Creating %i WFs attached to %i CUs"
	% (n_CUs * tester.wavefronts_per_cu, n_CUs)
	)
	for cu_idx in range(n_CUs):
	for wf_idx in range(tester.wavefronts_per_cu):
	wavefronts.append(
	GpuWavefront(
	thread_id=g_thread_idx,
	cu_id=cu_idx,
	num_lanes=args.wf_size,
	clk_domain=thread_clock,
	deadlock_threshold=tester_deadlock_threshold,
	)
	)
	g_thread_idx += 1
	tester.wavefronts = wavefronts

	#
	# Run simulation
	#
	root = Root(full_system=False, system=system)

	# Not much point in this being higher than the L1 latency
	m5.ticks.setGlobalFrequency("1ns")

	# Instantiate configuration
	m5.instantiate()

	# Simulate until tester completes
	exit_event = m5.simulate()

	print("Exiting tick: ", m5.curTick())
	print("Exiting because ", exit_event.getCause())