configs/dram/lat_mem_rd.py - public/gem5 - Git at Google

 # Copyright (c) 2015-2016 ARM Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
 # not be construed as granting a license to any other intellectual
 # property including but not limited to intellectual property relating
 # to a hardware implementation of the functionality of the software
 # licensed hereunder.  You may use the software subject to the license
 # terms below provided that you ensure that this notice is replicated
 # unmodified and in its entirety in all distributions of the software,
 # modified or unmodified, in source code or in binary form.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met: redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer;
 # redistributions in binary form must reproduce the above copyright
 # notice, this list of conditions and the following disclaimer in the
 # documentation and/or other materials provided with the distribution;
 # neither the name of the copyright holders nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 import gzip
 import argparse
 import os

 import m5
 from m5.objects import *
 from m5.util import addToPath
 from m5.stats import periodicStatDump

 addToPath("../")
 from common import ObjectList
 from common import MemConfig

 addToPath("../../util")
 import protolib

 # this script is helpful to observe the memory latency for various
 # levels in a cache hierarchy, and various cache and memory
 # configurations, in essence replicating the lmbench lat_mem_rd thrash
 # behaviour

 # import the packet proto definitions, and if they are not found,
 # attempt to generate them automatically
 try:
     import packet_pb2
 except:
     print("Did not find packet proto definitions, attempting to generate")
     from subprocess import call

     error = call(
         [
             "protoc",
             "--python_out=configs/dram",
             "--proto_path=src/proto",
             "src/proto/packet.proto",
         ]
     )
     if not error:
         print("Generated packet proto definitions")

         try:
             import google.protobuf
         except:
             print("Please install the Python protobuf module")
             exit(-1)

         import packet_pb2
     else:
         print("Failed to import packet proto definitions")
         exit(-1)

 parser = argparse.ArgumentParser()

 parser.add_argument(
     "--mem-type",
     default="DDR3_1600_8x8",
     choices=ObjectList.mem_list.get_names(),
     help="type of memory to use",
 )
 parser.add_argument(
     "--mem-size",
     action="store",
     type=str,
     default="16MB",
     help="Specify the memory size",
 )
 parser.add_argument(
     "--reuse-trace",
     action="store_true",
     help="Prevent generation of traces and reuse existing",
 )

 args = parser.parse_args()

 # start by creating the system itself, using a multi-layer 2.0 GHz
 # crossbar, delivering 64 bytes / 3 cycles (one header cycle) which
 # amounts to 42.7 GByte/s per layer and thus per port
 system = System(membus=SystemXBar(width=32))
 system.clk_domain = SrcClockDomain(
     clock="2.0GHz", voltage_domain=VoltageDomain(voltage="1V")
 )

 mem_range = AddrRange(args.mem_size)
 system.mem_ranges = [mem_range]

 # do not worry about reserving space for the backing store
 system.mmap_using_noreserve = True

 # currently not exposed as command-line args, set here for now
 args.mem_channels = 1
 args.mem_ranks = 1
 args.external_memory_system = 0
 args.tlm_memory = 0
 args.elastic_trace_en = 0

 MemConfig.config_mem(args, system)

 # there is no point slowing things down by saving any data
 for ctrl in system.mem_ctrls:
     ctrl.null = True

     # the following assumes that we are using the native DRAM
     # controller, check to be sure
     if isinstance(ctrl, m5.objects.MemCtrl):
         # make the DRAM refresh interval sufficiently infinite to avoid
         # latency spikes
         ctrl.tREFI = "100s"

 # use the same concept as the utilisation sweep, and print the config
 # so that we can later read it in
 cfg_file_name = os.path.join(m5.options.outdir, "lat_mem_rd.cfg")
 cfg_file = open(cfg_file_name, "w")

 # set an appropriate burst length in bytes
 burst_size = 64
 system.cache_line_size = burst_size

 # lazy version to check if an integer is a power of two
 def is_pow2(num):
     return num != 0 and ((num & (num - 1)) == 0)


 # assume we start every range at 0
 max_range = int(mem_range.end)

 # start at a size of 4 kByte, and go up till we hit the max, increase
 # the step every time we hit a power of two
 min_range = 4096
 ranges = [min_range]
 step = 1024

 while ranges[-1] < max_range:
     new_range = ranges[-1] + step
     if is_pow2(new_range):
         step *= 2
     ranges.append(new_range)

 # how many times to repeat the measurement for each data point
 iterations = 2

 # 150 ns in ticks, this is choosen to be high enough that transactions
 # do not pile up in the system, adjust if needed
 itt = 150 * 1000

 # for every data point, we create a trace containing a random address
 # sequence, so that we can play back the same sequence for warming and
 # the actual measurement
 def create_trace(filename, max_addr, burst_size, itt):
     try:
         proto_out = gzip.open(filename, "wb")
     except IOError:
         print("Failed to open ", filename, " for writing")
         exit(-1)

     # write the magic number in 4-byte Little Endian, similar to what
     # is done in src/proto/protoio.cc
     proto_out.write("gem5")

     # add the packet header
     header = packet_pb2.PacketHeader()
     header.obj_id = "lat_mem_rd for range 0:" + str(max_addr)
     # assume the default tick rate (1 ps)
     header.tick_freq = 1000000000000
     protolib.encodeMessage(proto_out, header)

     # create a list of every single address to touch
     addrs = list(range(0, max_addr, burst_size))

     import random

     random.shuffle(addrs)

     tick = 0

     # create a packet we can re-use for all the addresses
     packet = packet_pb2.Packet()
     # ReadReq is 1 in src/mem/packet.hh Command enum
     packet.cmd = 1
     packet.size = int(burst_size)

     for addr in addrs:
         packet.tick = int(tick)
         packet.addr = int(addr)
         protolib.encodeMessage(proto_out, packet)
         tick = tick + itt

     proto_out.close()


 # this will take a while, so keep the user informed
 print("Generating traces, please wait...")

 nxt_range = 0
 nxt_state = 0
 period = int(itt * (max_range / burst_size))

 # now we create the states for each range
 for r in ranges:
     filename = os.path.join(
         m5.options.outdir, "lat_mem_rd%d.trc.gz" % nxt_range
     )

     if not args.reuse_trace:
         # create the actual random trace for this range
         create_trace(filename, r, burst_size, itt)

     # the warming state
     cfg_file.write("STATE %d %d TRACE %s 0\n" % (nxt_state, period, filename))
     nxt_state = nxt_state + 1

     # the measuring states
     for i in range(iterations):
         cfg_file.write(
             "STATE %d %d TRACE %s 0\n" % (nxt_state, period, filename)
         )
         nxt_state = nxt_state + 1

     nxt_range = nxt_range + 1

 cfg_file.write("INIT 0\n")

 # go through the states one by one
 for state in range(1, nxt_state):
     cfg_file.write("TRANSITION %d %d 1\n" % (state - 1, state))

 cfg_file.write("TRANSITION %d %d 1\n" % (nxt_state - 1, nxt_state - 1))

 cfg_file.close()

 # create a traffic generator, and point it to the file we just created
 system.tgen = TrafficGen(config_file=cfg_file_name, progress_check="10s")

 # add a communication monitor
 system.monitor = CommMonitor()
 system.monitor.footprint = MemFootprintProbe()

 # connect the traffic generator to the system
 system.tgen.port = system.monitor.cpu_side_port

 # create the actual cache hierarchy, for now just go with something
 # basic to explore some of the options
 from common.Caches import *

 # a starting point for an L3 cache
 class L3Cache(Cache):
     assoc = 16
     tag_latency = 20
     data_latency = 20
     sequential_access = True
     response_latency = 40
     mshrs = 32
     tgts_per_mshr = 12
     write_buffers = 16


 # note that everything is in the same clock domain, 2.0 GHz as
 # specified above
 system.l1cache = L1_DCache(size="64kB")
 system.monitor.mem_side_port = system.l1cache.cpu_side

 system.l2cache = L2Cache(size="512kB", writeback_clean=True)
 system.l2cache.xbar = L2XBar()
 system.l1cache.mem_side = system.l2cache.xbar.cpu_side_ports
 system.l2cache.cpu_side = system.l2cache.xbar.mem_side_ports

 # make the L3 mostly exclusive, and correspondingly ensure that the L2
 # writes back also clean lines to the L3
 system.l3cache = L3Cache(size="4MB", clusivity="mostly_excl")
 system.l3cache.xbar = L2XBar()
 system.l2cache.mem_side = system.l3cache.xbar.cpu_side_ports
 system.l3cache.cpu_side = system.l3cache.xbar.mem_side_ports
 system.l3cache.mem_side = system.membus.cpu_side_ports

 # connect the system port even if it is not used in this example
 system.system_port = system.membus.cpu_side_ports

 # every period, dump and reset all stats
 periodicStatDump(period)

 # run Forrest, run!
 root = Root(full_system=False, system=system)
 root.system.mem_mode = "timing"

 m5.instantiate()
 m5.simulate(nxt_state * period)

 # print all we need to make sense of the stats output
 print("lat_mem_rd with %d iterations, ranges:" % iterations)
 for r in ranges:
     print(r)
	# Copyright (c) 2015-2016 ARM Limited
	# All rights reserved.
	#
	# The license below extends only to copyright in the software and shall
	# not be construed as granting a license to any other intellectual
	# property including but not limited to intellectual property relating
	# to a hardware implementation of the functionality of the software
	# licensed hereunder. You may use the software subject to the license
	# terms below provided that you ensure that this notice is replicated
	# unmodified and in its entirety in all distributions of the software,
	# modified or unmodified, in source code or in binary form.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are
	# met: redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer;
	# redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution;
	# neither the name of the copyright holders nor the names of its
	# contributors may be used to endorse or promote products derived from
	# this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	import gzip
	import argparse
	import os

	import m5
	from m5.objects import *
	from m5.util import addToPath
	from m5.stats import periodicStatDump

	addToPath("../")
	from common import ObjectList
	from common import MemConfig

	addToPath("../../util")
	import protolib

	# this script is helpful to observe the memory latency for various
	# levels in a cache hierarchy, and various cache and memory
	# configurations, in essence replicating the lmbench lat_mem_rd thrash
	# behaviour

	# import the packet proto definitions, and if they are not found,
	# attempt to generate them automatically
	try:
	import packet_pb2
	except:
	print("Did not find packet proto definitions, attempting to generate")
	from subprocess import call

	error = call(
	[
	"protoc",
	"--python_out=configs/dram",
	"--proto_path=src/proto",
	"src/proto/packet.proto",
	]
	)
	if not error:
	print("Generated packet proto definitions")

	try:
	import google.protobuf
	except:
	print("Please install the Python protobuf module")
	exit(-1)

	import packet_pb2
	else:
	print("Failed to import packet proto definitions")
	exit(-1)

	parser = argparse.ArgumentParser()

	parser.add_argument(
	"--mem-type",
	default="DDR3_1600_8x8",
	choices=ObjectList.mem_list.get_names(),
	help="type of memory to use",
	)
	parser.add_argument(
	"--mem-size",
	action="store",
	type=str,
	default="16MB",
	help="Specify the memory size",
	)
	parser.add_argument(
	"--reuse-trace",
	action="store_true",
	help="Prevent generation of traces and reuse existing",
	)

	args = parser.parse_args()

	# start by creating the system itself, using a multi-layer 2.0 GHz
	# crossbar, delivering 64 bytes / 3 cycles (one header cycle) which
	# amounts to 42.7 GByte/s per layer and thus per port
	system = System(membus=SystemXBar(width=32))
	system.clk_domain = SrcClockDomain(
	clock="2.0GHz", voltage_domain=VoltageDomain(voltage="1V")
	)

	mem_range = AddrRange(args.mem_size)
	system.mem_ranges = [mem_range]

	# do not worry about reserving space for the backing store
	system.mmap_using_noreserve = True

	# currently not exposed as command-line args, set here for now
	args.mem_channels = 1
	args.mem_ranks = 1
	args.external_memory_system = 0
	args.tlm_memory = 0
	args.elastic_trace_en = 0

	MemConfig.config_mem(args, system)

	# there is no point slowing things down by saving any data
	for ctrl in system.mem_ctrls:
	ctrl.null = True

	# the following assumes that we are using the native DRAM
	# controller, check to be sure
	if isinstance(ctrl, m5.objects.MemCtrl):
	# make the DRAM refresh interval sufficiently infinite to avoid
	# latency spikes
	ctrl.tREFI = "100s"

	# use the same concept as the utilisation sweep, and print the config
	# so that we can later read it in
	cfg_file_name = os.path.join(m5.options.outdir, "lat_mem_rd.cfg")
	cfg_file = open(cfg_file_name, "w")

	# set an appropriate burst length in bytes
	burst_size = 64
	system.cache_line_size = burst_size

	# lazy version to check if an integer is a power of two
	def is_pow2(num):
	return num != 0 and ((num & (num - 1)) == 0)


	# assume we start every range at 0
	max_range = int(mem_range.end)

	# start at a size of 4 kByte, and go up till we hit the max, increase
	# the step every time we hit a power of two
	min_range = 4096
	ranges = [min_range]
	step = 1024

	while ranges[-1] < max_range:
	new_range = ranges[-1] + step
	if is_pow2(new_range):
	step *= 2
	ranges.append(new_range)

	# how many times to repeat the measurement for each data point
	iterations = 2

	# 150 ns in ticks, this is choosen to be high enough that transactions
	# do not pile up in the system, adjust if needed
	itt = 150 * 1000

	# for every data point, we create a trace containing a random address
	# sequence, so that we can play back the same sequence for warming and
	# the actual measurement
	def create_trace(filename, max_addr, burst_size, itt):
	try:
	proto_out = gzip.open(filename, "wb")
	except IOError:
	print("Failed to open ", filename, " for writing")
	exit(-1)

	# write the magic number in 4-byte Little Endian, similar to what
	# is done in src/proto/protoio.cc
	proto_out.write("gem5")

	# add the packet header
	header = packet_pb2.PacketHeader()
	header.obj_id = "lat_mem_rd for range 0:" + str(max_addr)
	# assume the default tick rate (1 ps)
	header.tick_freq = 1000000000000
	protolib.encodeMessage(proto_out, header)

	# create a list of every single address to touch
	addrs = list(range(0, max_addr, burst_size))

	import random

	random.shuffle(addrs)

	tick = 0

	# create a packet we can re-use for all the addresses
	packet = packet_pb2.Packet()
	# ReadReq is 1 in src/mem/packet.hh Command enum
	packet.cmd = 1
	packet.size = int(burst_size)

	for addr in addrs:
	packet.tick = int(tick)
	packet.addr = int(addr)
	protolib.encodeMessage(proto_out, packet)
	tick = tick + itt

	proto_out.close()


	# this will take a while, so keep the user informed
	print("Generating traces, please wait...")

	nxt_range = 0
	nxt_state = 0
	period = int(itt * (max_range / burst_size))

	# now we create the states for each range
	for r in ranges:
	filename = os.path.join(
	m5.options.outdir, "lat_mem_rd%d.trc.gz" % nxt_range
	)

	if not args.reuse_trace:
	# create the actual random trace for this range
	create_trace(filename, r, burst_size, itt)

	# the warming state
	cfg_file.write("STATE %d %d TRACE %s 0\n" % (nxt_state, period, filename))
	nxt_state = nxt_state + 1

	# the measuring states
	for i in range(iterations):
	cfg_file.write(
	"STATE %d %d TRACE %s 0\n" % (nxt_state, period, filename)
	)
	nxt_state = nxt_state + 1

	nxt_range = nxt_range + 1

	cfg_file.write("INIT 0\n")

	# go through the states one by one
	for state in range(1, nxt_state):
	cfg_file.write("TRANSITION %d %d 1\n" % (state - 1, state))

	cfg_file.write("TRANSITION %d %d 1\n" % (nxt_state - 1, nxt_state - 1))

	cfg_file.close()

	# create a traffic generator, and point it to the file we just created
	system.tgen = TrafficGen(config_file=cfg_file_name, progress_check="10s")

	# add a communication monitor
	system.monitor = CommMonitor()
	system.monitor.footprint = MemFootprintProbe()

	# connect the traffic generator to the system
	system.tgen.port = system.monitor.cpu_side_port

	# create the actual cache hierarchy, for now just go with something
	# basic to explore some of the options
	from common.Caches import *

	# a starting point for an L3 cache
	class L3Cache(Cache):
	assoc = 16
	tag_latency = 20
	data_latency = 20
	sequential_access = True
	response_latency = 40
	mshrs = 32
	tgts_per_mshr = 12
	write_buffers = 16


	# note that everything is in the same clock domain, 2.0 GHz as
	# specified above
	system.l1cache = L1_DCache(size="64kB")
	system.monitor.mem_side_port = system.l1cache.cpu_side

	system.l2cache = L2Cache(size="512kB", writeback_clean=True)
	system.l2cache.xbar = L2XBar()
	system.l1cache.mem_side = system.l2cache.xbar.cpu_side_ports
	system.l2cache.cpu_side = system.l2cache.xbar.mem_side_ports

	# make the L3 mostly exclusive, and correspondingly ensure that the L2
	# writes back also clean lines to the L3
	system.l3cache = L3Cache(size="4MB", clusivity="mostly_excl")
	system.l3cache.xbar = L2XBar()
	system.l2cache.mem_side = system.l3cache.xbar.cpu_side_ports
	system.l3cache.cpu_side = system.l3cache.xbar.mem_side_ports
	system.l3cache.mem_side = system.membus.cpu_side_ports

	# connect the system port even if it is not used in this example
	system.system_port = system.membus.cpu_side_ports

	# every period, dump and reset all stats
	periodicStatDump(period)

	# run Forrest, run!
	root = Root(full_system=False, system=system)
	root.system.mem_mode = "timing"

	m5.instantiate()
	m5.simulate(nxt_state * period)

	# print all we need to make sense of the stats output
	print("lat_mem_rd with %d iterations, ranges:" % iterations)
	for r in ranges:
	print(r)