misc: Merge branch 'release-staging-v20.1.0.0' into develop

Change-Id: I3694b251855b969c7bd3807f34e1b4241d47d586
diff --git a/MAINTAINERS b/MAINTAINERS
index 92c4ce8..d248ec1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -68,6 +68,7 @@
 
 gpu-compute:
   Tony Gutierrez <anthony.gutierrez@amd.com>
+  Matt Poremba <matthew.poremba@amd.com>
 
 learning-gem5: The code and configs for the Learning gem5 book (see
                learning.gem5.com)
diff --git a/SConstruct b/SConstruct
index 0d8159b..ce9e9b6 100755
--- a/SConstruct
+++ b/SConstruct
@@ -318,8 +318,8 @@
     # we consistently violate
     main.Append(CCFLAGS=['-Wall', '-Wundef', '-Wextra',
                          '-Wno-sign-compare', '-Wno-unused-parameter'])
-    # We always compile using C++11
-    main.Append(CXXFLAGS=['-std=c++11'])
+    # We always compile using C++14
+    main.Append(CXXFLAGS=['-std=c++14'])
     if sys.platform.startswith('freebsd'):
         main.Append(CCFLAGS=['-I/usr/local/include'])
         main.Append(CXXFLAGS=['-I/usr/local/include'])
@@ -412,8 +412,8 @@
     clang_version_match = clang_version_re.search(CXX_version)
     if (clang_version_match):
         clang_version = clang_version_match.groups()[0]
-        if compareVersions(clang_version, "3.1") < 0:
-            error('clang version 3.1 or newer required.\n'
+        if compareVersions(clang_version, "3.9") < 0:
+            error('clang version 3.9 or newer required.\n'
                   'Installed version:', clang_version)
     else:
         error('Unable to determine clang version.')
diff --git a/configs/dram/low_power_sweep.py b/configs/dram/low_power_sweep.py
index 292b0fa..a9f7057 100644
--- a/configs/dram/low_power_sweep.py
+++ b/configs/dram/low_power_sweep.py
@@ -93,6 +93,8 @@
                                    voltage_domain =
                                    VoltageDomain(voltage = '1V'))
 
+system.workload = SEWorkload()
+
 # We are fine with 256 MB memory for now.
 mem_range = AddrRange('256MB')
 # Start address is 0
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 03418c3..18640c8 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -174,7 +174,7 @@
                   help="number of physical banks per LDS module")
 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
                   help="number of cycles per LDS bank conflict")
-parser.add_options("--lds-size", type="int", default=65536,
+parser.add_option("--lds-size", type="int", default=65536,
                    help="Size of the LDS in bytes")
 parser.add_option('--fast-forward-pseudo-op', action='store_true',
                   help = 'fast forward using kvm until the m5_switchcpu'
@@ -500,7 +500,8 @@
 system = System(cpu = cpu_list,
                 mem_ranges = [AddrRange(options.mem_size)],
                 cache_line_size = options.cacheline_size,
-                mem_mode = mem_mode)
+                mem_mode = mem_mode,
+                workload = SEWorkload())
 if fast_forward:
     system.future_cpu = future_cpu_list
 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
@@ -565,6 +566,16 @@
                - options.num_scalar_cache
 gpu_port_idx = gpu_port_idx - options.num_cp * 2
 
+# Connect token ports. For this we need to search through the list of all
+# sequencers, since the TCP coalescers will not necessarily be first. Only
+# TCP coalescers use a token port for back pressure.
+token_port_idx = 0
+for i in range(len(system.ruby._cpu_ports)):
+    if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
+        system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = \
+            system.ruby._cpu_ports[i].gmTokenPort
+        token_port_idx += 1
+
 wavefront_size = options.wf_size
 for i in range(n_cu):
     # The pipeline issues wavefront_size number of uncoalesced requests
@@ -572,8 +583,6 @@
     for j in range(wavefront_size):
         system.cpu[shader_idx].CUs[i].memory_port[j] = \
                   system.ruby._cpu_ports[gpu_port_idx].slave[j]
-    system.cpu[shader_idx].CUs[i].gmTokenPort = \
-            system.ruby._cpu_ports[gpu_port_idx].gmTokenPort
     gpu_port_idx += 1
 
 for i in range(n_cu):
diff --git a/configs/example/arm/starter_se.py b/configs/example/arm/starter_se.py
index 0003ce9..d342420 100644
--- a/configs/example/arm/starter_se.py
+++ b/configs/example/arm/starter_se.py
@@ -171,6 +171,8 @@
               (len(processes), args.num_cores))
         sys.exit(1)
 
+    system.workload = SEWorkload()
+
     # Assign one workload to each CPU
     for cpu, workload in zip(system.cpu_cluster.cpus, processes):
         cpu.workload = workload
diff --git a/configs/example/hmc_hello.py b/configs/example/hmc_hello.py
index a682519..706fc2b 100644
--- a/configs/example/hmc_hello.py
+++ b/configs/example/hmc_hello.py
@@ -50,6 +50,7 @@
 options = parser.parse_args()
 # create the system we are going to simulate
 system = System()
+system.workload = SEWorkload()
 # use timing mode for the interaction between master-slave ports
 system.mem_mode = 'timing'
 # set the clock fequency of the system
diff --git a/configs/example/se.py b/configs/example/se.py
index 200a0de..f3fea61 100644
--- a/configs/example/se.py
+++ b/configs/example/se.py
@@ -173,7 +173,7 @@
                 mem_mode = test_mem_mode,
                 mem_ranges = [AddrRange(options.mem_size)],
                 cache_line_size = options.cacheline_size,
-                workload = NULL)
+                workload = SEWorkload())
 
 if numThreads > 1:
     system.multi_thread = True
diff --git a/configs/learning_gem5/part1/simple.py b/configs/learning_gem5/part1/simple.py
index 22b2cf7..cb785b6 100644
--- a/configs/learning_gem5/part1/simple.py
+++ b/configs/learning_gem5/part1/simple.py
@@ -94,6 +94,8 @@
 binary = os.path.join(thispath, '../../../',
                       'tests/test-progs/hello/bin/', isa, 'linux/hello')
 
+system.workload = SEWorkload()
+
 # Create a process for a simple "Hello World" application
 process = Process()
 # Set the command
diff --git a/configs/learning_gem5/part1/two_level.py b/configs/learning_gem5/part1/two_level.py
index 53e1137..50d1d5f 100644
--- a/configs/learning_gem5/part1/two_level.py
+++ b/configs/learning_gem5/part1/two_level.py
@@ -137,6 +137,8 @@
 system.mem_ctrl.dram.range = system.mem_ranges[0]
 system.mem_ctrl.port = system.membus.master
 
+system.workload = SEWorkload()
+
 # Create a process for a simple "Hello World" application
 process = Process()
 # Set the command
diff --git a/configs/learning_gem5/part2/simple_cache.py b/configs/learning_gem5/part2/simple_cache.py
index 533aa23..391de8b 100644
--- a/configs/learning_gem5/part2/simple_cache.py
+++ b/configs/learning_gem5/part2/simple_cache.py
@@ -84,6 +84,8 @@
 # Connect the system up to the membus
 system.system_port = system.membus.slave
 
+system.workload = SEWorkload()
+
 # Create a process for a simple "Hello World" application
 process = Process()
 # Set the command
diff --git a/configs/learning_gem5/part2/simple_memobj.py b/configs/learning_gem5/part2/simple_memobj.py
index b7d2561..80f6602 100644
--- a/configs/learning_gem5/part2/simple_memobj.py
+++ b/configs/learning_gem5/part2/simple_memobj.py
@@ -82,6 +82,8 @@
 # Connect the system up to the membus
 system.system_port = system.membus.slave
 
+system.workload = SEWorkload()
+
 # Create a process for a simple "Hello World" application
 process = Process()
 # Set the command
diff --git a/configs/learning_gem5/part3/simple_ruby.py b/configs/learning_gem5/part3/simple_ruby.py
index 760a168..f0a9e08 100644
--- a/configs/learning_gem5/part3/simple_ruby.py
+++ b/configs/learning_gem5/part3/simple_ruby.py
@@ -89,6 +89,8 @@
 binary = os.path.join(thispath, '../../../', 'tests/test-progs/threads/bin/',
                       isa, 'linux/threads')
 
+system.workload = SEWorkload()
+
 # Create a process for a simple "multi-threaded" application
 process = Process()
 # Set the command
diff --git a/configs/splash2/cluster.py b/configs/splash2/cluster.py
index 0e92625..2b36c82 100644
--- a/configs/splash2/cluster.py
+++ b/configs/splash2/cluster.py
@@ -216,6 +216,8 @@
 system.toL2bus = L2XBar(clock = busFrequency)
 system.l2 = L2(size = options.l2size, assoc = 8)
 
+system.workload = SEWorkload()
+
 # ----------------------
 # Connect the L2 cache and memory together
 # ----------------------
diff --git a/configs/splash2/run.py b/configs/splash2/run.py
index 7ad2dac..b3b8787 100644
--- a/configs/splash2/run.py
+++ b/configs/splash2/run.py
@@ -195,7 +195,8 @@
 # Create a system, and add system wide objects
 # ----------------------
 system = System(cpu = cpus, physmem = SimpleMemory(),
-                membus = SystemXBar(clock = busFrequency))
+                membus = SystemXBar(clock = busFrequency),
+                workload = SEWorkload())
 system.clock = '1GHz'
 
 system.toL2bus = L2XBar(clock = busFrequency)
diff --git a/ext/testlib/helper.py b/ext/testlib/helper.py
index ff83409..1cb13f0 100644
--- a/ext/testlib/helper.py
+++ b/ext/testlib/helper.py
@@ -1,3 +1,15 @@
+# Copyright (c) 2020 ARM Limited
+# All rights reserved
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
 # Copyright (c) 2017 Mark D. Hill and David A. Wood
 # All rights reserved.
 #
@@ -29,7 +41,7 @@
 '''
 Helper classes for writing tests with this test library.
 '''
-from collections import MutableSet
+from collections import MutableSet, namedtuple
 
 import difflib
 import errno
@@ -42,8 +54,85 @@
 import threading
 import time
 
+class TimedWaitPID(object):
+    """Utility to monkey-patch os.waitpid() with os.wait4().
+
+    This allows process usage time to be obtained directly from the OS
+    when used with APIs, such as `subprocess`, which use os.waitpid to
+    join child processes.
+
+    The resource usage data from os.wait4() is stored in a functor and
+    can be obtained using the get_time_for_pid() method.
+
+    To avoid unbounded memory usage, the time record is deleted after
+    it is read.
+
+    """
+    TimeRecord = namedtuple( "_TimeRecord", "user_time system_time" )
+
+    class Wrapper(object):
+        def __init__(self):
+            self._time_for_pid = {}
+            self._access_lock = threading.Lock()
+
+        def __call__(self, pid, options):
+            pid, status, resource_usage = os.wait4(pid, options)
+            with self._access_lock:
+                self._time_for_pid[pid] = (
+                    TimedWaitPID.TimeRecord(
+                        resource_usage.ru_utime,
+                        resource_usage.ru_stime
+                    )
+                )
+            return (pid, status)
+
+        def has_time_for_pid(self, pid):
+            with self._access_lock:
+                return pid in self._time_for_pid
+
+        def get_time_for_pid(self, pid):
+            with self._access_lock:
+                if pid not in self._time_for_pid:
+                    raise Exception("No resource usage for pid {}".format(pid))
+                time_for_pid = self._time_for_pid[pid]
+                del self._time_for_pid[pid]
+                return time_for_pid
+
+    _wrapper = None
+    _wrapper_lock = threading.Lock()
+    _original_os_waitpid = None
+
+    @staticmethod
+    def install():
+        with TimedWaitPID._wrapper_lock:
+            if TimedWaitPID._wrapper is None:
+                TimedWaitPID._wrapper = TimedWaitPID.Wrapper()
+            if TimedWaitPID._original_os_waitpid is None :
+                TimedWaitPID._original_os_waitpid = os.waitpid
+                os.waitpid = TimedWaitPID._wrapper
+
+    @staticmethod
+    def restore():
+        with TimedWaitPID._wrapper_lock:
+            if TimedWaitPID._original_os_waitpid is not None :
+                os.waitpid = TimedWaitPID._original_os_waitpid
+                TimedWaitPID._original_os_waitpid = None
+
+    @staticmethod
+    def has_time_for_pid(pid):
+        with TimedWaitPID._wrapper_lock:
+            return TimedWaitPID._wrapper.has_time_for_pid(pid)
+
+    @staticmethod
+    def get_time_for_pid(pid):
+        with TimedWaitPID._wrapper_lock:
+            return TimedWaitPID._wrapper.get_time_for_pid(pid)
+
+# Patch os.waitpid()
+TimedWaitPID.install()
+
 #TODO Tear out duplicate logic from the sandbox IOManager
-def log_call(logger, command, *popenargs, **kwargs):
+def log_call(logger, command, time, *popenargs, **kwargs):
     '''
     Calls the given process and automatically logs the command and output.
 
@@ -97,6 +186,12 @@
     retval = p.wait()
     stdout_thread.join()
     stderr_thread.join()
+
+    if time is not None and TimedWaitPID.has_time_for_pid(p.pid):
+        resource_usage = TimedWaitPID.get_time_for_pid(p.pid)
+        time['user_time'] = resource_usage.user_time
+        time['system_time'] = resource_usage.system_time
+
     # Return the return exit code of the process.
     if retval != 0:
         raise subprocess.CalledProcessError(retval, cmdstr)
@@ -393,7 +488,8 @@
     (_, tfname) = tempfile.mkstemp(dir=os.path.dirname(out_file), text=True)
     with open(tfname, 'r+') as tempfile_:
         try:
-            log_call(logger, ['diff', out_file, ref_file], stdout=tempfile_)
+            log_call(logger, ['diff', out_file, ref_file],
+                time=None, stdout=tempfile_)
         except OSError:
             # Likely signals that diff does not exist on this system. fallback
             # to difflib
diff --git a/ext/testlib/result.py b/ext/testlib/result.py
index 2d2c506..5c60342 100644
--- a/ext/testlib/result.py
+++ b/ext/testlib/result.py
@@ -1,3 +1,15 @@
+# Copyright (c) 2020 ARM Limited
+# All rights reserved
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
 # Copyright (c) 2017 Mark D. Hill and David A. Wood
 # All rights reserved.
 #
@@ -60,6 +72,10 @@
     def unsuccessful(self):
         return self._metadata.result.value != state.Result.Passed
 
+    @property
+    def time(self):
+        return self._metadata.time
+
 
 class InternalTestResult(_CommonMetadataMixin):
     def __init__(self, obj, suite, directory):
@@ -258,6 +274,7 @@
              # TODO JUnit expects class of test.. add as test metadata.
             XMLAttribute('classname', str(test_result.uid)),
             XMLAttribute('status', str(test_result.result)),
+            XMLAttribute('time', str(test_result.time["user_time"])),
         ]
 
         # TODO JUnit expects a message for the reason a test was
diff --git a/ext/testlib/runner.py b/ext/testlib/runner.py
index ee658c9..16ff952 100644
--- a/ext/testlib/runner.py
+++ b/ext/testlib/runner.py
@@ -79,6 +79,8 @@
         self.suite = suite
         self.log = log.test_log
         self.log.test = test
+        self.time = {
+            "user_time" : 0, "system_time" : 0}
 
     @helper.cacheresult
     def _fixtures(self):
@@ -152,6 +154,8 @@
         else:
             self.testable.result = Result(Result.Passed)
 
+        self.testable.time = test_params.time
+
 
 class SuiteRunner(RunnerPattern):
     def test(self):
diff --git a/ext/testlib/wrappers.py b/ext/testlib/wrappers.py
index e919702..b2b887b 100644
--- a/ext/testlib/wrappers.py
+++ b/ext/testlib/wrappers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 ARM Limited
+# Copyright (c) 2019-2020 ARM Limited
 # All rights reserved
 #
 # The license below extends only to copyright in the software and shall
@@ -124,6 +124,14 @@
     def runner(self):
         return self.obj.runner
 
+    @property
+    def time(self):
+        return self.metadata.time
+
+    @time.setter
+    def time(self, value):
+        self.metadata.time = value
+
     # TODO Change log to provide status_update, result_update for all types.
     def log_status(self, status):
         import testlib.log as log
diff --git a/src/arch/arm/faults.cc b/src/arch/arm/faults.cc
index 56e1814..c724a79 100644
--- a/src/arch/arm/faults.cc
+++ b/src/arch/arm/faults.cc
@@ -517,7 +517,7 @@
     saved_cpsr.v = tc->readCCReg(CCREG_V);
     saved_cpsr.ge = tc->readCCReg(CCREG_GE);
 
-    Addr curPc M5_VAR_USED = tc->pcState().pc();
+    M5_VAR_USED Addr curPc = tc->pcState().pc();
     ITSTATE it = tc->pcState().itstate();
     saved_cpsr.it2 = it.top6;
     saved_cpsr.it1 = it.bottom2;
@@ -525,7 +525,7 @@
     // if we have a valid instruction then use it to annotate this fault with
     // extra information. This is used to generate the correct fault syndrome
     // information
-    ArmStaticInst *arm_inst M5_VAR_USED = instrAnnotate(inst);
+    M5_VAR_USED ArmStaticInst *arm_inst = instrAnnotate(inst);
 
     // Ensure Secure state if initially in Monitor mode
     if (have_security && saved_cpsr.mode == MODE_MON) {
@@ -703,7 +703,7 @@
     // If we have a valid instruction then use it to annotate this fault with
     // extra information. This is used to generate the correct fault syndrome
     // information
-    ArmStaticInst *arm_inst M5_VAR_USED = instrAnnotate(inst);
+    M5_VAR_USED ArmStaticInst *arm_inst = instrAnnotate(inst);
 
     // Set PC to start of exception handler
     Addr new_pc = purifyTaggedAddr(vec_address, tc, toEL, true);
@@ -755,7 +755,7 @@
     Addr base;
 
     // Check for invalid modes
-    CPSR M5_VAR_USED cpsr = tc->readMiscRegNoEffect(MISCREG_CPSR);
+    M5_VAR_USED CPSR cpsr = tc->readMiscRegNoEffect(MISCREG_CPSR);
     assert(ArmSystem::haveSecurity(tc) || cpsr.mode != MODE_MON);
     assert(ArmSystem::haveVirtualization(tc) || cpsr.mode != MODE_HYP);
 
@@ -865,7 +865,7 @@
 
     // As of now, there isn't a 32 bit thumb version of this instruction.
     assert(!machInst.bigThumb);
-    tc->syscall();
+    tc->getSystemPtr()->workload->syscall(tc);
 
     // Advance the PC since that won't happen automatically.
     PCState pc = tc->pcState();
@@ -1069,7 +1069,7 @@
             // See ARM ARM B3-1416
             bool override_LPAE = false;
             TTBCR ttbcr_s = tc->readMiscReg(MISCREG_TTBCR_S);
-            TTBCR M5_VAR_USED ttbcr_ns = tc->readMiscReg(MISCREG_TTBCR_NS);
+            M5_VAR_USED TTBCR ttbcr_ns = tc->readMiscReg(MISCREG_TTBCR_NS);
             if (ttbcr_s.eae) {
                 override_LPAE = true;
             } else {
diff --git a/src/arch/arm/insts/macromem.cc b/src/arch/arm/insts/macromem.cc
index ad8be64..1ce9705 100644
--- a/src/arch/arm/insts/macromem.cc
+++ b/src/arch/arm/insts/macromem.cc
@@ -561,7 +561,7 @@
 
     unsigned eBytes = (1 << size);
     unsigned loadSize = eBytes * elems;
-    unsigned loadRegs M5_VAR_USED =
+    M5_VAR_USED unsigned loadRegs =
         (loadSize + sizeof(uint32_t) - 1) / sizeof(uint32_t);
 
     assert(loadRegs > 0 && loadRegs <= 4);
@@ -925,7 +925,7 @@
 
     unsigned eBytes = (1 << size);
     unsigned storeSize = eBytes * elems;
-    unsigned storeRegs M5_VAR_USED =
+    M5_VAR_USED unsigned storeRegs =
         (storeSize + sizeof(uint32_t) - 1) / sizeof(uint32_t);
 
     assert(storeRegs > 0 && storeRegs <= 4);
diff --git a/src/arch/arm/insts/mem64.cc b/src/arch/arm/insts/mem64.cc
index 0ddda95..a12c330 100644
--- a/src/arch/arm/insts/mem64.cc
+++ b/src/arch/arm/insts/mem64.cc
@@ -79,7 +79,6 @@
     else
         memAccessFlags |= ArmISA::TLB::AllowUnaligned;
     if (acrel) {
-        flags[IsMemBarrier] = true;
         flags[IsWriteBarrier] = true;
         flags[IsReadBarrier] = true;
     }
diff --git a/src/arch/arm/insts/tme64.cc b/src/arch/arm/insts/tme64.cc
index 30aff20..3629d34 100644
--- a/src/arch/arm/insts/tme64.cc
+++ b/src/arch/arm/insts/tme64.cc
@@ -83,7 +83,6 @@
     _numVecElemDestRegs = 0;
     _numIntDestRegs = 0;
     _numCCDestRegs = 0;
-    flags[IsMemBarrier] = true;
     flags[IsMicroop] = true;
     flags[IsReadBarrier] = true;
     flags[IsWriteBarrier] = true;
@@ -129,7 +128,6 @@
     flags[IsHtmStart] = true;
     flags[IsInteger] = true;
     flags[IsLoad] = true;
-    flags[IsMemRef] = true;
     flags[IsMicroop] = true;
     flags[IsNonSpeculative] = true;
 }
@@ -170,7 +168,6 @@
     _numIntDestRegs = 0;
     _numCCDestRegs = 0;
     flags[IsLoad] = true;
-    flags[IsMemRef] = true;
     flags[IsMicroop] = true;
     flags[IsNonSpeculative] = true;
     flags[IsHtmCancel] = true;
@@ -213,7 +210,6 @@
     _numCCDestRegs = 0;
     flags[IsHtmStop] = true;
     flags[IsLoad] = true;
-    flags[IsMemRef] = true;
     flags[IsMicroop] = true;
     flags[IsNonSpeculative] = true;
 }
diff --git a/src/arch/arm/isa.cc b/src/arch/arm/isa.cc
index 4ad1125..8ec2dc6 100644
--- a/src/arch/arm/isa.cc
+++ b/src/arch/arm/isa.cc
@@ -1718,9 +1718,8 @@
                 tlbiOp.broadcast(tc);
                 return;
             }
-          // AArch64 TLB Invalidate All, EL2, Inner Shareable
+          // AArch64 TLB Invalidate All, EL2
           case MISCREG_TLBI_ALLE2:
-          case MISCREG_TLBI_ALLE2IS:
             {
                 assert64();
                 scr = readMiscReg(MISCREG_SCR);
@@ -1729,6 +1728,16 @@
                 tlbiOp(tc);
                 return;
             }
+          // AArch64 TLB Invalidate All, EL2, Inner Shareable
+          case MISCREG_TLBI_ALLE2IS:
+            {
+                assert64();
+                scr = readMiscReg(MISCREG_SCR);
+
+                TLBIALL tlbiOp(EL2, haveSecurity && !scr.ns);
+                tlbiOp.broadcast(tc);
+                return;
+            }
           // AArch64 TLB Invalidate All, EL1
           case MISCREG_TLBI_ALLE1:
           case MISCREG_TLBI_VMALLS12E1:
diff --git a/src/arch/arm/isa/insts/amo64.isa b/src/arch/arm/isa/insts/amo64.isa
index 1fe9b7a..51e1f38 100644
--- a/src/arch/arm/isa/insts/amo64.isa
+++ b/src/arch/arm/isa/insts/amo64.isa
@@ -91,7 +91,7 @@
                 self.instFlags.append("IsMicroop")
 
             if self.flavor in ("release", "acquire_release", "acquire"):
-                self.instFlags.append("IsMemBarrier")
+                self.instFlags.extend(["IsReadBarrier", "IsWriteBarrier"])
             if self.flavor in ("release", "acquire_release"):
                 self.instFlags.append("IsWriteBarrier")
             if self.flavor in ("acquire_release", "acquire"):
diff --git a/src/arch/arm/isa/insts/data64.isa b/src/arch/arm/isa/insts/data64.isa
index d5a5869..6ca6aa3 100644
--- a/src/arch/arm/isa/insts/data64.isa
+++ b/src/arch/arm/isa/insts/data64.isa
@@ -83,16 +83,16 @@
         "logic": '0'
     }
 
-    immOp2 = "uint64_t secOp M5_VAR_USED = imm;"
-    sRegOp2 = "uint64_t secOp M5_VAR_USED = " + \
+    immOp2 = "M5_VAR_USED uint64_t secOp = imm;"
+    sRegOp2 = "M5_VAR_USED uint64_t secOp = " + \
               "shiftReg64(Op264, shiftAmt, shiftType, intWidth);"
-    eRegOp2 = "uint64_t secOp M5_VAR_USED = " + \
+    eRegOp2 = "M5_VAR_USED uint64_t secOp = " + \
               "extendReg64(Op264, extendType, shiftAmt, intWidth);"
 
     def buildDataWork(mnem, code, flagType, suffix, buildCc, buildNonCc,
                       base, templateBase):
         code = '''
-        uint64_t resTemp M5_VAR_USED = 0;
+        M5_VAR_USED uint64_t resTemp = 0;
         ''' + code
         ccCode = createCcCode64(carryCode64[flagType], overflowCode64[flagType])
         Name = mnem.capitalize() + suffix
@@ -392,7 +392,7 @@
                                   "use_uops" : 0,
                                   "op_wb" : ";",
                                   "fa_code" : ";"},
-                                ['IsStore', 'IsMemRef']);
+                                ['IsStore']);
     header_output += DCStore64Declare.subst(msrDCZVAIop);
     decoder_output += DCStore64Constructor.subst(msrDCZVAIop);
     exec_output += DCStore64Execute.subst(msrDCZVAIop);
@@ -423,7 +423,7 @@
                                    "use_uops" : 0,
                                    "op_wb" : ";",
                                    "fa_code" : cachem_fa},
-                                 ['IsStore', 'IsMemRef']);
+                                 ['IsStore']);
     header_output += DCStore64Declare.subst(msrDCCVAUIop);
     decoder_output += DCStore64Constructor.subst(msrDCCVAUIop);
     exec_output += DCStore64Execute.subst(msrDCCVAUIop);
@@ -447,7 +447,7 @@
                                    "use_uops" : 0,
                                    "op_wb" : ";",
                                    "fa_code" : cachem_fa},
-                                 ['IsStore', 'IsMemRef']);
+                                 ['IsStore']);
     header_output += DCStore64Declare.subst(msrDCCVACIop);
     decoder_output += DCStore64Constructor.subst(msrDCCVACIop);
     exec_output += DCStore64Execute.subst(msrDCCVACIop);
@@ -472,7 +472,7 @@
                                     "use_uops" : 0,
                                     "op_wb" : ";",
                                     "fa_code" : cachem_fa},
-                                  ['IsStore', 'IsMemRef']);
+                                  ['IsStore']);
     header_output += DCStore64Declare.subst(msrDCCIVACIop);
     decoder_output += DCStore64Constructor.subst(msrDCCIVACIop);
     exec_output += DCStore64Execute.subst(msrDCCIVACIop);
@@ -503,7 +503,7 @@
                                    "use_uops" : 0,
                                    "op_wb" : ";",
                                    "fa_code" : cachem_fa},
-                                 ['IsStore', 'IsMemRef']);
+                                 ['IsStore']);
     header_output += DCStore64Declare.subst(msrDCIVACIop);
     decoder_output += DCStore64Constructor.subst(msrDCIVACIop);
     exec_output += DCStore64Execute.subst(msrDCIVACIop);
@@ -576,9 +576,9 @@
 
     def condCompCode(flagType, op, imm):
         ccCode = createCcCode64(carryCode64[flagType], overflowCode64[flagType])
-        opDecl = "uint64_t secOp M5_VAR_USED = imm;"
+        opDecl = "M5_VAR_USED uint64_t secOp = imm;"
         if not imm:
-            opDecl = "uint64_t secOp M5_VAR_USED = Op264;"
+            opDecl = "M5_VAR_USED uint64_t secOp = Op264;"
         return opDecl + '''
             if (testPredicate(CondCodesNZ, CondCodesC, CondCodesV, condCode)) {
                 uint64_t resTemp = Op164 ''' + op + ''' secOp;
diff --git a/src/arch/arm/isa/insts/fp.isa b/src/arch/arm/isa/insts/fp.isa
index 52b5315..90525c0 100644
--- a/src/arch/arm/isa/insts/fp.isa
+++ b/src/arch/arm/isa/insts/fp.isa
@@ -461,7 +461,7 @@
     exec_output = ""
 
     singleSimpleCode = vfpEnabledCheckCode + '''
-        FPSCR fpscr M5_VAR_USED = (FPSCR) FpscrExc;
+        M5_VAR_USED FPSCR fpscr = (FPSCR) FpscrExc;
         FpDest = %(op)s;
     '''
     singleCode = singleSimpleCode + '''
@@ -482,7 +482,7 @@
                 "%(func)s, fpscr.fz, fpscr.dn, fpscr.rMode)"
     singleUnaryOp = "unaryOp(fpscr, FpOp1, %(func)s, fpscr.fz, fpscr.rMode)"
     doubleCode = vfpEnabledCheckCode + '''
-        FPSCR fpscr M5_VAR_USED = (FPSCR) FpscrExc;
+        M5_VAR_USED FPSCR fpscr = (FPSCR) FpscrExc;
         double dest = %(op)s;
         FpDestP0_uw = dblLow(dest);
         FpDestP1_uw = dblHi(dest);
diff --git a/src/arch/arm/isa/insts/ldr.isa b/src/arch/arm/isa/insts/ldr.isa
index d7e27a4..3be0e3e 100644
--- a/src/arch/arm/isa/insts/ldr.isa
+++ b/src/arch/arm/isa/insts/ldr.isa
@@ -179,9 +179,7 @@
                 self.memFlags.append("Request::LLSC")
 
             if self.flavor in ("acquire", "acex"):
-                self.instFlags.extend(["IsMemBarrier",
-                                       "IsWriteBarrier",
-                                       "IsReadBarrier"])
+                self.instFlags.extend(["IsWriteBarrier", "IsReadBarrier"])
                 self.memFlags.append("Request::ACQUIRE")
 
             # Disambiguate the class name for different flavors of loads
@@ -260,9 +258,7 @@
                 self.Name = "%s_%s" % (self.name.upper(), self.Name)
 
             if self.flavor in ("acquire", "acex"):
-                self.instFlags.extend(["IsMemBarrier",
-                                       "IsWriteBarrier",
-                                       "IsReadBarrier"])
+                self.instFlags.extend(["IsWriteBarrier", "IsReadBarrier"])
                 self.memFlags.append("Request::ACQUIRE")
 
         def emit(self):
diff --git a/src/arch/arm/isa/insts/ldr64.isa b/src/arch/arm/isa/insts/ldr64.isa
index 51f5389..76b0cae 100644
--- a/src/arch/arm/isa/insts/ldr64.isa
+++ b/src/arch/arm/isa/insts/ldr64.isa
@@ -91,9 +91,7 @@
                 self.memFlags.append("ArmISA::TLB::AllowUnaligned")
 
             if self.flavor in ("acquire", "acex", "acexp"):
-                self.instFlags.extend(["IsMemBarrier",
-                                       "IsWriteBarrier",
-                                       "IsReadBarrier"])
+                self.instFlags.extend(["IsWriteBarrier", "IsReadBarrier"])
                 self.memFlags.append("Request::ACQUIRE")
 
             if self.flavor in ("acex", "exclusive", "exp", "acexp"):
@@ -203,7 +201,7 @@
             accEpilogCode = None
             # Code that actually handles the access
             if self.flavor in ("dprefetch", "iprefetch", "mprefetch"):
-                accCode = 'uint64_t temp M5_VAR_USED = Mem%s;'
+                accCode = 'M5_VAR_USED uint64_t temp = Mem%s;'
             elif self.flavor == "fp":
                 accEpilogCode = '''
                     ArmISA::ISA::zeroSveVecRegUpperPart(AA64FpDest,
diff --git a/src/arch/arm/isa/insts/macromem.isa b/src/arch/arm/isa/insts/macromem.isa
index 1b9cdf7..ad0f677 100644
--- a/src/arch/arm/isa/insts/macromem.isa
+++ b/src/arch/arm/isa/insts/macromem.isa
@@ -252,7 +252,7 @@
                                   'memacc_code' : loadMemAccCode,
                                   'ea_code' : simdEnabledCheckCode + eaCode,
                                   'predicate_test' : predicateTest },
-                                [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
+                                [ 'IsMicroop', 'IsLoad' ])
         storeIop = InstObjParams('strneon%(size)d_uop' % subst,
                                  'MicroStrNeon%(size)dUop' % subst,
                                  'MicroNeonMemOp',
@@ -261,7 +261,7 @@
                                    'memacc_code' : storeMemAccCode,
                                    'ea_code' : simdEnabledCheckCode + eaCode,
                                    'predicate_test' : predicateTest },
-                                 [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
+                                 [ 'IsMicroop', 'IsStore' ])
 
         exec_output += NeonLoadExecute.subst(loadIop) + \
                        NeonLoadInitiateAcc.subst(loadIop) + \
diff --git a/src/arch/arm/isa/insts/misc.isa b/src/arch/arm/isa/insts/misc.isa
index 6a9b048..17c30ff 100644
--- a/src/arch/arm/isa/insts/misc.isa
+++ b/src/arch/arm/isa/insts/misc.isa
@@ -1149,7 +1149,7 @@
                                    "postacc_code": "",
                                    "ea_code": McrDcimvacCode,
                                    "predicate_test": predicateTest},
-                                ['IsMemRef', 'IsStore'])
+                                ['IsStore'])
     header_output += MiscRegRegImmMemOpDeclare.subst(McrDcimvacIop)
     decoder_output += MiscRegRegImmOpConstructor.subst(McrDcimvacIop)
     exec_output += Mcr15Execute.subst(McrDcimvacIop) + \
@@ -1167,7 +1167,7 @@
                                    "postacc_code": "",
                                    "ea_code": McrDccmvacCode,
                                    "predicate_test": predicateTest},
-                                ['IsMemRef', 'IsStore'])
+                                ['IsStore'])
     header_output += MiscRegRegImmMemOpDeclare.subst(McrDccmvacIop)
     decoder_output += MiscRegRegImmOpConstructor.subst(McrDccmvacIop)
     exec_output += Mcr15Execute.subst(McrDccmvacIop) + \
@@ -1185,7 +1185,7 @@
                                    "postacc_code": "",
                                    "ea_code": McrDccmvauCode,
                                    "predicate_test": predicateTest},
-                                ['IsMemRef', 'IsStore'])
+                                ['IsStore'])
     header_output += MiscRegRegImmMemOpDeclare.subst(McrDccmvauIop)
     decoder_output += MiscRegRegImmOpConstructor.subst(McrDccmvauIop)
     exec_output += Mcr15Execute.subst(McrDccmvauIop) + \
@@ -1204,7 +1204,7 @@
                                    "postacc_code": "",
                                    "ea_code": McrDccimvacCode,
                                    "predicate_test": predicateTest},
-                                ['IsMemRef', 'IsStore'])
+                                ['IsStore'])
     header_output += MiscRegRegImmMemOpDeclare.subst(McrDccimvacIop)
     decoder_output += MiscRegRegImmOpConstructor.subst(McrDccimvacIop)
     exec_output += Mcr15Execute.subst(McrDccimvacIop) + \
@@ -1238,7 +1238,8 @@
     dsbIop = InstObjParams("dsb", "Dsb", "ImmOp",
                              {"code": dsbCode,
                                "predicate_test": predicateTest},
-                              ['IsMemBarrier', 'IsSerializeAfter'])
+                              ['IsReadBarrier', 'IsWriteBarrier',
+                               'IsSerializeAfter'])
     header_output += ImmOpDeclare.subst(dsbIop)
     decoder_output += ImmOpConstructor.subst(dsbIop)
     exec_output += PredOpExecute.subst(dsbIop)
@@ -1254,7 +1255,7 @@
     dmbIop = InstObjParams("dmb", "Dmb", "ImmOp",
                              {"code": dmbCode,
                                "predicate_test": predicateTest},
-                               ['IsMemBarrier'])
+                               ['IsReadBarrier', 'IsWriteBarrier'])
     header_output += ImmOpDeclare.subst(dmbIop)
     decoder_output += ImmOpConstructor.subst(dmbIop)
     exec_output += PredOpExecute.subst(dmbIop)
diff --git a/src/arch/arm/isa/insts/misc64.isa b/src/arch/arm/isa/insts/misc64.isa
index 7911ec9..5cc5167 100644
--- a/src/arch/arm/isa/insts/misc64.isa
+++ b/src/arch/arm/isa/insts/misc64.isa
@@ -128,7 +128,7 @@
         bitMask = (bitMask >> imm1) | (bitMask << (intWidth - imm1));
         diff += intWidth;
     }
-    uint64_t topBits M5_VAR_USED = ~mask(diff+1);
+    M5_VAR_USED uint64_t topBits = ~mask(diff+1);
     uint64_t result = imm1 == 0 ? Op164 :
                       (Op164 >> imm1) | (Op164 << (intWidth - imm1));
     result &= bitMask;
@@ -173,13 +173,14 @@
     exec_output += BasicExecute.subst(isbIop)
 
     dsbIop = InstObjParams("dsb", "Dsb64", "ArmStaticInst", "",
-                           ['IsMemBarrier', 'IsSerializeAfter'])
+                           ['IsReadBarrier', 'IsWriteBarrier',
+                            'IsSerializeAfter'])
     header_output += BasicDeclare.subst(dsbIop)
     decoder_output += BasicConstructor64.subst(dsbIop)
     exec_output += BasicExecute.subst(dsbIop)
 
     dmbIop = InstObjParams("dmb", "Dmb64", "ArmStaticInst", "",
-                           ['IsMemBarrier'])
+                           ['IsReadBarrier', 'IsWriteBarrier'])
     header_output += BasicDeclare.subst(dmbIop)
     decoder_output += BasicConstructor64.subst(dmbIop)
     exec_output += BasicExecute.subst(dmbIop)
diff --git a/src/arch/arm/isa/insts/neon64_mem.isa b/src/arch/arm/isa/insts/neon64_mem.isa
index e511f61..80741fb 100644
--- a/src/arch/arm/isa/insts/neon64_mem.isa
+++ b/src/arch/arm/isa/insts/neon64_mem.isa
@@ -146,7 +146,7 @@
                 'memacc_code' : loadMemAccCode,
                 'ea_code' : simd64EnabledCheckCode + eaCode,
             },
-            [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
+            [ 'IsMicroop', 'IsLoad' ])
         loadIop.snippets["memacc_code"] += zeroSveVecRegUpperPartCode % \
             "AA64FpDest"
         storeIop = InstObjParams(name + 'st',
@@ -156,7 +156,7 @@
                 'memacc_code' : storeMemAccCode,
                 'ea_code' : simd64EnabledCheckCode + eaCode,
             },
-            [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
+            [ 'IsMicroop', 'IsStore' ])
 
         exec_output += NeonLoadExecute64.subst(loadIop) + \
             NeonLoadInitiateAcc64.subst(loadIop) + \
diff --git a/src/arch/arm/isa/insts/str.isa b/src/arch/arm/isa/insts/str.isa
index e99f6ad..48bf153 100644
--- a/src/arch/arm/isa/insts/str.isa
+++ b/src/arch/arm/isa/insts/str.isa
@@ -187,8 +187,7 @@
                 self.memFlags.append("ArmISA::TLB::AllowUnaligned")
 
             if self.flavor in ("release", "relex"):
-                self.instFlags.extend(["IsMemBarrier",
-                                       "IsWriteBarrier",
+                self.instFlags.extend(["IsWriteBarrier",
                                        "IsReadBarrier"])
                 self.memFlags.append("Request::RELEASE")
 
@@ -269,8 +268,7 @@
                 self.memFlags.append("ArmISA::TLB::AlignWord")
 
             if self.flavor in ("release", "relex"):
-                self.instFlags.extend(["IsMemBarrier",
-                                       "IsWriteBarrier",
+                self.instFlags.extend(["IsWriteBarrier",
                                        "IsReadBarrier"])
                 self.memFlags.append("Request::RELEASE")
 
diff --git a/src/arch/arm/isa/insts/str64.isa b/src/arch/arm/isa/insts/str64.isa
index ac84533..ed99064 100644
--- a/src/arch/arm/isa/insts/str64.isa
+++ b/src/arch/arm/isa/insts/str64.isa
@@ -79,8 +79,7 @@
                 self.instFlags.append("IsMicroop")
 
             if self.flavor in ("release", "relex", "relexp"):
-                self.instFlags.extend(["IsMemBarrier",
-                                       "IsWriteBarrier",
+                self.instFlags.extend(["IsWriteBarrier",
                                        "IsReadBarrier"])
                 self.memFlags.append("Request::RELEASE")
 
diff --git a/src/arch/arm/isa/insts/sve.isa b/src/arch/arm/isa/insts/sve.isa
index 4e49e92..242ae34 100644
--- a/src/arch/arm/isa/insts/sve.isa
+++ b/src/arch/arm/isa/insts/sve.isa
@@ -2703,7 +2703,7 @@
         CondCodesC = !destPred.lastActive(GpOp, eCount);
         CondCodesV = 0;'''
         extraPrologCode = '''
-        auto& destPred M5_VAR_USED = PDest;'''
+        M5_VAR_USED auto& destPred = PDest;'''
         baseClass = ('SvePredUnaryWImplicitSrcOp' if predType == PredType.NONE
                      else 'SvePredUnaryWImplicitSrcPredOp')
         iop = InstObjParams(name, 'Sve' + Name, baseClass,
@@ -2722,7 +2722,7 @@
         global header_output, exec_output, decoders
         code = sveEnabledCheckCode + op
         extraPrologCode = '''
-        auto& destPred M5_VAR_USED = Ffr;'''
+        M5_VAR_USED auto& destPred = Ffr;'''
         baseClass = ('SveWImplicitSrcDstOp' if isSetFfr
                      else 'SvePredUnaryWImplicitDstOp')
         iop = InstObjParams(name, 'Sve' + Name, baseClass,
diff --git a/src/arch/arm/isa/insts/sve_mem.isa b/src/arch/arm/isa/insts/sve_mem.isa
index 66bfabb..8599900 100644
--- a/src/arch/arm/isa/insts/sve_mem.isa
+++ b/src/arch/arm/isa/insts/sve_mem.isa
@@ -823,7 +823,7 @@
              'rden_code' : loadRdEnableCode,
              'fault_code' : '',
              'fa_code' : ''},
-            ['IsMemRef', 'IsLoad'])
+            ['IsLoad'])
         storeIop = InstObjParams('str',
             'SveStrPred' if isPred else 'SveStrVec',
             'SveMemPredFillSpill' if isPred else 'SveMemVecFillSpill',
@@ -833,7 +833,7 @@
              'memacc_code': storeMemAccCode,
              'ea_code' : sveEnabledCheckCode + eaCode,
              'fa_code' : ''},
-            ['IsMemRef', 'IsStore'])
+            ['IsStore'])
         header_output += SveMemFillSpillOpDeclare.subst(loadIop)
         header_output += SveMemFillSpillOpDeclare.subst(storeIop)
         exec_output += (
@@ -1007,7 +1007,7 @@
              'ea_code' : sveEnabledCheckCode + eaCode,
              'fault_code' : '',
              'fa_code' : ''},
-            ['IsMemRef', 'IsLoad'])
+            ['IsLoad'])
         storeIop = InstObjParams('st1',
             'SveContigStoreSI' if offsetIsImm else 'SveContigStoreSS',
             'SveContigMemSI' if offsetIsImm else 'SveContigMemSS',
@@ -1017,7 +1017,7 @@
              'memacc_code': storeMemAccCode,
              'ea_code' : sveEnabledCheckCode + eaCode,
              'fa_code' : ''},
-            ['IsMemRef', 'IsStore'])
+            ['IsStore'])
         faultIop = InstObjParams('ldff1' if firstFaulting else 'ldnf1',
             'SveContigFFLoadSS' if firstFaulting else 'SveContigNFLoadSI',
             'SveContigMemSS' if firstFaulting else 'SveContigMemSI',
@@ -1028,7 +1028,7 @@
              'ea_code' : sveEnabledCheckCode + eaCode,
              'fault_code' : faultCode,
              'fa_code' : ''},
-            ['IsMemRef', 'IsLoad'])
+            ['IsLoad'])
         faultIop.snippets['memacc_code'] = (ffrReadBackCode +
                                            faultIop.snippets['memacc_code'])
         if offsetIsImm:
@@ -1091,7 +1091,7 @@
              'memacc_code': memAccCode,
              'ea_code' : sveEnabledCheckCode + eaCode,
              'fa_code' : ''},
-            ['IsMemRef', 'IsLoad'])
+            ['IsLoad'])
         header_output += SveContigMemSIOpDeclare.subst(iop)
         exec_output += (
             SveLoadAndReplExecute.subst(iop) +
@@ -1158,7 +1158,7 @@
              'fault_status_reset_code' : faultStatusResetCode,
              'pred_check_code' : predCheckCode,
              'fa_code' : ''},
-            ['IsMicroop', 'IsMemRef', 'IsLoad'])
+            ['IsMicroop', 'IsLoad'])
         storeIop = InstObjParams('st1',
             ('SveScatterStoreVIMicroop'
              if indexed_addr_form == IndexedAddrForm.VEC_PLUS_IMM
@@ -1170,7 +1170,7 @@
              'ea_code' : sveEnabledCheckCode + eaCode_store,
              'pred_check_code' : predCheckCode,
              'fa_code' : ''},
-            ['IsMicroop', 'IsMemRef', 'IsStore'])
+            ['IsMicroop', 'IsStore'])
         if indexed_addr_form == IndexedAddrForm.VEC_PLUS_IMM:
             header_output += SveIndexedMemVIMicroopDeclare.subst(loadIop)
             header_output += SveIndexedMemVIMicroopDeclare.subst(storeIop)
@@ -1445,7 +1445,7 @@
              'memacc_code': loadMemAccCode,
              'ea_code' : sveEnabledCheckCode + eaCode,
              'fa_code' : ''},
-            ['IsMemRef', 'IsLoad', 'IsMicroop'])
+            ['IsLoad', 'IsMicroop'])
         storeIop = InstObjParams('stxx',
             'SveStoreRegImmMicroop' if offsetIsImm
                                     else 'SveStoreRegRegMicroop',
@@ -1455,7 +1455,7 @@
              'memacc_code': storeMemAccCode,
              'ea_code' : sveEnabledCheckCode + eaCode,
              'fa_code' : ''},
-            ['IsMemRef', 'IsStore', 'IsMicroop'])
+            ['IsStore', 'IsMicroop'])
         if offsetIsImm:
             header_output += SveStructMemSIMicroopDeclare.subst(loadIop)
             header_output += SveStructMemSIMicroopDeclare.subst(storeIop)
@@ -1528,7 +1528,7 @@
                  'ea_code': sveEnabledCheckCode + eaCode,
                  'fault_code': '',
                  'fa_code': ''},
-                ['IsMemRef', 'IsLoad'])
+                ['IsLoad'])
         if offsetIsImm:
             header_output += SveContigMemSIOpDeclare.subst(iop)
         else:
diff --git a/src/arch/arm/isa/operands.isa b/src/arch/arm/isa/operands.isa
index e9ee098..134c51f 100644
--- a/src/arch/arm/isa/operands.isa
+++ b/src/arch/arm/isa/operands.isa
@@ -681,7 +681,7 @@
     'XURc' : intRegX64('urc'),
 
     #Memory Operand
-    'Mem': ('Mem', 'uw', None, ('IsMemRef', 'IsLoad', 'IsStore'), srtNormal),
+    'Mem': ('Mem', 'uw', None, (None, 'IsLoad', 'IsStore'), srtNormal),
 
     #PCState fields
     'RawPC': pcStateReg('pc', srtPC),
diff --git a/src/arch/arm/isa/templates/mem.isa b/src/arch/arm/isa/templates/mem.isa
index 1496fac..b056f48 100644
--- a/src/arch/arm/isa/templates/mem.isa
+++ b/src/arch/arm/isa/templates/mem.isa
@@ -206,7 +206,10 @@
         if (%(predicate_test)s)
         {
             if (fault == NoFault) {
-                fault = xc->readMem(EA, dataPtr, %(size)d, memAccessFlags);
+                const auto size = %(size)d;
+                fault = readMemAtomic(xc, EA, dataPtr,
+                                      size, memAccessFlags,
+                                      std::vector<bool>(size, true));
                 %(memacc_code)s;
             }
 
@@ -277,8 +280,10 @@
             }
 
             if (fault == NoFault) {
-                fault = xc->writeMem(dataPtr, %(size)d, EA,
-                                     memAccessFlags, NULL);
+                const auto size = %(size)d;
+                fault = writeMemAtomic(xc, dataPtr, EA, size,
+                                       memAccessFlags, NULL,
+                                       std::vector<bool>(size, true));
             }
 
             if (fault == NoFault) {
@@ -410,8 +415,10 @@
             }
 
             if (fault == NoFault) {
-                fault = xc->writeMem(memUnion.bytes, %(size)d, EA,
-                                     memAccessFlags, NULL);
+                const auto size = %(size)d;
+                fault = writeMemTiming(xc, memUnion.bytes, EA,
+                                       size, memAccessFlags, nullptr,
+                                       std::vector<bool>(size, true));
             }
         } else {
             xc->setPredicate(false);
@@ -462,7 +469,9 @@
         if (%(predicate_test)s)
         {
             if (fault == NoFault) {
-                fault = xc->initiateMemRead(EA, %(size)d, memAccessFlags);
+                const auto size = %(size)d;
+                fault = initiateMemRead(xc, EA, size, memAccessFlags,
+                                        std::vector<bool>(size, true));
             }
         } else {
             xc->setPredicate(false);
@@ -1117,7 +1126,7 @@
                  (IntRegIndex)_index)
     {
         %(constructor)s;
-        bool conditional M5_VAR_USED = false;
+        M5_VAR_USED bool conditional = false;
         if (!(condCode == COND_AL || condCode == COND_UC)) {
             conditional = true;
             for (int x = 0; x < _numDestRegs; x++) {
@@ -1183,7 +1192,7 @@
                  (IntRegIndex)_dest, (IntRegIndex)_base, _add, _imm)
     {
         %(constructor)s;
-        bool conditional M5_VAR_USED = false;
+        M5_VAR_USED bool conditional = false;
         if (!(condCode == COND_AL || condCode == COND_UC)) {
             conditional = true;
             for (int x = 0; x < _numDestRegs; x++) {
diff --git a/src/arch/arm/isa/templates/mem64.isa b/src/arch/arm/isa/templates/mem64.isa
index ed43cd7..3b90f8c 100644
--- a/src/arch/arm/isa/templates/mem64.isa
+++ b/src/arch/arm/isa/templates/mem64.isa
@@ -322,7 +322,9 @@
         }
 
         if (fault == NoFault) {
-            fault = xc->writeMem(NULL, op_size, EA, memAccessFlags, NULL);
+            fault = writeMemAtomic(xc, NULL, EA,
+                                   op_size, memAccessFlags, NULL,
+                                   std::vector<bool>(op_size, true));
         }
 
         if (fault == NoFault) {
@@ -349,7 +351,9 @@
         }
 
         if (fault == NoFault) {
-            fault = xc->writeMem(NULL, op_size, EA, memAccessFlags, NULL);
+            fault = writeMemTiming(xc, NULL, EA, op_size,
+                                   memAccessFlags, NULL,
+                                   std::vector<bool>(op_size, true));
         }
 
         return fault;
diff --git a/src/arch/arm/isa/templates/misc.isa b/src/arch/arm/isa/templates/misc.isa
index c982e75..51d8337 100644
--- a/src/arch/arm/isa/templates/misc.isa
+++ b/src/arch/arm/isa/templates/misc.isa
@@ -647,7 +647,8 @@
             if (fault == NoFault) {
                 Addr op_size = xc->tcBase()->getSystemPtr()->cacheLineSize();
                 EA &= ~(op_size - 1);
-                fault = xc->writeMem(NULL, op_size, EA, memAccessFlags, NULL);
+                fault = writeMemAtomic(xc, NULL, EA, op_size,
+                    memAccessFlags, NULL, std::vector<bool>(op_size, true));
             }
         } else {
             xc->setPredicate(false);
@@ -676,7 +677,8 @@
             if (fault == NoFault) {
                 Addr op_size = xc->tcBase()->getSystemPtr()->cacheLineSize();
                 EA &= ~(op_size - 1);
-                fault = xc->writeMem(NULL, op_size, EA, memAccessFlags, NULL);
+                fault = writeMemTiming(xc, NULL, EA, op_size,
+                    memAccessFlags, NULL, std::vector<bool>(op_size, true));
             }
         } else {
             xc->setPredicate(false);
diff --git a/src/arch/arm/isa/templates/neon64.isa b/src/arch/arm/isa/templates/neon64.isa
index 5d30107..d281c95 100644
--- a/src/arch/arm/isa/templates/neon64.isa
+++ b/src/arch/arm/isa/templates/neon64.isa
@@ -287,7 +287,9 @@
         uint8_t *dataPtr = memUnion.bytes;
 
         if (fault == NoFault) {
-            fault = xc->readMem(EA, dataPtr, accSize, memAccessFlags);
+            fault = readMemAtomic(xc, EA, dataPtr,
+                                  accSize, memAccessFlags,
+                                  std::vector<bool>(accSize, true));
             %(memacc_code)s;
         }
 
@@ -312,7 +314,9 @@
         %(ea_code)s;
 
         if (fault == NoFault) {
-            fault = xc->initiateMemRead(EA, accSize, memAccessFlags);
+            fault = initiateMemRead(xc, EA, accSize,
+                                    memAccessFlags,
+                                    std::vector<bool>(accSize, true));
         }
 
         return fault;
@@ -364,8 +368,9 @@
         }
 
         if (fault == NoFault) {
-            fault = xc->writeMem(dataPtr, accSize, EA, memAccessFlags,
-                                 NULL);
+            fault = writeMemAtomic(xc, dataPtr, EA, accSize,
+                                   memAccessFlags, nullptr,
+                                   std::vector<bool>(accSize, true));
         }
 
         if (fault == NoFault) {
@@ -394,8 +399,9 @@
         }
 
         if (fault == NoFault) {
-            fault = xc->writeMem(memUnion.bytes, accSize, EA, memAccessFlags,
-                                 NULL);
+            fault = writeMemTiming(xc, memUnion.bytes, EA,
+                                   accSize, memAccessFlags, NULL,
+                                   std::vector<bool>(accSize, true));
         }
 
         return fault;
diff --git a/src/arch/arm/isa/templates/semihost.isa b/src/arch/arm/isa/templates/semihost.isa
index 0ad84c8..c60db17 100644
--- a/src/arch/arm/isa/templates/semihost.isa
+++ b/src/arch/arm/isa/templates/semihost.isa
@@ -38,8 +38,8 @@
 // A new class of Semihosting constructor templates has been added.
 // Their main purpose is to check if the Exception Generation
 // Instructions (HLT, SVC) are actually a semihosting command.
-// If that is the case, the IsMemBarrier flag is raised, so that
-// in the O3 model we perform a coherent memory access during
+// If that is the case, the IsReadBarrier and IsWriteBarrier flags are raised,
+// so that in the O3 model we perform a coherent memory access during
 // the semihosting operation.
 // Please note: since we don't have a thread context pointer in the
 // constructor we cannot check if semihosting is enabled in the
@@ -64,7 +64,8 @@
         auto semihost_imm = machInst.thumb? %(thumb_semihost)s :
                                             %(arm_semihost)s;
         if (_imm == semihost_imm) {
-            flags[IsMemBarrier] = true;
+            flags[IsReadBarrier] = true;
+            flags[IsWriteBarrier] = true;
         }
     }
 }};
@@ -78,7 +79,8 @@
         // In AArch64 there is only one instruction for issuing
         // semhosting commands: HLT #0xF000
         if (_imm == 0xF000) {
-            flags[IsMemBarrier] = true;
+            flags[IsReadBarrier] = true;
+            flags[IsWriteBarrier] = true;
         }
     }
 }};
diff --git a/src/arch/arm/isa/templates/sve_mem.isa b/src/arch/arm/isa/templates/sve_mem.isa
index 46d38c4..7d59908 100644
--- a/src/arch/arm/isa/templates/sve_mem.isa
+++ b/src/arch/arm/isa/templates/sve_mem.isa
@@ -142,7 +142,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<RegElemType>(
             xc->tcBase());
 
@@ -155,8 +155,8 @@
 
         %(rden_code)s;
 
-        fault = xc->readMem(EA, memData.raw_ptr<uint8_t>(), memAccessSize,
-            this->memAccessFlags, rdEn);
+        fault = readMemAtomic(xc, EA, memData.raw_ptr<uint8_t>(),
+            memAccessSize, this->memAccessFlags, rdEn);
 
         %(fault_code)s;
 
@@ -176,7 +176,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<RegElemType>(
             xc->tcBase());
 
@@ -186,8 +186,8 @@
 
         %(rden_code)s;
 
-        fault = xc->initiateMemRead(EA, memAccessSize, this->memAccessFlags,
-            rdEn);
+        fault = initiateMemRead(xc, EA, memAccessSize,
+            this->memAccessFlags, rdEn);
 
         %(fault_code)s;
 
@@ -200,7 +200,7 @@
     Fault %(class_name)s%(tpl_args)s::completeAcc(PacketPtr pkt,
         ExecContext *xc, Trace::InstRecord *traceData) const
     {
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<RegElemType>(
             xc->tcBase());
 
@@ -229,7 +229,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<RegElemType>(
             xc->tcBase());
 
@@ -247,8 +247,8 @@
         }
 
         if (fault == NoFault) {
-            fault = xc->writeMem(memData.raw_ptr<uint8_t>(), memAccessSize, EA,
-                this->memAccessFlags, NULL, wrEn);
+            fault = writeMemAtomic(xc, memData.raw_ptr<uint8_t>(),
+                EA, memAccessSize, this->memAccessFlags, NULL, wrEn);
         }
 
         if (fault == NoFault) {
@@ -266,7 +266,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<RegElemType>(
             xc->tcBase());
 
@@ -284,8 +284,8 @@
         }
 
         if (fault == NoFault) {
-            fault = xc->writeMem(memData.raw_ptr<uint8_t>(), memAccessSize, EA,
-                this->memAccessFlags, NULL, wrEn);
+            fault = writeMemTiming(xc, memData.raw_ptr<uint8_t>(),
+                EA, memAccessSize, this->memAccessFlags, NULL, wrEn);
         }
 
         return fault;
@@ -308,7 +308,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<RegElemType>(
             xc->tcBase());
 
@@ -339,7 +339,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
 
         %(op_src_decl)s;
         %(op_rd)s;
@@ -363,7 +363,7 @@
         ExecContext *xc, Trace::InstRecord *traceData) const
     {
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<RegElemType>(
             xc->tcBase());
 
@@ -547,7 +547,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
 
         %(op_decl)s;
         %(op_rd)s;
@@ -595,7 +595,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
 
         %(op_src_decl)s;
         %(op_rd)s;
@@ -635,7 +635,7 @@
     Fault %(class_name)s%(tpl_args)s::completeAcc(PacketPtr pkt,
         ExecContext *xc, Trace::InstRecord *traceData) const
     {
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
 
         %(op_decl)s;
         %(op_rd)s;
@@ -661,7 +661,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
 
         %(op_decl)s;
         %(op_rd)s;
@@ -691,7 +691,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
 
         %(op_decl)s;
         %(op_rd)s;
@@ -759,7 +759,7 @@
     Fault %(class_name)s%(tpl_args)s::execute(ExecContext *xc,
         Trace::InstRecord *traceData) const
     {
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
 
         %(op_decl)s;
         %(op_rd)s;
@@ -933,7 +933,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
             xc->tcBase());
 
@@ -945,8 +945,9 @@
         auto memDataView = memData.as<Element>();
 
         if (fault == NoFault) {
-            fault = xc->readMem(EA, memData.raw_ptr<uint8_t>(), memAccessSize,
-                this->memAccessFlags);
+            fault = readMemAtomic(xc, EA, memData.raw_ptr<uint8_t>(),
+                memAccessSize, this->memAccessFlags,
+                std::vector<bool>(memAccessSize, true));
             %(memacc_code)s;
         }
 
@@ -965,7 +966,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
             xc->tcBase());
 
@@ -975,8 +976,9 @@
         %(ea_code)s;
 
         if (fault == NoFault) {
-            fault = xc->initiateMemRead(EA, memAccessSize,
-                this->memAccessFlags);
+            fault = initiateMemRead(xc, EA,
+                memAccessSize, this->memAccessFlags,
+                std::vector<bool>(memAccessSize, true));
         }
 
         return fault;
@@ -989,7 +991,7 @@
         ExecContext *xc, Trace::InstRecord *traceData) const
     {
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
             xc->tcBase());
 
@@ -1021,7 +1023,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
             xc->tcBase());
 
@@ -1039,8 +1041,8 @@
         }
 
         if (fault == NoFault) {
-            fault = xc->writeMem(memData.raw_ptr<uint8_t>(), memAccessSize, EA,
-                this->memAccessFlags, NULL, wrEn);
+            fault = writeMemAtomic(xc, memData.raw_ptr<uint8_t>(),
+                EA, memAccessSize, this->memAccessFlags, NULL, wrEn);
         }
 
         if (fault == NoFault) {
@@ -1058,7 +1060,7 @@
     {
         Addr EA;
         Fault fault = NoFault;
-        bool aarch64 M5_VAR_USED = true;
+        M5_VAR_USED bool aarch64 = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
             xc->tcBase());
 
@@ -1076,8 +1078,8 @@
         }
 
         if (fault == NoFault) {
-            fault = xc->writeMem(memData.raw_ptr<uint8_t>(), memAccessSize, EA,
-                this->memAccessFlags, NULL, wrEn);
+            fault = writeMemTiming(xc, memData.raw_ptr<uint8_t>(),
+                EA, memAccessSize, this->memAccessFlags, NULL, wrEn);
         }
 
         return fault;
diff --git a/src/arch/arm/remote_gdb.hh b/src/arch/arm/remote_gdb.hh
index eda5eec..8988c20 100644
--- a/src/arch/arm/remote_gdb.hh
+++ b/src/arch/arm/remote_gdb.hh
@@ -66,12 +66,12 @@
     {
       using BaseGdbRegCache::BaseGdbRegCache;
       private:
-        struct {
+        struct M5_ATTR_PACKED {
           uint32_t gpr[16];
           uint32_t cpsr;
           uint64_t fpr[32];
           uint32_t fpscr;
-        } M5_ATTR_PACKED r;
+        } r;
       public:
         char *data() const { return (char *)&r; }
         size_t size() const { return sizeof(r); }
@@ -88,7 +88,7 @@
     {
       using BaseGdbRegCache::BaseGdbRegCache;
       private:
-        struct {
+        struct M5_ATTR_PACKED {
           uint64_t x[31];
           uint64_t spx;
           uint64_t pc;
@@ -96,7 +96,7 @@
           VecElem v[NumVecV8ArchRegs * NumVecElemPerNeonVecReg];
           uint32_t fpsr;
           uint32_t fpcr;
-        } M5_ATTR_PACKED r;
+        } r;
       public:
         char *data() const { return (char *)&r; }
         size_t size() const { return sizeof(r); }
diff --git a/src/arch/arm/tlb.cc b/src/arch/arm/tlb.cc
index 413a13e..a0f837d 100644
--- a/src/arch/arm/tlb.cc
+++ b/src/arch/arm/tlb.cc
@@ -695,7 +695,7 @@
     // Cache clean operations require read permissions to the specified VA
     bool is_write = !req->isCacheClean() && mode == Write;
     bool is_atomic = req->isAtomic();
-    bool is_priv M5_VAR_USED  = isPriv && !(flags & UserMode);
+    M5_VAR_USED bool is_priv = isPriv && !(flags & UserMode);
 
     updateMiscReg(tc, curTranType);
 
diff --git a/src/arch/arm/tracers/tarmac_record.cc b/src/arch/arm/tracers/tarmac_record.cc
index 3969b6d..b7729b4 100644
--- a/src/arch/arm/tracers/tarmac_record.cc
+++ b/src/arch/arm/tracers/tarmac_record.cc
@@ -37,6 +37,8 @@
 
 #include "arch/arm/tracers/tarmac_record.hh"
 
+#include <memory>
+
 #include "arch/arm/insts/static_inst.hh"
 #include "tarmac_tracer.hh"
 
@@ -291,7 +293,7 @@
     // Generate an instruction entry in the record and
     // add it to the Instruction Queue
     queue.push_back(
-        m5::make_unique<TraceInstEntry>(tarmCtx, predicate)
+        std::make_unique<TraceInstEntry>(tarmCtx, predicate)
     );
 }
 
@@ -304,9 +306,9 @@
     // Memory Queue
     if (getMemValid()) {
         queue.push_back(
-            m5::make_unique<TraceMemEntry>(tarmCtx,
-                                           static_cast<uint8_t>(getSize()),
-                                           getAddr(), getIntData())
+            std::make_unique<TraceMemEntry>(tarmCtx,
+                                            static_cast<uint8_t>(getSize()),
+                                            getAddr(), getIntData())
         );
     }
 }
@@ -326,9 +328,7 @@
 
         // Copying the entry and adding it to the "list"
         // of entries to be dumped to trace.
-        queue.push_back(
-            m5::make_unique<TraceRegEntry>(single_reg)
-        );
+        queue.push_back(std::make_unique<TraceRegEntry>(single_reg));
     }
 
     // Gem5 is treating CPSR flags as separate registers (CC registers),
diff --git a/src/arch/arm/tracers/tarmac_record.hh b/src/arch/arm/tracers/tarmac_record.hh
index bb7a336..e5179ce 100644
--- a/src/arch/arm/tracers/tarmac_record.hh
+++ b/src/arch/arm/tracers/tarmac_record.hh
@@ -43,6 +43,8 @@
 #ifndef __ARCH_ARM_TRACERS_TARMAC_RECORD_HH__
 #define __ARCH_ARM_TRACERS_TARMAC_RECORD_HH__
 
+#include <memory>
+
 #include "arch/arm/tracers/tarmac_base.hh"
 #include "base/printable.hh"
 #include "config/the_isa.hh"
@@ -246,7 +248,7 @@
             if (cpsr_it == queue.end()) {
                 RegId reg(MiscRegClass, ArmISA::MISCREG_CPSR);
                 queue.push_back(
-                    m5::make_unique<RegEntry>(
+                    std::make_unique<RegEntry>(
                         genRegister<RegEntry>(tarmCtx, reg))
                 );
             }
diff --git a/src/arch/arm/tracers/tarmac_record_v8.cc b/src/arch/arm/tracers/tarmac_record_v8.cc
index fa4304f..f4bb7fd 100644
--- a/src/arch/arm/tracers/tarmac_record_v8.cc
+++ b/src/arch/arm/tracers/tarmac_record_v8.cc
@@ -37,6 +37,8 @@
 
 #include "arch/arm/tracers/tarmac_record_v8.hh"
 
+#include <memory>
+
 #include "arch/arm/insts/static_inst.hh"
 #include "arch/arm/tlb.hh"
 #include "arch/arm/tracers/tarmac_tracer.hh"
@@ -185,7 +187,7 @@
     // Generate an instruction entry in the record and
     // add it to the Instruction Queue
     queue.push_back(
-        m5::make_unique<TraceInstEntryV8>(tarmCtx, predicate)
+        std::make_unique<TraceInstEntryV8>(tarmCtx, predicate)
     );
 }
 
@@ -198,9 +200,9 @@
     // Memory Queue
     if (getMemValid()) {
         queue.push_back(
-            m5::make_unique<TraceMemEntryV8>(tarmCtx,
-                                             static_cast<uint8_t>(getSize()),
-                                             getAddr(), getIntData())
+            std::make_unique<TraceMemEntryV8>(tarmCtx,
+                                              static_cast<uint8_t>(getSize()),
+                                              getAddr(), getIntData())
         );
     }
 }
@@ -220,9 +222,7 @@
 
         // Copying the entry and adding it to the "list"
         // of entries to be dumped to trace.
-        queue.push_back(
-            m5::make_unique<TraceRegEntryV8>(single_reg)
-        );
+        queue.push_back(std::make_unique<TraceRegEntryV8>(single_reg));
     }
 
     // Gem5 is treating CPSR flags as separate registers (CC registers),
diff --git a/src/arch/arm/utility.cc b/src/arch/arm/utility.cc
index a189c4a..4d866d0 100644
--- a/src/arch/arm/utility.cc
+++ b/src/arch/arm/utility.cc
@@ -56,13 +56,10 @@
 uint64_t
 getArgument(ThreadContext *tc, int &number, uint16_t size, bool fp)
 {
-    if (!FullSystem) {
-        panic("getArgument() only implemented for full system mode.\n");
-        M5_DUMMY_RETURN
-    }
+    panic_if(!FullSystem,
+            "getArgument() only implemented for full system mode.");
 
-    if (fp)
-        panic("getArgument(): Floating point arguments not implemented\n");
+    panic_if(fp, "getArgument(): Floating point arguments not implemented");
 
     if (inAArch64(tc)) {
         if (size == (uint16_t)(-1))
diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc
index 296dbad..b501167 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32522,6 +32522,7 @@
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()
                                 ->cyclesToTicks(Cycles(24)));
@@ -32593,6 +32594,7 @@
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()
                                 ->cyclesToTicks(Cycles(24)));
diff --git a/src/arch/generic/memhelpers.hh b/src/arch/generic/memhelpers.hh
index 2a5a380..d9adfdc 100644
--- a/src/arch/generic/memhelpers.hh
+++ b/src/arch/generic/memhelpers.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 ARM Limited
+ * Copyright (c) 2013, 2019 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -47,6 +47,15 @@
 #include "sim/byteswap.hh"
 #include "sim/insttracer.hh"
 
+template <class XC>
+Fault
+initiateMemRead(XC *xc, Addr addr, std::size_t size,
+                Request::Flags flags,
+                const std::vector<bool> &byte_enable)
+{
+    return xc->initiateMemRead(addr, size, flags, byte_enable);
+}
+
 /// Initiate a read from memory in timing mode.  Note that the 'mem'
 /// parameter is unused; only the type of that parameter is used
 /// to determine the size of the access.
@@ -55,7 +64,9 @@
 initiateMemRead(XC *xc, Trace::InstRecord *traceData, Addr addr,
                 MemT &mem, Request::Flags flags)
 {
-    return xc->initiateMemRead(addr, sizeof(MemT), flags);
+    static const std::vector<bool> byte_enable(sizeof(MemT), true);
+    return initiateMemRead(xc, addr, sizeof(MemT),
+                           flags, byte_enable);
 }
 
 /// Extract the data returned from a timing mode read.
@@ -83,13 +94,25 @@
 }
 
 /// Read from memory in atomic mode.
+template <class XC>
+Fault
+readMemAtomic(XC *xc, Addr addr, uint8_t *mem,
+              std::size_t size, Request::Flags flags,
+              const std::vector<bool> &byte_enable)
+{
+    return xc->readMem(addr, mem, size, flags, byte_enable);
+}
+
+/// Read from memory in atomic mode.
 template <ByteOrder Order, class XC, class MemT>
 Fault
 readMemAtomic(XC *xc, Trace::InstRecord *traceData, Addr addr, MemT &mem,
               Request::Flags flags)
 {
     memset(&mem, 0, sizeof(mem));
-    Fault fault = xc->readMem(addr, (uint8_t *)&mem, sizeof(MemT), flags);
+    static const std::vector<bool> byte_enable(sizeof(MemT), true);
+    Fault fault = readMemAtomic(xc, addr, (uint8_t*)&mem,
+                                sizeof(MemT), flags, byte_enable);
     if (fault == NoFault) {
         mem = gtoh(mem, Order);
         if (traceData)
@@ -116,6 +139,15 @@
 }
 
 /// Write to memory in timing mode.
+template <class XC>
+Fault
+writeMemTiming(XC *xc, uint8_t *mem, Addr addr,
+               std::size_t size, Request::Flags flags, uint64_t *res,
+               const std::vector<bool> &byte_enable)
+{
+    return xc->writeMem(mem, size, addr, flags, res, byte_enable);
+}
+
 template <ByteOrder Order, class XC, class MemT>
 Fault
 writeMemTiming(XC *xc, Trace::InstRecord *traceData, MemT mem, Addr addr,
@@ -125,7 +157,9 @@
         traceData->setData(mem);
     }
     mem = htog(mem, Order);
-    return xc->writeMem((uint8_t *)&mem, sizeof(MemT), addr, flags, res);
+    static const std::vector<bool> byte_enable(sizeof(MemT), true);
+    return writeMemTiming(xc, (uint8_t*)&mem, addr,
+                          sizeof(MemT), flags, res, byte_enable);
 }
 
 template <class XC, class MemT>
@@ -147,6 +181,15 @@
 }
 
 /// Write to memory in atomic mode.
+template <class XC>
+Fault
+writeMemAtomic(XC *xc, uint8_t *mem, Addr addr,
+               std::size_t size, Request::Flags flags,
+               uint64_t *res, const std::vector<bool> &byte_enable)
+{
+    return xc->writeMem(mem, size, addr, flags, res, byte_enable);
+}
+
 template <ByteOrder Order, class XC, class MemT>
 Fault
 writeMemAtomic(XC *xc, Trace::InstRecord *traceData, const MemT &mem,
@@ -156,8 +199,9 @@
         traceData->setData(mem);
     }
     MemT host_mem = htog(mem, Order);
-    Fault fault =
-          xc->writeMem((uint8_t *)&host_mem, sizeof(MemT), addr, flags, res);
+    static const std::vector<bool> byte_enable(sizeof(MemT), true);
+    Fault fault = writeMemAtomic(xc, (uint8_t*)&host_mem,
+                                 addr, sizeof(MemT), flags, res, byte_enable);
     if (fault == NoFault && res != NULL) {
         if (flags & Request::MEM_SWAP || flags & Request::MEM_SWAP_COND)
             *(MemT *)res = gtoh(*(MemT *)res, Order);
diff --git a/src/arch/isa_parser.py b/src/arch/isa_parser.py
index 7d8bffd..86f5089 100755
--- a/src/arch/isa_parser.py
+++ b/src/arch/isa_parser.py
@@ -166,7 +166,7 @@
             if operands.predRead:
                 myDict['op_decl'] += 'uint8_t _sourceIndex = 0;\n'
             if operands.predWrite:
-                myDict['op_decl'] += 'uint8_t M5_VAR_USED _destIndex = 0;\n'
+                myDict['op_decl'] += 'M5_VAR_USED uint8_t _destIndex = 0;\n'
 
             is_src = lambda op: op.is_src
             is_dest = lambda op: op.is_dest
@@ -1076,7 +1076,7 @@
 
     def makeDecl(self):
         # Declare memory data variable.
-        return '%s %s;\n' % (self.ctype, self.base_name)
+        return '%s %s = {};\n' % (self.ctype, self.base_name)
 
     def makeRead(self, predRead):
         if self.read_code != None:
@@ -2082,7 +2082,8 @@
     # 'def [signed] bitfield <ID> [<first>:<last>]'
     # This generates a preprocessor macro in the output file.
     def p_def_bitfield_0(self, t):
-        'def_bitfield : DEF opt_signed BITFIELD ID LESS INTLIT COLON INTLIT GREATER SEMI'
+        'def_bitfield : DEF opt_signed ' \
+                'BITFIELD ID LESS INTLIT COLON INTLIT GREATER SEMI'
         expr = 'bits(machInst, %2d, %2d)' % (t[6], t[8])
         if (t[2] == 'signed'):
             expr = 'sext<%d>(%s)' % (t[6] - t[8] + 1, expr)
diff --git a/src/arch/mips/interrupts.cc b/src/arch/mips/interrupts.cc
index 1cecfaf..dfc5f30 100644
--- a/src/arch/mips/interrupts.cc
+++ b/src/arch/mips/interrupts.cc
@@ -145,8 +145,8 @@
 {
     assert(checkInterrupts());
 
-    StatusReg M5_VAR_USED status = tc->readMiscRegNoEffect(MISCREG_STATUS);
-    CauseReg M5_VAR_USED cause = tc->readMiscRegNoEffect(MISCREG_CAUSE);
+    M5_VAR_USED StatusReg status = tc->readMiscRegNoEffect(MISCREG_STATUS);
+    M5_VAR_USED CauseReg cause = tc->readMiscRegNoEffect(MISCREG_CAUSE);
     DPRINTF(Interrupt, "Interrupt! IM[7:0]=%d IP[7:0]=%d \n",
             (unsigned)status.im, (unsigned)cause.ip);
 
diff --git a/src/arch/mips/isa/decoder.isa b/src/arch/mips/isa/decoder.isa
index f62000e..3b2b015 100644
--- a/src/arch/mips/isa/decoder.isa
+++ b/src/arch/mips/isa/decoder.isa
@@ -166,7 +166,7 @@
                             fault = std::make_shared<SystemCallFault>();
                         }});
                     }
-                    0x7: sync({{ ; }}, IsMemBarrier);
+                    0x7: sync({{ ; }}, IsReadBarrier, IsWriteBarrier);
                   0x5: break({{fault = std::make_shared<BreakpointFault>();}});
                 }
 
@@ -174,10 +174,10 @@
 
             0x2: decode FUNCTION_LO {
                 0x0: HiLoRsSelOp::mfhi({{ Rd = HI_RS_SEL; }},
-                             IntMultOp, IsIprAccess);
+                             IntMultOp, IsSerializeBefore);
                 0x1: HiLoRdSelOp::mthi({{ HI_RD_SEL = Rs; }});
                 0x2: HiLoRsSelOp::mflo({{ Rd = LO_RS_SEL; }},
-                             IntMultOp, IsIprAccess);
+                             IntMultOp, IsSerializeBefore);
                 0x3: HiLoRdSelOp::mtlo({{ LO_RD_SEL = Rs; }});
             }
 
@@ -719,7 +719,7 @@
                         LLFlag = 0;
                         Status = status;
                         SRSCtl = srsCtl;
-                    }}, IsReturn, IsSerializing, IsERET);
+                    }}, IsReturn, IsSerializing);
 
                     0x1F: deret({{
                         DebugReg debug = Debug;
@@ -732,7 +732,7 @@
                             // Undefined;
                         }
                         Debug = debug;
-                    }}, IsReturn, IsSerializing, IsERET);
+                    }}, IsReturn, IsSerializing);
                 }
                 format CP0TLB {
                     0x01: tlbr({{
diff --git a/src/arch/mips/isa/formats/branch.isa b/src/arch/mips/isa/formats/branch.isa
index 4975a13..7c2b27c 100644
--- a/src/arch/mips/isa/formats/branch.isa
+++ b/src/arch/mips/isa/formats/branch.isa
@@ -241,7 +241,6 @@
             code += 'R31 = NNPC;\n'
         elif x == 'Likely':
             not_taken_code = 'NNPC = NPC; NPC = PC;'
-            inst_flags += ('IsCondDelaySlot', )
         else:
             inst_flags += (x, )
 
@@ -280,7 +279,6 @@
             code += 'R32 = NNPC;'
         elif x == 'Likely':
             not_taken_code = 'NNPC = NPC, NPC = PC;'
-            inst_flags += ('IsCondDelaySlot', )
         else:
             inst_flags += (x, )
 
diff --git a/src/arch/mips/isa/formats/dsp.isa b/src/arch/mips/isa/formats/dsp.isa
index 9a6d614..12af2d6 100644
--- a/src/arch/mips/isa/formats/dsp.isa
+++ b/src/arch/mips/isa/formats/dsp.isa
@@ -173,8 +173,6 @@
 
     code = decl_code + code + write_code
 
-    opt_flags += ('IsDspOp',)
-
     iop = InstObjParams(name, Name, 'DspIntOp', code, opt_flags)
     header_output = BasicDeclare.subst(iop)
     decoder_output = BasicConstructor.subst(iop)
@@ -204,8 +202,6 @@
 
     code = decl_code + fetch_code + code + write_code
 
-    opt_flags += ('IsDspOp',)
-
     iop = InstObjParams(name, Name, 'DspHiLoOp', code, opt_flags)
     header_output = BasicDeclare.subst(iop)
     decoder_output = BasicConstructor.subst(iop)
diff --git a/src/arch/mips/isa/formats/mem.isa b/src/arch/mips/isa/formats/mem.isa
index 491dd0c..a31485d 100644
--- a/src/arch/mips/isa/formats/mem.isa
+++ b/src/arch/mips/isa/formats/mem.isa
@@ -404,7 +404,7 @@
     Fault %(class_name)s::execute(ExecContext *xc,
                                   Trace::InstRecord *traceData) const
     {
-        Addr EA M5_VAR_USED = 0;
+        M5_VAR_USED Addr EA = 0;
         Fault fault = NoFault;
 
         %(fp_enable_check)s;
@@ -458,7 +458,6 @@
 
 def format LoadIndexedMemory(memacc_code, ea_code = {{ EA = Rs + Rt; }},
                      mem_flags = [], inst_flags = []) {{
-    inst_flags += ['IsIndexed']
     (header_output, decoder_output, decode_block, exec_output) = \
         LoadStoreBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
                       decode_template = ImmNopCheckDecode,
@@ -467,7 +466,6 @@
 
 def format StoreIndexedMemory(memacc_code, ea_code = {{ EA = Rs + Rt; }},
                      mem_flags = [], inst_flags = []) {{
-    inst_flags += ['IsIndexed']
     (header_output, decoder_output, decode_block, exec_output) = \
         LoadStoreBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
                       exec_template_base = 'Store')
@@ -475,7 +473,7 @@
 
 def format LoadFPIndexedMemory(memacc_code, ea_code = {{ EA = Rs + Rt; }},
                      mem_flags = [], inst_flags = []) {{
-    inst_flags += ['IsIndexed', 'IsFloating']
+    inst_flags += ['IsFloating']
     (header_output, decoder_output, decode_block, exec_output) = \
         LoadStoreBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
                       decode_template = ImmNopCheckDecode,
@@ -484,7 +482,7 @@
 
 def format StoreFPIndexedMemory(memacc_code, ea_code = {{ EA = Rs + Rt; }},
                      mem_flags = [], inst_flags = []) {{
-    inst_flags += ['IsIndexed', 'IsFloating']
+    inst_flags += ['IsFloating']
     (header_output, decoder_output, decode_block, exec_output) = \
         LoadStoreBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
                       exec_template_base = 'Store')
diff --git a/src/arch/mips/isa/formats/mt.isa b/src/arch/mips/isa/formats/mt.isa
index fd09fad..6e32caa 100644
--- a/src/arch/mips/isa/formats/mt.isa
+++ b/src/arch/mips/isa/formats/mt.isa
@@ -111,7 +111,7 @@
             ExecContext *xc, Trace::InstRecord *traceData) const
         {
             Fault fault = NoFault;
-            int64_t data M5_VAR_USED;
+            M5_VAR_USED int64_t data;
             %(op_decl)s;
             %(op_rd)s;
 
diff --git a/src/arch/mips/isa/operands.isa b/src/arch/mips/isa/operands.isa
index 26c5a54..3cb2d43 100644
--- a/src/arch/mips/isa/operands.isa
+++ b/src/arch/mips/isa/operands.isa
@@ -144,7 +144,7 @@
     'Cause': ('ControlReg','uw', 'MISCREG_CAUSE',None,1),
 
     #Memory Operand
-    'Mem': ('Mem', 'uw', None, ('IsMemRef', 'IsLoad', 'IsStore'), 4),
+    'Mem': ('Mem', 'uw', None, (None, 'IsLoad', 'IsStore'), 4),
 
     #Program Counter Operands
     'PC': ('PCState', 'uw', 'pc', (None, None, 'IsControl'), 4),
diff --git a/src/arch/mips/locked_mem.hh b/src/arch/mips/locked_mem.hh
index 8400ed6..153a991 100644
--- a/src/arch/mips/locked_mem.hh
+++ b/src/arch/mips/locked_mem.hh
@@ -50,6 +50,7 @@
 #include "arch/registers.hh"
 #include "base/logging.hh"
 #include "base/trace.hh"
+#include "cpu/base.hh"
 #include "debug/LLSC.hh"
 #include "mem/packet.hh"
 #include "mem/request.hh"
diff --git a/src/arch/mips/utility.cc b/src/arch/mips/utility.cc
index 7e797b5..930c36b 100644
--- a/src/arch/mips/utility.cc
+++ b/src/arch/mips/utility.cc
@@ -47,7 +47,6 @@
 getArgument(ThreadContext *tc, int &number, uint16_t size, bool fp)
 {
     panic("getArgument() not implemented\n");
-    M5_DUMMY_RETURN
 }
 
 uint64_t
diff --git a/src/arch/null/SConscript b/src/arch/null/SConscript
index 41457e2..3f0b053 100644
--- a/src/arch/null/SConscript
+++ b/src/arch/null/SConscript
@@ -36,6 +36,3 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 Import('*')
-
-if env['TARGET_ISA'] == 'null':
-    Source('cpu_dummy.cc')
diff --git a/src/arch/null/cpu_dummy.hh b/src/arch/null/cpu_dummy.hh
deleted file mode 100644
index 7e183eb..0000000
--- a/src/arch/null/cpu_dummy.hh
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2013 ARM Limited
- * All rights reserved
- *
- * The license below extends only to copyright in the software and shall
- * not be construed as granting a license to any other intellectual
- * property including but not limited to intellectual property relating
- * to a hardware implementation of the functionality of the software
- * licensed hereunder.  You may use the software subject to the license
- * terms below provided that you ensure that this notice is replicated
- * unmodified and in its entirety in all distributions of the software,
- * modified or unmodified, in source code or in binary form.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_NULL_CPU_DUMMY_HH__
-#define __ARCH_NULL_CPU_DUMMY_HH__
-
-#include "sim/core.hh"
-
-class BaseCPU
-{
-  public:
-    static int numSimulatedInsts() { return 0; }
-    static int numSimulatedOps() { return 0; }
-    static void wakeup(ThreadID tid) { ; }
-};
-
-#endif // __ARCH_NULL_CPU_DUMMY_HH__
diff --git a/src/arch/power/isa/decoder.isa b/src/arch/power/isa/decoder.isa
index b7b9aff..475ddcc 100644
--- a/src/arch/power/isa/decoder.isa
+++ b/src/arch/power/isa/decoder.isa
@@ -343,8 +343,8 @@
         format MiscOp {
             278: dcbt({{ }});
             246: dcbtst({{ }});
-            598: sync({{ }}, [ IsMemBarrier ]);
-            854: eieio({{ }}, [ IsMemBarrier ]);
+            598: sync({{ }}, [ IsReadBarrier, IsWriteBarrier ]);
+            854: eieio({{ }}, [ IsReadBarrier, IsWriteBarrier ]);
         }
     }
 
diff --git a/src/arch/power/isa/formats/mem.isa b/src/arch/power/isa/formats/mem.isa
index d0ce1a5..1f10a5a 100644
--- a/src/arch/power/isa/formats/mem.isa
+++ b/src/arch/power/isa/formats/mem.isa
@@ -109,7 +109,7 @@
                                       ExecContext *xc,
                                       Trace::InstRecord *traceData) const
     {
-        Addr M5_VAR_USED EA;
+        M5_VAR_USED Addr EA;
         Fault fault = NoFault;
 
         %(op_decl)s;
diff --git a/src/arch/power/isa/operands.isa b/src/arch/power/isa/operands.isa
index 397364f..e77fde2 100644
--- a/src/arch/power/isa/operands.isa
+++ b/src/arch/power/isa/operands.isa
@@ -54,7 +54,7 @@
     'Ft': ('FloatReg', 'df', 'FRT', 'IsFloating', 5),
 
     # Memory Operand
-    'Mem': ('Mem', 'uw', None, ('IsMemRef', 'IsLoad', 'IsStore'), 8),
+    'Mem': ('Mem', 'uw', None, (None, 'IsLoad', 'IsStore'), 8),
 
     # Program counter and next
     'CIA': ('PCState', 'uw', 'pc', (None, None, 'IsControl'), 9),
diff --git a/src/arch/riscv/faults.cc b/src/arch/riscv/faults.cc
index 7a1c7bd..ac4c582 100644
--- a/src/arch/riscv/faults.cc
+++ b/src/arch/riscv/faults.cc
@@ -194,7 +194,7 @@
 void
 SyscallFault::invokeSE(ThreadContext *tc, const StaticInstPtr &inst)
 {
-    tc->syscall();
+    tc->getSystemPtr()->workload->syscall(tc);
 }
 
 } // namespace RiscvISA
diff --git a/src/arch/riscv/isa.cc b/src/arch/riscv/isa.cc
index 87c4043..2823618 100644
--- a/src/arch/riscv/isa.cc
+++ b/src/arch/riscv/isa.cc
@@ -49,7 +49,7 @@
 namespace RiscvISA
 {
 
-const std::array<const char *, NumMiscRegs> M5_VAR_USED MiscRegNames = {{
+M5_VAR_USED const std::array<const char *, NumMiscRegs> MiscRegNames = {{
     [MISCREG_PRV]           = "PRV",
     [MISCREG_ISA]           = "ISA",
     [MISCREG_VENDORID]      = "VENDORID",
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index 7b19464..b39005f 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -421,7 +421,7 @@
         0x03: decode FUNCT3 {
             format FenceOp {
                 0x0: fence({{
-                }}, uint64_t, IsMemBarrier, No_OpClass);
+                }}, uint64_t, IsReadBarrier, IsWriteBarrier, No_OpClass);
                 0x1: fence_i({{
                 }}, uint64_t, IsNonSpeculative, IsSerializeAfter, No_OpClass);
             }
diff --git a/src/arch/riscv/isa/formats/amo.isa b/src/arch/riscv/isa/formats/amo.isa
index 8c7a6a5..7d01145 100644
--- a/src/arch/riscv/isa/formats/amo.isa
+++ b/src/arch/riscv/isa/formats/amo.isa
@@ -100,7 +100,8 @@
         if (RL) {
             rel_fence = new MemFenceMicro(machInst, No_OpClass);
             rel_fence->setFlag(IsFirstMicroop);
-            rel_fence->setFlag(IsMemBarrier);
+            rel_fence->setFlag(IsReadBarrier);
+            rel_fence->setFlag(IsWriteBarrier);
             rel_fence->setFlag(IsDelayedCommit);
         }
 
@@ -121,7 +122,8 @@
         if (AQ) {
             acq_fence = new MemFenceMicro(machInst, No_OpClass);
             acq_fence->setFlag(IsLastMicroop);
-            acq_fence->setFlag(IsMemBarrier);
+            acq_fence->setFlag(IsReadBarrier);
+            acq_fence->setFlag(IsWriteBarrier);
         }
 
         if (RL && AQ) {
@@ -159,7 +161,8 @@
         if (RL) {
             rel_fence = new MemFenceMicro(machInst, No_OpClass);
             rel_fence->setFlag(IsFirstMicroop);
-            rel_fence->setFlag(IsMemBarrier);
+            rel_fence->setFlag(IsReadBarrier);
+            rel_fence->setFlag(IsWriteBarrier);
             rel_fence->setFlag(IsDelayedCommit);
         }
 
@@ -180,7 +183,8 @@
         if (AQ) {
             acq_fence = new MemFenceMicro(machInst, No_OpClass);
             acq_fence->setFlag(IsLastMicroop);
-            acq_fence->setFlag(IsMemBarrier);
+            acq_fence->setFlag(IsReadBarrier);
+            acq_fence->setFlag(IsWriteBarrier);
         }
 
         if (RL && AQ) {
@@ -203,7 +207,6 @@
         %(constructor)s;
 
         // overwrite default flags
-        flags[IsMemRef] = true;
         flags[IsLoad] = false;
         flags[IsStore] = false;
         flags[IsAtomic] = true;
diff --git a/src/arch/riscv/isa/operands.isa b/src/arch/riscv/isa/operands.isa
index 12f5577..78cd5f9 100644
--- a/src/arch/riscv/isa/operands.isa
+++ b/src/arch/riscv/isa/operands.isa
@@ -72,7 +72,7 @@
     'Fp2_bits': ('FloatReg', 'ud', 'FP2 + 8', 'IsFloating', 2),
 
 #Memory Operand
-    'Mem': ('Mem', 'ud', None, ('IsMemRef', 'IsLoad', 'IsStore'), 5),
+    'Mem': ('Mem', 'ud', None, (None, 'IsLoad', 'IsStore'), 5),
 
 #Program Counter Operands
     'PC': ('PCState', 'ud', 'pc', (None, None, 'IsControl'), 7),
diff --git a/src/arch/riscv/locked_mem.hh b/src/arch/riscv/locked_mem.hh
index fd45b3f..10d1839 100644
--- a/src/arch/riscv/locked_mem.hh
+++ b/src/arch/riscv/locked_mem.hh
@@ -52,6 +52,7 @@
 #include "arch/registers.hh"
 #include "base/logging.hh"
 #include "base/trace.hh"
+#include "cpu/base.hh"
 #include "debug/LLSC.hh"
 #include "mem/packet.hh"
 #include "mem/request.hh"
diff --git a/src/arch/sparc/isa/decoder.isa b/src/arch/sparc/isa/decoder.isa
index 75a4d75..c89a141 100644
--- a/src/arch/sparc/isa/decoder.isa
+++ b/src/arch/sparc/isa/decoder.isa
@@ -335,7 +335,8 @@
                 // 7-14 should cause an illegal instruction exception
                 0x0F: decode I {
                     0x0: Nop::stbar(IsWriteBarrier, MemWriteOp);
-                    0x1: Nop::membar(IsMemBarrier, MemReadOp);
+                    0x1: Nop::membar(IsReadBarrier, IsWriteBarrier,
+                                     MemReadOp);
                 }
                 0x10: Priv::rdpcr({{Rd = Pcr;}});
                 0x11: Priv::rdpic({{Rd = Pic;}}, {{Pcr<0:>}});
diff --git a/src/arch/sparc/isa/formats/basic.isa b/src/arch/sparc/isa/formats/basic.isa
index a64f66c..9ab7699 100644
--- a/src/arch/sparc/isa/formats/basic.isa
+++ b/src/arch/sparc/isa/formats/basic.isa
@@ -49,7 +49,7 @@
     // Constructor.
     %(class_name)s(ExtMachInst machInst);
     Fault execute(ExecContext *, Trace::InstRecord *) const override;
-    Fault doFpOp(ExecContext *, Trace::InstRecord *) const M5_NO_INLINE;
+    M5_NO_INLINE Fault doFpOp(ExecContext *, Trace::InstRecord *) const;
 };
 }};
 
diff --git a/src/arch/sparc/isa/operands.isa b/src/arch/sparc/isa/operands.isa
index 2c1eec6..7a2da13 100644
--- a/src/arch/sparc/isa/operands.isa
+++ b/src/arch/sparc/isa/operands.isa
@@ -187,6 +187,6 @@
 
     'Fsr':              ('ControlReg', 'udw', 'MISCREG_FSR', (None, None, ['IsSerializeAfter','IsSerializing','IsNonSpeculative']), 80),
     # Mem gets a large number so it's always last
-    'Mem':              ('Mem', 'udw', None, ('IsMemRef', 'IsLoad', 'IsStore'), 100)
+    'Mem':              ('Mem', 'udw', None, (None, 'IsLoad', 'IsStore'), 100)
 
 }};
diff --git a/src/arch/sparc/linux/linux.hh b/src/arch/sparc/linux/linux.hh
index ed50a30..431ec06 100644
--- a/src/arch/sparc/linux/linux.hh
+++ b/src/arch/sparc/linux/linux.hh
@@ -230,6 +230,11 @@
 
         if (stack)
             ctc->setIntReg(SparcISA::StackPointerReg, stack);
+
+        // Set these extra values. Since "clone" doesn't return two values,
+        // we can set these and they won't be clobbered by the syscall ABI.
+        ptc->setIntReg(SparcISA::SyscallPseudoReturnReg, 0);
+        ctc->setIntReg(SparcISA::SyscallPseudoReturnReg, 1);
     }
 };
 
diff --git a/src/arch/sparc/linux/process.cc b/src/arch/sparc/linux/process.cc
index 79bbaee..ce051ba 100644
--- a/src/arch/sparc/linux/process.cc
+++ b/src/arch/sparc/linux/process.cc
@@ -92,7 +92,7 @@
 {
     switch (trapNum) {
       case 0x10: //Linux 32 bit syscall trap
-        tc->syscall();
+        tc->getSystemPtr()->workload->syscall(tc);
         break;
       default:
         SparcProcess::handleTrap(trapNum, tc);
@@ -129,7 +129,7 @@
     switch (trapNum) {
       // case 0x10: // Linux 32 bit syscall trap
       case 0x6d: // Linux 64 bit syscall trap
-        tc->syscall();
+        tc->getSystemPtr()->workload->syscall(tc);
         break;
       case 0x6e: // Linux 64 bit getcontext trap
         getContext(tc);
diff --git a/src/arch/sparc/utility.cc b/src/arch/sparc/utility.cc
index 21fbf93..66fabd1 100644
--- a/src/arch/sparc/utility.cc
+++ b/src/arch/sparc/utility.cc
@@ -43,10 +43,7 @@
 uint64_t
 getArgument(ThreadContext *tc, int &number, uint16_t size, bool fp)
 {
-    if (!FullSystem) {
-        panic("getArgument() only implemented for full system\n");
-        M5_DUMMY_RETURN
-    }
+    panic_if(!FullSystem, "getArgument() only implemented for full system");
 
     const int NumArgumentRegs = 6;
     if (number < NumArgumentRegs) {
diff --git a/src/arch/x86/isa/decoder/two_byte_opcodes.isa b/src/arch/x86/isa/decoder/two_byte_opcodes.isa
index 5d45144..2236c4f 100644
--- a/src/arch/x86/isa/decoder/two_byte_opcodes.isa
+++ b/src/arch/x86/isa/decoder/two_byte_opcodes.isa
@@ -144,98 +144,13 @@
             // to play with so there can be quite a few pseudo
             // instructions.
             //0x04: loadall_or_reset_or_hang();
-            0x4: decode IMMEDIATE {
-                format BasicOperate {
-                    0x00: m5arm({{
-                        PseudoInst::arm(xc->tcBase());
-                    }}, IsNonSpeculative);
-                    0x01: m5quiesce({{
-                        PseudoInst::quiesce(xc->tcBase());
-                    }}, IsNonSpeculative, IsQuiesce);
-                    0x02: m5quiesceNs({{
-                        PseudoInst::quiesceNs(xc->tcBase(), Rdi);
-                    }}, IsNonSpeculative, IsQuiesce);
-                    0x03: m5quiesceCycle({{
-                        PseudoInst::quiesceCycles(xc->tcBase(), Rdi);
-                    }}, IsNonSpeculative, IsQuiesce);
-                    0x04: m5quiesceTime({{
-                        Rax = PseudoInst::quiesceTime(xc->tcBase());
-                    }}, IsNonSpeculative);
-                    0x07: m5rpns({{
-                        Rax = PseudoInst::rpns(xc->tcBase());
-                    }}, IsNonSpeculative);
-                    0x21: m5exit({{
-                        PseudoInst::m5exit(xc->tcBase(), Rdi);
-                    }}, IsNonSpeculative);
-                    0x22: m5fail({{
-                        PseudoInst::m5fail(xc->tcBase(), Rdi, Rsi);
-                    }}, IsNonSpeculative);
-                    0x23: m5sum({{
-                        Rax = PseudoInst::m5sum(xc->tcBase(),
-                                Rdi, Rsi, Rdx, Rcx, R8, R9);
-                    }}, IsNonSpeculative);
-                    0x30: m5initparam({{
-                        Rax = PseudoInst::initParam(xc->tcBase(), Rdi, Rsi);
-                    }}, IsNonSpeculative);
-                    0x31: m5loadsymbol({{
-                        PseudoInst::loadsymbol(xc->tcBase());
-                    }}, IsNonSpeculative);
-                    0x40: m5resetstats({{
-                        PseudoInst::resetstats(xc->tcBase(), Rdi, Rsi);
-                    }}, IsNonSpeculative);
-                    0x41: m5dumpstats({{
-                        PseudoInst::dumpstats(xc->tcBase(), Rdi, Rsi);
-                    }}, IsNonSpeculative);
-                    0x42: m5dumpresetstats({{
-                        PseudoInst::dumpresetstats(xc->tcBase(), Rdi, Rsi);
-                    }}, IsNonSpeculative);
-                    0x43: m5checkpoint({{
-                        PseudoInst::m5checkpoint(xc->tcBase(), Rdi, Rsi);
-                    }}, IsNonSpeculative);
-                    0x50: m5readfile({{
-                        Rax = PseudoInst::readfile(
-                            xc->tcBase(), Rdi, Rsi, Rdx);
-                    }}, IsNonSpeculative);
-                    0x51: m5debugbreak({{
-                        PseudoInst::debugbreak(xc->tcBase());
-                    }}, IsNonSpeculative);
-                    0x52: m5switchcpu({{
-                        PseudoInst::switchcpu(xc->tcBase());
-                    }}, IsNonSpeculative);
-                    0x53: m5addsymbol({{
-                        PseudoInst::addsymbol(xc->tcBase(), Rdi, Rsi);
-                    }}, IsNonSpeculative);
-                    0x54: m5panic({{
-                        panic("M5 panic instruction called at pc = %#x.\n",
-                              RIP);
-                    }}, IsNonSpeculative);
-                    0x55: m5reserved1({{
-                        warn("M5 reserved opcode 1 ignored.\n");
-                    }}, IsNonSpeculative);
-                    0x56: m5reserved2({{
-                        warn("M5 reserved opcode 2 ignored.\n");
-                    }}, IsNonSpeculative);
-                    0x57: m5reserved3({{
-                        warn("M5 reserved opcode 3 ignored.\n");
-                    }}, IsNonSpeculative);
-                    0x58: m5reserved4({{
-                        warn("M5 reserved opcode 4 ignored.\n");
-                    }}, IsNonSpeculative);
-                    0x59: m5reserved5({{
-                        warn("M5 reserved opcode 5 ignored.\n");
-                    }}, IsNonSpeculative);
-                    0x5a: m5_work_begin({{
-                        PseudoInst::workbegin(xc->tcBase(), Rdi, Rsi);
-                    }}, IsNonSpeculative);
-                    0x5b: m5_work_end({{
-                        PseudoInst::workend(xc->tcBase(), Rdi, Rsi);
-                    }}, IsNonSpeculative);
-                    0x62: m5togglesync({{
-                        PseudoInst::togglesync(xc->tcBase());
-                    }}, IsNonSpeculative, IsQuiesce);
-                    default: Inst::UD2();
-                }
-            }
+            0x4: BasicOperate::gem5Op({{
+                uint64_t ret;
+                bool recognized = PseudoInst::pseudoInst<X86PseudoInstABI>(
+                        xc->tcBase(), IMMEDIATE, ret);
+                if (!recognized)
+                    fault = std::make_shared<InvalidOpcode>();
+            }}, IsNonSpeculative);
             0x05: decode FullSystemInt {
                 0: SyscallInst::syscall({{
                     return std::make_shared<SESyscallFault>();
@@ -786,13 +701,12 @@
             //0x6: group15();
             0x6: decode MODRM_MOD {
                 0x3: decode MODRM_REG {
-                    0x5: BasicOperate::LFENCE(
-                                 {{/*Nothing*/}}, IsReadBarrier,
-                                 IsSerializeAfter);
-                    0x6: BasicOperate::MFENCE(
-                                 {{/*Nothing*/}}, IsMemBarrier);
-                    0x7: BasicOperate::SFENCE(
-                                 {{/*Nothing*/}}, IsWriteBarrier);
+                    0x5: BasicOperate::LFENCE({{/*Nothing*/}},
+                                              IsReadBarrier, IsSerializeAfter);
+                    0x6: BasicOperate::MFENCE({{/*Nothing*/}},
+                                              IsReadBarrier, IsWriteBarrier);
+                    0x7: BasicOperate::SFENCE({{/*Nothing*/}},
+                                              IsWriteBarrier);
                     default: Inst::UD2();
                 }
                 default: decode MODRM_REG {
diff --git a/src/arch/x86/isa/formats/monitor_mwait.isa b/src/arch/x86/isa/formats/monitor_mwait.isa
index 809623d..b5fe34c 100644
--- a/src/arch/x86/isa/formats/monitor_mwait.isa
+++ b/src/arch/x86/isa/formats/monitor_mwait.isa
@@ -90,7 +90,6 @@
                 OpClass __opClass) :
             X86ISA::X86StaticInst(_mnemonic, _machInst, __opClass)
         {
-            flags[IsMemRef] = 1;
             flags[IsLoad] = 1;
         }
 
diff --git a/src/arch/x86/isa/includes.isa b/src/arch/x86/isa/includes.isa
index 65b304f..d2098c3 100644
--- a/src/arch/x86/isa/includes.isa
+++ b/src/arch/x86/isa/includes.isa
@@ -112,6 +112,7 @@
 #include "arch/x86/cpuid.hh"
 #include "arch/x86/faults.hh"
 #include "arch/x86/memhelpers.hh"
+#include "arch/x86/pseudo_inst_abi.hh"
 #include "arch/x86/tlb.hh"
 #include "base/compiler.hh"
 #include "base/condcodes.hh"
diff --git a/src/arch/x86/isa/microops/regop.isa b/src/arch/x86/isa/microops/regop.isa
index 227d1cb..c00c6fa 100644
--- a/src/arch/x86/isa/microops/regop.isa
+++ b/src/arch/x86/isa/microops/regop.isa
@@ -49,7 +49,7 @@
             %(op_decl)s;
             %(op_rd)s;
 
-            RegVal result M5_VAR_USED;
+            M5_VAR_USED RegVal result;
 
             if(%(cond_check)s)
             {
@@ -79,7 +79,7 @@
             %(op_decl)s;
             %(op_rd)s;
 
-            RegVal result M5_VAR_USED;
+            M5_VAR_USED RegVal result;
 
             if(%(cond_check)s)
             {
diff --git a/src/arch/x86/isa/microops/specop.isa b/src/arch/x86/isa/microops/specop.isa
index a7dda10..1d50569 100644
--- a/src/arch/x86/isa/microops/specop.isa
+++ b/src/arch/x86/isa/microops/specop.isa
@@ -233,7 +233,8 @@
         def __init__(self):
             self.className = "Mfence"
             self.mnemonic = "mfence"
-            self.instFlags = "| (1ULL << StaticInst::IsMemBarrier)"
+            self.instFlags = "| (1ULL << StaticInst::IsReadBarrier)" + \
+                             "| (1ULL << StaticInst::IsWriteBarrier)"
 
         def getAllocator(self, microFlags):
             allocString = '''
diff --git a/src/arch/x86/isa/operands.isa b/src/arch/x86/isa/operands.isa
index 2cd92dd..504deb7 100644
--- a/src/arch/x86/isa/operands.isa
+++ b/src/arch/x86/isa/operands.isa
@@ -64,7 +64,7 @@
     def floatReg(idx, id):
         return ('FloatReg', 'df', idx, 'IsFloating', id)
     def ccReg(idx, id):
-        return ('CCReg', 'uqw', idx, 'IsCC', id)
+        return ('CCReg', 'uqw', idx, None, id)
     def controlReg(idx, id, ctype = 'uqw'):
         return ('ControlReg', ctype, idx,
                 (None, None, ['IsSerializeAfter',
@@ -147,20 +147,20 @@
         # would be retained, the write predicate checks if any of the bits
         # are being written.
 
-        'PredccFlagBits': ('CCReg', 'uqw', '(CCREG_ZAPS)', 'IsCC',
+        'PredccFlagBits': ('CCReg', 'uqw', '(CCREG_ZAPS)', None,
                 60, None, None, '''(((ext & (PFBit | AFBit | ZFBit | SFBit
                 )) != (PFBit | AFBit | ZFBit | SFBit )) &&
                 ((ext & (PFBit | AFBit | ZFBit | SFBit )) != 0))''',
                 '((ext & (PFBit | AFBit | ZFBit | SFBit )) != 0)'),
-        'PredcfofBits':   ('CCReg', 'uqw', '(CCREG_CFOF)', 'IsCC',
+        'PredcfofBits':   ('CCReg', 'uqw', '(CCREG_CFOF)', None,
                 61, None, None, '''(((ext & CFBit) == 0 ||
                 (ext & OFBit) == 0) && ((ext & (CFBit | OFBit)) != 0))''',
                 '((ext & (CFBit | OFBit)) != 0)'),
-        'PreddfBit':   ('CCReg', 'uqw', '(CCREG_DF)', 'IsCC',
+        'PreddfBit':   ('CCReg', 'uqw', '(CCREG_DF)', None,
                 62, None, None, '(false)', '((ext & DFBit) != 0)'),
-        'PredecfBit':   ('CCReg', 'uqw', '(CCREG_ECF)', 'IsCC',
+        'PredecfBit':   ('CCReg', 'uqw', '(CCREG_ECF)', None,
                 63, None, None, '(false)', '((ext & ECFBit) != 0)'),
-        'PredezfBit':   ('CCReg', 'uqw', '(CCREG_EZF)', 'IsCC',
+        'PredezfBit':   ('CCReg', 'uqw', '(CCREG_EZF)', None,
                 64, None, None, '(false)', '((ext & EZFBit) != 0)'),
 
         # These register should needs to be more protected so that later
@@ -207,5 +207,5 @@
         'TscOp':         controlReg('MISCREG_TSC', 212),
         'M5Reg':         squashCReg('MISCREG_M5_REG', 213),
         'Mem':           ('Mem', 'uqw', None, \
-                          ('IsMemRef', 'IsLoad', 'IsStore'), 300)
+                          (None, 'IsLoad', 'IsStore'), 300)
 }};
diff --git a/src/arch/x86/ldstflags.hh b/src/arch/x86/ldstflags.hh
index 950728f..e8ded0a 100644
--- a/src/arch/x86/ldstflags.hh
+++ b/src/arch/x86/ldstflags.hh
@@ -46,7 +46,7 @@
  */
 namespace X86ISA
 {
-    const Request::FlagsType M5_VAR_USED SegmentFlagMask = mask(4);
+    M5_VAR_USED const Request::FlagsType SegmentFlagMask = mask(4);
     const int FlagShift = 4;
     enum FlagBit {
         CPL0FlagBit = 1,
diff --git a/src/arch/x86/memhelpers.hh b/src/arch/x86/memhelpers.hh
index 9f54954..35dfac6 100644
--- a/src/arch/x86/memhelpers.hh
+++ b/src/arch/x86/memhelpers.hh
@@ -45,7 +45,8 @@
 initiateMemRead(ExecContext *xc, Trace::InstRecord *traceData, Addr addr,
                 unsigned dataSize, Request::Flags flags)
 {
-    return xc->initiateMemRead(addr, dataSize, flags);
+    const std::vector<bool> byte_enable(dataSize, true);
+    return xc->initiateMemRead(addr, dataSize, flags, byte_enable);
 }
 
 static void
@@ -106,7 +107,9 @@
               uint64_t &mem, unsigned dataSize, Request::Flags flags)
 {
     memset(&mem, 0, sizeof(mem));
-    Fault fault = xc->readMem(addr, (uint8_t *)&mem, dataSize, flags);
+    const std::vector<bool> byte_enable(dataSize, true);
+    Fault fault = xc->readMem(addr, (uint8_t *)&mem, dataSize,
+                              flags, byte_enable);
     if (fault == NoFault) {
         // If LE to LE, this is a nop, if LE to BE, the actual data ends up
         // in the right place because the LSBs where at the low addresses on
@@ -124,8 +127,11 @@
                     unsigned flags)
 {
     std::array<T, N> real_mem;
+    // Size is fixed at compilation time. Make a static vector.
+    constexpr auto size = sizeof(T) * N;
+    static const std::vector<bool> byte_enable(size, true);
     Fault fault = xc->readMem(addr, (uint8_t *)&real_mem,
-                              sizeof(T) * N, flags);
+                              size, flags, byte_enable);
     if (fault == NoFault) {
         real_mem = letoh(real_mem);
         for (int i = 0; i < N; i++)
@@ -166,8 +172,11 @@
     for (int i = 0; i < N; i++)
         real_mem[i] = mem[i];
     real_mem = htole(real_mem);
-    return xc->writeMem((uint8_t *)&real_mem, sizeof(T) * N,
-                        addr, flags, res);
+    // Size is fixed at compilation time. Make a static vector.
+    constexpr auto size = sizeof(T) * N;
+    static const std::vector<bool> byte_enable(size, true);
+    return xc->writeMem((uint8_t *)&real_mem, size,
+                        addr, flags, res, byte_enable);
 }
 
 static Fault
@@ -178,7 +187,9 @@
     if (traceData)
         traceData->setData(mem);
     mem = htole(mem);
-    return xc->writeMem((uint8_t *)&mem, dataSize, addr, flags, res);
+    const std::vector<bool> byte_enable(dataSize, true);
+    return xc->writeMem((uint8_t *)&mem, dataSize, addr, flags,
+                        res, byte_enable);
 }
 
 template <size_t N>
@@ -208,8 +219,9 @@
     if (traceData)
         traceData->setData(mem);
     uint64_t host_mem = htole(mem);
-    Fault fault =
-          xc->writeMem((uint8_t *)&host_mem, dataSize, addr, flags, res);
+    const std::vector<bool> byte_enable(dataSize, true);
+    Fault fault = xc->writeMem((uint8_t *)&host_mem, dataSize, addr,
+                               flags, res, byte_enable);
     if (fault == NoFault && res)
         *res = letoh(*res);
     return fault;
diff --git a/src/base/SConscript b/src/base/SConscript
index bd18429..e04d84a 100644
--- a/src/base/SConscript
+++ b/src/base/SConscript
@@ -73,15 +73,7 @@
 GTest('trie.test', 'trie.test.cc')
 Source('types.cc')
 GTest('types.test', 'types.test.cc', 'types.cc')
-
-Source('loader/dtb_file.cc')
-Source('loader/elf_object.cc')
-Source('loader/image_file_data.cc')
-GTest('loader/image_file_data.test', 'loader/image_file_data.test.cc',
-'loader/image_file_data.cc')
-Source('loader/memory_image.cc')
-Source('loader/object_file.cc')
-Source('loader/symtab.cc')
+GTest('uncontended_mutex.test', 'uncontended_mutex.test.cc')
 
 Source('stats/group.cc')
 Source('stats/text.cc')
diff --git a/src/base/bmpwriter.hh b/src/base/bmpwriter.hh
index 7917c15..9d61096 100644
--- a/src/base/bmpwriter.hh
+++ b/src/base/bmpwriter.hh
@@ -75,15 +75,15 @@
     void write(std::ostream &bmp) const override;
 
   private:
-    struct FileHeader {
+    struct M5_ATTR_PACKED FileHeader {
         unsigned char magic_number[2];
         uint32_t size;
         uint16_t reserved1;
         uint16_t reserved2;
         uint32_t offset;
-    } M5_ATTR_PACKED;
+    };
 
-    struct InfoHeaderV1 { /* Aka DIB header */
+    struct M5_ATTR_PACKED InfoHeaderV1 { /* Aka DIB header */
         uint32_t Size;
         uint32_t Width;
         uint32_t Height;
@@ -95,14 +95,14 @@
         uint32_t YPelsPerMeter;
         uint32_t ClrUsed;
         uint32_t ClrImportant;
-    } M5_ATTR_PACKED;
+    };
 
-    struct CompleteV1Header {
+    struct M5_ATTR_PACKED CompleteV1Header {
         FileHeader file;
         InfoHeaderV1 info;
-    } M5_ATTR_PACKED;
+    };
 
-    struct BmpPixel32 {
+    struct M5_ATTR_PACKED BmpPixel32 {
         BmpPixel32 &operator=(const Pixel &rhs) {
             red = rhs.red;
             green = rhs.green;
@@ -115,7 +115,7 @@
         uint8_t green;
         uint8_t red;
         uint8_t padding;
-    } M5_ATTR_PACKED;
+    };
 
     typedef BmpPixel32 PixelType;
 
diff --git a/src/base/chunk_generator.hh b/src/base/chunk_generator.hh
index 994d83a..4c749b2 100644
--- a/src/base/chunk_generator.hh
+++ b/src/base/chunk_generator.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BASE__CHUNK_GENERATOR_HH__
-#define __BASE__CHUNK_GENERATOR_HH__
+#ifndef __BASE_CHUNK_GENERATOR_HH__
+#define __BASE_CHUNK_GENERATOR_HH__
 
 /**
  * @file
@@ -60,13 +60,13 @@
     /** The starting address of the next chunk (after the current one). */
     Addr nextAddr;
     /** The size of the current chunk (in bytes). */
-    unsigned  curSize;
+    Addr curSize;
     /** The number of bytes remaining in the region after the current chunk. */
-    unsigned  sizeLeft;
+    Addr sizeLeft;
     /** The start address so we can calculate offset in writing block. */
     const Addr startAddr;
     /** The maximum chunk size, e.g., the cache block size or page size. */
-    const unsigned chunkSize;
+    const Addr chunkSize;
 
   public:
     /**
@@ -78,8 +78,8 @@
      *
      * @ingroup api_chunk_generator
      */
-    ChunkGenerator(Addr _startAddr, unsigned totalSize, unsigned _chunkSize)
-        : startAddr(_startAddr), chunkSize(_chunkSize)
+    ChunkGenerator(Addr _startAddr, Addr totalSize, Addr _chunkSize) :
+        startAddr(_startAddr), chunkSize(_chunkSize)
     {
         // chunkSize must be a power of two
         assert(chunkSize == 0 || isPowerOf2(chunkSize));
@@ -87,13 +87,10 @@
         // set up initial chunk.
         curAddr = startAddr;
 
-        if (chunkSize == 0) //Special Case, if we see 0, assume no chuncking
-        {
+        if (chunkSize == 0) { // Special Case, if we see 0, assume no chunking.
             nextAddr = startAddr + totalSize;
-        }
-        else
-        {
-            // nextAddr should be *next* chunk start
+        } else {
+            // nextAddr should be *next* chunk start.
             nextAddr = roundUp(startAddr, chunkSize);
             if (curAddr == nextAddr) {
                 // ... even if startAddr is already chunk-aligned
@@ -101,8 +98,8 @@
             }
         }
 
-        // how many bytes are left between curAddr and the end of this chunk?
-        unsigned left_in_chunk = nextAddr - curAddr;
+        // How many bytes are left between curAddr and the end of this chunk?
+        Addr left_in_chunk = nextAddr - curAddr;
         curSize = std::min(totalSize, left_in_chunk);
         sizeLeft = totalSize - curSize;
     }
@@ -118,14 +115,14 @@
      *
      * @ingroup api_chunk_generator
      */
-    unsigned size() const { return curSize; }
+    Addr size() const { return curSize; }
 
     /**
      * Number of bytes we have already chunked up.
      *
      * @ingroup api_chunk_generator
      */
-    unsigned complete() const { return curAddr - startAddr; }
+    Addr complete() const { return curAddr - startAddr; }
 
     /**
      * Are we done?  That is, did the last call to next() advance
@@ -134,7 +131,7 @@
      *
      * @ingroup api_chunk_generator
      */
-    bool done() const { return (curSize == 0); }
+    bool done() const { return curSize == 0; }
 
     /**
      * Is this the last chunk?
@@ -142,7 +139,7 @@
      *
      * @ingroup api_chunk_generator
      */
-    bool last() const { return (sizeLeft == 0); }
+    bool last() const { return sizeLeft == 0; }
 
     /**
      * Advance generator to next chunk.
@@ -154,7 +151,7 @@
     bool
     next()
     {
-        if (sizeLeft == 0) {
+        if (last()) {
             curSize = 0;
             return false;
         }
@@ -167,4 +164,4 @@
     }
 };
 
-#endif // __BASE__CHUNK_GENERATOR_HH__
+#endif // __BASE_CHUNK_GENERATOR_HH__
diff --git a/src/base/compiler.hh b/src/base/compiler.hh
index 38736ca..ee9a224 100644
--- a/src/base/compiler.hh
+++ b/src/base/compiler.hh
@@ -45,69 +45,77 @@
 
 // http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
 
-#if defined(__GNUC__) // clang or gcc
-#  define M5_ATTR_NORETURN  __attribute__((noreturn))
-#  define M5_DUMMY_RETURN
-#  define M5_VAR_USED __attribute__((unused))
-#  define M5_ATTR_PACKED __attribute__ ((__packed__))
-#  define M5_NO_INLINE __attribute__ ((__noinline__))
-#  define M5_DEPRECATED __attribute__((deprecated))
-#  define M5_DEPRECATED_MSG(MSG) __attribute__((deprecated(MSG)))
-#  define M5_UNREACHABLE __builtin_unreachable()
-#  define M5_PUBLIC __attribute__ ((visibility ("default")))
-#  define M5_LOCAL __attribute__ ((visibility ("hidden")))
+
+/*
+ * Attributes that become standard in later versions of c++.
+ */
+
+// Use M5_FALLTHROUGH to mark when you're intentionally falling through from
+// one case to another in a switch statement.
+#if __has_cpp_attribute(fallthrough) // Standard in c++17.
+#  define M5_FALLTHROUGH [[fallthrough]]
+#else
+// Not supported, so it's not necessary to avoid warnings.
+#  define M5_FALLTHROUGH
 #endif
 
-#if defined(__clang__)
+// When the return value of a function should not be discarded, mark it with
+// M5_NODISCARD.
+#if __has_cpp_attribute(nodiscard) // Standard in c++17, with message in c++20.
+#  define M5_NODISCARD [[nodiscard]]
+#else
+// Not supported, but it's optional so we can just omit it.
+#  define M5_NODISCARD
+#endif
+
+// When a variable may purposefully not be used, for instance if it's only used
+// in debug statements which might be disabled, mark it with M5_VAR_USED.
+#if __has_cpp_attribute(maybe_unused) // Standard in c++17.
+#  define M5_VAR_USED [[maybe_unused]]
+#elif defined(__GNUC__)
+// gcc and clang support a custom attribute which is essentially the same
+// thing.
+#  define M5_VAR_USED [[gnu::unused]]
+#endif
+
+
+/*
+ * Compiler specific features.
+ */
+
+#if defined(__GNUC__) // clang or gcc.
+// Mark a structure as packed, so that no padding is added to its layout. This
+// padding might be added to, for instance, ensure certain fields have certain
+// alignment.
+#  define M5_ATTR_PACKED [[gnu::packed]]
+
+// Prevent a function from being inlined.
+#  define M5_NO_INLINE [[gnu::noinline]]
+
+// Set the visibility of a symbol.
+#  define M5_PUBLIC [[gnu:visibility("default")]]
+#  define M5_LOCAL [[gnu::visibility("hidden")]]
+
+// Marker for what should be an unreachable point in the code.
+#  define M5_UNREACHABLE __builtin_unreachable()
+
+// To mark a branch condition as likely taken, wrap it's condition with
+// M5_LIKELY. To mark it as likely not taken, wrap it's condition with
+// M5_UNLIKELY. These can be replaced with the standard attributes [[likely]]
+// and [[unlikely]] in c++20, although the syntax is different enough that
+// we can't do that with direct substitution.
+#  define M5_LIKELY(cond) __builtin_expect(!!(cond), 1)
+#  define M5_UNLIKELY(cond) __builtin_expect(!!(cond), 0)
+#endif
+
+// When a member variable may be unused, mark it with M5_CLASS_VAR_USED. This
+// needs to be limitted to clang only since clang warns on these unused
+// variables, and g++ will actually warn if you use this attribute since it
+// won't do anything there.
+#if defined(__clang__) // clang only.
 #  define M5_CLASS_VAR_USED M5_VAR_USED
 #else
 #  define M5_CLASS_VAR_USED
 #endif
 
-// This can be removed once all compilers support C++17
-#if defined __has_cpp_attribute
-    // Note: We must separate this if statement because GCC < 5.0 doesn't
-    //       support the function-like syntax in #if statements.
-    #if __has_cpp_attribute(fallthrough)
-        #define M5_FALLTHROUGH [[fallthrough]]
-    #else
-        #define M5_FALLTHROUGH
-    #endif
-
-    #if __has_cpp_attribute(nodiscard)
-        #define M5_NODISCARD [[nodiscard]]
-    #else
-        #define M5_NODISCARD
-    #endif
-#else
-    // Unsupported (and no warning) on GCC < 7.
-    #define M5_FALLTHROUGH
-
-    #define M5_NODISCARD
-#endif
-
-// std::make_unique redefined for C++11 compilers
-namespace m5
-{
-
-#if __cplusplus >= 201402L // C++14
-
-using std::make_unique;
-
-#else // C++11
-
-/** Defining custom version of make_unique: m5::make_unique<>() */
-template<typename T, typename... Args>
-std::unique_ptr<T>
-make_unique( Args&&... constructor_args )
-{
-    return std::unique_ptr<T>(
-               new T( std::forward<Args>(constructor_args)... )
-           );
-}
-
-#endif // __cplusplus >= 201402L
-
-} //namespace m5
-
 #endif // __BASE_COMPILER_HH__
diff --git a/src/base/fiber.cc b/src/base/fiber.cc
index 3d2e2e9..fe1bad0 100644
--- a/src/base/fiber.cc
+++ b/src/base/fiber.cc
@@ -145,10 +145,12 @@
 
     setStarted();
 
-    // Swap back to the parent context which is still considered "current",
-    // now that we're ready to go.
-    int ret M5_VAR_USED = swapcontext(&ctx, &_currentFiber->ctx);
-    panic_if(ret == -1, strerror(errno));
+    if (_setjmp(jmp) == 0) {
+        // Swap back to the parent context which is still considered "current",
+        // now that we're ready to go.
+        int ret = swapcontext(&ctx, &_currentFiber->ctx);
+        panic_if(ret == -1, strerror(errno));
+    }
 
     // Call main() when we're been reactivated for the first time.
     main();
@@ -175,7 +177,8 @@
     Fiber *prev = _currentFiber;
     Fiber *next = this;
     _currentFiber = next;
-    swapcontext(&prev->ctx, &next->ctx);
+    if (_setjmp(prev->jmp) == 0)
+        _longjmp(next->jmp, 1);
 }
 
 Fiber *Fiber::currentFiber() { return _currentFiber; }
diff --git a/src/base/fiber.hh b/src/base/fiber.hh
index dc7ef01..be8937f 100644
--- a/src/base/fiber.hh
+++ b/src/base/fiber.hh
@@ -39,6 +39,12 @@
 #include <ucontext.h>
 #endif
 
+// Avoid fortify source for longjmp to work between ucontext stacks.
+#pragma push_macro("__USE_FORTIFY_LEVEL")
+#undef __USE_FORTIFY_LEVEL
+#include <setjmp.h>
+#pragma pop_macro("__USE_FORTIFY_LEVEL")
+
 #include <cstddef>
 #include <cstdint>
 
@@ -137,6 +143,10 @@
     void start();
 
     ucontext_t ctx;
+    // ucontext is slow in swapcontext. Here we use _setjmp/_longjmp to avoid
+    // the additional signals for speed up.
+    jmp_buf jmp;
+
     Fiber *link;
 
     // The stack for this context, or a nullptr if allocated elsewhere.
diff --git a/src/base/loader/SConscript b/src/base/loader/SConscript
new file mode 100644
index 0000000..d17875f
--- /dev/null
+++ b/src/base/loader/SConscript
@@ -0,0 +1,37 @@
+# -*- mode:python -*-
+
+# Copyright (c) 2006 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+Source('dtb_file.cc')
+Source('elf_object.cc')
+Source('image_file_data.cc')
+GTest('image_file_data.test', 'image_file_data.test.cc', 'image_file_data.cc')
+Source('memory_image.cc')
+Source('object_file.cc')
+Source('symtab.cc')
diff --git a/src/base/loader/elf_object.cc b/src/base/loader/elf_object.cc
index 49fbd6d..bdcc92b 100644
--- a/src/base/loader/elf_object.cc
+++ b/src/base/loader/elf_object.cc
@@ -141,7 +141,7 @@
             "No loadable segments in '%s'. ELF file corrupted?\n",
             imageData->filename());
 
-    for (auto M5_VAR_USED &seg: image.segments())
+    for (M5_VAR_USED auto &seg: image.segments())
         DPRINTFR(Loader, "%s\n", seg);
 
     // We will actually read the sections when we need to load them
@@ -328,6 +328,11 @@
 {
     auto name = std::to_string(seg_num);
 
+    if (phdr.p_memsz == 0) {
+        warn("Ignoring empty loadable segment %s", name);
+        return;
+    }
+
     image.addSegment({ name, phdr.p_paddr, imageData,
                        phdr.p_offset, phdr.p_filesz });
     Addr uninitialized = phdr.p_memsz - phdr.p_filesz;
diff --git a/src/base/loader/object_file.cc b/src/base/loader/object_file.cc
index 12e5606..6fdf228 100644
--- a/src/base/loader/object_file.cc
+++ b/src/base/loader/object_file.cc
@@ -38,6 +38,60 @@
 
 ObjectFile::ObjectFile(ImageFileDataPtr ifd) : ImageFile(ifd) {}
 
+const char *
+archToString(Arch arch)
+{
+    switch (arch) {
+      case UnknownArch:
+        return "unknown";
+      case SPARC64:
+        return "sparc64";
+      case SPARC32:
+        return "sparc32";
+      case Mips:
+        return "mips";
+      case X86_64:
+        return "x86_64";
+      case I386:
+        return "i386";
+      case Arm64:
+        return "arm64";
+      case Arm:
+        return "arm";
+      case Thumb:
+        return "thumb";
+      case Power:
+        return "power";
+      case Riscv64:
+        return "riscv64";
+      case Riscv32:
+        return "riscv32";
+      default:
+        panic("Unrecognized arch %d.", arch);
+    }
+}
+
+const char *
+opSysToString(OpSys op_sys)
+{
+    switch (op_sys) {
+      case UnknownOpSys:
+        return "unknown";
+      case Tru64:
+        return "tru64";
+      case Linux:
+        return "linux";
+      case Solaris:
+        return "solaris";
+      case LinuxArmOABI:
+        return "linux_arm_OABI";
+      case FreeBSD:
+        return "freebsd";
+      default:
+        panic("Unrecognized operating system %d.", op_sys);
+    }
+}
+
 namespace
 {
 
diff --git a/src/base/loader/object_file.hh b/src/base/loader/object_file.hh
index 9ff9997..0bfd918 100644
--- a/src/base/loader/object_file.hh
+++ b/src/base/loader/object_file.hh
@@ -56,6 +56,8 @@
     Riscv32
 };
 
+const char *archToString(Arch arch);
+
 enum OpSys {
     UnknownOpSys,
     Tru64,
@@ -65,6 +67,8 @@
     FreeBSD
 };
 
+const char *opSysToString(OpSys op_sys);
+
 class SymbolTable;
 
 class ObjectFile : public ImageFile
diff --git a/src/base/logging.hh b/src/base/logging.hh
index 7113af8..29a9563 100644
--- a/src/base/logging.hh
+++ b/src/base/logging.hh
@@ -121,7 +121,7 @@
      * functions, and gcc will get mad if a function calls panic and then
      * doesn't return.
      */
-    void exit_helper() M5_ATTR_NORETURN { exit(); ::abort(); }
+    [[noreturn]] void exit_helper() { exit(); ::abort(); }
 
   protected:
     bool enabled;
@@ -196,7 +196,7 @@
  */
 #define panic_if(cond, ...)                                  \
     do {                                                     \
-        if ((cond)) {                                        \
+        if (M5_UNLIKELY(cond)) {                             \
             panic("panic condition " # cond " occurred: %s", \
                   csprintf(__VA_ARGS__));                    \
         }                                                    \
@@ -218,7 +218,7 @@
  */
 #define fatal_if(cond, ...)                                     \
     do {                                                        \
-        if ((cond)) {                                           \
+        if (M5_UNLIKELY(cond)) {                                \
             fatal("fatal condition " # cond " occurred: %s",    \
                   csprintf(__VA_ARGS__));                       \
         }                                                       \
@@ -262,13 +262,13 @@
  */
 #define warn_if(cond, ...) \
     do { \
-        if ((cond)) \
+        if (M5_UNLIKELY(cond)) \
             warn(__VA_ARGS__); \
     } while (0)
 
 #define warn_if_once(cond, ...) \
     do { \
-        if ((cond)) \
+        if (M5_UNLIKELY(cond)) \
             warn_once(__VA_ARGS__); \
     } while (0)
 /** @} */ // end of api_logger
@@ -291,7 +291,7 @@
 #else //!NDEBUG
 #define chatty_assert(cond, ...)                                        \
     do {                                                                \
-        if (!(cond))                                                    \
+        if (M5_UNLIKELY(!(cond)))                                       \
             panic("assert(" # cond ") failed: %s", csprintf(__VA_ARGS__)); \
     } while (0)
 #endif // NDEBUG
diff --git a/src/base/pngwriter.hh b/src/base/pngwriter.hh
index 2c53ec1..f7b0a2e 100644
--- a/src/base/pngwriter.hh
+++ b/src/base/pngwriter.hh
@@ -76,7 +76,7 @@
     void write(std::ostream &png) const override;
   private:
     /** Png Pixel type: not containing padding */
-    struct PngPixel24 {
+    struct M5_ATTR_PACKED PngPixel24 {
         PngPixel24 &operator=(const Pixel &rhs) {
             red = rhs.red;
             green = rhs.green;
@@ -87,7 +87,7 @@
         uint8_t red;
         uint8_t green;
         uint8_t blue;
-    } M5_ATTR_PACKED;
+    };
 
     /**
      * Handle to resources used by libpng:
diff --git a/src/base/stats/group.cc b/src/base/stats/group.cc
index 06eaa46..a76ad4f 100644
--- a/src/base/stats/group.cc
+++ b/src/base/stats/group.cc
@@ -68,7 +68,7 @@
 
     for (auto &g : statGroups) {
         if (DTRACE(Stats)) {
-            const SimObject M5_VAR_USED *so =
+            M5_VAR_USED const SimObject *so =
                 dynamic_cast<const SimObject *>(this);
             DPRINTF(Stats, "%s: regStats in group %s\n",
                     so ? so->name() : "?",
diff --git a/src/base/trace.hh b/src/base/trace.hh
index 3d8752c..aafb9c8 100644
--- a/src/base/trace.hh
+++ b/src/base/trace.hh
@@ -182,14 +182,14 @@
 
 #define DDUMP(x, data, count) do {               \
     using namespace Debug;                       \
-    if (DTRACE(x))                               \
+    if (M5_UNLIKELY(DTRACE(x)))                  \
         Trace::getDebugLogger()->dump(           \
             curTick(), name(), data, count, #x); \
 } while (0)
 
 #define DPRINTF(x, ...) do {                     \
     using namespace Debug;                       \
-    if (DTRACE(x)) {                             \
+    if (M5_UNLIKELY(DTRACE(x))) {                \
         Trace::getDebugLogger()->dprintf_flag(   \
             curTick(), name(), #x, __VA_ARGS__); \
     }                                            \
@@ -197,7 +197,7 @@
 
 #define DPRINTFS(x, s, ...) do {                        \
     using namespace Debug;                              \
-    if (DTRACE(x)) {                                    \
+    if (M5_UNLIKELY(DTRACE(x))) {                       \
         Trace::getDebugLogger()->dprintf_flag(          \
                 curTick(), s->name(), #x, __VA_ARGS__); \
     }                                                   \
@@ -205,7 +205,7 @@
 
 #define DPRINTFR(x, ...) do {                          \
     using namespace Debug;                             \
-    if (DTRACE(x)) {                                   \
+    if (M5_UNLIKELY(DTRACE(x))) {                      \
         Trace::getDebugLogger()->dprintf_flag(         \
             (Tick)-1, std::string(), #x, __VA_ARGS__); \
     }                                                  \
diff --git a/src/base/uncontended_mutex.hh b/src/base/uncontended_mutex.hh
new file mode 100644
index 0000000..721712f
--- /dev/null
+++ b/src/base/uncontended_mutex.hh
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2020 Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BASE_UNCONTENDED_MUTEX_HH__
+#define __BASE_UNCONTENDED_MUTEX_HH__
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+/*
+ * The std::mutex implementation is slower than expected because of many mode
+ * checking and legacy support.
+ *
+ * The UncontendedMutex uses an atomic flag to check if we really need to
+ * obtain a mutex lock. For most cases without multi-threads event queues,
+ * e.g. non-KVM simulation, this avoid the usage of mutex and speed up the
+ * simulation.
+ */
+class UncontendedMutex
+{
+  private:
+    /*
+     * A flag to record the current status:
+     * 0: no one has the lock
+     * 1: exactly one thread has the lock
+     * >1: one or more threads are waiting for the lock
+     */
+    std::atomic<int> flag;
+    std::mutex m;
+    std::condition_variable cv;
+
+    bool
+    testAndSet(int expected, int desired)
+    {
+        return flag.compare_exchange_strong(expected, desired);
+    }
+
+  public:
+    UncontendedMutex() : flag(0) {}
+
+    void
+    lock()
+    {
+        /*
+         * Here we use 'flag' to check if we are the first thread to get the
+         * lock. If not, we try to obtain the real mutex, and use the condition
+         * variable to wait for the thread who has the lock to release it.
+         *
+         * The flag will be updated to more than 1, so the thread with lock
+         * knows that there is another thread waiting for the lock.
+         */
+        while (!testAndSet(0, 1)) {
+            std::unique_lock<std::mutex> ul(m);
+            /*
+             * It is possible that just before we obtain the mutex lock, the
+             * first thread releases the flag and thus flag becomes zero. In
+             * such case, we shouldn't wait for the condition variable because
+             * there is no the other thread to notify us.
+             */
+            if (flag++ == 0)
+                break;
+            cv.wait(ul);
+        }
+    }
+
+    void
+    unlock()
+    {
+        /* In case there are no other threads waiting, we will just clear the
+         * flag and return.
+         */
+        if (testAndSet(1, 0))
+            return;
+
+        /*
+         * Otherwise, clear the flag and notify all the waiting threads. We
+         * need to protect the flag by mutex here so that there won't be
+         * another thread waiting but the flag is already set to 0.
+         */
+        {
+            std::lock_guard<std::mutex> g(m);
+            flag = 0;
+        }
+        /*
+         * It's possible to update the algorithm and use notify_one() here.
+         * However, tests show that notify_one() is much slower than
+         * notify_all() in this case. Here we choose to use notify_all().
+         */
+        cv.notify_all();
+    }
+};
+
+#endif // __BASE_UNCONTENDED_MUTEX_HH__
diff --git a/src/base/uncontended_mutex.test.cc b/src/base/uncontended_mutex.test.cc
new file mode 100644
index 0000000..6ce929b
--- /dev/null
+++ b/src/base/uncontended_mutex.test.cc
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2020 Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <gtest/gtest.h>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "base/uncontended_mutex.hh"
+
+TEST(UncontendedMutex, Lock)
+{
+    int data = 0;
+    UncontendedMutex m;
+
+    std::thread t1([&] () {
+        std::lock_guard<UncontendedMutex> g(m);
+        // Simulate += operation with a racing change between read and write.
+        int tmp = data;
+        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+        data = tmp + 1;
+    });
+
+    std::thread t2([&] () {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::lock_guard<UncontendedMutex> g(m);
+        data = data + 1;
+    });
+
+    std::thread t3([&] () {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::lock_guard<UncontendedMutex> g(m);
+        data = data + 1;
+    });
+    t1.join();
+    t2.join();
+    t3.join();
+
+    EXPECT_EQ(data, 3);
+}
+
+TEST(UncontendedMutex, HeavyContention)
+{
+    int num_of_iter = 1000;
+    int num_of_thread = 1000;
+    std::vector<std::thread> threads;
+
+    int data = 0;
+    UncontendedMutex m;
+
+    for (int t = 0 ; t < num_of_thread; ++t) {
+        threads.emplace_back([&] () {
+            for (int k = 0; k < num_of_iter; ++k) {
+                std::lock_guard<UncontendedMutex> g(m);
+                data++;
+            }
+        });
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+    EXPECT_EQ(data, num_of_iter * num_of_thread);
+}
diff --git a/src/base/vnc/vncinput.hh b/src/base/vnc/vncinput.hh
index 6e7bb98..95c4aab 100644
--- a/src/base/vnc/vncinput.hh
+++ b/src/base/vnc/vncinput.hh
@@ -96,7 +96,7 @@
         ClientCutText           = 6
     };
 
-    struct PixelFormat {
+    struct M5_ATTR_PACKED PixelFormat {
         uint8_t bpp;
         uint8_t depth;
         uint8_t bigendian;
@@ -108,48 +108,48 @@
         uint8_t greenshift;
         uint8_t blueshift;
         uint8_t padding[3];
-    } M5_ATTR_PACKED;
+    };
 
-    struct PixelFormatMessage {
+    struct M5_ATTR_PACKED PixelFormatMessage {
         uint8_t type;
         uint8_t padding[3];
         PixelFormat px;
-    } M5_ATTR_PACKED;
+    };
 
-    struct PixelEncodingsMessage {
+    struct M5_ATTR_PACKED PixelEncodingsMessage {
         uint8_t type;
         uint8_t padding;
         uint16_t num_encodings;
-    } M5_ATTR_PACKED;
+    };
 
-    struct FrameBufferUpdateReq {
+    struct M5_ATTR_PACKED FrameBufferUpdateReq {
         uint8_t type;
         uint8_t incremental;
         uint16_t x;
         uint16_t y;
         uint16_t width;
         uint16_t height;
-    } M5_ATTR_PACKED;
+    };
 
-    struct KeyEventMessage {
+    struct M5_ATTR_PACKED KeyEventMessage {
         uint8_t type;
         uint8_t down_flag;
         uint8_t padding[2];
         uint32_t key;
-    } M5_ATTR_PACKED;
+    };
 
-    struct PointerEventMessage {
+    struct M5_ATTR_PACKED PointerEventMessage {
         uint8_t type;
         uint8_t button_mask;
         uint16_t x;
         uint16_t y;
-    } M5_ATTR_PACKED;
+    };
 
-    struct ClientCutTextMessage {
+    struct M5_ATTR_PACKED ClientCutTextMessage {
         uint8_t type;
         uint8_t padding[3];
         uint32_t length;
-    } M5_ATTR_PACKED;
+    };
 
     typedef VncInputParams Params;
     VncInput(const Params *p);
diff --git a/src/base/vnc/vncserver.cc b/src/base/vnc/vncserver.cc
index dbf4d9c..2b34162 100644
--- a/src/base/vnc/vncserver.cc
+++ b/src/base/vnc/vncserver.cc
@@ -378,7 +378,7 @@
 {
     assert(curState == WaitForProtocolVersion);
 
-    size_t len M5_VAR_USED;
+    M5_VAR_USED size_t len;
     char version_string[13];
 
     // Null terminate the message so it's easier to work with
diff --git a/src/base/vnc/vncserver.hh b/src/base/vnc/vncserver.hh
index 929379d..c639af9 100644
--- a/src/base/vnc/vncserver.hh
+++ b/src/base/vnc/vncserver.hh
@@ -106,33 +106,33 @@
         NormalPhase
     };
 
-    struct ServerInitMsg {
+    struct M5_ATTR_PACKED ServerInitMsg {
         uint16_t fbWidth;
         uint16_t fbHeight;
         PixelFormat px;
         uint32_t namelen;
         char name[2]; // just to put M5 in here
-    } M5_ATTR_PACKED;
+    };
 
-    struct FrameBufferUpdate {
+    struct M5_ATTR_PACKED FrameBufferUpdate {
         uint8_t type;
         uint8_t padding;
         uint16_t num_rects;
-    } M5_ATTR_PACKED;
+    };
 
-    struct FrameBufferRect {
+    struct M5_ATTR_PACKED FrameBufferRect {
         uint16_t x;
         uint16_t y;
         uint16_t width;
         uint16_t height;
         int32_t encoding;
-    } M5_ATTR_PACKED;
+    };
 
-    struct ServerCutText {
+    struct M5_ATTR_PACKED ServerCutText {
         uint8_t type;
         uint8_t padding[3];
         uint32_t length;
-    } M5_ATTR_PACKED;
+    };
 
     /** @} */
 
diff --git a/src/cpu/StaticInstFlags.py b/src/cpu/StaticInstFlags.py
index 1c2b63a..4775289 100644
--- a/src/cpu/StaticInstFlags.py
+++ b/src/cpu/StaticInstFlags.py
@@ -36,15 +36,9 @@
 # one of these two flags set, it is possible for an instruction to have
 # neither (e.g., direct unconditional branches, memory barriers) or both
 # (e.g., an FP/int conversion).
-# - If IsMemRef is set, then exactly one of IsLoad or IsStore will be set.
 # - If IsControl is set, then exactly one of IsDirectControl or IsIndirect
 # Control will be set, and exactly one of IsCondControl or IsUncondControl
 # will be set.
-# - IsSerializing, IsMemBarrier, and IsWriteBarrier are implemented as flags
-# since in the current model there's no other way for instructions to inject
-# behavior into the pipeline outside of fetch.  Once we go to an exec-in-exec
-# CPU model we should be able to get rid of these flags and implement this
-# behavior via the execute() methods.
 
 class StaticInstFlags(Enum):
     wrapper_name = 'StaticInstFlags'
@@ -56,17 +50,13 @@
 
         'IsInteger',        # References integer regs.
         'IsFloating',       # References FP regs.
-        'IsCC',             # References CC regs.
         'IsVector',         # References Vector regs.
         'IsVectorElem',     # References Vector reg elems.
 
-        'IsMemRef',         # References memory (load, store, or prefetch)
         'IsLoad',           # Reads from memory (load or prefetch).
         'IsStore',          # Writes to memory.
         'IsAtomic',         # Does atomic RMW to memory.
         'IsStoreConditional',   # Store conditional instruction.
-        'IsIndexed',        # Accesses memory with an indexed address
-                            # computation
         'IsInstPrefetch',   # Instruction-cache prefetch.
         'IsDataPrefetch',   # Data-cache prefetch.
 
@@ -78,23 +68,16 @@
         'IsCall',           # Subroutine call.
         'IsReturn',         # Subroutine return.
 
-        'IsCondDelaySlot',  # Conditional Delay-Slot Instruction
-
-        'IsThreadSync',     # Thread synchronization operation.
-
         'IsSerializing',    # Serializes pipeline: won't execute until all
                             # older instructions have committed.
         'IsSerializeBefore',
         'IsSerializeAfter',
-        'IsMemBarrier',     # Is a memory barrier
         'IsWriteBarrier',   # Is a write barrier
         'IsReadBarrier',    # Is a read barrier
-        'IsERET',           # <- Causes the IFU to stall (MIPS ISA)
 
         'IsNonSpeculative', # Should not be executed speculatively
         'IsQuiesce',        # Is a quiesce instruction
 
-        'IsIprAccess',      # Accesses IPRs
         'IsUnverifiable',   # Can't be verified by a checker
 
         'IsSyscall',        # Causes a system call to be emulated in syscall
@@ -106,11 +89,9 @@
         'IsDelayedCommit',  # This microop doesn't commit right away
         'IsLastMicroop',    # This microop ends a microop sequence
         'IsFirstMicroop',   # This microop begins a microop sequence
-        # This flag doesn't do anything yet
-        'IsMicroBranch',    # This microop branches within the microcode for
-                            # a macroop
-        'IsDspOp',
+
         'IsSquashAfter',     # Squash all uncommitted state after executed
+
         # hardware transactional memory
         'IsHtmStart',       # Starts a HTM transaction
         'IsHtmStop',        # Stops (commits) a HTM transaction
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 9ba1b31..ef843d7 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012,2016-2017, 2019 ARM Limited
+ * Copyright (c) 2011-2012,2016-2017, 2019-2020 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -63,6 +63,7 @@
 #include "sim/clocked_object.hh"
 #include "sim/full_system.hh"
 #include "sim/process.hh"
+#include "sim/root.hh"
 #include "sim/sim_events.hh"
 #include "sim/sim_exit.hh"
 #include "sim/system.hh"
@@ -72,6 +73,8 @@
 
 using namespace std;
 
+std::unique_ptr<BaseCPU::GlobalStats> BaseCPU::globalStats;
+
 vector<BaseCPU *> BaseCPU::cpuList;
 
 // This variable reflects the max number of threads in any CPU.  Be
@@ -370,6 +373,12 @@
 {
     ClockedObject::regStats();
 
+    if (!globalStats) {
+        /* We need to construct the global CPU stat structure here
+         * since it needs a pointer to the Root object. */
+        globalStats.reset(new GlobalStats(Root::root()));
+    }
+
     using namespace Stats;
 
     numCycles
@@ -754,3 +763,39 @@
 {
     return params()->wait_for_remote_gdb;
 }
+
+
+BaseCPU::GlobalStats::GlobalStats(::Stats::Group *parent)
+    : ::Stats::Group(parent),
+    simInsts(this, "sim_insts", "Number of instructions simulated"),
+    simOps(this, "sim_ops", "Number of ops (including micro ops) simulated"),
+    hostInstRate(this, "host_inst_rate",
+                 "Simulator instruction rate (inst/s)"),
+    hostOpRate(this, "host_op_rate",
+               "Simulator op (including micro ops) rate (op/s)")
+{
+    simInsts
+        .functor(BaseCPU::numSimulatedInsts)
+        .precision(0)
+        .prereq(simInsts)
+        ;
+
+    simOps
+        .functor(BaseCPU::numSimulatedOps)
+        .precision(0)
+        .prereq(simOps)
+        ;
+
+    hostInstRate
+        .precision(0)
+        .prereq(simInsts)
+        ;
+
+    hostOpRate
+        .precision(0)
+        .prereq(simOps)
+        ;
+
+    hostInstRate = simInsts / hostSeconds;
+    hostOpRate = simOps / hostSeconds;
+}
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index 5320492..9cf4baa 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -48,7 +48,7 @@
 // and if so stop here
 #include "config/the_isa.hh"
 #if THE_ISA == NULL_ISA
-#include "arch/null/cpu_dummy.hh"
+#error Including BaseCPU in a system without CPU support
 #else
 #include "arch/generic/interrupts.hh"
 #include "base/statistics.hh"
@@ -145,6 +145,23 @@
     /** Cache the cache line size that we get from the system */
     const unsigned int _cacheLineSize;
 
+    /** Global CPU statistics that are merged into the Root object. */
+    struct GlobalStats : public Stats::Group {
+        GlobalStats(::Stats::Group *parent);
+
+        ::Stats::Value simInsts;
+        ::Stats::Value simOps;
+
+        ::Stats::Formula hostInstRate;
+        ::Stats::Formula hostOpRate;
+    };
+
+    /**
+     * Pointer to the global stat structure. This needs to be
+     * constructed from regStats since we merge it into the root
+     * group. */
+    static std::unique_ptr<GlobalStats> globalStats;
+
   public:
 
     /**
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index a6c08cc..194d77b 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -308,14 +308,13 @@
     }
 
     Fault initiateMemRead(Addr addr, unsigned size, Request::Flags flags,
-            const std::vector<bool> &byte_enable=std::vector<bool>()) override;
+            const std::vector<bool> &byte_enable) override;
 
     Fault initiateHtmCmd(Request::Flags flags) override;
 
     Fault writeMem(uint8_t *data, unsigned size, Addr addr,
                    Request::Flags flags, uint64_t *res,
-                   const std::vector<bool> &byte_enable=std::vector<bool>())
-                   override;
+                   const std::vector<bool> &byte_enable) override;
 
     Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
                          AtomicOpFunctorPtr amo_op) override;
@@ -541,8 +540,6 @@
     bool isIndirectCtrl() const { return staticInst->isIndirectCtrl(); }
     bool isCondCtrl()     const { return staticInst->isCondCtrl(); }
     bool isUncondCtrl()   const { return staticInst->isUncondCtrl(); }
-    bool isCondDelaySlot() const { return staticInst->isCondDelaySlot(); }
-    bool isThreadSync()   const { return staticInst->isThreadSync(); }
     bool isSerializing()  const { return staticInst->isSerializing(); }
     bool
     isSerializeBefore() const
@@ -555,11 +552,11 @@
         return staticInst->isSerializeAfter() || status[SerializeAfter];
     }
     bool isSquashAfter() const { return staticInst->isSquashAfter(); }
-    bool isMemBarrier()   const { return staticInst->isMemBarrier(); }
+    bool isFullMemBarrier()   const { return staticInst->isFullMemBarrier(); }
+    bool isReadBarrier() const { return staticInst->isReadBarrier(); }
     bool isWriteBarrier() const { return staticInst->isWriteBarrier(); }
     bool isNonSpeculative() const { return staticInst->isNonSpeculative(); }
     bool isQuiesce() const { return staticInst->isQuiesce(); }
-    bool isIprAccess() const { return staticInst->isIprAccess(); }
     bool isUnverifiable() const { return staticInst->isUnverifiable(); }
     bool isSyscall() const { return staticInst->isSyscall(); }
     bool isMacroop() const { return staticInst->isMacroop(); }
@@ -567,7 +564,6 @@
     bool isDelayedCommit() const { return staticInst->isDelayedCommit(); }
     bool isLastMicroop() const { return staticInst->isLastMicroop(); }
     bool isFirstMicroop() const { return staticInst->isFirstMicroop(); }
-    bool isMicroBranch() const { return staticInst->isMicroBranch(); }
     // hardware transactional memory
     bool isHtmStart() const { return staticInst->isHtmStart(); }
     bool isHtmStop() const { return staticInst->isHtmStop(); }
@@ -1072,11 +1068,11 @@
                                    Request::Flags flags,
                                    const std::vector<bool> &byte_enable)
 {
-    assert(byte_enable.empty() || byte_enable.size() == size);
+    assert(byte_enable.size() == size);
     return cpu->pushRequest(
-            dynamic_cast<typename DynInstPtr::PtrType>(this),
-            /* ld */ true, nullptr, size, addr, flags, nullptr, nullptr,
-            byte_enable);
+        dynamic_cast<typename DynInstPtr::PtrType>(this),
+        /* ld */ true, nullptr, size, addr, flags, nullptr, nullptr,
+        byte_enable);
 }
 
 template<class Impl>
@@ -1094,11 +1090,11 @@
                             Request::Flags flags, uint64_t *res,
                             const std::vector<bool> &byte_enable)
 {
-    assert(byte_enable.empty() || byte_enable.size() == size);
+    assert(byte_enable.size() == size);
     return cpu->pushRequest(
-            dynamic_cast<typename DynInstPtr::PtrType>(this),
-            /* st */ false, data, size, addr, flags, res, nullptr,
-            byte_enable);
+        dynamic_cast<typename DynInstPtr::PtrType>(this),
+        /* st */ false, data, size, addr, flags, res, nullptr,
+        byte_enable);
 }
 
 template<class Impl>
@@ -1115,7 +1111,7 @@
     return cpu->pushRequest(
             dynamic_cast<typename DynInstPtr::PtrType>(this),
             /* atomic */ false, nullptr, size, addr, flags, nullptr,
-            std::move(amo_op));
+            std::move(amo_op), std::vector<bool>(size, true));
 }
 
 #endif // __CPU_BASE_DYN_INST_HH__
diff --git a/src/cpu/checker/cpu.cc b/src/cpu/checker/cpu.cc
index fe0300e..8f55870 100644
--- a/src/cpu/checker/cpu.cc
+++ b/src/cpu/checker/cpu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011,2013,2017-2018 ARM Limited
+ * Copyright (c) 2011,2013,2017-2018, 2020 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -147,21 +147,15 @@
 
     RequestPtr mem_req;
 
-    if (!byte_enable.empty()) {
-        // Set up byte-enable mask for the current fragment
-        auto it_start = byte_enable.cbegin() + (size - (frag_size +
-                                                        size_left));
-        auto it_end = byte_enable.cbegin() + (size - size_left);
-        if (isAnyActiveElement(it_start, it_end)) {
-            mem_req = std::make_shared<Request>(frag_addr, frag_size,
-                    flags, requestorId, thread->pcState().instAddr(),
-                    tc->contextId());
-            mem_req->setByteEnable(std::vector<bool>(it_start, it_end));
-        }
-    } else {
+    // Set up byte-enable mask for the current fragment
+    auto it_start = byte_enable.cbegin() + (size - (frag_size +
+                                                    size_left));
+    auto it_end = byte_enable.cbegin() + (size - size_left);
+    if (isAnyActiveElement(it_start, it_end)) {
         mem_req = std::make_shared<Request>(frag_addr, frag_size,
-                    flags, requestorId, thread->pcState().instAddr(),
-                    tc->contextId());
+                flags, requestorId, thread->pcState().instAddr(),
+                tc->contextId());
+        mem_req->setByteEnable(std::vector<bool>(it_start, it_end));
     }
 
     return mem_req;
@@ -172,7 +166,7 @@
                     Request::Flags flags,
                     const std::vector<bool>& byte_enable)
 {
-    assert(byte_enable.empty() || byte_enable.size() == size);
+    assert(byte_enable.size() == size);
 
     Fault fault = NoFault;
     bool checked_flags = false;
@@ -256,7 +250,7 @@
                      Addr addr, Request::Flags flags, uint64_t *res,
                      const std::vector<bool>& byte_enable)
 {
-    assert(byte_enable.empty() || byte_enable.size() == size);
+    assert(byte_enable.size() == size);
 
     Fault fault = NoFault;
     bool checked_flags = false;
diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh
index f2395d7..97203c2 100644
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -587,12 +587,12 @@
 
     Fault readMem(Addr addr, uint8_t *data, unsigned size,
                   Request::Flags flags,
-                  const std::vector<bool>& byte_enable = std::vector<bool>())
+                  const std::vector<bool>& byte_enable)
         override;
 
     Fault writeMem(uint8_t *data, unsigned size, Addr addr,
                    Request::Flags flags, uint64_t *res,
-                   const std::vector<bool>& byte_enable = std::vector<bool>())
+                   const std::vector<bool>& byte_enable)
         override;
 
     Fault amoMem(Addr addr, uint8_t* data, unsigned size,
@@ -610,9 +610,6 @@
     /////////////////////////////////////////////////////
 
     void wakeup(ThreadID tid) override { }
-    // Assume that the normal CPU's call to syscall was successful.
-    // The checker's state would have already been updated by the syscall.
-    void syscall() override { }
 
     void
     handleError()
diff --git a/src/cpu/checker/thread_context.hh b/src/cpu/checker/thread_context.hh
index b5a974b..d07de62 100644
--- a/src/cpu/checker/thread_context.hh
+++ b/src/cpu/checker/thread_context.hh
@@ -171,9 +171,6 @@
         actualTC->connectMemPorts(tc);
     }
 
-    /** Executes a syscall in SE mode. */
-    void syscall() override { return actualTC->syscall(); }
-
     Status status() const override { return actualTC->status(); }
 
     void
diff --git a/src/cpu/exec_context.hh b/src/cpu/exec_context.hh
index cfef3c3..7c433ad 100644
--- a/src/cpu/exec_context.hh
+++ b/src/cpu/exec_context.hh
@@ -233,7 +233,7 @@
      */
     virtual Fault readMem(Addr addr, uint8_t *data, unsigned int size,
             Request::Flags flags,
-            const std::vector<bool>& byte_enable = std::vector<bool>())
+            const std::vector<bool>& byte_enable)
     {
         panic("ExecContext::readMem() should be overridden\n");
     }
@@ -247,7 +247,7 @@
      */
     virtual Fault initiateMemRead(Addr addr, unsigned int size,
             Request::Flags flags,
-            const std::vector<bool>& byte_enable = std::vector<bool>())
+            const std::vector<bool>& byte_enable)
     {
         panic("ExecContext::initiateMemRead() should be overridden\n");
     }
@@ -263,8 +263,7 @@
      */
     virtual Fault writeMem(uint8_t *data, unsigned int size, Addr addr,
                            Request::Flags flags, uint64_t *res,
-                           const std::vector<bool>& byte_enable =
-                               std::vector<bool>()) = 0;
+                           const std::vector<bool>& byte_enable) = 0;
 
     /**
      * For atomic-mode contexts, perform an atomic AMO (a.k.a., Atomic
@@ -300,18 +299,6 @@
 
     /** @} */
 
-    /**
-     * @{
-     * @name SysCall Emulation Interfaces
-     */
-
-    /**
-     * Executes a syscall.
-     */
-    virtual void syscall() = 0;
-
-    /** @} */
-
     /** Returns a pointer to the ThreadContext. */
     virtual ThreadContext *tcBase() const = 0;
 
diff --git a/src/cpu/exetrace.cc b/src/cpu/exetrace.cc
index ca05041..69ee5cc 100644
--- a/src/cpu/exetrace.cc
+++ b/src/cpu/exetrace.cc
@@ -77,16 +77,15 @@
 
     Addr cur_pc = pc.instAddr();
     Loader::SymbolTable::const_iterator it;
+    ccprintf(outs, "%#x", cur_pc);
     if (Debug::ExecSymbol && (!FullSystem || !inUserMode(thread)) &&
             (it = Loader::debugSymbolTable.findNearest(cur_pc)) !=
                 Loader::debugSymbolTable.end()) {
         Addr delta = cur_pc - it->address;
         if (delta)
-            ccprintf(outs, "@%s+%d", it->name, delta);
+            ccprintf(outs, " @%s+%d", it->name, delta);
         else
-            ccprintf(outs, "@%s", it->name);
-    } else {
-        ccprintf(outs, "%#x", cur_pc);
+            ccprintf(outs, " @%s", it->name);
     }
 
     if (inst->isMicroop()) {
diff --git a/src/cpu/kvm/x86_cpu.cc b/src/cpu/kvm/x86_cpu.cc
index 5a667d4..1fda9a0 100644
--- a/src/cpu/kvm/x86_cpu.cc
+++ b/src/cpu/kvm/x86_cpu.cc
@@ -68,7 +68,7 @@
 // data) is used to indicate that a segment has been accessed.
 #define SEG_TYPE_BIT_ACCESSED 1
 
-struct FXSave
+struct M5_ATTR_PACKED FXSave
 {
     uint16_t fcw;
     uint16_t fsw;
@@ -97,7 +97,7 @@
     uint8_t xmm[16][16];
 
     uint64_t reserved[12];
-} M5_ATTR_PACKED;
+};
 
 static_assert(sizeof(FXSave) == 512, "Unexpected size of FXSave");
 
diff --git a/src/cpu/minor/dyn_inst.cc b/src/cpu/minor/dyn_inst.cc
index af02d9f..1b43fc8 100644
--- a/src/cpu/minor/dyn_inst.cc
+++ b/src/cpu/minor/dyn_inst.cc
@@ -214,10 +214,7 @@
                     regs_str << ',';
             }
 
-#if THE_ISA == ARM_ISA
-            regs_str << " extMachInst=" << std::hex << std::setw(16)
-                << std::setfill('0') << staticInst->machInst << std::dec;
-#endif
+            ccprintf(regs_str, " extMachInst=%160x", staticInst->machInst);
         }
 
         std::ostringstream flags;
diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh
index 81675e6..4f7c763 100644
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -105,10 +105,9 @@
     Fault
     initiateMemRead(Addr addr, unsigned int size,
                     Request::Flags flags,
-                    const std::vector<bool>& byte_enable =
-                        std::vector<bool>()) override
+                    const std::vector<bool>& byte_enable) override
     {
-        assert(byte_enable.empty() || byte_enable.size() == size);
+        assert(byte_enable.size() == size);
         return execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
             size, addr, flags, nullptr, nullptr, byte_enable);
     }
@@ -123,10 +122,10 @@
     Fault
     writeMem(uint8_t *data, unsigned int size, Addr addr,
              Request::Flags flags, uint64_t *res,
-             const std::vector<bool>& byte_enable = std::vector<bool>())
+             const std::vector<bool>& byte_enable)
         override
     {
-        assert(byte_enable.empty() || byte_enable.size() == size);
+        assert(byte_enable.size() == size);
         return execute.getLSQ().pushRequest(inst, false /* store */, data,
             size, addr, flags, res, nullptr, byte_enable);
     }
@@ -137,7 +136,8 @@
     {
         // AMO requests are pushed through the store path
         return execute.getLSQ().pushRequest(inst, false /* amo */, nullptr,
-            size, addr, flags, nullptr, std::move(amo_op));
+            size, addr, flags, nullptr, std::move(amo_op),
+            std::vector<bool>(size, true));
     }
 
     RegVal
@@ -419,8 +419,6 @@
         return thread.setMiscReg(reg.index(), val);
     }
 
-    void syscall() override { thread.syscall(); }
-
     ThreadContext *tcBase() const override { return thread.getTC(); }
 
     /* @todo, should make stCondFailures persistent somewhere */
diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc
index 45ca002..0833224 100644
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -224,8 +224,7 @@
         !inst->isFault() &&
         inst->isLastOpInInst() &&
         (inst->staticInst->isSerializeAfter() ||
-         inst->staticInst->isSquashAfter() ||
-         inst->staticInst->isIprAccess());
+         inst->staticInst->isSquashAfter());
 
     DPRINTF(Branch, "tryToBranch before: %s after: %s%s\n",
         pc_before, target, (force_branch ? " (forcing)" : ""));
@@ -782,7 +781,7 @@
 
             /* Mark up barriers in the LSQ */
             if (!discarded && inst->isInst() &&
-                inst->staticInst->isMemBarrier())
+                inst->staticInst->isFullMemBarrier())
             {
                 DPRINTF(MinorMem, "Issuing memory barrier inst: %s\n", *inst);
                 lsq.issuedMemBarrierInst(inst);
@@ -952,7 +951,7 @@
             completed_inst = completed_mem_inst;
         }
         completed_mem_issue = completed_inst;
-    } else if (inst->isInst() && inst->staticInst->isMemBarrier() &&
+    } else if (inst->isInst() && inst->staticInst->isFullMemBarrier() &&
         !lsq.canPushIntoStoreBuffer())
     {
         DPRINTF(MinorExecute, "Can't commit data barrier inst: %s yet as"
@@ -1369,7 +1368,7 @@
             ex_info.inFlightInsts->pop();
 
             /* Complete barriers in the LSQ/move to store buffer */
-            if (inst->isInst() && inst->staticInst->isMemBarrier()) {
+            if (inst->isInst() && inst->staticInst->isFullMemBarrier()) {
                 DPRINTF(MinorMem, "Completing memory barrier"
                     " inst: %s committed: %d\n", *inst, committed_inst);
                 lsq.completeMemBarrierInst(inst, committed_inst);
diff --git a/src/cpu/minor/fetch1.cc b/src/cpu/minor/fetch1.cc
index 4977e3d..287f520 100644
--- a/src/cpu/minor/fetch1.cc
+++ b/src/cpu/minor/fetch1.cc
@@ -388,7 +388,7 @@
 Fetch1::minorTraceResponseLine(const std::string &name,
     Fetch1::FetchRequestPtr response) const
 {
-    const RequestPtr &request M5_VAR_USED = response->request;
+    M5_VAR_USED const RequestPtr &request = response->request;
 
     if (response->packet && response->packet->isError()) {
         MINORLINE(this, "id=F;%s vaddr=0x%x fault=\"error packet\"\n",
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index 106b51b..b4e5493 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -77,7 +77,7 @@
     SimpleThread &thread = *port.cpu.threads[inst->id.threadId];
     TheISA::PCState old_pc = thread.pcState();
     ExecContext context(port.cpu, thread, port.execute, inst);
-    Fault M5_VAR_USED fault = inst->translationFault;
+    M5_VAR_USED Fault fault = inst->translationFault;
 
     // Give the instruction a chance to suppress a translation fault
     inst->translationFault = inst->staticInst->initiateAcc(&context, nullptr);
@@ -154,7 +154,7 @@
 bool
 LSQ::LSQRequest::isBarrier()
 {
-    return inst->isInst() && inst->staticInst->isMemBarrier();
+    return inst->isInst() && inst->staticInst->isFullMemBarrier();
 }
 
 bool
@@ -301,8 +301,7 @@
         inst->id.threadId);
 
     const auto &byte_enable = request->getByteEnable();
-    if (byte_enable.size() == 0 ||
-        isAnyActiveElement(byte_enable.cbegin(), byte_enable.cend())) {
+    if (isAnyActiveElement(byte_enable.cbegin(), byte_enable.cend())) {
         port.numAccessesInDTLB++;
 
         setState(LSQ::LSQRequest::InTranslation);
@@ -334,7 +333,7 @@
 {
     port.numAccessesInDTLB--;
 
-    unsigned int M5_VAR_USED expected_fragment_index =
+    M5_VAR_USED unsigned int expected_fragment_index =
         numTranslatedFragments;
 
     numInTranslationFragments--;
@@ -475,7 +474,7 @@
     for (unsigned int fragment_index = 0; fragment_index < numFragments;
          fragment_index++)
     {
-        bool M5_VAR_USED is_last_fragment = false;
+        M5_VAR_USED bool is_last_fragment = false;
 
         if (fragment_addr == base_addr) {
             /* First fragment */
@@ -495,24 +494,19 @@
         bool disabled_fragment = false;
 
         fragment->setContext(request->contextId());
-        if (byte_enable.empty()) {
+        // Set up byte-enable mask for the current fragment
+        auto it_start = byte_enable.begin() +
+            (fragment_addr - base_addr);
+        auto it_end = byte_enable.begin() +
+            (fragment_addr - base_addr) + fragment_size;
+        if (isAnyActiveElement(it_start, it_end)) {
             fragment->setVirt(
                 fragment_addr, fragment_size, request->getFlags(),
-                request->requestorId(), request->getPC());
+                request->requestorId(),
+                request->getPC());
+            fragment->setByteEnable(std::vector<bool>(it_start, it_end));
         } else {
-            // Set up byte-enable mask for the current fragment
-            auto it_start = byte_enable.begin() +
-                (fragment_addr - base_addr);
-            auto it_end = byte_enable.begin() +
-                (fragment_addr - base_addr) + fragment_size;
-            if (isAnyActiveElement(it_start, it_end)) {
-                fragment->setVirt(
-                    fragment_addr, fragment_size, request->getFlags(),
-                    request->requestorId(), request->getPC());
-                fragment->setByteEnable(std::vector<bool>(it_start, it_end));
-            } else {
-                disabled_fragment = true;
-            }
+            disabled_fragment = true;
         }
 
         if (!disabled_fragment) {
@@ -1711,7 +1705,7 @@
 void
 LSQ::issuedMemBarrierInst(MinorDynInstPtr inst)
 {
-    assert(inst->isInst() && inst->staticInst->isMemBarrier());
+    assert(inst->isInst() && inst->staticInst->isFullMemBarrier());
     assert(inst->id.execSeqNum > lastMemBarrier[inst->id.threadId]);
 
     /* Remember the barrier.  We only have a notion of one
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index 75d065f..4935f1e 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -701,7 +701,7 @@
             // will be active.
             _nextStatus = Active;
 
-            const DynInstPtr &inst M5_VAR_USED = rob->readHeadInst(tid);
+            M5_VAR_USED const DynInstPtr &inst = rob->readHeadInst(tid);
 
             DPRINTF(Commit,"[tid:%i] Instruction [sn:%llu] PC %s is head of"
                     " ROB and ready to commit\n",
@@ -1196,7 +1196,7 @@
         // Make sure we are only trying to commit un-executed instructions we
         // think are possible.
         assert(head_inst->isNonSpeculative() || head_inst->isStoreConditional()
-               || head_inst->isMemBarrier() || head_inst->isWriteBarrier()
+               || head_inst->isReadBarrier() || head_inst->isWriteBarrier()
                || head_inst->isAtomic()
                || (head_inst->isLoad() && head_inst->strictlyOrdered()));
 
@@ -1233,11 +1233,6 @@
         return false;
     }
 
-    if (head_inst->isThreadSync()) {
-        // Not handled for now.
-        panic("Thread sync instructions are not handled yet.\n");
-    }
-
     // Check if the instruction caused a fault.  If so, trap.
     Fault inst_fault = head_inst->getFault();
 
@@ -1467,7 +1462,7 @@
         }
     }
 
-    if (inst->isMemBarrier()) {
+    if (inst->isFullMemBarrier()) {
         stats.membars[tid]++;
     }
 
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 01938f1..11fac25 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -914,26 +914,6 @@
 
 template <class Impl>
 void
-FullO3CPU<Impl>::syscall(ThreadID tid)
-{
-    DPRINTF(O3CPU, "[tid:%i] Executing syscall().\n\n", tid);
-
-    DPRINTF(Activity,"Activity: syscall() called.\n");
-
-    // Temporarily increase this by one to account for the syscall
-    // instruction.
-    ++(this->thread[tid]->funcExeInst);
-
-    // Execute the actual syscall.
-    this->thread[tid]->syscall();
-
-    // Decrease funcExeInst by one as the normal commit will handle
-    // incrementing it.
-    --(this->thread[tid]->funcExeInst);
-}
-
-template <class Impl>
-void
 FullO3CPU<Impl>::serializeThread(CheckpointOut &cp, ThreadID tid) const
 {
     thread[tid]->serialize(cp);
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index 0447275..200d343 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -276,11 +276,6 @@
     void exitThreads();
 
   public:
-    /** Executes a syscall.
-     * @todo: Determine if this needs to be virtual.
-     */
-    void syscall(ThreadID tid);
-
     /** Starts draining the CPU's pipeline of all instructions in
      * order to stop all memory accesses. */
     DrainState drain() override;
diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index 5f2a588..8172b9a 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -248,9 +248,6 @@
     /** Traps to handle specified fault. */
     void trap(const Fault &fault);
 
-    /** Emulates a syscall. */
-    void syscall() override;
-
   public:
 
     // The register accessor methods provide the index of the
diff --git a/src/cpu/o3/dyn_inst_impl.hh b/src/cpu/o3/dyn_inst_impl.hh
index 8a6a434..6c6625c 100644
--- a/src/cpu/o3/dyn_inst_impl.hh
+++ b/src/cpu/o3/dyn_inst_impl.hh
@@ -187,19 +187,4 @@
     this->cpu->trap(fault, this->threadNumber, this->staticInst);
 }
 
-template <class Impl>
-void
-BaseO3DynInst<Impl>::syscall()
-{
-    // HACK: check CPU's nextPC before and after syscall. If it
-    // changes, update this instruction's nextPC because the syscall
-    // must have changed the nextPC.
-    TheISA::PCState curPC = this->cpu->pcState(this->threadNumber);
-    this->cpu->syscall(this->threadNumber);
-    TheISA::PCState newPC = this->cpu->pcState(this->threadNumber);
-    if (!(curPC == newPC)) {
-        this->pcState(newPC);
-    }
-}
-
 #endif//__CPU_O3_DYN_INST_IMPL_HH__
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index 497c532..b39001d 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -1119,7 +1119,7 @@
             }
 
             toRename->iewInfo[tid].dispatchedToSQ++;
-        } else if (inst->isMemBarrier() || inst->isWriteBarrier()) {
+        } else if (inst->isReadBarrier() || inst->isWriteBarrier()) {
             // Same as non-speculative stores.
             inst->setCanCommit();
             instQueue.insertBarrier(inst);
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index ff5b3be..19ed49a 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -1014,7 +1014,7 @@
         ++freeEntries;
         completed_inst->memOpDone(true);
         count[tid]--;
-    } else if (completed_inst->isMemBarrier() ||
+    } else if (completed_inst->isReadBarrier() ||
                completed_inst->isWriteBarrier()) {
         // Completes a non mem ref barrier
         memDepUnit[tid].completeInst(completed_inst);
@@ -1245,7 +1245,7 @@
             DPRINTF(IQ, "[tid:%i] Instruction [sn:%llu] PC %s squashed.\n",
                     tid, squashed_inst->seqNum, squashed_inst->pcState());
 
-            bool is_acq_rel = squashed_inst->isMemBarrier() &&
+            bool is_acq_rel = squashed_inst->isFullMemBarrier() &&
                          (squashed_inst->isLoad() ||
                           (squashed_inst->isStore() &&
                              !squashed_inst->isStoreConditional()));
@@ -1255,7 +1255,7 @@
                 (!squashed_inst->isNonSpeculative() &&
                  !squashed_inst->isStoreConditional() &&
                  !squashed_inst->isAtomic() &&
-                 !squashed_inst->isMemBarrier() &&
+                 !squashed_inst->isReadBarrier() &&
                  !squashed_inst->isWriteBarrier())) {
 
                 for (int src_reg_idx = 0;
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 6e7d8d7..bec3ac2 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -406,15 +406,12 @@
         addRequest(Addr addr, unsigned size,
                    const std::vector<bool>& byte_enable)
         {
-            if (byte_enable.empty() ||
-                isAnyActiveElement(byte_enable.begin(), byte_enable.end())) {
+            if (isAnyActiveElement(byte_enable.begin(), byte_enable.end())) {
                 auto request = std::make_shared<Request>(
                         addr, size, _flags, _inst->requestorId(),
                         _inst->instAddr(), _inst->contextId(),
                         std::move(_amo_op));
-                if (!byte_enable.empty()) {
-                    request->setByteEnable(byte_enable);
-                }
+                request->setByteEnable(byte_enable);
                 _requests.push_back(request);
             }
         }
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index c4cb45e..e3922ae 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -682,7 +682,7 @@
     // This comming request can be either load, store or atomic.
     // Atomic request has a corresponding pointer to its atomic memory
     // operation
-    bool isAtomic M5_VAR_USED = !isLoad && amo_op;
+    M5_VAR_USED bool isAtomic = !isLoad && amo_op;
 
     ThreadID tid = cpu->contextToThread(inst->contextId());
     auto cacheLineSize = cpu->cacheLineSize();
@@ -715,9 +715,7 @@
                     size, flags, data, res, std::move(amo_op));
         }
         assert(req);
-        if (!byte_enable.empty()) {
-            req->_byteEnable = byte_enable;
-        }
+        req->_byteEnable = byte_enable;
         inst->setRequest();
         req->taskId(cpu->taskId());
 
@@ -894,9 +892,7 @@
     mainReq = std::make_shared<Request>(base_addr,
                 _size, _flags, _inst->requestorId(),
                 _inst->instAddr(), _inst->contextId());
-    if (!_byteEnable.empty()) {
-        mainReq->setByteEnable(_byteEnable);
-    }
+    mainReq->setByteEnable(_byteEnable);
 
     // Paddr is not used in mainReq. However, we will accumulate the flags
     // from the sub requests into mainReq by calling setFlags() in finish().
@@ -905,41 +901,29 @@
     mainReq->setPaddr(0);
 
     /* Get the pre-fix, possibly unaligned. */
-    if (_byteEnable.empty()) {
-        this->addRequest(base_addr, next_addr - base_addr, _byteEnable);
-    } else {
-        auto it_start = _byteEnable.begin();
-        auto it_end = _byteEnable.begin() + (next_addr - base_addr);
-        this->addRequest(base_addr, next_addr - base_addr,
-                         std::vector<bool>(it_start, it_end));
-    }
+    auto it_start = _byteEnable.begin();
+    auto it_end = _byteEnable.begin() + (next_addr - base_addr);
+    this->addRequest(base_addr, next_addr - base_addr,
+                     std::vector<bool>(it_start, it_end));
     size_so_far = next_addr - base_addr;
 
     /* We are block aligned now, reading whole blocks. */
     base_addr = next_addr;
     while (base_addr != final_addr) {
-        if (_byteEnable.empty()) {
-            this->addRequest(base_addr, cacheLineSize, _byteEnable);
-        } else {
-            auto it_start = _byteEnable.begin() + size_so_far;
-            auto it_end = _byteEnable.begin() + size_so_far + cacheLineSize;
-            this->addRequest(base_addr, cacheLineSize,
-                             std::vector<bool>(it_start, it_end));
-        }
+        auto it_start = _byteEnable.begin() + size_so_far;
+        auto it_end = _byteEnable.begin() + size_so_far + cacheLineSize;
+        this->addRequest(base_addr, cacheLineSize,
+                         std::vector<bool>(it_start, it_end));
         size_so_far += cacheLineSize;
         base_addr += cacheLineSize;
     }
 
     /* Deal with the tail. */
     if (size_so_far < _size) {
-        if (_byteEnable.empty()) {
-            this->addRequest(base_addr, _size - size_so_far, _byteEnable);
-        } else {
-            auto it_start = _byteEnable.begin() + size_so_far;
-            auto it_end = _byteEnable.end();
-            this->addRequest(base_addr, _size - size_so_far,
-                             std::vector<bool>(it_start, it_end));
-        }
+        auto it_start = _byteEnable.begin() + size_so_far;
+        auto it_end = _byteEnable.end();
+        this->addRequest(base_addr, _size - size_so_far,
+                         std::vector<bool>(it_start, it_end));
     }
 
     if (_requests.size() > 0) {
diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh
index 4be98c5..7e1126e 100644
--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@@ -44,6 +44,7 @@
 #include <map>
 #include <vector>
 
+#include "base/debug.hh"
 #include "cpu/o3/inst_queue.hh"
 #include "cpu/o3/mem_dep_unit.hh"
 #include "debug/MemDepUnit.hh"
@@ -171,24 +172,31 @@
 MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const DynInstPtr &barr_inst)
 {
     InstSeqNum barr_sn = barr_inst->seqNum;
-    // Memory barriers block loads and stores, write barriers only stores.
-    // Required also for hardware transactional memory commands which
-    // can have strict ordering semantics
-    if (barr_inst->isMemBarrier() || barr_inst->isHtmCmd()) {
-        loadBarrierSNs.insert(barr_sn);
-        storeBarrierSNs.insert(barr_sn);
-        DPRINTF(MemDepUnit, "Inserted a memory barrier %s SN:%lli\n",
-                barr_inst->pcState(), barr_sn);
-    } else if (barr_inst->isWriteBarrier()) {
-        storeBarrierSNs.insert(barr_sn);
-        DPRINTF(MemDepUnit, "Inserted a write barrier %s SN:%lli\n",
-                barr_inst->pcState(), barr_sn);
-    }
 
-    if (loadBarrierSNs.size() || storeBarrierSNs.size()) {
-        DPRINTF(MemDepUnit, "Outstanding load barriers = %d; "
-                            "store barriers = %d\n",
-                loadBarrierSNs.size(), storeBarrierSNs.size());
+    if (barr_inst->isReadBarrier() || barr_inst->isHtmCmd())
+        loadBarrierSNs.insert(barr_sn);
+    if (barr_inst->isWriteBarrier() || barr_inst->isHtmCmd())
+        storeBarrierSNs.insert(barr_sn);
+
+    if (DTRACE(MemDepUnit)) {
+        const char *barrier_type = nullptr;
+        if (barr_inst->isReadBarrier() && barr_inst->isWriteBarrier())
+            barrier_type = "memory";
+        else if (barr_inst->isReadBarrier())
+            barrier_type = "read";
+        else if (barr_inst->isWriteBarrier())
+            barrier_type = "write";
+
+        if (barrier_type) {
+            DPRINTF(MemDepUnit, "Inserted a %s barrier %s SN:%lli\n",
+                    barrier_type, barr_inst->pcState(), barr_sn);
+        }
+
+        if (loadBarrierSNs.size() || storeBarrierSNs.size()) {
+            DPRINTF(MemDepUnit, "Outstanding load barriers = %d; "
+                                "store barriers = %d\n",
+                    loadBarrierSNs.size(), storeBarrierSNs.size());
+        }
     }
 }
 
@@ -262,7 +270,7 @@
     } else {
         // Otherwise make the instruction dependent on the store/barrier.
         DPRINTF(MemDepUnit, "Adding to dependency list\n");
-        for (auto M5_VAR_USED producing_store : producing_stores)
+        for (M5_VAR_USED auto producing_store : producing_stores)
             DPRINTF(MemDepUnit, "\tinst PC %s is dependent on [sn:%lli].\n",
                 inst->pcState(), producing_store);
 
@@ -444,18 +452,27 @@
     completed(inst);
     InstSeqNum barr_sn = inst->seqNum;
 
-    if (inst->isMemBarrier() || inst->isHtmCmd()) {
+    if (inst->isWriteBarrier() || inst->isHtmCmd()) {
+        assert(hasStoreBarrier());
+        storeBarrierSNs.erase(barr_sn);
+    }
+    if (inst->isReadBarrier() || inst->isHtmCmd()) {
         assert(hasLoadBarrier());
-        assert(hasStoreBarrier());
         loadBarrierSNs.erase(barr_sn);
-        storeBarrierSNs.erase(barr_sn);
-        DPRINTF(MemDepUnit, "Memory barrier completed: %s SN:%lli\n",
-                            inst->pcState(), inst->seqNum);
-    } else if (inst->isWriteBarrier()) {
-        assert(hasStoreBarrier());
-        storeBarrierSNs.erase(barr_sn);
-        DPRINTF(MemDepUnit, "Write barrier completed: %s SN:%lli\n",
-                            inst->pcState(), inst->seqNum);
+    }
+    if (DTRACE(MemDepUnit)) {
+        const char *barrier_type = nullptr;
+        if (inst->isWriteBarrier() && inst->isReadBarrier())
+            barrier_type = "Memory";
+        else if (inst->isWriteBarrier())
+            barrier_type = "Write";
+        else if (inst->isReadBarrier())
+            barrier_type = "Read";
+
+        if (barrier_type) {
+            DPRINTF(MemDepUnit, "%s barrier completed: %s SN:%lli\n",
+                                barrier_type, inst->pcState(), inst->seqNum);
+        }
     }
 }
 
@@ -463,9 +480,8 @@
 void
 MemDepUnit<MemDepPred, Impl>::wakeDependents(const DynInstPtr &inst)
 {
-    // Only stores, atomics, barriers and
-    // hardware transactional memory commands have dependents.
-    if (!inst->isStore() && !inst->isAtomic() && !inst->isMemBarrier() &&
+    // Only stores, atomics and barriers have dependents.
+    if (!inst->isStore() && !inst->isAtomic() && !inst->isReadBarrier() &&
         !inst->isWriteBarrier() && !inst->isHtmCmd()) {
         return;
     }
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index 1cbe87a..052012e 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -684,8 +684,7 @@
         // instructions.  This is mainly due to lack of support for
         // out-of-order operations of either of those classes of
         // instructions.
-        if ((inst->isIprAccess() || inst->isSerializeBefore()) &&
-            !inst->isSerializeHandled()) {
+        if (inst->isSerializeBefore() && !inst->isSerializeHandled()) {
             DPRINTF(Rename, "Serialize before instruction encountered.\n");
 
             if (!inst->isTempSerializeBefore()) {
diff --git a/src/cpu/o3/scoreboard.hh b/src/cpu/o3/scoreboard.hh
index 5573afc..1f3582f 100644
--- a/src/cpu/o3/scoreboard.hh
+++ b/src/cpu/o3/scoreboard.hh
@@ -57,7 +57,7 @@
     std::vector<bool> regScoreBoard;
 
     /** The number of actual physical registers */
-    unsigned M5_CLASS_VAR_USED numPhysRegs;
+    M5_CLASS_VAR_USED unsigned numPhysRegs;
 
   public:
     /** Constructs a scoreboard.
diff --git a/src/cpu/o3/thread_context.hh b/src/cpu/o3/thread_context.hh
index b3eba13..8d6edbf 100644
--- a/src/cpu/o3/thread_context.hh
+++ b/src/cpu/o3/thread_context.hh
@@ -419,13 +419,6 @@
         thread->storeCondFailures = sc_failures;
     }
 
-    /** Executes a syscall in SE mode. */
-    void
-    syscall() override
-    {
-        return cpu->syscall(thread->threadId());
-    }
-
     /** Reads the funcExeInst counter. */
     Counter readFuncExeInst() const override { return thread->funcExeInst; }
 
diff --git a/src/cpu/o3/thread_state.hh b/src/cpu/o3/thread_state.hh
index 3226832..285adea 100644
--- a/src/cpu/o3/thread_state.hh
+++ b/src/cpu/o3/thread_state.hh
@@ -128,9 +128,6 @@
 
     /** Returns a pointer to the TC of this thread. */
     ThreadContext *getTC() { return tc; }
-
-    /** Handles the syscall. */
-    void syscall() { process->syscall(tc); }
 };
 
 #endif // __CPU_O3_THREAD_STATE_HH__
diff --git a/src/cpu/pred/bpred_unit.cc b/src/cpu/pred/bpred_unit.cc
index d0ed71d..d71f464 100644
--- a/src/cpu/pred/bpred_unit.cc
+++ b/src/cpu/pred/bpred_unit.cc
@@ -108,7 +108,7 @@
 {
     // We shouldn't have any outstanding requests when we resume from
     // a drained system.
-    for (const auto& ph M5_VAR_USED : predHist)
+    for (M5_VAR_USED const auto& ph : predHist)
         assert(ph.empty());
 }
 
diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc
index 20c6e1c..25248d9 100644
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -345,21 +345,15 @@
         (Addr) size_left);
     size_left -= frag_size;
 
-    if (!byte_enable.empty()) {
-        // Set up byte-enable mask for the current fragment
-        auto it_start = byte_enable.begin() + (size - (frag_size + size_left));
-        auto it_end = byte_enable.begin() + (size - size_left);
-        if (isAnyActiveElement(it_start, it_end)) {
-            req->setVirt(frag_addr, frag_size, flags, dataRequestorId(),
-                         inst_addr);
-            req->setByteEnable(std::vector<bool>(it_start, it_end));
-        } else {
-            predicate = false;
-        }
-    } else {
+    // Set up byte-enable mask for the current fragment
+    auto it_start = byte_enable.begin() + (size - (frag_size + size_left));
+    auto it_end = byte_enable.begin() + (size - size_left);
+    if (isAnyActiveElement(it_start, it_end)) {
         req->setVirt(frag_addr, frag_size, flags, dataRequestorId(),
                      inst_addr);
-        req->setByteEnable(std::vector<bool>());
+        req->setByteEnable(std::vector<bool>(it_start, it_end));
+    } else {
+        predicate = false;
     }
 
     return predicate;
diff --git a/src/cpu/simple/exec_context.hh b/src/cpu/simple/exec_context.hh
index 2b2afd2..09b3668 100644
--- a/src/cpu/simple/exec_context.hh
+++ b/src/cpu/simple/exec_context.hh
@@ -433,31 +433,32 @@
     Fault
     readMem(Addr addr, uint8_t *data, unsigned int size,
             Request::Flags flags,
-            const std::vector<bool>& byte_enable = std::vector<bool>())
+            const std::vector<bool>& byte_enable)
         override
     {
-        assert(byte_enable.empty() || byte_enable.size() == size);
+        assert(byte_enable.size() == size);
         return cpu->readMem(addr, data, size, flags, byte_enable);
     }
 
     Fault
     initiateMemRead(Addr addr, unsigned int size,
                     Request::Flags flags,
-                    const std::vector<bool>& byte_enable = std::vector<bool>())
+                    const std::vector<bool>& byte_enable)
         override
     {
-        assert(byte_enable.empty() || byte_enable.size() == size);
+        assert(byte_enable.size() == size);
         return cpu->initiateMemRead(addr, size, flags, byte_enable);
     }
 
     Fault
     writeMem(uint8_t *data, unsigned int size, Addr addr,
              Request::Flags flags, uint64_t *res,
-             const std::vector<bool>& byte_enable = std::vector<bool>())
+             const std::vector<bool>& byte_enable)
         override
     {
-        assert(byte_enable.empty() || byte_enable.size() == size);
-        return cpu->writeMem(data, size, addr, flags, res, byte_enable);
+        assert(byte_enable.size() == size);
+        return cpu->writeMem(data, size, addr, flags, res,
+            byte_enable);
     }
 
     Fault amoMem(Addr addr, uint8_t *data, unsigned int size,
@@ -496,11 +497,6 @@
         return thread->readStCondFailures();
     }
 
-    /**
-     * Executes a syscall specified by the callnum.
-     */
-    void syscall() override { thread->syscall(); }
-
     /** Returns a pointer to the ThreadContext. */
     ThreadContext *tcBase() const override { return thread->getTC(); }
 
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index c898d79..1955695 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -467,9 +467,7 @@
 
     RequestPtr req = std::make_shared<Request>(
         addr, size, flags, dataRequestorId(), pc, thread->contextId());
-    if (!byte_enable.empty()) {
-        req->setByteEnable(byte_enable);
-    }
+    req->setByteEnable(byte_enable);
 
     req->taskId(taskId());
 
@@ -551,9 +549,7 @@
 
     RequestPtr req = std::make_shared<Request>(
         addr, size, flags, dataRequestorId(), pc, thread->contextId());
-    if (!byte_enable.empty()) {
-        req->setByteEnable(byte_enable);
-    }
+    req->setByteEnable(byte_enable);
 
     req->taskId(taskId());
 
@@ -947,7 +943,7 @@
     // hardware transactional memory
 
     SimpleExecContext *t_info = threadInfo[curThread];
-    const bool is_htm_speculative M5_VAR_USED =
+    M5_VAR_USED const bool is_htm_speculative =
         t_info->inHtmTransactionalState();
 
     // received a response from the dcache: complete the load or store
diff --git a/src/cpu/simple_thread.hh b/src/cpu/simple_thread.hh
index 5fe52cb..255140f 100644
--- a/src/cpu/simple_thread.hh
+++ b/src/cpu/simple_thread.hh
@@ -585,8 +585,6 @@
         return ThreadState::readFuncExeInst();
     }
 
-    void syscall() override { process->syscall(this); }
-
     RegVal readIntRegFlat(RegIndex idx) const override { return intRegs[idx]; }
     void
     setIntRegFlat(RegIndex idx, RegVal val) override
diff --git a/src/cpu/static_inst.cc b/src/cpu/static_inst.cc
index f21d41b..7c6b0bf 100644
--- a/src/cpu/static_inst.cc
+++ b/src/cpu/static_inst.cc
@@ -106,7 +106,6 @@
 {
     panic("StaticInst::branchTarget() called on instruction "
           "that is not a PC-relative branch.");
-    M5_DUMMY_RETURN;
 }
 
 TheISA::PCState
@@ -114,7 +113,6 @@
 {
     panic("StaticInst::branchTarget() called on instruction "
           "that is not an indirect branch.");
-    M5_DUMMY_RETURN;
 }
 
 const string &
diff --git a/src/cpu/static_inst.hh b/src/cpu/static_inst.hh
index 146be8c..6556170 100644
--- a/src/cpu/static_inst.hh
+++ b/src/cpu/static_inst.hh
@@ -157,7 +157,11 @@
 
     bool isNop()          const { return flags[IsNop]; }
 
-    bool isMemRef()       const { return flags[IsMemRef]; }
+    bool
+    isMemRef() const
+    {
+        return flags[IsLoad] || flags[IsStore] || flags[IsAtomic];
+    }
     bool isLoad()         const { return flags[IsLoad]; }
     bool isStore()        const { return flags[IsStore]; }
     bool isAtomic()       const { return flags[IsAtomic]; }
@@ -170,7 +174,6 @@
     bool isInteger()      const { return flags[IsInteger]; }
     bool isFloating()     const { return flags[IsFloating]; }
     bool isVector()       const { return flags[IsVector]; }
-    bool isCC()           const { return flags[IsCC]; }
 
     bool isControl()      const { return flags[IsControl]; }
     bool isCall()         const { return flags[IsCall]; }
@@ -179,20 +182,22 @@
     bool isIndirectCtrl() const { return flags[IsIndirectControl]; }
     bool isCondCtrl()     const { return flags[IsCondControl]; }
     bool isUncondCtrl()   const { return flags[IsUncondControl]; }
-    bool isCondDelaySlot() const { return flags[IsCondDelaySlot]; }
 
-    bool isThreadSync()   const { return flags[IsThreadSync]; }
     bool isSerializing()  const { return flags[IsSerializing] ||
                                       flags[IsSerializeBefore] ||
                                       flags[IsSerializeAfter]; }
     bool isSerializeBefore() const { return flags[IsSerializeBefore]; }
     bool isSerializeAfter() const { return flags[IsSerializeAfter]; }
     bool isSquashAfter() const { return flags[IsSquashAfter]; }
-    bool isMemBarrier()   const { return flags[IsMemBarrier]; }
+    bool
+    isFullMemBarrier() const
+    {
+        return flags[IsReadBarrier] && flags[IsWriteBarrier];
+    }
+    bool isReadBarrier() const { return flags[IsReadBarrier]; }
     bool isWriteBarrier() const { return flags[IsWriteBarrier]; }
     bool isNonSpeculative() const { return flags[IsNonSpeculative]; }
     bool isQuiesce() const { return flags[IsQuiesce]; }
-    bool isIprAccess() const { return flags[IsIprAccess]; }
     bool isUnverifiable() const { return flags[IsUnverifiable]; }
     bool isSyscall() const { return flags[IsSyscall]; }
     bool isMacroop() const { return flags[IsMacroop]; }
@@ -200,8 +205,6 @@
     bool isDelayedCommit() const { return flags[IsDelayedCommit]; }
     bool isLastMicroop() const { return flags[IsLastMicroop]; }
     bool isFirstMicroop() const { return flags[IsFirstMicroop]; }
-    //This flag doesn't do anything yet
-    bool isMicroBranch() const { return flags[IsMicroBranch]; }
     // hardware transactional memory
     // HtmCmds must be identified as such in order
     // to provide them with necessary memory ordering semantics.
diff --git a/src/cpu/testers/memtest/memtest.cc b/src/cpu/testers/memtest/memtest.cc
index 134f0f6..bf99058 100644
--- a/src/cpu/testers/memtest/memtest.cc
+++ b/src/cpu/testers/memtest/memtest.cc
@@ -245,7 +245,7 @@
     if (cmd < percentReads) {
         // start by ensuring there is a reference value if we have not
         // seen this address before
-        uint8_t M5_VAR_USED ref_data = 0;
+        M5_VAR_USED uint8_t ref_data = 0;
         auto ref = referenceData.find(req->getPaddr());
         if (ref == referenceData.end()) {
             referenceData[req->getPaddr()] = 0;
diff --git a/src/cpu/thread_context.hh b/src/cpu/thread_context.hh
index c4fbaf4..a6f7869 100644
--- a/src/cpu/thread_context.hh
+++ b/src/cpu/thread_context.hh
@@ -58,7 +58,6 @@
 // DTB pointers.
 namespace TheISA
 {
-    class ISA;
     class Decoder;
 }
 class BaseCPU;
@@ -88,7 +87,6 @@
 class ThreadContext : public PCEventScope
 {
   protected:
-    typedef TheISA::MachInst MachInst;
     using VecRegContainer = TheISA::VecRegContainer;
     using VecElem = TheISA::VecElem;
     using VecPredRegContainer = TheISA::VecPredRegContainer;
@@ -294,8 +292,6 @@
     // Same with st cond failures.
     virtual Counter readFuncExeInst() const = 0;
 
-    virtual void syscall() = 0;
-
     // This function exits the thread context in the CPU and returns
     // 1 if the CPU has no more active threads (meaning it's OK to exit);
     // Used in syscall-emulation mode when a  thread calls the exit syscall.
diff --git a/src/cpu/trace/trace_cpu.cc b/src/cpu/trace/trace_cpu.cc
index 3ac3207..a978903 100644
--- a/src/cpu/trace/trace_cpu.cc
+++ b/src/cpu/trace/trace_cpu.cc
@@ -811,7 +811,7 @@
     DPRINTF(TraceCPUData, "Printing readyList:\n");
     while (itr != readyList.end()) {
         auto graph_itr = depGraph.find(itr->seqNum);
-        GraphNode* node_ptr M5_VAR_USED = graph_itr->second;
+        M5_VAR_USED GraphNode* node_ptr = graph_itr->second;
         DPRINTFR(TraceCPUData, "\t%lld(%s), %lld\n", itr->seqNum,
             node_ptr->typeToStr(), itr->execTick);
         itr++;
@@ -1341,7 +1341,7 @@
         // If it is not an rob dependency then it must be a register dependency
         // If the register dependency is not found, it violates an assumption
         // and must be caught by assert.
-        bool regdep_found M5_VAR_USED = removeRegDep(done_seq_num);
+        M5_VAR_USED bool regdep_found = removeRegDep(done_seq_num);
         assert(regdep_found);
     }
     // Return true if the node is dependency free
diff --git a/src/dev/arm/SMMUv3.py b/src/dev/arm/SMMUv3.py
index 29c1568..f53b8ec 100644
--- a/src/dev/arm/SMMUv3.py
+++ b/src/dev/arm/SMMUv3.py
@@ -162,9 +162,9 @@
     # [0] S2P = 0b1, Stage 2 translation supported.
     smmu_idr0 = Param.UInt32(0x094C100F, "SMMU_IDR0 register");
 
-    # [25:21] CMDQS = 0b00101, Maximum number of Command queue entries
-    # as log 2 (entries) (0b00101 = 32 entries).
-    smmu_idr1 = Param.UInt32(0x00A00000, "SMMU_IDR1 register");
+    # [25:21] CMDQS = 0b00111, Maximum number of Command queue entries
+    # as log 2 (entries) (0b00111 = 128 entries).
+    smmu_idr1 = Param.UInt32(0x00E00000, "SMMU_IDR1 register");
 
     smmu_idr2 = Param.UInt32(0, "SMMU_IDR2 register");
     smmu_idr3 = Param.UInt32(0, "SMMU_IDR3 register");
diff --git a/src/dev/arm/fvp_base_pwr_ctrl.cc b/src/dev/arm/fvp_base_pwr_ctrl.cc
index fc66e1c..d6b6a59 100644
--- a/src/dev/arm/fvp_base_pwr_ctrl.cc
+++ b/src/dev/arm/fvp_base_pwr_ctrl.cc
@@ -58,13 +58,13 @@
 }
 
 void
-FVPBasePwrCtrl::init()
+FVPBasePwrCtrl::startup()
 {
     // All cores are ON by default (PwrStatus.{l0,l1} = 0b1)
     corePwrStatus.resize(sys->threads.size(), 0x60000000);
     for (const auto &tc : sys->threads)
         poweredCoresPerCluster[tc->socketId()] += 1;
-    BasicPioDevice::init();
+    BasicPioDevice::startup();
 }
 
 void
diff --git a/src/dev/arm/fvp_base_pwr_ctrl.hh b/src/dev/arm/fvp_base_pwr_ctrl.hh
index aa446a8..92c3198 100644
--- a/src/dev/arm/fvp_base_pwr_ctrl.hh
+++ b/src/dev/arm/fvp_base_pwr_ctrl.hh
@@ -88,7 +88,7 @@
      */
     void clearWakeRequest(ThreadContext *const tc);
 
-    void init() override;
+    void startup() override;
 
   protected:
     Tick read(PacketPtr pkt) override;
diff --git a/src/dev/arm/gic_v2.cc b/src/dev/arm/gic_v2.cc
index 1a6954d..a3939d1 100644
--- a/src/dev/arm/gic_v2.cc
+++ b/src/dev/arm/gic_v2.cc
@@ -389,7 +389,7 @@
     const ContextID ctx = pkt->req->contextId();
     const size_t data_sz = pkt->getSize();
 
-    uint32_t pkt_data M5_VAR_USED;
+    M5_VAR_USED uint32_t pkt_data;
     switch (data_sz)
     {
       case 1:
diff --git a/src/dev/arm/gic_v3_distributor.cc b/src/dev/arm/gic_v3_distributor.cc
index 27f404b..27fbe9c 100644
--- a/src/dev/arm/gic_v3_distributor.cc
+++ b/src/dev/arm/gic_v3_distributor.cc
@@ -472,6 +472,9 @@
         //return 0x43b; // ARM JEP106 code (r0p0 GIC-500)
         return 0;
 
+      case GICD_TYPER2: // Interrupt Controller Type Register 2
+        return 0; // RES0
+
       case GICD_STATUSR: // Error Reporting Status Register
         // Optional register, RAZ/WI
         return 0x0;
diff --git a/src/dev/arm/gic_v3_distributor.hh b/src/dev/arm/gic_v3_distributor.hh
index 99b65ed..5e17e2a 100644
--- a/src/dev/arm/gic_v3_distributor.hh
+++ b/src/dev/arm/gic_v3_distributor.hh
@@ -65,6 +65,8 @@
         GICD_TYPER = 0x0004,
         // Implementer Identification Register
         GICD_IIDR = 0x0008,
+        // Interrupt Controller Type Register 2
+        GICD_TYPER2 = 0x000C,
         // Error Reporting Status Register
         GICD_STATUSR = 0x0010,
         // Set Non-secure SPI Pending Register
diff --git a/src/dev/arm/gpu_nomali.hh b/src/dev/arm/gpu_nomali.hh
index 1880ec6..8c3ac26 100644
--- a/src/dev/arm/gpu_nomali.hh
+++ b/src/dev/arm/gpu_nomali.hh
@@ -99,7 +99,7 @@
      * @param err Error code from the NoMali library.
      * @param msg Message to print.
      */
-    static void gpuPanic(nomali_error_t err, const char *msg) M5_ATTR_NORETURN;
+    [[noreturn]] static void gpuPanic(nomali_error_t err, const char *msg);
     /**
      * Panic if the NoMali returned an error, do nothing otherwise.
      *
diff --git a/src/dev/arm/smmu_v3_transl.hh b/src/dev/arm/smmu_v3_transl.hh
index 878addd..bfe6319 100644
--- a/src/dev/arm/smmu_v3_transl.hh
+++ b/src/dev/arm/smmu_v3_transl.hh
@@ -97,7 +97,7 @@
     TranslContext context;
 
     Tick recvTick;
-    Tick M5_CLASS_VAR_USED faultTick;
+    M5_CLASS_VAR_USED Tick faultTick;
 
     virtual void main(Yield &yield);
 
diff --git a/src/dev/baddev.cc b/src/dev/baddev.cc
index 9e28d3b..48cdff3 100644
--- a/src/dev/baddev.cc
+++ b/src/dev/baddev.cc
@@ -49,14 +49,12 @@
 BadDevice::read(PacketPtr pkt)
 {
     panic("Device %s not imlpmented\n", devname);
-    M5_DUMMY_RETURN
 }
 
 Tick
 BadDevice::write(PacketPtr pkt)
 {
     panic("Device %s not imlpmented\n", devname);
-    M5_DUMMY_RETURN
 }
 
 BadDevice *
diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc
index c31d9f0..89399ce 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -126,7 +126,7 @@
     assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
 
     // TODO: How to get pid??
-    Addr M5_VAR_USED daddr = pkt->getAddr() - pioAddr;
+    M5_VAR_USED Addr daddr = pkt->getAddr() - pioAddr;
 
     DPRINTF(HSAPacketProcessor,
           "%s: write of size %d to reg-offset %d (0x%x)\n",
@@ -256,7 +256,7 @@
 HSAPacketProcessor::CmdQueueCmdDmaEvent::process()
 {
     uint32_t rl_idx = series_ctx->rl_idx;
-    AQLRingBuffer *aqlRingBuffer M5_VAR_USED =
+    M5_VAR_USED AQLRingBuffer *aqlRingBuffer =
         hsaPP->regdQList[rl_idx]->qCntxt.aqlBuf;
     HSAQueueDescriptor* qDesc =
         hsaPP->regdQList[rl_idx]->qCntxt.qDesc;
@@ -590,7 +590,7 @@
 void
 HSAPacketProcessor::displayQueueDescriptor(int pid, uint32_t rl_idx)
 {
-    HSAQueueDescriptor* M5_VAR_USED qDesc = regdQList[rl_idx]->qCntxt.qDesc;
+    M5_VAR_USED HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
     DPRINTF(HSAPacketProcessor,
             "%s: pid[%d], basePointer[0x%lx], dBPointer[0x%lx], "
             "writeIndex[0x%x], readIndex[0x%x], size(bytes)[0x%x]\n",
diff --git a/src/dev/hsa/hw_scheduler.cc b/src/dev/hsa/hw_scheduler.cc
index f25839d..7d8fb90 100644
--- a/src/dev/hsa/hw_scheduler.cc
+++ b/src/dev/hsa/hw_scheduler.cc
@@ -118,7 +118,7 @@
 
     // Check if this newly created queue can be directly mapped
     // to registered queue list
-    bool M5_VAR_USED register_q = mapQIfSlotAvlbl(queue_id, aql_buf, q_desc);
+    M5_VAR_USED bool register_q = mapQIfSlotAvlbl(queue_id, aql_buf, q_desc);
     schedWakeup();
     DPRINTF(HSAPacketProcessor,
              "%s: offset = %p, qID = %d, is_regd = %s, AL size %d\n",
diff --git a/src/dev/mips/malta.hh b/src/dev/mips/malta.hh
index d2bf584..d424daf 100644
--- a/src/dev/mips/malta.hh
+++ b/src/dev/mips/malta.hh
@@ -108,21 +108,18 @@
     calcPciConfigAddr(int bus, int dev, int func)
     {
         panic("Need implementation\n");
-        M5_DUMMY_RETURN
     }
 
     Addr
     calcPciIOAddr(Addr addr)
     {
         panic("Need implementation\n");
-        M5_DUMMY_RETURN
     }
 
     Addr
     calcPciMemAddr(Addr addr)
     {
         panic("Need implementation\n");
-        M5_DUMMY_RETURN
     }
 
     void serialize(CheckpointOut &cp) const override;
diff --git a/src/dev/net/sinic.cc b/src/dev/net/sinic.cc
index 5d8bee2..bc6fdbd 100644
--- a/src/dev/net/sinic.cc
+++ b/src/dev/net/sinic.cc
@@ -233,7 +233,7 @@
 
     prepareRead(cpu, index);
 
-    uint64_t value M5_VAR_USED = 0;
+    M5_VAR_USED uint64_t value = 0;
     if (pkt->getSize() == 4) {
         uint32_t reg = regData32(raddr);
         pkt->setLE(reg);
diff --git a/src/dev/net/tcp_iface.cc b/src/dev/net/tcp_iface.cc
index cb6fecb..f7591a0 100644
--- a/src/dev/net/tcp_iface.cc
+++ b/src/dev/net/tcp_iface.cc
@@ -253,7 +253,7 @@
 
 TCPIface::~TCPIface()
 {
-    int M5_VAR_USED ret;
+    M5_VAR_USED int ret;
 
     ret = close(sock);
     assert(ret == 0);
diff --git a/src/dev/pci/copy_engine.cc b/src/dev/pci/copy_engine.cc
index 4c66eb0..d526a3e 100644
--- a/src/dev/pci/copy_engine.cc
+++ b/src/dev/pci/copy_engine.cc
@@ -305,19 +305,19 @@
     ///
 
     if (size == sizeof(uint64_t)) {
-        uint64_t val M5_VAR_USED = pkt->getLE<uint64_t>();
+        M5_VAR_USED uint64_t val = pkt->getLE<uint64_t>();
         DPRINTF(DMACopyEngine, "Wrote device register %#X value %#X\n",
                 daddr, val);
     } else if (size == sizeof(uint32_t)) {
-        uint32_t val M5_VAR_USED = pkt->getLE<uint32_t>();
+        M5_VAR_USED uint32_t val = pkt->getLE<uint32_t>();
         DPRINTF(DMACopyEngine, "Wrote device register %#X value %#X\n",
                 daddr, val);
     } else if (size == sizeof(uint16_t)) {
-        uint16_t val M5_VAR_USED = pkt->getLE<uint16_t>();
+        M5_VAR_USED uint16_t val = pkt->getLE<uint16_t>();
         DPRINTF(DMACopyEngine, "Wrote device register %#X value %#X\n",
                 daddr, val);
     } else if (size == sizeof(uint8_t)) {
-        uint8_t val M5_VAR_USED = pkt->getLE<uint8_t>();
+        M5_VAR_USED uint8_t val = pkt->getLE<uint8_t>();
         DPRINTF(DMACopyEngine, "Wrote device register %#X value %#X\n",
                 daddr, val);
     } else {
diff --git a/src/dev/sparc/t1000.cc b/src/dev/sparc/t1000.cc
index 36a1666..6cf716a 100644
--- a/src/dev/sparc/t1000.cc
+++ b/src/dev/sparc/t1000.cc
@@ -75,7 +75,6 @@
 T1000::pciToDma(Addr pciAddr) const
 {
     panic("Need implementation\n");
-    M5_DUMMY_RETURN
 }
 
 
@@ -83,21 +82,18 @@
 T1000::calcPciConfigAddr(int bus, int dev, int func)
 {
     panic("Need implementation\n");
-    M5_DUMMY_RETURN
 }
 
 Addr
 T1000::calcPciIOAddr(Addr addr)
 {
     panic("Need implementation\n");
-    M5_DUMMY_RETURN
 }
 
 Addr
 T1000::calcPciMemAddr(Addr addr)
 {
     panic("Need implementation\n");
-    M5_DUMMY_RETURN
 }
 
 T1000 *
diff --git a/src/dev/storage/ide_ctrl.cc b/src/dev/storage/ide_ctrl.cc
index 47cdd10..5efa42b 100644
--- a/src/dev/storage/ide_ctrl.cc
+++ b/src/dev/storage/ide_ctrl.cc
@@ -120,7 +120,8 @@
             panic("IDE controllers support a maximum "
                   "of 4 devices attached!\n");
         }
-        params()->disks[i]->setController(this, sys->getPageBytes());
+        // Arbitrarily set the chunk size to 4K.
+        params()->disks[i]->setController(this, 4 * 1024);
     }
 
     primary.select(false);
diff --git a/src/dev/storage/ide_disk.cc b/src/dev/storage/ide_disk.cc
index e97e23b..57fa076 100644
--- a/src/dev/storage/ide_disk.cc
+++ b/src/dev/storage/ide_disk.cc
@@ -435,7 +435,7 @@
         // clear out the data buffer
         memset(dataBuffer, 0, MAX_DMA_SIZE);
         dmaReadCG = new ChunkGenerator(curPrd.getBaseAddr(),
-                curPrd.getByteCount(), pageBytes);
+                curPrd.getByteCount(), chunkBytes);
 
     }
     if (ctrl->dmaPending() || ctrl->drainState() != DrainState::Running) {
@@ -447,7 +447,7 @@
                 &dmaReadWaitEvent, dataBuffer + dmaReadCG->complete());
         dmaReadBytes += dmaReadCG->size();
         dmaReadTxs++;
-        if (dmaReadCG->size() == pageBytes)
+        if (dmaReadCG->size() == chunkBytes)
             dmaReadFullPages++;
         dmaReadCG->next();
     } else {
@@ -518,7 +518,7 @@
     if (!dmaWriteCG) {
         // clear out the data buffer
         dmaWriteCG = new ChunkGenerator(curPrd.getBaseAddr(),
-                curPrd.getByteCount(), pageBytes);
+                curPrd.getByteCount(), chunkBytes);
     }
     if (ctrl->dmaPending() || ctrl->drainState() != DrainState::Running) {
         schedule(dmaWriteWaitEvent, curTick() + DMA_BACKOFF_PERIOD);
@@ -532,7 +532,7 @@
                 curPrd.getByteCount(), curPrd.getEOT());
         dmaWriteBytes += dmaWriteCG->size();
         dmaWriteTxs++;
-        if (dmaWriteCG->size() == pageBytes)
+        if (dmaWriteCG->size() == chunkBytes)
             dmaWriteFullPages++;
         dmaWriteCG->next();
     } else {
diff --git a/src/dev/storage/ide_disk.hh b/src/dev/storage/ide_disk.hh
index 9f42941..90cbf57 100644
--- a/src/dev/storage/ide_disk.hh
+++ b/src/dev/storage/ide_disk.hh
@@ -239,8 +239,8 @@
     DmaState_t dmaState;
     /** Dma transaction is a read */
     bool dmaRead;
-    /** Size of OS pages. */
-    Addr pageBytes;
+    /** Size of chunks to DMA. */
+    Addr chunkBytes;
     /** PRD table base address */
     uint32_t curPrdAddr;
     /** PRD entry */
@@ -283,11 +283,11 @@
      * @param c The IDE controller
      */
     void
-    setController(IdeController *c, Addr page_bytes)
+    setController(IdeController *c, Addr chunk_bytes)
     {
         panic_if(ctrl, "Cannot change the controller once set!\n");
         ctrl = c;
-        pageBytes = page_bytes;
+        chunkBytes = chunk_bytes;
     }
 
     // Device register read/write
diff --git a/src/dev/virtio/base.hh b/src/dev/virtio/base.hh
index de6cfd1..e032372 100644
--- a/src/dev/virtio/base.hh
+++ b/src/dev/virtio/base.hh
@@ -451,10 +451,10 @@
         typedef uint16_t Flags;
         typedef uint16_t Index;
 
-        struct Header {
+        struct M5_ATTR_PACKED Header {
             Flags flags;
             Index index;
-        } M5_ATTR_PACKED;
+        };
 
         VirtRing<T>(PortProxy &proxy, ByteOrder bo, uint16_t size) :
             header{0, 0}, ring(size), _proxy(proxy), _base(0), byteOrder(bo)
diff --git a/src/dev/virtio/block.hh b/src/dev/virtio/block.hh
index 4393f2b..f160a18 100644
--- a/src/dev/virtio/block.hh
+++ b/src/dev/virtio/block.hh
@@ -81,9 +81,9 @@
      * @note This needs to be changed if the supported feature set
      * changes!
      */
-    struct Config {
+    struct M5_ATTR_PACKED Config {
         uint64_t capacity;
-    } M5_ATTR_PACKED;
+    };
     Config config;
 
     /** @{
@@ -122,11 +122,11 @@
     /** @} */
 
     /** VirtIO block device request as sent by guest */
-    struct BlkRequest {
+    struct M5_ATTR_PACKED BlkRequest {
         RequestType type;
         uint32_t reserved;
         uint64_t sector;
-    } M5_ATTR_PACKED;
+    };
 
     /**
      * Device read request.
diff --git a/src/dev/virtio/console.hh b/src/dev/virtio/console.hh
index d60bc66..73afb9c 100644
--- a/src/dev/virtio/console.hh
+++ b/src/dev/virtio/console.hh
@@ -77,10 +77,10 @@
      * @note This needs to be changed if the multiport feature is
      * announced!
      */
-    struct Config {
+    struct M5_ATTR_PACKED Config {
         uint16_t cols;
         uint16_t rows;
-    } M5_ATTR_PACKED;
+    };
 
     /** Currently active configuration (host byte order) */
     Config config;
diff --git a/src/dev/virtio/fs9p.hh b/src/dev/virtio/fs9p.hh
index 2eecbc5..7751b36 100644
--- a/src/dev/virtio/fs9p.hh
+++ b/src/dev/virtio/fs9p.hh
@@ -50,14 +50,14 @@
 typedef uint8_t P9MsgType;
 typedef uint16_t P9Tag;
 
-struct P9MsgHeader {
+struct M5_ATTR_PACKED P9MsgHeader {
     /** Length including header */
     uint32_t len;
     /** Message type */
     P9MsgType type;
     /** Message tag */
     P9Tag tag;
-} M5_ATTR_PACKED;
+};
 
 /** Convert p9 byte order (LE) to host byte order */
 template <typename T> inline T
@@ -120,10 +120,10 @@
      * @note The fields in this structure depend on the features
      * exposed to the guest.
      */
-    struct Config {
+    struct M5_ATTR_PACKED Config {
         uint16_t len;
         char tag[];
-    } M5_ATTR_PACKED;
+    };
 
     /** Currently active configuration (host byte order) */
     std::unique_ptr<Config> config;
diff --git a/src/dev/virtio/pci.cc b/src/dev/virtio/pci.cc
index 115136e..fdded20 100644
--- a/src/dev/virtio/pci.cc
+++ b/src/dev/virtio/pci.cc
@@ -65,7 +65,7 @@
 Tick
 PciVirtIO::read(PacketPtr pkt)
 {
-    const unsigned M5_VAR_USED size(pkt->getSize());
+    M5_VAR_USED const unsigned size(pkt->getSize());
     int bar;
     Addr offset;
     if (!getBAR(pkt->getAddr(), bar, offset))
@@ -146,7 +146,7 @@
 Tick
 PciVirtIO::write(PacketPtr pkt)
 {
-    const unsigned M5_VAR_USED size(pkt->getSize());
+    M5_VAR_USED const unsigned size(pkt->getSize());
     int bar;
     Addr offset;
     if (!getBAR(pkt->getAddr(), bar, offset))
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 33f5c6e..8108029 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -350,7 +350,7 @@
     // set the wavefront context to have a pointer to this section of the LDS
     w->ldsChunk = ldsChunk;
 
-    int32_t refCount M5_VAR_USED =
+    M5_VAR_USED int32_t refCount =
                 lds.increaseRefCounter(w->dispatchId, w->wgId);
     DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
                     cu_id, w->wgId, refCount);
@@ -867,7 +867,7 @@
         // this is for writeComplete callback
         // we simply get decrement write-related wait counters
         assert(gpuDynInst);
-        Wavefront *w M5_VAR_USED =
+        M5_VAR_USED Wavefront *w =
             computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
         assert(w);
         DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
@@ -965,7 +965,7 @@
 
     for (int i = 0; i < len; ++i) {
         PacketPtr pkt = retries.front().first;
-        GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
+        M5_VAR_USED GPUDynInstPtr gpuDynInst = retries.front().second;
         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
                 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
                 pkt->req->getPaddr());
@@ -999,7 +999,7 @@
 
     for (int i = 0; i < len; ++i) {
         PacketPtr pkt = retries.front().first;
-        Wavefront *wavefront M5_VAR_USED = retries.front().second;
+        M5_VAR_USED Wavefront *wavefront = retries.front().second;
         DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
                 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
                 pkt->req->getPaddr());
@@ -1406,7 +1406,7 @@
         DTLBPort::SenderState *sender_state =
             safe_cast<DTLBPort::SenderState*>(translation_state->saved);
 
-        Wavefront *w M5_VAR_USED =
+        M5_VAR_USED Wavefront *w =
             computeUnit->wfList[sender_state->_gpuDynInst->simdId]
             [sender_state->_gpuDynInst->wfSlotId];
 
@@ -1575,7 +1575,7 @@
 {
     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
-    ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
+    M5_VAR_USED ComputeUnit *compute_unit = computeUnit;
 
     if (!(sendTimingReq(pkt))) {
         retries.push_back(std::make_pair(pkt, gpuDynInst));
@@ -1604,7 +1604,7 @@
 {
     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
-    ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort.computeUnit;
+    M5_VAR_USED ComputeUnit *compute_unit = scalarDataPort.computeUnit;
 
     if (!(scalarDataPort.sendTimingReq(pkt))) {
         scalarDataPort.retries.push_back(pkt);
@@ -1644,7 +1644,7 @@
 
     for (int i = 0; i < len; ++i) {
         PacketPtr pkt = retries.front();
-        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+        M5_VAR_USED Addr vaddr = pkt->req->getVaddr();
         DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
 
         if (!sendTimingReq(pkt)) {
@@ -1683,7 +1683,7 @@
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
     delete pkt->senderState;
 
-    Wavefront *w M5_VAR_USED = gpuDynInst->wavefront();
+    M5_VAR_USED Wavefront *w = gpuDynInst->wavefront();
 
     DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
         "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
@@ -1722,7 +1722,7 @@
 bool
 ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
 {
-    Addr line M5_VAR_USED = pkt->req->getPaddr();
+    M5_VAR_USED Addr line = pkt->req->getPaddr();
     DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
             computeUnit->cu_id, pkt->req->getVaddr(), line);
 
@@ -1788,7 +1788,7 @@
 
     for (int i = 0; i < len; ++i) {
         PacketPtr pkt = retries.front();
-        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+        M5_VAR_USED Addr vaddr = pkt->req->getVaddr();
         DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
 
         if (!sendTimingReq(pkt)) {
@@ -2584,7 +2584,7 @@
             dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
     fatal_if(!sender_state, "packet without a valid sender state");
 
-    GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
+    M5_VAR_USED GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
 
     if (isStalled()) {
         fatal_if(retries.empty(), "must have retries waiting to be stalled");
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index fe2091d..2df4807 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -1015,6 +1015,8 @@
             return sqcTLBPort;
         } else if (if_name == "ldsPort") {
             return ldsPort;
+        } else if (if_name == "gmTokenPort") {
+            return gmTokenPort;
         } else {
             return ClockedObject::getPort(if_name, idx);
         }
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index 4e4259e..5d98288 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -33,6 +33,7 @@
 
 #include "gpu-compute/fetch_unit.hh"
 
+#include "base/bitfield.hh"
 #include "debug/GPUFetch.hh"
 #include "debug/GPUPort.hh"
 #include "debug/GPUTLB.hh"
@@ -240,6 +241,8 @@
      * pending, in the same cycle another instruction is trying to fetch.
      */
     if (!fetchBuf.at(wavefront->wfSlotId).isReserved(pkt->req->getVaddr())) {
+        wavefront->dropFetch = false;
+        wavefront->pendingFetch = false;
         return;
     }
 
@@ -574,7 +577,8 @@
     int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
 
     for (int i = 0; i < num_dwords; ++i) {
-        ((uint32_t*)(&split_inst))[i] = *reinterpret_cast<uint32_t*>(readPtr);
+        replaceBits(split_inst, 32*(i+1)-1, 32*i,
+            *reinterpret_cast<uint32_t*>(readPtr));
         if (readPtr + dword_size >= bufEnd) {
             readPtr = bufStart;
         }
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index 3d2fa0d..f34eff6 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -35,6 +35,7 @@
 #define __GPU_DYN_INST_HH__
 
 #include <cstdint>
+#include <memory>
 #include <string>
 
 #include "base/amo.hh"
@@ -255,27 +256,27 @@
     makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
     {
         if (isAtomicAnd()) {
-            return m5::make_unique<AtomicOpAnd<c0>>(*reg0);
+            return std::make_unique<AtomicOpAnd<c0>>(*reg0);
         } else if (isAtomicOr()) {
-            return m5::make_unique<AtomicOpOr<c0>>(*reg0);
+            return std::make_unique<AtomicOpOr<c0>>(*reg0);
         } else if (isAtomicXor()) {
-            return m5::make_unique<AtomicOpXor<c0>>(*reg0);
+            return std::make_unique<AtomicOpXor<c0>>(*reg0);
         } else if (isAtomicCAS()) {
-            return m5::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
+            return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
         } else if (isAtomicExch()) {
-            return m5::make_unique<AtomicOpExch<c0>>(*reg0);
+            return std::make_unique<AtomicOpExch<c0>>(*reg0);
         } else if (isAtomicAdd()) {
-            return m5::make_unique<AtomicOpAdd<c0>>(*reg0);
+            return std::make_unique<AtomicOpAdd<c0>>(*reg0);
         } else if (isAtomicSub()) {
-            return m5::make_unique<AtomicOpSub<c0>>(*reg0);
+            return std::make_unique<AtomicOpSub<c0>>(*reg0);
         } else if (isAtomicInc()) {
-            return m5::make_unique<AtomicOpInc<c0>>();
+            return std::make_unique<AtomicOpInc<c0>>();
         } else if (isAtomicDec()) {
-            return m5::make_unique<AtomicOpDec<c0>>();
+            return std::make_unique<AtomicOpDec<c0>>();
         } else if (isAtomicMax()) {
-            return m5::make_unique<AtomicOpMax<c0>>(*reg0);
+            return std::make_unique<AtomicOpMax<c0>>(*reg0);
         } else if (isAtomicMin()) {
-            return m5::make_unique<AtomicOpMin<c0>>(*reg0);
+            return std::make_unique<AtomicOpMin<c0>>(*reg0);
         } else {
             fatal("Unrecognized atomic operation");
         }
diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc
index 4c35396..54c3729 100644
--- a/src/gpu-compute/gpu_tlb.cc
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -164,7 +164,7 @@
          * vpn holds the virtual page address
          * The least significant bits are simply masked
          */
-        int set = (vpn >> TheISA::PageShift) & setMask;
+        int set = (vpn >> PageShift) & setMask;
 
         if (!freeList[set].empty()) {
             newEntry = freeList[set].front();
@@ -184,7 +184,7 @@
     GpuTLB::EntryList::iterator
     GpuTLB::lookupIt(Addr va, bool update_lru)
     {
-        int set = (va >> TheISA::PageShift) & setMask;
+        int set = (va >> PageShift) & setMask;
 
         if (FA) {
             assert(!set);
@@ -214,7 +214,7 @@
     TlbEntry*
     GpuTLB::lookup(Addr va, bool update_lru)
     {
-        int set = (va >> TheISA::PageShift) & setMask;
+        int set = (va >> PageShift) & setMask;
 
         auto entry = lookupIt(va, update_lru);
 
@@ -266,7 +266,7 @@
     GpuTLB::demapPage(Addr va, uint64_t asn)
     {
 
-        int set = (va >> TheISA::PageShift) & setMask;
+        int set = (va >> PageShift) & setMask;
         auto entry = lookupIt(va, false);
 
         if (entry != entryList[set].end()) {
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
index 005e6f6..a311f5d 100644
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -749,7 +749,7 @@
                 // that we've reserved a global and local memory unit. Thus,
                 // we need to mark the latter execution unit as not available.
                 if (execUnitIds.size() > 1) {
-                    int lm_exec_unit M5_VAR_USED = wf->localMem;
+                    M5_VAR_USED int lm_exec_unit = wf->localMem;
                     assert(toExecute.dispatchStatus(lm_exec_unit)
                            == SKIP);
                 }
@@ -758,7 +758,7 @@
                 // Verify the GM pipe for this wave is ready to execute
                 // and the wave in the GM pipe is the same as the wave
                 // in the LM pipe
-                int gm_exec_unit M5_VAR_USED = wf->globalMem;
+                M5_VAR_USED int gm_exec_unit = wf->globalMem;
                 assert(wf->wfDynId == toExecute
                        .readyInst(gm_exec_unit)->wfDynId);
                 assert(toExecute.dispatchStatus(gm_exec_unit)
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 0e737db..dd914ca 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -33,6 +33,7 @@
 
 #include "gpu-compute/wavefront.hh"
 
+#include "base/bitfield.hh"
 #include "debug/GPUExec.hh"
 #include "debug/GPUInitAbi.hh"
 #include "debug/WavefrontStack.hh"
@@ -257,23 +258,23 @@
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
                 computeUnit->srf[simdId]->write(physSgprIdx,
-                        ((uint32_t*)&host_disp_pkt_addr)[0]);
+                        bits(host_disp_pkt_addr, 31, 0));
                 ++regInitIdx;
                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                         "Setting DispatchPtr: s[%d] = %x\n",
                         computeUnit->cu_id, simdId,
                         wfSlotId, wfDynId, physSgprIdx,
-                        ((uint32_t*)&host_disp_pkt_addr)[0]);
+                        bits(host_disp_pkt_addr, 31, 0));
 
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
                 computeUnit->srf[simdId]->write(physSgprIdx,
-                        ((uint32_t*)&host_disp_pkt_addr)[1]);
+                        bits(host_disp_pkt_addr, 63, 32));
                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                         "Setting DispatchPtr: s[%d] = %x\n",
                         computeUnit->cu_id, simdId,
                         wfSlotId, wfDynId, physSgprIdx,
-                        ((uint32_t*)&host_disp_pkt_addr)[1]);
+                        bits(host_disp_pkt_addr, 63, 32));
 
                 ++regInitIdx;
                 break;
@@ -281,23 +282,23 @@
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
                 computeUnit->srf[simdId]->write(physSgprIdx,
-                        ((uint32_t*)&task->hostAMDQueueAddr)[0]);
+                        bits(task->hostAMDQueueAddr, 31, 0));
                 ++regInitIdx;
                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                         "Setting QueuePtr: s[%d] = %x\n",
                         computeUnit->cu_id, simdId,
                         wfSlotId, wfDynId, physSgprIdx,
-                       ((uint32_t*)&task->hostAMDQueueAddr)[0]);
+                        bits(task->hostAMDQueueAddr, 31, 0));
 
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
                 computeUnit->srf[simdId]->write(physSgprIdx,
-                        ((uint32_t*)&task->hostAMDQueueAddr)[1]);
+                        bits(task->hostAMDQueueAddr, 63, 32));
                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                         "Setting QueuePtr: s[%d] = %x\n",
                         computeUnit->cu_id, simdId,
                         wfSlotId, wfDynId, physSgprIdx,
-                       ((uint32_t*)&task->hostAMDQueueAddr)[1]);
+                        bits(task->hostAMDQueueAddr, 63, 32));
 
                 ++regInitIdx;
                 break;
@@ -305,23 +306,23 @@
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
                 computeUnit->srf[simdId]->write(physSgprIdx,
-                        ((uint32_t*)&kernarg_addr)[0]);
+                        bits(kernarg_addr, 31, 0));
                 ++regInitIdx;
                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                         "Setting KernargSegPtr: s[%d] = %x\n",
                         computeUnit->cu_id, simdId,
                         wfSlotId, wfDynId, physSgprIdx,
-                       ((uint32_t*)kernarg_addr)[0]);
+                        bits(kernarg_addr, 31, 0));
 
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
                 computeUnit->srf[simdId]->write(physSgprIdx,
-                        ((uint32_t*)&kernarg_addr)[1]);
+                        bits(kernarg_addr, 63, 32));
                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                         "Setting KernargSegPtr: s[%d] = %x\n",
                         computeUnit->cu_id, simdId,
                         wfSlotId, wfDynId, physSgprIdx,
-                       ((uint32_t*)kernarg_addr)[1]);
+                        bits(kernarg_addr, 63, 32));
 
                 ++regInitIdx;
                 break;
diff --git a/src/kern/linux/helpers.cc b/src/kern/linux/helpers.cc
index 9fb2487..5dd4599 100644
--- a/src/kern/linux/helpers.cc
+++ b/src/kern/linux/helpers.cc
@@ -43,14 +43,14 @@
 #include "sim/byteswap.hh"
 #include "sim/system.hh"
 
-struct DmesgEntry {
+struct M5_ATTR_PACKED DmesgEntry {
     uint64_t ts_nsec;
     uint16_t len;
     uint16_t text_len;
     uint16_t dict_len;
     uint8_t facility;
     uint8_t flags;
-} M5_ATTR_PACKED;
+};
 
 static int
 dumpDmesgEntry(const uint8_t *base, const uint8_t *end, const ByteOrder bo,
diff --git a/src/kern/linux/linux.cc b/src/kern/linux/linux.cc
index c27d053..e5b7144 100644
--- a/src/kern/linux/linux.cc
+++ b/src/kern/linux/linux.cc
@@ -73,7 +73,7 @@
     if (matched) {
         FILE *f = tmpfile();
         int fd = fileno(f);
-        size_t ret M5_VAR_USED = fwrite(data.c_str(), 1, data.size(), f);
+        M5_VAR_USED size_t ret = fwrite(data.c_str(), 1, data.size(), f);
         assert(ret == data.size());
         rewind(f);
         return fd;
diff --git a/src/kern/system_events.cc b/src/kern/system_events.cc
index d97b766..a8dfc28 100644
--- a/src/kern/system_events.cc
+++ b/src/kern/system_events.cc
@@ -35,7 +35,7 @@
 void
 SkipFuncBase::process(ThreadContext *tc)
 {
-    TheISA::PCState oldPC M5_VAR_USED = tc->pcState();
+    M5_VAR_USED TheISA::PCState oldPC = tc->pcState();
 
     returnFromFuncIn(tc);
 
diff --git a/src/learning_gem5/part2/simple_cache.cc b/src/learning_gem5/part2/simple_cache.cc
index 3a3cfe6..d91eb3c 100644
--- a/src/learning_gem5/part2/simple_cache.cc
+++ b/src/learning_gem5/part2/simple_cache.cc
@@ -230,7 +230,7 @@
         DPRINTF(SimpleCache, "Copying data from new packet to old\n");
         // We had to upgrade a previous packet. We can functionally deal with
         // the cache access now. It better be a hit.
-        bool hit M5_VAR_USED = accessFunctional(originalPacket);
+        M5_VAR_USED bool hit = accessFunctional(originalPacket);
         panic_if(!hit, "Should always hit after inserting");
         originalPacket->makeResponse();
         delete pkt; // We may need to delay this, I'm not sure.
diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc
index 2de77e9..10aea37 100644
--- a/src/mem/abstract_mem.cc
+++ b/src/mem/abstract_mem.cc
@@ -45,7 +45,6 @@
 #include "arch/locked_mem.hh"
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
-#include "cpu/base.hh"
 #include "cpu/thread_context.hh"
 #include "debug/LLSC.hh"
 #include "debug/MemoryAccess.hh"
@@ -347,17 +346,15 @@
 tracePacket(System *sys, const char *label, PacketPtr pkt)
 {
     int size = pkt->getSize();
-#if THE_ISA != NULL_ISA
     if (size == 1 || size == 2 || size == 4 || size == 8) {
         ByteOrder byte_order = sys->getGuestByteOrder();
-        DPRINTF(MemoryAccess,"%s from %s of size %i on address %#x data "
+        DPRINTF(MemoryAccess, "%s from %s of size %i on address %#x data "
                 "%#x %c\n", label, sys->getRequestorName(pkt->req->
                 requestorId()), size, pkt->getAddr(),
                 size, pkt->getAddr(), pkt->getUintX(byte_order),
                 pkt->req->isUncacheable() ? 'U' : 'C');
         return;
     }
-#endif
     DPRINTF(MemoryAccess, "%s from %s of size %i on address %#x %c\n",
             label, sys->getRequestorName(pkt->req->requestorId()),
             size, pkt->getAddr(), pkt->req->isUncacheable() ? 'U' : 'C');
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index c420714..c52d2c6 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -855,7 +855,7 @@
     // the bigger block
 
     // Get previous compressed size
-    const std::size_t M5_VAR_USED prev_size = compression_blk->getSizeBits();
+    M5_VAR_USED const std::size_t prev_size = compression_blk->getSizeBits();
 
     // Check if new data is co-allocatable
     const bool is_co_allocatable = superblock->isCompressed(compression_blk) &&
@@ -2320,7 +2320,7 @@
     if (cache->system->bypassCaches()) {
         // Just forward the packet if caches are disabled.
         // @todo This should really enqueue the packet rather
-        bool M5_VAR_USED success = cache->memSidePort.sendTimingReq(pkt);
+        M5_VAR_USED bool success = cache->memSidePort.sendTimingReq(pkt);
         assert(success);
         return true;
     } else if (tryTiming(pkt)) {
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index b4f4238..4ecda83 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -447,7 +447,7 @@
         // this express snoop travels towards the memory, and at
         // every crossbar it is snooped upwards thus reaching
         // every cache in the system
-        bool M5_VAR_USED success = memSidePort.sendTimingReq(snoop_pkt);
+        M5_VAR_USED bool success = memSidePort.sendTimingReq(snoop_pkt);
         // express snoops always succeed
         assert(success);
 
@@ -992,7 +992,7 @@
     // responds in atomic mode, so remember a few things about the
     // original packet up front
     bool invalidate = pkt->isInvalidate();
-    bool M5_VAR_USED needs_writable = pkt->needsWritable();
+    M5_VAR_USED bool needs_writable = pkt->needsWritable();
 
     // at the moment we could get an uncacheable write which does not
     // have the invalidate flag, and we need a suitable way of dealing
@@ -1391,7 +1391,7 @@
         // prefetchSquash first may result in the MSHR being
         // prematurely deallocated.
         if (snoop_pkt.cacheResponding()) {
-            auto M5_VAR_USED r = outstandingSnoop.insert(snoop_pkt.req);
+            M5_VAR_USED auto r = outstandingSnoop.insert(snoop_pkt.req);
             assert(r.second);
 
             // if we are getting a snoop response with no sharers it
diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc
index a35be33..2ff466b 100644
--- a/src/mem/cache/prefetch/base.cc
+++ b/src/mem/cache/prefetch/base.cc
@@ -48,7 +48,6 @@
 #include <cassert>
 
 #include "base/intmath.hh"
-#include "cpu/base.hh"
 #include "mem/cache/base.hh"
 #include "params/BasePrefetcher.hh"
 #include "sim/system.hh"
diff --git a/src/mem/cache/tags/fa_lru.cc b/src/mem/cache/tags/fa_lru.cc
index 9574ec9..962c9fd 100644
--- a/src/mem/cache/tags/fa_lru.cc
+++ b/src/mem/cache/tags/fa_lru.cc
@@ -117,7 +117,7 @@
 FALRU::invalidate(CacheBlk *blk)
 {
     // Erase block entry reference in the hash table
-    auto num_erased M5_VAR_USED =
+    M5_VAR_USED auto num_erased =
         tagHash.erase(std::make_pair(blk->tag, blk->isSecure()));
 
     // Sanity check; only one block reference should be erased
diff --git a/src/mem/coherent_xbar.cc b/src/mem/coherent_xbar.cc
index 037bd32..14392b5 100644
--- a/src/mem/coherent_xbar.cc
+++ b/src/mem/coherent_xbar.cc
@@ -638,7 +638,7 @@
                             *memSidePorts[dest_port_id]);
         }
 
-        bool success M5_VAR_USED =
+        M5_VAR_USED bool success =
             memSidePorts[dest_port_id]->sendTimingSnoopResp(pkt);
         pktCount[cpu_side_port_id][dest_port_id]++;
         pktSize[cpu_side_port_id][dest_port_id] += pkt_size;
@@ -858,7 +858,7 @@
         // if this is the destination of the operation, the xbar
         // sends the responce to the cache clean operation only
         // after having encountered the cache clean request
-        auto M5_VAR_USED ret = outstandingCMO.emplace(pkt->id, nullptr);
+        M5_VAR_USED auto ret = outstandingCMO.emplace(pkt->id, nullptr);
         // in atomic mode we know that the WriteClean packet should
         // precede the clean request
         assert(ret.second);
diff --git a/src/mem/dramsim2_wrapper.cc b/src/mem/dramsim2_wrapper.cc
index 413b390..f8cb4a4 100644
--- a/src/mem/dramsim2_wrapper.cc
+++ b/src/mem/dramsim2_wrapper.cc
@@ -169,7 +169,7 @@
 void
 DRAMSim2Wrapper::enqueue(bool is_write, uint64_t addr)
 {
-    bool success M5_VAR_USED = dramsim->addTransaction(is_write, addr);
+    M5_VAR_USED bool success = dramsim->addTransaction(is_write, addr);
     assert(success);
 }
 
diff --git a/src/mem/dramsim3_wrapper.cc b/src/mem/dramsim3_wrapper.cc
index 07754bc..b37a93c 100644
--- a/src/mem/dramsim3_wrapper.cc
+++ b/src/mem/dramsim3_wrapper.cc
@@ -123,7 +123,7 @@
 void
 DRAMsim3Wrapper::enqueue(uint64_t addr, bool is_write)
 {
-    bool success M5_VAR_USED = dramsim->AddTransaction(addr, is_write);
+    M5_VAR_USED bool success = dramsim->AddTransaction(addr, is_write);
     assert(success);
 }
 
diff --git a/src/mem/external_slave.cc b/src/mem/external_slave.cc
index f130498..a40e559 100644
--- a/src/mem/external_slave.cc
+++ b/src/mem/external_slave.cc
@@ -97,7 +97,7 @@
 StubSlavePort::recvAtomic(PacketPtr packet)
 {
     if (DTRACE(ExternalPort)) {
-        unsigned int M5_VAR_USED size = packet->getSize();
+        M5_VAR_USED unsigned int size = packet->getSize();
 
         DPRINTF(ExternalPort, "StubSlavePort: recvAtomic a: 0x%x size: %d"
             " data: ...\n", packet->getAddr(), size);
diff --git a/src/mem/mem_interface.hh b/src/mem/mem_interface.hh
index f150f77..9f5fbc4 100644
--- a/src/mem/mem_interface.hh
+++ b/src/mem/mem_interface.hh
@@ -140,7 +140,7 @@
     /**
      * General timing requirements
      */
-    const Tick M5_CLASS_VAR_USED tCK;
+    M5_CLASS_VAR_USED const Tick tCK;
     const Tick tCS;
     const Tick tBURST;
     const Tick tRTW;
diff --git a/src/mem/mem_object.hh b/src/mem/mem_object.hh
index 7cce0c9..5220836 100644
--- a/src/mem/mem_object.hh
+++ b/src/mem/mem_object.hh
@@ -55,8 +55,8 @@
 class MemObject : public ClockedObject
 {
   public:
-    M5_DEPRECATED_MSG(
-            "MemObject is deprecated. Use ClockedObject or SimObject instead")
+    [[deprecated(
+        "MemObject is deprecated. Use ClockedObject or SimObject instead")]]
         MemObject(const MemObjectParams *params) : ClockedObject(params)
     {}
 };
diff --git a/src/mem/page_table.cc b/src/mem/page_table.cc
index 601b9c5..5318f35 100644
--- a/src/mem/page_table.cc
+++ b/src/mem/page_table.cc
@@ -78,7 +78,7 @@
             new_vaddr, size);
 
     while (size > 0) {
-        auto new_it M5_VAR_USED = pTable.find(new_vaddr);
+        M5_VAR_USED auto new_it = pTable.find(new_vaddr);
         auto old_it = pTable.find(vaddr);
         assert(old_it != pTable.end() && new_it == pTable.end());
 
diff --git a/src/mem/port.hh b/src/mem/port.hh
index c933af62..357a10e 100644
--- a/src/mem/port.hh
+++ b/src/mem/port.hh
@@ -245,7 +245,7 @@
     }
 };
 
-class M5_DEPRECATED MasterPort : public RequestPort
+class [[deprecated]] MasterPort : public RequestPort
 {
   public:
     MasterPort(const std::string& name, SimObject* _owner,
@@ -449,7 +449,7 @@
     }
 };
 
-class M5_DEPRECATED SlavePort : public ResponsePort
+class [[deprecated]] SlavePort : public ResponsePort
 {
   public:
     SlavePort(const std::string& name, SimObject* _owner,
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 43f54e6..73c823b 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -432,6 +432,7 @@
     {
         _flags.set(flags);
         privateFlags.set(VALID_PADDR|VALID_SIZE);
+        _byteEnable = std::vector<bool>(size, true);
     }
 
     Request(Addr vaddr, unsigned size, Flags flags,
@@ -440,6 +441,7 @@
     {
         setVirt(vaddr, size, flags, id, pc, std::move(atomic_op));
         setContext(cid);
+        _byteEnable = std::vector<bool>(size, true);
     }
 
     Request(const Request& other)
@@ -541,14 +543,12 @@
         req1->_size = split_addr - _vaddr;
         req2->_vaddr = split_addr;
         req2->_size = _size - req1->_size;
-        if (!_byteEnable.empty()) {
-            req1->_byteEnable = std::vector<bool>(
-                _byteEnable.begin(),
-                _byteEnable.begin() + req1->_size);
-            req2->_byteEnable = std::vector<bool>(
-                _byteEnable.begin() + req1->_size,
-                _byteEnable.end());
-        }
+        req1->_byteEnable = std::vector<bool>(
+            _byteEnable.begin(),
+            _byteEnable.begin() + req1->_size);
+        req2->_byteEnable = std::vector<bool>(
+            _byteEnable.begin() + req1->_size,
+            _byteEnable.end());
     }
 
     /**
@@ -624,7 +624,7 @@
     void
     setByteEnable(const std::vector<bool>& be)
     {
-        assert(be.empty() || be.size() == _size);
+        assert(be.size() == _size);
         _byteEnable = be;
     }
 
diff --git a/src/mem/ruby/network/garnet/GarnetNetwork.cc b/src/mem/ruby/network/garnet/GarnetNetwork.cc
index 8334107..4e3ef1d 100644
--- a/src/mem/ruby/network/garnet/GarnetNetwork.cc
+++ b/src/mem/ruby/network/garnet/GarnetNetwork.cc
@@ -130,7 +130,7 @@
         for (vector<Router*>::const_iterator i= m_routers.begin();
              i != m_routers.end(); ++i) {
             Router* router = safe_cast<Router*>(*i);
-            int router_id M5_VAR_USED =
+            M5_VAR_USED int router_id =
                 fault_model->declare_router(router->get_num_inports(),
                                             router->get_num_outports(),
                                             router->get_vc_per_vnet(),
diff --git a/src/mem/ruby/network/garnet/OutputUnit.hh b/src/mem/ruby/network/garnet/OutputUnit.hh
index 3cb924d..1245269 100644
--- a/src/mem/ruby/network/garnet/OutputUnit.hh
+++ b/src/mem/ruby/network/garnet/OutputUnit.hh
@@ -99,7 +99,7 @@
 
   private:
     Router *m_router;
-    int M5_CLASS_VAR_USED m_id;
+    M5_CLASS_VAR_USED int m_id;
     PortDirection m_direction;
     int m_vc_per_vnet;
     NetworkLink *m_out_link;
diff --git a/src/mem/ruby/network/garnet/RoutingUnit.cc b/src/mem/ruby/network/garnet/RoutingUnit.cc
index 835f052..1a75f65 100644
--- a/src/mem/ruby/network/garnet/RoutingUnit.cc
+++ b/src/mem/ruby/network/garnet/RoutingUnit.cc
@@ -201,7 +201,7 @@
 {
     PortDirection outport_dirn = "Unknown";
 
-    int M5_VAR_USED num_rows = m_router->get_net_ptr()->getNumRows();
+    M5_VAR_USED int num_rows = m_router->get_net_ptr()->getNumRows();
     int num_cols = m_router->get_net_ptr()->getNumCols();
     assert(num_rows > 0 && num_cols > 0);
 
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index b3f2c61..dc46002 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -414,7 +414,7 @@
 CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const
 {
     uint64_t warmedUpBlocks = 0;
-    uint64_t totalBlocks M5_VAR_USED = (uint64_t)m_cache_num_sets *
+    M5_VAR_USED uint64_t totalBlocks = (uint64_t)m_cache_num_sets *
                                        (uint64_t)m_cache_assoc;
 
     for (int i = 0; i < m_cache_num_sets; i++) {
diff --git a/src/mem/ruby/structures/PerfectCacheMemory.hh b/src/mem/ruby/structures/PerfectCacheMemory.hh
index 9898995..a1c8a82 100644
--- a/src/mem/ruby/structures/PerfectCacheMemory.hh
+++ b/src/mem/ruby/structures/PerfectCacheMemory.hh
@@ -150,7 +150,7 @@
 inline void
 PerfectCacheMemory<ENTRY>::deallocate(Addr address)
 {
-    auto num_erased M5_VAR_USED = m_map.erase(makeLineAddress(address));
+    M5_VAR_USED auto num_erased = m_map.erase(makeLineAddress(address));
     assert(num_erased == 1);
 }
 
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index f5d4f02..310ba72 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -77,6 +77,26 @@
     return !instMap.empty();
 }
 
+void
+UncoalescedTable::initPacketsRemaining(InstSeqNum seqNum, int count)
+{
+    if (!instPktsRemaining.count(seqNum)) {
+        instPktsRemaining[seqNum] = count;
+    }
+}
+
+int
+UncoalescedTable::getPacketsRemaining(InstSeqNum seqNum)
+{
+    return instPktsRemaining[seqNum];
+}
+
+void
+UncoalescedTable::setPacketsRemaining(InstSeqNum seqNum, int count)
+{
+    instPktsRemaining[seqNum] = count;
+}
+
 PerInstPackets*
 UncoalescedTable::getInstPackets(int offset)
 {
@@ -94,9 +114,20 @@
 UncoalescedTable::updateResources()
 {
     for (auto iter = instMap.begin(); iter != instMap.end(); ) {
-        if (iter->second.empty()) {
-            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
+        InstSeqNum seq_num = iter->first;
+        DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n",
+                coalescer->name().c_str(), seq_num);
+        assert(instPktsRemaining.count(seq_num));
+
+        if (instPktsRemaining[seq_num] == 0) {
+            assert(iter->second.empty());
+
+            // Remove from both maps
             instMap.erase(iter++);
+            instPktsRemaining.erase(seq_num);
+
+            // Release the token
+            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
             coalescer->getGMTokenPort().sendTokens(1);
         } else {
             ++iter;
@@ -460,7 +491,7 @@
 {
     PacketPtr pkt = crequest->getFirstPkt();
     Addr request_address = pkt->getAddr();
-    Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);
+    M5_VAR_USED Addr request_line_address = makeLineAddress(request_address);
 
     RubyRequestType type = crequest->getRubyType();
 
@@ -555,16 +586,23 @@
         // otherwise, this must be either read or write command
         assert(pkt->isRead() || pkt->isWrite());
 
+        InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
+        int num_packets = getDynInst(pkt)->exec_mask.count();
+
         // the pkt is temporarily stored in the uncoalesced table until
         // it's picked for coalescing process later in this cycle or in a
-        // future cycle
+        // future cycle. Packets remaining is set to the number of excepted
+        // requests from the instruction based on its exec_mask.
         uncoalescedTable.insertPacket(pkt);
+        uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
         DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
                 pkt->getAddr());
 
         // we schedule an issue event here to process the uncoalesced table
         // and try to issue Ruby request to cache system
         if (!issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n",
+                    seq_num);
             schedule(issueEvent, curTick());
         }
     }
@@ -595,6 +633,18 @@
         << "]";
 }
 
+GPUDynInstPtr
+GPUCoalescer::getDynInst(PacketPtr pkt) const
+{
+    RubyPort::SenderState* ss =
+            safe_cast<RubyPort::SenderState*>(pkt->senderState);
+
+    ComputeUnit::DataPort::SenderState* cu_state =
+        safe_cast<ComputeUnit::DataPort::SenderState*>
+            (ss->predecessor);
+
+    return cu_state->_gpuDynInst;
+}
 
 bool
 GPUCoalescer::coalescePacket(PacketPtr pkt)
@@ -674,10 +724,7 @@
                 // CU will use that instruction to decrement wait counters
                 // in the issuing wavefront.
                 // For Ruby tester, gpuDynInst == nullptr
-                ComputeUnit::DataPort::SenderState* cu_state =
-                    safe_cast<ComputeUnit::DataPort::SenderState*>
-                        (ss->predecessor);
-                gpuDynInst = cu_state->_gpuDynInst;
+                gpuDynInst = getDynInst(pkt);
             }
 
             PendingWriteInst& inst = pendingWriteInsts[seqNum];
@@ -698,21 +745,45 @@
     // Iterate over the maximum number of instructions we can coalesce
     // per cycle (coalescingWindow).
     for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
-        PerInstPackets *pktList =
+        PerInstPackets *pkt_list =
             uncoalescedTable.getInstPackets(instIdx);
 
         // getInstPackets will return nullptr if no instruction
         // exists at the current offset.
-        if (!pktList) {
+        if (!pkt_list) {
             break;
+        } else if (pkt_list->empty()) {
+            // Found something, but it has not been cleaned up by update
+            // resources yet. See if there is anything else to coalesce.
+            // Assume we can't check anymore if the coalescing window is 1.
+            continue;
         } else {
+            // All packets in the list have the same seqNum, use first.
+            InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum();
+
+            // The difference in list size before and after tells us the
+            // number of packets which were coalesced.
+            size_t pkt_list_size = pkt_list->size();
+
             // Since we have a pointer to the list of packets in the inst,
             // erase them from the list if coalescing is successful and
             // leave them in the list otherwise. This aggressively attempts
             // to coalesce as many packets as possible from the current inst.
-            pktList->remove_if(
+            pkt_list->remove_if(
                 [&](PacketPtr pkt) { return coalescePacket(pkt); }
             );
+
+            assert(pkt_list_size >= pkt_list->size());
+            size_t pkt_list_diff = pkt_list_size - pkt_list->size();
+
+            int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num);
+            num_remaining -= pkt_list_diff;
+            assert(num_remaining >= 0);
+
+            uncoalescedTable.setPacketsRemaining(seq_num, num_remaining);
+            DPRINTF(GPUCoalescer,
+                    "Coalesced %d pkts for seqNum %d, %d remaining\n",
+                    pkt_list_diff, seq_num, num_remaining);
         }
     }
 
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 3b1b7af..2684d51 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -70,12 +70,18 @@
     bool packetAvailable();
     void printRequestTable(std::stringstream& ss);
 
+    // Modify packets remaining map. Init sets value iff the seqNum has not
+    // yet been seen before. get/set act as a regular getter/setter.
+    void initPacketsRemaining(InstSeqNum seqNum, int count);
+    int getPacketsRemaining(InstSeqNum seqNum);
+    void setPacketsRemaining(InstSeqNum seqNum, int count);
+
     // Returns a pointer to the list of packets corresponding to an
     // instruction in the instruction map or nullptr if there are no
     // instructions at the offset.
     PerInstPackets* getInstPackets(int offset);
     void updateResources();
-    bool areRequestsDone(const uint64_t instSeqNum);
+    bool areRequestsDone(const InstSeqNum instSeqNum);
 
     // Check if a packet hasn't been removed from instMap in too long.
     // Panics if a deadlock is detected and returns nothing otherwise.
@@ -88,7 +94,9 @@
     // which need responses. This data structure assumes the sequence number
     // is monotonically increasing (which is true for CU class) in order to
     // issue packets in age order.
-    std::map<uint64_t, PerInstPackets> instMap;
+    std::map<InstSeqNum, PerInstPackets> instMap;
+
+    std::map<InstSeqNum, int> instPktsRemaining;
 };
 
 class CoalescedRequest
@@ -389,6 +397,8 @@
 
     virtual RubyRequestType getRequestType(PacketPtr pkt);
 
+    GPUDynInstPtr getDynInst(PacketPtr pkt) const;
+
     // Attempt to remove a packet from the uncoalescedTable and coalesce
     // with a previous request from the same instruction. If there is no
     // previous instruction and the max number of outstanding requests has
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index fc011cc..116f04f 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -204,7 +204,7 @@
             if (it->contains(pkt->getAddr())) {
                 // generally it is not safe to assume success here as
                 // the port could be blocked
-                bool M5_VAR_USED success =
+                M5_VAR_USED bool success =
                     ruby_port->request_ports[i]->sendTimingReq(pkt);
                 assert(success);
                 return true;
@@ -371,7 +371,7 @@
 {
     DPRINTF(RubyPort, "Functional access for address: %#x\n", pkt->getAddr());
 
-    RubyPort *rp M5_VAR_USED = static_cast<RubyPort *>(&owner);
+    M5_VAR_USED RubyPort *rp = static_cast<RubyPort *>(&owner);
     RubySystem *rs = rp->m_ruby_system;
 
     // Check for pio requests and directly send them to the dedicated
@@ -597,7 +597,7 @@
         ranges.splice(ranges.begin(),
                 ruby_port->request_ports[i]->getAddrRanges());
     }
-    for (const auto M5_VAR_USED &r : ranges)
+    for (M5_VAR_USED const auto &r : ranges)
         DPRINTF(RubyPort, "%s\n", r.to_string());
     return ranges;
 }
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index ac5515f..565b426 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -602,7 +602,7 @@
 
     DPRINTF(RubySystem, "Functional Write request for %#x\n", addr);
 
-    uint32_t M5_VAR_USED num_functional_writes = 0;
+    M5_VAR_USED uint32_t num_functional_writes = 0;
 
     // Only send functional requests within the same network.
     assert(requestorToNetwork.count(pkt->requestorId()));
diff --git a/src/mem/slicc/ast/PeekStatementAST.py b/src/mem/slicc/ast/PeekStatementAST.py
index 6cadb31..20e5140 100644
--- a/src/mem/slicc/ast/PeekStatementAST.py
+++ b/src/mem/slicc/ast/PeekStatementAST.py
@@ -61,7 +61,7 @@
         code('''
 {
     // Declare message
-    const $mtid* in_msg_ptr M5_VAR_USED;
+    M5_VAR_USED const $mtid* in_msg_ptr;
     in_msg_ptr = dynamic_cast<const $mtid *>(($qcode).${{self.method}}());
     if (in_msg_ptr == NULL) {
         // If the cast fails, this is the wrong inport (wrong message type).
diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py
index 7f92d87..1399d00 100644
--- a/src/mem/slicc/symbols/StateMachine.py
+++ b/src/mem/slicc/symbols/StateMachine.py
@@ -605,7 +605,7 @@
 $c_ident::initNetQueues()
 {
     MachineType machine_type = string_to_MachineType("${{self.ident}}");
-    int base M5_VAR_USED = MachineType_base_number(machine_type);
+    M5_VAR_USED int base = MachineType_base_number(machine_type);
 
 ''')
         code.indent()
diff --git a/src/python/SConscript b/src/python/SConscript
index cf52ee1..50f467e 100644
--- a/src/python/SConscript
+++ b/src/python/SConscript
@@ -69,4 +69,5 @@
 Source('pybind11/core.cc', add_tags='python')
 Source('pybind11/debug.cc', add_tags='python')
 Source('pybind11/event.cc', add_tags='python')
+Source('pybind11/object_file.cc', add_tags='python')
 Source('pybind11/stats.cc', add_tags='python')
diff --git a/src/python/pybind11/object_file.cc b/src/python/pybind11/object_file.cc
new file mode 100644
index 0000000..2ad5598
--- /dev/null
+++ b/src/python/pybind11/object_file.cc
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2020 Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "base/loader/object_file.hh"
+#include "python/pybind11/pybind.hh"
+#include "sim/init.hh"
+
+namespace py = pybind11;
+
+namespace
+{
+
+void
+objectfile_pybind(py::module &m_internal)
+{
+    py::module m = m_internal.def_submodule("object_file");
+
+    py::class_<Loader::ObjectFile>(m, "ObjectFile")
+        .def("get_arch", [](const Loader::ObjectFile &obj) {
+                return Loader::archToString(obj.getArch());
+                }, py::return_value_policy::reference)
+        .def("get_op_sys", [](const Loader::ObjectFile &obj) {
+                return Loader::opSysToString(obj.getOpSys());
+                }, py::return_value_policy::reference)
+        .def("entry_point", &Loader::ObjectFile::entryPoint)
+        .def("get_interpreter", &Loader::ObjectFile::getInterpreter);
+
+    m.def("create", [](const std::string &fname) {
+            return Loader::createObjectFile(fname); });
+}
+EmbeddedPyBind embed_("object_file", &objectfile_pybind);
+
+} // anonymous namespace
diff --git a/src/sim/SConscript b/src/sim/SConscript
index 0bdf921..6bda828 100644
--- a/src/sim/SConscript
+++ b/src/sim/SConscript
@@ -63,6 +63,7 @@
 Source('root.cc')
 Source('serialize.cc')
 Source('drain.cc')
+Source('se_workload.cc')
 Source('sim_events.cc')
 Source('sim_object.cc')
 Source('sub_system.cc')
diff --git a/src/sim/System.py b/src/sim/System.py
index caf32fb..a2f6056 100644
--- a/src/sim/System.py
+++ b/src/sim/System.py
@@ -111,7 +111,7 @@
     work_cpus_ckpt_count = Param.Counter(0,
         "create checkpoint when active cpu count value is reached")
 
-    workload = Param.Workload(NULL, "Operating system kernel")
+    workload = Param.Workload(NULL, "Workload to run on this system")
     init_param = Param.UInt64(0, "numerical value to pass into simulator")
     readfile = Param.String("", "file to read startup script from")
     symbolfile = Param.String("", "file to get the symbols from")
diff --git a/src/sim/Workload.py b/src/sim/Workload.py
index 1e35abe..f1974bb 100644
--- a/src/sim/Workload.py
+++ b/src/sim/Workload.py
@@ -50,3 +50,7 @@
     load_addr_offset = Param.UInt64(0, "Address to offset the kernel with")
 
     command_line = Param.String("a", "boot flags to pass to the kernel")
+
+class SEWorkload(Workload):
+    type = 'SEWorkload'
+    cxx_header = "sim/se_workload.hh"
diff --git a/src/sim/eventq.cc b/src/sim/eventq.cc
index bc4864c..adce51e 100644
--- a/src/sim/eventq.cc
+++ b/src/sim/eventq.cc
@@ -32,6 +32,7 @@
 
 #include <cassert>
 #include <iostream>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <vector>
diff --git a/src/sim/eventq.hh b/src/sim/eventq.hh
index aa54722..45a5ab8 100644
--- a/src/sim/eventq.hh
+++ b/src/sim/eventq.hh
@@ -41,12 +41,12 @@
 #include <functional>
 #include <iosfwd>
 #include <memory>
-#include <mutex>
 #include <string>
 
 #include "base/debug.hh"
 #include "base/flags.hh"
 #include "base/types.hh"
+#include "base/uncontended_mutex.hh"
 #include "debug/Event.hh"
 #include "sim/serialize.hh"
 
@@ -622,7 +622,7 @@
     Tick _curTick;
 
     //! Mutex to protect async queue.
-    std::mutex async_queue_mutex;
+    UncontendedMutex async_queue_mutex;
 
     //! List of events added by other threads to this event queue.
     std::list<Event*> async_queue;
@@ -647,7 +647,7 @@
      * @see EventQueue::lock()
      * @see EventQueue::unlock()
      */
-    std::mutex service_mutex;
+    UncontendedMutex service_mutex;
 
     //! Insert / remove event from the queue. Should only be called
     //! by thread operating this queue.
diff --git a/src/sim/faults.cc b/src/sim/faults.cc
index d4d3c11..501b5d1 100644
--- a/src/sim/faults.cc
+++ b/src/sim/faults.cc
@@ -67,7 +67,7 @@
 void
 SESyscallFault::invoke(ThreadContext *tc, const StaticInstPtr &inst)
 {
-    tc->syscall();
+    tc->getSystemPtr()->workload->syscall(tc);
     // Move the PC forward since that doesn't happen automatically.
     TheISA::PCState pc = tc->pcState();
     inst->advancePC(pc);
diff --git a/src/sim/probe/probe.hh b/src/sim/probe/probe.hh
index ef53944..bc73eb0 100644
--- a/src/sim/probe/probe.hh
+++ b/src/sim/probe/probe.hh
@@ -151,7 +151,7 @@
 {
   private:
     /** Required for sensible debug messages.*/
-    const M5_CLASS_VAR_USED SimObject *object;
+    M5_CLASS_VAR_USED const SimObject *object;
     /** Vector for name look-up. */
     std::vector<ProbePoint *> points;
 
diff --git a/src/sim/pseudo_inst.cc b/src/sim/pseudo_inst.cc
index 6970120..aacca73 100644
--- a/src/sim/pseudo_inst.cc
+++ b/src/sim/pseudo_inst.cc
@@ -479,7 +479,7 @@
 m5Syscall(ThreadContext *tc)
 {
     DPRINTF(PseudoInst, "PseudoInst::m5Syscall()\n");
-    tc->syscall();
+    tc->getSystemPtr()->workload->syscall(tc);
 }
 
 void
diff --git a/src/arch/null/cpu_dummy.cc b/src/sim/se_workload.cc
similarity index 65%
copy from src/arch/null/cpu_dummy.cc
copy to src/sim/se_workload.cc
index df30b81..dccd7ca 100644
--- a/src/arch/null/cpu_dummy.cc
+++ b/src/sim/se_workload.cc
@@ -1,15 +1,5 @@
 /*
- * Copyright (c) 2013 ARM Limited
- * All rights reserved
- *
- * The license below extends only to copyright in the software and shall
- * not be construed as granting a license to any other intellectual
- * property including but not limited to intellectual property relating
- * to a hardware implementation of the functionality of the software
- * licensed hereunder.  You may use the software subject to the license
- * terms below provided that you ensure that this notice is replicated
- * unmodified and in its entirety in all distributions of the software,
- * modified or unmodified, in source code or in binary form.
+ * Copyright 2020 Google Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -35,8 +25,23 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/**
- * Provide the actual storage for maxThreadsPerCPU which is declared
- * extern and normally provided by src/cpu/base.cc
- */
-int maxThreadsPerCPU = 1;
+#include "sim/se_workload.hh"
+
+#include "cpu/thread_context.hh"
+#include "params/SEWorkload.hh"
+#include "sim/process.hh"
+
+SEWorkload::SEWorkload(const Params &p) : Workload(&p), _params(p)
+{}
+
+void
+SEWorkload::syscall(ThreadContext *tc)
+{
+    tc->getProcessPtr()->syscall(tc);
+}
+
+SEWorkload *
+SEWorkloadParams::create()
+{
+    return new SEWorkload(*this);
+}
diff --git a/src/sim/se_workload.hh b/src/sim/se_workload.hh
new file mode 100644
index 0000000..8deb03b
--- /dev/null
+++ b/src/sim/se_workload.hh
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __SIM_SE_WORKLOAD_HH__
+#define __SIM_SE_WORKLOAD_HH__
+
+#include "params/SEWorkload.hh"
+#include "sim/workload.hh"
+
+class SEWorkload : public Workload
+{
+  public:
+    using Params = SEWorkloadParams;
+
+  protected:
+    const Params &_params;
+
+  public:
+    const Params &params() const { return _params; }
+
+    SEWorkload(const Params &p);
+
+    Addr
+    getEntry() const override
+    {
+        // This object represents the OS, not the individual processes running
+        // within it.
+        panic("No workload entry point for syscall emulation mode.");
+    }
+
+    Loader::Arch
+    getArch() const override
+    {
+        // ISA specific subclasses should implement this method.
+        // This implemenetation is just to avoid having to implement those for
+        // now, and will be removed in the future.
+        panic("SEWorkload::getArch() not implemented.");
+    }
+
+    const Loader::SymbolTable &
+    symtab(ThreadContext *) override
+    {
+        // This object represents the OS, not the individual processes running
+        // within it.
+        panic("No workload symbol table for syscall emulation mode.");
+    }
+
+    bool
+    insertSymbol(const Loader::Symbol &symbol) override
+    {
+        // This object represents the OS, not the individual processes running
+        // within it.
+        panic("No workload symbol table for syscall emulation mode.");
+    }
+
+    void syscall(ThreadContext *tc) override;
+};
+
+#endif // __SIM_SE_WORKLOAD_HH__
diff --git a/src/sim/stat_control.cc b/src/sim/stat_control.cc
index 9464c0d..5b66786 100644
--- a/src/sim/stat_control.cc
+++ b/src/sim/stat_control.cc
@@ -53,7 +53,6 @@
 #include "base/hostinfo.hh"
 #include "base/statistics.hh"
 #include "base/time.hh"
-#include "cpu/base.hh"
 #include "sim/global_event.hh"
 
 using namespace std;
@@ -62,6 +61,7 @@
 Stats::Value simTicks;
 Stats::Value finalTick;
 Stats::Value simFreq;
+Stats::Value hostSeconds;
 
 namespace Stats {
 
@@ -94,36 +94,14 @@
 
 struct Global
 {
-    Stats::Formula hostInstRate;
-    Stats::Formula hostOpRate;
     Stats::Formula hostTickRate;
     Stats::Value hostMemory;
-    Stats::Value hostSeconds;
-
-    Stats::Value simInsts;
-    Stats::Value simOps;
 
     Global();
 };
 
 Global::Global()
 {
-    simInsts
-        .functor(BaseCPU::numSimulatedInsts)
-        .name("sim_insts")
-        .desc("Number of instructions simulated")
-        .precision(0)
-        .prereq(simInsts)
-        ;
-
-    simOps
-        .functor(BaseCPU::numSimulatedOps)
-        .name("sim_ops")
-        .desc("Number of ops (including micro ops) simulated")
-        .precision(0)
-        .prereq(simOps)
-        ;
-
     simSeconds
         .name("sim_seconds")
         .desc("Number of seconds simulated")
@@ -148,20 +126,6 @@
               "(restored from checkpoints and never reset)")
         ;
 
-    hostInstRate
-        .name("host_inst_rate")
-        .desc("Simulator instruction rate (inst/s)")
-        .precision(0)
-        .prereq(simInsts)
-        ;
-
-    hostOpRate
-        .name("host_op_rate")
-        .desc("Simulator op (including micro ops) rate (op/s)")
-        .precision(0)
-        .prereq(simOps)
-        ;
-
     hostMemory
         .functor(memUsage)
         .name("host_mem_usage")
@@ -183,8 +147,6 @@
         ;
 
     simSeconds = simTicks / simFreq;
-    hostInstRate = simInsts / hostSeconds;
-    hostOpRate = simOps / hostSeconds;
     hostTickRate = simTicks / hostSeconds;
 
     registerResetCallback([]() {
diff --git a/src/sim/stats.hh b/src/sim/stats.hh
index ed68af6..4e17f64 100644
--- a/src/sim/stats.hh
+++ b/src/sim/stats.hh
@@ -34,5 +34,6 @@
 extern Stats::Formula simSeconds;
 extern Stats::Value simTicks;
 extern Stats::Value simFreq;
+extern Stats::Value hostSeconds;
 
 #endif // __SIM_SIM_STATS_HH__
diff --git a/src/sim/syscall_emul.hh b/src/sim/syscall_emul.hh
index 05a29f9..9d1f6e2 100644
--- a/src/sim/syscall_emul.hh
+++ b/src/sim/syscall_emul.hh
@@ -1506,11 +1506,6 @@
 
     desc->returnInto(ctc, 0);
 
-#if THE_ISA == SPARC_ISA
-    tc->setIntReg(TheISA::SyscallPseudoReturnReg, 0);
-    ctc->setIntReg(TheISA::SyscallPseudoReturnReg, 1);
-#endif
-
     TheISA::PCState cpc = tc->pcState();
     if (!p->kvmInSE)
         cpc.advance();
diff --git a/src/sim/system.cc b/src/sim/system.cc
index cb412a8..9011a75 100644
--- a/src/sim/system.cc
+++ b/src/sim/system.cc
@@ -50,12 +50,15 @@
 #include "base/loader/symtab.hh"
 #include "base/str.hh"
 #include "base/trace.hh"
+#include "config/the_isa.hh"
 #include "config/use_kvm.hh"
 #if USE_KVM
 #include "cpu/kvm/base.hh"
 #include "cpu/kvm/vm.hh"
 #endif
+#if THE_ISA != NULL_ISA
 #include "cpu/base.hh"
+#endif
 #include "cpu/thread_context.hh"
 #include "debug/Loader.hh"
 #include "debug/Quiesce.hh"
@@ -180,7 +183,7 @@
 {
     auto &t = thread(id);
 #   if THE_ISA != NULL_ISA
-    BaseCPU M5_VAR_USED *cpu = t.context->getCpuPtr();
+    M5_VAR_USED BaseCPU *cpu = t.context->getCpuPtr();
     DPRINTFS(Quiesce, cpu, "quiesce()\n");
 #   endif
     t.quiesce();
@@ -247,7 +250,7 @@
         warn_once("Cache line size is neither 16, 32, 64 nor 128 bytes.\n");
 
     // Get the generic system requestor IDs
-    RequestorID tmp_id M5_VAR_USED;
+    M5_VAR_USED RequestorID tmp_id;
     tmp_id = getRequestorId(this, "writebacks");
     assert(tmp_id == Request::wbRequestorId);
     tmp_id = getRequestorId(this, "functional");
@@ -270,14 +273,6 @@
 }
 
 void
-System::init()
-{
-    // check that the system port is connected
-    if (!_systemPort.isConnected())
-        panic("System port on %s is not connected.\n", name());
-}
-
-void
 System::startup()
 {
     SimObject::startup();
diff --git a/src/sim/system.hh b/src/sim/system.hh
index 7d77c48..e5d4ec6 100644
--- a/src/sim/system.hh
+++ b/src/sim/system.hh
@@ -52,7 +52,6 @@
 #include "base/loader/symtab.hh"
 #include "base/statistics.hh"
 #include "config/the_isa.hh"
-#include "cpu/base.hh"
 #include "cpu/pc_event.hh"
 #include "enums/MemoryMode.hh"
 #include "mem/mem_requestor.hh"
@@ -222,11 +221,6 @@
         const_iterator end() const { return const_iterator(*this, size()); }
     };
 
-    /**
-     * After all objects have been created and all ports are
-     * connected, check that the system port is connected.
-     */
-    void init() override;
     void startup() override;
 
     /**
diff --git a/src/sim/workload.hh b/src/sim/workload.hh
index 435a24b..7c1b66d 100644
--- a/src/sim/workload.hh
+++ b/src/sim/workload.hh
@@ -69,6 +69,12 @@
     virtual const Loader::SymbolTable &symtab(ThreadContext *tc) = 0;
     virtual bool insertSymbol(const Loader::Symbol &symbol) = 0;
 
+    virtual void
+    syscall(ThreadContext *tc)
+    {
+        panic("syscall() not implemented.");
+    }
+
     /** @{ */
     /**
      * Add a function-based event to the given function, to be looked
diff --git a/src/systemc/core/list.hh b/src/systemc/core/list.hh
index b1c5f55..6ba2825 100644
--- a/src/systemc/core/list.hh
+++ b/src/systemc/core/list.hh
@@ -102,8 +102,13 @@
         prevListNode = t;
     }
 
-    T *getNext() { return dynamic_cast<T *>(nextListNode); }
-    bool empty() { return getNext() == nullptr; }
+    T *
+    getNext()
+    {
+        return empty() ? nullptr : static_cast<T *>(nextListNode);
+    }
+
+    bool empty() { return nextListNode == this; }
 };
 
 } // namespace sc_gem5
diff --git a/src/systemc/core/scheduler.cc b/src/systemc/core/scheduler.cc
index 179bd55..cc0be7c 100644
--- a/src/systemc/core/scheduler.cc
+++ b/src/systemc/core/scheduler.cc
@@ -71,8 +71,7 @@
         deltas.front()->deschedule();
 
     // Timed notifications.
-    for (auto &tsp: timeSlots) {
-        TimeSlot *&ts = tsp.second;
+    for (auto &ts: timeSlots) {
         while (!ts->events.empty())
             ts->events.front()->deschedule();
         deschedule(ts);
@@ -260,6 +259,7 @@
 {
     std::lock_guard<std::mutex> lock(asyncListMutex);
     asyncUpdateList.pushLast(c);
+    hasAsyncUpdate = true;
 }
 
 void
@@ -326,11 +326,12 @@
 Scheduler::runUpdate()
 {
     status(StatusUpdate);
-    {
+    if (hasAsyncUpdate) {
         std::lock_guard<std::mutex> lock(asyncListMutex);
         Channel *channel;
         while ((channel = asyncUpdateList.getNext()) != nullptr)
             updateList.pushLast(channel);
+        hasAsyncUpdate = false;
     }
 
     try {
diff --git a/src/systemc/core/scheduler.hh b/src/systemc/core/scheduler.hh
index c9ca161..13f35ed 100644
--- a/src/systemc/core/scheduler.hh
+++ b/src/systemc/core/scheduler.hh
@@ -28,7 +28,9 @@
 #ifndef __SYSTEMC_CORE_SCHEDULER_HH__
 #define __SYSTEMC_CORE_SCHEDULER_HH__
 
+#include <atomic>
 #include <functional>
+#include <list>
 #include <map>
 #include <mutex>
 #include <set>
@@ -151,13 +153,27 @@
     class TimeSlot : public ::Event
     {
       public:
-        TimeSlot() : ::Event(Default_Pri, AutoDelete) {}
-
+        TimeSlot(Scheduler* scheduler) : ::Event(Default_Pri, AutoDelete),
+                                         parent_scheduler(scheduler) {}
+        // Event::when() is only set after it's scheduled to an event queue.
+        // However, TimeSlot won't be scheduled before init is done. We need
+        // to keep the real 'targeted_when' information before scheduled.
+        Tick targeted_when;
+        Scheduler* parent_scheduler;
         ScEvents events;
-        void process();
+        void process() override;
+
+      protected:
+        void
+        releaseImpl() override
+        {
+            if (!scheduled())
+                parent_scheduler->releaseTimeSlot(this);
+        }
+
     };
 
-    typedef std::map<Tick, TimeSlot *> TimeSlots;
+    typedef std::list<TimeSlot *> TimeSlots;
 
     Scheduler();
     ~Scheduler();
@@ -250,12 +266,14 @@
         }
 
         // Timed notification/timeout.
-        TimeSlot *&ts = timeSlots[tick];
-        if (!ts) {
-            ts = new TimeSlot;
-            schedule(ts, tick);
+        auto it = timeSlots.begin();
+        while (it != timeSlots.end() && (*it)->targeted_when < tick)
+            it++;
+        if (it == timeSlots.end() || (*it)->targeted_when != tick) {
+            it = timeSlots.emplace(it, acquireTimeSlot(tick));
+            schedule(*it, tick);
         }
-        event->schedule(ts->events, tick);
+        event->schedule((*it)->events, tick);
     }
 
     // For descheduling delayed/timed notifications/timeouts.
@@ -270,10 +288,15 @@
         }
 
         // Timed notification/timeout.
-        auto tsit = timeSlots.find(event->when());
-        panic_if(tsit == timeSlots.end(),
+        auto tsit = timeSlots.begin();
+        while (tsit != timeSlots.end() &&
+               (*tsit)->targeted_when < event->when())
+            tsit++;
+
+        panic_if(tsit == timeSlots.end() ||
+                 (*tsit)->targeted_when != event->when(),
                 "Descheduling event at time with no events.");
-        TimeSlot *ts = tsit->second;
+        TimeSlot *ts = *tsit;
         ScEvents &events = ts->events;
         assert(on == &events);
         event->deschedule();
@@ -288,7 +311,7 @@
     void
     completeTimeSlot(TimeSlot *ts)
     {
-        assert(ts == timeSlots.begin()->second);
+        assert(ts == timeSlots.front());
         timeSlots.erase(timeSlots.begin());
         if (!runToTime && starved())
             scheduleStarvationEvent();
@@ -324,7 +347,7 @@
         if (pendingCurr())
             return 0;
         if (pendingFuture())
-            return timeSlots.begin()->first - getCurTick();
+            return timeSlots.front()->targeted_when - getCurTick();
         return MaxTick - getCurTick();
     }
 
@@ -374,6 +397,27 @@
     void registerTraceFile(TraceFile *tf) { traceFiles.insert(tf); }
     void unregisterTraceFile(TraceFile *tf) { traceFiles.erase(tf); }
 
+    TimeSlot*
+    acquireTimeSlot(Tick tick)
+    {
+        TimeSlot *ts = nullptr;
+        if (!freeTimeSlots.empty()) {
+            ts = freeTimeSlots.top();
+            freeTimeSlots.pop();
+        } else {
+            ts = new TimeSlot(this);
+        }
+        ts->targeted_when = tick;
+        ts->events.clear();
+        return ts;
+    }
+
+    void
+    releaseTimeSlot(TimeSlot *ts)
+    {
+        freeTimeSlots.push(ts);
+    }
+
   private:
     typedef const EventBase::Priority Priority;
     static Priority DefaultPriority = EventBase::Default_Pri;
@@ -410,6 +454,7 @@
 
     ScEvents deltas;
     TimeSlots timeSlots;
+    std::stack<TimeSlot*> freeTimeSlots;
 
     Process *
     getNextReady()
@@ -434,7 +479,8 @@
     {
         return (readyListMethods.empty() && readyListThreads.empty() &&
                 updateList.empty() && deltas.empty() &&
-                (timeSlots.empty() || timeSlots.begin()->first > maxTick) &&
+                (timeSlots.empty() ||
+                 timeSlots.front()->targeted_when > maxTick) &&
                 initList.empty());
     }
     EventWrapper<Scheduler, &Scheduler::pause> starvationEvent;
@@ -484,6 +530,7 @@
 
     ChannelList asyncUpdateList;
     std::mutex asyncListMutex;
+    std::atomic<bool> hasAsyncUpdate;
 
     std::map<::Event *, Tick> eventsToSchedule;
 
diff --git a/tests/configs/gpu-ruby.py b/tests/configs/gpu-ruby.py
index 155775a..a463fe3 100644
--- a/tests/configs/gpu-ruby.py
+++ b/tests/configs/gpu-ruby.py
@@ -261,7 +261,8 @@
 
 system = System(cpu = cpu_list,
                 mem_ranges = [AddrRange(options.mem_size)],
-                mem_mode = 'timing')
+                mem_mode = 'timing',
+                workload = SEWorkload())
 
 # Dummy voltage domain for all our clock domains
 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
diff --git a/tests/gem5/cpu_tests/run.py b/tests/gem5/cpu_tests/run.py
index 4d2daf1..b893b80 100644
--- a/tests/gem5/cpu_tests/run.py
+++ b/tests/gem5/cpu_tests/run.py
@@ -119,6 +119,8 @@
 
 system = System()
 
+system.workload = SEWorkload()
+
 system.clk_domain = SrcClockDomain()
 system.clk_domain.clock = '1GHz'
 system.clk_domain.voltage_domain = VoltageDomain()
diff --git a/tests/gem5/fixture.py b/tests/gem5/fixture.py
index e21cb88..e8a67b8 100644
--- a/tests/gem5/fixture.py
+++ b/tests/gem5/fixture.py
@@ -171,7 +171,7 @@
         command.extend(self.targets)
         if self.options:
             command.extend(self.options)
-        log_call(log.test_log, command, stderr=sys.stderr)
+        log_call(log.test_log, command, time=None, stderr=sys.stderr)
 
 class Gem5Fixture(SConsFixture):
     def __new__(cls, isa, variant, protocol=None):
@@ -209,7 +209,7 @@
         targets = set(self.required_by)
         command = ['make', '-C', self.directory]
         command.extend([target.target for target in targets])
-        log_call(log.test_log, command, stderr=sys.stderr)
+        log_call(log.test_log, command, time=None, stderr=sys.stderr)
 
 
 class MakeTarget(Fixture):
diff --git a/tests/gem5/m5threads_test_atomic/atomic_system.py b/tests/gem5/m5threads_test_atomic/atomic_system.py
index 2d9b129..a08be5c 100644
--- a/tests/gem5/m5threads_test_atomic/atomic_system.py
+++ b/tests/gem5/m5threads_test_atomic/atomic_system.py
@@ -40,6 +40,8 @@
 root = Root(full_system = False)
 root.system = System()
 
+root.system.workload = SEWorkload()
+
 root.system.clk_domain = SrcClockDomain()
 root.system.clk_domain.clock = '3GHz'
 root.system.clk_domain.voltage_domain = VoltageDomain()
diff --git a/tests/gem5/suite.py b/tests/gem5/suite.py
index cba3d43..3b0f1f8 100644
--- a/tests/gem5/suite.py
+++ b/tests/gem5/suite.py
@@ -176,6 +176,7 @@
         command.append(config)
         # Config_args should set up the program args.
         command.extend(config_args)
-        log_call(params.log, command, stdout=sys.stdout, stderr=sys.stderr)
+        log_call(params.log, command, time=params.time,
+            stdout=sys.stdout, stderr=sys.stderr)
 
     return test_run_gem5
diff --git a/tests/test-progs/chdir-print/Makefile b/tests/test-progs/chdir-print/Makefile
deleted file mode 100644
index 6a357d5..0000000
--- a/tests/test-progs/chdir-print/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-
-CPP := g++
-
-TEST_OBJS := chdir-print.o
-TEST_PROGS := $(TEST_OBJS:.o=)
-
-# ==== Rules ==================================================================
-
-.PHONY: default clean
-
-default: $(TEST_PROGS)
-
-clean:
-	$(RM)  $(TEST_OBJS) $(TEST_PROGS)
-
-$(TEST_PROGS): $(TEST_OBJS)
-	$(CPP)  -static -o $@  $@.o
-
-%.o: %.c Makefile
-	$(CPP) -c -o $@ $*.c -msse3
diff --git a/tests/test-progs/chdir-print/README.txt b/tests/test-progs/chdir-print/README.txt
deleted file mode 100644
index b1e9213..0000000
--- a/tests/test-progs/chdir-print/README.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-# example test compile and run parameters
-# Note: the absolute path to the chdir-print binary should be specified
-# in the run command even if running from the same folder. This is needed
-# because chdir is executed before triggering a clone for the file read,
-# and the cloned process won't be able to find the executable if a relative
-# path is provided.
-
-# compile examples
-scons --default=X86 ./build/X86/gem5.opt PROTOCOL=MOESI_hammer
-scons --default=X86 ./build/X86/gem5.opt PROTOCOL=MESI_Three_Level
-
-# run parameters
-<GEM5_ROOT>/build/X86/gem5.opt <GEM5_ROOT>/configs/example/se.py -c <GEM5_ROOT>/tests/test-progs/chdir-print/chdir-print -n2 --ruby
-
-
-# example successful output for MESI_Three_Level:
-
-<...>
-
-**** REAL SIMULATION ****
-info: Entering event queue @ 0.  Starting simulation...
-warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files!
-cwd: /proj/research_simu/users/jalsop/gem5-mem_dif_debug/tests/test-progs/chdir-print/
-cwd: /proc
-
-<...>
-
-processor       : 0
-vendor_id       : Generic
-cpu family      : 0
-model           : 0
-model name      : Generic
-stepping        : 0
-cpu MHz         : 2000
-cache size:     : 2048K
-physical id     : 0
-siblings        : 2
-core id         : 0
-cpu cores       : 2
-fpu             : yes
-fpu exception   : yes
-cpuid level     : 1
-wp              : yes
-flags           : fpu
-cache alignment : 64
-
-processor       : 1
-vendor_id       : Generic
-cpu family      : 0
-model           : 0
-model name      : Generic
-stepping        : 0
-cpu MHz         : 2000
-cache size:     : 2048K
-physical id     : 0
-siblings        : 2
-core id         : 1
-cpu cores       : 2
-fpu             : yes
-fpu exception   : yes
-cpuid level     : 1
-wp              : yes
-flags           : fpu
-cache alignment : 64
-
-SUCCESS
-Exiting @ tick 2694923000 because exiting with last active thread context
diff --git a/tests/test-progs/chdir-print/chdir-print.c b/tests/test-progs/chdir-print/chdir-print.c
deleted file mode 100644
index 71747b6..0000000
--- a/tests/test-progs/chdir-print/chdir-print.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * For use for simulation and test purposes only
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-const int BUFFER_SIZE = 64;
-
-// Tests the functionality of RegisterFilesystem
-int main(void)
-{
-    char *cwd = getcwd(NULL, PATH_MAX);
-    printf("cwd: %s\n", cwd);
-    free(cwd);
-
-    chdir("/proc");
-
-    cwd = getcwd(NULL, PATH_MAX);
-    printf("cwd: %s\n", cwd);
-    free(cwd);
-
-    FILE *fp;
-    char buffer[BUFFER_SIZE];
-
-    bool found_procline = false;
-    fp = popen("cat cpuinfo", "r");
-    if (fp != NULL) {
-        while (fgets(buffer, BUFFER_SIZE, fp) != NULL) {
-            printf("%s", buffer);
-            if (strstr(buffer, "processor")) {
-                found_procline = true;
-            }
-        }
-        pclose(fp);
-    }
-
-    if (found_procline) {
-        printf("SUCCESS\n");
-        return EXIT_SUCCESS;
-    }
-
-    printf("FAILURE\n");
-    return EXIT_FAILURE;
-}
diff --git a/tests/test-progs/mwait/Makefile b/tests/test-progs/mwait/Makefile
deleted file mode 100644
index 6b88811..0000000
--- a/tests/test-progs/mwait/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-
-CPP := g++
-
-TEST_OBJS := mwait.o
-TEST_PROGS := $(TEST_OBJS:.o=)
-
-# ==== Rules ==================================================================
-
-.PHONY: default clean
-
-default: $(TEST_PROGS) 
-
-clean:
-	$(RM)  $(TEST_OBJS) $(TEST_PROGS)
-
-$(TEST_PROGS): $(TEST_OBJS)
-	$(CPP)  -static -o $@  $@.o pthread.o
-
-%.o: %.c Makefile
-	$(CPP) -c -o $@ $*.c -msse3
diff --git a/tests/test-progs/mwait/mwait.c b/tests/test-progs/mwait/mwait.c
deleted file mode 100644
index e1b2035..0000000
--- a/tests/test-progs/mwait/mwait.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// author: Marc Orr
-
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define NUM_TRIES   1000
-
-// Make sure that flags and wait sit in different cache lines
-volatile int flags[10];
-volatile int wait[10];
-
-pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-
-void *DoWork1(void *threadid)
-{
-    flags[0] = flags[0] + 1;
-    wait[0] = 0;
-    pthread_exit(0);
-}
-
-void *DoWork2(void *threadid)
-{
-    pthread_mutex_lock (&mutex);
-    flags[0] = flags[0] + 1;
-    pthread_mutex_unlock (&mutex);
-    pthread_exit(0);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Program main
-////////////////////////////////////////////////////////////////////////////////
-int main( int argc, char** argv)
-{
-    // stuff for thread
-    pthread_t threads[1];
-
-    // initialize global variables
-    flags[0] = 0;
-    wait[0] = 1;
-
-    // monitor (via gcc intrinsic)
-    __builtin_ia32_monitor ((void *)&flags, 0, 0);
-
-    // invalidate flags in this cpu's cache
-    pthread_create(&threads[0], NULL, DoWork1, NULL);
-    while (wait[0]);
-
-    // launch thread to invalidate address being monitored
-    pthread_create(&threads[0], NULL, DoWork2, NULL);
-
-    // wait for other thread to modify flags
-    int mwait_cnt = 0;
-    do {
-        pthread_mutex_lock (&mutex);
-        if (flags[0] != 2) {
-            pthread_mutex_unlock (&mutex);
-            __builtin_ia32_mwait(0, 0);
-        } else {
-            pthread_mutex_unlock (&mutex);
-        }
-        mwait_cnt++;
-    } while (flags[0] != 2 && mwait_cnt < NUM_TRIES);
-
-    // test may hang if mwait is not working
-    if (flags[0]==2) {
-        printf("mwait regression PASSED, flags[0] = %d\n", flags[0]);
-    } else {
-        printf("mwait regression FAILED, flags[0] = %d\n", flags[0]);
-    }
-
-    return 0;
-}
diff --git a/tests/test-progs/page-access-wrap/Makefile b/tests/test-progs/page-access-wrap/Makefile
deleted file mode 100644
index 41e7a18..0000000
--- a/tests/test-progs/page-access-wrap/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-
-CPP := g++
-
-TEST_OBJS := page-access-wrap.o
-TEST_PROGS := $(TEST_OBJS:.o=)
-
-# ==== Rules ==================================================================
-
-.PHONY: default clean
-
-default: $(TEST_PROGS)
-
-clean:
-	$(RM)  $(TEST_OBJS) $(TEST_PROGS)
-
-$(TEST_PROGS): $(TEST_OBJS)
-	$(CPP)  -static -o $@  $@.o
-
-%.o: %.c Makefile
-	$(CPP) -c -o $@ $*.c -msse3
diff --git a/tests/test-progs/page-access-wrap/page-access-wrap.cpp b/tests/test-progs/page-access-wrap/page-access-wrap.cpp
deleted file mode 100644
index 6e536aa..0000000
--- a/tests/test-progs/page-access-wrap/page-access-wrap.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * For use for simulation and test purposes only
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <sys/mman.h>
-
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-int main(void)
-{
-    uint64_t page_size = 0x1000;
-    uint64_t num_pages = 0x10000;
-    uint64_t length = page_size * num_pages;
-
-    void *raw = mmap(NULL, length, PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
-    uint8_t *mem = reinterpret_cast<uint8_t*>(raw);
-
-    srand(0xABCD);
-
-    uint64_t last_byte = page_size - 1;
-    uint64_t page_boundaries = num_pages - 1;
-
-    for (int i = 0; i < 2000; i++) {
-        uint64_t random_boundary = rand() % page_boundaries;
-        uint64_t boundary_offset = random_boundary * page_size;
-        uint64_t boundary_last_byte = boundary_offset + last_byte;
-        uint32_t *poke = reinterpret_cast<uint32_t*>(mem + boundary_last_byte);
-        printf("%p\n", poke);
-        uint32_t value = *poke;
-    }
-
-    return 0;
-}
diff --git a/tests/test-progs/stack-print/bin/x86/linux/stack-print b/tests/test-progs/stack-print/bin/x86/linux/stack-print
deleted file mode 100755
index e40dc48..0000000
--- a/tests/test-progs/stack-print/bin/x86/linux/stack-print
+++ /dev/null
Binary files differ
diff --git a/tests/test-progs/stack-print/src/stack-print.c b/tests/test-progs/stack-print/src/stack-print.c
deleted file mode 100644
index 9fbf962..0000000
--- a/tests/test-progs/stack-print/src/stack-print.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * For use for simulation and test purposes only
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * Author: Brandon Potter
- */
-
-#include <elf.h>
-#include <stdio.h>
-
-int main(int argc, char **argv, char **envp)
-{
-    int i;
-
-    printf("%p: argc: [%d]\n", &argc, argc);
-    printf("\n");
-
-    for (i = 0; i < argc; i++)
-        printf("%p: argv[%d]: [%s]\n", &argv[i], i, argv[i]);
-    printf("\n");
-
-    i = 0;
-    while (envp[i] != NULL) {
-        printf("%p: envp[%d]: [%s]\n", &envp[i], i, envp[i]);
-        i++;
-    }
-    printf("\n");
-
-    Elf64_auxv_t *auxv = (Elf64_auxv_t*)&envp[--i];
-    while (auxv++) {
-        char *type;
-        switch(auxv->a_type) {
-            case AT_IGNORE:
-                type = "AT_IGNORE";
-                break;
-            case AT_EXECFD:
-                type = "AT_EXECFD";
-                break;
-            case AT_PHDR:
-                type = "AT_PHDR";
-                break;
-            case AT_PHENT:
-                type = "AT_PHENT";
-                break;
-            case AT_PHNUM:
-                type = "AT_PHNUM";
-                break;
-            case AT_PAGESZ:
-                type = "AT_PAGESZ";
-                break;
-            case AT_BASE:
-                type = "AT_BASE";
-                break;
-            case AT_FLAGS:
-                type = "AT_FLAGS";
-                break;
-            case AT_ENTRY:
-                type = "AT_ENTRY";
-                break;
-            case AT_NOTELF:
-                type = "AT_NOTELF";
-                break;
-            case AT_UID:
-                type = "AT_UID";
-                break;
-            case AT_EUID:
-                type = "AT_EUID";
-                break;
-            case AT_GID:
-                type = "AT_GID";
-                break;
-            case AT_EGID:
-                type = "AT_EGID";
-                break;
-            case AT_CLKTCK:
-                type = "AT_CLKTCK";
-                break;
-            case AT_PLATFORM:
-                type = "AT_PLATFORM";
-                break;
-            case AT_HWCAP:
-                type = "AT_HWCAP";
-                break;
-            case AT_FPUCW:
-                type = "AT_FPUCW";
-                break;
-            case AT_DCACHEBSIZE:
-                type = "AT_DCACHEBSIZE";
-                break;
-            case AT_ICACHEBSIZE:
-                type = "AT_ICACHEBSIZE";
-                break;
-            case AT_UCACHEBSIZE:
-                type = "AT_UCACHEBSIZE";
-                break;
-            case AT_IGNOREPPC:
-                type = "AT_IGNOREPPC";
-                break;
-            case AT_SECURE:
-                type = "AT_SECURE";
-                break;
-            case AT_BASE_PLATFORM:
-                type = "AT_BASE_PLATFORM";
-                break;
-            case AT_RANDOM:
-                type = "AT_RANDOM";
-                break;
-            case AT_EXECFN:
-                type = "AT_EXECFN";
-                break;
-            case AT_SYSINFO:
-                type = "AT_SYSINFO";
-                break;
-            case AT_SYSINFO_EHDR:
-                type = "AT_SYSINFO_EHDR";
-                break;
-            case AT_L1I_CACHESHAPE:
-                type = "AT_L1I_CACHESHAPE";
-                break;
-            case AT_L1D_CACHESHAPE:
-                type = "AT_L1D_CACHESHAPE";
-                break;
-            case AT_L2_CACHESHAPE:
-                type = "AT_L2_CACHESHAPE";
-                break;
-            case AT_L3_CACHESHAPE:
-                type = "AT_L3_CACHESHAPE";
-                break;
-            case AT_NULL:
-            default:
-                printf("\n");
-                return 0;
-        }
-        printf("%p: %s: [%lx]\n", auxv, type, auxv->a_un.a_val);
-    }
-}
-
diff --git a/util/dockerfiles/ubuntu-18.04_all-dependencies/Dockerfile b/util/dockerfiles/ubuntu-18.04_all-dependencies/Dockerfile
index 282805d..1259f2e 100644
--- a/util/dockerfiles/ubuntu-18.04_all-dependencies/Dockerfile
+++ b/util/dockerfiles/ubuntu-18.04_all-dependencies/Dockerfile
@@ -31,4 +31,4 @@
 RUN apt -y install build-essential git m4 scons zlib1g zlib1g-dev \
     libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \
     python-dev python python-six doxygen libboost-all-dev libhdf5-serial-dev \
-    python-pydot libpng-dev libelf-dev
+    python-pydot libpng-dev libelf-dev pkg-config
diff --git a/util/dockerfiles/ubuntu-20.04_all-dependencies/Dockerfile b/util/dockerfiles/ubuntu-20.04_all-dependencies/Dockerfile
index 283d356..3facf7e 100644
--- a/util/dockerfiles/ubuntu-20.04_all-dependencies/Dockerfile
+++ b/util/dockerfiles/ubuntu-20.04_all-dependencies/Dockerfile
@@ -32,4 +32,4 @@
 RUN apt -y install build-essential git m4 scons zlib1g zlib1g-dev \
     libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \
     python3-dev python3-six python-is-python3 doxygen libboost-all-dev \
-    libhdf5-serial-dev python3-pydot libpng-dev libelf-dev
+    libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config
diff --git a/util/m5/src/abi/aarch64/SConsopts b/util/m5/src/abi/aarch64/SConsopts
index 47ada02..010b22e 100644
--- a/util/m5/src/abi/aarch64/SConsopts
+++ b/util/m5/src/abi/aarch64/SConsopts
@@ -30,4 +30,4 @@
 
 env['CALL_TYPE']['inst'].impl('m5op.S', 'verify_inst.cc', default=True)
 env['CALL_TYPE']['addr'].impl('m5op_addr.S')
-env['CALL_TYPE']['semi'].impl('m5op_semi.S')
+env['CALL_TYPE']['semi'].impl('m5op_semi.S', 'verify_semi.cc')
diff --git a/src/arch/null/cpu_dummy.cc b/util/m5/src/abi/aarch64/verify_semi.cc
similarity index 65%
rename from src/arch/null/cpu_dummy.cc
rename to util/m5/src/abi/aarch64/verify_semi.cc
index df30b81..c3cc77e 100644
--- a/src/arch/null/cpu_dummy.cc
+++ b/util/m5/src/abi/aarch64/verify_semi.cc
@@ -1,15 +1,5 @@
 /*
- * Copyright (c) 2013 ARM Limited
- * All rights reserved
- *
- * The license below extends only to copyright in the software and shall
- * not be construed as granting a license to any other intellectual
- * property including but not limited to intellectual property relating
- * to a hardware implementation of the functionality of the software
- * licensed hereunder.  You may use the software subject to the license
- * terms below provided that you ensure that this notice is replicated
- * unmodified and in its entirety in all distributions of the software,
- * modified or unmodified, in source code or in binary form.
+ * Copyright 2020 Google Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -35,8 +25,30 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/**
- * Provide the actual storage for maxThreadsPerCPU which is declared
- * extern and normally provided by src/cpu/base.cc
- */
-int maxThreadsPerCPU = 1;
+#include <gtest/gtest.h>
+
+#include "call_type/verify_semi.hh"
+
+extern uint64_t m5_semi_argument_block[];
+
+void
+abi_verify_semi(const siginfo_t &info, int func,
+        const std::vector<uint64_t> &args)
+{
+    // Extract the instruction that triggered the signal.
+    uint32_t inst = *(uint32_t *)info.si_addr;
+
+    // Get the imm16 field from it.
+    uint32_t imm16 = (inst >> 5) & 0xffff;
+
+    // Verify that it used the gem5 immediate value.
+    EXPECT_EQ(imm16, 0x5d57);
+
+    // Check that the right function was called.
+    EXPECT_EQ(func, (m5_semi_argument_block[0] >> 8) & 0xff);
+
+    // Check that the arguments were correct.
+    uint64_t *arg = &m5_semi_argument_block[1];
+    for (uint64_t expected: args)
+        EXPECT_EQ(*arg++, expected);
+}
diff --git a/util/m5/src/call_type/semi.test.cc b/util/m5/src/call_type/semi.test.cc
index 10f3d0e..6c17a06 100644
--- a/util/m5/src/call_type/semi.test.cc
+++ b/util/m5/src/call_type/semi.test.cc
@@ -27,8 +27,134 @@
 
 #include <gtest/gtest.h>
 
+#include <csetjmp>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "args.hh"
 #include "call_type.hh"
+#include "call_type/verify_semi.hh"
+#include "dispatch_table.hh"
+
+class DefaultCallType : public CallType
+{
+  private:
+    DispatchTable dt;
+
+  public:
+    DefaultCallType() : CallType("default") {}
+
+    bool init_called = false;
+    void init() override { init_called = true; }
+
+    bool isDefault() const override { return true; }
+    void printDesc(std::ostream &os) const override {}
+    const DispatchTable &getDispatch() const override { return dt; }
+};
+
+DefaultCallType default_call_type;
+
+TEST(SemiCallType, Detect)
+{
+    CallType *ct;
+
+    // Semi should not be selected if there are no arguments.
+    Args empty({});
+    default_call_type.init_called = false;
+    ct = CallType::detect(empty);
+    EXPECT_EQ(ct, &default_call_type);
+    EXPECT_TRUE(default_call_type.init_called);
+
+    // Inst should not be selected if --semi isn't the first argument.
+    Args one_arg({"one"});
+    default_call_type.init_called = false;
+    ct = CallType::detect(one_arg);
+    EXPECT_EQ(ct, &default_call_type);
+    EXPECT_TRUE(default_call_type.init_called);
+
+    // Semi should be selected if --semi is the first argument.
+    Args selected({"--semi"});
+    default_call_type.init_called = false;
+    ct = CallType::detect(selected);
+    EXPECT_NE(ct, &default_call_type);
+    EXPECT_NE(ct, nullptr);
+    EXPECT_FALSE(default_call_type.init_called);
+
+    Args extra({"--semi", "foo"});
+    default_call_type.init_called = false;
+    ct = CallType::detect(extra);
+    EXPECT_NE(ct, &default_call_type);
+    EXPECT_NE(ct, nullptr);
+    EXPECT_FALSE(default_call_type.init_called);
+
+    // Semi should not be selected if --semi isn't first.
+    Args not_first({"foo", "--semi"});
+    default_call_type.init_called = false;
+    ct = CallType::detect(not_first);
+    EXPECT_EQ(ct, &default_call_type);
+    EXPECT_TRUE(default_call_type.init_called);
+}
+
+sigjmp_buf intercept_env;
+siginfo_t intercept_siginfo;
+
+void
+sigill_handler(int sig, siginfo_t *info, void *ucontext)
+{
+    std::memcpy(&intercept_siginfo, info, sizeof(intercept_siginfo));
+    siglongjmp(intercept_env, 1);
+}
 
 TEST(SemiCallType, Sum)
 {
+    // Get the semi call type, which is in an anonymous namespace.
+    Args args({"--semi"});
+    CallType *semi_call_type = CallType::detect(args);
+    EXPECT_NE(semi_call_type, nullptr);
+
+    // Get the dispatch table associated with it.
+    const auto &dt = semi_call_type->getDispatch();
+
+    // Determine if we're running within gem5 by checking whether a flag is
+    // set in the environment.
+    bool in_gem5 = (std::getenv("RUNNING_IN_GEM5") != nullptr);
+    if (in_gem5)
+        std::cout << "In gem5, m5 ops should work." << std::endl;
+    else
+        std::cout << "Not in gem5, m5 ops won't work." << std::endl;
+
+    // If it is, then we should be able to run the "sum" command.
+    if (in_gem5) {
+        EXPECT_EQ((*dt.m5_sum)(2, 2, 0, 0, 0, 0), 4);
+        return;
+    }
+
+    // If not, then we'll need to try to catch the fall out from trying to run
+    // an m5 op and verify that what we were trying looks correct.
+
+    struct sigaction sigill_action;
+    std::memset(&sigill_action, 0, sizeof(sigill_action));
+    sigill_action.sa_sigaction = &sigill_handler;
+    sigill_action.sa_flags = SA_SIGINFO | SA_RESETHAND;
+
+    struct sigaction old_sigill_action;
+
+    sigaction(SIGILL, &sigill_action, &old_sigill_action);
+
+    if (!sigsetjmp(intercept_env, 1)) {
+        (*dt.m5_sum)(2, 2, 0, 0, 0, 0);
+        sigaction(SIGILL, &old_sigill_action, nullptr);
+        ADD_FAILURE() << "Didn't die when attempting to run \"sum\".";
+        return;
+    }
+
+    // Back from siglongjump.
+    auto &info = intercept_siginfo;
+
+    EXPECT_EQ(info.si_signo, SIGILL);
+    EXPECT_TRUE(info.si_code == ILL_ILLOPC || info.si_code == ILL_ILLOPN);
+
+    abi_verify_semi(info, M5OP_SUM, {2, 2, 0, 0, 0, 0});
 }
diff --git a/src/arch/null/cpu_dummy.cc b/util/m5/src/call_type/verify_semi.hh
similarity index 65%
copy from src/arch/null/cpu_dummy.cc
copy to util/m5/src/call_type/verify_semi.hh
index df30b81..02aa5cc 100644
--- a/src/arch/null/cpu_dummy.cc
+++ b/util/m5/src/call_type/verify_semi.hh
@@ -1,15 +1,5 @@
 /*
- * Copyright (c) 2013 ARM Limited
- * All rights reserved
- *
- * The license below extends only to copyright in the software and shall
- * not be construed as granting a license to any other intellectual
- * property including but not limited to intellectual property relating
- * to a hardware implementation of the functionality of the software
- * licensed hereunder.  You may use the software subject to the license
- * terms below provided that you ensure that this notice is replicated
- * unmodified and in its entirety in all distributions of the software,
- * modified or unmodified, in source code or in binary form.
+ * Copyright 2020 Google Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -35,8 +25,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/**
- * Provide the actual storage for maxThreadsPerCPU which is declared
- * extern and normally provided by src/cpu/base.cc
- */
-int maxThreadsPerCPU = 1;
+#include <csignal>
+#include <vector>
+
+#ifndef __VERIFY_SEMI_HH__
+#define __VERIFY_SEMI_HH__
+
+void abi_verify_semi(const siginfo_t &info, int func,
+        const std::vector<uint64_t> &args);
+
+#endif // __VERIFY_SEMI_HH__
diff --git a/util/m5/src/m5_mmap.c b/util/m5/src/m5_mmap.c
index d341303..c088e26 100644
--- a/util/m5/src/m5_mmap.c
+++ b/util/m5/src/m5_mmap.c
@@ -46,6 +46,7 @@
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <unistd.h>
 
 #include "m5_mmap.h"
 
@@ -63,6 +64,11 @@
 {
     int fd;
 
+    if (m5_mem) {
+        fprintf(stderr, "m5 mem already mapped.\n");
+        exit(1);
+    }
+
     fd = open(m5_mmap_dev, O_RDWR | O_SYNC);
     if (fd == -1) {
         fprintf(stderr, "Can't open %s: %s\n", m5_mmap_dev, strerror(errno));
@@ -71,8 +77,19 @@
 
     m5_mem = mmap(NULL, 0x10000, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
                   m5op_addr);
+    close(fd);
+
     if (!m5_mem) {
         fprintf(stderr, "Can't map %s: %s\n", m5_mmap_dev, strerror(errno));
         exit(1);
     }
 }
+
+void
+unmap_m5_mem()
+{
+    if (m5_mem) {
+        munmap(m5_mem, 0x10000);
+        m5_mem = NULL;
+    }
+}
diff --git a/util/m5/src/m5_mmap.h b/util/m5/src/m5_mmap.h
index 09cddb2..ce934f0 100644
--- a/util/m5/src/m5_mmap.h
+++ b/util/m5/src/m5_mmap.h
@@ -51,6 +51,7 @@
 extern uint64_t m5op_addr;
 extern const char *m5_mmap_dev;
 void map_m5_mem();
+void unmap_m5_mem();
 
 #ifdef __cplusplus
 }