arch-arm: Memoize computeAddrTop in the MMU code

Profiling gem5 has indicated computeAddrTop as one of the main
contributors in AArch64 simulation time

The utility function gets used in the critical path of gem5, which is
the memory translation subsystem. The function is supposed to compute a
rather trivial task: identifying the "real" most significant bit of a
virtual address.

This turns out to be quite expensive. Why?

The main issue is the AArch32/AArch64 check, which uses the ELIs32
helper. This performs a sequential read of several MiscReg
values until it confirms that an EL is indeed using AArch32 (or
AArch64).

This is functionally accurate but it is too expensive for the critical
path of a program.

This patch is addressing the issue by adding a Memoizer object for the
computeAddrTop function to the CachedState data structure, which is
already holding cached system register values for performance reasons.
Whenever we need to invalidate those sys reg values because of a change
in the translation system, we also flush/invalidate the memoizer cache

Change-Id: If42e945c650c293ace304fb4c35e709783bb82d4
Signed-off-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/59151
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/src/arch/arm/mmu.cc b/src/arch/arm/mmu.cc
index 79a5240..0dde0d1 100644
--- a/src/arch/arm/mmu.cc
+++ b/src/arch/arm/mmu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2016-2021 Arm Limited
+ * Copyright (c) 2010-2013, 2016-2022 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -197,6 +197,8 @@
 MMU::invalidateMiscReg()
 {
     s1State.miscRegValid = false;
+    s1State.computeAddrTop.flush();
+    s2State.computeAddrTop.flush();
 }
 
 Fault
@@ -237,11 +239,12 @@
     updateMiscReg(tc, NormalTran, state.isStage2);
     Addr vaddr_tainted = req->getVaddr();
     Addr vaddr = 0;
-    if (state.aarch64)
+    if (state.aarch64) {
         vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.aarch64EL,
-                                 (TCR)state.ttbcr, mode==Execute);
-    else
+            static_cast<TCR>(state.ttbcr), mode==Execute, state);
+    } else {
         vaddr = vaddr_tainted;
+    }
     Request::Flags flags = req->getFlags();
 
     bool is_fetch = (mode == Execute);
@@ -480,7 +483,7 @@
 
     Addr vaddr_tainted = req->getVaddr();
     Addr vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.aarch64EL,
-        (TCR)state.ttbcr, mode==Execute);
+        static_cast<TCR>(state.ttbcr), mode==Execute, state);
 
     Request::Flags flags = req->getFlags();
     bool is_fetch  = (mode == Execute);
@@ -789,6 +792,18 @@
     return false;
 }
 
+Addr
+MMU::purifyTaggedAddr(Addr vaddr_tainted, ThreadContext *tc, ExceptionLevel el,
+                      TCR tcr, bool is_inst, CachedState& state)
+{
+    const bool selbit = bits(vaddr_tainted, 55);
+
+    // Call the memoized version of computeAddrTop
+    const auto topbit = state.computeAddrTop(tc, selbit, is_inst, tcr, el);
+
+    return maskTaggedAddr(vaddr_tainted, tc, el, topbit);
+}
+
 Fault
 MMU::translateMmuOff(ThreadContext *tc, const RequestPtr &req, Mode mode,
         ArmTranslationType tran_type, Addr vaddr, bool long_desc_format,
@@ -947,11 +962,12 @@
 
     Addr vaddr_tainted = req->getVaddr();
     Addr vaddr = 0;
-    if (state.aarch64)
+    if (state.aarch64) {
         vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.aarch64EL,
-            (TCR)state.ttbcr, mode==Execute);
-    else
+            static_cast<TCR>(state.ttbcr), mode==Execute, state);
+    } else {
         vaddr = vaddr_tainted;
+    }
     Request::Flags flags = req->getFlags();
 
     bool is_fetch  = (mode == Execute);
@@ -1443,7 +1459,7 @@
     ExceptionLevel target_el = state.aarch64 ? state.aarch64EL : EL1;
     if (state.aarch64) {
         vaddr = purifyTaggedAddr(vaddr_tainted, tc, target_el,
-            (TCR)state.ttbcr, mode==Execute);
+            static_cast<TCR>(state.ttbcr), mode==Execute, state);
     } else {
         vaddr = vaddr_tainted;
     }
diff --git a/src/arch/arm/mmu.hh b/src/arch/arm/mmu.hh
index fc5b307..171dbf5 100644
--- a/src/arch/arm/mmu.hh
+++ b/src/arch/arm/mmu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2016, 2019-2021 Arm Limited
+ * Copyright (c) 2010-2013, 2016, 2019-2022 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -43,8 +43,9 @@
 
 #include "arch/arm/page_size.hh"
 #include "arch/arm/tlb.hh"
+#include "arch/arm/utility.hh"
 #include "arch/generic/mmu.hh"
-
+#include "base/memoizer.hh"
 #include "enums/ArmLookupLevel.hh"
 
 #include "params/ArmMMU.hh"
@@ -130,11 +131,44 @@
         S12E1Tran = 0x100
     };
 
-    struct CachedState {
-        explicit CachedState(MMU *_mmu, bool stage2)
-          : mmu(_mmu), isStage2(stage2)
+    struct CachedState
+    {
+        CachedState(MMU *_mmu, bool stage2)
+          : mmu(_mmu), isStage2(stage2),
+            computeAddrTop(ArmISA::computeAddrTop)
         {}
 
+        CachedState&
+        operator=(const CachedState &rhs)
+        {
+            isStage2 = rhs.isStage2;
+            cpsr = rhs.cpsr;
+            aarch64 = rhs.aarch64;
+            aarch64EL = EL0;
+            sctlr = rhs.sctlr;
+            scr = rhs.scr;
+            isPriv = rhs.isPriv;
+            isSecure = rhs.isSecure;
+            isHyp = rhs.isHyp;
+            ttbcr = rhs.ttbcr;
+            asid = rhs.asid;
+            vmid = rhs.vmid;
+            prrr = rhs.prrr;
+            nmrr = rhs.nmrr;
+            hcr = rhs.hcr;
+            dacr = rhs.dacr;
+            miscRegValid = rhs.miscRegValid;
+            curTranType = rhs.curTranType;
+            stage2Req = rhs.stage2Req;
+            stage2DescReq = rhs.stage2DescReq;
+            directToStage2 = rhs.directToStage2;
+
+            // When we copy we just flush the memoizer cache
+            computeAddrTop.flush();
+
+            return *this;
+        }
+
         void updateMiscReg(ThreadContext *tc, ArmTranslationType tran_type);
 
         /** Returns the current VMID
@@ -173,6 +207,9 @@
         // Indicates whether all translation requests should
         // be routed directly to the stage 2 TLB
         bool directToStage2 = false;
+
+        Memoizer<int, ThreadContext*, bool,
+                 bool, TCR, ExceptionLevel> computeAddrTop;
     };
 
     MMU(const ArmMMUParams &p);
@@ -397,7 +434,12 @@
                              ThreadContext *tc, bool stage2);
     Fault checkPermissions64(TlbEntry *te, const RequestPtr &req, Mode mode,
                              ThreadContext *tc, CachedState &state);
+
   protected:
+    Addr purifyTaggedAddr(Addr vaddr_tainted, ThreadContext *tc,
+                          ExceptionLevel el,
+                          TCR tcr, bool is_inst, CachedState& state);
+
     bool checkPAN(ThreadContext *tc, uint8_t ap, const RequestPtr &req,
                   Mode mode, const bool is_priv, CachedState &state);