cpu: Fixed useful counter handling in LTAGE

Increased to 2 bits of useful counter per TAGE entry as described in the
LTAGE paper (and made the size configurable)

Changed how the useful counters are incremented/decremented as described
in the LTAGE paper

Change-Id: I8c692cc7c180d29897cb77781681ff498a1d16c8
Signed-off-by: Pau Cabre <pau.cabre@metempsy.com>
Reviewed-on: https://gem5-review.googlesource.com/c/14215
Reviewed-by: Ilias Vougioukas <ilias.vougioukas@arm.com>
Reviewed-by: Jason Lowe-Power <jason@lowepower.com>
Maintainer: Jason Lowe-Power <jason@lowepower.com>
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index a428194..aa6cd4a 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -100,6 +100,7 @@
     logSizeLoopPred = Param.Unsigned(8, "Log size of the loop predictor")
     nHistoryTables = Param.Unsigned(12, "Number of history tables")
     tagTableCounterBits = Param.Unsigned(3, "Number of tag table counter bits")
+    tagTableUBits = Param.Unsigned(2, "Number of tag table u bits")
     histBufferSize = Param.Unsigned(2097152,
             "A large number to track all branch histories(2MEntries default)")
     minHist = Param.Unsigned(4, "Minimum history size of LTAGE")
diff --git a/src/cpu/pred/ltage.cc b/src/cpu/pred/ltage.cc
index 85ae2b3..b049d4f 100644
--- a/src/cpu/pred/ltage.cc
+++ b/src/cpu/pred/ltage.cc
@@ -55,6 +55,7 @@
     logSizeLoopPred(params->logSizeLoopPred),
     nHistoryTables(params->nHistoryTables),
     tagTableCounterBits(params->tagTableCounterBits),
+    tagTableUBits(params->tagTableUBits),
     histBufferSize(params->histBufferSize),
     minHist(params->minHist),
     maxHist(params->maxHist),
@@ -68,6 +69,11 @@
     loopNumIterMask((1 << loopTableIterBits) - 1),
     threadHistory(params->numThreads)
 {
+    // Current method for periodically resetting the u counter bits only
+    // works for 1 or 2 bits
+    // Also make sure that it is not 0
+    assert(tagTableUBits <= 2 && (tagTableUBits > 0));
+
     // we use uint16_t type for these vales, so they cannot be more than
     // 16 bits
     assert(loopTableTagBits <= 16);
@@ -576,7 +582,7 @@
 
         if (alloc) {
             // is there some "unuseful" entry to allocate
-            int8_t min = 1;
+            uint8_t min = 1;
             for (int i = nHistoryTables; i > bi->hitBank; i--) {
                 if (gtable[i][bi->tableIndices[i]].u < min) {
                     min = gtable[i][bi->tableIndices[i]].u;
@@ -605,7 +611,6 @@
                 if ((gtable[i][bi->tableIndices[i]].u == 0)) {
                     gtable[i][bi->tableIndices[i]].tag = bi->tableTags[i];
                     gtable[i][bi->tableIndices[i]].ctr = (taken) ? 0 : -1;
-                    gtable[i][bi->tableIndices[i]].u = 0; //?
                     break;
                 }
             }
@@ -643,12 +648,9 @@
             }
 
             // update the u counter
-            if (longest_match_pred != bi->altTaken) {
-                if (longest_match_pred == taken) {
-                    if (gtable[bi->hitBank][bi->hitBankIndex].u < 1) {
-                        gtable[bi->hitBank][bi->hitBankIndex].u++;
-                    }
-                }
+            if (bi->tagePred != bi->altTaken) {
+                unsignedCtrUpdate(gtable[bi->hitBank][bi->hitBankIndex].u,
+                                  bi->tagePred == taken, tagTableUBits);
             }
         } else {
             baseUpdate(pc, taken, bi);
diff --git a/src/cpu/pred/ltage.hh b/src/cpu/pred/ltage.hh
index 2119156..68aef1cc 100644
--- a/src/cpu/pred/ltage.hh
+++ b/src/cpu/pred/ltage.hh
@@ -94,7 +94,7 @@
     {
         int8_t ctr;
         uint16_t tag;
-        int8_t u;
+        uint8_t u;
         TageEntry() : ctr(0), tag(0), u(0) { }
     };
 
@@ -360,6 +360,7 @@
     const unsigned logSizeLoopPred;
     const unsigned nHistoryTables;
     const unsigned tagTableCounterBits;
+    const unsigned tagTableUBits;
     const unsigned histBufferSize;
     const unsigned minHist;
     const unsigned maxHist;