mem-ruby: Fixing MESI Three Level

Adding back some changes done in patch 676ae57827.
Transient state IS_I, STALE_DATA, Data_Stale event are necessary.

Issue: (cacheline A, initial state for P0 and P1 is I)
|   P0   |   P1   |
|GETX (A)|        |
|        |GETS (A)|
|Inv_All |        |
P1 never sends the ACK - deadlock
It should ACK, later upon data use it as stale data, and got to I.

Solution:
P1(A):
GETS:    I->IS
Inv_All: IS->IS_I, Send ACK
Data:    IS_I->I, STALE_DATA to L0

Signed-off-by: Pouya Fotouhi <pfotouhi@ucdavis.edu>
Change-Id: I1e7b2c05439d08579c68d8eb444e0f332e75e07f
Reviewed-on: https://gem5-review.googlesource.com/c/15715
Reviewed-by: Jason Lowe-Power <jason@lowepower.com>
Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com>
Maintainer: Jason Lowe-Power <jason@lowepower.com>
diff --git a/src/mem/protocol/MESI_Three_Level-L0cache.sm b/src/mem/protocol/MESI_Three_Level-L0cache.sm
index 4950d63..a87a3d9 100644
--- a/src/mem/protocol/MESI_Three_Level-L0cache.sm
+++ b/src/mem/protocol/MESI_Three_Level-L0cache.sm
@@ -101,6 +101,7 @@
 
     Data,               desc="Data for processor";
     Data_Exclusive,     desc="Data for processor";
+    Data_Stale,         desc="Data for processor, but not for storage";
 
     Ack,        desc="Ack for processor";
     Ack_all,      desc="Last ack for processor";
@@ -268,6 +269,8 @@
             trigger(Event:Data_Exclusive, in_msg.addr, cache_entry, tbe);
         } else if(in_msg.Class == CoherenceClass:DATA) {
             trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+        } else if(in_msg.Class == CoherenceClass:STALE_DATA) {
+            trigger(Event:Data_Stale, in_msg.addr, cache_entry, tbe);
         } else if (in_msg.Class == CoherenceClass:ACK) {
             trigger(Event:Ack, in_msg.addr, cache_entry, tbe);
         } else if (in_msg.Class == CoherenceClass:WB_ACK) {
@@ -732,6 +735,15 @@
     kd_wakeUpDependents;
   }
 
+  transition(IS, Data_Stale, I) {
+    u_writeDataToCache;
+    hx_load_hit;
+    s_deallocateTBE;
+    ff_deallocateCacheBlock;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
   transition(Inst_IS, Data, S) {
     u_writeInstToCache;
     hx_ifetch_hit;
@@ -748,6 +760,15 @@
     kd_wakeUpDependents;
   }
 
+  transition(Inst_IS, Data_Stale, I) {
+    u_writeInstToCache;
+    hx_ifetch_hit;
+    s_deallocateTBE;
+    ff_deallocateCacheBlock;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
   transition({IM,SM}, Data_Exclusive, M) {
     u_writeDataToCache;
     hhx_store_hit;
diff --git a/src/mem/protocol/MESI_Three_Level-L1cache.sm b/src/mem/protocol/MESI_Three_Level-L1cache.sm
index e960486..6db35ce 100644
--- a/src/mem/protocol/MESI_Three_Level-L1cache.sm
+++ b/src/mem/protocol/MESI_Three_Level-L1cache.sm
@@ -71,6 +71,7 @@
     IS, AccessPermission:Busy, desc="L1 idle, issued GETS, have not seen response yet";
     IM, AccessPermission:Busy, desc="L1 idle, issued GETX, have not seen response yet";
     SM, AccessPermission:Read_Only, desc="L1 idle, issued GETX, have not seen response yet";
+    IS_I, AccessPermission:Busy, desc="L1 idle, issued GETS, saw Inv before data because directory doesn't block on GETS hit";
     M_I, AccessPermission:Busy, desc="L1 replacing, waiting for ACK";
     SINK_WB_ACK, AccessPermission:Busy, desc="This is to sink WB_Acks from L2";
 
@@ -278,7 +279,8 @@
         if(in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) {
           trigger(Event:Data_Exclusive, in_msg.addr, cache_entry, tbe);
         } else if(in_msg.Type == CoherenceResponseType:DATA) {
-          if (getState(tbe, cache_entry, in_msg.addr) == State:IS &&
+          if ((getState(tbe, cache_entry, in_msg.addr) == State:IS ||
+               getState(tbe, cache_entry, in_msg.addr) == State:IS_I) &&
               machineIDToMachineType(in_msg.Sender) == MachineType:L1Cache) {
 
               trigger(Event:DataS_fromL1, in_msg.addr, cache_entry, tbe);
@@ -626,6 +628,20 @@
       }
   }
 
+  action(h_stale_data_to_l0, "hs", desc="If not prefetch, send data to the L0 cache.") {
+      enqueue(bufferToL0_out, CoherenceMsg, l1_response_latency) {
+          assert(is_valid(cache_entry));
+
+          out_msg.addr := address;
+          out_msg.Class := CoherenceClass:STALE_DATA;
+          out_msg.Sender := machineID;
+          out_msg.Dest := createMachineID(MachineType:L0Cache, version);
+          out_msg.DataBlk := cache_entry.DataBlk;
+          out_msg.Dirty := cache_entry.Dirty;
+          out_msg.MessageSize := MessageSizeType:Response_Data;
+       }
+   }
+
   action(i_allocateTBE, "i", desc="Allocate TBE (number of invalidates=0)") {
     check_allocate(TBEs);
     assert(is_valid(cache_entry));
@@ -733,7 +749,7 @@
   //*****************************************************
 
   // Transitions for Load/Store/Replacement/WriteBack from transient states
-  transition({IS, IM, M_I, SM, SINK_WB_ACK, S_IL0, M_IL0, E_IL0, MM_IL0},
+  transition({IS, IM, IS_I, M_I, SM, SINK_WB_ACK, S_IL0, M_IL0, E_IL0, MM_IL0},
              {Load, Store, L1_Replacement}) {
     z0_stallAndWaitL0Queue;
   }
@@ -895,6 +911,11 @@
   }
 
   // Transitions from IS
+  transition({IS,IS_I}, Inv, IS_I) {
+    fi_sendInvAck;
+    l_popL2RequestQueue;
+  }
+
   transition(IS, Data_all_Acks, S) {
     u_writeDataFromL2Response;
     h_data_to_l0;
@@ -903,6 +924,15 @@
     kd_wakeUpDependents;
   }
 
+  transition(IS_I, Data_all_Acks, I) {
+    u_writeDataFromL2Response;
+    h_stale_data_to_l0;
+    s_deallocateTBE;
+    ff_deallocateCacheBlock;
+    o_popL2ResponseQueue;
+    kd_wakeUpDependents;
+  }
+
   transition(IS, DataS_fromL1, S) {
     u_writeDataFromL2Response;
     j_sendUnblock;
@@ -912,8 +942,18 @@
     kd_wakeUpDependents;
   }
 
+  transition(IS_I, DataS_fromL1, I) {
+    u_writeDataFromL2Response;
+    j_sendUnblock;
+    h_stale_data_to_l0;
+    s_deallocateTBE;
+    ff_deallocateCacheBlock;
+    o_popL2ResponseQueue;
+    kd_wakeUpDependents;
+  }
+
   // directory is blocked when sending exclusive data
-  transition(IS, Data_Exclusive, E) {
+  transition({IS,IS_I}, Data_Exclusive, E) {
     u_writeDataFromL2Response;
     hh_xdata_to_l0;
     jj_sendExclusiveUnblock;
@@ -1012,7 +1052,7 @@
     z2_stallAndWaitL2Queue;
   }
 
-  transition({IS, S_IL0, M_IL0, E_IL0, MM_IL0}, {Inv, Fwd_GETX, Fwd_GETS}) {
+  transition({S_IL0, M_IL0, E_IL0, MM_IL0}, {Inv, Fwd_GETX, Fwd_GETS}) {
     z2_stallAndWaitL2Queue;
   }
 }
diff --git a/src/mem/protocol/MESI_Three_Level-msg.sm b/src/mem/protocol/MESI_Three_Level-msg.sm
index 7fe4add..2a5ecc8 100644
--- a/src/mem/protocol/MESI_Three_Level-msg.sm
+++ b/src/mem/protocol/MESI_Three_Level-msg.sm
@@ -46,6 +46,11 @@
   DATA, desc="Data block for L1 cache in S state";
   DATA_EXCLUSIVE, desc="Data block for L1 cache in M/E state";
   ACK, desc="Generic invalidate ack";
+
+  // This is a special case in which the L1 cache lost permissions to the
+  // shared block before it got the data. So the L0 cache can use the data
+  // but not store it.
+  STALE_DATA;
 }
 
 // Class for messages sent between the L0 and the L1 controllers.