cpu: fixed how O3 CPU executes an exit system call

When a thread executed an exit syscall in SE mode, the thread context
was removed immediately in the same cycle, which left inflight squash
operations and trap event incomplete. The problem happened when a new
thread was assigned to the CPU later. The new thread started with some
incomplete transactions of the previous thread (e.g., squashing). This
problem could cause incorrect execution flow for the new thread (i.e.,
pc was not reset properly at the exit point), deadlock (i.e., some
stage-to-stage signals were not reset) and incorrect rename map between
logical and physical registers.

This patch adds a new state called 'Halting' to the thread context and
defers removing thread context from a CPU until a trap event initiated
by an exit syscall execution is processed. This patch also makes sure
that the removal of a thread context happens after all inflight
transactions of the to-be-removed thread in the pipeline complete.

Change-Id: If7ef1462fb8864e22b45371ee7ae67e2a5ad38b8
Reviewed-on: https://gem5-review.googlesource.com/c/8184
Reviewed-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Maintainer: Jason Lowe-Power <jason@lowepower.com>
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index 2d7a23e..4e32f86 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -187,6 +187,9 @@
     /** Initializes stage by sending back the number of free entries. */
     void startupStage();
 
+    /** Clear all thread-specific states */
+    void clearStates(ThreadID tid);
+
     /** Initializes the draining of commit. */
     void drain();
 
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index 879fad2..2891ce3 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -366,6 +366,22 @@
 
 template <class Impl>
 void
+DefaultCommit<Impl>::clearStates(ThreadID tid)
+{
+    commitStatus[tid] = Idle;
+    changedROBNumEntries[tid] = false;
+    checkEmptyROB[tid] = false;
+    trapInFlight[tid] = false;
+    committedStores[tid] = false;
+    trapSquash[tid] = false;
+    tcSquash[tid] = false;
+    pc[tid].set(0);
+    lastCommitedSeqNum[tid] = 0;
+    squashAfterInst[tid] = NULL;
+}
+
+template <class Impl>
+void
 DefaultCommit<Impl>::drain()
 {
     drainPending = true;
@@ -813,6 +829,13 @@
         if (trapSquash[tid]) {
             assert(!tcSquash[tid]);
             squashFromTrap(tid);
+
+            // If the thread is trying to exit (i.e., an exit syscall was
+            // executed), this trapSquash was originated by the exit
+            // syscall earlier. In this case, schedule an exit event in
+            // the next cycle to fully terminate this thread
+            if (cpu->isThreadExiting(tid))
+                cpu->scheduleThreadExitEvent(tid);
         } else if (tcSquash[tid]) {
             assert(commitStatus[tid] != TrapPending);
             squashFromTC(tid);
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index f5aa9f7..e50741e 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -143,6 +143,8 @@
       dtb(params->dtb),
       tickEvent([this]{ tick(); }, "FullO3CPU tick",
                 false, Event::CPU_Tick_Pri),
+      threadExitEvent([this]{ exitThreads(); }, "FullO3CPU exit threads",
+                false, Event::CPU_Exit_Pri),
 #ifndef NDEBUG
       instcount(0),
 #endif
@@ -810,7 +812,7 @@
 FullO3CPU<Impl>::haltContext(ThreadID tid)
 {
     //For now, this is the same as deallocate
-    DPRINTF(O3CPU,"[tid:%i]: Halt Context called. Deallocating", tid);
+    DPRINTF(O3CPU,"[tid:%i]: Halt Context called. Deallocating\n", tid);
     assert(!switchedOut());
 
     deactivateThread(tid);
@@ -886,51 +888,20 @@
     // here to alleviate the case for double-freeing registers
     // in SMT workloads.
 
-    // Unbind Int Regs from Rename Map
-    for (RegId reg_id(IntRegClass, 0); reg_id.index() < TheISA::NumIntRegs;
-         reg_id.index()++) {
-        PhysRegIdPtr phys_reg = renameMap[tid].lookup(reg_id);
-        scoreboard.unsetReg(phys_reg);
-        freeList.addReg(phys_reg);
-    }
+    // clear all thread-specific states in each stage of the pipeline
+    // since this thread is going to be completely removed from the CPU
+    commit.clearStates(tid);
+    fetch.clearStates(tid);
+    decode.clearStates(tid);
+    rename.clearStates(tid);
+    iew.clearStates(tid);
 
-    // Unbind Float Regs from Rename Map
-    for (RegId reg_id(FloatRegClass, 0); reg_id.index() < TheISA::NumFloatRegs;
-         reg_id.index()++) {
-        PhysRegIdPtr phys_reg = renameMap[tid].lookup(reg_id);
-        scoreboard.unsetReg(phys_reg);
-        freeList.addReg(phys_reg);
-    }
-
-    // Unbind Float Regs from Rename Map
-    for (unsigned preg = 0; preg < TheISA::NumVecPredRegs; preg++) {
-        PhysRegIdPtr phys_reg = renameMap[tid].lookup(
-            RegId(VecPredRegClass, preg));
-        scoreboard.unsetReg(phys_reg);
-        freeList.addReg(phys_reg);
-    }
-
-    // Unbind condition-code Regs from Rename Map
-    for (RegId reg_id(CCRegClass, 0); reg_id.index() < TheISA::NumCCRegs;
-         reg_id.index()++) {
-        PhysRegIdPtr phys_reg = renameMap[tid].lookup(reg_id);
-        scoreboard.unsetReg(phys_reg);
-        freeList.addReg(phys_reg);
-    }
-
-    // Squash Throughout Pipeline
-    DynInstPtr inst = commit.rob->readHeadInst(tid);
-    InstSeqNum squash_seq_num = inst->seqNum;
-    fetch.squash(0, squash_seq_num, inst, tid);
-    decode.squash(tid);
-    rename.squash(squash_seq_num, tid);
-    iew.squash(tid);
-    iew.ldstQueue.squash(squash_seq_num, tid);
-    commit.rob->squash(squash_seq_num, tid);
-
-
+    // at this step, all instructions in the pipeline should be already
+    // either committed successfully or squashed. All thread-specific
+    // queues in the pipeline must be empty.
     assert(iew.instQueue.getCount(tid) == 0);
     assert(iew.ldstQueue.getCount(tid) == 0);
+    assert(commit.rob->isEmpty(tid));
 
     // Reset ROB/IQ/LSQ Entries
 
@@ -1884,5 +1855,78 @@
     }
 }
 
+template <class Impl>
+void
+FullO3CPU<Impl>::addThreadToExitingList(ThreadID tid)
+{
+    DPRINTF(O3CPU, "Thread %d is inserted to exitingThreads list\n", tid);
+
+    // make sure the thread is Active
+    assert(std::find(activeThreads.begin(), activeThreads.end(), tid)
+                                              != activeThreads.end());
+
+    // make sure the thread has not been added to the list yet
+    assert(exitingThreads.count(tid) == 0);
+
+    // add the thread to exitingThreads list to mark that this thread is
+    // trying to exit. The boolean value in the pair denotes if a thread is
+    // ready to exit. The thread is not ready to exit until the corresponding
+    // exit trap event is processed in the future. Until then, it'll be still
+    // an active thread that is trying to exit.
+    exitingThreads.emplace(std::make_pair(tid, false));
+}
+
+template <class Impl>
+bool
+FullO3CPU<Impl>::isThreadExiting(ThreadID tid) const
+{
+    return exitingThreads.count(tid) == 1;
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::scheduleThreadExitEvent(ThreadID tid)
+{
+    assert(exitingThreads.count(tid) == 1);
+
+    // exit trap event has been processed. Now, the thread is ready to exit
+    // and be removed from the CPU.
+    exitingThreads[tid] = true;
+
+    // we schedule a threadExitEvent in the next cycle to properly clean
+    // up the thread's states in the pipeline. threadExitEvent has lower
+    // priority than tickEvent, so the cleanup will happen at the very end
+    // of the next cycle after all pipeline stages complete their operations.
+    // We want all stages to complete squashing instructions before doing
+    // the cleanup.
+    if (!threadExitEvent.scheduled()) {
+        schedule(threadExitEvent, nextCycle());
+    }
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::exitThreads()
+{
+    // there must be at least one thread trying to exit
+    assert(exitingThreads.size() > 0);
+
+    // terminate all threads that are ready to exit
+    auto it = exitingThreads.begin();
+    while (it != exitingThreads.end()) {
+        ThreadID thread_id = it->first;
+        bool readyToExit = it->second;
+
+        if (readyToExit) {
+            DPRINTF(O3CPU, "Exiting thread %d\n", thread_id);
+            haltContext(thread_id);
+            tcBase(thread_id)->setStatus(ThreadContext::Halted);
+            it = exitingThreads.erase(it);
+        } else {
+            it++;
+        }
+    }
+}
+
 // Forward declaration of FullO3CPU.
 template class FullO3CPU<O3CPUImpl>;
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index aabac5f..ec6be65 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -205,6 +205,9 @@
     /** The tick event used for scheduling CPU ticks. */
     EventFunctionWrapper tickEvent;
 
+    /** The exit event used for terminating all ready-to-exit threads */
+    EventFunctionWrapper threadExitEvent;
+
     /** Schedule tick event, regardless of its current state. */
     void scheduleTickEvent(Cycles delay)
     {
@@ -331,6 +334,21 @@
     void serializeThread(CheckpointOut &cp, ThreadID tid) const override;
     void unserializeThread(CheckpointIn &cp, ThreadID tid) override;
 
+    /** Insert tid to the list of threads trying to exit */
+    void addThreadToExitingList(ThreadID tid);
+
+    /** Is the thread trying to exit? */
+    bool isThreadExiting(ThreadID tid) const;
+
+    /**
+     *  If a thread is trying to exit and its corresponding trap event
+     *  has been completed, schedule an event to terminate the thread.
+     */
+    void scheduleThreadExitEvent(ThreadID tid);
+
+    /** Terminate all threads that are ready to exit */
+    void exitThreads();
+
   public:
     /** Executes a syscall.
      * @todo: Determine if this needs to be virtual.
@@ -648,6 +666,13 @@
     /** Active Threads List */
     std::list<ThreadID> activeThreads;
 
+    /**
+     *  This is a list of threads that are trying to exit. Each thread id
+     *  is mapped to a boolean value denoting whether the thread is ready
+     *  to exit.
+     */
+    std::unordered_map<ThreadID, bool> exitingThreads;
+
     /** Integer Register Scoreboard */
     Scoreboard scoreboard;
 
diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh
index a1e29a6..4cd3184 100644
--- a/src/cpu/o3/decode.hh
+++ b/src/cpu/o3/decode.hh
@@ -102,6 +102,10 @@
     DefaultDecode(O3CPU *_cpu, DerivO3CPUParams *params);
 
     void startupStage();
+
+    /** Clear all thread-specific states */
+    void clearStates(ThreadID tid);
+
     void resetStage();
 
     /** Returns the name of decode. */
diff --git a/src/cpu/o3/decode_impl.hh b/src/cpu/o3/decode_impl.hh
index 63b180e..27b3c30 100644
--- a/src/cpu/o3/decode_impl.hh
+++ b/src/cpu/o3/decode_impl.hh
@@ -93,6 +93,14 @@
 
 template<class Impl>
 void
+DefaultDecode<Impl>::clearStates(ThreadID tid)
+{
+    decodeStatus[tid] = Idle;
+    stalls[tid].rename = false;
+}
+
+template<class Impl>
+void
 DefaultDecode<Impl>::resetStage()
 {
     _status = Inactive;
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index ee1932b..3cf0773 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -216,6 +216,9 @@
     /** Initialize stage. */
     void startupStage();
 
+    /** Clear all thread-specific states*/
+    void clearStates(ThreadID tid);
+
     /** Handles retrying the fetch access. */
     void recvReqRetry();
 
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index 73c1ed1..8afe623 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -328,6 +328,26 @@
 
 template<class Impl>
 void
+DefaultFetch<Impl>::clearStates(ThreadID tid)
+{
+    fetchStatus[tid] = Running;
+    pc[tid] = cpu->pcState(tid);
+    fetchOffset[tid] = 0;
+    macroop[tid] = NULL;
+    delayedCommit[tid] = false;
+    memReq[tid] = NULL;
+    stalls[tid].decode = false;
+    stalls[tid].drain = false;
+    fetchBufferPC[tid] = 0;
+    fetchBufferValid[tid] = false;
+    fetchQueue[tid].clear();
+
+    // TODO not sure what to do with priorityList for now
+    // priorityList.push_back(tid);
+}
+
+template<class Impl>
+void
 DefaultFetch<Impl>::resetStage()
 {
     numInst = 0;
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index b15521f..363c645 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -147,6 +147,9 @@
     /** Initializes stage; sends back the number of free IQ and LSQ entries. */
     void startupStage();
 
+    /** Clear all thread-specific states */
+    void clearStates(ThreadID tid);
+
     /** Sets main time buffer used for backwards communication. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index 3d5d848..2513896 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -325,6 +325,19 @@
 
 template<class Impl>
 void
+DefaultIEW<Impl>::clearStates(ThreadID tid)
+{
+    toRename->iewInfo[tid].usedIQ = true;
+    toRename->iewInfo[tid].freeIQEntries =
+        instQueue.numFreeEntries(tid);
+
+    toRename->iewInfo[tid].usedLSQ = true;
+    toRename->iewInfo[tid].freeLQEntries = ldstQueue.numFreeLoadEntries(tid);
+    toRename->iewInfo[tid].freeSQEntries = ldstQueue.numFreeStoreEntries(tid);
+}
+
+template<class Impl>
+void
 DefaultIEW<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
     timeBuffer = tb_ptr;
diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh
index a091c09..5727909 100644
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -169,6 +169,9 @@
     /** Initializes variables for the stage. */
     void startupStage();
 
+    /** Clear all thread-specific states */
+    void clearStates(ThreadID tid);
+
     /** Sets pointer to list of active threads. */
     void setActiveThreads(std::list<ThreadID> *at_ptr);
 
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index b63163f..fd9b09e 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -256,6 +256,28 @@
 
 template <class Impl>
 void
+DefaultRename<Impl>::clearStates(ThreadID tid)
+{
+    renameStatus[tid] = Idle;
+
+    freeEntries[tid].iqEntries = iew_ptr->instQueue.numFreeEntries(tid);
+    freeEntries[tid].lqEntries = iew_ptr->ldstQueue.numFreeLoadEntries(tid);
+    freeEntries[tid].sqEntries = iew_ptr->ldstQueue.numFreeStoreEntries(tid);
+    freeEntries[tid].robEntries = commit_ptr->numROBFreeEntries(tid);
+    emptyROB[tid] = true;
+
+    stalls[tid].iew = false;
+    serializeInst[tid] = NULL;
+
+    instsInProgress[tid] = 0;
+    loadsInProgress[tid] = 0;
+    storesInProgress[tid] = 0;
+
+    serializeOnNextInst[tid] = false;
+}
+
+template <class Impl>
+void
 DefaultRename<Impl>::resetStage()
 {
     _status = Inactive;
diff --git a/src/cpu/o3/thread_context_impl.hh b/src/cpu/o3/thread_context_impl.hh
index 473e2e2..57e0a45 100644
--- a/src/cpu/o3/thread_context_impl.hh
+++ b/src/cpu/o3/thread_context_impl.hh
@@ -128,11 +128,18 @@
 {
     DPRINTF(O3CPU, "Calling halt on Thread Context %d\n", threadId());
 
-    if (thread->status() == ThreadContext::Halted)
+    if (thread->status() == ThreadContext::Halting ||
+        thread->status() == ThreadContext::Halted)
         return;
 
-    thread->setStatus(ThreadContext::Halted);
-    cpu->haltContext(thread->threadId());
+    // the thread is not going to halt/terminate immediately in this cycle.
+    // The thread will be removed after an exit trap is processed
+    // (e.g., after trapLatency cycles). Until then, the thread's status
+    // will be Halting.
+    thread->setStatus(ThreadContext::Halting);
+
+    // add this thread to the exiting list to mark that it is trying to exit.
+    cpu->addThreadToExitingList(thread->threadId());
 }
 
 template <class Impl>
diff --git a/src/cpu/thread_context.hh b/src/cpu/thread_context.hh
index a570b9a..6b9ff1a 100644
--- a/src/cpu/thread_context.hh
+++ b/src/cpu/thread_context.hh
@@ -111,6 +111,10 @@
         /// synchronization, etc.
         Suspended,
 
+        /// Trying to exit and waiting for an event to completely exit.
+        /// Entered when target executes an exit syscall.
+        Halting,
+
         /// Permanently shut down.  Entered when target executes
         /// m5exit pseudo-instruction.  When all contexts enter
         /// this state, the simulation will terminate.
diff --git a/src/sim/eventq.hh b/src/sim/eventq.hh
index 6e8e633..895f694 100644
--- a/src/sim/eventq.hh
+++ b/src/sim/eventq.hh
@@ -161,6 +161,9 @@
     /// (such as writebacks).
     static const Priority CPU_Tick_Pri =                50;
 
+    /// If we want to exit a thread in a CPU, it comes after CPU_Tick_Pri
+    static const Priority CPU_Exit_Pri =                64;
+
     /// Statistics events (dump, reset, etc.) come after
     /// everything else, but before exit.
     static const Priority Stat_Event_Pri =              90;