base,sim: implement a faster mutex for single thread case

This change applies an atomic variable to check if we really need to
obtain a mutex, and uses a condition variable to notify.

See about 5% improvement in the simulation speed.

Change-Id: I7e165987dcb587b27fae90978b9b3fde6f5563ef
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/34915
Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com>
Reviewed-by: Richard Cooper <richard.cooper@arm.com>
Reviewed-by: Daniel Carvalho <odanrc@yahoo.com.br>
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Maintainer: Gabe Black <gabeblack@google.com>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/src/base/SConscript b/src/base/SConscript
index 6514de0..e04d84a 100644
--- a/src/base/SConscript
+++ b/src/base/SConscript
@@ -73,6 +73,7 @@
 GTest('trie.test', 'trie.test.cc')
 Source('types.cc')
 GTest('types.test', 'types.test.cc', 'types.cc')
+GTest('uncontended_mutex.test', 'uncontended_mutex.test.cc')
 
 Source('stats/group.cc')
 Source('stats/text.cc')
diff --git a/src/base/uncontended_mutex.hh b/src/base/uncontended_mutex.hh
new file mode 100644
index 0000000..721712f
--- /dev/null
+++ b/src/base/uncontended_mutex.hh
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2020 Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BASE_UNCONTENDED_MUTEX_HH__
+#define __BASE_UNCONTENDED_MUTEX_HH__
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+/*
+ * The std::mutex implementation is slower than expected because of many mode
+ * checking and legacy support.
+ *
+ * The UncontendedMutex uses an atomic flag to check if we really need to
+ * obtain a mutex lock. For most cases without multi-threads event queues,
+ * e.g. non-KVM simulation, this avoid the usage of mutex and speed up the
+ * simulation.
+ */
+class UncontendedMutex
+{
+  private:
+    /*
+     * A flag to record the current status:
+     * 0: no one has the lock
+     * 1: exactly one thread has the lock
+     * >1: one or more threads are waiting for the lock
+     */
+    std::atomic<int> flag;
+    std::mutex m;
+    std::condition_variable cv;
+
+    bool
+    testAndSet(int expected, int desired)
+    {
+        return flag.compare_exchange_strong(expected, desired);
+    }
+
+  public:
+    UncontendedMutex() : flag(0) {}
+
+    void
+    lock()
+    {
+        /*
+         * Here we use 'flag' to check if we are the first thread to get the
+         * lock. If not, we try to obtain the real mutex, and use the condition
+         * variable to wait for the thread who has the lock to release it.
+         *
+         * The flag will be updated to more than 1, so the thread with lock
+         * knows that there is another thread waiting for the lock.
+         */
+        while (!testAndSet(0, 1)) {
+            std::unique_lock<std::mutex> ul(m);
+            /*
+             * It is possible that just before we obtain the mutex lock, the
+             * first thread releases the flag and thus flag becomes zero. In
+             * such case, we shouldn't wait for the condition variable because
+             * there is no the other thread to notify us.
+             */
+            if (flag++ == 0)
+                break;
+            cv.wait(ul);
+        }
+    }
+
+    void
+    unlock()
+    {
+        /* In case there are no other threads waiting, we will just clear the
+         * flag and return.
+         */
+        if (testAndSet(1, 0))
+            return;
+
+        /*
+         * Otherwise, clear the flag and notify all the waiting threads. We
+         * need to protect the flag by mutex here so that there won't be
+         * another thread waiting but the flag is already set to 0.
+         */
+        {
+            std::lock_guard<std::mutex> g(m);
+            flag = 0;
+        }
+        /*
+         * It's possible to update the algorithm and use notify_one() here.
+         * However, tests show that notify_one() is much slower than
+         * notify_all() in this case. Here we choose to use notify_all().
+         */
+        cv.notify_all();
+    }
+};
+
+#endif // __BASE_UNCONTENDED_MUTEX_HH__
diff --git a/src/base/uncontended_mutex.test.cc b/src/base/uncontended_mutex.test.cc
new file mode 100644
index 0000000..6ce929b
--- /dev/null
+++ b/src/base/uncontended_mutex.test.cc
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2020 Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <gtest/gtest.h>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "base/uncontended_mutex.hh"
+
+TEST(UncontendedMutex, Lock)
+{
+    int data = 0;
+    UncontendedMutex m;
+
+    std::thread t1([&] () {
+        std::lock_guard<UncontendedMutex> g(m);
+        // Simulate += operation with a racing change between read and write.
+        int tmp = data;
+        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+        data = tmp + 1;
+    });
+
+    std::thread t2([&] () {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::lock_guard<UncontendedMutex> g(m);
+        data = data + 1;
+    });
+
+    std::thread t3([&] () {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::lock_guard<UncontendedMutex> g(m);
+        data = data + 1;
+    });
+    t1.join();
+    t2.join();
+    t3.join();
+
+    EXPECT_EQ(data, 3);
+}
+
+TEST(UncontendedMutex, HeavyContention)
+{
+    int num_of_iter = 1000;
+    int num_of_thread = 1000;
+    std::vector<std::thread> threads;
+
+    int data = 0;
+    UncontendedMutex m;
+
+    for (int t = 0 ; t < num_of_thread; ++t) {
+        threads.emplace_back([&] () {
+            for (int k = 0; k < num_of_iter; ++k) {
+                std::lock_guard<UncontendedMutex> g(m);
+                data++;
+            }
+        });
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+    EXPECT_EQ(data, num_of_iter * num_of_thread);
+}
diff --git a/src/sim/eventq.cc b/src/sim/eventq.cc
index bc4864c..adce51e 100644
--- a/src/sim/eventq.cc
+++ b/src/sim/eventq.cc
@@ -32,6 +32,7 @@
 
 #include <cassert>
 #include <iostream>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <vector>
diff --git a/src/sim/eventq.hh b/src/sim/eventq.hh
index aa54722..45a5ab8 100644
--- a/src/sim/eventq.hh
+++ b/src/sim/eventq.hh
@@ -41,12 +41,12 @@
 #include <functional>
 #include <iosfwd>
 #include <memory>
-#include <mutex>
 #include <string>
 
 #include "base/debug.hh"
 #include "base/flags.hh"
 #include "base/types.hh"
+#include "base/uncontended_mutex.hh"
 #include "debug/Event.hh"
 #include "sim/serialize.hh"
 
@@ -622,7 +622,7 @@
     Tick _curTick;
 
     //! Mutex to protect async queue.
-    std::mutex async_queue_mutex;
+    UncontendedMutex async_queue_mutex;
 
     //! List of events added by other threads to this event queue.
     std::list<Event*> async_queue;
@@ -647,7 +647,7 @@
      * @see EventQueue::lock()
      * @see EventQueue::unlock()
      */
-    std::mutex service_mutex;
+    UncontendedMutex service_mutex;
 
     //! Insert / remove event from the queue. Should only be called
     //! by thread operating this queue.