kvm: Don't handle IO and execute in the same tick

We currently execute instructions in the guest and then handle any IO
request right after we break out of the virtualized environment. This
has the effect of executing IO requests in the exact same tick as the
first instruction in the sequence that was just run. There seem to be
cases where this simplification upsets some timing-sensitive devices.

This changeset splits execute and IO (and other services) across
multiple ticks. This is implemented by adding a separate
RunningService state to the CPU state machine. When a VM requires
service, it enters into this state and pending IO is then serviced in
the future instead of immediately. The delay between getting the
request and servicing it depends on the number of cycles executed in
the guest, which allows other components to catch up with the CPU.
diff --git a/src/cpu/kvm/arm_cpu.cc b/src/cpu/kvm/arm_cpu.cc
index e131202..b0b43d3 100644
--- a/src/cpu/kvm/arm_cpu.cc
+++ b/src/cpu/kvm/arm_cpu.cc
@@ -266,8 +266,8 @@
     kvmArmVCpuInit(KVM_ARM_TARGET_CORTEX_A15);
 }
 
-void
-ArmKvmCPU::tick()
+Tick
+ArmKvmCPU::kvmRun(Tick ticks)
 {
     bool simFIQ(interrupts->checkRaw(INT_FIQ));
     bool simIRQ(interrupts->checkRaw(INT_IRQ));
@@ -283,7 +283,7 @@
         vm.setIRQLine(INTERRUPT_VCPU_IRQ(vcpuID), simIRQ);
     }
 
-    BaseKvmCPU::tick();
+    return BaseKvmCPU::kvmRun(ticks);
 }
 
 void
diff --git a/src/cpu/kvm/arm_cpu.hh b/src/cpu/kvm/arm_cpu.hh
index 786320c..9d9850f 100644
--- a/src/cpu/kvm/arm_cpu.hh
+++ b/src/cpu/kvm/arm_cpu.hh
@@ -89,7 +89,7 @@
 
     typedef std::vector<uint64_t> RegIndexVector;
 
-    void tick();
+    Tick kvmRun(Tick ticks);
 
     void updateKvmState();
     void updateThreadContext();
diff --git a/src/cpu/kvm/base.cc b/src/cpu/kvm/base.cc
index 6ffad82..3bfe44c 100644
--- a/src/cpu/kvm/base.cc
+++ b/src/cpu/kvm/base.cc
@@ -49,6 +49,7 @@
 #include "arch/utility.hh"
 #include "cpu/kvm/base.hh"
 #include "debug/Checkpoint.hh"
+#include "debug/Drain.hh"
 #include "debug/Kvm.hh"
 #include "debug/KvmIO.hh"
 #include "debug/KvmRun.hh"
@@ -56,6 +57,8 @@
 #include "sim/process.hh"
 #include "sim/system.hh"
 
+#include <signal.h>
+
 /* Used by some KVM macros */
 #define PAGE_SIZE pageSize
 
@@ -81,6 +84,7 @@
       tickEvent(*this),
       perfControlledByTimer(params->usePerfOverflow),
       hostFactor(params->hostFactor),
+      drainManager(NULL),
       ctrInsts(0)
 {
     if (pageSize == -1)
@@ -94,7 +98,6 @@
     threadContexts.push_back(tc);
 
     setupCounters();
-    setupSignalHandler();
 
     if (params->usePerfOverflow)
         runTimer.reset(new PerfKvmTimer(hwCycles,
@@ -151,6 +154,10 @@
     // point. Initialize virtual CPUs here instead.
     vcpuFD = vm.createVCPU(vcpuID);
 
+    // Setup signal handlers. This has to be done after the vCPU is
+    // created since it manipulates the vCPU signal mask.
+    setupSignalHandler();
+
     // Map the KVM run structure */
     vcpuMMapSize = kvm.getVCPUMMapSize();
     _kvmRun = (struct kvm_run *)mmap(0, vcpuMMapSize,
@@ -232,9 +239,6 @@
         dump();
     }
 
-    // Update the thread context so we have something to serialize.
-    syncThreadContext();
-
     assert(tid == 0);
     assert(_status == Idle);
     thread->serialize(os);
@@ -258,15 +262,62 @@
     if (switchedOut())
         return 0;
 
-    DPRINTF(Kvm, "drain\n");
+    DPRINTF(Drain, "BaseKvmCPU::drain\n");
+    switch (_status) {
+      case Running:
+        // The base KVM code is normally ready when it is in the
+        // Running state, but the architecture specific code might be
+        // of a different opinion. This may happen when the CPU been
+        // notified of an event that hasn't been accepted by the vCPU
+        // yet.
+        if (!archIsDrained()) {
+            drainManager = dm;
+            return 1;
+        }
 
-    // De-schedule the tick event so we don't insert any more MMIOs
-    // into the system while it is draining.
-    if (tickEvent.scheduled())
-        deschedule(tickEvent);
+        // The state of the CPU is consistent, so we don't need to do
+        // anything special to drain it. We simply de-schedule the
+        // tick event and enter the Idle state to prevent nasty things
+        // like MMIOs from happening.
+        if (tickEvent.scheduled())
+            deschedule(tickEvent);
+        _status = Idle;
 
-    _status = Idle;
-    return 0;
+        /** FALLTHROUGH */
+      case Idle:
+        // Idle, no need to drain
+        assert(!tickEvent.scheduled());
+
+        // Sync the thread context here since we'll need it when we
+        // switch CPUs or checkpoint the CPU.
+        syncThreadContext();
+
+        return 0;
+
+      case RunningServiceCompletion:
+        // The CPU has just requested a service that was handled in
+        // the RunningService state, but the results have still not
+        // been reported to the CPU. Now, we /could/ probably just
+        // update the register state ourselves instead of letting KVM
+        // handle it, but that would be tricky. Instead, we enter KVM
+        // and let it do its stuff.
+        drainManager = dm;
+
+        DPRINTF(Drain, "KVM CPU is waiting for service completion, "
+                "requesting drain.\n");
+        return 1;
+
+      case RunningService:
+        // We need to drain since the CPU is waiting for service (e.g., MMIOs)
+        drainManager = dm;
+
+        DPRINTF(Drain, "KVM CPU is waiting for service, requesting drain.\n");
+        return 1;
+
+      default:
+        panic("KVM: Unhandled CPU state in drain()\n");
+        return 0;
+    }
 }
 
 void
@@ -297,10 +348,6 @@
 {
     DPRINTF(Kvm, "switchOut\n");
 
-    // Make sure to update the thread context in case, the new CPU
-    // will need to access it.
-    syncThreadContext();
-
     BaseCPU::switchOut();
 
     // We should have drained prior to executing a switchOut, which
@@ -324,9 +371,12 @@
     assert(_status == Idle);
     assert(threadContexts.size() == 1);
 
-    // The BaseCPU updated the thread context, make sure that we
-    // synchronize next time we enter start the CPU.
-    threadContextDirty = true;
+    // Force an update of the KVM state here instead of flagging the
+    // TC as dirty. This is not ideal from a performance point of
+    // view, but it makes debugging easier as it allows meaningful KVM
+    // state to be dumped before and after a takeover.
+    updateKvmState();
+    threadContextDirty = false;
 }
 
 void
@@ -436,25 +486,73 @@
 void
 BaseKvmCPU::tick()
 {
-    assert(_status == Running);
-
-    DPRINTF(KvmRun, "Entering KVM...\n");
-
-    Tick ticksToExecute(mainEventQueue.nextTick() - curTick());
-    Tick ticksExecuted(kvmRun(ticksToExecute));
-
-    Tick delay(ticksExecuted + handleKvmExit());
+    Tick delay(0);
+    assert(_status != Idle);
 
     switch (_status) {
-      case Running:
-        schedule(tickEvent, clockEdge(ticksToCycles(delay)));
+      case RunningService:
+        // handleKvmExit() will determine the next state of the CPU
+        delay = handleKvmExit();
+
+        if (tryDrain())
+            _status = Idle;
         break;
 
+      case RunningServiceCompletion:
+      case Running: {
+          Tick ticksToExecute(mainEventQueue.nextTick() - curTick());
+
+          // We might need to update the KVM state.
+          syncKvmState();
+
+          DPRINTF(KvmRun, "Entering KVM...\n");
+          if (drainManager) {
+              // Force an immediate exit from KVM after completing
+              // pending operations. The architecture-specific code
+              // takes care to run until it is in a state where it can
+              // safely be drained.
+              delay = kvmRunDrain();
+          } else {
+              delay = kvmRun(ticksToExecute);
+          }
+
+          // Entering into KVM implies that we'll have to reload the thread
+          // context from KVM if we want to access it. Flag the KVM state as
+          // dirty with respect to the cached thread context.
+          kvmStateDirty = true;
+
+          // Enter into the RunningService state unless the
+          // simulation was stopped by a timer.
+          if (_kvmRun->exit_reason !=  KVM_EXIT_INTR)
+              _status = RunningService;
+          else
+              _status = Running;
+
+          if (tryDrain())
+              _status = Idle;
+      } break;
+
       default:
-        /* The CPU is halted or waiting for an interrupt from a
-         * device. Don't start it. */
-        break;
+        panic("BaseKvmCPU entered tick() in an illegal state (%i)\n",
+              _status);
     }
+
+    // Schedule a new tick if we are still running
+    if (_status != Idle)
+        schedule(tickEvent, clockEdge(ticksToCycles(delay)));
+}
+
+Tick
+BaseKvmCPU::kvmRunDrain()
+{
+    // By default, the only thing we need to drain is a pending IO
+    // operation which assumes that we are in the
+    // RunningServiceCompletion state.
+    assert(_status == RunningServiceCompletion);
+
+    // Deliver the data from the pending IO operation and immediately
+    // exit.
+    return kvmRun(0);
 }
 
 uint64_t
@@ -466,68 +564,91 @@
 Tick
 BaseKvmCPU::kvmRun(Tick ticks)
 {
-    // We might need to update the KVM state.
-    syncKvmState();
-    // Entering into KVM implies that we'll have to reload the thread
-    // context from KVM if we want to access it. Flag the KVM state as
-    // dirty with respect to the cached thread context.
-    kvmStateDirty = true;
-
-    if (ticks < runTimer->resolution()) {
-        DPRINTF(KvmRun, "KVM: Adjusting tick count (%i -> %i)\n",
-                ticks, runTimer->resolution());
-        ticks = runTimer->resolution();
-    }
-
+    Tick ticksExecuted;
     DPRINTF(KvmRun, "KVM: Executing for %i ticks\n", ticks);
     timerOverflowed = false;
 
-    // Get hardware statistics after synchronizing contexts. The KVM
-    // state update might affect guest cycle counters.
-    uint64_t baseCycles(getHostCycles());
-    uint64_t baseInstrs(hwInstructions.read());
+    if (ticks == 0) {
+        // Settings ticks == 0 is a special case which causes an entry
+        // into KVM that finishes pending operations (e.g., IO) and
+        // then immediately exits.
+        DPRINTF(KvmRun, "KVM: Delivering IO without full guest entry\n");
 
-    // Arm the run timer and start the cycle timer if it isn't
-    // controlled by the overflow timer. Starting/stopping the cycle
-    // timer automatically starts the other perf timers as they are in
-    // the same counter group.
-    runTimer->arm(ticks);
-    if (!perfControlledByTimer)
-        hwCycles.start();
+        // This signal is always masked while we are executing in gem5
+        // and gets unmasked temporarily as soon as we enter into
+        // KVM. See setSignalMask() and setupSignalHandler().
+        raise(KVM_TIMER_SIGNAL);
 
-    if (ioctl(KVM_RUN) == -1) {
-        if (errno != EINTR)
-            panic("KVM: Failed to start virtual CPU (errno: %i)\n",
-                  errno);
+        // Enter into KVM. KVM will check for signals after completing
+        // pending operations (IO). Since the KVM_TIMER_SIGNAL is
+        // pending, this forces an immediate exit into gem5 again. We
+        // don't bother to setup timers since this shouldn't actually
+        // execute any code in the guest.
+        ioctlRun();
+
+        // We always execute at least one cycle to prevent the
+        // BaseKvmCPU::tick() to be rescheduled on the same tick
+        // twice.
+        ticksExecuted = clockPeriod();
+    } else {
+        if (ticks < runTimer->resolution()) {
+            DPRINTF(KvmRun, "KVM: Adjusting tick count (%i -> %i)\n",
+                    ticks, runTimer->resolution());
+            ticks = runTimer->resolution();
+        }
+
+        // Get hardware statistics after synchronizing contexts. The KVM
+        // state update might affect guest cycle counters.
+        uint64_t baseCycles(getHostCycles());
+        uint64_t baseInstrs(hwInstructions.read());
+
+        // Arm the run timer and start the cycle timer if it isn't
+        // controlled by the overflow timer. Starting/stopping the cycle
+        // timer automatically starts the other perf timers as they are in
+        // the same counter group.
+        runTimer->arm(ticks);
+        if (!perfControlledByTimer)
+            hwCycles.start();
+
+        ioctlRun();
+
+        runTimer->disarm();
+        if (!perfControlledByTimer)
+            hwCycles.stop();
+
+        // The timer signal may have been delivered after we exited
+        // from KVM. It will be pending in that case since it is
+        // masked when we aren't executing in KVM. Discard it to make
+        // sure we don't deliver it immediately next time we try to
+        // enter into KVM.
+        discardPendingSignal(KVM_TIMER_SIGNAL);
+
+        const uint64_t hostCyclesExecuted(getHostCycles() - baseCycles);
+        const uint64_t simCyclesExecuted(hostCyclesExecuted * hostFactor);
+        const uint64_t instsExecuted(hwInstructions.read() - baseInstrs);
+        ticksExecuted = runTimer->ticksFromHostCycles(hostCyclesExecuted);
+
+        if (ticksExecuted < ticks &&
+            timerOverflowed &&
+            _kvmRun->exit_reason == KVM_EXIT_INTR) {
+            // TODO: We should probably do something clever here...
+            warn("KVM: Early timer event, requested %i ticks but got %i ticks.\n",
+                 ticks, ticksExecuted);
+        }
+
+        /* Update statistics */
+        numCycles += simCyclesExecuted;;
+        numInsts += instsExecuted;
+        ctrInsts += instsExecuted;
+        system->totalNumInsts += instsExecuted;
+
+        DPRINTF(KvmRun,
+                "KVM: Executed %i instructions in %i cycles "
+                "(%i ticks, sim cycles: %i).\n",
+                instsExecuted, hostCyclesExecuted, ticksExecuted, simCyclesExecuted);
     }
 
-    runTimer->disarm();
-    if (!perfControlledByTimer)
-        hwCycles.stop();
-
-
-    const uint64_t hostCyclesExecuted(getHostCycles() - baseCycles);
-    const uint64_t simCyclesExecuted(hostCyclesExecuted * hostFactor);
-    const uint64_t instsExecuted(hwInstructions.read() - baseInstrs);
-    const Tick ticksExecuted(runTimer->ticksFromHostCycles(hostCyclesExecuted));
-
-    if (ticksExecuted < ticks &&
-        timerOverflowed &&
-        _kvmRun->exit_reason == KVM_EXIT_INTR) {
-        // TODO: We should probably do something clever here...
-        warn("KVM: Early timer event, requested %i ticks but got %i ticks.\n",
-             ticks, ticksExecuted);
-    }
-
-    /* Update statistics */
-    numCycles += simCyclesExecuted;;
     ++numVMExits;
-    numInsts += instsExecuted;
-    ctrInsts += instsExecuted;
-    system->totalNumInsts += instsExecuted;
-
-    DPRINTF(KvmRun, "KVM: Executed %i instructions in %i cycles (%i ticks, sim cycles: %i).\n",
-            instsExecuted, hostCyclesExecuted, ticksExecuted, simCyclesExecuted);
 
     return ticksExecuted + flushCoalescedMMIO();
 }
@@ -700,7 +821,11 @@
 BaseKvmCPU::handleKvmExit()
 {
     DPRINTF(KvmRun, "handleKvmExit (exit_reason: %i)\n", _kvmRun->exit_reason);
+    assert(_status == RunningService);
 
+    // Switch into the running state by default. Individual handlers
+    // can override this.
+    _status = Running;
     switch (_kvmRun->exit_reason) {
       case KVM_EXIT_UNKNOWN:
         return handleKvmExitUnknown();
@@ -709,6 +834,7 @@
         return handleKvmExitException();
 
       case KVM_EXIT_IO:
+        _status = RunningServiceCompletion;
         ++numIO;
         return handleKvmExitIO();
 
@@ -728,6 +854,7 @@
         return 0;
 
       case KVM_EXIT_MMIO:
+        _status = RunningServiceCompletion;
         /* Service memory mapped IO requests */
         DPRINTF(KvmIO, "KVM: Handling MMIO (w: %u, addr: 0x%x, len: %u)\n",
                 _kvmRun->mmio.is_write,
@@ -816,6 +943,27 @@
     return dataPort.sendAtomic(&pkt);
 }
 
+void
+BaseKvmCPU::setSignalMask(const sigset_t *mask)
+{
+    std::unique_ptr<struct kvm_signal_mask> kvm_mask;
+
+    if (mask) {
+        kvm_mask.reset((struct kvm_signal_mask *)operator new(
+                           sizeof(struct kvm_signal_mask) + sizeof(*mask)));
+        // The kernel and the user-space headers have different ideas
+        // about the size of sigset_t. This seems like a massive hack,
+        // but is actually what qemu does.
+        assert(sizeof(*mask) >= 8);
+        kvm_mask->len = 8;
+        memcpy(kvm_mask->sigset, mask, kvm_mask->len);
+    }
+
+    if (ioctl(KVM_SET_SIGNAL_MASK, (void *)kvm_mask.get()) == -1)
+        panic("KVM: Failed to set vCPU signal mask (errno: %i)\n",
+              errno);
+}
+
 int
 BaseKvmCPU::ioctl(int request, long p1) const
 {
@@ -862,6 +1010,50 @@
     sa.sa_flags = SA_SIGINFO | SA_RESTART;
     if (sigaction(KVM_TIMER_SIGNAL, &sa, NULL) == -1)
         panic("KVM: Failed to setup vCPU signal handler\n");
+
+    sigset_t sigset;
+    if (sigprocmask(SIG_BLOCK, NULL, &sigset) == -1)
+        panic("KVM: Failed get signal mask\n");
+
+    // Request KVM to setup the same signal mask as we're currently
+    // running with. We'll sometimes need to mask the KVM_TIMER_SIGNAL
+    // to cause immediate exits from KVM after servicing IO
+    // requests. See kvmRun().
+    setSignalMask(&sigset);
+
+    // Mask the KVM_TIMER_SIGNAL so it isn't delivered unless we're
+    // actually executing inside KVM.
+    sigaddset(&sigset, KVM_TIMER_SIGNAL);
+    if (sigprocmask(SIG_SETMASK, &sigset, NULL) == -1)
+        panic("KVM: Failed mask the KVM timer signal\n");
+}
+
+bool
+BaseKvmCPU::discardPendingSignal(int signum) const
+{
+    int discardedSignal;
+
+    // Setting the timeout to zero causes sigtimedwait to return
+    // immediately.
+    struct timespec timeout;
+    timeout.tv_sec = 0;
+    timeout.tv_nsec = 0;
+
+    sigset_t sigset;
+    sigemptyset(&sigset);
+    sigaddset(&sigset, signum);
+
+    do {
+        discardedSignal = sigtimedwait(&sigset, NULL, &timeout);
+    } while (discardedSignal == -1 && errno == EINTR);
+
+    if (discardedSignal == signum)
+        return true;
+    else if (discardedSignal == -1 && errno == EAGAIN)
+        return false;
+    else
+        panic("Unexpected return value from sigtimedwait: %i (errno: %i)\n",
+              discardedSignal, errno);
 }
 
 void
@@ -893,3 +1085,36 @@
                           0, // TID (0 => currentThread)
                           hwCycles);
 }
+
+bool
+BaseKvmCPU::tryDrain()
+{
+    if (!drainManager)
+        return false;
+
+    if (!archIsDrained()) {
+        DPRINTF(Drain, "tryDrain: Architecture code is not ready.\n");
+        return false;
+    }
+
+    if (_status == Idle || _status == Running) {
+        DPRINTF(Drain,
+                "tryDrain: CPU transitioned into the Idle state, drain done\n");
+        drainManager->signalDrainDone();
+        drainManager = NULL;
+        return true;
+    } else {
+        DPRINTF(Drain, "tryDrain: CPU not ready.\n");
+        return false;
+    }
+}
+
+void
+BaseKvmCPU::ioctlRun()
+{
+    if (ioctl(KVM_RUN) == -1) {
+        if (errno != EINTR)
+            panic("KVM: Failed to start virtual CPU (errno: %i)\n",
+                  errno);
+    }
+}
diff --git a/src/cpu/kvm/base.hh b/src/cpu/kvm/base.hh
index 81b24a3..42a7eca 100644
--- a/src/cpu/kvm/base.hh
+++ b/src/cpu/kvm/base.hh
@@ -41,6 +41,7 @@
 #define __CPU_KVM_BASE_HH__
 
 #include <memory>
+#include <csignal>
 
 #include "base/statistics.hh"
 #include "cpu/kvm/perfevent.hh"
@@ -133,11 +134,67 @@
     KvmVM &vm;
 
   protected:
+    /**
+     *
+     * @dot
+     *   digraph {
+     *     Idle;
+     *     Running;
+     *     RunningService;
+     *     RunningServiceCompletion;
+     *
+     *     Idle -> Idle;
+     *     Idle -> Running [label="activateContext()", URL="\ref activateContext"];
+     *     Running -> Running [label="tick()", URL="\ref tick"];
+     *     Running -> RunningService [label="tick()", URL="\ref tick"];
+     *     Running -> Idle [label="suspendContext()", URL="\ref suspendContext"];
+     *     Running -> Idle [label="drain()", URL="\ref drain"];
+     *     Idle -> Running [label="drainResume()", URL="\ref drainResume"];
+     *     RunningService -> RunningServiceCompletion [label="handleKvmExit()", URL="\ref handleKvmExit"];
+     *     RunningServiceCompletion -> Running [label="tick()", URL="\ref tick"];
+     *     RunningServiceCompletion -> RunningService [label="tick()", URL="\ref tick"];
+     *   }
+     * @enddot
+     */
     enum Status {
-        /** Context not scheduled in KVM */
+        /** Context not scheduled in KVM.
+         *
+         * The CPU generally enters this state when the guest execute
+         * an instruction that halts the CPU (e.g., WFI on ARM or HLT
+         * on X86) if KVM traps this instruction. Ticks are not
+         * scheduled in this state.
+         *
+         * @see suspendContext()
+         */
         Idle,
-        /** Running normally */
+        /** Running normally.
+         *
+         * This is the normal run state of the CPU. KVM will be
+         * entered next time tick() is called.
+         */
         Running,
+        /** Requiring service at the beginning of the next cycle.
+         *
+         * The virtual machine has exited and requires service, tick()
+         * will call handleKvmExit() on the next cycle. The next state
+         * after running service is determined in handleKvmExit() and
+         * depends on what kind of service the guest requested:
+         * <ul>
+         *   <li>IO/MMIO: RunningServiceCompletion
+         *   <li>Halt: Idle
+         *   <li>Others: Running
+         * </ul>
+         */
+        RunningService,
+        /** Service completion in progress.
+         *
+         * The VM has requested service that requires KVM to be
+         * entered once in order to get to a consistent state. This
+         * happens in handleKvmExit() or one of its friends after IO
+         * exits. After executing tick(), the CPU will transition into
+         * the Running or RunningService state.
+         */
+        RunningServiceCompletion,
     };
 
     /** CPU run state */
@@ -146,12 +203,8 @@
     /**
      * Execute the CPU until the next event in the main event queue or
      * until the guest needs service from gem5.
-     *
-     * @note This method is virtual in order to allow implementations
-     * to check for architecture specific events (e.g., interrupts)
-     * before entering the VM.
      */
-    virtual void tick();
+    void tick();
 
     /**
      * Get the value of the hardware cycle counter in the guest.
@@ -177,10 +230,32 @@
      * can, for example, occur when the guest executes MMIO. A larger
      * number is typically due to performance counter inaccuracies.
      *
-     * @param ticks Number of ticks to execute
+     * @note This method is virtual in order to allow implementations
+     * to check for architecture specific events (e.g., interrupts)
+     * before entering the VM.
+     *
+     * @note It is the response of the caller (normally tick()) to
+     * make sure that the KVM state is synchronized and that the TC is
+     * invalidated after entering KVM.
+     *
+     * @param ticks Number of ticks to execute, set to 0 to exit
+     * immediately after finishing pending operations.
      * @return Number of ticks executed (see note)
      */
-    Tick kvmRun(Tick ticks);
+    virtual Tick kvmRun(Tick ticks);
+
+    /**
+     * Request the CPU to run until draining completes.
+     *
+     * This function normally calls kvmRun(0) to make KVM finish
+     * pending MMIO operations. Architecures implementing
+     * archIsDrained() must override this method.
+     *
+     * @see BaseKvmCPU::archIsDrained()
+     *
+     * @return Number of ticks executed
+     */
+    virtual Tick kvmRunDrain();
 
     /**
      * Get a pointer to the kvm_run structure containing all the input
@@ -385,6 +460,24 @@
     /** @} */
 
     /**
+     * Is the architecture specific code in a state that prevents
+     * draining?
+     *
+     * This method should return false if there are any pending events
+     * in the guest vCPU that won't be carried over to the gem5 state
+     * and thus will prevent correct checkpointing or CPU handover. It
+     * might, for example, check for pending interrupts that have been
+     * passed to the vCPU but not acknowledged by the OS. Architecures
+     * implementing this method <i>must</i> override
+     * kvmRunDrain().
+     *
+     * @see BaseKvmCPU::kvmRunDrain()
+     *
+     * @return true if the vCPU is drained, false otherwise.
+     */
+    virtual bool archIsDrained() const { return true; }
+
+    /**
      * Inject a memory mapped IO request into gem5
      *
      * @param paddr Physical address
@@ -395,6 +488,21 @@
      */
     Tick doMMIOAccess(Addr paddr, void *data, int size, bool write);
 
+    /** @{ */
+    /**
+     * Set the signal mask used in kvmRun()
+     *
+     * This method allows the signal mask of the thread executing
+     * kvmRun() to be overridden inside the actual system call. This
+     * allows us to mask timer signals used to force KVM exits while
+     * in gem5.
+     *
+     * The signal mask can be disabled by setting it to NULL.
+     *
+     * @param mask Signals to mask
+     */
+    void setSignalMask(const sigset_t *mask);
+    /** @} */
 
     /**
      * @addtogroup KvmIoctl
@@ -499,9 +607,23 @@
      */
     void setupSignalHandler();
 
+    /**
+     * Discard a (potentially) pending signal.
+     *
+     * @param signum Signal to discard
+     * @return true if the signal was pending, false otherwise.
+     */
+    bool discardPendingSignal(int signum) const;
+
     /** Setup hardware performance counters */
     void setupCounters();
 
+    /** Try to drain the CPU if a drain is pending */
+    bool tryDrain();
+
+    /** Execute the KVM_RUN ioctl */
+    void ioctlRun();
+
     /** KVM vCPU file descriptor */
     int vcpuFD;
     /** Size of MMAPed kvm_run area */
@@ -550,6 +672,13 @@
 
     float hostFactor;
 
+    /**
+     * Drain manager to use when signaling drain completion
+     *
+     * This pointer is non-NULL when draining and NULL otherwise.
+     */
+    DrainManager *drainManager;
+
   public:
     /* @{ */
     Stats::Scalar numInsts;