util-gem5art: Simplify rerunning failing runs

Often, a gem5 experiment will fail for some reason. This happens
non-deterministically when fastforwarding with KVM making it more
difficult to handle.

This change allows the user to specify a function `check_failure` to
check to see if the test has failed. An example would be to open the
terminal and check to see if the kernel has panicked.

Additionally, this change adds a rerun function to rerun a particular
run that has failed.

Change-Id: Ib4a8d47c824254ae89ac9e1593ebd2710e263146
Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/47464
Reviewed-by: Hoa Nguyen <hoanguyen@ucdavis.edu>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/util/gem5art/run/gem5art/run.py b/util/gem5art/run/gem5art/run.py
index c367f6d..a32d899 100644
--- a/util/gem5art/run/gem5art/run.py
+++ b/util/gem5art/run/gem5art/run.py
@@ -39,7 +39,7 @@
 import signal
 import subprocess
 import time
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 from uuid import UUID, uuid4
 import zipfile
 
@@ -64,6 +64,7 @@
     run_script_git_artifact: Artifact
     params: Tuple[str, ...]
     timeout: int
+    check_failure: Callable[["gem5Run"], bool]
 
     gem5_name: str
     script_name: str
@@ -93,6 +94,8 @@
     results: Optional[Artifact]
     artifacts: List[Artifact]
 
+    rerunnable: bool
+
     @classmethod
     def _create(
         cls,
@@ -105,6 +108,7 @@
         run_script_git_artifact: Artifact,
         params: Tuple[str, ...],
         timeout: int,
+        check_failure: Callable[["gem5Run"], bool],
     ) -> "gem5Run":
         """
         Shared code between SE and FS when creating a run object.
@@ -119,6 +123,9 @@
         run.params = params
         run.timeout = timeout
 
+        # Note: Mypy doesn't support monkey patching like this
+        run.check_failure = check_failure  # type: ignore
+
         run._id = uuid4()
 
         run.outdir = outdir.resolve()  # ensure this is absolute
@@ -142,6 +149,8 @@
         # Initially, there are no results
         run.results = None
 
+        run.rerunnable = False
+
         return run
 
     @classmethod
@@ -156,6 +165,7 @@
         run_script_git_artifact: Artifact,
         *params: str,
         timeout: int = 60 * 15,
+        check_failure: Callable[["gem5Run"], bool] = lambda run: False,
     ) -> "gem5Run":
         """
         name is the name of the run. The name is not necessarily unique. The
@@ -186,6 +196,7 @@
             run_script_git_artifact,
             params,
             timeout,
+            check_failure,
         )
 
         run.artifacts = [
@@ -230,6 +241,7 @@
         disk_image_artifact: Artifact,
         *params: str,
         timeout: int = 60 * 15,
+        check_failure: Callable[["gem5Run"], bool] = lambda run: False,
     ) -> "gem5Run":
         """
         name is the name of the run. The name is not necessarily unique. The
@@ -244,6 +256,10 @@
         Further parameters can be passed via extra arguments. These
         parameters will be passed in order to the gem5 run script.
 
+        check_failure is a user-defined function that will be executed
+        periodically (e.g., every 10 seconds) to check the health of the
+        simulation. When it returns True, the simulation will be killed
+
         Note: When instantiating this class for the first time, it will create
         a file `info.json` in the outdir which contains a serialized version
         of this class.
@@ -259,6 +275,7 @@
             run_script_git_artifact,
             params,
             timeout,
+            check_failure,
         )
         run.linux_binary = Path(linux_binary)
         run.disk_image = Path(disk_image)
@@ -397,6 +414,10 @@
         # Remove list of artifacts
         del d["artifacts"]
 
+        # Doesn't make sense to serialize the user-specified fail function
+        if "check_failure" in d.keys():
+            del d["check_failure"]
+
         # Replace the artifacts with their UUIDs
         for k, v in d.items():
             if isinstance(v, Artifact):
@@ -438,7 +459,7 @@
         d = self._convertForJson(self._getSerializable())
         return json.dumps(d)
 
-    def run(self, task: Any = None, cwd: str = ".") -> None:
+    def _run(self, task: Any = None, cwd: str = ".") -> None:
         """Actually run the test.
 
         Calls Popen with the command to fork a new process.
@@ -452,11 +473,8 @@
         process to run in a different directory than the running process. Note
         that only the spawned process runs in the new directory.
         """
-        # Check if the run is already in the database
+        # Connect to the database
         db = artifact.getDBConnection()
-        if self.hash in db:
-            print(f"Error: Have already run {self.command}. Exiting!")
-            return
 
         self.status = "Begin run"
         self.dumpJson("info.json")
@@ -503,6 +521,15 @@
                 proc.kill()
                 self.kill_reason = "kernel panic"
 
+            # Assigning a function/lambda to an object variable does not make
+            # the function/lambda become a bound one. Therefore, the
+            # user-defined function must pass `self` in.
+            # Here, mypy classifies self.check_failure() as a bound function,
+            # so we tell mypy to ignore it./
+            if self.check_failure(self):  # type: ignore
+                proc.kill()
+                self.kill_reason = "User defined kill"
+
             self.dumpJson("info.json")
 
             # Check again in five seconds
@@ -529,6 +556,44 @@
 
         print("Done storing the results of {}".format(" ".join(self.command)))
 
+    def run(self, task: Any = None, cwd: str = ".") -> None:
+        """Actually run the test.
+
+        Calls Popen with the command to fork a new process.
+        Then, this function polls the process every 5 seconds to check if it
+        has finished or not. Each time it checks, it dumps the json info so
+        other applications can poll those files.
+
+        task is the celery task that is running this gem5 instance.
+
+        cwd is the directory to change to before running. This allows a server
+        process to run in a different directory than the running process. Note
+        that only the spawned process runs in the new directory.
+        """
+        # Check if the run is already in the database
+        db = artifact.getDBConnection()
+        if self.hash in db:
+            print(f"Error: Have already run {self.command}. Exiting!")
+            return
+        self._run(task, cwd)
+
+    def rerun(self, task: Any = None, cwd: str = ".") -> None:
+        """Rerun the test.
+
+        Calls Popen with the command to fork a new process.
+        Then, this function polls the process every 5 seconds to check if it
+        has finished or not. Each time it checks, it dumps the json info so
+        other applications can poll those files.
+
+        task is the celery task that is running this gem5 instance.
+
+        cwd is the directory to change to before running. This allows a server
+        process to run in a different directory than the running process. Note
+        that only the spawned process runs in the new directory.
+        """
+        # TODO: remove the old runs?
+        self._run(task, cwd)
+
     def saveResults(self) -> None:
         """Zip up the output directory and store the results in the
         database."""
@@ -616,3 +681,20 @@
 
     for run in fsruns:
         yield gem5Run.loadFromDict(run)
+
+
+def getRerunnableRunsByNameLike(
+    db: ArtifactDB, name: str, fs_only: bool = False, limit: int = 0
+) -> Iterable[gem5Run]:
+
+    """Returns a generator of gem5Run objects having rerunnable as true
+    and the object "name" containing the name parameter as a substring. The
+    parameter is case sensitive.
+
+    If fs_only is True, then only full system runs will be returned.
+    Limit specifies the maximum number of runs to return.
+    """
+
+    for run in getRunsByNameLike(db, name, fs_only, limit):
+        if run.rerunnable:
+            yield run