util-gem5art: Simplify rerunning failing runs
Often, a gem5 experiment will fail for some reason. This happens
non-deterministically when fastforwarding with KVM making it more
difficult to handle.
This change allows the user to specify a function `check_failure` to
check to see if the test has failed. An example would be to open the
terminal and check to see if the kernel has panicked.
Additionally, this change adds a rerun function to rerun a particular
run that has failed.
Change-Id: Ib4a8d47c824254ae89ac9e1593ebd2710e263146
Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/47464
Reviewed-by: Hoa Nguyen <hoanguyen@ucdavis.edu>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/util/gem5art/run/gem5art/run.py b/util/gem5art/run/gem5art/run.py
index c367f6d..a32d899 100644
--- a/util/gem5art/run/gem5art/run.py
+++ b/util/gem5art/run/gem5art/run.py
@@ -39,7 +39,7 @@
import signal
import subprocess
import time
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
from uuid import UUID, uuid4
import zipfile
@@ -64,6 +64,7 @@
run_script_git_artifact: Artifact
params: Tuple[str, ...]
timeout: int
+ check_failure: Callable[["gem5Run"], bool]
gem5_name: str
script_name: str
@@ -93,6 +94,8 @@
results: Optional[Artifact]
artifacts: List[Artifact]
+ rerunnable: bool
+
@classmethod
def _create(
cls,
@@ -105,6 +108,7 @@
run_script_git_artifact: Artifact,
params: Tuple[str, ...],
timeout: int,
+ check_failure: Callable[["gem5Run"], bool],
) -> "gem5Run":
"""
Shared code between SE and FS when creating a run object.
@@ -119,6 +123,9 @@
run.params = params
run.timeout = timeout
+ # Note: Mypy doesn't support monkey patching like this
+ run.check_failure = check_failure # type: ignore
+
run._id = uuid4()
run.outdir = outdir.resolve() # ensure this is absolute
@@ -142,6 +149,8 @@
# Initially, there are no results
run.results = None
+ run.rerunnable = False
+
return run
@classmethod
@@ -156,6 +165,7 @@
run_script_git_artifact: Artifact,
*params: str,
timeout: int = 60 * 15,
+ check_failure: Callable[["gem5Run"], bool] = lambda run: False,
) -> "gem5Run":
"""
name is the name of the run. The name is not necessarily unique. The
@@ -186,6 +196,7 @@
run_script_git_artifact,
params,
timeout,
+ check_failure,
)
run.artifacts = [
@@ -230,6 +241,7 @@
disk_image_artifact: Artifact,
*params: str,
timeout: int = 60 * 15,
+ check_failure: Callable[["gem5Run"], bool] = lambda run: False,
) -> "gem5Run":
"""
name is the name of the run. The name is not necessarily unique. The
@@ -244,6 +256,10 @@
Further parameters can be passed via extra arguments. These
parameters will be passed in order to the gem5 run script.
+ check_failure is a user-defined function that will be executed
+ periodically (e.g., every 10 seconds) to check the health of the
+ simulation. When it returns True, the simulation will be killed
+
Note: When instantiating this class for the first time, it will create
a file `info.json` in the outdir which contains a serialized version
of this class.
@@ -259,6 +275,7 @@
run_script_git_artifact,
params,
timeout,
+ check_failure,
)
run.linux_binary = Path(linux_binary)
run.disk_image = Path(disk_image)
@@ -397,6 +414,10 @@
# Remove list of artifacts
del d["artifacts"]
+ # Doesn't make sense to serialize the user-specified fail function
+ if "check_failure" in d.keys():
+ del d["check_failure"]
+
# Replace the artifacts with their UUIDs
for k, v in d.items():
if isinstance(v, Artifact):
@@ -438,7 +459,7 @@
d = self._convertForJson(self._getSerializable())
return json.dumps(d)
- def run(self, task: Any = None, cwd: str = ".") -> None:
+ def _run(self, task: Any = None, cwd: str = ".") -> None:
"""Actually run the test.
Calls Popen with the command to fork a new process.
@@ -452,11 +473,8 @@
process to run in a different directory than the running process. Note
that only the spawned process runs in the new directory.
"""
- # Check if the run is already in the database
+ # Connect to the database
db = artifact.getDBConnection()
- if self.hash in db:
- print(f"Error: Have already run {self.command}. Exiting!")
- return
self.status = "Begin run"
self.dumpJson("info.json")
@@ -503,6 +521,15 @@
proc.kill()
self.kill_reason = "kernel panic"
+ # Assigning a function/lambda to an object variable does not make
+ # the function/lambda become a bound one. Therefore, the
+ # user-defined function must pass `self` in.
+ # Here, mypy classifies self.check_failure() as a bound function,
+ # so we tell mypy to ignore it./
+ if self.check_failure(self): # type: ignore
+ proc.kill()
+ self.kill_reason = "User defined kill"
+
self.dumpJson("info.json")
# Check again in five seconds
@@ -529,6 +556,44 @@
print("Done storing the results of {}".format(" ".join(self.command)))
+ def run(self, task: Any = None, cwd: str = ".") -> None:
+ """Actually run the test.
+
+ Calls Popen with the command to fork a new process.
+ Then, this function polls the process every 5 seconds to check if it
+ has finished or not. Each time it checks, it dumps the json info so
+ other applications can poll those files.
+
+ task is the celery task that is running this gem5 instance.
+
+ cwd is the directory to change to before running. This allows a server
+ process to run in a different directory than the running process. Note
+ that only the spawned process runs in the new directory.
+ """
+ # Check if the run is already in the database
+ db = artifact.getDBConnection()
+ if self.hash in db:
+ print(f"Error: Have already run {self.command}. Exiting!")
+ return
+ self._run(task, cwd)
+
+ def rerun(self, task: Any = None, cwd: str = ".") -> None:
+ """Rerun the test.
+
+ Calls Popen with the command to fork a new process.
+ Then, this function polls the process every 5 seconds to check if it
+ has finished or not. Each time it checks, it dumps the json info so
+ other applications can poll those files.
+
+ task is the celery task that is running this gem5 instance.
+
+ cwd is the directory to change to before running. This allows a server
+ process to run in a different directory than the running process. Note
+ that only the spawned process runs in the new directory.
+ """
+ # TODO: remove the old runs?
+ self._run(task, cwd)
+
def saveResults(self) -> None:
"""Zip up the output directory and store the results in the
database."""
@@ -616,3 +681,20 @@
for run in fsruns:
yield gem5Run.loadFromDict(run)
+
+
+def getRerunnableRunsByNameLike(
+ db: ArtifactDB, name: str, fs_only: bool = False, limit: int = 0
+) -> Iterable[gem5Run]:
+
+ """Returns a generator of gem5Run objects having rerunnable as true
+ and the object "name" containing the name parameter as a substring. The
+ parameter is case sensitive.
+
+ If fs_only is True, then only full system runs will be returned.
+ Limit specifies the maximum number of runs to return.
+ """
+
+ for run in getRunsByNameLike(db, name, fs_only, limit):
+ if run.rerunnable:
+ yield run