util-gem5art: Decouple artifacts from mongodb
This commit does a few things to decouple the artifacts from mongodb.
- Creates an ArtifactFileDB which stores artifacts in a json file on the
local machine
- Adds tests fro the artifact file DB
- Sets the file database to be default if pymongo isn't installed
- Extends the Artifact class to prepare for downloading artifacts from
gem5-resources
Change-Id: I1bceef94dc53c066d1c0475e79c9a1ad1f1a6202
Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/47463
Reviewed-by: Hoa Nguyen <hoanguyen@ucdavis.edu>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/util/gem5art/artifact/gem5art/artifact/_artifactdb.py b/util/gem5art/artifact/gem5art/artifact/_artifactdb.py
index 4ffb0a0..6714c9c 100644
--- a/util/gem5art/artifact/gem5art/artifact/_artifactdb.py
+++ b/util/gem5art/artifact/gem5art/artifact/_artifactdb.py
@@ -36,14 +36,24 @@
from abc import ABC, abstractmethod
-import gridfs # type: ignore
+import copy
+import json
import os
from pathlib import Path
-from pymongo import MongoClient # type: ignore
-from typing import Any, Dict, Iterable, Union, Type
+import shutil
+from typing import Any, Dict, Iterable, Union, Type, List, Tuple
from urllib.parse import urlparse
from uuid import UUID
+try:
+ import gridfs # type: ignore
+ from pymongo import MongoClient # type: ignore
+
+ MONGO_SUPPORT = True
+except ModuleNotFoundError:
+ # If pymongo isn't installed, then disable support for it
+ MONGO_SUPPORT = False
+
class ArtifactDB(ABC):
"""
@@ -51,7 +61,7 @@
"""
@abstractmethod
- def __init__(self, uri: str):
+ def __init__(self, uri: str) -> None:
"""Initialize the database with a URI"""
pass
@@ -205,11 +215,194 @@
yield d
+class ArtifactFileDB(ArtifactDB):
+ """
+ This is a file-based database where Artifacts (as defined in artifacts.py)
+ are stored in a JSON file.
+
+ This database stores a list of serialized artifacts in a JSON file.
+ This database is not thread-safe.
+
+ If the user specifies a valid path in the environment variable
+ GEM5ART_STORAGE then this database will copy all artifacts to that
+ directory named with their UUIDs.
+ """
+
+ class ArtifactEncoder(json.JSONEncoder):
+ def default(self, obj):
+ if isinstance(obj, UUID):
+ return str(obj)
+ return ArtifactFileDB.ArtifactEncoder(self, obj)
+
+ _json_file: Path
+ _uuid_artifact_map: Dict[str, Dict[str, str]]
+ _hash_uuid_map: Dict[str, List[str]]
+ _storage_enabled: bool
+ _storage_path: Path
+
+ def __init__(self, uri: str) -> None:
+ """Initialize the file-driven database from a JSON file.
+ If the file doesn't exist, a new file will be created.
+ """
+ parsed_uri = urlparse(uri)
+ # using urlparse to parse relative/absolute file path
+ # abs path: urlparse("file:///path/to/file") ->
+ # (netloc='', path='/path/to/file')
+ # rel path: urlparse("file://path/to/file") ->
+ # (netloc='path', path='/to/file')
+ # so, the filepath would be netloc+path for both cases
+ self._json_file = Path(parsed_uri.netloc) / Path(parsed_uri.path)
+ storage_path = os.environ.get("GEM5ART_STORAGE", "")
+ self._storage_enabled = True if storage_path else False
+ self._storage_path = Path(storage_path)
+ if (
+ self._storage_enabled
+ and self._storage_path.exists()
+ and not self._storage_path.is_dir()
+ ):
+ raise Exception(
+ f"GEM5ART_STORAGE={storage_path} exists and is not a directory"
+ )
+ if self._storage_enabled:
+ os.makedirs(self._storage_path, exist_ok=True)
+
+ self._uuid_artifact_map, self._hash_uuid_map = self._load_from_file(
+ self._json_file
+ )
+
+ def put(self, key: UUID, artifact: Dict[str, Union[str, UUID]]) -> None:
+ """Insert the artifact into the database with the key."""
+ assert artifact["_id"] == key
+ assert isinstance(artifact["hash"], str)
+ self.insert_artifact(key, artifact["hash"], artifact)
+
+ def upload(self, key: UUID, path: Path) -> None:
+ """Copy the artifact to the folder specified by GEM5ART_STORAGE."""
+ if not self._storage_enabled:
+ return
+ src_path = path
+ dst_path = self._storage_path / str(key)
+ if not dst_path.exists():
+ shutil.copy2(src_path, dst_path)
+
+ def __contains__(self, key: Union[UUID, str]) -> bool:
+ """Key can be a UUID or a string. Returns true if item in DB"""
+ if isinstance(key, UUID):
+ return self.has_uuid(key)
+ return self.has_hash(key)
+
+ def get(self, key: Union[UUID, str]) -> Dict[str, str]:
+ """Key can be a UUID or a string. Returns a dictionary to construct
+ an artifact.
+ """
+ artifact: List[Dict[str, str]] = []
+ if isinstance(key, UUID):
+ artifact = list(self.get_artifact_by_uuid(key))
+ else:
+ # This is a hash.
+ artifact = list(self.get_artifact_by_hash(key))
+ return artifact[0]
+
+ def downloadFile(self, key: UUID, path: Path) -> None:
+ """Copy the file from the storage to specified path."""
+ assert path.exists()
+ if not self._storage_enabled:
+ return
+ src_path = self._storage_path / str(key)
+ dst_path = path
+ shutil.copy2(src_path, dst_path)
+
+ def _load_from_file(
+ self, json_file: Path
+ ) -> Tuple[Dict[str, Dict[str, str]], Dict[str, List[str]]]:
+ uuid_mapping: Dict[str, Dict[str, str]] = {}
+ hash_mapping: Dict[str, List[str]] = {}
+ if json_file.exists():
+ with open(json_file, "r") as f:
+ j = json.load(f)
+ for an_artifact in j:
+ the_uuid = an_artifact["_id"]
+ the_hash = an_artifact["hash"]
+ uuid_mapping[the_uuid] = an_artifact
+ if not the_hash in hash_mapping:
+ hash_mapping[the_hash] = []
+ hash_mapping[the_hash].append(the_uuid)
+ return uuid_mapping, hash_mapping
+
+ def _save_to_file(self, json_file: Path) -> None:
+ content = list(self._uuid_artifact_map.values())
+ with open(json_file, "w") as f:
+ json.dump(content, f, indent=4, cls=ArtifactFileDB.ArtifactEncoder)
+
+ def has_uuid(self, the_uuid: UUID) -> bool:
+ return str(the_uuid) in self._uuid_artifact_map
+
+ def has_hash(self, the_hash: str) -> bool:
+ return the_hash in self._hash_uuid_map
+
+ def get_artifact_by_uuid(self, the_uuid: UUID) -> Iterable[Dict[str, str]]:
+ uuid_str = str(the_uuid)
+ if not uuid_str in self._uuid_artifact_map:
+ return
+ yield self._uuid_artifact_map[uuid_str]
+
+ def get_artifact_by_hash(self, the_hash: str) -> Iterable[Dict[str, str]]:
+ if not the_hash in self._hash_uuid_map:
+ return
+ for the_uuid in self._hash_uuid_map[the_hash]:
+ yield self._uuid_artifact_map[the_uuid]
+
+ def insert_artifact(
+ self,
+ the_uuid: UUID,
+ the_hash: str,
+ the_artifact: Dict[str, Union[str, UUID]],
+ ) -> bool:
+ """
+ Put the artifact to the database.
+
+ Return True if the artifact uuid does not exist in the database prior
+ to calling this function; return False otherwise.
+ """
+ uuid_str = str(the_uuid)
+ if uuid_str in self._uuid_artifact_map:
+ return False
+ artifact_copy = copy.deepcopy(the_artifact)
+ artifact_copy["_id"] = str(artifact_copy["_id"])
+ self._uuid_artifact_map[uuid_str] = artifact_copy # type: ignore
+ if not the_hash in self._hash_uuid_map:
+ self._hash_uuid_map[the_hash] = []
+ self._hash_uuid_map[the_hash].append(uuid_str)
+ self._save_to_file(self._json_file)
+ return True
+
+ def find_exact(
+ self, attr: Dict[str, str], limit: int
+ ) -> Iterable[Dict[str, Any]]:
+ """
+ Return all artifacts such that, for every yielded artifact,
+ and for every (k,v) in attr, the attribute `k` of the artifact has
+ the value of `v`.
+ """
+ count = 0
+ if count >= limit:
+ return
+ for artifact in self._uuid_artifact_map.values():
+ #https://docs.python.org/3/library/stdtypes.html#frozenset.issubset
+ if attr.items() <= artifact.items():
+ yield artifact
+
+
_db = None
-_default_uri = "mongodb://localhost:27017"
+if MONGO_SUPPORT:
+ _default_uri = "mongodb://localhost:27017"
+else:
+ _default_uri = "file://db.json"
-_db_schemes: Dict[str, Type[ArtifactDB]] = {"mongodb": ArtifactMongoDB}
+_db_schemes: Dict[str, Type[ArtifactDB]] = {"file": ArtifactFileDB}
+if MONGO_SUPPORT:
+ _db_schemes["mongodb"] = ArtifactMongoDB
def _getDBType(uri: str) -> Type[ArtifactDB]:
@@ -220,6 +413,10 @@
Supported types:
**ArtifactMongoDB**: mongodb://...
See http://dochub.mongodb.org/core/connections for details.
+ **ArtifactFileDB**: file://...
+ A simple flat file database with optional storage for the binary
+ artifacts. The filepath is where the json file is stored and the
+ data storage can be specified with GEM5ART_STORAGE
"""
result = urlparse(uri)
if result.scheme in _db_schemes:
diff --git a/util/gem5art/artifact/gem5art/artifact/artifact.py b/util/gem5art/artifact/gem5art/artifact/artifact.py
index ecab0ee..91ffc64 100644
--- a/util/gem5art/artifact/gem5art/artifact/artifact.py
+++ b/util/gem5art/artifact/gem5art/artifact/artifact.py
@@ -29,12 +29,13 @@
import hashlib
from inspect import cleandoc
-import os
+import json
from pathlib import Path
import subprocess
import time
-from typing import Any, Dict, Iterator, List, Union
+from typing import Any, Dict, List, Union, Optional
from uuid import UUID, uuid4
+import json
from ._artifactdb import getDBConnection
@@ -108,6 +109,19 @@
6) ID: unique identifier of the artifact
7) inputs: list of the input artifacts used to create this artifact stored
as a list of uuids
+
+ Optional fields:
+ a) architecture: name of the ISA (e.g. x86, riscv) ("" by default)
+ b) size: size of the artifact in bytes (None by default)
+ c) is_zipped: True when the artifact must be decompressed before using,
+ False otherwise (False by default)
+ d) md5sum: the md5 checksum of the artifact, used for integrity checking
+ ("" by default)
+ e) url: URL to download the artifact ("" by default)
+ f) supported_gem5_versions: a list of supported gem5 versions that the
+ artifact should be used with (an empty list by default)
+ g) version: version of the artifact, e.g. "v21-0" ("" by default)
+ h) **kwargs: other fields, values must have __str__() defined.
"""
_id: UUID
@@ -122,8 +136,19 @@
cwd: Path
inputs: List["Artifact"]
+ # Optional fields
+ architecture: str
+ size: Optional[int]
+ is_zipped: bool
+ md5sum: str
+ url: str
+ supported_gem5_versions: List[str]
+ version: str
+
+ extra: Dict[str, str]
+
@classmethod
- def registerArtifact(
+ def createArtifact(
cls,
command: str,
name: str,
@@ -132,15 +157,24 @@
path: Union[str, Path],
documentation: str,
inputs: List["Artifact"] = [],
+ architecture: str = "",
+ size: int = None,
+ is_zipped: bool = False,
+ md5sum: str = "",
+ url: str = "",
+ supported_gem5_versions: List[str] = [],
+ version: str = "",
+ **kwargs: str,
) -> "Artifact":
- """Constructs a new artifact.
- This assume either it's not in the database or it is the exact same as
- when it was added to the database
+ """Constructs a new artifact without using the database.
+
+ Different from registerArtifact(), this method won't use database.
+ As a result, this method won't check whether the artifact has
+ already existed in the database, as well as it won't add the artifact
+ to the database.
"""
- _db = getDBConnection()
-
# Dictionary with all of the kwargs for construction.
data: Dict[str, Any] = {}
@@ -180,20 +214,75 @@
data["inputs"] = [i._id for i in inputs]
- if data["hash"] in _db:
- old_artifact = Artifact(_db.get(data["hash"]))
- data["_id"] = old_artifact._id
+ data["architecture"] = architecture
+ data["size"] = size
+ data["is_zipped"] = is_zipped
+ data["md5sum"] = md5sum
+ data["url"] = url
+ data["supported_gem5_versions"] = supported_gem5_versions[:]
+ data["version"] = version
- # Now that we have a complete object, construct it
- self = cls(data)
+ data["extra"] = kwargs
+
+ data["_id"] = uuid4()
+
+ # Now that we have a complete object, construct it
+ self = cls(data)
+
+ return self
+
+ @classmethod
+ def registerArtifact(
+ cls,
+ command: str,
+ name: str,
+ cwd: str,
+ typ: str,
+ path: Union[str, Path],
+ documentation: str,
+ inputs: List["Artifact"] = [],
+ architecture: str = "",
+ size: Optional[int] = None,
+ is_zipped: bool = False,
+ md5sum: str = "",
+ url: str = "",
+ supported_gem5_versions: List[str] = [],
+ version: str = "",
+ **kwargs: str,
+ ) -> "Artifact":
+ """Constructs a new artifact and adds to the database.
+
+ This assume either it's not in the database or it is the exact same as
+ when it was added to the database
+ """
+
+ self = cls.createArtifact(
+ command,
+ name,
+ cwd,
+ typ,
+ path,
+ documentation,
+ inputs,
+ architecture,
+ size,
+ is_zipped,
+ md5sum,
+ url,
+ supported_gem5_versions,
+ version,
+ **kwargs,
+ )
+
+ _db = getDBConnection()
+
+ if self.hash in _db:
+ old_artifact = Artifact(_db.get(self.hash))
+ self._id = old_artifact._id
+
self._checkSimilar(old_artifact)
else:
- data["_id"] = uuid4()
-
- # Now that we have a complete object, construct it
- self = cls(data)
-
# Upload the file if there is one.
if self.path.is_file():
_db.upload(self._id, self.path)
@@ -204,18 +293,23 @@
return self
def __init__(self, other: Union[str, UUID, Dict[str, Any]]) -> None:
- """Constructs the object from the database based on a UUID or
- dictionary from the database
+ """Constructs an artifact object from the database based on a UUID or
+ dictionary from the database. Note that if the variable `other` is of
+ type `Dict[str, Any]`, this function will not try to establish a
+ connection to the database.
"""
- _db = getDBConnection()
- if isinstance(other, str):
- other = UUID(other)
- if isinstance(other, UUID):
- other = _db.get(other)
+ if not isinstance(other, Dict):
+ _db = getDBConnection()
+ if isinstance(other, str):
+ other = UUID(other)
+ if isinstance(other, UUID):
+ other = _db.get(other)
if not other:
raise Exception("Cannot construct artifact")
+ if isinstance(other["_id"], str):
+ other["_id"] = UUID(other["_id"]) # type: ignore
assert isinstance(other["_id"], UUID)
self._id = other["_id"]
self.name = other["name"]
@@ -229,6 +323,35 @@
self.cwd = Path(other["cwd"])
self.inputs = [Artifact(i) for i in other["inputs"]]
+ # Optional fields
+ self.architecture = other.get("architecture", "")
+ if "size" in other:
+ if isinstance(other["size"], int):
+ self.size = other["size"]
+ else:
+ self.size = None
+ self.is_zipped = bool(other.get("is_zipped", False))
+ self.md5sum = other.get("md5sum", "")
+ self.url = other.get("url", "")
+ self.supported_gem5_versions = []
+ if "supported_gem5_versions" in other:
+ if isinstance(other["supported_gem5_versions"], list):
+ self.supported_gem5_versions = other[
+ "supported_gem5_versions"
+ ][:]
+ elif isinstance(other["supported_gem5_versions"], str):
+ self.supported_gem5_versions = json.loads(
+ other["supported_gem5_versions"]
+ )
+ self.version = other.get("version", "")
+
+ self.extra = {}
+ if "extra" in other:
+ if isinstance(other["extra"], dict):
+ self.extra = {k: v for k, v in other["extra"].items()}
+ elif isinstance(other["extra"], str):
+ self.extra = json.loads(other["extra"])
+
def __str__(self) -> str:
inputs = ", ".join([i.name + ":" + str(i._id) for i in self.inputs])
return "\n ".join(
@@ -250,6 +373,10 @@
data["inputs"] = [input._id for input in self.inputs]
data["cwd"] = str(data["cwd"])
data["path"] = str(data["path"])
+ data["supported_gem5_versions"] = json.dumps(
+ self.supported_gem5_versions
+ )
+ data["extra"] = json.dumps(self.extra)
return data
def __eq__(self, other: object) -> bool:
diff --git a/util/gem5art/artifact/tests/test_artifact.py b/util/gem5art/artifact/tests/test_artifact.py
index 897113d..af6f8ae 100644
--- a/util/gem5art/artifact/tests/test_artifact.py
+++ b/util/gem5art/artifact/tests/test_artifact.py
@@ -48,7 +48,6 @@
self.hashes = {}
def put(self, key, metadata):
- print("putting an entry in the mock database")
self.db[key] = metadata
self.hashes[metadata["hash"]] = key
diff --git a/util/gem5art/artifact/tests/test_filedb.py b/util/gem5art/artifact/tests/test_filedb.py
new file mode 100644
index 0000000..9b5cd02
--- /dev/null
+++ b/util/gem5art/artifact/tests/test_filedb.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests for ArtifactFileDB"""
+
+
+import json
+import os
+from pathlib import Path
+import unittest
+from uuid import UUID
+
+from gem5art.artifact import Artifact
+from gem5art.artifact._artifactdb import getDBConnection
+
+
+class TestArtifactFileDB(unittest.TestCase):
+ def setUp(self):
+ _db = getDBConnection("file://test.json")
+
+ with open("test-file.txt", "w") as f:
+ f.write("This is a test file.")
+
+ self.artifact = Artifact.registerArtifact(
+ name=f"test-artifact",
+ typ="text",
+ path=f"test-file.txt",
+ cwd="./",
+ command='echo "This is a test file" > test-file.txt',
+ inputs=[],
+ documentation=f"This artifact is made for testing.",
+ )
+
+ def tearDown(self):
+ os.remove("test-file.txt")
+ os.remove("test.json")
+
+ def test_init_function(self):
+ self.assertTrue(Path("test.json").exists())
+
+ def test_json_content(self):
+ with open("test.json", "r") as f:
+ artifacts = json.load(f)
+ self.assertTrue(len(artifacts) == 1)
+ artifact = artifacts[0]
+ self.assertTrue(artifact["hash"] == self.artifact.hash)
+ self.assertTrue(UUID(artifact["_id"]) == self.artifact._id)