util-gem5art: Decouple artifacts from mongodb

This commit does a few things to decouple the artifacts from mongodb.
- Creates an ArtifactFileDB which stores artifacts in a json file on the
local machine
- Adds tests fro the artifact file DB
- Sets the file database to be default if pymongo isn't installed
- Extends the Artifact class to prepare for downloading artifacts from
gem5-resources

Change-Id: I1bceef94dc53c066d1c0475e79c9a1ad1f1a6202
Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/47463
Reviewed-by: Hoa Nguyen <hoanguyen@ucdavis.edu>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/util/gem5art/artifact/gem5art/artifact/_artifactdb.py b/util/gem5art/artifact/gem5art/artifact/_artifactdb.py
index 4ffb0a0..6714c9c 100644
--- a/util/gem5art/artifact/gem5art/artifact/_artifactdb.py
+++ b/util/gem5art/artifact/gem5art/artifact/_artifactdb.py
@@ -36,14 +36,24 @@
 
 from abc import ABC, abstractmethod
 
-import gridfs  # type: ignore
+import copy
+import json
 import os
 from pathlib import Path
-from pymongo import MongoClient  # type: ignore
-from typing import Any, Dict, Iterable, Union, Type
+import shutil
+from typing import Any, Dict, Iterable, Union, Type, List, Tuple
 from urllib.parse import urlparse
 from uuid import UUID
 
+try:
+    import gridfs  # type: ignore
+    from pymongo import MongoClient  # type: ignore
+
+    MONGO_SUPPORT = True
+except ModuleNotFoundError:
+    # If pymongo isn't installed, then disable support for it
+    MONGO_SUPPORT = False
+
 
 class ArtifactDB(ABC):
     """
@@ -51,7 +61,7 @@
     """
 
     @abstractmethod
-    def __init__(self, uri: str):
+    def __init__(self, uri: str) -> None:
         """Initialize the database with a URI"""
         pass
 
@@ -205,11 +215,194 @@
             yield d
 
 
+class ArtifactFileDB(ArtifactDB):
+    """
+    This is a file-based database where Artifacts (as defined in artifacts.py)
+    are stored in a JSON file.
+
+    This database stores a list of serialized artifacts in a JSON file.
+    This database is not thread-safe.
+
+    If the user specifies a valid path in the environment variable
+    GEM5ART_STORAGE then this database will copy all artifacts to that
+    directory named with their UUIDs.
+    """
+
+    class ArtifactEncoder(json.JSONEncoder):
+        def default(self, obj):
+            if isinstance(obj, UUID):
+                return str(obj)
+            return ArtifactFileDB.ArtifactEncoder(self, obj)
+
+    _json_file: Path
+    _uuid_artifact_map: Dict[str, Dict[str, str]]
+    _hash_uuid_map: Dict[str, List[str]]
+    _storage_enabled: bool
+    _storage_path: Path
+
+    def __init__(self, uri: str) -> None:
+        """Initialize the file-driven database from a JSON file.
+        If the file doesn't exist, a new file will be created.
+        """
+        parsed_uri = urlparse(uri)
+        # using urlparse to parse relative/absolute file path
+        # abs path: urlparse("file:///path/to/file") ->
+        #           (netloc='', path='/path/to/file')
+        # rel path: urlparse("file://path/to/file") ->
+        #           (netloc='path', path='/to/file')
+        # so, the filepath would be netloc+path for both cases
+        self._json_file = Path(parsed_uri.netloc) / Path(parsed_uri.path)
+        storage_path = os.environ.get("GEM5ART_STORAGE", "")
+        self._storage_enabled = True if storage_path else False
+        self._storage_path = Path(storage_path)
+        if (
+            self._storage_enabled
+            and self._storage_path.exists()
+            and not self._storage_path.is_dir()
+        ):
+            raise Exception(
+                f"GEM5ART_STORAGE={storage_path} exists and is not a directory"
+            )
+        if self._storage_enabled:
+            os.makedirs(self._storage_path, exist_ok=True)
+
+        self._uuid_artifact_map, self._hash_uuid_map = self._load_from_file(
+            self._json_file
+        )
+
+    def put(self, key: UUID, artifact: Dict[str, Union[str, UUID]]) -> None:
+        """Insert the artifact into the database with the key."""
+        assert artifact["_id"] == key
+        assert isinstance(artifact["hash"], str)
+        self.insert_artifact(key, artifact["hash"], artifact)
+
+    def upload(self, key: UUID, path: Path) -> None:
+        """Copy the artifact to the folder specified by GEM5ART_STORAGE."""
+        if not self._storage_enabled:
+            return
+        src_path = path
+        dst_path = self._storage_path / str(key)
+        if not dst_path.exists():
+            shutil.copy2(src_path, dst_path)
+
+    def __contains__(self, key: Union[UUID, str]) -> bool:
+        """Key can be a UUID or a string. Returns true if item in DB"""
+        if isinstance(key, UUID):
+            return self.has_uuid(key)
+        return self.has_hash(key)
+
+    def get(self, key: Union[UUID, str]) -> Dict[str, str]:
+        """Key can be a UUID or a string. Returns a dictionary to construct
+        an artifact.
+        """
+        artifact: List[Dict[str, str]] = []
+        if isinstance(key, UUID):
+            artifact = list(self.get_artifact_by_uuid(key))
+        else:
+            # This is a hash.
+            artifact = list(self.get_artifact_by_hash(key))
+        return artifact[0]
+
+    def downloadFile(self, key: UUID, path: Path) -> None:
+        """Copy the file from the storage to specified path."""
+        assert path.exists()
+        if not self._storage_enabled:
+            return
+        src_path = self._storage_path / str(key)
+        dst_path = path
+        shutil.copy2(src_path, dst_path)
+
+    def _load_from_file(
+        self, json_file: Path
+    ) -> Tuple[Dict[str, Dict[str, str]], Dict[str, List[str]]]:
+        uuid_mapping: Dict[str, Dict[str, str]] = {}
+        hash_mapping: Dict[str, List[str]] = {}
+        if json_file.exists():
+            with open(json_file, "r") as f:
+                j = json.load(f)
+                for an_artifact in j:
+                    the_uuid = an_artifact["_id"]
+                    the_hash = an_artifact["hash"]
+                    uuid_mapping[the_uuid] = an_artifact
+                    if not the_hash in hash_mapping:
+                        hash_mapping[the_hash] = []
+                    hash_mapping[the_hash].append(the_uuid)
+        return uuid_mapping, hash_mapping
+
+    def _save_to_file(self, json_file: Path) -> None:
+        content = list(self._uuid_artifact_map.values())
+        with open(json_file, "w") as f:
+            json.dump(content, f, indent=4, cls=ArtifactFileDB.ArtifactEncoder)
+
+    def has_uuid(self, the_uuid: UUID) -> bool:
+        return str(the_uuid) in self._uuid_artifact_map
+
+    def has_hash(self, the_hash: str) -> bool:
+        return the_hash in self._hash_uuid_map
+
+    def get_artifact_by_uuid(self, the_uuid: UUID) -> Iterable[Dict[str, str]]:
+        uuid_str = str(the_uuid)
+        if not uuid_str in self._uuid_artifact_map:
+            return
+        yield self._uuid_artifact_map[uuid_str]
+
+    def get_artifact_by_hash(self, the_hash: str) -> Iterable[Dict[str, str]]:
+        if not the_hash in self._hash_uuid_map:
+            return
+        for the_uuid in self._hash_uuid_map[the_hash]:
+            yield self._uuid_artifact_map[the_uuid]
+
+    def insert_artifact(
+        self,
+        the_uuid: UUID,
+        the_hash: str,
+        the_artifact: Dict[str, Union[str, UUID]],
+    ) -> bool:
+        """
+        Put the artifact to the database.
+
+        Return True if the artifact uuid does not exist in the database prior
+        to calling this function; return False otherwise.
+        """
+        uuid_str = str(the_uuid)
+        if uuid_str in self._uuid_artifact_map:
+            return False
+        artifact_copy = copy.deepcopy(the_artifact)
+        artifact_copy["_id"] = str(artifact_copy["_id"])
+        self._uuid_artifact_map[uuid_str] = artifact_copy  # type: ignore
+        if not the_hash in self._hash_uuid_map:
+            self._hash_uuid_map[the_hash] = []
+        self._hash_uuid_map[the_hash].append(uuid_str)
+        self._save_to_file(self._json_file)
+        return True
+
+    def find_exact(
+        self, attr: Dict[str, str], limit: int
+    ) -> Iterable[Dict[str, Any]]:
+        """
+        Return all artifacts such that, for every yielded artifact,
+        and for every (k,v) in attr, the attribute `k` of the artifact has
+        the value of `v`.
+        """
+        count = 0
+        if count >= limit:
+            return
+        for artifact in self._uuid_artifact_map.values():
+            #https://docs.python.org/3/library/stdtypes.html#frozenset.issubset
+            if attr.items() <= artifact.items():
+                yield artifact
+
+
 _db = None
 
-_default_uri = "mongodb://localhost:27017"
+if MONGO_SUPPORT:
+    _default_uri = "mongodb://localhost:27017"
+else:
+    _default_uri = "file://db.json"
 
-_db_schemes: Dict[str, Type[ArtifactDB]] = {"mongodb": ArtifactMongoDB}
+_db_schemes: Dict[str, Type[ArtifactDB]] = {"file": ArtifactFileDB}
+if MONGO_SUPPORT:
+    _db_schemes["mongodb"] = ArtifactMongoDB
 
 
 def _getDBType(uri: str) -> Type[ArtifactDB]:
@@ -220,6 +413,10 @@
     Supported types:
         **ArtifactMongoDB**: mongodb://...
             See http://dochub.mongodb.org/core/connections for details.
+        **ArtifactFileDB**: file://...
+            A simple flat file database with optional storage for the binary
+            artifacts. The filepath is where the json file is stored and the
+            data storage can be specified with GEM5ART_STORAGE
     """
     result = urlparse(uri)
     if result.scheme in _db_schemes:
diff --git a/util/gem5art/artifact/gem5art/artifact/artifact.py b/util/gem5art/artifact/gem5art/artifact/artifact.py
index ecab0ee..91ffc64 100644
--- a/util/gem5art/artifact/gem5art/artifact/artifact.py
+++ b/util/gem5art/artifact/gem5art/artifact/artifact.py
@@ -29,12 +29,13 @@
 
 import hashlib
 from inspect import cleandoc
-import os
+import json
 from pathlib import Path
 import subprocess
 import time
-from typing import Any, Dict, Iterator, List, Union
+from typing import Any, Dict, List, Union, Optional
 from uuid import UUID, uuid4
+import json
 
 from ._artifactdb import getDBConnection
 
@@ -108,6 +109,19 @@
     6) ID: unique identifier of the artifact
     7) inputs: list of the input artifacts used to create this artifact stored
        as a list of uuids
+
+    Optional fields:
+    a) architecture: name of the ISA (e.g. x86, riscv) ("" by default)
+    b) size: size of the artifact in bytes (None by default)
+    c) is_zipped: True when the artifact must be decompressed before using,
+       False otherwise (False by default)
+    d) md5sum: the md5 checksum of the artifact, used for integrity checking
+       ("" by default)
+    e) url: URL to download the artifact ("" by default)
+    f) supported_gem5_versions: a list of supported gem5 versions that the
+       artifact should be used with (an empty list by default)
+    g) version: version of the artifact, e.g. "v21-0" ("" by default)
+    h) **kwargs: other fields, values must have __str__() defined.
     """
 
     _id: UUID
@@ -122,8 +136,19 @@
     cwd: Path
     inputs: List["Artifact"]
 
+    # Optional fields
+    architecture: str
+    size: Optional[int]
+    is_zipped: bool
+    md5sum: str
+    url: str
+    supported_gem5_versions: List[str]
+    version: str
+
+    extra: Dict[str, str]
+
     @classmethod
-    def registerArtifact(
+    def createArtifact(
         cls,
         command: str,
         name: str,
@@ -132,15 +157,24 @@
         path: Union[str, Path],
         documentation: str,
         inputs: List["Artifact"] = [],
+        architecture: str = "",
+        size: int = None,
+        is_zipped: bool = False,
+        md5sum: str = "",
+        url: str = "",
+        supported_gem5_versions: List[str] = [],
+        version: str = "",
+        **kwargs: str,
     ) -> "Artifact":
-        """Constructs a new artifact.
 
-        This assume either it's not in the database or it is the exact same as
-        when it was added to the database
+        """Constructs a new artifact without using the database.
+
+        Different from registerArtifact(), this method won't use database.
+        As a result, this method won't check whether the artifact has
+        already existed in the database, as well as it won't add the artifact
+        to the database.
         """
 
-        _db = getDBConnection()
-
         # Dictionary with all of the kwargs for construction.
         data: Dict[str, Any] = {}
 
@@ -180,20 +214,75 @@
 
         data["inputs"] = [i._id for i in inputs]
 
-        if data["hash"] in _db:
-            old_artifact = Artifact(_db.get(data["hash"]))
-            data["_id"] = old_artifact._id
+        data["architecture"] = architecture
+        data["size"] = size
+        data["is_zipped"] = is_zipped
+        data["md5sum"] = md5sum
+        data["url"] = url
+        data["supported_gem5_versions"] = supported_gem5_versions[:]
+        data["version"] = version
 
-            # Now that we have a complete object, construct it
-            self = cls(data)
+        data["extra"] = kwargs
+
+        data["_id"] = uuid4()
+
+        # Now that we have a complete object, construct it
+        self = cls(data)
+
+        return self
+
+    @classmethod
+    def registerArtifact(
+        cls,
+        command: str,
+        name: str,
+        cwd: str,
+        typ: str,
+        path: Union[str, Path],
+        documentation: str,
+        inputs: List["Artifact"] = [],
+        architecture: str = "",
+        size: Optional[int] = None,
+        is_zipped: bool = False,
+        md5sum: str = "",
+        url: str = "",
+        supported_gem5_versions: List[str] = [],
+        version: str = "",
+        **kwargs: str,
+    ) -> "Artifact":
+        """Constructs a new artifact and adds to the database.
+
+        This assume either it's not in the database or it is the exact same as
+        when it was added to the database
+        """
+
+        self = cls.createArtifact(
+            command,
+            name,
+            cwd,
+            typ,
+            path,
+            documentation,
+            inputs,
+            architecture,
+            size,
+            is_zipped,
+            md5sum,
+            url,
+            supported_gem5_versions,
+            version,
+            **kwargs,
+        )
+
+        _db = getDBConnection()
+
+        if self.hash in _db:
+            old_artifact = Artifact(_db.get(self.hash))
+            self._id = old_artifact._id
+
             self._checkSimilar(old_artifact)
 
         else:
-            data["_id"] = uuid4()
-
-            # Now that we have a complete object, construct it
-            self = cls(data)
-
             # Upload the file if there is one.
             if self.path.is_file():
                 _db.upload(self._id, self.path)
@@ -204,18 +293,23 @@
         return self
 
     def __init__(self, other: Union[str, UUID, Dict[str, Any]]) -> None:
-        """Constructs the object from the database based on a UUID or
-        dictionary from the database
+        """Constructs an artifact object from the database based on a UUID or
+        dictionary from the database. Note that if the variable `other` is of
+        type `Dict[str, Any]`, this function will not try to establish a
+        connection to the database.
         """
-        _db = getDBConnection()
-        if isinstance(other, str):
-            other = UUID(other)
-        if isinstance(other, UUID):
-            other = _db.get(other)
+        if not isinstance(other, Dict):
+            _db = getDBConnection()
+            if isinstance(other, str):
+                other = UUID(other)
+            if isinstance(other, UUID):
+                other = _db.get(other)
 
         if not other:
             raise Exception("Cannot construct artifact")
 
+        if isinstance(other["_id"], str):
+            other["_id"] = UUID(other["_id"])  # type: ignore
         assert isinstance(other["_id"], UUID)
         self._id = other["_id"]
         self.name = other["name"]
@@ -229,6 +323,35 @@
         self.cwd = Path(other["cwd"])
         self.inputs = [Artifact(i) for i in other["inputs"]]
 
+        # Optional fields
+        self.architecture = other.get("architecture", "")
+        if "size" in other:
+            if isinstance(other["size"], int):
+                self.size = other["size"]
+            else:
+                self.size = None
+        self.is_zipped = bool(other.get("is_zipped", False))
+        self.md5sum = other.get("md5sum", "")
+        self.url = other.get("url", "")
+        self.supported_gem5_versions = []
+        if "supported_gem5_versions" in other:
+            if isinstance(other["supported_gem5_versions"], list):
+                self.supported_gem5_versions = other[
+                    "supported_gem5_versions"
+                ][:]
+            elif isinstance(other["supported_gem5_versions"], str):
+                self.supported_gem5_versions = json.loads(
+                    other["supported_gem5_versions"]
+                )
+        self.version = other.get("version", "")
+
+        self.extra = {}
+        if "extra" in other:
+            if isinstance(other["extra"], dict):
+                self.extra = {k: v for k, v in other["extra"].items()}
+            elif isinstance(other["extra"], str):
+                self.extra = json.loads(other["extra"])
+
     def __str__(self) -> str:
         inputs = ", ".join([i.name + ":" + str(i._id) for i in self.inputs])
         return "\n    ".join(
@@ -250,6 +373,10 @@
         data["inputs"] = [input._id for input in self.inputs]
         data["cwd"] = str(data["cwd"])
         data["path"] = str(data["path"])
+        data["supported_gem5_versions"] = json.dumps(
+            self.supported_gem5_versions
+        )
+        data["extra"] = json.dumps(self.extra)
         return data
 
     def __eq__(self, other: object) -> bool:
diff --git a/util/gem5art/artifact/tests/test_artifact.py b/util/gem5art/artifact/tests/test_artifact.py
index 897113d..af6f8ae 100644
--- a/util/gem5art/artifact/tests/test_artifact.py
+++ b/util/gem5art/artifact/tests/test_artifact.py
@@ -48,7 +48,6 @@
         self.hashes = {}
 
     def put(self, key, metadata):
-        print("putting an entry in the mock database")
         self.db[key] = metadata
         self.hashes[metadata["hash"]] = key
 
diff --git a/util/gem5art/artifact/tests/test_filedb.py b/util/gem5art/artifact/tests/test_filedb.py
new file mode 100644
index 0000000..9b5cd02
--- /dev/null
+++ b/util/gem5art/artifact/tests/test_filedb.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests for ArtifactFileDB"""
+
+
+import json
+import os
+from pathlib import Path
+import unittest
+from uuid import UUID
+
+from gem5art.artifact import Artifact
+from gem5art.artifact._artifactdb import getDBConnection
+
+
+class TestArtifactFileDB(unittest.TestCase):
+    def setUp(self):
+        _db = getDBConnection("file://test.json")
+
+        with open("test-file.txt", "w") as f:
+            f.write("This is a test file.")
+
+        self.artifact = Artifact.registerArtifact(
+            name=f"test-artifact",
+            typ="text",
+            path=f"test-file.txt",
+            cwd="./",
+            command='echo "This is a test file" > test-file.txt',
+            inputs=[],
+            documentation=f"This artifact is made for testing.",
+        )
+
+    def tearDown(self):
+        os.remove("test-file.txt")
+        os.remove("test.json")
+
+    def test_init_function(self):
+        self.assertTrue(Path("test.json").exists())
+
+    def test_json_content(self):
+        with open("test.json", "r") as f:
+            artifacts = json.load(f)
+        self.assertTrue(len(artifacts) == 1)
+        artifact = artifacts[0]
+        self.assertTrue(artifact["hash"] == self.artifact.hash)
+        self.assertTrue(UUID(artifact["_id"]) == self.artifact._id)