util/gem5-resources-manager/api/create_resources_json.py - public/gem5 - Git at Google

 # Copyright (c) 2023 The Regents of the University of California
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met: redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer;
 # redistributions in binary form must reproduce the above copyright
 # notice, this list of conditions and the following disclaimer in the
 # documentation and/or other materials provided with the distribution;
 # neither the name of the copyright holders nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 import json
 import requests
 import base64
 import os
 from jsonschema import validate


 class ResourceJsonCreator:
     """
     This class generates the JSON which is pushed onto MongoDB.
     On a high-level, it does the following:
         - Adds certain fields to the JSON.
         - Populates those fields.
         - Makes sure the JSON follows the schema.
     """

     # Global Variables
     base_url = "https://github.com/gem5/gem5/tree/develop"  # gem5 GitHub URL
     resource_url_map = {
         "dev": (
             "https://gem5.googlesource.com/public/gem5-resources/+/refs/heads/"
             "develop/resources.json?format=TEXT"
         ),
         "22.1": (
             "https://gem5.googlesource.com/public/gem5-resources/+/refs/heads/"
             "stable/resources.json?format=TEXT"
         ),
         "22.0": (
             "http://resources.gem5.org/prev-resources-json/"
             "resources-21-2.json"
         ),
         "21.2": (
             "http://resources.gem5.org/prev-resources-json/"
             "resources-22-0.json"
         ),
     }

     def __init__(self):
         self.schema = {}
         with open("schema/schema.json", "r") as f:
             self.schema = json.load(f)

     def _get_file_data(self, url):
         json_data = None
         try:
             json_data = requests.get(url).text
             json_data = base64.b64decode(json_data).decode("utf-8")
             return json.loads(json_data)
         except:
             json_data = requests.get(url).json()
             return json_data

     def _get_size(self, url):
         """
         Helper function to return the size of a download through its URL.
         Returns 0 if URL has an error.

         :param url: Download URL
         """
         try:
             response = requests.head(url)
             size = int(response.headers.get("content-length", 0))
             return size
         except Exception as e:
             return 0

     def _search_folder(self, folder_path, id):
         """
         Helper function to find the instance of a string in a folder.
         This is recursive, i.e., subfolders will also be searched.

         :param folder_path: Path to the folder to begin searching
         :param id: Phrase to search in the folder

         :returns matching_files: List of file paths to the files containing id
         """
         matching_files = []
         for filename in os.listdir(folder_path):
             file_path = os.path.join(folder_path, filename)
             if os.path.isfile(file_path):
                 with open(
                     file_path, "r", encoding="utf-8", errors="ignore"
                 ) as f:
                     contents = f.read()
                     if id in contents:
                         file_path = file_path.replace("\\", "/")
                         matching_files.append(file_path)
             elif os.path.isdir(file_path):
                 matching_files.extend(self._search_folder(file_path, id))
         return matching_files

     def _change_type(self, resource):
         if resource["type"] == "workload":
             # get the architecture from the name and remove 64 from it
             resource["architecture"] = (
                 resource["name"].split("-")[0].replace("64", "").upper()
             )
             return resource
         if "kernel" in resource["name"]:
             resource["type"] = "kernel"
         elif "bootloader" in resource["name"]:
             resource["type"] = "bootloader"
         elif "benchmark" in resource["documentation"]:
             resource["type"] = "disk-image"
             # if tags not in resource:
             if "tags" not in resource:
                 resource["tags"] = []
             resource["tags"].append("benchmark")
             if (
                 "additional_metadata" in resource
                 and "root_partition" in resource["additional_metadata"]
                 and resource["additional_metadata"]["root_partition"]
                 is not None
             ):
                 resource["root_partition"] = resource["additional_metadata"][
                     "root_partition"
                 ]
             else:
                 resource["root_partition"] = ""
         elif resource["url"] is not None and ".img.gz" in resource["url"]:
             resource["type"] = "disk-image"
             if (
                 "additional_metadata" in resource
                 and "root_partition" in resource["additional_metadata"]
                 and resource["additional_metadata"]["root_partition"]
                 is not None
             ):
                 resource["root_partition"] = resource["additional_metadata"][
                     "root_partition"
                 ]
             else:
                 resource["root_partition"] = ""
         elif "binary" in resource["documentation"]:
             resource["type"] = "binary"
         elif "checkpoint" in resource["documentation"]:
             resource["type"] = "checkpoint"
         elif "simpoint" in resource["documentation"]:
             resource["type"] = "simpoint"
         return resource

     def _extract_code_examples(self, resource, source):
         """
         This function goes by IDs present in the resources DataFrame.
         It finds which files use those IDs in gem5/configs.
         It adds the GitHub URL of those files under "example".
         It finds whether those files are used in gem5/tests/gem5.
         If yes, it marks "tested" as True. If not, it marks "tested" as False.
         "example" and "tested" are made into a JSON for every code example.
         This list of JSONs is assigned to the 'code_examples' field of the
         DataFrame.

         :param resources: A DataFrame containing the current state of
         resources.
         :param source: Path to gem5

         :returns resources: DataFrame with ['code-examples'] populated.
         """
         id = resource["id"]
         # search for files in the folder tree that contain the 'id' value
         matching_files = self._search_folder(
             source + "/configs", '"' + id + '"'
         )
         filenames = [os.path.basename(path) for path in matching_files]
         tested_files = []
         for file in filenames:
             tested_files.append(
                 True
                 if len(self._search_folder(source + "/tests/gem5", file)) > 0
                 else False
             )

         matching_files = [
             file.replace(source, self.base_url) for file in matching_files
         ]

         code_examples = []

         for i in range(len(matching_files)):
             json_obj = {
                 "example": matching_files[i],
                 "tested": tested_files[i],
             }
             code_examples.append(json_obj)
         return code_examples

     def unwrap_resources(self, ver):
         data = self._get_file_data(self.resource_url_map[ver])
         resources = data["resources"]
         new_resources = []
         for resource in resources:
             if resource["type"] == "group":
                 for group in resource["contents"]:
                     new_resources.append(group)
             else:
                 new_resources.append(resource)
         return new_resources

     def _get_example_usage(self, resource):
         if resource["category"] == "workload":
             return f"Workload(\"{resource['id']}\")"
         else:
             return f"obtain_resource(resource_id=\"{resource['id']}\")"

     def _parse_readme(self, url):
         metadata = {
             "tags": [],
             "author": [],
             "license": "",
         }
         try:
             request = requests.get(url)
             content = request.text
             content = content.split("---")[1]
             content = content.split("---")[0]
             if "tags:" in content:
                 tags = content.split("tags:\n")[1]
                 tags = tags.split(":")[0]
                 tags = tags.split("\n")[:-1]
                 tags = [tag.strip().replace("- ", "") for tag in tags]
                 if tags == [""] or tags == None:
                     tags = []
                 metadata["tags"] = tags
             if "author:" in content:
                 author = content.split("author:")[1]
                 author = author.split("\n")[0]
                 author = (
                     author.replace("[", "").replace("]", "").replace('"', "")
                 )
                 author = author.split(",")
                 author = [a.strip() for a in author]
                 metadata["author"] = author
             if "license:" in content:
                 license = content.split("license:")[1].split("\n")[0]
                 metadata["license"] = license
         except:
             pass
         return metadata

     def _add_fields(self, resources, source):
         new_resources = []
         for resource in resources:
             res = self._change_type(resource)
             res["gem5_versions"] = ["23.0"]
             res["resource_version"] = "1.0.0"
             res["category"] = res["type"]
             del res["type"]
             res["id"] = res["name"]
             del res["name"]
             res["description"] = res["documentation"]
             del res["documentation"]
             if "additional_metadata" in res:
                 for k, v in res["additional_metadata"].items():
                     res[k] = v
                 del res["additional_metadata"]
             res["example_usage"] = self._get_example_usage(res)
             if "source" in res:
                 url = (
                     "https://raw.githubusercontent.com/gem5/"
                     "gem5-resources/develop/"
                     + str(res["source"])
                     + "/README.md"
                 )
                 res["source_url"] = (
                     "https://github.com/gem5/gem5-resources/tree/develop/"
                     + str(res["source"])
                 )
             else:
                 url = ""
                 res["source_url"] = ""
             metadata = self._parse_readme(url)
             if "tags" in res:
                 res["tags"].extend(metadata["tags"])
             else:
                 res["tags"] = metadata["tags"]
             res["author"] = metadata["author"]
             res["license"] = metadata["license"]

             res["code_examples"] = self._extract_code_examples(res, source)

             if "url" in resource:
                 download_url = res["url"].replace(
                     "{url_base}", "http://dist.gem5.org/dist/develop"
                 )
                 res["url"] = download_url
                 res["size"] = self._get_size(download_url)
             else:
                 res["size"] = 0

             res = {k: v for k, v in res.items() if v is not None}

             new_resources.append(res)
         return new_resources

     def _validate_schema(self, resources):
         for resource in resources:
             try:
                 validate(resource, schema=self.schema)
             except Exception as e:
                 print(resource)
                 raise e

     def create_json(self, version, source, output):
         resources = self.unwrap_resources(version)
         resources = self._add_fields(resources, source)
         self._validate_schema(resources)
         with open(output, "w") as f:
             json.dump(resources, f, indent=4)
	# Copyright (c) 2023 The Regents of the University of California
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are
	# met: redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer;
	# redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution;
	# neither the name of the copyright holders nor the names of its
	# contributors may be used to endorse or promote products derived from
	# this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	import json
	import requests
	import base64
	import os
	from jsonschema import validate


	class ResourceJsonCreator:
	"""
	This class generates the JSON which is pushed onto MongoDB.
	On a high-level, it does the following:
	- Adds certain fields to the JSON.
	- Populates those fields.
	- Makes sure the JSON follows the schema.
	"""

	# Global Variables
	base_url = "https://github.com/gem5/gem5/tree/develop" # gem5 GitHub URL
	resource_url_map = {
	"dev": (
	"https://gem5.googlesource.com/public/gem5-resources/+/refs/heads/"
	"develop/resources.json?format=TEXT"
	),
	"22.1": (
	"https://gem5.googlesource.com/public/gem5-resources/+/refs/heads/"
	"stable/resources.json?format=TEXT"
	),
	"22.0": (
	"http://resources.gem5.org/prev-resources-json/"
	"resources-21-2.json"
	),
	"21.2": (
	"http://resources.gem5.org/prev-resources-json/"
	"resources-22-0.json"
	),
	}

	def __init__(self):
	self.schema = {}
	with open("schema/schema.json", "r") as f:
	self.schema = json.load(f)

	def _get_file_data(self, url):
	json_data = None
	try:
	json_data = requests.get(url).text
	json_data = base64.b64decode(json_data).decode("utf-8")
	return json.loads(json_data)
	except:
	json_data = requests.get(url).json()
	return json_data

	def _get_size(self, url):
	"""
	Helper function to return the size of a download through its URL.
	Returns 0 if URL has an error.

	:param url: Download URL
	"""
	try:
	response = requests.head(url)
	size = int(response.headers.get("content-length", 0))
	return size
	except Exception as e:
	return 0

	def _search_folder(self, folder_path, id):
	"""
	Helper function to find the instance of a string in a folder.
	This is recursive, i.e., subfolders will also be searched.

	:param folder_path: Path to the folder to begin searching
	:param id: Phrase to search in the folder

	:returns matching_files: List of file paths to the files containing id
	"""
	matching_files = []
	for filename in os.listdir(folder_path):
	file_path = os.path.join(folder_path, filename)
	if os.path.isfile(file_path):
	with open(
	file_path, "r", encoding="utf-8", errors="ignore"
	) as f:
	contents = f.read()
	if id in contents:
	file_path = file_path.replace("\\", "/")
	matching_files.append(file_path)
	elif os.path.isdir(file_path):
	matching_files.extend(self._search_folder(file_path, id))
	return matching_files

	def _change_type(self, resource):
	if resource["type"] == "workload":
	# get the architecture from the name and remove 64 from it
	resource["architecture"] = (
	resource["name"].split("-")[0].replace("64", "").upper()
	)
	return resource
	if "kernel" in resource["name"]:
	resource["type"] = "kernel"
	elif "bootloader" in resource["name"]:
	resource["type"] = "bootloader"
	elif "benchmark" in resource["documentation"]:
	resource["type"] = "disk-image"
	# if tags not in resource:
	if "tags" not in resource:
	resource["tags"] = []
	resource["tags"].append("benchmark")
	if (
	"additional_metadata" in resource
	and "root_partition" in resource["additional_metadata"]
	and resource["additional_metadata"]["root_partition"]
	is not None
	):
	resource["root_partition"] = resource["additional_metadata"][
	"root_partition"
	]
	else:
	resource["root_partition"] = ""
	elif resource["url"] is not None and ".img.gz" in resource["url"]:
	resource["type"] = "disk-image"
	if (
	"additional_metadata" in resource
	and "root_partition" in resource["additional_metadata"]
	and resource["additional_metadata"]["root_partition"]
	is not None
	):
	resource["root_partition"] = resource["additional_metadata"][
	"root_partition"
	]
	else:
	resource["root_partition"] = ""
	elif "binary" in resource["documentation"]:
	resource["type"] = "binary"
	elif "checkpoint" in resource["documentation"]:
	resource["type"] = "checkpoint"
	elif "simpoint" in resource["documentation"]:
	resource["type"] = "simpoint"
	return resource

	def _extract_code_examples(self, resource, source):
	"""
	This function goes by IDs present in the resources DataFrame.
	It finds which files use those IDs in gem5/configs.
	It adds the GitHub URL of those files under "example".
	It finds whether those files are used in gem5/tests/gem5.
	If yes, it marks "tested" as True. If not, it marks "tested" as False.
	"example" and "tested" are made into a JSON for every code example.
	This list of JSONs is assigned to the 'code_examples' field of the
	DataFrame.

	:param resources: A DataFrame containing the current state of
	resources.
	:param source: Path to gem5

	:returns resources: DataFrame with ['code-examples'] populated.
	"""
	id = resource["id"]
	# search for files in the folder tree that contain the 'id' value
	matching_files = self._search_folder(
	source + "/configs", '"' + id + '"'
	)
	filenames = [os.path.basename(path) for path in matching_files]
	tested_files = []
	for file in filenames:
	tested_files.append(
	True
	if len(self._search_folder(source + "/tests/gem5", file)) > 0
	else False
	)

	matching_files = [
	file.replace(source, self.base_url) for file in matching_files
	]

	code_examples = []

	for i in range(len(matching_files)):
	json_obj = {
	"example": matching_files[i],
	"tested": tested_files[i],
	}
	code_examples.append(json_obj)
	return code_examples

	def unwrap_resources(self, ver):
	data = self._get_file_data(self.resource_url_map[ver])
	resources = data["resources"]
	new_resources = []
	for resource in resources:
	if resource["type"] == "group":
	for group in resource["contents"]:
	new_resources.append(group)
	else:
	new_resources.append(resource)
	return new_resources

	def _get_example_usage(self, resource):
	if resource["category"] == "workload":
	return f"Workload(\"{resource['id']}\")"
	else:
	return f"obtain_resource(resource_id=\"{resource['id']}\")"

	def _parse_readme(self, url):
	metadata = {
	"tags": [],
	"author": [],
	"license": "",
	}
	try:
	request = requests.get(url)
	content = request.text
	content = content.split("---")[1]
	content = content.split("---")[0]
	if "tags:" in content:
	tags = content.split("tags:\n")[1]
	tags = tags.split(":")[0]
	tags = tags.split("\n")[:-1]
	tags = [tag.strip().replace("- ", "") for tag in tags]
	if tags == [""] or tags == None:
	tags = []
	metadata["tags"] = tags
	if "author:" in content:
	author = content.split("author:")[1]
	author = author.split("\n")[0]
	author = (
	author.replace("[", "").replace("]", "").replace('"', "")
	)
	author = author.split(",")
	author = [a.strip() for a in author]
	metadata["author"] = author
	if "license:" in content:
	license = content.split("license:")[1].split("\n")[0]
	metadata["license"] = license
	except:
	pass
	return metadata

	def _add_fields(self, resources, source):
	new_resources = []
	for resource in resources:
	res = self._change_type(resource)
	res["gem5_versions"] = ["23.0"]
	res["resource_version"] = "1.0.0"
	res["category"] = res["type"]
	del res["type"]
	res["id"] = res["name"]
	del res["name"]
	res["description"] = res["documentation"]
	del res["documentation"]
	if "additional_metadata" in res:
	for k, v in res["additional_metadata"].items():
	res[k] = v
	del res["additional_metadata"]
	res["example_usage"] = self._get_example_usage(res)
	if "source" in res:
	url = (
	"https://raw.githubusercontent.com/gem5/"
	"gem5-resources/develop/"
	+ str(res["source"])
	+ "/README.md"
	)
	res["source_url"] = (
	"https://github.com/gem5/gem5-resources/tree/develop/"
	+ str(res["source"])
	)
	else:
	url = ""
	res["source_url"] = ""
	metadata = self._parse_readme(url)
	if "tags" in res:
	res["tags"].extend(metadata["tags"])
	else:
	res["tags"] = metadata["tags"]
	res["author"] = metadata["author"]
	res["license"] = metadata["license"]

	res["code_examples"] = self._extract_code_examples(res, source)

	if "url" in resource:
	download_url = res["url"].replace(
	"{url_base}", "http://dist.gem5.org/dist/develop"
	)
	res["url"] = download_url
	res["size"] = self._get_size(download_url)
	else:
	res["size"] = 0

	res = {k: v for k, v in res.items() if v is not None}

	new_resources.append(res)
	return new_resources

	def _validate_schema(self, resources):
	for resource in resources:
	try:
	validate(resource, schema=self.schema)
	except Exception as e:
	print(resource)
	raise e

	def create_json(self, version, source, output):
	resources = self.unwrap_resources(version)
	resources = self._add_fields(resources, source)
	self._validate_schema(resources)
	with open(output, "w") as f:
	json.dump(resources, f, indent=4)