util/find_copyrights.py - public/gem5 - Git at Google

 #!/usr/bin/env python3

 import os
 import re
 import sys

 from file_types import lang_type, find_files

 mode_line = re.compile("(-\*- *mode:.* *-\*-)")
 shell_comment = re.compile(r"^\s*#")
 lisp_comment = re.compile(r";")
 cpp_comment = re.compile(r"//")
 c_comment_start = re.compile(r"/\*")
 c_comment_end = re.compile(r"\*/")


 def find_copyright_block(lines, lang_type):
     start = None
     if lang_type in ("python", "make", "shell", "perl", "scons"):
         for i, line in enumerate(lines):
             if i == 0 and (line.startswith("#!") or mode_line.search(line)):
                 continue

             if shell_comment.search(line):
                 if start is None:
                     start = i
             elif start is None:
                 if line.strip():
                     return
             else:
                 yield start, i - 1
                 start = None

     elif lang_type in ("lisp",):
         for i, line in enumerate(lines):
             if i == 0 and mode_line.search(line):
                 continue

             if lisp_comment.search(line):
                 if start is None:
                     start = i
             elif start is None:
                 if line.strip():
                     return
             else:
                 yield start, i - 1
                 start = None

     elif lang_type in (
         "C",
         "C++",
         "swig",
         "isa",
         "asm",
         "slicc",
         "lex",
         "yacc",
     ):
         mode = None
         for i, line in enumerate(lines):
             if i == 0 and mode_line.search(line):
                 continue

             if mode == "C":
                 assert start is not None, "on line %d" % (i + 1)
                 match = c_comment_end.search(line)
                 if match:
                     yield start, i
                     mode = None
                 continue

             cpp_match = cpp_comment.search(line)
             c_match = c_comment_start.search(line)

             if cpp_match:
                 assert not c_match, "on line %d" % (i + 1)
                 if line[: cpp_match.start()].strip():
                     return
                 if mode is None:
                     mode = "CPP"
                     start = i
                 else:
                     text = line[cpp_match.end() :].lstrip()
                     if text.startswith("Copyright") > 0:
                         yield start, i - 1
                         start = i
                 continue
             elif mode == "CPP":
                 assert start is not None, "on line %d" % (i + 1)
                 if not line.strip():
                     continue
                 yield start, i - 1
                 mode = None
                 if not c_match:
                     return

             if c_match:
                 assert mode is None, "on line %d" % (i + 1)
                 mode = "C"
                 start = i

             if mode is None and line.strip():
                 return

     else:
         raise AttributeError("Could not handle language %s" % lang_type)


 date_range_re = re.compile(r"([0-9]{4})\s*-\s*([0-9]{4})")


 def process_dates(dates):
     dates = [d.strip() for d in dates.split(",")]

     output = set()
     for date in dates:
         match = date_range_re.match(date)
         if match:
             f, l = [int(d) for d in match.groups()]
             for i in range(f, l + 1):
                 output.add(i)
         else:
             try:
                 date = int(date)
                 output.add(date)
             except ValueError:
                 pass

     return output


 copyright_re = re.compile(
     r"Copyright (\([cC]\)) ([-, 0-9]+)[\s*#/]*([A-z-,. ]+)", re.DOTALL
 )

 authors_re = re.compile(r"^[\s*#/]*Authors:\s*([A-z .]+)\s*$")
 more_authors_re = re.compile(r"^[\s*#/]*([A-z .]+)\s*$")

 all_owners = set()


 def get_data(lang_type, lines):
     data = []
     last = None
     for start, end in find_copyright_block(lines, lang_type):
         joined = "".join(lines[start : end + 1])
         match = copyright_re.search(joined)
         if not match:
             continue

         c, dates, owner = match.groups()
         dates = dates.strip()
         owner = owner.strip()

         all_owners.add(owner)
         try:
             dates = process_dates(dates)
         except Exception:
             print(dates)
             print(owner)
             raise

         authors = []
         for i in range(start, end + 1):
             line = lines[i]
             if not authors:
                 match = authors_re.search(line)
                 if match:
                     authors.append(match.group(1).strip())
             else:
                 match = more_authors_re.search(line)
                 if not match:
                     for j in range(i, end + 1):
                         line = lines[j].strip()
                         if not line:
                             end = j
                             break
                         if line.startswith("//"):
                             line = line[2:].lstrip()
                             if line:
                                 end = j - 1
                                 break
                     break
                 authors.append(match.group(1).strip())

         info = (owner, dates, authors, start, end)
         data.append(info)

     return data


 def datestr(dates):
     dates = list(dates)
     dates.sort()

     output = []

     def add_output(first, second):
         if first == second:
             output.append("%d" % (first))
         else:
             output.append("%d-%d" % (first, second))

     first = dates.pop(0)
     second = first
     while dates:
         next = dates.pop(0)
         if next == second + 1:
             second = next
         else:
             add_output(first, second)
             first = next
             second = next

     add_output(first, second)

     return ",".join(output)


 usage_str = """usage:
 %s [-v] <directory>"""


 def usage(exitcode):
     print(usage_str % sys.argv[0])
     if exitcode is not None:
         sys.exit(exitcode)


 if __name__ == "__main__":
     import getopt

     show_counts = False
     ignore = set()
     verbose = False
     try:
         opts, args = getopt.getopt(sys.argv[1:], "ci:v")
     except getopt.GetoptError:
         usage(1)

     for o, a in opts:
         if o == "-c":
             show_counts = True
         if o == "-i":
             ignore.add(a)
         if o == "-v":
             verbose = True

     files = []

     for base in args:
         if os.path.isfile(base):
             files += [(base, lang_type(base))]
         elif os.path.isdir(base):
             files += find_files(base)
         else:
             raise AttributeError("can't access '%s'" % base)

     copyrights = {}
     counts = {}

     for filename, lang in files:
         f = file(filename, "r")
         lines = f.readlines()
         if not lines:
             continue

         lines = [line.rstrip("\r\n") for line in lines]

         lt = lang_type(filename, lines[0])
         try:
             data = get_data(lt, lines)
         except Exception as e:
             if verbose:
                 if len(e.args) == 1:
                     e.args = ("%s (%s))" % (e, filename),)
                 print("could not parse %s: %s" % (filename, e))
             continue

         for owner, dates, authors, start, end in data:
             if owner not in copyrights:
                 copyrights[owner] = set()
             if owner not in counts:
                 counts[owner] = 0

             copyrights[owner] |= dates
             counts[owner] += 1

     info = [(counts[o], d, o) for o, d in list(copyrights.items())]

     for count, dates, owner in sorted(info, reverse=True):
         if show_counts:
             owner = "%s (%s files)" % (owner, count)
         print("Copyright (c) %s %s" % (datestr(dates), owner))
	#!/usr/bin/env python3

	import os
	import re
	import sys

	from file_types import lang_type, find_files

	mode_line = re.compile("(-\- mode:.* -\-)")
	shell_comment = re.compile(r"^\s*#")
	lisp_comment = re.compile(r";")
	cpp_comment = re.compile(r"//")
	c_comment_start = re.compile(r"/\*")
	c_comment_end = re.compile(r"\*/")


	def find_copyright_block(lines, lang_type):
	start = None
	if lang_type in ("python", "make", "shell", "perl", "scons"):
	for i, line in enumerate(lines):
	if i == 0 and (line.startswith("#!") or mode_line.search(line)):
	continue

	if shell_comment.search(line):
	if start is None:
	start = i
	elif start is None:
	if line.strip():
	return
	else:
	yield start, i - 1
	start = None

	elif lang_type in ("lisp",):
	for i, line in enumerate(lines):
	if i == 0 and mode_line.search(line):
	continue

	if lisp_comment.search(line):
	if start is None:
	start = i
	elif start is None:
	if line.strip():
	return
	else:
	yield start, i - 1
	start = None

	elif lang_type in (
	"C",
	"C++",
	"swig",
	"isa",
	"asm",
	"slicc",
	"lex",
	"yacc",
	):
	mode = None
	for i, line in enumerate(lines):
	if i == 0 and mode_line.search(line):
	continue

	if mode == "C":
	assert start is not None, "on line %d" % (i + 1)
	match = c_comment_end.search(line)
	if match:
	yield start, i
	mode = None
	continue

	cpp_match = cpp_comment.search(line)
	c_match = c_comment_start.search(line)

	if cpp_match:
	assert not c_match, "on line %d" % (i + 1)
	if line[: cpp_match.start()].strip():
	return
	if mode is None:
	mode = "CPP"
	start = i
	else:
	text = line[cpp_match.end() :].lstrip()
	if text.startswith("Copyright") > 0:
	yield start, i - 1
	start = i
	continue
	elif mode == "CPP":
	assert start is not None, "on line %d" % (i + 1)
	if not line.strip():
	continue
	yield start, i - 1
	mode = None
	if not c_match:
	return

	if c_match:
	assert mode is None, "on line %d" % (i + 1)
	mode = "C"
	start = i

	if mode is None and line.strip():
	return

	else:
	raise AttributeError("Could not handle language %s" % lang_type)


	date_range_re = re.compile(r"([0-9]{4})\s-\s([0-9]{4})")


	def process_dates(dates):
	dates = [d.strip() for d in dates.split(",")]

	output = set()
	for date in dates:
	match = date_range_re.match(date)
	if match:
	f, l = [int(d) for d in match.groups()]
	for i in range(f, l + 1):
	output.add(i)
	else:
	try:
	date = int(date)
	output.add(date)
	except ValueError:
	pass

	return output


	copyright_re = re.compile(
	r"Copyright (\([cC]\)) ([-, 0-9]+)[\s#/]([A-z-,. ]+)", re.DOTALL
	)

	authors_re = re.compile(r"^[\s#/]Authors:\s([A-z .]+)\s$")
	more_authors_re = re.compile(r"^[\s#/]([A-z .]+)\s*$")

	all_owners = set()


	def get_data(lang_type, lines):
	data = []
	last = None
	for start, end in find_copyright_block(lines, lang_type):
	joined = "".join(lines[start : end + 1])
	match = copyright_re.search(joined)
	if not match:
	continue

	c, dates, owner = match.groups()
	dates = dates.strip()
	owner = owner.strip()

	all_owners.add(owner)
	try:
	dates = process_dates(dates)
	except Exception:
	print(dates)
	print(owner)
	raise

	authors = []
	for i in range(start, end + 1):
	line = lines[i]
	if not authors:
	match = authors_re.search(line)
	if match:
	authors.append(match.group(1).strip())
	else:
	match = more_authors_re.search(line)
	if not match:
	for j in range(i, end + 1):
	line = lines[j].strip()
	if not line:
	end = j
	break
	if line.startswith("//"):
	line = line[2:].lstrip()
	if line:
	end = j - 1
	break
	break
	authors.append(match.group(1).strip())

	info = (owner, dates, authors, start, end)
	data.append(info)

	return data


	def datestr(dates):
	dates = list(dates)
	dates.sort()

	output = []

	def add_output(first, second):
	if first == second:
	output.append("%d" % (first))
	else:
	output.append("%d-%d" % (first, second))

	first = dates.pop(0)
	second = first
	while dates:
	next = dates.pop(0)
	if next == second + 1:
	second = next
	else:
	add_output(first, second)
	first = next
	second = next

	add_output(first, second)

	return ",".join(output)


	usage_str = """usage:
	%s [-v] <directory>"""


	def usage(exitcode):
	print(usage_str % sys.argv[0])
	if exitcode is not None:
	sys.exit(exitcode)


	if __name__ == "__main__":
	import getopt

	show_counts = False
	ignore = set()
	verbose = False
	try:
	opts, args = getopt.getopt(sys.argv[1:], "ci:v")
	except getopt.GetoptError:
	usage(1)

	for o, a in opts:
	if o == "-c":
	show_counts = True
	if o == "-i":
	ignore.add(a)
	if o == "-v":
	verbose = True

	files = []

	for base in args:
	if os.path.isfile(base):
	files += [(base, lang_type(base))]
	elif os.path.isdir(base):
	files += find_files(base)
	else:
	raise AttributeError("can't access '%s'" % base)

	copyrights = {}
	counts = {}

	for filename, lang in files:
	f = file(filename, "r")
	lines = f.readlines()
	if not lines:
	continue

	lines = [line.rstrip("\r\n") for line in lines]

	lt = lang_type(filename, lines[0])
	try:
	data = get_data(lt, lines)
	except Exception as e:
	if verbose:
	if len(e.args) == 1:
	e.args = ("%s (%s))" % (e, filename),)
	print("could not parse %s: %s" % (filename, e))
	continue

	for owner, dates, authors, start, end in data:
	if owner not in copyrights:
	copyrights[owner] = set()
	if owner not in counts:
	counts[owner] = 0

	copyrights[owner] \|= dates
	counts[owner] += 1

	info = [(counts[o], d, o) for o, d in list(copyrights.items())]

	for count, dates, owner in sorted(info, reverse=True):
	if show_counts:
	owner = "%s (%s files)" % (owner, count)
	print("Copyright (c) %s %s" % (datestr(dates), owner))