util/find_copyrights.py - public/gem5 - Git at Google

 #!/usr/bin/env python3

 import os
 import re
 import sys

 from file_types import lang_type, find_files

 mode_line = re.compile('(-\*- *mode:.* *-\*-)')
 shell_comment = re.compile(r'^\s*#')
 lisp_comment = re.compile(r';')
 cpp_comment = re.compile(r'//')
 c_comment_start = re.compile(r'/\*')
 c_comment_end   = re.compile(r'\*/')
 def find_copyright_block(lines, lang_type):
     start = None
     if lang_type in ('python', 'make', 'shell', 'perl', 'scons'):
         for i,line in enumerate(lines):
             if i == 0 and (line.startswith('#!') or mode_line.search(line)):
                 continue

             if shell_comment.search(line):
                 if start is None:
                     start = i
             elif start is None:
                 if line.strip():
                     return
             else:
                 yield start, i-1
                 start = None

     elif lang_type in ('lisp', ):
         for i,line in enumerate(lines):
             if i == 0 and mode_line.search(line):
                 continue

             if lisp_comment.search(line):
                 if start is None:
                     start = i
             elif start is None:
                 if line.strip():
                     return
             else:
                 yield start, i-1
                 start = None

     elif lang_type in ('C', 'C++', 'swig', 'isa', 'asm', 'slicc',
                        'lex', 'yacc'):
         mode = None
         for i,line in enumerate(lines):
             if i == 0 and mode_line.search(line):
                 continue

             if mode == 'C':
                 assert start is not None, 'on line %d' % (i + 1)
                 match = c_comment_end.search(line)
                 if match:
                     yield start, i
                     mode = None
                 continue

             cpp_match = cpp_comment.search(line)
             c_match = c_comment_start.search(line)

             if cpp_match:
                 assert not c_match, 'on line %d' % (i + 1)
                 if line[:cpp_match.start()].strip():
                     return
                 if mode is None:
                     mode = 'CPP'
                     start = i
                 else:
                     text = line[cpp_match.end():].lstrip()
                     if text.startswith("Copyright") > 0:
                         yield start, i-1
                         start = i
                 continue
             elif mode == 'CPP':
                 assert start is not None, 'on line %d' % (i + 1)
                 if not line.strip():
                     continue
                 yield start, i-1
                 mode = None
                 if not c_match:
                     return

             if c_match:
                 assert mode is None, 'on line %d' % (i + 1)
                 mode = 'C'
                 start = i

             if mode is None and line.strip():
                 return

     else:
         raise AttributeError("Could not handle language %s" % lang_type)

 date_range_re = re.compile(r'([0-9]{4})\s*-\s*([0-9]{4})')
 def process_dates(dates):
     dates = [ d.strip() for d in dates.split(',') ]

     output = set()
     for date in dates:
         match = date_range_re.match(date)
         if match:
             f,l = [ int(d) for d in match.groups() ]
             for i in range(f, l+1):
                 output.add(i)
         else:
             try:
                 date = int(date)
                 output.add(date)
             except ValueError:
                 pass

     return output

 copyright_re = \
     re.compile(r'Copyright (\([cC]\)) ([-, 0-9]+)[\s*#/]*([A-z-,. ]+)',
                re.DOTALL)

 authors_re = re.compile(r'^[\s*#/]*Authors:\s*([A-z .]+)\s*$')
 more_authors_re = re.compile(r'^[\s*#/]*([A-z .]+)\s*$')

 all_owners = set()
 def get_data(lang_type, lines):
     data = []
     last = None
     for start,end in find_copyright_block(lines, lang_type):
         joined = ''.join(lines[start:end+1])
         match = copyright_re.search(joined)
         if not match:
             continue

         c,dates,owner = match.groups()
         dates = dates.strip()
         owner = owner.strip()

         all_owners.add(owner)
         try:
             dates = process_dates(dates)
         except Exception:
             print(dates)
             print(owner)
             raise

         authors = []
         for i in range(start,end+1):
             line = lines[i]
             if not authors:
                 match = authors_re.search(line)
                 if match:
                     authors.append(match.group(1).strip())
             else:
                 match = more_authors_re.search(line)
                 if not match:
                     for j in range(i, end+1):
                         line = lines[j].strip()
                         if not line:
                             end = j
                             break
                         if line.startswith('//'):
                             line = line[2:].lstrip()
                             if line:
                                 end = j - 1
                                 break
                     break
                 authors.append(match.group(1).strip())

         info = (owner, dates, authors, start, end)
         data.append(info)

     return data

 def datestr(dates):
     dates = list(dates)
     dates.sort()

     output = []
     def add_output(first, second):
         if first == second:
             output.append('%d' % (first))
         else:
             output.append('%d-%d' % (first, second))

     first = dates.pop(0)
     second = first
     while dates:
         next = dates.pop(0)
         if next == second + 1:
             second = next
         else:
             add_output(first, second)
             first = next
             second = next

     add_output(first, second)

     return ','.join(output)

 usage_str = """usage:
 %s [-v] <directory>"""

 def usage(exitcode):
     print(usage_str % sys.argv[0])
     if exitcode is not None:
         sys.exit(exitcode)

 if __name__ == '__main__':
     import getopt

     show_counts = False
     ignore = set()
     verbose = False
     try:
         opts, args = getopt.getopt(sys.argv[1:], "ci:v")
     except getopt.GetoptError:
         usage(1)

     for o,a in opts:
         if o == '-c':
             show_counts = True
         if o == '-i':
             ignore.add(a)
         if o == '-v':
             verbose = True

     files = []

     for base in args:
         if os.path.isfile(base):
             files += [ (base, lang_type(base)) ]
         elif os.path.isdir(base):
             files += find_files(base)
         else:
             raise AttributeError("can't access '%s'" %  base)

     copyrights = {}
     counts = {}

     for filename, lang in files:
         f = file(filename, 'r')
         lines = f.readlines()
         if not lines:
             continue

         lines = [ line.rstrip('\r\n') for line in lines ]

         lt = lang_type(filename, lines[0])
         try:
             data = get_data(lt, lines)
         except Exception as e:
             if verbose:
                 if len(e.args) == 1:
                     e.args = ('%s (%s))' % (e, filename), )
                 print("could not parse %s: %s" % (filename, e))
             continue

         for owner, dates, authors, start, end in data:
             if owner not in copyrights:
                 copyrights[owner] = set()
             if owner not in counts:
                 counts[owner] = 0

             copyrights[owner] |= dates
             counts[owner] += 1

     info = [ (counts[o], d, o) for o,d in list(copyrights.items()) ]

     for count,dates,owner in sorted(info, reverse=True):
         if show_counts:
             owner = '%s (%s files)' % (owner, count)
         print('Copyright (c) %s %s' % (datestr(dates), owner))
	#!/usr/bin/env python3

	import os
	import re
	import sys

	from file_types import lang_type, find_files

	mode_line = re.compile('(-\- mode:.* -\-)')
	shell_comment = re.compile(r'^\s*#')
	lisp_comment = re.compile(r';')
	cpp_comment = re.compile(r'//')
	c_comment_start = re.compile(r'/\*')
	c_comment_end = re.compile(r'\*/')
	def find_copyright_block(lines, lang_type):
	start = None
	if lang_type in ('python', 'make', 'shell', 'perl', 'scons'):
	for i,line in enumerate(lines):
	if i == 0 and (line.startswith('#!') or mode_line.search(line)):
	continue

	if shell_comment.search(line):
	if start is None:
	start = i
	elif start is None:
	if line.strip():
	return
	else:
	yield start, i-1
	start = None

	elif lang_type in ('lisp', ):
	for i,line in enumerate(lines):
	if i == 0 and mode_line.search(line):
	continue

	if lisp_comment.search(line):
	if start is None:
	start = i
	elif start is None:
	if line.strip():
	return
	else:
	yield start, i-1
	start = None

	elif lang_type in ('C', 'C++', 'swig', 'isa', 'asm', 'slicc',
	'lex', 'yacc'):
	mode = None
	for i,line in enumerate(lines):
	if i == 0 and mode_line.search(line):
	continue

	if mode == 'C':
	assert start is not None, 'on line %d' % (i + 1)
	match = c_comment_end.search(line)
	if match:
	yield start, i
	mode = None
	continue

	cpp_match = cpp_comment.search(line)
	c_match = c_comment_start.search(line)

	if cpp_match:
	assert not c_match, 'on line %d' % (i + 1)
	if line[:cpp_match.start()].strip():
	return
	if mode is None:
	mode = 'CPP'
	start = i
	else:
	text = line[cpp_match.end():].lstrip()
	if text.startswith("Copyright") > 0:
	yield start, i-1
	start = i
	continue
	elif mode == 'CPP':
	assert start is not None, 'on line %d' % (i + 1)
	if not line.strip():
	continue
	yield start, i-1
	mode = None
	if not c_match:
	return

	if c_match:
	assert mode is None, 'on line %d' % (i + 1)
	mode = 'C'
	start = i

	if mode is None and line.strip():
	return

	else:
	raise AttributeError("Could not handle language %s" % lang_type)

	date_range_re = re.compile(r'([0-9]{4})\s-\s([0-9]{4})')
	def process_dates(dates):
	dates = [ d.strip() for d in dates.split(',') ]

	output = set()
	for date in dates:
	match = date_range_re.match(date)
	if match:
	f,l = [ int(d) for d in match.groups() ]
	for i in range(f, l+1):
	output.add(i)
	else:
	try:
	date = int(date)
	output.add(date)
	except ValueError:
	pass

	return output

	copyright_re = \
	re.compile(r'Copyright (\([cC]\)) ([-, 0-9]+)[\s#/]([A-z-,. ]+)',
	re.DOTALL)

	authors_re = re.compile(r'^[\s#/]Authors:\s([A-z .]+)\s$')
	more_authors_re = re.compile(r'^[\s#/]([A-z .]+)\s*$')

	all_owners = set()
	def get_data(lang_type, lines):
	data = []
	last = None
	for start,end in find_copyright_block(lines, lang_type):
	joined = ''.join(lines[start:end+1])
	match = copyright_re.search(joined)
	if not match:
	continue

	c,dates,owner = match.groups()
	dates = dates.strip()
	owner = owner.strip()

	all_owners.add(owner)
	try:
	dates = process_dates(dates)
	except Exception:
	print(dates)
	print(owner)
	raise

	authors = []
	for i in range(start,end+1):
	line = lines[i]
	if not authors:
	match = authors_re.search(line)
	if match:
	authors.append(match.group(1).strip())
	else:
	match = more_authors_re.search(line)
	if not match:
	for j in range(i, end+1):
	line = lines[j].strip()
	if not line:
	end = j
	break
	if line.startswith('//'):
	line = line[2:].lstrip()
	if line:
	end = j - 1
	break
	break
	authors.append(match.group(1).strip())

	info = (owner, dates, authors, start, end)
	data.append(info)

	return data

	def datestr(dates):
	dates = list(dates)
	dates.sort()

	output = []
	def add_output(first, second):
	if first == second:
	output.append('%d' % (first))
	else:
	output.append('%d-%d' % (first, second))

	first = dates.pop(0)
	second = first
	while dates:
	next = dates.pop(0)
	if next == second + 1:
	second = next
	else:
	add_output(first, second)
	first = next
	second = next

	add_output(first, second)

	return ','.join(output)

	usage_str = """usage:
	%s [-v] <directory>"""

	def usage(exitcode):
	print(usage_str % sys.argv[0])
	if exitcode is not None:
	sys.exit(exitcode)

	if __name__ == '__main__':
	import getopt

	show_counts = False
	ignore = set()
	verbose = False
	try:
	opts, args = getopt.getopt(sys.argv[1:], "ci:v")
	except getopt.GetoptError:
	usage(1)

	for o,a in opts:
	if o == '-c':
	show_counts = True
	if o == '-i':
	ignore.add(a)
	if o == '-v':
	verbose = True

	files = []

	for base in args:
	if os.path.isfile(base):
	files += [ (base, lang_type(base)) ]
	elif os.path.isdir(base):
	files += find_files(base)
	else:
	raise AttributeError("can't access '%s'" % base)

	copyrights = {}
	counts = {}

	for filename, lang in files:
	f = file(filename, 'r')
	lines = f.readlines()
	if not lines:
	continue

	lines = [ line.rstrip('\r\n') for line in lines ]

	lt = lang_type(filename, lines[0])
	try:
	data = get_data(lt, lines)
	except Exception as e:
	if verbose:
	if len(e.args) == 1:
	e.args = ('%s (%s))' % (e, filename), )
	print("could not parse %s: %s" % (filename, e))
	continue

	for owner, dates, authors, start, end in data:
	if owner not in copyrights:
	copyrights[owner] = set()
	if owner not in counts:
	counts[owner] = 0

	copyrights[owner] \|= dates
	counts[owner] += 1

	info = [ (counts[o], d, o) for o,d in list(copyrights.items()) ]

	for count,dates,owner in sorted(info, reverse=True):
	if show_counts:
	owner = '%s (%s files)' % (owner, count)
	print('Copyright (c) %s %s' % (datestr(dates), owner))