find_copyrights.py revision 13540
1#!/usr/bin/env python2.7 2 3import os 4import re 5import sys 6 7from file_types import lang_type, find_files 8 9mode_line = re.compile('(-\*- *mode:.* *-\*-)') 10shell_comment = re.compile(r'^\s*#') 11lisp_comment = re.compile(r';') 12cpp_comment = re.compile(r'//') 13c_comment_start = re.compile(r'/\*') 14c_comment_end = re.compile(r'\*/') 15def find_copyright_block(lines, lang_type): 16 start = None 17 if lang_type in ('python', 'make', 'shell', 'perl', 'scons'): 18 for i,line in enumerate(lines): 19 if i == 0 and (line.startswith('#!') or mode_line.search(line)): 20 continue 21 22 if shell_comment.search(line): 23 if start is None: 24 start = i 25 elif start is None: 26 if line.strip(): 27 return 28 else: 29 yield start, i-1 30 start = None 31 32 elif lang_type in ('lisp', ): 33 for i,line in enumerate(lines): 34 if i == 0 and mode_line.search(line): 35 continue 36 37 if lisp_comment.search(line): 38 if start is None: 39 start = i 40 elif start is None: 41 if line.strip(): 42 return 43 else: 44 yield start, i-1 45 start = None 46 47 elif lang_type in ('C', 'C++', 'swig', 'isa', 'asm', 'slicc', 48 'lex', 'yacc'): 49 mode = None 50 for i,line in enumerate(lines): 51 if i == 0 and mode_line.search(line): 52 continue 53 54 if mode == 'C': 55 assert start is not None, 'on line %d' % (i + 1) 56 match = c_comment_end.search(line) 57 if match: 58 yield start, i 59 mode = None 60 continue 61 62 cpp_match = cpp_comment.search(line) 63 c_match = c_comment_start.search(line) 64 65 if cpp_match: 66 assert not c_match, 'on line %d' % (i + 1) 67 if line[:cpp_match.start()].strip(): 68 return 69 if mode is None: 70 mode = 'CPP' 71 start = i 72 else: 73 text = line[cpp_match.end():].lstrip() 74 if text.startswith("Copyright") > 0: 75 yield start, i-1 76 start = i 77 continue 78 elif mode == 'CPP': 79 assert start is not None, 'on line %d' % (i + 1) 80 if not line.strip(): 81 continue 82 yield start, i-1 83 mode = None 84 if not c_match: 85 return 86 87 if c_match: 88 assert mode is None, 'on line %d' % (i + 1) 89 mode = 'C' 90 start = i 91 92 if mode is None and line.strip(): 93 return 94 95 else: 96 raise AttributeError, "Could not handle language %s" % lang_type 97 98date_range_re = re.compile(r'([0-9]{4})\s*-\s*([0-9]{4})') 99def process_dates(dates): 100 dates = [ d.strip() for d in dates.split(',') ] 101 102 output = set() 103 for date in dates: 104 match = date_range_re.match(date) 105 if match: 106 f,l = [ int(d) for d in match.groups() ] 107 for i in xrange(f, l+1): 108 output.add(i) 109 else: 110 try: 111 date = int(date) 112 output.add(date) 113 except ValueError: 114 pass 115 116 return output 117 118copyright_re = \ 119 re.compile(r'Copyright (\([cC]\)) ([-, 0-9]+)[\s*#/]*([A-z-,. ]+)', 120 re.DOTALL) 121 122authors_re = re.compile(r'^[\s*#/]*Authors:\s*([A-z .]+)\s*$') 123more_authors_re = re.compile(r'^[\s*#/]*([A-z .]+)\s*$') 124 125all_owners = set() 126def get_data(lang_type, lines): 127 data = [] 128 last = None 129 for start,end in find_copyright_block(lines, lang_type): 130 joined = ''.join(lines[start:end+1]) 131 match = copyright_re.search(joined) 132 if not match: 133 continue 134 135 c,dates,owner = match.groups() 136 dates = dates.strip() 137 owner = owner.strip() 138 139 all_owners.add(owner) 140 try: 141 dates = process_dates(dates) 142 except Exception: 143 print dates 144 print owner 145 raise 146 147 authors = [] 148 for i in xrange(start,end+1): 149 line = lines[i] 150 if not authors: 151 match = authors_re.search(line) 152 if match: 153 authors.append(match.group(1).strip()) 154 else: 155 match = more_authors_re.search(line) 156 if not match: 157 for j in xrange(i, end+1): 158 line = lines[j].strip() 159 if not line: 160 end = j 161 break 162 if line.startswith('//'): 163 line = line[2:].lstrip() 164 if line: 165 end = j - 1 166 break 167 break 168 authors.append(match.group(1).strip()) 169 170 info = (owner, dates, authors, start, end) 171 data.append(info) 172 173 return data 174 175def datestr(dates): 176 dates = list(dates) 177 dates.sort() 178 179 output = [] 180 def add_output(first, second): 181 if first == second: 182 output.append('%d' % (first)) 183 else: 184 output.append('%d-%d' % (first, second)) 185 186 first = dates.pop(0) 187 second = first 188 while dates: 189 next = dates.pop(0) 190 if next == second + 1: 191 second = next 192 else: 193 add_output(first, second) 194 first = next 195 second = next 196 197 add_output(first, second) 198 199 return ','.join(output) 200 201usage_str = """usage: 202%s [-v] <directory>""" 203 204def usage(exitcode): 205 print usage_str % sys.argv[0] 206 if exitcode is not None: 207 sys.exit(exitcode) 208 209if __name__ == '__main__': 210 import getopt 211 212 show_counts = False 213 ignore = set() 214 verbose = False 215 try: 216 opts, args = getopt.getopt(sys.argv[1:], "ci:v") 217 except getopt.GetoptError: 218 usage(1) 219 220 for o,a in opts: 221 if o == '-c': 222 show_counts = True 223 if o == '-i': 224 ignore.add(a) 225 if o == '-v': 226 verbose = True 227 228 files = [] 229 230 for base in args: 231 if os.path.isfile(base): 232 files += [ (base, lang_type(base)) ] 233 elif os.path.isdir(base): 234 files += find_files(base) 235 else: 236 raise AttributeError, "can't access '%s'" % base 237 238 copyrights = {} 239 counts = {} 240 241 for filename, lang in files: 242 f = file(filename, 'r') 243 lines = f.readlines() 244 if not lines: 245 continue 246 247 lines = [ line.rstrip('\r\n') for line in lines ] 248 249 lt = lang_type(filename, lines[0]) 250 try: 251 data = get_data(lt, lines) 252 except Exception, e: 253 if verbose: 254 if len(e.args) == 1: 255 e.args = ('%s (%s))' % (e, filename), ) 256 print "could not parse %s: %s" % (filename, e) 257 continue 258 259 for owner, dates, authors, start, end in data: 260 if owner not in copyrights: 261 copyrights[owner] = set() 262 if owner not in counts: 263 counts[owner] = 0 264 265 copyrights[owner] |= dates 266 counts[owner] += 1 267 268 info = [ (counts[o], d, o) for o,d in copyrights.items() ] 269 270 for count,dates,owner in sorted(info, reverse=True): 271 if show_counts: 272 owner = '%s (%s files)' % (owner, count) 273 print 'Copyright (c) %s %s' % (datestr(dates), owner) 274