find_copyrights.py revision 8333
12650Ssaidi@eecs.umich.edu#!/usr/bin/env python
22650Ssaidi@eecs.umich.edu
32650Ssaidi@eecs.umich.eduimport os
42650Ssaidi@eecs.umich.eduimport re
52650Ssaidi@eecs.umich.eduimport sys
62650Ssaidi@eecs.umich.edu
72650Ssaidi@eecs.umich.edufrom file_types import lang_type, find_files
82650Ssaidi@eecs.umich.edu
92650Ssaidi@eecs.umich.edumode_line = re.compile('(-\*- *mode:.* *-\*-)')
102650Ssaidi@eecs.umich.edushell_comment = re.compile(r'^\s*#')
112650Ssaidi@eecs.umich.edulisp_comment = re.compile(r';')
122650Ssaidi@eecs.umich.educpp_comment = re.compile(r'//')
132650Ssaidi@eecs.umich.educ_comment_start = re.compile(r'/\*')
142650Ssaidi@eecs.umich.educ_comment_end   = re.compile(r'\*/')
152650Ssaidi@eecs.umich.edudef find_copyright_block(lines, lang_type):
162650Ssaidi@eecs.umich.edu    start = None
172650Ssaidi@eecs.umich.edu    if lang_type in ('python', 'make', 'shell', 'perl', 'scons'):
182650Ssaidi@eecs.umich.edu        for i,line in enumerate(lines):
192650Ssaidi@eecs.umich.edu            if i == 0 and (line.startswith('#!') or mode_line.search(line)):
202650Ssaidi@eecs.umich.edu                continue
212650Ssaidi@eecs.umich.edu
222650Ssaidi@eecs.umich.edu            if shell_comment.search(line):
232650Ssaidi@eecs.umich.edu                if start is None:
242650Ssaidi@eecs.umich.edu                    start = i
252650Ssaidi@eecs.umich.edu            elif start is None:
262650Ssaidi@eecs.umich.edu                if line.strip():
272650Ssaidi@eecs.umich.edu                    return
282650Ssaidi@eecs.umich.edu            else:
292650Ssaidi@eecs.umich.edu                yield start, i-1
302650Ssaidi@eecs.umich.edu                start = None
312650Ssaidi@eecs.umich.edu
322650Ssaidi@eecs.umich.edu    elif lang_type in ('lisp', ):
332650Ssaidi@eecs.umich.edu        for i,line in enumerate(lines):
343836Ssaidi@eecs.umich.edu            if i == 0 and mode_line.search(line):
353804Ssaidi@eecs.umich.edu                continue
363602Sgblack@eecs.umich.edu
374997Sgblack@eecs.umich.edu            if lisp_comment.search(line):
383569Sgblack@eecs.umich.edu                if start is None:
396022Sgblack@eecs.umich.edu                    start = i
407878Sgblack@eecs.umich.edu            elif start is None:
415358Sgblack@eecs.umich.edu                if line.strip():
423468Sgblack@eecs.umich.edu                    return
433468Sgblack@eecs.umich.edu            else:
443806Ssaidi@eecs.umich.edu                yield start, i-1
453468Sgblack@eecs.umich.edu                start = None
463468Sgblack@eecs.umich.edu
473468Sgblack@eecs.umich.edu    elif lang_type in ('C', 'C++', 'swig', 'isa', 'asm', 'slicc',
483603Ssaidi@eecs.umich.edu                       'lex', 'yacc'):
495358Sgblack@eecs.umich.edu        mode = None
503804Ssaidi@eecs.umich.edu        for i,line in enumerate(lines):
514997Sgblack@eecs.umich.edu            if i == 0 and mode_line.search(line):
527741Sgblack@eecs.umich.edu                continue
534997Sgblack@eecs.umich.edu
544997Sgblack@eecs.umich.edu            if mode == 'C':
554997Sgblack@eecs.umich.edu                assert start is not None, 'on line %d' % (i + 1)
564997Sgblack@eecs.umich.edu                match = c_comment_end.search(line)
577741Sgblack@eecs.umich.edu                if match:
584990Sgblack@eecs.umich.edu                    yield start, i
596022Sgblack@eecs.umich.edu                    mode = None
606022Sgblack@eecs.umich.edu                continue
614990Sgblack@eecs.umich.edu
624990Sgblack@eecs.umich.edu            cpp_match = cpp_comment.search(line)
634990Sgblack@eecs.umich.edu            c_match = c_comment_start.search(line)
644990Sgblack@eecs.umich.edu
654990Sgblack@eecs.umich.edu            if cpp_match:
664990Sgblack@eecs.umich.edu                assert not c_match, 'on line %d' % (i + 1)
674990Sgblack@eecs.umich.edu                if line[:cpp_match.start()].strip():
684990Sgblack@eecs.umich.edu                    return
694990Sgblack@eecs.umich.edu                if mode is None:
703804Ssaidi@eecs.umich.edu                    mode = 'CPP'
713804Ssaidi@eecs.umich.edu                    start = i
723804Ssaidi@eecs.umich.edu                else:
733804Ssaidi@eecs.umich.edu                    text = line[cpp_match.end():].lstrip()
743804Ssaidi@eecs.umich.edu                    if text.startswith("Copyright") > 0:
753804Ssaidi@eecs.umich.edu                        yield start, i-1
763804Ssaidi@eecs.umich.edu                        start = i
773804Ssaidi@eecs.umich.edu                continue
783881Ssaidi@eecs.umich.edu            elif mode == 'CPP':
793804Ssaidi@eecs.umich.edu                assert start is not None, 'on line %d' % (i + 1)
803836Ssaidi@eecs.umich.edu                if not line.strip():
813836Ssaidi@eecs.umich.edu                    continue
823836Ssaidi@eecs.umich.edu                yield start, i-1
833881Ssaidi@eecs.umich.edu                mode = None
843881Ssaidi@eecs.umich.edu                if not c_match:
853804Ssaidi@eecs.umich.edu                    return
863804Ssaidi@eecs.umich.edu
873804Ssaidi@eecs.umich.edu            if c_match:
883804Ssaidi@eecs.umich.edu                assert mode is None, 'on line %d' % (i + 1)
893804Ssaidi@eecs.umich.edu                mode = 'C'
903804Ssaidi@eecs.umich.edu                start = i
913804Ssaidi@eecs.umich.edu
923804Ssaidi@eecs.umich.edu            if mode is None and line.strip():
933804Ssaidi@eecs.umich.edu                return
943468Sgblack@eecs.umich.edu
953468Sgblack@eecs.umich.edu    else:
963804Ssaidi@eecs.umich.edu        raise AttributeError, "Could not handle language %s" % lang_type
973804Ssaidi@eecs.umich.edu
983804Ssaidi@eecs.umich.edudate_range_re = re.compile(r'([0-9]{4})\s*-\s*([0-9]{4})')
993804Ssaidi@eecs.umich.edudef process_dates(dates):
1003468Sgblack@eecs.umich.edu    dates = [ d.strip() for d in dates.split(',') ]
1013468Sgblack@eecs.umich.edu
1024070Ssaidi@eecs.umich.edu    output = set()
1034070Ssaidi@eecs.umich.edu    for date in dates:
1044070Ssaidi@eecs.umich.edu        match = date_range_re.match(date)
1054070Ssaidi@eecs.umich.edu        if match:
1064070Ssaidi@eecs.umich.edu            f,l = [ int(d) for d in match.groups() ]
1073804Ssaidi@eecs.umich.edu            for i in xrange(f, l+1):
1083804Ssaidi@eecs.umich.edu                output.add(i)
1093804Ssaidi@eecs.umich.edu        else:
1103804Ssaidi@eecs.umich.edu            try:
1113804Ssaidi@eecs.umich.edu                date = int(date)
1123804Ssaidi@eecs.umich.edu                output.add(date)
1135555Snate@binkert.org            except ValueError:
1145555Snate@binkert.org                pass
1155555Snate@binkert.org
1163804Ssaidi@eecs.umich.edu    return output
1173804Ssaidi@eecs.umich.edu
1184070Ssaidi@eecs.umich.educopyright_re = \
1194070Ssaidi@eecs.umich.edu    re.compile(r'Copyright (\([cC]\)) ([-, 0-9]+)[\s*#/]*([A-z-,. ]+)',
1204070Ssaidi@eecs.umich.edu               re.DOTALL)
1213804Ssaidi@eecs.umich.edu
1223804Ssaidi@eecs.umich.eduauthors_re = re.compile(r'^[\s*#/]*Authors:\s*([A-z .]+)\s*$')
1233826Ssaidi@eecs.umich.edumore_authors_re = re.compile(r'^[\s*#/]*([A-z .]+)\s*$')
1243804Ssaidi@eecs.umich.edu
1253804Ssaidi@eecs.umich.eduall_owners = set()
1263804Ssaidi@eecs.umich.edudef get_data(lang_type, lines):
1273804Ssaidi@eecs.umich.edu    data = []
1283804Ssaidi@eecs.umich.edu    last = None
1293804Ssaidi@eecs.umich.edu    for start,end in find_copyright_block(lines, lang_type):
1303804Ssaidi@eecs.umich.edu        joined = ''.join(lines[start:end+1])
1313804Ssaidi@eecs.umich.edu        match = copyright_re.search(joined)
1323804Ssaidi@eecs.umich.edu        if not match:
1333804Ssaidi@eecs.umich.edu            continue
1343804Ssaidi@eecs.umich.edu
1353804Ssaidi@eecs.umich.edu        c,dates,owner = match.groups()
1363804Ssaidi@eecs.umich.edu        dates = dates.strip()
1373804Ssaidi@eecs.umich.edu        owner = owner.strip()
1383804Ssaidi@eecs.umich.edu
1393804Ssaidi@eecs.umich.edu        all_owners.add(owner)
1403804Ssaidi@eecs.umich.edu        try:
1413804Ssaidi@eecs.umich.edu            dates = process_dates(dates)
1423804Ssaidi@eecs.umich.edu        except Exception:
1433804Ssaidi@eecs.umich.edu            print dates
1444990Sgblack@eecs.umich.edu            print owner
1453804Ssaidi@eecs.umich.edu            raise
1463804Ssaidi@eecs.umich.edu
1473834Sgblack@eecs.umich.edu        authors = []
1483804Ssaidi@eecs.umich.edu        for i in xrange(start,end+1):
1493804Ssaidi@eecs.umich.edu            line = lines[i]
1504990Sgblack@eecs.umich.edu            if not authors:
1513826Ssaidi@eecs.umich.edu                match = authors_re.search(line)
1526022Sgblack@eecs.umich.edu                if match:
1536022Sgblack@eecs.umich.edu                    authors.append(match.group(1).strip())
1546022Sgblack@eecs.umich.edu            else:
1553804Ssaidi@eecs.umich.edu                match = more_authors_re.search(line)
1565034Smilesck@eecs.umich.edu                if not match:
1575034Smilesck@eecs.umich.edu                    for j in xrange(i, end+1):
1583804Ssaidi@eecs.umich.edu                        line = lines[j].strip()
1597741Sgblack@eecs.umich.edu                        if not line:
1607741Sgblack@eecs.umich.edu                            end = j
1615358Sgblack@eecs.umich.edu                            break
1625358Sgblack@eecs.umich.edu                        if line.startswith('//'):
1635358Sgblack@eecs.umich.edu                            line = line[2:].lstrip()
1645358Sgblack@eecs.umich.edu                            if line:
1653826Ssaidi@eecs.umich.edu                                end = j - 1
1663826Ssaidi@eecs.umich.edu                                break
1676023Snate@binkert.org                    break
1685894Sgblack@eecs.umich.edu                authors.append(match.group(1).strip())
1696023Snate@binkert.org
1704997Sgblack@eecs.umich.edu        info = (owner, dates, authors, start, end)
1713806Ssaidi@eecs.umich.edu        data.append(info)
1723806Ssaidi@eecs.umich.edu
1734997Sgblack@eecs.umich.edu    return data
1744070Ssaidi@eecs.umich.edu
1753804Ssaidi@eecs.umich.edudef datestr(dates):
1764990Sgblack@eecs.umich.edu    dates = list(dates)
1774990Sgblack@eecs.umich.edu    dates.sort()
1784990Sgblack@eecs.umich.edu
1794990Sgblack@eecs.umich.edu    output = []
1806022Sgblack@eecs.umich.edu    def add_output(first, second):
1816022Sgblack@eecs.umich.edu        if first == second:
1826022Sgblack@eecs.umich.edu            output.append('%d' % (first))
1833804Ssaidi@eecs.umich.edu        else:
1844990Sgblack@eecs.umich.edu            output.append('%d-%d' % (first, second))
1853804Ssaidi@eecs.umich.edu
1863826Ssaidi@eecs.umich.edu    first = dates.pop(0)
1874070Ssaidi@eecs.umich.edu    second = first
1884070Ssaidi@eecs.umich.edu    while dates:
1894070Ssaidi@eecs.umich.edu        next = dates.pop(0)
1904070Ssaidi@eecs.umich.edu        if next == second + 1:
1913836Ssaidi@eecs.umich.edu            second = next
1923836Ssaidi@eecs.umich.edu        else:
1933804Ssaidi@eecs.umich.edu            add_output(first, second)
1943804Ssaidi@eecs.umich.edu            first = next
1953468Sgblack@eecs.umich.edu            second = next
1962650Ssaidi@eecs.umich.edu
1972650Ssaidi@eecs.umich.edu    add_output(first, second)
198
199    return ','.join(output)
200
201usage_str = """usage:
202%s [-v] <directory>"""
203
204def usage(exitcode):
205    print usage_str % sys.argv[0]
206    if exitcode is not None:
207        sys.exit(exitcode)
208
209if __name__ == '__main__':
210    import getopt
211
212    show_counts = False
213    ignore = set()
214    verbose = False
215    try:
216        opts, args = getopt.getopt(sys.argv[1:], "ci:v")
217    except getopt.GetoptError:
218        usage(1)
219
220    for o,a in opts:
221        if o == '-c':
222            show_counts = True
223        if o == '-i':
224            ignore.add(a)
225        if o == '-v':
226            verbose = True
227
228    files = []
229
230    for base in args:
231        if os.path.isfile(base):
232            files += [ (base, lang_type(base)) ]
233        elif os.path.isdir(base):
234            files += find_files(base)
235        else:
236            raise AttributeError, "can't access '%s'" %  base
237
238    copyrights = {}
239    counts = {}
240
241    for filename, lang in files:
242        f = file(filename, 'r')
243        lines = f.readlines()
244        if not lines:
245            continue
246
247        lines = [ line.rstrip('\r\n') for line in lines ]
248
249        lt = lang_type(filename, lines[0])
250        try:
251            data = get_data(lt, lines)
252        except Exception, e:
253            if verbose:
254                if len(e.args) == 1:
255                    e.args = ('%s (%s))' % (e, filename), )
256                print "could not parse %s: %s" % (filename, e)
257            continue
258
259        for owner, dates, authors, start, end in data:
260            if owner not in copyrights:
261                copyrights[owner] = set()
262            if owner not in counts:
263                counts[owner] = 0
264
265            copyrights[owner] |= dates
266            counts[owner] += 1
267
268    info = [ (counts[o], d, o) for o,d in copyrights.items() ]
269
270    for count,dates,owner in sorted(info, reverse=True):
271        if show_counts:
272            owner = '%s (%s files)' % (owner, count)
273        print 'Copyright (c) %s %s' % (datestr(dates), owner)
274