find_copyrights.py revision 13540
1#!/usr/bin/env python2.7
2
3import os
4import re
5import sys
6
7from file_types import lang_type, find_files
8
9mode_line = re.compile('(-\*- *mode:.* *-\*-)')
10shell_comment = re.compile(r'^\s*#')
11lisp_comment = re.compile(r';')
12cpp_comment = re.compile(r'//')
13c_comment_start = re.compile(r'/\*')
14c_comment_end   = re.compile(r'\*/')
15def find_copyright_block(lines, lang_type):
16    start = None
17    if lang_type in ('python', 'make', 'shell', 'perl', 'scons'):
18        for i,line in enumerate(lines):
19            if i == 0 and (line.startswith('#!') or mode_line.search(line)):
20                continue
21
22            if shell_comment.search(line):
23                if start is None:
24                    start = i
25            elif start is None:
26                if line.strip():
27                    return
28            else:
29                yield start, i-1
30                start = None
31
32    elif lang_type in ('lisp', ):
33        for i,line in enumerate(lines):
34            if i == 0 and mode_line.search(line):
35                continue
36
37            if lisp_comment.search(line):
38                if start is None:
39                    start = i
40            elif start is None:
41                if line.strip():
42                    return
43            else:
44                yield start, i-1
45                start = None
46
47    elif lang_type in ('C', 'C++', 'swig', 'isa', 'asm', 'slicc',
48                       'lex', 'yacc'):
49        mode = None
50        for i,line in enumerate(lines):
51            if i == 0 and mode_line.search(line):
52                continue
53
54            if mode == 'C':
55                assert start is not None, 'on line %d' % (i + 1)
56                match = c_comment_end.search(line)
57                if match:
58                    yield start, i
59                    mode = None
60                continue
61
62            cpp_match = cpp_comment.search(line)
63            c_match = c_comment_start.search(line)
64
65            if cpp_match:
66                assert not c_match, 'on line %d' % (i + 1)
67                if line[:cpp_match.start()].strip():
68                    return
69                if mode is None:
70                    mode = 'CPP'
71                    start = i
72                else:
73                    text = line[cpp_match.end():].lstrip()
74                    if text.startswith("Copyright") > 0:
75                        yield start, i-1
76                        start = i
77                continue
78            elif mode == 'CPP':
79                assert start is not None, 'on line %d' % (i + 1)
80                if not line.strip():
81                    continue
82                yield start, i-1
83                mode = None
84                if not c_match:
85                    return
86
87            if c_match:
88                assert mode is None, 'on line %d' % (i + 1)
89                mode = 'C'
90                start = i
91
92            if mode is None and line.strip():
93                return
94
95    else:
96        raise AttributeError, "Could not handle language %s" % lang_type
97
98date_range_re = re.compile(r'([0-9]{4})\s*-\s*([0-9]{4})')
99def process_dates(dates):
100    dates = [ d.strip() for d in dates.split(',') ]
101
102    output = set()
103    for date in dates:
104        match = date_range_re.match(date)
105        if match:
106            f,l = [ int(d) for d in match.groups() ]
107            for i in xrange(f, l+1):
108                output.add(i)
109        else:
110            try:
111                date = int(date)
112                output.add(date)
113            except ValueError:
114                pass
115
116    return output
117
118copyright_re = \
119    re.compile(r'Copyright (\([cC]\)) ([-, 0-9]+)[\s*#/]*([A-z-,. ]+)',
120               re.DOTALL)
121
122authors_re = re.compile(r'^[\s*#/]*Authors:\s*([A-z .]+)\s*$')
123more_authors_re = re.compile(r'^[\s*#/]*([A-z .]+)\s*$')
124
125all_owners = set()
126def get_data(lang_type, lines):
127    data = []
128    last = None
129    for start,end in find_copyright_block(lines, lang_type):
130        joined = ''.join(lines[start:end+1])
131        match = copyright_re.search(joined)
132        if not match:
133            continue
134
135        c,dates,owner = match.groups()
136        dates = dates.strip()
137        owner = owner.strip()
138
139        all_owners.add(owner)
140        try:
141            dates = process_dates(dates)
142        except Exception:
143            print dates
144            print owner
145            raise
146
147        authors = []
148        for i in xrange(start,end+1):
149            line = lines[i]
150            if not authors:
151                match = authors_re.search(line)
152                if match:
153                    authors.append(match.group(1).strip())
154            else:
155                match = more_authors_re.search(line)
156                if not match:
157                    for j in xrange(i, end+1):
158                        line = lines[j].strip()
159                        if not line:
160                            end = j
161                            break
162                        if line.startswith('//'):
163                            line = line[2:].lstrip()
164                            if line:
165                                end = j - 1
166                                break
167                    break
168                authors.append(match.group(1).strip())
169
170        info = (owner, dates, authors, start, end)
171        data.append(info)
172
173    return data
174
175def datestr(dates):
176    dates = list(dates)
177    dates.sort()
178
179    output = []
180    def add_output(first, second):
181        if first == second:
182            output.append('%d' % (first))
183        else:
184            output.append('%d-%d' % (first, second))
185
186    first = dates.pop(0)
187    second = first
188    while dates:
189        next = dates.pop(0)
190        if next == second + 1:
191            second = next
192        else:
193            add_output(first, second)
194            first = next
195            second = next
196
197    add_output(first, second)
198
199    return ','.join(output)
200
201usage_str = """usage:
202%s [-v] <directory>"""
203
204def usage(exitcode):
205    print usage_str % sys.argv[0]
206    if exitcode is not None:
207        sys.exit(exitcode)
208
209if __name__ == '__main__':
210    import getopt
211
212    show_counts = False
213    ignore = set()
214    verbose = False
215    try:
216        opts, args = getopt.getopt(sys.argv[1:], "ci:v")
217    except getopt.GetoptError:
218        usage(1)
219
220    for o,a in opts:
221        if o == '-c':
222            show_counts = True
223        if o == '-i':
224            ignore.add(a)
225        if o == '-v':
226            verbose = True
227
228    files = []
229
230    for base in args:
231        if os.path.isfile(base):
232            files += [ (base, lang_type(base)) ]
233        elif os.path.isdir(base):
234            files += find_files(base)
235        else:
236            raise AttributeError, "can't access '%s'" %  base
237
238    copyrights = {}
239    counts = {}
240
241    for filename, lang in files:
242        f = file(filename, 'r')
243        lines = f.readlines()
244        if not lines:
245            continue
246
247        lines = [ line.rstrip('\r\n') for line in lines ]
248
249        lt = lang_type(filename, lines[0])
250        try:
251            data = get_data(lt, lines)
252        except Exception, e:
253            if verbose:
254                if len(e.args) == 1:
255                    e.args = ('%s (%s))' % (e, filename), )
256                print "could not parse %s: %s" % (filename, e)
257            continue
258
259        for owner, dates, authors, start, end in data:
260            if owner not in copyrights:
261                copyrights[owner] = set()
262            if owner not in counts:
263                counts[owner] = 0
264
265            copyrights[owner] |= dates
266            counts[owner] += 1
267
268    info = [ (counts[o], d, o) for o,d in copyrights.items() ]
269
270    for count,dates,owner in sorted(info, reverse=True):
271        if show_counts:
272            owner = '%s (%s files)' % (owner, count)
273        print 'Copyright (c) %s %s' % (datestr(dates), owner)
274