mkdoc.py revision 14299:2fbea9df56d2
1#!/usr/bin/env python3
2#
3#  Syntax: mkdoc.py [-I<path> ..] [.. a list of header files ..]
4#
5#  Extract documentation from C++ header files to use it in Python bindings
6#
7
8import os
9import sys
10import platform
11import re
12import textwrap
13
14from clang import cindex
15from clang.cindex import CursorKind
16from collections import OrderedDict
17from glob import glob
18from threading import Thread, Semaphore
19from multiprocessing import cpu_count
20
21RECURSE_LIST = [
22    CursorKind.TRANSLATION_UNIT,
23    CursorKind.NAMESPACE,
24    CursorKind.CLASS_DECL,
25    CursorKind.STRUCT_DECL,
26    CursorKind.ENUM_DECL,
27    CursorKind.CLASS_TEMPLATE
28]
29
30PRINT_LIST = [
31    CursorKind.CLASS_DECL,
32    CursorKind.STRUCT_DECL,
33    CursorKind.ENUM_DECL,
34    CursorKind.ENUM_CONSTANT_DECL,
35    CursorKind.CLASS_TEMPLATE,
36    CursorKind.FUNCTION_DECL,
37    CursorKind.FUNCTION_TEMPLATE,
38    CursorKind.CONVERSION_FUNCTION,
39    CursorKind.CXX_METHOD,
40    CursorKind.CONSTRUCTOR,
41    CursorKind.FIELD_DECL
42]
43
44PREFIX_BLACKLIST = [
45    CursorKind.TRANSLATION_UNIT
46]
47
48CPP_OPERATORS = {
49    '<=': 'le', '>=': 'ge', '==': 'eq', '!=': 'ne', '[]': 'array',
50    '+=': 'iadd', '-=': 'isub', '*=': 'imul', '/=': 'idiv', '%=':
51    'imod', '&=': 'iand', '|=': 'ior', '^=': 'ixor', '<<=': 'ilshift',
52    '>>=': 'irshift', '++': 'inc', '--': 'dec', '<<': 'lshift', '>>':
53    'rshift', '&&': 'land', '||': 'lor', '!': 'lnot', '~': 'bnot',
54    '&': 'band', '|': 'bor', '+': 'add', '-': 'sub', '*': 'mul', '/':
55    'div', '%': 'mod', '<': 'lt', '>': 'gt', '=': 'assign', '()': 'call'
56}
57
58CPP_OPERATORS = OrderedDict(
59    sorted(CPP_OPERATORS.items(), key=lambda t: -len(t[0])))
60
61job_count = cpu_count()
62job_semaphore = Semaphore(job_count)
63
64
65class NoFilenamesError(ValueError):
66    pass
67
68
69def d(s):
70    return s if isinstance(s, str) else s.decode('utf8')
71
72
73def sanitize_name(name):
74    name = re.sub(r'type-parameter-0-([0-9]+)', r'T\1', name)
75    for k, v in CPP_OPERATORS.items():
76        name = name.replace('operator%s' % k, 'operator_%s' % v)
77    name = re.sub('<.*>', '', name)
78    name = ''.join([ch if ch.isalnum() else '_' for ch in name])
79    name = re.sub('_$', '', re.sub('_+', '_', name))
80    return '__doc_' + name
81
82
83def process_comment(comment):
84    result = ''
85
86    # Remove C++ comment syntax
87    leading_spaces = float('inf')
88    for s in comment.expandtabs(tabsize=4).splitlines():
89        s = s.strip()
90        if s.startswith('/*'):
91            s = s[2:].lstrip('*')
92        elif s.endswith('*/'):
93            s = s[:-2].rstrip('*')
94        elif s.startswith('///'):
95            s = s[3:]
96        if s.startswith('*'):
97            s = s[1:]
98        if len(s) > 0:
99            leading_spaces = min(leading_spaces, len(s) - len(s.lstrip()))
100        result += s + '\n'
101
102    if leading_spaces != float('inf'):
103        result2 = ""
104        for s in result.splitlines():
105            result2 += s[leading_spaces:] + '\n'
106        result = result2
107
108    # Doxygen tags
109    cpp_group = '([\w:]+)'
110    param_group = '([\[\w:\]]+)'
111
112    s = result
113    s = re.sub(r'\\c\s+%s' % cpp_group, r'``\1``', s)
114    s = re.sub(r'\\a\s+%s' % cpp_group, r'*\1*', s)
115    s = re.sub(r'\\e\s+%s' % cpp_group, r'*\1*', s)
116    s = re.sub(r'\\em\s+%s' % cpp_group, r'*\1*', s)
117    s = re.sub(r'\\b\s+%s' % cpp_group, r'**\1**', s)
118    s = re.sub(r'\\ingroup\s+%s' % cpp_group, r'', s)
119    s = re.sub(r'\\param%s?\s+%s' % (param_group, cpp_group),
120               r'\n\n$Parameter ``\2``:\n\n', s)
121    s = re.sub(r'\\tparam%s?\s+%s' % (param_group, cpp_group),
122               r'\n\n$Template parameter ``\2``:\n\n', s)
123
124    for in_, out_ in {
125        'return': 'Returns',
126        'author': 'Author',
127        'authors': 'Authors',
128        'copyright': 'Copyright',
129        'date': 'Date',
130        'remark': 'Remark',
131        'sa': 'See also',
132        'see': 'See also',
133        'extends': 'Extends',
134        'throw': 'Throws',
135        'throws': 'Throws'
136    }.items():
137        s = re.sub(r'\\%s\s*' % in_, r'\n\n$%s:\n\n' % out_, s)
138
139    s = re.sub(r'\\details\s*', r'\n\n', s)
140    s = re.sub(r'\\brief\s*', r'', s)
141    s = re.sub(r'\\short\s*', r'', s)
142    s = re.sub(r'\\ref\s*', r'', s)
143
144    s = re.sub(r'\\code\s?(.*?)\s?\\endcode',
145               r"```\n\1\n```\n", s, flags=re.DOTALL)
146
147    # HTML/TeX tags
148    s = re.sub(r'<tt>(.*?)</tt>', r'``\1``', s, flags=re.DOTALL)
149    s = re.sub(r'<pre>(.*?)</pre>', r"```\n\1\n```\n", s, flags=re.DOTALL)
150    s = re.sub(r'<em>(.*?)</em>', r'*\1*', s, flags=re.DOTALL)
151    s = re.sub(r'<b>(.*?)</b>', r'**\1**', s, flags=re.DOTALL)
152    s = re.sub(r'\\f\$(.*?)\\f\$', r'$\1$', s, flags=re.DOTALL)
153    s = re.sub(r'<li>', r'\n\n* ', s)
154    s = re.sub(r'</?ul>', r'', s)
155    s = re.sub(r'</li>', r'\n\n', s)
156
157    s = s.replace('``true``', '``True``')
158    s = s.replace('``false``', '``False``')
159
160    # Re-flow text
161    wrapper = textwrap.TextWrapper()
162    wrapper.expand_tabs = True
163    wrapper.replace_whitespace = True
164    wrapper.drop_whitespace = True
165    wrapper.width = 70
166    wrapper.initial_indent = wrapper.subsequent_indent = ''
167
168    result = ''
169    in_code_segment = False
170    for x in re.split(r'(```)', s):
171        if x == '```':
172            if not in_code_segment:
173                result += '```\n'
174            else:
175                result += '\n```\n\n'
176            in_code_segment = not in_code_segment
177        elif in_code_segment:
178            result += x.strip()
179        else:
180            for y in re.split(r'(?: *\n *){2,}', x):
181                wrapped = wrapper.fill(re.sub(r'\s+', ' ', y).strip())
182                if len(wrapped) > 0 and wrapped[0] == '$':
183                    result += wrapped[1:] + '\n'
184                    wrapper.initial_indent = \
185                        wrapper.subsequent_indent = ' ' * 4
186                else:
187                    if len(wrapped) > 0:
188                        result += wrapped + '\n\n'
189                    wrapper.initial_indent = wrapper.subsequent_indent = ''
190    return result.rstrip().lstrip('\n')
191
192
193def extract(filename, node, prefix, output):
194    if not (node.location.file is None or
195            os.path.samefile(d(node.location.file.name), filename)):
196        return 0
197    if node.kind in RECURSE_LIST:
198        sub_prefix = prefix
199        if node.kind not in PREFIX_BLACKLIST:
200            if len(sub_prefix) > 0:
201                sub_prefix += '_'
202            sub_prefix += d(node.spelling)
203        for i in node.get_children():
204            extract(filename, i, sub_prefix, output)
205    if node.kind in PRINT_LIST:
206        comment = d(node.raw_comment) if node.raw_comment is not None else ''
207        comment = process_comment(comment)
208        sub_prefix = prefix
209        if len(sub_prefix) > 0:
210            sub_prefix += '_'
211        if len(node.spelling) > 0:
212            name = sanitize_name(sub_prefix + d(node.spelling))
213            output.append((name, filename, comment))
214
215
216class ExtractionThread(Thread):
217    def __init__(self, filename, parameters, output):
218        Thread.__init__(self)
219        self.filename = filename
220        self.parameters = parameters
221        self.output = output
222        job_semaphore.acquire()
223
224    def run(self):
225        print('Processing "%s" ..' % self.filename, file=sys.stderr)
226        try:
227            index = cindex.Index(
228                cindex.conf.lib.clang_createIndex(False, True))
229            tu = index.parse(self.filename, self.parameters)
230            extract(self.filename, tu.cursor, '', self.output)
231        finally:
232            job_semaphore.release()
233
234
235def read_args(args):
236    parameters = []
237    filenames = []
238    if "-x" not in args:
239        parameters.extend(['-x', 'c++'])
240    if not any(it.startswith("-std=") for it in args):
241        parameters.append('-std=c++11')
242
243    if platform.system() == 'Darwin':
244        dev_path = '/Applications/Xcode.app/Contents/Developer/'
245        lib_dir = dev_path + 'Toolchains/XcodeDefault.xctoolchain/usr/lib/'
246        sdk_dir = dev_path + 'Platforms/MacOSX.platform/Developer/SDKs'
247        libclang = lib_dir + 'libclang.dylib'
248
249        if os.path.exists(libclang):
250            cindex.Config.set_library_path(os.path.dirname(libclang))
251
252        if os.path.exists(sdk_dir):
253            sysroot_dir = os.path.join(sdk_dir, next(os.walk(sdk_dir))[1][0])
254            parameters.append('-isysroot')
255            parameters.append(sysroot_dir)
256    elif platform.system() == 'Linux':
257        # clang doesn't find its own base includes by default on Linux,
258        # but different distros install them in different paths.
259        # Try to autodetect, preferring the highest numbered version.
260        def clang_folder_version(d):
261            return [int(ver) for ver in re.findall(r'(?<!lib)(?<!\d)\d+', d)]
262        clang_include_dir = max((
263            path
264            for libdir in ['lib64', 'lib', 'lib32']
265            for path in glob('/usr/%s/clang/*/include' % libdir)
266            if os.path.isdir(path)
267        ), default=None, key=clang_folder_version)
268        if clang_include_dir:
269            parameters.extend(['-isystem', clang_include_dir])
270
271    for item in args:
272        if item.startswith('-'):
273            parameters.append(item)
274        else:
275            filenames.append(item)
276
277    if len(filenames) == 0:
278        raise NoFilenamesError("args parameter did not contain any filenames")
279
280    return parameters, filenames
281
282
283def extract_all(args):
284    parameters, filenames = read_args(args)
285    output = []
286    for filename in filenames:
287        thr = ExtractionThread(filename, parameters, output)
288        thr.start()
289
290    print('Waiting for jobs to finish ..', file=sys.stderr)
291    for i in range(job_count):
292        job_semaphore.acquire()
293
294    return output
295
296
297def write_header(comments, out_file=sys.stdout):
298    print('''/*
299  This file contains docstrings for the Python bindings.
300  Do not edit! These were automatically extracted by mkdoc.py
301 */
302
303#define __EXPAND(x)                                      x
304#define __COUNT(_1, _2, _3, _4, _5, _6, _7, COUNT, ...)  COUNT
305#define __VA_SIZE(...)                                   __EXPAND(__COUNT(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1))
306#define __CAT1(a, b)                                     a ## b
307#define __CAT2(a, b)                                     __CAT1(a, b)
308#define __DOC1(n1)                                       __doc_##n1
309#define __DOC2(n1, n2)                                   __doc_##n1##_##n2
310#define __DOC3(n1, n2, n3)                               __doc_##n1##_##n2##_##n3
311#define __DOC4(n1, n2, n3, n4)                           __doc_##n1##_##n2##_##n3##_##n4
312#define __DOC5(n1, n2, n3, n4, n5)                       __doc_##n1##_##n2##_##n3##_##n4##_##n5
313#define __DOC6(n1, n2, n3, n4, n5, n6)                   __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6
314#define __DOC7(n1, n2, n3, n4, n5, n6, n7)               __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6##_##n7
315#define DOC(...)                                         __EXPAND(__EXPAND(__CAT2(__DOC, __VA_SIZE(__VA_ARGS__)))(__VA_ARGS__))
316
317#if defined(__GNUG__)
318#pragma GCC diagnostic push
319#pragma GCC diagnostic ignored "-Wunused-variable"
320#endif
321''', file=out_file)
322
323
324    name_ctr = 1
325    name_prev = None
326    for name, _, comment in list(sorted(comments, key=lambda x: (x[0], x[1]))):
327        if name == name_prev:
328            name_ctr += 1
329            name = name + "_%i" % name_ctr
330        else:
331            name_prev = name
332            name_ctr = 1
333        print('\nstatic const char *%s =%sR"doc(%s)doc";' %
334              (name, '\n' if '\n' in comment else ' ', comment), file=out_file)
335
336    print('''
337#if defined(__GNUG__)
338#pragma GCC diagnostic pop
339#endif
340''', file=out_file)
341
342
343def mkdoc(args):
344    args = list(args)
345    out_path = None
346    for idx, arg in enumerate(args):
347        if arg.startswith("-o"):
348            args.remove(arg)
349            try:
350                out_path = arg[2:] or args.pop(idx)
351            except IndexError:
352                print("-o flag requires an argument")
353                exit(-1)
354            break
355
356    comments = extract_all(args)
357
358    if out_path:
359        try:
360            with open(out_path, 'w') as out_file:
361                write_header(comments, out_file)
362        except:
363            # In the event of an error, don't leave a partially-written
364            # output file.
365            try:
366                os.unlink(out_path)
367            except:
368                pass
369            raise
370    else:
371        write_header(comments)
372
373
374if __name__ == '__main__':
375    try:
376        mkdoc(sys.argv[1:])
377    except NoFilenamesError:
378        print('Syntax: %s [.. a list of header files ..]' % sys.argv[0])
379        exit(-1)
380