113481Sgiacomo.travaglini@arm.com#!/usr/bin/env python
213481Sgiacomo.travaglini@arm.com#
313481Sgiacomo.travaglini@arm.com# Copyright 2007 Neal Norwitz
413481Sgiacomo.travaglini@arm.com# Portions Copyright 2007 Google Inc.
513481Sgiacomo.travaglini@arm.com#
613481Sgiacomo.travaglini@arm.com# Licensed under the Apache License, Version 2.0 (the "License");
713481Sgiacomo.travaglini@arm.com# you may not use this file except in compliance with the License.
813481Sgiacomo.travaglini@arm.com# You may obtain a copy of the License at
913481Sgiacomo.travaglini@arm.com#
1013481Sgiacomo.travaglini@arm.com#      http://www.apache.org/licenses/LICENSE-2.0
1113481Sgiacomo.travaglini@arm.com#
1213481Sgiacomo.travaglini@arm.com# Unless required by applicable law or agreed to in writing, software
1313481Sgiacomo.travaglini@arm.com# distributed under the License is distributed on an "AS IS" BASIS,
1413481Sgiacomo.travaglini@arm.com# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1513481Sgiacomo.travaglini@arm.com# See the License for the specific language governing permissions and
1613481Sgiacomo.travaglini@arm.com# limitations under the License.
1713481Sgiacomo.travaglini@arm.com
1813481Sgiacomo.travaglini@arm.com"""Tokenize C++ source code."""
1913481Sgiacomo.travaglini@arm.com
2013481Sgiacomo.travaglini@arm.com__author__ = 'nnorwitz@google.com (Neal Norwitz)'
2113481Sgiacomo.travaglini@arm.com
2213481Sgiacomo.travaglini@arm.com
2313481Sgiacomo.travaglini@arm.comtry:
2413481Sgiacomo.travaglini@arm.com    # Python 3.x
2513481Sgiacomo.travaglini@arm.com    import builtins
2613481Sgiacomo.travaglini@arm.comexcept ImportError:
2713481Sgiacomo.travaglini@arm.com    # Python 2.x
2813481Sgiacomo.travaglini@arm.com    import __builtin__ as builtins
2913481Sgiacomo.travaglini@arm.com
3013481Sgiacomo.travaglini@arm.com
3113481Sgiacomo.travaglini@arm.comimport sys
3213481Sgiacomo.travaglini@arm.com
3313481Sgiacomo.travaglini@arm.comfrom cpp import utils
3413481Sgiacomo.travaglini@arm.com
3513481Sgiacomo.travaglini@arm.com
3613481Sgiacomo.travaglini@arm.comif not hasattr(builtins, 'set'):
3713481Sgiacomo.travaglini@arm.com    # Nominal support for Python 2.3.
3813481Sgiacomo.travaglini@arm.com    from sets import Set as set
3913481Sgiacomo.travaglini@arm.com
4013481Sgiacomo.travaglini@arm.com
4113481Sgiacomo.travaglini@arm.com# Add $ as a valid identifier char since so much code uses it.
4213481Sgiacomo.travaglini@arm.com_letters = 'abcdefghijklmnopqrstuvwxyz'
4313481Sgiacomo.travaglini@arm.comVALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
4413481Sgiacomo.travaglini@arm.comHEX_DIGITS = set('0123456789abcdefABCDEF')
4513481Sgiacomo.travaglini@arm.comINT_OR_FLOAT_DIGITS = set('01234567890eE-+')
4613481Sgiacomo.travaglini@arm.com
4713481Sgiacomo.travaglini@arm.com
4813481Sgiacomo.travaglini@arm.com# C++0x string preffixes.
4913481Sgiacomo.travaglini@arm.com_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
5013481Sgiacomo.travaglini@arm.com
5113481Sgiacomo.travaglini@arm.com
5213481Sgiacomo.travaglini@arm.com# Token types.
5313481Sgiacomo.travaglini@arm.comUNKNOWN = 'UNKNOWN'
5413481Sgiacomo.travaglini@arm.comSYNTAX = 'SYNTAX'
5513481Sgiacomo.travaglini@arm.comCONSTANT = 'CONSTANT'
5613481Sgiacomo.travaglini@arm.comNAME = 'NAME'
5713481Sgiacomo.travaglini@arm.comPREPROCESSOR = 'PREPROCESSOR'
5813481Sgiacomo.travaglini@arm.com
5913481Sgiacomo.travaglini@arm.com# Where the token originated from.  This can be used for backtracking.
6013481Sgiacomo.travaglini@arm.com# It is always set to WHENCE_STREAM in this code.
6113481Sgiacomo.travaglini@arm.comWHENCE_STREAM, WHENCE_QUEUE = range(2)
6213481Sgiacomo.travaglini@arm.com
6313481Sgiacomo.travaglini@arm.com
6413481Sgiacomo.travaglini@arm.comclass Token(object):
6513481Sgiacomo.travaglini@arm.com    """Data container to represent a C++ token.
6613481Sgiacomo.travaglini@arm.com
6713481Sgiacomo.travaglini@arm.com    Tokens can be identifiers, syntax char(s), constants, or
6813481Sgiacomo.travaglini@arm.com    pre-processor directives.
6913481Sgiacomo.travaglini@arm.com
7013481Sgiacomo.travaglini@arm.com    start contains the index of the first char of the token in the source
7113481Sgiacomo.travaglini@arm.com    end contains the index of the last char of the token in the source
7213481Sgiacomo.travaglini@arm.com    """
7313481Sgiacomo.travaglini@arm.com
7413481Sgiacomo.travaglini@arm.com    def __init__(self, token_type, name, start, end):
7513481Sgiacomo.travaglini@arm.com        self.token_type = token_type
7613481Sgiacomo.travaglini@arm.com        self.name = name
7713481Sgiacomo.travaglini@arm.com        self.start = start
7813481Sgiacomo.travaglini@arm.com        self.end = end
7913481Sgiacomo.travaglini@arm.com        self.whence = WHENCE_STREAM
8013481Sgiacomo.travaglini@arm.com
8113481Sgiacomo.travaglini@arm.com    def __str__(self):
8213481Sgiacomo.travaglini@arm.com        if not utils.DEBUG:
8313481Sgiacomo.travaglini@arm.com            return 'Token(%r)' % self.name
8413481Sgiacomo.travaglini@arm.com        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
8513481Sgiacomo.travaglini@arm.com
8613481Sgiacomo.travaglini@arm.com    __repr__ = __str__
8713481Sgiacomo.travaglini@arm.com
8813481Sgiacomo.travaglini@arm.com
8913481Sgiacomo.travaglini@arm.comdef _GetString(source, start, i):
9013481Sgiacomo.travaglini@arm.com    i = source.find('"', i+1)
9113481Sgiacomo.travaglini@arm.com    while source[i-1] == '\\':
9213481Sgiacomo.travaglini@arm.com        # Count the trailing backslashes.
9313481Sgiacomo.travaglini@arm.com        backslash_count = 1
9413481Sgiacomo.travaglini@arm.com        j = i - 2
9513481Sgiacomo.travaglini@arm.com        while source[j] == '\\':
9613481Sgiacomo.travaglini@arm.com            backslash_count += 1
9713481Sgiacomo.travaglini@arm.com            j -= 1
9813481Sgiacomo.travaglini@arm.com        # When trailing backslashes are even, they escape each other.
9913481Sgiacomo.travaglini@arm.com        if (backslash_count % 2) == 0:
10013481Sgiacomo.travaglini@arm.com            break
10113481Sgiacomo.travaglini@arm.com        i = source.find('"', i+1)
10213481Sgiacomo.travaglini@arm.com    return i + 1
10313481Sgiacomo.travaglini@arm.com
10413481Sgiacomo.travaglini@arm.com
10513481Sgiacomo.travaglini@arm.comdef _GetChar(source, start, i):
10613481Sgiacomo.travaglini@arm.com    # NOTE(nnorwitz): may not be quite correct, should be good enough.
10713481Sgiacomo.travaglini@arm.com    i = source.find("'", i+1)
10813481Sgiacomo.travaglini@arm.com    while source[i-1] == '\\':
10913481Sgiacomo.travaglini@arm.com        # Need to special case '\\'.
11013481Sgiacomo.travaglini@arm.com        if (i - 2) > start and source[i-2] == '\\':
11113481Sgiacomo.travaglini@arm.com            break
11213481Sgiacomo.travaglini@arm.com        i = source.find("'", i+1)
11313481Sgiacomo.travaglini@arm.com    # Try to handle unterminated single quotes (in a #if 0 block).
11413481Sgiacomo.travaglini@arm.com    if i < 0:
11513481Sgiacomo.travaglini@arm.com        i = start
11613481Sgiacomo.travaglini@arm.com    return i + 1
11713481Sgiacomo.travaglini@arm.com
11813481Sgiacomo.travaglini@arm.com
11913481Sgiacomo.travaglini@arm.comdef GetTokens(source):
12013481Sgiacomo.travaglini@arm.com    """Returns a sequence of Tokens.
12113481Sgiacomo.travaglini@arm.com
12213481Sgiacomo.travaglini@arm.com    Args:
12313481Sgiacomo.travaglini@arm.com      source: string of C++ source code.
12413481Sgiacomo.travaglini@arm.com
12513481Sgiacomo.travaglini@arm.com    Yields:
12613481Sgiacomo.travaglini@arm.com      Token that represents the next token in the source.
12713481Sgiacomo.travaglini@arm.com    """
12813481Sgiacomo.travaglini@arm.com    # Cache various valid character sets for speed.
12913481Sgiacomo.travaglini@arm.com    valid_identifier_chars = VALID_IDENTIFIER_CHARS
13013481Sgiacomo.travaglini@arm.com    hex_digits = HEX_DIGITS
13113481Sgiacomo.travaglini@arm.com    int_or_float_digits = INT_OR_FLOAT_DIGITS
13213481Sgiacomo.travaglini@arm.com    int_or_float_digits2 = int_or_float_digits | set('.')
13313481Sgiacomo.travaglini@arm.com
13413481Sgiacomo.travaglini@arm.com    # Only ignore errors while in a #if 0 block.
13513481Sgiacomo.travaglini@arm.com    ignore_errors = False
13613481Sgiacomo.travaglini@arm.com    count_ifs = 0
13713481Sgiacomo.travaglini@arm.com
13813481Sgiacomo.travaglini@arm.com    i = 0
13913481Sgiacomo.travaglini@arm.com    end = len(source)
14013481Sgiacomo.travaglini@arm.com    while i < end:
14113481Sgiacomo.travaglini@arm.com        # Skip whitespace.
14213481Sgiacomo.travaglini@arm.com        while i < end and source[i].isspace():
14313481Sgiacomo.travaglini@arm.com            i += 1
14413481Sgiacomo.travaglini@arm.com        if i >= end:
14513481Sgiacomo.travaglini@arm.com            return
14613481Sgiacomo.travaglini@arm.com
14713481Sgiacomo.travaglini@arm.com        token_type = UNKNOWN
14813481Sgiacomo.travaglini@arm.com        start = i
14913481Sgiacomo.travaglini@arm.com        c = source[i]
15013481Sgiacomo.travaglini@arm.com        if c.isalpha() or c == '_':              # Find a string token.
15113481Sgiacomo.travaglini@arm.com            token_type = NAME
15213481Sgiacomo.travaglini@arm.com            while source[i] in valid_identifier_chars:
15313481Sgiacomo.travaglini@arm.com                i += 1
15413481Sgiacomo.travaglini@arm.com            # String and character constants can look like a name if
15513481Sgiacomo.travaglini@arm.com            # they are something like L"".
15613481Sgiacomo.travaglini@arm.com            if (source[i] == "'" and (i - start) == 1 and
15713481Sgiacomo.travaglini@arm.com                source[start:i] in 'uUL'):
15813481Sgiacomo.travaglini@arm.com                # u, U, and L are valid C++0x character preffixes.
15913481Sgiacomo.travaglini@arm.com                token_type = CONSTANT
16013481Sgiacomo.travaglini@arm.com                i = _GetChar(source, start, i)
16113481Sgiacomo.travaglini@arm.com            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
16213481Sgiacomo.travaglini@arm.com                token_type = CONSTANT
16313481Sgiacomo.travaglini@arm.com                i = _GetString(source, start, i)
16413481Sgiacomo.travaglini@arm.com        elif c == '/' and source[i+1] == '/':    # Find // comments.
16513481Sgiacomo.travaglini@arm.com            i = source.find('\n', i)
16613481Sgiacomo.travaglini@arm.com            if i == -1:  # Handle EOF.
16713481Sgiacomo.travaglini@arm.com                i = end
16813481Sgiacomo.travaglini@arm.com            continue
16913481Sgiacomo.travaglini@arm.com        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
17013481Sgiacomo.travaglini@arm.com            i = source.find('*/', i) + 2
17113481Sgiacomo.travaglini@arm.com            continue
17213481Sgiacomo.travaglini@arm.com        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
17313481Sgiacomo.travaglini@arm.com            token_type = SYNTAX
17413481Sgiacomo.travaglini@arm.com            i += 1
17513481Sgiacomo.travaglini@arm.com            new_ch = source[i]
17613481Sgiacomo.travaglini@arm.com            if new_ch == c and c != '>':         # Treat ">>" as two tokens.
17713481Sgiacomo.travaglini@arm.com                i += 1
17813481Sgiacomo.travaglini@arm.com            elif c == '-' and new_ch == '>':
17913481Sgiacomo.travaglini@arm.com                i += 1
18013481Sgiacomo.travaglini@arm.com            elif new_ch == '=':
18113481Sgiacomo.travaglini@arm.com                i += 1
18213481Sgiacomo.travaglini@arm.com        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
18313481Sgiacomo.travaglini@arm.com            token_type = SYNTAX
18413481Sgiacomo.travaglini@arm.com            i += 1
18513481Sgiacomo.travaglini@arm.com            if c == '.' and source[i].isdigit():
18613481Sgiacomo.travaglini@arm.com                token_type = CONSTANT
18713481Sgiacomo.travaglini@arm.com                i += 1
18813481Sgiacomo.travaglini@arm.com                while source[i] in int_or_float_digits:
18913481Sgiacomo.travaglini@arm.com                    i += 1
19013481Sgiacomo.travaglini@arm.com                # Handle float suffixes.
19113481Sgiacomo.travaglini@arm.com                for suffix in ('l', 'f'):
19213481Sgiacomo.travaglini@arm.com                    if suffix == source[i:i+1].lower():
19313481Sgiacomo.travaglini@arm.com                        i += 1
19413481Sgiacomo.travaglini@arm.com                        break
19513481Sgiacomo.travaglini@arm.com        elif c.isdigit():                        # Find integer.
19613481Sgiacomo.travaglini@arm.com            token_type = CONSTANT
19713481Sgiacomo.travaglini@arm.com            if c == '0' and source[i+1] in 'xX':
19813481Sgiacomo.travaglini@arm.com                # Handle hex digits.
19913481Sgiacomo.travaglini@arm.com                i += 2
20013481Sgiacomo.travaglini@arm.com                while source[i] in hex_digits:
20113481Sgiacomo.travaglini@arm.com                    i += 1
20213481Sgiacomo.travaglini@arm.com            else:
20313481Sgiacomo.travaglini@arm.com                while source[i] in int_or_float_digits2:
20413481Sgiacomo.travaglini@arm.com                    i += 1
20513481Sgiacomo.travaglini@arm.com            # Handle integer (and float) suffixes.
20613481Sgiacomo.travaglini@arm.com            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
20713481Sgiacomo.travaglini@arm.com                size = len(suffix)
20813481Sgiacomo.travaglini@arm.com                if suffix == source[i:i+size].lower():
20913481Sgiacomo.travaglini@arm.com                    i += size
21013481Sgiacomo.travaglini@arm.com                    break
21113481Sgiacomo.travaglini@arm.com        elif c == '"':                           # Find string.
21213481Sgiacomo.travaglini@arm.com            token_type = CONSTANT
21313481Sgiacomo.travaglini@arm.com            i = _GetString(source, start, i)
21413481Sgiacomo.travaglini@arm.com        elif c == "'":                           # Find char.
21513481Sgiacomo.travaglini@arm.com            token_type = CONSTANT
21613481Sgiacomo.travaglini@arm.com            i = _GetChar(source, start, i)
21713481Sgiacomo.travaglini@arm.com        elif c == '#':                           # Find pre-processor command.
21813481Sgiacomo.travaglini@arm.com            token_type = PREPROCESSOR
21913481Sgiacomo.travaglini@arm.com            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
22013481Sgiacomo.travaglini@arm.com            if got_if:
22113481Sgiacomo.travaglini@arm.com                count_ifs += 1
22213481Sgiacomo.travaglini@arm.com            elif source[i:i+6] == '#endif':
22313481Sgiacomo.travaglini@arm.com                count_ifs -= 1
22413481Sgiacomo.travaglini@arm.com                if count_ifs == 0:
22513481Sgiacomo.travaglini@arm.com                    ignore_errors = False
22613481Sgiacomo.travaglini@arm.com
22713481Sgiacomo.travaglini@arm.com            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
22813481Sgiacomo.travaglini@arm.com            while 1:
22913481Sgiacomo.travaglini@arm.com                i1 = source.find('\n', i)
23013481Sgiacomo.travaglini@arm.com                i2 = source.find('//', i)
23113481Sgiacomo.travaglini@arm.com                i3 = source.find('/*', i)
23213481Sgiacomo.travaglini@arm.com                i4 = source.find('"', i)
23313481Sgiacomo.travaglini@arm.com                # NOTE(nnorwitz): doesn't handle comments in #define macros.
23413481Sgiacomo.travaglini@arm.com                # Get the first important symbol (newline, comment, EOF/end).
23513481Sgiacomo.travaglini@arm.com                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
23613481Sgiacomo.travaglini@arm.com
23713481Sgiacomo.travaglini@arm.com                # Handle #include "dir//foo.h" properly.
23813481Sgiacomo.travaglini@arm.com                if source[i] == '"':
23913481Sgiacomo.travaglini@arm.com                    i = source.find('"', i+1) + 1
24013481Sgiacomo.travaglini@arm.com                    assert i > 0
24113481Sgiacomo.travaglini@arm.com                    continue
24213481Sgiacomo.travaglini@arm.com                # Keep going if end of the line and the line ends with \.
24313481Sgiacomo.travaglini@arm.com                if not (i == i1 and source[i-1] == '\\'):
24413481Sgiacomo.travaglini@arm.com                    if got_if:
24513481Sgiacomo.travaglini@arm.com                        condition = source[start+4:i].lstrip()
24613481Sgiacomo.travaglini@arm.com                        if (condition.startswith('0') or
24713481Sgiacomo.travaglini@arm.com                            condition.startswith('(0)')):
24813481Sgiacomo.travaglini@arm.com                            ignore_errors = True
24913481Sgiacomo.travaglini@arm.com                    break
25013481Sgiacomo.travaglini@arm.com                i += 1
25113481Sgiacomo.travaglini@arm.com        elif c == '\\':                          # Handle \ in code.
25213481Sgiacomo.travaglini@arm.com            # This is different from the pre-processor \ handling.
25313481Sgiacomo.travaglini@arm.com            i += 1
25413481Sgiacomo.travaglini@arm.com            continue
25513481Sgiacomo.travaglini@arm.com        elif ignore_errors:
25613481Sgiacomo.travaglini@arm.com            # The tokenizer seems to be in pretty good shape.  This
25713481Sgiacomo.travaglini@arm.com            # raise is conditionally disabled so that bogus code
25813481Sgiacomo.travaglini@arm.com            # in an #if 0 block can be handled.  Since we will ignore
25913481Sgiacomo.travaglini@arm.com            # it anyways, this is probably fine.  So disable the
26013481Sgiacomo.travaglini@arm.com            # exception and  return the bogus char.
26113481Sgiacomo.travaglini@arm.com            i += 1
26213481Sgiacomo.travaglini@arm.com        else:
26313481Sgiacomo.travaglini@arm.com            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
26413481Sgiacomo.travaglini@arm.com                             ('?', i, c, source[i-10:i+10]))
26513481Sgiacomo.travaglini@arm.com            raise RuntimeError('unexpected token')
26613481Sgiacomo.travaglini@arm.com
26713481Sgiacomo.travaglini@arm.com        if i <= 0:
26813481Sgiacomo.travaglini@arm.com            print('Invalid index, exiting now.')
26913481Sgiacomo.travaglini@arm.com            return
27013481Sgiacomo.travaglini@arm.com        yield Token(token_type, source[start:i], start, i)
27113481Sgiacomo.travaglini@arm.com
27213481Sgiacomo.travaglini@arm.com
27313481Sgiacomo.travaglini@arm.comif __name__ == '__main__':
27413481Sgiacomo.travaglini@arm.com    def main(argv):
27513481Sgiacomo.travaglini@arm.com        """Driver mostly for testing purposes."""
27613481Sgiacomo.travaglini@arm.com        for filename in argv[1:]:
27713481Sgiacomo.travaglini@arm.com            source = utils.ReadFile(filename)
27813481Sgiacomo.travaglini@arm.com            if source is None:
27913481Sgiacomo.travaglini@arm.com                continue
28013481Sgiacomo.travaglini@arm.com
28113481Sgiacomo.travaglini@arm.com            for token in GetTokens(source):
28213481Sgiacomo.travaglini@arm.com                print('%-12s: %s' % (token.token_type, token.name))
28313481Sgiacomo.travaglini@arm.com                # print('\r%6.2f%%' % (100.0 * index / token.end),)
28413481Sgiacomo.travaglini@arm.com            sys.stdout.write('\n')
28513481Sgiacomo.travaglini@arm.com
28613481Sgiacomo.travaglini@arm.com
28713481Sgiacomo.travaglini@arm.com    main(sys.argv)
288