113481Sgiacomo.travaglini@arm.com#!/usr/bin/env python 213481Sgiacomo.travaglini@arm.com# 313481Sgiacomo.travaglini@arm.com# Copyright 2007 Neal Norwitz 413481Sgiacomo.travaglini@arm.com# Portions Copyright 2007 Google Inc. 513481Sgiacomo.travaglini@arm.com# 613481Sgiacomo.travaglini@arm.com# Licensed under the Apache License, Version 2.0 (the "License"); 713481Sgiacomo.travaglini@arm.com# you may not use this file except in compliance with the License. 813481Sgiacomo.travaglini@arm.com# You may obtain a copy of the License at 913481Sgiacomo.travaglini@arm.com# 1013481Sgiacomo.travaglini@arm.com# http://www.apache.org/licenses/LICENSE-2.0 1113481Sgiacomo.travaglini@arm.com# 1213481Sgiacomo.travaglini@arm.com# Unless required by applicable law or agreed to in writing, software 1313481Sgiacomo.travaglini@arm.com# distributed under the License is distributed on an "AS IS" BASIS, 1413481Sgiacomo.travaglini@arm.com# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1513481Sgiacomo.travaglini@arm.com# See the License for the specific language governing permissions and 1613481Sgiacomo.travaglini@arm.com# limitations under the License. 1713481Sgiacomo.travaglini@arm.com 1813481Sgiacomo.travaglini@arm.com"""Tokenize C++ source code.""" 1913481Sgiacomo.travaglini@arm.com 2013481Sgiacomo.travaglini@arm.com__author__ = 'nnorwitz@google.com (Neal Norwitz)' 2113481Sgiacomo.travaglini@arm.com 2213481Sgiacomo.travaglini@arm.com 2313481Sgiacomo.travaglini@arm.comtry: 2413481Sgiacomo.travaglini@arm.com # Python 3.x 2513481Sgiacomo.travaglini@arm.com import builtins 2613481Sgiacomo.travaglini@arm.comexcept ImportError: 2713481Sgiacomo.travaglini@arm.com # Python 2.x 2813481Sgiacomo.travaglini@arm.com import __builtin__ as builtins 2913481Sgiacomo.travaglini@arm.com 3013481Sgiacomo.travaglini@arm.com 3113481Sgiacomo.travaglini@arm.comimport sys 3213481Sgiacomo.travaglini@arm.com 3313481Sgiacomo.travaglini@arm.comfrom cpp import utils 3413481Sgiacomo.travaglini@arm.com 3513481Sgiacomo.travaglini@arm.com 3613481Sgiacomo.travaglini@arm.comif not hasattr(builtins, 'set'): 3713481Sgiacomo.travaglini@arm.com # Nominal support for Python 2.3. 3813481Sgiacomo.travaglini@arm.com from sets import Set as set 3913481Sgiacomo.travaglini@arm.com 4013481Sgiacomo.travaglini@arm.com 4113481Sgiacomo.travaglini@arm.com# Add $ as a valid identifier char since so much code uses it. 4213481Sgiacomo.travaglini@arm.com_letters = 'abcdefghijklmnopqrstuvwxyz' 4313481Sgiacomo.travaglini@arm.comVALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$') 4413481Sgiacomo.travaglini@arm.comHEX_DIGITS = set('0123456789abcdefABCDEF') 4513481Sgiacomo.travaglini@arm.comINT_OR_FLOAT_DIGITS = set('01234567890eE-+') 4613481Sgiacomo.travaglini@arm.com 4713481Sgiacomo.travaglini@arm.com 4813481Sgiacomo.travaglini@arm.com# C++0x string preffixes. 4913481Sgiacomo.travaglini@arm.com_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR')) 5013481Sgiacomo.travaglini@arm.com 5113481Sgiacomo.travaglini@arm.com 5213481Sgiacomo.travaglini@arm.com# Token types. 5313481Sgiacomo.travaglini@arm.comUNKNOWN = 'UNKNOWN' 5413481Sgiacomo.travaglini@arm.comSYNTAX = 'SYNTAX' 5513481Sgiacomo.travaglini@arm.comCONSTANT = 'CONSTANT' 5613481Sgiacomo.travaglini@arm.comNAME = 'NAME' 5713481Sgiacomo.travaglini@arm.comPREPROCESSOR = 'PREPROCESSOR' 5813481Sgiacomo.travaglini@arm.com 5913481Sgiacomo.travaglini@arm.com# Where the token originated from. This can be used for backtracking. 6013481Sgiacomo.travaglini@arm.com# It is always set to WHENCE_STREAM in this code. 6113481Sgiacomo.travaglini@arm.comWHENCE_STREAM, WHENCE_QUEUE = range(2) 6213481Sgiacomo.travaglini@arm.com 6313481Sgiacomo.travaglini@arm.com 6413481Sgiacomo.travaglini@arm.comclass Token(object): 6513481Sgiacomo.travaglini@arm.com """Data container to represent a C++ token. 6613481Sgiacomo.travaglini@arm.com 6713481Sgiacomo.travaglini@arm.com Tokens can be identifiers, syntax char(s), constants, or 6813481Sgiacomo.travaglini@arm.com pre-processor directives. 6913481Sgiacomo.travaglini@arm.com 7013481Sgiacomo.travaglini@arm.com start contains the index of the first char of the token in the source 7113481Sgiacomo.travaglini@arm.com end contains the index of the last char of the token in the source 7213481Sgiacomo.travaglini@arm.com """ 7313481Sgiacomo.travaglini@arm.com 7413481Sgiacomo.travaglini@arm.com def __init__(self, token_type, name, start, end): 7513481Sgiacomo.travaglini@arm.com self.token_type = token_type 7613481Sgiacomo.travaglini@arm.com self.name = name 7713481Sgiacomo.travaglini@arm.com self.start = start 7813481Sgiacomo.travaglini@arm.com self.end = end 7913481Sgiacomo.travaglini@arm.com self.whence = WHENCE_STREAM 8013481Sgiacomo.travaglini@arm.com 8113481Sgiacomo.travaglini@arm.com def __str__(self): 8213481Sgiacomo.travaglini@arm.com if not utils.DEBUG: 8313481Sgiacomo.travaglini@arm.com return 'Token(%r)' % self.name 8413481Sgiacomo.travaglini@arm.com return 'Token(%r, %s, %s)' % (self.name, self.start, self.end) 8513481Sgiacomo.travaglini@arm.com 8613481Sgiacomo.travaglini@arm.com __repr__ = __str__ 8713481Sgiacomo.travaglini@arm.com 8813481Sgiacomo.travaglini@arm.com 8913481Sgiacomo.travaglini@arm.comdef _GetString(source, start, i): 9013481Sgiacomo.travaglini@arm.com i = source.find('"', i+1) 9113481Sgiacomo.travaglini@arm.com while source[i-1] == '\\': 9213481Sgiacomo.travaglini@arm.com # Count the trailing backslashes. 9313481Sgiacomo.travaglini@arm.com backslash_count = 1 9413481Sgiacomo.travaglini@arm.com j = i - 2 9513481Sgiacomo.travaglini@arm.com while source[j] == '\\': 9613481Sgiacomo.travaglini@arm.com backslash_count += 1 9713481Sgiacomo.travaglini@arm.com j -= 1 9813481Sgiacomo.travaglini@arm.com # When trailing backslashes are even, they escape each other. 9913481Sgiacomo.travaglini@arm.com if (backslash_count % 2) == 0: 10013481Sgiacomo.travaglini@arm.com break 10113481Sgiacomo.travaglini@arm.com i = source.find('"', i+1) 10213481Sgiacomo.travaglini@arm.com return i + 1 10313481Sgiacomo.travaglini@arm.com 10413481Sgiacomo.travaglini@arm.com 10513481Sgiacomo.travaglini@arm.comdef _GetChar(source, start, i): 10613481Sgiacomo.travaglini@arm.com # NOTE(nnorwitz): may not be quite correct, should be good enough. 10713481Sgiacomo.travaglini@arm.com i = source.find("'", i+1) 10813481Sgiacomo.travaglini@arm.com while source[i-1] == '\\': 10913481Sgiacomo.travaglini@arm.com # Need to special case '\\'. 11013481Sgiacomo.travaglini@arm.com if (i - 2) > start and source[i-2] == '\\': 11113481Sgiacomo.travaglini@arm.com break 11213481Sgiacomo.travaglini@arm.com i = source.find("'", i+1) 11313481Sgiacomo.travaglini@arm.com # Try to handle unterminated single quotes (in a #if 0 block). 11413481Sgiacomo.travaglini@arm.com if i < 0: 11513481Sgiacomo.travaglini@arm.com i = start 11613481Sgiacomo.travaglini@arm.com return i + 1 11713481Sgiacomo.travaglini@arm.com 11813481Sgiacomo.travaglini@arm.com 11913481Sgiacomo.travaglini@arm.comdef GetTokens(source): 12013481Sgiacomo.travaglini@arm.com """Returns a sequence of Tokens. 12113481Sgiacomo.travaglini@arm.com 12213481Sgiacomo.travaglini@arm.com Args: 12313481Sgiacomo.travaglini@arm.com source: string of C++ source code. 12413481Sgiacomo.travaglini@arm.com 12513481Sgiacomo.travaglini@arm.com Yields: 12613481Sgiacomo.travaglini@arm.com Token that represents the next token in the source. 12713481Sgiacomo.travaglini@arm.com """ 12813481Sgiacomo.travaglini@arm.com # Cache various valid character sets for speed. 12913481Sgiacomo.travaglini@arm.com valid_identifier_chars = VALID_IDENTIFIER_CHARS 13013481Sgiacomo.travaglini@arm.com hex_digits = HEX_DIGITS 13113481Sgiacomo.travaglini@arm.com int_or_float_digits = INT_OR_FLOAT_DIGITS 13213481Sgiacomo.travaglini@arm.com int_or_float_digits2 = int_or_float_digits | set('.') 13313481Sgiacomo.travaglini@arm.com 13413481Sgiacomo.travaglini@arm.com # Only ignore errors while in a #if 0 block. 13513481Sgiacomo.travaglini@arm.com ignore_errors = False 13613481Sgiacomo.travaglini@arm.com count_ifs = 0 13713481Sgiacomo.travaglini@arm.com 13813481Sgiacomo.travaglini@arm.com i = 0 13913481Sgiacomo.travaglini@arm.com end = len(source) 14013481Sgiacomo.travaglini@arm.com while i < end: 14113481Sgiacomo.travaglini@arm.com # Skip whitespace. 14213481Sgiacomo.travaglini@arm.com while i < end and source[i].isspace(): 14313481Sgiacomo.travaglini@arm.com i += 1 14413481Sgiacomo.travaglini@arm.com if i >= end: 14513481Sgiacomo.travaglini@arm.com return 14613481Sgiacomo.travaglini@arm.com 14713481Sgiacomo.travaglini@arm.com token_type = UNKNOWN 14813481Sgiacomo.travaglini@arm.com start = i 14913481Sgiacomo.travaglini@arm.com c = source[i] 15013481Sgiacomo.travaglini@arm.com if c.isalpha() or c == '_': # Find a string token. 15113481Sgiacomo.travaglini@arm.com token_type = NAME 15213481Sgiacomo.travaglini@arm.com while source[i] in valid_identifier_chars: 15313481Sgiacomo.travaglini@arm.com i += 1 15413481Sgiacomo.travaglini@arm.com # String and character constants can look like a name if 15513481Sgiacomo.travaglini@arm.com # they are something like L"". 15613481Sgiacomo.travaglini@arm.com if (source[i] == "'" and (i - start) == 1 and 15713481Sgiacomo.travaglini@arm.com source[start:i] in 'uUL'): 15813481Sgiacomo.travaglini@arm.com # u, U, and L are valid C++0x character preffixes. 15913481Sgiacomo.travaglini@arm.com token_type = CONSTANT 16013481Sgiacomo.travaglini@arm.com i = _GetChar(source, start, i) 16113481Sgiacomo.travaglini@arm.com elif source[i] == "'" and source[start:i] in _STR_PREFIXES: 16213481Sgiacomo.travaglini@arm.com token_type = CONSTANT 16313481Sgiacomo.travaglini@arm.com i = _GetString(source, start, i) 16413481Sgiacomo.travaglini@arm.com elif c == '/' and source[i+1] == '/': # Find // comments. 16513481Sgiacomo.travaglini@arm.com i = source.find('\n', i) 16613481Sgiacomo.travaglini@arm.com if i == -1: # Handle EOF. 16713481Sgiacomo.travaglini@arm.com i = end 16813481Sgiacomo.travaglini@arm.com continue 16913481Sgiacomo.travaglini@arm.com elif c == '/' and source[i+1] == '*': # Find /* comments. */ 17013481Sgiacomo.travaglini@arm.com i = source.find('*/', i) + 2 17113481Sgiacomo.travaglini@arm.com continue 17213481Sgiacomo.travaglini@arm.com elif c in ':+-<>&|*=': # : or :: (plus other chars). 17313481Sgiacomo.travaglini@arm.com token_type = SYNTAX 17413481Sgiacomo.travaglini@arm.com i += 1 17513481Sgiacomo.travaglini@arm.com new_ch = source[i] 17613481Sgiacomo.travaglini@arm.com if new_ch == c and c != '>': # Treat ">>" as two tokens. 17713481Sgiacomo.travaglini@arm.com i += 1 17813481Sgiacomo.travaglini@arm.com elif c == '-' and new_ch == '>': 17913481Sgiacomo.travaglini@arm.com i += 1 18013481Sgiacomo.travaglini@arm.com elif new_ch == '=': 18113481Sgiacomo.travaglini@arm.com i += 1 18213481Sgiacomo.travaglini@arm.com elif c in '()[]{}~!?^%;/.,': # Handle single char tokens. 18313481Sgiacomo.travaglini@arm.com token_type = SYNTAX 18413481Sgiacomo.travaglini@arm.com i += 1 18513481Sgiacomo.travaglini@arm.com if c == '.' and source[i].isdigit(): 18613481Sgiacomo.travaglini@arm.com token_type = CONSTANT 18713481Sgiacomo.travaglini@arm.com i += 1 18813481Sgiacomo.travaglini@arm.com while source[i] in int_or_float_digits: 18913481Sgiacomo.travaglini@arm.com i += 1 19013481Sgiacomo.travaglini@arm.com # Handle float suffixes. 19113481Sgiacomo.travaglini@arm.com for suffix in ('l', 'f'): 19213481Sgiacomo.travaglini@arm.com if suffix == source[i:i+1].lower(): 19313481Sgiacomo.travaglini@arm.com i += 1 19413481Sgiacomo.travaglini@arm.com break 19513481Sgiacomo.travaglini@arm.com elif c.isdigit(): # Find integer. 19613481Sgiacomo.travaglini@arm.com token_type = CONSTANT 19713481Sgiacomo.travaglini@arm.com if c == '0' and source[i+1] in 'xX': 19813481Sgiacomo.travaglini@arm.com # Handle hex digits. 19913481Sgiacomo.travaglini@arm.com i += 2 20013481Sgiacomo.travaglini@arm.com while source[i] in hex_digits: 20113481Sgiacomo.travaglini@arm.com i += 1 20213481Sgiacomo.travaglini@arm.com else: 20313481Sgiacomo.travaglini@arm.com while source[i] in int_or_float_digits2: 20413481Sgiacomo.travaglini@arm.com i += 1 20513481Sgiacomo.travaglini@arm.com # Handle integer (and float) suffixes. 20613481Sgiacomo.travaglini@arm.com for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'): 20713481Sgiacomo.travaglini@arm.com size = len(suffix) 20813481Sgiacomo.travaglini@arm.com if suffix == source[i:i+size].lower(): 20913481Sgiacomo.travaglini@arm.com i += size 21013481Sgiacomo.travaglini@arm.com break 21113481Sgiacomo.travaglini@arm.com elif c == '"': # Find string. 21213481Sgiacomo.travaglini@arm.com token_type = CONSTANT 21313481Sgiacomo.travaglini@arm.com i = _GetString(source, start, i) 21413481Sgiacomo.travaglini@arm.com elif c == "'": # Find char. 21513481Sgiacomo.travaglini@arm.com token_type = CONSTANT 21613481Sgiacomo.travaglini@arm.com i = _GetChar(source, start, i) 21713481Sgiacomo.travaglini@arm.com elif c == '#': # Find pre-processor command. 21813481Sgiacomo.travaglini@arm.com token_type = PREPROCESSOR 21913481Sgiacomo.travaglini@arm.com got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace() 22013481Sgiacomo.travaglini@arm.com if got_if: 22113481Sgiacomo.travaglini@arm.com count_ifs += 1 22213481Sgiacomo.travaglini@arm.com elif source[i:i+6] == '#endif': 22313481Sgiacomo.travaglini@arm.com count_ifs -= 1 22413481Sgiacomo.travaglini@arm.com if count_ifs == 0: 22513481Sgiacomo.travaglini@arm.com ignore_errors = False 22613481Sgiacomo.travaglini@arm.com 22713481Sgiacomo.travaglini@arm.com # TODO(nnorwitz): handle preprocessor statements (\ continuations). 22813481Sgiacomo.travaglini@arm.com while 1: 22913481Sgiacomo.travaglini@arm.com i1 = source.find('\n', i) 23013481Sgiacomo.travaglini@arm.com i2 = source.find('//', i) 23113481Sgiacomo.travaglini@arm.com i3 = source.find('/*', i) 23213481Sgiacomo.travaglini@arm.com i4 = source.find('"', i) 23313481Sgiacomo.travaglini@arm.com # NOTE(nnorwitz): doesn't handle comments in #define macros. 23413481Sgiacomo.travaglini@arm.com # Get the first important symbol (newline, comment, EOF/end). 23513481Sgiacomo.travaglini@arm.com i = min([x for x in (i1, i2, i3, i4, end) if x != -1]) 23613481Sgiacomo.travaglini@arm.com 23713481Sgiacomo.travaglini@arm.com # Handle #include "dir//foo.h" properly. 23813481Sgiacomo.travaglini@arm.com if source[i] == '"': 23913481Sgiacomo.travaglini@arm.com i = source.find('"', i+1) + 1 24013481Sgiacomo.travaglini@arm.com assert i > 0 24113481Sgiacomo.travaglini@arm.com continue 24213481Sgiacomo.travaglini@arm.com # Keep going if end of the line and the line ends with \. 24313481Sgiacomo.travaglini@arm.com if not (i == i1 and source[i-1] == '\\'): 24413481Sgiacomo.travaglini@arm.com if got_if: 24513481Sgiacomo.travaglini@arm.com condition = source[start+4:i].lstrip() 24613481Sgiacomo.travaglini@arm.com if (condition.startswith('0') or 24713481Sgiacomo.travaglini@arm.com condition.startswith('(0)')): 24813481Sgiacomo.travaglini@arm.com ignore_errors = True 24913481Sgiacomo.travaglini@arm.com break 25013481Sgiacomo.travaglini@arm.com i += 1 25113481Sgiacomo.travaglini@arm.com elif c == '\\': # Handle \ in code. 25213481Sgiacomo.travaglini@arm.com # This is different from the pre-processor \ handling. 25313481Sgiacomo.travaglini@arm.com i += 1 25413481Sgiacomo.travaglini@arm.com continue 25513481Sgiacomo.travaglini@arm.com elif ignore_errors: 25613481Sgiacomo.travaglini@arm.com # The tokenizer seems to be in pretty good shape. This 25713481Sgiacomo.travaglini@arm.com # raise is conditionally disabled so that bogus code 25813481Sgiacomo.travaglini@arm.com # in an #if 0 block can be handled. Since we will ignore 25913481Sgiacomo.travaglini@arm.com # it anyways, this is probably fine. So disable the 26013481Sgiacomo.travaglini@arm.com # exception and return the bogus char. 26113481Sgiacomo.travaglini@arm.com i += 1 26213481Sgiacomo.travaglini@arm.com else: 26313481Sgiacomo.travaglini@arm.com sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' % 26413481Sgiacomo.travaglini@arm.com ('?', i, c, source[i-10:i+10])) 26513481Sgiacomo.travaglini@arm.com raise RuntimeError('unexpected token') 26613481Sgiacomo.travaglini@arm.com 26713481Sgiacomo.travaglini@arm.com if i <= 0: 26813481Sgiacomo.travaglini@arm.com print('Invalid index, exiting now.') 26913481Sgiacomo.travaglini@arm.com return 27013481Sgiacomo.travaglini@arm.com yield Token(token_type, source[start:i], start, i) 27113481Sgiacomo.travaglini@arm.com 27213481Sgiacomo.travaglini@arm.com 27313481Sgiacomo.travaglini@arm.comif __name__ == '__main__': 27413481Sgiacomo.travaglini@arm.com def main(argv): 27513481Sgiacomo.travaglini@arm.com """Driver mostly for testing purposes.""" 27613481Sgiacomo.travaglini@arm.com for filename in argv[1:]: 27713481Sgiacomo.travaglini@arm.com source = utils.ReadFile(filename) 27813481Sgiacomo.travaglini@arm.com if source is None: 27913481Sgiacomo.travaglini@arm.com continue 28013481Sgiacomo.travaglini@arm.com 28113481Sgiacomo.travaglini@arm.com for token in GetTokens(source): 28213481Sgiacomo.travaglini@arm.com print('%-12s: %s' % (token.token_type, token.name)) 28313481Sgiacomo.travaglini@arm.com # print('\r%6.2f%%' % (100.0 * index / token.end),) 28413481Sgiacomo.travaglini@arm.com sys.stdout.write('\n') 28513481Sgiacomo.travaglini@arm.com 28613481Sgiacomo.travaglini@arm.com 28713481Sgiacomo.travaglini@arm.com main(sys.argv) 288