GardenSnake.py revision 4479
1# GardenSnake - a parser generator demonstration program 2# 3# This implements a modified version of a subset of Python: 4# - only 'def', 'return' and 'if' statements 5# - 'if' only has 'then' clause (no elif nor else) 6# - single-quoted strings only, content in raw format 7# - numbers are decimal.Decimal instances (not integers or floats) 8# - no print statment; use the built-in 'print' function 9# - only < > == + - / * implemented (and unary + -) 10# - assignment and tuple assignment work 11# - no generators of any sort 12# - no ... well, no quite a lot 13 14# Why? I'm thinking about a new indentation-based configuration 15# language for a project and wanted to figure out how to do it. Once 16# I got that working I needed a way to test it out. My original AST 17# was dumb so I decided to target Python's AST and compile it into 18# Python code. Plus, it's pretty cool that it only took a day or so 19# from sitting down with Ply to having working code. 20 21# This uses David Beazley's Ply from http://www.dabeaz.com/ply/ 22 23# This work is hereby released into the Public Domain. To view a copy of 24# the public domain dedication, visit 25# http://creativecommons.org/licenses/publicdomain/ or send a letter to 26# Creative Commons, 543 Howard Street, 5th Floor, San Francisco, 27# California, 94105, USA. 28# 29# Portions of this work are derived from Python's Grammar definition 30# and may be covered under the Python copyright and license 31# 32# Andrew Dalke / Dalke Scientific Software, LLC 33# 30 August 2006 / Cape Town, South Africa 34 35# Changelog: 36# 30 August - added link to CC license; removed the "swapcase" encoding 37 38# Modifications for inclusion in PLY distribution 39import sys 40sys.path.insert(0,"../..") 41from ply import * 42 43##### Lexer ###### 44#import lex 45import decimal 46 47tokens = ( 48 'DEF', 49 'IF', 50 'NAME', 51 'NUMBER', # Python decimals 52 'STRING', # single quoted strings only; syntax of raw strings 53 'LPAR', 54 'RPAR', 55 'COLON', 56 'EQ', 57 'ASSIGN', 58 'LT', 59 'GT', 60 'PLUS', 61 'MINUS', 62 'MULT', 63 'DIV', 64 'RETURN', 65 'WS', 66 'NEWLINE', 67 'COMMA', 68 'SEMICOLON', 69 'INDENT', 70 'DEDENT', 71 'ENDMARKER', 72 ) 73 74#t_NUMBER = r'\d+' 75# taken from decmial.py but without the leading sign 76def t_NUMBER(t): 77 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?""" 78 t.value = decimal.Decimal(t.value) 79 return t 80 81def t_STRING(t): 82 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ... 83 t.value=t.value[1:-1].decode("string-escape") # .swapcase() # for fun 84 return t 85 86t_COLON = r':' 87t_EQ = r'==' 88t_ASSIGN = r'=' 89t_LT = r'<' 90t_GT = r'>' 91t_PLUS = r'\+' 92t_MINUS = r'-' 93t_MULT = r'\*' 94t_DIV = r'/' 95t_COMMA = r',' 96t_SEMICOLON = r';' 97 98# Ply nicely documented how to do this. 99 100RESERVED = { 101 "def": "DEF", 102 "if": "IF", 103 "return": "RETURN", 104 } 105 106def t_NAME(t): 107 r'[a-zA-Z_][a-zA-Z0-9_]*' 108 t.type = RESERVED.get(t.value, "NAME") 109 return t 110 111# Putting this before t_WS let it consume lines with only comments in 112# them so the latter code never sees the WS part. Not consuming the 113# newline. Needed for "if 1: #comment" 114def t_comment(t): 115 r"[ ]*\043[^\n]*" # \043 is '#' 116 pass 117 118 119# Whitespace 120def t_WS(t): 121 r' [ ]+ ' 122 if t.lexer.at_line_start and t.lexer.paren_count == 0: 123 return t 124 125# Don't generate newline tokens when inside of parenthesis, eg 126# a = (1, 127# 2, 3) 128def t_newline(t): 129 r'\n+' 130 t.lexer.lineno += len(t.value) 131 t.type = "NEWLINE" 132 if t.lexer.paren_count == 0: 133 return t 134 135def t_LPAR(t): 136 r'\(' 137 t.lexer.paren_count += 1 138 return t 139 140def t_RPAR(t): 141 r'\)' 142 # check for underflow? should be the job of the parser 143 t.lexer.paren_count -= 1 144 return t 145 146 147def t_error(t): 148 raise SyntaxError("Unknown symbol %r" % (t.value[0],)) 149 print "Skipping", repr(t.value[0]) 150 t.lexer.skip(1) 151 152## I implemented INDENT / DEDENT generation as a post-processing filter 153 154# The original lex token stream contains WS and NEWLINE characters. 155# WS will only occur before any other tokens on a line. 156 157# I have three filters. One tags tokens by adding two attributes. 158# "must_indent" is True if the token must be indented from the 159# previous code. The other is "at_line_start" which is True for WS 160# and the first non-WS/non-NEWLINE on a line. It flags the check so 161# see if the new line has changed indication level. 162 163# Python's syntax has three INDENT states 164# 0) no colon hence no need to indent 165# 1) "if 1: go()" - simple statements have a COLON but no need for an indent 166# 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent 167NO_INDENT = 0 168MAY_INDENT = 1 169MUST_INDENT = 2 170 171# only care about whitespace at the start of a line 172def track_tokens_filter(lexer, tokens): 173 lexer.at_line_start = at_line_start = True 174 indent = NO_INDENT 175 saw_colon = False 176 for token in tokens: 177 token.at_line_start = at_line_start 178 179 if token.type == "COLON": 180 at_line_start = False 181 indent = MAY_INDENT 182 token.must_indent = False 183 184 elif token.type == "NEWLINE": 185 at_line_start = True 186 if indent == MAY_INDENT: 187 indent = MUST_INDENT 188 token.must_indent = False 189 190 elif token.type == "WS": 191 assert token.at_line_start == True 192 at_line_start = True 193 token.must_indent = False 194 195 else: 196 # A real token; only indent after COLON NEWLINE 197 if indent == MUST_INDENT: 198 token.must_indent = True 199 else: 200 token.must_indent = False 201 at_line_start = False 202 indent = NO_INDENT 203 204 yield token 205 lexer.at_line_start = at_line_start 206 207def _new_token(type, lineno): 208 tok = lex.LexToken() 209 tok.type = type 210 tok.value = None 211 tok.lineno = lineno 212 return tok 213 214# Synthesize a DEDENT tag 215def DEDENT(lineno): 216 return _new_token("DEDENT", lineno) 217 218# Synthesize an INDENT tag 219def INDENT(lineno): 220 return _new_token("INDENT", lineno) 221 222 223# Track the indentation level and emit the right INDENT / DEDENT events. 224def indentation_filter(tokens): 225 # A stack of indentation levels; will never pop item 0 226 levels = [0] 227 token = None 228 depth = 0 229 prev_was_ws = False 230 for token in tokens: 231## if 1: 232## print "Process", token, 233## if token.at_line_start: 234## print "at_line_start", 235## if token.must_indent: 236## print "must_indent", 237## print 238 239 # WS only occurs at the start of the line 240 # There may be WS followed by NEWLINE so 241 # only track the depth here. Don't indent/dedent 242 # until there's something real. 243 if token.type == "WS": 244 assert depth == 0 245 depth = len(token.value) 246 prev_was_ws = True 247 # WS tokens are never passed to the parser 248 continue 249 250 if token.type == "NEWLINE": 251 depth = 0 252 if prev_was_ws or token.at_line_start: 253 # ignore blank lines 254 continue 255 # pass the other cases on through 256 yield token 257 continue 258 259 # then it must be a real token (not WS, not NEWLINE) 260 # which can affect the indentation level 261 262 prev_was_ws = False 263 if token.must_indent: 264 # The current depth must be larger than the previous level 265 if not (depth > levels[-1]): 266 raise IndentationError("expected an indented block") 267 268 levels.append(depth) 269 yield INDENT(token.lineno) 270 271 elif token.at_line_start: 272 # Must be on the same level or one of the previous levels 273 if depth == levels[-1]: 274 # At the same level 275 pass 276 elif depth > levels[-1]: 277 raise IndentationError("indentation increase but not in new block") 278 else: 279 # Back up; but only if it matches a previous level 280 try: 281 i = levels.index(depth) 282 except ValueError: 283 raise IndentationError("inconsistent indentation") 284 for _ in range(i+1, len(levels)): 285 yield DEDENT(token.lineno) 286 levels.pop() 287 288 yield token 289 290 ### Finished processing ### 291 292 # Must dedent any remaining levels 293 if len(levels) > 1: 294 assert token is not None 295 for _ in range(1, len(levels)): 296 yield DEDENT(token.lineno) 297 298 299# The top-level filter adds an ENDMARKER, if requested. 300# Python's grammar uses it. 301def filter(lexer, add_endmarker = True): 302 token = None 303 tokens = iter(lexer.token, None) 304 tokens = track_tokens_filter(lexer, tokens) 305 for token in indentation_filter(tokens): 306 yield token 307 308 if add_endmarker: 309 lineno = 1 310 if token is not None: 311 lineno = token.lineno 312 yield _new_token("ENDMARKER", lineno) 313 314# Combine Ply and my filters into a new lexer 315 316class IndentLexer(object): 317 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0): 318 self.lexer = lex.lex(debug=debug, optimize=optimize, lextab=lextab, reflags=reflags) 319 self.token_stream = None 320 def input(self, s, add_endmarker=True): 321 self.lexer.paren_count = 0 322 self.lexer.input(s) 323 self.token_stream = filter(self.lexer, add_endmarker) 324 def token(self): 325 try: 326 return self.token_stream.next() 327 except StopIteration: 328 return None 329 330########## Parser (tokens -> AST) ###### 331 332# also part of Ply 333#import yacc 334 335# I use the Python AST 336from compiler import ast 337 338# Helper function 339def Assign(left, right): 340 names = [] 341 if isinstance(left, ast.Name): 342 # Single assignment on left 343 return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right) 344 elif isinstance(left, ast.Tuple): 345 # List of things - make sure they are Name nodes 346 names = [] 347 for child in left.getChildren(): 348 if not isinstance(child, ast.Name): 349 raise SyntaxError("that assignment not supported") 350 names.append(child.name) 351 ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names] 352 return ast.Assign([ast.AssTuple(ass_list)], right) 353 else: 354 raise SyntaxError("Can't do that yet") 355 356 357# The grammar comments come from Python's Grammar/Grammar file 358 359## NB: compound_stmt in single_input is followed by extra NEWLINE! 360# file_input: (NEWLINE | stmt)* ENDMARKER 361def p_file_input_end(p): 362 """file_input_end : file_input ENDMARKER""" 363 p[0] = ast.Stmt(p[1]) 364def p_file_input(p): 365 """file_input : file_input NEWLINE 366 | file_input stmt 367 | NEWLINE 368 | stmt""" 369 if isinstance(p[len(p)-1], basestring): 370 if len(p) == 3: 371 p[0] = p[1] 372 else: 373 p[0] = [] # p == 2 --> only a blank line 374 else: 375 if len(p) == 3: 376 p[0] = p[1] + p[2] 377 else: 378 p[0] = p[1] 379 380 381# funcdef: [decorators] 'def' NAME parameters ':' suite 382# ignoring decorators 383def p_funcdef(p): 384 "funcdef : DEF NAME parameters COLON suite" 385 p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5]) 386 387# parameters: '(' [varargslist] ')' 388def p_parameters(p): 389 """parameters : LPAR RPAR 390 | LPAR varargslist RPAR""" 391 if len(p) == 3: 392 p[0] = [] 393 else: 394 p[0] = p[2] 395 396 397# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) | 398# highly simplified 399def p_varargslist(p): 400 """varargslist : varargslist COMMA NAME 401 | NAME""" 402 if len(p) == 4: 403 p[0] = p[1] + p[3] 404 else: 405 p[0] = [p[1]] 406 407# stmt: simple_stmt | compound_stmt 408def p_stmt_simple(p): 409 """stmt : simple_stmt""" 410 # simple_stmt is a list 411 p[0] = p[1] 412 413def p_stmt_compound(p): 414 """stmt : compound_stmt""" 415 p[0] = [p[1]] 416 417# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE 418def p_simple_stmt(p): 419 """simple_stmt : small_stmts NEWLINE 420 | small_stmts SEMICOLON NEWLINE""" 421 p[0] = p[1] 422 423def p_small_stmts(p): 424 """small_stmts : small_stmts SEMICOLON small_stmt 425 | small_stmt""" 426 if len(p) == 4: 427 p[0] = p[1] + [p[3]] 428 else: 429 p[0] = [p[1]] 430 431# small_stmt: expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | 432# import_stmt | global_stmt | exec_stmt | assert_stmt 433def p_small_stmt(p): 434 """small_stmt : flow_stmt 435 | expr_stmt""" 436 p[0] = p[1] 437 438# expr_stmt: testlist (augassign (yield_expr|testlist) | 439# ('=' (yield_expr|testlist))*) 440# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | 441# '<<=' | '>>=' | '**=' | '//=') 442def p_expr_stmt(p): 443 """expr_stmt : testlist ASSIGN testlist 444 | testlist """ 445 if len(p) == 2: 446 # a list of expressions 447 p[0] = ast.Discard(p[1]) 448 else: 449 p[0] = Assign(p[1], p[3]) 450 451def p_flow_stmt(p): 452 "flow_stmt : return_stmt" 453 p[0] = p[1] 454 455# return_stmt: 'return' [testlist] 456def p_return_stmt(p): 457 "return_stmt : RETURN testlist" 458 p[0] = ast.Return(p[2]) 459 460 461def p_compound_stmt(p): 462 """compound_stmt : if_stmt 463 | funcdef""" 464 p[0] = p[1] 465 466def p_if_stmt(p): 467 'if_stmt : IF test COLON suite' 468 p[0] = ast.If([(p[2], p[4])], None) 469 470def p_suite(p): 471 """suite : simple_stmt 472 | NEWLINE INDENT stmts DEDENT""" 473 if len(p) == 2: 474 p[0] = ast.Stmt(p[1]) 475 else: 476 p[0] = ast.Stmt(p[3]) 477 478 479def p_stmts(p): 480 """stmts : stmts stmt 481 | stmt""" 482 if len(p) == 3: 483 p[0] = p[1] + p[2] 484 else: 485 p[0] = p[1] 486 487## No using Python's approach because Ply supports precedence 488 489# comparison: expr (comp_op expr)* 490# arith_expr: term (('+'|'-') term)* 491# term: factor (('*'|'/'|'%'|'//') factor)* 492# factor: ('+'|'-'|'~') factor | power 493# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' 494 495def make_lt_compare((left, right)): 496 return ast.Compare(left, [('<', right),]) 497def make_gt_compare((left, right)): 498 return ast.Compare(left, [('>', right),]) 499def make_eq_compare((left, right)): 500 return ast.Compare(left, [('==', right),]) 501 502 503binary_ops = { 504 "+": ast.Add, 505 "-": ast.Sub, 506 "*": ast.Mul, 507 "/": ast.Div, 508 "<": make_lt_compare, 509 ">": make_gt_compare, 510 "==": make_eq_compare, 511} 512unary_ops = { 513 "+": ast.UnaryAdd, 514 "-": ast.UnarySub, 515 } 516precedence = ( 517 ("left", "EQ", "GT", "LT"), 518 ("left", "PLUS", "MINUS"), 519 ("left", "MULT", "DIV"), 520 ) 521 522def p_comparison(p): 523 """comparison : comparison PLUS comparison 524 | comparison MINUS comparison 525 | comparison MULT comparison 526 | comparison DIV comparison 527 | comparison LT comparison 528 | comparison EQ comparison 529 | comparison GT comparison 530 | PLUS comparison 531 | MINUS comparison 532 | power""" 533 if len(p) == 4: 534 p[0] = binary_ops[p[2]]((p[1], p[3])) 535 elif len(p) == 3: 536 p[0] = unary_ops[p[1]](p[2]) 537 else: 538 p[0] = p[1] 539 540# power: atom trailer* ['**' factor] 541# trailers enables function calls. I only allow one level of calls 542# so this is 'trailer' 543def p_power(p): 544 """power : atom 545 | atom trailer""" 546 if len(p) == 2: 547 p[0] = p[1] 548 else: 549 if p[2][0] == "CALL": 550 p[0] = ast.CallFunc(p[1], p[2][1], None, None) 551 else: 552 raise AssertionError("not implemented") 553 554def p_atom_name(p): 555 """atom : NAME""" 556 p[0] = ast.Name(p[1]) 557 558def p_atom_number(p): 559 """atom : NUMBER 560 | STRING""" 561 p[0] = ast.Const(p[1]) 562 563def p_atom_tuple(p): 564 """atom : LPAR testlist RPAR""" 565 p[0] = p[2] 566 567# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME 568def p_trailer(p): 569 "trailer : LPAR arglist RPAR" 570 p[0] = ("CALL", p[2]) 571 572# testlist: test (',' test)* [','] 573# Contains shift/reduce error 574def p_testlist(p): 575 """testlist : testlist_multi COMMA 576 | testlist_multi """ 577 if len(p) == 2: 578 p[0] = p[1] 579 else: 580 # May need to promote singleton to tuple 581 if isinstance(p[1], list): 582 p[0] = p[1] 583 else: 584 p[0] = [p[1]] 585 # Convert into a tuple? 586 if isinstance(p[0], list): 587 p[0] = ast.Tuple(p[0]) 588 589def p_testlist_multi(p): 590 """testlist_multi : testlist_multi COMMA test 591 | test""" 592 if len(p) == 2: 593 # singleton 594 p[0] = p[1] 595 else: 596 if isinstance(p[1], list): 597 p[0] = p[1] + [p[3]] 598 else: 599 # singleton -> tuple 600 p[0] = [p[1], p[3]] 601 602 603# test: or_test ['if' or_test 'else' test] | lambdef 604# as I don't support 'and', 'or', and 'not' this works down to 'comparison' 605def p_test(p): 606 "test : comparison" 607 p[0] = p[1] 608 609 610 611# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test) 612# XXX INCOMPLETE: this doesn't allow the trailing comma 613def p_arglist(p): 614 """arglist : arglist COMMA argument 615 | argument""" 616 if len(p) == 4: 617 p[0] = p[1] + [p[3]] 618 else: 619 p[0] = [p[1]] 620 621# argument: test [gen_for] | test '=' test # Really [keyword '='] test 622def p_argument(p): 623 "argument : test" 624 p[0] = p[1] 625 626def p_error(p): 627 #print "Error!", repr(p) 628 raise SyntaxError(p) 629 630 631class GardenSnakeParser(object): 632 def __init__(self, lexer = None): 633 if lexer is None: 634 lexer = IndentLexer() 635 self.lexer = lexer 636 self.parser = yacc.yacc(start="file_input_end") 637 638 def parse(self, code): 639 self.lexer.input(code) 640 result = self.parser.parse(lexer = self.lexer) 641 return ast.Module(None, result) 642 643 644###### Code generation ###### 645 646from compiler import misc, syntax, pycodegen 647 648class GardenSnakeCompiler(object): 649 def __init__(self): 650 self.parser = GardenSnakeParser() 651 def compile(self, code, filename="<string>"): 652 tree = self.parser.parse(code) 653 #print tree 654 misc.set_filename(filename, tree) 655 syntax.check(tree) 656 gen = pycodegen.ModuleCodeGenerator(tree) 657 code = gen.getCode() 658 return code 659 660####### Test code ####### 661 662compile = GardenSnakeCompiler().compile 663 664code = r""" 665 666print('LET\'S TRY THIS \\OUT') 667 668#Comment here 669def x(a): 670 print('called with',a) 671 if a == 1: 672 return 2 673 if a*2 > 10: return 999 / 4 674 # Another comment here 675 676 return a+2*3 677 678ints = (1, 2, 679 3, 4, 6805) 681print('mutiline-expression', ints) 682 683t = 4+1/3*2+6*(9-5+1) 684print('predence test; should be 34+2/3:', t, t==(34+2/3)) 685 686print('numbers', 1,2,3,4,5) 687if 1: 688 8 689 a=9 690 print(x(a)) 691 692print(x(1)) 693print(x(2)) 694print(x(8),'3') 695print('this is decimal', 1/5) 696print('BIG DECIMAL', 1.234567891234567e12345) 697 698""" 699 700# Set up the GardenSnake run-time environment 701def print_(*args): 702 print "-->", " ".join(map(str,args)) 703 704globals()["print"] = print_ 705 706compiled_code = compile(code) 707 708exec compiled_code in globals() 709print "Done" 710