1# GardenSnake - a parser generator demonstration program
2#
3# This implements a modified version of a subset of Python:
4#  - only 'def', 'return' and 'if' statements
5#  - 'if' only has 'then' clause (no elif nor else)
6#  - single-quoted strings only, content in raw format
7#  - numbers are decimal.Decimal instances (not integers or floats)
8#  - no print statment; use the built-in 'print' function
9#  - only < > == + - / * implemented (and unary + -)
10#  - assignment and tuple assignment work
11#  - no generators of any sort
12#  - no ... well, no quite a lot
13
14# Why?  I'm thinking about a new indentation-based configuration
15# language for a project and wanted to figure out how to do it.  Once
16# I got that working I needed a way to test it out.  My original AST
17# was dumb so I decided to target Python's AST and compile it into
18# Python code.  Plus, it's pretty cool that it only took a day or so
19# from sitting down with Ply to having working code.
20
21# This uses David Beazley's Ply from http://www.dabeaz.com/ply/
22
23# This work is hereby released into the Public Domain. To view a copy of
24# the public domain dedication, visit
25# http://creativecommons.org/licenses/publicdomain/ or send a letter to
26# Creative Commons, 543 Howard Street, 5th Floor, San Francisco,
27# California, 94105, USA.
28#
29# Portions of this work are derived from Python's Grammar definition
30# and may be covered under the Python copyright and license
31#
32#          Andrew Dalke / Dalke Scientific Software, LLC
33#             30 August 2006 / Cape Town, South Africa
34
35# Changelog:
36#  30 August - added link to CC license; removed the "swapcase" encoding
37
38# Modifications for inclusion in PLY distribution
39import sys
40sys.path.insert(0,"../..")
41from ply import *
42
43##### Lexer ######
44#import lex
45import decimal
46
47tokens = (
48    'DEF',
49    'IF',
50    'NAME',
51    'NUMBER',  # Python decimals
52    'STRING',  # single quoted strings only; syntax of raw strings
53    'LPAR',
54    'RPAR',
55    'COLON',
56    'EQ',
57    'ASSIGN',
58    'LT',
59    'GT',
60    'PLUS',
61    'MINUS',
62    'MULT',
63    'DIV',
64    'RETURN',
65    'WS',
66    'NEWLINE',
67    'COMMA',
68    'SEMICOLON',
69    'INDENT',
70    'DEDENT',
71    'ENDMARKER',
72    )
73
74#t_NUMBER = r'\d+'
75# taken from decmial.py but without the leading sign
76def t_NUMBER(t):
77    r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
78    t.value = decimal.Decimal(t.value)
79    return t
80
81def t_STRING(t):
82    r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
83    t.value=t.value[1:-1].decode("string-escape") # .swapcase() # for fun
84    return t
85
86t_COLON = r':'
87t_EQ = r'=='
88t_ASSIGN = r'='
89t_LT = r'<'
90t_GT = r'>'
91t_PLUS = r'\+'
92t_MINUS = r'-'
93t_MULT = r'\*'
94t_DIV = r'/'
95t_COMMA = r','
96t_SEMICOLON = r';'
97
98# Ply nicely documented how to do this.
99
100RESERVED = {
101  "def": "DEF",
102  "if": "IF",
103  "return": "RETURN",
104  }
105
106def t_NAME(t):
107    r'[a-zA-Z_][a-zA-Z0-9_]*'
108    t.type = RESERVED.get(t.value, "NAME")
109    return t
110
111# Putting this before t_WS let it consume lines with only comments in
112# them so the latter code never sees the WS part.  Not consuming the
113# newline.  Needed for "if 1: #comment"
114def t_comment(t):
115    r"[ ]*\043[^\n]*"  # \043 is '#'
116    pass
117
118
119# Whitespace
120def t_WS(t):
121    r' [ ]+ '
122    if t.lexer.at_line_start and t.lexer.paren_count == 0:
123        return t
124
125# Don't generate newline tokens when inside of parenthesis, eg
126#   a = (1,
127#        2, 3)
128def t_newline(t):
129    r'\n+'
130    t.lexer.lineno += len(t.value)
131    t.type = "NEWLINE"
132    if t.lexer.paren_count == 0:
133        return t
134
135def t_LPAR(t):
136    r'\('
137    t.lexer.paren_count += 1
138    return t
139
140def t_RPAR(t):
141    r'\)'
142    # check for underflow?  should be the job of the parser
143    t.lexer.paren_count -= 1
144    return t
145
146
147def t_error(t):
148    raise SyntaxError("Unknown symbol %r" % (t.value[0],))
149    print "Skipping", repr(t.value[0])
150    t.lexer.skip(1)
151
152## I implemented INDENT / DEDENT generation as a post-processing filter
153
154# The original lex token stream contains WS and NEWLINE characters.
155# WS will only occur before any other tokens on a line.
156
157# I have three filters.  One tags tokens by adding two attributes.
158# "must_indent" is True if the token must be indented from the
159# previous code.  The other is "at_line_start" which is True for WS
160# and the first non-WS/non-NEWLINE on a line.  It flags the check so
161# see if the new line has changed indication level.
162
163# Python's syntax has three INDENT states
164#  0) no colon hence no need to indent
165#  1) "if 1: go()" - simple statements have a COLON but no need for an indent
166#  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
167NO_INDENT = 0
168MAY_INDENT = 1
169MUST_INDENT = 2
170
171# only care about whitespace at the start of a line
172def track_tokens_filter(lexer, tokens):
173    lexer.at_line_start = at_line_start = True
174    indent = NO_INDENT
175    saw_colon = False
176    for token in tokens:
177        token.at_line_start = at_line_start
178
179        if token.type == "COLON":
180            at_line_start = False
181            indent = MAY_INDENT
182            token.must_indent = False
183
184        elif token.type == "NEWLINE":
185            at_line_start = True
186            if indent == MAY_INDENT:
187                indent = MUST_INDENT
188            token.must_indent = False
189
190        elif token.type == "WS":
191            assert token.at_line_start == True
192            at_line_start = True
193            token.must_indent = False
194
195        else:
196            # A real token; only indent after COLON NEWLINE
197            if indent == MUST_INDENT:
198                token.must_indent = True
199            else:
200                token.must_indent = False
201            at_line_start = False
202            indent = NO_INDENT
203
204        yield token
205        lexer.at_line_start = at_line_start
206
207def _new_token(type, lineno):
208    tok = lex.LexToken()
209    tok.type = type
210    tok.value = None
211    tok.lineno = lineno
212    return tok
213
214# Synthesize a DEDENT tag
215def DEDENT(lineno):
216    return _new_token("DEDENT", lineno)
217
218# Synthesize an INDENT tag
219def INDENT(lineno):
220    return _new_token("INDENT", lineno)
221
222
223# Track the indentation level and emit the right INDENT / DEDENT events.
224def indentation_filter(tokens):
225    # A stack of indentation levels; will never pop item 0
226    levels = [0]
227    token = None
228    depth = 0
229    prev_was_ws = False
230    for token in tokens:
231##        if 1:
232##            print "Process", token,
233##            if token.at_line_start:
234##                print "at_line_start",
235##            if token.must_indent:
236##                print "must_indent",
237##            print
238
239        # WS only occurs at the start of the line
240        # There may be WS followed by NEWLINE so
241        # only track the depth here.  Don't indent/dedent
242        # until there's something real.
243        if token.type == "WS":
244            assert depth == 0
245            depth = len(token.value)
246            prev_was_ws = True
247            # WS tokens are never passed to the parser
248            continue
249
250        if token.type == "NEWLINE":
251            depth = 0
252            if prev_was_ws or token.at_line_start:
253                # ignore blank lines
254                continue
255            # pass the other cases on through
256            yield token
257            continue
258
259        # then it must be a real token (not WS, not NEWLINE)
260        # which can affect the indentation level
261
262        prev_was_ws = False
263        if token.must_indent:
264            # The current depth must be larger than the previous level
265            if not (depth > levels[-1]):
266                raise IndentationError("expected an indented block")
267
268            levels.append(depth)
269            yield INDENT(token.lineno)
270
271        elif token.at_line_start:
272            # Must be on the same level or one of the previous levels
273            if depth == levels[-1]:
274                # At the same level
275                pass
276            elif depth > levels[-1]:
277                raise IndentationError("indentation increase but not in new block")
278            else:
279                # Back up; but only if it matches a previous level
280                try:
281                    i = levels.index(depth)
282                except ValueError:
283                    raise IndentationError("inconsistent indentation")
284                for _ in range(i+1, len(levels)):
285                    yield DEDENT(token.lineno)
286                    levels.pop()
287
288        yield token
289
290    ### Finished processing ###
291
292    # Must dedent any remaining levels
293    if len(levels) > 1:
294        assert token is not None
295        for _ in range(1, len(levels)):
296            yield DEDENT(token.lineno)
297
298
299# The top-level filter adds an ENDMARKER, if requested.
300# Python's grammar uses it.
301def filter(lexer, add_endmarker = True):
302    token = None
303    tokens = iter(lexer.token, None)
304    tokens = track_tokens_filter(lexer, tokens)
305    for token in indentation_filter(tokens):
306        yield token
307
308    if add_endmarker:
309        lineno = 1
310        if token is not None:
311            lineno = token.lineno
312        yield _new_token("ENDMARKER", lineno)
313
314# Combine Ply and my filters into a new lexer
315
316class IndentLexer(object):
317    def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
318        self.lexer = lex.lex(debug=debug, optimize=optimize, lextab=lextab, reflags=reflags)
319        self.token_stream = None
320    def input(self, s, add_endmarker=True):
321        self.lexer.paren_count = 0
322        self.lexer.input(s)
323        self.token_stream = filter(self.lexer, add_endmarker)
324    def token(self):
325        try:
326            return self.token_stream.next()
327        except StopIteration:
328            return None
329
330##########   Parser (tokens -> AST) ######
331
332# also part of Ply
333#import yacc
334
335# I use the Python AST
336from compiler import ast
337
338# Helper function
339def Assign(left, right):
340    names = []
341    if isinstance(left, ast.Name):
342        # Single assignment on left
343        return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right)
344    elif isinstance(left, ast.Tuple):
345        # List of things - make sure they are Name nodes
346        names = []
347        for child in left.getChildren():
348            if not isinstance(child, ast.Name):
349                raise SyntaxError("that assignment not supported")
350            names.append(child.name)
351        ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names]
352        return ast.Assign([ast.AssTuple(ass_list)], right)
353    else:
354        raise SyntaxError("Can't do that yet")
355
356
357# The grammar comments come from Python's Grammar/Grammar file
358
359## NB: compound_stmt in single_input is followed by extra NEWLINE!
360# file_input: (NEWLINE | stmt)* ENDMARKER
361def p_file_input_end(p):
362    """file_input_end : file_input ENDMARKER"""
363    p[0] = ast.Stmt(p[1])
364def p_file_input(p):
365    """file_input : file_input NEWLINE
366                  | file_input stmt
367                  | NEWLINE
368                  | stmt"""
369    if isinstance(p[len(p)-1], basestring):
370        if len(p) == 3:
371            p[0] = p[1]
372        else:
373            p[0] = [] # p == 2 --> only a blank line
374    else:
375        if len(p) == 3:
376            p[0] = p[1] + p[2]
377        else:
378            p[0] = p[1]
379
380
381# funcdef: [decorators] 'def' NAME parameters ':' suite
382# ignoring decorators
383def p_funcdef(p):
384    "funcdef : DEF NAME parameters COLON suite"
385    p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5])
386
387# parameters: '(' [varargslist] ')'
388def p_parameters(p):
389    """parameters : LPAR RPAR
390                  | LPAR varargslist RPAR"""
391    if len(p) == 3:
392        p[0] = []
393    else:
394        p[0] = p[2]
395
396
397# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) |
398# highly simplified
399def p_varargslist(p):
400    """varargslist : varargslist COMMA NAME
401                   | NAME"""
402    if len(p) == 4:
403        p[0] = p[1] + p[3]
404    else:
405        p[0] = [p[1]]
406
407# stmt: simple_stmt | compound_stmt
408def p_stmt_simple(p):
409    """stmt : simple_stmt"""
410    # simple_stmt is a list
411    p[0] = p[1]
412
413def p_stmt_compound(p):
414    """stmt : compound_stmt"""
415    p[0] = [p[1]]
416
417# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
418def p_simple_stmt(p):
419    """simple_stmt : small_stmts NEWLINE
420                   | small_stmts SEMICOLON NEWLINE"""
421    p[0] = p[1]
422
423def p_small_stmts(p):
424    """small_stmts : small_stmts SEMICOLON small_stmt
425                   | small_stmt"""
426    if len(p) == 4:
427        p[0] = p[1] + [p[3]]
428    else:
429        p[0] = [p[1]]
430
431# small_stmt: expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt |
432#    import_stmt | global_stmt | exec_stmt | assert_stmt
433def p_small_stmt(p):
434    """small_stmt : flow_stmt
435                  | expr_stmt"""
436    p[0] = p[1]
437
438# expr_stmt: testlist (augassign (yield_expr|testlist) |
439#                      ('=' (yield_expr|testlist))*)
440# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
441#             '<<=' | '>>=' | '**=' | '//=')
442def p_expr_stmt(p):
443    """expr_stmt : testlist ASSIGN testlist
444                 | testlist """
445    if len(p) == 2:
446        # a list of expressions
447        p[0] = ast.Discard(p[1])
448    else:
449        p[0] = Assign(p[1], p[3])
450
451def p_flow_stmt(p):
452    "flow_stmt : return_stmt"
453    p[0] = p[1]
454
455# return_stmt: 'return' [testlist]
456def p_return_stmt(p):
457    "return_stmt : RETURN testlist"
458    p[0] = ast.Return(p[2])
459
460
461def p_compound_stmt(p):
462    """compound_stmt : if_stmt
463                     | funcdef"""
464    p[0] = p[1]
465
466def p_if_stmt(p):
467    'if_stmt : IF test COLON suite'
468    p[0] = ast.If([(p[2], p[4])], None)
469
470def p_suite(p):
471    """suite : simple_stmt
472             | NEWLINE INDENT stmts DEDENT"""
473    if len(p) == 2:
474        p[0] = ast.Stmt(p[1])
475    else:
476        p[0] = ast.Stmt(p[3])
477
478
479def p_stmts(p):
480    """stmts : stmts stmt
481             | stmt"""
482    if len(p) == 3:
483        p[0] = p[1] + p[2]
484    else:
485        p[0] = p[1]
486
487## No using Python's approach because Ply supports precedence
488
489# comparison: expr (comp_op expr)*
490# arith_expr: term (('+'|'-') term)*
491# term: factor (('*'|'/'|'%'|'//') factor)*
492# factor: ('+'|'-'|'~') factor | power
493# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
494
495def make_lt_compare((left, right)):
496    return ast.Compare(left, [('<', right),])
497def make_gt_compare((left, right)):
498    return ast.Compare(left, [('>', right),])
499def make_eq_compare((left, right)):
500    return ast.Compare(left, [('==', right),])
501
502
503binary_ops = {
504    "+": ast.Add,
505    "-": ast.Sub,
506    "*": ast.Mul,
507    "/": ast.Div,
508    "<": make_lt_compare,
509    ">": make_gt_compare,
510    "==": make_eq_compare,
511}
512unary_ops = {
513    "+": ast.UnaryAdd,
514    "-": ast.UnarySub,
515    }
516precedence = (
517    ("left", "EQ", "GT", "LT"),
518    ("left", "PLUS", "MINUS"),
519    ("left", "MULT", "DIV"),
520    )
521
522def p_comparison(p):
523    """comparison : comparison PLUS comparison
524                  | comparison MINUS comparison
525                  | comparison MULT comparison
526                  | comparison DIV comparison
527                  | comparison LT comparison
528                  | comparison EQ comparison
529                  | comparison GT comparison
530                  | PLUS comparison
531                  | MINUS comparison
532                  | power"""
533    if len(p) == 4:
534        p[0] = binary_ops[p[2]]((p[1], p[3]))
535    elif len(p) == 3:
536        p[0] = unary_ops[p[1]](p[2])
537    else:
538        p[0] = p[1]
539
540# power: atom trailer* ['**' factor]
541# trailers enables function calls.  I only allow one level of calls
542# so this is 'trailer'
543def p_power(p):
544    """power : atom
545             | atom trailer"""
546    if len(p) == 2:
547        p[0] = p[1]
548    else:
549        if p[2][0] == "CALL":
550            p[0] = ast.CallFunc(p[1], p[2][1], None, None)
551        else:
552            raise AssertionError("not implemented")
553
554def p_atom_name(p):
555    """atom : NAME"""
556    p[0] = ast.Name(p[1])
557
558def p_atom_number(p):
559    """atom : NUMBER
560            | STRING"""
561    p[0] = ast.Const(p[1])
562
563def p_atom_tuple(p):
564    """atom : LPAR testlist RPAR"""
565    p[0] = p[2]
566
567# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
568def p_trailer(p):
569    "trailer : LPAR arglist RPAR"
570    p[0] = ("CALL", p[2])
571
572# testlist: test (',' test)* [',']
573# Contains shift/reduce error
574def p_testlist(p):
575    """testlist : testlist_multi COMMA
576                | testlist_multi """
577    if len(p) == 2:
578        p[0] = p[1]
579    else:
580        # May need to promote singleton to tuple
581        if isinstance(p[1], list):
582            p[0] = p[1]
583        else:
584            p[0] = [p[1]]
585    # Convert into a tuple?
586    if isinstance(p[0], list):
587        p[0] = ast.Tuple(p[0])
588
589def p_testlist_multi(p):
590    """testlist_multi : testlist_multi COMMA test
591                      | test"""
592    if len(p) == 2:
593        # singleton
594        p[0] = p[1]
595    else:
596        if isinstance(p[1], list):
597            p[0] = p[1] + [p[3]]
598        else:
599            # singleton -> tuple
600            p[0] = [p[1], p[3]]
601
602
603# test: or_test ['if' or_test 'else' test] | lambdef
604#  as I don't support 'and', 'or', and 'not' this works down to 'comparison'
605def p_test(p):
606    "test : comparison"
607    p[0] = p[1]
608
609
610
611# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test)
612# XXX INCOMPLETE: this doesn't allow the trailing comma
613def p_arglist(p):
614    """arglist : arglist COMMA argument
615               | argument"""
616    if len(p) == 4:
617        p[0] = p[1] + [p[3]]
618    else:
619        p[0] = [p[1]]
620
621# argument: test [gen_for] | test '=' test  # Really [keyword '='] test
622def p_argument(p):
623    "argument : test"
624    p[0] = p[1]
625
626def p_error(p):
627    #print "Error!", repr(p)
628    raise SyntaxError(p)
629
630
631class GardenSnakeParser(object):
632    def __init__(self, lexer = None):
633        if lexer is None:
634            lexer = IndentLexer()
635        self.lexer = lexer
636        self.parser = yacc.yacc(start="file_input_end")
637
638    def parse(self, code):
639        self.lexer.input(code)
640        result = self.parser.parse(lexer = self.lexer)
641        return ast.Module(None, result)
642
643
644###### Code generation ######
645
646from compiler import misc, syntax, pycodegen
647
648class GardenSnakeCompiler(object):
649    def __init__(self):
650        self.parser = GardenSnakeParser()
651    def compile(self, code, filename="<string>"):
652        tree = self.parser.parse(code)
653        #print  tree
654        misc.set_filename(filename, tree)
655        syntax.check(tree)
656        gen = pycodegen.ModuleCodeGenerator(tree)
657        code = gen.getCode()
658        return code
659
660####### Test code #######
661
662compile = GardenSnakeCompiler().compile
663
664code = r"""
665
666print('LET\'S TRY THIS \\OUT')
667
668#Comment here
669def x(a):
670    print('called with',a)
671    if a == 1:
672        return 2
673    if a*2 > 10: return 999 / 4
674        # Another comment here
675
676    return a+2*3
677
678ints = (1, 2,
679   3, 4,
6805)
681print('mutiline-expression', ints)
682
683t = 4+1/3*2+6*(9-5+1)
684print('predence test; should be 34+2/3:', t, t==(34+2/3))
685
686print('numbers', 1,2,3,4,5)
687if 1:
688 8
689 a=9
690 print(x(a))
691
692print(x(1))
693print(x(2))
694print(x(8),'3')
695print('this is decimal', 1/5)
696print('BIG DECIMAL', 1.234567891234567e12345)
697
698"""
699
700# Set up the GardenSnake run-time environment
701def print_(*args):
702    print "-->", " ".join(map(str,args))
703
704globals()["print"] = print_
705
706compiled_code = compile(code)
707
708exec compiled_code in globals()
709print "Done"
710