ext/ply/example/GardenSnake/GardenSnake.py - public/gem5 - Git at Google

 # GardenSnake - a parser generator demonstration program
 #
 # This implements a modified version of a subset of Python:
 #  - only 'def', 'return' and 'if' statements
 #  - 'if' only has 'then' clause (no elif nor else)
 #  - single-quoted strings only, content in raw format
 #  - numbers are decimal.Decimal instances (not integers or floats)
 #  - no print statment; use the built-in 'print' function
 #  - only < > == + - / * implemented (and unary + -)
 #  - assignment and tuple assignment work
 #  - no generators of any sort
 #  - no ... well, no quite a lot

 # Why?  I'm thinking about a new indentation-based configuration
 # language for a project and wanted to figure out how to do it.  Once
 # I got that working I needed a way to test it out.  My original AST
 # was dumb so I decided to target Python's AST and compile it into
 # Python code.  Plus, it's pretty cool that it only took a day or so
 # from sitting down with Ply to having working code.

 # This uses David Beazley's Ply from http://www.dabeaz.com/ply/

 # This work is hereby released into the Public Domain. To view a copy of
 # the public domain dedication, visit
 # http://creativecommons.org/licenses/publicdomain/ or send a letter to
 # Creative Commons, 543 Howard Street, 5th Floor, San Francisco,
 # California, 94105, USA.
 #
 # Portions of this work are derived from Python's Grammar definition
 # and may be covered under the Python copyright and license
 #
 #          Andrew Dalke / Dalke Scientific Software, LLC
 #             30 August 2006 / Cape Town, South Africa

 # Changelog:
 #  30 August - added link to CC license; removed the "swapcase" encoding

 # Modifications for inclusion in PLY distribution
 import sys
 sys.path.insert(0,"../..")
 from ply import *

 ##### Lexer ######
 #import lex
 import decimal

 tokens = (
     'DEF',
     'IF',
     'NAME',
     'NUMBER',  # Python decimals
     'STRING',  # single quoted strings only; syntax of raw strings
     'LPAR',
     'RPAR',
     'COLON',
     'EQ',
     'ASSIGN',
     'LT',
     'GT',
     'PLUS',
     'MINUS',
     'MULT',
     'DIV',
     'RETURN',
     'WS',
     'NEWLINE',
     'COMMA',
     'SEMICOLON',
     'INDENT',
     'DEDENT',
     'ENDMARKER',
     )

 #t_NUMBER = r'\d+'
 # taken from decmial.py but without the leading sign
 def t_NUMBER(t):
     r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
     t.value = decimal.Decimal(t.value)
     return t

 def t_STRING(t):
     r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
     t.value=t.value[1:-1].decode("string-escape") # .swapcase() # for fun
     return t

 t_COLON = r':'
 t_EQ = r'=='
 t_ASSIGN = r'='
 t_LT = r'<'
 t_GT = r'>'
 t_PLUS = r'\+'
 t_MINUS = r'-'
 t_MULT = r'\*'
 t_DIV = r'/'
 t_COMMA = r','
 t_SEMICOLON = r';'

 # Ply nicely documented how to do this.

 RESERVED = {
   "def": "DEF",
   "if": "IF",
   "return": "RETURN",
   }

 def t_NAME(t):
     r'[a-zA-Z_][a-zA-Z0-9_]*'
     t.type = RESERVED.get(t.value, "NAME")
     return t

 # Putting this before t_WS let it consume lines with only comments in
 # them so the latter code never sees the WS part.  Not consuming the
 # newline.  Needed for "if 1: #comment"
 def t_comment(t):
     r"[ ]*\043[^\n]*"  # \043 is '#'
     pass


 # Whitespace
 def t_WS(t):
     r' [ ]+ '
     if t.lexer.at_line_start and t.lexer.paren_count == 0:
         return t

 # Don't generate newline tokens when inside of parenthesis, eg
 #   a = (1,
 #        2, 3)
 def t_newline(t):
     r'\n+'
     t.lexer.lineno += len(t.value)
     t.type = "NEWLINE"
     if t.lexer.paren_count == 0:
         return t

 def t_LPAR(t):
     r'\('
     t.lexer.paren_count += 1
     return t

 def t_RPAR(t):
     r'\)'
     # check for underflow?  should be the job of the parser
     t.lexer.paren_count -= 1
     return t


 def t_error(t):
     raise SyntaxError("Unknown symbol %r" % (t.value[0],))
     print "Skipping", repr(t.value[0])
     t.lexer.skip(1)

 ## I implemented INDENT / DEDENT generation as a post-processing filter

 # The original lex token stream contains WS and NEWLINE characters.
 # WS will only occur before any other tokens on a line.

 # I have three filters.  One tags tokens by adding two attributes.
 # "must_indent" is True if the token must be indented from the
 # previous code.  The other is "at_line_start" which is True for WS
 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
 # see if the new line has changed indication level.

 # Python's syntax has three INDENT states
 #  0) no colon hence no need to indent
 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
 NO_INDENT = 0
 MAY_INDENT = 1
 MUST_INDENT = 2

 # only care about whitespace at the start of a line
 def track_tokens_filter(lexer, tokens):
     lexer.at_line_start = at_line_start = True
     indent = NO_INDENT
     saw_colon = False
     for token in tokens:
         token.at_line_start = at_line_start

         if token.type == "COLON":
             at_line_start = False
             indent = MAY_INDENT
             token.must_indent = False

         elif token.type == "NEWLINE":
             at_line_start = True
             if indent == MAY_INDENT:
                 indent = MUST_INDENT
             token.must_indent = False

         elif token.type == "WS":
             assert token.at_line_start == True
             at_line_start = True
             token.must_indent = False

         else:
             # A real token; only indent after COLON NEWLINE
             if indent == MUST_INDENT:
                 token.must_indent = True
             else:
                 token.must_indent = False
             at_line_start = False
             indent = NO_INDENT

         yield token
         lexer.at_line_start = at_line_start

 def _new_token(type, lineno):
     tok = lex.LexToken()
     tok.type = type
     tok.value = None
     tok.lineno = lineno
     return tok

 # Synthesize a DEDENT tag
 def DEDENT(lineno):
     return _new_token("DEDENT", lineno)

 # Synthesize an INDENT tag
 def INDENT(lineno):
     return _new_token("INDENT", lineno)


 # Track the indentation level and emit the right INDENT / DEDENT events.
 def indentation_filter(tokens):
     # A stack of indentation levels; will never pop item 0
     levels = [0]
     token = None
     depth = 0
     prev_was_ws = False
     for token in tokens:
 ##        if 1:
 ##            print "Process", token,
 ##            if token.at_line_start:
 ##                print "at_line_start",
 ##            if token.must_indent:
 ##                print "must_indent",
 ##            print

         # WS only occurs at the start of the line
         # There may be WS followed by NEWLINE so
         # only track the depth here.  Don't indent/dedent
         # until there's something real.
         if token.type == "WS":
             assert depth == 0
             depth = len(token.value)
             prev_was_ws = True
             # WS tokens are never passed to the parser
             continue

         if token.type == "NEWLINE":
             depth = 0
             if prev_was_ws or token.at_line_start:
                 # ignore blank lines
                 continue
             # pass the other cases on through
             yield token
             continue

         # then it must be a real token (not WS, not NEWLINE)
         # which can affect the indentation level

         prev_was_ws = False
         if token.must_indent:
             # The current depth must be larger than the previous level
             if not (depth > levels[-1]):
                 raise IndentationError("expected an indented block")

             levels.append(depth)
             yield INDENT(token.lineno)

         elif token.at_line_start:
             # Must be on the same level or one of the previous levels
             if depth == levels[-1]:
                 # At the same level
                 pass
             elif depth > levels[-1]:
                 raise IndentationError("indentation increase but not in new block")
             else:
                 # Back up; but only if it matches a previous level
                 try:
                     i = levels.index(depth)
                 except ValueError:
                     raise IndentationError("inconsistent indentation")
                 for _ in range(i+1, len(levels)):
                     yield DEDENT(token.lineno)
                     levels.pop()

         yield token

     ### Finished processing ###

     # Must dedent any remaining levels
     if len(levels) > 1:
         assert token is not None
         for _ in range(1, len(levels)):
             yield DEDENT(token.lineno)


 # The top-level filter adds an ENDMARKER, if requested.
 # Python's grammar uses it.
 def filter(lexer, add_endmarker = True):
     token = None
     tokens = iter(lexer.token, None)
     tokens = track_tokens_filter(lexer, tokens)
     for token in indentation_filter(tokens):
         yield token

     if add_endmarker:
         lineno = 1
         if token is not None:
             lineno = token.lineno
         yield _new_token("ENDMARKER", lineno)

 # Combine Ply and my filters into a new lexer

 class IndentLexer(object):
     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
         self.lexer = lex.lex(debug=debug, optimize=optimize, lextab=lextab, reflags=reflags)
         self.token_stream = None
     def input(self, s, add_endmarker=True):
         self.lexer.paren_count = 0
         self.lexer.input(s)
         self.token_stream = filter(self.lexer, add_endmarker)
     def token(self):
         try:
             return self.token_stream.next()
         except StopIteration:
             return None

 ##########   Parser (tokens -> AST) ######

 # also part of Ply
 #import yacc

 # I use the Python AST
 from compiler import ast

 # Helper function
 def Assign(left, right):
     names = []
     if isinstance(left, ast.Name):
         # Single assignment on left
         return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right)
     elif isinstance(left, ast.Tuple):
         # List of things - make sure they are Name nodes
         names = []
         for child in left.getChildren():
             if not isinstance(child, ast.Name):
                 raise SyntaxError("that assignment not supported")
             names.append(child.name)
         ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names]
         return ast.Assign([ast.AssTuple(ass_list)], right)
     else:
         raise SyntaxError("Can't do that yet")


 # The grammar comments come from Python's Grammar/Grammar file

 ## NB: compound_stmt in single_input is followed by extra NEWLINE!
 # file_input: (NEWLINE | stmt)* ENDMARKER
 def p_file_input_end(p):
     """file_input_end : file_input ENDMARKER"""
     p[0] = ast.Stmt(p[1])
 def p_file_input(p):
     """file_input : file_input NEWLINE
                   | file_input stmt
                   | NEWLINE
                   | stmt"""
     if isinstance(p[len(p)-1], basestring):
         if len(p) == 3:
             p[0] = p[1]
         else:
             p[0] = [] # p == 2 --> only a blank line
     else:
         if len(p) == 3:
             p[0] = p[1] + p[2]
         else:
             p[0] = p[1]


 # funcdef: [decorators] 'def' NAME parameters ':' suite
 # ignoring decorators
 def p_funcdef(p):
     "funcdef : DEF NAME parameters COLON suite"
     p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5])

 # parameters: '(' [varargslist] ')'
 def p_parameters(p):
     """parameters : LPAR RPAR
                   | LPAR varargslist RPAR"""
     if len(p) == 3:
         p[0] = []
     else:
         p[0] = p[2]


 # varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) |
 # highly simplified
 def p_varargslist(p):
     """varargslist : varargslist COMMA NAME
                    | NAME"""
     if len(p) == 4:
         p[0] = p[1] + p[3]
     else:
         p[0] = [p[1]]

 # stmt: simple_stmt | compound_stmt
 def p_stmt_simple(p):
     """stmt : simple_stmt"""
     # simple_stmt is a list
     p[0] = p[1]

 def p_stmt_compound(p):
     """stmt : compound_stmt"""
     p[0] = [p[1]]

 # simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
 def p_simple_stmt(p):
     """simple_stmt : small_stmts NEWLINE
                    | small_stmts SEMICOLON NEWLINE"""
     p[0] = p[1]

 def p_small_stmts(p):
     """small_stmts : small_stmts SEMICOLON small_stmt
                    | small_stmt"""
     if len(p) == 4:
         p[0] = p[1] + [p[3]]
     else:
         p[0] = [p[1]]

 # small_stmt: expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt |
 #    import_stmt | global_stmt | exec_stmt | assert_stmt
 def p_small_stmt(p):
     """small_stmt : flow_stmt
                   | expr_stmt"""
     p[0] = p[1]

 # expr_stmt: testlist (augassign (yield_expr|testlist) |
 #                      ('=' (yield_expr|testlist))*)
 # augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
 #             '<<=' | '>>=' | '**=' | '//=')
 def p_expr_stmt(p):
     """expr_stmt : testlist ASSIGN testlist
                  | testlist """
     if len(p) == 2:
         # a list of expressions
         p[0] = ast.Discard(p[1])
     else:
         p[0] = Assign(p[1], p[3])

 def p_flow_stmt(p):
     "flow_stmt : return_stmt"
     p[0] = p[1]

 # return_stmt: 'return' [testlist]
 def p_return_stmt(p):
     "return_stmt : RETURN testlist"
     p[0] = ast.Return(p[2])


 def p_compound_stmt(p):
     """compound_stmt : if_stmt
                      | funcdef"""
     p[0] = p[1]

 def p_if_stmt(p):
     'if_stmt : IF test COLON suite'
     p[0] = ast.If([(p[2], p[4])], None)

 def p_suite(p):
     """suite : simple_stmt
              | NEWLINE INDENT stmts DEDENT"""
     if len(p) == 2:
         p[0] = ast.Stmt(p[1])
     else:
         p[0] = ast.Stmt(p[3])


 def p_stmts(p):
     """stmts : stmts stmt
              | stmt"""
     if len(p) == 3:
         p[0] = p[1] + p[2]
     else:
         p[0] = p[1]

 ## No using Python's approach because Ply supports precedence

 # comparison: expr (comp_op expr)*
 # arith_expr: term (('+'|'-') term)*
 # term: factor (('*'|'/'|'%'|'//') factor)*
 # factor: ('+'|'-'|'~') factor | power
 # comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'

 def make_lt_compare((left, right)):
     return ast.Compare(left, [('<', right),])
 def make_gt_compare((left, right)):
     return ast.Compare(left, [('>', right),])
 def make_eq_compare((left, right)):
     return ast.Compare(left, [('==', right),])


 binary_ops = {
     "+": ast.Add,
     "-": ast.Sub,
     "*": ast.Mul,
     "/": ast.Div,
     "<": make_lt_compare,
     ">": make_gt_compare,
     "==": make_eq_compare,
 }
 unary_ops = {
     "+": ast.UnaryAdd,
     "-": ast.UnarySub,
     }
 precedence = (
     ("left", "EQ", "GT", "LT"),
     ("left", "PLUS", "MINUS"),
     ("left", "MULT", "DIV"),
     )

 def p_comparison(p):
     """comparison : comparison PLUS comparison
                   | comparison MINUS comparison
                   | comparison MULT comparison
                   | comparison DIV comparison
                   | comparison LT comparison
                   | comparison EQ comparison
                   | comparison GT comparison
                   | PLUS comparison
                   | MINUS comparison
                   | power"""
     if len(p) == 4:
         p[0] = binary_ops[p[2]]((p[1], p[3]))
     elif len(p) == 3:
         p[0] = unary_ops[p[1]](p[2])
     else:
         p[0] = p[1]

 # power: atom trailer* ['**' factor]
 # trailers enables function calls.  I only allow one level of calls
 # so this is 'trailer'
 def p_power(p):
     """power : atom
              | atom trailer"""
     if len(p) == 2:
         p[0] = p[1]
     else:
         if p[2][0] == "CALL":
             p[0] = ast.CallFunc(p[1], p[2][1], None, None)
         else:
             raise AssertionError("not implemented")

 def p_atom_name(p):
     """atom : NAME"""
     p[0] = ast.Name(p[1])

 def p_atom_number(p):
     """atom : NUMBER
             | STRING"""
     p[0] = ast.Const(p[1])

 def p_atom_tuple(p):
     """atom : LPAR testlist RPAR"""
     p[0] = p[2]

 # trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
 def p_trailer(p):
     "trailer : LPAR arglist RPAR"
     p[0] = ("CALL", p[2])

 # testlist: test (',' test)* [',']
 # Contains shift/reduce error
 def p_testlist(p):
     """testlist : testlist_multi COMMA
                 | testlist_multi """
     if len(p) == 2:
         p[0] = p[1]
     else:
         # May need to promote singleton to tuple
         if isinstance(p[1], list):
             p[0] = p[1]
         else:
             p[0] = [p[1]]
     # Convert into a tuple?
     if isinstance(p[0], list):
         p[0] = ast.Tuple(p[0])

 def p_testlist_multi(p):
     """testlist_multi : testlist_multi COMMA test
                       | test"""
     if len(p) == 2:
         # singleton
         p[0] = p[1]
     else:
         if isinstance(p[1], list):
             p[0] = p[1] + [p[3]]
         else:
             # singleton -> tuple
             p[0] = [p[1], p[3]]


 # test: or_test ['if' or_test 'else' test] | lambdef
 #  as I don't support 'and', 'or', and 'not' this works down to 'comparison'
 def p_test(p):
     "test : comparison"
     p[0] = p[1]


 # arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test)
 # XXX INCOMPLETE: this doesn't allow the trailing comma
 def p_arglist(p):
     """arglist : arglist COMMA argument
                | argument"""
     if len(p) == 4:
         p[0] = p[1] + [p[3]]
     else:
         p[0] = [p[1]]

 # argument: test [gen_for] | test '=' test  # Really [keyword '='] test
 def p_argument(p):
     "argument : test"
     p[0] = p[1]

 def p_error(p):
     #print "Error!", repr(p)
     raise SyntaxError(p)


 class GardenSnakeParser(object):
     def __init__(self, lexer = None):
         if lexer is None:
             lexer = IndentLexer()
         self.lexer = lexer
         self.parser = yacc.yacc(start="file_input_end")

     def parse(self, code):
         self.lexer.input(code)
         result = self.parser.parse(lexer = self.lexer)
         return ast.Module(None, result)


 ###### Code generation ######

 from compiler import misc, syntax, pycodegen

 class GardenSnakeCompiler(object):
     def __init__(self):
         self.parser = GardenSnakeParser()
     def compile(self, code, filename="<string>"):
         tree = self.parser.parse(code)
         #print  tree
         misc.set_filename(filename, tree)
         syntax.check(tree)
         gen = pycodegen.ModuleCodeGenerator(tree)
         code = gen.getCode()
         return code

 ####### Test code #######

 compile = GardenSnakeCompiler().compile

 code = r"""

 print('LET\'S TRY THIS \\OUT')

 #Comment here
 def x(a):
     print('called with',a)
     if a == 1:
         return 2
     if a*2 > 10: return 999 / 4
         # Another comment here

     return a+2*3

 ints = (1, 2,
    3, 4,
 5)
 print('mutiline-expression', ints)

 t = 4+1/3*2+6*(9-5+1)
 print('predence test; should be 34+2/3:', t, t==(34+2/3))

 print('numbers', 1,2,3,4,5)
 if 1:
  8
  a=9
  print(x(a))

 print(x(1))
 print(x(2))
 print(x(8),'3')
 print('this is decimal', 1/5)
 print('BIG DECIMAL', 1.234567891234567e12345)

 """

 # Set up the GardenSnake run-time environment
 def print_(*args):
     print "-->", " ".join(map(str,args))

 globals()["print"] = print_

 compiled_code = compile(code)

 exec compiled_code in globals()
 print "Done"
	# GardenSnake - a parser generator demonstration program
	#
	# This implements a modified version of a subset of Python:
	# - only 'def', 'return' and 'if' statements
	# - 'if' only has 'then' clause (no elif nor else)
	# - single-quoted strings only, content in raw format
	# - numbers are decimal.Decimal instances (not integers or floats)
	# - no print statment; use the built-in 'print' function
	# - only < > == + - / * implemented (and unary + -)
	# - assignment and tuple assignment work
	# - no generators of any sort
	# - no ... well, no quite a lot

	# Why? I'm thinking about a new indentation-based configuration
	# language for a project and wanted to figure out how to do it. Once
	# I got that working I needed a way to test it out. My original AST
	# was dumb so I decided to target Python's AST and compile it into
	# Python code. Plus, it's pretty cool that it only took a day or so
	# from sitting down with Ply to having working code.

	# This uses David Beazley's Ply from http://www.dabeaz.com/ply/

	# This work is hereby released into the Public Domain. To view a copy of
	# the public domain dedication, visit
	# http://creativecommons.org/licenses/publicdomain/ or send a letter to
	# Creative Commons, 543 Howard Street, 5th Floor, San Francisco,
	# California, 94105, USA.
	#
	# Portions of this work are derived from Python's Grammar definition
	# and may be covered under the Python copyright and license
	#
	# Andrew Dalke / Dalke Scientific Software, LLC
	# 30 August 2006 / Cape Town, South Africa

	# Changelog:
	# 30 August - added link to CC license; removed the "swapcase" encoding

	# Modifications for inclusion in PLY distribution
	import sys
	sys.path.insert(0,"../..")
	from ply import *

	##### Lexer ######
	#import lex
	import decimal

	tokens = (
	'DEF',
	'IF',
	'NAME',
	'NUMBER', # Python decimals
	'STRING', # single quoted strings only; syntax of raw strings
	'LPAR',
	'RPAR',
	'COLON',
	'EQ',
	'ASSIGN',
	'LT',
	'GT',
	'PLUS',
	'MINUS',
	'MULT',
	'DIV',
	'RETURN',
	'WS',
	'NEWLINE',
	'COMMA',
	'SEMICOLON',
	'INDENT',
	'DEDENT',
	'ENDMARKER',
	)

	#t_NUMBER = r'\d+'
	# taken from decmial.py but without the leading sign
	def t_NUMBER(t):
	r"""(\d+(\.\d*)?\|\.\d+)([eE][-+]? \d+)?"""
	t.value = decimal.Decimal(t.value)
	return t

	def t_STRING(t):
	r"'([^\\']+\|\\'\|\\\\)*'" # I think this is right ...
	t.value=t.value[1:-1].decode("string-escape") # .swapcase() # for fun
	return t

	t_COLON = r':'
	t_EQ = r'=='
	t_ASSIGN = r'='
	t_LT = r'<'
	t_GT = r'>'
	t_PLUS = r'\+'
	t_MINUS = r'-'
	t_MULT = r'\*'
	t_DIV = r'/'
	t_COMMA = r','
	t_SEMICOLON = r';'

	# Ply nicely documented how to do this.

	RESERVED = {
	"def": "DEF",
	"if": "IF",
	"return": "RETURN",
	}

	def t_NAME(t):
	r'[a-zA-Z_][a-zA-Z0-9_]*'
	t.type = RESERVED.get(t.value, "NAME")
	return t

	# Putting this before t_WS let it consume lines with only comments in
	# them so the latter code never sees the WS part. Not consuming the
	# newline. Needed for "if 1: #comment"
	def t_comment(t):
	r"[ ]\043[^\n]" # \043 is '#'
	pass


	# Whitespace
	def t_WS(t):
	r' [ ]+ '
	if t.lexer.at_line_start and t.lexer.paren_count == 0:
	return t

	# Don't generate newline tokens when inside of parenthesis, eg
	# a = (1,
	# 2, 3)
	def t_newline(t):
	r'\n+'
	t.lexer.lineno += len(t.value)
	t.type = "NEWLINE"
	if t.lexer.paren_count == 0:
	return t

	def t_LPAR(t):
	r'\('
	t.lexer.paren_count += 1
	return t

	def t_RPAR(t):
	r'\)'
	# check for underflow? should be the job of the parser
	t.lexer.paren_count -= 1
	return t


	def t_error(t):
	raise SyntaxError("Unknown symbol %r" % (t.value[0],))
	print "Skipping", repr(t.value[0])
	t.lexer.skip(1)

	## I implemented INDENT / DEDENT generation as a post-processing filter

	# The original lex token stream contains WS and NEWLINE characters.
	# WS will only occur before any other tokens on a line.

	# I have three filters. One tags tokens by adding two attributes.
	# "must_indent" is True if the token must be indented from the
	# previous code. The other is "at_line_start" which is True for WS
	# and the first non-WS/non-NEWLINE on a line. It flags the check so
	# see if the new line has changed indication level.

	# Python's syntax has three INDENT states
	# 0) no colon hence no need to indent
	# 1) "if 1: go()" - simple statements have a COLON but no need for an indent
	# 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
	NO_INDENT = 0
	MAY_INDENT = 1
	MUST_INDENT = 2

	# only care about whitespace at the start of a line
	def track_tokens_filter(lexer, tokens):
	lexer.at_line_start = at_line_start = True
	indent = NO_INDENT
	saw_colon = False
	for token in tokens:
	token.at_line_start = at_line_start

	if token.type == "COLON":
	at_line_start = False
	indent = MAY_INDENT
	token.must_indent = False

	elif token.type == "NEWLINE":
	at_line_start = True
	if indent == MAY_INDENT:
	indent = MUST_INDENT
	token.must_indent = False

	elif token.type == "WS":
	assert token.at_line_start == True
	at_line_start = True
	token.must_indent = False

	else:
	# A real token; only indent after COLON NEWLINE
	if indent == MUST_INDENT:
	token.must_indent = True
	else:
	token.must_indent = False
	at_line_start = False
	indent = NO_INDENT

	yield token
	lexer.at_line_start = at_line_start

	def _new_token(type, lineno):
	tok = lex.LexToken()
	tok.type = type
	tok.value = None
	tok.lineno = lineno
	return tok

	# Synthesize a DEDENT tag
	def DEDENT(lineno):
	return _new_token("DEDENT", lineno)

	# Synthesize an INDENT tag
	def INDENT(lineno):
	return _new_token("INDENT", lineno)


	# Track the indentation level and emit the right INDENT / DEDENT events.
	def indentation_filter(tokens):
	# A stack of indentation levels; will never pop item 0
	levels = [0]
	token = None
	depth = 0
	prev_was_ws = False
	for token in tokens:
	## if 1:
	## print "Process", token,
	## if token.at_line_start:
	## print "at_line_start",
	## if token.must_indent:
	## print "must_indent",
	## print

	# WS only occurs at the start of the line
	# There may be WS followed by NEWLINE so
	# only track the depth here. Don't indent/dedent
	# until there's something real.
	if token.type == "WS":
	assert depth == 0
	depth = len(token.value)
	prev_was_ws = True
	# WS tokens are never passed to the parser
	continue

	if token.type == "NEWLINE":
	depth = 0
	if prev_was_ws or token.at_line_start:
	# ignore blank lines
	continue
	# pass the other cases on through
	yield token
	continue

	# then it must be a real token (not WS, not NEWLINE)
	# which can affect the indentation level

	prev_was_ws = False
	if token.must_indent:
	# The current depth must be larger than the previous level
	if not (depth > levels[-1]):
	raise IndentationError("expected an indented block")

	levels.append(depth)
	yield INDENT(token.lineno)

	elif token.at_line_start:
	# Must be on the same level or one of the previous levels
	if depth == levels[-1]:
	# At the same level
	pass
	elif depth > levels[-1]:
	raise IndentationError("indentation increase but not in new block")
	else:
	# Back up; but only if it matches a previous level
	try:
	i = levels.index(depth)
	except ValueError:
	raise IndentationError("inconsistent indentation")
	for _ in range(i+1, len(levels)):
	yield DEDENT(token.lineno)
	levels.pop()

	yield token

	### Finished processing ###

	# Must dedent any remaining levels
	if len(levels) > 1:
	assert token is not None
	for _ in range(1, len(levels)):
	yield DEDENT(token.lineno)


	# The top-level filter adds an ENDMARKER, if requested.
	# Python's grammar uses it.
	def filter(lexer, add_endmarker = True):
	token = None
	tokens = iter(lexer.token, None)
	tokens = track_tokens_filter(lexer, tokens)
	for token in indentation_filter(tokens):
	yield token

	if add_endmarker:
	lineno = 1
	if token is not None:
	lineno = token.lineno
	yield _new_token("ENDMARKER", lineno)

	# Combine Ply and my filters into a new lexer

	class IndentLexer(object):
	def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
	self.lexer = lex.lex(debug=debug, optimize=optimize, lextab=lextab, reflags=reflags)
	self.token_stream = None
	def input(self, s, add_endmarker=True):
	self.lexer.paren_count = 0
	self.lexer.input(s)
	self.token_stream = filter(self.lexer, add_endmarker)
	def token(self):
	try:
	return self.token_stream.next()
	except StopIteration:
	return None

	########## Parser (tokens -> AST) ######

	# also part of Ply
	#import yacc

	# I use the Python AST
	from compiler import ast

	# Helper function
	def Assign(left, right):
	names = []
	if isinstance(left, ast.Name):
	# Single assignment on left
	return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right)
	elif isinstance(left, ast.Tuple):
	# List of things - make sure they are Name nodes
	names = []
	for child in left.getChildren():
	if not isinstance(child, ast.Name):
	raise SyntaxError("that assignment not supported")
	names.append(child.name)
	ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names]
	return ast.Assign([ast.AssTuple(ass_list)], right)
	else:
	raise SyntaxError("Can't do that yet")


	# The grammar comments come from Python's Grammar/Grammar file

	## NB: compound_stmt in single_input is followed by extra NEWLINE!
	# file_input: (NEWLINE \| stmt)* ENDMARKER
	def p_file_input_end(p):
	"""file_input_end : file_input ENDMARKER"""
	p[0] = ast.Stmt(p[1])
	def p_file_input(p):
	"""file_input : file_input NEWLINE
	\| file_input stmt
	\| NEWLINE
	\| stmt"""
	if isinstance(p[len(p)-1], basestring):
	if len(p) == 3:
	p[0] = p[1]
	else:
	p[0] = [] # p == 2 --> only a blank line
	else:
	if len(p) == 3:
	p[0] = p[1] + p[2]
	else:
	p[0] = p[1]


	# funcdef: [decorators] 'def' NAME parameters ':' suite
	# ignoring decorators
	def p_funcdef(p):
	"funcdef : DEF NAME parameters COLON suite"
	p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5])

	# parameters: '(' [varargslist] ')'
	def p_parameters(p):
	"""parameters : LPAR RPAR
	\| LPAR varargslist RPAR"""
	if len(p) == 3:
	p[0] = []
	else:
	p[0] = p[2]


	# varargslist: (fpdef ['=' test] ',')* ('' NAME [',' '' NAME] \| '*' NAME) \|
	# highly simplified
	def p_varargslist(p):
	"""varargslist : varargslist COMMA NAME
	\| NAME"""
	if len(p) == 4:
	p[0] = p[1] + p[3]
	else:
	p[0] = [p[1]]

	# stmt: simple_stmt \| compound_stmt
	def p_stmt_simple(p):
	"""stmt : simple_stmt"""
	# simple_stmt is a list
	p[0] = p[1]

	def p_stmt_compound(p):
	"""stmt : compound_stmt"""
	p[0] = [p[1]]

	# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
	def p_simple_stmt(p):
	"""simple_stmt : small_stmts NEWLINE
	\| small_stmts SEMICOLON NEWLINE"""
	p[0] = p[1]

	def p_small_stmts(p):
	"""small_stmts : small_stmts SEMICOLON small_stmt
	\| small_stmt"""
	if len(p) == 4:
	p[0] = p[1] + [p[3]]
	else:
	p[0] = [p[1]]

	# small_stmt: expr_stmt \| print_stmt \| del_stmt \| pass_stmt \| flow_stmt \|
	# import_stmt \| global_stmt \| exec_stmt \| assert_stmt
	def p_small_stmt(p):
	"""small_stmt : flow_stmt
	\| expr_stmt"""
	p[0] = p[1]

	# expr_stmt: testlist (augassign (yield_expr\|testlist) \|
	# ('=' (yield_expr\|testlist))*)
	# augassign: ('+=' \| '-=' \| '*=' \| '/=' \| '%=' \| '&=' \| '\|=' \| '^=' \|
	# '<<=' \| '>>=' \| '**=' \| '//=')
	def p_expr_stmt(p):
	"""expr_stmt : testlist ASSIGN testlist
	\| testlist """
	if len(p) == 2:
	# a list of expressions
	p[0] = ast.Discard(p[1])
	else:
	p[0] = Assign(p[1], p[3])

	def p_flow_stmt(p):
	"flow_stmt : return_stmt"
	p[0] = p[1]

	# return_stmt: 'return' [testlist]
	def p_return_stmt(p):
	"return_stmt : RETURN testlist"
	p[0] = ast.Return(p[2])


	def p_compound_stmt(p):
	"""compound_stmt : if_stmt
	\| funcdef"""
	p[0] = p[1]

	def p_if_stmt(p):
	'if_stmt : IF test COLON suite'
	p[0] = ast.If([(p[2], p[4])], None)

	def p_suite(p):
	"""suite : simple_stmt
	\| NEWLINE INDENT stmts DEDENT"""
	if len(p) == 2:
	p[0] = ast.Stmt(p[1])
	else:
	p[0] = ast.Stmt(p[3])


	def p_stmts(p):
	"""stmts : stmts stmt
	\| stmt"""
	if len(p) == 3:
	p[0] = p[1] + p[2]
	else:
	p[0] = p[1]

	## No using Python's approach because Ply supports precedence

	# comparison: expr (comp_op expr)*
	# arith_expr: term (('+'\|'-') term)*
	# term: factor ((''\|'/'\|'%'\|'//') factor)
	# factor: ('+'\|'-'\|'~') factor \| power
	# comp_op: '<'\|'>'\|'=='\|'>='\|'<='\|'<>'\|'!='\|'in'\|'not' 'in'\|'is'\|'is' 'not'

	def make_lt_compare((left, right)):
	return ast.Compare(left, [('<', right),])
	def make_gt_compare((left, right)):
	return ast.Compare(left, [('>', right),])
	def make_eq_compare((left, right)):
	return ast.Compare(left, [('==', right),])


	binary_ops = {
	"+": ast.Add,
	"-": ast.Sub,
	"*": ast.Mul,
	"/": ast.Div,
	"<": make_lt_compare,
	">": make_gt_compare,
	"==": make_eq_compare,
	}
	unary_ops = {
	"+": ast.UnaryAdd,
	"-": ast.UnarySub,
	}
	precedence = (
	("left", "EQ", "GT", "LT"),
	("left", "PLUS", "MINUS"),
	("left", "MULT", "DIV"),
	)

	def p_comparison(p):
	"""comparison : comparison PLUS comparison
	\| comparison MINUS comparison
	\| comparison MULT comparison
	\| comparison DIV comparison
	\| comparison LT comparison
	\| comparison EQ comparison
	\| comparison GT comparison
	\| PLUS comparison
	\| MINUS comparison
	\| power"""
	if len(p) == 4:
	p[0] = binary_ops[p[2]]((p[1], p[3]))
	elif len(p) == 3:
	p[0] = unary_ops[p[1]](p[2])
	else:
	p[0] = p[1]

	# power: atom trailer* ['**' factor]
	# trailers enables function calls. I only allow one level of calls
	# so this is 'trailer'
	def p_power(p):
	"""power : atom
	\| atom trailer"""
	if len(p) == 2:
	p[0] = p[1]
	else:
	if p[2][0] == "CALL":
	p[0] = ast.CallFunc(p[1], p[2][1], None, None)
	else:
	raise AssertionError("not implemented")

	def p_atom_name(p):
	"""atom : NAME"""
	p[0] = ast.Name(p[1])

	def p_atom_number(p):
	"""atom : NUMBER
	\| STRING"""
	p[0] = ast.Const(p[1])

	def p_atom_tuple(p):
	"""atom : LPAR testlist RPAR"""
	p[0] = p[2]

	# trailer: '(' [arglist] ')' \| '[' subscriptlist ']' \| '.' NAME
	def p_trailer(p):
	"trailer : LPAR arglist RPAR"
	p[0] = ("CALL", p[2])

	# testlist: test (',' test)* [',']
	# Contains shift/reduce error
	def p_testlist(p):
	"""testlist : testlist_multi COMMA
	\| testlist_multi """
	if len(p) == 2:
	p[0] = p[1]
	else:
	# May need to promote singleton to tuple
	if isinstance(p[1], list):
	p[0] = p[1]
	else:
	p[0] = [p[1]]
	# Convert into a tuple?
	if isinstance(p[0], list):
	p[0] = ast.Tuple(p[0])

	def p_testlist_multi(p):
	"""testlist_multi : testlist_multi COMMA test
	\| test"""
	if len(p) == 2:
	# singleton
	p[0] = p[1]
	else:
	if isinstance(p[1], list):
	p[0] = p[1] + [p[3]]
	else:
	# singleton -> tuple
	p[0] = [p[1], p[3]]


	# test: or_test ['if' or_test 'else' test] \| lambdef
	# as I don't support 'and', 'or', and 'not' this works down to 'comparison'
	def p_test(p):
	"test : comparison"
	p[0] = p[1]



	# arglist: (argument ',')* (argument [',']\| '' test [',' '' test] \| '*' test)
	# XXX INCOMPLETE: this doesn't allow the trailing comma
	def p_arglist(p):
	"""arglist : arglist COMMA argument
	\| argument"""
	if len(p) == 4:
	p[0] = p[1] + [p[3]]
	else:
	p[0] = [p[1]]

	# argument: test [gen_for] \| test '=' test # Really [keyword '='] test
	def p_argument(p):
	"argument : test"
	p[0] = p[1]

	def p_error(p):
	#print "Error!", repr(p)
	raise SyntaxError(p)


	class GardenSnakeParser(object):
	def __init__(self, lexer = None):
	if lexer is None:
	lexer = IndentLexer()
	self.lexer = lexer
	self.parser = yacc.yacc(start="file_input_end")

	def parse(self, code):
	self.lexer.input(code)
	result = self.parser.parse(lexer = self.lexer)
	return ast.Module(None, result)


	###### Code generation ######

	from compiler import misc, syntax, pycodegen

	class GardenSnakeCompiler(object):
	def __init__(self):
	self.parser = GardenSnakeParser()
	def compile(self, code, filename="<string>"):
	tree = self.parser.parse(code)
	#print tree
	misc.set_filename(filename, tree)
	syntax.check(tree)
	gen = pycodegen.ModuleCodeGenerator(tree)
	code = gen.getCode()
	return code

	####### Test code #######

	compile = GardenSnakeCompiler().compile

	code = r"""

	print('LET\'S TRY THIS \\OUT')

	#Comment here
	def x(a):
	print('called with',a)
	if a == 1:
	return 2
	if a*2 > 10: return 999 / 4
	# Another comment here

	return a+2*3

	ints = (1, 2,
	3, 4,
	5)
	print('mutiline-expression', ints)

	t = 4+1/32+6(9-5+1)
	print('predence test; should be 34+2/3:', t, t==(34+2/3))

	print('numbers', 1,2,3,4,5)
	if 1:
	8
	a=9
	print(x(a))

	print(x(1))
	print(x(2))
	print(x(8),'3')
	print('this is decimal', 1/5)
	print('BIG DECIMAL', 1.234567891234567e12345)

	"""

	# Set up the GardenSnake run-time environment
	def print_(*args):
	print "-->", " ".join(map(str,args))

	globals()["print"] = print_

	compiled_code = compile(code)

	exec compiled_code in globals()
	print "Done"