#!/usr/bin/python # Author: Jeff Terrell # Date: 2008-01-29 # COMP 524 - Programming Languages # Program 1: Parser # time: 20 minutes # Note: I'm not sure you can get this any shorter and still have it be LL(1)... grammar = """ program := optDeclList stmtList EOF optDeclList := decl optDeclList | eps decl := type ID SEMI type := INT | FLOAT | STRING stmtList := stmt optStmtList optStmtList := stmt optStmtList | eps stmt := imperative | compound imperative := ID imp_tail imp_tail := ASSIGN expr SEMI | OPAREN optArgList CPAREN SEMI optArgList := expr optArgList_tail | eps optArgList_tail := COMMA expr optArgList_tail | eps expr := logical expr_tail expr_tail := logicop logical expr_tail | eps logicop := AND | OR logical := relational logical_tail logical_tail := relop relational logical_tail | eps relop := EQ | NOTEQ | LT | LTEQ | GT | GTEQ relational := addend rel_tail rel_tail := addop addend rel_tail | eps addop := PLUS | MINUS addend := factor addend_tail addend_tail := multop factor addend_tail | eps multop := TIMES | DIVIDE factor := NUMLIT | STRLIT | OPAREN expr CPAREN | NOT OPAREN expr CPAREN | factid factid := ID factid_tail factid_tail := OPAREN optArgList CPAREN | eps compound := while | if | block block := OBRACE innerblock CBRACE innerblock := optDeclList stmtList while := WHILE OPAREN expr CPAREN block if := IF OPAREN expr CPAREN block optElseifList optElse optElseifList := ELSEIF OPAREN expr CPAREN block optElseifList | eps optElse := ELSE block | eps """ class Token: def __init__(self, t, n): self.type = t self.name = n def tostr(self): return "[" + self.type + " | '" + self.name + "']" import sys import re # time: 34 minutes # +18 minutes for a bug fix (see comment below re: int vs. int_var) def scan(filename): try: filein = open(filename) except IOError, (errno, strerror): print "I/O error(%s): %s" % (errno, strerror) sys.exit() # a simple approach: read the entire file in as a single string cstream = filein.read() filein.close() tokens = [] last_len = len(cstream) # The basic scanning approach is to determine what token is at the head of # the string, strip it off, and add it to the list of tokens. Loop until # we're out of input. while len(cstream) > 0: # Strip off whitespace r = re.compile(r"^\s*(//.*)?", re.MULTILINE) cstream = r.sub("", cstream) if len(cstream) == 0: tokens.append(Token('EOF', '')) # punctuation is easy... if cstream.startswith(";"): cstream = cstream[1:] tokens.append(Token('SEMI', '')) if cstream.startswith("("): cstream = cstream[1:] tokens.append(Token('OPAREN', '')) if cstream.startswith(")"): cstream = cstream[1:] tokens.append(Token('CPAREN', '')) if cstream.startswith(","): cstream = cstream[1:] tokens.append(Token('COMMA', '')) if cstream.startswith("&&"): cstream = cstream[2:] tokens.append(Token('AND', '')) if cstream.startswith("||"): cstream = cstream[2:] tokens.append(Token('OR', '')) if cstream.startswith("=="): cstream = cstream[2:] tokens.append(Token('EQ', '')) if cstream.startswith("="): cstream = cstream[1:] tokens.append(Token('ASSIGN', '')) if cstream.startswith("!="): cstream = cstream[2:] tokens.append(Token('NOTEQ', '')) if cstream.startswith("!"): cstream = cstream[1:] tokens.append(Token('NOT', '')) if cstream.startswith("<="): cstream = cstream[2:] tokens.append(Token('LTEQ', '')) if cstream.startswith("<"): cstream = cstream[1:] tokens.append(Token('LT', '')) if cstream.startswith(">="): cstream = cstream[2:] tokens.append(Token('GTEQ', '')) if cstream.startswith(">"): cstream = cstream[1:] tokens.append(Token('GT', '')) if cstream.startswith("*"): cstream = cstream[1:] tokens.append(Token('TIMES', '')) if cstream.startswith("/"): # but comments also start with "/"... if cstream.startswith("//"): continue # we have a comment cstream = cstream[1:] tokens.append(Token('DIVIDE', '')) if cstream.startswith("{"): cstream = cstream[1:] tokens.append(Token('OBRACE', '')) if cstream.startswith("}"): cstream = cstream[1:] tokens.append(Token('CBRACE', '')) # This is tricky. 'int_var' is an ID, but 'int' is a keyword, so we # need to make sure that there is either space or punctuation after the # 'int'--in other words, that 'int' is followed by a non-word # character. This is what the \b accomplishes. We'll do the same # thing for all the other keywords. # # NOTE: I didn't test this subtlety on your parsers. r = re.compile(r'int\b') m = r.match(cstream) if m: cstream = cstream[3:] tokens.append(Token('INT', '')) r = re.compile(r'float\b') m = r.match(cstream) if m: cstream = cstream[5:] tokens.append(Token('FLOAT', '')) r = re.compile(r'string\b') m = r.match(cstream) if m: cstream = cstream[6:] tokens.append(Token('STRING', '')) r = re.compile(r'while\b') m = r.match(cstream) if m: cstream = cstream[5:] tokens.append(Token('WHILE', '')) r = re.compile(r'if\b') m = r.match(cstream) if m: cstream = cstream[2:] tokens.append(Token('IF', '')) r = re.compile(r'elseif\b') m = r.match(cstream) if m: cstream = cstream[6:] tokens.append(Token('ELSEIF', '')) r = re.compile(r'else\b') m = r.match(cstream) if m: cstream = cstream[4:] tokens.append(Token('ELSE', '')) # Note that we need to have the IDs after the keywords, because # otherwise the keywords would be recognizes as IDs. r = re.compile("[a-z]+") m = r.match(cstream) if m: id = m.group(0) cstream = cstream[len(id):] tokens.append(Token('ID', id)) r = re.compile(r"'[^']*'") m = r.match(cstream) if m: str = m.group(0) cstream = cstream[len(str):] tokens.append(Token('STRLIT', str)) r = re.compile(r"(\d+(\.\d*)?|\d*\.\d+)") m = r.match(cstream) if m: num = m.group(0) cstream = cstream[len(num):] tokens.append(Token('NUMLIT', num)) # I originally put these here, after the number literal, because # num_lits could have a sign. But now, I think it would be fine to put # them with the other punctuation. if cstream.startswith("+"): cstream = cstream[1:] tokens.append(Token('PLUS', '')) if cstream.startswith("-"): cstream = cstream[1:] tokens.append(Token('MINUS', '')) # If there's an invalid token at the head of the character stream, # we'll keep looping, over and over, never advancing the stream. So, # we test the length. If it hasn't changed since the last iteration, # we have an invalid token. if last_len == len(cstream): sys.stderr.write('Error: invalid token at head of cstream.\ncstream is:\n"""\n%s\n"""\n' % cstream) sys.exit(1) last_len = len(cstream) return tokens # globals, I have discovered, are kind of a pain in Python... global current_token global token_list # declaredIDs is where we note that IDs have been declared. This is how we # check to ensure names have been declared before being used. global declaredIDs import traceback # The initializeName parameter controls whether we are checking that names have # been declared before use. It's unset by default. def match(type, initializeName=0): global current_token global token_list global declaredIDs if type == current_token.type: if type == 'ID' and not initializeName and not isNameDeclared(current_token.name): sys.stderr.write('Warning: name %s is not declared\n' % current_token.name) saved = current_token if len(token_list) > 0: current_token = token_list.pop(0) else: current_token = Token('EOF', '') return saved sys.stderr.write('Error: expected %s but got %s' % (type, current_token.type)) sys.stderr.write(traceback.print_tb(sys.exc_info()[2])) sys.exit() def check(type): global current_token return type == current_token.type # Called when a name is seen to determine if it has been declared yet. def isNameDeclared(name): global declaredIDs return name in declaredIDs # Make a note that a name has been declared, and can now legally be used. def declareName(name): global declaredIDs declaredIDs[name] = 1 ################################################################################ ##### Begin recursive-descent parse functions ##### ################################################################################ # # This is a very straightforward, mechanical translation of the grammar rules. # There is nothing fancy here at all. # optDeclList := decl optDeclList | eps def optDeclList(): if declPending(): return ['optDeclList', decl(), optDeclList()] return [] # decl := type ID SEMI def declPending(): return typePending() def decl(): parse_tree = ['decl', _type(), match('ID',initializeName=1), match('SEMI')] declareName(parse_tree[2].name) return parse_tree # type := INT | FLOAT | STRING def typePending(): return (check('INT') or check('FLOAT') or check('STRING')) # Had to call this "_type()" instead of "type()" to avoid clashing with the # Python keyword. def _type(): if check('INT'): return ['type', match('INT')] if check('FLOAT'): return ['type', match('FLOAT')] return ['type', match('STRING')] # stmtList := stmt optStmtList def stmtList(): return ['stmtList', stmt(), optStmtList()] # optStmtList := stmt optStmtList | eps def optStmtList(): if stmtPending(): return ['optStmtList', stmt(), optStmtList()] return [] # stmt := imperative | compound def stmtPending(): return declPending() or imperativePending() or compoundPending() def stmt(): if imperativePending(): return ['stmt', imperative()] return ['stmt', compound()] # imperative := ID imp_tail def imperativePending(): return check('ID') def imperative(): return ['imperative', match('ID'), imp_tail()] # imp_tail := ASSIGN expr SEMI | OPAREN optArgList CPAREN SEMI def imp_tail(): if check('ASSIGN'): return ['imp_tail', match('ASSIGN'), expr(), match('SEMI')] return ['imp_tail', match('OPAREN'), optArgList(), match('CPAREN'), match('SEMI')] # optArgList := expr optArgList_tail | eps def optArgList(): if exprPending(): return ['optArgList', expr(), optArgList_tail()] return [] # optArgList_tail := COMMA expr optArgList_tail | eps def optArgList_tail(): if check('COMMA'): return ['optArgList_tail', match('COMMA'), expr(), optArgList_tail()] return [] # compound := while | if | block def compoundPending(): return whilePending() or ifPending() or blockPending() def compound(): if whilePending(): return ['compound', _while()] if ifPending(): return ['compound', _if()] return ['compound', block()] # while := WHILE OPAREN expr CPAREN block def whilePending(): return check('WHILE') # Had to call this "_while()" instead of "while()" to avoid clashing with the # Python keyword. def _while(): return ['while', match('WHILE'), match('OPAREN'), expr(), match('CPAREN'), block()] # if := IF OPAREN expr CPAREN block optElseifList optElse def ifPending(): return check('IF') # Had to call this "_if()" instead of "if()" to avoid clashing with the Python # keyword. def _if(): return ['if', match('IF'), match('OPAREN'), expr(), match('CPAREN'), block(), optElseifList(), optElse()] #optElseifList := ELSEIF OPAREN expr CPAREN block optElseifList | eps def optElseifList(): if check('ELSEIF'): return ['optElseifList', match('ELSEIF'), match('OPAREN'), expr(), match('CPAREN'), block(), optElseifList()] return [] #optElse := ELSE block | eps def optElse(): if check('ELSE'): return ['optElse', match('ELSE'), block()] return [] # block := OBRACE innerblock CBRACE def blockPending(): return check('OBRACE') def block(): return ['block', match('OBRACE'), innerblock(), match('CBRACE')] # innerblock := optDeclList stmtList def innerblock(): return ['innerblock', optDeclList(), stmtList()] # expr := logical expr_tail def exprPending(): return logicalPending() def expr(): return ['expr', logical(), expr_tail()] # expr_tail := logicop logical expr_tail | eps def expr_tail(): if logicopPending(): return ['expr_tail', logicop(), logical(), expr_tail()] return [] # logicop := AND | OR def logicopPending(): return check('AND') or check('OR') def logicop(): if check('AND'): return ['logicop', match('AND')] match('OR') # logical := relational logical_tail def logicalPending(): return relationalPending() def logical(): return ['logical', relational(), logical_tail()] # logical_tail := relop relational logical_tail | eps def logical_tail(): if relopPending(): return ['logical_tail', relop(), relational(), logical_tail()] return [] # relop := EQ | NOTEQ | LT | LTEQ | GT | GTEQ def relopPending(): return check('EQ') or check('NOTEQ') or check('LT') or check('LTEQ') or check('GT') or check('GTEQ') def relop(): if check('EQ'): return ['relop', match('EQ')] if check('NOTEQ'): return ['relop', match('NOTEQ')] if check('LT'): return ['relop', match('LT')] if check('LTEQ'): return ['relop', match('LTEQ')] if check('GT'): return ['relop', match('GT')] return ['relop', match('GTEQ')] # relational := addend rel_tail def relationalPending(): return addendPending() def relational(): return ['relational', addend(), rel_tail()] # rel_tail := addop addend rel_tail | eps def rel_tail(): if addopPending(): return ['rel_tail', addop(), addend(), rel_tail()] return [] # addop := PLUS | MINUS def addopPending(): return check('PLUS') or check('MINUS') def addop(): if check('PLUS'): return ['addop', match('PLUS')] return ['addop', match('MINUS')] # addend := factor addend_tail def addendPending(): return factorPending() def addend(): return ['addend', factor(), addend_tail()] # addend_tail := multop factor addend_tail | eps def addend_tail(): if multopPending(): return ['addend_tail', multop(), factor(), addend_tail()] return [] # multop := TIMES | DIVIDE def multopPending(): return check('TIMES') or check('DIVIDE') def multop(): if check('TIMES'): return ['multop', match('TIMES')] return ['multop', match('DIVIDE')] # factor := NUMLIT | STRLIT | OPAREN expr CPAREN | NOT OPAREN expr CPAREN | factid def factorPending(): return check('NUMLIT') or check('STRLIT') or check('OPAREN') or check('NOT') or factidPending() def factor(): if check('NUMLIT'): return ['factor', match('NUMLIT')] if check('STRLIT'): return ['factor', match('STRLIT')] if check('OPAREN'): return ['factor', match('OPAREN'), expr(), match('CPAREN')] if check('NOT'): return ['factor', match('NOT'), match('OPAREN'), expr(), match('CPAREN')] return ['factor', factid()] # factid := ID factid_tail def factidPending(): return check('ID') def factid(): return ['factid', match('ID'), factid_tail()] # factid_tail := OPAREN optArgList CPAREN | eps def factid_tail(): if check('OPAREN'): return ['factid_tail', match('OPAREN'), optArgList(), match('CPAREN')] return [] ################################################################################ ##### End recursive-descent parse functions ##### ################################################################################ def parse(toks): global current_token global token_list global declaredIDs token_list = toks current_token = token_list.pop(0); declaredIDs = { 'read' : 1, 'print' : 1 } return ['program', optDeclList(), stmtList(), match('EOF')] import types def printParseTree(p, tabstop=0): if type(p) is types.ListType: if len(p) == 0: print '. '*tabstop + 'e' return print '. '*tabstop + p[0] for el in p[1:]: printParseTree(el, tabstop+1) elif isinstance(p, Token): s = p.tostr() print '. '*tabstop + p.tostr() def main(argv=None): if argv is None: argv = sys.argv if len(argv) != 2: sys.stderr.write('Error: specify the filename to read as input\n') else: print 'parsing %s' % argv[1] printParseTree(parse(scan(argv[1]))) # So, main() will only get called if this program is run directly, but not when # it's imported somewhere else. Nifty, eh? if __name__ == "__main__": main()