1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
---|
2 | # Licensed to PSF under a Contributor Agreement.
|
---|
3 |
|
---|
4 | """Parser engine for the grammar tables generated by pgen.
|
---|
5 |
|
---|
6 | The grammar table must be loaded first.
|
---|
7 |
|
---|
8 | See Parser/parser.c in the Python distribution for additional info on
|
---|
9 | how this parsing engine works.
|
---|
10 |
|
---|
11 | """
|
---|
12 |
|
---|
13 | # Local imports
|
---|
14 | from . import token
|
---|
15 |
|
---|
16 | class ParseError(Exception):
|
---|
17 | """Exception to signal the parser is stuck."""
|
---|
18 |
|
---|
19 | def __init__(self, msg, type, value, context):
|
---|
20 | Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
|
---|
21 | (msg, type, value, context))
|
---|
22 | self.msg = msg
|
---|
23 | self.type = type
|
---|
24 | self.value = value
|
---|
25 | self.context = context
|
---|
26 |
|
---|
27 | class Parser(object):
|
---|
28 | """Parser engine.
|
---|
29 |
|
---|
30 | The proper usage sequence is:
|
---|
31 |
|
---|
32 | p = Parser(grammar, [converter]) # create instance
|
---|
33 | p.setup([start]) # prepare for parsing
|
---|
34 | <for each input token>:
|
---|
35 | if p.addtoken(...): # parse a token; may raise ParseError
|
---|
36 | break
|
---|
37 | root = p.rootnode # root of abstract syntax tree
|
---|
38 |
|
---|
39 | A Parser instance may be reused by calling setup() repeatedly.
|
---|
40 |
|
---|
41 | A Parser instance contains state pertaining to the current token
|
---|
42 | sequence, and should not be used concurrently by different threads
|
---|
43 | to parse separate token sequences.
|
---|
44 |
|
---|
45 | See driver.py for how to get input tokens by tokenizing a file or
|
---|
46 | string.
|
---|
47 |
|
---|
48 | Parsing is complete when addtoken() returns True; the root of the
|
---|
49 | abstract syntax tree can then be retrieved from the rootnode
|
---|
50 | instance variable. When a syntax error occurs, addtoken() raises
|
---|
51 | the ParseError exception. There is no error recovery; the parser
|
---|
52 | cannot be used after a syntax error was reported (but it can be
|
---|
53 | reinitialized by calling setup()).
|
---|
54 |
|
---|
55 | """
|
---|
56 |
|
---|
57 | def __init__(self, grammar, convert=None):
|
---|
58 | """Constructor.
|
---|
59 |
|
---|
60 | The grammar argument is a grammar.Grammar instance; see the
|
---|
61 | grammar module for more information.
|
---|
62 |
|
---|
63 | The parser is not ready yet for parsing; you must call the
|
---|
64 | setup() method to get it started.
|
---|
65 |
|
---|
66 | The optional convert argument is a function mapping concrete
|
---|
67 | syntax tree nodes to abstract syntax tree nodes. If not
|
---|
68 | given, no conversion is done and the syntax tree produced is
|
---|
69 | the concrete syntax tree. If given, it must be a function of
|
---|
70 | two arguments, the first being the grammar (a grammar.Grammar
|
---|
71 | instance), and the second being the concrete syntax tree node
|
---|
72 | to be converted. The syntax tree is converted from the bottom
|
---|
73 | up.
|
---|
74 |
|
---|
75 | A concrete syntax tree node is a (type, value, context, nodes)
|
---|
76 | tuple, where type is the node type (a token or symbol number),
|
---|
77 | value is None for symbols and a string for tokens, context is
|
---|
78 | None or an opaque value used for error reporting (typically a
|
---|
79 | (lineno, offset) pair), and nodes is a list of children for
|
---|
80 | symbols, and None for tokens.
|
---|
81 |
|
---|
82 | An abstract syntax tree node may be anything; this is entirely
|
---|
83 | up to the converter function.
|
---|
84 |
|
---|
85 | """
|
---|
86 | self.grammar = grammar
|
---|
87 | self.convert = convert or (lambda grammar, node: node)
|
---|
88 |
|
---|
89 | def setup(self, start=None):
|
---|
90 | """Prepare for parsing.
|
---|
91 |
|
---|
92 | This *must* be called before starting to parse.
|
---|
93 |
|
---|
94 | The optional argument is an alternative start symbol; it
|
---|
95 | defaults to the grammar's start symbol.
|
---|
96 |
|
---|
97 | You can use a Parser instance to parse any number of programs;
|
---|
98 | each time you call setup() the parser is reset to an initial
|
---|
99 | state determined by the (implicit or explicit) start symbol.
|
---|
100 |
|
---|
101 | """
|
---|
102 | if start is None:
|
---|
103 | start = self.grammar.start
|
---|
104 | # Each stack entry is a tuple: (dfa, state, node).
|
---|
105 | # A node is a tuple: (type, value, context, children),
|
---|
106 | # where children is a list of nodes or None, and context may be None.
|
---|
107 | newnode = (start, None, None, [])
|
---|
108 | stackentry = (self.grammar.dfas[start], 0, newnode)
|
---|
109 | self.stack = [stackentry]
|
---|
110 | self.rootnode = None
|
---|
111 | self.used_names = set() # Aliased to self.rootnode.used_names in pop()
|
---|
112 |
|
---|
113 | def addtoken(self, type, value, context):
|
---|
114 | """Add a token; return True iff this is the end of the program."""
|
---|
115 | # Map from token to label
|
---|
116 | ilabel = self.classify(type, value, context)
|
---|
117 | # Loop until the token is shifted; may raise exceptions
|
---|
118 | while True:
|
---|
119 | dfa, state, node = self.stack[-1]
|
---|
120 | states, first = dfa
|
---|
121 | arcs = states[state]
|
---|
122 | # Look for a state with this label
|
---|
123 | for i, newstate in arcs:
|
---|
124 | t, v = self.grammar.labels[i]
|
---|
125 | if ilabel == i:
|
---|
126 | # Look it up in the list of labels
|
---|
127 | assert t < 256
|
---|
128 | # Shift a token; we're done with it
|
---|
129 | self.shift(type, value, newstate, context)
|
---|
130 | # Pop while we are in an accept-only state
|
---|
131 | state = newstate
|
---|
132 | while states[state] == [(0, state)]:
|
---|
133 | self.pop()
|
---|
134 | if not self.stack:
|
---|
135 | # Done parsing!
|
---|
136 | return True
|
---|
137 | dfa, state, node = self.stack[-1]
|
---|
138 | states, first = dfa
|
---|
139 | # Done with this token
|
---|
140 | return False
|
---|
141 | elif t >= 256:
|
---|
142 | # See if it's a symbol and if we're in its first set
|
---|
143 | itsdfa = self.grammar.dfas[t]
|
---|
144 | itsstates, itsfirst = itsdfa
|
---|
145 | if ilabel in itsfirst:
|
---|
146 | # Push a symbol
|
---|
147 | self.push(t, self.grammar.dfas[t], newstate, context)
|
---|
148 | break # To continue the outer while loop
|
---|
149 | else:
|
---|
150 | if (0, state) in arcs:
|
---|
151 | # An accepting state, pop it and try something else
|
---|
152 | self.pop()
|
---|
153 | if not self.stack:
|
---|
154 | # Done parsing, but another token is input
|
---|
155 | raise ParseError("too much input",
|
---|
156 | type, value, context)
|
---|
157 | else:
|
---|
158 | # No success finding a transition
|
---|
159 | raise ParseError("bad input", type, value, context)
|
---|
160 |
|
---|
161 | def classify(self, type, value, context):
|
---|
162 | """Turn a token into a label. (Internal)"""
|
---|
163 | if type == token.NAME:
|
---|
164 | # Keep a listing of all used names
|
---|
165 | self.used_names.add(value)
|
---|
166 | # Check for reserved words
|
---|
167 | ilabel = self.grammar.keywords.get(value)
|
---|
168 | if ilabel is not None:
|
---|
169 | return ilabel
|
---|
170 | ilabel = self.grammar.tokens.get(type)
|
---|
171 | if ilabel is None:
|
---|
172 | raise ParseError("bad token", type, value, context)
|
---|
173 | return ilabel
|
---|
174 |
|
---|
175 | def shift(self, type, value, newstate, context):
|
---|
176 | """Shift a token. (Internal)"""
|
---|
177 | dfa, state, node = self.stack[-1]
|
---|
178 | newnode = (type, value, context, None)
|
---|
179 | newnode = self.convert(self.grammar, newnode)
|
---|
180 | if newnode is not None:
|
---|
181 | node[-1].append(newnode)
|
---|
182 | self.stack[-1] = (dfa, newstate, node)
|
---|
183 |
|
---|
184 | def push(self, type, newdfa, newstate, context):
|
---|
185 | """Push a nonterminal. (Internal)"""
|
---|
186 | dfa, state, node = self.stack[-1]
|
---|
187 | newnode = (type, None, context, [])
|
---|
188 | self.stack[-1] = (dfa, newstate, node)
|
---|
189 | self.stack.append((newdfa, 0, newnode))
|
---|
190 |
|
---|
191 | def pop(self):
|
---|
192 | """Pop a nonterminal. (Internal)"""
|
---|
193 | popdfa, popstate, popnode = self.stack.pop()
|
---|
194 | newnode = self.convert(self.grammar, popnode)
|
---|
195 | if newnode is not None:
|
---|
196 | if self.stack:
|
---|
197 | dfa, state, node = self.stack[-1]
|
---|
198 | node[-1].append(newnode)
|
---|
199 | else:
|
---|
200 | self.rootnode = newnode
|
---|
201 | self.rootnode.used_names = self.used_names
|
---|