1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
---|
2 | # Licensed to PSF under a Contributor Agreement.
|
---|
3 |
|
---|
4 | """Convert graminit.[ch] spit out by pgen to Python code.
|
---|
5 |
|
---|
6 | Pgen is the Python parser generator. It is useful to quickly create a
|
---|
7 | parser from a grammar file in Python's grammar notation. But I don't
|
---|
8 | want my parsers to be written in C (yet), so I'm translating the
|
---|
9 | parsing tables to Python data structures and writing a Python parse
|
---|
10 | engine.
|
---|
11 |
|
---|
12 | Note that the token numbers are constants determined by the standard
|
---|
13 | Python tokenizer. The standard token module defines these numbers and
|
---|
14 | their names (the names are not used much). The token numbers are
|
---|
15 | hardcoded into the Python tokenizer and into pgen. A Python
|
---|
16 | implementation of the Python tokenizer is also available, in the
|
---|
17 | standard tokenize module.
|
---|
18 |
|
---|
19 | On the other hand, symbol numbers (representing the grammar's
|
---|
20 | non-terminals) are assigned by pgen based on the actual grammar
|
---|
21 | input.
|
---|
22 |
|
---|
23 | Note: this module is pretty much obsolete; the pgen module generates
|
---|
24 | equivalent grammar tables directly from the Grammar.txt input file
|
---|
25 | without having to invoke the Python pgen C program.
|
---|
26 |
|
---|
27 | """
|
---|
28 |
|
---|
29 | # Python imports
|
---|
30 | import re
|
---|
31 |
|
---|
32 | # Local imports
|
---|
33 | from pgen2 import grammar, token
|
---|
34 |
|
---|
35 |
|
---|
36 | class Converter(grammar.Grammar):
|
---|
37 | """Grammar subclass that reads classic pgen output files.
|
---|
38 |
|
---|
39 | The run() method reads the tables as produced by the pgen parser
|
---|
40 | generator, typically contained in two C files, graminit.h and
|
---|
41 | graminit.c. The other methods are for internal use only.
|
---|
42 |
|
---|
43 | See the base class for more documentation.
|
---|
44 |
|
---|
45 | """
|
---|
46 |
|
---|
47 | def run(self, graminit_h, graminit_c):
|
---|
48 | """Load the grammar tables from the text files written by pgen."""
|
---|
49 | self.parse_graminit_h(graminit_h)
|
---|
50 | self.parse_graminit_c(graminit_c)
|
---|
51 | self.finish_off()
|
---|
52 |
|
---|
53 | def parse_graminit_h(self, filename):
|
---|
54 | """Parse the .h file written by pgen. (Internal)
|
---|
55 |
|
---|
56 | This file is a sequence of #define statements defining the
|
---|
57 | nonterminals of the grammar as numbers. We build two tables
|
---|
58 | mapping the numbers to names and back.
|
---|
59 |
|
---|
60 | """
|
---|
61 | try:
|
---|
62 | f = open(filename)
|
---|
63 | except IOError, err:
|
---|
64 | print "Can't open %s: %s" % (filename, err)
|
---|
65 | return False
|
---|
66 | self.symbol2number = {}
|
---|
67 | self.number2symbol = {}
|
---|
68 | lineno = 0
|
---|
69 | for line in f:
|
---|
70 | lineno += 1
|
---|
71 | mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
|
---|
72 | if not mo and line.strip():
|
---|
73 | print "%s(%s): can't parse %s" % (filename, lineno,
|
---|
74 | line.strip())
|
---|
75 | else:
|
---|
76 | symbol, number = mo.groups()
|
---|
77 | number = int(number)
|
---|
78 | assert symbol not in self.symbol2number
|
---|
79 | assert number not in self.number2symbol
|
---|
80 | self.symbol2number[symbol] = number
|
---|
81 | self.number2symbol[number] = symbol
|
---|
82 | return True
|
---|
83 |
|
---|
84 | def parse_graminit_c(self, filename):
|
---|
85 | """Parse the .c file written by pgen. (Internal)
|
---|
86 |
|
---|
87 | The file looks as follows. The first two lines are always this:
|
---|
88 |
|
---|
89 | #include "pgenheaders.h"
|
---|
90 | #include "grammar.h"
|
---|
91 |
|
---|
92 | After that come four blocks:
|
---|
93 |
|
---|
94 | 1) one or more state definitions
|
---|
95 | 2) a table defining dfas
|
---|
96 | 3) a table defining labels
|
---|
97 | 4) a struct defining the grammar
|
---|
98 |
|
---|
99 | A state definition has the following form:
|
---|
100 | - one or more arc arrays, each of the form:
|
---|
101 | static arc arcs_<n>_<m>[<k>] = {
|
---|
102 | {<i>, <j>},
|
---|
103 | ...
|
---|
104 | };
|
---|
105 | - followed by a state array, of the form:
|
---|
106 | static state states_<s>[<t>] = {
|
---|
107 | {<k>, arcs_<n>_<m>},
|
---|
108 | ...
|
---|
109 | };
|
---|
110 |
|
---|
111 | """
|
---|
112 | try:
|
---|
113 | f = open(filename)
|
---|
114 | except IOError, err:
|
---|
115 | print "Can't open %s: %s" % (filename, err)
|
---|
116 | return False
|
---|
117 | # The code below essentially uses f's iterator-ness!
|
---|
118 | lineno = 0
|
---|
119 |
|
---|
120 | # Expect the two #include lines
|
---|
121 | lineno, line = lineno+1, f.next()
|
---|
122 | assert line == '#include "pgenheaders.h"\n', (lineno, line)
|
---|
123 | lineno, line = lineno+1, f.next()
|
---|
124 | assert line == '#include "grammar.h"\n', (lineno, line)
|
---|
125 |
|
---|
126 | # Parse the state definitions
|
---|
127 | lineno, line = lineno+1, f.next()
|
---|
128 | allarcs = {}
|
---|
129 | states = []
|
---|
130 | while line.startswith("static arc "):
|
---|
131 | while line.startswith("static arc "):
|
---|
132 | mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
|
---|
133 | line)
|
---|
134 | assert mo, (lineno, line)
|
---|
135 | n, m, k = map(int, mo.groups())
|
---|
136 | arcs = []
|
---|
137 | for _ in range(k):
|
---|
138 | lineno, line = lineno+1, f.next()
|
---|
139 | mo = re.match(r"\s+{(\d+), (\d+)},$", line)
|
---|
140 | assert mo, (lineno, line)
|
---|
141 | i, j = map(int, mo.groups())
|
---|
142 | arcs.append((i, j))
|
---|
143 | lineno, line = lineno+1, f.next()
|
---|
144 | assert line == "};\n", (lineno, line)
|
---|
145 | allarcs[(n, m)] = arcs
|
---|
146 | lineno, line = lineno+1, f.next()
|
---|
147 | mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
|
---|
148 | assert mo, (lineno, line)
|
---|
149 | s, t = map(int, mo.groups())
|
---|
150 | assert s == len(states), (lineno, line)
|
---|
151 | state = []
|
---|
152 | for _ in range(t):
|
---|
153 | lineno, line = lineno+1, f.next()
|
---|
154 | mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
|
---|
155 | assert mo, (lineno, line)
|
---|
156 | k, n, m = map(int, mo.groups())
|
---|
157 | arcs = allarcs[n, m]
|
---|
158 | assert k == len(arcs), (lineno, line)
|
---|
159 | state.append(arcs)
|
---|
160 | states.append(state)
|
---|
161 | lineno, line = lineno+1, f.next()
|
---|
162 | assert line == "};\n", (lineno, line)
|
---|
163 | lineno, line = lineno+1, f.next()
|
---|
164 | self.states = states
|
---|
165 |
|
---|
166 | # Parse the dfas
|
---|
167 | dfas = {}
|
---|
168 | mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
|
---|
169 | assert mo, (lineno, line)
|
---|
170 | ndfas = int(mo.group(1))
|
---|
171 | for i in range(ndfas):
|
---|
172 | lineno, line = lineno+1, f.next()
|
---|
173 | mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
|
---|
174 | line)
|
---|
175 | assert mo, (lineno, line)
|
---|
176 | symbol = mo.group(2)
|
---|
177 | number, x, y, z = map(int, mo.group(1, 3, 4, 5))
|
---|
178 | assert self.symbol2number[symbol] == number, (lineno, line)
|
---|
179 | assert self.number2symbol[number] == symbol, (lineno, line)
|
---|
180 | assert x == 0, (lineno, line)
|
---|
181 | state = states[z]
|
---|
182 | assert y == len(state), (lineno, line)
|
---|
183 | lineno, line = lineno+1, f.next()
|
---|
184 | mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
|
---|
185 | assert mo, (lineno, line)
|
---|
186 | first = {}
|
---|
187 | rawbitset = eval(mo.group(1))
|
---|
188 | for i, c in enumerate(rawbitset):
|
---|
189 | byte = ord(c)
|
---|
190 | for j in range(8):
|
---|
191 | if byte & (1<<j):
|
---|
192 | first[i*8 + j] = 1
|
---|
193 | dfas[number] = (state, first)
|
---|
194 | lineno, line = lineno+1, f.next()
|
---|
195 | assert line == "};\n", (lineno, line)
|
---|
196 | self.dfas = dfas
|
---|
197 |
|
---|
198 | # Parse the labels
|
---|
199 | labels = []
|
---|
200 | lineno, line = lineno+1, f.next()
|
---|
201 | mo = re.match(r"static label labels\[(\d+)\] = {$", line)
|
---|
202 | assert mo, (lineno, line)
|
---|
203 | nlabels = int(mo.group(1))
|
---|
204 | for i in range(nlabels):
|
---|
205 | lineno, line = lineno+1, f.next()
|
---|
206 | mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
|
---|
207 | assert mo, (lineno, line)
|
---|
208 | x, y = mo.groups()
|
---|
209 | x = int(x)
|
---|
210 | if y == "0":
|
---|
211 | y = None
|
---|
212 | else:
|
---|
213 | y = eval(y)
|
---|
214 | labels.append((x, y))
|
---|
215 | lineno, line = lineno+1, f.next()
|
---|
216 | assert line == "};\n", (lineno, line)
|
---|
217 | self.labels = labels
|
---|
218 |
|
---|
219 | # Parse the grammar struct
|
---|
220 | lineno, line = lineno+1, f.next()
|
---|
221 | assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
|
---|
222 | lineno, line = lineno+1, f.next()
|
---|
223 | mo = re.match(r"\s+(\d+),$", line)
|
---|
224 | assert mo, (lineno, line)
|
---|
225 | ndfas = int(mo.group(1))
|
---|
226 | assert ndfas == len(self.dfas)
|
---|
227 | lineno, line = lineno+1, f.next()
|
---|
228 | assert line == "\tdfas,\n", (lineno, line)
|
---|
229 | lineno, line = lineno+1, f.next()
|
---|
230 | mo = re.match(r"\s+{(\d+), labels},$", line)
|
---|
231 | assert mo, (lineno, line)
|
---|
232 | nlabels = int(mo.group(1))
|
---|
233 | assert nlabels == len(self.labels), (lineno, line)
|
---|
234 | lineno, line = lineno+1, f.next()
|
---|
235 | mo = re.match(r"\s+(\d+)$", line)
|
---|
236 | assert mo, (lineno, line)
|
---|
237 | start = int(mo.group(1))
|
---|
238 | assert start in self.number2symbol, (lineno, line)
|
---|
239 | self.start = start
|
---|
240 | lineno, line = lineno+1, f.next()
|
---|
241 | assert line == "};\n", (lineno, line)
|
---|
242 | try:
|
---|
243 | lineno, line = lineno+1, f.next()
|
---|
244 | except StopIteration:
|
---|
245 | pass
|
---|
246 | else:
|
---|
247 | assert 0, (lineno, line)
|
---|
248 |
|
---|
249 | def finish_off(self):
|
---|
250 | """Create additional useful structures. (Internal)."""
|
---|
251 | self.keywords = {} # map from keyword strings to arc labels
|
---|
252 | self.tokens = {} # map from numeric token values to arc labels
|
---|
253 | for ilabel, (type, value) in enumerate(self.labels):
|
---|
254 | if type == token.NAME and value is not None:
|
---|
255 | self.keywords[value] = ilabel
|
---|
256 | elif value is None:
|
---|
257 | self.tokens[type] = ilabel
|
---|