1 | # -*- coding: iso-8859-1 -*-
|
---|
2 | """A lexical analyzer class for simple shell-like syntaxes."""
|
---|
3 |
|
---|
4 | # Module and documentation by Eric S. Raymond, 21 Dec 1998
|
---|
5 | # Input stacking and error message cleanup added by ESR, March 2000
|
---|
6 | # push_source() and pop_source() made explicit by ESR, January 2001.
|
---|
7 | # Posix compliance, split(), string arguments, and
|
---|
8 | # iterator interface by Gustavo Niemeyer, April 2003.
|
---|
9 |
|
---|
10 | import os.path
|
---|
11 | import sys
|
---|
12 | from collections import deque
|
---|
13 |
|
---|
14 | try:
|
---|
15 | from cStringIO import StringIO
|
---|
16 | except ImportError:
|
---|
17 | from StringIO import StringIO
|
---|
18 |
|
---|
19 | __all__ = ["shlex", "split"]
|
---|
20 |
|
---|
21 | class shlex:
|
---|
22 | "A lexical analyzer class for simple shell-like syntaxes."
|
---|
23 | def __init__(self, instream=None, infile=None, posix=False):
|
---|
24 | if isinstance(instream, basestring):
|
---|
25 | instream = StringIO(instream)
|
---|
26 | if instream is not None:
|
---|
27 | self.instream = instream
|
---|
28 | self.infile = infile
|
---|
29 | else:
|
---|
30 | self.instream = sys.stdin
|
---|
31 | self.infile = None
|
---|
32 | self.posix = posix
|
---|
33 | if posix:
|
---|
34 | self.eof = None
|
---|
35 | else:
|
---|
36 | self.eof = ''
|
---|
37 | self.commenters = '#'
|
---|
38 | self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
|
---|
39 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
|
---|
40 | if self.posix:
|
---|
41 | self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
|
---|
42 | 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
|
---|
43 | self.whitespace = ' \t\r\n'
|
---|
44 | self.whitespace_split = False
|
---|
45 | self.quotes = '\'"'
|
---|
46 | self.escape = '\\'
|
---|
47 | self.escapedquotes = '"'
|
---|
48 | self.state = ' '
|
---|
49 | self.pushback = deque()
|
---|
50 | self.lineno = 1
|
---|
51 | self.debug = 0
|
---|
52 | self.token = ''
|
---|
53 | self.filestack = deque()
|
---|
54 | self.source = None
|
---|
55 | if self.debug:
|
---|
56 | print 'shlex: reading from %s, line %d' \
|
---|
57 | % (self.instream, self.lineno)
|
---|
58 |
|
---|
59 | def push_token(self, tok):
|
---|
60 | "Push a token onto the stack popped by the get_token method"
|
---|
61 | if self.debug >= 1:
|
---|
62 | print "shlex: pushing token " + repr(tok)
|
---|
63 | self.pushback.appendleft(tok)
|
---|
64 |
|
---|
65 | def push_source(self, newstream, newfile=None):
|
---|
66 | "Push an input source onto the lexer's input source stack."
|
---|
67 | if isinstance(newstream, basestring):
|
---|
68 | newstream = StringIO(newstream)
|
---|
69 | self.filestack.appendleft((self.infile, self.instream, self.lineno))
|
---|
70 | self.infile = newfile
|
---|
71 | self.instream = newstream
|
---|
72 | self.lineno = 1
|
---|
73 | if self.debug:
|
---|
74 | if newfile is not None:
|
---|
75 | print 'shlex: pushing to file %s' % (self.infile,)
|
---|
76 | else:
|
---|
77 | print 'shlex: pushing to stream %s' % (self.instream,)
|
---|
78 |
|
---|
79 | def pop_source(self):
|
---|
80 | "Pop the input source stack."
|
---|
81 | self.instream.close()
|
---|
82 | (self.infile, self.instream, self.lineno) = self.filestack.popleft()
|
---|
83 | if self.debug:
|
---|
84 | print 'shlex: popping to %s, line %d' \
|
---|
85 | % (self.instream, self.lineno)
|
---|
86 | self.state = ' '
|
---|
87 |
|
---|
88 | def get_token(self):
|
---|
89 | "Get a token from the input stream (or from stack if it's nonempty)"
|
---|
90 | if self.pushback:
|
---|
91 | tok = self.pushback.popleft()
|
---|
92 | if self.debug >= 1:
|
---|
93 | print "shlex: popping token " + repr(tok)
|
---|
94 | return tok
|
---|
95 | # No pushback. Get a token.
|
---|
96 | raw = self.read_token()
|
---|
97 | # Handle inclusions
|
---|
98 | if self.source is not None:
|
---|
99 | while raw == self.source:
|
---|
100 | spec = self.sourcehook(self.read_token())
|
---|
101 | if spec:
|
---|
102 | (newfile, newstream) = spec
|
---|
103 | self.push_source(newstream, newfile)
|
---|
104 | raw = self.get_token()
|
---|
105 | # Maybe we got EOF instead?
|
---|
106 | while raw == self.eof:
|
---|
107 | if not self.filestack:
|
---|
108 | return self.eof
|
---|
109 | else:
|
---|
110 | self.pop_source()
|
---|
111 | raw = self.get_token()
|
---|
112 | # Neither inclusion nor EOF
|
---|
113 | if self.debug >= 1:
|
---|
114 | if raw != self.eof:
|
---|
115 | print "shlex: token=" + repr(raw)
|
---|
116 | else:
|
---|
117 | print "shlex: token=EOF"
|
---|
118 | return raw
|
---|
119 |
|
---|
120 | def read_token(self):
|
---|
121 | quoted = False
|
---|
122 | escapedstate = ' '
|
---|
123 | while True:
|
---|
124 | nextchar = self.instream.read(1)
|
---|
125 | if nextchar == '\n':
|
---|
126 | self.lineno = self.lineno + 1
|
---|
127 | if self.debug >= 3:
|
---|
128 | print "shlex: in state", repr(self.state), \
|
---|
129 | "I see character:", repr(nextchar)
|
---|
130 | if self.state is None:
|
---|
131 | self.token = '' # past end of file
|
---|
132 | break
|
---|
133 | elif self.state == ' ':
|
---|
134 | if not nextchar:
|
---|
135 | self.state = None # end of file
|
---|
136 | break
|
---|
137 | elif nextchar in self.whitespace:
|
---|
138 | if self.debug >= 2:
|
---|
139 | print "shlex: I see whitespace in whitespace state"
|
---|
140 | if self.token or (self.posix and quoted):
|
---|
141 | break # emit current token
|
---|
142 | else:
|
---|
143 | continue
|
---|
144 | elif nextchar in self.commenters:
|
---|
145 | self.instream.readline()
|
---|
146 | self.lineno = self.lineno + 1
|
---|
147 | elif self.posix and nextchar in self.escape:
|
---|
148 | escapedstate = 'a'
|
---|
149 | self.state = nextchar
|
---|
150 | elif nextchar in self.wordchars:
|
---|
151 | self.token = nextchar
|
---|
152 | self.state = 'a'
|
---|
153 | elif nextchar in self.quotes:
|
---|
154 | if not self.posix:
|
---|
155 | self.token = nextchar
|
---|
156 | self.state = nextchar
|
---|
157 | elif self.whitespace_split:
|
---|
158 | self.token = nextchar
|
---|
159 | self.state = 'a'
|
---|
160 | else:
|
---|
161 | self.token = nextchar
|
---|
162 | if self.token or (self.posix and quoted):
|
---|
163 | break # emit current token
|
---|
164 | else:
|
---|
165 | continue
|
---|
166 | elif self.state in self.quotes:
|
---|
167 | quoted = True
|
---|
168 | if not nextchar: # end of file
|
---|
169 | if self.debug >= 2:
|
---|
170 | print "shlex: I see EOF in quotes state"
|
---|
171 | # XXX what error should be raised here?
|
---|
172 | raise ValueError, "No closing quotation"
|
---|
173 | if nextchar == self.state:
|
---|
174 | if not self.posix:
|
---|
175 | self.token = self.token + nextchar
|
---|
176 | self.state = ' '
|
---|
177 | break
|
---|
178 | else:
|
---|
179 | self.state = 'a'
|
---|
180 | elif self.posix and nextchar in self.escape and \
|
---|
181 | self.state in self.escapedquotes:
|
---|
182 | escapedstate = self.state
|
---|
183 | self.state = nextchar
|
---|
184 | else:
|
---|
185 | self.token = self.token + nextchar
|
---|
186 | elif self.state in self.escape:
|
---|
187 | if not nextchar: # end of file
|
---|
188 | if self.debug >= 2:
|
---|
189 | print "shlex: I see EOF in escape state"
|
---|
190 | # XXX what error should be raised here?
|
---|
191 | raise ValueError, "No escaped character"
|
---|
192 | # In posix shells, only the quote itself or the escape
|
---|
193 | # character may be escaped within quotes.
|
---|
194 | if escapedstate in self.quotes and \
|
---|
195 | nextchar != self.state and nextchar != escapedstate:
|
---|
196 | self.token = self.token + self.state
|
---|
197 | self.token = self.token + nextchar
|
---|
198 | self.state = escapedstate
|
---|
199 | elif self.state == 'a':
|
---|
200 | if not nextchar:
|
---|
201 | self.state = None # end of file
|
---|
202 | break
|
---|
203 | elif nextchar in self.whitespace:
|
---|
204 | if self.debug >= 2:
|
---|
205 | print "shlex: I see whitespace in word state"
|
---|
206 | self.state = ' '
|
---|
207 | if self.token or (self.posix and quoted):
|
---|
208 | break # emit current token
|
---|
209 | else:
|
---|
210 | continue
|
---|
211 | elif nextchar in self.commenters:
|
---|
212 | self.instream.readline()
|
---|
213 | self.lineno = self.lineno + 1
|
---|
214 | if self.posix:
|
---|
215 | self.state = ' '
|
---|
216 | if self.token or (self.posix and quoted):
|
---|
217 | break # emit current token
|
---|
218 | else:
|
---|
219 | continue
|
---|
220 | elif self.posix and nextchar in self.quotes:
|
---|
221 | self.state = nextchar
|
---|
222 | elif self.posix and nextchar in self.escape:
|
---|
223 | escapedstate = 'a'
|
---|
224 | self.state = nextchar
|
---|
225 | elif nextchar in self.wordchars or nextchar in self.quotes \
|
---|
226 | or self.whitespace_split:
|
---|
227 | self.token = self.token + nextchar
|
---|
228 | else:
|
---|
229 | self.pushback.appendleft(nextchar)
|
---|
230 | if self.debug >= 2:
|
---|
231 | print "shlex: I see punctuation in word state"
|
---|
232 | self.state = ' '
|
---|
233 | if self.token:
|
---|
234 | break # emit current token
|
---|
235 | else:
|
---|
236 | continue
|
---|
237 | result = self.token
|
---|
238 | self.token = ''
|
---|
239 | if self.posix and not quoted and result == '':
|
---|
240 | result = None
|
---|
241 | if self.debug > 1:
|
---|
242 | if result:
|
---|
243 | print "shlex: raw token=" + repr(result)
|
---|
244 | else:
|
---|
245 | print "shlex: raw token=EOF"
|
---|
246 | return result
|
---|
247 |
|
---|
248 | def sourcehook(self, newfile):
|
---|
249 | "Hook called on a filename to be sourced."
|
---|
250 | if newfile[0] == '"':
|
---|
251 | newfile = newfile[1:-1]
|
---|
252 | # This implements cpp-like semantics for relative-path inclusion.
|
---|
253 | if isinstance(self.infile, basestring) and not os.path.isabs(newfile):
|
---|
254 | newfile = os.path.join(os.path.dirname(self.infile), newfile)
|
---|
255 | return (newfile, open(newfile, "r"))
|
---|
256 |
|
---|
257 | def error_leader(self, infile=None, lineno=None):
|
---|
258 | "Emit a C-compiler-like, Emacs-friendly error-message leader."
|
---|
259 | if infile is None:
|
---|
260 | infile = self.infile
|
---|
261 | if lineno is None:
|
---|
262 | lineno = self.lineno
|
---|
263 | return "\"%s\", line %d: " % (infile, lineno)
|
---|
264 |
|
---|
265 | def __iter__(self):
|
---|
266 | return self
|
---|
267 |
|
---|
268 | def next(self):
|
---|
269 | token = self.get_token()
|
---|
270 | if token == self.eof:
|
---|
271 | raise StopIteration
|
---|
272 | return token
|
---|
273 |
|
---|
274 | def split(s, comments=False, posix=True):
|
---|
275 | lex = shlex(s, posix=posix)
|
---|
276 | lex.whitespace_split = True
|
---|
277 | if not comments:
|
---|
278 | lex.commenters = ''
|
---|
279 | return list(lex)
|
---|
280 |
|
---|
281 | if __name__ == '__main__':
|
---|
282 | if len(sys.argv) == 1:
|
---|
283 | lexer = shlex()
|
---|
284 | else:
|
---|
285 | file = sys.argv[1]
|
---|
286 | lexer = shlex(open(file), file)
|
---|
287 | while 1:
|
---|
288 | tt = lexer.get_token()
|
---|
289 | if tt:
|
---|
290 | print "Token: " + repr(tt)
|
---|
291 | else:
|
---|
292 | break
|
---|