1 | # Copyright (C) 2003-2007, 2009, 2010 Nominum, Inc.
|
---|
2 | #
|
---|
3 | # Permission to use, copy, modify, and distribute this software and its
|
---|
4 | # documentation for any purpose with or without fee is hereby granted,
|
---|
5 | # provided that the above copyright notice and this permission notice
|
---|
6 | # appear in all copies.
|
---|
7 | #
|
---|
8 | # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
|
---|
9 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
---|
10 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
|
---|
11 | # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
---|
12 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
---|
13 | # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
---|
14 | # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
---|
15 |
|
---|
16 | """Tokenize DNS master file format"""
|
---|
17 |
|
---|
18 | import cStringIO
|
---|
19 | import sys
|
---|
20 |
|
---|
21 | import dns.exception
|
---|
22 | import dns.name
|
---|
23 | import dns.ttl
|
---|
24 |
|
---|
25 | _DELIMITERS = {
|
---|
26 | ' ' : True,
|
---|
27 | '\t' : True,
|
---|
28 | '\n' : True,
|
---|
29 | ';' : True,
|
---|
30 | '(' : True,
|
---|
31 | ')' : True,
|
---|
32 | '"' : True }
|
---|
33 |
|
---|
34 | _QUOTING_DELIMITERS = { '"' : True }
|
---|
35 |
|
---|
36 | EOF = 0
|
---|
37 | EOL = 1
|
---|
38 | WHITESPACE = 2
|
---|
39 | IDENTIFIER = 3
|
---|
40 | QUOTED_STRING = 4
|
---|
41 | COMMENT = 5
|
---|
42 | DELIMITER = 6
|
---|
43 |
|
---|
44 | class UngetBufferFull(dns.exception.DNSException):
|
---|
45 | """Raised when an attempt is made to unget a token when the unget
|
---|
46 | buffer is full."""
|
---|
47 | pass
|
---|
48 |
|
---|
49 | class Token(object):
|
---|
50 | """A DNS master file format token.
|
---|
51 |
|
---|
52 | @ivar ttype: The token type
|
---|
53 | @type ttype: int
|
---|
54 | @ivar value: The token value
|
---|
55 | @type value: string
|
---|
56 | @ivar has_escape: Does the token value contain escapes?
|
---|
57 | @type has_escape: bool
|
---|
58 | """
|
---|
59 |
|
---|
60 | def __init__(self, ttype, value='', has_escape=False):
|
---|
61 | """Initialize a token instance.
|
---|
62 |
|
---|
63 | @param ttype: The token type
|
---|
64 | @type ttype: int
|
---|
65 | @ivar value: The token value
|
---|
66 | @type value: string
|
---|
67 | @ivar has_escape: Does the token value contain escapes?
|
---|
68 | @type has_escape: bool
|
---|
69 | """
|
---|
70 | self.ttype = ttype
|
---|
71 | self.value = value
|
---|
72 | self.has_escape = has_escape
|
---|
73 |
|
---|
74 | def is_eof(self):
|
---|
75 | return self.ttype == EOF
|
---|
76 |
|
---|
77 | def is_eol(self):
|
---|
78 | return self.ttype == EOL
|
---|
79 |
|
---|
80 | def is_whitespace(self):
|
---|
81 | return self.ttype == WHITESPACE
|
---|
82 |
|
---|
83 | def is_identifier(self):
|
---|
84 | return self.ttype == IDENTIFIER
|
---|
85 |
|
---|
86 | def is_quoted_string(self):
|
---|
87 | return self.ttype == QUOTED_STRING
|
---|
88 |
|
---|
89 | def is_comment(self):
|
---|
90 | return self.ttype == COMMENT
|
---|
91 |
|
---|
92 | def is_delimiter(self):
|
---|
93 | return self.ttype == DELIMITER
|
---|
94 |
|
---|
95 | def is_eol_or_eof(self):
|
---|
96 | return (self.ttype == EOL or self.ttype == EOF)
|
---|
97 |
|
---|
98 | def __eq__(self, other):
|
---|
99 | if not isinstance(other, Token):
|
---|
100 | return False
|
---|
101 | return (self.ttype == other.ttype and
|
---|
102 | self.value == other.value)
|
---|
103 |
|
---|
104 | def __ne__(self, other):
|
---|
105 | if not isinstance(other, Token):
|
---|
106 | return True
|
---|
107 | return (self.ttype != other.ttype or
|
---|
108 | self.value != other.value)
|
---|
109 |
|
---|
110 | def __str__(self):
|
---|
111 | return '%d "%s"' % (self.ttype, self.value)
|
---|
112 |
|
---|
113 | def unescape(self):
|
---|
114 | if not self.has_escape:
|
---|
115 | return self
|
---|
116 | unescaped = ''
|
---|
117 | l = len(self.value)
|
---|
118 | i = 0
|
---|
119 | while i < l:
|
---|
120 | c = self.value[i]
|
---|
121 | i += 1
|
---|
122 | if c == '\\':
|
---|
123 | if i >= l:
|
---|
124 | raise dns.exception.UnexpectedEnd
|
---|
125 | c = self.value[i]
|
---|
126 | i += 1
|
---|
127 | if c.isdigit():
|
---|
128 | if i >= l:
|
---|
129 | raise dns.exception.UnexpectedEnd
|
---|
130 | c2 = self.value[i]
|
---|
131 | i += 1
|
---|
132 | if i >= l:
|
---|
133 | raise dns.exception.UnexpectedEnd
|
---|
134 | c3 = self.value[i]
|
---|
135 | i += 1
|
---|
136 | if not (c2.isdigit() and c3.isdigit()):
|
---|
137 | raise dns.exception.SyntaxError
|
---|
138 | c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
|
---|
139 | unescaped += c
|
---|
140 | return Token(self.ttype, unescaped)
|
---|
141 |
|
---|
142 | # compatibility for old-style tuple tokens
|
---|
143 |
|
---|
144 | def __len__(self):
|
---|
145 | return 2
|
---|
146 |
|
---|
147 | def __iter__(self):
|
---|
148 | return iter((self.ttype, self.value))
|
---|
149 |
|
---|
150 | def __getitem__(self, i):
|
---|
151 | if i == 0:
|
---|
152 | return self.ttype
|
---|
153 | elif i == 1:
|
---|
154 | return self.value
|
---|
155 | else:
|
---|
156 | raise IndexError
|
---|
157 |
|
---|
158 | class Tokenizer(object):
|
---|
159 | """A DNS master file format tokenizer.
|
---|
160 |
|
---|
161 | A token is a (type, value) tuple, where I{type} is an int, and
|
---|
162 | I{value} is a string. The valid types are EOF, EOL, WHITESPACE,
|
---|
163 | IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER.
|
---|
164 |
|
---|
165 | @ivar file: The file to tokenize
|
---|
166 | @type file: file
|
---|
167 | @ivar ungotten_char: The most recently ungotten character, or None.
|
---|
168 | @type ungotten_char: string
|
---|
169 | @ivar ungotten_token: The most recently ungotten token, or None.
|
---|
170 | @type ungotten_token: (int, string) token tuple
|
---|
171 | @ivar multiline: The current multiline level. This value is increased
|
---|
172 | by one every time a '(' delimiter is read, and decreased by one every time
|
---|
173 | a ')' delimiter is read.
|
---|
174 | @type multiline: int
|
---|
175 | @ivar quoting: This variable is true if the tokenizer is currently
|
---|
176 | reading a quoted string.
|
---|
177 | @type quoting: bool
|
---|
178 | @ivar eof: This variable is true if the tokenizer has encountered EOF.
|
---|
179 | @type eof: bool
|
---|
180 | @ivar delimiters: The current delimiter dictionary.
|
---|
181 | @type delimiters: dict
|
---|
182 | @ivar line_number: The current line number
|
---|
183 | @type line_number: int
|
---|
184 | @ivar filename: A filename that will be returned by the L{where} method.
|
---|
185 | @type filename: string
|
---|
186 | """
|
---|
187 |
|
---|
188 | def __init__(self, f=sys.stdin, filename=None):
|
---|
189 | """Initialize a tokenizer instance.
|
---|
190 |
|
---|
191 | @param f: The file to tokenize. The default is sys.stdin.
|
---|
192 | This parameter may also be a string, in which case the tokenizer
|
---|
193 | will take its input from the contents of the string.
|
---|
194 | @type f: file or string
|
---|
195 | @param filename: the name of the filename that the L{where} method
|
---|
196 | will return.
|
---|
197 | @type filename: string
|
---|
198 | """
|
---|
199 |
|
---|
200 | if isinstance(f, str):
|
---|
201 | f = cStringIO.StringIO(f)
|
---|
202 | if filename is None:
|
---|
203 | filename = '<string>'
|
---|
204 | else:
|
---|
205 | if filename is None:
|
---|
206 | if f is sys.stdin:
|
---|
207 | filename = '<stdin>'
|
---|
208 | else:
|
---|
209 | filename = '<file>'
|
---|
210 | self.file = f
|
---|
211 | self.ungotten_char = None
|
---|
212 | self.ungotten_token = None
|
---|
213 | self.multiline = 0
|
---|
214 | self.quoting = False
|
---|
215 | self.eof = False
|
---|
216 | self.delimiters = _DELIMITERS
|
---|
217 | self.line_number = 1
|
---|
218 | self.filename = filename
|
---|
219 |
|
---|
220 | def _get_char(self):
|
---|
221 | """Read a character from input.
|
---|
222 | @rtype: string
|
---|
223 | """
|
---|
224 |
|
---|
225 | if self.ungotten_char is None:
|
---|
226 | if self.eof:
|
---|
227 | c = ''
|
---|
228 | else:
|
---|
229 | c = self.file.read(1)
|
---|
230 | if c == '':
|
---|
231 | self.eof = True
|
---|
232 | elif c == '\n':
|
---|
233 | self.line_number += 1
|
---|
234 | else:
|
---|
235 | c = self.ungotten_char
|
---|
236 | self.ungotten_char = None
|
---|
237 | return c
|
---|
238 |
|
---|
239 | def where(self):
|
---|
240 | """Return the current location in the input.
|
---|
241 |
|
---|
242 | @rtype: (string, int) tuple. The first item is the filename of
|
---|
243 | the input, the second is the current line number.
|
---|
244 | """
|
---|
245 |
|
---|
246 | return (self.filename, self.line_number)
|
---|
247 |
|
---|
248 | def _unget_char(self, c):
|
---|
249 | """Unget a character.
|
---|
250 |
|
---|
251 | The unget buffer for characters is only one character large; it is
|
---|
252 | an error to try to unget a character when the unget buffer is not
|
---|
253 | empty.
|
---|
254 |
|
---|
255 | @param c: the character to unget
|
---|
256 | @type c: string
|
---|
257 | @raises UngetBufferFull: there is already an ungotten char
|
---|
258 | """
|
---|
259 |
|
---|
260 | if not self.ungotten_char is None:
|
---|
261 | raise UngetBufferFull
|
---|
262 | self.ungotten_char = c
|
---|
263 |
|
---|
264 | def skip_whitespace(self):
|
---|
265 | """Consume input until a non-whitespace character is encountered.
|
---|
266 |
|
---|
267 | The non-whitespace character is then ungotten, and the number of
|
---|
268 | whitespace characters consumed is returned.
|
---|
269 |
|
---|
270 | If the tokenizer is in multiline mode, then newlines are whitespace.
|
---|
271 |
|
---|
272 | @rtype: int
|
---|
273 | """
|
---|
274 |
|
---|
275 | skipped = 0
|
---|
276 | while True:
|
---|
277 | c = self._get_char()
|
---|
278 | if c != ' ' and c != '\t':
|
---|
279 | if (c != '\n') or not self.multiline:
|
---|
280 | self._unget_char(c)
|
---|
281 | return skipped
|
---|
282 | skipped += 1
|
---|
283 |
|
---|
284 | def get(self, want_leading = False, want_comment = False):
|
---|
285 | """Get the next token.
|
---|
286 |
|
---|
287 | @param want_leading: If True, return a WHITESPACE token if the
|
---|
288 | first character read is whitespace. The default is False.
|
---|
289 | @type want_leading: bool
|
---|
290 | @param want_comment: If True, return a COMMENT token if the
|
---|
291 | first token read is a comment. The default is False.
|
---|
292 | @type want_comment: bool
|
---|
293 | @rtype: Token object
|
---|
294 | @raises dns.exception.UnexpectedEnd: input ended prematurely
|
---|
295 | @raises dns.exception.SyntaxError: input was badly formed
|
---|
296 | """
|
---|
297 |
|
---|
298 | if not self.ungotten_token is None:
|
---|
299 | token = self.ungotten_token
|
---|
300 | self.ungotten_token = None
|
---|
301 | if token.is_whitespace():
|
---|
302 | if want_leading:
|
---|
303 | return token
|
---|
304 | elif token.is_comment():
|
---|
305 | if want_comment:
|
---|
306 | return token
|
---|
307 | else:
|
---|
308 | return token
|
---|
309 | skipped = self.skip_whitespace()
|
---|
310 | if want_leading and skipped > 0:
|
---|
311 | return Token(WHITESPACE, ' ')
|
---|
312 | token = ''
|
---|
313 | ttype = IDENTIFIER
|
---|
314 | has_escape = False
|
---|
315 | while True:
|
---|
316 | c = self._get_char()
|
---|
317 | if c == '' or c in self.delimiters:
|
---|
318 | if c == '' and self.quoting:
|
---|
319 | raise dns.exception.UnexpectedEnd
|
---|
320 | if token == '' and ttype != QUOTED_STRING:
|
---|
321 | if c == '(':
|
---|
322 | self.multiline += 1
|
---|
323 | self.skip_whitespace()
|
---|
324 | continue
|
---|
325 | elif c == ')':
|
---|
326 | if not self.multiline > 0:
|
---|
327 | raise dns.exception.SyntaxError
|
---|
328 | self.multiline -= 1
|
---|
329 | self.skip_whitespace()
|
---|
330 | continue
|
---|
331 | elif c == '"':
|
---|
332 | if not self.quoting:
|
---|
333 | self.quoting = True
|
---|
334 | self.delimiters = _QUOTING_DELIMITERS
|
---|
335 | ttype = QUOTED_STRING
|
---|
336 | continue
|
---|
337 | else:
|
---|
338 | self.quoting = False
|
---|
339 | self.delimiters = _DELIMITERS
|
---|
340 | self.skip_whitespace()
|
---|
341 | continue
|
---|
342 | elif c == '\n':
|
---|
343 | return Token(EOL, '\n')
|
---|
344 | elif c == ';':
|
---|
345 | while 1:
|
---|
346 | c = self._get_char()
|
---|
347 | if c == '\n' or c == '':
|
---|
348 | break
|
---|
349 | token += c
|
---|
350 | if want_comment:
|
---|
351 | self._unget_char(c)
|
---|
352 | return Token(COMMENT, token)
|
---|
353 | elif c == '':
|
---|
354 | if self.multiline:
|
---|
355 | raise dns.exception.SyntaxError('unbalanced parentheses')
|
---|
356 | return Token(EOF)
|
---|
357 | elif self.multiline:
|
---|
358 | self.skip_whitespace()
|
---|
359 | token = ''
|
---|
360 | continue
|
---|
361 | else:
|
---|
362 | return Token(EOL, '\n')
|
---|
363 | else:
|
---|
364 | # This code exists in case we ever want a
|
---|
365 | # delimiter to be returned. It never produces
|
---|
366 | # a token currently.
|
---|
367 | token = c
|
---|
368 | ttype = DELIMITER
|
---|
369 | else:
|
---|
370 | self._unget_char(c)
|
---|
371 | break
|
---|
372 | elif self.quoting:
|
---|
373 | if c == '\\':
|
---|
374 | c = self._get_char()
|
---|
375 | if c == '':
|
---|
376 | raise dns.exception.UnexpectedEnd
|
---|
377 | if c.isdigit():
|
---|
378 | c2 = self._get_char()
|
---|
379 | if c2 == '':
|
---|
380 | raise dns.exception.UnexpectedEnd
|
---|
381 | c3 = self._get_char()
|
---|
382 | if c == '':
|
---|
383 | raise dns.exception.UnexpectedEnd
|
---|
384 | if not (c2.isdigit() and c3.isdigit()):
|
---|
385 | raise dns.exception.SyntaxError
|
---|
386 | c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
|
---|
387 | elif c == '\n':
|
---|
388 | raise dns.exception.SyntaxError('newline in quoted string')
|
---|
389 | elif c == '\\':
|
---|
390 | #
|
---|
391 | # It's an escape. Put it and the next character into
|
---|
392 | # the token; it will be checked later for goodness.
|
---|
393 | #
|
---|
394 | token += c
|
---|
395 | has_escape = True
|
---|
396 | c = self._get_char()
|
---|
397 | if c == '' or c == '\n':
|
---|
398 | raise dns.exception.UnexpectedEnd
|
---|
399 | token += c
|
---|
400 | if token == '' and ttype != QUOTED_STRING:
|
---|
401 | if self.multiline:
|
---|
402 | raise dns.exception.SyntaxError('unbalanced parentheses')
|
---|
403 | ttype = EOF
|
---|
404 | return Token(ttype, token, has_escape)
|
---|
405 |
|
---|
406 | def unget(self, token):
|
---|
407 | """Unget a token.
|
---|
408 |
|
---|
409 | The unget buffer for tokens is only one token large; it is
|
---|
410 | an error to try to unget a token when the unget buffer is not
|
---|
411 | empty.
|
---|
412 |
|
---|
413 | @param token: the token to unget
|
---|
414 | @type token: Token object
|
---|
415 | @raises UngetBufferFull: there is already an ungotten token
|
---|
416 | """
|
---|
417 |
|
---|
418 | if not self.ungotten_token is None:
|
---|
419 | raise UngetBufferFull
|
---|
420 | self.ungotten_token = token
|
---|
421 |
|
---|
422 | def next(self):
|
---|
423 | """Return the next item in an iteration.
|
---|
424 | @rtype: (int, string)
|
---|
425 | """
|
---|
426 |
|
---|
427 | token = self.get()
|
---|
428 | if token.is_eof():
|
---|
429 | raise StopIteration
|
---|
430 | return token
|
---|
431 |
|
---|
432 | def __iter__(self):
|
---|
433 | return self
|
---|
434 |
|
---|
435 | # Helpers
|
---|
436 |
|
---|
437 | def get_int(self):
|
---|
438 | """Read the next token and interpret it as an integer.
|
---|
439 |
|
---|
440 | @raises dns.exception.SyntaxError:
|
---|
441 | @rtype: int
|
---|
442 | """
|
---|
443 |
|
---|
444 | token = self.get().unescape()
|
---|
445 | if not token.is_identifier():
|
---|
446 | raise dns.exception.SyntaxError('expecting an identifier')
|
---|
447 | if not token.value.isdigit():
|
---|
448 | raise dns.exception.SyntaxError('expecting an integer')
|
---|
449 | return int(token.value)
|
---|
450 |
|
---|
451 | def get_uint8(self):
|
---|
452 | """Read the next token and interpret it as an 8-bit unsigned
|
---|
453 | integer.
|
---|
454 |
|
---|
455 | @raises dns.exception.SyntaxError:
|
---|
456 | @rtype: int
|
---|
457 | """
|
---|
458 |
|
---|
459 | value = self.get_int()
|
---|
460 | if value < 0 or value > 255:
|
---|
461 | raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value)
|
---|
462 | return value
|
---|
463 |
|
---|
464 | def get_uint16(self):
|
---|
465 | """Read the next token and interpret it as a 16-bit unsigned
|
---|
466 | integer.
|
---|
467 |
|
---|
468 | @raises dns.exception.SyntaxError:
|
---|
469 | @rtype: int
|
---|
470 | """
|
---|
471 |
|
---|
472 | value = self.get_int()
|
---|
473 | if value < 0 or value > 65535:
|
---|
474 | raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value)
|
---|
475 | return value
|
---|
476 |
|
---|
477 | def get_uint32(self):
|
---|
478 | """Read the next token and interpret it as a 32-bit unsigned
|
---|
479 | integer.
|
---|
480 |
|
---|
481 | @raises dns.exception.SyntaxError:
|
---|
482 | @rtype: int
|
---|
483 | """
|
---|
484 |
|
---|
485 | token = self.get().unescape()
|
---|
486 | if not token.is_identifier():
|
---|
487 | raise dns.exception.SyntaxError('expecting an identifier')
|
---|
488 | if not token.value.isdigit():
|
---|
489 | raise dns.exception.SyntaxError('expecting an integer')
|
---|
490 | value = long(token.value)
|
---|
491 | if value < 0 or value > 4294967296L:
|
---|
492 | raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value)
|
---|
493 | return value
|
---|
494 |
|
---|
495 | def get_string(self, origin=None):
|
---|
496 | """Read the next token and interpret it as a string.
|
---|
497 |
|
---|
498 | @raises dns.exception.SyntaxError:
|
---|
499 | @rtype: string
|
---|
500 | """
|
---|
501 |
|
---|
502 | token = self.get().unescape()
|
---|
503 | if not (token.is_identifier() or token.is_quoted_string()):
|
---|
504 | raise dns.exception.SyntaxError('expecting a string')
|
---|
505 | return token.value
|
---|
506 |
|
---|
507 | def get_identifier(self, origin=None):
|
---|
508 | """Read the next token and raise an exception if it is not an identifier.
|
---|
509 |
|
---|
510 | @raises dns.exception.SyntaxError:
|
---|
511 | @rtype: string
|
---|
512 | """
|
---|
513 |
|
---|
514 | token = self.get().unescape()
|
---|
515 | if not token.is_identifier():
|
---|
516 | raise dns.exception.SyntaxError('expecting an identifier')
|
---|
517 | return token.value
|
---|
518 |
|
---|
519 | def get_name(self, origin=None):
|
---|
520 | """Read the next token and interpret it as a DNS name.
|
---|
521 |
|
---|
522 | @raises dns.exception.SyntaxError:
|
---|
523 | @rtype: dns.name.Name object"""
|
---|
524 |
|
---|
525 | token = self.get()
|
---|
526 | if not token.is_identifier():
|
---|
527 | raise dns.exception.SyntaxError('expecting an identifier')
|
---|
528 | return dns.name.from_text(token.value, origin)
|
---|
529 |
|
---|
530 | def get_eol(self):
|
---|
531 | """Read the next token and raise an exception if it isn't EOL or
|
---|
532 | EOF.
|
---|
533 |
|
---|
534 | @raises dns.exception.SyntaxError:
|
---|
535 | @rtype: string
|
---|
536 | """
|
---|
537 |
|
---|
538 | token = self.get()
|
---|
539 | if not token.is_eol_or_eof():
|
---|
540 | raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value))
|
---|
541 | return token.value
|
---|
542 |
|
---|
543 | def get_ttl(self):
|
---|
544 | token = self.get().unescape()
|
---|
545 | if not token.is_identifier():
|
---|
546 | raise dns.exception.SyntaxError('expecting an identifier')
|
---|
547 | return dns.ttl.from_text(token.value)
|
---|