Context Navigation

tokenizer.py

Visit:

Last change on this file was 745, checked in by Silvan Scherrer, 13 years ago
Samba Server: updated trunk to 3.6.0
File size: 18.1 KB

Line
1	# Copyright (C) 2003-2007, 2009, 2010 Nominum, Inc.
2	#
3	# Permission to use, copy, modify, and distribute this software and its
4	# documentation for any purpose with or without fee is hereby granted,
5	# provided that the above copyright notice and this permission notice
6	# appear in all copies.
7	#
8	# THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
9	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
11	# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13	# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
14	# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
16	"""Tokenize DNS master file format"""
17
18	import cStringIO
19	import sys
20
21	import dns.exception
22	import dns.name
23	import dns.ttl
24
25	_DELIMITERS = {
26	' ' : True,
27	'\t' : True,
28	'\n' : True,
29	';' : True,
30	'(' : True,
31	')' : True,
32	'"' : True }
33
34	_QUOTING_DELIMITERS = { '"' : True }
35
36	EOF = 0
37	EOL = 1
38	WHITESPACE = 2
39	IDENTIFIER = 3
40	QUOTED_STRING = 4
41	COMMENT = 5
42	DELIMITER = 6
43
44	class UngetBufferFull(dns.exception.DNSException):
45	"""Raised when an attempt is made to unget a token when the unget
46	buffer is full."""
47	pass
48
49	class Token(object):
50	"""A DNS master file format token.
51
52	@ivar ttype: The token type
53	@type ttype: int
54	@ivar value: The token value
55	@type value: string
56	@ivar has_escape: Does the token value contain escapes?
57	@type has_escape: bool
58	"""
59
60	def __init__(self, ttype, value='', has_escape=False):
61	"""Initialize a token instance.
62
63	@param ttype: The token type
64	@type ttype: int
65	@ivar value: The token value
66	@type value: string
67	@ivar has_escape: Does the token value contain escapes?
68	@type has_escape: bool
69	"""
70	self.ttype = ttype
71	self.value = value
72	self.has_escape = has_escape
73
74	def is_eof(self):
75	return self.ttype == EOF
76
77	def is_eol(self):
78	return self.ttype == EOL
79
80	def is_whitespace(self):
81	return self.ttype == WHITESPACE
82
83	def is_identifier(self):
84	return self.ttype == IDENTIFIER
85
86	def is_quoted_string(self):
87	return self.ttype == QUOTED_STRING
88
89	def is_comment(self):
90	return self.ttype == COMMENT
91
92	def is_delimiter(self):
93	return self.ttype == DELIMITER
94
95	def is_eol_or_eof(self):
96	return (self.ttype == EOL or self.ttype == EOF)
97
98	def __eq__(self, other):
99	if not isinstance(other, Token):
100	return False
101	return (self.ttype == other.ttype and
102	self.value == other.value)
103
104	def __ne__(self, other):
105	if not isinstance(other, Token):
106	return True
107	return (self.ttype != other.ttype or
108	self.value != other.value)
109
110	def __str__(self):
111	return '%d "%s"' % (self.ttype, self.value)
112
113	def unescape(self):
114	if not self.has_escape:
115	return self
116	unescaped = ''
117	l = len(self.value)
118	i = 0
119	while i < l:
120	c = self.value[i]
121	i += 1
122	if c == '\\':
123	if i >= l:
124	raise dns.exception.UnexpectedEnd
125	c = self.value[i]
126	i += 1
127	if c.isdigit():
128	if i >= l:
129	raise dns.exception.UnexpectedEnd
130	c2 = self.value[i]
131	i += 1
132	if i >= l:
133	raise dns.exception.UnexpectedEnd
134	c3 = self.value[i]
135	i += 1
136	if not (c2.isdigit() and c3.isdigit()):
137	raise dns.exception.SyntaxError
138	c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
139	unescaped += c
140	return Token(self.ttype, unescaped)
141
142	# compatibility for old-style tuple tokens
143
144	def __len__(self):
145	return 2
146
147	def __iter__(self):
148	return iter((self.ttype, self.value))
149
150	def __getitem__(self, i):
151	if i == 0:
152	return self.ttype
153	elif i == 1:
154	return self.value
155	else:
156	raise IndexError
157
158	class Tokenizer(object):
159	"""A DNS master file format tokenizer.
160
161	A token is a (type, value) tuple, where I{type} is an int, and
162	I{value} is a string. The valid types are EOF, EOL, WHITESPACE,
163	IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER.
164
165	@ivar file: The file to tokenize
166	@type file: file
167	@ivar ungotten_char: The most recently ungotten character, or None.
168	@type ungotten_char: string
169	@ivar ungotten_token: The most recently ungotten token, or None.
170	@type ungotten_token: (int, string) token tuple
171	@ivar multiline: The current multiline level. This value is increased
172	by one every time a '(' delimiter is read, and decreased by one every time
173	a ')' delimiter is read.
174	@type multiline: int
175	@ivar quoting: This variable is true if the tokenizer is currently
176	reading a quoted string.
177	@type quoting: bool
178	@ivar eof: This variable is true if the tokenizer has encountered EOF.
179	@type eof: bool
180	@ivar delimiters: The current delimiter dictionary.
181	@type delimiters: dict
182	@ivar line_number: The current line number
183	@type line_number: int
184	@ivar filename: A filename that will be returned by the L{where} method.
185	@type filename: string
186	"""
187
188	def __init__(self, f=sys.stdin, filename=None):
189	"""Initialize a tokenizer instance.
190
191	@param f: The file to tokenize. The default is sys.stdin.
192	This parameter may also be a string, in which case the tokenizer
193	will take its input from the contents of the string.
194	@type f: file or string
195	@param filename: the name of the filename that the L{where} method
196	will return.
197	@type filename: string
198	"""
199
200	if isinstance(f, str):
201	f = cStringIO.StringIO(f)
202	if filename is None:
203	filename = '<string>'
204	else:
205	if filename is None:
206	if f is sys.stdin:
207	filename = '<stdin>'
208	else:
209	filename = '<file>'
210	self.file = f
211	self.ungotten_char = None
212	self.ungotten_token = None
213	self.multiline = 0
214	self.quoting = False
215	self.eof = False
216	self.delimiters = _DELIMITERS
217	self.line_number = 1
218	self.filename = filename
219
220	def _get_char(self):
221	"""Read a character from input.
222	@rtype: string
223	"""
224
225	if self.ungotten_char is None:
226	if self.eof:
227	c = ''
228	else:
229	c = self.file.read(1)
230	if c == '':
231	self.eof = True
232	elif c == '\n':
233	self.line_number += 1
234	else:
235	c = self.ungotten_char
236	self.ungotten_char = None
237	return c
238
239	def where(self):
240	"""Return the current location in the input.
241
242	@rtype: (string, int) tuple. The first item is the filename of
243	the input, the second is the current line number.
244	"""
245
246	return (self.filename, self.line_number)
247
248	def _unget_char(self, c):
249	"""Unget a character.
250
251	The unget buffer for characters is only one character large; it is
252	an error to try to unget a character when the unget buffer is not
253	empty.
254
255	@param c: the character to unget
256	@type c: string
257	@raises UngetBufferFull: there is already an ungotten char
258	"""
259
260	if not self.ungotten_char is None:
261	raise UngetBufferFull
262	self.ungotten_char = c
263
264	def skip_whitespace(self):
265	"""Consume input until a non-whitespace character is encountered.
266
267	The non-whitespace character is then ungotten, and the number of
268	whitespace characters consumed is returned.
269
270	If the tokenizer is in multiline mode, then newlines are whitespace.
271
272	@rtype: int
273	"""
274
275	skipped = 0
276	while True:
277	c = self._get_char()
278	if c != ' ' and c != '\t':
279	if (c != '\n') or not self.multiline:
280	self._unget_char(c)
281	return skipped
282	skipped += 1
283
284	def get(self, want_leading = False, want_comment = False):
285	"""Get the next token.
286
287	@param want_leading: If True, return a WHITESPACE token if the
288	first character read is whitespace. The default is False.
289	@type want_leading: bool
290	@param want_comment: If True, return a COMMENT token if the
291	first token read is a comment. The default is False.
292	@type want_comment: bool
293	@rtype: Token object
294	@raises dns.exception.UnexpectedEnd: input ended prematurely
295	@raises dns.exception.SyntaxError: input was badly formed
296	"""
297
298	if not self.ungotten_token is None:
299	token = self.ungotten_token
300	self.ungotten_token = None
301	if token.is_whitespace():
302	if want_leading:
303	return token
304	elif token.is_comment():
305	if want_comment:
306	return token
307	else:
308	return token
309	skipped = self.skip_whitespace()
310	if want_leading and skipped > 0:
311	return Token(WHITESPACE, ' ')
312	token = ''
313	ttype = IDENTIFIER
314	has_escape = False
315	while True:
316	c = self._get_char()
317	if c == '' or c in self.delimiters:
318	if c == '' and self.quoting:
319	raise dns.exception.UnexpectedEnd
320	if token == '' and ttype != QUOTED_STRING:
321	if c == '(':
322	self.multiline += 1
323	self.skip_whitespace()
324	continue
325	elif c == ')':
326	if not self.multiline > 0:
327	raise dns.exception.SyntaxError
328	self.multiline -= 1
329	self.skip_whitespace()
330	continue
331	elif c == '"':
332	if not self.quoting:
333	self.quoting = True
334	self.delimiters = _QUOTING_DELIMITERS
335	ttype = QUOTED_STRING
336	continue
337	else:
338	self.quoting = False
339	self.delimiters = _DELIMITERS
340	self.skip_whitespace()
341	continue
342	elif c == '\n':
343	return Token(EOL, '\n')
344	elif c == ';':
345	while 1:
346	c = self._get_char()
347	if c == '\n' or c == '':
348	break
349	token += c
350	if want_comment:
351	self._unget_char(c)
352	return Token(COMMENT, token)
353	elif c == '':
354	if self.multiline:
355	raise dns.exception.SyntaxError('unbalanced parentheses')
356	return Token(EOF)
357	elif self.multiline:
358	self.skip_whitespace()
359	token = ''
360	continue
361	else:
362	return Token(EOL, '\n')
363	else:
364	# This code exists in case we ever want a
365	# delimiter to be returned. It never produces
366	# a token currently.
367	token = c
368	ttype = DELIMITER
369	else:
370	self._unget_char(c)
371	break
372	elif self.quoting:
373	if c == '\\':
374	c = self._get_char()
375	if c == '':
376	raise dns.exception.UnexpectedEnd
377	if c.isdigit():
378	c2 = self._get_char()
379	if c2 == '':
380	raise dns.exception.UnexpectedEnd
381	c3 = self._get_char()
382	if c == '':
383	raise dns.exception.UnexpectedEnd
384	if not (c2.isdigit() and c3.isdigit()):
385	raise dns.exception.SyntaxError
386	c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
387	elif c == '\n':
388	raise dns.exception.SyntaxError('newline in quoted string')
389	elif c == '\\':
390	#
391	# It's an escape. Put it and the next character into
392	# the token; it will be checked later for goodness.
393	#
394	token += c
395	has_escape = True
396	c = self._get_char()
397	if c == '' or c == '\n':
398	raise dns.exception.UnexpectedEnd
399	token += c
400	if token == '' and ttype != QUOTED_STRING:
401	if self.multiline:
402	raise dns.exception.SyntaxError('unbalanced parentheses')
403	ttype = EOF
404	return Token(ttype, token, has_escape)
405
406	def unget(self, token):
407	"""Unget a token.
408
409	The unget buffer for tokens is only one token large; it is
410	an error to try to unget a token when the unget buffer is not
411	empty.
412
413	@param token: the token to unget
414	@type token: Token object
415	@raises UngetBufferFull: there is already an ungotten token
416	"""
417
418	if not self.ungotten_token is None:
419	raise UngetBufferFull
420	self.ungotten_token = token
421
422	def next(self):
423	"""Return the next item in an iteration.
424	@rtype: (int, string)
425	"""
426
427	token = self.get()
428	if token.is_eof():
429	raise StopIteration
430	return token
431
432	def __iter__(self):
433	return self
434
435	# Helpers
436
437	def get_int(self):
438	"""Read the next token and interpret it as an integer.
439
440	@raises dns.exception.SyntaxError:
441	@rtype: int
442	"""
443
444	token = self.get().unescape()
445	if not token.is_identifier():
446	raise dns.exception.SyntaxError('expecting an identifier')
447	if not token.value.isdigit():
448	raise dns.exception.SyntaxError('expecting an integer')
449	return int(token.value)
450
451	def get_uint8(self):
452	"""Read the next token and interpret it as an 8-bit unsigned
453	integer.
454
455	@raises dns.exception.SyntaxError:
456	@rtype: int
457	"""
458
459	value = self.get_int()
460	if value < 0 or value > 255:
461	raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value)
462	return value
463
464	def get_uint16(self):
465	"""Read the next token and interpret it as a 16-bit unsigned
466	integer.
467
468	@raises dns.exception.SyntaxError:
469	@rtype: int
470	"""
471
472	value = self.get_int()
473	if value < 0 or value > 65535:
474	raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value)
475	return value
476
477	def get_uint32(self):
478	"""Read the next token and interpret it as a 32-bit unsigned
479	integer.
480
481	@raises dns.exception.SyntaxError:
482	@rtype: int
483	"""
484
485	token = self.get().unescape()
486	if not token.is_identifier():
487	raise dns.exception.SyntaxError('expecting an identifier')
488	if not token.value.isdigit():
489	raise dns.exception.SyntaxError('expecting an integer')
490	value = long(token.value)
491	if value < 0 or value > 4294967296L:
492	raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value)
493	return value
494
495	def get_string(self, origin=None):
496	"""Read the next token and interpret it as a string.
497
498	@raises dns.exception.SyntaxError:
499	@rtype: string
500	"""
501
502	token = self.get().unescape()
503	if not (token.is_identifier() or token.is_quoted_string()):
504	raise dns.exception.SyntaxError('expecting a string')
505	return token.value
506
507	def get_identifier(self, origin=None):
508	"""Read the next token and raise an exception if it is not an identifier.
509
510	@raises dns.exception.SyntaxError:
511	@rtype: string
512	"""
513
514	token = self.get().unescape()
515	if not token.is_identifier():
516	raise dns.exception.SyntaxError('expecting an identifier')
517	return token.value
518
519	def get_name(self, origin=None):
520	"""Read the next token and interpret it as a DNS name.
521
522	@raises dns.exception.SyntaxError:
523	@rtype: dns.name.Name object"""
524
525	token = self.get()
526	if not token.is_identifier():
527	raise dns.exception.SyntaxError('expecting an identifier')
528	return dns.name.from_text(token.value, origin)
529
530	def get_eol(self):
531	"""Read the next token and raise an exception if it isn't EOL or
532	EOF.
533
534	@raises dns.exception.SyntaxError:
535	@rtype: string
536	"""
537
538	token = self.get()
539	if not token.is_eol_or_eof():
540	raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value))
541	return token.value
542
543	def get_ttl(self):
544	token = self.get().unescape()
545	if not token.is_identifier():
546	raise dns.exception.SyntaxError('expecting an identifier')
547	return dns.ttl.from_text(token.value)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/server/lib/dnspython/dns/tokenizer.py

Download in other formats: