Context Navigation

feedparser.py

Last change on this file was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 19.9 KB

Line
1	# Copyright (C) 2004-2006 Python Software Foundation
2	# Authors: Baxter, Wouters and Warsaw
3	# Contact: email-sig@python.org
4
5	"""FeedParser - An email feed parser.
6
7	The feed parser implements an interface for incrementally parsing an email
8	message, line by line. This has advantages for certain applications, such as
9	those reading email messages off a socket.
10
11	FeedParser.feed() is the primary interface for pushing new data into the
12	parser. It returns when there's nothing more it can do with the available
13	data. When you have no more data to push into the parser, call .close().
14	This completes the parsing and returns the root message object.
15
16	The other advantage of this parser is that it will never throw a parsing
17	exception. Instead, when it finds something unexpected, it adds a 'defect' to
18	the current message. Defects are just instances that live on the message
19	object's .defects attribute.
20	"""
21
22	__all__ = ['FeedParser']
23
24	import re
25
26	from email import errors
27	from email import message
28
29	NLCRE = re.compile('\r\n\|\r\|\n')
30	NLCRE_bol = re.compile('(\r\n\|\r\|\n)')
31	NLCRE_eol = re.compile('(\r\n\|\r\|\n)$')
32	NLCRE_crack = re.compile('(\r\n\|\r\|\n)')
33	# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
34	# except controls, SP, and ":".
35	headerRE = re.compile(r'^(From \|[\041-\071\073-\176]{1,}:\|[\t ])')
36	EMPTYSTRING = ''
37	NL = '\n'
38
39	NeedMoreData = object()
40
41
42
43
44	class BufferedSubFile(object):
45	"""A file-ish object that can have new data loaded into it.
46
47	You can also push and pop line-matching predicates onto a stack. When the
48	current predicate matches the current line, a false EOF response
49	(i.e. empty string) is returned instead. This lets the parser adhere to a
50	simple abstraction -- it parses until EOF closes the current message.
51	"""
52	def __init__(self):
53	# The last partial line pushed into this object.
54	self._partial = ''
55	# The list of full, pushed lines, in reverse order
56	self._lines = []
57	# The stack of false-EOF checking predicates.
58	self._eofstack = []
59	# A flag indicating whether the file has been closed or not.
60	self._closed = False
61
62	def push_eof_matcher(self, pred):
63	self._eofstack.append(pred)
64
65	def pop_eof_matcher(self):
66	return self._eofstack.pop()
67
68	def close(self):
69	# Don't forget any trailing partial line.
70	self._lines.append(self._partial)
71	self._partial = ''
72	self._closed = True
73
74	def readline(self):
75	if not self._lines:
76	if self._closed:
77	return ''
78	return NeedMoreData
79	# Pop the line off the stack and see if it matches the current
80	# false-EOF predicate.
81	line = self._lines.pop()
82	# RFC 2046, section 5.1.2 requires us to recognize outer level
83	# boundaries at any level of inner nesting. Do this, but be sure it's
84	# in the order of most to least nested.
85	for ateof in self._eofstack[::-1]:
86	if ateof(line):
87	# We're at the false EOF. But push the last line back first.
88	self._lines.append(line)
89	return ''
90	return line
91
92	def unreadline(self, line):
93	# Let the consumer push a line back into the buffer.
94	assert line is not NeedMoreData
95	self._lines.append(line)
96
97	def push(self, data):
98	"""Push some new data into this object."""
99	# Handle any previous leftovers
100	data, self._partial = self._partial + data, ''
101	# Crack into lines, but preserve the newlines on the end of each
102	parts = NLCRE_crack.split(data)
103	# The ahem interesting behaviour of re.split when supplied grouping
104	# parentheses is that the last element of the resulting list is the
105	# data after the final RE. In the case of a NL/CR terminated string,
106	# this is the empty string.
107	self._partial = parts.pop()
108	# parts is a list of strings, alternating between the line contents
109	# and the eol character(s). Gather up a list of lines after
110	# re-attaching the newlines.
111	lines = []
112	for i in range(len(parts) // 2):
113	lines.append(parts[i2] + parts[i2+1])
114	self.pushlines(lines)
115
116	def pushlines(self, lines):
117	# Reverse and insert at the front of the lines.
118	self._lines[:0] = lines[::-1]
119
120	def is_closed(self):
121	return self._closed
122
123	def __iter__(self):
124	return self
125
126	def next(self):
127	line = self.readline()
128	if line == '':
129	raise StopIteration
130	return line
131
132
133
134
135	class FeedParser:
136	"""A feed-style parser of email."""
137
138	def __init__(self, _factory=message.Message):
139	"""_factory is called with no arguments to create a new message obj"""
140	self._factory = _factory
141	self._input = BufferedSubFile()
142	self._msgstack = []
143	self._parse = self._parsegen().next
144	self._cur = None
145	self._last = None
146	self._headersonly = False
147
148	# Non-public interface for supporting Parser's headersonly flag
149	def _set_headersonly(self):
150	self._headersonly = True
151
152	def feed(self, data):
153	"""Push more data into the parser."""
154	self._input.push(data)
155	self._call_parse()
156
157	def _call_parse(self):
158	try:
159	self._parse()
160	except StopIteration:
161	pass
162
163	def close(self):
164	"""Parse all remaining data and return the root message object."""
165	self._input.close()
166	self._call_parse()
167	root = self._pop_message()
168	assert not self._msgstack
169	# Look for final set of defects
170	if root.get_content_maintype() == 'multipart' \
171	and not root.is_multipart():
172	root.defects.append(errors.MultipartInvariantViolationDefect())
173	return root
174
175	def _new_message(self):
176	msg = self._factory()
177	if self._cur and self._cur.get_content_type() == 'multipart/digest':
178	msg.set_default_type('message/rfc822')
179	if self._msgstack:
180	self._msgstack[-1].attach(msg)
181	self._msgstack.append(msg)
182	self._cur = msg
183	self._last = msg
184
185	def _pop_message(self):
186	retval = self._msgstack.pop()
187	if self._msgstack:
188	self._cur = self._msgstack[-1]
189	else:
190	self._cur = None
191	return retval
192
193	def _parsegen(self):
194	# Create a new message and start by parsing headers.
195	self._new_message()
196	headers = []
197	# Collect the headers, searching for a line that doesn't match the RFC
198	# 2822 header or continuation pattern (including an empty line).
199	for line in self._input:
200	if line is NeedMoreData:
201	yield NeedMoreData
202	continue
203	if not headerRE.match(line):
204	# If we saw the RFC defined header/body separator
205	# (i.e. newline), just throw it away. Otherwise the line is
206	# part of the body so push it back.
207	if not NLCRE.match(line):
208	self._input.unreadline(line)
209	break
210	headers.append(line)
211	# Done with the headers, so parse them and figure out what we're
212	# supposed to see in the body of the message.
213	self._parse_headers(headers)
214	# Headers-only parsing is a backwards compatibility hack, which was
215	# necessary in the older parser, which could throw errors. All
216	# remaining lines in the input are thrown into the message body.
217	if self._headersonly:
218	lines = []
219	while True:
220	line = self._input.readline()
221	if line is NeedMoreData:
222	yield NeedMoreData
223	continue
224	if line == '':
225	break
226	lines.append(line)
227	self._cur.set_payload(EMPTYSTRING.join(lines))
228	return
229	if self._cur.get_content_type() == 'message/delivery-status':
230	# message/delivery-status contains blocks of headers separated by
231	# a blank line. We'll represent each header block as a separate
232	# nested message object, but the processing is a bit different
233	# than standard message/* types because there is no body for the
234	# nested messages. A blank line separates the subparts.
235	while True:
236	self._input.push_eof_matcher(NLCRE.match)
237	for retval in self._parsegen():
238	if retval is NeedMoreData:
239	yield NeedMoreData
240	continue
241	break
242	msg = self._pop_message()
243	# We need to pop the EOF matcher in order to tell if we're at
244	# the end of the current file, not the end of the last block
245	# of message headers.
246	self._input.pop_eof_matcher()
247	# The input stream must be sitting at the newline or at the
248	# EOF. We want to see if we're at the end of this subpart, so
249	# first consume the blank line, then test the next line to see
250	# if we're at this subpart's EOF.
251	while True:
252	line = self._input.readline()
253	if line is NeedMoreData:
254	yield NeedMoreData
255	continue
256	break
257	while True:
258	line = self._input.readline()
259	if line is NeedMoreData:
260	yield NeedMoreData
261	continue
262	break
263	if line == '':
264	break
265	# Not at EOF so this is a line we're going to need.
266	self._input.unreadline(line)
267	return
268	if self._cur.get_content_maintype() == 'message':
269	# The message claims to be a message/* type, then what follows is
270	# another RFC 2822 message.
271	for retval in self._parsegen():
272	if retval is NeedMoreData:
273	yield NeedMoreData
274	continue
275	break
276	self._pop_message()
277	return
278	if self._cur.get_content_maintype() == 'multipart':
279	boundary = self._cur.get_boundary()
280	if boundary is None:
281	# The message /claims/ to be a multipart but it has not
282	# defined a boundary. That's a problem which we'll handle by
283	# reading everything until the EOF and marking the message as
284	# defective.
285	self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
286	lines = []
287	for line in self._input:
288	if line is NeedMoreData:
289	yield NeedMoreData
290	continue
291	lines.append(line)
292	self._cur.set_payload(EMPTYSTRING.join(lines))
293	return
294	# Create a line match predicate which matches the inter-part
295	# boundary as well as the end-of-multipart boundary. Don't push
296	# this onto the input stream until we've scanned past the
297	# preamble.
298	separator = '--' + boundary
299	boundaryre = re.compile(
300	'(?P<sep>' + re.escape(separator) +
301	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')
302	capturing_preamble = True
303	preamble = []
304	linesep = False
305	while True:
306	line = self._input.readline()
307	if line is NeedMoreData:
308	yield NeedMoreData
309	continue
310	if line == '':
311	break
312	mo = boundaryre.match(line)
313	if mo:
314	# If we're looking at the end boundary, we're done with
315	# this multipart. If there was a newline at the end of
316	# the closing boundary, then we need to initialize the
317	# epilogue with the empty string (see below).
318	if mo.group('end'):
319	linesep = mo.group('linesep')
320	break
321	# We saw an inter-part boundary. Were we in the preamble?
322	if capturing_preamble:
323	if preamble:
324	# According to RFC 2046, the last newline belongs
325	# to the boundary.
326	lastline = preamble[-1]
327	eolmo = NLCRE_eol.search(lastline)
328	if eolmo:
329	preamble[-1] = lastline[:-len(eolmo.group(0))]
330	self._cur.preamble = EMPTYSTRING.join(preamble)
331	capturing_preamble = False
332	self._input.unreadline(line)
333	continue
334	# We saw a boundary separating two parts. Consume any
335	# multiple boundary lines that may be following. Our
336	# interpretation of RFC 2046 BNF grammar does not produce
337	# body parts within such double boundaries.
338	while True:
339	line = self._input.readline()
340	if line is NeedMoreData:
341	yield NeedMoreData
342	continue
343	mo = boundaryre.match(line)
344	if not mo:
345	self._input.unreadline(line)
346	break
347	# Recurse to parse this subpart; the input stream points
348	# at the subpart's first line.
349	self._input.push_eof_matcher(boundaryre.match)
350	for retval in self._parsegen():
351	if retval is NeedMoreData:
352	yield NeedMoreData
353	continue
354	break
355	# Because of RFC 2046, the newline preceding the boundary
356	# separator actually belongs to the boundary, not the
357	# previous subpart's payload (or epilogue if the previous
358	# part is a multipart).
359	if self._last.get_content_maintype() == 'multipart':
360	epilogue = self._last.epilogue
361	if epilogue == '':
362	self._last.epilogue = None
363	elif epilogue is not None:
364	mo = NLCRE_eol.search(epilogue)
365	if mo:
366	end = len(mo.group(0))
367	self._last.epilogue = epilogue[:-end]
368	else:
369	payload = self._last.get_payload()
370	if isinstance(payload, basestring):
371	mo = NLCRE_eol.search(payload)
372	if mo:
373	payload = payload[:-len(mo.group(0))]
374	self._last.set_payload(payload)
375	self._input.pop_eof_matcher()
376	self._pop_message()
377	# Set the multipart up for newline cleansing, which will
378	# happen if we're in a nested multipart.
379	self._last = self._cur
380	else:
381	# I think we must be in the preamble
382	assert capturing_preamble
383	preamble.append(line)
384	# We've seen either the EOF or the end boundary. If we're still
385	# capturing the preamble, we never saw the start boundary. Note
386	# that as a defect and store the captured text as the payload.
387	# Everything from here to the EOF is epilogue.
388	if capturing_preamble:
389	self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
390	self._cur.set_payload(EMPTYSTRING.join(preamble))
391	epilogue = []
392	for line in self._input:
393	if line is NeedMoreData:
394	yield NeedMoreData
395	continue
396	self._cur.epilogue = EMPTYSTRING.join(epilogue)
397	return
398	# If the end boundary ended in a newline, we'll need to make sure
399	# the epilogue isn't None
400	if linesep:
401	epilogue = ['']
402	else:
403	epilogue = []
404	for line in self._input:
405	if line is NeedMoreData:
406	yield NeedMoreData
407	continue
408	epilogue.append(line)
409	# Any CRLF at the front of the epilogue is not technically part of
410	# the epilogue. Also, watch out for an empty string epilogue,
411	# which means a single newline.
412	if epilogue:
413	firstline = epilogue[0]
414	bolmo = NLCRE_bol.match(firstline)
415	if bolmo:
416	epilogue[0] = firstline[len(bolmo.group(0)):]
417	self._cur.epilogue = EMPTYSTRING.join(epilogue)
418	return
419	# Otherwise, it's some non-multipart type, so the entire rest of the
420	# file contents becomes the payload.
421	lines = []
422	for line in self._input:
423	if line is NeedMoreData:
424	yield NeedMoreData
425	continue
426	lines.append(line)
427	self._cur.set_payload(EMPTYSTRING.join(lines))
428
429	def _parse_headers(self, lines):
430	# Passed a list of lines that make up the headers for the current msg
431	lastheader = ''
432	lastvalue = []
433	for lineno, line in enumerate(lines):
434	# Check for continuation
435	if line[0] in ' \t':
436	if not lastheader:
437	# The first line of the headers was a continuation. This
438	# is illegal, so let's note the defect, store the illegal
439	# line, and ignore it for purposes of headers.
440	defect = errors.FirstHeaderLineIsContinuationDefect(line)
441	self._cur.defects.append(defect)
442	continue
443	lastvalue.append(line)
444	continue
445	if lastheader:
446	# XXX reconsider the joining of folded lines
447	lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
448	self._cur[lastheader] = lhdr
449	lastheader, lastvalue = '', []
450	# Check for envelope header, i.e. unix-from
451	if line.startswith('From '):
452	if lineno == 0:
453	# Strip off the trailing newline
454	mo = NLCRE_eol.search(line)
455	if mo:
456	line = line[:-len(mo.group(0))]
457	self._cur.set_unixfrom(line)
458	continue
459	elif lineno == len(lines) - 1:
460	# Something looking like a unix-from at the end - it's
461	# probably the first line of the body, so push back the
462	# line and stop.
463	self._input.unreadline(line)
464	return
465	else:
466	# Weirdly placed unix-from line. Note this as a defect
467	# and ignore it.
468	defect = errors.MisplacedEnvelopeHeaderDefect(line)
469	self._cur.defects.append(defect)
470	continue
471	# Split the line on the colon separating field name from value.
472	i = line.find(':')
473	if i < 0:
474	defect = errors.MalformedHeaderDefect(line)
475	self._cur.defects.append(defect)
476	continue
477	lastheader = line[:i]
478	lastvalue = [line[i+1:].lstrip()]
479	# Done with all the lines, so handle the last header.
480	if lastheader:
481	# XXX reconsider the joining of folded lines
482	self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/vendor/Python-2.6.5/Lib/email/feedparser.py

Download in other formats: