Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

feedparser.py

Last change on this file was 388, checked in by dmik, 11 years ago
python: Update vendor to 2.7.6.
Property svn:eol-style set to `native`
File size: 20.1 KB

Line
1	# Copyright (C) 2004-2006 Python Software Foundation
2	# Authors: Baxter, Wouters and Warsaw
3	# Contact: email-sig@python.org
4
5	"""FeedParser - An email feed parser.
6
7	The feed parser implements an interface for incrementally parsing an email
8	message, line by line. This has advantages for certain applications, such as
9	those reading email messages off a socket.
10
11	FeedParser.feed() is the primary interface for pushing new data into the
12	parser. It returns when there's nothing more it can do with the available
13	data. When you have no more data to push into the parser, call .close().
14	This completes the parsing and returns the root message object.
15
16	The other advantage of this parser is that it will never raise a parsing
17	exception. Instead, when it finds something unexpected, it adds a 'defect' to
18	the current message. Defects are just instances that live on the message
19	object's .defects attribute.
20	"""
21
22	__all__ = ['FeedParser']
23
24	import re
25
26	from email import errors
27	from email import message
28
29	NLCRE = re.compile('\r\n\|\r\|\n')
30	NLCRE_bol = re.compile('(\r\n\|\r\|\n)')
31	NLCRE_eol = re.compile('(\r\n\|\r\|\n)\Z')
32	NLCRE_crack = re.compile('(\r\n\|\r\|\n)')
33	# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
34	# except controls, SP, and ":".
35	headerRE = re.compile(r'^(From \|[\041-\071\073-\176]{1,}:\|[\t ])')
36	EMPTYSTRING = ''
37	NL = '\n'
38
39	NeedMoreData = object()
40
41
42
43
44	class BufferedSubFile(object):
45	"""A file-ish object that can have new data loaded into it.
46
47	You can also push and pop line-matching predicates onto a stack. When the
48	current predicate matches the current line, a false EOF response
49	(i.e. empty string) is returned instead. This lets the parser adhere to a
50	simple abstraction -- it parses until EOF closes the current message.
51	"""
52	def __init__(self):
53	# The last partial line pushed into this object.
54	self._partial = ''
55	# The list of full, pushed lines, in reverse order
56	self._lines = []
57	# The stack of false-EOF checking predicates.
58	self._eofstack = []
59	# A flag indicating whether the file has been closed or not.
60	self._closed = False
61
62	def push_eof_matcher(self, pred):
63	self._eofstack.append(pred)
64
65	def pop_eof_matcher(self):
66	return self._eofstack.pop()
67
68	def close(self):
69	# Don't forget any trailing partial line.
70	self._lines.append(self._partial)
71	self._partial = ''
72	self._closed = True
73
74	def readline(self):
75	if not self._lines:
76	if self._closed:
77	return ''
78	return NeedMoreData
79	# Pop the line off the stack and see if it matches the current
80	# false-EOF predicate.
81	line = self._lines.pop()
82	# RFC 2046, section 5.1.2 requires us to recognize outer level
83	# boundaries at any level of inner nesting. Do this, but be sure it's
84	# in the order of most to least nested.
85	for ateof in self._eofstack[::-1]:
86	if ateof(line):
87	# We're at the false EOF. But push the last line back first.
88	self._lines.append(line)
89	return ''
90	return line
91
92	def unreadline(self, line):
93	# Let the consumer push a line back into the buffer.
94	assert line is not NeedMoreData
95	self._lines.append(line)
96
97	def push(self, data):
98	"""Push some new data into this object."""
99	# Handle any previous leftovers
100	data, self._partial = self._partial + data, ''
101	# Crack into lines, but preserve the newlines on the end of each
102	parts = NLCRE_crack.split(data)
103	# The ahem interesting behaviour of re.split when supplied grouping
104	# parentheses is that the last element of the resulting list is the
105	# data after the final RE. In the case of a NL/CR terminated string,
106	# this is the empty string.
107	self._partial = parts.pop()
108	#GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
109	# is there a \n to follow later?
110	if not self._partial and parts and parts[-1].endswith('\r'):
111	self._partial = parts.pop(-2)+parts.pop()
112	# parts is a list of strings, alternating between the line contents
113	# and the eol character(s). Gather up a list of lines after
114	# re-attaching the newlines.
115	lines = []
116	for i in range(len(parts) // 2):
117	lines.append(parts[i2] + parts[i2+1])
118	self.pushlines(lines)
119
120	def pushlines(self, lines):
121	# Reverse and insert at the front of the lines.
122	self._lines[:0] = lines[::-1]
123
124	def is_closed(self):
125	return self._closed
126
127	def __iter__(self):
128	return self
129
130	def next(self):
131	line = self.readline()
132	if line == '':
133	raise StopIteration
134	return line
135
136
137
138
139	class FeedParser:
140	"""A feed-style parser of email."""
141
142	def __init__(self, _factory=message.Message):
143	"""_factory is called with no arguments to create a new message obj"""
144	self._factory = _factory
145	self._input = BufferedSubFile()
146	self._msgstack = []
147	self._parse = self._parsegen().next
148	self._cur = None
149	self._last = None
150	self._headersonly = False
151
152	# Non-public interface for supporting Parser's headersonly flag
153	def _set_headersonly(self):
154	self._headersonly = True
155
156	def feed(self, data):
157	"""Push more data into the parser."""
158	self._input.push(data)
159	self._call_parse()
160
161	def _call_parse(self):
162	try:
163	self._parse()
164	except StopIteration:
165	pass
166
167	def close(self):
168	"""Parse all remaining data and return the root message object."""
169	self._input.close()
170	self._call_parse()
171	root = self._pop_message()
172	assert not self._msgstack
173	# Look for final set of defects
174	if root.get_content_maintype() == 'multipart' \
175	and not root.is_multipart():
176	root.defects.append(errors.MultipartInvariantViolationDefect())
177	return root
178
179	def _new_message(self):
180	msg = self._factory()
181	if self._cur and self._cur.get_content_type() == 'multipart/digest':
182	msg.set_default_type('message/rfc822')
183	if self._msgstack:
184	self._msgstack[-1].attach(msg)
185	self._msgstack.append(msg)
186	self._cur = msg
187	self._last = msg
188
189	def _pop_message(self):
190	retval = self._msgstack.pop()
191	if self._msgstack:
192	self._cur = self._msgstack[-1]
193	else:
194	self._cur = None
195	return retval
196
197	def _parsegen(self):
198	# Create a new message and start by parsing headers.
199	self._new_message()
200	headers = []
201	# Collect the headers, searching for a line that doesn't match the RFC
202	# 2822 header or continuation pattern (including an empty line).
203	for line in self._input:
204	if line is NeedMoreData:
205	yield NeedMoreData
206	continue
207	if not headerRE.match(line):
208	# If we saw the RFC defined header/body separator
209	# (i.e. newline), just throw it away. Otherwise the line is
210	# part of the body so push it back.
211	if not NLCRE.match(line):
212	self._input.unreadline(line)
213	break
214	headers.append(line)
215	# Done with the headers, so parse them and figure out what we're
216	# supposed to see in the body of the message.
217	self._parse_headers(headers)
218	# Headers-only parsing is a backwards compatibility hack, which was
219	# necessary in the older parser, which could raise errors. All
220	# remaining lines in the input are thrown into the message body.
221	if self._headersonly:
222	lines = []
223	while True:
224	line = self._input.readline()
225	if line is NeedMoreData:
226	yield NeedMoreData
227	continue
228	if line == '':
229	break
230	lines.append(line)
231	self._cur.set_payload(EMPTYSTRING.join(lines))
232	return
233	if self._cur.get_content_type() == 'message/delivery-status':
234	# message/delivery-status contains blocks of headers separated by
235	# a blank line. We'll represent each header block as a separate
236	# nested message object, but the processing is a bit different
237	# than standard message/* types because there is no body for the
238	# nested messages. A blank line separates the subparts.
239	while True:
240	self._input.push_eof_matcher(NLCRE.match)
241	for retval in self._parsegen():
242	if retval is NeedMoreData:
243	yield NeedMoreData
244	continue
245	break
246	msg = self._pop_message()
247	# We need to pop the EOF matcher in order to tell if we're at
248	# the end of the current file, not the end of the last block
249	# of message headers.
250	self._input.pop_eof_matcher()
251	# The input stream must be sitting at the newline or at the
252	# EOF. We want to see if we're at the end of this subpart, so
253	# first consume the blank line, then test the next line to see
254	# if we're at this subpart's EOF.
255	while True:
256	line = self._input.readline()
257	if line is NeedMoreData:
258	yield NeedMoreData
259	continue
260	break
261	while True:
262	line = self._input.readline()
263	if line is NeedMoreData:
264	yield NeedMoreData
265	continue
266	break
267	if line == '':
268	break
269	# Not at EOF so this is a line we're going to need.
270	self._input.unreadline(line)
271	return
272	if self._cur.get_content_maintype() == 'message':
273	# The message claims to be a message/* type, then what follows is
274	# another RFC 2822 message.
275	for retval in self._parsegen():
276	if retval is NeedMoreData:
277	yield NeedMoreData
278	continue
279	break
280	self._pop_message()
281	return
282	if self._cur.get_content_maintype() == 'multipart':
283	boundary = self._cur.get_boundary()
284	if boundary is None:
285	# The message /claims/ to be a multipart but it has not
286	# defined a boundary. That's a problem which we'll handle by
287	# reading everything until the EOF and marking the message as
288	# defective.
289	self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
290	lines = []
291	for line in self._input:
292	if line is NeedMoreData:
293	yield NeedMoreData
294	continue
295	lines.append(line)
296	self._cur.set_payload(EMPTYSTRING.join(lines))
297	return
298	# Create a line match predicate which matches the inter-part
299	# boundary as well as the end-of-multipart boundary. Don't push
300	# this onto the input stream until we've scanned past the
301	# preamble.
302	separator = '--' + boundary
303	boundaryre = re.compile(
304	'(?P<sep>' + re.escape(separator) +
305	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')
306	capturing_preamble = True
307	preamble = []
308	linesep = False
309	while True:
310	line = self._input.readline()
311	if line is NeedMoreData:
312	yield NeedMoreData
313	continue
314	if line == '':
315	break
316	mo = boundaryre.match(line)
317	if mo:
318	# If we're looking at the end boundary, we're done with
319	# this multipart. If there was a newline at the end of
320	# the closing boundary, then we need to initialize the
321	# epilogue with the empty string (see below).
322	if mo.group('end'):
323	linesep = mo.group('linesep')
324	break
325	# We saw an inter-part boundary. Were we in the preamble?
326	if capturing_preamble:
327	if preamble:
328	# According to RFC 2046, the last newline belongs
329	# to the boundary.
330	lastline = preamble[-1]
331	eolmo = NLCRE_eol.search(lastline)
332	if eolmo:
333	preamble[-1] = lastline[:-len(eolmo.group(0))]
334	self._cur.preamble = EMPTYSTRING.join(preamble)
335	capturing_preamble = False
336	self._input.unreadline(line)
337	continue
338	# We saw a boundary separating two parts. Consume any
339	# multiple boundary lines that may be following. Our
340	# interpretation of RFC 2046 BNF grammar does not produce
341	# body parts within such double boundaries.
342	while True:
343	line = self._input.readline()
344	if line is NeedMoreData:
345	yield NeedMoreData
346	continue
347	mo = boundaryre.match(line)
348	if not mo:
349	self._input.unreadline(line)
350	break
351	# Recurse to parse this subpart; the input stream points
352	# at the subpart's first line.
353	self._input.push_eof_matcher(boundaryre.match)
354	for retval in self._parsegen():
355	if retval is NeedMoreData:
356	yield NeedMoreData
357	continue
358	break
359	# Because of RFC 2046, the newline preceding the boundary
360	# separator actually belongs to the boundary, not the
361	# previous subpart's payload (or epilogue if the previous
362	# part is a multipart).
363	if self._last.get_content_maintype() == 'multipart':
364	epilogue = self._last.epilogue
365	if epilogue == '':
366	self._last.epilogue = None
367	elif epilogue is not None:
368	mo = NLCRE_eol.search(epilogue)
369	if mo:
370	end = len(mo.group(0))
371	self._last.epilogue = epilogue[:-end]
372	else:
373	payload = self._last.get_payload()
374	if isinstance(payload, basestring):
375	mo = NLCRE_eol.search(payload)
376	if mo:
377	payload = payload[:-len(mo.group(0))]
378	self._last.set_payload(payload)
379	self._input.pop_eof_matcher()
380	self._pop_message()
381	# Set the multipart up for newline cleansing, which will
382	# happen if we're in a nested multipart.
383	self._last = self._cur
384	else:
385	# I think we must be in the preamble
386	assert capturing_preamble
387	preamble.append(line)
388	# We've seen either the EOF or the end boundary. If we're still
389	# capturing the preamble, we never saw the start boundary. Note
390	# that as a defect and store the captured text as the payload.
391	# Everything from here to the EOF is epilogue.
392	if capturing_preamble:
393	self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
394	self._cur.set_payload(EMPTYSTRING.join(preamble))
395	epilogue = []
396	for line in self._input:
397	if line is NeedMoreData:
398	yield NeedMoreData
399	continue
400	self._cur.epilogue = EMPTYSTRING.join(epilogue)
401	return
402	# If the end boundary ended in a newline, we'll need to make sure
403	# the epilogue isn't None
404	if linesep:
405	epilogue = ['']
406	else:
407	epilogue = []
408	for line in self._input:
409	if line is NeedMoreData:
410	yield NeedMoreData
411	continue
412	epilogue.append(line)
413	# Any CRLF at the front of the epilogue is not technically part of
414	# the epilogue. Also, watch out for an empty string epilogue,
415	# which means a single newline.
416	if epilogue:
417	firstline = epilogue[0]
418	bolmo = NLCRE_bol.match(firstline)
419	if bolmo:
420	epilogue[0] = firstline[len(bolmo.group(0)):]
421	self._cur.epilogue = EMPTYSTRING.join(epilogue)
422	return
423	# Otherwise, it's some non-multipart type, so the entire rest of the
424	# file contents becomes the payload.
425	lines = []
426	for line in self._input:
427	if line is NeedMoreData:
428	yield NeedMoreData
429	continue
430	lines.append(line)
431	self._cur.set_payload(EMPTYSTRING.join(lines))
432
433	def _parse_headers(self, lines):
434	# Passed a list of lines that make up the headers for the current msg
435	lastheader = ''
436	lastvalue = []
437	for lineno, line in enumerate(lines):
438	# Check for continuation
439	if line[0] in ' \t':
440	if not lastheader:
441	# The first line of the headers was a continuation. This
442	# is illegal, so let's note the defect, store the illegal
443	# line, and ignore it for purposes of headers.
444	defect = errors.FirstHeaderLineIsContinuationDefect(line)
445	self._cur.defects.append(defect)
446	continue
447	lastvalue.append(line)
448	continue
449	if lastheader:
450	# XXX reconsider the joining of folded lines
451	lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
452	self._cur[lastheader] = lhdr
453	lastheader, lastvalue = '', []
454	# Check for envelope header, i.e. unix-from
455	if line.startswith('From '):
456	if lineno == 0:
457	# Strip off the trailing newline
458	mo = NLCRE_eol.search(line)
459	if mo:
460	line = line[:-len(mo.group(0))]
461	self._cur.set_unixfrom(line)
462	continue
463	elif lineno == len(lines) - 1:
464	# Something looking like a unix-from at the end - it's
465	# probably the first line of the body, so push back the
466	# line and stop.
467	self._input.unreadline(line)
468	return
469	else:
470	# Weirdly placed unix-from line. Note this as a defect
471	# and ignore it.
472	defect = errors.MisplacedEnvelopeHeaderDefect(line)
473	self._cur.defects.append(defect)
474	continue
475	# Split the line on the colon separating field name from value.
476	i = line.find(':')
477	if i < 0:
478	defect = errors.MalformedHeaderDefect(line)
479	self._cur.defects.append(defect)
480	continue
481	lastheader = line[:i]
482	lastvalue = [line[i+1:].lstrip()]
483	# Done with all the lines, so handle the last header.
484	if lastheader:
485	# XXX reconsider the joining of folded lines
486	self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/vendor/Python-2.7.6/Lib/email/feedparser.py

Download in other formats: