Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

feedparser.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 20.1 KB

Rev	Line
[2]	1	# Copyright (C) 2004-2006 Python Software Foundation
	2	# Authors: Baxter, Wouters and Warsaw
	3	# Contact: email-sig@python.org
	4
	5	"""FeedParser - An email feed parser.
	6
	7	The feed parser implements an interface for incrementally parsing an email
	8	message, line by line. This has advantages for certain applications, such as
	9	those reading email messages off a socket.
	10
	11	FeedParser.feed() is the primary interface for pushing new data into the
	12	parser. It returns when there's nothing more it can do with the available
	13	data. When you have no more data to push into the parser, call .close().
	14	This completes the parsing and returns the root message object.
	15
[391]	16	The other advantage of this parser is that it will never raise a parsing
[2]	17	exception. Instead, when it finds something unexpected, it adds a 'defect' to
	18	the current message. Defects are just instances that live on the message
	19	object's .defects attribute.
	20	"""
	21
	22	__all__ = ['FeedParser']
	23
	24	import re
	25
	26	from email import errors
	27	from email import message
	28
	29	NLCRE = re.compile('\r\n\|\r\|\n')
	30	NLCRE_bol = re.compile('(\r\n\|\r\|\n)')
[391]	31	NLCRE_eol = re.compile('(\r\n\|\r\|\n)\Z')
[2]	32	NLCRE_crack = re.compile('(\r\n\|\r\|\n)')
	33	# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
	34	# except controls, SP, and ":".
	35	headerRE = re.compile(r'^(From \|[\041-\071\073-\176]{1,}:\|[\t ])')
	36	EMPTYSTRING = ''
	37	NL = '\n'
	38
	39	NeedMoreData = object()
	40
	41
	42
	43
	44	class BufferedSubFile(object):
	45	"""A file-ish object that can have new data loaded into it.
	46
	47	You can also push and pop line-matching predicates onto a stack. When the
	48	current predicate matches the current line, a false EOF response
	49	(i.e. empty string) is returned instead. This lets the parser adhere to a
	50	simple abstraction -- it parses until EOF closes the current message.
	51	"""
	52	def __init__(self):
	53	# The last partial line pushed into this object.
	54	self._partial = ''
	55	# The list of full, pushed lines, in reverse order
	56	self._lines = []
	57	# The stack of false-EOF checking predicates.
	58	self._eofstack = []
	59	# A flag indicating whether the file has been closed or not.
	60	self._closed = False
	61
	62	def push_eof_matcher(self, pred):
	63	self._eofstack.append(pred)
	64
	65	def pop_eof_matcher(self):
	66	return self._eofstack.pop()
	67
	68	def close(self):
	69	# Don't forget any trailing partial line.
	70	self._lines.append(self._partial)
	71	self._partial = ''
	72	self._closed = True
	73
	74	def readline(self):
	75	if not self._lines:
	76	if self._closed:
	77	return ''
	78	return NeedMoreData
	79	# Pop the line off the stack and see if it matches the current
	80	# false-EOF predicate.
	81	line = self._lines.pop()
	82	# RFC 2046, section 5.1.2 requires us to recognize outer level
	83	# boundaries at any level of inner nesting. Do this, but be sure it's
	84	# in the order of most to least nested.
	85	for ateof in self._eofstack[::-1]:
	86	if ateof(line):
	87	# We're at the false EOF. But push the last line back first.
	88	self._lines.append(line)
	89	return ''
	90	return line
	91
	92	def unreadline(self, line):
	93	# Let the consumer push a line back into the buffer.
	94	assert line is not NeedMoreData
	95	self._lines.append(line)
	96
	97	def push(self, data):
	98	"""Push some new data into this object."""
	99	# Handle any previous leftovers
	100	data, self._partial = self._partial + data, ''
	101	# Crack into lines, but preserve the newlines on the end of each
	102	parts = NLCRE_crack.split(data)
	103	# The ahem interesting behaviour of re.split when supplied grouping
	104	# parentheses is that the last element of the resulting list is the
	105	# data after the final RE. In the case of a NL/CR terminated string,
	106	# this is the empty string.
[391]	107	self._partial = parts.pop()
	108	#GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
	109	# is there a \n to follow later?
	110	if not self._partial and parts and parts[-1].endswith('\r'):
[2]	111	self._partial = parts.pop(-2)+parts.pop()
	112	# parts is a list of strings, alternating between the line contents
	113	# and the eol character(s). Gather up a list of lines after
	114	# re-attaching the newlines.
	115	lines = []
	116	for i in range(len(parts) // 2):
	117	lines.append(parts[i2] + parts[i2+1])
	118	self.pushlines(lines)
	119
	120	def pushlines(self, lines):
	121	# Reverse and insert at the front of the lines.
	122	self._lines[:0] = lines[::-1]
	123
	124	def is_closed(self):
	125	return self._closed
	126
	127	def __iter__(self):
	128	return self
	129
	130	def next(self):
	131	line = self.readline()
	132	if line == '':
	133	raise StopIteration
	134	return line
	135
	136
	137
	138
	139	class FeedParser:
	140	"""A feed-style parser of email."""
	141
	142	def __init__(self, _factory=message.Message):
	143	"""_factory is called with no arguments to create a new message obj"""
	144	self._factory = _factory
	145	self._input = BufferedSubFile()
	146	self._msgstack = []
	147	self._parse = self._parsegen().next
	148	self._cur = None
	149	self._last = None
	150	self._headersonly = False
	151
	152	# Non-public interface for supporting Parser's headersonly flag
	153	def _set_headersonly(self):
	154	self._headersonly = True
	155
	156	def feed(self, data):
	157	"""Push more data into the parser."""
	158	self._input.push(data)
	159	self._call_parse()
	160
	161	def _call_parse(self):
	162	try:
	163	self._parse()
	164	except StopIteration:
	165	pass
	166
	167	def close(self):
	168	"""Parse all remaining data and return the root message object."""
	169	self._input.close()
	170	self._call_parse()
	171	root = self._pop_message()
	172	assert not self._msgstack
	173	# Look for final set of defects
	174	if root.get_content_maintype() == 'multipart' \
	175	and not root.is_multipart():
	176	root.defects.append(errors.MultipartInvariantViolationDefect())
	177	return root
	178
	179	def _new_message(self):
	180	msg = self._factory()
	181	if self._cur and self._cur.get_content_type() == 'multipart/digest':
	182	msg.set_default_type('message/rfc822')
	183	if self._msgstack:
	184	self._msgstack[-1].attach(msg)
	185	self._msgstack.append(msg)
	186	self._cur = msg
	187	self._last = msg
	188
	189	def _pop_message(self):
	190	retval = self._msgstack.pop()
	191	if self._msgstack:
	192	self._cur = self._msgstack[-1]
	193	else:
	194	self._cur = None
	195	return retval
	196
	197	def _parsegen(self):
	198	# Create a new message and start by parsing headers.
	199	self._new_message()
	200	headers = []
	201	# Collect the headers, searching for a line that doesn't match the RFC
	202	# 2822 header or continuation pattern (including an empty line).
	203	for line in self._input:
	204	if line is NeedMoreData:
	205	yield NeedMoreData
	206	continue
	207	if not headerRE.match(line):
	208	# If we saw the RFC defined header/body separator
	209	# (i.e. newline), just throw it away. Otherwise the line is
	210	# part of the body so push it back.
	211	if not NLCRE.match(line):
	212	self._input.unreadline(line)
	213	break
	214	headers.append(line)
	215	# Done with the headers, so parse them and figure out what we're
	216	# supposed to see in the body of the message.
[391]	217	self._parse_headers(headers)
[2]	218	# Headers-only parsing is a backwards compatibility hack, which was
	219	# necessary in the older parser, which could raise errors. All
	220	# remaining lines in the input are thrown into the message body.
	221	if self._headersonly:
	222	lines = []
	223	while True:
	224	line = self._input.readline()
	225	if line is NeedMoreData:
	226	yield NeedMoreData
	227	continue
	228	if line == '':
	229	break
	230	lines.append(line)
	231	self._cur.set_payload(EMPTYSTRING.join(lines))
	232	return
	233	if self._cur.get_content_type() == 'message/delivery-status':
	234	# message/delivery-status contains blocks of headers separated by
	235	# a blank line. We'll represent each header block as a separate
	236	# nested message object, but the processing is a bit different
	237	# than standard message/* types because there is no body for the
	238	# nested messages. A blank line separates the subparts.
	239	while True:
	240	self._input.push_eof_matcher(NLCRE.match)
	241	for retval in self._parsegen():
	242	if retval is NeedMoreData:
	243	yield NeedMoreData
	244	continue
	245	break
	246	msg = self._pop_message()
	247	# We need to pop the EOF matcher in order to tell if we're at
	248	# the end of the current file, not the end of the last block
	249	# of message headers.
	250	self._input.pop_eof_matcher()
	251	# The input stream must be sitting at the newline or at the
	252	# EOF. We want to see if we're at the end of this subpart, so
	253	# first consume the blank line, then test the next line to see
	254	# if we're at this subpart's EOF.
	255	while True:
	256	line = self._input.readline()
	257	if line is NeedMoreData:
	258	yield NeedMoreData
	259	continue
	260	break
	261	while True:
	262	line = self._input.readline()
	263	if line is NeedMoreData:
	264	yield NeedMoreData
	265	continue
	266	break
	267	if line == '':
	268	break
	269	# Not at EOF so this is a line we're going to need.
	270	self._input.unreadline(line)
	271	return
	272	if self._cur.get_content_maintype() == 'message':
	273	# The message claims to be a message/* type, then what follows is
	274	# another RFC 2822 message.
	275	for retval in self._parsegen():
	276	if retval is NeedMoreData:
	277	yield NeedMoreData
	278	continue
	279	break
	280	self._pop_message()
	281	return
	282	if self._cur.get_content_maintype() == 'multipart':
	283	boundary = self._cur.get_boundary()
	284	if boundary is None:
	285	# The message /claims/ to be a multipart but it has not
	286	# defined a boundary. That's a problem which we'll handle by
	287	# reading everything until the EOF and marking the message as
	288	# defective.
	289	self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
	290	lines = []
	291	for line in self._input:
	292	if line is NeedMoreData:
	293	yield NeedMoreData
	294	continue
	295	lines.append(line)
	296	self._cur.set_payload(EMPTYSTRING.join(lines))
	297	return
	298	# Create a line match predicate which matches the inter-part
	299	# boundary as well as the end-of-multipart boundary. Don't push
	300	# this onto the input stream until we've scanned past the
	301	# preamble.
	302	separator = '--' + boundary
	303	boundaryre = re.compile(
	304	'(?P<sep>' + re.escape(separator) +
	305	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')
	306	capturing_preamble = True
	307	preamble = []
	308	linesep = False
	309	while True:
	310	line = self._input.readline()
	311	if line is NeedMoreData:
	312	yield NeedMoreData
	313	continue
	314	if line == '':
	315	break
	316	mo = boundaryre.match(line)
	317	if mo:
	318	# If we're looking at the end boundary, we're done with
	319	# this multipart. If there was a newline at the end of
	320	# the closing boundary, then we need to initialize the
	321	# epilogue with the empty string (see below).
	322	if mo.group('end'):
	323	linesep = mo.group('linesep')
	324	break
	325	# We saw an inter-part boundary. Were we in the preamble?
	326	if capturing_preamble:
	327	if preamble:
	328	# According to RFC 2046, the last newline belongs
	329	# to the boundary.
	330	lastline = preamble[-1]
	331	eolmo = NLCRE_eol.search(lastline)
	332	if eolmo:
	333	preamble[-1] = lastline[:-len(eolmo.group(0))]
	334	self._cur.preamble = EMPTYSTRING.join(preamble)
	335	capturing_preamble = False
	336	self._input.unreadline(line)
	337	continue
	338	# We saw a boundary separating two parts. Consume any
	339	# multiple boundary lines that may be following. Our
	340	# interpretation of RFC 2046 BNF grammar does not produce
	341	# body parts within such double boundaries.
	342	while True:
	343	line = self._input.readline()
	344	if line is NeedMoreData:
	345	yield NeedMoreData
	346	continue
	347	mo = boundaryre.match(line)
	348	if not mo:
	349	self._input.unreadline(line)
	350	break
	351	# Recurse to parse this subpart; the input stream points
	352	# at the subpart's first line.
	353	self._input.push_eof_matcher(boundaryre.match)
	354	for retval in self._parsegen():
	355	if retval is NeedMoreData:
	356	yield NeedMoreData
	357	continue
	358	break
	359	# Because of RFC 2046, the newline preceding the boundary
	360	# separator actually belongs to the boundary, not the
	361	# previous subpart's payload (or epilogue if the previous
	362	# part is a multipart).
	363	if self._last.get_content_maintype() == 'multipart':
	364	epilogue = self._last.epilogue
	365	if epilogue == '':
	366	self._last.epilogue = None
	367	elif epilogue is not None:
	368	mo = NLCRE_eol.search(epilogue)
	369	if mo:
	370	end = len(mo.group(0))
	371	self._last.epilogue = epilogue[:-end]
	372	else:
	373	payload = self._last.get_payload()
	374	if isinstance(payload, basestring):
	375	mo = NLCRE_eol.search(payload)
	376	if mo:
	377	payload = payload[:-len(mo.group(0))]
	378	self._last.set_payload(payload)
	379	self._input.pop_eof_matcher()
	380	self._pop_message()
	381	# Set the multipart up for newline cleansing, which will
	382	# happen if we're in a nested multipart.
	383	self._last = self._cur
	384	else:
	385	# I think we must be in the preamble
	386	assert capturing_preamble
	387	preamble.append(line)
	388	# We've seen either the EOF or the end boundary. If we're still
	389	# capturing the preamble, we never saw the start boundary. Note
	390	# that as a defect and store the captured text as the payload.
	391	# Everything from here to the EOF is epilogue.
	392	if capturing_preamble:
	393	self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
	394	self._cur.set_payload(EMPTYSTRING.join(preamble))
	395	epilogue = []
	396	for line in self._input:
	397	if line is NeedMoreData:
	398	yield NeedMoreData
	399	continue
	400	self._cur.epilogue = EMPTYSTRING.join(epilogue)
	401	return
	402	# If the end boundary ended in a newline, we'll need to make sure
	403	# the epilogue isn't None
	404	if linesep:
	405	epilogue = ['']
	406	else:
	407	epilogue = []
	408	for line in self._input:
	409	if line is NeedMoreData:
	410	yield NeedMoreData
	411	continue
	412	epilogue.append(line)
	413	# Any CRLF at the front of the epilogue is not technically part of
	414	# the epilogue. Also, watch out for an empty string epilogue,
	415	# which means a single newline.
	416	if epilogue:
	417	firstline = epilogue[0]
	418	bolmo = NLCRE_bol.match(firstline)
	419	if bolmo:
	420	epilogue[0] = firstline[len(bolmo.group(0)):]
	421	self._cur.epilogue = EMPTYSTRING.join(epilogue)
	422	return
	423	# Otherwise, it's some non-multipart type, so the entire rest of the
	424	# file contents becomes the payload.
	425	lines = []
	426	for line in self._input:
	427	if line is NeedMoreData:
	428	yield NeedMoreData
	429	continue
	430	lines.append(line)
	431	self._cur.set_payload(EMPTYSTRING.join(lines))
	432
	433	def _parse_headers(self, lines):
	434	# Passed a list of lines that make up the headers for the current msg
	435	lastheader = ''
	436	lastvalue = []
	437	for lineno, line in enumerate(lines):
	438	# Check for continuation
	439	if line[0] in ' \t':
	440	if not lastheader:
	441	# The first line of the headers was a continuation. This
	442	# is illegal, so let's note the defect, store the illegal
	443	# line, and ignore it for purposes of headers.
	444	defect = errors.FirstHeaderLineIsContinuationDefect(line)
	445	self._cur.defects.append(defect)
	446	continue
	447	lastvalue.append(line)
	448	continue
	449	if lastheader:
	450	# XXX reconsider the joining of folded lines
	451	lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
	452	self._cur[lastheader] = lhdr
	453	lastheader, lastvalue = '', []
	454	# Check for envelope header, i.e. unix-from
	455	if line.startswith('From '):
	456	if lineno == 0:
	457	# Strip off the trailing newline
	458	mo = NLCRE_eol.search(line)
	459	if mo:
	460	line = line[:-len(mo.group(0))]
	461	self._cur.set_unixfrom(line)
	462	continue
	463	elif lineno == len(lines) - 1:
	464	# Something looking like a unix-from at the end - it's
	465	# probably the first line of the body, so push back the
	466	# line and stop.
	467	self._input.unreadline(line)
	468	return
	469	else:
	470	# Weirdly placed unix-from line. Note this as a defect
	471	# and ignore it.
	472	defect = errors.MisplacedEnvelopeHeaderDefect(line)
	473	self._cur.defects.append(defect)
	474	continue
	475	# Split the line on the colon separating field name from value.
	476	i = line.find(':')
	477	if i < 0:
	478	defect = errors.MalformedHeaderDefect(line)
	479	self._cur.defects.append(defect)
	480	continue
	481	lastheader = line[:i]
	482	lastvalue = [line[i+1:].lstrip()]
	483	# Done with all the lines, so handle the last header.
	484	if lastheader:
	485	# XXX reconsider the joining of folded lines
	486	self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/email/feedparser.py

Download in other formats: