source: python/trunk/Lib/email/feedparser.py@ 1538

Last change on this file since 1538 was 391, checked in by dmik, 12 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 20.1 KB
Line 
1# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never raise a parsing
17exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
22__all__ = ['FeedParser']
23
24import re
25
26from email import errors
27from email import message
28
29NLCRE = re.compile('\r\n|\r|\n')
30NLCRE_bol = re.compile('(\r\n|\r|\n)')
31NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
32NLCRE_crack = re.compile('(\r\n|\r|\n)')
33# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
34# except controls, SP, and ":".
35headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
36EMPTYSTRING = ''
37NL = '\n'
38
39NeedMoreData = object()
40
41
42
43
44class BufferedSubFile(object):
45 """A file-ish object that can have new data loaded into it.
46
47 You can also push and pop line-matching predicates onto a stack. When the
48 current predicate matches the current line, a false EOF response
49 (i.e. empty string) is returned instead. This lets the parser adhere to a
50 simple abstraction -- it parses until EOF closes the current message.
51 """
52 def __init__(self):
53 # The last partial line pushed into this object.
54 self._partial = ''
55 # The list of full, pushed lines, in reverse order
56 self._lines = []
57 # The stack of false-EOF checking predicates.
58 self._eofstack = []
59 # A flag indicating whether the file has been closed or not.
60 self._closed = False
61
62 def push_eof_matcher(self, pred):
63 self._eofstack.append(pred)
64
65 def pop_eof_matcher(self):
66 return self._eofstack.pop()
67
68 def close(self):
69 # Don't forget any trailing partial line.
70 self._lines.append(self._partial)
71 self._partial = ''
72 self._closed = True
73
74 def readline(self):
75 if not self._lines:
76 if self._closed:
77 return ''
78 return NeedMoreData
79 # Pop the line off the stack and see if it matches the current
80 # false-EOF predicate.
81 line = self._lines.pop()
82 # RFC 2046, section 5.1.2 requires us to recognize outer level
83 # boundaries at any level of inner nesting. Do this, but be sure it's
84 # in the order of most to least nested.
85 for ateof in self._eofstack[::-1]:
86 if ateof(line):
87 # We're at the false EOF. But push the last line back first.
88 self._lines.append(line)
89 return ''
90 return line
91
92 def unreadline(self, line):
93 # Let the consumer push a line back into the buffer.
94 assert line is not NeedMoreData
95 self._lines.append(line)
96
97 def push(self, data):
98 """Push some new data into this object."""
99 # Handle any previous leftovers
100 data, self._partial = self._partial + data, ''
101 # Crack into lines, but preserve the newlines on the end of each
102 parts = NLCRE_crack.split(data)
103 # The *ahem* interesting behaviour of re.split when supplied grouping
104 # parentheses is that the last element of the resulting list is the
105 # data after the final RE. In the case of a NL/CR terminated string,
106 # this is the empty string.
107 self._partial = parts.pop()
108 #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
109 # is there a \n to follow later?
110 if not self._partial and parts and parts[-1].endswith('\r'):
111 self._partial = parts.pop(-2)+parts.pop()
112 # parts is a list of strings, alternating between the line contents
113 # and the eol character(s). Gather up a list of lines after
114 # re-attaching the newlines.
115 lines = []
116 for i in range(len(parts) // 2):
117 lines.append(parts[i*2] + parts[i*2+1])
118 self.pushlines(lines)
119
120 def pushlines(self, lines):
121 # Reverse and insert at the front of the lines.
122 self._lines[:0] = lines[::-1]
123
124 def is_closed(self):
125 return self._closed
126
127 def __iter__(self):
128 return self
129
130 def next(self):
131 line = self.readline()
132 if line == '':
133 raise StopIteration
134 return line
135
136
137
138
139class FeedParser:
140 """A feed-style parser of email."""
141
142 def __init__(self, _factory=message.Message):
143 """_factory is called with no arguments to create a new message obj"""
144 self._factory = _factory
145 self._input = BufferedSubFile()
146 self._msgstack = []
147 self._parse = self._parsegen().next
148 self._cur = None
149 self._last = None
150 self._headersonly = False
151
152 # Non-public interface for supporting Parser's headersonly flag
153 def _set_headersonly(self):
154 self._headersonly = True
155
156 def feed(self, data):
157 """Push more data into the parser."""
158 self._input.push(data)
159 self._call_parse()
160
161 def _call_parse(self):
162 try:
163 self._parse()
164 except StopIteration:
165 pass
166
167 def close(self):
168 """Parse all remaining data and return the root message object."""
169 self._input.close()
170 self._call_parse()
171 root = self._pop_message()
172 assert not self._msgstack
173 # Look for final set of defects
174 if root.get_content_maintype() == 'multipart' \
175 and not root.is_multipart():
176 root.defects.append(errors.MultipartInvariantViolationDefect())
177 return root
178
179 def _new_message(self):
180 msg = self._factory()
181 if self._cur and self._cur.get_content_type() == 'multipart/digest':
182 msg.set_default_type('message/rfc822')
183 if self._msgstack:
184 self._msgstack[-1].attach(msg)
185 self._msgstack.append(msg)
186 self._cur = msg
187 self._last = msg
188
189 def _pop_message(self):
190 retval = self._msgstack.pop()
191 if self._msgstack:
192 self._cur = self._msgstack[-1]
193 else:
194 self._cur = None
195 return retval
196
197 def _parsegen(self):
198 # Create a new message and start by parsing headers.
199 self._new_message()
200 headers = []
201 # Collect the headers, searching for a line that doesn't match the RFC
202 # 2822 header or continuation pattern (including an empty line).
203 for line in self._input:
204 if line is NeedMoreData:
205 yield NeedMoreData
206 continue
207 if not headerRE.match(line):
208 # If we saw the RFC defined header/body separator
209 # (i.e. newline), just throw it away. Otherwise the line is
210 # part of the body so push it back.
211 if not NLCRE.match(line):
212 self._input.unreadline(line)
213 break
214 headers.append(line)
215 # Done with the headers, so parse them and figure out what we're
216 # supposed to see in the body of the message.
217 self._parse_headers(headers)
218 # Headers-only parsing is a backwards compatibility hack, which was
219 # necessary in the older parser, which could raise errors. All
220 # remaining lines in the input are thrown into the message body.
221 if self._headersonly:
222 lines = []
223 while True:
224 line = self._input.readline()
225 if line is NeedMoreData:
226 yield NeedMoreData
227 continue
228 if line == '':
229 break
230 lines.append(line)
231 self._cur.set_payload(EMPTYSTRING.join(lines))
232 return
233 if self._cur.get_content_type() == 'message/delivery-status':
234 # message/delivery-status contains blocks of headers separated by
235 # a blank line. We'll represent each header block as a separate
236 # nested message object, but the processing is a bit different
237 # than standard message/* types because there is no body for the
238 # nested messages. A blank line separates the subparts.
239 while True:
240 self._input.push_eof_matcher(NLCRE.match)
241 for retval in self._parsegen():
242 if retval is NeedMoreData:
243 yield NeedMoreData
244 continue
245 break
246 msg = self._pop_message()
247 # We need to pop the EOF matcher in order to tell if we're at
248 # the end of the current file, not the end of the last block
249 # of message headers.
250 self._input.pop_eof_matcher()
251 # The input stream must be sitting at the newline or at the
252 # EOF. We want to see if we're at the end of this subpart, so
253 # first consume the blank line, then test the next line to see
254 # if we're at this subpart's EOF.
255 while True:
256 line = self._input.readline()
257 if line is NeedMoreData:
258 yield NeedMoreData
259 continue
260 break
261 while True:
262 line = self._input.readline()
263 if line is NeedMoreData:
264 yield NeedMoreData
265 continue
266 break
267 if line == '':
268 break
269 # Not at EOF so this is a line we're going to need.
270 self._input.unreadline(line)
271 return
272 if self._cur.get_content_maintype() == 'message':
273 # The message claims to be a message/* type, then what follows is
274 # another RFC 2822 message.
275 for retval in self._parsegen():
276 if retval is NeedMoreData:
277 yield NeedMoreData
278 continue
279 break
280 self._pop_message()
281 return
282 if self._cur.get_content_maintype() == 'multipart':
283 boundary = self._cur.get_boundary()
284 if boundary is None:
285 # The message /claims/ to be a multipart but it has not
286 # defined a boundary. That's a problem which we'll handle by
287 # reading everything until the EOF and marking the message as
288 # defective.
289 self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
290 lines = []
291 for line in self._input:
292 if line is NeedMoreData:
293 yield NeedMoreData
294 continue
295 lines.append(line)
296 self._cur.set_payload(EMPTYSTRING.join(lines))
297 return
298 # Create a line match predicate which matches the inter-part
299 # boundary as well as the end-of-multipart boundary. Don't push
300 # this onto the input stream until we've scanned past the
301 # preamble.
302 separator = '--' + boundary
303 boundaryre = re.compile(
304 '(?P<sep>' + re.escape(separator) +
305 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
306 capturing_preamble = True
307 preamble = []
308 linesep = False
309 while True:
310 line = self._input.readline()
311 if line is NeedMoreData:
312 yield NeedMoreData
313 continue
314 if line == '':
315 break
316 mo = boundaryre.match(line)
317 if mo:
318 # If we're looking at the end boundary, we're done with
319 # this multipart. If there was a newline at the end of
320 # the closing boundary, then we need to initialize the
321 # epilogue with the empty string (see below).
322 if mo.group('end'):
323 linesep = mo.group('linesep')
324 break
325 # We saw an inter-part boundary. Were we in the preamble?
326 if capturing_preamble:
327 if preamble:
328 # According to RFC 2046, the last newline belongs
329 # to the boundary.
330 lastline = preamble[-1]
331 eolmo = NLCRE_eol.search(lastline)
332 if eolmo:
333 preamble[-1] = lastline[:-len(eolmo.group(0))]
334 self._cur.preamble = EMPTYSTRING.join(preamble)
335 capturing_preamble = False
336 self._input.unreadline(line)
337 continue
338 # We saw a boundary separating two parts. Consume any
339 # multiple boundary lines that may be following. Our
340 # interpretation of RFC 2046 BNF grammar does not produce
341 # body parts within such double boundaries.
342 while True:
343 line = self._input.readline()
344 if line is NeedMoreData:
345 yield NeedMoreData
346 continue
347 mo = boundaryre.match(line)
348 if not mo:
349 self._input.unreadline(line)
350 break
351 # Recurse to parse this subpart; the input stream points
352 # at the subpart's first line.
353 self._input.push_eof_matcher(boundaryre.match)
354 for retval in self._parsegen():
355 if retval is NeedMoreData:
356 yield NeedMoreData
357 continue
358 break
359 # Because of RFC 2046, the newline preceding the boundary
360 # separator actually belongs to the boundary, not the
361 # previous subpart's payload (or epilogue if the previous
362 # part is a multipart).
363 if self._last.get_content_maintype() == 'multipart':
364 epilogue = self._last.epilogue
365 if epilogue == '':
366 self._last.epilogue = None
367 elif epilogue is not None:
368 mo = NLCRE_eol.search(epilogue)
369 if mo:
370 end = len(mo.group(0))
371 self._last.epilogue = epilogue[:-end]
372 else:
373 payload = self._last.get_payload()
374 if isinstance(payload, basestring):
375 mo = NLCRE_eol.search(payload)
376 if mo:
377 payload = payload[:-len(mo.group(0))]
378 self._last.set_payload(payload)
379 self._input.pop_eof_matcher()
380 self._pop_message()
381 # Set the multipart up for newline cleansing, which will
382 # happen if we're in a nested multipart.
383 self._last = self._cur
384 else:
385 # I think we must be in the preamble
386 assert capturing_preamble
387 preamble.append(line)
388 # We've seen either the EOF or the end boundary. If we're still
389 # capturing the preamble, we never saw the start boundary. Note
390 # that as a defect and store the captured text as the payload.
391 # Everything from here to the EOF is epilogue.
392 if capturing_preamble:
393 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
394 self._cur.set_payload(EMPTYSTRING.join(preamble))
395 epilogue = []
396 for line in self._input:
397 if line is NeedMoreData:
398 yield NeedMoreData
399 continue
400 self._cur.epilogue = EMPTYSTRING.join(epilogue)
401 return
402 # If the end boundary ended in a newline, we'll need to make sure
403 # the epilogue isn't None
404 if linesep:
405 epilogue = ['']
406 else:
407 epilogue = []
408 for line in self._input:
409 if line is NeedMoreData:
410 yield NeedMoreData
411 continue
412 epilogue.append(line)
413 # Any CRLF at the front of the epilogue is not technically part of
414 # the epilogue. Also, watch out for an empty string epilogue,
415 # which means a single newline.
416 if epilogue:
417 firstline = epilogue[0]
418 bolmo = NLCRE_bol.match(firstline)
419 if bolmo:
420 epilogue[0] = firstline[len(bolmo.group(0)):]
421 self._cur.epilogue = EMPTYSTRING.join(epilogue)
422 return
423 # Otherwise, it's some non-multipart type, so the entire rest of the
424 # file contents becomes the payload.
425 lines = []
426 for line in self._input:
427 if line is NeedMoreData:
428 yield NeedMoreData
429 continue
430 lines.append(line)
431 self._cur.set_payload(EMPTYSTRING.join(lines))
432
433 def _parse_headers(self, lines):
434 # Passed a list of lines that make up the headers for the current msg
435 lastheader = ''
436 lastvalue = []
437 for lineno, line in enumerate(lines):
438 # Check for continuation
439 if line[0] in ' \t':
440 if not lastheader:
441 # The first line of the headers was a continuation. This
442 # is illegal, so let's note the defect, store the illegal
443 # line, and ignore it for purposes of headers.
444 defect = errors.FirstHeaderLineIsContinuationDefect(line)
445 self._cur.defects.append(defect)
446 continue
447 lastvalue.append(line)
448 continue
449 if lastheader:
450 # XXX reconsider the joining of folded lines
451 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
452 self._cur[lastheader] = lhdr
453 lastheader, lastvalue = '', []
454 # Check for envelope header, i.e. unix-from
455 if line.startswith('From '):
456 if lineno == 0:
457 # Strip off the trailing newline
458 mo = NLCRE_eol.search(line)
459 if mo:
460 line = line[:-len(mo.group(0))]
461 self._cur.set_unixfrom(line)
462 continue
463 elif lineno == len(lines) - 1:
464 # Something looking like a unix-from at the end - it's
465 # probably the first line of the body, so push back the
466 # line and stop.
467 self._input.unreadline(line)
468 return
469 else:
470 # Weirdly placed unix-from line. Note this as a defect
471 # and ignore it.
472 defect = errors.MisplacedEnvelopeHeaderDefect(line)
473 self._cur.defects.append(defect)
474 continue
475 # Split the line on the colon separating field name from value.
476 i = line.find(':')
477 if i < 0:
478 defect = errors.MalformedHeaderDefect(line)
479 self._cur.defects.append(defect)
480 continue
481 lastheader = line[:i]
482 lastvalue = [line[i+1:].lstrip()]
483 # Done with all the lines, so handle the last header.
484 if lastheader:
485 # XXX reconsider the joining of folded lines
486 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
Note: See TracBrowser for help on using the repository browser.