source: python/vendor/Python-2.6.5/Lib/email/feedparser.py

Last change on this file was 2, checked in by Yuri Dario, 15 years ago

Initial import for vendor code.

  • Property svn:eol-style set to native
File size: 19.9 KB
Line 
1# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never throw a parsing
17exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
22__all__ = ['FeedParser']
23
24import re
25
26from email import errors
27from email import message
28
29NLCRE = re.compile('\r\n|\r|\n')
30NLCRE_bol = re.compile('(\r\n|\r|\n)')
31NLCRE_eol = re.compile('(\r\n|\r|\n)$')
32NLCRE_crack = re.compile('(\r\n|\r|\n)')
33# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
34# except controls, SP, and ":".
35headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
36EMPTYSTRING = ''
37NL = '\n'
38
39NeedMoreData = object()
40
41
42
43
44class BufferedSubFile(object):
45 """A file-ish object that can have new data loaded into it.
46
47 You can also push and pop line-matching predicates onto a stack. When the
48 current predicate matches the current line, a false EOF response
49 (i.e. empty string) is returned instead. This lets the parser adhere to a
50 simple abstraction -- it parses until EOF closes the current message.
51 """
52 def __init__(self):
53 # The last partial line pushed into this object.
54 self._partial = ''
55 # The list of full, pushed lines, in reverse order
56 self._lines = []
57 # The stack of false-EOF checking predicates.
58 self._eofstack = []
59 # A flag indicating whether the file has been closed or not.
60 self._closed = False
61
62 def push_eof_matcher(self, pred):
63 self._eofstack.append(pred)
64
65 def pop_eof_matcher(self):
66 return self._eofstack.pop()
67
68 def close(self):
69 # Don't forget any trailing partial line.
70 self._lines.append(self._partial)
71 self._partial = ''
72 self._closed = True
73
74 def readline(self):
75 if not self._lines:
76 if self._closed:
77 return ''
78 return NeedMoreData
79 # Pop the line off the stack and see if it matches the current
80 # false-EOF predicate.
81 line = self._lines.pop()
82 # RFC 2046, section 5.1.2 requires us to recognize outer level
83 # boundaries at any level of inner nesting. Do this, but be sure it's
84 # in the order of most to least nested.
85 for ateof in self._eofstack[::-1]:
86 if ateof(line):
87 # We're at the false EOF. But push the last line back first.
88 self._lines.append(line)
89 return ''
90 return line
91
92 def unreadline(self, line):
93 # Let the consumer push a line back into the buffer.
94 assert line is not NeedMoreData
95 self._lines.append(line)
96
97 def push(self, data):
98 """Push some new data into this object."""
99 # Handle any previous leftovers
100 data, self._partial = self._partial + data, ''
101 # Crack into lines, but preserve the newlines on the end of each
102 parts = NLCRE_crack.split(data)
103 # The *ahem* interesting behaviour of re.split when supplied grouping
104 # parentheses is that the last element of the resulting list is the
105 # data after the final RE. In the case of a NL/CR terminated string,
106 # this is the empty string.
107 self._partial = parts.pop()
108 # parts is a list of strings, alternating between the line contents
109 # and the eol character(s). Gather up a list of lines after
110 # re-attaching the newlines.
111 lines = []
112 for i in range(len(parts) // 2):
113 lines.append(parts[i*2] + parts[i*2+1])
114 self.pushlines(lines)
115
116 def pushlines(self, lines):
117 # Reverse and insert at the front of the lines.
118 self._lines[:0] = lines[::-1]
119
120 def is_closed(self):
121 return self._closed
122
123 def __iter__(self):
124 return self
125
126 def next(self):
127 line = self.readline()
128 if line == '':
129 raise StopIteration
130 return line
131
132
133
134
135class FeedParser:
136 """A feed-style parser of email."""
137
138 def __init__(self, _factory=message.Message):
139 """_factory is called with no arguments to create a new message obj"""
140 self._factory = _factory
141 self._input = BufferedSubFile()
142 self._msgstack = []
143 self._parse = self._parsegen().next
144 self._cur = None
145 self._last = None
146 self._headersonly = False
147
148 # Non-public interface for supporting Parser's headersonly flag
149 def _set_headersonly(self):
150 self._headersonly = True
151
152 def feed(self, data):
153 """Push more data into the parser."""
154 self._input.push(data)
155 self._call_parse()
156
157 def _call_parse(self):
158 try:
159 self._parse()
160 except StopIteration:
161 pass
162
163 def close(self):
164 """Parse all remaining data and return the root message object."""
165 self._input.close()
166 self._call_parse()
167 root = self._pop_message()
168 assert not self._msgstack
169 # Look for final set of defects
170 if root.get_content_maintype() == 'multipart' \
171 and not root.is_multipart():
172 root.defects.append(errors.MultipartInvariantViolationDefect())
173 return root
174
175 def _new_message(self):
176 msg = self._factory()
177 if self._cur and self._cur.get_content_type() == 'multipart/digest':
178 msg.set_default_type('message/rfc822')
179 if self._msgstack:
180 self._msgstack[-1].attach(msg)
181 self._msgstack.append(msg)
182 self._cur = msg
183 self._last = msg
184
185 def _pop_message(self):
186 retval = self._msgstack.pop()
187 if self._msgstack:
188 self._cur = self._msgstack[-1]
189 else:
190 self._cur = None
191 return retval
192
193 def _parsegen(self):
194 # Create a new message and start by parsing headers.
195 self._new_message()
196 headers = []
197 # Collect the headers, searching for a line that doesn't match the RFC
198 # 2822 header or continuation pattern (including an empty line).
199 for line in self._input:
200 if line is NeedMoreData:
201 yield NeedMoreData
202 continue
203 if not headerRE.match(line):
204 # If we saw the RFC defined header/body separator
205 # (i.e. newline), just throw it away. Otherwise the line is
206 # part of the body so push it back.
207 if not NLCRE.match(line):
208 self._input.unreadline(line)
209 break
210 headers.append(line)
211 # Done with the headers, so parse them and figure out what we're
212 # supposed to see in the body of the message.
213 self._parse_headers(headers)
214 # Headers-only parsing is a backwards compatibility hack, which was
215 # necessary in the older parser, which could throw errors. All
216 # remaining lines in the input are thrown into the message body.
217 if self._headersonly:
218 lines = []
219 while True:
220 line = self._input.readline()
221 if line is NeedMoreData:
222 yield NeedMoreData
223 continue
224 if line == '':
225 break
226 lines.append(line)
227 self._cur.set_payload(EMPTYSTRING.join(lines))
228 return
229 if self._cur.get_content_type() == 'message/delivery-status':
230 # message/delivery-status contains blocks of headers separated by
231 # a blank line. We'll represent each header block as a separate
232 # nested message object, but the processing is a bit different
233 # than standard message/* types because there is no body for the
234 # nested messages. A blank line separates the subparts.
235 while True:
236 self._input.push_eof_matcher(NLCRE.match)
237 for retval in self._parsegen():
238 if retval is NeedMoreData:
239 yield NeedMoreData
240 continue
241 break
242 msg = self._pop_message()
243 # We need to pop the EOF matcher in order to tell if we're at
244 # the end of the current file, not the end of the last block
245 # of message headers.
246 self._input.pop_eof_matcher()
247 # The input stream must be sitting at the newline or at the
248 # EOF. We want to see if we're at the end of this subpart, so
249 # first consume the blank line, then test the next line to see
250 # if we're at this subpart's EOF.
251 while True:
252 line = self._input.readline()
253 if line is NeedMoreData:
254 yield NeedMoreData
255 continue
256 break
257 while True:
258 line = self._input.readline()
259 if line is NeedMoreData:
260 yield NeedMoreData
261 continue
262 break
263 if line == '':
264 break
265 # Not at EOF so this is a line we're going to need.
266 self._input.unreadline(line)
267 return
268 if self._cur.get_content_maintype() == 'message':
269 # The message claims to be a message/* type, then what follows is
270 # another RFC 2822 message.
271 for retval in self._parsegen():
272 if retval is NeedMoreData:
273 yield NeedMoreData
274 continue
275 break
276 self._pop_message()
277 return
278 if self._cur.get_content_maintype() == 'multipart':
279 boundary = self._cur.get_boundary()
280 if boundary is None:
281 # The message /claims/ to be a multipart but it has not
282 # defined a boundary. That's a problem which we'll handle by
283 # reading everything until the EOF and marking the message as
284 # defective.
285 self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
286 lines = []
287 for line in self._input:
288 if line is NeedMoreData:
289 yield NeedMoreData
290 continue
291 lines.append(line)
292 self._cur.set_payload(EMPTYSTRING.join(lines))
293 return
294 # Create a line match predicate which matches the inter-part
295 # boundary as well as the end-of-multipart boundary. Don't push
296 # this onto the input stream until we've scanned past the
297 # preamble.
298 separator = '--' + boundary
299 boundaryre = re.compile(
300 '(?P<sep>' + re.escape(separator) +
301 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
302 capturing_preamble = True
303 preamble = []
304 linesep = False
305 while True:
306 line = self._input.readline()
307 if line is NeedMoreData:
308 yield NeedMoreData
309 continue
310 if line == '':
311 break
312 mo = boundaryre.match(line)
313 if mo:
314 # If we're looking at the end boundary, we're done with
315 # this multipart. If there was a newline at the end of
316 # the closing boundary, then we need to initialize the
317 # epilogue with the empty string (see below).
318 if mo.group('end'):
319 linesep = mo.group('linesep')
320 break
321 # We saw an inter-part boundary. Were we in the preamble?
322 if capturing_preamble:
323 if preamble:
324 # According to RFC 2046, the last newline belongs
325 # to the boundary.
326 lastline = preamble[-1]
327 eolmo = NLCRE_eol.search(lastline)
328 if eolmo:
329 preamble[-1] = lastline[:-len(eolmo.group(0))]
330 self._cur.preamble = EMPTYSTRING.join(preamble)
331 capturing_preamble = False
332 self._input.unreadline(line)
333 continue
334 # We saw a boundary separating two parts. Consume any
335 # multiple boundary lines that may be following. Our
336 # interpretation of RFC 2046 BNF grammar does not produce
337 # body parts within such double boundaries.
338 while True:
339 line = self._input.readline()
340 if line is NeedMoreData:
341 yield NeedMoreData
342 continue
343 mo = boundaryre.match(line)
344 if not mo:
345 self._input.unreadline(line)
346 break
347 # Recurse to parse this subpart; the input stream points
348 # at the subpart's first line.
349 self._input.push_eof_matcher(boundaryre.match)
350 for retval in self._parsegen():
351 if retval is NeedMoreData:
352 yield NeedMoreData
353 continue
354 break
355 # Because of RFC 2046, the newline preceding the boundary
356 # separator actually belongs to the boundary, not the
357 # previous subpart's payload (or epilogue if the previous
358 # part is a multipart).
359 if self._last.get_content_maintype() == 'multipart':
360 epilogue = self._last.epilogue
361 if epilogue == '':
362 self._last.epilogue = None
363 elif epilogue is not None:
364 mo = NLCRE_eol.search(epilogue)
365 if mo:
366 end = len(mo.group(0))
367 self._last.epilogue = epilogue[:-end]
368 else:
369 payload = self._last.get_payload()
370 if isinstance(payload, basestring):
371 mo = NLCRE_eol.search(payload)
372 if mo:
373 payload = payload[:-len(mo.group(0))]
374 self._last.set_payload(payload)
375 self._input.pop_eof_matcher()
376 self._pop_message()
377 # Set the multipart up for newline cleansing, which will
378 # happen if we're in a nested multipart.
379 self._last = self._cur
380 else:
381 # I think we must be in the preamble
382 assert capturing_preamble
383 preamble.append(line)
384 # We've seen either the EOF or the end boundary. If we're still
385 # capturing the preamble, we never saw the start boundary. Note
386 # that as a defect and store the captured text as the payload.
387 # Everything from here to the EOF is epilogue.
388 if capturing_preamble:
389 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
390 self._cur.set_payload(EMPTYSTRING.join(preamble))
391 epilogue = []
392 for line in self._input:
393 if line is NeedMoreData:
394 yield NeedMoreData
395 continue
396 self._cur.epilogue = EMPTYSTRING.join(epilogue)
397 return
398 # If the end boundary ended in a newline, we'll need to make sure
399 # the epilogue isn't None
400 if linesep:
401 epilogue = ['']
402 else:
403 epilogue = []
404 for line in self._input:
405 if line is NeedMoreData:
406 yield NeedMoreData
407 continue
408 epilogue.append(line)
409 # Any CRLF at the front of the epilogue is not technically part of
410 # the epilogue. Also, watch out for an empty string epilogue,
411 # which means a single newline.
412 if epilogue:
413 firstline = epilogue[0]
414 bolmo = NLCRE_bol.match(firstline)
415 if bolmo:
416 epilogue[0] = firstline[len(bolmo.group(0)):]
417 self._cur.epilogue = EMPTYSTRING.join(epilogue)
418 return
419 # Otherwise, it's some non-multipart type, so the entire rest of the
420 # file contents becomes the payload.
421 lines = []
422 for line in self._input:
423 if line is NeedMoreData:
424 yield NeedMoreData
425 continue
426 lines.append(line)
427 self._cur.set_payload(EMPTYSTRING.join(lines))
428
429 def _parse_headers(self, lines):
430 # Passed a list of lines that make up the headers for the current msg
431 lastheader = ''
432 lastvalue = []
433 for lineno, line in enumerate(lines):
434 # Check for continuation
435 if line[0] in ' \t':
436 if not lastheader:
437 # The first line of the headers was a continuation. This
438 # is illegal, so let's note the defect, store the illegal
439 # line, and ignore it for purposes of headers.
440 defect = errors.FirstHeaderLineIsContinuationDefect(line)
441 self._cur.defects.append(defect)
442 continue
443 lastvalue.append(line)
444 continue
445 if lastheader:
446 # XXX reconsider the joining of folded lines
447 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
448 self._cur[lastheader] = lhdr
449 lastheader, lastvalue = '', []
450 # Check for envelope header, i.e. unix-from
451 if line.startswith('From '):
452 if lineno == 0:
453 # Strip off the trailing newline
454 mo = NLCRE_eol.search(line)
455 if mo:
456 line = line[:-len(mo.group(0))]
457 self._cur.set_unixfrom(line)
458 continue
459 elif lineno == len(lines) - 1:
460 # Something looking like a unix-from at the end - it's
461 # probably the first line of the body, so push back the
462 # line and stop.
463 self._input.unreadline(line)
464 return
465 else:
466 # Weirdly placed unix-from line. Note this as a defect
467 # and ignore it.
468 defect = errors.MisplacedEnvelopeHeaderDefect(line)
469 self._cur.defects.append(defect)
470 continue
471 # Split the line on the colon separating field name from value.
472 i = line.find(':')
473 if i < 0:
474 defect = errors.MalformedHeaderDefect(line)
475 self._cur.defects.append(defect)
476 continue
477 lastheader = line[:i]
478 lastvalue = [line[i+1:].lstrip()]
479 # Done with all the lines, so handle the last header.
480 if lastheader:
481 # XXX reconsider the joining of folded lines
482 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
Note: See TracBrowser for help on using the repository browser.