Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

rfc822.py@ 6

Last change on this file since 6 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 32.5 KB

Line
1	"""RFC 2822 message manipulation.
2
3	Note: This is only a very rough sketch of a full RFC-822 parser; in particular
4	the tokenizing of addresses does not adhere to all the quoting rules.
5
6	Note: RFC 2822 is a long awaited update to RFC 822. This module should
7	conform to RFC 2822, and is thus mis-named (it's not worth renaming it). Some
8	effort at RFC 2822 updates have been made, but a thorough audit has not been
9	performed. Consider any RFC 2822 non-conformance to be a bug.
10
11	RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
12	RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
13
14	Directions for use:
15
16	To create a Message object: first open a file, e.g.:
17
18	fp = open(file, 'r')
19
20	You can use any other legal way of getting an open file object, e.g. use
21	sys.stdin or call os.popen(). Then pass the open file object to the Message()
22	constructor:
23
24	m = Message(fp)
25
26	This class can work with any input object that supports a readline method. If
27	the input object has seek and tell capability, the rewindbody method will
28	work; also illegal lines will be pushed back onto the input stream. If the
29	input object lacks seek but has an `unread' method that can push back a line
30	of input, Message will use that to push back illegal lines. Thus this class
31	can be used to parse messages coming from a buffered stream.
32
33	The optional `seekable' argument is provided as a workaround for certain stdio
34	libraries in which tell() discards buffered data before discovering that the
35	lseek() system call doesn't work. For maximum portability, you should set the
36	seekable argument to zero to prevent that initial \code{tell} when passing in
37	an unseekable object such as a a file object created from a socket object. If
38	it is 1 on entry -- which it is by default -- the tell() method of the open
39	file object is called once; if this raises an exception, seekable is reset to
40	0. For other nonzero values of seekable, this test is not made.
41
42	To get the text of a particular header there are several methods:
43
44	str = m.getheader(name)
45	str = m.getrawheader(name)
46
47	where name is the name of the header, e.g. 'Subject'. The difference is that
48	getheader() strips the leading and trailing whitespace, while getrawheader()
49	doesn't. Both functions retain embedded whitespace (including newlines)
50	exactly as they are specified in the header, and leave the case of the text
51	unchanged.
52
53	For addresses and address lists there are functions
54
55	realname, mailaddress = m.getaddr(name)
56	list = m.getaddrlist(name)
57
58	where the latter returns a list of (realname, mailaddr) tuples.
59
60	There is also a method
61
62	time = m.getdate(name)
63
64	which parses a Date-like field and returns a time-compatible tuple,
65	i.e. a tuple such as returned by time.localtime() or accepted by
66	time.mktime().
67
68	See the class definition for lower level access methods.
69
70	There are also some utility functions here.
71	"""
72	# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
73
74	import time
75
76	from warnings import warnpy3k
77	warnpy3k("in 3.x, rfc822 has been removed in favor of the email package",
78	stacklevel=2)
79
80	__all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
81
82	_blanklines = ('\r\n', '\n') # Optimization for islast()
83
84
85	class Message:
86	"""Represents a single RFC 2822-compliant message."""
87
88	def __init__(self, fp, seekable = 1):
89	"""Initialize the class instance and read the headers."""
90	if seekable == 1:
91	# Exercise tell() to make sure it works
92	# (and then assume seek() works, too)
93	try:
94	fp.tell()
95	except (AttributeError, IOError):
96	seekable = 0
97	self.fp = fp
98	self.seekable = seekable
99	self.startofheaders = None
100	self.startofbody = None
101	#
102	if self.seekable:
103	try:
104	self.startofheaders = self.fp.tell()
105	except IOError:
106	self.seekable = 0
107	#
108	self.readheaders()
109	#
110	if self.seekable:
111	try:
112	self.startofbody = self.fp.tell()
113	except IOError:
114	self.seekable = 0
115
116	def rewindbody(self):
117	"""Rewind the file to the start of the body (if seekable)."""
118	if not self.seekable:
119	raise IOError, "unseekable file"
120	self.fp.seek(self.startofbody)
121
122	def readheaders(self):
123	"""Read header lines.
124
125	Read header lines up to the entirely blank line that terminates them.
126	The (normally blank) line that ends the headers is skipped, but not
127	included in the returned list. If a non-header line ends the headers,
128	(which is an error), an attempt is made to backspace over it; it is
129	never included in the returned list.
130
131	The variable self.status is set to the empty string if all went well,
132	otherwise it is an error message. The variable self.headers is a
133	completely uninterpreted list of lines contained in the header (so
134	printing them will reproduce the header exactly as it appears in the
135	file).
136	"""
137	self.dict = {}
138	self.unixfrom = ''
139	self.headers = lst = []
140	self.status = ''
141	headerseen = ""
142	firstline = 1
143	startofline = unread = tell = None
144	if hasattr(self.fp, 'unread'):
145	unread = self.fp.unread
146	elif self.seekable:
147	tell = self.fp.tell
148	while 1:
149	if tell:
150	try:
151	startofline = tell()
152	except IOError:
153	startofline = tell = None
154	self.seekable = 0
155	line = self.fp.readline()
156	if not line:
157	self.status = 'EOF in headers'
158	break
159	# Skip unix From name time lines
160	if firstline and line.startswith('From '):
161	self.unixfrom = self.unixfrom + line
162	continue
163	firstline = 0
164	if headerseen and line[0] in ' \t':
165	# It's a continuation line.
166	lst.append(line)
167	x = (self.dict[headerseen] + "\n " + line.strip())
168	self.dict[headerseen] = x.strip()
169	continue
170	elif self.iscomment(line):
171	# It's a comment. Ignore it.
172	continue
173	elif self.islast(line):
174	# Note! No pushback here! The delimiter line gets eaten.
175	break
176	headerseen = self.isheader(line)
177	if headerseen:
178	# It's a legal header line, save it.
179	lst.append(line)
180	self.dict[headerseen] = line[len(headerseen)+1:].strip()
181	continue
182	else:
183	# It's not a header line; throw it back and stop here.
184	if not self.dict:
185	self.status = 'No headers'
186	else:
187	self.status = 'Non-header line where header expected'
188	# Try to undo the read.
189	if unread:
190	unread(line)
191	elif tell:
192	self.fp.seek(startofline)
193	else:
194	self.status = self.status + '; bad seek'
195	break
196
197	def isheader(self, line):
198	"""Determine whether a given line is a legal header.
199
200	This method should return the header name, suitably canonicalized.
201	You may override this method in order to use Message parsing on tagged
202	data in RFC 2822-like formats with special header formats.
203	"""
204	i = line.find(':')
205	if i > 0:
206	return line[:i].lower()
207	return None
208
209	def islast(self, line):
210	"""Determine whether a line is a legal end of RFC 2822 headers.
211
212	You may override this method if your application wants to bend the
213	rules, e.g. to strip trailing whitespace, or to recognize MH template
214	separators ('--------'). For convenience (e.g. for code reading from
215	sockets) a line consisting of \r\n also matches.
216	"""
217	return line in _blanklines
218
219	def iscomment(self, line):
220	"""Determine whether a line should be skipped entirely.
221
222	You may override this method in order to use Message parsing on tagged
223	data in RFC 2822-like formats that support embedded comments or
224	free-text data.
225	"""
226	return False
227
228	def getallmatchingheaders(self, name):
229	"""Find all header lines matching a given header name.
230
231	Look through the list of headers and find all lines matching a given
232	header name (and their continuation lines). A list of the lines is
233	returned, without interpretation. If the header does not occur, an
234	empty list is returned. If the header occurs multiple times, all
235	occurrences are returned. Case is not important in the header name.
236	"""
237	name = name.lower() + ':'
238	n = len(name)
239	lst = []
240	hit = 0
241	for line in self.headers:
242	if line[:n].lower() == name:
243	hit = 1
244	elif not line[:1].isspace():
245	hit = 0
246	if hit:
247	lst.append(line)
248	return lst
249
250	def getfirstmatchingheader(self, name):
251	"""Get the first header line matching name.
252
253	This is similar to getallmatchingheaders, but it returns only the
254	first matching header (and its continuation lines).
255	"""
256	name = name.lower() + ':'
257	n = len(name)
258	lst = []
259	hit = 0
260	for line in self.headers:
261	if hit:
262	if not line[:1].isspace():
263	break
264	elif line[:n].lower() == name:
265	hit = 1
266	if hit:
267	lst.append(line)
268	return lst
269
270	def getrawheader(self, name):
271	"""A higher-level interface to getfirstmatchingheader().
272
273	Return a string containing the literal text of the header but with the
274	keyword stripped. All leading, trailing and embedded whitespace is
275	kept in the string, however. Return None if the header does not
276	occur.
277	"""
278
279	lst = self.getfirstmatchingheader(name)
280	if not lst:
281	return None
282	lst[0] = lst[0][len(name) + 1:]
283	return ''.join(lst)
284
285	def getheader(self, name, default=None):
286	"""Get the header value for a name.
287
288	This is the normal interface: it returns a stripped version of the
289	header value for a given header name, or None if it doesn't exist.
290	This uses the dictionary version which finds the last such header.
291	"""
292	return self.dict.get(name.lower(), default)
293	get = getheader
294
295	def getheaders(self, name):
296	"""Get all values for a header.
297
298	This returns a list of values for headers given more than once; each
299	value in the result list is stripped in the same way as the result of
300	getheader(). If the header is not given, return an empty list.
301	"""
302	result = []
303	current = ''
304	have_header = 0
305	for s in self.getallmatchingheaders(name):
306	if s[0].isspace():
307	if current:
308	current = "%s\n %s" % (current, s.strip())
309	else:
310	current = s.strip()
311	else:
312	if have_header:
313	result.append(current)
314	current = s[s.find(":") + 1:].strip()
315	have_header = 1
316	if have_header:
317	result.append(current)
318	return result
319
320	def getaddr(self, name):
321	"""Get a single address from a header, as a tuple.
322
323	An example return value:
324	('Guido van Rossum', 'guido@cwi.nl')
325	"""
326	# New, by Ben Escoto
327	alist = self.getaddrlist(name)
328	if alist:
329	return alist[0]
330	else:
331	return (None, None)
332
333	def getaddrlist(self, name):
334	"""Get a list of addresses from a header.
335
336	Retrieves a list of addresses from a header, where each address is a
337	tuple as returned by getaddr(). Scans all named headers, so it works
338	properly with multiple To: or Cc: headers for example.
339	"""
340	raw = []
341	for h in self.getallmatchingheaders(name):
342	if h[0] in ' \t':
343	raw.append(h)
344	else:
345	if raw:
346	raw.append(', ')
347	i = h.find(':')
348	if i > 0:
349	addr = h[i+1:]
350	raw.append(addr)
351	alladdrs = ''.join(raw)
352	a = AddressList(alladdrs)
353	return a.addresslist
354
355	def getdate(self, name):
356	"""Retrieve a date field from a header.
357
358	Retrieves a date field from the named header, returning a tuple
359	compatible with time.mktime().
360	"""
361	try:
362	data = self[name]
363	except KeyError:
364	return None
365	return parsedate(data)
366
367	def getdate_tz(self, name):
368	"""Retrieve a date field from a header as a 10-tuple.
369
370	The first 9 elements make up a tuple compatible with time.mktime(),
371	and the 10th is the offset of the poster's time zone from GMT/UTC.
372	"""
373	try:
374	data = self[name]
375	except KeyError:
376	return None
377	return parsedate_tz(data)
378
379
380	# Access as a dictionary (only finds last header of each type):
381
382	def __len__(self):
383	"""Get the number of headers in a message."""
384	return len(self.dict)
385
386	def __getitem__(self, name):
387	"""Get a specific header, as from a dictionary."""
388	return self.dict[name.lower()]
389
390	def __setitem__(self, name, value):
391	"""Set the value of a header.
392
393	Note: This is not a perfect inversion of __getitem__, because any
394	changed headers get stuck at the end of the raw-headers list rather
395	than where the altered header was.
396	"""
397	del self[name] # Won't fail if it doesn't exist
398	self.dict[name.lower()] = value
399	text = name + ": " + value
400	for line in text.split("\n"):
401	self.headers.append(line + "\n")
402
403	def __delitem__(self, name):
404	"""Delete all occurrences of a specific header, if it is present."""
405	name = name.lower()
406	if not name in self.dict:
407	return
408	del self.dict[name]
409	name = name + ':'
410	n = len(name)
411	lst = []
412	hit = 0
413	for i in range(len(self.headers)):
414	line = self.headers[i]
415	if line[:n].lower() == name:
416	hit = 1
417	elif not line[:1].isspace():
418	hit = 0
419	if hit:
420	lst.append(i)
421	for i in reversed(lst):
422	del self.headers[i]
423
424	def setdefault(self, name, default=""):
425	lowername = name.lower()
426	if lowername in self.dict:
427	return self.dict[lowername]
428	else:
429	text = name + ": " + default
430	for line in text.split("\n"):
431	self.headers.append(line + "\n")
432	self.dict[lowername] = default
433	return default
434
435	def has_key(self, name):
436	"""Determine whether a message contains the named header."""
437	return name.lower() in self.dict
438
439	def __contains__(self, name):
440	"""Determine whether a message contains the named header."""
441	return name.lower() in self.dict
442
443	def __iter__(self):
444	return iter(self.dict)
445
446	def keys(self):
447	"""Get all of a message's header field names."""
448	return self.dict.keys()
449
450	def values(self):
451	"""Get all of a message's header field values."""
452	return self.dict.values()
453
454	def items(self):
455	"""Get all of a message's headers.
456
457	Returns a list of name, value tuples.
458	"""
459	return self.dict.items()
460
461	def __str__(self):
462	return ''.join(self.headers)
463
464
465	# Utility functions
466	# -----------------
467
468	# XXX Should fix unquote() and quote() to be really conformant.
469	# XXX The inverses of the parse functions may also be useful.
470
471
472	def unquote(s):
473	"""Remove quotes from a string."""
474	if len(s) > 1:
475	if s.startswith('"') and s.endswith('"'):
476	return s[1:-1].replace('\\\\', '\\').replace('\\"', '"')
477	if s.startswith('<') and s.endswith('>'):
478	return s[1:-1]
479	return s
480
481
482	def quote(s):
483	"""Add quotes around a string."""
484	return s.replace('\\', '\\\\').replace('"', '\\"')
485
486
487	def parseaddr(address):
488	"""Parse an address into a (realname, mailaddr) tuple."""
489	a = AddressList(address)
490	lst = a.addresslist
491	if not lst:
492	return (None, None)
493	return lst[0]
494
495
496	class AddrlistClass:
497	"""Address parser class by Ben Escoto.
498
499	To understand what this class does, it helps to have a copy of
500	RFC 2822 in front of you.
501
502	http://www.faqs.org/rfcs/rfc2822.html
503
504	Note: this class interface is deprecated and may be removed in the future.
505	Use rfc822.AddressList instead.
506	"""
507
508	def __init__(self, field):
509	"""Initialize a new instance.
510
511	`field' is an unparsed address header field, containing one or more
512	addresses.
513	"""
514	self.specials = '()<>@,:;.\"[]'
515	self.pos = 0
516	self.LWS = ' \t'
517	self.CR = '\r\n'
518	self.atomends = self.specials + self.LWS + self.CR
519	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
520	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
521	# syntax, so allow dots in phrases.
522	self.phraseends = self.atomends.replace('.', '')
523	self.field = field
524	self.commentlist = []
525
526	def gotonext(self):
527	"""Parse up to the start of the next address."""
528	while self.pos < len(self.field):
529	if self.field[self.pos] in self.LWS + '\n\r':
530	self.pos = self.pos + 1
531	elif self.field[self.pos] == '(':
532	self.commentlist.append(self.getcomment())
533	else: break
534
535	def getaddrlist(self):
536	"""Parse all addresses.
537
538	Returns a list containing all of the addresses.
539	"""
540	result = []
541	ad = self.getaddress()
542	while ad:
543	result += ad
544	ad = self.getaddress()
545	return result
546
547	def getaddress(self):
548	"""Parse the next address."""
549	self.commentlist = []
550	self.gotonext()
551
552	oldpos = self.pos
553	oldcl = self.commentlist
554	plist = self.getphraselist()
555
556	self.gotonext()
557	returnlist = []
558
559	if self.pos >= len(self.field):
560	# Bad email address technically, no domain.
561	if plist:
562	returnlist = [(' '.join(self.commentlist), plist[0])]
563
564	elif self.field[self.pos] in '.@':
565	# email address is just an addrspec
566	# this isn't very efficient since we start over
567	self.pos = oldpos
568	self.commentlist = oldcl
569	addrspec = self.getaddrspec()
570	returnlist = [(' '.join(self.commentlist), addrspec)]
571
572	elif self.field[self.pos] == ':':
573	# address is a group
574	returnlist = []
575
576	fieldlen = len(self.field)
577	self.pos += 1
578	while self.pos < len(self.field):
579	self.gotonext()
580	if self.pos < fieldlen and self.field[self.pos] == ';':
581	self.pos += 1
582	break
583	returnlist = returnlist + self.getaddress()
584
585	elif self.field[self.pos] == '<':
586	# Address is a phrase then a route addr
587	routeaddr = self.getrouteaddr()
588
589	if self.commentlist:
590	returnlist = [(' '.join(plist) + ' (' + \
591	' '.join(self.commentlist) + ')', routeaddr)]
592	else: returnlist = [(' '.join(plist), routeaddr)]
593
594	else:
595	if plist:
596	returnlist = [(' '.join(self.commentlist), plist[0])]
597	elif self.field[self.pos] in self.specials:
598	self.pos += 1
599
600	self.gotonext()
601	if self.pos < len(self.field) and self.field[self.pos] == ',':
602	self.pos += 1
603	return returnlist
604
605	def getrouteaddr(self):
606	"""Parse a route address (Return-path value).
607
608	This method just skips all the route stuff and returns the addrspec.
609	"""
610	if self.field[self.pos] != '<':
611	return
612
613	expectroute = 0
614	self.pos += 1
615	self.gotonext()
616	adlist = ""
617	while self.pos < len(self.field):
618	if expectroute:
619	self.getdomain()
620	expectroute = 0
621	elif self.field[self.pos] == '>':
622	self.pos += 1
623	break
624	elif self.field[self.pos] == '@':
625	self.pos += 1
626	expectroute = 1
627	elif self.field[self.pos] == ':':
628	self.pos += 1
629	else:
630	adlist = self.getaddrspec()
631	self.pos += 1
632	break
633	self.gotonext()
634
635	return adlist
636
637	def getaddrspec(self):
638	"""Parse an RFC 2822 addr-spec."""
639	aslist = []
640
641	self.gotonext()
642	while self.pos < len(self.field):
643	if self.field[self.pos] == '.':
644	aslist.append('.')
645	self.pos += 1
646	elif self.field[self.pos] == '"':
647	aslist.append('"%s"' % self.getquote())
648	elif self.field[self.pos] in self.atomends:
649	break
650	else: aslist.append(self.getatom())
651	self.gotonext()
652
653	if self.pos >= len(self.field) or self.field[self.pos] != '@':
654	return ''.join(aslist)
655
656	aslist.append('@')
657	self.pos += 1
658	self.gotonext()
659	return ''.join(aslist) + self.getdomain()
660
661	def getdomain(self):
662	"""Get the complete domain name from an address."""
663	sdlist = []
664	while self.pos < len(self.field):
665	if self.field[self.pos] in self.LWS:
666	self.pos += 1
667	elif self.field[self.pos] == '(':
668	self.commentlist.append(self.getcomment())
669	elif self.field[self.pos] == '[':
670	sdlist.append(self.getdomainliteral())
671	elif self.field[self.pos] == '.':
672	self.pos += 1
673	sdlist.append('.')
674	elif self.field[self.pos] in self.atomends:
675	break
676	else: sdlist.append(self.getatom())
677	return ''.join(sdlist)
678
679	def getdelimited(self, beginchar, endchars, allowcomments = 1):
680	"""Parse a header fragment delimited by special characters.
681
682	`beginchar' is the start character for the fragment. If self is not
683	looking at an instance of `beginchar' then getdelimited returns the
684	empty string.
685
686	`endchars' is a sequence of allowable end-delimiting characters.
687	Parsing stops when one of these is encountered.
688
689	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
690	within the parsed fragment.
691	"""
692	if self.field[self.pos] != beginchar:
693	return ''
694
695	slist = ['']
696	quote = 0
697	self.pos += 1
698	while self.pos < len(self.field):
699	if quote == 1:
700	slist.append(self.field[self.pos])
701	quote = 0
702	elif self.field[self.pos] in endchars:
703	self.pos += 1
704	break
705	elif allowcomments and self.field[self.pos] == '(':
706	slist.append(self.getcomment())
707	continue # have already advanced pos from getcomment
708	elif self.field[self.pos] == '\\':
709	quote = 1
710	else:
711	slist.append(self.field[self.pos])
712	self.pos += 1
713
714	return ''.join(slist)
715
716	def getquote(self):
717	"""Get a quote-delimited fragment from self's field."""
718	return self.getdelimited('"', '"\r', 0)
719
720	def getcomment(self):
721	"""Get a parenthesis-delimited fragment from self's field."""
722	return self.getdelimited('(', ')\r', 1)
723
724	def getdomainliteral(self):
725	"""Parse an RFC 2822 domain-literal."""
726	return '[%s]' % self.getdelimited('[', ']\r', 0)
727
728	def getatom(self, atomends=None):
729	"""Parse an RFC 2822 atom.
730
731	Optional atomends specifies a different set of end token delimiters
732	(the default is to use self.atomends). This is used e.g. in
733	getphraselist() since phrase endings must not include the `.' (which
734	is legal in phrases)."""
735	atomlist = ['']
736	if atomends is None:
737	atomends = self.atomends
738
739	while self.pos < len(self.field):
740	if self.field[self.pos] in atomends:
741	break
742	else: atomlist.append(self.field[self.pos])
743	self.pos += 1
744
745	return ''.join(atomlist)
746
747	def getphraselist(self):
748	"""Parse a sequence of RFC 2822 phrases.
749
750	A phrase is a sequence of words, which are in turn either RFC 2822
751	atoms or quoted-strings. Phrases are canonicalized by squeezing all
752	runs of continuous whitespace into one space.
753	"""
754	plist = []
755
756	while self.pos < len(self.field):
757	if self.field[self.pos] in self.LWS:
758	self.pos += 1
759	elif self.field[self.pos] == '"':
760	plist.append(self.getquote())
761	elif self.field[self.pos] == '(':
762	self.commentlist.append(self.getcomment())
763	elif self.field[self.pos] in self.phraseends:
764	break
765	else:
766	plist.append(self.getatom(self.phraseends))
767
768	return plist
769
770	class AddressList(AddrlistClass):
771	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
772	def __init__(self, field):
773	AddrlistClass.__init__(self, field)
774	if field:
775	self.addresslist = self.getaddrlist()
776	else:
777	self.addresslist = []
778
779	def __len__(self):
780	return len(self.addresslist)
781
782	def __str__(self):
783	return ", ".join(map(dump_address_pair, self.addresslist))
784
785	def __add__(self, other):
786	# Set union
787	newaddr = AddressList(None)
788	newaddr.addresslist = self.addresslist[:]
789	for x in other.addresslist:
790	if not x in self.addresslist:
791	newaddr.addresslist.append(x)
792	return newaddr
793
794	def __iadd__(self, other):
795	# Set union, in-place
796	for x in other.addresslist:
797	if not x in self.addresslist:
798	self.addresslist.append(x)
799	return self
800
801	def __sub__(self, other):
802	# Set difference
803	newaddr = AddressList(None)
804	for x in self.addresslist:
805	if not x in other.addresslist:
806	newaddr.addresslist.append(x)
807	return newaddr
808
809	def __isub__(self, other):
810	# Set difference, in-place
811	for x in other.addresslist:
812	if x in self.addresslist:
813	self.addresslist.remove(x)
814	return self
815
816	def __getitem__(self, index):
817	# Make indexing, slices, and 'in' work
818	return self.addresslist[index]
819
820	def dump_address_pair(pair):
821	"""Dump a (name, address) pair in a canonicalized form."""
822	if pair[0]:
823	return '"' + pair[0] + '" <' + pair[1] + '>'
824	else:
825	return pair[1]
826
827	# Parse a date field
828
829	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
830	'aug', 'sep', 'oct', 'nov', 'dec',
831	'january', 'february', 'march', 'april', 'may', 'june', 'july',
832	'august', 'september', 'october', 'november', 'december']
833	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
834
835	# The timezone table does not include the military time zones defined
836	# in RFC822, other than Z. According to RFC1123, the description in
837	# RFC822 gets the signs wrong, so we can't rely on any such time
838	# zones. RFC1123 recommends that numeric timezone indicators be used
839	# instead of timezone names.
840
841	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
842	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
843	'EST': -500, 'EDT': -400, # Eastern
844	'CST': -600, 'CDT': -500, # Central
845	'MST': -700, 'MDT': -600, # Mountain
846	'PST': -800, 'PDT': -700 # Pacific
847	}
848
849
850	def parsedate_tz(data):
851	"""Convert a date string to a time tuple.
852
853	Accounts for military timezones.
854	"""
855	if not data:
856	return None
857	data = data.split()
858	if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
859	# There's a dayname here. Skip it
860	del data[0]
861	else:
862	# no space after the "weekday,"?
863	i = data[0].rfind(',')
864	if i >= 0:
865	data[0] = data[0][i+1:]
866	if len(data) == 3: # RFC 850 date, deprecated
867	stuff = data[0].split('-')
868	if len(stuff) == 3:
869	data = stuff + data[1:]
870	if len(data) == 4:
871	s = data[3]
872	i = s.find('+')
873	if i > 0:
874	data[3:] = [s[:i], s[i+1:]]
875	else:
876	data.append('') # Dummy tz
877	if len(data) < 5:
878	return None
879	data = data[:5]
880	[dd, mm, yy, tm, tz] = data
881	mm = mm.lower()
882	if not mm in _monthnames:
883	dd, mm = mm, dd.lower()
884	if not mm in _monthnames:
885	return None
886	mm = _monthnames.index(mm)+1
887	if mm > 12: mm = mm - 12
888	if dd[-1] == ',':
889	dd = dd[:-1]
890	i = yy.find(':')
891	if i > 0:
892	yy, tm = tm, yy
893	if yy[-1] == ',':
894	yy = yy[:-1]
895	if not yy[0].isdigit():
896	yy, tz = tz, yy
897	if tm[-1] == ',':
898	tm = tm[:-1]
899	tm = tm.split(':')
900	if len(tm) == 2:
901	[thh, tmm] = tm
902	tss = '0'
903	elif len(tm) == 3:
904	[thh, tmm, tss] = tm
905	else:
906	return None
907	try:
908	yy = int(yy)
909	dd = int(dd)
910	thh = int(thh)
911	tmm = int(tmm)
912	tss = int(tss)
913	except ValueError:
914	return None
915	tzoffset = None
916	tz = tz.upper()
917	if tz in _timezones:
918	tzoffset = _timezones[tz]
919	else:
920	try:
921	tzoffset = int(tz)
922	except ValueError:
923	pass
924	# Convert a timezone offset into seconds ; -0500 -> -18000
925	if tzoffset:
926	if tzoffset < 0:
927	tzsign = -1
928	tzoffset = -tzoffset
929	else:
930	tzsign = 1
931	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
932	return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
933
934
935	def parsedate(data):
936	"""Convert a time string to a time tuple."""
937	t = parsedate_tz(data)
938	if t is None:
939	return t
940	return t[:9]
941
942
943	def mktime_tz(data):
944	"""Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
945	if data[9] is None:
946	# No zone info, so localtime is better assumption than GMT
947	return time.mktime(data[:8] + (-1,))
948	else:
949	t = time.mktime(data[:8] + (0,))
950	return t - data[9] - time.timezone
951
952	def formatdate(timeval=None):
953	"""Returns time format preferred for Internet standards.
954
955	Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
956
957	According to RFC 1123, day and month names must always be in
958	English. If not for that, this code could use strftime(). It
959	can't because strftime() honors the locale and could generated
960	non-English names.
961	"""
962	if timeval is None:
963	timeval = time.time()
964	timeval = time.gmtime(timeval)
965	return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
966	("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]],
967	timeval[2],
968	("Jan", "Feb", "Mar", "Apr", "May", "Jun",
969	"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1],
970	timeval[0], timeval[3], timeval[4], timeval[5])
971
972
973	# When used as script, run a small test program.
974	# The first command line argument must be a filename containing one
975	# message in RFC-822 format.
976
977	if __name__ == '__main__':
978	import sys, os
979	file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
980	if sys.argv[1:]: file = sys.argv[1]
981	f = open(file, 'r')
982	m = Message(f)
983	print 'From:', m.getaddr('from')
984	print 'To:', m.getaddrlist('to')
985	print 'Subject:', m.getheader('subject')
986	print 'Date:', m.getheader('date')
987	date = m.getdate_tz('date')
988	tz = date[-1]
989	date = time.localtime(mktime_tz(date))
990	if date:
991	print 'ParsedDate:', time.asctime(date),
992	hhmmss = tz
993	hhmm, ss = divmod(hhmmss, 60)
994	hh, mm = divmod(hhmm, 60)
995	print "%+03d%02d" % (hh, mm),
996	if ss: print ".%02d" % ss,
997	print
998	else:
999	print 'ParsedDate:', None
1000	m.rewindbody()
1001	n = 0
1002	while f.readline():
1003	n += 1
1004	print 'Lines:', n
1005	print '-'*70
1006	print 'len =', len(m)
1007	if 'Date' in m: print 'Date =', m['Date']
1008	if 'X-Nonsense' in m: pass
1009	print 'keys =', m.keys()
1010	print 'values =', m.values()
1011	print 'items =', m.items()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/rfc822.py@ 6

Download in other formats: