Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

_parseaddr.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 15.4 KB

Line
1	# Copyright (C) 2002-2007 Python Software Foundation
2	# Contact: email-sig@python.org
3
4	"""Email address parsing code.
5
6	Lifted directly from rfc822.py. This should eventually be rewritten.
7	"""
8
9	__all__ = [
10	'mktime_tz',
11	'parsedate',
12	'parsedate_tz',
13	'quote',
14	]
15
16	import time, calendar
17
18	SPACE = ' '
19	EMPTYSTRING = ''
20	COMMASPACE = ', '
21
22	# Parse a date field
23	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24	'aug', 'sep', 'oct', 'nov', 'dec',
25	'january', 'february', 'march', 'april', 'may', 'june', 'july',
26	'august', 'september', 'october', 'november', 'december']
27
28	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30	# The timezone table does not include the military time zones defined
31	# in RFC822, other than Z. According to RFC1123, the description in
32	# RFC822 gets the signs wrong, so we can't rely on any such time
33	# zones. RFC1123 recommends that numeric timezone indicators be used
34	# instead of timezone names.
35
36	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38	'EST': -500, 'EDT': -400, # Eastern
39	'CST': -600, 'CDT': -500, # Central
40	'MST': -700, 'MDT': -600, # Mountain
41	'PST': -800, 'PDT': -700 # Pacific
42	}
43
44
45	def parsedate_tz(data):
46	"""Convert a date string to a time tuple.
47
48	Accounts for military timezones.
49	"""
50	data = data.split()
51	# The FWS after the comma after the day-of-week is optional, so search and
52	# adjust for this.
53	if data[0].endswith(',') or data[0].lower() in _daynames:
54	# There's a dayname here. Skip it
55	del data[0]
56	else:
57	i = data[0].rfind(',')
58	if i >= 0:
59	data[0] = data[0][i+1:]
60	if len(data) == 3: # RFC 850 date, deprecated
61	stuff = data[0].split('-')
62	if len(stuff) == 3:
63	data = stuff + data[1:]
64	if len(data) == 4:
65	s = data[3]
66	i = s.find('+')
67	if i > 0:
68	data[3:] = [s[:i], s[i+1:]]
69	else:
70	data.append('') # Dummy tz
71	if len(data) < 5:
72	return None
73	data = data[:5]
74	[dd, mm, yy, tm, tz] = data
75	mm = mm.lower()
76	if mm not in _monthnames:
77	dd, mm = mm, dd.lower()
78	if mm not in _monthnames:
79	return None
80	mm = _monthnames.index(mm) + 1
81	if mm > 12:
82	mm -= 12
83	if dd[-1] == ',':
84	dd = dd[:-1]
85	i = yy.find(':')
86	if i > 0:
87	yy, tm = tm, yy
88	if yy[-1] == ',':
89	yy = yy[:-1]
90	if not yy[0].isdigit():
91	yy, tz = tz, yy
92	if tm[-1] == ',':
93	tm = tm[:-1]
94	tm = tm.split(':')
95	if len(tm) == 2:
96	[thh, tmm] = tm
97	tss = '0'
98	elif len(tm) == 3:
99	[thh, tmm, tss] = tm
100	else:
101	return None
102	try:
103	yy = int(yy)
104	dd = int(dd)
105	thh = int(thh)
106	tmm = int(tmm)
107	tss = int(tss)
108	except ValueError:
109	return None
110	# Check for a yy specified in two-digit format, then convert it to the
111	# appropriate four-digit format, according to the POSIX standard. RFC 822
112	# calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
113	# mandates a 4-digit yy. For more information, see the documentation for
114	# the time module.
115	if yy < 100:
116	# The year is between 1969 and 1999 (inclusive).
117	if yy > 68:
118	yy += 1900
119	# The year is between 2000 and 2068 (inclusive).
120	else:
121	yy += 2000
122	tzoffset = None
123	tz = tz.upper()
124	if tz in _timezones:
125	tzoffset = _timezones[tz]
126	else:
127	try:
128	tzoffset = int(tz)
129	except ValueError:
130	pass
131	# Convert a timezone offset into seconds ; -0500 -> -18000
132	if tzoffset:
133	if tzoffset < 0:
134	tzsign = -1
135	tzoffset = -tzoffset
136	else:
137	tzsign = 1
138	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
139	# Daylight Saving Time flag is set to -1, since DST is unknown.
140	return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
141
142
143	def parsedate(data):
144	"""Convert a time string to a time tuple."""
145	t = parsedate_tz(data)
146	if isinstance(t, tuple):
147	return t[:9]
148	else:
149	return t
150
151
152	def mktime_tz(data):
153	"""Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
154	if data[9] is None:
155	# No zone info, so localtime is better assumption than GMT
156	return time.mktime(data[:8] + (-1,))
157	else:
158	t = calendar.timegm(data)
159	return t - data[9]
160
161
162	def quote(str):
163	"""Prepare string to be used in a quoted string.
164
165	Turns backslash and double quote characters into quoted pairs. These
166	are the only characters that need to be quoted inside a quoted string.
167	Does not add the surrounding double quotes.
168	"""
169	return str.replace('\\', '\\\\').replace('"', '\\"')
170
171
172	class AddrlistClass:
173	"""Address parser class by Ben Escoto.
174
175	To understand what this class does, it helps to have a copy of RFC 2822 in
176	front of you.
177
178	Note: this class interface is deprecated and may be removed in the future.
179	Use rfc822.AddressList instead.
180	"""
181
182	def __init__(self, field):
183	"""Initialize a new instance.
184
185	`field' is an unparsed address header field, containing
186	one or more addresses.
187	"""
188	self.specials = '()<>@,:;.\"[]'
189	self.pos = 0
190	self.LWS = ' \t'
191	self.CR = '\r\n'
192	self.FWS = self.LWS + self.CR
193	self.atomends = self.specials + self.LWS + self.CR
194	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
195	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
196	# syntax, so allow dots in phrases.
197	self.phraseends = self.atomends.replace('.', '')
198	self.field = field
199	self.commentlist = []
200
201	def gotonext(self):
202	"""Parse up to the start of the next address."""
203	while self.pos < len(self.field):
204	if self.field[self.pos] in self.LWS + '\n\r':
205	self.pos += 1
206	elif self.field[self.pos] == '(':
207	self.commentlist.append(self.getcomment())
208	else:
209	break
210
211	def getaddrlist(self):
212	"""Parse all addresses.
213
214	Returns a list containing all of the addresses.
215	"""
216	result = []
217	while self.pos < len(self.field):
218	ad = self.getaddress()
219	if ad:
220	result += ad
221	else:
222	result.append(('', ''))
223	return result
224
225	def getaddress(self):
226	"""Parse the next address."""
227	self.commentlist = []
228	self.gotonext()
229
230	oldpos = self.pos
231	oldcl = self.commentlist
232	plist = self.getphraselist()
233
234	self.gotonext()
235	returnlist = []
236
237	if self.pos >= len(self.field):
238	# Bad email address technically, no domain.
239	if plist:
240	returnlist = [(SPACE.join(self.commentlist), plist[0])]
241
242	elif self.field[self.pos] in '.@':
243	# email address is just an addrspec
244	# this isn't very efficient since we start over
245	self.pos = oldpos
246	self.commentlist = oldcl
247	addrspec = self.getaddrspec()
248	returnlist = [(SPACE.join(self.commentlist), addrspec)]
249
250	elif self.field[self.pos] == ':':
251	# address is a group
252	returnlist = []
253
254	fieldlen = len(self.field)
255	self.pos += 1
256	while self.pos < len(self.field):
257	self.gotonext()
258	if self.pos < fieldlen and self.field[self.pos] == ';':
259	self.pos += 1
260	break
261	returnlist = returnlist + self.getaddress()
262
263	elif self.field[self.pos] == '<':
264	# Address is a phrase then a route addr
265	routeaddr = self.getrouteaddr()
266
267	if self.commentlist:
268	returnlist = [(SPACE.join(plist) + ' (' +
269	' '.join(self.commentlist) + ')', routeaddr)]
270	else:
271	returnlist = [(SPACE.join(plist), routeaddr)]
272
273	else:
274	if plist:
275	returnlist = [(SPACE.join(self.commentlist), plist[0])]
276	elif self.field[self.pos] in self.specials:
277	self.pos += 1
278
279	self.gotonext()
280	if self.pos < len(self.field) and self.field[self.pos] == ',':
281	self.pos += 1
282	return returnlist
283
284	def getrouteaddr(self):
285	"""Parse a route address (Return-path value).
286
287	This method just skips all the route stuff and returns the addrspec.
288	"""
289	if self.field[self.pos] != '<':
290	return
291
292	expectroute = False
293	self.pos += 1
294	self.gotonext()
295	adlist = ''
296	while self.pos < len(self.field):
297	if expectroute:
298	self.getdomain()
299	expectroute = False
300	elif self.field[self.pos] == '>':
301	self.pos += 1
302	break
303	elif self.field[self.pos] == '@':
304	self.pos += 1
305	expectroute = True
306	elif self.field[self.pos] == ':':
307	self.pos += 1
308	else:
309	adlist = self.getaddrspec()
310	self.pos += 1
311	break
312	self.gotonext()
313
314	return adlist
315
316	def getaddrspec(self):
317	"""Parse an RFC 2822 addr-spec."""
318	aslist = []
319
320	self.gotonext()
321	while self.pos < len(self.field):
322	if self.field[self.pos] == '.':
323	aslist.append('.')
324	self.pos += 1
325	elif self.field[self.pos] == '"':
326	aslist.append('"%s"' % quote(self.getquote()))
327	elif self.field[self.pos] in self.atomends:
328	break
329	else:
330	aslist.append(self.getatom())
331	self.gotonext()
332
333	if self.pos >= len(self.field) or self.field[self.pos] != '@':
334	return EMPTYSTRING.join(aslist)
335
336	aslist.append('@')
337	self.pos += 1
338	self.gotonext()
339	return EMPTYSTRING.join(aslist) + self.getdomain()
340
341	def getdomain(self):
342	"""Get the complete domain name from an address."""
343	sdlist = []
344	while self.pos < len(self.field):
345	if self.field[self.pos] in self.LWS:
346	self.pos += 1
347	elif self.field[self.pos] == '(':
348	self.commentlist.append(self.getcomment())
349	elif self.field[self.pos] == '[':
350	sdlist.append(self.getdomainliteral())
351	elif self.field[self.pos] == '.':
352	self.pos += 1
353	sdlist.append('.')
354	elif self.field[self.pos] in self.atomends:
355	break
356	else:
357	sdlist.append(self.getatom())
358	return EMPTYSTRING.join(sdlist)
359
360	def getdelimited(self, beginchar, endchars, allowcomments=True):
361	"""Parse a header fragment delimited by special characters.
362
363	`beginchar' is the start character for the fragment.
364	If self is not looking at an instance of `beginchar' then
365	getdelimited returns the empty string.
366
367	`endchars' is a sequence of allowable end-delimiting characters.
368	Parsing stops when one of these is encountered.
369
370	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
371	within the parsed fragment.
372	"""
373	if self.field[self.pos] != beginchar:
374	return ''
375
376	slist = ['']
377	quote = False
378	self.pos += 1
379	while self.pos < len(self.field):
380	if quote:
381	slist.append(self.field[self.pos])
382	quote = False
383	elif self.field[self.pos] in endchars:
384	self.pos += 1
385	break
386	elif allowcomments and self.field[self.pos] == '(':
387	slist.append(self.getcomment())
388	continue # have already advanced pos from getcomment
389	elif self.field[self.pos] == '\\':
390	quote = True
391	else:
392	slist.append(self.field[self.pos])
393	self.pos += 1
394
395	return EMPTYSTRING.join(slist)
396
397	def getquote(self):
398	"""Get a quote-delimited fragment from self's field."""
399	return self.getdelimited('"', '"\r', False)
400
401	def getcomment(self):
402	"""Get a parenthesis-delimited fragment from self's field."""
403	return self.getdelimited('(', ')\r', True)
404
405	def getdomainliteral(self):
406	"""Parse an RFC 2822 domain-literal."""
407	return '[%s]' % self.getdelimited('[', ']\r', False)
408
409	def getatom(self, atomends=None):
410	"""Parse an RFC 2822 atom.
411
412	Optional atomends specifies a different set of end token delimiters
413	(the default is to use self.atomends). This is used e.g. in
414	getphraselist() since phrase endings must not include the `.' (which
415	is legal in phrases)."""
416	atomlist = ['']
417	if atomends is None:
418	atomends = self.atomends
419
420	while self.pos < len(self.field):
421	if self.field[self.pos] in atomends:
422	break
423	else:
424	atomlist.append(self.field[self.pos])
425	self.pos += 1
426
427	return EMPTYSTRING.join(atomlist)
428
429	def getphraselist(self):
430	"""Parse a sequence of RFC 2822 phrases.
431
432	A phrase is a sequence of words, which are in turn either RFC 2822
433	atoms or quoted-strings. Phrases are canonicalized by squeezing all
434	runs of continuous whitespace into one space.
435	"""
436	plist = []
437
438	while self.pos < len(self.field):
439	if self.field[self.pos] in self.FWS:
440	self.pos += 1
441	elif self.field[self.pos] == '"':
442	plist.append(self.getquote())
443	elif self.field[self.pos] == '(':
444	self.commentlist.append(self.getcomment())
445	elif self.field[self.pos] in self.phraseends:
446	break
447	else:
448	plist.append(self.getatom(self.phraseends))
449
450	return plist
451
452	class AddressList(AddrlistClass):
453	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
454	def __init__(self, field):
455	AddrlistClass.__init__(self, field)
456	if field:
457	self.addresslist = self.getaddrlist()
458	else:
459	self.addresslist = []
460
461	def __len__(self):
462	return len(self.addresslist)
463
464	def __add__(self, other):
465	# Set union
466	newaddr = AddressList(None)
467	newaddr.addresslist = self.addresslist[:]
468	for x in other.addresslist:
469	if not x in self.addresslist:
470	newaddr.addresslist.append(x)
471	return newaddr
472
473	def __iadd__(self, other):
474	# Set union, in-place
475	for x in other.addresslist:
476	if not x in self.addresslist:
477	self.addresslist.append(x)
478	return self
479
480	def __sub__(self, other):
481	# Set difference
482	newaddr = AddressList(None)
483	for x in self.addresslist:
484	if not x in other.addresslist:
485	newaddr.addresslist.append(x)
486	return newaddr
487
488	def __isub__(self, other):
489	# Set difference, in-place
490	for x in other.addresslist:
491	if x in self.addresslist:
492	self.addresslist.remove(x)
493	return self
494
495	def __getitem__(self, index):
496	# Make indexing, slices, and 'in' work
497	return self.addresslist[index]

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/email/_parseaddr.py

Download in other formats: