Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

header.py@ 383

Last change on this file since 383 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 21.2 KB

Line
1	# Copyright (C) 2002-2006 Python Software Foundation
2	# Author: Ben Gertzfield, Barry Warsaw
3	# Contact: email-sig@python.org
4
5	"""Header encoding and decoding functionality."""
6
7	__all__ = [
8	'Header',
9	'decode_header',
10	'make_header',
11	]
12
13	import re
14	import binascii
15
16	import email.quoprimime
17	import email.base64mime
18
19	from email.errors import HeaderParseError
20	from email.charset import Charset
21
22	NL = '\n'
23	SPACE = ' '
24	USPACE = u' '
25	SPACE8 = ' ' * 8
26	UEMPTYSTRING = u''
27
28	MAXLINELEN = 76
29
30	USASCII = Charset('us-ascii')
31	UTF8 = Charset('utf-8')
32
33	# Match encoded-word strings in the form =?charset?q?Hello_World?=
34	ecre = re.compile(r'''
35	=\? # literal =?
36	(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
37	\? # literal ?
38	(?P<encoding>[qb]) # either a "q" or a "b", case insensitive
39	\? # literal ?
40	(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
41	\?= # literal ?=
42	(?=[ \t]\|$) # whitespace or the end of the string
43	''', re.VERBOSE \| re.IGNORECASE \| re.MULTILINE)
44
45	# Field name regexp, including trailing colon, but not separating whitespace,
46	# according to RFC 2822. Character range is from tilde to exclamation mark.
47	# For use with .match()
48	fcre = re.compile(r'[\041-\176]+:$')
49
50
51
52
53	# Helpers
54	_max_append = email.quoprimime._max_append
55
56
57
58
59	def decode_header(header):
60	"""Decode a message header value without converting charset.
61
62	Returns a list of (decoded_string, charset) pairs containing each of the
63	decoded parts of the header. Charset is None for non-encoded parts of the
64	header, otherwise a lower-case string containing the name of the character
65	set specified in the encoded string.
66
67	An email.errors.HeaderParseError may be raised when certain decoding error
68	occurs (e.g. a base64 decoding exception).
69	"""
70	# If no encoding, just return the header
71	header = str(header)
72	if not ecre.search(header):
73	return [(header, None)]
74	decoded = []
75	dec = ''
76	for line in header.splitlines():
77	# This line might not have an encoding in it
78	if not ecre.search(line):
79	decoded.append((line, None))
80	continue
81	parts = ecre.split(line)
82	while parts:
83	unenc = parts.pop(0).strip()
84	if unenc:
85	# Should we continue a long line?
86	if decoded and decoded[-1][1] is None:
87	decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
88	else:
89	decoded.append((unenc, None))
90	if parts:
91	charset, encoding = [s.lower() for s in parts[0:2]]
92	encoded = parts[2]
93	dec = None
94	if encoding == 'q':
95	dec = email.quoprimime.header_decode(encoded)
96	elif encoding == 'b':
97	try:
98	dec = email.base64mime.decode(encoded)
99	except binascii.Error:
100	# Turn this into a higher level exception. BAW: Right
101	# now we throw the lower level exception away but
102	# when/if we get exception chaining, we'll preserve it.
103	raise HeaderParseError
104	if dec is None:
105	dec = encoded
106
107	if decoded and decoded[-1][1] == charset:
108	decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
109	else:
110	decoded.append((dec, charset))
111	del parts[0:3]
112	return decoded
113
114
115
116
117	def make_header(decoded_seq, maxlinelen=None, header_name=None,
118	continuation_ws=' '):
119	"""Create a Header from a sequence of pairs as returned by decode_header()
120
121	decode_header() takes a header value string and returns a sequence of
122	pairs of the format (decoded_string, charset) where charset is the string
123	name of the character set.
124
125	This function takes one of those sequence of pairs and returns a Header
126	instance. Optional maxlinelen, header_name, and continuation_ws are as in
127	the Header constructor.
128	"""
129	h = Header(maxlinelen=maxlinelen, header_name=header_name,
130	continuation_ws=continuation_ws)
131	for s, charset in decoded_seq:
132	# None means us-ascii but we can simply pass it on to h.append()
133	if charset is not None and not isinstance(charset, Charset):
134	charset = Charset(charset)
135	h.append(s, charset)
136	return h
137
138
139
140
141	class Header:
142	def __init__(self, s=None, charset=None,
143	maxlinelen=None, header_name=None,
144	continuation_ws=' ', errors='strict'):
145	"""Create a MIME-compliant header that can contain many character sets.
146
147	Optional s is the initial header value. If None, the initial header
148	value is not set. You can later append to the header with .append()
149	method calls. s may be a byte string or a Unicode string, but see the
150	.append() documentation for semantics.
151
152	Optional charset serves two purposes: it has the same meaning as the
153	charset argument to the .append() method. It also sets the default
154	character set for all subsequent .append() calls that omit the charset
155	argument. If charset is not provided in the constructor, the us-ascii
156	charset is used both as s's initial charset and as the default for
157	subsequent .append() calls.
158
159	The maximum line length can be specified explicit via maxlinelen. For
160	splitting the first line to a shorter value (to account for the field
161	header which isn't included in s, e.g. `Subject') pass in the name of
162	the field in header_name. The default maxlinelen is 76.
163
164	continuation_ws must be RFC 2822 compliant folding whitespace (usually
165	either a space or a hard tab) which will be prepended to continuation
166	lines.
167
168	errors is passed through to the .append() call.
169	"""
170	if charset is None:
171	charset = USASCII
172	if not isinstance(charset, Charset):
173	charset = Charset(charset)
174	self._charset = charset
175	self._continuation_ws = continuation_ws
176	cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
177	# BAW: I believe `chunks' and `maxlinelen' should be non-public.
178	self._chunks = []
179	if s is not None:
180	self.append(s, charset, errors)
181	if maxlinelen is None:
182	maxlinelen = MAXLINELEN
183	if header_name is None:
184	# We don't know anything about the field header so the first line
185	# is the same length as subsequent lines.
186	self._firstlinelen = maxlinelen
187	else:
188	# The first line should be shorter to take into account the field
189	# header. Also subtract off 2 extra for the colon and space.
190	self._firstlinelen = maxlinelen - len(header_name) - 2
191	# Second and subsequent lines should subtract off the length in
192	# columns of the continuation whitespace prefix.
193	self._maxlinelen = maxlinelen - cws_expanded_len
194
195	def __str__(self):
196	"""A synonym for self.encode()."""
197	return self.encode()
198
199	def __unicode__(self):
200	"""Helper for the built-in unicode function."""
201	uchunks = []
202	lastcs = None
203	for s, charset in self._chunks:
204	# We must preserve spaces between encoded and non-encoded word
205	# boundaries, which means for us we need to add a space when we go
206	# from a charset to None/us-ascii, or from None/us-ascii to a
207	# charset. Only do this for the second and subsequent chunks.
208	nextcs = charset
209	if uchunks:
210	if lastcs not in (None, 'us-ascii'):
211	if nextcs in (None, 'us-ascii'):
212	uchunks.append(USPACE)
213	nextcs = None
214	elif nextcs not in (None, 'us-ascii'):
215	uchunks.append(USPACE)
216	lastcs = nextcs
217	uchunks.append(unicode(s, str(charset)))
218	return UEMPTYSTRING.join(uchunks)
219
220	# Rich comparison operators for equality only. BAW: does it make sense to
221	# have or explicitly disable <, <=, >, >= operators?
222	def __eq__(self, other):
223	# other may be a Header or a string. Both are fine so coerce
224	# ourselves to a string, swap the args and do another comparison.
225	return other == self.encode()
226
227	def __ne__(self, other):
228	return not self == other
229
230	def append(self, s, charset=None, errors='strict'):
231	"""Append a string to the MIME header.
232
233	Optional charset, if given, should be a Charset instance or the name
234	of a character set (which will be converted to a Charset instance). A
235	value of None (the default) means that the charset given in the
236	constructor is used.
237
238	s may be a byte string or a Unicode string. If it is a byte string
239	(i.e. isinstance(s, str) is true), then charset is the encoding of
240	that byte string, and a UnicodeError will be raised if the string
241	cannot be decoded with that charset. If s is a Unicode string, then
242	charset is a hint specifying the character set of the characters in
243	the string. In this case, when producing an RFC 2822 compliant header
244	using RFC 2047 rules, the Unicode string will be encoded using the
245	following charsets in order: us-ascii, the charset hint, utf-8. The
246	first character set not to provoke a UnicodeError is used.
247
248	Optional `errors' is passed as the third argument to any unicode() or
249	ustr.encode() call.
250	"""
251	if charset is None:
252	charset = self._charset
253	elif not isinstance(charset, Charset):
254	charset = Charset(charset)
255	# If the charset is our faux 8bit charset, leave the string unchanged
256	if charset != '8bit':
257	# We need to test that the string can be converted to unicode and
258	# back to a byte string, given the input and output codecs of the
259	# charset.
260	if isinstance(s, str):
261	# Possibly raise UnicodeError if the byte string can't be
262	# converted to a unicode with the input codec of the charset.
263	incodec = charset.input_codec or 'us-ascii'
264	ustr = unicode(s, incodec, errors)
265	# Now make sure that the unicode could be converted back to a
266	# byte string with the output codec, which may be different
267	# than the iput coded. Still, use the original byte string.
268	outcodec = charset.output_codec or 'us-ascii'
269	ustr.encode(outcodec, errors)
270	elif isinstance(s, unicode):
271	# Now we have to be sure the unicode string can be converted
272	# to a byte string with a reasonable output codec. We want to
273	# use the byte string in the chunk.
274	for charset in USASCII, charset, UTF8:
275	try:
276	outcodec = charset.output_codec or 'us-ascii'
277	s = s.encode(outcodec, errors)
278	break
279	except UnicodeError:
280	pass
281	else:
282	assert False, 'utf-8 conversion failed'
283	self._chunks.append((s, charset))
284
285	def _split(self, s, charset, maxlinelen, splitchars):
286	# Split up a header safely for use with encode_chunks.
287	splittable = charset.to_splittable(s)
288	encoded = charset.from_splittable(splittable, True)
289	elen = charset.encoded_header_len(encoded)
290	# If the line's encoded length first, just return it
291	if elen <= maxlinelen:
292	return [(encoded, charset)]
293	# If we have undetermined raw 8bit characters sitting in a byte
294	# string, we really don't know what the right thing to do is. We
295	# can't really split it because it might be multibyte data which we
296	# could break if we split it between pairs. The least harm seems to
297	# be to not split the header at all, but that means they could go out
298	# longer than maxlinelen.
299	if charset == '8bit':
300	return [(s, charset)]
301	# BAW: I'm not sure what the right test here is. What we're trying to
302	# do is be faithful to RFC 2822's recommendation that ($2.2.3):
303	#
304	# "Note: Though structured field bodies are defined in such a way that
305	# folding can take place between many of the lexical tokens (and even
306	# within some of the lexical tokens), folding SHOULD be limited to
307	# placing the CRLF at higher-level syntactic breaks."
308	#
309	# For now, I can only imagine doing this when the charset is us-ascii,
310	# although it's possible that other charsets may also benefit from the
311	# higher-level syntactic breaks.
312	elif charset == 'us-ascii':
313	return self._split_ascii(s, charset, maxlinelen, splitchars)
314	# BAW: should we use encoded?
315	elif elen == len(s):
316	# We can split on _maxlinelen boundaries because we know that the
317	# encoding won't change the size of the string
318	splitpnt = maxlinelen
319	first = charset.from_splittable(splittable[:splitpnt], False)
320	last = charset.from_splittable(splittable[splitpnt:], False)
321	else:
322	# Binary search for split point
323	first, last = _binsplit(splittable, charset, maxlinelen)
324	# first is of the proper length so just wrap it in the appropriate
325	# chrome. last must be recursively split.
326	fsplittable = charset.to_splittable(first)
327	fencoded = charset.from_splittable(fsplittable, True)
328	chunk = [(fencoded, charset)]
329	return chunk + self._split(last, charset, self._maxlinelen, splitchars)
330
331	def _split_ascii(self, s, charset, firstlen, splitchars):
332	chunks = _split_ascii(s, firstlen, self._maxlinelen,
333	self._continuation_ws, splitchars)
334	return zip(chunks, [charset]*len(chunks))
335
336	def _encode_chunks(self, newchunks, maxlinelen):
337	# MIME-encode a header with many different charsets and/or encodings.
338	#
339	# Given a list of pairs (string, charset), return a MIME-encoded
340	# string suitable for use in a header field. Each pair may have
341	# different charsets and/or encodings, and the resulting header will
342	# accurately reflect each setting.
343	#
344	# Each encoding can be email.utils.QP (quoted-printable, for
345	# ASCII-like character sets like iso-8859-1), email.utils.BASE64
346	# (Base64, for non-ASCII like character sets like KOI8-R and
347	# iso-2022-jp), or None (no encoding).
348	#
349	# Each pair will be represented on a separate line; the resulting
350	# string will be in the format:
351	#
352	# =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
353	# =?charset2?b?SvxyZ2VuIEL2aW5n?="
354	chunks = []
355	for header, charset in newchunks:
356	if not header:
357	continue
358	if charset is None or charset.header_encoding is None:
359	s = header
360	else:
361	s = charset.header_encode(header)
362	# Don't add more folding whitespace than necessary
363	if chunks and chunks[-1].endswith(' '):
364	extra = ''
365	else:
366	extra = ' '
367	_max_append(chunks, s, maxlinelen, extra)
368	joiner = NL + self._continuation_ws
369	return joiner.join(chunks)
370
371	def encode(self, splitchars=';, '):
372	"""Encode a message header into an RFC-compliant format.
373
374	There are many issues involved in converting a given string for use in
375	an email header. Only certain character sets are readable in most
376	email clients, and as header strings can only contain a subset of
377	7-bit ASCII, care must be taken to properly convert and encode (with
378	Base64 or quoted-printable) header strings. In addition, there is a
379	75-character length limit on any given encoded header field, so
380	line-wrapping must be performed, even with double-byte character sets.
381
382	This method will do its best to convert the string to the correct
383	character set used in email, and encode and line wrap it safely with
384	the appropriate scheme for that character set.
385
386	If the given charset is not known or an error occurs during
387	conversion, this function will return the header untouched.
388
389	Optional splitchars is a string containing characters to split long
390	ASCII lines on, in rough support of RFC 2822's `highest level
391	syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
392	"""
393	newchunks = []
394	maxlinelen = self._firstlinelen
395	lastlen = 0
396	for s, charset in self._chunks:
397	# The first bit of the next chunk should be just long enough to
398	# fill the next line. Don't forget the space separating the
399	# encoded words.
400	targetlen = maxlinelen - lastlen - 1
401	if targetlen < charset.encoded_header_len(''):
402	# Stick it on the next line
403	targetlen = maxlinelen
404	newchunks += self._split(s, charset, targetlen, splitchars)
405	lastchunk, lastcharset = newchunks[-1]
406	lastlen = lastcharset.encoded_header_len(lastchunk)
407	return self._encode_chunks(newchunks, maxlinelen)
408
409
410
411
412	def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
413	lines = []
414	maxlen = firstlen
415	for line in s.splitlines():
416	# Ignore any leading whitespace (i.e. continuation whitespace) already
417	# on the line, since we'll be adding our own.
418	line = line.lstrip()
419	if len(line) < maxlen:
420	lines.append(line)
421	maxlen = restlen
422	continue
423	# Attempt to split the line at the highest-level syntactic break
424	# possible. Note that we don't have a lot of smarts about field
425	# syntax; we just try to break on semi-colons, then commas, then
426	# whitespace.
427	for ch in splitchars:
428	if ch in line:
429	break
430	else:
431	# There's nothing useful to split the line on, not even spaces, so
432	# just append this line unchanged
433	lines.append(line)
434	maxlen = restlen
435	continue
436	# Now split the line on the character plus trailing whitespace
437	cre = re.compile(r'%s\s*' % ch)
438	if ch in ';,':
439	eol = ch
440	else:
441	eol = ''
442	joiner = eol + ' '
443	joinlen = len(joiner)
444	wslen = len(continuation_ws.replace('\t', SPACE8))
445	this = []
446	linelen = 0
447	for part in cre.split(line):
448	curlen = linelen + max(0, len(this)-1) * joinlen
449	partlen = len(part)
450	onfirstline = not lines
451	# We don't want to split after the field name, if we're on the
452	# first line and the field name is present in the header string.
453	if ch == ' ' and onfirstline and \
454	len(this) == 1 and fcre.match(this[0]):
455	this.append(part)
456	linelen += partlen
457	elif curlen + partlen > maxlen:
458	if this:
459	lines.append(joiner.join(this) + eol)
460	# If this part is longer than maxlen and we aren't already
461	# splitting on whitespace, try to recursively split this line
462	# on whitespace.
463	if partlen > maxlen and ch != ' ':
464	subl = _split_ascii(part, maxlen, restlen,
465	continuation_ws, ' ')
466	lines.extend(subl[:-1])
467	this = [subl[-1]]
468	else:
469	this = [part]
470	linelen = wslen + len(this[-1])
471	maxlen = restlen
472	else:
473	this.append(part)
474	linelen += partlen
475	# Put any left over parts on a line by themselves
476	if this:
477	lines.append(joiner.join(this))
478	return lines
479
480
481
482
483	def _binsplit(splittable, charset, maxlinelen):
484	i = 0
485	j = len(splittable)
486	while i < j:
487	# Invariants:
488	# 1. splittable[:k] fits for all k <= i (note that we assume,
489	# at the start, that splittable[:0] fits).
490	# 2. splittable[:k] does not fit for any k > j (at the start,
491	# this means we shouldn't look at any k > len(splittable)).
492	# 3. We don't know about splittable[:k] for k in i+1..j.
493	# 4. We want to set i to the largest k that fits, with i <= k <= j.
494	#
495	m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
496	chunk = charset.from_splittable(splittable[:m], True)
497	chunklen = charset.encoded_header_len(chunk)
498	if chunklen <= maxlinelen:
499	# m is acceptable, so is a new lower bound.
500	i = m
501	else:
502	# m is not acceptable, so final i must be < m.
503	j = m - 1
504	# i == j. Invariant #1 implies that splittable[:i] fits, and
505	# invariant #2 implies that splittable[:i+1] does not fit, so i
506	# is what we're looking for.
507	first = charset.from_splittable(splittable[:i], False)
508	last = charset.from_splittable(splittable[i:], False)
509	return first, last

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/email/header.py@ 383

Download in other formats: