Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

codecs.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 34.4 KB

Rev	Line
[2]	1	""" codecs -- Python Codec Registry, API and helpers.
	2
	3
	4	Written by Marc-Andre Lemburg (mal@lemburg.com).
	5
	6	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
	7
	8	"""#"
	9
	10	import __builtin__, sys
	11
	12	### Registry and builtin stateless codec functions
	13
	14	try:
	15	from _codecs import *
	16	except ImportError, why:
	17	raise SystemError('Failed to load the builtin codecs: %s' % why)
	18
	19	__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
	20	"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
	21	"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
	22	"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
	23	"strict_errors", "ignore_errors", "replace_errors",
	24	"xmlcharrefreplace_errors",
	25	"register_error", "lookup_error"]
	26
	27	### Constants
	28
	29	#
	30	# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
	31	# and its possible byte string values
	32	# for UTF8/UTF16/UTF32 output and little/big endian machines
	33	#
	34
	35	# UTF-8
	36	BOM_UTF8 = '\xef\xbb\xbf'
	37
	38	# UTF-16, little endian
	39	BOM_LE = BOM_UTF16_LE = '\xff\xfe'
	40
	41	# UTF-16, big endian
	42	BOM_BE = BOM_UTF16_BE = '\xfe\xff'
	43
	44	# UTF-32, little endian
	45	BOM_UTF32_LE = '\xff\xfe\x00\x00'
	46
	47	# UTF-32, big endian
	48	BOM_UTF32_BE = '\x00\x00\xfe\xff'
	49
	50	if sys.byteorder == 'little':
	51
	52	# UTF-16, native endianness
	53	BOM = BOM_UTF16 = BOM_UTF16_LE
	54
	55	# UTF-32, native endianness
	56	BOM_UTF32 = BOM_UTF32_LE
	57
	58	else:
	59
	60	# UTF-16, native endianness
	61	BOM = BOM_UTF16 = BOM_UTF16_BE
	62
	63	# UTF-32, native endianness
	64	BOM_UTF32 = BOM_UTF32_BE
	65
	66	# Old broken names (don't use in new code)
	67	BOM32_LE = BOM_UTF16_LE
	68	BOM32_BE = BOM_UTF16_BE
	69	BOM64_LE = BOM_UTF32_LE
	70	BOM64_BE = BOM_UTF32_BE
	71
	72
	73	### Codec base classes (defining the API)
	74
	75	class CodecInfo(tuple):
	76
	77	def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
	78	incrementalencoder=None, incrementaldecoder=None, name=None):
	79	self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
	80	self.name = name
	81	self.encode = encode
	82	self.decode = decode
	83	self.incrementalencoder = incrementalencoder
	84	self.incrementaldecoder = incrementaldecoder
	85	self.streamwriter = streamwriter
	86	self.streamreader = streamreader
	87	return self
	88
	89	def __repr__(self):
	90	return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
	91
	92	class Codec:
	93
	94	""" Defines the interface for stateless encoders/decoders.
	95
	96	The .encode()/.decode() methods may use different error
	97	handling schemes by providing the errors argument. These
	98	string values are predefined:
	99
	100	'strict' - raise a ValueError error (or a subclass)
	101	'ignore' - ignore the character and continue with the next
	102	'replace' - replace with a suitable replacement character;
	103	Python will use the official U+FFFD REPLACEMENT
	104	CHARACTER for the builtin Unicode codecs on
	105	decoding and '?' on encoding.
	106	'xmlcharrefreplace' - Replace with the appropriate XML
	107	character reference (only for encoding).
	108	'backslashreplace' - Replace with backslashed escape sequences
	109	(only for encoding).
	110
	111	The set of allowed values can be extended via register_error.
	112
	113	"""
	114	def encode(self, input, errors='strict'):
	115
	116	""" Encodes the object input and returns a tuple (output
	117	object, length consumed).
	118
	119	errors defines the error handling to apply. It defaults to
	120	'strict' handling.
	121
	122	The method may not store state in the Codec instance. Use
	123	StreamCodec for codecs which have to keep state in order to
	124	make encoding/decoding efficient.
	125
	126	The encoder must be able to handle zero length input and
	127	return an empty object of the output object type in this
	128	situation.
	129
	130	"""
	131	raise NotImplementedError
	132
	133	def decode(self, input, errors='strict'):
	134
	135	""" Decodes the object input and returns a tuple (output
	136	object, length consumed).
	137
	138	input must be an object which provides the bf_getreadbuf
	139	buffer slot. Python strings, buffer objects and memory
	140	mapped files are examples of objects providing this slot.
	141
	142	errors defines the error handling to apply. It defaults to
	143	'strict' handling.
	144
	145	The method may not store state in the Codec instance. Use
	146	StreamCodec for codecs which have to keep state in order to
	147	make encoding/decoding efficient.
	148
	149	The decoder must be able to handle zero length input and
	150	return an empty object of the output object type in this
	151	situation.
	152
	153	"""
	154	raise NotImplementedError
	155
	156	class IncrementalEncoder(object):
	157	"""
	158	An IncrementalEncoder encodes an input in multiple steps. The input can be
	159	passed piece by piece to the encode() method. The IncrementalEncoder remembers
	160	the state of the Encoding process between calls to encode().
	161	"""
	162	def __init__(self, errors='strict'):
	163	"""
	164	Creates an IncrementalEncoder instance.
	165
	166	The IncrementalEncoder may use different error handling schemes by
	167	providing the errors keyword argument. See the module docstring
	168	for a list of possible values.
	169	"""
	170	self.errors = errors
	171	self.buffer = ""
	172
	173	def encode(self, input, final=False):
	174	"""
	175	Encodes input and returns the resulting object.
	176	"""
	177	raise NotImplementedError
	178
	179	def reset(self):
	180	"""
	181	Resets the encoder to the initial state.
	182	"""
	183
	184	def getstate(self):
	185	"""
	186	Return the current state of the encoder.
	187	"""
	188	return 0
	189
	190	def setstate(self, state):
	191	"""
	192	Set the current state of the encoder. state must have been
	193	returned by getstate().
	194	"""
	195
	196	class BufferedIncrementalEncoder(IncrementalEncoder):
	197	"""
	198	This subclass of IncrementalEncoder can be used as the baseclass for an
	199	incremental encoder if the encoder must keep some of the output in a
	200	buffer between calls to encode().
	201	"""
	202	def __init__(self, errors='strict'):
	203	IncrementalEncoder.__init__(self, errors)
	204	self.buffer = "" # unencoded input that is kept between calls to encode()
	205
	206	def _buffer_encode(self, input, errors, final):
	207	# Overwrite this method in subclasses: It must encode input
	208	# and return an (output, length consumed) tuple
	209	raise NotImplementedError
	210
	211	def encode(self, input, final=False):
	212	# encode input (taking the buffer into account)
	213	data = self.buffer + input
	214	(result, consumed) = self._buffer_encode(data, self.errors, final)
	215	# keep unencoded input until the next call
	216	self.buffer = data[consumed:]
	217	return result
	218
	219	def reset(self):
	220	IncrementalEncoder.reset(self)
	221	self.buffer = ""
	222
	223	def getstate(self):
	224	return self.buffer or 0
	225
	226	def setstate(self, state):
	227	self.buffer = state or ""
	228
	229	class IncrementalDecoder(object):
	230	"""
	231	An IncrementalDecoder decodes an input in multiple steps. The input can be
	232	passed piece by piece to the decode() method. The IncrementalDecoder
	233	remembers the state of the decoding process between calls to decode().
	234	"""
	235	def __init__(self, errors='strict'):
	236	"""
	237	Creates a IncrementalDecoder instance.
	238
	239	The IncrementalDecoder may use different error handling schemes by
	240	providing the errors keyword argument. See the module docstring
	241	for a list of possible values.
	242	"""
	243	self.errors = errors
	244
	245	def decode(self, input, final=False):
	246	"""
	247	Decodes input and returns the resulting object.
	248	"""
	249	raise NotImplementedError
	250
	251	def reset(self):
	252	"""
	253	Resets the decoder to the initial state.
	254	"""
	255
	256	def getstate(self):
	257	"""
	258	Return the current state of the decoder.
	259
	260	This must be a (buffered_input, additional_state_info) tuple.
	261	buffered_input must be a bytes object containing bytes that
	262	were passed to decode() that have not yet been converted.
	263	additional_state_info must be a non-negative integer
	264	representing the state of the decoder WITHOUT yet having
	265	processed the contents of buffered_input. In the initial state
	266	and after reset(), getstate() must return (b"", 0).
	267	"""
	268	return (b"", 0)
	269
	270	def setstate(self, state):
	271	"""
	272	Set the current state of the decoder.
	273
	274	state must have been returned by getstate(). The effect of
	275	setstate((b"", 0)) must be equivalent to reset().
	276	"""
	277
	278	class BufferedIncrementalDecoder(IncrementalDecoder):
	279	"""
	280	This subclass of IncrementalDecoder can be used as the baseclass for an
	281	incremental decoder if the decoder must be able to handle incomplete byte
	282	sequences.
	283	"""
	284	def __init__(self, errors='strict'):
	285	IncrementalDecoder.__init__(self, errors)
	286	self.buffer = "" # undecoded input that is kept between calls to decode()
	287
	288	def _buffer_decode(self, input, errors, final):
	289	# Overwrite this method in subclasses: It must decode input
	290	# and return an (output, length consumed) tuple
	291	raise NotImplementedError
	292
	293	def decode(self, input, final=False):
	294	# decode input (taking the buffer into account)
	295	data = self.buffer + input
	296	(result, consumed) = self._buffer_decode(data, self.errors, final)
	297	# keep undecoded input until the next call
	298	self.buffer = data[consumed:]
	299	return result
	300
	301	def reset(self):
	302	IncrementalDecoder.reset(self)
	303	self.buffer = ""
	304
	305	def getstate(self):
	306	# additional state info is always 0
	307	return (self.buffer, 0)
	308
	309	def setstate(self, state):
	310	# ignore additional state info
	311	self.buffer = state[0]
	312
	313	#
	314	# The StreamWriter and StreamReader class provide generic working
	315	# interfaces which can be used to implement new encoding submodules
	316	# very easily. See encodings/utf_8.py for an example on how this is
	317	# done.
	318	#
	319
	320	class StreamWriter(Codec):
	321
	322	def __init__(self, stream, errors='strict'):
	323
	324	""" Creates a StreamWriter instance.
	325
	326	stream must be a file-like object open for writing
	327	(binary) data.
	328
	329	The StreamWriter may use different error handling
	330	schemes by providing the errors keyword argument. These
	331	parameters are predefined:
	332
	333	'strict' - raise a ValueError (or a subclass)
	334	'ignore' - ignore the character and continue with the next
	335	'replace'- replace with a suitable replacement character
	336	'xmlcharrefreplace' - Replace with the appropriate XML
	337	character reference.
	338	'backslashreplace' - Replace with backslashed escape
	339	sequences (only for encoding).
	340
	341	The set of allowed parameter values can be extended via
	342	register_error.
	343	"""
	344	self.stream = stream
	345	self.errors = errors
	346
	347	def write(self, object):
	348
	349	""" Writes the object's contents encoded to self.stream.
	350	"""
	351	data, consumed = self.encode(object, self.errors)
	352	self.stream.write(data)
	353
	354	def writelines(self, list):
	355
	356	""" Writes the concatenated list of strings to the stream
	357	using .write().
	358	"""
	359	self.write(''.join(list))
	360
	361	def reset(self):
	362
	363	""" Flushes and resets the codec buffers used for keeping state.
	364
	365	Calling this method should ensure that the data on the
	366	output is put into a clean state, that allows appending
	367	of new fresh data without having to rescan the whole
	368	stream to recover state.
	369
	370	"""
	371	pass
	372
[391]	373	def seek(self, offset, whence=0):
	374	self.stream.seek(offset, whence)
	375	if whence == 0 and offset == 0:
	376	self.reset()
	377
[2]	378	def __getattr__(self, name,
	379	getattr=getattr):
	380
	381	""" Inherit all other methods from the underlying stream.
	382	"""
	383	return getattr(self.stream, name)
	384
	385	def __enter__(self):
	386	return self
	387
	388	def __exit__(self, type, value, tb):
	389	self.stream.close()
	390
	391	###
	392
	393	class StreamReader(Codec):
	394
	395	def __init__(self, stream, errors='strict'):
	396
	397	""" Creates a StreamReader instance.
	398
	399	stream must be a file-like object open for reading
	400	(binary) data.
	401
	402	The StreamReader may use different error handling
	403	schemes by providing the errors keyword argument. These
	404	parameters are predefined:
	405
	406	'strict' - raise a ValueError (or a subclass)
	407	'ignore' - ignore the character and continue with the next
	408	'replace'- replace with a suitable replacement character;
	409
	410	The set of allowed parameter values can be extended via
	411	register_error.
	412	"""
	413	self.stream = stream
	414	self.errors = errors
	415	self.bytebuffer = ""
	416	# For str->str decoding this will stay a str
	417	# For str->unicode decoding the first read will promote it to unicode
	418	self.charbuffer = ""
	419	self.linebuffer = None
	420
	421	def decode(self, input, errors='strict'):
	422	raise NotImplementedError
	423
	424	def read(self, size=-1, chars=-1, firstline=False):
	425
	426	""" Decodes data from the stream self.stream and returns the
	427	resulting object.
	428
	429	chars indicates the number of characters to read from the
	430	stream. read() will never return more than chars
	431	characters, but it might return less, if there are not enough
	432	characters available.
	433
	434	size indicates the approximate maximum number of bytes to
	435	read from the stream for decoding purposes. The decoder
	436	can modify this setting as appropriate. The default value
	437	-1 indicates to read and decode as much as possible. size
	438	is intended to prevent having to decode huge files in one
	439	step.
	440
	441	If firstline is true, and a UnicodeDecodeError happens
	442	after the first line terminator in the input only the first line
	443	will be returned, the rest of the input will be kept until the
	444	next call to read().
	445
	446	The method should use a greedy read strategy meaning that
	447	it should read as much data as is allowed within the
	448	definition of the encoding and the given size, e.g. if
	449	optional encoding endings or state markers are available
	450	on the stream, these should be read too.
	451	"""
	452	# If we have lines cached, first merge them back into characters
	453	if self.linebuffer:
	454	self.charbuffer = "".join(self.linebuffer)
	455	self.linebuffer = None
	456
	457	# read until we get the required number of characters (if available)
	458	while True:
	459	# can the request can be satisfied from the character buffer?
	460	if chars < 0:
	461	if size < 0:
	462	if self.charbuffer:
	463	break
	464	elif len(self.charbuffer) >= size:
	465	break
	466	else:
	467	if len(self.charbuffer) >= chars:
	468	break
	469	# we need more data
	470	if size < 0:
	471	newdata = self.stream.read()
	472	else:
	473	newdata = self.stream.read(size)
	474	# decode bytes (those remaining from the last call included)
	475	data = self.bytebuffer + newdata
	476	try:
	477	newchars, decodedbytes = self.decode(data, self.errors)
	478	except UnicodeDecodeError, exc:
	479	if firstline:
	480	newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
	481	lines = newchars.splitlines(True)
	482	if len(lines)<=1:
	483	raise
	484	else:
	485	raise
	486	# keep undecoded bytes until the next call
	487	self.bytebuffer = data[decodedbytes:]
	488	# put new characters in the character buffer
	489	self.charbuffer += newchars
	490	# there was no data available
	491	if not newdata:
	492	break
	493	if chars < 0:
	494	# Return everything we've got
	495	result = self.charbuffer
	496	self.charbuffer = ""
	497	else:
	498	# Return the first chars characters
	499	result = self.charbuffer[:chars]
	500	self.charbuffer = self.charbuffer[chars:]
	501	return result
	502
	503	def readline(self, size=None, keepends=True):
	504
	505	""" Read one line from the input stream and return the
	506	decoded data.
	507
	508	size, if given, is passed as size argument to the
	509	read() method.
	510
	511	"""
	512	# If we have lines cached from an earlier read, return
	513	# them unconditionally
	514	if self.linebuffer:
	515	line = self.linebuffer[0]
	516	del self.linebuffer[0]
	517	if len(self.linebuffer) == 1:
	518	# revert to charbuffer mode; we might need more data
	519	# next time
	520	self.charbuffer = self.linebuffer[0]
	521	self.linebuffer = None
	522	if not keepends:
	523	line = line.splitlines(False)[0]
	524	return line
	525
	526	readsize = size or 72
	527	line = ""
	528	# If size is given, we call read() only once
	529	while True:
	530	data = self.read(readsize, firstline=True)
	531	if data:
	532	# If we're at a "\r" read one extra character (which might
	533	# be a "\n") to get a proper line ending. If the stream is
	534	# temporarily exhausted we return the wrong line ending.
	535	if data.endswith("\r"):
	536	data += self.read(size=1, chars=1)
	537
	538	line += data
	539	lines = line.splitlines(True)
	540	if lines:
	541	if len(lines) > 1:
	542	# More than one line result; the first line is a full line
	543	# to return
	544	line = lines[0]
	545	del lines[0]
	546	if len(lines) > 1:
	547	# cache the remaining lines
	548	lines[-1] += self.charbuffer
	549	self.linebuffer = lines
	550	self.charbuffer = None
	551	else:
	552	# only one remaining line, put it back into charbuffer
	553	self.charbuffer = lines[0] + self.charbuffer
	554	if not keepends:
	555	line = line.splitlines(False)[0]
	556	break
	557	line0withend = lines[0]
	558	line0withoutend = lines[0].splitlines(False)[0]
	559	if line0withend != line0withoutend: # We really have a line end
	560	# Put the rest back together and keep it until the next call
	561	self.charbuffer = "".join(lines[1:]) + self.charbuffer
	562	if keepends:
	563	line = line0withend
	564	else:
	565	line = line0withoutend
	566	break
	567	# we didn't get anything or this was our only try
	568	if not data or size is not None:
	569	if line and not keepends:
	570	line = line.splitlines(False)[0]
	571	break
	572	if readsize<8000:
	573	readsize *= 2
	574	return line
	575
	576	def readlines(self, sizehint=None, keepends=True):
	577
	578	""" Read all lines available on the input stream
	579	and return them as list of lines.
	580
	581	Line breaks are implemented using the codec's decoder
	582	method and are included in the list entries.
	583
	584	sizehint, if given, is ignored since there is no efficient
	585	way to finding the true end-of-line.
	586
	587	"""
	588	data = self.read()
	589	return data.splitlines(keepends)
	590
	591	def reset(self):
	592
	593	""" Resets the codec buffers used for keeping state.
	594
	595	Note that no stream repositioning should take place.
	596	This method is primarily intended to be able to recover
	597	from decoding errors.
	598
	599	"""
	600	self.bytebuffer = ""
	601	self.charbuffer = u""
	602	self.linebuffer = None
	603
	604	def seek(self, offset, whence=0):
	605	""" Set the input stream's current position.
	606
	607	Resets the codec buffers used for keeping state.
	608	"""
[391]	609	self.stream.seek(offset, whence)
[2]	610	self.reset()
	611
	612	def next(self):
	613
	614	""" Return the next decoded line from the input stream."""
	615	line = self.readline()
	616	if line:
	617	return line
	618	raise StopIteration
	619
	620	def __iter__(self):
	621	return self
	622
	623	def __getattr__(self, name,
	624	getattr=getattr):
	625
	626	""" Inherit all other methods from the underlying stream.
	627	"""
	628	return getattr(self.stream, name)
	629
	630	def __enter__(self):
	631	return self
	632
	633	def __exit__(self, type, value, tb):
	634	self.stream.close()
	635
	636	###
	637
	638	class StreamReaderWriter:
	639
	640	""" StreamReaderWriter instances allow wrapping streams which
	641	work in both read and write modes.
	642
	643	The design is such that one can use the factory functions
	644	returned by the codec.lookup() function to construct the
	645	instance.
	646
	647	"""
	648	# Optional attributes set by the file wrappers below
	649	encoding = 'unknown'
	650
	651	def __init__(self, stream, Reader, Writer, errors='strict'):
	652
	653	""" Creates a StreamReaderWriter instance.
	654
	655	stream must be a Stream-like object.
	656
	657	Reader, Writer must be factory functions or classes
	658	providing the StreamReader, StreamWriter interface resp.
	659
	660	Error handling is done in the same way as defined for the
	661	StreamWriter/Readers.
	662
	663	"""
	664	self.stream = stream
	665	self.reader = Reader(stream, errors)
	666	self.writer = Writer(stream, errors)
	667	self.errors = errors
	668
	669	def read(self, size=-1):
	670
	671	return self.reader.read(size)
	672
	673	def readline(self, size=None):
	674
	675	return self.reader.readline(size)
	676
	677	def readlines(self, sizehint=None):
	678
	679	return self.reader.readlines(sizehint)
	680
	681	def next(self):
	682
	683	""" Return the next decoded line from the input stream."""
	684	return self.reader.next()
	685
	686	def __iter__(self):
	687	return self
	688
	689	def write(self, data):
	690
	691	return self.writer.write(data)
	692
	693	def writelines(self, list):
	694
	695	return self.writer.writelines(list)
	696
	697	def reset(self):
	698
	699	self.reader.reset()
	700	self.writer.reset()
	701
[391]	702	def seek(self, offset, whence=0):
	703	self.stream.seek(offset, whence)
	704	self.reader.reset()
	705	if whence == 0 and offset == 0:
	706	self.writer.reset()
	707
[2]	708	def __getattr__(self, name,
	709	getattr=getattr):
	710
	711	""" Inherit all other methods from the underlying stream.
	712	"""
	713	return getattr(self.stream, name)
	714
	715	# these are needed to make "with codecs.open(...)" work properly
	716
	717	def __enter__(self):
	718	return self
	719
	720	def __exit__(self, type, value, tb):
	721	self.stream.close()
	722
	723	###
	724
	725	class StreamRecoder:
	726
	727	""" StreamRecoder instances provide a frontend - backend
	728	view of encoding data.
	729
	730	They use the complete set of APIs returned by the
	731	codecs.lookup() function to implement their task.
	732
	733	Data written to the stream is first decoded into an
	734	intermediate format (which is dependent on the given codec
	735	combination) and then written to the stream using an instance
	736	of the provided Writer class.
	737
	738	In the other direction, data is read from the stream using a
	739	Reader instance and then return encoded data to the caller.
	740
	741	"""
	742	# Optional attributes set by the file wrappers below
	743	data_encoding = 'unknown'
	744	file_encoding = 'unknown'
	745
	746	def __init__(self, stream, encode, decode, Reader, Writer,
	747	errors='strict'):
	748
	749	""" Creates a StreamRecoder instance which implements a two-way
	750	conversion: encode and decode work on the frontend (the
	751	input to .read() and output of .write()) while
	752	Reader and Writer work on the backend (reading and
	753	writing to the stream).
	754
	755	You can use these objects to do transparent direct
	756	recodings from e.g. latin-1 to utf-8 and back.
	757
	758	stream must be a file-like object.
	759
	760	encode, decode must adhere to the Codec interface, Reader,
	761	Writer must be factory functions or classes providing the
	762	StreamReader, StreamWriter interface resp.
	763
	764	encode and decode are needed for the frontend translation,
	765	Reader and Writer for the backend translation. Unicode is
	766	used as intermediate encoding.
	767
	768	Error handling is done in the same way as defined for the
	769	StreamWriter/Readers.
	770
	771	"""
	772	self.stream = stream
	773	self.encode = encode
	774	self.decode = decode
	775	self.reader = Reader(stream, errors)
	776	self.writer = Writer(stream, errors)
	777	self.errors = errors
	778
	779	def read(self, size=-1):
	780
	781	data = self.reader.read(size)
	782	data, bytesencoded = self.encode(data, self.errors)
	783	return data
	784
	785	def readline(self, size=None):
	786
	787	if size is None:
	788	data = self.reader.readline()
	789	else:
	790	data = self.reader.readline(size)
	791	data, bytesencoded = self.encode(data, self.errors)
	792	return data
	793
	794	def readlines(self, sizehint=None):
	795
	796	data = self.reader.read()
	797	data, bytesencoded = self.encode(data, self.errors)
	798	return data.splitlines(1)
	799
	800	def next(self):
	801
	802	""" Return the next decoded line from the input stream."""
	803	data = self.reader.next()
	804	data, bytesencoded = self.encode(data, self.errors)
	805	return data
	806
	807	def __iter__(self):
	808	return self
	809
	810	def write(self, data):
	811
	812	data, bytesdecoded = self.decode(data, self.errors)
	813	return self.writer.write(data)
	814
	815	def writelines(self, list):
	816
	817	data = ''.join(list)
	818	data, bytesdecoded = self.decode(data, self.errors)
	819	return self.writer.write(data)
	820
	821	def reset(self):
	822
	823	self.reader.reset()
	824	self.writer.reset()
	825
	826	def __getattr__(self, name,
	827	getattr=getattr):
	828
	829	""" Inherit all other methods from the underlying stream.
	830	"""
	831	return getattr(self.stream, name)
	832
	833	def __enter__(self):
	834	return self
	835
	836	def __exit__(self, type, value, tb):
	837	self.stream.close()
	838
	839	### Shortcuts
	840
	841	def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
	842
	843	""" Open an encoded file using the given mode and return
	844	a wrapped version providing transparent encoding/decoding.
	845
	846	Note: The wrapped version will only accept the object format
	847	defined by the codecs, i.e. Unicode objects for most builtin
	848	codecs. Output is also codec dependent and will usually be
	849	Unicode as well.
	850
	851	Files are always opened in binary mode, even if no binary mode
	852	was specified. This is done to avoid data loss due to encodings
	853	using 8-bit values. The default file mode is 'rb' meaning to
	854	open the file in binary read mode.
	855
	856	encoding specifies the encoding which is to be used for the
	857	file.
	858
	859	errors may be given to define the error handling. It defaults
	860	to 'strict' which causes ValueErrors to be raised in case an
	861	encoding error occurs.
	862
	863	buffering has the same meaning as for the builtin open() API.
	864	It defaults to line buffered.
	865
	866	The returned wrapped file object provides an extra attribute
	867	.encoding which allows querying the used encoding. This
	868	attribute is only available if an encoding was specified as
	869	parameter.
	870
	871	"""
	872	if encoding is not None:
	873	if 'U' in mode:
	874	# No automatic conversion of '\n' is done on reading and writing
	875	mode = mode.strip().replace('U', '')
	876	if mode[:1] not in set('rwa'):
	877	mode = 'r' + mode
	878	if 'b' not in mode:
	879	# Force opening of the file in binary mode
	880	mode = mode + 'b'
	881	file = __builtin__.open(filename, mode, buffering)
	882	if encoding is None:
	883	return file
	884	info = lookup(encoding)
	885	srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
	886	# Add attributes to simplify introspection
	887	srw.encoding = encoding
	888	return srw
	889
	890	def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
	891
	892	""" Return a wrapped version of file which provides transparent
	893	encoding translation.
	894
	895	Strings written to the wrapped file are interpreted according
	896	to the given data_encoding and then written to the original
	897	file as string using file_encoding. The intermediate encoding
	898	will usually be Unicode but depends on the specified codecs.
	899
	900	Strings are read from the file using file_encoding and then
	901	passed back to the caller as string using data_encoding.
	902
	903	If file_encoding is not given, it defaults to data_encoding.
	904
	905	errors may be given to define the error handling. It defaults
	906	to 'strict' which causes ValueErrors to be raised in case an
	907	encoding error occurs.
	908
	909	The returned wrapped file object provides two extra attributes
	910	.data_encoding and .file_encoding which reflect the given
	911	parameters of the same name. The attributes can be used for
	912	introspection by Python programs.
	913
	914	"""
	915	if file_encoding is None:
	916	file_encoding = data_encoding
	917	data_info = lookup(data_encoding)
	918	file_info = lookup(file_encoding)
	919	sr = StreamRecoder(file, data_info.encode, data_info.decode,
	920	file_info.streamreader, file_info.streamwriter, errors)
	921	# Add attributes to simplify introspection
	922	sr.data_encoding = data_encoding
	923	sr.file_encoding = file_encoding
	924	return sr
	925
	926	### Helpers for codec lookup
	927
	928	def getencoder(encoding):
	929
	930	""" Lookup up the codec for the given encoding and return
	931	its encoder function.
	932
	933	Raises a LookupError in case the encoding cannot be found.
	934
	935	"""
	936	return lookup(encoding).encode
	937
	938	def getdecoder(encoding):
	939
	940	""" Lookup up the codec for the given encoding and return
	941	its decoder function.
	942
	943	Raises a LookupError in case the encoding cannot be found.
	944
	945	"""
	946	return lookup(encoding).decode
	947
	948	def getincrementalencoder(encoding):
	949
	950	""" Lookup up the codec for the given encoding and return
	951	its IncrementalEncoder class or factory function.
	952
	953	Raises a LookupError in case the encoding cannot be found
	954	or the codecs doesn't provide an incremental encoder.
	955
	956	"""
	957	encoder = lookup(encoding).incrementalencoder
	958	if encoder is None:
	959	raise LookupError(encoding)
	960	return encoder
	961
	962	def getincrementaldecoder(encoding):
	963
	964	""" Lookup up the codec for the given encoding and return
	965	its IncrementalDecoder class or factory function.
	966
	967	Raises a LookupError in case the encoding cannot be found
	968	or the codecs doesn't provide an incremental decoder.
	969
	970	"""
	971	decoder = lookup(encoding).incrementaldecoder
	972	if decoder is None:
	973	raise LookupError(encoding)
	974	return decoder
	975
	976	def getreader(encoding):
	977
	978	""" Lookup up the codec for the given encoding and return
	979	its StreamReader class or factory function.
	980
	981	Raises a LookupError in case the encoding cannot be found.
	982
	983	"""
	984	return lookup(encoding).streamreader
	985
	986	def getwriter(encoding):
	987
	988	""" Lookup up the codec for the given encoding and return
	989	its StreamWriter class or factory function.
	990
	991	Raises a LookupError in case the encoding cannot be found.
	992
	993	"""
	994	return lookup(encoding).streamwriter
	995
	996	def iterencode(iterator, encoding, errors='strict', **kwargs):
	997	"""
	998	Encoding iterator.
	999
	1000	Encodes the input strings from the iterator using a IncrementalEncoder.
	1001
	1002	errors and kwargs are passed through to the IncrementalEncoder
	1003	constructor.
	1004	"""
	1005	encoder = getincrementalencoder(encoding)(errors, **kwargs)
	1006	for input in iterator:
	1007	output = encoder.encode(input)
	1008	if output:
	1009	yield output
	1010	output = encoder.encode("", True)
	1011	if output:
	1012	yield output
	1013
	1014	def iterdecode(iterator, encoding, errors='strict', **kwargs):
	1015	"""
	1016	Decoding iterator.
	1017
	1018	Decodes the input strings from the iterator using a IncrementalDecoder.
	1019
	1020	errors and kwargs are passed through to the IncrementalDecoder
	1021	constructor.
	1022	"""
	1023	decoder = getincrementaldecoder(encoding)(errors, **kwargs)
	1024	for input in iterator:
	1025	output = decoder.decode(input)
	1026	if output:
	1027	yield output
	1028	output = decoder.decode("", True)
	1029	if output:
	1030	yield output
	1031
	1032	### Helpers for charmap-based codecs
	1033
	1034	def make_identity_dict(rng):
	1035
	1036	""" make_identity_dict(rng) -> dict
	1037
	1038	Return a dictionary where elements of the rng sequence are
	1039	mapped to themselves.
	1040
	1041	"""
	1042	res = {}
	1043	for i in rng:
	1044	res[i]=i
	1045	return res
	1046
	1047	def make_encoding_map(decoding_map):
	1048
	1049	""" Creates an encoding map from a decoding map.
	1050
	1051	If a target mapping in the decoding map occurs multiple
	1052	times, then that target is mapped to None (undefined mapping),
	1053	causing an exception when encountered by the charmap codec
	1054	during translation.
	1055
	1056	One example where this happens is cp875.py which decodes
	1057	multiple character to \u001a.
	1058
	1059	"""
	1060	m = {}
	1061	for k,v in decoding_map.items():
	1062	if not v in m:
	1063	m[v] = k
	1064	else:
	1065	m[v] = None
	1066	return m
	1067
	1068	### error handlers
	1069
	1070	try:
	1071	strict_errors = lookup_error("strict")
	1072	ignore_errors = lookup_error("ignore")
	1073	replace_errors = lookup_error("replace")
	1074	xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
	1075	backslashreplace_errors = lookup_error("backslashreplace")
	1076	except LookupError:
	1077	# In --disable-unicode builds, these error handler are missing
	1078	strict_errors = None
	1079	ignore_errors = None
	1080	replace_errors = None
	1081	xmlcharrefreplace_errors = None
	1082	backslashreplace_errors = None
	1083
	1084	# Tell modulefinder that using codecs probably needs the encodings
	1085	# package
	1086	_false = 0
	1087	if _false:
	1088	import encodings
	1089
	1090	### Tests
	1091
	1092	if __name__ == '__main__':
	1093
	1094	# Make stdout translate Latin-1 output into UTF-8 output
	1095	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
	1096
	1097	# Have stdin translate Latin-1 input into UTF-8 input
	1098	sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/codecs.py

Download in other formats: