Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

codecs.py@ 382

Last change on this file since 382 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 34.1 KB

Line
1	""" codecs -- Python Codec Registry, API and helpers.
2
3
4	Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8	"""#"
9
10	import __builtin__, sys
11
12	### Registry and builtin stateless codec functions
13
14	try:
15	from _codecs import *
16	except ImportError, why:
17	raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19	__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20	"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21	"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22	"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23	"strict_errors", "ignore_errors", "replace_errors",
24	"xmlcharrefreplace_errors",
25	"register_error", "lookup_error"]
26
27	### Constants
28
29	#
30	# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31	# and its possible byte string values
32	# for UTF8/UTF16/UTF32 output and little/big endian machines
33	#
34
35	# UTF-8
36	BOM_UTF8 = '\xef\xbb\xbf'
37
38	# UTF-16, little endian
39	BOM_LE = BOM_UTF16_LE = '\xff\xfe'
40
41	# UTF-16, big endian
42	BOM_BE = BOM_UTF16_BE = '\xfe\xff'
43
44	# UTF-32, little endian
45	BOM_UTF32_LE = '\xff\xfe\x00\x00'
46
47	# UTF-32, big endian
48	BOM_UTF32_BE = '\x00\x00\xfe\xff'
49
50	if sys.byteorder == 'little':
51
52	# UTF-16, native endianness
53	BOM = BOM_UTF16 = BOM_UTF16_LE
54
55	# UTF-32, native endianness
56	BOM_UTF32 = BOM_UTF32_LE
57
58	else:
59
60	# UTF-16, native endianness
61	BOM = BOM_UTF16 = BOM_UTF16_BE
62
63	# UTF-32, native endianness
64	BOM_UTF32 = BOM_UTF32_BE
65
66	# Old broken names (don't use in new code)
67	BOM32_LE = BOM_UTF16_LE
68	BOM32_BE = BOM_UTF16_BE
69	BOM64_LE = BOM_UTF32_LE
70	BOM64_BE = BOM_UTF32_BE
71
72
73	### Codec base classes (defining the API)
74
75	class CodecInfo(tuple):
76
77	def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78	incrementalencoder=None, incrementaldecoder=None, name=None):
79	self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80	self.name = name
81	self.encode = encode
82	self.decode = decode
83	self.incrementalencoder = incrementalencoder
84	self.incrementaldecoder = incrementaldecoder
85	self.streamwriter = streamwriter
86	self.streamreader = streamreader
87	return self
88
89	def __repr__(self):
90	return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
91
92	class Codec:
93
94	""" Defines the interface for stateless encoders/decoders.
95
96	The .encode()/.decode() methods may use different error
97	handling schemes by providing the errors argument. These
98	string values are predefined:
99
100	'strict' - raise a ValueError error (or a subclass)
101	'ignore' - ignore the character and continue with the next
102	'replace' - replace with a suitable replacement character;
103	Python will use the official U+FFFD REPLACEMENT
104	CHARACTER for the builtin Unicode codecs on
105	decoding and '?' on encoding.
106	'xmlcharrefreplace' - Replace with the appropriate XML
107	character reference (only for encoding).
108	'backslashreplace' - Replace with backslashed escape sequences
109	(only for encoding).
110
111	The set of allowed values can be extended via register_error.
112
113	"""
114	def encode(self, input, errors='strict'):
115
116	""" Encodes the object input and returns a tuple (output
117	object, length consumed).
118
119	errors defines the error handling to apply. It defaults to
120	'strict' handling.
121
122	The method may not store state in the Codec instance. Use
123	StreamCodec for codecs which have to keep state in order to
124	make encoding/decoding efficient.
125
126	The encoder must be able to handle zero length input and
127	return an empty object of the output object type in this
128	situation.
129
130	"""
131	raise NotImplementedError
132
133	def decode(self, input, errors='strict'):
134
135	""" Decodes the object input and returns a tuple (output
136	object, length consumed).
137
138	input must be an object which provides the bf_getreadbuf
139	buffer slot. Python strings, buffer objects and memory
140	mapped files are examples of objects providing this slot.
141
142	errors defines the error handling to apply. It defaults to
143	'strict' handling.
144
145	The method may not store state in the Codec instance. Use
146	StreamCodec for codecs which have to keep state in order to
147	make encoding/decoding efficient.
148
149	The decoder must be able to handle zero length input and
150	return an empty object of the output object type in this
151	situation.
152
153	"""
154	raise NotImplementedError
155
156	class IncrementalEncoder(object):
157	"""
158	An IncrementalEncoder encodes an input in multiple steps. The input can be
159	passed piece by piece to the encode() method. The IncrementalEncoder remembers
160	the state of the Encoding process between calls to encode().
161	"""
162	def __init__(self, errors='strict'):
163	"""
164	Creates an IncrementalEncoder instance.
165
166	The IncrementalEncoder may use different error handling schemes by
167	providing the errors keyword argument. See the module docstring
168	for a list of possible values.
169	"""
170	self.errors = errors
171	self.buffer = ""
172
173	def encode(self, input, final=False):
174	"""
175	Encodes input and returns the resulting object.
176	"""
177	raise NotImplementedError
178
179	def reset(self):
180	"""
181	Resets the encoder to the initial state.
182	"""
183
184	def getstate(self):
185	"""
186	Return the current state of the encoder.
187	"""
188	return 0
189
190	def setstate(self, state):
191	"""
192	Set the current state of the encoder. state must have been
193	returned by getstate().
194	"""
195
196	class BufferedIncrementalEncoder(IncrementalEncoder):
197	"""
198	This subclass of IncrementalEncoder can be used as the baseclass for an
199	incremental encoder if the encoder must keep some of the output in a
200	buffer between calls to encode().
201	"""
202	def __init__(self, errors='strict'):
203	IncrementalEncoder.__init__(self, errors)
204	self.buffer = "" # unencoded input that is kept between calls to encode()
205
206	def _buffer_encode(self, input, errors, final):
207	# Overwrite this method in subclasses: It must encode input
208	# and return an (output, length consumed) tuple
209	raise NotImplementedError
210
211	def encode(self, input, final=False):
212	# encode input (taking the buffer into account)
213	data = self.buffer + input
214	(result, consumed) = self._buffer_encode(data, self.errors, final)
215	# keep unencoded input until the next call
216	self.buffer = data[consumed:]
217	return result
218
219	def reset(self):
220	IncrementalEncoder.reset(self)
221	self.buffer = ""
222
223	def getstate(self):
224	return self.buffer or 0
225
226	def setstate(self, state):
227	self.buffer = state or ""
228
229	class IncrementalDecoder(object):
230	"""
231	An IncrementalDecoder decodes an input in multiple steps. The input can be
232	passed piece by piece to the decode() method. The IncrementalDecoder
233	remembers the state of the decoding process between calls to decode().
234	"""
235	def __init__(self, errors='strict'):
236	"""
237	Creates a IncrementalDecoder instance.
238
239	The IncrementalDecoder may use different error handling schemes by
240	providing the errors keyword argument. See the module docstring
241	for a list of possible values.
242	"""
243	self.errors = errors
244
245	def decode(self, input, final=False):
246	"""
247	Decodes input and returns the resulting object.
248	"""
249	raise NotImplementedError
250
251	def reset(self):
252	"""
253	Resets the decoder to the initial state.
254	"""
255
256	def getstate(self):
257	"""
258	Return the current state of the decoder.
259
260	This must be a (buffered_input, additional_state_info) tuple.
261	buffered_input must be a bytes object containing bytes that
262	were passed to decode() that have not yet been converted.
263	additional_state_info must be a non-negative integer
264	representing the state of the decoder WITHOUT yet having
265	processed the contents of buffered_input. In the initial state
266	and after reset(), getstate() must return (b"", 0).
267	"""
268	return (b"", 0)
269
270	def setstate(self, state):
271	"""
272	Set the current state of the decoder.
273
274	state must have been returned by getstate(). The effect of
275	setstate((b"", 0)) must be equivalent to reset().
276	"""
277
278	class BufferedIncrementalDecoder(IncrementalDecoder):
279	"""
280	This subclass of IncrementalDecoder can be used as the baseclass for an
281	incremental decoder if the decoder must be able to handle incomplete byte
282	sequences.
283	"""
284	def __init__(self, errors='strict'):
285	IncrementalDecoder.__init__(self, errors)
286	self.buffer = "" # undecoded input that is kept between calls to decode()
287
288	def _buffer_decode(self, input, errors, final):
289	# Overwrite this method in subclasses: It must decode input
290	# and return an (output, length consumed) tuple
291	raise NotImplementedError
292
293	def decode(self, input, final=False):
294	# decode input (taking the buffer into account)
295	data = self.buffer + input
296	(result, consumed) = self._buffer_decode(data, self.errors, final)
297	# keep undecoded input until the next call
298	self.buffer = data[consumed:]
299	return result
300
301	def reset(self):
302	IncrementalDecoder.reset(self)
303	self.buffer = ""
304
305	def getstate(self):
306	# additional state info is always 0
307	return (self.buffer, 0)
308
309	def setstate(self, state):
310	# ignore additional state info
311	self.buffer = state[0]
312
313	#
314	# The StreamWriter and StreamReader class provide generic working
315	# interfaces which can be used to implement new encoding submodules
316	# very easily. See encodings/utf_8.py for an example on how this is
317	# done.
318	#
319
320	class StreamWriter(Codec):
321
322	def __init__(self, stream, errors='strict'):
323
324	""" Creates a StreamWriter instance.
325
326	stream must be a file-like object open for writing
327	(binary) data.
328
329	The StreamWriter may use different error handling
330	schemes by providing the errors keyword argument. These
331	parameters are predefined:
332
333	'strict' - raise a ValueError (or a subclass)
334	'ignore' - ignore the character and continue with the next
335	'replace'- replace with a suitable replacement character
336	'xmlcharrefreplace' - Replace with the appropriate XML
337	character reference.
338	'backslashreplace' - Replace with backslashed escape
339	sequences (only for encoding).
340
341	The set of allowed parameter values can be extended via
342	register_error.
343	"""
344	self.stream = stream
345	self.errors = errors
346
347	def write(self, object):
348
349	""" Writes the object's contents encoded to self.stream.
350	"""
351	data, consumed = self.encode(object, self.errors)
352	self.stream.write(data)
353
354	def writelines(self, list):
355
356	""" Writes the concatenated list of strings to the stream
357	using .write().
358	"""
359	self.write(''.join(list))
360
361	def reset(self):
362
363	""" Flushes and resets the codec buffers used for keeping state.
364
365	Calling this method should ensure that the data on the
366	output is put into a clean state, that allows appending
367	of new fresh data without having to rescan the whole
368	stream to recover state.
369
370	"""
371	pass
372
373	def __getattr__(self, name,
374	getattr=getattr):
375
376	""" Inherit all other methods from the underlying stream.
377	"""
378	return getattr(self.stream, name)
379
380	def __enter__(self):
381	return self
382
383	def __exit__(self, type, value, tb):
384	self.stream.close()
385
386	###
387
388	class StreamReader(Codec):
389
390	def __init__(self, stream, errors='strict'):
391
392	""" Creates a StreamReader instance.
393
394	stream must be a file-like object open for reading
395	(binary) data.
396
397	The StreamReader may use different error handling
398	schemes by providing the errors keyword argument. These
399	parameters are predefined:
400
401	'strict' - raise a ValueError (or a subclass)
402	'ignore' - ignore the character and continue with the next
403	'replace'- replace with a suitable replacement character;
404
405	The set of allowed parameter values can be extended via
406	register_error.
407	"""
408	self.stream = stream
409	self.errors = errors
410	self.bytebuffer = ""
411	# For str->str decoding this will stay a str
412	# For str->unicode decoding the first read will promote it to unicode
413	self.charbuffer = ""
414	self.linebuffer = None
415
416	def decode(self, input, errors='strict'):
417	raise NotImplementedError
418
419	def read(self, size=-1, chars=-1, firstline=False):
420
421	""" Decodes data from the stream self.stream and returns the
422	resulting object.
423
424	chars indicates the number of characters to read from the
425	stream. read() will never return more than chars
426	characters, but it might return less, if there are not enough
427	characters available.
428
429	size indicates the approximate maximum number of bytes to
430	read from the stream for decoding purposes. The decoder
431	can modify this setting as appropriate. The default value
432	-1 indicates to read and decode as much as possible. size
433	is intended to prevent having to decode huge files in one
434	step.
435
436	If firstline is true, and a UnicodeDecodeError happens
437	after the first line terminator in the input only the first line
438	will be returned, the rest of the input will be kept until the
439	next call to read().
440
441	The method should use a greedy read strategy meaning that
442	it should read as much data as is allowed within the
443	definition of the encoding and the given size, e.g. if
444	optional encoding endings or state markers are available
445	on the stream, these should be read too.
446	"""
447	# If we have lines cached, first merge them back into characters
448	if self.linebuffer:
449	self.charbuffer = "".join(self.linebuffer)
450	self.linebuffer = None
451
452	# read until we get the required number of characters (if available)
453	while True:
454	# can the request can be satisfied from the character buffer?
455	if chars < 0:
456	if size < 0:
457	if self.charbuffer:
458	break
459	elif len(self.charbuffer) >= size:
460	break
461	else:
462	if len(self.charbuffer) >= chars:
463	break
464	# we need more data
465	if size < 0:
466	newdata = self.stream.read()
467	else:
468	newdata = self.stream.read(size)
469	# decode bytes (those remaining from the last call included)
470	data = self.bytebuffer + newdata
471	try:
472	newchars, decodedbytes = self.decode(data, self.errors)
473	except UnicodeDecodeError, exc:
474	if firstline:
475	newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
476	lines = newchars.splitlines(True)
477	if len(lines)<=1:
478	raise
479	else:
480	raise
481	# keep undecoded bytes until the next call
482	self.bytebuffer = data[decodedbytes:]
483	# put new characters in the character buffer
484	self.charbuffer += newchars
485	# there was no data available
486	if not newdata:
487	break
488	if chars < 0:
489	# Return everything we've got
490	result = self.charbuffer
491	self.charbuffer = ""
492	else:
493	# Return the first chars characters
494	result = self.charbuffer[:chars]
495	self.charbuffer = self.charbuffer[chars:]
496	return result
497
498	def readline(self, size=None, keepends=True):
499
500	""" Read one line from the input stream and return the
501	decoded data.
502
503	size, if given, is passed as size argument to the
504	read() method.
505
506	"""
507	# If we have lines cached from an earlier read, return
508	# them unconditionally
509	if self.linebuffer:
510	line = self.linebuffer[0]
511	del self.linebuffer[0]
512	if len(self.linebuffer) == 1:
513	# revert to charbuffer mode; we might need more data
514	# next time
515	self.charbuffer = self.linebuffer[0]
516	self.linebuffer = None
517	if not keepends:
518	line = line.splitlines(False)[0]
519	return line
520
521	readsize = size or 72
522	line = ""
523	# If size is given, we call read() only once
524	while True:
525	data = self.read(readsize, firstline=True)
526	if data:
527	# If we're at a "\r" read one extra character (which might
528	# be a "\n") to get a proper line ending. If the stream is
529	# temporarily exhausted we return the wrong line ending.
530	if data.endswith("\r"):
531	data += self.read(size=1, chars=1)
532
533	line += data
534	lines = line.splitlines(True)
535	if lines:
536	if len(lines) > 1:
537	# More than one line result; the first line is a full line
538	# to return
539	line = lines[0]
540	del lines[0]
541	if len(lines) > 1:
542	# cache the remaining lines
543	lines[-1] += self.charbuffer
544	self.linebuffer = lines
545	self.charbuffer = None
546	else:
547	# only one remaining line, put it back into charbuffer
548	self.charbuffer = lines[0] + self.charbuffer
549	if not keepends:
550	line = line.splitlines(False)[0]
551	break
552	line0withend = lines[0]
553	line0withoutend = lines[0].splitlines(False)[0]
554	if line0withend != line0withoutend: # We really have a line end
555	# Put the rest back together and keep it until the next call
556	self.charbuffer = "".join(lines[1:]) + self.charbuffer
557	if keepends:
558	line = line0withend
559	else:
560	line = line0withoutend
561	break
562	# we didn't get anything or this was our only try
563	if not data or size is not None:
564	if line and not keepends:
565	line = line.splitlines(False)[0]
566	break
567	if readsize<8000:
568	readsize *= 2
569	return line
570
571	def readlines(self, sizehint=None, keepends=True):
572
573	""" Read all lines available on the input stream
574	and return them as list of lines.
575
576	Line breaks are implemented using the codec's decoder
577	method and are included in the list entries.
578
579	sizehint, if given, is ignored since there is no efficient
580	way to finding the true end-of-line.
581
582	"""
583	data = self.read()
584	return data.splitlines(keepends)
585
586	def reset(self):
587
588	""" Resets the codec buffers used for keeping state.
589
590	Note that no stream repositioning should take place.
591	This method is primarily intended to be able to recover
592	from decoding errors.
593
594	"""
595	self.bytebuffer = ""
596	self.charbuffer = u""
597	self.linebuffer = None
598
599	def seek(self, offset, whence=0):
600	""" Set the input stream's current position.
601
602	Resets the codec buffers used for keeping state.
603	"""
604	self.reset()
605	self.stream.seek(offset, whence)
606
607	def next(self):
608
609	""" Return the next decoded line from the input stream."""
610	line = self.readline()
611	if line:
612	return line
613	raise StopIteration
614
615	def __iter__(self):
616	return self
617
618	def __getattr__(self, name,
619	getattr=getattr):
620
621	""" Inherit all other methods from the underlying stream.
622	"""
623	return getattr(self.stream, name)
624
625	def __enter__(self):
626	return self
627
628	def __exit__(self, type, value, tb):
629	self.stream.close()
630
631	###
632
633	class StreamReaderWriter:
634
635	""" StreamReaderWriter instances allow wrapping streams which
636	work in both read and write modes.
637
638	The design is such that one can use the factory functions
639	returned by the codec.lookup() function to construct the
640	instance.
641
642	"""
643	# Optional attributes set by the file wrappers below
644	encoding = 'unknown'
645
646	def __init__(self, stream, Reader, Writer, errors='strict'):
647
648	""" Creates a StreamReaderWriter instance.
649
650	stream must be a Stream-like object.
651
652	Reader, Writer must be factory functions or classes
653	providing the StreamReader, StreamWriter interface resp.
654
655	Error handling is done in the same way as defined for the
656	StreamWriter/Readers.
657
658	"""
659	self.stream = stream
660	self.reader = Reader(stream, errors)
661	self.writer = Writer(stream, errors)
662	self.errors = errors
663
664	def read(self, size=-1):
665
666	return self.reader.read(size)
667
668	def readline(self, size=None):
669
670	return self.reader.readline(size)
671
672	def readlines(self, sizehint=None):
673
674	return self.reader.readlines(sizehint)
675
676	def next(self):
677
678	""" Return the next decoded line from the input stream."""
679	return self.reader.next()
680
681	def __iter__(self):
682	return self
683
684	def write(self, data):
685
686	return self.writer.write(data)
687
688	def writelines(self, list):
689
690	return self.writer.writelines(list)
691
692	def reset(self):
693
694	self.reader.reset()
695	self.writer.reset()
696
697	def __getattr__(self, name,
698	getattr=getattr):
699
700	""" Inherit all other methods from the underlying stream.
701	"""
702	return getattr(self.stream, name)
703
704	# these are needed to make "with codecs.open(...)" work properly
705
706	def __enter__(self):
707	return self
708
709	def __exit__(self, type, value, tb):
710	self.stream.close()
711
712	###
713
714	class StreamRecoder:
715
716	""" StreamRecoder instances provide a frontend - backend
717	view of encoding data.
718
719	They use the complete set of APIs returned by the
720	codecs.lookup() function to implement their task.
721
722	Data written to the stream is first decoded into an
723	intermediate format (which is dependent on the given codec
724	combination) and then written to the stream using an instance
725	of the provided Writer class.
726
727	In the other direction, data is read from the stream using a
728	Reader instance and then return encoded data to the caller.
729
730	"""
731	# Optional attributes set by the file wrappers below
732	data_encoding = 'unknown'
733	file_encoding = 'unknown'
734
735	def __init__(self, stream, encode, decode, Reader, Writer,
736	errors='strict'):
737
738	""" Creates a StreamRecoder instance which implements a two-way
739	conversion: encode and decode work on the frontend (the
740	input to .read() and output of .write()) while
741	Reader and Writer work on the backend (reading and
742	writing to the stream).
743
744	You can use these objects to do transparent direct
745	recodings from e.g. latin-1 to utf-8 and back.
746
747	stream must be a file-like object.
748
749	encode, decode must adhere to the Codec interface, Reader,
750	Writer must be factory functions or classes providing the
751	StreamReader, StreamWriter interface resp.
752
753	encode and decode are needed for the frontend translation,
754	Reader and Writer for the backend translation. Unicode is
755	used as intermediate encoding.
756
757	Error handling is done in the same way as defined for the
758	StreamWriter/Readers.
759
760	"""
761	self.stream = stream
762	self.encode = encode
763	self.decode = decode
764	self.reader = Reader(stream, errors)
765	self.writer = Writer(stream, errors)
766	self.errors = errors
767
768	def read(self, size=-1):
769
770	data = self.reader.read(size)
771	data, bytesencoded = self.encode(data, self.errors)
772	return data
773
774	def readline(self, size=None):
775
776	if size is None:
777	data = self.reader.readline()
778	else:
779	data = self.reader.readline(size)
780	data, bytesencoded = self.encode(data, self.errors)
781	return data
782
783	def readlines(self, sizehint=None):
784
785	data = self.reader.read()
786	data, bytesencoded = self.encode(data, self.errors)
787	return data.splitlines(1)
788
789	def next(self):
790
791	""" Return the next decoded line from the input stream."""
792	data = self.reader.next()
793	data, bytesencoded = self.encode(data, self.errors)
794	return data
795
796	def __iter__(self):
797	return self
798
799	def write(self, data):
800
801	data, bytesdecoded = self.decode(data, self.errors)
802	return self.writer.write(data)
803
804	def writelines(self, list):
805
806	data = ''.join(list)
807	data, bytesdecoded = self.decode(data, self.errors)
808	return self.writer.write(data)
809
810	def reset(self):
811
812	self.reader.reset()
813	self.writer.reset()
814
815	def __getattr__(self, name,
816	getattr=getattr):
817
818	""" Inherit all other methods from the underlying stream.
819	"""
820	return getattr(self.stream, name)
821
822	def __enter__(self):
823	return self
824
825	def __exit__(self, type, value, tb):
826	self.stream.close()
827
828	### Shortcuts
829
830	def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
831
832	""" Open an encoded file using the given mode and return
833	a wrapped version providing transparent encoding/decoding.
834
835	Note: The wrapped version will only accept the object format
836	defined by the codecs, i.e. Unicode objects for most builtin
837	codecs. Output is also codec dependent and will usually be
838	Unicode as well.
839
840	Files are always opened in binary mode, even if no binary mode
841	was specified. This is done to avoid data loss due to encodings
842	using 8-bit values. The default file mode is 'rb' meaning to
843	open the file in binary read mode.
844
845	encoding specifies the encoding which is to be used for the
846	file.
847
848	errors may be given to define the error handling. It defaults
849	to 'strict' which causes ValueErrors to be raised in case an
850	encoding error occurs.
851
852	buffering has the same meaning as for the builtin open() API.
853	It defaults to line buffered.
854
855	The returned wrapped file object provides an extra attribute
856	.encoding which allows querying the used encoding. This
857	attribute is only available if an encoding was specified as
858	parameter.
859
860	"""
861	if encoding is not None:
862	if 'U' in mode:
863	# No automatic conversion of '\n' is done on reading and writing
864	mode = mode.strip().replace('U', '')
865	if mode[:1] not in set('rwa'):
866	mode = 'r' + mode
867	if 'b' not in mode:
868	# Force opening of the file in binary mode
869	mode = mode + 'b'
870	file = __builtin__.open(filename, mode, buffering)
871	if encoding is None:
872	return file
873	info = lookup(encoding)
874	srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
875	# Add attributes to simplify introspection
876	srw.encoding = encoding
877	return srw
878
879	def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
880
881	""" Return a wrapped version of file which provides transparent
882	encoding translation.
883
884	Strings written to the wrapped file are interpreted according
885	to the given data_encoding and then written to the original
886	file as string using file_encoding. The intermediate encoding
887	will usually be Unicode but depends on the specified codecs.
888
889	Strings are read from the file using file_encoding and then
890	passed back to the caller as string using data_encoding.
891
892	If file_encoding is not given, it defaults to data_encoding.
893
894	errors may be given to define the error handling. It defaults
895	to 'strict' which causes ValueErrors to be raised in case an
896	encoding error occurs.
897
898	The returned wrapped file object provides two extra attributes
899	.data_encoding and .file_encoding which reflect the given
900	parameters of the same name. The attributes can be used for
901	introspection by Python programs.
902
903	"""
904	if file_encoding is None:
905	file_encoding = data_encoding
906	data_info = lookup(data_encoding)
907	file_info = lookup(file_encoding)
908	sr = StreamRecoder(file, data_info.encode, data_info.decode,
909	file_info.streamreader, file_info.streamwriter, errors)
910	# Add attributes to simplify introspection
911	sr.data_encoding = data_encoding
912	sr.file_encoding = file_encoding
913	return sr
914
915	### Helpers for codec lookup
916
917	def getencoder(encoding):
918
919	""" Lookup up the codec for the given encoding and return
920	its encoder function.
921
922	Raises a LookupError in case the encoding cannot be found.
923
924	"""
925	return lookup(encoding).encode
926
927	def getdecoder(encoding):
928
929	""" Lookup up the codec for the given encoding and return
930	its decoder function.
931
932	Raises a LookupError in case the encoding cannot be found.
933
934	"""
935	return lookup(encoding).decode
936
937	def getincrementalencoder(encoding):
938
939	""" Lookup up the codec for the given encoding and return
940	its IncrementalEncoder class or factory function.
941
942	Raises a LookupError in case the encoding cannot be found
943	or the codecs doesn't provide an incremental encoder.
944
945	"""
946	encoder = lookup(encoding).incrementalencoder
947	if encoder is None:
948	raise LookupError(encoding)
949	return encoder
950
951	def getincrementaldecoder(encoding):
952
953	""" Lookup up the codec for the given encoding and return
954	its IncrementalDecoder class or factory function.
955
956	Raises a LookupError in case the encoding cannot be found
957	or the codecs doesn't provide an incremental decoder.
958
959	"""
960	decoder = lookup(encoding).incrementaldecoder
961	if decoder is None:
962	raise LookupError(encoding)
963	return decoder
964
965	def getreader(encoding):
966
967	""" Lookup up the codec for the given encoding and return
968	its StreamReader class or factory function.
969
970	Raises a LookupError in case the encoding cannot be found.
971
972	"""
973	return lookup(encoding).streamreader
974
975	def getwriter(encoding):
976
977	""" Lookup up the codec for the given encoding and return
978	its StreamWriter class or factory function.
979
980	Raises a LookupError in case the encoding cannot be found.
981
982	"""
983	return lookup(encoding).streamwriter
984
985	def iterencode(iterator, encoding, errors='strict', **kwargs):
986	"""
987	Encoding iterator.
988
989	Encodes the input strings from the iterator using a IncrementalEncoder.
990
991	errors and kwargs are passed through to the IncrementalEncoder
992	constructor.
993	"""
994	encoder = getincrementalencoder(encoding)(errors, **kwargs)
995	for input in iterator:
996	output = encoder.encode(input)
997	if output:
998	yield output
999	output = encoder.encode("", True)
1000	if output:
1001	yield output
1002
1003	def iterdecode(iterator, encoding, errors='strict', **kwargs):
1004	"""
1005	Decoding iterator.
1006
1007	Decodes the input strings from the iterator using a IncrementalDecoder.
1008
1009	errors and kwargs are passed through to the IncrementalDecoder
1010	constructor.
1011	"""
1012	decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1013	for input in iterator:
1014	output = decoder.decode(input)
1015	if output:
1016	yield output
1017	output = decoder.decode("", True)
1018	if output:
1019	yield output
1020
1021	### Helpers for charmap-based codecs
1022
1023	def make_identity_dict(rng):
1024
1025	""" make_identity_dict(rng) -> dict
1026
1027	Return a dictionary where elements of the rng sequence are
1028	mapped to themselves.
1029
1030	"""
1031	res = {}
1032	for i in rng:
1033	res[i]=i
1034	return res
1035
1036	def make_encoding_map(decoding_map):
1037
1038	""" Creates an encoding map from a decoding map.
1039
1040	If a target mapping in the decoding map occurs multiple
1041	times, then that target is mapped to None (undefined mapping),
1042	causing an exception when encountered by the charmap codec
1043	during translation.
1044
1045	One example where this happens is cp875.py which decodes
1046	multiple character to \u001a.
1047
1048	"""
1049	m = {}
1050	for k,v in decoding_map.items():
1051	if not v in m:
1052	m[v] = k
1053	else:
1054	m[v] = None
1055	return m
1056
1057	### error handlers
1058
1059	try:
1060	strict_errors = lookup_error("strict")
1061	ignore_errors = lookup_error("ignore")
1062	replace_errors = lookup_error("replace")
1063	xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1064	backslashreplace_errors = lookup_error("backslashreplace")
1065	except LookupError:
1066	# In --disable-unicode builds, these error handler are missing
1067	strict_errors = None
1068	ignore_errors = None
1069	replace_errors = None
1070	xmlcharrefreplace_errors = None
1071	backslashreplace_errors = None
1072
1073	# Tell modulefinder that using codecs probably needs the encodings
1074	# package
1075	_false = 0
1076	if _false:
1077	import encodings
1078
1079	### Tests
1080
1081	if __name__ == '__main__':
1082
1083	# Make stdout translate Latin-1 output into UTF-8 output
1084	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1085
1086	# Have stdin translate Latin-1 input into UTF-8 input
1087	sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/codecs.py@ 382

Download in other formats: