1 | """ codecs -- Python Codec Registry, API and helpers.
|
---|
2 |
|
---|
3 |
|
---|
4 | Written by Marc-Andre Lemburg (mal@lemburg.com).
|
---|
5 |
|
---|
6 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
---|
7 |
|
---|
8 | """#"
|
---|
9 |
|
---|
10 | import __builtin__, sys
|
---|
11 |
|
---|
12 | ### Registry and builtin stateless codec functions
|
---|
13 |
|
---|
14 | try:
|
---|
15 | from _codecs import *
|
---|
16 | except ImportError, why:
|
---|
17 | raise SystemError('Failed to load the builtin codecs: %s' % why)
|
---|
18 |
|
---|
19 | __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
|
---|
20 | "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
|
---|
21 | "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
|
---|
22 | "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
|
---|
23 | "strict_errors", "ignore_errors", "replace_errors",
|
---|
24 | "xmlcharrefreplace_errors",
|
---|
25 | "register_error", "lookup_error"]
|
---|
26 |
|
---|
27 | ### Constants
|
---|
28 |
|
---|
29 | #
|
---|
30 | # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
|
---|
31 | # and its possible byte string values
|
---|
32 | # for UTF8/UTF16/UTF32 output and little/big endian machines
|
---|
33 | #
|
---|
34 |
|
---|
35 | # UTF-8
|
---|
36 | BOM_UTF8 = '\xef\xbb\xbf'
|
---|
37 |
|
---|
38 | # UTF-16, little endian
|
---|
39 | BOM_LE = BOM_UTF16_LE = '\xff\xfe'
|
---|
40 |
|
---|
41 | # UTF-16, big endian
|
---|
42 | BOM_BE = BOM_UTF16_BE = '\xfe\xff'
|
---|
43 |
|
---|
44 | # UTF-32, little endian
|
---|
45 | BOM_UTF32_LE = '\xff\xfe\x00\x00'
|
---|
46 |
|
---|
47 | # UTF-32, big endian
|
---|
48 | BOM_UTF32_BE = '\x00\x00\xfe\xff'
|
---|
49 |
|
---|
50 | if sys.byteorder == 'little':
|
---|
51 |
|
---|
52 | # UTF-16, native endianness
|
---|
53 | BOM = BOM_UTF16 = BOM_UTF16_LE
|
---|
54 |
|
---|
55 | # UTF-32, native endianness
|
---|
56 | BOM_UTF32 = BOM_UTF32_LE
|
---|
57 |
|
---|
58 | else:
|
---|
59 |
|
---|
60 | # UTF-16, native endianness
|
---|
61 | BOM = BOM_UTF16 = BOM_UTF16_BE
|
---|
62 |
|
---|
63 | # UTF-32, native endianness
|
---|
64 | BOM_UTF32 = BOM_UTF32_BE
|
---|
65 |
|
---|
66 | # Old broken names (don't use in new code)
|
---|
67 | BOM32_LE = BOM_UTF16_LE
|
---|
68 | BOM32_BE = BOM_UTF16_BE
|
---|
69 | BOM64_LE = BOM_UTF32_LE
|
---|
70 | BOM64_BE = BOM_UTF32_BE
|
---|
71 |
|
---|
72 |
|
---|
73 | ### Codec base classes (defining the API)
|
---|
74 |
|
---|
75 | class CodecInfo(tuple):
|
---|
76 |
|
---|
77 | def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
|
---|
78 | incrementalencoder=None, incrementaldecoder=None, name=None):
|
---|
79 | self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
|
---|
80 | self.name = name
|
---|
81 | self.encode = encode
|
---|
82 | self.decode = decode
|
---|
83 | self.incrementalencoder = incrementalencoder
|
---|
84 | self.incrementaldecoder = incrementaldecoder
|
---|
85 | self.streamwriter = streamwriter
|
---|
86 | self.streamreader = streamreader
|
---|
87 | return self
|
---|
88 |
|
---|
89 | def __repr__(self):
|
---|
90 | return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
|
---|
91 |
|
---|
92 | class Codec:
|
---|
93 |
|
---|
94 | """ Defines the interface for stateless encoders/decoders.
|
---|
95 |
|
---|
96 | The .encode()/.decode() methods may use different error
|
---|
97 | handling schemes by providing the errors argument. These
|
---|
98 | string values are predefined:
|
---|
99 |
|
---|
100 | 'strict' - raise a ValueError error (or a subclass)
|
---|
101 | 'ignore' - ignore the character and continue with the next
|
---|
102 | 'replace' - replace with a suitable replacement character;
|
---|
103 | Python will use the official U+FFFD REPLACEMENT
|
---|
104 | CHARACTER for the builtin Unicode codecs on
|
---|
105 | decoding and '?' on encoding.
|
---|
106 | 'xmlcharrefreplace' - Replace with the appropriate XML
|
---|
107 | character reference (only for encoding).
|
---|
108 | 'backslashreplace' - Replace with backslashed escape sequences
|
---|
109 | (only for encoding).
|
---|
110 |
|
---|
111 | The set of allowed values can be extended via register_error.
|
---|
112 |
|
---|
113 | """
|
---|
114 | def encode(self, input, errors='strict'):
|
---|
115 |
|
---|
116 | """ Encodes the object input and returns a tuple (output
|
---|
117 | object, length consumed).
|
---|
118 |
|
---|
119 | errors defines the error handling to apply. It defaults to
|
---|
120 | 'strict' handling.
|
---|
121 |
|
---|
122 | The method may not store state in the Codec instance. Use
|
---|
123 | StreamCodec for codecs which have to keep state in order to
|
---|
124 | make encoding/decoding efficient.
|
---|
125 |
|
---|
126 | The encoder must be able to handle zero length input and
|
---|
127 | return an empty object of the output object type in this
|
---|
128 | situation.
|
---|
129 |
|
---|
130 | """
|
---|
131 | raise NotImplementedError
|
---|
132 |
|
---|
133 | def decode(self, input, errors='strict'):
|
---|
134 |
|
---|
135 | """ Decodes the object input and returns a tuple (output
|
---|
136 | object, length consumed).
|
---|
137 |
|
---|
138 | input must be an object which provides the bf_getreadbuf
|
---|
139 | buffer slot. Python strings, buffer objects and memory
|
---|
140 | mapped files are examples of objects providing this slot.
|
---|
141 |
|
---|
142 | errors defines the error handling to apply. It defaults to
|
---|
143 | 'strict' handling.
|
---|
144 |
|
---|
145 | The method may not store state in the Codec instance. Use
|
---|
146 | StreamCodec for codecs which have to keep state in order to
|
---|
147 | make encoding/decoding efficient.
|
---|
148 |
|
---|
149 | The decoder must be able to handle zero length input and
|
---|
150 | return an empty object of the output object type in this
|
---|
151 | situation.
|
---|
152 |
|
---|
153 | """
|
---|
154 | raise NotImplementedError
|
---|
155 |
|
---|
156 | class IncrementalEncoder(object):
|
---|
157 | """
|
---|
158 | An IncrementalEncoder encodes an input in multiple steps. The input can be
|
---|
159 | passed piece by piece to the encode() method. The IncrementalEncoder remembers
|
---|
160 | the state of the Encoding process between calls to encode().
|
---|
161 | """
|
---|
162 | def __init__(self, errors='strict'):
|
---|
163 | """
|
---|
164 | Creates an IncrementalEncoder instance.
|
---|
165 |
|
---|
166 | The IncrementalEncoder may use different error handling schemes by
|
---|
167 | providing the errors keyword argument. See the module docstring
|
---|
168 | for a list of possible values.
|
---|
169 | """
|
---|
170 | self.errors = errors
|
---|
171 | self.buffer = ""
|
---|
172 |
|
---|
173 | def encode(self, input, final=False):
|
---|
174 | """
|
---|
175 | Encodes input and returns the resulting object.
|
---|
176 | """
|
---|
177 | raise NotImplementedError
|
---|
178 |
|
---|
179 | def reset(self):
|
---|
180 | """
|
---|
181 | Resets the encoder to the initial state.
|
---|
182 | """
|
---|
183 |
|
---|
184 | def getstate(self):
|
---|
185 | """
|
---|
186 | Return the current state of the encoder.
|
---|
187 | """
|
---|
188 | return 0
|
---|
189 |
|
---|
190 | def setstate(self, state):
|
---|
191 | """
|
---|
192 | Set the current state of the encoder. state must have been
|
---|
193 | returned by getstate().
|
---|
194 | """
|
---|
195 |
|
---|
196 | class BufferedIncrementalEncoder(IncrementalEncoder):
|
---|
197 | """
|
---|
198 | This subclass of IncrementalEncoder can be used as the baseclass for an
|
---|
199 | incremental encoder if the encoder must keep some of the output in a
|
---|
200 | buffer between calls to encode().
|
---|
201 | """
|
---|
202 | def __init__(self, errors='strict'):
|
---|
203 | IncrementalEncoder.__init__(self, errors)
|
---|
204 | self.buffer = "" # unencoded input that is kept between calls to encode()
|
---|
205 |
|
---|
206 | def _buffer_encode(self, input, errors, final):
|
---|
207 | # Overwrite this method in subclasses: It must encode input
|
---|
208 | # and return an (output, length consumed) tuple
|
---|
209 | raise NotImplementedError
|
---|
210 |
|
---|
211 | def encode(self, input, final=False):
|
---|
212 | # encode input (taking the buffer into account)
|
---|
213 | data = self.buffer + input
|
---|
214 | (result, consumed) = self._buffer_encode(data, self.errors, final)
|
---|
215 | # keep unencoded input until the next call
|
---|
216 | self.buffer = data[consumed:]
|
---|
217 | return result
|
---|
218 |
|
---|
219 | def reset(self):
|
---|
220 | IncrementalEncoder.reset(self)
|
---|
221 | self.buffer = ""
|
---|
222 |
|
---|
223 | def getstate(self):
|
---|
224 | return self.buffer or 0
|
---|
225 |
|
---|
226 | def setstate(self, state):
|
---|
227 | self.buffer = state or ""
|
---|
228 |
|
---|
229 | class IncrementalDecoder(object):
|
---|
230 | """
|
---|
231 | An IncrementalDecoder decodes an input in multiple steps. The input can be
|
---|
232 | passed piece by piece to the decode() method. The IncrementalDecoder
|
---|
233 | remembers the state of the decoding process between calls to decode().
|
---|
234 | """
|
---|
235 | def __init__(self, errors='strict'):
|
---|
236 | """
|
---|
237 | Creates a IncrementalDecoder instance.
|
---|
238 |
|
---|
239 | The IncrementalDecoder may use different error handling schemes by
|
---|
240 | providing the errors keyword argument. See the module docstring
|
---|
241 | for a list of possible values.
|
---|
242 | """
|
---|
243 | self.errors = errors
|
---|
244 |
|
---|
245 | def decode(self, input, final=False):
|
---|
246 | """
|
---|
247 | Decodes input and returns the resulting object.
|
---|
248 | """
|
---|
249 | raise NotImplementedError
|
---|
250 |
|
---|
251 | def reset(self):
|
---|
252 | """
|
---|
253 | Resets the decoder to the initial state.
|
---|
254 | """
|
---|
255 |
|
---|
256 | def getstate(self):
|
---|
257 | """
|
---|
258 | Return the current state of the decoder.
|
---|
259 |
|
---|
260 | This must be a (buffered_input, additional_state_info) tuple.
|
---|
261 | buffered_input must be a bytes object containing bytes that
|
---|
262 | were passed to decode() that have not yet been converted.
|
---|
263 | additional_state_info must be a non-negative integer
|
---|
264 | representing the state of the decoder WITHOUT yet having
|
---|
265 | processed the contents of buffered_input. In the initial state
|
---|
266 | and after reset(), getstate() must return (b"", 0).
|
---|
267 | """
|
---|
268 | return (b"", 0)
|
---|
269 |
|
---|
270 | def setstate(self, state):
|
---|
271 | """
|
---|
272 | Set the current state of the decoder.
|
---|
273 |
|
---|
274 | state must have been returned by getstate(). The effect of
|
---|
275 | setstate((b"", 0)) must be equivalent to reset().
|
---|
276 | """
|
---|
277 |
|
---|
278 | class BufferedIncrementalDecoder(IncrementalDecoder):
|
---|
279 | """
|
---|
280 | This subclass of IncrementalDecoder can be used as the baseclass for an
|
---|
281 | incremental decoder if the decoder must be able to handle incomplete byte
|
---|
282 | sequences.
|
---|
283 | """
|
---|
284 | def __init__(self, errors='strict'):
|
---|
285 | IncrementalDecoder.__init__(self, errors)
|
---|
286 | self.buffer = "" # undecoded input that is kept between calls to decode()
|
---|
287 |
|
---|
288 | def _buffer_decode(self, input, errors, final):
|
---|
289 | # Overwrite this method in subclasses: It must decode input
|
---|
290 | # and return an (output, length consumed) tuple
|
---|
291 | raise NotImplementedError
|
---|
292 |
|
---|
293 | def decode(self, input, final=False):
|
---|
294 | # decode input (taking the buffer into account)
|
---|
295 | data = self.buffer + input
|
---|
296 | (result, consumed) = self._buffer_decode(data, self.errors, final)
|
---|
297 | # keep undecoded input until the next call
|
---|
298 | self.buffer = data[consumed:]
|
---|
299 | return result
|
---|
300 |
|
---|
301 | def reset(self):
|
---|
302 | IncrementalDecoder.reset(self)
|
---|
303 | self.buffer = ""
|
---|
304 |
|
---|
305 | def getstate(self):
|
---|
306 | # additional state info is always 0
|
---|
307 | return (self.buffer, 0)
|
---|
308 |
|
---|
309 | def setstate(self, state):
|
---|
310 | # ignore additional state info
|
---|
311 | self.buffer = state[0]
|
---|
312 |
|
---|
313 | #
|
---|
314 | # The StreamWriter and StreamReader class provide generic working
|
---|
315 | # interfaces which can be used to implement new encoding submodules
|
---|
316 | # very easily. See encodings/utf_8.py for an example on how this is
|
---|
317 | # done.
|
---|
318 | #
|
---|
319 |
|
---|
320 | class StreamWriter(Codec):
|
---|
321 |
|
---|
322 | def __init__(self, stream, errors='strict'):
|
---|
323 |
|
---|
324 | """ Creates a StreamWriter instance.
|
---|
325 |
|
---|
326 | stream must be a file-like object open for writing
|
---|
327 | (binary) data.
|
---|
328 |
|
---|
329 | The StreamWriter may use different error handling
|
---|
330 | schemes by providing the errors keyword argument. These
|
---|
331 | parameters are predefined:
|
---|
332 |
|
---|
333 | 'strict' - raise a ValueError (or a subclass)
|
---|
334 | 'ignore' - ignore the character and continue with the next
|
---|
335 | 'replace'- replace with a suitable replacement character
|
---|
336 | 'xmlcharrefreplace' - Replace with the appropriate XML
|
---|
337 | character reference.
|
---|
338 | 'backslashreplace' - Replace with backslashed escape
|
---|
339 | sequences (only for encoding).
|
---|
340 |
|
---|
341 | The set of allowed parameter values can be extended via
|
---|
342 | register_error.
|
---|
343 | """
|
---|
344 | self.stream = stream
|
---|
345 | self.errors = errors
|
---|
346 |
|
---|
347 | def write(self, object):
|
---|
348 |
|
---|
349 | """ Writes the object's contents encoded to self.stream.
|
---|
350 | """
|
---|
351 | data, consumed = self.encode(object, self.errors)
|
---|
352 | self.stream.write(data)
|
---|
353 |
|
---|
354 | def writelines(self, list):
|
---|
355 |
|
---|
356 | """ Writes the concatenated list of strings to the stream
|
---|
357 | using .write().
|
---|
358 | """
|
---|
359 | self.write(''.join(list))
|
---|
360 |
|
---|
361 | def reset(self):
|
---|
362 |
|
---|
363 | """ Flushes and resets the codec buffers used for keeping state.
|
---|
364 |
|
---|
365 | Calling this method should ensure that the data on the
|
---|
366 | output is put into a clean state, that allows appending
|
---|
367 | of new fresh data without having to rescan the whole
|
---|
368 | stream to recover state.
|
---|
369 |
|
---|
370 | """
|
---|
371 | pass
|
---|
372 |
|
---|
373 | def __getattr__(self, name,
|
---|
374 | getattr=getattr):
|
---|
375 |
|
---|
376 | """ Inherit all other methods from the underlying stream.
|
---|
377 | """
|
---|
378 | return getattr(self.stream, name)
|
---|
379 |
|
---|
380 | def __enter__(self):
|
---|
381 | return self
|
---|
382 |
|
---|
383 | def __exit__(self, type, value, tb):
|
---|
384 | self.stream.close()
|
---|
385 |
|
---|
386 | ###
|
---|
387 |
|
---|
388 | class StreamReader(Codec):
|
---|
389 |
|
---|
390 | def __init__(self, stream, errors='strict'):
|
---|
391 |
|
---|
392 | """ Creates a StreamReader instance.
|
---|
393 |
|
---|
394 | stream must be a file-like object open for reading
|
---|
395 | (binary) data.
|
---|
396 |
|
---|
397 | The StreamReader may use different error handling
|
---|
398 | schemes by providing the errors keyword argument. These
|
---|
399 | parameters are predefined:
|
---|
400 |
|
---|
401 | 'strict' - raise a ValueError (or a subclass)
|
---|
402 | 'ignore' - ignore the character and continue with the next
|
---|
403 | 'replace'- replace with a suitable replacement character;
|
---|
404 |
|
---|
405 | The set of allowed parameter values can be extended via
|
---|
406 | register_error.
|
---|
407 | """
|
---|
408 | self.stream = stream
|
---|
409 | self.errors = errors
|
---|
410 | self.bytebuffer = ""
|
---|
411 | # For str->str decoding this will stay a str
|
---|
412 | # For str->unicode decoding the first read will promote it to unicode
|
---|
413 | self.charbuffer = ""
|
---|
414 | self.linebuffer = None
|
---|
415 |
|
---|
416 | def decode(self, input, errors='strict'):
|
---|
417 | raise NotImplementedError
|
---|
418 |
|
---|
419 | def read(self, size=-1, chars=-1, firstline=False):
|
---|
420 |
|
---|
421 | """ Decodes data from the stream self.stream and returns the
|
---|
422 | resulting object.
|
---|
423 |
|
---|
424 | chars indicates the number of characters to read from the
|
---|
425 | stream. read() will never return more than chars
|
---|
426 | characters, but it might return less, if there are not enough
|
---|
427 | characters available.
|
---|
428 |
|
---|
429 | size indicates the approximate maximum number of bytes to
|
---|
430 | read from the stream for decoding purposes. The decoder
|
---|
431 | can modify this setting as appropriate. The default value
|
---|
432 | -1 indicates to read and decode as much as possible. size
|
---|
433 | is intended to prevent having to decode huge files in one
|
---|
434 | step.
|
---|
435 |
|
---|
436 | If firstline is true, and a UnicodeDecodeError happens
|
---|
437 | after the first line terminator in the input only the first line
|
---|
438 | will be returned, the rest of the input will be kept until the
|
---|
439 | next call to read().
|
---|
440 |
|
---|
441 | The method should use a greedy read strategy meaning that
|
---|
442 | it should read as much data as is allowed within the
|
---|
443 | definition of the encoding and the given size, e.g. if
|
---|
444 | optional encoding endings or state markers are available
|
---|
445 | on the stream, these should be read too.
|
---|
446 | """
|
---|
447 | # If we have lines cached, first merge them back into characters
|
---|
448 | if self.linebuffer:
|
---|
449 | self.charbuffer = "".join(self.linebuffer)
|
---|
450 | self.linebuffer = None
|
---|
451 |
|
---|
452 | # read until we get the required number of characters (if available)
|
---|
453 | while True:
|
---|
454 | # can the request can be satisfied from the character buffer?
|
---|
455 | if chars < 0:
|
---|
456 | if size < 0:
|
---|
457 | if self.charbuffer:
|
---|
458 | break
|
---|
459 | elif len(self.charbuffer) >= size:
|
---|
460 | break
|
---|
461 | else:
|
---|
462 | if len(self.charbuffer) >= chars:
|
---|
463 | break
|
---|
464 | # we need more data
|
---|
465 | if size < 0:
|
---|
466 | newdata = self.stream.read()
|
---|
467 | else:
|
---|
468 | newdata = self.stream.read(size)
|
---|
469 | # decode bytes (those remaining from the last call included)
|
---|
470 | data = self.bytebuffer + newdata
|
---|
471 | try:
|
---|
472 | newchars, decodedbytes = self.decode(data, self.errors)
|
---|
473 | except UnicodeDecodeError, exc:
|
---|
474 | if firstline:
|
---|
475 | newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
|
---|
476 | lines = newchars.splitlines(True)
|
---|
477 | if len(lines)<=1:
|
---|
478 | raise
|
---|
479 | else:
|
---|
480 | raise
|
---|
481 | # keep undecoded bytes until the next call
|
---|
482 | self.bytebuffer = data[decodedbytes:]
|
---|
483 | # put new characters in the character buffer
|
---|
484 | self.charbuffer += newchars
|
---|
485 | # there was no data available
|
---|
486 | if not newdata:
|
---|
487 | break
|
---|
488 | if chars < 0:
|
---|
489 | # Return everything we've got
|
---|
490 | result = self.charbuffer
|
---|
491 | self.charbuffer = ""
|
---|
492 | else:
|
---|
493 | # Return the first chars characters
|
---|
494 | result = self.charbuffer[:chars]
|
---|
495 | self.charbuffer = self.charbuffer[chars:]
|
---|
496 | return result
|
---|
497 |
|
---|
498 | def readline(self, size=None, keepends=True):
|
---|
499 |
|
---|
500 | """ Read one line from the input stream and return the
|
---|
501 | decoded data.
|
---|
502 |
|
---|
503 | size, if given, is passed as size argument to the
|
---|
504 | read() method.
|
---|
505 |
|
---|
506 | """
|
---|
507 | # If we have lines cached from an earlier read, return
|
---|
508 | # them unconditionally
|
---|
509 | if self.linebuffer:
|
---|
510 | line = self.linebuffer[0]
|
---|
511 | del self.linebuffer[0]
|
---|
512 | if len(self.linebuffer) == 1:
|
---|
513 | # revert to charbuffer mode; we might need more data
|
---|
514 | # next time
|
---|
515 | self.charbuffer = self.linebuffer[0]
|
---|
516 | self.linebuffer = None
|
---|
517 | if not keepends:
|
---|
518 | line = line.splitlines(False)[0]
|
---|
519 | return line
|
---|
520 |
|
---|
521 | readsize = size or 72
|
---|
522 | line = ""
|
---|
523 | # If size is given, we call read() only once
|
---|
524 | while True:
|
---|
525 | data = self.read(readsize, firstline=True)
|
---|
526 | if data:
|
---|
527 | # If we're at a "\r" read one extra character (which might
|
---|
528 | # be a "\n") to get a proper line ending. If the stream is
|
---|
529 | # temporarily exhausted we return the wrong line ending.
|
---|
530 | if data.endswith("\r"):
|
---|
531 | data += self.read(size=1, chars=1)
|
---|
532 |
|
---|
533 | line += data
|
---|
534 | lines = line.splitlines(True)
|
---|
535 | if lines:
|
---|
536 | if len(lines) > 1:
|
---|
537 | # More than one line result; the first line is a full line
|
---|
538 | # to return
|
---|
539 | line = lines[0]
|
---|
540 | del lines[0]
|
---|
541 | if len(lines) > 1:
|
---|
542 | # cache the remaining lines
|
---|
543 | lines[-1] += self.charbuffer
|
---|
544 | self.linebuffer = lines
|
---|
545 | self.charbuffer = None
|
---|
546 | else:
|
---|
547 | # only one remaining line, put it back into charbuffer
|
---|
548 | self.charbuffer = lines[0] + self.charbuffer
|
---|
549 | if not keepends:
|
---|
550 | line = line.splitlines(False)[0]
|
---|
551 | break
|
---|
552 | line0withend = lines[0]
|
---|
553 | line0withoutend = lines[0].splitlines(False)[0]
|
---|
554 | if line0withend != line0withoutend: # We really have a line end
|
---|
555 | # Put the rest back together and keep it until the next call
|
---|
556 | self.charbuffer = "".join(lines[1:]) + self.charbuffer
|
---|
557 | if keepends:
|
---|
558 | line = line0withend
|
---|
559 | else:
|
---|
560 | line = line0withoutend
|
---|
561 | break
|
---|
562 | # we didn't get anything or this was our only try
|
---|
563 | if not data or size is not None:
|
---|
564 | if line and not keepends:
|
---|
565 | line = line.splitlines(False)[0]
|
---|
566 | break
|
---|
567 | if readsize<8000:
|
---|
568 | readsize *= 2
|
---|
569 | return line
|
---|
570 |
|
---|
571 | def readlines(self, sizehint=None, keepends=True):
|
---|
572 |
|
---|
573 | """ Read all lines available on the input stream
|
---|
574 | and return them as list of lines.
|
---|
575 |
|
---|
576 | Line breaks are implemented using the codec's decoder
|
---|
577 | method and are included in the list entries.
|
---|
578 |
|
---|
579 | sizehint, if given, is ignored since there is no efficient
|
---|
580 | way to finding the true end-of-line.
|
---|
581 |
|
---|
582 | """
|
---|
583 | data = self.read()
|
---|
584 | return data.splitlines(keepends)
|
---|
585 |
|
---|
586 | def reset(self):
|
---|
587 |
|
---|
588 | """ Resets the codec buffers used for keeping state.
|
---|
589 |
|
---|
590 | Note that no stream repositioning should take place.
|
---|
591 | This method is primarily intended to be able to recover
|
---|
592 | from decoding errors.
|
---|
593 |
|
---|
594 | """
|
---|
595 | self.bytebuffer = ""
|
---|
596 | self.charbuffer = u""
|
---|
597 | self.linebuffer = None
|
---|
598 |
|
---|
599 | def seek(self, offset, whence=0):
|
---|
600 | """ Set the input stream's current position.
|
---|
601 |
|
---|
602 | Resets the codec buffers used for keeping state.
|
---|
603 | """
|
---|
604 | self.reset()
|
---|
605 | self.stream.seek(offset, whence)
|
---|
606 |
|
---|
607 | def next(self):
|
---|
608 |
|
---|
609 | """ Return the next decoded line from the input stream."""
|
---|
610 | line = self.readline()
|
---|
611 | if line:
|
---|
612 | return line
|
---|
613 | raise StopIteration
|
---|
614 |
|
---|
615 | def __iter__(self):
|
---|
616 | return self
|
---|
617 |
|
---|
618 | def __getattr__(self, name,
|
---|
619 | getattr=getattr):
|
---|
620 |
|
---|
621 | """ Inherit all other methods from the underlying stream.
|
---|
622 | """
|
---|
623 | return getattr(self.stream, name)
|
---|
624 |
|
---|
625 | def __enter__(self):
|
---|
626 | return self
|
---|
627 |
|
---|
628 | def __exit__(self, type, value, tb):
|
---|
629 | self.stream.close()
|
---|
630 |
|
---|
631 | ###
|
---|
632 |
|
---|
633 | class StreamReaderWriter:
|
---|
634 |
|
---|
635 | """ StreamReaderWriter instances allow wrapping streams which
|
---|
636 | work in both read and write modes.
|
---|
637 |
|
---|
638 | The design is such that one can use the factory functions
|
---|
639 | returned by the codec.lookup() function to construct the
|
---|
640 | instance.
|
---|
641 |
|
---|
642 | """
|
---|
643 | # Optional attributes set by the file wrappers below
|
---|
644 | encoding = 'unknown'
|
---|
645 |
|
---|
646 | def __init__(self, stream, Reader, Writer, errors='strict'):
|
---|
647 |
|
---|
648 | """ Creates a StreamReaderWriter instance.
|
---|
649 |
|
---|
650 | stream must be a Stream-like object.
|
---|
651 |
|
---|
652 | Reader, Writer must be factory functions or classes
|
---|
653 | providing the StreamReader, StreamWriter interface resp.
|
---|
654 |
|
---|
655 | Error handling is done in the same way as defined for the
|
---|
656 | StreamWriter/Readers.
|
---|
657 |
|
---|
658 | """
|
---|
659 | self.stream = stream
|
---|
660 | self.reader = Reader(stream, errors)
|
---|
661 | self.writer = Writer(stream, errors)
|
---|
662 | self.errors = errors
|
---|
663 |
|
---|
664 | def read(self, size=-1):
|
---|
665 |
|
---|
666 | return self.reader.read(size)
|
---|
667 |
|
---|
668 | def readline(self, size=None):
|
---|
669 |
|
---|
670 | return self.reader.readline(size)
|
---|
671 |
|
---|
672 | def readlines(self, sizehint=None):
|
---|
673 |
|
---|
674 | return self.reader.readlines(sizehint)
|
---|
675 |
|
---|
676 | def next(self):
|
---|
677 |
|
---|
678 | """ Return the next decoded line from the input stream."""
|
---|
679 | return self.reader.next()
|
---|
680 |
|
---|
681 | def __iter__(self):
|
---|
682 | return self
|
---|
683 |
|
---|
684 | def write(self, data):
|
---|
685 |
|
---|
686 | return self.writer.write(data)
|
---|
687 |
|
---|
688 | def writelines(self, list):
|
---|
689 |
|
---|
690 | return self.writer.writelines(list)
|
---|
691 |
|
---|
692 | def reset(self):
|
---|
693 |
|
---|
694 | self.reader.reset()
|
---|
695 | self.writer.reset()
|
---|
696 |
|
---|
697 | def __getattr__(self, name,
|
---|
698 | getattr=getattr):
|
---|
699 |
|
---|
700 | """ Inherit all other methods from the underlying stream.
|
---|
701 | """
|
---|
702 | return getattr(self.stream, name)
|
---|
703 |
|
---|
704 | # these are needed to make "with codecs.open(...)" work properly
|
---|
705 |
|
---|
706 | def __enter__(self):
|
---|
707 | return self
|
---|
708 |
|
---|
709 | def __exit__(self, type, value, tb):
|
---|
710 | self.stream.close()
|
---|
711 |
|
---|
712 | ###
|
---|
713 |
|
---|
714 | class StreamRecoder:
|
---|
715 |
|
---|
716 | """ StreamRecoder instances provide a frontend - backend
|
---|
717 | view of encoding data.
|
---|
718 |
|
---|
719 | They use the complete set of APIs returned by the
|
---|
720 | codecs.lookup() function to implement their task.
|
---|
721 |
|
---|
722 | Data written to the stream is first decoded into an
|
---|
723 | intermediate format (which is dependent on the given codec
|
---|
724 | combination) and then written to the stream using an instance
|
---|
725 | of the provided Writer class.
|
---|
726 |
|
---|
727 | In the other direction, data is read from the stream using a
|
---|
728 | Reader instance and then return encoded data to the caller.
|
---|
729 |
|
---|
730 | """
|
---|
731 | # Optional attributes set by the file wrappers below
|
---|
732 | data_encoding = 'unknown'
|
---|
733 | file_encoding = 'unknown'
|
---|
734 |
|
---|
735 | def __init__(self, stream, encode, decode, Reader, Writer,
|
---|
736 | errors='strict'):
|
---|
737 |
|
---|
738 | """ Creates a StreamRecoder instance which implements a two-way
|
---|
739 | conversion: encode and decode work on the frontend (the
|
---|
740 | input to .read() and output of .write()) while
|
---|
741 | Reader and Writer work on the backend (reading and
|
---|
742 | writing to the stream).
|
---|
743 |
|
---|
744 | You can use these objects to do transparent direct
|
---|
745 | recodings from e.g. latin-1 to utf-8 and back.
|
---|
746 |
|
---|
747 | stream must be a file-like object.
|
---|
748 |
|
---|
749 | encode, decode must adhere to the Codec interface, Reader,
|
---|
750 | Writer must be factory functions or classes providing the
|
---|
751 | StreamReader, StreamWriter interface resp.
|
---|
752 |
|
---|
753 | encode and decode are needed for the frontend translation,
|
---|
754 | Reader and Writer for the backend translation. Unicode is
|
---|
755 | used as intermediate encoding.
|
---|
756 |
|
---|
757 | Error handling is done in the same way as defined for the
|
---|
758 | StreamWriter/Readers.
|
---|
759 |
|
---|
760 | """
|
---|
761 | self.stream = stream
|
---|
762 | self.encode = encode
|
---|
763 | self.decode = decode
|
---|
764 | self.reader = Reader(stream, errors)
|
---|
765 | self.writer = Writer(stream, errors)
|
---|
766 | self.errors = errors
|
---|
767 |
|
---|
768 | def read(self, size=-1):
|
---|
769 |
|
---|
770 | data = self.reader.read(size)
|
---|
771 | data, bytesencoded = self.encode(data, self.errors)
|
---|
772 | return data
|
---|
773 |
|
---|
774 | def readline(self, size=None):
|
---|
775 |
|
---|
776 | if size is None:
|
---|
777 | data = self.reader.readline()
|
---|
778 | else:
|
---|
779 | data = self.reader.readline(size)
|
---|
780 | data, bytesencoded = self.encode(data, self.errors)
|
---|
781 | return data
|
---|
782 |
|
---|
783 | def readlines(self, sizehint=None):
|
---|
784 |
|
---|
785 | data = self.reader.read()
|
---|
786 | data, bytesencoded = self.encode(data, self.errors)
|
---|
787 | return data.splitlines(1)
|
---|
788 |
|
---|
789 | def next(self):
|
---|
790 |
|
---|
791 | """ Return the next decoded line from the input stream."""
|
---|
792 | data = self.reader.next()
|
---|
793 | data, bytesencoded = self.encode(data, self.errors)
|
---|
794 | return data
|
---|
795 |
|
---|
796 | def __iter__(self):
|
---|
797 | return self
|
---|
798 |
|
---|
799 | def write(self, data):
|
---|
800 |
|
---|
801 | data, bytesdecoded = self.decode(data, self.errors)
|
---|
802 | return self.writer.write(data)
|
---|
803 |
|
---|
804 | def writelines(self, list):
|
---|
805 |
|
---|
806 | data = ''.join(list)
|
---|
807 | data, bytesdecoded = self.decode(data, self.errors)
|
---|
808 | return self.writer.write(data)
|
---|
809 |
|
---|
810 | def reset(self):
|
---|
811 |
|
---|
812 | self.reader.reset()
|
---|
813 | self.writer.reset()
|
---|
814 |
|
---|
815 | def __getattr__(self, name,
|
---|
816 | getattr=getattr):
|
---|
817 |
|
---|
818 | """ Inherit all other methods from the underlying stream.
|
---|
819 | """
|
---|
820 | return getattr(self.stream, name)
|
---|
821 |
|
---|
822 | def __enter__(self):
|
---|
823 | return self
|
---|
824 |
|
---|
825 | def __exit__(self, type, value, tb):
|
---|
826 | self.stream.close()
|
---|
827 |
|
---|
828 | ### Shortcuts
|
---|
829 |
|
---|
830 | def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
|
---|
831 |
|
---|
832 | """ Open an encoded file using the given mode and return
|
---|
833 | a wrapped version providing transparent encoding/decoding.
|
---|
834 |
|
---|
835 | Note: The wrapped version will only accept the object format
|
---|
836 | defined by the codecs, i.e. Unicode objects for most builtin
|
---|
837 | codecs. Output is also codec dependent and will usually be
|
---|
838 | Unicode as well.
|
---|
839 |
|
---|
840 | Files are always opened in binary mode, even if no binary mode
|
---|
841 | was specified. This is done to avoid data loss due to encodings
|
---|
842 | using 8-bit values. The default file mode is 'rb' meaning to
|
---|
843 | open the file in binary read mode.
|
---|
844 |
|
---|
845 | encoding specifies the encoding which is to be used for the
|
---|
846 | file.
|
---|
847 |
|
---|
848 | errors may be given to define the error handling. It defaults
|
---|
849 | to 'strict' which causes ValueErrors to be raised in case an
|
---|
850 | encoding error occurs.
|
---|
851 |
|
---|
852 | buffering has the same meaning as for the builtin open() API.
|
---|
853 | It defaults to line buffered.
|
---|
854 |
|
---|
855 | The returned wrapped file object provides an extra attribute
|
---|
856 | .encoding which allows querying the used encoding. This
|
---|
857 | attribute is only available if an encoding was specified as
|
---|
858 | parameter.
|
---|
859 |
|
---|
860 | """
|
---|
861 | if encoding is not None:
|
---|
862 | if 'U' in mode:
|
---|
863 | # No automatic conversion of '\n' is done on reading and writing
|
---|
864 | mode = mode.strip().replace('U', '')
|
---|
865 | if mode[:1] not in set('rwa'):
|
---|
866 | mode = 'r' + mode
|
---|
867 | if 'b' not in mode:
|
---|
868 | # Force opening of the file in binary mode
|
---|
869 | mode = mode + 'b'
|
---|
870 | file = __builtin__.open(filename, mode, buffering)
|
---|
871 | if encoding is None:
|
---|
872 | return file
|
---|
873 | info = lookup(encoding)
|
---|
874 | srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
|
---|
875 | # Add attributes to simplify introspection
|
---|
876 | srw.encoding = encoding
|
---|
877 | return srw
|
---|
878 |
|
---|
879 | def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
|
---|
880 |
|
---|
881 | """ Return a wrapped version of file which provides transparent
|
---|
882 | encoding translation.
|
---|
883 |
|
---|
884 | Strings written to the wrapped file are interpreted according
|
---|
885 | to the given data_encoding and then written to the original
|
---|
886 | file as string using file_encoding. The intermediate encoding
|
---|
887 | will usually be Unicode but depends on the specified codecs.
|
---|
888 |
|
---|
889 | Strings are read from the file using file_encoding and then
|
---|
890 | passed back to the caller as string using data_encoding.
|
---|
891 |
|
---|
892 | If file_encoding is not given, it defaults to data_encoding.
|
---|
893 |
|
---|
894 | errors may be given to define the error handling. It defaults
|
---|
895 | to 'strict' which causes ValueErrors to be raised in case an
|
---|
896 | encoding error occurs.
|
---|
897 |
|
---|
898 | The returned wrapped file object provides two extra attributes
|
---|
899 | .data_encoding and .file_encoding which reflect the given
|
---|
900 | parameters of the same name. The attributes can be used for
|
---|
901 | introspection by Python programs.
|
---|
902 |
|
---|
903 | """
|
---|
904 | if file_encoding is None:
|
---|
905 | file_encoding = data_encoding
|
---|
906 | data_info = lookup(data_encoding)
|
---|
907 | file_info = lookup(file_encoding)
|
---|
908 | sr = StreamRecoder(file, data_info.encode, data_info.decode,
|
---|
909 | file_info.streamreader, file_info.streamwriter, errors)
|
---|
910 | # Add attributes to simplify introspection
|
---|
911 | sr.data_encoding = data_encoding
|
---|
912 | sr.file_encoding = file_encoding
|
---|
913 | return sr
|
---|
914 |
|
---|
915 | ### Helpers for codec lookup
|
---|
916 |
|
---|
917 | def getencoder(encoding):
|
---|
918 |
|
---|
919 | """ Lookup up the codec for the given encoding and return
|
---|
920 | its encoder function.
|
---|
921 |
|
---|
922 | Raises a LookupError in case the encoding cannot be found.
|
---|
923 |
|
---|
924 | """
|
---|
925 | return lookup(encoding).encode
|
---|
926 |
|
---|
927 | def getdecoder(encoding):
|
---|
928 |
|
---|
929 | """ Lookup up the codec for the given encoding and return
|
---|
930 | its decoder function.
|
---|
931 |
|
---|
932 | Raises a LookupError in case the encoding cannot be found.
|
---|
933 |
|
---|
934 | """
|
---|
935 | return lookup(encoding).decode
|
---|
936 |
|
---|
937 | def getincrementalencoder(encoding):
|
---|
938 |
|
---|
939 | """ Lookup up the codec for the given encoding and return
|
---|
940 | its IncrementalEncoder class or factory function.
|
---|
941 |
|
---|
942 | Raises a LookupError in case the encoding cannot be found
|
---|
943 | or the codecs doesn't provide an incremental encoder.
|
---|
944 |
|
---|
945 | """
|
---|
946 | encoder = lookup(encoding).incrementalencoder
|
---|
947 | if encoder is None:
|
---|
948 | raise LookupError(encoding)
|
---|
949 | return encoder
|
---|
950 |
|
---|
951 | def getincrementaldecoder(encoding):
|
---|
952 |
|
---|
953 | """ Lookup up the codec for the given encoding and return
|
---|
954 | its IncrementalDecoder class or factory function.
|
---|
955 |
|
---|
956 | Raises a LookupError in case the encoding cannot be found
|
---|
957 | or the codecs doesn't provide an incremental decoder.
|
---|
958 |
|
---|
959 | """
|
---|
960 | decoder = lookup(encoding).incrementaldecoder
|
---|
961 | if decoder is None:
|
---|
962 | raise LookupError(encoding)
|
---|
963 | return decoder
|
---|
964 |
|
---|
965 | def getreader(encoding):
|
---|
966 |
|
---|
967 | """ Lookup up the codec for the given encoding and return
|
---|
968 | its StreamReader class or factory function.
|
---|
969 |
|
---|
970 | Raises a LookupError in case the encoding cannot be found.
|
---|
971 |
|
---|
972 | """
|
---|
973 | return lookup(encoding).streamreader
|
---|
974 |
|
---|
975 | def getwriter(encoding):
|
---|
976 |
|
---|
977 | """ Lookup up the codec for the given encoding and return
|
---|
978 | its StreamWriter class or factory function.
|
---|
979 |
|
---|
980 | Raises a LookupError in case the encoding cannot be found.
|
---|
981 |
|
---|
982 | """
|
---|
983 | return lookup(encoding).streamwriter
|
---|
984 |
|
---|
985 | def iterencode(iterator, encoding, errors='strict', **kwargs):
|
---|
986 | """
|
---|
987 | Encoding iterator.
|
---|
988 |
|
---|
989 | Encodes the input strings from the iterator using a IncrementalEncoder.
|
---|
990 |
|
---|
991 | errors and kwargs are passed through to the IncrementalEncoder
|
---|
992 | constructor.
|
---|
993 | """
|
---|
994 | encoder = getincrementalencoder(encoding)(errors, **kwargs)
|
---|
995 | for input in iterator:
|
---|
996 | output = encoder.encode(input)
|
---|
997 | if output:
|
---|
998 | yield output
|
---|
999 | output = encoder.encode("", True)
|
---|
1000 | if output:
|
---|
1001 | yield output
|
---|
1002 |
|
---|
1003 | def iterdecode(iterator, encoding, errors='strict', **kwargs):
|
---|
1004 | """
|
---|
1005 | Decoding iterator.
|
---|
1006 |
|
---|
1007 | Decodes the input strings from the iterator using a IncrementalDecoder.
|
---|
1008 |
|
---|
1009 | errors and kwargs are passed through to the IncrementalDecoder
|
---|
1010 | constructor.
|
---|
1011 | """
|
---|
1012 | decoder = getincrementaldecoder(encoding)(errors, **kwargs)
|
---|
1013 | for input in iterator:
|
---|
1014 | output = decoder.decode(input)
|
---|
1015 | if output:
|
---|
1016 | yield output
|
---|
1017 | output = decoder.decode("", True)
|
---|
1018 | if output:
|
---|
1019 | yield output
|
---|
1020 |
|
---|
1021 | ### Helpers for charmap-based codecs
|
---|
1022 |
|
---|
1023 | def make_identity_dict(rng):
|
---|
1024 |
|
---|
1025 | """ make_identity_dict(rng) -> dict
|
---|
1026 |
|
---|
1027 | Return a dictionary where elements of the rng sequence are
|
---|
1028 | mapped to themselves.
|
---|
1029 |
|
---|
1030 | """
|
---|
1031 | res = {}
|
---|
1032 | for i in rng:
|
---|
1033 | res[i]=i
|
---|
1034 | return res
|
---|
1035 |
|
---|
1036 | def make_encoding_map(decoding_map):
|
---|
1037 |
|
---|
1038 | """ Creates an encoding map from a decoding map.
|
---|
1039 |
|
---|
1040 | If a target mapping in the decoding map occurs multiple
|
---|
1041 | times, then that target is mapped to None (undefined mapping),
|
---|
1042 | causing an exception when encountered by the charmap codec
|
---|
1043 | during translation.
|
---|
1044 |
|
---|
1045 | One example where this happens is cp875.py which decodes
|
---|
1046 | multiple character to \u001a.
|
---|
1047 |
|
---|
1048 | """
|
---|
1049 | m = {}
|
---|
1050 | for k,v in decoding_map.items():
|
---|
1051 | if not v in m:
|
---|
1052 | m[v] = k
|
---|
1053 | else:
|
---|
1054 | m[v] = None
|
---|
1055 | return m
|
---|
1056 |
|
---|
1057 | ### error handlers
|
---|
1058 |
|
---|
1059 | try:
|
---|
1060 | strict_errors = lookup_error("strict")
|
---|
1061 | ignore_errors = lookup_error("ignore")
|
---|
1062 | replace_errors = lookup_error("replace")
|
---|
1063 | xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
|
---|
1064 | backslashreplace_errors = lookup_error("backslashreplace")
|
---|
1065 | except LookupError:
|
---|
1066 | # In --disable-unicode builds, these error handler are missing
|
---|
1067 | strict_errors = None
|
---|
1068 | ignore_errors = None
|
---|
1069 | replace_errors = None
|
---|
1070 | xmlcharrefreplace_errors = None
|
---|
1071 | backslashreplace_errors = None
|
---|
1072 |
|
---|
1073 | # Tell modulefinder that using codecs probably needs the encodings
|
---|
1074 | # package
|
---|
1075 | _false = 0
|
---|
1076 | if _false:
|
---|
1077 | import encodings
|
---|
1078 |
|
---|
1079 | ### Tests
|
---|
1080 |
|
---|
1081 | if __name__ == '__main__':
|
---|
1082 |
|
---|
1083 | # Make stdout translate Latin-1 output into UTF-8 output
|
---|
1084 | sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
|
---|
1085 |
|
---|
1086 | # Have stdin translate Latin-1 input into UTF-8 input
|
---|
1087 | sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
|
---|