source: python/trunk/Lib/encodings/utf_8_sig.py

Last change on this file was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 3.6 KB
Line 
1""" Python 'utf-8-sig' Codec
2This work similar to UTF-8 with the following changes:
3
4* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
5 first three bytes.
6
7* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
8 bytes will be skipped.
9"""
10import codecs
11
12### Codec APIs
13
14def encode(input, errors='strict'):
15 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
16
17def decode(input, errors='strict'):
18 prefix = 0
19 if input[:3] == codecs.BOM_UTF8:
20 input = input[3:]
21 prefix = 3
22 (output, consumed) = codecs.utf_8_decode(input, errors, True)
23 return (output, consumed+prefix)
24
25class IncrementalEncoder(codecs.IncrementalEncoder):
26 def __init__(self, errors='strict'):
27 codecs.IncrementalEncoder.__init__(self, errors)
28 self.first = 1
29
30 def encode(self, input, final=False):
31 if self.first:
32 self.first = 0
33 return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
34 else:
35 return codecs.utf_8_encode(input, self.errors)[0]
36
37 def reset(self):
38 codecs.IncrementalEncoder.reset(self)
39 self.first = 1
40
41 def getstate(self):
42 return self.first
43
44 def setstate(self, state):
45 self.first = state
46
47class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
48 def __init__(self, errors='strict'):
49 codecs.BufferedIncrementalDecoder.__init__(self, errors)
50 self.first = True
51
52 def _buffer_decode(self, input, errors, final):
53 if self.first:
54 if len(input) < 3:
55 if codecs.BOM_UTF8.startswith(input):
56 # not enough data to decide if this really is a BOM
57 # => try again on the next call
58 return (u"", 0)
59 else:
60 self.first = None
61 else:
62 self.first = None
63 if input[:3] == codecs.BOM_UTF8:
64 (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
65 return (output, consumed+3)
66 return codecs.utf_8_decode(input, errors, final)
67
68 def reset(self):
69 codecs.BufferedIncrementalDecoder.reset(self)
70 self.first = True
71
72class StreamWriter(codecs.StreamWriter):
73 def reset(self):
74 codecs.StreamWriter.reset(self)
75 try:
76 del self.encode
77 except AttributeError:
78 pass
79
80 def encode(self, input, errors='strict'):
81 self.encode = codecs.utf_8_encode
82 return encode(input, errors)
83
84class StreamReader(codecs.StreamReader):
85 def reset(self):
86 codecs.StreamReader.reset(self)
87 try:
88 del self.decode
89 except AttributeError:
90 pass
91
92 def decode(self, input, errors='strict'):
93 if len(input) < 3:
94 if codecs.BOM_UTF8.startswith(input):
95 # not enough data to decide if this is a BOM
96 # => try again on the next call
97 return (u"", 0)
98 elif input[:3] == codecs.BOM_UTF8:
99 self.decode = codecs.utf_8_decode
100 (output, consumed) = codecs.utf_8_decode(input[3:],errors)
101 return (output, consumed+3)
102 # (else) no BOM present
103 self.decode = codecs.utf_8_decode
104 return codecs.utf_8_decode(input, errors)
105
106### encodings module API
107
108def getregentry():
109 return codecs.CodecInfo(
110 name='utf-8-sig',
111 encode=encode,
112 decode=decode,
113 incrementalencoder=IncrementalEncoder,
114 incrementaldecoder=IncrementalDecoder,
115 streamreader=StreamReader,
116 streamwriter=StreamWriter,
117 )
Note: See TracBrowser for help on using the repository browser.