1 | """ Python 'utf-8-sig' Codec
|
---|
2 | This work similar to UTF-8 with the following changes:
|
---|
3 |
|
---|
4 | * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
|
---|
5 | first three bytes.
|
---|
6 |
|
---|
7 | * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
|
---|
8 | bytes will be skipped.
|
---|
9 | """
|
---|
10 | import codecs
|
---|
11 |
|
---|
12 | ### Codec APIs
|
---|
13 |
|
---|
14 | def encode(input, errors='strict'):
|
---|
15 | return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
|
---|
16 |
|
---|
17 | def decode(input, errors='strict'):
|
---|
18 | prefix = 0
|
---|
19 | if input[:3] == codecs.BOM_UTF8:
|
---|
20 | input = input[3:]
|
---|
21 | prefix = 3
|
---|
22 | (output, consumed) = codecs.utf_8_decode(input, errors, True)
|
---|
23 | return (output, consumed+prefix)
|
---|
24 |
|
---|
25 | class IncrementalEncoder(codecs.IncrementalEncoder):
|
---|
26 | def __init__(self, errors='strict'):
|
---|
27 | codecs.IncrementalEncoder.__init__(self, errors)
|
---|
28 | self.first = 1
|
---|
29 |
|
---|
30 | def encode(self, input, final=False):
|
---|
31 | if self.first:
|
---|
32 | self.first = 0
|
---|
33 | return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
|
---|
34 | else:
|
---|
35 | return codecs.utf_8_encode(input, self.errors)[0]
|
---|
36 |
|
---|
37 | def reset(self):
|
---|
38 | codecs.IncrementalEncoder.reset(self)
|
---|
39 | self.first = 1
|
---|
40 |
|
---|
41 | def getstate(self):
|
---|
42 | return self.first
|
---|
43 |
|
---|
44 | def setstate(self, state):
|
---|
45 | self.first = state
|
---|
46 |
|
---|
47 | class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
---|
48 | def __init__(self, errors='strict'):
|
---|
49 | codecs.BufferedIncrementalDecoder.__init__(self, errors)
|
---|
50 | self.first = True
|
---|
51 |
|
---|
52 | def _buffer_decode(self, input, errors, final):
|
---|
53 | if self.first:
|
---|
54 | if len(input) < 3:
|
---|
55 | if codecs.BOM_UTF8.startswith(input):
|
---|
56 | # not enough data to decide if this really is a BOM
|
---|
57 | # => try again on the next call
|
---|
58 | return (u"", 0)
|
---|
59 | else:
|
---|
60 | self.first = None
|
---|
61 | else:
|
---|
62 | self.first = None
|
---|
63 | if input[:3] == codecs.BOM_UTF8:
|
---|
64 | (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
|
---|
65 | return (output, consumed+3)
|
---|
66 | return codecs.utf_8_decode(input, errors, final)
|
---|
67 |
|
---|
68 | def reset(self):
|
---|
69 | codecs.BufferedIncrementalDecoder.reset(self)
|
---|
70 | self.first = True
|
---|
71 |
|
---|
72 | class StreamWriter(codecs.StreamWriter):
|
---|
73 | def reset(self):
|
---|
74 | codecs.StreamWriter.reset(self)
|
---|
75 | try:
|
---|
76 | del self.encode
|
---|
77 | except AttributeError:
|
---|
78 | pass
|
---|
79 |
|
---|
80 | def encode(self, input, errors='strict'):
|
---|
81 | self.encode = codecs.utf_8_encode
|
---|
82 | return encode(input, errors)
|
---|
83 |
|
---|
84 | class StreamReader(codecs.StreamReader):
|
---|
85 | def reset(self):
|
---|
86 | codecs.StreamReader.reset(self)
|
---|
87 | try:
|
---|
88 | del self.decode
|
---|
89 | except AttributeError:
|
---|
90 | pass
|
---|
91 |
|
---|
92 | def decode(self, input, errors='strict'):
|
---|
93 | if len(input) < 3:
|
---|
94 | if codecs.BOM_UTF8.startswith(input):
|
---|
95 | # not enough data to decide if this is a BOM
|
---|
96 | # => try again on the next call
|
---|
97 | return (u"", 0)
|
---|
98 | elif input[:3] == codecs.BOM_UTF8:
|
---|
99 | self.decode = codecs.utf_8_decode
|
---|
100 | (output, consumed) = codecs.utf_8_decode(input[3:],errors)
|
---|
101 | return (output, consumed+3)
|
---|
102 | # (else) no BOM present
|
---|
103 | self.decode = codecs.utf_8_decode
|
---|
104 | return codecs.utf_8_decode(input, errors)
|
---|
105 |
|
---|
106 | ### encodings module API
|
---|
107 |
|
---|
108 | def getregentry():
|
---|
109 | return codecs.CodecInfo(
|
---|
110 | name='utf-8-sig',
|
---|
111 | encode=encode,
|
---|
112 | decode=decode,
|
---|
113 | incrementalencoder=IncrementalEncoder,
|
---|
114 | incrementaldecoder=IncrementalDecoder,
|
---|
115 | streamreader=StreamReader,
|
---|
116 | streamwriter=StreamWriter,
|
---|
117 | )
|
---|