1 | import re, unicodedata, sys
|
---|
2 |
|
---|
3 | if sys.maxunicode == 65535:
|
---|
4 | raise RuntimeError, "need UCS-4 Python"
|
---|
5 |
|
---|
6 | def gen_category(cats):
|
---|
7 | for i in range(0, 0x110000):
|
---|
8 | if unicodedata.category(unichr(i)) in cats:
|
---|
9 | yield(i)
|
---|
10 |
|
---|
11 | def gen_bidirectional(cats):
|
---|
12 | for i in range(0, 0x110000):
|
---|
13 | if unicodedata.bidirectional(unichr(i)) in cats:
|
---|
14 | yield(i)
|
---|
15 |
|
---|
16 | def compact_set(l):
|
---|
17 | single = []
|
---|
18 | tuple = []
|
---|
19 | prev = None
|
---|
20 | span = 0
|
---|
21 | for e in l:
|
---|
22 | if prev is None:
|
---|
23 | prev = e
|
---|
24 | span = 0
|
---|
25 | continue
|
---|
26 | if prev+span+1 != e:
|
---|
27 | if span > 2:
|
---|
28 | tuple.append((prev,prev+span+1))
|
---|
29 | else:
|
---|
30 | for i in range(prev, prev+span+1):
|
---|
31 | single.append(i)
|
---|
32 | prev = e
|
---|
33 | span = 0
|
---|
34 | else:
|
---|
35 | span += 1
|
---|
36 | if span:
|
---|
37 | tuple.append((prev,prev+span+1))
|
---|
38 | else:
|
---|
39 | single.append(prev)
|
---|
40 | tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
|
---|
41 | if not single:
|
---|
42 | return "set(%s)" % tuple
|
---|
43 | if not tuple:
|
---|
44 | return "set(%s)" % repr(single)
|
---|
45 | return "set(%s + %s)" % (repr(single),tuple)
|
---|
46 |
|
---|
47 | ############## Read the tables in the RFC #######################
|
---|
48 |
|
---|
49 | data = open("rfc3454.txt").readlines()
|
---|
50 |
|
---|
51 | tables = []
|
---|
52 | curname = None
|
---|
53 | for l in data:
|
---|
54 | l = l.strip()
|
---|
55 | if not l:
|
---|
56 | continue
|
---|
57 | # Skip RFC page breaks
|
---|
58 | if l.startswith("Hoffman & Blanchet") or\
|
---|
59 | l.startswith("RFC 3454"):
|
---|
60 | continue
|
---|
61 | # Find start/end lines
|
---|
62 | m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
|
---|
63 | if m:
|
---|
64 | if m.group(1) == "Start":
|
---|
65 | if curname:
|
---|
66 | raise "Double Start",(curname, l)
|
---|
67 | curname = m.group(2)
|
---|
68 | table = {}
|
---|
69 | tables.append((curname, table))
|
---|
70 | continue
|
---|
71 | else:
|
---|
72 | if not curname:
|
---|
73 | raise "End without start", l
|
---|
74 | curname = None
|
---|
75 | continue
|
---|
76 | if not curname:
|
---|
77 | continue
|
---|
78 | # Now we are in a table
|
---|
79 | fields = l.split(";")
|
---|
80 | if len(fields) > 1:
|
---|
81 | # Drop comment field
|
---|
82 | fields = fields[:-1]
|
---|
83 | if len(fields) == 1:
|
---|
84 | fields = fields[0].split("-")
|
---|
85 | if len(fields) > 1:
|
---|
86 | # range
|
---|
87 | try:
|
---|
88 | start, end = fields
|
---|
89 | except ValueError:
|
---|
90 | raise "Unpacking problem", l
|
---|
91 | else:
|
---|
92 | start = end = fields[0]
|
---|
93 | start = int(start, 16)
|
---|
94 | end = int(end, 16)
|
---|
95 | for i in range(start, end+1):
|
---|
96 | table[i] = i
|
---|
97 | else:
|
---|
98 | code, value = fields
|
---|
99 | value = value.strip()
|
---|
100 | if value:
|
---|
101 | value = [int(v, 16) for v in value.split(" ")]
|
---|
102 | else:
|
---|
103 | # table B.1
|
---|
104 | value = None
|
---|
105 | table[int(code, 16)] = value
|
---|
106 |
|
---|
107 | ########### Generate compact Python versions of the tables #############
|
---|
108 |
|
---|
109 | print """# This file is generated by mkstringprep.py. DO NOT EDIT.
|
---|
110 | \"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
|
---|
111 |
|
---|
112 | There are two kinds of tables: sets, for which a member test is provided,
|
---|
113 | and mappings, for which a mapping function is provided.
|
---|
114 | \"\"\"
|
---|
115 |
|
---|
116 | import unicodedata
|
---|
117 | """
|
---|
118 |
|
---|
119 | print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version)
|
---|
120 |
|
---|
121 | # A.1 is the table of unassigned characters
|
---|
122 | # XXX Plane 15 PUA is listed as unassigned in Python.
|
---|
123 | name, table = tables[0]
|
---|
124 | del tables[0]
|
---|
125 | assert name == "A.1"
|
---|
126 | table = set(table.keys())
|
---|
127 | Cn = set(gen_category(["Cn"]))
|
---|
128 |
|
---|
129 | # FDD0..FDEF are process internal codes
|
---|
130 | Cn -= set(range(0xFDD0, 0xFDF0))
|
---|
131 | # not a character
|
---|
132 | Cn -= set(range(0xFFFE, 0x110000, 0x10000))
|
---|
133 | Cn -= set(range(0xFFFF, 0x110000, 0x10000))
|
---|
134 |
|
---|
135 | # assert table == Cn
|
---|
136 |
|
---|
137 | print """
|
---|
138 | def in_table_a1(code):
|
---|
139 | if unicodedata.category(code) != 'Cn': return False
|
---|
140 | c = ord(code)
|
---|
141 | if 0xFDD0 <= c < 0xFDF0: return False
|
---|
142 | return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
|
---|
143 | """
|
---|
144 |
|
---|
145 | # B.1 cannot easily be derived
|
---|
146 | name, table = tables[0]
|
---|
147 | del tables[0]
|
---|
148 | assert name == "B.1"
|
---|
149 | table = table.keys()
|
---|
150 | table.sort()
|
---|
151 | print """
|
---|
152 | b1_set = """ + compact_set(table) + """
|
---|
153 | def in_table_b1(code):
|
---|
154 | return ord(code) in b1_set
|
---|
155 | """
|
---|
156 |
|
---|
157 | # B.2 and B.3 is case folding.
|
---|
158 | # It takes CaseFolding.txt into account, which is
|
---|
159 | # not available in the Python database. Since
|
---|
160 | # B.2 is derived from B.3, we process B.3 first.
|
---|
161 | # B.3 supposedly *is* CaseFolding-3.2.0.txt.
|
---|
162 |
|
---|
163 | name, table_b2 = tables[0]
|
---|
164 | del tables[0]
|
---|
165 | assert name == "B.2"
|
---|
166 |
|
---|
167 | name, table_b3 = tables[0]
|
---|
168 | del tables[0]
|
---|
169 | assert name == "B.3"
|
---|
170 |
|
---|
171 | # B.3 is mostly Python's .lower, except for a number
|
---|
172 | # of special cases, e.g. considering canonical forms.
|
---|
173 |
|
---|
174 | b3_exceptions = {}
|
---|
175 |
|
---|
176 | for k,v in table_b2.items():
|
---|
177 | if map(ord, unichr(k).lower()) != v:
|
---|
178 | b3_exceptions[k] = u"".join(map(unichr,v))
|
---|
179 |
|
---|
180 | b3 = b3_exceptions.items()
|
---|
181 | b3.sort()
|
---|
182 |
|
---|
183 | print """
|
---|
184 | b3_exceptions = {"""
|
---|
185 | for i,(k,v) in enumerate(b3):
|
---|
186 | print "0x%x:%s," % (k, repr(v)),
|
---|
187 | if i % 4 == 3:
|
---|
188 | print
|
---|
189 | print "}"
|
---|
190 |
|
---|
191 | print """
|
---|
192 | def map_table_b3(code):
|
---|
193 | r = b3_exceptions.get(ord(code))
|
---|
194 | if r is not None: return r
|
---|
195 | return code.lower()
|
---|
196 | """
|
---|
197 |
|
---|
198 | def map_table_b3(code):
|
---|
199 | r = b3_exceptions.get(ord(code))
|
---|
200 | if r is not None: return r
|
---|
201 | return code.lower()
|
---|
202 |
|
---|
203 | # B.2 is case folding for NFKC. This is the same as B.3,
|
---|
204 | # except where NormalizeWithKC(Fold(a)) !=
|
---|
205 | # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
|
---|
206 |
|
---|
207 | def map_table_b2(a):
|
---|
208 | al = map_table_b3(a)
|
---|
209 | b = unicodedata.normalize("NFKC", al)
|
---|
210 | bl = u"".join([map_table_b3(ch) for ch in b])
|
---|
211 | c = unicodedata.normalize("NFKC", bl)
|
---|
212 | if b != c:
|
---|
213 | return c
|
---|
214 | else:
|
---|
215 | return al
|
---|
216 |
|
---|
217 | specials = {}
|
---|
218 | for k,v in table_b2.items():
|
---|
219 | if map(ord, map_table_b2(unichr(k))) != v:
|
---|
220 | specials[k] = v
|
---|
221 |
|
---|
222 | # B.3 should not add any additional special cases
|
---|
223 | assert specials == {}
|
---|
224 |
|
---|
225 | print """
|
---|
226 | def map_table_b2(a):
|
---|
227 | al = map_table_b3(a)
|
---|
228 | b = unicodedata.normalize("NFKC", al)
|
---|
229 | bl = u"".join([map_table_b3(ch) for ch in b])
|
---|
230 | c = unicodedata.normalize("NFKC", bl)
|
---|
231 | if b != c:
|
---|
232 | return c
|
---|
233 | else:
|
---|
234 | return al
|
---|
235 | """
|
---|
236 |
|
---|
237 | # C.1.1 is a table with a single character
|
---|
238 | name, table = tables[0]
|
---|
239 | del tables[0]
|
---|
240 | assert name == "C.1.1"
|
---|
241 | assert table == {0x20:0x20}
|
---|
242 |
|
---|
243 | print """
|
---|
244 | def in_table_c11(code):
|
---|
245 | return code == u" "
|
---|
246 | """
|
---|
247 |
|
---|
248 | # C.1.2 is the rest of all space characters
|
---|
249 | name, table = tables[0]
|
---|
250 | del tables[0]
|
---|
251 | assert name == "C.1.2"
|
---|
252 |
|
---|
253 | # table = set(table.keys())
|
---|
254 | # Zs = set(gen_category(["Zs"])) - set([0x20])
|
---|
255 | # assert Zs == table
|
---|
256 |
|
---|
257 | print """
|
---|
258 | def in_table_c12(code):
|
---|
259 | return unicodedata.category(code) == "Zs" and code != u" "
|
---|
260 |
|
---|
261 | def in_table_c11_c12(code):
|
---|
262 | return unicodedata.category(code) == "Zs"
|
---|
263 | """
|
---|
264 |
|
---|
265 | # C.2.1 ASCII control characters
|
---|
266 | name, table_c21 = tables[0]
|
---|
267 | del tables[0]
|
---|
268 | assert name == "C.2.1"
|
---|
269 |
|
---|
270 | Cc = set(gen_category(["Cc"]))
|
---|
271 | Cc_ascii = Cc & set(range(128))
|
---|
272 | table_c21 = set(table_c21.keys())
|
---|
273 | assert Cc_ascii == table_c21
|
---|
274 |
|
---|
275 | print """
|
---|
276 | def in_table_c21(code):
|
---|
277 | return ord(code) < 128 and unicodedata.category(code) == "Cc"
|
---|
278 | """
|
---|
279 |
|
---|
280 | # C.2.2 Non-ASCII control characters. It also includes
|
---|
281 | # a number of characters in category Cf.
|
---|
282 | name, table_c22 = tables[0]
|
---|
283 | del tables[0]
|
---|
284 | assert name == "C.2.2"
|
---|
285 |
|
---|
286 | Cc_nonascii = Cc - Cc_ascii
|
---|
287 | table_c22 = set(table_c22.keys())
|
---|
288 | assert len(Cc_nonascii - table_c22) == 0
|
---|
289 |
|
---|
290 | specials = list(table_c22 - Cc_nonascii)
|
---|
291 | specials.sort()
|
---|
292 |
|
---|
293 | print """c22_specials = """ + compact_set(specials) + """
|
---|
294 | def in_table_c22(code):
|
---|
295 | c = ord(code)
|
---|
296 | if c < 128: return False
|
---|
297 | if unicodedata.category(code) == "Cc": return True
|
---|
298 | return c in c22_specials
|
---|
299 |
|
---|
300 | def in_table_c21_c22(code):
|
---|
301 | return unicodedata.category(code) == "Cc" or \\
|
---|
302 | ord(code) in c22_specials
|
---|
303 | """
|
---|
304 |
|
---|
305 | # C.3 Private use
|
---|
306 | name, table = tables[0]
|
---|
307 | del tables[0]
|
---|
308 | assert name == "C.3"
|
---|
309 |
|
---|
310 | Co = set(gen_category(["Co"]))
|
---|
311 | assert set(table.keys()) == Co
|
---|
312 |
|
---|
313 | print """
|
---|
314 | def in_table_c3(code):
|
---|
315 | return unicodedata.category(code) == "Co"
|
---|
316 | """
|
---|
317 |
|
---|
318 | # C.4 Non-character code points, xFFFE, xFFFF
|
---|
319 | # plus process internal codes
|
---|
320 | name, table = tables[0]
|
---|
321 | del tables[0]
|
---|
322 | assert name == "C.4"
|
---|
323 |
|
---|
324 | nonchar = set(range(0xFDD0,0xFDF0) +
|
---|
325 | range(0xFFFE,0x110000,0x10000) +
|
---|
326 | range(0xFFFF,0x110000,0x10000))
|
---|
327 | table = set(table.keys())
|
---|
328 | assert table == nonchar
|
---|
329 |
|
---|
330 | print """
|
---|
331 | def in_table_c4(code):
|
---|
332 | c = ord(code)
|
---|
333 | if c < 0xFDD0: return False
|
---|
334 | if c < 0xFDF0: return True
|
---|
335 | return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
|
---|
336 | """
|
---|
337 |
|
---|
338 | # C.5 Surrogate codes
|
---|
339 | name, table = tables[0]
|
---|
340 | del tables[0]
|
---|
341 | assert name == "C.5"
|
---|
342 |
|
---|
343 | Cs = set(gen_category(["Cs"]))
|
---|
344 | assert set(table.keys()) == Cs
|
---|
345 |
|
---|
346 | print """
|
---|
347 | def in_table_c5(code):
|
---|
348 | return unicodedata.category(code) == "Cs"
|
---|
349 | """
|
---|
350 |
|
---|
351 | # C.6 Inappropriate for plain text
|
---|
352 | name, table = tables[0]
|
---|
353 | del tables[0]
|
---|
354 | assert name == "C.6"
|
---|
355 |
|
---|
356 | table = table.keys()
|
---|
357 | table.sort()
|
---|
358 |
|
---|
359 | print """
|
---|
360 | c6_set = """ + compact_set(table) + """
|
---|
361 | def in_table_c6(code):
|
---|
362 | return ord(code) in c6_set
|
---|
363 | """
|
---|
364 |
|
---|
365 | # C.7 Inappropriate for canonical representation
|
---|
366 | name, table = tables[0]
|
---|
367 | del tables[0]
|
---|
368 | assert name == "C.7"
|
---|
369 |
|
---|
370 | table = table.keys()
|
---|
371 | table.sort()
|
---|
372 |
|
---|
373 | print """
|
---|
374 | c7_set = """ + compact_set(table) + """
|
---|
375 | def in_table_c7(code):
|
---|
376 | return ord(code) in c7_set
|
---|
377 | """
|
---|
378 |
|
---|
379 | # C.8 Change display properties or are deprecated
|
---|
380 | name, table = tables[0]
|
---|
381 | del tables[0]
|
---|
382 | assert name == "C.8"
|
---|
383 |
|
---|
384 | table = table.keys()
|
---|
385 | table.sort()
|
---|
386 |
|
---|
387 | print """
|
---|
388 | c8_set = """ + compact_set(table) + """
|
---|
389 | def in_table_c8(code):
|
---|
390 | return ord(code) in c8_set
|
---|
391 | """
|
---|
392 |
|
---|
393 | # C.9 Tagging characters
|
---|
394 | name, table = tables[0]
|
---|
395 | del tables[0]
|
---|
396 | assert name == "C.9"
|
---|
397 |
|
---|
398 | table = table.keys()
|
---|
399 | table.sort()
|
---|
400 |
|
---|
401 | print """
|
---|
402 | c9_set = """ + compact_set(table) + """
|
---|
403 | def in_table_c9(code):
|
---|
404 | return ord(code) in c9_set
|
---|
405 | """
|
---|
406 |
|
---|
407 | # D.1 Characters with bidirectional property "R" or "AL"
|
---|
408 | name, table = tables[0]
|
---|
409 | del tables[0]
|
---|
410 | assert name == "D.1"
|
---|
411 |
|
---|
412 | RandAL = set(gen_bidirectional(["R","AL"]))
|
---|
413 | assert set(table.keys()) == RandAL
|
---|
414 |
|
---|
415 | print """
|
---|
416 | def in_table_d1(code):
|
---|
417 | return unicodedata.bidirectional(code) in ("R","AL")
|
---|
418 | """
|
---|
419 |
|
---|
420 | # D.2 Characters with bidirectional property "L"
|
---|
421 | name, table = tables[0]
|
---|
422 | del tables[0]
|
---|
423 | assert name == "D.2"
|
---|
424 |
|
---|
425 | L = set(gen_bidirectional(["L"]))
|
---|
426 | assert set(table.keys()) == L
|
---|
427 |
|
---|
428 | print """
|
---|
429 | def in_table_d2(code):
|
---|
430 | return unicodedata.bidirectional(code) == "L"
|
---|
431 | """
|
---|