| 1 | import re, unicodedata, sys
|
|---|
| 2 |
|
|---|
| 3 | if sys.maxunicode == 65535:
|
|---|
| 4 | raise RuntimeError("need UCS-4 Python")
|
|---|
| 5 |
|
|---|
| 6 | def gen_category(cats):
|
|---|
| 7 | for i in range(0, 0x110000):
|
|---|
| 8 | if unicodedata.category(unichr(i)) in cats:
|
|---|
| 9 | yield(i)
|
|---|
| 10 |
|
|---|
| 11 | def gen_bidirectional(cats):
|
|---|
| 12 | for i in range(0, 0x110000):
|
|---|
| 13 | if unicodedata.bidirectional(unichr(i)) in cats:
|
|---|
| 14 | yield(i)
|
|---|
| 15 |
|
|---|
| 16 | def compact_set(l):
|
|---|
| 17 | single = []
|
|---|
| 18 | tuple = []
|
|---|
| 19 | prev = None
|
|---|
| 20 | span = 0
|
|---|
| 21 | for e in l:
|
|---|
| 22 | if prev is None:
|
|---|
| 23 | prev = e
|
|---|
| 24 | span = 0
|
|---|
| 25 | continue
|
|---|
| 26 | if prev+span+1 != e:
|
|---|
| 27 | if span > 2:
|
|---|
| 28 | tuple.append((prev,prev+span+1))
|
|---|
| 29 | else:
|
|---|
| 30 | for i in range(prev, prev+span+1):
|
|---|
| 31 | single.append(i)
|
|---|
| 32 | prev = e
|
|---|
| 33 | span = 0
|
|---|
| 34 | else:
|
|---|
| 35 | span += 1
|
|---|
| 36 | if span:
|
|---|
| 37 | tuple.append((prev,prev+span+1))
|
|---|
| 38 | else:
|
|---|
| 39 | single.append(prev)
|
|---|
| 40 | tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
|
|---|
| 41 | if not single:
|
|---|
| 42 | return "set(%s)" % tuple
|
|---|
| 43 | if not tuple:
|
|---|
| 44 | return "set(%s)" % repr(single)
|
|---|
| 45 | return "set(%s + %s)" % (repr(single),tuple)
|
|---|
| 46 |
|
|---|
| 47 | ############## Read the tables in the RFC #######################
|
|---|
| 48 |
|
|---|
| 49 | data = open("rfc3454.txt").readlines()
|
|---|
| 50 |
|
|---|
| 51 | tables = []
|
|---|
| 52 | curname = None
|
|---|
| 53 | for l in data:
|
|---|
| 54 | l = l.strip()
|
|---|
| 55 | if not l:
|
|---|
| 56 | continue
|
|---|
| 57 | # Skip RFC page breaks
|
|---|
| 58 | if l.startswith("Hoffman & Blanchet") or\
|
|---|
| 59 | l.startswith("RFC 3454"):
|
|---|
| 60 | continue
|
|---|
| 61 | # Find start/end lines
|
|---|
| 62 | m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
|
|---|
| 63 | if m:
|
|---|
| 64 | if m.group(1) == "Start":
|
|---|
| 65 | if curname:
|
|---|
| 66 | raise RuntimeError("Double Start", (curname, l))
|
|---|
| 67 | curname = m.group(2)
|
|---|
| 68 | table = {}
|
|---|
| 69 | tables.append((curname, table))
|
|---|
| 70 | continue
|
|---|
| 71 | else:
|
|---|
| 72 | if not curname:
|
|---|
| 73 | raise RuntimeError("End without start", l)
|
|---|
| 74 | curname = None
|
|---|
| 75 | continue
|
|---|
| 76 | if not curname:
|
|---|
| 77 | continue
|
|---|
| 78 | # Now we are in a table
|
|---|
| 79 | fields = l.split(";")
|
|---|
| 80 | if len(fields) > 1:
|
|---|
| 81 | # Drop comment field
|
|---|
| 82 | fields = fields[:-1]
|
|---|
| 83 | if len(fields) == 1:
|
|---|
| 84 | fields = fields[0].split("-")
|
|---|
| 85 | if len(fields) > 1:
|
|---|
| 86 | # range
|
|---|
| 87 | try:
|
|---|
| 88 | start, end = fields
|
|---|
| 89 | except ValueError:
|
|---|
| 90 | raise RuntimeError("Unpacking problem", l)
|
|---|
| 91 | else:
|
|---|
| 92 | start = end = fields[0]
|
|---|
| 93 | start = int(start, 16)
|
|---|
| 94 | end = int(end, 16)
|
|---|
| 95 | for i in range(start, end+1):
|
|---|
| 96 | table[i] = i
|
|---|
| 97 | else:
|
|---|
| 98 | code, value = fields
|
|---|
| 99 | value = value.strip()
|
|---|
| 100 | if value:
|
|---|
| 101 | value = [int(v, 16) for v in value.split(" ")]
|
|---|
| 102 | else:
|
|---|
| 103 | # table B.1
|
|---|
| 104 | value = None
|
|---|
| 105 | table[int(code, 16)] = value
|
|---|
| 106 |
|
|---|
| 107 | ########### Generate compact Python versions of the tables #############
|
|---|
| 108 |
|
|---|
| 109 | print """# This file is generated by mkstringprep.py. DO NOT EDIT.
|
|---|
| 110 | \"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
|
|---|
| 111 |
|
|---|
| 112 | There are two kinds of tables: sets, for which a member test is provided,
|
|---|
| 113 | and mappings, for which a mapping function is provided.
|
|---|
| 114 | \"\"\"
|
|---|
| 115 |
|
|---|
| 116 | import unicodedata
|
|---|
| 117 | """
|
|---|
| 118 |
|
|---|
| 119 | print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version)
|
|---|
| 120 |
|
|---|
| 121 | # A.1 is the table of unassigned characters
|
|---|
| 122 | # XXX Plane 15 PUA is listed as unassigned in Python.
|
|---|
| 123 | name, table = tables[0]
|
|---|
| 124 | del tables[0]
|
|---|
| 125 | assert name == "A.1"
|
|---|
| 126 | table = set(table.keys())
|
|---|
| 127 | Cn = set(gen_category(["Cn"]))
|
|---|
| 128 |
|
|---|
| 129 | # FDD0..FDEF are process internal codes
|
|---|
| 130 | Cn -= set(range(0xFDD0, 0xFDF0))
|
|---|
| 131 | # not a character
|
|---|
| 132 | Cn -= set(range(0xFFFE, 0x110000, 0x10000))
|
|---|
| 133 | Cn -= set(range(0xFFFF, 0x110000, 0x10000))
|
|---|
| 134 |
|
|---|
| 135 | # assert table == Cn
|
|---|
| 136 |
|
|---|
| 137 | print """
|
|---|
| 138 | def in_table_a1(code):
|
|---|
| 139 | if unicodedata.category(code) != 'Cn': return False
|
|---|
| 140 | c = ord(code)
|
|---|
| 141 | if 0xFDD0 <= c < 0xFDF0: return False
|
|---|
| 142 | return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
|
|---|
| 143 | """
|
|---|
| 144 |
|
|---|
| 145 | # B.1 cannot easily be derived
|
|---|
| 146 | name, table = tables[0]
|
|---|
| 147 | del tables[0]
|
|---|
| 148 | assert name == "B.1"
|
|---|
| 149 | table = sorted(table.keys())
|
|---|
| 150 | print """
|
|---|
| 151 | b1_set = """ + compact_set(table) + """
|
|---|
| 152 | def in_table_b1(code):
|
|---|
| 153 | return ord(code) in b1_set
|
|---|
| 154 | """
|
|---|
| 155 |
|
|---|
| 156 | # B.2 and B.3 is case folding.
|
|---|
| 157 | # It takes CaseFolding.txt into account, which is
|
|---|
| 158 | # not available in the Python database. Since
|
|---|
| 159 | # B.2 is derived from B.3, we process B.3 first.
|
|---|
| 160 | # B.3 supposedly *is* CaseFolding-3.2.0.txt.
|
|---|
| 161 |
|
|---|
| 162 | name, table_b2 = tables[0]
|
|---|
| 163 | del tables[0]
|
|---|
| 164 | assert name == "B.2"
|
|---|
| 165 |
|
|---|
| 166 | name, table_b3 = tables[0]
|
|---|
| 167 | del tables[0]
|
|---|
| 168 | assert name == "B.3"
|
|---|
| 169 |
|
|---|
| 170 | # B.3 is mostly Python's .lower, except for a number
|
|---|
| 171 | # of special cases, e.g. considering canonical forms.
|
|---|
| 172 |
|
|---|
| 173 | b3_exceptions = {}
|
|---|
| 174 |
|
|---|
| 175 | for k,v in table_b2.items():
|
|---|
| 176 | if map(ord, unichr(k).lower()) != v:
|
|---|
| 177 | b3_exceptions[k] = u"".join(map(unichr,v))
|
|---|
| 178 |
|
|---|
| 179 | b3 = sorted(b3_exceptions.items())
|
|---|
| 180 |
|
|---|
| 181 | print """
|
|---|
| 182 | b3_exceptions = {"""
|
|---|
| 183 | for i,(k,v) in enumerate(b3):
|
|---|
| 184 | print "0x%x:%s," % (k, repr(v)),
|
|---|
| 185 | if i % 4 == 3:
|
|---|
| 186 | print
|
|---|
| 187 | print "}"
|
|---|
| 188 |
|
|---|
| 189 | print """
|
|---|
| 190 | def map_table_b3(code):
|
|---|
| 191 | r = b3_exceptions.get(ord(code))
|
|---|
| 192 | if r is not None: return r
|
|---|
| 193 | return code.lower()
|
|---|
| 194 | """
|
|---|
| 195 |
|
|---|
| 196 | def map_table_b3(code):
|
|---|
| 197 | r = b3_exceptions.get(ord(code))
|
|---|
| 198 | if r is not None: return r
|
|---|
| 199 | return code.lower()
|
|---|
| 200 |
|
|---|
| 201 | # B.2 is case folding for NFKC. This is the same as B.3,
|
|---|
| 202 | # except where NormalizeWithKC(Fold(a)) !=
|
|---|
| 203 | # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
|
|---|
| 204 |
|
|---|
| 205 | def map_table_b2(a):
|
|---|
| 206 | al = map_table_b3(a)
|
|---|
| 207 | b = unicodedata.normalize("NFKC", al)
|
|---|
| 208 | bl = u"".join([map_table_b3(ch) for ch in b])
|
|---|
| 209 | c = unicodedata.normalize("NFKC", bl)
|
|---|
| 210 | if b != c:
|
|---|
| 211 | return c
|
|---|
| 212 | else:
|
|---|
| 213 | return al
|
|---|
| 214 |
|
|---|
| 215 | specials = {}
|
|---|
| 216 | for k,v in table_b2.items():
|
|---|
| 217 | if map(ord, map_table_b2(unichr(k))) != v:
|
|---|
| 218 | specials[k] = v
|
|---|
| 219 |
|
|---|
| 220 | # B.3 should not add any additional special cases
|
|---|
| 221 | assert specials == {}
|
|---|
| 222 |
|
|---|
| 223 | print """
|
|---|
| 224 | def map_table_b2(a):
|
|---|
| 225 | al = map_table_b3(a)
|
|---|
| 226 | b = unicodedata.normalize("NFKC", al)
|
|---|
| 227 | bl = u"".join([map_table_b3(ch) for ch in b])
|
|---|
| 228 | c = unicodedata.normalize("NFKC", bl)
|
|---|
| 229 | if b != c:
|
|---|
| 230 | return c
|
|---|
| 231 | else:
|
|---|
| 232 | return al
|
|---|
| 233 | """
|
|---|
| 234 |
|
|---|
| 235 | # C.1.1 is a table with a single character
|
|---|
| 236 | name, table = tables[0]
|
|---|
| 237 | del tables[0]
|
|---|
| 238 | assert name == "C.1.1"
|
|---|
| 239 | assert table == {0x20:0x20}
|
|---|
| 240 |
|
|---|
| 241 | print """
|
|---|
| 242 | def in_table_c11(code):
|
|---|
| 243 | return code == u" "
|
|---|
| 244 | """
|
|---|
| 245 |
|
|---|
| 246 | # C.1.2 is the rest of all space characters
|
|---|
| 247 | name, table = tables[0]
|
|---|
| 248 | del tables[0]
|
|---|
| 249 | assert name == "C.1.2"
|
|---|
| 250 |
|
|---|
| 251 | # table = set(table.keys())
|
|---|
| 252 | # Zs = set(gen_category(["Zs"])) - set([0x20])
|
|---|
| 253 | # assert Zs == table
|
|---|
| 254 |
|
|---|
| 255 | print """
|
|---|
| 256 | def in_table_c12(code):
|
|---|
| 257 | return unicodedata.category(code) == "Zs" and code != u" "
|
|---|
| 258 |
|
|---|
| 259 | def in_table_c11_c12(code):
|
|---|
| 260 | return unicodedata.category(code) == "Zs"
|
|---|
| 261 | """
|
|---|
| 262 |
|
|---|
| 263 | # C.2.1 ASCII control characters
|
|---|
| 264 | name, table_c21 = tables[0]
|
|---|
| 265 | del tables[0]
|
|---|
| 266 | assert name == "C.2.1"
|
|---|
| 267 |
|
|---|
| 268 | Cc = set(gen_category(["Cc"]))
|
|---|
| 269 | Cc_ascii = Cc & set(range(128))
|
|---|
| 270 | table_c21 = set(table_c21.keys())
|
|---|
| 271 | assert Cc_ascii == table_c21
|
|---|
| 272 |
|
|---|
| 273 | print """
|
|---|
| 274 | def in_table_c21(code):
|
|---|
| 275 | return ord(code) < 128 and unicodedata.category(code) == "Cc"
|
|---|
| 276 | """
|
|---|
| 277 |
|
|---|
| 278 | # C.2.2 Non-ASCII control characters. It also includes
|
|---|
| 279 | # a number of characters in category Cf.
|
|---|
| 280 | name, table_c22 = tables[0]
|
|---|
| 281 | del tables[0]
|
|---|
| 282 | assert name == "C.2.2"
|
|---|
| 283 |
|
|---|
| 284 | Cc_nonascii = Cc - Cc_ascii
|
|---|
| 285 | table_c22 = set(table_c22.keys())
|
|---|
| 286 | assert len(Cc_nonascii - table_c22) == 0
|
|---|
| 287 |
|
|---|
| 288 | specials = list(table_c22 - Cc_nonascii)
|
|---|
| 289 | specials.sort()
|
|---|
| 290 |
|
|---|
| 291 | print """c22_specials = """ + compact_set(specials) + """
|
|---|
| 292 | def in_table_c22(code):
|
|---|
| 293 | c = ord(code)
|
|---|
| 294 | if c < 128: return False
|
|---|
| 295 | if unicodedata.category(code) == "Cc": return True
|
|---|
| 296 | return c in c22_specials
|
|---|
| 297 |
|
|---|
| 298 | def in_table_c21_c22(code):
|
|---|
| 299 | return unicodedata.category(code) == "Cc" or \\
|
|---|
| 300 | ord(code) in c22_specials
|
|---|
| 301 | """
|
|---|
| 302 |
|
|---|
| 303 | # C.3 Private use
|
|---|
| 304 | name, table = tables[0]
|
|---|
| 305 | del tables[0]
|
|---|
| 306 | assert name == "C.3"
|
|---|
| 307 |
|
|---|
| 308 | Co = set(gen_category(["Co"]))
|
|---|
| 309 | assert set(table.keys()) == Co
|
|---|
| 310 |
|
|---|
| 311 | print """
|
|---|
| 312 | def in_table_c3(code):
|
|---|
| 313 | return unicodedata.category(code) == "Co"
|
|---|
| 314 | """
|
|---|
| 315 |
|
|---|
| 316 | # C.4 Non-character code points, xFFFE, xFFFF
|
|---|
| 317 | # plus process internal codes
|
|---|
| 318 | name, table = tables[0]
|
|---|
| 319 | del tables[0]
|
|---|
| 320 | assert name == "C.4"
|
|---|
| 321 |
|
|---|
| 322 | nonchar = set(range(0xFDD0,0xFDF0) +
|
|---|
| 323 | range(0xFFFE,0x110000,0x10000) +
|
|---|
| 324 | range(0xFFFF,0x110000,0x10000))
|
|---|
| 325 | table = set(table.keys())
|
|---|
| 326 | assert table == nonchar
|
|---|
| 327 |
|
|---|
| 328 | print """
|
|---|
| 329 | def in_table_c4(code):
|
|---|
| 330 | c = ord(code)
|
|---|
| 331 | if c < 0xFDD0: return False
|
|---|
| 332 | if c < 0xFDF0: return True
|
|---|
| 333 | return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
|
|---|
| 334 | """
|
|---|
| 335 |
|
|---|
| 336 | # C.5 Surrogate codes
|
|---|
| 337 | name, table = tables[0]
|
|---|
| 338 | del tables[0]
|
|---|
| 339 | assert name == "C.5"
|
|---|
| 340 |
|
|---|
| 341 | Cs = set(gen_category(["Cs"]))
|
|---|
| 342 | assert set(table.keys()) == Cs
|
|---|
| 343 |
|
|---|
| 344 | print """
|
|---|
| 345 | def in_table_c5(code):
|
|---|
| 346 | return unicodedata.category(code) == "Cs"
|
|---|
| 347 | """
|
|---|
| 348 |
|
|---|
| 349 | # C.6 Inappropriate for plain text
|
|---|
| 350 | name, table = tables[0]
|
|---|
| 351 | del tables[0]
|
|---|
| 352 | assert name == "C.6"
|
|---|
| 353 |
|
|---|
| 354 | table = sorted(table.keys())
|
|---|
| 355 |
|
|---|
| 356 | print """
|
|---|
| 357 | c6_set = """ + compact_set(table) + """
|
|---|
| 358 | def in_table_c6(code):
|
|---|
| 359 | return ord(code) in c6_set
|
|---|
| 360 | """
|
|---|
| 361 |
|
|---|
| 362 | # C.7 Inappropriate for canonical representation
|
|---|
| 363 | name, table = tables[0]
|
|---|
| 364 | del tables[0]
|
|---|
| 365 | assert name == "C.7"
|
|---|
| 366 |
|
|---|
| 367 | table = sorted(table.keys())
|
|---|
| 368 |
|
|---|
| 369 | print """
|
|---|
| 370 | c7_set = """ + compact_set(table) + """
|
|---|
| 371 | def in_table_c7(code):
|
|---|
| 372 | return ord(code) in c7_set
|
|---|
| 373 | """
|
|---|
| 374 |
|
|---|
| 375 | # C.8 Change display properties or are deprecated
|
|---|
| 376 | name, table = tables[0]
|
|---|
| 377 | del tables[0]
|
|---|
| 378 | assert name == "C.8"
|
|---|
| 379 |
|
|---|
| 380 | table = sorted(table.keys())
|
|---|
| 381 |
|
|---|
| 382 | print """
|
|---|
| 383 | c8_set = """ + compact_set(table) + """
|
|---|
| 384 | def in_table_c8(code):
|
|---|
| 385 | return ord(code) in c8_set
|
|---|
| 386 | """
|
|---|
| 387 |
|
|---|
| 388 | # C.9 Tagging characters
|
|---|
| 389 | name, table = tables[0]
|
|---|
| 390 | del tables[0]
|
|---|
| 391 | assert name == "C.9"
|
|---|
| 392 |
|
|---|
| 393 | table = sorted(table.keys())
|
|---|
| 394 |
|
|---|
| 395 | print """
|
|---|
| 396 | c9_set = """ + compact_set(table) + """
|
|---|
| 397 | def in_table_c9(code):
|
|---|
| 398 | return ord(code) in c9_set
|
|---|
| 399 | """
|
|---|
| 400 |
|
|---|
| 401 | # D.1 Characters with bidirectional property "R" or "AL"
|
|---|
| 402 | name, table = tables[0]
|
|---|
| 403 | del tables[0]
|
|---|
| 404 | assert name == "D.1"
|
|---|
| 405 |
|
|---|
| 406 | RandAL = set(gen_bidirectional(["R","AL"]))
|
|---|
| 407 | assert set(table.keys()) == RandAL
|
|---|
| 408 |
|
|---|
| 409 | print """
|
|---|
| 410 | def in_table_d1(code):
|
|---|
| 411 | return unicodedata.bidirectional(code) in ("R","AL")
|
|---|
| 412 | """
|
|---|
| 413 |
|
|---|
| 414 | # D.2 Characters with bidirectional property "L"
|
|---|
| 415 | name, table = tables[0]
|
|---|
| 416 | del tables[0]
|
|---|
| 417 | assert name == "D.2"
|
|---|
| 418 |
|
|---|
| 419 | L = set(gen_bidirectional(["L"]))
|
|---|
| 420 | assert set(table.keys()) == L
|
|---|
| 421 |
|
|---|
| 422 | print """
|
|---|
| 423 | def in_table_d2(code):
|
|---|
| 424 | return unicodedata.bidirectional(code) == "L"
|
|---|
| 425 | """
|
|---|