1 | #!/usr/bin/env python
|
---|
2 | """ Utility for parsing HTML entity definitions available from:
|
---|
3 |
|
---|
4 | http://www.w3.org/ as e.g.
|
---|
5 | http://www.w3.org/TR/REC-html40/HTMLlat1.ent
|
---|
6 |
|
---|
7 | Input is read from stdin, output is written to stdout in form of a
|
---|
8 | Python snippet defining a dictionary "entitydefs" mapping literal
|
---|
9 | entity name to character or numeric entity.
|
---|
10 |
|
---|
11 | Marc-Andre Lemburg, mal@lemburg.com, 1999.
|
---|
12 | Use as you like. NO WARRANTIES.
|
---|
13 |
|
---|
14 | """
|
---|
15 | import re,sys
|
---|
16 | import TextTools
|
---|
17 |
|
---|
18 | entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
|
---|
19 |
|
---|
20 | def parse(text,pos=0,endpos=None):
|
---|
21 |
|
---|
22 | pos = 0
|
---|
23 | if endpos is None:
|
---|
24 | endpos = len(text)
|
---|
25 | d = {}
|
---|
26 | while 1:
|
---|
27 | m = entityRE.search(text,pos,endpos)
|
---|
28 | if not m:
|
---|
29 | break
|
---|
30 | name,charcode,comment = m.groups()
|
---|
31 | d[name] = charcode,comment
|
---|
32 | pos = m.end()
|
---|
33 | return d
|
---|
34 |
|
---|
35 | def writefile(f,defs):
|
---|
36 |
|
---|
37 | f.write("entitydefs = {\n")
|
---|
38 | items = defs.items()
|
---|
39 | items.sort()
|
---|
40 | for name,(charcode,comment) in items:
|
---|
41 | if charcode[:2] == '&#':
|
---|
42 | code = int(charcode[2:-1])
|
---|
43 | if code < 256:
|
---|
44 | charcode = "'\%o'" % code
|
---|
45 | else:
|
---|
46 | charcode = repr(charcode)
|
---|
47 | else:
|
---|
48 | charcode = repr(charcode)
|
---|
49 | comment = TextTools.collapse(comment)
|
---|
50 | f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))
|
---|
51 | f.write('\n}\n')
|
---|
52 |
|
---|
53 | if __name__ == '__main__':
|
---|
54 | if len(sys.argv) > 1:
|
---|
55 | infile = open(sys.argv[1])
|
---|
56 | else:
|
---|
57 | infile = sys.stdin
|
---|
58 | if len(sys.argv) > 2:
|
---|
59 | outfile = open(sys.argv[2],'w')
|
---|
60 | else:
|
---|
61 | outfile = sys.stdout
|
---|
62 | text = infile.read()
|
---|
63 | defs = parse(text)
|
---|
64 | writefile(outfile,defs)
|
---|