| 1 | #!/usr/bin/env python
|
|---|
| 2 | """ Utility for parsing HTML entity definitions available from:
|
|---|
| 3 |
|
|---|
| 4 | http://www.w3.org/ as e.g.
|
|---|
| 5 | http://www.w3.org/TR/REC-html40/HTMLlat1.ent
|
|---|
| 6 |
|
|---|
| 7 | Input is read from stdin, output is written to stdout in form of a
|
|---|
| 8 | Python snippet defining a dictionary "entitydefs" mapping literal
|
|---|
| 9 | entity name to character or numeric entity.
|
|---|
| 10 |
|
|---|
| 11 | Marc-Andre Lemburg, mal@lemburg.com, 1999.
|
|---|
| 12 | Use as you like. NO WARRANTIES.
|
|---|
| 13 |
|
|---|
| 14 | """
|
|---|
| 15 | import re,sys
|
|---|
| 16 | import TextTools
|
|---|
| 17 |
|
|---|
| 18 | entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
|
|---|
| 19 |
|
|---|
| 20 | def parse(text,pos=0,endpos=None):
|
|---|
| 21 |
|
|---|
| 22 | pos = 0
|
|---|
| 23 | if endpos is None:
|
|---|
| 24 | endpos = len(text)
|
|---|
| 25 | d = {}
|
|---|
| 26 | while 1:
|
|---|
| 27 | m = entityRE.search(text,pos,endpos)
|
|---|
| 28 | if not m:
|
|---|
| 29 | break
|
|---|
| 30 | name,charcode,comment = m.groups()
|
|---|
| 31 | d[name] = charcode,comment
|
|---|
| 32 | pos = m.end()
|
|---|
| 33 | return d
|
|---|
| 34 |
|
|---|
| 35 | def writefile(f,defs):
|
|---|
| 36 |
|
|---|
| 37 | f.write("entitydefs = {\n")
|
|---|
| 38 | items = defs.items()
|
|---|
| 39 | items.sort()
|
|---|
| 40 | for name,(charcode,comment) in items:
|
|---|
| 41 | if charcode[:2] == '&#':
|
|---|
| 42 | code = int(charcode[2:-1])
|
|---|
| 43 | if code < 256:
|
|---|
| 44 | charcode = "'\%o'" % code
|
|---|
| 45 | else:
|
|---|
| 46 | charcode = repr(charcode)
|
|---|
| 47 | else:
|
|---|
| 48 | charcode = repr(charcode)
|
|---|
| 49 | comment = TextTools.collapse(comment)
|
|---|
| 50 | f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))
|
|---|
| 51 | f.write('\n}\n')
|
|---|
| 52 |
|
|---|
| 53 | if __name__ == '__main__':
|
|---|
| 54 | if len(sys.argv) > 1:
|
|---|
| 55 | infile = open(sys.argv[1])
|
|---|
| 56 | else:
|
|---|
| 57 | infile = sys.stdin
|
|---|
| 58 | if len(sys.argv) > 2:
|
|---|
| 59 | outfile = open(sys.argv[2],'w')
|
|---|
| 60 | else:
|
|---|
| 61 | outfile = sys.stdout
|
|---|
| 62 | text = infile.read()
|
|---|
| 63 | defs = parse(text)
|
|---|
| 64 | writefile(outfile,defs)
|
|---|