source: vendor/python/2.5/Doc/lib/libsgmllib.tex

Last change on this file was 3225, checked in by bird, 18 years ago

Python 2.5

File size: 10.8 KB
Line 
1\section{\module{sgmllib} ---
2 Simple SGML parser}
3
4\declaremodule{standard}{sgmllib}
5\modulesynopsis{Only as much of an SGML parser as needed to parse HTML.}
6
7\index{SGML}
8
9This module defines a class \class{SGMLParser} which serves as the
10basis for parsing text files formatted in SGML (Standard Generalized
11Mark-up Language). In fact, it does not provide a full SGML parser
12--- it only parses SGML insofar as it is used by HTML, and the module
13only exists as a base for the \refmodule{htmllib} module. Another
14HTML parser which supports XHTML and offers a somewhat different
15interface is available in the \refmodule{HTMLParser} module.
16
17\begin{classdesc}{SGMLParser}{}
18The \class{SGMLParser} class is instantiated without arguments.
19The parser is hardcoded to recognize the following
20constructs:
21
22\begin{itemize}
23\item
24Opening and closing tags of the form
25\samp{<\var{tag} \var{attr}="\var{value}" ...>} and
26\samp{</\var{tag}>}, respectively.
27
28\item
29Numeric character references of the form \samp{\&\#\var{name};}.
30
31\item
32Entity references of the form \samp{\&\var{name};}.
33
34\item
35SGML comments of the form \samp{<!--\var{text}-->}. Note that
36spaces, tabs, and newlines are allowed between the trailing
37\samp{>} and the immediately preceding \samp{--}.
38
39\end{itemize}
40\end{classdesc}
41
42A single exception is defined as well:
43
44\begin{excdesc}{SGMLParseError}
45Exception raised by the \class{SGMLParser} class when it encounters an
46error while parsing.
47\versionadded{2.1}
48\end{excdesc}
49
50
51\class{SGMLParser} instances have the following methods:
52
53
54\begin{methoddesc}{reset}{}
55Reset the instance. Loses all unprocessed data. This is called
56implicitly at instantiation time.
57\end{methoddesc}
58
59\begin{methoddesc}{setnomoretags}{}
60Stop processing tags. Treat all following input as literal input
61(CDATA). (This is only provided so the HTML tag
62\code{<PLAINTEXT>} can be implemented.)
63\end{methoddesc}
64
65\begin{methoddesc}{setliteral}{}
66Enter literal mode (CDATA mode).
67\end{methoddesc}
68
69\begin{methoddesc}{feed}{data}
70Feed some text to the parser. It is processed insofar as it consists
71of complete elements; incomplete data is buffered until more data is
72fed or \method{close()} is called.
73\end{methoddesc}
74
75\begin{methoddesc}{close}{}
76Force processing of all buffered data as if it were followed by an
77end-of-file mark. This method may be redefined by a derived class to
78define additional processing at the end of the input, but the
79redefined version should always call \method{close()}.
80\end{methoddesc}
81
82\begin{methoddesc}{get_starttag_text}{}
83Return the text of the most recently opened start tag. This should
84not normally be needed for structured processing, but may be useful in
85dealing with HTML ``as deployed'' or for re-generating input with
86minimal changes (whitespace between attributes can be preserved,
87etc.).
88\end{methoddesc}
89
90\begin{methoddesc}{handle_starttag}{tag, method, attributes}
91This method is called to handle start tags for which either a
92\method{start_\var{tag}()} or \method{do_\var{tag}()} method has been
93defined. The \var{tag} argument is the name of the tag converted to
94lower case, and the \var{method} argument is the bound method which
95should be used to support semantic interpretation of the start tag.
96The \var{attributes} argument is a list of \code{(\var{name},
97\var{value})} pairs containing the attributes found inside the tag's
98\code{<>} brackets.
99
100The \var{name} has been translated to lower case.
101Double quotes and backslashes in the \var{value} have been interpreted,
102as well as known character references and known entity references
103terminated by a semicolon (normally, entity references can be terminated
104by any non-alphanumerical character, but this would break the very
105common case of \code{<A HREF="url?spam=1\&eggs=2">} when \code{eggs}
106is a valid entity name).
107
108For instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this
109method would be called as \samp{unknown_starttag('a', [('href',
110'http://www.cwi.nl/')])}. The base implementation simply calls
111\var{method} with \var{attributes} as the only argument.
112\versionadded[Handling of entity and character references within
113 attribute values]{2.5}
114\end{methoddesc}
115
116\begin{methoddesc}{handle_endtag}{tag, method}
117This method is called to handle endtags for which an
118\method{end_\var{tag}()} method has been defined. The
119\var{tag} argument is the name of the tag converted to lower case, and
120the \var{method} argument is the bound method which should be used to
121support semantic interpretation of the end tag. If no
122\method{end_\var{tag}()} method is defined for the closing element,
123this handler is not called. The base implementation simply calls
124\var{method}.
125\end{methoddesc}
126
127\begin{methoddesc}{handle_data}{data}
128This method is called to process arbitrary data. It is intended to be
129overridden by a derived class; the base class implementation does
130nothing.
131\end{methoddesc}
132
133\begin{methoddesc}{handle_charref}{ref}
134This method is called to process a character reference of the form
135\samp{\&\#\var{ref};}. The base implementation uses
136\method{convert_charref()} to convert the reference to a string. If
137that method returns a string, it is passed to \method{handle_data()},
138otherwise \method{unknown_charref(\var{ref})} is called to handle the
139error.
140\versionchanged[Use \method{convert_charref()} instead of hard-coding
141the conversion]{2.5}
142\end{methoddesc}
143
144\begin{methoddesc}{convert_charref}{ref}
145Convert a character reference to a string, or \code{None}. \var{ref}
146is the reference passed in as a string. In the base implementation,
147\var{ref} must be a decimal number in the range 0-255. It converts
148the code point found using the \method{convert_codepoint()} method.
149If \var{ref} is invalid or out of range, this method returns
150\code{None}. This method is called by the default
151\method{handle_charref()} implementation and by the attribute value
152parser.
153\versionadded{2.5}
154\end{methoddesc}
155
156\begin{methoddesc}{convert_codepoint}{codepoint}
157Convert a codepoint to a \class{str} value. Encodings can be handled
158here if appropriate, though the rest of \module{sgmllib} is oblivious
159on this matter.
160\versionadded{2.5}
161\end{methoddesc}
162
163\begin{methoddesc}{handle_entityref}{ref}
164This method is called to process a general entity reference of the
165form \samp{\&\var{ref};} where \var{ref} is an general entity
166reference. It converts \var{ref} by passing it to
167\method{convert_entityref()}. If a translation is returned, it
168calls the method \method{handle_data()} with the translation;
169otherwise, it calls the method \code{unknown_entityref(\var{ref})}.
170The default \member{entitydefs} defines translations for
171\code{\&amp;}, \code{\&apos}, \code{\&gt;}, \code{\&lt;}, and
172\code{\&quot;}.
173\versionchanged[Use \method{convert_entityref()} instead of hard-coding
174the conversion]{2.5}
175\end{methoddesc}
176
177\begin{methoddesc}{convert_entityref}{ref}
178Convert a named entity reference to a \class{str} value, or
179\code{None}. The resulting value will not be parsed. \var{ref} will
180be only the name of the entity. The default implementation looks for
181\var{ref} in the instance (or class) variable \member{entitydefs}
182which should be a mapping from entity names to corresponding
183translations. If no translation is available for \var{ref}, this
184method returns \code{None}. This method is called by the default
185\method{handle_entityref()} implementation and by the attribute value
186parser.
187\versionadded{2.5}
188\end{methoddesc}
189
190\begin{methoddesc}{handle_comment}{comment}
191This method is called when a comment is encountered. The
192\var{comment} argument is a string containing the text between the
193\samp{<!--} and \samp{-->} delimiters, but not the delimiters
194themselves. For example, the comment \samp{<!--text-->} will
195cause this method to be called with the argument \code{'text'}. The
196default method does nothing.
197\end{methoddesc}
198
199\begin{methoddesc}{handle_decl}{data}
200Method called when an SGML declaration is read by the parser. In
201practice, the \code{DOCTYPE} declaration is the only thing observed in
202HTML, but the parser does not discriminate among different (or broken)
203declarations. Internal subsets in a \code{DOCTYPE} declaration are
204not supported. The \var{data} parameter will be the entire contents
205of the declaration inside the \code{<!}...\code{>} markup. The
206default implementation does nothing.
207\end{methoddesc}
208
209\begin{methoddesc}{report_unbalanced}{tag}
210This method is called when an end tag is found which does not
211correspond to any open element.
212\end{methoddesc}
213
214\begin{methoddesc}{unknown_starttag}{tag, attributes}
215This method is called to process an unknown start tag. It is intended
216to be overridden by a derived class; the base class implementation
217does nothing.
218\end{methoddesc}
219
220\begin{methoddesc}{unknown_endtag}{tag}
221This method is called to process an unknown end tag. It is intended
222to be overridden by a derived class; the base class implementation
223does nothing.
224\end{methoddesc}
225
226\begin{methoddesc}{unknown_charref}{ref}
227This method is called to process unresolvable numeric character
228references. Refer to \method{handle_charref()} to determine what is
229handled by default. It is intended to be overridden by a derived
230class; the base class implementation does nothing.
231\end{methoddesc}
232
233\begin{methoddesc}{unknown_entityref}{ref}
234This method is called to process an unknown entity reference. It is
235intended to be overridden by a derived class; the base class
236implementation does nothing.
237\end{methoddesc}
238
239Apart from overriding or extending the methods listed above, derived
240classes may also define methods of the following form to define
241processing of specific tags. Tag names in the input stream are case
242independent; the \var{tag} occurring in method names must be in lower
243case:
244
245\begin{methoddescni}{start_\var{tag}}{attributes}
246This method is called to process an opening tag \var{tag}. It has
247preference over \method{do_\var{tag}()}. The
248\var{attributes} argument has the same meaning as described for
249\method{handle_starttag()} above.
250\end{methoddescni}
251
252\begin{methoddescni}{do_\var{tag}}{attributes}
253This method is called to process an opening tag \var{tag}
254for which no \method{start_\var{tag}} method is defined.
255The \var{attributes} argument
256has the same meaning as described for \method{handle_starttag()} above.
257\end{methoddescni}
258
259\begin{methoddescni}{end_\var{tag}}{}
260This method is called to process a closing tag \var{tag}.
261\end{methoddescni}
262
263Note that the parser maintains a stack of open elements for which no
264end tag has been found yet. Only tags processed by
265\method{start_\var{tag}()} are pushed on this stack. Definition of an
266\method{end_\var{tag}()} method is optional for these tags. For tags
267processed by \method{do_\var{tag}()} or by \method{unknown_tag()}, no
268\method{end_\var{tag}()} method must be defined; if defined, it will
269not be used. If both \method{start_\var{tag}()} and
270\method{do_\var{tag}()} methods exist for a tag, the
271\method{start_\var{tag}()} method takes precedence.
Note: See TracBrowser for help on using the repository browser.