source: vendor/python/2.5/Doc/lib/libshlex.tex

Last change on this file was 3225, checked in by bird, 18 years ago

Python 2.5

File size: 11.0 KB
Line 
1\section{\module{shlex} ---
2 Simple lexical analysis}
3
4\declaremodule{standard}{shlex}
5\modulesynopsis{Simple lexical analysis for \UNIX\ shell-like languages.}
6\moduleauthor{Eric S. Raymond}{esr@snark.thyrsus.com}
7\moduleauthor{Gustavo Niemeyer}{niemeyer@conectiva.com}
8\sectionauthor{Eric S. Raymond}{esr@snark.thyrsus.com}
9\sectionauthor{Gustavo Niemeyer}{niemeyer@conectiva.com}
10
11\versionadded{1.5.2}
12
13The \class{shlex} class makes it easy to write lexical analyzers for
14simple syntaxes resembling that of the \UNIX{} shell. This will often
15be useful for writing minilanguages, (for example, in run control
16files for Python applications) or for parsing quoted strings.
17
18\note{The \module{shlex} module currently does not support Unicode input.}
19
20The \module{shlex} module defines the following functions:
21
22\begin{funcdesc}{split}{s\optional{, comments}}
23Split the string \var{s} using shell-like syntax. If \var{comments} is
24\constant{False} (the default), the parsing of comments in the given
25string will be disabled (setting the \member{commenters} member of the
26\class{shlex} instance to the empty string). This function operates
27in \POSIX{} mode.
28\versionadded{2.3}
29\end{funcdesc}
30
31The \module{shlex} module defines the following class:
32
33\begin{classdesc}{shlex}{\optional{instream\optional{,
34 infile\optional{, posix}}}}
35A \class{shlex} instance or subclass instance is a lexical analyzer
36object. The initialization argument, if present, specifies where to
37read characters from. It must be a file-/stream-like object with
38\method{read()} and \method{readline()} methods, or a string (strings
39are accepted since Python 2.3). If no argument is given, input will be
40taken from \code{sys.stdin}. The second optional argument is a filename
41string, which sets the initial value of the \member{infile} member. If
42the \var{instream} argument is omitted or equal to \code{sys.stdin},
43this second argument defaults to ``stdin''. The \var{posix} argument
44was introduced in Python 2.3, and defines the operational mode. When
45\var{posix} is not true (default), the \class{shlex} instance will
46operate in compatibility mode. When operating in \POSIX{} mode,
47\class{shlex} will try to be as close as possible to the \POSIX{} shell
48parsing rules. See section~\ref{shlex-objects}.
49\end{classdesc}
50
51\begin{seealso}
52 \seemodule{ConfigParser}{Parser for configuration files similar to the
53 Windows \file{.ini} files.}
54\end{seealso}
55
56
57\subsection{shlex Objects \label{shlex-objects}}
58
59A \class{shlex} instance has the following methods:
60
61\begin{methoddesc}{get_token}{}
62Return a token. If tokens have been stacked using
63\method{push_token()}, pop a token off the stack. Otherwise, read one
64from the input stream. If reading encounters an immediate
65end-of-file, \member{self.eof} is returned (the empty string (\code{''})
66in non-\POSIX{} mode, and \code{None} in \POSIX{} mode).
67\end{methoddesc}
68
69\begin{methoddesc}{push_token}{str}
70Push the argument onto the token stack.
71\end{methoddesc}
72
73\begin{methoddesc}{read_token}{}
74Read a raw token. Ignore the pushback stack, and do not interpret source
75requests. (This is not ordinarily a useful entry point, and is
76documented here only for the sake of completeness.)
77\end{methoddesc}
78
79\begin{methoddesc}{sourcehook}{filename}
80When \class{shlex} detects a source request (see
81\member{source} below) this method is given the following token as
82argument, and expected to return a tuple consisting of a filename and
83an open file-like object.
84
85Normally, this method first strips any quotes off the argument. If
86the result is an absolute pathname, or there was no previous source
87request in effect, or the previous source was a stream
88(such as \code{sys.stdin}), the result is left alone. Otherwise, if the
89result is a relative pathname, the directory part of the name of the
90file immediately before it on the source inclusion stack is prepended
91(this behavior is like the way the C preprocessor handles
92\code{\#include "file.h"}).
93
94The result of the manipulations is treated as a filename, and returned
95as the first component of the tuple, with
96\function{open()} called on it to yield the second component. (Note:
97this is the reverse of the order of arguments in instance initialization!)
98
99This hook is exposed so that you can use it to implement directory
100search paths, addition of file extensions, and other namespace hacks.
101There is no corresponding `close' hook, but a shlex instance will call
102the \method{close()} method of the sourced input stream when it
103returns \EOF.
104
105For more explicit control of source stacking, use the
106\method{push_source()} and \method{pop_source()} methods.
107\end{methoddesc}
108
109\begin{methoddesc}{push_source}{stream\optional{, filename}}
110Push an input source stream onto the input stack. If the filename
111argument is specified it will later be available for use in error
112messages. This is the same method used internally by the
113\method{sourcehook} method.
114\versionadded{2.1}
115\end{methoddesc}
116
117\begin{methoddesc}{pop_source}{}
118Pop the last-pushed input source from the input stack.
119This is the same method used internally when the lexer reaches
120\EOF{} on a stacked input stream.
121\versionadded{2.1}
122\end{methoddesc}
123
124\begin{methoddesc}{error_leader}{\optional{file\optional{, line}}}
125This method generates an error message leader in the format of a
126\UNIX{} C compiler error label; the format is \code{'"\%s", line \%d: '},
127where the \samp{\%s} is replaced with the name of the current source
128file and the \samp{\%d} with the current input line number (the
129optional arguments can be used to override these).
130
131This convenience is provided to encourage \module{shlex} users to
132generate error messages in the standard, parseable format understood
133by Emacs and other \UNIX{} tools.
134\end{methoddesc}
135
136Instances of \class{shlex} subclasses have some public instance
137variables which either control lexical analysis or can be used for
138debugging:
139
140\begin{memberdesc}{commenters}
141The string of characters that are recognized as comment beginners.
142All characters from the comment beginner to end of line are ignored.
143Includes just \character{\#} by default.
144\end{memberdesc}
145
146\begin{memberdesc}{wordchars}
147The string of characters that will accumulate into multi-character
148tokens. By default, includes all \ASCII{} alphanumerics and
149underscore.
150\end{memberdesc}
151
152\begin{memberdesc}{whitespace}
153Characters that will be considered whitespace and skipped. Whitespace
154bounds tokens. By default, includes space, tab, linefeed and
155carriage-return.
156\end{memberdesc}
157
158\begin{memberdesc}{escape}
159Characters that will be considered as escape. This will be only used
160in \POSIX{} mode, and includes just \character{\textbackslash} by default.
161\versionadded{2.3}
162\end{memberdesc}
163
164\begin{memberdesc}{quotes}
165Characters that will be considered string quotes. The token
166accumulates until the same quote is encountered again (thus, different
167quote types protect each other as in the shell.) By default, includes
168\ASCII{} single and double quotes.
169\end{memberdesc}
170
171\begin{memberdesc}{escapedquotes}
172Characters in \member{quotes} that will interpret escape characters
173defined in \member{escape}. This is only used in \POSIX{} mode, and
174includes just \character{"} by default.
175\versionadded{2.3}
176\end{memberdesc}
177
178\begin{memberdesc}{whitespace_split}
179If \code{True}, tokens will only be split in whitespaces. This is useful, for
180example, for parsing command lines with \class{shlex}, getting tokens
181in a similar way to shell arguments.
182\versionadded{2.3}
183\end{memberdesc}
184
185\begin{memberdesc}{infile}
186The name of the current input file, as initially set at class
187instantiation time or stacked by later source requests. It may
188be useful to examine this when constructing error messages.
189\end{memberdesc}
190
191\begin{memberdesc}{instream}
192The input stream from which this \class{shlex} instance is reading
193characters.
194\end{memberdesc}
195
196\begin{memberdesc}{source}
197This member is \code{None} by default. If you assign a string to it,
198that string will be recognized as a lexical-level inclusion request
199similar to the \samp{source} keyword in various shells. That is, the
200immediately following token will opened as a filename and input taken
201from that stream until \EOF, at which point the \method{close()}
202method of that stream will be called and the input source will again
203become the original input stream. Source requests may be stacked any
204number of levels deep.
205\end{memberdesc}
206
207\begin{memberdesc}{debug}
208If this member is numeric and \code{1} or more, a \class{shlex}
209instance will print verbose progress output on its behavior. If you
210need to use this, you can read the module source code to learn the
211details.
212\end{memberdesc}
213
214\begin{memberdesc}{lineno}
215Source line number (count of newlines seen so far plus one).
216\end{memberdesc}
217
218\begin{memberdesc}{token}
219The token buffer. It may be useful to examine this when catching
220exceptions.
221\end{memberdesc}
222
223\begin{memberdesc}{eof}
224Token used to determine end of file. This will be set to the empty
225string (\code{''}), in non-\POSIX{} mode, and to \code{None} in
226\POSIX{} mode.
227\versionadded{2.3}
228\end{memberdesc}
229
230\subsection{Parsing Rules\label{shlex-parsing-rules}}
231
232When operating in non-\POSIX{} mode, \class{shlex} will try to obey to
233the following rules.
234
235\begin{itemize}
236\item Quote characters are not recognized within words
237 (\code{Do"Not"Separate} is parsed as the single word
238 \code{Do"Not"Separate});
239\item Escape characters are not recognized;
240\item Enclosing characters in quotes preserve the literal value of
241 all characters within the quotes;
242\item Closing quotes separate words (\code{"Do"Separate} is parsed
243 as \code{"Do"} and \code{Separate});
244\item If \member{whitespace_split} is \code{False}, any character not
245 declared to be a word character, whitespace, or a quote will be
246 returned as a single-character token. If it is \code{True},
247 \class{shlex} will only split words in whitespaces;
248\item EOF is signaled with an empty string (\code{''});
249\item It's not possible to parse empty strings, even if quoted.
250\end{itemize}
251
252When operating in \POSIX{} mode, \class{shlex} will try to obey to the
253following parsing rules.
254
255\begin{itemize}
256\item Quotes are stripped out, and do not separate words
257 (\code{"Do"Not"Separate"} is parsed as the single word
258 \code{DoNotSeparate});
259\item Non-quoted escape characters (e.g. \character{\textbackslash})
260 preserve the literal value of the next character that follows;
261\item Enclosing characters in quotes which are not part of
262 \member{escapedquotes} (e.g. \character{'}) preserve the literal
263 value of all characters within the quotes;
264\item Enclosing characters in quotes which are part of
265 \member{escapedquotes} (e.g. \character{"}) preserves the literal
266 value of all characters within the quotes, with the exception of
267 the characters mentioned in \member{escape}. The escape characters
268 retain its special meaning only when followed by the quote in use,
269 or the escape character itself. Otherwise the escape character
270 will be considered a normal character.
271\item EOF is signaled with a \constant{None} value;
272\item Quoted empty strings (\code{''}) are allowed;
273\end{itemize}
274
Note: See TracBrowser for help on using the repository browser.