source: vendor/python/2.5/Doc/lib/librobotparser.tex

Last change on this file was 3225, checked in by bird, 18 years ago

Python 2.5

File size: 2.0 KB
Line 
1\section{\module{robotparser} ---
2 Parser for robots.txt}
3
4\declaremodule{standard}{robotparser}
5\modulesynopsis{Loads a \protect\file{robots.txt} file and
6 answers questions about fetchability of other URLs.}
7\sectionauthor{Skip Montanaro}{skip@mojam.com}
8
9\index{WWW}
10\index{World Wide Web}
11\index{URL}
12\index{robots.txt}
13
14This module provides a single class, \class{RobotFileParser}, which answers
15questions about whether or not a particular user agent can fetch a URL on
16the Web site that published the \file{robots.txt} file. For more details on
17the structure of \file{robots.txt} files, see
18\url{http://www.robotstxt.org/wc/norobots.html}.
19
20\begin{classdesc}{RobotFileParser}{}
21
22This class provides a set of methods to read, parse and answer questions
23about a single \file{robots.txt} file.
24
25\begin{methoddesc}{set_url}{url}
26Sets the URL referring to a \file{robots.txt} file.
27\end{methoddesc}
28
29\begin{methoddesc}{read}{}
30Reads the \file{robots.txt} URL and feeds it to the parser.
31\end{methoddesc}
32
33\begin{methoddesc}{parse}{lines}
34Parses the lines argument.
35\end{methoddesc}
36
37\begin{methoddesc}{can_fetch}{useragent, url}
38Returns \code{True} if the \var{useragent} is allowed to fetch the \var{url}
39according to the rules contained in the parsed \file{robots.txt} file.
40\end{methoddesc}
41
42\begin{methoddesc}{mtime}{}
43Returns the time the \code{robots.txt} file was last fetched. This is
44useful for long-running web spiders that need to check for new
45\code{robots.txt} files periodically.
46\end{methoddesc}
47
48\begin{methoddesc}{modified}{}
49Sets the time the \code{robots.txt} file was last fetched to the current
50time.
51\end{methoddesc}
52
53\end{classdesc}
54
55The following example demonstrates basic use of the RobotFileParser class.
56
57\begin{verbatim}
58>>> import robotparser
59>>> rp = robotparser.RobotFileParser()
60>>> rp.set_url("http://www.musi-cal.com/robots.txt")
61>>> rp.read()
62>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
63False
64>>> rp.can_fetch("*", "http://www.musi-cal.com/")
65True
66\end{verbatim}
Note: See TracBrowser for help on using the repository browser.