source: python/trunk/Lib/robotparser.py@ 394

Last change on this file since 394 was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 7.1 KB
Line 
1""" robotparser.py
2
3 Copyright (C) 2000 Bastian Kleineidam
4
5 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PSF license for Python 2.2
8
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11"""
12import urlparse
13import urllib
14
15__all__ = ["RobotFileParser"]
16
17
18class RobotFileParser:
19 """ This class provides a set of methods to read, parse and answer
20 questions about a single robots.txt file.
21
22 """
23
24 def __init__(self, url=''):
25 self.entries = []
26 self.default_entry = None
27 self.disallow_all = False
28 self.allow_all = False
29 self.set_url(url)
30 self.last_checked = 0
31
32 def mtime(self):
33 """Returns the time the robots.txt file was last fetched.
34
35 This is useful for long-running web spiders that need to
36 check for new robots.txt files periodically.
37
38 """
39 return self.last_checked
40
41 def modified(self):
42 """Sets the time the robots.txt file was last fetched to the
43 current time.
44
45 """
46 import time
47 self.last_checked = time.time()
48
49 def set_url(self, url):
50 """Sets the URL referring to a robots.txt file."""
51 self.url = url
52 self.host, self.path = urlparse.urlparse(url)[1:3]
53
54 def read(self):
55 """Reads the robots.txt URL and feeds it to the parser."""
56 opener = URLopener()
57 f = opener.open(self.url)
58 lines = [line.strip() for line in f]
59 f.close()
60 self.errcode = opener.errcode
61 if self.errcode in (401, 403):
62 self.disallow_all = True
63 elif self.errcode >= 400:
64 self.allow_all = True
65 elif self.errcode == 200 and lines:
66 self.parse(lines)
67
68 def _add_entry(self, entry):
69 if "*" in entry.useragents:
70 # the default entry is considered last
71 if self.default_entry is None:
72 # the first default entry wins
73 self.default_entry = entry
74 else:
75 self.entries.append(entry)
76
77 def parse(self, lines):
78 """parse the input lines from a robots.txt file.
79 We allow that a user-agent: line is not preceded by
80 one or more blank lines."""
81 # states:
82 # 0: start state
83 # 1: saw user-agent line
84 # 2: saw an allow or disallow line
85 state = 0
86 linenumber = 0
87 entry = Entry()
88
89 for line in lines:
90 linenumber += 1
91 if not line:
92 if state == 1:
93 entry = Entry()
94 state = 0
95 elif state == 2:
96 self._add_entry(entry)
97 entry = Entry()
98 state = 0
99 # remove optional comment and strip line
100 i = line.find('#')
101 if i >= 0:
102 line = line[:i]
103 line = line.strip()
104 if not line:
105 continue
106 line = line.split(':', 1)
107 if len(line) == 2:
108 line[0] = line[0].strip().lower()
109 line[1] = urllib.unquote(line[1].strip())
110 if line[0] == "user-agent":
111 if state == 2:
112 self._add_entry(entry)
113 entry = Entry()
114 entry.useragents.append(line[1])
115 state = 1
116 elif line[0] == "disallow":
117 if state != 0:
118 entry.rulelines.append(RuleLine(line[1], False))
119 state = 2
120 elif line[0] == "allow":
121 if state != 0:
122 entry.rulelines.append(RuleLine(line[1], True))
123 state = 2
124 if state == 2:
125 self._add_entry(entry)
126
127
128 def can_fetch(self, useragent, url):
129 """using the parsed robots.txt decide if useragent can fetch url"""
130 if self.disallow_all:
131 return False
132 if self.allow_all:
133 return True
134 # search for given user agent matches
135 # the first match counts
136 parsed_url = urlparse.urlparse(urllib.unquote(url))
137 url = urlparse.urlunparse(('', '', parsed_url.path,
138 parsed_url.params, parsed_url.query, parsed_url.fragment))
139 url = urllib.quote(url)
140 if not url:
141 url = "/"
142 for entry in self.entries:
143 if entry.applies_to(useragent):
144 return entry.allowance(url)
145 # try the default entry last
146 if self.default_entry:
147 return self.default_entry.allowance(url)
148 # agent not found ==> access granted
149 return True
150
151
152 def __str__(self):
153 return ''.join([str(entry) + "\n" for entry in self.entries])
154
155
156class RuleLine:
157 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
158 (allowance==False) followed by a path."""
159 def __init__(self, path, allowance):
160 if path == '' and not allowance:
161 # an empty value means allow all
162 allowance = True
163 path = urlparse.urlunparse(urlparse.urlparse(path))
164 self.path = urllib.quote(path)
165 self.allowance = allowance
166
167 def applies_to(self, filename):
168 return self.path == "*" or filename.startswith(self.path)
169
170 def __str__(self):
171 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
172
173
174class Entry:
175 """An entry has one or more user-agents and zero or more rulelines"""
176 def __init__(self):
177 self.useragents = []
178 self.rulelines = []
179
180 def __str__(self):
181 ret = []
182 for agent in self.useragents:
183 ret.extend(["User-agent: ", agent, "\n"])
184 for line in self.rulelines:
185 ret.extend([str(line), "\n"])
186 return ''.join(ret)
187
188 def applies_to(self, useragent):
189 """check if this entry applies to the specified agent"""
190 # split the name token and make it lower case
191 useragent = useragent.split("/")[0].lower()
192 for agent in self.useragents:
193 if agent == '*':
194 # we have the catch-all agent
195 return True
196 agent = agent.lower()
197 if agent in useragent:
198 return True
199 return False
200
201 def allowance(self, filename):
202 """Preconditions:
203 - our agent applies to this entry
204 - filename is URL decoded"""
205 for line in self.rulelines:
206 if line.applies_to(filename):
207 return line.allowance
208 return True
209
210class URLopener(urllib.FancyURLopener):
211 def __init__(self, *args):
212 urllib.FancyURLopener.__init__(self, *args)
213 self.errcode = 200
214
215 def prompt_user_passwd(self, host, realm):
216 ## If robots.txt file is accessible only with a password,
217 ## we act as if the file wasn't there.
218 return None, None
219
220 def http_error_default(self, url, fp, errcode, errmsg, headers):
221 self.errcode = errcode
222 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
223 errmsg, headers)
Note: See TracBrowser for help on using the repository browser.