Context Navigation

robotparser.py@ 394

Last change on this file since 394 was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 7.1 KB

Line
1	""" robotparser.py
2
3	Copyright (C) 2000 Bastian Kleineidam
4
5	You can choose between two licenses when using this package:
6	1) GNU GPLv2
7	2) PSF license for Python 2.2
8
9	The robots.txt Exclusion Protocol is implemented as specified in
10	http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11	"""
12	import urlparse
13	import urllib
14
15	__all__ = ["RobotFileParser"]
16
17
18	class RobotFileParser:
19	""" This class provides a set of methods to read, parse and answer
20	questions about a single robots.txt file.
21
22	"""
23
24	def __init__(self, url=''):
25	self.entries = []
26	self.default_entry = None
27	self.disallow_all = False
28	self.allow_all = False
29	self.set_url(url)
30	self.last_checked = 0
31
32	def mtime(self):
33	"""Returns the time the robots.txt file was last fetched.
34
35	This is useful for long-running web spiders that need to
36	check for new robots.txt files periodically.
37
38	"""
39	return self.last_checked
40
41	def modified(self):
42	"""Sets the time the robots.txt file was last fetched to the
43	current time.
44
45	"""
46	import time
47	self.last_checked = time.time()
48
49	def set_url(self, url):
50	"""Sets the URL referring to a robots.txt file."""
51	self.url = url
52	self.host, self.path = urlparse.urlparse(url)[1:3]
53
54	def read(self):
55	"""Reads the robots.txt URL and feeds it to the parser."""
56	opener = URLopener()
57	f = opener.open(self.url)
58	lines = [line.strip() for line in f]
59	f.close()
60	self.errcode = opener.errcode
61	if self.errcode in (401, 403):
62	self.disallow_all = True
63	elif self.errcode >= 400:
64	self.allow_all = True
65	elif self.errcode == 200 and lines:
66	self.parse(lines)
67
68	def _add_entry(self, entry):
69	if "*" in entry.useragents:
70	# the default entry is considered last
71	if self.default_entry is None:
72	# the first default entry wins
73	self.default_entry = entry
74	else:
75	self.entries.append(entry)
76
77	def parse(self, lines):
78	"""parse the input lines from a robots.txt file.
79	We allow that a user-agent: line is not preceded by
80	one or more blank lines."""
81	# states:
82	# 0: start state
83	# 1: saw user-agent line
84	# 2: saw an allow or disallow line
85	state = 0
86	linenumber = 0
87	entry = Entry()
88
89	for line in lines:
90	linenumber += 1
91	if not line:
92	if state == 1:
93	entry = Entry()
94	state = 0
95	elif state == 2:
96	self._add_entry(entry)
97	entry = Entry()
98	state = 0
99	# remove optional comment and strip line
100	i = line.find('#')
101	if i >= 0:
102	line = line[:i]
103	line = line.strip()
104	if not line:
105	continue
106	line = line.split(':', 1)
107	if len(line) == 2:
108	line[0] = line[0].strip().lower()
109	line[1] = urllib.unquote(line[1].strip())
110	if line[0] == "user-agent":
111	if state == 2:
112	self._add_entry(entry)
113	entry = Entry()
114	entry.useragents.append(line[1])
115	state = 1
116	elif line[0] == "disallow":
117	if state != 0:
118	entry.rulelines.append(RuleLine(line[1], False))
119	state = 2
120	elif line[0] == "allow":
121	if state != 0:
122	entry.rulelines.append(RuleLine(line[1], True))
123	state = 2
124	if state == 2:
125	self._add_entry(entry)
126
127
128	def can_fetch(self, useragent, url):
129	"""using the parsed robots.txt decide if useragent can fetch url"""
130	if self.disallow_all:
131	return False
132	if self.allow_all:
133	return True
134	# search for given user agent matches
135	# the first match counts
136	parsed_url = urlparse.urlparse(urllib.unquote(url))
137	url = urlparse.urlunparse(('', '', parsed_url.path,
138	parsed_url.params, parsed_url.query, parsed_url.fragment))
139	url = urllib.quote(url)
140	if not url:
141	url = "/"
142	for entry in self.entries:
143	if entry.applies_to(useragent):
144	return entry.allowance(url)
145	# try the default entry last
146	if self.default_entry:
147	return self.default_entry.allowance(url)
148	# agent not found ==> access granted
149	return True
150
151
152	def __str__(self):
153	return ''.join([str(entry) + "\n" for entry in self.entries])
154
155
156	class RuleLine:
157	"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
158	(allowance==False) followed by a path."""
159	def __init__(self, path, allowance):
160	if path == '' and not allowance:
161	# an empty value means allow all
162	allowance = True
163	path = urlparse.urlunparse(urlparse.urlparse(path))
164	self.path = urllib.quote(path)
165	self.allowance = allowance
166
167	def applies_to(self, filename):
168	return self.path == "*" or filename.startswith(self.path)
169
170	def __str__(self):
171	return (self.allowance and "Allow" or "Disallow") + ": " + self.path
172
173
174	class Entry:
175	"""An entry has one or more user-agents and zero or more rulelines"""
176	def __init__(self):
177	self.useragents = []
178	self.rulelines = []
179
180	def __str__(self):
181	ret = []
182	for agent in self.useragents:
183	ret.extend(["User-agent: ", agent, "\n"])
184	for line in self.rulelines:
185	ret.extend([str(line), "\n"])
186	return ''.join(ret)
187
188	def applies_to(self, useragent):
189	"""check if this entry applies to the specified agent"""
190	# split the name token and make it lower case
191	useragent = useragent.split("/")[0].lower()
192	for agent in self.useragents:
193	if agent == '*':
194	# we have the catch-all agent
195	return True
196	agent = agent.lower()
197	if agent in useragent:
198	return True
199	return False
200
201	def allowance(self, filename):
202	"""Preconditions:
203	- our agent applies to this entry
204	- filename is URL decoded"""
205	for line in self.rulelines:
206	if line.applies_to(filename):
207	return line.allowance
208	return True
209
210	class URLopener(urllib.FancyURLopener):
211	def __init__(self, *args):
212	urllib.FancyURLopener.__init__(self, *args)
213	self.errcode = 200
214
215	def prompt_user_passwd(self, host, realm):
216	## If robots.txt file is accessible only with a password,
217	## we act as if the file wasn't there.
218	return None, None
219
220	def http_error_default(self, url, fp, errcode, errmsg, headers):
221	self.errcode = errcode
222	return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
223	errmsg, headers)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/robotparser.py@ 394

Download in other formats: