Context Navigation

robotparser.py

Last change on this file was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 6.8 KB

Line
1	""" robotparser.py
2
3	Copyright (C) 2000 Bastian Kleineidam
4
5	You can choose between two licenses when using this package:
6	1) GNU GPLv2
7	2) PSF license for Python 2.2
8
9	The robots.txt Exclusion Protocol is implemented as specified in
10	http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11	"""
12	import urlparse
13	import urllib
14
15	__all__ = ["RobotFileParser"]
16
17
18	class RobotFileParser:
19	""" This class provides a set of methods to read, parse and answer
20	questions about a single robots.txt file.
21
22	"""
23
24	def __init__(self, url=''):
25	self.entries = []
26	self.default_entry = None
27	self.disallow_all = False
28	self.allow_all = False
29	self.set_url(url)
30	self.last_checked = 0
31
32	def mtime(self):
33	"""Returns the time the robots.txt file was last fetched.
34
35	This is useful for long-running web spiders that need to
36	check for new robots.txt files periodically.
37
38	"""
39	return self.last_checked
40
41	def modified(self):
42	"""Sets the time the robots.txt file was last fetched to the
43	current time.
44
45	"""
46	import time
47	self.last_checked = time.time()
48
49	def set_url(self, url):
50	"""Sets the URL referring to a robots.txt file."""
51	self.url = url
52	self.host, self.path = urlparse.urlparse(url)[1:3]
53
54	def read(self):
55	"""Reads the robots.txt URL and feeds it to the parser."""
56	opener = URLopener()
57	f = opener.open(self.url)
58	lines = [line.strip() for line in f]
59	f.close()
60	self.errcode = opener.errcode
61	if self.errcode in (401, 403):
62	self.disallow_all = True
63	elif self.errcode >= 400:
64	self.allow_all = True
65	elif self.errcode == 200 and lines:
66	self.parse(lines)
67
68	def _add_entry(self, entry):
69	if "*" in entry.useragents:
70	# the default entry is considered last
71	self.default_entry = entry
72	else:
73	self.entries.append(entry)
74
75	def parse(self, lines):
76	"""parse the input lines from a robots.txt file.
77	We allow that a user-agent: line is not preceded by
78	one or more blank lines."""
79	# states:
80	# 0: start state
81	# 1: saw user-agent line
82	# 2: saw an allow or disallow line
83	state = 0
84	linenumber = 0
85	entry = Entry()
86
87	for line in lines:
88	linenumber += 1
89	if not line:
90	if state == 1:
91	entry = Entry()
92	state = 0
93	elif state == 2:
94	self._add_entry(entry)
95	entry = Entry()
96	state = 0
97	# remove optional comment and strip line
98	i = line.find('#')
99	if i >= 0:
100	line = line[:i]
101	line = line.strip()
102	if not line:
103	continue
104	line = line.split(':', 1)
105	if len(line) == 2:
106	line[0] = line[0].strip().lower()
107	line[1] = urllib.unquote(line[1].strip())
108	if line[0] == "user-agent":
109	if state == 2:
110	self._add_entry(entry)
111	entry = Entry()
112	entry.useragents.append(line[1])
113	state = 1
114	elif line[0] == "disallow":
115	if state != 0:
116	entry.rulelines.append(RuleLine(line[1], False))
117	state = 2
118	elif line[0] == "allow":
119	if state != 0:
120	entry.rulelines.append(RuleLine(line[1], True))
121	state = 2
122	if state == 2:
123	self.entries.append(entry)
124
125
126	def can_fetch(self, useragent, url):
127	"""using the parsed robots.txt decide if useragent can fetch url"""
128	if self.disallow_all:
129	return False
130	if self.allow_all:
131	return True
132	# search for given user agent matches
133	# the first match counts
134	url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
135	for entry in self.entries:
136	if entry.applies_to(useragent):
137	return entry.allowance(url)
138	# try the default entry last
139	if self.default_entry:
140	return self.default_entry.allowance(url)
141	# agent not found ==> access granted
142	return True
143
144
145	def __str__(self):
146	return ''.join([str(entry) + "\n" for entry in self.entries])
147
148
149	class RuleLine:
150	"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
151	(allowance==False) followed by a path."""
152	def __init__(self, path, allowance):
153	if path == '' and not allowance:
154	# an empty value means allow all
155	allowance = True
156	self.path = urllib.quote(path)
157	self.allowance = allowance
158
159	def applies_to(self, filename):
160	return self.path == "*" or filename.startswith(self.path)
161
162	def __str__(self):
163	return (self.allowance and "Allow" or "Disallow") + ": " + self.path
164
165
166	class Entry:
167	"""An entry has one or more user-agents and zero or more rulelines"""
168	def __init__(self):
169	self.useragents = []
170	self.rulelines = []
171
172	def __str__(self):
173	ret = []
174	for agent in self.useragents:
175	ret.extend(["User-agent: ", agent, "\n"])
176	for line in self.rulelines:
177	ret.extend([str(line), "\n"])
178	return ''.join(ret)
179
180	def applies_to(self, useragent):
181	"""check if this entry applies to the specified agent"""
182	# split the name token and make it lower case
183	useragent = useragent.split("/")[0].lower()
184	for agent in self.useragents:
185	if agent == '*':
186	# we have the catch-all agent
187	return True
188	agent = agent.lower()
189	if agent in useragent:
190	return True
191	return False
192
193	def allowance(self, filename):
194	"""Preconditions:
195	- our agent applies to this entry
196	- filename is URL decoded"""
197	for line in self.rulelines:
198	if line.applies_to(filename):
199	return line.allowance
200	return True
201
202	class URLopener(urllib.FancyURLopener):
203	def __init__(self, *args):
204	urllib.FancyURLopener.__init__(self, *args)
205	self.errcode = 200
206
207	def prompt_user_passwd(self, host, realm):
208	## If robots.txt file is accessible only with a password,
209	## we act as if the file wasn't there.
210	return None, None
211
212	def http_error_default(self, url, fp, errcode, errmsg, headers):
213	self.errcode = errcode
214	return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
215	errmsg, headers)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/vendor/Python-2.6.5/Lib/robotparser.py

Download in other formats: