Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

robotparser.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 7.1 KB

Rev	Line
[2]	1	""" robotparser.py
	2
	3	Copyright (C) 2000 Bastian Kleineidam
	4
	5	You can choose between two licenses when using this package:
	6	1) GNU GPLv2
	7	2) PSF license for Python 2.2
	8
	9	The robots.txt Exclusion Protocol is implemented as specified in
	10	http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
	11	"""
	12	import urlparse
	13	import urllib
	14
	15	__all__ = ["RobotFileParser"]
	16
	17
	18	class RobotFileParser:
	19	""" This class provides a set of methods to read, parse and answer
	20	questions about a single robots.txt file.
	21
	22	"""
	23
	24	def __init__(self, url=''):
	25	self.entries = []
	26	self.default_entry = None
	27	self.disallow_all = False
	28	self.allow_all = False
	29	self.set_url(url)
	30	self.last_checked = 0
	31
	32	def mtime(self):
	33	"""Returns the time the robots.txt file was last fetched.
	34
	35	This is useful for long-running web spiders that need to
	36	check for new robots.txt files periodically.
	37
	38	"""
	39	return self.last_checked
	40
	41	def modified(self):
	42	"""Sets the time the robots.txt file was last fetched to the
	43	current time.
	44
	45	"""
	46	import time
	47	self.last_checked = time.time()
	48
	49	def set_url(self, url):
	50	"""Sets the URL referring to a robots.txt file."""
	51	self.url = url
	52	self.host, self.path = urlparse.urlparse(url)[1:3]
	53
	54	def read(self):
	55	"""Reads the robots.txt URL and feeds it to the parser."""
	56	opener = URLopener()
	57	f = opener.open(self.url)
	58	lines = [line.strip() for line in f]
	59	f.close()
	60	self.errcode = opener.errcode
	61	if self.errcode in (401, 403):
	62	self.disallow_all = True
	63	elif self.errcode >= 400:
	64	self.allow_all = True
	65	elif self.errcode == 200 and lines:
	66	self.parse(lines)
	67
	68	def _add_entry(self, entry):
	69	if "*" in entry.useragents:
	70	# the default entry is considered last
[391]	71	if self.default_entry is None:
	72	# the first default entry wins
	73	self.default_entry = entry
[2]	74	else:
	75	self.entries.append(entry)
	76
	77	def parse(self, lines):
	78	"""parse the input lines from a robots.txt file.
	79	We allow that a user-agent: line is not preceded by
	80	one or more blank lines."""
	81	# states:
	82	# 0: start state
	83	# 1: saw user-agent line
	84	# 2: saw an allow or disallow line
	85	state = 0
	86	linenumber = 0
	87	entry = Entry()
	88
	89	for line in lines:
	90	linenumber += 1
	91	if not line:
	92	if state == 1:
	93	entry = Entry()
	94	state = 0
	95	elif state == 2:
	96	self._add_entry(entry)
	97	entry = Entry()
	98	state = 0
	99	# remove optional comment and strip line
	100	i = line.find('#')
	101	if i >= 0:
	102	line = line[:i]
	103	line = line.strip()
	104	if not line:
	105	continue
	106	line = line.split(':', 1)
	107	if len(line) == 2:
	108	line[0] = line[0].strip().lower()
	109	line[1] = urllib.unquote(line[1].strip())
	110	if line[0] == "user-agent":
	111	if state == 2:
	112	self._add_entry(entry)
	113	entry = Entry()
	114	entry.useragents.append(line[1])
	115	state = 1
	116	elif line[0] == "disallow":
	117	if state != 0:
	118	entry.rulelines.append(RuleLine(line[1], False))
	119	state = 2
	120	elif line[0] == "allow":
	121	if state != 0:
	122	entry.rulelines.append(RuleLine(line[1], True))
	123	state = 2
	124	if state == 2:
[391]	125	self._add_entry(entry)
[2]	126
	127
	128	def can_fetch(self, useragent, url):
	129	"""using the parsed robots.txt decide if useragent can fetch url"""
	130	if self.disallow_all:
	131	return False
	132	if self.allow_all:
	133	return True
	134	# search for given user agent matches
	135	# the first match counts
[391]	136	parsed_url = urlparse.urlparse(urllib.unquote(url))
	137	url = urlparse.urlunparse(('', '', parsed_url.path,
	138	parsed_url.params, parsed_url.query, parsed_url.fragment))
	139	url = urllib.quote(url)
	140	if not url:
	141	url = "/"
[2]	142	for entry in self.entries:
	143	if entry.applies_to(useragent):
	144	return entry.allowance(url)
	145	# try the default entry last
	146	if self.default_entry:
	147	return self.default_entry.allowance(url)
	148	# agent not found ==> access granted
	149	return True
	150
	151
	152	def __str__(self):
	153	return ''.join([str(entry) + "\n" for entry in self.entries])
	154
	155
	156	class RuleLine:
	157	"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
	158	(allowance==False) followed by a path."""
	159	def __init__(self, path, allowance):
	160	if path == '' and not allowance:
	161	# an empty value means allow all
	162	allowance = True
[391]	163	path = urlparse.urlunparse(urlparse.urlparse(path))
[2]	164	self.path = urllib.quote(path)
	165	self.allowance = allowance
	166
	167	def applies_to(self, filename):
	168	return self.path == "*" or filename.startswith(self.path)
	169
	170	def __str__(self):
	171	return (self.allowance and "Allow" or "Disallow") + ": " + self.path
	172
	173
	174	class Entry:
	175	"""An entry has one or more user-agents and zero or more rulelines"""
	176	def __init__(self):
	177	self.useragents = []
	178	self.rulelines = []
	179
	180	def __str__(self):
	181	ret = []
	182	for agent in self.useragents:
	183	ret.extend(["User-agent: ", agent, "\n"])
	184	for line in self.rulelines:
	185	ret.extend([str(line), "\n"])
	186	return ''.join(ret)
	187
	188	def applies_to(self, useragent):
	189	"""check if this entry applies to the specified agent"""
	190	# split the name token and make it lower case
	191	useragent = useragent.split("/")[0].lower()
	192	for agent in self.useragents:
	193	if agent == '*':
	194	# we have the catch-all agent
	195	return True
	196	agent = agent.lower()
	197	if agent in useragent:
	198	return True
	199	return False
	200
	201	def allowance(self, filename):
	202	"""Preconditions:
	203	- our agent applies to this entry
	204	- filename is URL decoded"""
	205	for line in self.rulelines:
	206	if line.applies_to(filename):
	207	return line.allowance
	208	return True
	209
	210	class URLopener(urllib.FancyURLopener):
	211	def __init__(self, *args):
	212	urllib.FancyURLopener.__init__(self, *args)
	213	self.errcode = 200
	214
	215	def prompt_user_passwd(self, host, realm):
	216	## If robots.txt file is accessible only with a password,
	217	## we act as if the file wasn't there.
	218	return None, None
	219
	220	def http_error_default(self, url, fp, errcode, errmsg, headers):
	221	self.errcode = errcode
	222	return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
	223	errmsg, headers)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/robotparser.py

Download in other formats: