Context Navigation

res.c

Visit:

Last change on this file was 3440, checked in by bird, 18 years ago
wget 1.10.2
File size: 15.7 KB

Line
1	/* Support for Robot Exclusion Standard (RES).
2	Copyright (C) 2001 Free Software Foundation, Inc.
3
4	This file is part of Wget.
5
6	This program is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 2 of the License, or (at
9	your option) any later version.
10
11	This program is distributed in the hope that it will be useful, but
12	WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with this program; if not, write to the Free Software
18	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20	In addition, as a special exception, the Free Software Foundation
21	gives permission to link the code of its release of Wget with the
22	OpenSSL project's "OpenSSL" library (or with modified versions of it
23	that use the same license as the "OpenSSL" library), and distribute
24	the linked executables. You must obey the GNU General Public License
25	in all respects for all of the code used other than "OpenSSL". If you
26	modify this file, you may extend this exception to your version of the
27	file, but you are not obligated to do so. If you do not wish to do
28	so, delete this exception statement from your version. */
29
30	/* This file implements the Robot Exclusion Standard (RES).
31
32	RES is a simple protocol that enables site admins to signalize to
33	the web crawlers that certain parts of the site should not be
34	accessed. All the admin needs to do is create a "robots.txt" file
35	in the web server root, and use simple commands to allow or
36	disallow access to certain parts of the site.
37
38	The first specification was written by Martijn Koster in 1994, and
39	is still available at <http://www.robotstxt.org/wc/norobots.html>.
40	In 1996, Martijn wrote an Internet Draft specifying an improved RES
41	specification; however, that work was apparently abandoned since
42	the draft has expired in 1997 and hasn't been replaced since. The
43	draft is available at
44	<http://www.robotstxt.org/wc/norobots-rfc.html>.
45
46	This file implements RES as specified by the draft. Note that this
47	only handles the "robots.txt" support. The META tag that controls
48	whether the links should be followed is handled in `html-url.c'.
49
50	Known deviations:
51
52	* The end-of-line comment recognition is more in the spirit of the
53	Bourne Shell (as specified by RES-1994). That means that
54	"foo#bar" is taken literally, whereas "foo #bar" is interpreted
55	as "foo". The Draft apparently specifies that both should be
56	interpreted as "foo".
57
58	* We don't recognize sole CR as the line ending.
59
60	* We don't implement expiry mechanism for /robots.txt specs. I
61	consider it non-necessary for a relatively short-lived
62	application such as Wget. Besides, it is highly questionable
63	whether anyone deploys the recommended expiry scheme for
64	robots.txt.
65
66	Entry points are functions res_parse, res_parse_from_file,
67	res_match_path, res_register_specs, res_get_specs, and
68	res_retrieve_file. */
69
70	#ifdef HAVE_CONFIG_H
71	# include <config.h>
72	#endif
73
74	#include <stdio.h>
75	#include <stdlib.h>
76	#ifdef HAVE_STRING_H
77	# include <string.h>
78	#else
79	# include <strings.h>
80	#endif /* HAVE_STRING_H */
81	#include <errno.h>
82	#include <assert.h>
83
84	#include "wget.h"
85	#include "utils.h"
86	#include "hash.h"
87	#include "url.h"
88	#include "retr.h"
89	#include "res.h"
90
91	struct path_info {
92	char *path;
93	int allowedp;
94	int user_agent_exact_p;
95	};
96
97	struct robot_specs {
98	int count;
99	int size;
100	struct path_info *paths;
101	};
102
103
104	/* Parsing the robot spec. */
105
106	/* Check whether AGENT (a string of length LENGTH) equals "wget" or
107	"". If it is either of them, matches is set to one. If it is
108	"wget", exact_match is set to one. /
109
110	static void
111	match_user_agent (const char *agent, int length,
112	int matches, int exact_match)
113	{
114	if (length == 1 && agent == '')
115	{
116	*matches = 1;
117	*exact_match = 0;
118	}
119	else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
120	{
121	*matches = 1;
122	*exact_match = 1;
123	}
124	else
125	{
126	*matches = 0;
127	*exact_match = 0;
128	}
129	}
130
131	/* Add a path specification between PATH_B and PATH_E as one of the
132	paths in SPECS. */
133
134	static void
135	add_path (struct robot_specs specs, const char path_b, const char *path_e,
136	int allowedp, int exactp)
137	{
138	struct path_info pp;
139	if (path_b < path_e && *path_b == '/')
140	/* Our path representation doesn't use a leading slash, so remove
141	one from theirs. */
142	++path_b;
143	pp.path = strdupdelim (path_b, path_e);
144	pp.allowedp = allowedp;
145	pp.user_agent_exact_p = exactp;
146	++specs->count;
147	if (specs->count > specs->size)
148	{
149	if (specs->size == 0)
150	specs->size = 1;
151	else
152	specs->size <<= 1;
153	specs->paths = xrealloc (specs->paths,
154	specs->size * sizeof (struct path_info));
155	}
156	specs->paths[specs->count - 1] = pp;
157	}
158
159	/* Recreate SPECS->paths with only those paths that have non-zero
160	user_agent_exact_p. */
161
162	static void
163	prune_non_exact (struct robot_specs *specs)
164	{
165	struct path_info *newpaths;
166	int i, j, cnt;
167	cnt = 0;
168	for (i = 0; i < specs->count; i++)
169	if (specs->paths[i].user_agent_exact_p)
170	++cnt;
171	newpaths = xnew_array (struct path_info, cnt);
172	for (i = 0, j = 0; i < specs->count; i++)
173	if (specs->paths[i].user_agent_exact_p)
174	newpaths[j++] = specs->paths[i];
175	assert (j == cnt);
176	xfree (specs->paths);
177	specs->paths = newpaths;
178	specs->count = cnt;
179	specs->size = cnt;
180	}
181
182	#define EOL(p) ((p) >= lineend)
183
184	#define SKIP_SPACE(p) do { \
185	while (!EOL (p) && ISSPACE (*p)) \
186	++p; \
187	} while (0)
188
189	#define FIELD_IS(string_literal) \
190	BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
191
192	/* Parse textual RES specs beginning with SOURCE of length LENGTH.
193	Return a specs objects ready to be fed to res_match_path.
194
195	The parsing itself is trivial, but creating a correct SPECS object
196	is trickier than it seems, because RES is surprisingly byzantine if
197	you attempt to implement it correctly.
198
199	A "record" is a block of one or more `User-Agent' lines followed by
200	one or more `Allow' or `Disallow' lines. Record is accepted by
201	Wget if one of the `User-Agent' lines was "wget", or if the user
202	agent line was "*".
203
204	After all the lines have been read, we examine whether an exact
205	("wget") user-agent field was specified. If so, we delete all the
206	lines read under "User-Agent: *" blocks because we have our own
207	Wget-specific blocks. This enables the admin to say:
208
209	User-Agent: *
210	Disallow: /
211
212	User-Agent: google
213	User-Agent: wget
214	Disallow: /cgi-bin
215
216	This means that to Wget and to Google, /cgi-bin is disallowed,
217	whereas for all other crawlers, everything is disallowed.
218	res_parse is implemented so that the order of records doesn't
219	matter. In the case above, the "User-Agent: *" could have come
220	after the other one. */
221
222	struct robot_specs *
223	res_parse (const char *source, int length)
224	{
225	int line_count = 1;
226
227	const char *p = source;
228	const char *end = source + length;
229
230	/* non-zero if last applicable user-agent field matches Wget. */
231	int user_agent_applies = 0;
232
233	/* non-zero if last applicable user-agent field exactly matches
234	Wget. */
235	int user_agent_exact = 0;
236
237	/* whether we ever encountered exact user agent. */
238	int found_exact = 0;
239
240	/* count of allow/disallow lines in the current "record", i.e. after
241	the last `user-agent' instructions. */
242	int record_count = 0;
243
244	struct robot_specs *specs = xnew0 (struct robot_specs);
245
246	while (1)
247	{
248	const char lineend, lineend_real;
249	const char field_b, field_e;
250	const char value_b, value_e;
251
252	if (p == end)
253	break;
254	lineend_real = memchr (p, '\n', end - p);
255	if (lineend_real)
256	++lineend_real;
257	else
258	lineend_real = end;
259	lineend = lineend_real;
260
261	/* Before doing anything else, check whether the line is empty
262	or comment-only. */
263	SKIP_SPACE (p);
264	if (EOL (p) \|\| *p == '#')
265	goto next;
266
267	/* Make sure the end-of-line comments are respected by setting
268	lineend to a location preceding the first comment. Real line
269	ending remains in lineend_real. */
270	for (lineend = p; lineend < lineend_real; lineend++)
271	if ((lineend == p \|\| ISSPACE (*(lineend - 1)))
272	&& *lineend == '#')
273	break;
274
275	/* Ignore trailing whitespace in the same way. */
276	while (lineend > p && ISSPACE (*(lineend - 1)))
277	--lineend;
278
279	assert (!EOL (p));
280
281	field_b = p;
282	while (!EOL (p) && (ISALNUM (p) \|\| p == '-'))
283	++p;
284	field_e = p;
285
286	SKIP_SPACE (p);
287	if (field_b == field_e \|\| EOL (p) \|\| *p != ':')
288	{
289	DEBUGP (("Ignoring malformed line %d", line_count));
290	goto next;
291	}
292	++p; /* skip ':' */
293	SKIP_SPACE (p);
294
295	value_b = p;
296	while (!EOL (p))
297	++p;
298	value_e = p;
299
300	/* Finally, we have a syntactically valid line. */
301	if (FIELD_IS ("user-agent"))
302	{
303	/* We have to support several cases:
304
305	--previous records--
306
307	User-Agent: foo
308	User-Agent: Wget
309	User-Agent: bar
310	... matching record ...
311
312	User-Agent: baz
313	User-Agent: qux
314	... non-matching record ...
315
316	User-Agent: *
317	... matching record, but will be pruned later ...
318
319	We have to respect `User-Agent' at the beginning of each
320	new record simply because we don't know if we're going to
321	encounter "Wget" among the agents or not. Hence,
322	match_user_agent is called when record_count != 0.
323
324	But if record_count is 0, we have to keep calling it
325	until it matches, and if that happens, we must not call
326	it any more, until the next record. Hence the other part
327	of the condition. */
328	if (record_count != 0 \|\| user_agent_applies == 0)
329	match_user_agent (value_b, value_e - value_b,
330	&user_agent_applies, &user_agent_exact);
331	if (user_agent_exact)
332	found_exact = 1;
333	record_count = 0;
334	}
335	else if (FIELD_IS ("allow"))
336	{
337	if (user_agent_applies)
338	{
339	add_path (specs, value_b, value_e, 1, user_agent_exact);
340	}
341	++record_count;
342	}
343	else if (FIELD_IS ("disallow"))
344	{
345	if (user_agent_applies)
346	{
347	int allowed = 0;
348	if (value_b == value_e)
349	/* Empty "disallow" line means everything is
350	allowed! */
351	allowed = 1;
352	add_path (specs, value_b, value_e, allowed, user_agent_exact);
353	}
354	++record_count;
355	}
356	else
357	{
358	DEBUGP (("Ignoring unknown field at line %d", line_count));
359	goto next;
360	}
361
362	next:
363	p = lineend_real;
364	++line_count;
365	}
366
367	if (found_exact)
368	{
369	/* We've encountered an exactly matching user-agent. Throw out
370	all the stuff with user-agent: . /
371	prune_non_exact (specs);
372	}
373	else if (specs->size > specs->count)
374	{
375	/* add_path normally over-allocates specs->paths. Reallocate it
376	to the correct size in order to conserve some memory. */
377	specs->paths = xrealloc (specs->paths,
378	specs->count * sizeof (struct path_info));
379	specs->size = specs->count;
380	}
381
382	return specs;
383	}
384
385	/* The same like res_parse, but first map the FILENAME into memory,
386	and then parse it. */
387
388	struct robot_specs *
389	res_parse_from_file (const char *filename)
390	{
391	struct robot_specs *specs;
392	struct file_memory *fm = read_file (filename);
393	if (!fm)
394	{
395	logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
396	filename, strerror (errno));
397	return NULL;
398	}
399	specs = res_parse (fm->content, fm->length);
400	read_file_free (fm);
401	return specs;
402	}
403
404	static void
405	free_specs (struct robot_specs *specs)
406	{
407	int i;
408	for (i = 0; i < specs->count; i++)
409	xfree (specs->paths[i].path);
410	xfree_null (specs->paths);
411	xfree (specs);
412	}
413
414
415	/* Matching of a path according to the specs. */
416
417	/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
418	that number is not a numerical representation of '/', decode C and
419	advance the pointer. */
420
421	#define DECODE_MAYBE(c, ptr) do { \
422	if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
423	{ \
424	char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
425	if (decoded != '/') \
426	{ \
427	c = decoded; \
428	ptr += 2; \
429	} \
430	} \
431	} while (0)
432
433	/* The inner matching engine: return non-zero if RECORD_PATH matches
434	URL_PATH. The rules for matching are described at
435	<http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2. */
436
437	static int
438	matches (const char record_path, const char url_path)
439	{
440	const char *rp = record_path;
441	const char *up = url_path;
442
443	for (; ; ++rp, ++up)
444	{
445	char rc = *rp;
446	char uc = *up;
447	if (!rc)
448	return 1;
449	if (!uc)
450	return 0;
451	DECODE_MAYBE(rc, rp);
452	DECODE_MAYBE(uc, up);
453	if (rc != uc)
454	return 0;
455	}
456	}
457
458	/* Iterate through all paths in SPECS. For the first one that
459	matches, return its allow/reject status. If none matches,
460	retrieval is by default allowed. */
461
462	int
463	res_match_path (const struct robot_specs specs, const char path)
464	{
465	int i;
466	if (!specs)
467	return 1;
468	for (i = 0; i < specs->count; i++)
469	if (matches (specs->paths[i].path, path))
470	{
471	int allowedp = specs->paths[i].allowedp;
472	DEBUGP (("%s path %s because of rule `%s'.\n",
473	allowedp ? "Allowing" : "Rejecting",
474	path, specs->paths[i].path));
475	return allowedp;
476	}
477	return 1;
478	}
479
480
481	/* Registering the specs. */
482
483	static struct hash_table *registered_specs;
484
485	/* Stolen from cookies.c. */
486	#define SET_HOSTPORT(host, port, result) do { \
487	int HP_len = strlen (host); \
488	result = alloca (HP_len + 1 + numdigit (port) + 1); \
489	memcpy (result, host, HP_len); \
490	result[HP_len] = ':'; \
491	number_to_string (result + HP_len + 1, port); \
492	} while (0)
493
494	/* Register RES specs that below to server on HOST:PORT. They will
495	later be retrievable using res_get_specs. */
496
497	void
498	res_register_specs (const char host, int port, struct robot_specs specs)
499	{
500	struct robot_specs *old;
501	char hp, hp_old;
502	SET_HOSTPORT (host, port, hp);
503
504	if (!registered_specs)
505	registered_specs = make_nocase_string_hash_table (0);
506
507	if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
508	{
509	if (old)
510	free_specs (old);
511	hash_table_put (registered_specs, hp_old, specs);
512	}
513	else
514	{
515	hash_table_put (registered_specs, xstrdup (hp), specs);
516	}
517	}
518
519	/* Get the specs that belong to HOST:PORT. */
520
521	struct robot_specs *
522	res_get_specs (const char *host, int port)
523	{
524	char *hp;
525	SET_HOSTPORT (host, port, hp);
526	if (!registered_specs)
527	return NULL;
528	return hash_table_get (registered_specs, hp);
529	}
530
531
532	/* Loading the robots file. */
533
534	#define RES_SPECS_LOCATION "/robots.txt"
535
536	/* Retrieve the robots.txt from the server root of the server that
537	serves URL. The file will be named according to the currently
538	active rules, and the file name will be returned in *file.
539
540	Return non-zero if robots were retrieved OK, zero otherwise. */
541
542	int
543	res_retrieve_file (const char url, char *file)
544	{
545	uerr_t err;
546	char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
547
548	logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
549	*file = NULL;
550	err = retrieve_url (robots_url, file, NULL, NULL, NULL);
551	xfree (robots_url);
552
553	if (err != RETROK && *file != NULL)
554	{
555	/* If the file is not retrieved correctly, but retrieve_url
556	allocated the file name, deallocate is here so that the
557	caller doesn't have to worry about it. */
558	xfree (*file);
559	*file = NULL;
560	}
561	return err == RETROK;
562	}
563
564
565	static int
566	cleanup_hash_table_mapper (void key, void value, void *arg_ignored)
567	{
568	xfree (key);
569	free_specs (value);
570	return 0;
571	}
572
573	void
574	res_cleanup (void)
575	{
576	if (registered_specs)
577	{
578	hash_table_map (registered_specs, cleanup_hash_table_mapper, NULL);
579	hash_table_destroy (registered_specs);
580	registered_specs = NULL;
581	}
582	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/net-misc/wget/src/res.c

Download in other formats: