source: trunk/essentials/net-misc/wget/src/res.c

Last change on this file was 3440, checked in by bird, 18 years ago

wget 1.10.2

File size: 15.7 KB
Line 
1/* Support for Robot Exclusion Standard (RES).
2 Copyright (C) 2001 Free Software Foundation, Inc.
3
4This file is part of Wget.
5
6This program is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2 of the License, or (at
9your option) any later version.
10
11This program is distributed in the hope that it will be useful, but
12WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with this program; if not, write to the Free Software
18Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20In addition, as a special exception, the Free Software Foundation
21gives permission to link the code of its release of Wget with the
22OpenSSL project's "OpenSSL" library (or with modified versions of it
23that use the same license as the "OpenSSL" library), and distribute
24the linked executables. You must obey the GNU General Public License
25in all respects for all of the code used other than "OpenSSL". If you
26modify this file, you may extend this exception to your version of the
27file, but you are not obligated to do so. If you do not wish to do
28so, delete this exception statement from your version. */
29
30/* This file implements the Robot Exclusion Standard (RES).
31
32 RES is a simple protocol that enables site admins to signalize to
33 the web crawlers that certain parts of the site should not be
34 accessed. All the admin needs to do is create a "robots.txt" file
35 in the web server root, and use simple commands to allow or
36 disallow access to certain parts of the site.
37
38 The first specification was written by Martijn Koster in 1994, and
39 is still available at <http://www.robotstxt.org/wc/norobots.html>.
40 In 1996, Martijn wrote an Internet Draft specifying an improved RES
41 specification; however, that work was apparently abandoned since
42 the draft has expired in 1997 and hasn't been replaced since. The
43 draft is available at
44 <http://www.robotstxt.org/wc/norobots-rfc.html>.
45
46 This file implements RES as specified by the draft. Note that this
47 only handles the "robots.txt" support. The META tag that controls
48 whether the links should be followed is handled in `html-url.c'.
49
50 Known deviations:
51
52 * The end-of-line comment recognition is more in the spirit of the
53 Bourne Shell (as specified by RES-1994). That means that
54 "foo#bar" is taken literally, whereas "foo #bar" is interpreted
55 as "foo". The Draft apparently specifies that both should be
56 interpreted as "foo".
57
58 * We don't recognize sole CR as the line ending.
59
60 * We don't implement expiry mechanism for /robots.txt specs. I
61 consider it non-necessary for a relatively short-lived
62 application such as Wget. Besides, it is highly questionable
63 whether anyone deploys the recommended expiry scheme for
64 robots.txt.
65
66 Entry points are functions res_parse, res_parse_from_file,
67 res_match_path, res_register_specs, res_get_specs, and
68 res_retrieve_file. */
69
70#ifdef HAVE_CONFIG_H
71# include <config.h>
72#endif
73
74#include <stdio.h>
75#include <stdlib.h>
76#ifdef HAVE_STRING_H
77# include <string.h>
78#else
79# include <strings.h>
80#endif /* HAVE_STRING_H */
81#include <errno.h>
82#include <assert.h>
83
84#include "wget.h"
85#include "utils.h"
86#include "hash.h"
87#include "url.h"
88#include "retr.h"
89#include "res.h"
90
91struct path_info {
92 char *path;
93 int allowedp;
94 int user_agent_exact_p;
95};
96
97struct robot_specs {
98 int count;
99 int size;
100 struct path_info *paths;
101};
102
103
104/* Parsing the robot spec. */
105
106/* Check whether AGENT (a string of length LENGTH) equals "wget" or
107 "*". If it is either of them, *matches is set to one. If it is
108 "wget", *exact_match is set to one. */
109
110static void
111match_user_agent (const char *agent, int length,
112 int *matches, int *exact_match)
113{
114 if (length == 1 && *agent == '*')
115 {
116 *matches = 1;
117 *exact_match = 0;
118 }
119 else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
120 {
121 *matches = 1;
122 *exact_match = 1;
123 }
124 else
125 {
126 *matches = 0;
127 *exact_match = 0;
128 }
129}
130
131/* Add a path specification between PATH_B and PATH_E as one of the
132 paths in SPECS. */
133
134static void
135add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
136 int allowedp, int exactp)
137{
138 struct path_info pp;
139 if (path_b < path_e && *path_b == '/')
140 /* Our path representation doesn't use a leading slash, so remove
141 one from theirs. */
142 ++path_b;
143 pp.path = strdupdelim (path_b, path_e);
144 pp.allowedp = allowedp;
145 pp.user_agent_exact_p = exactp;
146 ++specs->count;
147 if (specs->count > specs->size)
148 {
149 if (specs->size == 0)
150 specs->size = 1;
151 else
152 specs->size <<= 1;
153 specs->paths = xrealloc (specs->paths,
154 specs->size * sizeof (struct path_info));
155 }
156 specs->paths[specs->count - 1] = pp;
157}
158
159/* Recreate SPECS->paths with only those paths that have non-zero
160 user_agent_exact_p. */
161
162static void
163prune_non_exact (struct robot_specs *specs)
164{
165 struct path_info *newpaths;
166 int i, j, cnt;
167 cnt = 0;
168 for (i = 0; i < specs->count; i++)
169 if (specs->paths[i].user_agent_exact_p)
170 ++cnt;
171 newpaths = xnew_array (struct path_info, cnt);
172 for (i = 0, j = 0; i < specs->count; i++)
173 if (specs->paths[i].user_agent_exact_p)
174 newpaths[j++] = specs->paths[i];
175 assert (j == cnt);
176 xfree (specs->paths);
177 specs->paths = newpaths;
178 specs->count = cnt;
179 specs->size = cnt;
180}
181
182#define EOL(p) ((p) >= lineend)
183
184#define SKIP_SPACE(p) do { \
185 while (!EOL (p) && ISSPACE (*p)) \
186 ++p; \
187} while (0)
188
189#define FIELD_IS(string_literal) \
190 BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
191
192/* Parse textual RES specs beginning with SOURCE of length LENGTH.
193 Return a specs objects ready to be fed to res_match_path.
194
195 The parsing itself is trivial, but creating a correct SPECS object
196 is trickier than it seems, because RES is surprisingly byzantine if
197 you attempt to implement it correctly.
198
199 A "record" is a block of one or more `User-Agent' lines followed by
200 one or more `Allow' or `Disallow' lines. Record is accepted by
201 Wget if one of the `User-Agent' lines was "wget", or if the user
202 agent line was "*".
203
204 After all the lines have been read, we examine whether an exact
205 ("wget") user-agent field was specified. If so, we delete all the
206 lines read under "User-Agent: *" blocks because we have our own
207 Wget-specific blocks. This enables the admin to say:
208
209 User-Agent: *
210 Disallow: /
211
212 User-Agent: google
213 User-Agent: wget
214 Disallow: /cgi-bin
215
216 This means that to Wget and to Google, /cgi-bin is disallowed,
217 whereas for all other crawlers, everything is disallowed.
218 res_parse is implemented so that the order of records doesn't
219 matter. In the case above, the "User-Agent: *" could have come
220 after the other one. */
221
222struct robot_specs *
223res_parse (const char *source, int length)
224{
225 int line_count = 1;
226
227 const char *p = source;
228 const char *end = source + length;
229
230 /* non-zero if last applicable user-agent field matches Wget. */
231 int user_agent_applies = 0;
232
233 /* non-zero if last applicable user-agent field *exactly* matches
234 Wget. */
235 int user_agent_exact = 0;
236
237 /* whether we ever encountered exact user agent. */
238 int found_exact = 0;
239
240 /* count of allow/disallow lines in the current "record", i.e. after
241 the last `user-agent' instructions. */
242 int record_count = 0;
243
244 struct robot_specs *specs = xnew0 (struct robot_specs);
245
246 while (1)
247 {
248 const char *lineend, *lineend_real;
249 const char *field_b, *field_e;
250 const char *value_b, *value_e;
251
252 if (p == end)
253 break;
254 lineend_real = memchr (p, '\n', end - p);
255 if (lineend_real)
256 ++lineend_real;
257 else
258 lineend_real = end;
259 lineend = lineend_real;
260
261 /* Before doing anything else, check whether the line is empty
262 or comment-only. */
263 SKIP_SPACE (p);
264 if (EOL (p) || *p == '#')
265 goto next;
266
267 /* Make sure the end-of-line comments are respected by setting
268 lineend to a location preceding the first comment. Real line
269 ending remains in lineend_real. */
270 for (lineend = p; lineend < lineend_real; lineend++)
271 if ((lineend == p || ISSPACE (*(lineend - 1)))
272 && *lineend == '#')
273 break;
274
275 /* Ignore trailing whitespace in the same way. */
276 while (lineend > p && ISSPACE (*(lineend - 1)))
277 --lineend;
278
279 assert (!EOL (p));
280
281 field_b = p;
282 while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
283 ++p;
284 field_e = p;
285
286 SKIP_SPACE (p);
287 if (field_b == field_e || EOL (p) || *p != ':')
288 {
289 DEBUGP (("Ignoring malformed line %d", line_count));
290 goto next;
291 }
292 ++p; /* skip ':' */
293 SKIP_SPACE (p);
294
295 value_b = p;
296 while (!EOL (p))
297 ++p;
298 value_e = p;
299
300 /* Finally, we have a syntactically valid line. */
301 if (FIELD_IS ("user-agent"))
302 {
303 /* We have to support several cases:
304
305 --previous records--
306
307 User-Agent: foo
308 User-Agent: Wget
309 User-Agent: bar
310 ... matching record ...
311
312 User-Agent: baz
313 User-Agent: qux
314 ... non-matching record ...
315
316 User-Agent: *
317 ... matching record, but will be pruned later ...
318
319 We have to respect `User-Agent' at the beginning of each
320 new record simply because we don't know if we're going to
321 encounter "Wget" among the agents or not. Hence,
322 match_user_agent is called when record_count != 0.
323
324 But if record_count is 0, we have to keep calling it
325 until it matches, and if that happens, we must not call
326 it any more, until the next record. Hence the other part
327 of the condition. */
328 if (record_count != 0 || user_agent_applies == 0)
329 match_user_agent (value_b, value_e - value_b,
330 &user_agent_applies, &user_agent_exact);
331 if (user_agent_exact)
332 found_exact = 1;
333 record_count = 0;
334 }
335 else if (FIELD_IS ("allow"))
336 {
337 if (user_agent_applies)
338 {
339 add_path (specs, value_b, value_e, 1, user_agent_exact);
340 }
341 ++record_count;
342 }
343 else if (FIELD_IS ("disallow"))
344 {
345 if (user_agent_applies)
346 {
347 int allowed = 0;
348 if (value_b == value_e)
349 /* Empty "disallow" line means everything is
350 *allowed*! */
351 allowed = 1;
352 add_path (specs, value_b, value_e, allowed, user_agent_exact);
353 }
354 ++record_count;
355 }
356 else
357 {
358 DEBUGP (("Ignoring unknown field at line %d", line_count));
359 goto next;
360 }
361
362 next:
363 p = lineend_real;
364 ++line_count;
365 }
366
367 if (found_exact)
368 {
369 /* We've encountered an exactly matching user-agent. Throw out
370 all the stuff with user-agent: *. */
371 prune_non_exact (specs);
372 }
373 else if (specs->size > specs->count)
374 {
375 /* add_path normally over-allocates specs->paths. Reallocate it
376 to the correct size in order to conserve some memory. */
377 specs->paths = xrealloc (specs->paths,
378 specs->count * sizeof (struct path_info));
379 specs->size = specs->count;
380 }
381
382 return specs;
383}
384
385/* The same like res_parse, but first map the FILENAME into memory,
386 and then parse it. */
387
388struct robot_specs *
389res_parse_from_file (const char *filename)
390{
391 struct robot_specs *specs;
392 struct file_memory *fm = read_file (filename);
393 if (!fm)
394 {
395 logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
396 filename, strerror (errno));
397 return NULL;
398 }
399 specs = res_parse (fm->content, fm->length);
400 read_file_free (fm);
401 return specs;
402}
403
404static void
405free_specs (struct robot_specs *specs)
406{
407 int i;
408 for (i = 0; i < specs->count; i++)
409 xfree (specs->paths[i].path);
410 xfree_null (specs->paths);
411 xfree (specs);
412}
413
414
415/* Matching of a path according to the specs. */
416
417/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
418 that number is not a numerical representation of '/', decode C and
419 advance the pointer. */
420
421#define DECODE_MAYBE(c, ptr) do { \
422 if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
423 { \
424 char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
425 if (decoded != '/') \
426 { \
427 c = decoded; \
428 ptr += 2; \
429 } \
430 } \
431} while (0)
432
433/* The inner matching engine: return non-zero if RECORD_PATH matches
434 URL_PATH. The rules for matching are described at
435 <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2. */
436
437static int
438matches (const char *record_path, const char *url_path)
439{
440 const char *rp = record_path;
441 const char *up = url_path;
442
443 for (; ; ++rp, ++up)
444 {
445 char rc = *rp;
446 char uc = *up;
447 if (!rc)
448 return 1;
449 if (!uc)
450 return 0;
451 DECODE_MAYBE(rc, rp);
452 DECODE_MAYBE(uc, up);
453 if (rc != uc)
454 return 0;
455 }
456}
457
458/* Iterate through all paths in SPECS. For the first one that
459 matches, return its allow/reject status. If none matches,
460 retrieval is by default allowed. */
461
462int
463res_match_path (const struct robot_specs *specs, const char *path)
464{
465 int i;
466 if (!specs)
467 return 1;
468 for (i = 0; i < specs->count; i++)
469 if (matches (specs->paths[i].path, path))
470 {
471 int allowedp = specs->paths[i].allowedp;
472 DEBUGP (("%s path %s because of rule `%s'.\n",
473 allowedp ? "Allowing" : "Rejecting",
474 path, specs->paths[i].path));
475 return allowedp;
476 }
477 return 1;
478}
479
480
481/* Registering the specs. */
482
483static struct hash_table *registered_specs;
484
485/* Stolen from cookies.c. */
486#define SET_HOSTPORT(host, port, result) do { \
487 int HP_len = strlen (host); \
488 result = alloca (HP_len + 1 + numdigit (port) + 1); \
489 memcpy (result, host, HP_len); \
490 result[HP_len] = ':'; \
491 number_to_string (result + HP_len + 1, port); \
492} while (0)
493
494/* Register RES specs that below to server on HOST:PORT. They will
495 later be retrievable using res_get_specs. */
496
497void
498res_register_specs (const char *host, int port, struct robot_specs *specs)
499{
500 struct robot_specs *old;
501 char *hp, *hp_old;
502 SET_HOSTPORT (host, port, hp);
503
504 if (!registered_specs)
505 registered_specs = make_nocase_string_hash_table (0);
506
507 if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
508 {
509 if (old)
510 free_specs (old);
511 hash_table_put (registered_specs, hp_old, specs);
512 }
513 else
514 {
515 hash_table_put (registered_specs, xstrdup (hp), specs);
516 }
517}
518
519/* Get the specs that belong to HOST:PORT. */
520
521struct robot_specs *
522res_get_specs (const char *host, int port)
523{
524 char *hp;
525 SET_HOSTPORT (host, port, hp);
526 if (!registered_specs)
527 return NULL;
528 return hash_table_get (registered_specs, hp);
529}
530
531
532/* Loading the robots file. */
533
534#define RES_SPECS_LOCATION "/robots.txt"
535
536/* Retrieve the robots.txt from the server root of the server that
537 serves URL. The file will be named according to the currently
538 active rules, and the file name will be returned in *file.
539
540 Return non-zero if robots were retrieved OK, zero otherwise. */
541
542int
543res_retrieve_file (const char *url, char **file)
544{
545 uerr_t err;
546 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
547
548 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
549 *file = NULL;
550 err = retrieve_url (robots_url, file, NULL, NULL, NULL);
551 xfree (robots_url);
552
553 if (err != RETROK && *file != NULL)
554 {
555 /* If the file is not retrieved correctly, but retrieve_url
556 allocated the file name, deallocate is here so that the
557 caller doesn't have to worry about it. */
558 xfree (*file);
559 *file = NULL;
560 }
561 return err == RETROK;
562}
563
564
565static int
566cleanup_hash_table_mapper (void *key, void *value, void *arg_ignored)
567{
568 xfree (key);
569 free_specs (value);
570 return 0;
571}
572
573void
574res_cleanup (void)
575{
576 if (registered_specs)
577 {
578 hash_table_map (registered_specs, cleanup_hash_table_mapper, NULL);
579 hash_table_destroy (registered_specs);
580 registered_specs = NULL;
581 }
582}
Note: See TracBrowser for help on using the repository browser.