1 | /* Handling of recursive HTTP retrieving.
|
---|
2 | Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
|
---|
3 |
|
---|
4 | This file is part of GNU Wget.
|
---|
5 |
|
---|
6 | GNU Wget is free software; you can redistribute it and/or modify
|
---|
7 | it under the terms of the GNU General Public License as published by
|
---|
8 | the Free Software Foundation; either version 2 of the License, or
|
---|
9 | (at your option) any later version.
|
---|
10 |
|
---|
11 | GNU Wget is distributed in the hope that it will be useful,
|
---|
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | GNU General Public License for more details.
|
---|
15 |
|
---|
16 | You should have received a copy of the GNU General Public License
|
---|
17 | along with Wget; if not, write to the Free Software
|
---|
18 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 |
|
---|
20 | In addition, as a special exception, the Free Software Foundation
|
---|
21 | gives permission to link the code of its release of Wget with the
|
---|
22 | OpenSSL project's "OpenSSL" library (or with modified versions of it
|
---|
23 | that use the same license as the "OpenSSL" library), and distribute
|
---|
24 | the linked executables. You must obey the GNU General Public License
|
---|
25 | in all respects for all of the code used other than "OpenSSL". If you
|
---|
26 | modify this file, you may extend this exception to your version of the
|
---|
27 | file, but you are not obligated to do so. If you do not wish to do
|
---|
28 | so, delete this exception statement from your version. */
|
---|
29 |
|
---|
30 | #include <config.h>
|
---|
31 |
|
---|
32 | #include <stdio.h>
|
---|
33 | #include <stdlib.h>
|
---|
34 | #ifdef HAVE_STRING_H
|
---|
35 | # include <string.h>
|
---|
36 | #else
|
---|
37 | # include <strings.h>
|
---|
38 | #endif /* HAVE_STRING_H */
|
---|
39 | #ifdef HAVE_UNISTD_H
|
---|
40 | # include <unistd.h>
|
---|
41 | #endif /* HAVE_UNISTD_H */
|
---|
42 | #include <errno.h>
|
---|
43 | #include <assert.h>
|
---|
44 | #include <sys/types.h>
|
---|
45 |
|
---|
46 | #include "wget.h"
|
---|
47 | #include "url.h"
|
---|
48 | #include "recur.h"
|
---|
49 | #include "utils.h"
|
---|
50 | #include "retr.h"
|
---|
51 | #include "ftp.h"
|
---|
52 | #include "host.h"
|
---|
53 | #include "hash.h"
|
---|
54 | #include "res.h"
|
---|
55 | #include "convert.h"
|
---|
56 |
|
---|
57 | #ifndef errno
|
---|
58 | extern int errno;
|
---|
59 | #endif
|
---|
60 |
|
---|
61 | extern char *version_string;
|
---|
62 | extern SUM_SIZE_INT total_downloaded_bytes;
|
---|
63 |
|
---|
64 | extern struct hash_table *dl_url_file_map;
|
---|
65 | extern struct hash_table *downloaded_html_set;
|
---|
66 | |
---|
67 |
|
---|
68 | /* Functions for maintaining the URL queue. */
|
---|
69 |
|
---|
70 | struct queue_element {
|
---|
71 | const char *url; /* the URL to download */
|
---|
72 | const char *referer; /* the referring document */
|
---|
73 | int depth; /* the depth */
|
---|
74 | unsigned int html_allowed :1; /* whether the document is allowed to
|
---|
75 | be treated as HTML. */
|
---|
76 |
|
---|
77 | struct queue_element *next; /* next element in queue */
|
---|
78 | };
|
---|
79 |
|
---|
80 | struct url_queue {
|
---|
81 | struct queue_element *head;
|
---|
82 | struct queue_element *tail;
|
---|
83 | int count, maxcount;
|
---|
84 | };
|
---|
85 |
|
---|
86 | /* Create a URL queue. */
|
---|
87 |
|
---|
88 | static struct url_queue *
|
---|
89 | url_queue_new (void)
|
---|
90 | {
|
---|
91 | struct url_queue *queue = xnew0 (struct url_queue);
|
---|
92 | return queue;
|
---|
93 | }
|
---|
94 |
|
---|
95 | /* Delete a URL queue. */
|
---|
96 |
|
---|
97 | static void
|
---|
98 | url_queue_delete (struct url_queue *queue)
|
---|
99 | {
|
---|
100 | xfree (queue);
|
---|
101 | }
|
---|
102 |
|
---|
103 | /* Enqueue a URL in the queue. The queue is FIFO: the items will be
|
---|
104 | retrieved ("dequeued") from the queue in the order they were placed
|
---|
105 | into it. */
|
---|
106 |
|
---|
107 | static void
|
---|
108 | url_enqueue (struct url_queue *queue,
|
---|
109 | const char *url, const char *referer, int depth, int html_allowed)
|
---|
110 | {
|
---|
111 | struct queue_element *qel = xnew (struct queue_element);
|
---|
112 | qel->url = url;
|
---|
113 | qel->referer = referer;
|
---|
114 | qel->depth = depth;
|
---|
115 | qel->html_allowed = html_allowed;
|
---|
116 | qel->next = NULL;
|
---|
117 |
|
---|
118 | ++queue->count;
|
---|
119 | if (queue->count > queue->maxcount)
|
---|
120 | queue->maxcount = queue->count;
|
---|
121 |
|
---|
122 | DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
|
---|
123 | DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
|
---|
124 |
|
---|
125 | if (queue->tail)
|
---|
126 | queue->tail->next = qel;
|
---|
127 | queue->tail = qel;
|
---|
128 |
|
---|
129 | if (!queue->head)
|
---|
130 | queue->head = queue->tail;
|
---|
131 | }
|
---|
132 |
|
---|
133 | /* Take a URL out of the queue. Return 1 if this operation succeeded,
|
---|
134 | or 0 if the queue is empty. */
|
---|
135 |
|
---|
136 | static int
|
---|
137 | url_dequeue (struct url_queue *queue,
|
---|
138 | const char **url, const char **referer, int *depth,
|
---|
139 | int *html_allowed)
|
---|
140 | {
|
---|
141 | struct queue_element *qel = queue->head;
|
---|
142 |
|
---|
143 | if (!qel)
|
---|
144 | return 0;
|
---|
145 |
|
---|
146 | queue->head = queue->head->next;
|
---|
147 | if (!queue->head)
|
---|
148 | queue->tail = NULL;
|
---|
149 |
|
---|
150 | *url = qel->url;
|
---|
151 | *referer = qel->referer;
|
---|
152 | *depth = qel->depth;
|
---|
153 | *html_allowed = qel->html_allowed;
|
---|
154 |
|
---|
155 | --queue->count;
|
---|
156 |
|
---|
157 | DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
|
---|
158 | DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
|
---|
159 |
|
---|
160 | xfree (qel);
|
---|
161 | return 1;
|
---|
162 | }
|
---|
163 | |
---|
164 |
|
---|
165 | static int download_child_p PARAMS ((const struct urlpos *, struct url *, int,
|
---|
166 | struct url *, struct hash_table *));
|
---|
167 | static int descend_redirect_p PARAMS ((const char *, const char *, int,
|
---|
168 | struct url *, struct hash_table *));
|
---|
169 |
|
---|
170 |
|
---|
171 | /* Retrieve a part of the web beginning with START_URL. This used to
|
---|
172 | be called "recursive retrieval", because the old function was
|
---|
173 | recursive and implemented depth-first search. retrieve_tree on the
|
---|
174 | other hand implements breadth-search traversal of the tree, which
|
---|
175 | results in much nicer ordering of downloads.
|
---|
176 |
|
---|
177 | The algorithm this function uses is simple:
|
---|
178 |
|
---|
179 | 1. put START_URL in the queue.
|
---|
180 | 2. while there are URLs in the queue:
|
---|
181 |
|
---|
182 | 3. get next URL from the queue.
|
---|
183 | 4. download it.
|
---|
184 | 5. if the URL is HTML and its depth does not exceed maximum depth,
|
---|
185 | get the list of URLs embedded therein.
|
---|
186 | 6. for each of those URLs do the following:
|
---|
187 |
|
---|
188 | 7. if the URL is not one of those downloaded before, and if it
|
---|
189 | satisfies the criteria specified by the various command-line
|
---|
190 | options, add it to the queue. */
|
---|
191 |
|
---|
192 | uerr_t
|
---|
193 | retrieve_tree (const char *start_url)
|
---|
194 | {
|
---|
195 | uerr_t status = RETROK;
|
---|
196 |
|
---|
197 | /* The queue of URLs we need to load. */
|
---|
198 | struct url_queue *queue;
|
---|
199 |
|
---|
200 | /* The URLs we do not wish to enqueue, because they are already in
|
---|
201 | the queue, but haven't been downloaded yet. */
|
---|
202 | struct hash_table *blacklist;
|
---|
203 |
|
---|
204 | int up_error_code;
|
---|
205 | struct url *start_url_parsed = url_parse (start_url, &up_error_code);
|
---|
206 |
|
---|
207 | if (!start_url_parsed)
|
---|
208 | {
|
---|
209 | logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
|
---|
210 | url_error (up_error_code));
|
---|
211 | return URLERROR;
|
---|
212 | }
|
---|
213 |
|
---|
214 | queue = url_queue_new ();
|
---|
215 | blacklist = make_string_hash_table (0);
|
---|
216 |
|
---|
217 | /* Enqueue the starting URL. Use start_url_parsed->url rather than
|
---|
218 | just URL so we enqueue the canonical form of the URL. */
|
---|
219 | url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
|
---|
220 | string_set_add (blacklist, start_url_parsed->url);
|
---|
221 |
|
---|
222 | while (1)
|
---|
223 | {
|
---|
224 | int descend = 0;
|
---|
225 | char *url, *referer, *file = NULL;
|
---|
226 | int depth, html_allowed;
|
---|
227 | int dash_p_leaf_HTML = 0;
|
---|
228 |
|
---|
229 | if (opt.quota && total_downloaded_bytes > opt.quota)
|
---|
230 | break;
|
---|
231 | if (status == FWRITEERR)
|
---|
232 | break;
|
---|
233 |
|
---|
234 | /* Get the next URL from the queue... */
|
---|
235 |
|
---|
236 | if (!url_dequeue (queue,
|
---|
237 | (const char **)&url, (const char **)&referer,
|
---|
238 | &depth, &html_allowed))
|
---|
239 | break;
|
---|
240 |
|
---|
241 | /* ...and download it. Note that this download is in most cases
|
---|
242 | unconditional, as download_child_p already makes sure a file
|
---|
243 | doesn't get enqueued twice -- and yet this check is here, and
|
---|
244 | not in download_child_p. This is so that if you run `wget -r
|
---|
245 | URL1 URL2', and a random URL is encountered once under URL1
|
---|
246 | and again under URL2, but at a different (possibly smaller)
|
---|
247 | depth, we want the URL's children to be taken into account
|
---|
248 | the second time. */
|
---|
249 | if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
|
---|
250 | {
|
---|
251 | file = xstrdup (hash_table_get (dl_url_file_map, url));
|
---|
252 |
|
---|
253 | DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
|
---|
254 | url, file));
|
---|
255 |
|
---|
256 | if (html_allowed
|
---|
257 | && downloaded_html_set
|
---|
258 | && string_set_contains (downloaded_html_set, file))
|
---|
259 | descend = 1;
|
---|
260 | }
|
---|
261 | else
|
---|
262 | {
|
---|
263 | int dt = 0;
|
---|
264 | char *redirected = NULL;
|
---|
265 | int oldrec = opt.recursive;
|
---|
266 |
|
---|
267 | opt.recursive = 0;
|
---|
268 | status = retrieve_url (url, &file, &redirected, referer, &dt);
|
---|
269 | opt.recursive = oldrec;
|
---|
270 |
|
---|
271 | if (html_allowed && file && status == RETROK
|
---|
272 | && (dt & RETROKF) && (dt & TEXTHTML))
|
---|
273 | descend = 1;
|
---|
274 |
|
---|
275 | if (redirected)
|
---|
276 | {
|
---|
277 | /* We have been redirected, possibly to another host, or
|
---|
278 | different path, or wherever. Check whether we really
|
---|
279 | want to follow it. */
|
---|
280 | if (descend)
|
---|
281 | {
|
---|
282 | if (!descend_redirect_p (redirected, url, depth,
|
---|
283 | start_url_parsed, blacklist))
|
---|
284 | descend = 0;
|
---|
285 | else
|
---|
286 | /* Make sure that the old pre-redirect form gets
|
---|
287 | blacklisted. */
|
---|
288 | string_set_add (blacklist, url);
|
---|
289 | }
|
---|
290 |
|
---|
291 | xfree (url);
|
---|
292 | url = redirected;
|
---|
293 | }
|
---|
294 | }
|
---|
295 |
|
---|
296 | if (descend
|
---|
297 | && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
|
---|
298 | {
|
---|
299 | if (opt.page_requisites
|
---|
300 | && (depth == opt.reclevel || depth == opt.reclevel + 1))
|
---|
301 | {
|
---|
302 | /* When -p is specified, we are allowed to exceed the
|
---|
303 | maximum depth, but only for the "inline" links,
|
---|
304 | i.e. those that are needed to display the page.
|
---|
305 | Originally this could exceed the depth at most by
|
---|
306 | one, but we allow one more level so that the leaf
|
---|
307 | pages that contain frames can be loaded
|
---|
308 | correctly. */
|
---|
309 | dash_p_leaf_HTML = 1;
|
---|
310 | }
|
---|
311 | else
|
---|
312 | {
|
---|
313 | /* Either -p wasn't specified or it was and we've
|
---|
314 | already spent the two extra (pseudo-)levels that it
|
---|
315 | affords us, so we need to bail out. */
|
---|
316 | DEBUGP (("Not descending further; at depth %d, max. %d.\n",
|
---|
317 | depth, opt.reclevel));
|
---|
318 | descend = 0;
|
---|
319 | }
|
---|
320 | }
|
---|
321 |
|
---|
322 | /* If the downloaded document was HTML, parse it and enqueue the
|
---|
323 | links it contains. */
|
---|
324 |
|
---|
325 | if (descend)
|
---|
326 | {
|
---|
327 | int meta_disallow_follow = 0;
|
---|
328 | struct urlpos *children
|
---|
329 | = get_urls_html (file, url, &meta_disallow_follow);
|
---|
330 |
|
---|
331 | if (opt.use_robots && meta_disallow_follow)
|
---|
332 | {
|
---|
333 | free_urlpos (children);
|
---|
334 | children = NULL;
|
---|
335 | }
|
---|
336 |
|
---|
337 | if (children)
|
---|
338 | {
|
---|
339 | struct urlpos *child = children;
|
---|
340 | struct url *url_parsed = url_parsed = url_parse (url, NULL);
|
---|
341 | assert (url_parsed != NULL);
|
---|
342 |
|
---|
343 | for (; child; child = child->next)
|
---|
344 | {
|
---|
345 | if (child->ignore_when_downloading)
|
---|
346 | continue;
|
---|
347 | if (dash_p_leaf_HTML && !child->link_inline_p)
|
---|
348 | continue;
|
---|
349 | if (download_child_p (child, url_parsed, depth, start_url_parsed,
|
---|
350 | blacklist))
|
---|
351 | {
|
---|
352 | url_enqueue (queue, xstrdup (child->url->url),
|
---|
353 | xstrdup (url), depth + 1,
|
---|
354 | child->link_expect_html);
|
---|
355 | /* We blacklist the URL we have enqueued, because we
|
---|
356 | don't want to enqueue (and hence download) the
|
---|
357 | same URL twice. */
|
---|
358 | string_set_add (blacklist, child->url->url);
|
---|
359 | }
|
---|
360 | }
|
---|
361 |
|
---|
362 | url_free (url_parsed);
|
---|
363 | free_urlpos (children);
|
---|
364 | }
|
---|
365 | }
|
---|
366 |
|
---|
367 | if (opt.delete_after || (file && !acceptable (file)))
|
---|
368 | {
|
---|
369 | /* Either --delete-after was specified, or we loaded this
|
---|
370 | otherwise rejected (e.g. by -R) HTML file just so we
|
---|
371 | could harvest its hyperlinks -- in either case, delete
|
---|
372 | the local file. */
|
---|
373 | DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
|
---|
374 | opt.delete_after ? "--delete-after" :
|
---|
375 | "recursive rejection criteria"));
|
---|
376 | logprintf (LOG_VERBOSE,
|
---|
377 | (opt.delete_after
|
---|
378 | ? _("Removing %s.\n")
|
---|
379 | : _("Removing %s since it should be rejected.\n")),
|
---|
380 | file);
|
---|
381 | if (unlink (file))
|
---|
382 | logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
|
---|
383 | register_delete_file (file);
|
---|
384 | }
|
---|
385 |
|
---|
386 | xfree (url);
|
---|
387 | xfree_null (referer);
|
---|
388 | xfree_null (file);
|
---|
389 | }
|
---|
390 |
|
---|
391 | /* If anything is left of the queue due to a premature exit, free it
|
---|
392 | now. */
|
---|
393 | {
|
---|
394 | char *d1, *d2;
|
---|
395 | int d3, d4;
|
---|
396 | while (url_dequeue (queue,
|
---|
397 | (const char **)&d1, (const char **)&d2, &d3, &d4))
|
---|
398 | {
|
---|
399 | xfree (d1);
|
---|
400 | xfree_null (d2);
|
---|
401 | }
|
---|
402 | }
|
---|
403 | url_queue_delete (queue);
|
---|
404 |
|
---|
405 | if (start_url_parsed)
|
---|
406 | url_free (start_url_parsed);
|
---|
407 | string_set_free (blacklist);
|
---|
408 |
|
---|
409 | if (opt.quota && total_downloaded_bytes > opt.quota)
|
---|
410 | return QUOTEXC;
|
---|
411 | else if (status == FWRITEERR)
|
---|
412 | return FWRITEERR;
|
---|
413 | else
|
---|
414 | return RETROK;
|
---|
415 | }
|
---|
416 |
|
---|
417 | /* Based on the context provided by retrieve_tree, decide whether a
|
---|
418 | URL is to be descended to. This is only ever called from
|
---|
419 | retrieve_tree, but is in a separate function for clarity.
|
---|
420 |
|
---|
421 | The most expensive checks (such as those for robots) are memoized
|
---|
422 | by storing these URLs to BLACKLIST. This may or may not help. It
|
---|
423 | will help if those URLs are encountered many times. */
|
---|
424 |
|
---|
425 | static int
|
---|
426 | download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
---|
427 | struct url *start_url_parsed, struct hash_table *blacklist)
|
---|
428 | {
|
---|
429 | struct url *u = upos->url;
|
---|
430 | const char *url = u->url;
|
---|
431 | int u_scheme_like_http;
|
---|
432 |
|
---|
433 | DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
|
---|
434 |
|
---|
435 | if (string_set_contains (blacklist, url))
|
---|
436 | {
|
---|
437 | DEBUGP (("Already on the black list.\n"));
|
---|
438 | goto out;
|
---|
439 | }
|
---|
440 |
|
---|
441 | /* Several things to check for:
|
---|
442 | 1. if scheme is not http, and we don't load it
|
---|
443 | 2. check for relative links (if relative_only is set)
|
---|
444 | 3. check for domain
|
---|
445 | 4. check for no-parent
|
---|
446 | 5. check for excludes && includes
|
---|
447 | 6. check for suffix
|
---|
448 | 7. check for same host (if spanhost is unset), with possible
|
---|
449 | gethostbyname baggage
|
---|
450 | 8. check for robots.txt
|
---|
451 |
|
---|
452 | Addendum: If the URL is FTP, and it is to be loaded, only the
|
---|
453 | domain and suffix settings are "stronger".
|
---|
454 |
|
---|
455 | Note that .html files will get loaded regardless of suffix rules
|
---|
456 | (but that is remedied later with unlink) unless the depth equals
|
---|
457 | the maximum depth.
|
---|
458 |
|
---|
459 | More time- and memory- consuming tests should be put later on
|
---|
460 | the list. */
|
---|
461 |
|
---|
462 | /* Determine whether URL under consideration has a HTTP-like scheme. */
|
---|
463 | u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
|
---|
464 |
|
---|
465 | /* 1. Schemes other than HTTP are normally not recursed into. */
|
---|
466 | if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
|
---|
467 | {
|
---|
468 | DEBUGP (("Not following non-HTTP schemes.\n"));
|
---|
469 | goto out;
|
---|
470 | }
|
---|
471 |
|
---|
472 | /* 2. If it is an absolute link and they are not followed, throw it
|
---|
473 | out. */
|
---|
474 | if (u_scheme_like_http)
|
---|
475 | if (opt.relative_only && !upos->link_relative_p)
|
---|
476 | {
|
---|
477 | DEBUGP (("It doesn't really look like a relative link.\n"));
|
---|
478 | goto out;
|
---|
479 | }
|
---|
480 |
|
---|
481 | /* 3. If its domain is not to be accepted/looked-up, chuck it
|
---|
482 | out. */
|
---|
483 | if (!accept_domain (u))
|
---|
484 | {
|
---|
485 | DEBUGP (("The domain was not accepted.\n"));
|
---|
486 | goto out;
|
---|
487 | }
|
---|
488 |
|
---|
489 | /* 4. Check for parent directory.
|
---|
490 |
|
---|
491 | If we descended to a different host or changed the scheme, ignore
|
---|
492 | opt.no_parent. Also ignore it for documents needed to display
|
---|
493 | the parent page when in -p mode. */
|
---|
494 | if (opt.no_parent
|
---|
495 | && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
|
---|
496 | && 0 == strcasecmp (u->host, start_url_parsed->host)
|
---|
497 | && u->port == start_url_parsed->port
|
---|
498 | && !(opt.page_requisites && upos->link_inline_p))
|
---|
499 | {
|
---|
500 | if (!frontcmp (start_url_parsed->dir, u->dir))
|
---|
501 | {
|
---|
502 | DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
|
---|
503 | u->dir, start_url_parsed->dir));
|
---|
504 | goto out;
|
---|
505 | }
|
---|
506 | }
|
---|
507 |
|
---|
508 | /* 5. If the file does not match the acceptance list, or is on the
|
---|
509 | rejection list, chuck it out. The same goes for the directory
|
---|
510 | exclusion and inclusion lists. */
|
---|
511 | if (opt.includes || opt.excludes)
|
---|
512 | {
|
---|
513 | if (!accdir (u->dir, ALLABS))
|
---|
514 | {
|
---|
515 | DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
|
---|
516 | goto out;
|
---|
517 | }
|
---|
518 | }
|
---|
519 |
|
---|
520 | /* 6. Check for acceptance/rejection rules. We ignore these rules
|
---|
521 | for directories (no file name to match) and for non-leaf HTMLs,
|
---|
522 | which can lead to other files that do need to be downloaded. (-p
|
---|
523 | automatically implies non-leaf because with -p we can, if
|
---|
524 | necesary, overstep the maximum depth to get the page requisites.) */
|
---|
525 | if (u->file[0] != '\0'
|
---|
526 | && !(has_html_suffix_p (u->file)
|
---|
527 | /* The exception only applies to non-leaf HTMLs (but -p
|
---|
528 | always implies non-leaf because we can overstep the
|
---|
529 | maximum depth to get the requisites): */
|
---|
530 | && (/* non-leaf */
|
---|
531 | opt.reclevel == INFINITE_RECURSION
|
---|
532 | /* also non-leaf */
|
---|
533 | || depth < opt.reclevel - 1
|
---|
534 | /* -p, which implies non-leaf (see above) */
|
---|
535 | || opt.page_requisites)))
|
---|
536 | {
|
---|
537 | if (!acceptable (u->file))
|
---|
538 | {
|
---|
539 | DEBUGP (("%s (%s) does not match acc/rej rules.\n",
|
---|
540 | url, u->file));
|
---|
541 | goto out;
|
---|
542 | }
|
---|
543 | }
|
---|
544 |
|
---|
545 | /* 7. */
|
---|
546 | if (schemes_are_similar_p (u->scheme, parent->scheme))
|
---|
547 | if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
|
---|
548 | {
|
---|
549 | DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
|
---|
550 | u->host, parent->host));
|
---|
551 | goto out;
|
---|
552 | }
|
---|
553 |
|
---|
554 | /* 8. */
|
---|
555 | if (opt.use_robots && u_scheme_like_http)
|
---|
556 | {
|
---|
557 | struct robot_specs *specs = res_get_specs (u->host, u->port);
|
---|
558 | if (!specs)
|
---|
559 | {
|
---|
560 | char *rfile;
|
---|
561 | if (res_retrieve_file (url, &rfile))
|
---|
562 | {
|
---|
563 | specs = res_parse_from_file (rfile);
|
---|
564 | xfree (rfile);
|
---|
565 | }
|
---|
566 | else
|
---|
567 | {
|
---|
568 | /* If we cannot get real specs, at least produce
|
---|
569 | dummy ones so that we can register them and stop
|
---|
570 | trying to retrieve them. */
|
---|
571 | specs = res_parse ("", 0);
|
---|
572 | }
|
---|
573 | res_register_specs (u->host, u->port, specs);
|
---|
574 | }
|
---|
575 |
|
---|
576 | /* Now that we have (or don't have) robots.txt specs, we can
|
---|
577 | check what they say. */
|
---|
578 | if (!res_match_path (specs, u->path))
|
---|
579 | {
|
---|
580 | DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
|
---|
581 | string_set_add (blacklist, url);
|
---|
582 | goto out;
|
---|
583 | }
|
---|
584 | }
|
---|
585 |
|
---|
586 | /* The URL has passed all the tests. It can be placed in the
|
---|
587 | download queue. */
|
---|
588 | DEBUGP (("Decided to load it.\n"));
|
---|
589 |
|
---|
590 | return 1;
|
---|
591 |
|
---|
592 | out:
|
---|
593 | DEBUGP (("Decided NOT to load it.\n"));
|
---|
594 |
|
---|
595 | return 0;
|
---|
596 | }
|
---|
597 |
|
---|
598 | /* This function determines whether we will consider downloading the
|
---|
599 | children of a URL whose download resulted in a redirection,
|
---|
600 | possibly to another host, etc. It is needed very rarely, and thus
|
---|
601 | it is merely a simple-minded wrapper around download_child_p. */
|
---|
602 |
|
---|
603 | static int
|
---|
604 | descend_redirect_p (const char *redirected, const char *original, int depth,
|
---|
605 | struct url *start_url_parsed, struct hash_table *blacklist)
|
---|
606 | {
|
---|
607 | struct url *orig_parsed, *new_parsed;
|
---|
608 | struct urlpos *upos;
|
---|
609 | int success;
|
---|
610 |
|
---|
611 | orig_parsed = url_parse (original, NULL);
|
---|
612 | assert (orig_parsed != NULL);
|
---|
613 |
|
---|
614 | new_parsed = url_parse (redirected, NULL);
|
---|
615 | assert (new_parsed != NULL);
|
---|
616 |
|
---|
617 | upos = xnew0 (struct urlpos);
|
---|
618 | upos->url = new_parsed;
|
---|
619 |
|
---|
620 | success = download_child_p (upos, orig_parsed, depth,
|
---|
621 | start_url_parsed, blacklist);
|
---|
622 |
|
---|
623 | url_free (orig_parsed);
|
---|
624 | url_free (new_parsed);
|
---|
625 | xfree (upos);
|
---|
626 |
|
---|
627 | if (!success)
|
---|
628 | DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
|
---|
629 |
|
---|
630 | return success;
|
---|
631 | }
|
---|