Context Navigation

recur.c

Visit:

Last change on this file was 3440, checked in by bird, 18 years ago
wget 1.10.2
File size: 17.8 KB

Line
1	/* Handling of recursive HTTP retrieving.
2	Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
3
4	This file is part of GNU Wget.
5
6	GNU Wget is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 2 of the License, or
9	(at your option) any later version.
10
11	GNU Wget is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with Wget; if not, write to the Free Software
18	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20	In addition, as a special exception, the Free Software Foundation
21	gives permission to link the code of its release of Wget with the
22	OpenSSL project's "OpenSSL" library (or with modified versions of it
23	that use the same license as the "OpenSSL" library), and distribute
24	the linked executables. You must obey the GNU General Public License
25	in all respects for all of the code used other than "OpenSSL". If you
26	modify this file, you may extend this exception to your version of the
27	file, but you are not obligated to do so. If you do not wish to do
28	so, delete this exception statement from your version. */
29
30	#include <config.h>
31
32	#include <stdio.h>
33	#include <stdlib.h>
34	#ifdef HAVE_STRING_H
35	# include <string.h>
36	#else
37	# include <strings.h>
38	#endif /* HAVE_STRING_H */
39	#ifdef HAVE_UNISTD_H
40	# include <unistd.h>
41	#endif /* HAVE_UNISTD_H */
42	#include <errno.h>
43	#include <assert.h>
44	#include <sys/types.h>
45
46	#include "wget.h"
47	#include "url.h"
48	#include "recur.h"
49	#include "utils.h"
50	#include "retr.h"
51	#include "ftp.h"
52	#include "host.h"
53	#include "hash.h"
54	#include "res.h"
55	#include "convert.h"
56
57	#ifndef errno
58	extern int errno;
59	#endif
60
61	extern char *version_string;
62	extern SUM_SIZE_INT total_downloaded_bytes;
63
64	extern struct hash_table *dl_url_file_map;
65	extern struct hash_table *downloaded_html_set;
66
67
68	/* Functions for maintaining the URL queue. */
69
70	struct queue_element {
71	const char url; / the URL to download */
72	const char referer; / the referring document */
73	int depth; /* the depth */
74	unsigned int html_allowed :1; /* whether the document is allowed to
75	be treated as HTML. */
76
77	struct queue_element next; / next element in queue */
78	};
79
80	struct url_queue {
81	struct queue_element *head;
82	struct queue_element *tail;
83	int count, maxcount;
84	};
85
86	/* Create a URL queue. */
87
88	static struct url_queue *
89	url_queue_new (void)
90	{
91	struct url_queue *queue = xnew0 (struct url_queue);
92	return queue;
93	}
94
95	/* Delete a URL queue. */
96
97	static void
98	url_queue_delete (struct url_queue *queue)
99	{
100	xfree (queue);
101	}
102
103	/* Enqueue a URL in the queue. The queue is FIFO: the items will be
104	retrieved ("dequeued") from the queue in the order they were placed
105	into it. */
106
107	static void
108	url_enqueue (struct url_queue *queue,
109	const char url, const char referer, int depth, int html_allowed)
110	{
111	struct queue_element *qel = xnew (struct queue_element);
112	qel->url = url;
113	qel->referer = referer;
114	qel->depth = depth;
115	qel->html_allowed = html_allowed;
116	qel->next = NULL;
117
118	++queue->count;
119	if (queue->count > queue->maxcount)
120	queue->maxcount = queue->count;
121
122	DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
123	DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
124
125	if (queue->tail)
126	queue->tail->next = qel;
127	queue->tail = qel;
128
129	if (!queue->head)
130	queue->head = queue->tail;
131	}
132
133	/* Take a URL out of the queue. Return 1 if this operation succeeded,
134	or 0 if the queue is empty. */
135
136	static int
137	url_dequeue (struct url_queue *queue,
138	const char url, const char referer, int *depth,
139	int *html_allowed)
140	{
141	struct queue_element *qel = queue->head;
142
143	if (!qel)
144	return 0;
145
146	queue->head = queue->head->next;
147	if (!queue->head)
148	queue->tail = NULL;
149
150	*url = qel->url;
151	*referer = qel->referer;
152	*depth = qel->depth;
153	*html_allowed = qel->html_allowed;
154
155	--queue->count;
156
157	DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
158	DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
159
160	xfree (qel);
161	return 1;
162	}
163
164
165	static int download_child_p PARAMS ((const struct urlpos , struct url , int,
166	struct url , struct hash_table ));
167	static int descend_redirect_p PARAMS ((const char , const char , int,
168	struct url , struct hash_table ));
169
170
171	/* Retrieve a part of the web beginning with START_URL. This used to
172	be called "recursive retrieval", because the old function was
173	recursive and implemented depth-first search. retrieve_tree on the
174	other hand implements breadth-search traversal of the tree, which
175	results in much nicer ordering of downloads.
176
177	The algorithm this function uses is simple:
178
179	1. put START_URL in the queue.
180	2. while there are URLs in the queue:
181
182	3. get next URL from the queue.
183	4. download it.
184	5. if the URL is HTML and its depth does not exceed maximum depth,
185	get the list of URLs embedded therein.
186	6. for each of those URLs do the following:
187
188	7. if the URL is not one of those downloaded before, and if it
189	satisfies the criteria specified by the various command-line
190	options, add it to the queue. */
191
192	uerr_t
193	retrieve_tree (const char *start_url)
194	{
195	uerr_t status = RETROK;
196
197	/* The queue of URLs we need to load. */
198	struct url_queue *queue;
199
200	/* The URLs we do not wish to enqueue, because they are already in
201	the queue, but haven't been downloaded yet. */
202	struct hash_table *blacklist;
203
204	int up_error_code;
205	struct url *start_url_parsed = url_parse (start_url, &up_error_code);
206
207	if (!start_url_parsed)
208	{
209	logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
210	url_error (up_error_code));
211	return URLERROR;
212	}
213
214	queue = url_queue_new ();
215	blacklist = make_string_hash_table (0);
216
217	/* Enqueue the starting URL. Use start_url_parsed->url rather than
218	just URL so we enqueue the canonical form of the URL. */
219	url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
220	string_set_add (blacklist, start_url_parsed->url);
221
222	while (1)
223	{
224	int descend = 0;
225	char url, referer, *file = NULL;
226	int depth, html_allowed;
227	int dash_p_leaf_HTML = 0;
228
229	if (opt.quota && total_downloaded_bytes > opt.quota)
230	break;
231	if (status == FWRITEERR)
232	break;
233
234	/* Get the next URL from the queue... */
235
236	if (!url_dequeue (queue,
237	(const char )&url, (const char )&referer,
238	&depth, &html_allowed))
239	break;
240
241	/* ...and download it. Note that this download is in most cases
242	unconditional, as download_child_p already makes sure a file
243	doesn't get enqueued twice -- and yet this check is here, and
244	not in download_child_p. This is so that if you run `wget -r
245	URL1 URL2', and a random URL is encountered once under URL1
246	and again under URL2, but at a different (possibly smaller)
247	depth, we want the URL's children to be taken into account
248	the second time. */
249	if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
250	{
251	file = xstrdup (hash_table_get (dl_url_file_map, url));
252
253	DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
254	url, file));
255
256	if (html_allowed
257	&& downloaded_html_set
258	&& string_set_contains (downloaded_html_set, file))
259	descend = 1;
260	}
261	else
262	{
263	int dt = 0;
264	char *redirected = NULL;
265	int oldrec = opt.recursive;
266
267	opt.recursive = 0;
268	status = retrieve_url (url, &file, &redirected, referer, &dt);
269	opt.recursive = oldrec;
270
271	if (html_allowed && file && status == RETROK
272	&& (dt & RETROKF) && (dt & TEXTHTML))
273	descend = 1;
274
275	if (redirected)
276	{
277	/* We have been redirected, possibly to another host, or
278	different path, or wherever. Check whether we really
279	want to follow it. */
280	if (descend)
281	{
282	if (!descend_redirect_p (redirected, url, depth,
283	start_url_parsed, blacklist))
284	descend = 0;
285	else
286	/* Make sure that the old pre-redirect form gets
287	blacklisted. */
288	string_set_add (blacklist, url);
289	}
290
291	xfree (url);
292	url = redirected;
293	}
294	}
295
296	if (descend
297	&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
298	{
299	if (opt.page_requisites
300	&& (depth == opt.reclevel \|\| depth == opt.reclevel + 1))
301	{
302	/* When -p is specified, we are allowed to exceed the
303	maximum depth, but only for the "inline" links,
304	i.e. those that are needed to display the page.
305	Originally this could exceed the depth at most by
306	one, but we allow one more level so that the leaf
307	pages that contain frames can be loaded
308	correctly. */
309	dash_p_leaf_HTML = 1;
310	}
311	else
312	{
313	/* Either -p wasn't specified or it was and we've
314	already spent the two extra (pseudo-)levels that it
315	affords us, so we need to bail out. */
316	DEBUGP (("Not descending further; at depth %d, max. %d.\n",
317	depth, opt.reclevel));
318	descend = 0;
319	}
320	}
321
322	/* If the downloaded document was HTML, parse it and enqueue the
323	links it contains. */
324
325	if (descend)
326	{
327	int meta_disallow_follow = 0;
328	struct urlpos *children
329	= get_urls_html (file, url, &meta_disallow_follow);
330
331	if (opt.use_robots && meta_disallow_follow)
332	{
333	free_urlpos (children);
334	children = NULL;
335	}
336
337	if (children)
338	{
339	struct urlpos *child = children;
340	struct url *url_parsed = url_parsed = url_parse (url, NULL);
341	assert (url_parsed != NULL);
342
343	for (; child; child = child->next)
344	{
345	if (child->ignore_when_downloading)
346	continue;
347	if (dash_p_leaf_HTML && !child->link_inline_p)
348	continue;
349	if (download_child_p (child, url_parsed, depth, start_url_parsed,
350	blacklist))
351	{
352	url_enqueue (queue, xstrdup (child->url->url),
353	xstrdup (url), depth + 1,
354	child->link_expect_html);
355	/* We blacklist the URL we have enqueued, because we
356	don't want to enqueue (and hence download) the
357	same URL twice. */
358	string_set_add (blacklist, child->url->url);
359	}
360	}
361
362	url_free (url_parsed);
363	free_urlpos (children);
364	}
365	}
366
367	if (opt.delete_after \|\| (file && !acceptable (file)))
368	{
369	/* Either --delete-after was specified, or we loaded this
370	otherwise rejected (e.g. by -R) HTML file just so we
371	could harvest its hyperlinks -- in either case, delete
372	the local file. */
373	DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
374	opt.delete_after ? "--delete-after" :
375	"recursive rejection criteria"));
376	logprintf (LOG_VERBOSE,
377	(opt.delete_after
378	? _("Removing %s.\n")
379	: _("Removing %s since it should be rejected.\n")),
380	file);
381	if (unlink (file))
382	logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
383	register_delete_file (file);
384	}
385
386	xfree (url);
387	xfree_null (referer);
388	xfree_null (file);
389	}
390
391	/* If anything is left of the queue due to a premature exit, free it
392	now. */
393	{
394	char d1, d2;
395	int d3, d4;
396	while (url_dequeue (queue,
397	(const char )&d1, (const char )&d2, &d3, &d4))
398	{
399	xfree (d1);
400	xfree_null (d2);
401	}
402	}
403	url_queue_delete (queue);
404
405	if (start_url_parsed)
406	url_free (start_url_parsed);
407	string_set_free (blacklist);
408
409	if (opt.quota && total_downloaded_bytes > opt.quota)
410	return QUOTEXC;
411	else if (status == FWRITEERR)
412	return FWRITEERR;
413	else
414	return RETROK;
415	}
416
417	/* Based on the context provided by retrieve_tree, decide whether a
418	URL is to be descended to. This is only ever called from
419	retrieve_tree, but is in a separate function for clarity.
420
421	The most expensive checks (such as those for robots) are memoized
422	by storing these URLs to BLACKLIST. This may or may not help. It
423	will help if those URLs are encountered many times. */
424
425	static int
426	download_child_p (const struct urlpos upos, struct url parent, int depth,
427	struct url start_url_parsed, struct hash_table blacklist)
428	{
429	struct url *u = upos->url;
430	const char *url = u->url;
431	int u_scheme_like_http;
432
433	DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
434
435	if (string_set_contains (blacklist, url))
436	{
437	DEBUGP (("Already on the black list.\n"));
438	goto out;
439	}
440
441	/* Several things to check for:
442	1. if scheme is not http, and we don't load it
443	2. check for relative links (if relative_only is set)
444	3. check for domain
445	4. check for no-parent
446	5. check for excludes && includes
447	6. check for suffix
448	7. check for same host (if spanhost is unset), with possible
449	gethostbyname baggage
450	8. check for robots.txt
451
452	Addendum: If the URL is FTP, and it is to be loaded, only the
453	domain and suffix settings are "stronger".
454
455	Note that .html files will get loaded regardless of suffix rules
456	(but that is remedied later with unlink) unless the depth equals
457	the maximum depth.
458
459	More time- and memory- consuming tests should be put later on
460	the list. */
461
462	/* Determine whether URL under consideration has a HTTP-like scheme. */
463	u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
464
465	/* 1. Schemes other than HTTP are normally not recursed into. */
466	if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
467	{
468	DEBUGP (("Not following non-HTTP schemes.\n"));
469	goto out;
470	}
471
472	/* 2. If it is an absolute link and they are not followed, throw it
473	out. */
474	if (u_scheme_like_http)
475	if (opt.relative_only && !upos->link_relative_p)
476	{
477	DEBUGP (("It doesn't really look like a relative link.\n"));
478	goto out;
479	}
480
481	/* 3. If its domain is not to be accepted/looked-up, chuck it
482	out. */
483	if (!accept_domain (u))
484	{
485	DEBUGP (("The domain was not accepted.\n"));
486	goto out;
487	}
488
489	/* 4. Check for parent directory.
490
491	If we descended to a different host or changed the scheme, ignore
492	opt.no_parent. Also ignore it for documents needed to display
493	the parent page when in -p mode. */
494	if (opt.no_parent
495	&& schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
496	&& 0 == strcasecmp (u->host, start_url_parsed->host)
497	&& u->port == start_url_parsed->port
498	&& !(opt.page_requisites && upos->link_inline_p))
499	{
500	if (!frontcmp (start_url_parsed->dir, u->dir))
501	{
502	DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
503	u->dir, start_url_parsed->dir));
504	goto out;
505	}
506	}
507
508	/* 5. If the file does not match the acceptance list, or is on the
509	rejection list, chuck it out. The same goes for the directory
510	exclusion and inclusion lists. */
511	if (opt.includes \|\| opt.excludes)
512	{
513	if (!accdir (u->dir, ALLABS))
514	{
515	DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
516	goto out;
517	}
518	}
519
520	/* 6. Check for acceptance/rejection rules. We ignore these rules
521	for directories (no file name to match) and for non-leaf HTMLs,
522	which can lead to other files that do need to be downloaded. (-p
523	automatically implies non-leaf because with -p we can, if
524	necesary, overstep the maximum depth to get the page requisites.) */
525	if (u->file[0] != '\0'
526	&& !(has_html_suffix_p (u->file)
527	/* The exception only applies to non-leaf HTMLs (but -p
528	always implies non-leaf because we can overstep the
529	maximum depth to get the requisites): */
530	&& (/* non-leaf */
531	opt.reclevel == INFINITE_RECURSION
532	/* also non-leaf */
533	\|\| depth < opt.reclevel - 1
534	/* -p, which implies non-leaf (see above) */
535	\|\| opt.page_requisites)))
536	{
537	if (!acceptable (u->file))
538	{
539	DEBUGP (("%s (%s) does not match acc/rej rules.\n",
540	url, u->file));
541	goto out;
542	}
543	}
544
545	/* 7. */
546	if (schemes_are_similar_p (u->scheme, parent->scheme))
547	if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
548	{
549	DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
550	u->host, parent->host));
551	goto out;
552	}
553
554	/* 8. */
555	if (opt.use_robots && u_scheme_like_http)
556	{
557	struct robot_specs *specs = res_get_specs (u->host, u->port);
558	if (!specs)
559	{
560	char *rfile;
561	if (res_retrieve_file (url, &rfile))
562	{
563	specs = res_parse_from_file (rfile);
564	xfree (rfile);
565	}
566	else
567	{
568	/* If we cannot get real specs, at least produce
569	dummy ones so that we can register them and stop
570	trying to retrieve them. */
571	specs = res_parse ("", 0);
572	}
573	res_register_specs (u->host, u->port, specs);
574	}
575
576	/* Now that we have (or don't have) robots.txt specs, we can
577	check what they say. */
578	if (!res_match_path (specs, u->path))
579	{
580	DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
581	string_set_add (blacklist, url);
582	goto out;
583	}
584	}
585
586	/* The URL has passed all the tests. It can be placed in the
587	download queue. */
588	DEBUGP (("Decided to load it.\n"));
589
590	return 1;
591
592	out:
593	DEBUGP (("Decided NOT to load it.\n"));
594
595	return 0;
596	}
597
598	/* This function determines whether we will consider downloading the
599	children of a URL whose download resulted in a redirection,
600	possibly to another host, etc. It is needed very rarely, and thus
601	it is merely a simple-minded wrapper around download_child_p. */
602
603	static int
604	descend_redirect_p (const char redirected, const char original, int depth,
605	struct url start_url_parsed, struct hash_table blacklist)
606	{
607	struct url orig_parsed, new_parsed;
608	struct urlpos *upos;
609	int success;
610
611	orig_parsed = url_parse (original, NULL);
612	assert (orig_parsed != NULL);
613
614	new_parsed = url_parse (redirected, NULL);
615	assert (new_parsed != NULL);
616
617	upos = xnew0 (struct urlpos);
618	upos->url = new_parsed;
619
620	success = download_child_p (upos, orig_parsed, depth,
621	start_url_parsed, blacklist);
622
623	url_free (orig_parsed);
624	url_free (new_parsed);
625	xfree (upos);
626
627	if (!success)
628	DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
629
630	return success;
631	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/net-misc/wget/src/recur.c

Download in other formats: