Context Navigation

html-url.c

Visit:

Last change on this file was 3440, checked in by bird, 18 years ago
wget 1.10.2
File size: 21.0 KB

Line
1	/* Collect URLs from HTML source.
2	Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
3
4	This file is part of GNU Wget.
5
6	GNU Wget is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 2 of the License, or
9	(at your option) any later version.
10
11	GNU Wget is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with Wget; if not, write to the Free Software
18	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20	In addition, as a special exception, the Free Software Foundation
21	gives permission to link the code of its release of Wget with the
22	OpenSSL project's "OpenSSL" library (or with modified versions of it
23	that use the same license as the "OpenSSL" library), and distribute
24	the linked executables. You must obey the GNU General Public License
25	in all respects for all of the code used other than "OpenSSL". If you
26	modify this file, you may extend this exception to your version of the
27	file, but you are not obligated to do so. If you do not wish to do
28	so, delete this exception statement from your version. */
29
30	#include <config.h>
31
32	#include <stdio.h>
33	#ifdef HAVE_STRING_H
34	# include <string.h>
35	#else
36	# include <strings.h>
37	#endif
38	#include <stdlib.h>
39	#include <errno.h>
40	#include <assert.h>
41
42	#include "wget.h"
43	#include "html-parse.h"
44	#include "url.h"
45	#include "utils.h"
46	#include "hash.h"
47	#include "convert.h"
48	#include "recur.h" /* declaration of get_urls_html */
49
50	#ifndef errno
51	extern int errno;
52	#endif
53
54	struct map_context;
55
56	typedef void (tag_handler_t) PARAMS ((int, struct taginfo ,
57	struct map_context *));
58
59	#define DECLARE_TAG_HANDLER(fun) \
60	static void fun PARAMS ((int, struct taginfo , struct map_context ))
61
62	DECLARE_TAG_HANDLER (tag_find_urls);
63	DECLARE_TAG_HANDLER (tag_handle_base);
64	DECLARE_TAG_HANDLER (tag_handle_form);
65	DECLARE_TAG_HANDLER (tag_handle_link);
66	DECLARE_TAG_HANDLER (tag_handle_meta);
67
68	enum {
69	TAG_A,
70	TAG_APPLET,
71	TAG_AREA,
72	TAG_BASE,
73	TAG_BGSOUND,
74	TAG_BODY,
75	TAG_EMBED,
76	TAG_FIG,
77	TAG_FORM,
78	TAG_FRAME,
79	TAG_IFRAME,
80	TAG_IMG,
81	TAG_INPUT,
82	TAG_LAYER,
83	TAG_LINK,
84	TAG_META,
85	TAG_OBJECT,
86	TAG_OVERLAY,
87	TAG_SCRIPT,
88	TAG_TABLE,
89	TAG_TD,
90	TAG_TH
91	};
92
93	/* The list of known tags and functions used for handling them. Most
94	tags are simply harvested for URLs. */
95	static struct known_tag {
96	int tagid;
97	const char *name;
98	tag_handler_t handler;
99	} known_tags[] = {
100	{ TAG_A, "a", tag_find_urls },
101	{ TAG_APPLET, "applet", tag_find_urls },
102	{ TAG_AREA, "area", tag_find_urls },
103	{ TAG_BASE, "base", tag_handle_base },
104	{ TAG_BGSOUND, "bgsound", tag_find_urls },
105	{ TAG_BODY, "body", tag_find_urls },
106	{ TAG_EMBED, "embed", tag_find_urls },
107	{ TAG_FIG, "fig", tag_find_urls },
108	{ TAG_FORM, "form", tag_handle_form },
109	{ TAG_FRAME, "frame", tag_find_urls },
110	{ TAG_IFRAME, "iframe", tag_find_urls },
111	{ TAG_IMG, "img", tag_find_urls },
112	{ TAG_INPUT, "input", tag_find_urls },
113	{ TAG_LAYER, "layer", tag_find_urls },
114	{ TAG_LINK, "link", tag_handle_link },
115	{ TAG_META, "meta", tag_handle_meta },
116	{ TAG_OBJECT, "object", tag_find_urls },
117	{ TAG_OVERLAY, "overlay", tag_find_urls },
118	{ TAG_SCRIPT, "script", tag_find_urls },
119	{ TAG_TABLE, "table", tag_find_urls },
120	{ TAG_TD, "td", tag_find_urls },
121	{ TAG_TH, "th", tag_find_urls }
122	};
123
124	/* tag_url_attributes documents which attributes of which tags contain
125	URLs to harvest. It is used by tag_find_urls. */
126
127	/* Defines for the FLAGS. */
128
129	/* The link is "inline", i.e. needs to be retrieved for this document
130	to be correctly rendered. Inline links include inlined images,
131	stylesheets, children frames, etc. */
132	#define ATTR_INLINE 1
133
134	/* The link is expected to yield HTML contents. It's important not to
135	try to follow HTML obtained by following e.g. <img src="...">
136	regardless of content-type. Doing this causes infinite loops for
137	"images" that return non-404 error pages with links to the same
138	image. */
139	#define ATTR_HTML 2
140
141	/* For tags handled by tag_find_urls: attributes that contain URLs to
142	download. */
143	static struct {
144	int tagid;
145	const char *attr_name;
146	int flags;
147	} tag_url_attributes[] = {
148	{ TAG_A, "href", ATTR_HTML },
149	{ TAG_APPLET, "code", ATTR_INLINE },
150	{ TAG_AREA, "href", ATTR_HTML },
151	{ TAG_BGSOUND, "src", ATTR_INLINE },
152	{ TAG_BODY, "background", ATTR_INLINE },
153	{ TAG_EMBED, "href", ATTR_HTML },
154	{ TAG_EMBED, "src", ATTR_INLINE \| ATTR_HTML },
155	{ TAG_FIG, "src", ATTR_INLINE },
156	{ TAG_FRAME, "src", ATTR_INLINE \| ATTR_HTML },
157	{ TAG_IFRAME, "src", ATTR_INLINE \| ATTR_HTML },
158	{ TAG_IMG, "href", ATTR_INLINE },
159	{ TAG_IMG, "lowsrc", ATTR_INLINE },
160	{ TAG_IMG, "src", ATTR_INLINE },
161	{ TAG_INPUT, "src", ATTR_INLINE },
162	{ TAG_LAYER, "src", ATTR_INLINE \| ATTR_HTML },
163	{ TAG_OBJECT, "data", ATTR_INLINE },
164	{ TAG_OVERLAY, "src", ATTR_INLINE \| ATTR_HTML },
165	{ TAG_SCRIPT, "src", ATTR_INLINE },
166	{ TAG_TABLE, "background", ATTR_INLINE },
167	{ TAG_TD, "background", ATTR_INLINE },
168	{ TAG_TH, "background", ATTR_INLINE }
169	};
170
171	/* The lists of interesting tags and attributes are built dynamically,
172	from the information above. However, some places in the code refer
173	to the attributes not mentioned here. We add them manually. */
174	static const char *additional_attributes[] = {
175	"rel", /* used by tag_handle_link */
176	"http-equiv", /* used by tag_handle_meta */
177	"name", /* used by tag_handle_meta */
178	"content", /* used by tag_handle_meta */
179	"action" /* used by tag_handle_form */
180	};
181
182	struct hash_table *interesting_tags;
183	struct hash_table *interesting_attributes;
184
185	static void
186	init_interesting (void)
187	{
188	/* Init the variables interesting_tags and interesting_attributes
189	that are used by the HTML parser to know which tags and
190	attributes we're interested in. We initialize this only once,
191	for performance reasons.
192
193	Here we also make sure that what we put in interesting_tags
194	matches the user's preferences as specified through --ignore-tags
195	and --follow-tags. */
196
197	int i;
198	interesting_tags = make_nocase_string_hash_table (countof (known_tags));
199
200	/* First, add all the tags we know hot to handle, mapped to their
201	respective entries in known_tags. */
202	for (i = 0; i < countof (known_tags); i++)
203	hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
204
205	/* Then remove the tags ignored through --ignore-tags. */
206	if (opt.ignore_tags)
207	{
208	char **ignored;
209	for (ignored = opt.ignore_tags; *ignored; ignored++)
210	hash_table_remove (interesting_tags, *ignored);
211	}
212
213	/* If --follow-tags is specified, use only those tags. */
214	if (opt.follow_tags)
215	{
216	/* Create a new table intersecting --follow-tags and known_tags,
217	and use it as interesting_tags. */
218	struct hash_table *intersect = make_nocase_string_hash_table (0);
219	char **followed;
220	for (followed = opt.follow_tags; *followed; followed++)
221	{
222	struct known_tag t = hash_table_get (interesting_tags, followed);
223	if (!t)
224	continue; /* ignore unknown --follow-tags entries. */
225	hash_table_put (intersect, *followed, t);
226	}
227	hash_table_destroy (interesting_tags);
228	interesting_tags = intersect;
229	}
230
231	/* Add the attributes we care about. */
232	interesting_attributes = make_nocase_string_hash_table (10);
233	for (i = 0; i < countof (additional_attributes); i++)
234	hash_table_put (interesting_attributes, additional_attributes[i], "1");
235	for (i = 0; i < countof (tag_url_attributes); i++)
236	hash_table_put (interesting_attributes,
237	tag_url_attributes[i].attr_name, "1");
238	}
239
240	/* Find the value of attribute named NAME in the taginfo TAG. If the
241	attribute is not present, return NULL. If ATTRIND is non-NULL, the
242	index of the attribute in TAG will be stored there. */
243
244	static char *
245	find_attr (struct taginfo tag, const char name, int *attrind)
246	{
247	int i;
248	for (i = 0; i < tag->nattrs; i++)
249	if (!strcasecmp (tag->attrs[i].name, name))
250	{
251	if (attrind)
252	*attrind = i;
253	return tag->attrs[i].value;
254	}
255	return NULL;
256	}
257
258	struct map_context {
259	char text; / HTML text. */
260	char base; / Base URI of the document, possibly
261	changed through <base href=...>. */
262	const char parent_base; / Base of the current document. */
263	const char document_file; / File name of this document. */
264	int nofollow; /* whether NOFOLLOW was specified in a
265	<meta name=robots> tag. */
266
267	struct urlpos head, tail; /* List of URLs that is being
268	built. */
269	};
270
271	/* Append LINK_URI to the urlpos structure that is being built.
272
273	LINK_URI will be merged with the current document base. TAG and
274	ATTRIND are the necessary context to store the position and
275	size. */
276
277	static struct urlpos *
278	append_url (const char *link_uri,
279	struct taginfo tag, int attrind, struct map_context ctx)
280	{
281	int link_has_scheme = url_has_scheme (link_uri);
282	struct urlpos *newel;
283	const char *base = ctx->base ? ctx->base : ctx->parent_base;
284	struct url *url;
285
286	if (!base)
287	{
288	DEBUGP (("%s: no base, merge will use \"%s\".\n",
289	ctx->document_file, link_uri));
290
291	if (!link_has_scheme)
292	{
293	/* Base URL is unavailable, and the link does not have a
294	location attached to it -- we have to give up. Since
295	this can only happen when using `--force-html -i', print
296	a warning. */
297	logprintf (LOG_NOTQUIET,
298	_("%s: Cannot resolve incomplete link %s.\n"),
299	ctx->document_file, link_uri);
300	return NULL;
301	}
302
303	url = url_parse (link_uri, NULL);
304	if (!url)
305	{
306	DEBUGP (("%s: link \"%s\" doesn't parse.\n",
307	ctx->document_file, link_uri));
308	return NULL;
309	}
310	}
311	else
312	{
313	/* Merge BASE with LINK_URI, but also make sure the result is
314	canonicalized, i.e. that "../" have been resolved.
315	(parse_url will do that for us.) */
316
317	char *complete_uri = uri_merge (base, link_uri);
318
319	DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
320	ctx->document_file, base, link_uri, complete_uri));
321
322	url = url_parse (complete_uri, NULL);
323	if (!url)
324	{
325	DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
326	ctx->document_file, complete_uri));
327	xfree (complete_uri);
328	return NULL;
329	}
330	xfree (complete_uri);
331	}
332
333	DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
334
335	newel = xnew0 (struct urlpos);
336	newel->url = url;
337	newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
338	newel->size = tag->attrs[attrind].value_raw_size;
339
340	/* A URL is relative if the host is not named, and the name does not
341	start with `/'. */
342	if (!link_has_scheme && *link_uri != '/')
343	newel->link_relative_p = 1;
344	else if (link_has_scheme)
345	newel->link_complete_p = 1;
346
347	if (ctx->tail)
348	{
349	ctx->tail->next = newel;
350	ctx->tail = newel;
351	}
352	else
353	ctx->tail = ctx->head = newel;
354
355	return newel;
356	}
357
358
359	/* All the tag_* functions are called from collect_tags_mapper, as
360	specified by KNOWN_TAGS. */
361
362	/* Default tag handler: collect URLs from attributes specified for
363	this tag by tag_url_attributes. */
364
365	static void
366	tag_find_urls (int tagid, struct taginfo tag, struct map_context ctx)
367	{
368	int i, attrind;
369	int first = -1;
370
371	for (i = 0; i < countof (tag_url_attributes); i++)
372	if (tag_url_attributes[i].tagid == tagid)
373	{
374	/* We've found the index of tag_url_attributes where the
375	attributes of our tag begin. */
376	first = i;
377	break;
378	}
379	assert (first != -1);
380
381	/* Loop over the "interesting" attributes of this tag. In this
382	example, it will loop over "src" and "lowsrc".
383
384	<img src="foo.png" lowsrc="bar.png">
385
386	This has to be done in the outer loop so that the attributes are
387	processed in the same order in which they appear in the page.
388	This is required when converting links. */
389
390	for (attrind = 0; attrind < tag->nattrs; attrind++)
391	{
392	/* Find whether TAG/ATTRIND is a combination that contains a
393	URL. */
394	char *link = tag->attrs[attrind].value;
395	const int size = countof (tag_url_attributes);
396
397	/* If you're cringing at the inefficiency of the nested loops,
398	remember that they both iterate over a very small number of
399	items. The worst-case inner loop is for the IMG tag, which
400	has three attributes. */
401	for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
402	{
403	if (0 == strcasecmp (tag->attrs[attrind].name,
404	tag_url_attributes[i].attr_name))
405	{
406	struct urlpos *up = append_url (link, tag, attrind, ctx);
407	if (up)
408	{
409	int flags = tag_url_attributes[i].flags;
410	if (flags & ATTR_INLINE)
411	up->link_inline_p = 1;
412	if (flags & ATTR_HTML)
413	up->link_expect_html = 1;
414	}
415	}
416	}
417	}
418	}
419
420	/* Handle the BASE tag, for <base href=...>. */
421
422	static void
423	tag_handle_base (int tagid, struct taginfo tag, struct map_context ctx)
424	{
425	struct urlpos *base_urlpos;
426	int attrind;
427	char *newbase = find_attr (tag, "href", &attrind);
428	if (!newbase)
429	return;
430
431	base_urlpos = append_url (newbase, tag, attrind, ctx);
432	if (!base_urlpos)
433	return;
434	base_urlpos->ignore_when_downloading = 1;
435	base_urlpos->link_base_p = 1;
436
437	if (ctx->base)
438	xfree (ctx->base);
439	if (ctx->parent_base)
440	ctx->base = uri_merge (ctx->parent_base, newbase);
441	else
442	ctx->base = xstrdup (newbase);
443	}
444
445	/* Mark the URL found in <form action=...> for conversion. */
446
447	static void
448	tag_handle_form (int tagid, struct taginfo tag, struct map_context ctx)
449	{
450	int attrind;
451	char *action = find_attr (tag, "action", &attrind);
452	if (action)
453	{
454	struct urlpos *up = append_url (action, tag, attrind, ctx);
455	if (up)
456	up->ignore_when_downloading = 1;
457	}
458	}
459
460	/* Handle the LINK tag. It requires special handling because how its
461	links will be followed in -p mode depends on the REL attribute. */
462
463	static void
464	tag_handle_link (int tagid, struct taginfo tag, struct map_context ctx)
465	{
466	int attrind;
467	char *href = find_attr (tag, "href", &attrind);
468
469	/* All <link href="..."> link references are external, except those
470	known not to be, such as style sheet and shortcut icon:
471
472	<link rel="stylesheet" href="...">
473	<link rel="shortcut icon" href="...">
474	*/
475	if (href)
476	{
477	struct urlpos *up = append_url (href, tag, attrind, ctx);
478	if (up)
479	{
480	char *rel = find_attr (tag, "rel", NULL);
481	if (rel
482	&& (0 == strcasecmp (rel, "stylesheet")
483	\|\| 0 == strcasecmp (rel, "shortcut icon")))
484	up->link_inline_p = 1;
485	else
486	/* The external ones usually point to HTML pages, such as
487	<link rel="next" href="..."> */
488	up->link_expect_html = 1;
489	}
490	}
491	}
492
493	/* Handle the META tag. This requires special handling because of the
494	refresh feature and because of robot exclusion. */
495
496	static void
497	tag_handle_meta (int tagid, struct taginfo tag, struct map_context ctx)
498	{
499	char *name = find_attr (tag, "name", NULL);
500	char *http_equiv = find_attr (tag, "http-equiv", NULL);
501
502	if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
503	{
504	/* Some pages use a META tag to specify that the page be
505	refreshed by a new page after a given number of seconds. The
506	general format for this is:
507
508	<meta http-equiv=Refresh content="NUMBER; URL=index2.html">
509
510	So we just need to skip past the "NUMBER; URL=" garbage to
511	get to the URL. */
512
513	struct urlpos *entry;
514	int attrind;
515	int timeout = 0;
516	char *p;
517
518	char *refresh = find_attr (tag, "content", &attrind);
519	if (!refresh)
520	return;
521
522	for (p = refresh; ISDIGIT (*p); p++)
523	timeout = 10 * timeout + *p - '0';
524	if (*p++ != ';')
525	return;
526
527	while (ISSPACE (*p))
528	++p;
529	if (!( TOUPPER (*p) == 'U'
530	&& TOUPPER (*(p + 1)) == 'R'
531	&& TOUPPER (*(p + 2)) == 'L'
532	&& *(p + 3) == '='))
533	return;
534	p += 4;
535	while (ISSPACE (*p))
536	++p;
537
538	entry = append_url (p, tag, attrind, ctx);
539	if (entry)
540	{
541	entry->link_refresh_p = 1;
542	entry->refresh_timeout = timeout;
543	entry->link_expect_html = 1;
544	}
545	}
546	else if (name && 0 == strcasecmp (name, "robots"))
547	{
548	/* Handle stuff like:
549	<meta name="robots" content="index,nofollow"> */
550	char *content = find_attr (tag, "content", NULL);
551	if (!content)
552	return;
553	if (!strcasecmp (content, "none"))
554	ctx->nofollow = 1;
555	else
556	{
557	while (*content)
558	{
559	/* Find the next occurrence of ',' or the end of
560	the string. */
561	char *end = strchr (content, ',');
562	if (end)
563	++end;
564	else
565	end = content + strlen (content);
566	if (!strncasecmp (content, "nofollow", end - content))
567	ctx->nofollow = 1;
568	content = end;
569	}
570	}
571	}
572	}
573
574	/* Dispatch the tag handler appropriate for the tag we're mapping
575	over. See known_tags[] for definition of tag handlers. */
576
577	static void
578	collect_tags_mapper (struct taginfo tag, void arg)
579	{
580	struct map_context ctx = (struct map_context )arg;
581
582	/* Find the tag in our table of tags. This must not fail because
583	map_html_tags only returns tags found in interesting_tags. */
584	struct known_tag *t = hash_table_get (interesting_tags, tag->name);
585	assert (t != NULL);
586
587	t->handler (t->tagid, tag, ctx);
588	}
589
590
591	/* Analyze HTML tags FILE and construct a list of URLs referenced from
592	it. It merges relative links in FILE with URL. It is aware of
593	<base href=...> and does the right thing. */
594
595	struct urlpos *
596	get_urls_html (const char file, const char url, int *meta_disallow_follow)
597	{
598	struct file_memory *fm;
599	struct map_context ctx;
600	int flags;
601
602	/* Load the file. */
603	fm = read_file (file);
604	if (!fm)
605	{
606	logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
607	return NULL;
608	}
609	DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
610
611	ctx.text = fm->content;
612	ctx.head = ctx.tail = NULL;
613	ctx.base = NULL;
614	ctx.parent_base = url ? url : opt.base_href;
615	ctx.document_file = file;
616	ctx.nofollow = 0;
617
618	if (!interesting_tags)
619	init_interesting ();
620
621	/* Specify MHT_TRIM_VALUES because of buggy HTML generators that
622	generate <a href=" foo"> instead of <a href="foo"> (browsers
623	ignore spaces as well.) If you really mean space, use &32; or
624	%20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
625	e.g. in <img src="foo.[newline]html">. Such newlines are also
626	ignored by IE and Mozilla and are presumably introduced by
627	writing HTML with editors that force word wrap. */
628	flags = MHT_TRIM_VALUES;
629	if (opt.strict_comments)
630	flags \|= MHT_STRICT_COMMENTS;
631
632	map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
633	interesting_tags, interesting_attributes);
634
635	DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
636	if (meta_disallow_follow)
637	*meta_disallow_follow = ctx.nofollow;
638
639	xfree_null (ctx.base);
640	read_file_free (fm);
641	return ctx.head;
642	}
643
644	/* This doesn't really have anything to do with HTML, but it's similar
645	to get_urls_html, so we put it here. */
646
647	struct urlpos *
648	get_urls_file (const char *file)
649	{
650	struct file_memory *fm;
651	struct urlpos head, tail;
652	const char text, text_end;
653
654	/* Load the file. */
655	fm = read_file (file);
656	if (!fm)
657	{
658	logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
659	return NULL;
660	}
661	DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
662
663	head = tail = NULL;
664	text = fm->content;
665	text_end = fm->content + fm->length;
666	while (text < text_end)
667	{
668	int up_error_code;
669	char *url_text;
670	struct urlpos *entry;
671	struct url *url;
672
673	const char *line_beg = text;
674	const char *line_end = memchr (text, '\n', text_end - text);
675	if (!line_end)
676	line_end = text_end;
677	else
678	++line_end;
679	text = line_end;
680
681	/* Strip whitespace from the beginning and end of line. */
682	while (line_beg < line_end && ISSPACE (*line_beg))
683	++line_beg;
684	while (line_end > line_beg && ISSPACE (*(line_end - 1)))
685	--line_end;
686
687	if (line_beg == line_end)
688	continue;
689
690	/* The URL is in the [line_beg, line_end) region. */
691
692	/* We must copy the URL to a zero-terminated string, and we
693	can't use alloca because we're in a loop. sigh. */
694	url_text = strdupdelim (line_beg, line_end);
695
696	if (opt.base_href)
697	{
698	/* Merge opt.base_href with URL. */
699	char *merged = uri_merge (opt.base_href, url_text);
700	xfree (url_text);
701	url_text = merged;
702	}
703
704	url = url_parse (url_text, &up_error_code);
705	if (!url)
706	{
707	logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
708	file, url_text, url_error (up_error_code));
709	xfree (url_text);
710	continue;
711	}
712	xfree (url_text);
713
714	entry = xnew0 (struct urlpos);
715	entry->next = NULL;
716	entry->url = url;
717
718	if (!head)
719	head = entry;
720	else
721	tail->next = entry;
722	tail = entry;
723	}
724	read_file_free (fm);
725	return head;
726	}
727
728	void
729	cleanup_html_url (void)
730	{
731	/* Destroy the hash tables. The hash table keys and values are not
732	allocated by this code, so we don't need to free them here. */
733	if (interesting_tags)
734	hash_table_destroy (interesting_tags);
735	if (interesting_attributes)
736	hash_table_destroy (interesting_attributes);
737	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/net-misc/wget/src/html-url.c

Download in other formats: