Context Navigation

convert.c

Visit:

Last change on this file was 3440, checked in by bird, 18 years ago
wget 1.10.2
File size: 29.8 KB

Line
1	/* Conversion of links to local files.
2	Copyright (C) 2005 Free Software Foundation, Inc.
3
4	This file is part of GNU Wget.
5
6	GNU Wget is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 2 of the License, or
9	(at your option) any later version.
10
11	GNU Wget is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with Wget; if not, write to the Free Software
18	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20	In addition, as a special exception, the Free Software Foundation
21	gives permission to link the code of its release of Wget with the
22	OpenSSL project's "OpenSSL" library (or with modified versions of it
23	that use the same license as the "OpenSSL" library), and distribute
24	the linked executables. You must obey the GNU General Public License
25	in all respects for all of the code used other than "OpenSSL". If you
26	modify this file, you may extend this exception to your version of the
27	file, but you are not obligated to do so. If you do not wish to do
28	so, delete this exception statement from your version. */
29
30	#include <config.h>
31
32	#include <stdio.h>
33	#include <stdlib.h>
34	#ifdef HAVE_STRING_H
35	# include <string.h>
36	#else
37	# include <strings.h>
38	#endif /* HAVE_STRING_H */
39	#ifdef HAVE_UNISTD_H
40	# include <unistd.h>
41	#endif /* HAVE_UNISTD_H */
42	#include <errno.h>
43	#include <assert.h>
44	#include <sys/types.h>
45
46	#include "wget.h"
47	#include "convert.h"
48	#include "url.h"
49	#include "recur.h"
50	#include "utils.h"
51	#include "hash.h"
52	#include "ptimer.h"
53
54	static struct hash_table *dl_file_url_map;
55	struct hash_table *dl_url_file_map;
56
57	/* Set of HTML files downloaded in this Wget run, used for link
58	conversion after Wget is done. */
59	struct hash_table *downloaded_html_set;
60
61	static void convert_links PARAMS ((const char , struct urlpos ));
62
63	/* This function is called when the retrieval is done to convert the
64	links that have been downloaded. It has to be called at the end of
65	the retrieval, because only then does Wget know conclusively which
66	URLs have been downloaded, and which not, so it can tell which
67	direction to convert to.
68
69	The "direction" means that the URLs to the files that have been
70	downloaded get converted to the relative URL which will point to
71	that file. And the other URLs get converted to the remote URL on
72	the server.
73
74	All the downloaded HTMLs are kept in downloaded_html_files, and
75	downloaded URLs in urls_downloaded. All the information is
76	extracted from these two lists. */
77
78	void
79	convert_all_links (void)
80	{
81	int i;
82	double secs;
83	int file_count = 0;
84
85	struct ptimer *timer = ptimer_new ();
86
87	int cnt;
88	char **file_array;
89
90	cnt = 0;
91	if (downloaded_html_set)
92	cnt = hash_table_count (downloaded_html_set);
93	if (cnt == 0)
94	return;
95	file_array = alloca_array (char *, cnt);
96	string_set_to_array (downloaded_html_set, file_array);
97
98	for (i = 0; i < cnt; i++)
99	{
100	struct urlpos urls, cur_url;
101	char *url;
102	char *file = file_array[i];
103
104	/* Determine the URL of the HTML file. get_urls_html will need
105	it. */
106	url = hash_table_get (dl_file_url_map, file);
107	if (!url)
108	{
109	DEBUGP (("Apparently %s has been removed.\n", file));
110	continue;
111	}
112
113	DEBUGP (("Scanning %s (from %s)\n", file, url));
114
115	/* Parse the HTML file... */
116	urls = get_urls_html (file, url, NULL);
117
118	/* We don't respect meta_disallow_follow here because, even if
119	the file is not followed, we might still want to convert the
120	links that have been followed from other files. */
121
122	for (cur_url = urls; cur_url; cur_url = cur_url->next)
123	{
124	char *local_name;
125	struct url *u = cur_url->url;
126
127	if (cur_url->link_base_p)
128	{
129	/* Base references have been resolved by our parser, so
130	we turn the base URL into an empty string. (Perhaps
131	we should remove the tag entirely?) */
132	cur_url->convert = CO_NULLIFY_BASE;
133	continue;
134	}
135
136	/* We decide the direction of conversion according to whether
137	a URL was downloaded. Downloaded URLs will be converted
138	ABS2REL, whereas non-downloaded will be converted REL2ABS. */
139	local_name = hash_table_get (dl_url_file_map, u->url);
140
141	/* Decide on the conversion type. */
142	if (local_name)
143	{
144	/* We've downloaded this URL. Convert it to relative
145	form. We do this even if the URL already is in
146	relative form, because our directory structure may
147	not be identical to that on the server (think `-nd',
148	`--cut-dirs', etc.) */
149	cur_url->convert = CO_CONVERT_TO_RELATIVE;
150	cur_url->local_name = xstrdup (local_name);
151	DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
152	}
153	else
154	{
155	/* We haven't downloaded this URL. If it's not already
156	complete (including a full host name), convert it to
157	that form, so it can be reached while browsing this
158	HTML locally. */
159	if (!cur_url->link_complete_p)
160	cur_url->convert = CO_CONVERT_TO_COMPLETE;
161	cur_url->local_name = NULL;
162	DEBUGP (("will convert url %s to complete\n", u->url));
163	}
164	}
165
166	/* Convert the links in the file. */
167	convert_links (file, urls);
168	++file_count;
169
170	/* Free the data. */
171	free_urlpos (urls);
172	}
173
174	secs = ptimer_measure (timer) / 1000;
175	ptimer_destroy (timer);
176	logprintf (LOG_VERBOSE, _("Converted %d files in %.*f seconds.\n"),
177	file_count, secs < 10 ? 3 : 1, secs);
178	}
179
180	static void write_backup_file PARAMS ((const char *, downloaded_file_t));
181	static const char replace_attr PARAMS ((const char , int, FILE *,
182	const char *));
183	static const char replace_attr_refresh_hack PARAMS ((const char , int, FILE *,
184	const char *, int));
185	static char local_quote_string PARAMS ((const char ));
186	static char construct_relative PARAMS ((const char , const char *));
187
188	/* Change the links in one HTML file. LINKS is a list of links in the
189	document, along with their positions and the desired direction of
190	the conversion. */
191	static void
192	convert_links (const char file, struct urlpos links)
193	{
194	struct file_memory *fm;
195	FILE *fp;
196	const char *p;
197	downloaded_file_t downloaded_file_return;
198
199	struct urlpos *link;
200	int to_url_count = 0, to_file_count = 0;
201
202	logprintf (LOG_VERBOSE, _("Converting %s... "), file);
203
204	{
205	/* First we do a "dry run": go through the list L and see whether
206	any URL needs to be converted in the first place. If not, just
207	leave the file alone. */
208	int dry_count = 0;
209	struct urlpos *dry;
210	for (dry = links; dry; dry = dry->next)
211	if (dry->convert != CO_NOCONVERT)
212	++dry_count;
213	if (!dry_count)
214	{
215	logputs (LOG_VERBOSE, _("nothing to do.\n"));
216	return;
217	}
218	}
219
220	fm = read_file (file);
221	if (!fm)
222	{
223	logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
224	file, strerror (errno));
225	return;
226	}
227
228	downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
229	if (opt.backup_converted && downloaded_file_return)
230	write_backup_file (file, downloaded_file_return);
231
232	/* Before opening the file for writing, unlink the file. This is
233	important if the data in FM is mmaped. In such case, nulling the
234	file, which is what fopen() below does, would make us read all
235	zeroes from the mmaped region. */
236	if (unlink (file) < 0 && errno != ENOENT)
237	{
238	logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
239	file, strerror (errno));
240	read_file_free (fm);
241	return;
242	}
243	/* Now open the file for writing. */
244	fp = fopen (file, "wb");
245	if (!fp)
246	{
247	logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
248	file, strerror (errno));
249	read_file_free (fm);
250	return;
251	}
252
253	/* Here we loop through all the URLs in file, replacing those of
254	them that are downloaded with relative references. */
255	p = fm->content;
256	for (link = links; link; link = link->next)
257	{
258	char *url_start = fm->content + link->pos;
259
260	if (link->pos >= fm->length)
261	{
262	DEBUGP (("Something strange is going on. Please investigate."));
263	break;
264	}
265	/* If the URL is not to be converted, skip it. */
266	if (link->convert == CO_NOCONVERT)
267	{
268	DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
269	continue;
270	}
271
272	/* Echo the file contents, up to the offending URL's opening
273	quote, to the outfile. */
274	fwrite (p, 1, url_start - p, fp);
275	p = url_start;
276
277	switch (link->convert)
278	{
279	case CO_CONVERT_TO_RELATIVE:
280	/* Convert absolute URL to relative. */
281	{
282	char *newname = construct_relative (file, link->local_name);
283	char *quoted_newname = local_quote_string (newname);
284
285	if (!link->link_refresh_p)
286	p = replace_attr (p, link->size, fp, quoted_newname);
287	else
288	p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
289	link->refresh_timeout);
290
291	DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
292	link->url->url, newname, link->pos, file));
293	xfree (newname);
294	xfree (quoted_newname);
295	++to_file_count;
296	break;
297	}
298	case CO_CONVERT_TO_COMPLETE:
299	/* Convert the link to absolute URL. */
300	{
301	char *newlink = link->url->url;
302	char *quoted_newlink = html_quote_string (newlink);
303
304	if (!link->link_refresh_p)
305	p = replace_attr (p, link->size, fp, quoted_newlink);
306	else
307	p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
308	link->refresh_timeout);
309
310	DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
311	newlink, link->pos, file));
312	xfree (quoted_newlink);
313	++to_url_count;
314	break;
315	}
316	case CO_NULLIFY_BASE:
317	/* Change the base href to "". */
318	p = replace_attr (p, link->size, fp, "");
319	break;
320	case CO_NOCONVERT:
321	abort ();
322	break;
323	}
324	}
325
326	/* Output the rest of the file. */
327	if (p - fm->content < fm->length)
328	fwrite (p, 1, fm->length - (p - fm->content), fp);
329	fclose (fp);
330	read_file_free (fm);
331
332	logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
333	}
334
335	/* Construct and return a link that points from BASEFILE to LINKFILE.
336	Both files should be local file names, BASEFILE of the referrering
337	file, and LINKFILE of the referred file.
338
339	Examples:
340
341	cr("foo", "bar") -> "bar"
342	cr("A/foo", "A/bar") -> "bar"
343	cr("A/foo", "A/B/bar") -> "B/bar"
344	cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
345	cr("X/", "Y/bar") -> "../Y/bar" (trailing slash does matter in BASE)
346
347	Both files should be absolute or relative, otherwise strange
348	results might ensue. The function makes no special efforts to
349	handle "." and ".." in links, so make sure they're not there
350	(e.g. using path_simplify). */
351
352	static char *
353	construct_relative (const char basefile, const char linkfile)
354	{
355	char *link;
356	int basedirs;
357	const char b, l;
358	int i, start;
359
360	/* First, skip the initial directory components common to both
361	files. */
362	start = 0;
363	for (b = basefile, l = linkfile; b == l && *b != '\0'; ++b, ++l)
364	{
365	if (*b == '/')
366	start = (b - basefile) + 1;
367	}
368	basefile += start;
369	linkfile += start;
370
371	/* With common directories out of the way, the situation we have is
372	as follows:
373	b - b1/b2/[...]/bfile
374	l - l1/l2/[...]/lfile
375
376	The link we're constructing needs to be:
377	lnk - ../../l1/l2/[...]/lfile
378
379	Where the number of ".."'s equals the number of bN directory
380	components in B. */
381
382	/* Count the directory components in B. */
383	basedirs = 0;
384	for (b = basefile; *b; b++)
385	{
386	if (*b == '/')
387	++basedirs;
388	}
389
390	/* Construct LINK as explained above. */
391	link = (char )xmalloc (3 basedirs + strlen (linkfile) + 1);
392	for (i = 0; i < basedirs; i++)
393	memcpy (link + 3 * i, "../", 3);
394	strcpy (link + 3 * i, linkfile);
395	return link;
396	}
397
398	/* Used by write_backup_file to remember which files have been
399	written. */
400	static struct hash_table *converted_files;
401
402	static void
403	write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
404	{
405	/* Rather than just writing over the original .html file with the
406	converted version, save the former to *.orig. Note we only do
407	this for files we've _successfully_ downloaded, so we don't
408	clobber .orig files sitting around from previous invocations. */
409
410	/* Construct the backup filename as the original name plus ".orig". */
411	size_t filename_len = strlen (file);
412	char* filename_plus_orig_suffix;
413
414	if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
415	{
416	/* Just write "orig" over "html". We need to do it this way
417	because when we're checking to see if we've downloaded the
418	file before (to see if we can skip downloading it), we don't
419	know if it's a text/html file. Therefore we don't know yet
420	at that stage that -E is going to cause us to tack on
421	".html", so we need to compare vs. the original URL plus
422	".orig", not the original URL plus ".html.orig". */
423	filename_plus_orig_suffix = alloca (filename_len + 1);
424	strcpy (filename_plus_orig_suffix, file);
425	strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
426	}
427	else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
428	{
429	/* Append ".orig" to the name. */
430	filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
431	strcpy (filename_plus_orig_suffix, file);
432	strcpy (filename_plus_orig_suffix + filename_len, ".orig");
433	}
434
435	if (!converted_files)
436	converted_files = make_string_hash_table (0);
437
438	/* We can get called twice on the same URL thanks to the
439	convert_all_links() call in main(). If we write the .orig file
440	each time in such a case, it'll end up containing the first-pass
441	conversion, not the original file. So, see if we've already been
442	called on this file. */
443	if (!string_set_contains (converted_files, file))
444	{
445	/* Rename <file> to <file>.orig before former gets written over. */
446	if (rename (file, filename_plus_orig_suffix) != 0)
447	logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
448	file, filename_plus_orig_suffix, strerror (errno));
449
450	/* Remember that we've already written a .orig backup for this file.
451	Note that we never free this memory since we need it till the
452	convert_all_links() call, which is one of the last things the
453	program does before terminating. BTW, I'm not sure if it would be
454	safe to just set 'converted_file_ptr->string' to 'file' below,
455	rather than making a copy of the string... Another note is that I
456	thought I could just add a field to the urlpos structure saying
457	that we'd written a .orig file for this URL, but that didn't work,
458	so I had to make this separate list.
459	-- Dan Harkless <wget@harkless.org>
460
461	This [adding a field to the urlpos structure] didn't work
462	because convert_file() is called from convert_all_links at
463	the end of the retrieval with a freshly built new urlpos
464	list.
465	-- Hrvoje Niksic <hniksic@xemacs.org>
466	*/
467	string_set_add (converted_files, file);
468	}
469	}
470
471	static int find_fragment PARAMS ((const char , int, const char *,
472	const char **));
473
474	/* Replace an attribute's original text with NEW_TEXT. */
475
476	static const char *
477	replace_attr (const char p, int size, FILE fp, const char *new_text)
478	{
479	int quote_flag = 0;
480	char quote_char = '\"'; /* use "..." for quoting, unless the
481	original value is quoted, in which
482	case reuse its quoting char. */
483	const char frag_beg, frag_end;
484
485	/* Structure of our string is:
486	"...old-contents..."
487	<--- size ---> (with quotes)
488	OR:
489	...old-contents...
490	<--- size --> (no quotes) */
491
492	if (p == '\"' \|\| p == '\'')
493	{
494	quote_char = *p;
495	quote_flag = 1;
496	++p;
497	size -= 2; /* disregard opening and closing quote */
498	}
499	putc (quote_char, fp);
500	fputs (new_text, fp);
501
502	/* Look for fragment identifier, if any. */
503	if (find_fragment (p, size, &frag_beg, &frag_end))
504	fwrite (frag_beg, 1, frag_end - frag_beg, fp);
505	p += size;
506	if (quote_flag)
507	++p;
508	putc (quote_char, fp);
509
510	return p;
511	}
512
513	/* The same as REPLACE_ATTR, but used when replacing
514	<meta http-equiv=refresh content="new_text"> because we need to
515	append "timeout_value; URL=" before the next_text. */
516
517	static const char *
518	replace_attr_refresh_hack (const char p, int size, FILE fp,
519	const char *new_text, int timeout)
520	{
521	/* "0; URL=..." */
522	char new_with_timeout = (char )alloca (numdigit (timeout)
523	+ 6 /* "; URL=" */
524	+ strlen (new_text)
525	+ 1);
526	sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
527
528	return replace_attr (p, size, fp, new_with_timeout);
529	}
530
531	/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
532	preceded by '&'. If the character is not found, return zero. If
533	the character is found, return 1 and set BP and EP to point to the
534	beginning and end of the region.
535
536	This is used for finding the fragment indentifiers in URLs. */
537
538	static int
539	find_fragment (const char beg, int size, const char bp, const char *ep)
540	{
541	const char *end = beg + size;
542	int saw_amp = 0;
543	for (; beg < end; beg++)
544	{
545	switch (*beg)
546	{
547	case '&':
548	saw_amp = 1;
549	break;
550	case '#':
551	if (!saw_amp)
552	{
553	*bp = beg;
554	*ep = end;
555	return 1;
556	}
557	/* fallthrough */
558	default:
559	saw_amp = 0;
560	}
561	}
562	return 0;
563	}
564
565	/* Quote FILE for use as local reference to an HTML file.
566
567	We quote ? as %3F to avoid passing part of the file name as the
568	parameter when browsing the converted file through HTTP. However,
569	it is safe to do this only when `--html-extension' is turned on.
570	This is because converting "index.html?foo=bar" to
571	"index.html%3Ffoo=bar" would break local browsing, as the latter
572	isn't even recognized as an HTML file! However, converting
573	"index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
574	safe for both local and HTTP-served browsing.
575
576	We always quote "#" as "%23" and "%" as "%25" because those
577	characters have special meanings in URLs. */
578
579	static char *
580	local_quote_string (const char *file)
581	{
582	const char *from;
583	char newname, to;
584
585	char *any = strpbrk (file, "?#%");
586	if (!any)
587	return html_quote_string (file);
588
589	/* Allocate space assuming the worst-case scenario, each character
590	having to be quoted. */
591	to = newname = (char )alloca (3 strlen (file) + 1);
592	for (from = file; *from; from++)
593	switch (*from)
594	{
595	case '%':
596	*to++ = '%';
597	*to++ = '2';
598	*to++ = '5';
599	break;
600	case '#':
601	*to++ = '%';
602	*to++ = '2';
603	*to++ = '3';
604	break;
605	case '?':
606	if (opt.html_extension)
607	{
608	*to++ = '%';
609	*to++ = '3';
610	*to++ = 'F';
611	break;
612	}
613	/* fallthrough */
614	default:
615	to++ = from;
616	}
617	*to = '\0';
618
619	return html_quote_string (newname);
620	}
621
622
623	/* Book-keeping code for dl_file_url_map, dl_url_file_map,
624	downloaded_html_list, and downloaded_html_set. Other code calls
625	these functions to let us know that a file has been downloaded. */
626
627	#define ENSURE_TABLES_EXIST do { \
628	if (!dl_file_url_map) \
629	dl_file_url_map = make_string_hash_table (0); \
630	if (!dl_url_file_map) \
631	dl_url_file_map = make_string_hash_table (0); \
632	} while (0)
633
634	/* Return 1 if S1 and S2 are the same, except for "/index.html". The
635	three cases in which it returns one are (substitute any substring
636	for "foo"):
637
638	m("foo/index.html", "foo/") ==> 1
639	m("foo/", "foo/index.html") ==> 1
640	m("foo", "foo/index.html") ==> 1
641	m("foo", "foo/" ==> 1
642	m("foo", "foo") ==> 1 */
643
644	static int
645	match_except_index (const char s1, const char s2)
646	{
647	int i;
648	const char *lng;
649
650	/* Skip common substring. */
651	for (i = 0; s1 && s2 && s1 == s2; s1++, s2++, i++)
652	;
653	if (i == 0)
654	/* Strings differ at the very beginning -- bail out. We need to
655	check this explicitly to avoid `lng - 1' reading outside the
656	array. */
657	return 0;
658
659	if (!s1 && !s2)
660	/* Both strings hit EOF -- strings are equal. */
661	return 1;
662	else if (s1 && s2)
663	/* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
664	return 0;
665	else if (*s1)
666	/* S1 is the longer one. */
667	lng = s1;
668	else
669	/* S2 is the longer one. */
670	lng = s2;
671
672	/* foo / / foo/ */
673	/* foo/index.html / / or / / foo/index.html */
674	/* ^ / / ^ */
675
676	if (*lng != '/')
677	/* The right-hand case. */
678	--lng;
679
680	if (lng == '/' && (lng + 1) == '\0')
681	/* foo */
682	/* foo/ */
683	return 1;
684
685	return 0 == strcmp (lng, "/index.html");
686	}
687
688	static int
689	dissociate_urls_from_file_mapper (void key, void value, void *arg)
690	{
691	char mapping_url = (char )key;
692	char mapping_file = (char )value;
693	char file = (char )arg;
694
695	if (0 == strcmp (mapping_file, file))
696	{
697	hash_table_remove (dl_url_file_map, mapping_url);
698	xfree (mapping_url);
699	xfree (mapping_file);
700	}
701
702	/* Continue mapping. */
703	return 0;
704	}
705
706	/* Remove all associations from various URLs to FILE from dl_url_file_map. */
707
708	static void
709	dissociate_urls_from_file (const char *file)
710	{
711	hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
712	(char *)file);
713	}
714
715	/* Register that URL has been successfully downloaded to FILE. This
716	is used by the link conversion code to convert references to URLs
717	to references to local files. It is also being used to check if a
718	URL has already been downloaded. */
719
720	void
721	register_download (const char url, const char file)
722	{
723	char old_file, old_url;
724
725	ENSURE_TABLES_EXIST;
726
727	/* With some forms of retrieval, it is possible, although not likely
728	or particularly desirable. If both are downloaded, the second
729	download will override the first one. When that happens,
730	dissociate the old file name from the URL. */
731
732	if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
733	{
734	if (0 == strcmp (url, old_url))
735	/* We have somehow managed to download the same URL twice.
736	Nothing to do. */
737	return;
738
739	if (match_except_index (url, old_url)
740	&& !hash_table_contains (dl_url_file_map, url))
741	/* The two URLs differ only in the "index.html" ending. For
742	example, one is "http://www.server.com/", and the other is
743	"http://www.server.com/index.html". Don't remove the old
744	one, just add the new one as a non-canonical entry. */
745	goto url_only;
746
747	hash_table_remove (dl_file_url_map, file);
748	xfree (old_file);
749	xfree (old_url);
750
751	/* Remove all the URLs that point to this file. Yes, there can
752	be more than one such URL, because we store redirections as
753	multiple entries in dl_url_file_map. For example, if URL1
754	redirects to URL2 which gets downloaded to FILE, we map both
755	URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map
756	only points to URL2.) When another URL gets loaded to FILE,
757	we want both URL1 and URL2 dissociated from it.
758
759	This is a relatively expensive operation because it performs
760	a linear search of the whole hash table, but it should be
761	called very rarely, only when two URLs resolve to the same
762	file name, and the "<file>.1" extensions are turned off.
763	In other words, almost never. */
764	dissociate_urls_from_file (file);
765	}
766
767	hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
768
769	url_only:
770	/* A URL->FILE mapping is not possible without a FILE->URL mapping.
771	If the latter were present, it should have been removed by the
772	above `if'. So we could write:
773
774	assert (!hash_table_contains (dl_url_file_map, url));
775
776	The above is correct when running in recursive mode where the
777	same URL always resolves to the same file. But if you do
778	something like:
779
780	wget URL URL
781
782	then the first URL will resolve to "FILE", and the other to
783	"FILE.1". In that case, FILE.1 will not be found in
784	dl_file_url_map, but URL will still point to FILE in
785	dl_url_file_map. */
786	if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
787	{
788	hash_table_remove (dl_url_file_map, url);
789	xfree (old_url);
790	xfree (old_file);
791	}
792
793	hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
794	}
795
796	/* Register that FROM has been redirected to TO. This assumes that TO
797	is successfully downloaded and already registered using
798	register_download() above. */
799
800	void
801	register_redirection (const char from, const char to)
802	{
803	char *file;
804
805	ENSURE_TABLES_EXIST;
806
807	file = hash_table_get (dl_url_file_map, to);
808	assert (file != NULL);
809	if (!hash_table_contains (dl_url_file_map, from))
810	hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
811	}
812
813	/* Register that the file has been deleted. */
814
815	void
816	register_delete_file (const char *file)
817	{
818	char old_url, old_file;
819
820	ENSURE_TABLES_EXIST;
821
822	if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
823	return;
824
825	hash_table_remove (dl_file_url_map, file);
826	xfree (old_file);
827	xfree (old_url);
828	dissociate_urls_from_file (file);
829	}
830
831	/* Register that FILE is an HTML file that has been downloaded. */
832
833	void
834	register_html (const char url, const char file)
835	{
836	if (!downloaded_html_set)
837	downloaded_html_set = make_string_hash_table (0);
838	string_set_add (downloaded_html_set, file);
839	}
840
841	static void downloaded_files_free PARAMS ((void));
842
843	/* Cleanup the data structures associated with this file. */
844
845	void
846	convert_cleanup (void)
847	{
848	if (dl_file_url_map)
849	{
850	free_keys_and_values (dl_file_url_map);
851	hash_table_destroy (dl_file_url_map);
852	dl_file_url_map = NULL;
853	}
854	if (dl_url_file_map)
855	{
856	free_keys_and_values (dl_url_file_map);
857	hash_table_destroy (dl_url_file_map);
858	dl_url_file_map = NULL;
859	}
860	if (downloaded_html_set)
861	string_set_free (downloaded_html_set);
862	downloaded_files_free ();
863	if (converted_files)
864	string_set_free (converted_files);
865	}
866
867
868	/* Book-keeping code for downloaded files that enables extension
869	hacks. */
870
871	/* This table should really be merged with dl_file_url_map and
872	downloaded_html_files. This was originally a list, but I changed
873	it to a hash table beause it was actually taking a lot of time to
874	find things in it. */
875
876	static struct hash_table *downloaded_files_hash;
877
878	/* We're storing "modes" of type downloaded_file_t in the hash table.
879	However, our hash tables only accept pointers for keys and values.
880	So when we need a pointer, we use the address of a
881	downloaded_file_t variable of static storage. */
882
883	static downloaded_file_t *
884	downloaded_mode_to_ptr (downloaded_file_t mode)
885	{
886	static downloaded_file_t
887	v1 = FILE_NOT_ALREADY_DOWNLOADED,
888	v2 = FILE_DOWNLOADED_NORMALLY,
889	v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
890	v4 = CHECK_FOR_FILE;
891
892	switch (mode)
893	{
894	case FILE_NOT_ALREADY_DOWNLOADED:
895	return &v1;
896	case FILE_DOWNLOADED_NORMALLY:
897	return &v2;
898	case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
899	return &v3;
900	case CHECK_FOR_FILE:
901	return &v4;
902	}
903	return NULL;
904	}
905
906	/* Remembers which files have been downloaded. In the standard case,
907	should be called with mode == FILE_DOWNLOADED_NORMALLY for each
908	file we actually download successfully (i.e. not for ones we have
909	failures on or that we skip due to -N).
910
911	When we've downloaded a file and tacked on a ".html" extension due
912	to -E, call this function with
913	FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
914	FILE_DOWNLOADED_NORMALLY.
915
916	If you just want to check if a file has been previously added
917	without adding it, call with mode == CHECK_FOR_FILE. Please be
918	sure to call this function with local filenames, not remote
919	URLs. */
920
921	downloaded_file_t
922	downloaded_file (downloaded_file_t mode, const char *file)
923	{
924	downloaded_file_t *ptr;
925
926	if (mode == CHECK_FOR_FILE)
927	{
928	if (!downloaded_files_hash)
929	return FILE_NOT_ALREADY_DOWNLOADED;
930	ptr = hash_table_get (downloaded_files_hash, file);
931	if (!ptr)
932	return FILE_NOT_ALREADY_DOWNLOADED;
933	return *ptr;
934	}
935
936	if (!downloaded_files_hash)
937	downloaded_files_hash = make_string_hash_table (0);
938
939	ptr = hash_table_get (downloaded_files_hash, file);
940	if (ptr)
941	return *ptr;
942
943	ptr = downloaded_mode_to_ptr (mode);
944	hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
945
946	return FILE_NOT_ALREADY_DOWNLOADED;
947	}
948
949	static int
950	df_free_mapper (void key, void value, void *ignored)
951	{
952	xfree (key);
953	return 0;
954	}
955
956	static void
957	downloaded_files_free (void)
958	{
959	if (downloaded_files_hash)
960	{
961	hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
962	hash_table_destroy (downloaded_files_hash);
963	downloaded_files_hash = NULL;
964	}
965	}
966
967
968	/* The function returns the pointer to the malloc-ed quoted version of
969	string s. It will recognize and quote numeric and special graphic
970	entities, as per RFC1866:
971
972	`&' -> `&'
973	`<' -> `<'
974	`>' -> `>'
975	`"' -> `"'
976	SP -> ` '
977
978	No other entities are recognized or replaced. */
979	char *
980	html_quote_string (const char *s)
981	{
982	const char *b = s;
983	char p, res;
984	int i;
985
986	/* Pass through the string, and count the new size. */
987	for (i = 0; *s; s++, i++)
988	{
989	if (*s == '&')
990	i += 4; /* `amp;' */
991	else if (s == '<' \|\| s == '>')
992	i += 3; /* `lt;' and `gt;' */
993	else if (*s == '\"')
994	i += 5; /* `quot;' */
995	else if (*s == ' ')
996	i += 4; /* #32; */
997	}
998	res = (char *)xmalloc (i + 1);
999	s = b;
1000	for (p = res; *s; s++)
1001	{
1002	switch (*s)
1003	{
1004	case '&':
1005	*p++ = '&';
1006	*p++ = 'a';
1007	*p++ = 'm';
1008	*p++ = 'p';
1009	*p++ = ';';
1010	break;
1011	case '<': case '>':
1012	*p++ = '&';
1013	p++ = (s == '<' ? 'l' : 'g');
1014	*p++ = 't';
1015	*p++ = ';';
1016	break;
1017	case '\"':
1018	*p++ = '&';
1019	*p++ = 'q';
1020	*p++ = 'u';
1021	*p++ = 'o';
1022	*p++ = 't';
1023	*p++ = ';';
1024	break;
1025	case ' ':
1026	*p++ = '&';
1027	*p++ = '#';
1028	*p++ = '3';
1029	*p++ = '2';
1030	*p++ = ';';
1031	break;
1032	default:
1033	p++ = s;
1034	}
1035	}
1036	*p = '\0';
1037	return res;
1038	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/wget/current/src/convert.c

Download in other formats: