Context Navigation

url.c

Visit:

Last change on this file was 3440, checked in by bird, 18 years ago
wget 1.10.2
File size: 55.6 KB

Line
1	/* URL handling.
2	Copyright (C) 2005 Free Software Foundation, Inc.
3
4	This file is part of GNU Wget.
5
6	GNU Wget is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 2 of the License, or (at
9	your option) any later version.
10
11	GNU Wget is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with Wget; if not, write to the Free Software
18	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20	In addition, as a special exception, the Free Software Foundation
21	gives permission to link the code of its release of Wget with the
22	OpenSSL project's "OpenSSL" library (or with modified versions of it
23	that use the same license as the "OpenSSL" library), and distribute
24	the linked executables. You must obey the GNU General Public License
25	in all respects for all of the code used other than "OpenSSL". If you
26	modify this file, you may extend this exception to your version of the
27	file, but you are not obligated to do so. If you do not wish to do
28	so, delete this exception statement from your version. */
29
30	#include <config.h>
31
32	#include <stdio.h>
33	#include <stdlib.h>
34	#ifdef HAVE_STRING_H
35	# include <string.h>
36	#else
37	# include <strings.h>
38	#endif
39	#include <sys/types.h>
40	#ifdef HAVE_UNISTD_H
41	# include <unistd.h>
42	#endif
43	#include <errno.h>
44	#include <assert.h>
45
46	#include "wget.h"
47	#include "utils.h"
48	#include "url.h"
49	#include "host.h" /* for is_valid_ipv6_address */
50
51	#ifndef errno
52	extern int errno;
53	#endif
54
55	struct scheme_data
56	{
57	const char *name;
58	const char *leading_string;
59	int default_port;
60	int enabled;
61	};
62
63	/* Supported schemes: */
64	static struct scheme_data supported_schemes[] =
65	{
66	{ "http", "http://", DEFAULT_HTTP_PORT, 1 },
67	#ifdef HAVE_SSL
68	{ "https", "https://", DEFAULT_HTTPS_PORT, 1 },
69	#endif
70	{ "ftp", "ftp://", DEFAULT_FTP_PORT, 1 },
71
72	/* SCHEME_INVALID */
73	{ NULL, NULL, -1, 0 }
74	};
75
76	/* Forward declarations: */
77
78	static int path_simplify PARAMS ((char *));
79
80
81	/* Support for escaping and unescaping of URL strings. */
82
83	/* Table of "reserved" and "unsafe" characters. Those terms are
84	rfc1738-speak, as such largely obsoleted by rfc2396 and later
85	specs, but the general idea remains.
86
87	A reserved character is the one that you can't decode without
88	changing the meaning of the URL. For example, you can't decode
89	"/foo/%2f/bar" into "/foo///bar" because the number and contents of
90	path components is different. Non-reserved characters can be
91	changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The
92	unsafe characters are loosely based on rfc1738, plus "$" and ",",
93	as recommended by rfc2396, and minus "~", which is very frequently
94	used (and sometimes unrecognized as %7E by broken servers).
95
96	An unsafe character is the one that should be encoded when URLs are
97	placed in foreign environments. E.g. space and newline are unsafe
98	in HTTP contexts because HTTP uses them as separator and line
99	terminator, so they must be encoded to %20 and %0A respectively.
100	"*" is unsafe in shell context, etc.
101
102	We determine whether a character is unsafe through static table
103	lookup. This code assumes ASCII character set and 8-bit chars. */
104
105	enum {
106	/* rfc1738 reserved chars + "$" and ",". */
107	urlchr_reserved = 1,
108
109	/* rfc1738 unsafe chars, plus non-printables. */
110	urlchr_unsafe = 2
111	};
112
113	#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
114	#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
115	#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
116
117	/* Shorthands for the table: */
118	#define R urlchr_reserved
119	#define U urlchr_unsafe
120	#define RU R\|U
121
122	static const unsigned char urlchr_table[256] =
123	{
124	U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
125	U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
126	U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
127	U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
128	U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
129	0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
130	0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
131	0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
132	RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
133	0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
134	0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
135	0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
136	U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
137	0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
138	0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
139	0, 0, 0, U, U, U, 0, U, /* x y z { \| } ~ DEL */
140
141	U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142	U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
143	U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
144	U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
145
146	U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
147	U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148	U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
149	U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
150	};
151	#undef R
152	#undef U
153	#undef RU
154
155	/* URL-unescape the string S.
156
157	This is done by transforming the sequences "%HH" to the character
158	represented by the hexadecimal digits HH. If % is not followed by
159	two hexadecimal digits, it is inserted literally.
160
161	The transformation is done in place. If you need the original
162	string intact, make a copy before calling this function. */
163
164	static void
165	url_unescape (char *s)
166	{
167	char t = s; / t - tortoise */
168	char h = s; / h - hare */
169
170	for (; *h; h++, t++)
171	{
172	if (*h != '%')
173	{
174	copychar:
175	t = h;
176	}
177	else
178	{
179	char c;
180	/* Do nothing if '%' is not followed by two hex digits. */
181	if (!h[1] \|\| !h[2] \|\| !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
182	goto copychar;
183	c = X2DIGITS_TO_NUM (h[1], h[2]);
184	/* Don't unescape %00 because there is no way to insert it
185	into a C string without effectively truncating it. */
186	if (c == '\0')
187	goto copychar;
188	*t = c;
189	h += 2;
190	}
191	}
192	*t = '\0';
193	}
194
195	/* The core of url_escape_* functions. Escapes the characters that
196	match the provided mask in urlchr_table.
197
198	If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
199	will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
200	freshly allocated string will be returned in all cases. */
201
202	static char *
203	url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
204	{
205	const char *p1;
206	char p2, newstr;
207	int newlen;
208	int addition = 0;
209
210	for (p1 = s; *p1; p1++)
211	if (urlchr_test (*p1, mask))
212	addition += 2; /* Two more characters (hex digits) */
213
214	if (!addition)
215	return allow_passthrough ? (char *)s : xstrdup (s);
216
217	newlen = (p1 - s) + addition;
218	newstr = (char *)xmalloc (newlen + 1);
219
220	p1 = s;
221	p2 = newstr;
222	while (*p1)
223	{
224	/* Quote the characters that match the test mask. */
225	if (urlchr_test (*p1, mask))
226	{
227	unsigned char c = *p1++;
228	*p2++ = '%';
229	*p2++ = XNUM_TO_DIGIT (c >> 4);
230	*p2++ = XNUM_TO_DIGIT (c & 0xf);
231	}
232	else
233	p2++ = p1++;
234	}
235	assert (p2 - newstr == newlen);
236	*p2 = '\0';
237
238	return newstr;
239	}
240
241	/* URL-escape the unsafe characters (see urlchr_table) in a given
242	string, returning a freshly allocated string. */
243
244	char *
245	url_escape (const char *s)
246	{
247	return url_escape_1 (s, urlchr_unsafe, 0);
248	}
249
250	/* URL-escape the unsafe characters (see urlchr_table) in a given
251	string. If no characters are unsafe, S is returned. */
252
253	static char *
254	url_escape_allow_passthrough (const char *s)
255	{
256	return url_escape_1 (s, urlchr_unsafe, 1);
257	}
258
259
260	/* Decide whether the char at position P needs to be encoded. (It is
261	not enough to pass a single char *P because the function may need
262	to inspect the surrounding context.)
263
264	Return 1 if the char should be escaped as %XX, 0 otherwise. */
265
266	static inline int
267	char_needs_escaping (const char *p)
268	{
269	if (*p == '%')
270	{
271	if (ISXDIGIT ((p + 1)) && ISXDIGIT ((p + 2)))
272	return 0;
273	else
274	/* Garbled %.. sequence: encode `%'. */
275	return 1;
276	}
277	else if (URL_UNSAFE_CHAR (p) && !URL_RESERVED_CHAR (p))
278	return 1;
279	else
280	return 0;
281	}
282
283	/* Translate a %-escaped (but possibly non-conformant) input string S
284	into a %-escaped (and conformant) output string. If no characters
285	are encoded or decoded, return the same string S; otherwise, return
286	a freshly allocated string with the new contents.
287
288	After a URL has been run through this function, the protocols that
289	use `%' as the quote character can use the resulting string as-is,
290	while those that don't can use url_unescape to get to the intended
291	data. This function is stable: once the input is transformed,
292	further transformations of the result yield the same output.
293
294	Let's discuss why this function is needed.
295
296	Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since
297	a raw space character would mess up the HTTP request, it needs to
298	be quoted, like this:
299
300	GET /abc%20def HTTP/1.0
301
302	It would appear that the unsafe chars need to be quoted, for
303	example with url_escape. But what if we're requested to download
304	`abc%20def'? url_escape transforms "%" to "%25", which would leave
305	us with `abc%2520def'. This is incorrect -- since %-escapes are
306	part of URL syntax, "%20" is the correct way to denote a literal
307	space on the Wget command line. This leads to the conclusion that
308	in that case Wget should not call url_escape, but leave the `%20'
309	as is. This is clearly contradictory, but it only gets worse.
310
311	What if the requested URI is `abc%20 def'? If we call url_escape,
312	we end up with `/abc%2520%20def', which is almost certainly not
313	intended. If we don't call url_escape, we are left with the
314	embedded space and cannot complete the request. What the user
315	meant was for Wget to request `/abc%20%20def', and this is where
316	reencode_escapes kicks in.
317
318	Wget used to solve this by first decoding %-quotes, and then
319	encoding all the "unsafe" characters found in the resulting string.
320	This was wrong because it didn't preserve certain URL special
321	(reserved) characters. For instance, URI containing "a%2B+b" (0x2b
322	== '+') would get translated to "a%2B%2Bb" or "a++b" depending on
323	whether we considered `+' reserved (it is). One of these results
324	is inevitable because by the second step we would lose information
325	on whether the `+' was originally encoded or not. Both results
326	were wrong because in CGI parameters + means space, while %2B means
327	literal plus. reencode_escapes correctly translates the above to
328	"a%2B+b", i.e. returns the original string.
329
330	This function uses a modified version of the algorithm originally
331	proposed by Anon Sricharoenchai:
332
333	* Encode all "unsafe" characters, except those that are also
334	"reserved", to %XX. See urlchr_table for which characters are
335	unsafe and reserved.
336
337	* Encode the "%" characters not followed by two hex digits to
338	"%25".
339
340	* Pass through all other characters and %XX escapes as-is. (Up to
341	Wget 1.10 this decoded %XX escapes corresponding to "safe"
342	characters, but that was obtrusive and broke some servers.)
343
344	Anon's test case:
345
346	"http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
347	->
348	"http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
349
350	Simpler test cases:
351
352	"foo bar" -> "foo%20bar"
353	"foo%20bar" -> "foo%20bar"
354	"foo %20bar" -> "foo%20%20bar"
355	"foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
356	"foo%25%20bar" -> "foo%25%20bar"
357	"foo%2%20bar" -> "foo%252%20bar"
358	"foo+bar" -> "foo+bar" (plus is reserved!)
359	"foo%2b+bar" -> "foo%2b+bar" */
360
361	static char *
362	reencode_escapes (const char *s)
363	{
364	const char *p1;
365	char newstr, p2;
366	int oldlen, newlen;
367
368	int encode_count = 0;
369
370	/* First pass: inspect the string to see if there's anything to do,
371	and to calculate the new length. */
372	for (p1 = s; *p1; p1++)
373	if (char_needs_escaping (p1))
374	++encode_count;
375
376	if (!encode_count)
377	/* The string is good as it is. */
378	return (char ) s; / C const model sucks. */
379
380	oldlen = p1 - s;
381	/* Each encoding adds two characters (hex digits). */
382	newlen = oldlen + 2 * encode_count;
383	newstr = xmalloc (newlen + 1);
384
385	/* Second pass: copy the string to the destination address, encoding
386	chars when needed. */
387	p1 = s;
388	p2 = newstr;
389
390	while (*p1)
391	if (char_needs_escaping (p1))
392	{
393	unsigned char c = *p1++;
394	*p2++ = '%';
395	*p2++ = XNUM_TO_DIGIT (c >> 4);
396	*p2++ = XNUM_TO_DIGIT (c & 0xf);
397	}
398	else
399	p2++ = p1++;
400
401	*p2 = '\0';
402	assert (p2 - newstr == newlen);
403	return newstr;
404	}
405
406
407	/* Returns the scheme type if the scheme is supported, or
408	SCHEME_INVALID if not. */
409
410	enum url_scheme
411	url_scheme (const char *url)
412	{
413	int i;
414
415	for (i = 0; supported_schemes[i].leading_string; i++)
416	if (0 == strncasecmp (url, supported_schemes[i].leading_string,
417	strlen (supported_schemes[i].leading_string)))
418	{
419	if (supported_schemes[i].enabled)
420	return (enum url_scheme) i;
421	else
422	return SCHEME_INVALID;
423	}
424
425	return SCHEME_INVALID;
426	}
427
428	#define SCHEME_CHAR(ch) (ISALNUM (ch) \|\| (ch) == '-' \|\| (ch) == '+')
429
430	/* Return 1 if the URL begins with any "scheme", 0 otherwise. As
431	currently implemented, it returns true if URL begins with
432	[-+a-zA-Z0-9]+: . */
433
434	int
435	url_has_scheme (const char *url)
436	{
437	const char *p = url;
438
439	/* The first char must be a scheme char. */
440	if (!p \|\| !SCHEME_CHAR (p))
441	return 0;
442	++p;
443	/* Followed by 0 or more scheme chars. */
444	while (p && SCHEME_CHAR (p))
445	++p;
446	/* Terminated by ':'. */
447	return *p == ':';
448	}
449
450	int
451	scheme_default_port (enum url_scheme scheme)
452	{
453	return supported_schemes[scheme].default_port;
454	}
455
456	void
457	scheme_disable (enum url_scheme scheme)
458	{
459	supported_schemes[scheme].enabled = 0;
460	}
461
462	/* Skip the username and password, if present in the URL. The
463	function should not be called with the complete URL, but with the
464	portion after the scheme.
465
466	If no username and password are found, return URL. */
467
468	static const char *
469	url_skip_credentials (const char *url)
470	{
471	/* Look for '@' that comes before terminators, such as '/', '?',
472	'#', or ';'. */
473	const char p = (const char )strpbrk (url, "@/?#;");
474	if (!p \|\| *p != '@')
475	return url;
476	return p + 1;
477	}
478
479	/* Parse credentials contained in [BEG, END). The region is expected
480	to have come from a URL and is unescaped. */
481
482	static int
483	parse_credentials (const char beg, const char end, char user, char passwd)
484	{
485	char *colon;
486	const char *userend;
487
488	if (beg == end)
489	return 0; /* empty user name */
490
491	colon = memchr (beg, ':', end - beg);
492	if (colon == beg)
493	return 0; /* again empty user name */
494
495	if (colon)
496	{
497	*passwd = strdupdelim (colon + 1, end);
498	userend = colon;
499	url_unescape (*passwd);
500	}
501	else
502	{
503	*passwd = NULL;
504	userend = end;
505	}
506	*user = strdupdelim (beg, userend);
507	url_unescape (*user);
508	return 1;
509	}
510
511	/* Used by main.c: detect URLs written using the "shorthand" URL forms
512	popularized by Netscape and NcFTP. HTTP shorthands look like this:
513
514	www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
515	www.foo.com[:port] -> http://www.foo.com[:port]
516
517	FTP shorthands look like this:
518
519	foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
520	foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
521
522	If the URL needs not or cannot be rewritten, return NULL. */
523
524	char *
525	rewrite_shorthand_url (const char *url)
526	{
527	const char *p;
528
529	if (url_scheme (url) != SCHEME_INVALID)
530	return NULL;
531
532	/* Look for a ':' or '/'. The former signifies NcFTP syntax, the
533	latter Netscape. */
534	for (p = url; p && p != ':' && *p != '/'; p++)
535	;
536
537	if (p == url)
538	return NULL;
539
540	/* If we're looking at "://", it means the URL uses a scheme we
541	don't support, which may include "https" when compiled without
542	SSL support. Don't bogusly rewrite such URLs. */
543	if (p[0] == ':' && p[1] == '/' && p[2] == '/')
544	return NULL;
545
546	if (*p == ':')
547	{
548	const char *pp;
549	char *res;
550	/* If the characters after the colon and before the next slash
551	or end of string are all digits, it's HTTP. */
552	int digits = 0;
553	for (pp = p + 1; ISDIGIT (*pp); pp++)
554	++digits;
555	if (digits > 0 && (pp == '/' \|\| pp == '\0'))
556	goto http;
557
558	/* Prepend "ftp://" to the entire URL... */
559	res = xmalloc (6 + strlen (url) + 1);
560	sprintf (res, "ftp://%s", url);
561	/* ...and replace ':' with '/'. */
562	res[6 + (p - url)] = '/';
563	return res;
564	}
565	else
566	{
567	char *res;
568	http:
569	/* Just prepend "http://" to what we have. */
570	res = xmalloc (7 + strlen (url) + 1);
571	sprintf (res, "http://%s", url);
572	return res;
573	}
574	}
575
576
577	static void split_path PARAMS ((const char , char , char *));
578
579	/* Like strpbrk, with the exception that it returns the pointer to the
580	terminating zero (end-of-string aka "eos") if no matching character
581	is found.
582
583	Although I normally balk at Gcc-specific optimizations, it probably
584	makes sense here: glibc has optimizations that detect strpbrk being
585	called with literal string as ACCEPT and inline the search. That
586	optimization is defeated if strpbrk is hidden within the call to
587	another function. (And no, making strpbrk_or_eos inline doesn't
588	help because the check for literal accept is in the
589	preprocessor.) */
590
591	#if defined(__GNUC__) && __GNUC__ >= 3
592
593	#define strpbrk_or_eos(s, accept) ({ \
594	char *SOE_p = strpbrk (s, accept); \
595	if (!SOE_p) \
596	SOE_p = strchr (s, '\0'); \
597	SOE_p; \
598	})
599
600	#else /* not __GNUC__ or old gcc */
601
602	static inline char *
603	strpbrk_or_eos (const char s, const char accept)
604	{
605	char *p = strpbrk (s, accept);
606	if (!p)
607	p = strchr (s, '\0');
608	return p;
609	}
610	#endif /* not __GNUC__ or old gcc */
611
612	/* Turn STR into lowercase; return non-zero if a character was
613	actually changed. */
614
615	static int
616	lowercase_str (char *str)
617	{
618	int change = 0;
619	for (; *str; str++)
620	if (ISUPPER (*str))
621	{
622	change = 1;
623	str = TOLOWER (str);
624	}
625	return change;
626	}
627
628	static const char *parse_errors[] = {
629	#define PE_NO_ERROR 0
630	N_("No error"),
631	#define PE_UNSUPPORTED_SCHEME 1
632	N_("Unsupported scheme"),
633	#define PE_INVALID_HOST_NAME 2
634	N_("Invalid host name"),
635	#define PE_BAD_PORT_NUMBER 3
636	N_("Bad port number"),
637	#define PE_INVALID_USER_NAME 4
638	N_("Invalid user name"),
639	#define PE_UNTERMINATED_IPV6_ADDRESS 5
640	N_("Unterminated IPv6 numeric address"),
641	#define PE_IPV6_NOT_SUPPORTED 6
642	N_("IPv6 addresses not supported"),
643	#define PE_INVALID_IPV6_ADDRESS 7
644	N_("Invalid IPv6 numeric address")
645	};
646
647	/* Parse a URL.
648
649	Return a new struct url if successful, NULL on error. In case of
650	error, and if ERROR is not NULL, also set *ERROR to the appropriate
651	error code. */
652	struct url *
653	url_parse (const char url, int error)
654	{
655	struct url *u;
656	const char *p;
657	int path_modified, host_modified;
658
659	enum url_scheme scheme;
660
661	const char uname_b, uname_e;
662	const char host_b, host_e;
663	const char path_b, path_e;
664	const char params_b, params_e;
665	const char query_b, query_e;
666	const char fragment_b, fragment_e;
667
668	int port;
669	char user = NULL, passwd = NULL;
670
671	char *url_encoded = NULL;
672
673	int error_code;
674
675	scheme = url_scheme (url);
676	if (scheme == SCHEME_INVALID)
677	{
678	error_code = PE_UNSUPPORTED_SCHEME;
679	goto err;
680	}
681
682	url_encoded = reencode_escapes (url);
683	p = url_encoded;
684
685	p += strlen (supported_schemes[scheme].leading_string);
686	uname_b = p;
687	p = url_skip_credentials (p);
688	uname_e = p;
689
690	/* scheme://user:pass@host[:port]... */
691	/* ^ */
692
693	/* We attempt to break down the URL into the components path,
694	params, query, and fragment. They are ordered like this:
695
696	scheme://host[:port][/path][;params][?query][#fragment] */
697
698	params_b = params_e = NULL;
699	query_b = query_e = NULL;
700	fragment_b = fragment_e = NULL;
701
702	host_b = p;
703
704	if (*p == '[')
705	{
706	/* Handle IPv6 address inside square brackets. Ideally we'd
707	just look for the terminating ']', but rfc2732 mandates
708	rejecting invalid IPv6 addresses. */
709
710	/* The address begins after '['. */
711	host_b = p + 1;
712	host_e = strchr (host_b, ']');
713
714	if (!host_e)
715	{
716	error_code = PE_UNTERMINATED_IPV6_ADDRESS;
717	goto err;
718	}
719
720	#ifdef ENABLE_IPV6
721	/* Check if the IPv6 address is valid. */
722	if (!is_valid_ipv6_address(host_b, host_e))
723	{
724	error_code = PE_INVALID_IPV6_ADDRESS;
725	goto err;
726	}
727
728	/* Continue parsing after the closing ']'. */
729	p = host_e + 1;
730	#else
731	error_code = PE_IPV6_NOT_SUPPORTED;
732	goto err;
733	#endif
734
735	/* The closing bracket must be followed by a separator or by the
736	null char. */
737	/* http://[::1]... */
738	/* ^ */
739	if (!strchr (":/;?#", *p))
740	{
741	/* Trailing garbage after []-delimited IPv6 address. */
742	error_code = PE_INVALID_HOST_NAME;
743	goto err;
744	}
745	}
746	else
747	{
748	p = strpbrk_or_eos (p, ":/;?#");
749	host_e = p;
750	}
751
752	if (host_b == host_e)
753	{
754	error_code = PE_INVALID_HOST_NAME;
755	goto err;
756	}
757
758	port = scheme_default_port (scheme);
759	if (*p == ':')
760	{
761	const char port_b, port_e, *pp;
762
763	/* scheme://host:port/tralala */
764	/* ^ */
765	++p;
766	port_b = p;
767	p = strpbrk_or_eos (p, "/;?#");
768	port_e = p;
769
770	/* Allow empty port, as per rfc2396. */
771	if (port_b != port_e)
772	{
773	for (port = 0, pp = port_b; pp < port_e; pp++)
774	{
775	if (!ISDIGIT (*pp))
776	{
777	/* http://host:12randomgarbage/blah */
778	/* ^ */
779	error_code = PE_BAD_PORT_NUMBER;
780	goto err;
781	}
782	port = 10 * port + (*pp - '0');
783	/* Check for too large port numbers here, before we have
784	a chance to overflow on bogus port values. */
785	if (port > 65535)
786	{
787	error_code = PE_BAD_PORT_NUMBER;
788	goto err;
789	}
790	}
791	}
792	}
793
794	if (*p == '/')
795	{
796	++p;
797	path_b = p;
798	p = strpbrk_or_eos (p, ";?#");
799	path_e = p;
800	}
801	else
802	{
803	/* Path is not allowed not to exist. */
804	path_b = path_e = p;
805	}
806
807	if (*p == ';')
808	{
809	++p;
810	params_b = p;
811	p = strpbrk_or_eos (p, "?#");
812	params_e = p;
813	}
814	if (*p == '?')
815	{
816	++p;
817	query_b = p;
818	p = strpbrk_or_eos (p, "#");
819	query_e = p;
820
821	/* Hack that allows users to use '?' (a wildcard character) in
822	FTP URLs without it being interpreted as a query string
823	delimiter. */
824	if (scheme == SCHEME_FTP)
825	{
826	query_b = query_e = NULL;
827	path_e = p;
828	}
829	}
830	if (*p == '#')
831	{
832	++p;
833	fragment_b = p;
834	p += strlen (p);
835	fragment_e = p;
836	}
837	assert (*p == 0);
838
839	if (uname_b != uname_e)
840	{
841	/* http://user:pass@host */
842	/* ^ ^ */
843	/* uname_b uname_e */
844	if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
845	{
846	error_code = PE_INVALID_USER_NAME;
847	goto err;
848	}
849	}
850
851	u = xnew0 (struct url);
852	u->scheme = scheme;
853	u->host = strdupdelim (host_b, host_e);
854	u->port = port;
855	u->user = user;
856	u->passwd = passwd;
857
858	u->path = strdupdelim (path_b, path_e);
859	path_modified = path_simplify (u->path);
860	split_path (u->path, &u->dir, &u->file);
861
862	host_modified = lowercase_str (u->host);
863
864	/* Decode %HH sequences in host name. This is important not so much
865	to support %HH sequences in host names (which other browser
866	don't), but to support binary characters (which will have been
867	converted to %HH by reencode_escapes). */
868	if (strchr (u->host, '%'))
869	{
870	url_unescape (u->host);
871	host_modified = 1;
872	}
873
874	if (params_b)
875	u->params = strdupdelim (params_b, params_e);
876	if (query_b)
877	u->query = strdupdelim (query_b, query_e);
878	if (fragment_b)
879	u->fragment = strdupdelim (fragment_b, fragment_e);
880
881	if (path_modified \|\| u->fragment \|\| host_modified \|\| path_b == path_e)
882	{
883	/* If we suspect that a transformation has rendered what
884	url_string might return different from URL_ENCODED, rebuild
885	u->url using url_string. */
886	u->url = url_string (u, 0);
887
888	if (url_encoded != url)
889	xfree ((char *) url_encoded);
890	}
891	else
892	{
893	if (url_encoded == url)
894	u->url = xstrdup (url);
895	else
896	u->url = url_encoded;
897	}
898
899	return u;
900
901	err:
902	/* Cleanup in case of error: */
903	if (url_encoded && url_encoded != url)
904	xfree (url_encoded);
905
906	/* Transmit the error code to the caller, if the caller wants to
907	know. */
908	if (error)
909	*error = error_code;
910	return NULL;
911	}
912
913	/* Return the error message string from ERROR_CODE, which should have
914	been retrieved from url_parse. The error message is translated. */
915
916	const char *
917	url_error (int error_code)
918	{
919	assert (error_code >= 0 && error_code < countof (parse_errors));
920	return _(parse_errors[error_code]);
921	}
922
923	/* Split PATH into DIR and FILE. PATH comes from the URL and is
924	expected to be URL-escaped.
925
926	The path is split into directory (the part up to the last slash)
927	and file (the part after the last slash), which are subsequently
928	unescaped. Examples:
929
930	PATH DIR FILE
931	"foo/bar/baz" "foo/bar" "baz"
932	"foo/bar/" "foo/bar" ""
933	"foo" "" "foo"
934	"foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
935
936	DIR and FILE are freshly allocated. */
937
938	static void
939	split_path (const char path, char dir, char *file)
940	{
941	char *last_slash = strrchr (path, '/');
942	if (!last_slash)
943	{
944	*dir = xstrdup ("");
945	*file = xstrdup (path);
946	}
947	else
948	{
949	*dir = strdupdelim (path, last_slash);
950	*file = xstrdup (last_slash + 1);
951	}
952	url_unescape (*dir);
953	url_unescape (*file);
954	}
955
956	/* Note: URL's "full path" is the path with the query string and
957	params appended. The "fragment" (#foo) is intentionally ignored,
958	but that might be changed. For example, if the original URL was
959	"http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
960	the full path will be "/foo/bar/baz;bullshit?querystring". */
961
962	/* Return the length of the full path, without the terminating
963	zero. */
964
965	static int
966	full_path_length (const struct url *url)
967	{
968	int len = 0;
969
970	#define FROB(el) if (url->el) len += 1 + strlen (url->el)
971
972	FROB (path);
973	FROB (params);
974	FROB (query);
975
976	#undef FROB
977
978	return len;
979	}
980
981	/* Write out the full path. */
982
983	static void
984	full_path_write (const struct url url, char where)
985	{
986	#define FROB(el, chr) do { \
987	char *f_el = url->el; \
988	if (f_el) { \
989	int l = strlen (f_el); \
990	*where++ = chr; \
991	memcpy (where, f_el, l); \
992	where += l; \
993	} \
994	} while (0)
995
996	FROB (path, '/');
997	FROB (params, ';');
998	FROB (query, '?');
999
1000	#undef FROB
1001	}
1002
1003	/* Public function for getting the "full path". E.g. if u->path is
1004	"foo/bar" and u->query is "param=value", full_path will be
1005	"/foo/bar?param=value". */
1006
1007	char *
1008	url_full_path (const struct url *url)
1009	{
1010	int length = full_path_length (url);
1011	char full_path = (char ) xmalloc (length + 1);
1012
1013	full_path_write (url, full_path);
1014	full_path[length] = '\0';
1015
1016	return full_path;
1017	}
1018
1019	/* Unescape CHR in an otherwise escaped STR. Used to selectively
1020	escaping of certain characters, such as "/" and ":". Returns a
1021	count of unescaped chars. */
1022
1023	static void
1024	unescape_single_char (char *str, char chr)
1025	{
1026	const char c1 = XNUM_TO_DIGIT (chr >> 4);
1027	const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1028	char h = str; / hare */
1029	char t = str; / tortoise */
1030	for (; *h; h++, t++)
1031	{
1032	if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1033	{
1034	*t = chr;
1035	h += 2;
1036	}
1037	else
1038	t = h;
1039	}
1040	*t = '\0';
1041	}
1042
1043	/* Escape unsafe and reserved characters, except for the slash
1044	characters. */
1045
1046	static char *
1047	url_escape_dir (const char *dir)
1048	{
1049	char *newdir = url_escape_1 (dir, urlchr_unsafe \| urlchr_reserved, 1);
1050	if (newdir == dir)
1051	return (char *)dir;
1052
1053	unescape_single_char (newdir, '/');
1054	return newdir;
1055	}
1056
1057	/* Sync u->path and u->url with u->dir and u->file. Called after
1058	u->file or u->dir have been changed, typically by the FTP code. */
1059
1060	static void
1061	sync_path (struct url *u)
1062	{
1063	char newpath, efile, *edir;
1064
1065	xfree (u->path);
1066
1067	/* u->dir and u->file are not escaped. URL-escape them before
1068	reassembling them into u->path. That way, if they contain
1069	separators like '?' or even if u->file contains slashes, the
1070	path will be correctly assembled. (u->file can contain slashes
1071	if the URL specifies it with %2f, or if an FTP server returns
1072	it.) */
1073	edir = url_escape_dir (u->dir);
1074	efile = url_escape_1 (u->file, urlchr_unsafe \| urlchr_reserved, 1);
1075
1076	if (!*edir)
1077	newpath = xstrdup (efile);
1078	else
1079	{
1080	int dirlen = strlen (edir);
1081	int filelen = strlen (efile);
1082
1083	/* Copy "DIR/FILE" to newpath. */
1084	char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1085	memcpy (p, edir, dirlen);
1086	p += dirlen;
1087	*p++ = '/';
1088	memcpy (p, efile, filelen);
1089	p += filelen;
1090	*p = '\0';
1091	}
1092
1093	u->path = newpath;
1094
1095	if (edir != u->dir)
1096	xfree (edir);
1097	if (efile != u->file)
1098	xfree (efile);
1099
1100	/* Regenerate u->url as well. */
1101	xfree (u->url);
1102	u->url = url_string (u, 0);
1103	}
1104
1105	/* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1106	This way we can sync u->path and u->url when they get changed. */
1107
1108	void
1109	url_set_dir (struct url url, const char newdir)
1110	{
1111	xfree (url->dir);
1112	url->dir = xstrdup (newdir);
1113	sync_path (url);
1114	}
1115
1116	void
1117	url_set_file (struct url url, const char newfile)
1118	{
1119	xfree (url->file);
1120	url->file = xstrdup (newfile);
1121	sync_path (url);
1122	}
1123
1124	void
1125	url_free (struct url *url)
1126	{
1127	xfree (url->host);
1128	xfree (url->path);
1129	xfree (url->url);
1130
1131	xfree_null (url->params);
1132	xfree_null (url->query);
1133	xfree_null (url->fragment);
1134	xfree_null (url->user);
1135	xfree_null (url->passwd);
1136
1137	xfree (url->dir);
1138	xfree (url->file);
1139
1140	xfree (url);
1141	}
1142
1143
1144	/* Create all the necessary directories for PATH (a file). Calls
1145	make_directory internally. */
1146	int
1147	mkalldirs (const char *path)
1148	{
1149	const char *p;
1150	char *t;
1151	struct_stat st;
1152	int res;
1153
1154	p = path + strlen (path);
1155	for (; *p != '/' && p != path; p--)
1156	;
1157
1158	/* Don't create if it's just a file. */
1159	if ((p == path) && (*p != '/'))
1160	return 0;
1161	t = strdupdelim (path, p);
1162
1163	/* Check whether the directory exists. */
1164	if ((stat (t, &st) == 0))
1165	{
1166	if (S_ISDIR (st.st_mode))
1167	{
1168	xfree (t);
1169	return 0;
1170	}
1171	else
1172	{
1173	/* If the dir exists as a file name, remove it first. This
1174	is only for Wget to work with buggy old CERN http
1175	servers. Here is the scenario: When Wget tries to
1176	retrieve a directory without a slash, e.g.
1177	http://foo/bar (bar being a directory), CERN server will
1178	not redirect it too http://foo/bar/ -- it will generate a
1179	directory listing containing links to bar/file1,
1180	bar/file2, etc. Wget will lose because it saves this
1181	HTML listing to a file `bar', so it cannot create the
1182	directory. To work around this, if the file of the same
1183	name exists, we just remove it and create the directory
1184	anyway. */
1185	DEBUGP (("Removing %s because of directory danger!\n", t));
1186	unlink (t);
1187	}
1188	}
1189	res = make_directory (t);
1190	if (res != 0)
1191	logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1192	xfree (t);
1193	return res;
1194	}
1195
1196
1197	/* Functions for constructing the file name out of URL components. */
1198
1199	/* A growable string structure, used by url_file_name and friends.
1200	This should perhaps be moved to utils.c.
1201
1202	The idea is to have a convenient and efficient way to construct a
1203	string by having various functions append data to it. Instead of
1204	passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1205	functions in questions, we pass the pointer to this struct. */
1206
1207	struct growable {
1208	char *base;
1209	int size;
1210	int tail;
1211	};
1212
1213	/* Ensure that the string can accept APPEND_COUNT more characters past
1214	the current TAIL position. If necessary, this will grow the string
1215	and update its allocated size. If the string is already large
1216	enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1217	#define GROW(g, append_size) do { \
1218	struct growable *G_ = g; \
1219	DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1220	} while (0)
1221
1222	/* Return the tail position of the string. */
1223	#define TAIL(r) ((r)->base + (r)->tail)
1224
1225	/* Move the tail position by APPEND_COUNT characters. */
1226	#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1227
1228	/* Append the string STR to DEST. NOTICE: the string in DEST is not
1229	terminated. */
1230
1231	static void
1232	append_string (const char str, struct growable dest)
1233	{
1234	int l = strlen (str);
1235	GROW (dest, l);
1236	memcpy (TAIL (dest), str, l);
1237	TAIL_INCR (dest, l);
1238	}
1239
1240	/* Append CH to DEST. For example, append_char (0, DEST)
1241	zero-terminates DEST. */
1242
1243	static void
1244	append_char (char ch, struct growable *dest)
1245	{
1246	GROW (dest, 1);
1247	*TAIL (dest) = ch;
1248	TAIL_INCR (dest, 1);
1249	}
1250
1251	enum {
1252	filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1253	filechr_not_windows = 2, /* unusable on Windows, one of \\|/<>?:" /
1254	filechr_control = 4 /* a control character, e.g. 0-31 */
1255	};
1256
1257	#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1258
1259	/* Shorthands for the table: */
1260	#define U filechr_not_unix
1261	#define W filechr_not_windows
1262	#define C filechr_control
1263
1264	#define UW U\|W
1265	#define UWC U\|W\|C
1266
1267	/* Table of characters unsafe under various conditions (see above).
1268
1269	Arguably we could also claim `%' to be unsafe, since we use it as
1270	the escape character. If we ever want to be able to reliably
1271	translate file name back to URL, this would become important
1272	crucial. Right now, it's better to be minimal in escaping. */
1273
1274	static const unsigned char filechr_table[256] =
1275	{
1276	UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1277	C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1278	C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1279	C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1280	0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1281	0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1282	0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1283	0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1284	0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1285	0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1286	0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1287	0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1288	0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1289	0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1290	0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1291	0, 0, 0, 0, 0, 0, 0, 0, /* x y z { \| } ~ DEL */
1292
1293	C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1294	C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1295	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1296	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1297
1298	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1299	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1300	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1301	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1302	};
1303	#undef U
1304	#undef W
1305	#undef C
1306	#undef UW
1307	#undef UWC
1308
1309	/* FN_PORT_SEP is the separator between host and port in file names
1310	for non-standard port numbers. On Unix this is normally ':', as in
1311	"www.xemacs.org:4001/index.html". Under Windows, we set it to +
1312	because Windows can't handle ':' in file names. */
1313	#define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1314
1315	/* FN_QUERY_SEP is the separator between the file name and the URL
1316	query, normally '?'. Since Windows cannot handle '?' as part of
1317	file name, we use '@' instead there. */
1318	#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1319
1320	/* Quote path element, characters in [b, e), as file name, and append
1321	the quoted string to DEST. Each character is quoted as per
1322	file_unsafe_char and the corresponding table.
1323
1324	If ESCAPED_P is non-zero, the path element is considered to be
1325	URL-escaped and will be unescaped prior to inspection. */
1326
1327	static void
1328	append_uri_pathel (const char b, const char e, int escaped_p,
1329	struct growable *dest)
1330	{
1331	const char *p;
1332	int quoted, outlen;
1333
1334	int mask;
1335	if (opt.restrict_files_os == restrict_unix)
1336	mask = filechr_not_unix;
1337	else
1338	mask = filechr_not_windows;
1339	if (opt.restrict_files_ctrl)
1340	mask \|= filechr_control;
1341
1342	/* Copy [b, e) to PATHEL and URL-unescape it. */
1343	if (escaped_p)
1344	{
1345	char *unescaped;
1346	BOUNDED_TO_ALLOCA (b, e, unescaped);
1347	url_unescape (unescaped);
1348	b = unescaped;
1349	e = unescaped + strlen (unescaped);
1350	}
1351
1352	/* Defang ".." when found as component of path. Remember that path
1353	comes from the URL and might contain malicious input. */
1354	if (e - b == 2 && b[0] == '.' && b[1] == '.')
1355	{
1356	b = "%2E%2E";
1357	e = b + 6;
1358	}
1359
1360	/* Walk the PATHEL string and check how many characters we'll need
1361	to quote. */
1362	quoted = 0;
1363	for (p = b; p < e; p++)
1364	if (FILE_CHAR_TEST (*p, mask))
1365	++quoted;
1366
1367	/* Calculate the length of the output string. e-b is the input
1368	string length. Each quoted char introduces two additional
1369	characters in the string, hence 2quoted. /
1370	outlen = (e - b) + (2 * quoted);
1371	GROW (dest, outlen);
1372
1373	if (!quoted)
1374	{
1375	/* If there's nothing to quote, we can simply append the string
1376	without processing it again. */
1377	memcpy (TAIL (dest), b, outlen);
1378	}
1379	else
1380	{
1381	char *q = TAIL (dest);
1382	for (p = b; p < e; p++)
1383	{
1384	if (!FILE_CHAR_TEST (*p, mask))
1385	q++ = p;
1386	else
1387	{
1388	unsigned char ch = *p;
1389	*q++ = '%';
1390	*q++ = XNUM_TO_DIGIT (ch >> 4);
1391	*q++ = XNUM_TO_DIGIT (ch & 0xf);
1392	}
1393	}
1394	assert (q - TAIL (dest) == outlen);
1395	}
1396	TAIL_INCR (dest, outlen);
1397	}
1398
1399	/* Append to DEST the directory structure that corresponds the
1400	directory part of URL's path. For example, if the URL is
1401	http://server/dir1/dir2/file, this appends "/dir1/dir2".
1402
1403	Each path element ("dir1" and "dir2" in the above example) is
1404	examined, url-unescaped, and re-escaped as file name element.
1405
1406	Additionally, it cuts as many directories from the path as
1407	specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1408	will produce "bar" for the above example. For 2 or more, it will
1409	produce "".
1410
1411	Each component of the path is quoted for use as file name. */
1412
1413	static void
1414	append_dir_structure (const struct url u, struct growable dest)
1415	{
1416	char pathel, next;
1417	int cut = opt.cut_dirs;
1418
1419	/* Go through the path components, de-URL-quote them, and quote them
1420	(if necessary) as file names. */
1421
1422	pathel = u->path;
1423	for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1424	{
1425	if (cut-- > 0)
1426	continue;
1427	if (pathel == next)
1428	/* Ignore empty pathels. */
1429	continue;
1430
1431	if (dest->tail)
1432	append_char ('/', dest);
1433	append_uri_pathel (pathel, next, 1, dest);
1434	}
1435	}
1436
1437	/* Return a unique file name that matches the given URL as good as
1438	possible. Does not create directories on the file system. */
1439
1440	char *
1441	url_file_name (const struct url *u)
1442	{
1443	struct growable fnres; /* stands for "file name result" */
1444
1445	const char u_file, u_query;
1446	char fname, unique;
1447
1448	fnres.base = NULL;
1449	fnres.size = 0;
1450	fnres.tail = 0;
1451
1452	/* Start with the directory prefix, if specified. */
1453	if (opt.dir_prefix)
1454	append_string (opt.dir_prefix, &fnres);
1455
1456	/* If "dirstruct" is turned on (typically the case with -r), add
1457	the host and port (unless those have been turned off) and
1458	directory structure. */
1459	if (opt.dirstruct)
1460	{
1461	if (opt.protocol_directories)
1462	{
1463	if (fnres.tail)
1464	append_char ('/', &fnres);
1465	append_string (supported_schemes[u->scheme].name, &fnres);
1466	}
1467	if (opt.add_hostdir)
1468	{
1469	if (fnres.tail)
1470	append_char ('/', &fnres);
1471	if (0 != strcmp (u->host, ".."))
1472	append_string (u->host, &fnres);
1473	else
1474	/* Host name can come from the network; malicious DNS may
1475	allow ".." to be resolved, causing us to write to
1476	"../<file>". Defang such host names. */
1477	append_string ("%2E%2E", &fnres);
1478	if (u->port != scheme_default_port (u->scheme))
1479	{
1480	char portstr[24];
1481	number_to_string (portstr, u->port);
1482	append_char (FN_PORT_SEP, &fnres);
1483	append_string (portstr, &fnres);
1484	}
1485	}
1486
1487	append_dir_structure (u, &fnres);
1488	}
1489
1490	/* Add the file name. */
1491	if (fnres.tail)
1492	append_char ('/', &fnres);
1493	u_file = *u->file ? u->file : "index.html";
1494	append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1495
1496	/* Append "?query" to the file name. */
1497	u_query = u->query && *u->query ? u->query : NULL;
1498	if (u_query)
1499	{
1500	append_char (FN_QUERY_SEP, &fnres);
1501	append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1502	}
1503
1504	/* Zero-terminate the file name. */
1505	append_char ('\0', &fnres);
1506
1507	fname = fnres.base;
1508
1509	/* Check the cases in which the unique extensions are not used:
1510	1) Clobbering is turned off (-nc).
1511	2) Retrieval with regetting.
1512	3) Timestamping is used.
1513	4) Hierarchy is built.
1514
1515	The exception is the case when file does exist and is a
1516	directory (see `mkalldirs' for explanation). */
1517
1518	if ((opt.noclobber \|\| opt.always_rest \|\| opt.timestamping \|\| opt.dirstruct)
1519	&& !(file_exists_p (fname) && !file_non_directory_p (fname)))
1520	return fname;
1521
1522	unique = unique_name (fname, 1);
1523	if (unique != fname)
1524	xfree (fname);
1525	return unique;
1526	}
1527
1528
1529	/* Resolve "." and ".." elements of PATH by destructively modifying
1530	PATH and return non-zero if PATH has been modified, zero otherwise.
1531
1532	The algorithm is in spirit similar to the one described in rfc1808,
1533	although implemented differently, in one pass. To recap, path
1534	elements containing only "." are removed, and ".." is taken to mean
1535	"back up one element". Single leading and trailing slashes are
1536	preserved.
1537
1538	For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1539	test examples are provided below. If you change anything in this
1540	function, run test_path_simplify to make sure you haven't broken a
1541	test case. */
1542
1543	static int
1544	path_simplify (char *path)
1545	{
1546	char h = path; / hare */
1547	char t = path; / tortoise */
1548	char beg = path; / boundary for backing the tortoise */
1549	char *end = path + strlen (path);
1550
1551	while (h < end)
1552	{
1553	/* Hare should be at the beginning of a path element. */
1554
1555	if (h[0] == '.' && (h[1] == '/' \|\| h[1] == '\0'))
1556	{
1557	/* Ignore "./". */
1558	h += 2;
1559	}
1560	else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' \|\| h[2] == '\0'))
1561	{
1562	/* Handle "../" by retreating the tortoise by one path
1563	element -- but not past beggining. */
1564	if (t > beg)
1565	{
1566	/* Move backwards until T hits the beginning of the
1567	previous path element or the beginning of path. */
1568	for (--t; t > beg && t[-1] != '/'; t--)
1569	;
1570	}
1571	else
1572	{
1573	/* If we're at the beginning, copy the "../" literally
1574	move the beginning so a later ".." doesn't remove
1575	it. */
1576	beg = t + 3;
1577	goto regular;
1578	}
1579	h += 3;
1580	}
1581	else
1582	{
1583	regular:
1584	/* A regular path element. If H hasn't advanced past T,
1585	simply skip to the next path element. Otherwise, copy
1586	the path element until the next slash. */
1587	if (t == h)
1588	{
1589	/* Skip the path element, including the slash. */
1590	while (h < end && *h != '/')
1591	t++, h++;
1592	if (h < end)
1593	t++, h++;
1594	}
1595	else
1596	{
1597	/* Copy the path element, including the final slash. */
1598	while (h < end && *h != '/')
1599	t++ = h++;
1600	if (h < end)
1601	t++ = h++;
1602	}
1603	}
1604	}
1605
1606	if (t != h)
1607	*t = '\0';
1608
1609	return t != h;
1610	}
1611
1612
1613	/* Return the length of URL's path. Path is considered to be
1614	terminated by one of '?', ';', '#', or by the end of the
1615	string. */
1616
1617	static int
1618	path_length (const char *url)
1619	{
1620	const char *q = strpbrk_or_eos (url, "?;#");
1621	return q - url;
1622	}
1623
1624	/* Find the last occurrence of character C in the range [b, e), or
1625	NULL, if none are present. We might want to use memrchr (a GNU
1626	extension) under GNU libc. */
1627
1628	static const char *
1629	find_last_char (const char b, const char e, char c)
1630	{
1631	for (; e > b; e--)
1632	if (*e == c)
1633	return e;
1634	return NULL;
1635	}
1636
1637	/* Merge BASE with LINK and return the resulting URI.
1638
1639	Either of the URIs may be absolute or relative, complete with the
1640	host name, or path only. This tries to reasonably handle all
1641	foreseeable cases. It only employs minimal URL parsing, without
1642	knowledge of the specifics of schemes.
1643
1644	I briefly considered making this function call path_simplify after
1645	the merging process, as rfc1738 seems to suggest. This is a bad
1646	idea for several reasons: 1) it complexifies the code, and 2)
1647	url_parse has to simplify path anyway, so it's wasteful to boot. */
1648
1649	char *
1650	uri_merge (const char base, const char link)
1651	{
1652	int linklength;
1653	const char *end;
1654	char *merge;
1655
1656	if (url_has_scheme (link))
1657	return xstrdup (link);
1658
1659	/* We may not examine BASE past END. */
1660	end = base + path_length (base);
1661	linklength = strlen (link);
1662
1663	if (!*link)
1664	{
1665	/* Empty LINK points back to BASE, query string and all. */
1666	return xstrdup (base);
1667	}
1668	else if (*link == '?')
1669	{
1670	/* LINK points to the same location, but changes the query
1671	string. Examples: */
1672	/* uri_merge("path", "?new") -> "path?new" */
1673	/* uri_merge("path?foo", "?new") -> "path?new" */
1674	/* uri_merge("path?foo#bar", "?new") -> "path?new" */
1675	/* uri_merge("path#foo", "?new") -> "path?new" */
1676	int baselength = end - base;
1677	merge = xmalloc (baselength + linklength + 1);
1678	memcpy (merge, base, baselength);
1679	memcpy (merge + baselength, link, linklength);
1680	merge[baselength + linklength] = '\0';
1681	}
1682	else if (*link == '#')
1683	{
1684	/* uri_merge("path", "#new") -> "path#new" */
1685	/* uri_merge("path#foo", "#new") -> "path#new" */
1686	/* uri_merge("path?foo", "#new") -> "path?foo#new" */
1687	/* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1688	int baselength;
1689	const char *end1 = strchr (base, '#');
1690	if (!end1)
1691	end1 = base + strlen (base);
1692	baselength = end1 - base;
1693	merge = xmalloc (baselength + linklength + 1);
1694	memcpy (merge, base, baselength);
1695	memcpy (merge + baselength, link, linklength);
1696	merge[baselength + linklength] = '\0';
1697	}
1698	else if (link == '/' && (link + 1) == '/')
1699	{
1700	/* LINK begins with "//" and so is a net path: we need to
1701	replace everything after (and including) the double slash
1702	with LINK. */
1703
1704	/* uri_merge("foo", "//new/bar") -> "//new/bar" */
1705	/* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1706	/* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1707
1708	int span;
1709	const char *slash;
1710	const char *start_insert;
1711
1712	/* Look for first slash. */
1713	slash = memchr (base, '/', end - base);
1714	/* If found slash and it is a double slash, then replace
1715	from this point, else default to replacing from the
1716	beginning. */
1717	if (slash && *(slash + 1) == '/')
1718	start_insert = slash;
1719	else
1720	start_insert = base;
1721
1722	span = start_insert - base;
1723	merge = (char *)xmalloc (span + linklength + 1);
1724	if (span)
1725	memcpy (merge, base, span);
1726	memcpy (merge + span, link, linklength);
1727	merge[span + linklength] = '\0';
1728	}
1729	else if (*link == '/')
1730	{
1731	/* LINK is an absolute path: we need to replace everything
1732	after (and including) the FIRST slash with LINK.
1733
1734	So, if BASE is "http://host/whatever/foo/bar", and LINK is
1735	"/qux/xyzzy", our result should be
1736	"http://host/qux/xyzzy". */
1737	int span;
1738	const char *slash;
1739	const char start_insert = NULL; / for gcc to shut up. */
1740	const char *pos = base;
1741	int seen_slash_slash = 0;
1742	/* We're looking for the first slash, but want to ignore
1743	double slash. */
1744	again:
1745	slash = memchr (pos, '/', end - pos);
1746	if (slash && !seen_slash_slash)
1747	if (*(slash + 1) == '/')
1748	{
1749	pos = slash + 2;
1750	seen_slash_slash = 1;
1751	goto again;
1752	}
1753
1754	/* At this point, SLASH is the location of the first / after
1755	"//", or the first slash altogether. START_INSERT is the
1756	pointer to the location where LINK will be inserted. When
1757	examining the last two examples, keep in mind that LINK
1758	begins with '/'. */
1759
1760	if (!slash && !seen_slash_slash)
1761	/* example: "foo" */
1762	/* ^ */
1763	start_insert = base;
1764	else if (!slash && seen_slash_slash)
1765	/* example: "http://foo" */
1766	/* ^ */
1767	start_insert = end;
1768	else if (slash && !seen_slash_slash)
1769	/* example: "foo/bar" */
1770	/* ^ */
1771	start_insert = base;
1772	else if (slash && seen_slash_slash)
1773	/* example: "http://something/" */
1774	/* ^ */
1775	start_insert = slash;
1776
1777	span = start_insert - base;
1778	merge = (char *)xmalloc (span + linklength + 1);
1779	if (span)
1780	memcpy (merge, base, span);
1781	memcpy (merge + span, link, linklength);
1782	merge[span + linklength] = '\0';
1783	}
1784	else
1785	{
1786	/* LINK is a relative URL: we need to replace everything
1787	after last slash (possibly empty) with LINK.
1788
1789	So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1790	our result should be "whatever/foo/qux/xyzzy". */
1791	int need_explicit_slash = 0;
1792	int span;
1793	const char *start_insert;
1794	const char *last_slash = find_last_char (base, end, '/');
1795	if (!last_slash)
1796	{
1797	/* No slash found at all. Replace what we have with LINK. */
1798	start_insert = base;
1799	}
1800	else if (last_slash && last_slash >= base + 2
1801	&& last_slash[-2] == ':' && last_slash[-1] == '/')
1802	{
1803	/* example: http://host" */
1804	/* ^ */
1805	start_insert = end + 1;
1806	need_explicit_slash = 1;
1807	}
1808	else
1809	{
1810	/* example: "whatever/foo/bar" */
1811	/* ^ */
1812	start_insert = last_slash + 1;
1813	}
1814
1815	span = start_insert - base;
1816	merge = (char *)xmalloc (span + linklength + 1);
1817	if (span)
1818	memcpy (merge, base, span);
1819	if (need_explicit_slash)
1820	merge[span - 1] = '/';
1821	memcpy (merge + span, link, linklength);
1822	merge[span + linklength] = '\0';
1823	}
1824
1825	return merge;
1826	}
1827
1828
1829	#define APPEND(p, s) do { \
1830	int len = strlen (s); \
1831	memcpy (p, s, len); \
1832	p += len; \
1833	} while (0)
1834
1835	/* Use this instead of password when the actual password is supposed
1836	to be hidden. We intentionally use a generic string without giving
1837	away the number of characters in the password, like previous
1838	versions did. */
1839	#define HIDDEN_PASSWORD "password"
1840
1841	/* Recreate the URL string from the data in URL.
1842
1843	If HIDE is non-zero (as it is when we're calling this on a URL we
1844	plan to print, but not when calling it to canonicalize a URL for
1845	use within the program), password will be hidden. Unsafe
1846	characters in the URL will be quoted. */
1847
1848	char *
1849	url_string (const struct url *url, int hide_password)
1850	{
1851	int size;
1852	char result, p;
1853	char quoted_host, quoted_user = NULL, *quoted_passwd = NULL;
1854
1855	int scheme_port = supported_schemes[url->scheme].default_port;
1856	const char *scheme_str = supported_schemes[url->scheme].leading_string;
1857	int fplen = full_path_length (url);
1858
1859	int brackets_around_host;
1860
1861	assert (scheme_str != NULL);
1862
1863	/* Make sure the user name and password are quoted. */
1864	if (url->user)
1865	{
1866	quoted_user = url_escape_allow_passthrough (url->user);
1867	if (url->passwd)
1868	{
1869	if (hide_password)
1870	quoted_passwd = HIDDEN_PASSWORD;
1871	else
1872	quoted_passwd = url_escape_allow_passthrough (url->passwd);
1873	}
1874	}
1875
1876	/* In the unlikely event that the host name contains non-printable
1877	characters, quote it for displaying to the user. */
1878	quoted_host = url_escape_allow_passthrough (url->host);
1879
1880	/* Undo the quoting of colons that URL escaping performs. IPv6
1881	addresses may legally contain colons, and in that case must be
1882	placed in square brackets. */
1883	if (quoted_host != url->host)
1884	unescape_single_char (quoted_host, ':');
1885	brackets_around_host = strchr (quoted_host, ':') != NULL;
1886
1887	size = (strlen (scheme_str)
1888	+ strlen (quoted_host)
1889	+ (brackets_around_host ? 2 : 0)
1890	+ fplen
1891	+ 1);
1892	if (url->port != scheme_port)
1893	size += 1 + numdigit (url->port);
1894	if (quoted_user)
1895	{
1896	size += 1 + strlen (quoted_user);
1897	if (quoted_passwd)
1898	size += 1 + strlen (quoted_passwd);
1899	}
1900
1901	p = result = xmalloc (size);
1902
1903	APPEND (p, scheme_str);
1904	if (quoted_user)
1905	{
1906	APPEND (p, quoted_user);
1907	if (quoted_passwd)
1908	{
1909	*p++ = ':';
1910	APPEND (p, quoted_passwd);
1911	}
1912	*p++ = '@';
1913	}
1914
1915	if (brackets_around_host)
1916	*p++ = '[';
1917	APPEND (p, quoted_host);
1918	if (brackets_around_host)
1919	*p++ = ']';
1920	if (url->port != scheme_port)
1921	{
1922	*p++ = ':';
1923	p = number_to_string (p, url->port);
1924	}
1925
1926	full_path_write (url, p);
1927	p += fplen;
1928	*p++ = '\0';
1929
1930	assert (p - result == size);
1931
1932	if (quoted_user && quoted_user != url->user)
1933	xfree (quoted_user);
1934	if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1935	xfree (quoted_passwd);
1936	if (quoted_host != url->host)
1937	xfree (quoted_host);
1938
1939	return result;
1940	}
1941
1942
1943	/* Return non-zero if scheme a is similar to scheme b.
1944
1945	Schemes are similar if they are equal. If SSL is supported, schemes
1946	are also similar if one is http (SCHEME_HTTP) and the other is https
1947	(SCHEME_HTTPS). */
1948	int
1949	schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1950	{
1951	if (a == b)
1952	return 1;
1953	#ifdef HAVE_SSL
1954	if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1955	\|\| (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1956	return 1;
1957	#endif
1958	return 0;
1959	}
1960
1961
1962	#if 0
1963	/* Debugging and testing support for path_simplify. */
1964
1965	/* Debug: run path_simplify on PATH and return the result in a new
1966	string. Useful for calling from the debugger. */
1967	static char *
1968	ps (char *path)
1969	{
1970	char *copy = xstrdup (path);
1971	path_simplify (copy);
1972	return copy;
1973	}
1974
1975	static void
1976	run_test (char test, char expected_result, int expected_change)
1977	{
1978	char *test_copy = xstrdup (test);
1979	int modified = path_simplify (test_copy);
1980
1981	if (0 != strcmp (test_copy, expected_result))
1982	{
1983	printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1984	test, expected_result, test_copy);
1985	}
1986	if (modified != expected_change)
1987	{
1988	if (expected_change == 1)
1989	printf ("Expected modification with path_simplify(\"%s\").\n",
1990	test);
1991	else
1992	printf ("Expected no modification with path_simplify(\"%s\").\n",
1993	test);
1994	}
1995	xfree (test_copy);
1996	}
1997
1998	static void
1999	test_path_simplify (void)
2000	{
2001	static struct {
2002	char test, result;
2003	int should_modify;
2004	} tests[] = {
2005	{ "", "", 0 },
2006	{ ".", "", 1 },
2007	{ "./", "", 1 },
2008	{ "..", "..", 0 },
2009	{ "../", "../", 0 },
2010	{ "foo", "foo", 0 },
2011	{ "foo/bar", "foo/bar", 0 },
2012	{ "foo///bar", "foo///bar", 0 },
2013	{ "foo/.", "foo/", 1 },
2014	{ "foo/./", "foo/", 1 },
2015	{ "foo./", "foo./", 0 },
2016	{ "foo/../bar", "bar", 1 },
2017	{ "foo/../bar/", "bar/", 1 },
2018	{ "foo/bar/..", "foo/", 1 },
2019	{ "foo/bar/../x", "foo/x", 1 },
2020	{ "foo/bar/../x/", "foo/x/", 1 },
2021	{ "foo/..", "", 1 },
2022	{ "foo/../..", "..", 1 },
2023	{ "foo/../../..", "../..", 1 },
2024	{ "foo/../../bar/../../baz", "../../baz", 1 },
2025	{ "a/b/../../c", "c", 1 },
2026	{ "./a/../b", "b", 1 }
2027	};
2028	int i;
2029
2030	for (i = 0; i < countof (tests); i++)
2031	{
2032	char *test = tests[i].test;
2033	char *expected_result = tests[i].result;
2034	int expected_change = tests[i].should_modify;
2035	run_test (test, expected_result, expected_change);
2036	}
2037	}
2038	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/net-misc/wget/src/url.c

Download in other formats: