source: trunk/essentials/net-misc/wget/src/url.c

Last change on this file was 3440, checked in by bird, 18 years ago

wget 1.10.2

File size: 55.6 KB
Line 
1/* URL handling.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3
4This file is part of GNU Wget.
5
6GNU Wget is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2 of the License, or (at
9your option) any later version.
10
11GNU Wget is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with Wget; if not, write to the Free Software
18Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20In addition, as a special exception, the Free Software Foundation
21gives permission to link the code of its release of Wget with the
22OpenSSL project's "OpenSSL" library (or with modified versions of it
23that use the same license as the "OpenSSL" library), and distribute
24the linked executables. You must obey the GNU General Public License
25in all respects for all of the code used other than "OpenSSL". If you
26modify this file, you may extend this exception to your version of the
27file, but you are not obligated to do so. If you do not wish to do
28so, delete this exception statement from your version. */
29
30#include <config.h>
31
32#include <stdio.h>
33#include <stdlib.h>
34#ifdef HAVE_STRING_H
35# include <string.h>
36#else
37# include <strings.h>
38#endif
39#include <sys/types.h>
40#ifdef HAVE_UNISTD_H
41# include <unistd.h>
42#endif
43#include <errno.h>
44#include <assert.h>
45
46#include "wget.h"
47#include "utils.h"
48#include "url.h"
49#include "host.h" /* for is_valid_ipv6_address */
50
51#ifndef errno
52extern int errno;
53#endif
54
55struct scheme_data
56{
57 const char *name;
58 const char *leading_string;
59 int default_port;
60 int enabled;
61};
62
63/* Supported schemes: */
64static struct scheme_data supported_schemes[] =
65{
66 { "http", "http://", DEFAULT_HTTP_PORT, 1 },
67#ifdef HAVE_SSL
68 { "https", "https://", DEFAULT_HTTPS_PORT, 1 },
69#endif
70 { "ftp", "ftp://", DEFAULT_FTP_PORT, 1 },
71
72 /* SCHEME_INVALID */
73 { NULL, NULL, -1, 0 }
74};
75
76/* Forward declarations: */
77
78static int path_simplify PARAMS ((char *));
79
80
81/* Support for escaping and unescaping of URL strings. */
82
83/* Table of "reserved" and "unsafe" characters. Those terms are
84 rfc1738-speak, as such largely obsoleted by rfc2396 and later
85 specs, but the general idea remains.
86
87 A reserved character is the one that you can't decode without
88 changing the meaning of the URL. For example, you can't decode
89 "/foo/%2f/bar" into "/foo///bar" because the number and contents of
90 path components is different. Non-reserved characters can be
91 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The
92 unsafe characters are loosely based on rfc1738, plus "$" and ",",
93 as recommended by rfc2396, and minus "~", which is very frequently
94 used (and sometimes unrecognized as %7E by broken servers).
95
96 An unsafe character is the one that should be encoded when URLs are
97 placed in foreign environments. E.g. space and newline are unsafe
98 in HTTP contexts because HTTP uses them as separator and line
99 terminator, so they must be encoded to %20 and %0A respectively.
100 "*" is unsafe in shell context, etc.
101
102 We determine whether a character is unsafe through static table
103 lookup. This code assumes ASCII character set and 8-bit chars. */
104
105enum {
106 /* rfc1738 reserved chars + "$" and ",". */
107 urlchr_reserved = 1,
108
109 /* rfc1738 unsafe chars, plus non-printables. */
110 urlchr_unsafe = 2
111};
112
113#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
114#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
115#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
116
117/* Shorthands for the table: */
118#define R urlchr_reserved
119#define U urlchr_unsafe
120#define RU R|U
121
122static const unsigned char urlchr_table[256] =
123{
124 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
125 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
126 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
127 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
128 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
129 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
130 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
131 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
132 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
133 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
134 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
135 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
136 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
137 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
138 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
139 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */
140
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
143 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
144 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
145
146 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
147 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
149 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
150};
151#undef R
152#undef U
153#undef RU
154
155/* URL-unescape the string S.
156
157 This is done by transforming the sequences "%HH" to the character
158 represented by the hexadecimal digits HH. If % is not followed by
159 two hexadecimal digits, it is inserted literally.
160
161 The transformation is done in place. If you need the original
162 string intact, make a copy before calling this function. */
163
164static void
165url_unescape (char *s)
166{
167 char *t = s; /* t - tortoise */
168 char *h = s; /* h - hare */
169
170 for (; *h; h++, t++)
171 {
172 if (*h != '%')
173 {
174 copychar:
175 *t = *h;
176 }
177 else
178 {
179 char c;
180 /* Do nothing if '%' is not followed by two hex digits. */
181 if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
182 goto copychar;
183 c = X2DIGITS_TO_NUM (h[1], h[2]);
184 /* Don't unescape %00 because there is no way to insert it
185 into a C string without effectively truncating it. */
186 if (c == '\0')
187 goto copychar;
188 *t = c;
189 h += 2;
190 }
191 }
192 *t = '\0';
193}
194
195/* The core of url_escape_* functions. Escapes the characters that
196 match the provided mask in urlchr_table.
197
198 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
199 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
200 freshly allocated string will be returned in all cases. */
201
202static char *
203url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
204{
205 const char *p1;
206 char *p2, *newstr;
207 int newlen;
208 int addition = 0;
209
210 for (p1 = s; *p1; p1++)
211 if (urlchr_test (*p1, mask))
212 addition += 2; /* Two more characters (hex digits) */
213
214 if (!addition)
215 return allow_passthrough ? (char *)s : xstrdup (s);
216
217 newlen = (p1 - s) + addition;
218 newstr = (char *)xmalloc (newlen + 1);
219
220 p1 = s;
221 p2 = newstr;
222 while (*p1)
223 {
224 /* Quote the characters that match the test mask. */
225 if (urlchr_test (*p1, mask))
226 {
227 unsigned char c = *p1++;
228 *p2++ = '%';
229 *p2++ = XNUM_TO_DIGIT (c >> 4);
230 *p2++ = XNUM_TO_DIGIT (c & 0xf);
231 }
232 else
233 *p2++ = *p1++;
234 }
235 assert (p2 - newstr == newlen);
236 *p2 = '\0';
237
238 return newstr;
239}
240
241/* URL-escape the unsafe characters (see urlchr_table) in a given
242 string, returning a freshly allocated string. */
243
244char *
245url_escape (const char *s)
246{
247 return url_escape_1 (s, urlchr_unsafe, 0);
248}
249
250/* URL-escape the unsafe characters (see urlchr_table) in a given
251 string. If no characters are unsafe, S is returned. */
252
253static char *
254url_escape_allow_passthrough (const char *s)
255{
256 return url_escape_1 (s, urlchr_unsafe, 1);
257}
258
259
260/* Decide whether the char at position P needs to be encoded. (It is
261 not enough to pass a single char *P because the function may need
262 to inspect the surrounding context.)
263
264 Return 1 if the char should be escaped as %XX, 0 otherwise. */
265
266static inline int
267char_needs_escaping (const char *p)
268{
269 if (*p == '%')
270 {
271 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
272 return 0;
273 else
274 /* Garbled %.. sequence: encode `%'. */
275 return 1;
276 }
277 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
278 return 1;
279 else
280 return 0;
281}
282
283/* Translate a %-escaped (but possibly non-conformant) input string S
284 into a %-escaped (and conformant) output string. If no characters
285 are encoded or decoded, return the same string S; otherwise, return
286 a freshly allocated string with the new contents.
287
288 After a URL has been run through this function, the protocols that
289 use `%' as the quote character can use the resulting string as-is,
290 while those that don't can use url_unescape to get to the intended
291 data. This function is stable: once the input is transformed,
292 further transformations of the result yield the same output.
293
294 Let's discuss why this function is needed.
295
296 Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since
297 a raw space character would mess up the HTTP request, it needs to
298 be quoted, like this:
299
300 GET /abc%20def HTTP/1.0
301
302 It would appear that the unsafe chars need to be quoted, for
303 example with url_escape. But what if we're requested to download
304 `abc%20def'? url_escape transforms "%" to "%25", which would leave
305 us with `abc%2520def'. This is incorrect -- since %-escapes are
306 part of URL syntax, "%20" is the correct way to denote a literal
307 space on the Wget command line. This leads to the conclusion that
308 in that case Wget should not call url_escape, but leave the `%20'
309 as is. This is clearly contradictory, but it only gets worse.
310
311 What if the requested URI is `abc%20 def'? If we call url_escape,
312 we end up with `/abc%2520%20def', which is almost certainly not
313 intended. If we don't call url_escape, we are left with the
314 embedded space and cannot complete the request. What the user
315 meant was for Wget to request `/abc%20%20def', and this is where
316 reencode_escapes kicks in.
317
318 Wget used to solve this by first decoding %-quotes, and then
319 encoding all the "unsafe" characters found in the resulting string.
320 This was wrong because it didn't preserve certain URL special
321 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
322 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
323 whether we considered `+' reserved (it is). One of these results
324 is inevitable because by the second step we would lose information
325 on whether the `+' was originally encoded or not. Both results
326 were wrong because in CGI parameters + means space, while %2B means
327 literal plus. reencode_escapes correctly translates the above to
328 "a%2B+b", i.e. returns the original string.
329
330 This function uses a modified version of the algorithm originally
331 proposed by Anon Sricharoenchai:
332
333 * Encode all "unsafe" characters, except those that are also
334 "reserved", to %XX. See urlchr_table for which characters are
335 unsafe and reserved.
336
337 * Encode the "%" characters not followed by two hex digits to
338 "%25".
339
340 * Pass through all other characters and %XX escapes as-is. (Up to
341 Wget 1.10 this decoded %XX escapes corresponding to "safe"
342 characters, but that was obtrusive and broke some servers.)
343
344 Anon's test case:
345
346 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
347 ->
348 "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
349
350 Simpler test cases:
351
352 "foo bar" -> "foo%20bar"
353 "foo%20bar" -> "foo%20bar"
354 "foo %20bar" -> "foo%20%20bar"
355 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
356 "foo%25%20bar" -> "foo%25%20bar"
357 "foo%2%20bar" -> "foo%252%20bar"
358 "foo+bar" -> "foo+bar" (plus is reserved!)
359 "foo%2b+bar" -> "foo%2b+bar" */
360
361static char *
362reencode_escapes (const char *s)
363{
364 const char *p1;
365 char *newstr, *p2;
366 int oldlen, newlen;
367
368 int encode_count = 0;
369
370 /* First pass: inspect the string to see if there's anything to do,
371 and to calculate the new length. */
372 for (p1 = s; *p1; p1++)
373 if (char_needs_escaping (p1))
374 ++encode_count;
375
376 if (!encode_count)
377 /* The string is good as it is. */
378 return (char *) s; /* C const model sucks. */
379
380 oldlen = p1 - s;
381 /* Each encoding adds two characters (hex digits). */
382 newlen = oldlen + 2 * encode_count;
383 newstr = xmalloc (newlen + 1);
384
385 /* Second pass: copy the string to the destination address, encoding
386 chars when needed. */
387 p1 = s;
388 p2 = newstr;
389
390 while (*p1)
391 if (char_needs_escaping (p1))
392 {
393 unsigned char c = *p1++;
394 *p2++ = '%';
395 *p2++ = XNUM_TO_DIGIT (c >> 4);
396 *p2++ = XNUM_TO_DIGIT (c & 0xf);
397 }
398 else
399 *p2++ = *p1++;
400
401 *p2 = '\0';
402 assert (p2 - newstr == newlen);
403 return newstr;
404}
405
406
407/* Returns the scheme type if the scheme is supported, or
408 SCHEME_INVALID if not. */
409
410enum url_scheme
411url_scheme (const char *url)
412{
413 int i;
414
415 for (i = 0; supported_schemes[i].leading_string; i++)
416 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
417 strlen (supported_schemes[i].leading_string)))
418 {
419 if (supported_schemes[i].enabled)
420 return (enum url_scheme) i;
421 else
422 return SCHEME_INVALID;
423 }
424
425 return SCHEME_INVALID;
426}
427
428#define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
429
430/* Return 1 if the URL begins with any "scheme", 0 otherwise. As
431 currently implemented, it returns true if URL begins with
432 [-+a-zA-Z0-9]+: . */
433
434int
435url_has_scheme (const char *url)
436{
437 const char *p = url;
438
439 /* The first char must be a scheme char. */
440 if (!*p || !SCHEME_CHAR (*p))
441 return 0;
442 ++p;
443 /* Followed by 0 or more scheme chars. */
444 while (*p && SCHEME_CHAR (*p))
445 ++p;
446 /* Terminated by ':'. */
447 return *p == ':';
448}
449
450int
451scheme_default_port (enum url_scheme scheme)
452{
453 return supported_schemes[scheme].default_port;
454}
455
456void
457scheme_disable (enum url_scheme scheme)
458{
459 supported_schemes[scheme].enabled = 0;
460}
461
462/* Skip the username and password, if present in the URL. The
463 function should *not* be called with the complete URL, but with the
464 portion after the scheme.
465
466 If no username and password are found, return URL. */
467
468static const char *
469url_skip_credentials (const char *url)
470{
471 /* Look for '@' that comes before terminators, such as '/', '?',
472 '#', or ';'. */
473 const char *p = (const char *)strpbrk (url, "@/?#;");
474 if (!p || *p != '@')
475 return url;
476 return p + 1;
477}
478
479/* Parse credentials contained in [BEG, END). The region is expected
480 to have come from a URL and is unescaped. */
481
482static int
483parse_credentials (const char *beg, const char *end, char **user, char **passwd)
484{
485 char *colon;
486 const char *userend;
487
488 if (beg == end)
489 return 0; /* empty user name */
490
491 colon = memchr (beg, ':', end - beg);
492 if (colon == beg)
493 return 0; /* again empty user name */
494
495 if (colon)
496 {
497 *passwd = strdupdelim (colon + 1, end);
498 userend = colon;
499 url_unescape (*passwd);
500 }
501 else
502 {
503 *passwd = NULL;
504 userend = end;
505 }
506 *user = strdupdelim (beg, userend);
507 url_unescape (*user);
508 return 1;
509}
510
511/* Used by main.c: detect URLs written using the "shorthand" URL forms
512 popularized by Netscape and NcFTP. HTTP shorthands look like this:
513
514 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
515 www.foo.com[:port] -> http://www.foo.com[:port]
516
517 FTP shorthands look like this:
518
519 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
520 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
521
522 If the URL needs not or cannot be rewritten, return NULL. */
523
524char *
525rewrite_shorthand_url (const char *url)
526{
527 const char *p;
528
529 if (url_scheme (url) != SCHEME_INVALID)
530 return NULL;
531
532 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
533 latter Netscape. */
534 for (p = url; *p && *p != ':' && *p != '/'; p++)
535 ;
536
537 if (p == url)
538 return NULL;
539
540 /* If we're looking at "://", it means the URL uses a scheme we
541 don't support, which may include "https" when compiled without
542 SSL support. Don't bogusly rewrite such URLs. */
543 if (p[0] == ':' && p[1] == '/' && p[2] == '/')
544 return NULL;
545
546 if (*p == ':')
547 {
548 const char *pp;
549 char *res;
550 /* If the characters after the colon and before the next slash
551 or end of string are all digits, it's HTTP. */
552 int digits = 0;
553 for (pp = p + 1; ISDIGIT (*pp); pp++)
554 ++digits;
555 if (digits > 0 && (*pp == '/' || *pp == '\0'))
556 goto http;
557
558 /* Prepend "ftp://" to the entire URL... */
559 res = xmalloc (6 + strlen (url) + 1);
560 sprintf (res, "ftp://%s", url);
561 /* ...and replace ':' with '/'. */
562 res[6 + (p - url)] = '/';
563 return res;
564 }
565 else
566 {
567 char *res;
568 http:
569 /* Just prepend "http://" to what we have. */
570 res = xmalloc (7 + strlen (url) + 1);
571 sprintf (res, "http://%s", url);
572 return res;
573 }
574}
575
576
577static void split_path PARAMS ((const char *, char **, char **));
578
579/* Like strpbrk, with the exception that it returns the pointer to the
580 terminating zero (end-of-string aka "eos") if no matching character
581 is found.
582
583 Although I normally balk at Gcc-specific optimizations, it probably
584 makes sense here: glibc has optimizations that detect strpbrk being
585 called with literal string as ACCEPT and inline the search. That
586 optimization is defeated if strpbrk is hidden within the call to
587 another function. (And no, making strpbrk_or_eos inline doesn't
588 help because the check for literal accept is in the
589 preprocessor.) */
590
591#if defined(__GNUC__) && __GNUC__ >= 3
592
593#define strpbrk_or_eos(s, accept) ({ \
594 char *SOE_p = strpbrk (s, accept); \
595 if (!SOE_p) \
596 SOE_p = strchr (s, '\0'); \
597 SOE_p; \
598})
599
600#else /* not __GNUC__ or old gcc */
601
602static inline char *
603strpbrk_or_eos (const char *s, const char *accept)
604{
605 char *p = strpbrk (s, accept);
606 if (!p)
607 p = strchr (s, '\0');
608 return p;
609}
610#endif /* not __GNUC__ or old gcc */
611
612/* Turn STR into lowercase; return non-zero if a character was
613 actually changed. */
614
615static int
616lowercase_str (char *str)
617{
618 int change = 0;
619 for (; *str; str++)
620 if (ISUPPER (*str))
621 {
622 change = 1;
623 *str = TOLOWER (*str);
624 }
625 return change;
626}
627
628static const char *parse_errors[] = {
629#define PE_NO_ERROR 0
630 N_("No error"),
631#define PE_UNSUPPORTED_SCHEME 1
632 N_("Unsupported scheme"),
633#define PE_INVALID_HOST_NAME 2
634 N_("Invalid host name"),
635#define PE_BAD_PORT_NUMBER 3
636 N_("Bad port number"),
637#define PE_INVALID_USER_NAME 4
638 N_("Invalid user name"),
639#define PE_UNTERMINATED_IPV6_ADDRESS 5
640 N_("Unterminated IPv6 numeric address"),
641#define PE_IPV6_NOT_SUPPORTED 6
642 N_("IPv6 addresses not supported"),
643#define PE_INVALID_IPV6_ADDRESS 7
644 N_("Invalid IPv6 numeric address")
645};
646
647/* Parse a URL.
648
649 Return a new struct url if successful, NULL on error. In case of
650 error, and if ERROR is not NULL, also set *ERROR to the appropriate
651 error code. */
652struct url *
653url_parse (const char *url, int *error)
654{
655 struct url *u;
656 const char *p;
657 int path_modified, host_modified;
658
659 enum url_scheme scheme;
660
661 const char *uname_b, *uname_e;
662 const char *host_b, *host_e;
663 const char *path_b, *path_e;
664 const char *params_b, *params_e;
665 const char *query_b, *query_e;
666 const char *fragment_b, *fragment_e;
667
668 int port;
669 char *user = NULL, *passwd = NULL;
670
671 char *url_encoded = NULL;
672
673 int error_code;
674
675 scheme = url_scheme (url);
676 if (scheme == SCHEME_INVALID)
677 {
678 error_code = PE_UNSUPPORTED_SCHEME;
679 goto err;
680 }
681
682 url_encoded = reencode_escapes (url);
683 p = url_encoded;
684
685 p += strlen (supported_schemes[scheme].leading_string);
686 uname_b = p;
687 p = url_skip_credentials (p);
688 uname_e = p;
689
690 /* scheme://user:pass@host[:port]... */
691 /* ^ */
692
693 /* We attempt to break down the URL into the components path,
694 params, query, and fragment. They are ordered like this:
695
696 scheme://host[:port][/path][;params][?query][#fragment] */
697
698 params_b = params_e = NULL;
699 query_b = query_e = NULL;
700 fragment_b = fragment_e = NULL;
701
702 host_b = p;
703
704 if (*p == '[')
705 {
706 /* Handle IPv6 address inside square brackets. Ideally we'd
707 just look for the terminating ']', but rfc2732 mandates
708 rejecting invalid IPv6 addresses. */
709
710 /* The address begins after '['. */
711 host_b = p + 1;
712 host_e = strchr (host_b, ']');
713
714 if (!host_e)
715 {
716 error_code = PE_UNTERMINATED_IPV6_ADDRESS;
717 goto err;
718 }
719
720#ifdef ENABLE_IPV6
721 /* Check if the IPv6 address is valid. */
722 if (!is_valid_ipv6_address(host_b, host_e))
723 {
724 error_code = PE_INVALID_IPV6_ADDRESS;
725 goto err;
726 }
727
728 /* Continue parsing after the closing ']'. */
729 p = host_e + 1;
730#else
731 error_code = PE_IPV6_NOT_SUPPORTED;
732 goto err;
733#endif
734
735 /* The closing bracket must be followed by a separator or by the
736 null char. */
737 /* http://[::1]... */
738 /* ^ */
739 if (!strchr (":/;?#", *p))
740 {
741 /* Trailing garbage after []-delimited IPv6 address. */
742 error_code = PE_INVALID_HOST_NAME;
743 goto err;
744 }
745 }
746 else
747 {
748 p = strpbrk_or_eos (p, ":/;?#");
749 host_e = p;
750 }
751
752 if (host_b == host_e)
753 {
754 error_code = PE_INVALID_HOST_NAME;
755 goto err;
756 }
757
758 port = scheme_default_port (scheme);
759 if (*p == ':')
760 {
761 const char *port_b, *port_e, *pp;
762
763 /* scheme://host:port/tralala */
764 /* ^ */
765 ++p;
766 port_b = p;
767 p = strpbrk_or_eos (p, "/;?#");
768 port_e = p;
769
770 /* Allow empty port, as per rfc2396. */
771 if (port_b != port_e)
772 {
773 for (port = 0, pp = port_b; pp < port_e; pp++)
774 {
775 if (!ISDIGIT (*pp))
776 {
777 /* http://host:12randomgarbage/blah */
778 /* ^ */
779 error_code = PE_BAD_PORT_NUMBER;
780 goto err;
781 }
782 port = 10 * port + (*pp - '0');
783 /* Check for too large port numbers here, before we have
784 a chance to overflow on bogus port values. */
785 if (port > 65535)
786 {
787 error_code = PE_BAD_PORT_NUMBER;
788 goto err;
789 }
790 }
791 }
792 }
793
794 if (*p == '/')
795 {
796 ++p;
797 path_b = p;
798 p = strpbrk_or_eos (p, ";?#");
799 path_e = p;
800 }
801 else
802 {
803 /* Path is not allowed not to exist. */
804 path_b = path_e = p;
805 }
806
807 if (*p == ';')
808 {
809 ++p;
810 params_b = p;
811 p = strpbrk_or_eos (p, "?#");
812 params_e = p;
813 }
814 if (*p == '?')
815 {
816 ++p;
817 query_b = p;
818 p = strpbrk_or_eos (p, "#");
819 query_e = p;
820
821 /* Hack that allows users to use '?' (a wildcard character) in
822 FTP URLs without it being interpreted as a query string
823 delimiter. */
824 if (scheme == SCHEME_FTP)
825 {
826 query_b = query_e = NULL;
827 path_e = p;
828 }
829 }
830 if (*p == '#')
831 {
832 ++p;
833 fragment_b = p;
834 p += strlen (p);
835 fragment_e = p;
836 }
837 assert (*p == 0);
838
839 if (uname_b != uname_e)
840 {
841 /* http://user:pass@host */
842 /* ^ ^ */
843 /* uname_b uname_e */
844 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
845 {
846 error_code = PE_INVALID_USER_NAME;
847 goto err;
848 }
849 }
850
851 u = xnew0 (struct url);
852 u->scheme = scheme;
853 u->host = strdupdelim (host_b, host_e);
854 u->port = port;
855 u->user = user;
856 u->passwd = passwd;
857
858 u->path = strdupdelim (path_b, path_e);
859 path_modified = path_simplify (u->path);
860 split_path (u->path, &u->dir, &u->file);
861
862 host_modified = lowercase_str (u->host);
863
864 /* Decode %HH sequences in host name. This is important not so much
865 to support %HH sequences in host names (which other browser
866 don't), but to support binary characters (which will have been
867 converted to %HH by reencode_escapes). */
868 if (strchr (u->host, '%'))
869 {
870 url_unescape (u->host);
871 host_modified = 1;
872 }
873
874 if (params_b)
875 u->params = strdupdelim (params_b, params_e);
876 if (query_b)
877 u->query = strdupdelim (query_b, query_e);
878 if (fragment_b)
879 u->fragment = strdupdelim (fragment_b, fragment_e);
880
881 if (path_modified || u->fragment || host_modified || path_b == path_e)
882 {
883 /* If we suspect that a transformation has rendered what
884 url_string might return different from URL_ENCODED, rebuild
885 u->url using url_string. */
886 u->url = url_string (u, 0);
887
888 if (url_encoded != url)
889 xfree ((char *) url_encoded);
890 }
891 else
892 {
893 if (url_encoded == url)
894 u->url = xstrdup (url);
895 else
896 u->url = url_encoded;
897 }
898
899 return u;
900
901 err:
902 /* Cleanup in case of error: */
903 if (url_encoded && url_encoded != url)
904 xfree (url_encoded);
905
906 /* Transmit the error code to the caller, if the caller wants to
907 know. */
908 if (error)
909 *error = error_code;
910 return NULL;
911}
912
913/* Return the error message string from ERROR_CODE, which should have
914 been retrieved from url_parse. The error message is translated. */
915
916const char *
917url_error (int error_code)
918{
919 assert (error_code >= 0 && error_code < countof (parse_errors));
920 return _(parse_errors[error_code]);
921}
922
923/* Split PATH into DIR and FILE. PATH comes from the URL and is
924 expected to be URL-escaped.
925
926 The path is split into directory (the part up to the last slash)
927 and file (the part after the last slash), which are subsequently
928 unescaped. Examples:
929
930 PATH DIR FILE
931 "foo/bar/baz" "foo/bar" "baz"
932 "foo/bar/" "foo/bar" ""
933 "foo" "" "foo"
934 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
935
936 DIR and FILE are freshly allocated. */
937
938static void
939split_path (const char *path, char **dir, char **file)
940{
941 char *last_slash = strrchr (path, '/');
942 if (!last_slash)
943 {
944 *dir = xstrdup ("");
945 *file = xstrdup (path);
946 }
947 else
948 {
949 *dir = strdupdelim (path, last_slash);
950 *file = xstrdup (last_slash + 1);
951 }
952 url_unescape (*dir);
953 url_unescape (*file);
954}
955
956/* Note: URL's "full path" is the path with the query string and
957 params appended. The "fragment" (#foo) is intentionally ignored,
958 but that might be changed. For example, if the original URL was
959 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
960 the full path will be "/foo/bar/baz;bullshit?querystring". */
961
962/* Return the length of the full path, without the terminating
963 zero. */
964
965static int
966full_path_length (const struct url *url)
967{
968 int len = 0;
969
970#define FROB(el) if (url->el) len += 1 + strlen (url->el)
971
972 FROB (path);
973 FROB (params);
974 FROB (query);
975
976#undef FROB
977
978 return len;
979}
980
981/* Write out the full path. */
982
983static void
984full_path_write (const struct url *url, char *where)
985{
986#define FROB(el, chr) do { \
987 char *f_el = url->el; \
988 if (f_el) { \
989 int l = strlen (f_el); \
990 *where++ = chr; \
991 memcpy (where, f_el, l); \
992 where += l; \
993 } \
994} while (0)
995
996 FROB (path, '/');
997 FROB (params, ';');
998 FROB (query, '?');
999
1000#undef FROB
1001}
1002
1003/* Public function for getting the "full path". E.g. if u->path is
1004 "foo/bar" and u->query is "param=value", full_path will be
1005 "/foo/bar?param=value". */
1006
1007char *
1008url_full_path (const struct url *url)
1009{
1010 int length = full_path_length (url);
1011 char *full_path = (char *) xmalloc (length + 1);
1012
1013 full_path_write (url, full_path);
1014 full_path[length] = '\0';
1015
1016 return full_path;
1017}
1018
1019/* Unescape CHR in an otherwise escaped STR. Used to selectively
1020 escaping of certain characters, such as "/" and ":". Returns a
1021 count of unescaped chars. */
1022
1023static void
1024unescape_single_char (char *str, char chr)
1025{
1026 const char c1 = XNUM_TO_DIGIT (chr >> 4);
1027 const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1028 char *h = str; /* hare */
1029 char *t = str; /* tortoise */
1030 for (; *h; h++, t++)
1031 {
1032 if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1033 {
1034 *t = chr;
1035 h += 2;
1036 }
1037 else
1038 *t = *h;
1039 }
1040 *t = '\0';
1041}
1042
1043/* Escape unsafe and reserved characters, except for the slash
1044 characters. */
1045
1046static char *
1047url_escape_dir (const char *dir)
1048{
1049 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1050 if (newdir == dir)
1051 return (char *)dir;
1052
1053 unescape_single_char (newdir, '/');
1054 return newdir;
1055}
1056
1057/* Sync u->path and u->url with u->dir and u->file. Called after
1058 u->file or u->dir have been changed, typically by the FTP code. */
1059
1060static void
1061sync_path (struct url *u)
1062{
1063 char *newpath, *efile, *edir;
1064
1065 xfree (u->path);
1066
1067 /* u->dir and u->file are not escaped. URL-escape them before
1068 reassembling them into u->path. That way, if they contain
1069 separators like '?' or even if u->file contains slashes, the
1070 path will be correctly assembled. (u->file can contain slashes
1071 if the URL specifies it with %2f, or if an FTP server returns
1072 it.) */
1073 edir = url_escape_dir (u->dir);
1074 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1075
1076 if (!*edir)
1077 newpath = xstrdup (efile);
1078 else
1079 {
1080 int dirlen = strlen (edir);
1081 int filelen = strlen (efile);
1082
1083 /* Copy "DIR/FILE" to newpath. */
1084 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1085 memcpy (p, edir, dirlen);
1086 p += dirlen;
1087 *p++ = '/';
1088 memcpy (p, efile, filelen);
1089 p += filelen;
1090 *p = '\0';
1091 }
1092
1093 u->path = newpath;
1094
1095 if (edir != u->dir)
1096 xfree (edir);
1097 if (efile != u->file)
1098 xfree (efile);
1099
1100 /* Regenerate u->url as well. */
1101 xfree (u->url);
1102 u->url = url_string (u, 0);
1103}
1104
1105/* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1106 This way we can sync u->path and u->url when they get changed. */
1107
1108void
1109url_set_dir (struct url *url, const char *newdir)
1110{
1111 xfree (url->dir);
1112 url->dir = xstrdup (newdir);
1113 sync_path (url);
1114}
1115
1116void
1117url_set_file (struct url *url, const char *newfile)
1118{
1119 xfree (url->file);
1120 url->file = xstrdup (newfile);
1121 sync_path (url);
1122}
1123
1124void
1125url_free (struct url *url)
1126{
1127 xfree (url->host);
1128 xfree (url->path);
1129 xfree (url->url);
1130
1131 xfree_null (url->params);
1132 xfree_null (url->query);
1133 xfree_null (url->fragment);
1134 xfree_null (url->user);
1135 xfree_null (url->passwd);
1136
1137 xfree (url->dir);
1138 xfree (url->file);
1139
1140 xfree (url);
1141}
1142
1143
1144/* Create all the necessary directories for PATH (a file). Calls
1145 make_directory internally. */
1146int
1147mkalldirs (const char *path)
1148{
1149 const char *p;
1150 char *t;
1151 struct_stat st;
1152 int res;
1153
1154 p = path + strlen (path);
1155 for (; *p != '/' && p != path; p--)
1156 ;
1157
1158 /* Don't create if it's just a file. */
1159 if ((p == path) && (*p != '/'))
1160 return 0;
1161 t = strdupdelim (path, p);
1162
1163 /* Check whether the directory exists. */
1164 if ((stat (t, &st) == 0))
1165 {
1166 if (S_ISDIR (st.st_mode))
1167 {
1168 xfree (t);
1169 return 0;
1170 }
1171 else
1172 {
1173 /* If the dir exists as a file name, remove it first. This
1174 is *only* for Wget to work with buggy old CERN http
1175 servers. Here is the scenario: When Wget tries to
1176 retrieve a directory without a slash, e.g.
1177 http://foo/bar (bar being a directory), CERN server will
1178 not redirect it too http://foo/bar/ -- it will generate a
1179 directory listing containing links to bar/file1,
1180 bar/file2, etc. Wget will lose because it saves this
1181 HTML listing to a file `bar', so it cannot create the
1182 directory. To work around this, if the file of the same
1183 name exists, we just remove it and create the directory
1184 anyway. */
1185 DEBUGP (("Removing %s because of directory danger!\n", t));
1186 unlink (t);
1187 }
1188 }
1189 res = make_directory (t);
1190 if (res != 0)
1191 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1192 xfree (t);
1193 return res;
1194}
1195
1196
1197/* Functions for constructing the file name out of URL components. */
1198
1199/* A growable string structure, used by url_file_name and friends.
1200 This should perhaps be moved to utils.c.
1201
1202 The idea is to have a convenient and efficient way to construct a
1203 string by having various functions append data to it. Instead of
1204 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1205 functions in questions, we pass the pointer to this struct. */
1206
1207struct growable {
1208 char *base;
1209 int size;
1210 int tail;
1211};
1212
1213/* Ensure that the string can accept APPEND_COUNT more characters past
1214 the current TAIL position. If necessary, this will grow the string
1215 and update its allocated size. If the string is already large
1216 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1217#define GROW(g, append_size) do { \
1218 struct growable *G_ = g; \
1219 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1220} while (0)
1221
1222/* Return the tail position of the string. */
1223#define TAIL(r) ((r)->base + (r)->tail)
1224
1225/* Move the tail position by APPEND_COUNT characters. */
1226#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1227
1228/* Append the string STR to DEST. NOTICE: the string in DEST is not
1229 terminated. */
1230
1231static void
1232append_string (const char *str, struct growable *dest)
1233{
1234 int l = strlen (str);
1235 GROW (dest, l);
1236 memcpy (TAIL (dest), str, l);
1237 TAIL_INCR (dest, l);
1238}
1239
1240/* Append CH to DEST. For example, append_char (0, DEST)
1241 zero-terminates DEST. */
1242
1243static void
1244append_char (char ch, struct growable *dest)
1245{
1246 GROW (dest, 1);
1247 *TAIL (dest) = ch;
1248 TAIL_INCR (dest, 1);
1249}
1250
1251enum {
1252 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1253 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1254 filechr_control = 4 /* a control character, e.g. 0-31 */
1255};
1256
1257#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1258
1259/* Shorthands for the table: */
1260#define U filechr_not_unix
1261#define W filechr_not_windows
1262#define C filechr_control
1263
1264#define UW U|W
1265#define UWC U|W|C
1266
1267/* Table of characters unsafe under various conditions (see above).
1268
1269 Arguably we could also claim `%' to be unsafe, since we use it as
1270 the escape character. If we ever want to be able to reliably
1271 translate file name back to URL, this would become important
1272 crucial. Right now, it's better to be minimal in escaping. */
1273
1274static const unsigned char filechr_table[256] =
1275{
1276UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1277 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1278 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1279 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1280 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1281 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1282 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1283 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1284 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1285 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1286 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1287 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1288 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1289 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1290 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1291 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1292
1293 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1294 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1296 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1297
1298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1299 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1301 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1302};
1303#undef U
1304#undef W
1305#undef C
1306#undef UW
1307#undef UWC
1308
1309/* FN_PORT_SEP is the separator between host and port in file names
1310 for non-standard port numbers. On Unix this is normally ':', as in
1311 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1312 because Windows can't handle ':' in file names. */
1313#define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1314
1315/* FN_QUERY_SEP is the separator between the file name and the URL
1316 query, normally '?'. Since Windows cannot handle '?' as part of
1317 file name, we use '@' instead there. */
1318#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1319
1320/* Quote path element, characters in [b, e), as file name, and append
1321 the quoted string to DEST. Each character is quoted as per
1322 file_unsafe_char and the corresponding table.
1323
1324 If ESCAPED_P is non-zero, the path element is considered to be
1325 URL-escaped and will be unescaped prior to inspection. */
1326
1327static void
1328append_uri_pathel (const char *b, const char *e, int escaped_p,
1329 struct growable *dest)
1330{
1331 const char *p;
1332 int quoted, outlen;
1333
1334 int mask;
1335 if (opt.restrict_files_os == restrict_unix)
1336 mask = filechr_not_unix;
1337 else
1338 mask = filechr_not_windows;
1339 if (opt.restrict_files_ctrl)
1340 mask |= filechr_control;
1341
1342 /* Copy [b, e) to PATHEL and URL-unescape it. */
1343 if (escaped_p)
1344 {
1345 char *unescaped;
1346 BOUNDED_TO_ALLOCA (b, e, unescaped);
1347 url_unescape (unescaped);
1348 b = unescaped;
1349 e = unescaped + strlen (unescaped);
1350 }
1351
1352 /* Defang ".." when found as component of path. Remember that path
1353 comes from the URL and might contain malicious input. */
1354 if (e - b == 2 && b[0] == '.' && b[1] == '.')
1355 {
1356 b = "%2E%2E";
1357 e = b + 6;
1358 }
1359
1360 /* Walk the PATHEL string and check how many characters we'll need
1361 to quote. */
1362 quoted = 0;
1363 for (p = b; p < e; p++)
1364 if (FILE_CHAR_TEST (*p, mask))
1365 ++quoted;
1366
1367 /* Calculate the length of the output string. e-b is the input
1368 string length. Each quoted char introduces two additional
1369 characters in the string, hence 2*quoted. */
1370 outlen = (e - b) + (2 * quoted);
1371 GROW (dest, outlen);
1372
1373 if (!quoted)
1374 {
1375 /* If there's nothing to quote, we can simply append the string
1376 without processing it again. */
1377 memcpy (TAIL (dest), b, outlen);
1378 }
1379 else
1380 {
1381 char *q = TAIL (dest);
1382 for (p = b; p < e; p++)
1383 {
1384 if (!FILE_CHAR_TEST (*p, mask))
1385 *q++ = *p;
1386 else
1387 {
1388 unsigned char ch = *p;
1389 *q++ = '%';
1390 *q++ = XNUM_TO_DIGIT (ch >> 4);
1391 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1392 }
1393 }
1394 assert (q - TAIL (dest) == outlen);
1395 }
1396 TAIL_INCR (dest, outlen);
1397}
1398
1399/* Append to DEST the directory structure that corresponds the
1400 directory part of URL's path. For example, if the URL is
1401 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1402
1403 Each path element ("dir1" and "dir2" in the above example) is
1404 examined, url-unescaped, and re-escaped as file name element.
1405
1406 Additionally, it cuts as many directories from the path as
1407 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1408 will produce "bar" for the above example. For 2 or more, it will
1409 produce "".
1410
1411 Each component of the path is quoted for use as file name. */
1412
1413static void
1414append_dir_structure (const struct url *u, struct growable *dest)
1415{
1416 char *pathel, *next;
1417 int cut = opt.cut_dirs;
1418
1419 /* Go through the path components, de-URL-quote them, and quote them
1420 (if necessary) as file names. */
1421
1422 pathel = u->path;
1423 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1424 {
1425 if (cut-- > 0)
1426 continue;
1427 if (pathel == next)
1428 /* Ignore empty pathels. */
1429 continue;
1430
1431 if (dest->tail)
1432 append_char ('/', dest);
1433 append_uri_pathel (pathel, next, 1, dest);
1434 }
1435}
1436
1437/* Return a unique file name that matches the given URL as good as
1438 possible. Does not create directories on the file system. */
1439
1440char *
1441url_file_name (const struct url *u)
1442{
1443 struct growable fnres; /* stands for "file name result" */
1444
1445 const char *u_file, *u_query;
1446 char *fname, *unique;
1447
1448 fnres.base = NULL;
1449 fnres.size = 0;
1450 fnres.tail = 0;
1451
1452 /* Start with the directory prefix, if specified. */
1453 if (opt.dir_prefix)
1454 append_string (opt.dir_prefix, &fnres);
1455
1456 /* If "dirstruct" is turned on (typically the case with -r), add
1457 the host and port (unless those have been turned off) and
1458 directory structure. */
1459 if (opt.dirstruct)
1460 {
1461 if (opt.protocol_directories)
1462 {
1463 if (fnres.tail)
1464 append_char ('/', &fnres);
1465 append_string (supported_schemes[u->scheme].name, &fnres);
1466 }
1467 if (opt.add_hostdir)
1468 {
1469 if (fnres.tail)
1470 append_char ('/', &fnres);
1471 if (0 != strcmp (u->host, ".."))
1472 append_string (u->host, &fnres);
1473 else
1474 /* Host name can come from the network; malicious DNS may
1475 allow ".." to be resolved, causing us to write to
1476 "../<file>". Defang such host names. */
1477 append_string ("%2E%2E", &fnres);
1478 if (u->port != scheme_default_port (u->scheme))
1479 {
1480 char portstr[24];
1481 number_to_string (portstr, u->port);
1482 append_char (FN_PORT_SEP, &fnres);
1483 append_string (portstr, &fnres);
1484 }
1485 }
1486
1487 append_dir_structure (u, &fnres);
1488 }
1489
1490 /* Add the file name. */
1491 if (fnres.tail)
1492 append_char ('/', &fnres);
1493 u_file = *u->file ? u->file : "index.html";
1494 append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1495
1496 /* Append "?query" to the file name. */
1497 u_query = u->query && *u->query ? u->query : NULL;
1498 if (u_query)
1499 {
1500 append_char (FN_QUERY_SEP, &fnres);
1501 append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1502 }
1503
1504 /* Zero-terminate the file name. */
1505 append_char ('\0', &fnres);
1506
1507 fname = fnres.base;
1508
1509 /* Check the cases in which the unique extensions are not used:
1510 1) Clobbering is turned off (-nc).
1511 2) Retrieval with regetting.
1512 3) Timestamping is used.
1513 4) Hierarchy is built.
1514
1515 The exception is the case when file does exist and is a
1516 directory (see `mkalldirs' for explanation). */
1517
1518 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1519 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1520 return fname;
1521
1522 unique = unique_name (fname, 1);
1523 if (unique != fname)
1524 xfree (fname);
1525 return unique;
1526}
1527
1528
1529/* Resolve "." and ".." elements of PATH by destructively modifying
1530 PATH and return non-zero if PATH has been modified, zero otherwise.
1531
1532 The algorithm is in spirit similar to the one described in rfc1808,
1533 although implemented differently, in one pass. To recap, path
1534 elements containing only "." are removed, and ".." is taken to mean
1535 "back up one element". Single leading and trailing slashes are
1536 preserved.
1537
1538 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1539 test examples are provided below. If you change anything in this
1540 function, run test_path_simplify to make sure you haven't broken a
1541 test case. */
1542
1543static int
1544path_simplify (char *path)
1545{
1546 char *h = path; /* hare */
1547 char *t = path; /* tortoise */
1548 char *beg = path; /* boundary for backing the tortoise */
1549 char *end = path + strlen (path);
1550
1551 while (h < end)
1552 {
1553 /* Hare should be at the beginning of a path element. */
1554
1555 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1556 {
1557 /* Ignore "./". */
1558 h += 2;
1559 }
1560 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1561 {
1562 /* Handle "../" by retreating the tortoise by one path
1563 element -- but not past beggining. */
1564 if (t > beg)
1565 {
1566 /* Move backwards until T hits the beginning of the
1567 previous path element or the beginning of path. */
1568 for (--t; t > beg && t[-1] != '/'; t--)
1569 ;
1570 }
1571 else
1572 {
1573 /* If we're at the beginning, copy the "../" literally
1574 move the beginning so a later ".." doesn't remove
1575 it. */
1576 beg = t + 3;
1577 goto regular;
1578 }
1579 h += 3;
1580 }
1581 else
1582 {
1583 regular:
1584 /* A regular path element. If H hasn't advanced past T,
1585 simply skip to the next path element. Otherwise, copy
1586 the path element until the next slash. */
1587 if (t == h)
1588 {
1589 /* Skip the path element, including the slash. */
1590 while (h < end && *h != '/')
1591 t++, h++;
1592 if (h < end)
1593 t++, h++;
1594 }
1595 else
1596 {
1597 /* Copy the path element, including the final slash. */
1598 while (h < end && *h != '/')
1599 *t++ = *h++;
1600 if (h < end)
1601 *t++ = *h++;
1602 }
1603 }
1604 }
1605
1606 if (t != h)
1607 *t = '\0';
1608
1609 return t != h;
1610}
1611
1612
1613/* Return the length of URL's path. Path is considered to be
1614 terminated by one of '?', ';', '#', or by the end of the
1615 string. */
1616
1617static int
1618path_length (const char *url)
1619{
1620 const char *q = strpbrk_or_eos (url, "?;#");
1621 return q - url;
1622}
1623
1624/* Find the last occurrence of character C in the range [b, e), or
1625 NULL, if none are present. We might want to use memrchr (a GNU
1626 extension) under GNU libc. */
1627
1628static const char *
1629find_last_char (const char *b, const char *e, char c)
1630{
1631 for (; e > b; e--)
1632 if (*e == c)
1633 return e;
1634 return NULL;
1635}
1636
1637/* Merge BASE with LINK and return the resulting URI.
1638
1639 Either of the URIs may be absolute or relative, complete with the
1640 host name, or path only. This tries to reasonably handle all
1641 foreseeable cases. It only employs minimal URL parsing, without
1642 knowledge of the specifics of schemes.
1643
1644 I briefly considered making this function call path_simplify after
1645 the merging process, as rfc1738 seems to suggest. This is a bad
1646 idea for several reasons: 1) it complexifies the code, and 2)
1647 url_parse has to simplify path anyway, so it's wasteful to boot. */
1648
1649char *
1650uri_merge (const char *base, const char *link)
1651{
1652 int linklength;
1653 const char *end;
1654 char *merge;
1655
1656 if (url_has_scheme (link))
1657 return xstrdup (link);
1658
1659 /* We may not examine BASE past END. */
1660 end = base + path_length (base);
1661 linklength = strlen (link);
1662
1663 if (!*link)
1664 {
1665 /* Empty LINK points back to BASE, query string and all. */
1666 return xstrdup (base);
1667 }
1668 else if (*link == '?')
1669 {
1670 /* LINK points to the same location, but changes the query
1671 string. Examples: */
1672 /* uri_merge("path", "?new") -> "path?new" */
1673 /* uri_merge("path?foo", "?new") -> "path?new" */
1674 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1675 /* uri_merge("path#foo", "?new") -> "path?new" */
1676 int baselength = end - base;
1677 merge = xmalloc (baselength + linklength + 1);
1678 memcpy (merge, base, baselength);
1679 memcpy (merge + baselength, link, linklength);
1680 merge[baselength + linklength] = '\0';
1681 }
1682 else if (*link == '#')
1683 {
1684 /* uri_merge("path", "#new") -> "path#new" */
1685 /* uri_merge("path#foo", "#new") -> "path#new" */
1686 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1687 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1688 int baselength;
1689 const char *end1 = strchr (base, '#');
1690 if (!end1)
1691 end1 = base + strlen (base);
1692 baselength = end1 - base;
1693 merge = xmalloc (baselength + linklength + 1);
1694 memcpy (merge, base, baselength);
1695 memcpy (merge + baselength, link, linklength);
1696 merge[baselength + linklength] = '\0';
1697 }
1698 else if (*link == '/' && *(link + 1) == '/')
1699 {
1700 /* LINK begins with "//" and so is a net path: we need to
1701 replace everything after (and including) the double slash
1702 with LINK. */
1703
1704 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1705 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1706 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1707
1708 int span;
1709 const char *slash;
1710 const char *start_insert;
1711
1712 /* Look for first slash. */
1713 slash = memchr (base, '/', end - base);
1714 /* If found slash and it is a double slash, then replace
1715 from this point, else default to replacing from the
1716 beginning. */
1717 if (slash && *(slash + 1) == '/')
1718 start_insert = slash;
1719 else
1720 start_insert = base;
1721
1722 span = start_insert - base;
1723 merge = (char *)xmalloc (span + linklength + 1);
1724 if (span)
1725 memcpy (merge, base, span);
1726 memcpy (merge + span, link, linklength);
1727 merge[span + linklength] = '\0';
1728 }
1729 else if (*link == '/')
1730 {
1731 /* LINK is an absolute path: we need to replace everything
1732 after (and including) the FIRST slash with LINK.
1733
1734 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1735 "/qux/xyzzy", our result should be
1736 "http://host/qux/xyzzy". */
1737 int span;
1738 const char *slash;
1739 const char *start_insert = NULL; /* for gcc to shut up. */
1740 const char *pos = base;
1741 int seen_slash_slash = 0;
1742 /* We're looking for the first slash, but want to ignore
1743 double slash. */
1744 again:
1745 slash = memchr (pos, '/', end - pos);
1746 if (slash && !seen_slash_slash)
1747 if (*(slash + 1) == '/')
1748 {
1749 pos = slash + 2;
1750 seen_slash_slash = 1;
1751 goto again;
1752 }
1753
1754 /* At this point, SLASH is the location of the first / after
1755 "//", or the first slash altogether. START_INSERT is the
1756 pointer to the location where LINK will be inserted. When
1757 examining the last two examples, keep in mind that LINK
1758 begins with '/'. */
1759
1760 if (!slash && !seen_slash_slash)
1761 /* example: "foo" */
1762 /* ^ */
1763 start_insert = base;
1764 else if (!slash && seen_slash_slash)
1765 /* example: "http://foo" */
1766 /* ^ */
1767 start_insert = end;
1768 else if (slash && !seen_slash_slash)
1769 /* example: "foo/bar" */
1770 /* ^ */
1771 start_insert = base;
1772 else if (slash && seen_slash_slash)
1773 /* example: "http://something/" */
1774 /* ^ */
1775 start_insert = slash;
1776
1777 span = start_insert - base;
1778 merge = (char *)xmalloc (span + linklength + 1);
1779 if (span)
1780 memcpy (merge, base, span);
1781 memcpy (merge + span, link, linklength);
1782 merge[span + linklength] = '\0';
1783 }
1784 else
1785 {
1786 /* LINK is a relative URL: we need to replace everything
1787 after last slash (possibly empty) with LINK.
1788
1789 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1790 our result should be "whatever/foo/qux/xyzzy". */
1791 int need_explicit_slash = 0;
1792 int span;
1793 const char *start_insert;
1794 const char *last_slash = find_last_char (base, end, '/');
1795 if (!last_slash)
1796 {
1797 /* No slash found at all. Replace what we have with LINK. */
1798 start_insert = base;
1799 }
1800 else if (last_slash && last_slash >= base + 2
1801 && last_slash[-2] == ':' && last_slash[-1] == '/')
1802 {
1803 /* example: http://host" */
1804 /* ^ */
1805 start_insert = end + 1;
1806 need_explicit_slash = 1;
1807 }
1808 else
1809 {
1810 /* example: "whatever/foo/bar" */
1811 /* ^ */
1812 start_insert = last_slash + 1;
1813 }
1814
1815 span = start_insert - base;
1816 merge = (char *)xmalloc (span + linklength + 1);
1817 if (span)
1818 memcpy (merge, base, span);
1819 if (need_explicit_slash)
1820 merge[span - 1] = '/';
1821 memcpy (merge + span, link, linklength);
1822 merge[span + linklength] = '\0';
1823 }
1824
1825 return merge;
1826}
1827
1828
1829#define APPEND(p, s) do { \
1830 int len = strlen (s); \
1831 memcpy (p, s, len); \
1832 p += len; \
1833} while (0)
1834
1835/* Use this instead of password when the actual password is supposed
1836 to be hidden. We intentionally use a generic string without giving
1837 away the number of characters in the password, like previous
1838 versions did. */
1839#define HIDDEN_PASSWORD "*password*"
1840
1841/* Recreate the URL string from the data in URL.
1842
1843 If HIDE is non-zero (as it is when we're calling this on a URL we
1844 plan to print, but not when calling it to canonicalize a URL for
1845 use within the program), password will be hidden. Unsafe
1846 characters in the URL will be quoted. */
1847
1848char *
1849url_string (const struct url *url, int hide_password)
1850{
1851 int size;
1852 char *result, *p;
1853 char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1854
1855 int scheme_port = supported_schemes[url->scheme].default_port;
1856 const char *scheme_str = supported_schemes[url->scheme].leading_string;
1857 int fplen = full_path_length (url);
1858
1859 int brackets_around_host;
1860
1861 assert (scheme_str != NULL);
1862
1863 /* Make sure the user name and password are quoted. */
1864 if (url->user)
1865 {
1866 quoted_user = url_escape_allow_passthrough (url->user);
1867 if (url->passwd)
1868 {
1869 if (hide_password)
1870 quoted_passwd = HIDDEN_PASSWORD;
1871 else
1872 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1873 }
1874 }
1875
1876 /* In the unlikely event that the host name contains non-printable
1877 characters, quote it for displaying to the user. */
1878 quoted_host = url_escape_allow_passthrough (url->host);
1879
1880 /* Undo the quoting of colons that URL escaping performs. IPv6
1881 addresses may legally contain colons, and in that case must be
1882 placed in square brackets. */
1883 if (quoted_host != url->host)
1884 unescape_single_char (quoted_host, ':');
1885 brackets_around_host = strchr (quoted_host, ':') != NULL;
1886
1887 size = (strlen (scheme_str)
1888 + strlen (quoted_host)
1889 + (brackets_around_host ? 2 : 0)
1890 + fplen
1891 + 1);
1892 if (url->port != scheme_port)
1893 size += 1 + numdigit (url->port);
1894 if (quoted_user)
1895 {
1896 size += 1 + strlen (quoted_user);
1897 if (quoted_passwd)
1898 size += 1 + strlen (quoted_passwd);
1899 }
1900
1901 p = result = xmalloc (size);
1902
1903 APPEND (p, scheme_str);
1904 if (quoted_user)
1905 {
1906 APPEND (p, quoted_user);
1907 if (quoted_passwd)
1908 {
1909 *p++ = ':';
1910 APPEND (p, quoted_passwd);
1911 }
1912 *p++ = '@';
1913 }
1914
1915 if (brackets_around_host)
1916 *p++ = '[';
1917 APPEND (p, quoted_host);
1918 if (brackets_around_host)
1919 *p++ = ']';
1920 if (url->port != scheme_port)
1921 {
1922 *p++ = ':';
1923 p = number_to_string (p, url->port);
1924 }
1925
1926 full_path_write (url, p);
1927 p += fplen;
1928 *p++ = '\0';
1929
1930 assert (p - result == size);
1931
1932 if (quoted_user && quoted_user != url->user)
1933 xfree (quoted_user);
1934 if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1935 xfree (quoted_passwd);
1936 if (quoted_host != url->host)
1937 xfree (quoted_host);
1938
1939 return result;
1940}
1941
1942
1943/* Return non-zero if scheme a is similar to scheme b.
1944
1945 Schemes are similar if they are equal. If SSL is supported, schemes
1946 are also similar if one is http (SCHEME_HTTP) and the other is https
1947 (SCHEME_HTTPS). */
1948int
1949schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1950{
1951 if (a == b)
1952 return 1;
1953#ifdef HAVE_SSL
1954 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1955 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1956 return 1;
1957#endif
1958 return 0;
1959}
1960
1961
1962#if 0
1963/* Debugging and testing support for path_simplify. */
1964
1965/* Debug: run path_simplify on PATH and return the result in a new
1966 string. Useful for calling from the debugger. */
1967static char *
1968ps (char *path)
1969{
1970 char *copy = xstrdup (path);
1971 path_simplify (copy);
1972 return copy;
1973}
1974
1975static void
1976run_test (char *test, char *expected_result, int expected_change)
1977{
1978 char *test_copy = xstrdup (test);
1979 int modified = path_simplify (test_copy);
1980
1981 if (0 != strcmp (test_copy, expected_result))
1982 {
1983 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1984 test, expected_result, test_copy);
1985 }
1986 if (modified != expected_change)
1987 {
1988 if (expected_change == 1)
1989 printf ("Expected modification with path_simplify(\"%s\").\n",
1990 test);
1991 else
1992 printf ("Expected no modification with path_simplify(\"%s\").\n",
1993 test);
1994 }
1995 xfree (test_copy);
1996}
1997
1998static void
1999test_path_simplify (void)
2000{
2001 static struct {
2002 char *test, *result;
2003 int should_modify;
2004 } tests[] = {
2005 { "", "", 0 },
2006 { ".", "", 1 },
2007 { "./", "", 1 },
2008 { "..", "..", 0 },
2009 { "../", "../", 0 },
2010 { "foo", "foo", 0 },
2011 { "foo/bar", "foo/bar", 0 },
2012 { "foo///bar", "foo///bar", 0 },
2013 { "foo/.", "foo/", 1 },
2014 { "foo/./", "foo/", 1 },
2015 { "foo./", "foo./", 0 },
2016 { "foo/../bar", "bar", 1 },
2017 { "foo/../bar/", "bar/", 1 },
2018 { "foo/bar/..", "foo/", 1 },
2019 { "foo/bar/../x", "foo/x", 1 },
2020 { "foo/bar/../x/", "foo/x/", 1 },
2021 { "foo/..", "", 1 },
2022 { "foo/../..", "..", 1 },
2023 { "foo/../../..", "../..", 1 },
2024 { "foo/../../bar/../../baz", "../../baz", 1 },
2025 { "a/b/../../c", "c", 1 },
2026 { "./a/../b", "b", 1 }
2027 };
2028 int i;
2029
2030 for (i = 0; i < countof (tests); i++)
2031 {
2032 char *test = tests[i].test;
2033 char *expected_result = tests[i].result;
2034 int expected_change = tests[i].should_modify;
2035 run_test (test, expected_result, expected_change);
2036 }
2037}
2038#endif
Note: See TracBrowser for help on using the repository browser.