| 1 | /* Conversion of links to local files.
|
|---|
| 2 | Copyright (C) 2005 Free Software Foundation, Inc.
|
|---|
| 3 |
|
|---|
| 4 | This file is part of GNU Wget.
|
|---|
| 5 |
|
|---|
| 6 | GNU Wget is free software; you can redistribute it and/or modify
|
|---|
| 7 | it under the terms of the GNU General Public License as published by
|
|---|
| 8 | the Free Software Foundation; either version 2 of the License, or
|
|---|
| 9 | (at your option) any later version.
|
|---|
| 10 |
|
|---|
| 11 | GNU Wget is distributed in the hope that it will be useful,
|
|---|
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|---|
| 14 | GNU General Public License for more details.
|
|---|
| 15 |
|
|---|
| 16 | You should have received a copy of the GNU General Public License
|
|---|
| 17 | along with Wget; if not, write to the Free Software
|
|---|
| 18 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|---|
| 19 |
|
|---|
| 20 | In addition, as a special exception, the Free Software Foundation
|
|---|
| 21 | gives permission to link the code of its release of Wget with the
|
|---|
| 22 | OpenSSL project's "OpenSSL" library (or with modified versions of it
|
|---|
| 23 | that use the same license as the "OpenSSL" library), and distribute
|
|---|
| 24 | the linked executables. You must obey the GNU General Public License
|
|---|
| 25 | in all respects for all of the code used other than "OpenSSL". If you
|
|---|
| 26 | modify this file, you may extend this exception to your version of the
|
|---|
| 27 | file, but you are not obligated to do so. If you do not wish to do
|
|---|
| 28 | so, delete this exception statement from your version. */
|
|---|
| 29 |
|
|---|
| 30 | #include <config.h>
|
|---|
| 31 |
|
|---|
| 32 | #include <stdio.h>
|
|---|
| 33 | #include <stdlib.h>
|
|---|
| 34 | #ifdef HAVE_STRING_H
|
|---|
| 35 | # include <string.h>
|
|---|
| 36 | #else
|
|---|
| 37 | # include <strings.h>
|
|---|
| 38 | #endif /* HAVE_STRING_H */
|
|---|
| 39 | #ifdef HAVE_UNISTD_H
|
|---|
| 40 | # include <unistd.h>
|
|---|
| 41 | #endif /* HAVE_UNISTD_H */
|
|---|
| 42 | #include <errno.h>
|
|---|
| 43 | #include <assert.h>
|
|---|
| 44 | #include <sys/types.h>
|
|---|
| 45 |
|
|---|
| 46 | #include "wget.h"
|
|---|
| 47 | #include "convert.h"
|
|---|
| 48 | #include "url.h"
|
|---|
| 49 | #include "recur.h"
|
|---|
| 50 | #include "utils.h"
|
|---|
| 51 | #include "hash.h"
|
|---|
| 52 | #include "ptimer.h"
|
|---|
| 53 |
|
|---|
| 54 | static struct hash_table *dl_file_url_map;
|
|---|
| 55 | struct hash_table *dl_url_file_map;
|
|---|
| 56 |
|
|---|
| 57 | /* Set of HTML files downloaded in this Wget run, used for link
|
|---|
| 58 | conversion after Wget is done. */
|
|---|
| 59 | struct hash_table *downloaded_html_set;
|
|---|
| 60 |
|
|---|
| 61 | static void convert_links PARAMS ((const char *, struct urlpos *));
|
|---|
| 62 |
|
|---|
| 63 | /* This function is called when the retrieval is done to convert the
|
|---|
| 64 | links that have been downloaded. It has to be called at the end of
|
|---|
| 65 | the retrieval, because only then does Wget know conclusively which
|
|---|
| 66 | URLs have been downloaded, and which not, so it can tell which
|
|---|
| 67 | direction to convert to.
|
|---|
| 68 |
|
|---|
| 69 | The "direction" means that the URLs to the files that have been
|
|---|
| 70 | downloaded get converted to the relative URL which will point to
|
|---|
| 71 | that file. And the other URLs get converted to the remote URL on
|
|---|
| 72 | the server.
|
|---|
| 73 |
|
|---|
| 74 | All the downloaded HTMLs are kept in downloaded_html_files, and
|
|---|
| 75 | downloaded URLs in urls_downloaded. All the information is
|
|---|
| 76 | extracted from these two lists. */
|
|---|
| 77 |
|
|---|
| 78 | void
|
|---|
| 79 | convert_all_links (void)
|
|---|
| 80 | {
|
|---|
| 81 | int i;
|
|---|
| 82 | double secs;
|
|---|
| 83 | int file_count = 0;
|
|---|
| 84 |
|
|---|
| 85 | struct ptimer *timer = ptimer_new ();
|
|---|
| 86 |
|
|---|
| 87 | int cnt;
|
|---|
| 88 | char **file_array;
|
|---|
| 89 |
|
|---|
| 90 | cnt = 0;
|
|---|
| 91 | if (downloaded_html_set)
|
|---|
| 92 | cnt = hash_table_count (downloaded_html_set);
|
|---|
| 93 | if (cnt == 0)
|
|---|
| 94 | return;
|
|---|
| 95 | file_array = alloca_array (char *, cnt);
|
|---|
| 96 | string_set_to_array (downloaded_html_set, file_array);
|
|---|
| 97 |
|
|---|
| 98 | for (i = 0; i < cnt; i++)
|
|---|
| 99 | {
|
|---|
| 100 | struct urlpos *urls, *cur_url;
|
|---|
| 101 | char *url;
|
|---|
| 102 | char *file = file_array[i];
|
|---|
| 103 |
|
|---|
| 104 | /* Determine the URL of the HTML file. get_urls_html will need
|
|---|
| 105 | it. */
|
|---|
| 106 | url = hash_table_get (dl_file_url_map, file);
|
|---|
| 107 | if (!url)
|
|---|
| 108 | {
|
|---|
| 109 | DEBUGP (("Apparently %s has been removed.\n", file));
|
|---|
| 110 | continue;
|
|---|
| 111 | }
|
|---|
| 112 |
|
|---|
| 113 | DEBUGP (("Scanning %s (from %s)\n", file, url));
|
|---|
| 114 |
|
|---|
| 115 | /* Parse the HTML file... */
|
|---|
| 116 | urls = get_urls_html (file, url, NULL);
|
|---|
| 117 |
|
|---|
| 118 | /* We don't respect meta_disallow_follow here because, even if
|
|---|
| 119 | the file is not followed, we might still want to convert the
|
|---|
| 120 | links that have been followed from other files. */
|
|---|
| 121 |
|
|---|
| 122 | for (cur_url = urls; cur_url; cur_url = cur_url->next)
|
|---|
| 123 | {
|
|---|
| 124 | char *local_name;
|
|---|
| 125 | struct url *u = cur_url->url;
|
|---|
| 126 |
|
|---|
| 127 | if (cur_url->link_base_p)
|
|---|
| 128 | {
|
|---|
| 129 | /* Base references have been resolved by our parser, so
|
|---|
| 130 | we turn the base URL into an empty string. (Perhaps
|
|---|
| 131 | we should remove the tag entirely?) */
|
|---|
| 132 | cur_url->convert = CO_NULLIFY_BASE;
|
|---|
| 133 | continue;
|
|---|
| 134 | }
|
|---|
| 135 |
|
|---|
| 136 | /* We decide the direction of conversion according to whether
|
|---|
| 137 | a URL was downloaded. Downloaded URLs will be converted
|
|---|
| 138 | ABS2REL, whereas non-downloaded will be converted REL2ABS. */
|
|---|
| 139 | local_name = hash_table_get (dl_url_file_map, u->url);
|
|---|
| 140 |
|
|---|
| 141 | /* Decide on the conversion type. */
|
|---|
| 142 | if (local_name)
|
|---|
| 143 | {
|
|---|
| 144 | /* We've downloaded this URL. Convert it to relative
|
|---|
| 145 | form. We do this even if the URL already is in
|
|---|
| 146 | relative form, because our directory structure may
|
|---|
| 147 | not be identical to that on the server (think `-nd',
|
|---|
| 148 | `--cut-dirs', etc.) */
|
|---|
| 149 | cur_url->convert = CO_CONVERT_TO_RELATIVE;
|
|---|
| 150 | cur_url->local_name = xstrdup (local_name);
|
|---|
| 151 | DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
|
|---|
| 152 | }
|
|---|
| 153 | else
|
|---|
| 154 | {
|
|---|
| 155 | /* We haven't downloaded this URL. If it's not already
|
|---|
| 156 | complete (including a full host name), convert it to
|
|---|
| 157 | that form, so it can be reached while browsing this
|
|---|
| 158 | HTML locally. */
|
|---|
| 159 | if (!cur_url->link_complete_p)
|
|---|
| 160 | cur_url->convert = CO_CONVERT_TO_COMPLETE;
|
|---|
| 161 | cur_url->local_name = NULL;
|
|---|
| 162 | DEBUGP (("will convert url %s to complete\n", u->url));
|
|---|
| 163 | }
|
|---|
| 164 | }
|
|---|
| 165 |
|
|---|
| 166 | /* Convert the links in the file. */
|
|---|
| 167 | convert_links (file, urls);
|
|---|
| 168 | ++file_count;
|
|---|
| 169 |
|
|---|
| 170 | /* Free the data. */
|
|---|
| 171 | free_urlpos (urls);
|
|---|
| 172 | }
|
|---|
| 173 |
|
|---|
| 174 | secs = ptimer_measure (timer) / 1000;
|
|---|
| 175 | ptimer_destroy (timer);
|
|---|
| 176 | logprintf (LOG_VERBOSE, _("Converted %d files in %.*f seconds.\n"),
|
|---|
| 177 | file_count, secs < 10 ? 3 : 1, secs);
|
|---|
| 178 | }
|
|---|
| 179 |
|
|---|
| 180 | static void write_backup_file PARAMS ((const char *, downloaded_file_t));
|
|---|
| 181 | static const char *replace_attr PARAMS ((const char *, int, FILE *,
|
|---|
| 182 | const char *));
|
|---|
| 183 | static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
|
|---|
| 184 | const char *, int));
|
|---|
| 185 | static char *local_quote_string PARAMS ((const char *));
|
|---|
| 186 | static char *construct_relative PARAMS ((const char *, const char *));
|
|---|
| 187 |
|
|---|
| 188 | /* Change the links in one HTML file. LINKS is a list of links in the
|
|---|
| 189 | document, along with their positions and the desired direction of
|
|---|
| 190 | the conversion. */
|
|---|
| 191 | static void
|
|---|
| 192 | convert_links (const char *file, struct urlpos *links)
|
|---|
| 193 | {
|
|---|
| 194 | struct file_memory *fm;
|
|---|
| 195 | FILE *fp;
|
|---|
| 196 | const char *p;
|
|---|
| 197 | downloaded_file_t downloaded_file_return;
|
|---|
| 198 |
|
|---|
| 199 | struct urlpos *link;
|
|---|
| 200 | int to_url_count = 0, to_file_count = 0;
|
|---|
| 201 |
|
|---|
| 202 | logprintf (LOG_VERBOSE, _("Converting %s... "), file);
|
|---|
| 203 |
|
|---|
| 204 | {
|
|---|
| 205 | /* First we do a "dry run": go through the list L and see whether
|
|---|
| 206 | any URL needs to be converted in the first place. If not, just
|
|---|
| 207 | leave the file alone. */
|
|---|
| 208 | int dry_count = 0;
|
|---|
| 209 | struct urlpos *dry;
|
|---|
| 210 | for (dry = links; dry; dry = dry->next)
|
|---|
| 211 | if (dry->convert != CO_NOCONVERT)
|
|---|
| 212 | ++dry_count;
|
|---|
| 213 | if (!dry_count)
|
|---|
| 214 | {
|
|---|
| 215 | logputs (LOG_VERBOSE, _("nothing to do.\n"));
|
|---|
| 216 | return;
|
|---|
| 217 | }
|
|---|
| 218 | }
|
|---|
| 219 |
|
|---|
| 220 | fm = read_file (file);
|
|---|
| 221 | if (!fm)
|
|---|
| 222 | {
|
|---|
| 223 | logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
|
|---|
| 224 | file, strerror (errno));
|
|---|
| 225 | return;
|
|---|
| 226 | }
|
|---|
| 227 |
|
|---|
| 228 | downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
|
|---|
| 229 | if (opt.backup_converted && downloaded_file_return)
|
|---|
| 230 | write_backup_file (file, downloaded_file_return);
|
|---|
| 231 |
|
|---|
| 232 | /* Before opening the file for writing, unlink the file. This is
|
|---|
| 233 | important if the data in FM is mmaped. In such case, nulling the
|
|---|
| 234 | file, which is what fopen() below does, would make us read all
|
|---|
| 235 | zeroes from the mmaped region. */
|
|---|
| 236 | if (unlink (file) < 0 && errno != ENOENT)
|
|---|
| 237 | {
|
|---|
| 238 | logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
|
|---|
| 239 | file, strerror (errno));
|
|---|
| 240 | read_file_free (fm);
|
|---|
| 241 | return;
|
|---|
| 242 | }
|
|---|
| 243 | /* Now open the file for writing. */
|
|---|
| 244 | fp = fopen (file, "wb");
|
|---|
| 245 | if (!fp)
|
|---|
| 246 | {
|
|---|
| 247 | logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
|
|---|
| 248 | file, strerror (errno));
|
|---|
| 249 | read_file_free (fm);
|
|---|
| 250 | return;
|
|---|
| 251 | }
|
|---|
| 252 |
|
|---|
| 253 | /* Here we loop through all the URLs in file, replacing those of
|
|---|
| 254 | them that are downloaded with relative references. */
|
|---|
| 255 | p = fm->content;
|
|---|
| 256 | for (link = links; link; link = link->next)
|
|---|
| 257 | {
|
|---|
| 258 | char *url_start = fm->content + link->pos;
|
|---|
| 259 |
|
|---|
| 260 | if (link->pos >= fm->length)
|
|---|
| 261 | {
|
|---|
| 262 | DEBUGP (("Something strange is going on. Please investigate."));
|
|---|
| 263 | break;
|
|---|
| 264 | }
|
|---|
| 265 | /* If the URL is not to be converted, skip it. */
|
|---|
| 266 | if (link->convert == CO_NOCONVERT)
|
|---|
| 267 | {
|
|---|
| 268 | DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
|
|---|
| 269 | continue;
|
|---|
| 270 | }
|
|---|
| 271 |
|
|---|
| 272 | /* Echo the file contents, up to the offending URL's opening
|
|---|
| 273 | quote, to the outfile. */
|
|---|
| 274 | fwrite (p, 1, url_start - p, fp);
|
|---|
| 275 | p = url_start;
|
|---|
| 276 |
|
|---|
| 277 | switch (link->convert)
|
|---|
| 278 | {
|
|---|
| 279 | case CO_CONVERT_TO_RELATIVE:
|
|---|
| 280 | /* Convert absolute URL to relative. */
|
|---|
| 281 | {
|
|---|
| 282 | char *newname = construct_relative (file, link->local_name);
|
|---|
| 283 | char *quoted_newname = local_quote_string (newname);
|
|---|
| 284 |
|
|---|
| 285 | if (!link->link_refresh_p)
|
|---|
| 286 | p = replace_attr (p, link->size, fp, quoted_newname);
|
|---|
| 287 | else
|
|---|
| 288 | p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
|
|---|
| 289 | link->refresh_timeout);
|
|---|
| 290 |
|
|---|
| 291 | DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
|
|---|
| 292 | link->url->url, newname, link->pos, file));
|
|---|
| 293 | xfree (newname);
|
|---|
| 294 | xfree (quoted_newname);
|
|---|
| 295 | ++to_file_count;
|
|---|
| 296 | break;
|
|---|
| 297 | }
|
|---|
| 298 | case CO_CONVERT_TO_COMPLETE:
|
|---|
| 299 | /* Convert the link to absolute URL. */
|
|---|
| 300 | {
|
|---|
| 301 | char *newlink = link->url->url;
|
|---|
| 302 | char *quoted_newlink = html_quote_string (newlink);
|
|---|
| 303 |
|
|---|
| 304 | if (!link->link_refresh_p)
|
|---|
| 305 | p = replace_attr (p, link->size, fp, quoted_newlink);
|
|---|
| 306 | else
|
|---|
| 307 | p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
|
|---|
| 308 | link->refresh_timeout);
|
|---|
| 309 |
|
|---|
| 310 | DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
|
|---|
| 311 | newlink, link->pos, file));
|
|---|
| 312 | xfree (quoted_newlink);
|
|---|
| 313 | ++to_url_count;
|
|---|
| 314 | break;
|
|---|
| 315 | }
|
|---|
| 316 | case CO_NULLIFY_BASE:
|
|---|
| 317 | /* Change the base href to "". */
|
|---|
| 318 | p = replace_attr (p, link->size, fp, "");
|
|---|
| 319 | break;
|
|---|
| 320 | case CO_NOCONVERT:
|
|---|
| 321 | abort ();
|
|---|
| 322 | break;
|
|---|
| 323 | }
|
|---|
| 324 | }
|
|---|
| 325 |
|
|---|
| 326 | /* Output the rest of the file. */
|
|---|
| 327 | if (p - fm->content < fm->length)
|
|---|
| 328 | fwrite (p, 1, fm->length - (p - fm->content), fp);
|
|---|
| 329 | fclose (fp);
|
|---|
| 330 | read_file_free (fm);
|
|---|
| 331 |
|
|---|
| 332 | logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
|
|---|
| 333 | }
|
|---|
| 334 |
|
|---|
| 335 | /* Construct and return a link that points from BASEFILE to LINKFILE.
|
|---|
| 336 | Both files should be local file names, BASEFILE of the referrering
|
|---|
| 337 | file, and LINKFILE of the referred file.
|
|---|
| 338 |
|
|---|
| 339 | Examples:
|
|---|
| 340 |
|
|---|
| 341 | cr("foo", "bar") -> "bar"
|
|---|
| 342 | cr("A/foo", "A/bar") -> "bar"
|
|---|
| 343 | cr("A/foo", "A/B/bar") -> "B/bar"
|
|---|
| 344 | cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
|
|---|
| 345 | cr("X/", "Y/bar") -> "../Y/bar" (trailing slash does matter in BASE)
|
|---|
| 346 |
|
|---|
| 347 | Both files should be absolute or relative, otherwise strange
|
|---|
| 348 | results might ensue. The function makes no special efforts to
|
|---|
| 349 | handle "." and ".." in links, so make sure they're not there
|
|---|
| 350 | (e.g. using path_simplify). */
|
|---|
| 351 |
|
|---|
| 352 | static char *
|
|---|
| 353 | construct_relative (const char *basefile, const char *linkfile)
|
|---|
| 354 | {
|
|---|
| 355 | char *link;
|
|---|
| 356 | int basedirs;
|
|---|
| 357 | const char *b, *l;
|
|---|
| 358 | int i, start;
|
|---|
| 359 |
|
|---|
| 360 | /* First, skip the initial directory components common to both
|
|---|
| 361 | files. */
|
|---|
| 362 | start = 0;
|
|---|
| 363 | for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
|
|---|
| 364 | {
|
|---|
| 365 | if (*b == '/')
|
|---|
| 366 | start = (b - basefile) + 1;
|
|---|
| 367 | }
|
|---|
| 368 | basefile += start;
|
|---|
| 369 | linkfile += start;
|
|---|
| 370 |
|
|---|
| 371 | /* With common directories out of the way, the situation we have is
|
|---|
| 372 | as follows:
|
|---|
| 373 | b - b1/b2/[...]/bfile
|
|---|
| 374 | l - l1/l2/[...]/lfile
|
|---|
| 375 |
|
|---|
| 376 | The link we're constructing needs to be:
|
|---|
| 377 | lnk - ../../l1/l2/[...]/lfile
|
|---|
| 378 |
|
|---|
| 379 | Where the number of ".."'s equals the number of bN directory
|
|---|
| 380 | components in B. */
|
|---|
| 381 |
|
|---|
| 382 | /* Count the directory components in B. */
|
|---|
| 383 | basedirs = 0;
|
|---|
| 384 | for (b = basefile; *b; b++)
|
|---|
| 385 | {
|
|---|
| 386 | if (*b == '/')
|
|---|
| 387 | ++basedirs;
|
|---|
| 388 | }
|
|---|
| 389 |
|
|---|
| 390 | /* Construct LINK as explained above. */
|
|---|
| 391 | link = (char *)xmalloc (3 * basedirs + strlen (linkfile) + 1);
|
|---|
| 392 | for (i = 0; i < basedirs; i++)
|
|---|
| 393 | memcpy (link + 3 * i, "../", 3);
|
|---|
| 394 | strcpy (link + 3 * i, linkfile);
|
|---|
| 395 | return link;
|
|---|
| 396 | }
|
|---|
| 397 |
|
|---|
| 398 | /* Used by write_backup_file to remember which files have been
|
|---|
| 399 | written. */
|
|---|
| 400 | static struct hash_table *converted_files;
|
|---|
| 401 |
|
|---|
| 402 | static void
|
|---|
| 403 | write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
|
|---|
| 404 | {
|
|---|
| 405 | /* Rather than just writing over the original .html file with the
|
|---|
| 406 | converted version, save the former to *.orig. Note we only do
|
|---|
| 407 | this for files we've _successfully_ downloaded, so we don't
|
|---|
| 408 | clobber .orig files sitting around from previous invocations. */
|
|---|
| 409 |
|
|---|
| 410 | /* Construct the backup filename as the original name plus ".orig". */
|
|---|
| 411 | size_t filename_len = strlen (file);
|
|---|
| 412 | char* filename_plus_orig_suffix;
|
|---|
| 413 |
|
|---|
| 414 | if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
|
|---|
| 415 | {
|
|---|
| 416 | /* Just write "orig" over "html". We need to do it this way
|
|---|
| 417 | because when we're checking to see if we've downloaded the
|
|---|
| 418 | file before (to see if we can skip downloading it), we don't
|
|---|
| 419 | know if it's a text/html file. Therefore we don't know yet
|
|---|
| 420 | at that stage that -E is going to cause us to tack on
|
|---|
| 421 | ".html", so we need to compare vs. the original URL plus
|
|---|
| 422 | ".orig", not the original URL plus ".html.orig". */
|
|---|
| 423 | filename_plus_orig_suffix = alloca (filename_len + 1);
|
|---|
| 424 | strcpy (filename_plus_orig_suffix, file);
|
|---|
| 425 | strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
|
|---|
| 426 | }
|
|---|
| 427 | else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
|
|---|
| 428 | {
|
|---|
| 429 | /* Append ".orig" to the name. */
|
|---|
| 430 | filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
|
|---|
| 431 | strcpy (filename_plus_orig_suffix, file);
|
|---|
| 432 | strcpy (filename_plus_orig_suffix + filename_len, ".orig");
|
|---|
| 433 | }
|
|---|
| 434 |
|
|---|
| 435 | if (!converted_files)
|
|---|
| 436 | converted_files = make_string_hash_table (0);
|
|---|
| 437 |
|
|---|
| 438 | /* We can get called twice on the same URL thanks to the
|
|---|
| 439 | convert_all_links() call in main(). If we write the .orig file
|
|---|
| 440 | each time in such a case, it'll end up containing the first-pass
|
|---|
| 441 | conversion, not the original file. So, see if we've already been
|
|---|
| 442 | called on this file. */
|
|---|
| 443 | if (!string_set_contains (converted_files, file))
|
|---|
| 444 | {
|
|---|
| 445 | /* Rename <file> to <file>.orig before former gets written over. */
|
|---|
| 446 | if (rename (file, filename_plus_orig_suffix) != 0)
|
|---|
| 447 | logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
|
|---|
| 448 | file, filename_plus_orig_suffix, strerror (errno));
|
|---|
| 449 |
|
|---|
| 450 | /* Remember that we've already written a .orig backup for this file.
|
|---|
| 451 | Note that we never free this memory since we need it till the
|
|---|
| 452 | convert_all_links() call, which is one of the last things the
|
|---|
| 453 | program does before terminating. BTW, I'm not sure if it would be
|
|---|
| 454 | safe to just set 'converted_file_ptr->string' to 'file' below,
|
|---|
| 455 | rather than making a copy of the string... Another note is that I
|
|---|
| 456 | thought I could just add a field to the urlpos structure saying
|
|---|
| 457 | that we'd written a .orig file for this URL, but that didn't work,
|
|---|
| 458 | so I had to make this separate list.
|
|---|
| 459 | -- Dan Harkless <wget@harkless.org>
|
|---|
| 460 |
|
|---|
| 461 | This [adding a field to the urlpos structure] didn't work
|
|---|
| 462 | because convert_file() is called from convert_all_links at
|
|---|
| 463 | the end of the retrieval with a freshly built new urlpos
|
|---|
| 464 | list.
|
|---|
| 465 | -- Hrvoje Niksic <hniksic@xemacs.org>
|
|---|
| 466 | */
|
|---|
| 467 | string_set_add (converted_files, file);
|
|---|
| 468 | }
|
|---|
| 469 | }
|
|---|
| 470 |
|
|---|
| 471 | static int find_fragment PARAMS ((const char *, int, const char **,
|
|---|
| 472 | const char **));
|
|---|
| 473 |
|
|---|
| 474 | /* Replace an attribute's original text with NEW_TEXT. */
|
|---|
| 475 |
|
|---|
| 476 | static const char *
|
|---|
| 477 | replace_attr (const char *p, int size, FILE *fp, const char *new_text)
|
|---|
| 478 | {
|
|---|
| 479 | int quote_flag = 0;
|
|---|
| 480 | char quote_char = '\"'; /* use "..." for quoting, unless the
|
|---|
| 481 | original value is quoted, in which
|
|---|
| 482 | case reuse its quoting char. */
|
|---|
| 483 | const char *frag_beg, *frag_end;
|
|---|
| 484 |
|
|---|
| 485 | /* Structure of our string is:
|
|---|
| 486 | "...old-contents..."
|
|---|
| 487 | <--- size ---> (with quotes)
|
|---|
| 488 | OR:
|
|---|
| 489 | ...old-contents...
|
|---|
| 490 | <--- size --> (no quotes) */
|
|---|
| 491 |
|
|---|
| 492 | if (*p == '\"' || *p == '\'')
|
|---|
| 493 | {
|
|---|
| 494 | quote_char = *p;
|
|---|
| 495 | quote_flag = 1;
|
|---|
| 496 | ++p;
|
|---|
| 497 | size -= 2; /* disregard opening and closing quote */
|
|---|
| 498 | }
|
|---|
| 499 | putc (quote_char, fp);
|
|---|
| 500 | fputs (new_text, fp);
|
|---|
| 501 |
|
|---|
| 502 | /* Look for fragment identifier, if any. */
|
|---|
| 503 | if (find_fragment (p, size, &frag_beg, &frag_end))
|
|---|
| 504 | fwrite (frag_beg, 1, frag_end - frag_beg, fp);
|
|---|
| 505 | p += size;
|
|---|
| 506 | if (quote_flag)
|
|---|
| 507 | ++p;
|
|---|
| 508 | putc (quote_char, fp);
|
|---|
| 509 |
|
|---|
| 510 | return p;
|
|---|
| 511 | }
|
|---|
| 512 |
|
|---|
| 513 | /* The same as REPLACE_ATTR, but used when replacing
|
|---|
| 514 | <meta http-equiv=refresh content="new_text"> because we need to
|
|---|
| 515 | append "timeout_value; URL=" before the next_text. */
|
|---|
| 516 |
|
|---|
| 517 | static const char *
|
|---|
| 518 | replace_attr_refresh_hack (const char *p, int size, FILE *fp,
|
|---|
| 519 | const char *new_text, int timeout)
|
|---|
| 520 | {
|
|---|
| 521 | /* "0; URL=..." */
|
|---|
| 522 | char *new_with_timeout = (char *)alloca (numdigit (timeout)
|
|---|
| 523 | + 6 /* "; URL=" */
|
|---|
| 524 | + strlen (new_text)
|
|---|
| 525 | + 1);
|
|---|
| 526 | sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
|
|---|
| 527 |
|
|---|
| 528 | return replace_attr (p, size, fp, new_with_timeout);
|
|---|
| 529 | }
|
|---|
| 530 |
|
|---|
| 531 | /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
|
|---|
| 532 | preceded by '&'. If the character is not found, return zero. If
|
|---|
| 533 | the character is found, return 1 and set BP and EP to point to the
|
|---|
| 534 | beginning and end of the region.
|
|---|
| 535 |
|
|---|
| 536 | This is used for finding the fragment indentifiers in URLs. */
|
|---|
| 537 |
|
|---|
| 538 | static int
|
|---|
| 539 | find_fragment (const char *beg, int size, const char **bp, const char **ep)
|
|---|
| 540 | {
|
|---|
| 541 | const char *end = beg + size;
|
|---|
| 542 | int saw_amp = 0;
|
|---|
| 543 | for (; beg < end; beg++)
|
|---|
| 544 | {
|
|---|
| 545 | switch (*beg)
|
|---|
| 546 | {
|
|---|
| 547 | case '&':
|
|---|
| 548 | saw_amp = 1;
|
|---|
| 549 | break;
|
|---|
| 550 | case '#':
|
|---|
| 551 | if (!saw_amp)
|
|---|
| 552 | {
|
|---|
| 553 | *bp = beg;
|
|---|
| 554 | *ep = end;
|
|---|
| 555 | return 1;
|
|---|
| 556 | }
|
|---|
| 557 | /* fallthrough */
|
|---|
| 558 | default:
|
|---|
| 559 | saw_amp = 0;
|
|---|
| 560 | }
|
|---|
| 561 | }
|
|---|
| 562 | return 0;
|
|---|
| 563 | }
|
|---|
| 564 |
|
|---|
| 565 | /* Quote FILE for use as local reference to an HTML file.
|
|---|
| 566 |
|
|---|
| 567 | We quote ? as %3F to avoid passing part of the file name as the
|
|---|
| 568 | parameter when browsing the converted file through HTTP. However,
|
|---|
| 569 | it is safe to do this only when `--html-extension' is turned on.
|
|---|
| 570 | This is because converting "index.html?foo=bar" to
|
|---|
| 571 | "index.html%3Ffoo=bar" would break local browsing, as the latter
|
|---|
| 572 | isn't even recognized as an HTML file! However, converting
|
|---|
| 573 | "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
|
|---|
| 574 | safe for both local and HTTP-served browsing.
|
|---|
| 575 |
|
|---|
| 576 | We always quote "#" as "%23" and "%" as "%25" because those
|
|---|
| 577 | characters have special meanings in URLs. */
|
|---|
| 578 |
|
|---|
| 579 | static char *
|
|---|
| 580 | local_quote_string (const char *file)
|
|---|
| 581 | {
|
|---|
| 582 | const char *from;
|
|---|
| 583 | char *newname, *to;
|
|---|
| 584 |
|
|---|
| 585 | char *any = strpbrk (file, "?#%");
|
|---|
| 586 | if (!any)
|
|---|
| 587 | return html_quote_string (file);
|
|---|
| 588 |
|
|---|
| 589 | /* Allocate space assuming the worst-case scenario, each character
|
|---|
| 590 | having to be quoted. */
|
|---|
| 591 | to = newname = (char *)alloca (3 * strlen (file) + 1);
|
|---|
| 592 | for (from = file; *from; from++)
|
|---|
| 593 | switch (*from)
|
|---|
| 594 | {
|
|---|
| 595 | case '%':
|
|---|
| 596 | *to++ = '%';
|
|---|
| 597 | *to++ = '2';
|
|---|
| 598 | *to++ = '5';
|
|---|
| 599 | break;
|
|---|
| 600 | case '#':
|
|---|
| 601 | *to++ = '%';
|
|---|
| 602 | *to++ = '2';
|
|---|
| 603 | *to++ = '3';
|
|---|
| 604 | break;
|
|---|
| 605 | case '?':
|
|---|
| 606 | if (opt.html_extension)
|
|---|
| 607 | {
|
|---|
| 608 | *to++ = '%';
|
|---|
| 609 | *to++ = '3';
|
|---|
| 610 | *to++ = 'F';
|
|---|
| 611 | break;
|
|---|
| 612 | }
|
|---|
| 613 | /* fallthrough */
|
|---|
| 614 | default:
|
|---|
| 615 | *to++ = *from;
|
|---|
| 616 | }
|
|---|
| 617 | *to = '\0';
|
|---|
| 618 |
|
|---|
| 619 | return html_quote_string (newname);
|
|---|
| 620 | }
|
|---|
| 621 | |
|---|
| 622 |
|
|---|
| 623 | /* Book-keeping code for dl_file_url_map, dl_url_file_map,
|
|---|
| 624 | downloaded_html_list, and downloaded_html_set. Other code calls
|
|---|
| 625 | these functions to let us know that a file has been downloaded. */
|
|---|
| 626 |
|
|---|
| 627 | #define ENSURE_TABLES_EXIST do { \
|
|---|
| 628 | if (!dl_file_url_map) \
|
|---|
| 629 | dl_file_url_map = make_string_hash_table (0); \
|
|---|
| 630 | if (!dl_url_file_map) \
|
|---|
| 631 | dl_url_file_map = make_string_hash_table (0); \
|
|---|
| 632 | } while (0)
|
|---|
| 633 |
|
|---|
| 634 | /* Return 1 if S1 and S2 are the same, except for "/index.html". The
|
|---|
| 635 | three cases in which it returns one are (substitute any substring
|
|---|
| 636 | for "foo"):
|
|---|
| 637 |
|
|---|
| 638 | m("foo/index.html", "foo/") ==> 1
|
|---|
| 639 | m("foo/", "foo/index.html") ==> 1
|
|---|
| 640 | m("foo", "foo/index.html") ==> 1
|
|---|
| 641 | m("foo", "foo/" ==> 1
|
|---|
| 642 | m("foo", "foo") ==> 1 */
|
|---|
| 643 |
|
|---|
| 644 | static int
|
|---|
| 645 | match_except_index (const char *s1, const char *s2)
|
|---|
| 646 | {
|
|---|
| 647 | int i;
|
|---|
| 648 | const char *lng;
|
|---|
| 649 |
|
|---|
| 650 | /* Skip common substring. */
|
|---|
| 651 | for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
|
|---|
| 652 | ;
|
|---|
| 653 | if (i == 0)
|
|---|
| 654 | /* Strings differ at the very beginning -- bail out. We need to
|
|---|
| 655 | check this explicitly to avoid `lng - 1' reading outside the
|
|---|
| 656 | array. */
|
|---|
| 657 | return 0;
|
|---|
| 658 |
|
|---|
| 659 | if (!*s1 && !*s2)
|
|---|
| 660 | /* Both strings hit EOF -- strings are equal. */
|
|---|
| 661 | return 1;
|
|---|
| 662 | else if (*s1 && *s2)
|
|---|
| 663 | /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
|
|---|
| 664 | return 0;
|
|---|
| 665 | else if (*s1)
|
|---|
| 666 | /* S1 is the longer one. */
|
|---|
| 667 | lng = s1;
|
|---|
| 668 | else
|
|---|
| 669 | /* S2 is the longer one. */
|
|---|
| 670 | lng = s2;
|
|---|
| 671 |
|
|---|
| 672 | /* foo */ /* foo/ */
|
|---|
| 673 | /* foo/index.html */ /* or */ /* foo/index.html */
|
|---|
| 674 | /* ^ */ /* ^ */
|
|---|
| 675 |
|
|---|
| 676 | if (*lng != '/')
|
|---|
| 677 | /* The right-hand case. */
|
|---|
| 678 | --lng;
|
|---|
| 679 |
|
|---|
| 680 | if (*lng == '/' && *(lng + 1) == '\0')
|
|---|
| 681 | /* foo */
|
|---|
| 682 | /* foo/ */
|
|---|
| 683 | return 1;
|
|---|
| 684 |
|
|---|
| 685 | return 0 == strcmp (lng, "/index.html");
|
|---|
| 686 | }
|
|---|
| 687 |
|
|---|
| 688 | static int
|
|---|
| 689 | dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
|
|---|
| 690 | {
|
|---|
| 691 | char *mapping_url = (char *)key;
|
|---|
| 692 | char *mapping_file = (char *)value;
|
|---|
| 693 | char *file = (char *)arg;
|
|---|
| 694 |
|
|---|
| 695 | if (0 == strcmp (mapping_file, file))
|
|---|
| 696 | {
|
|---|
| 697 | hash_table_remove (dl_url_file_map, mapping_url);
|
|---|
| 698 | xfree (mapping_url);
|
|---|
| 699 | xfree (mapping_file);
|
|---|
| 700 | }
|
|---|
| 701 |
|
|---|
| 702 | /* Continue mapping. */
|
|---|
| 703 | return 0;
|
|---|
| 704 | }
|
|---|
| 705 |
|
|---|
| 706 | /* Remove all associations from various URLs to FILE from dl_url_file_map. */
|
|---|
| 707 |
|
|---|
| 708 | static void
|
|---|
| 709 | dissociate_urls_from_file (const char *file)
|
|---|
| 710 | {
|
|---|
| 711 | hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
|
|---|
| 712 | (char *)file);
|
|---|
| 713 | }
|
|---|
| 714 |
|
|---|
| 715 | /* Register that URL has been successfully downloaded to FILE. This
|
|---|
| 716 | is used by the link conversion code to convert references to URLs
|
|---|
| 717 | to references to local files. It is also being used to check if a
|
|---|
| 718 | URL has already been downloaded. */
|
|---|
| 719 |
|
|---|
| 720 | void
|
|---|
| 721 | register_download (const char *url, const char *file)
|
|---|
| 722 | {
|
|---|
| 723 | char *old_file, *old_url;
|
|---|
| 724 |
|
|---|
| 725 | ENSURE_TABLES_EXIST;
|
|---|
| 726 |
|
|---|
| 727 | /* With some forms of retrieval, it is possible, although not likely
|
|---|
| 728 | or particularly desirable. If both are downloaded, the second
|
|---|
| 729 | download will override the first one. When that happens,
|
|---|
| 730 | dissociate the old file name from the URL. */
|
|---|
| 731 |
|
|---|
| 732 | if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
|
|---|
| 733 | {
|
|---|
| 734 | if (0 == strcmp (url, old_url))
|
|---|
| 735 | /* We have somehow managed to download the same URL twice.
|
|---|
| 736 | Nothing to do. */
|
|---|
| 737 | return;
|
|---|
| 738 |
|
|---|
| 739 | if (match_except_index (url, old_url)
|
|---|
| 740 | && !hash_table_contains (dl_url_file_map, url))
|
|---|
| 741 | /* The two URLs differ only in the "index.html" ending. For
|
|---|
| 742 | example, one is "http://www.server.com/", and the other is
|
|---|
| 743 | "http://www.server.com/index.html". Don't remove the old
|
|---|
| 744 | one, just add the new one as a non-canonical entry. */
|
|---|
| 745 | goto url_only;
|
|---|
| 746 |
|
|---|
| 747 | hash_table_remove (dl_file_url_map, file);
|
|---|
| 748 | xfree (old_file);
|
|---|
| 749 | xfree (old_url);
|
|---|
| 750 |
|
|---|
| 751 | /* Remove all the URLs that point to this file. Yes, there can
|
|---|
| 752 | be more than one such URL, because we store redirections as
|
|---|
| 753 | multiple entries in dl_url_file_map. For example, if URL1
|
|---|
| 754 | redirects to URL2 which gets downloaded to FILE, we map both
|
|---|
| 755 | URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map
|
|---|
| 756 | only points to URL2.) When another URL gets loaded to FILE,
|
|---|
| 757 | we want both URL1 and URL2 dissociated from it.
|
|---|
| 758 |
|
|---|
| 759 | This is a relatively expensive operation because it performs
|
|---|
| 760 | a linear search of the whole hash table, but it should be
|
|---|
| 761 | called very rarely, only when two URLs resolve to the same
|
|---|
| 762 | file name, *and* the "<file>.1" extensions are turned off.
|
|---|
| 763 | In other words, almost never. */
|
|---|
| 764 | dissociate_urls_from_file (file);
|
|---|
| 765 | }
|
|---|
| 766 |
|
|---|
| 767 | hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
|
|---|
| 768 |
|
|---|
| 769 | url_only:
|
|---|
| 770 | /* A URL->FILE mapping is not possible without a FILE->URL mapping.
|
|---|
| 771 | If the latter were present, it should have been removed by the
|
|---|
| 772 | above `if'. So we could write:
|
|---|
| 773 |
|
|---|
| 774 | assert (!hash_table_contains (dl_url_file_map, url));
|
|---|
| 775 |
|
|---|
| 776 | The above is correct when running in recursive mode where the
|
|---|
| 777 | same URL always resolves to the same file. But if you do
|
|---|
| 778 | something like:
|
|---|
| 779 |
|
|---|
| 780 | wget URL URL
|
|---|
| 781 |
|
|---|
| 782 | then the first URL will resolve to "FILE", and the other to
|
|---|
| 783 | "FILE.1". In that case, FILE.1 will not be found in
|
|---|
| 784 | dl_file_url_map, but URL will still point to FILE in
|
|---|
| 785 | dl_url_file_map. */
|
|---|
| 786 | if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
|
|---|
| 787 | {
|
|---|
| 788 | hash_table_remove (dl_url_file_map, url);
|
|---|
| 789 | xfree (old_url);
|
|---|
| 790 | xfree (old_file);
|
|---|
| 791 | }
|
|---|
| 792 |
|
|---|
| 793 | hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
|
|---|
| 794 | }
|
|---|
| 795 |
|
|---|
| 796 | /* Register that FROM has been redirected to TO. This assumes that TO
|
|---|
| 797 | is successfully downloaded and already registered using
|
|---|
| 798 | register_download() above. */
|
|---|
| 799 |
|
|---|
| 800 | void
|
|---|
| 801 | register_redirection (const char *from, const char *to)
|
|---|
| 802 | {
|
|---|
| 803 | char *file;
|
|---|
| 804 |
|
|---|
| 805 | ENSURE_TABLES_EXIST;
|
|---|
| 806 |
|
|---|
| 807 | file = hash_table_get (dl_url_file_map, to);
|
|---|
| 808 | assert (file != NULL);
|
|---|
| 809 | if (!hash_table_contains (dl_url_file_map, from))
|
|---|
| 810 | hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
|
|---|
| 811 | }
|
|---|
| 812 |
|
|---|
| 813 | /* Register that the file has been deleted. */
|
|---|
| 814 |
|
|---|
| 815 | void
|
|---|
| 816 | register_delete_file (const char *file)
|
|---|
| 817 | {
|
|---|
| 818 | char *old_url, *old_file;
|
|---|
| 819 |
|
|---|
| 820 | ENSURE_TABLES_EXIST;
|
|---|
| 821 |
|
|---|
| 822 | if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
|
|---|
| 823 | return;
|
|---|
| 824 |
|
|---|
| 825 | hash_table_remove (dl_file_url_map, file);
|
|---|
| 826 | xfree (old_file);
|
|---|
| 827 | xfree (old_url);
|
|---|
| 828 | dissociate_urls_from_file (file);
|
|---|
| 829 | }
|
|---|
| 830 |
|
|---|
| 831 | /* Register that FILE is an HTML file that has been downloaded. */
|
|---|
| 832 |
|
|---|
| 833 | void
|
|---|
| 834 | register_html (const char *url, const char *file)
|
|---|
| 835 | {
|
|---|
| 836 | if (!downloaded_html_set)
|
|---|
| 837 | downloaded_html_set = make_string_hash_table (0);
|
|---|
| 838 | string_set_add (downloaded_html_set, file);
|
|---|
| 839 | }
|
|---|
| 840 |
|
|---|
| 841 | static void downloaded_files_free PARAMS ((void));
|
|---|
| 842 |
|
|---|
| 843 | /* Cleanup the data structures associated with this file. */
|
|---|
| 844 |
|
|---|
| 845 | void
|
|---|
| 846 | convert_cleanup (void)
|
|---|
| 847 | {
|
|---|
| 848 | if (dl_file_url_map)
|
|---|
| 849 | {
|
|---|
| 850 | free_keys_and_values (dl_file_url_map);
|
|---|
| 851 | hash_table_destroy (dl_file_url_map);
|
|---|
| 852 | dl_file_url_map = NULL;
|
|---|
| 853 | }
|
|---|
| 854 | if (dl_url_file_map)
|
|---|
| 855 | {
|
|---|
| 856 | free_keys_and_values (dl_url_file_map);
|
|---|
| 857 | hash_table_destroy (dl_url_file_map);
|
|---|
| 858 | dl_url_file_map = NULL;
|
|---|
| 859 | }
|
|---|
| 860 | if (downloaded_html_set)
|
|---|
| 861 | string_set_free (downloaded_html_set);
|
|---|
| 862 | downloaded_files_free ();
|
|---|
| 863 | if (converted_files)
|
|---|
| 864 | string_set_free (converted_files);
|
|---|
| 865 | }
|
|---|
| 866 | |
|---|
| 867 |
|
|---|
| 868 | /* Book-keeping code for downloaded files that enables extension
|
|---|
| 869 | hacks. */
|
|---|
| 870 |
|
|---|
| 871 | /* This table should really be merged with dl_file_url_map and
|
|---|
| 872 | downloaded_html_files. This was originally a list, but I changed
|
|---|
| 873 | it to a hash table beause it was actually taking a lot of time to
|
|---|
| 874 | find things in it. */
|
|---|
| 875 |
|
|---|
| 876 | static struct hash_table *downloaded_files_hash;
|
|---|
| 877 |
|
|---|
| 878 | /* We're storing "modes" of type downloaded_file_t in the hash table.
|
|---|
| 879 | However, our hash tables only accept pointers for keys and values.
|
|---|
| 880 | So when we need a pointer, we use the address of a
|
|---|
| 881 | downloaded_file_t variable of static storage. */
|
|---|
| 882 |
|
|---|
| 883 | static downloaded_file_t *
|
|---|
| 884 | downloaded_mode_to_ptr (downloaded_file_t mode)
|
|---|
| 885 | {
|
|---|
| 886 | static downloaded_file_t
|
|---|
| 887 | v1 = FILE_NOT_ALREADY_DOWNLOADED,
|
|---|
| 888 | v2 = FILE_DOWNLOADED_NORMALLY,
|
|---|
| 889 | v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
|
|---|
| 890 | v4 = CHECK_FOR_FILE;
|
|---|
| 891 |
|
|---|
| 892 | switch (mode)
|
|---|
| 893 | {
|
|---|
| 894 | case FILE_NOT_ALREADY_DOWNLOADED:
|
|---|
| 895 | return &v1;
|
|---|
| 896 | case FILE_DOWNLOADED_NORMALLY:
|
|---|
| 897 | return &v2;
|
|---|
| 898 | case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
|
|---|
| 899 | return &v3;
|
|---|
| 900 | case CHECK_FOR_FILE:
|
|---|
| 901 | return &v4;
|
|---|
| 902 | }
|
|---|
| 903 | return NULL;
|
|---|
| 904 | }
|
|---|
| 905 |
|
|---|
| 906 | /* Remembers which files have been downloaded. In the standard case,
|
|---|
| 907 | should be called with mode == FILE_DOWNLOADED_NORMALLY for each
|
|---|
| 908 | file we actually download successfully (i.e. not for ones we have
|
|---|
| 909 | failures on or that we skip due to -N).
|
|---|
| 910 |
|
|---|
| 911 | When we've downloaded a file and tacked on a ".html" extension due
|
|---|
| 912 | to -E, call this function with
|
|---|
| 913 | FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
|
|---|
| 914 | FILE_DOWNLOADED_NORMALLY.
|
|---|
| 915 |
|
|---|
| 916 | If you just want to check if a file has been previously added
|
|---|
| 917 | without adding it, call with mode == CHECK_FOR_FILE. Please be
|
|---|
| 918 | sure to call this function with local filenames, not remote
|
|---|
| 919 | URLs. */
|
|---|
| 920 |
|
|---|
| 921 | downloaded_file_t
|
|---|
| 922 | downloaded_file (downloaded_file_t mode, const char *file)
|
|---|
| 923 | {
|
|---|
| 924 | downloaded_file_t *ptr;
|
|---|
| 925 |
|
|---|
| 926 | if (mode == CHECK_FOR_FILE)
|
|---|
| 927 | {
|
|---|
| 928 | if (!downloaded_files_hash)
|
|---|
| 929 | return FILE_NOT_ALREADY_DOWNLOADED;
|
|---|
| 930 | ptr = hash_table_get (downloaded_files_hash, file);
|
|---|
| 931 | if (!ptr)
|
|---|
| 932 | return FILE_NOT_ALREADY_DOWNLOADED;
|
|---|
| 933 | return *ptr;
|
|---|
| 934 | }
|
|---|
| 935 |
|
|---|
| 936 | if (!downloaded_files_hash)
|
|---|
| 937 | downloaded_files_hash = make_string_hash_table (0);
|
|---|
| 938 |
|
|---|
| 939 | ptr = hash_table_get (downloaded_files_hash, file);
|
|---|
| 940 | if (ptr)
|
|---|
| 941 | return *ptr;
|
|---|
| 942 |
|
|---|
| 943 | ptr = downloaded_mode_to_ptr (mode);
|
|---|
| 944 | hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
|
|---|
| 945 |
|
|---|
| 946 | return FILE_NOT_ALREADY_DOWNLOADED;
|
|---|
| 947 | }
|
|---|
| 948 |
|
|---|
| 949 | static int
|
|---|
| 950 | df_free_mapper (void *key, void *value, void *ignored)
|
|---|
| 951 | {
|
|---|
| 952 | xfree (key);
|
|---|
| 953 | return 0;
|
|---|
| 954 | }
|
|---|
| 955 |
|
|---|
| 956 | static void
|
|---|
| 957 | downloaded_files_free (void)
|
|---|
| 958 | {
|
|---|
| 959 | if (downloaded_files_hash)
|
|---|
| 960 | {
|
|---|
| 961 | hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
|
|---|
| 962 | hash_table_destroy (downloaded_files_hash);
|
|---|
| 963 | downloaded_files_hash = NULL;
|
|---|
| 964 | }
|
|---|
| 965 | }
|
|---|
| 966 | |
|---|
| 967 |
|
|---|
| 968 | /* The function returns the pointer to the malloc-ed quoted version of
|
|---|
| 969 | string s. It will recognize and quote numeric and special graphic
|
|---|
| 970 | entities, as per RFC1866:
|
|---|
| 971 |
|
|---|
| 972 | `&' -> `&'
|
|---|
| 973 | `<' -> `<'
|
|---|
| 974 | `>' -> `>'
|
|---|
| 975 | `"' -> `"'
|
|---|
| 976 | SP -> ` '
|
|---|
| 977 |
|
|---|
| 978 | No other entities are recognized or replaced. */
|
|---|
| 979 | char *
|
|---|
| 980 | html_quote_string (const char *s)
|
|---|
| 981 | {
|
|---|
| 982 | const char *b = s;
|
|---|
| 983 | char *p, *res;
|
|---|
| 984 | int i;
|
|---|
| 985 |
|
|---|
| 986 | /* Pass through the string, and count the new size. */
|
|---|
| 987 | for (i = 0; *s; s++, i++)
|
|---|
| 988 | {
|
|---|
| 989 | if (*s == '&')
|
|---|
| 990 | i += 4; /* `amp;' */
|
|---|
| 991 | else if (*s == '<' || *s == '>')
|
|---|
| 992 | i += 3; /* `lt;' and `gt;' */
|
|---|
| 993 | else if (*s == '\"')
|
|---|
| 994 | i += 5; /* `quot;' */
|
|---|
| 995 | else if (*s == ' ')
|
|---|
| 996 | i += 4; /* #32; */
|
|---|
| 997 | }
|
|---|
| 998 | res = (char *)xmalloc (i + 1);
|
|---|
| 999 | s = b;
|
|---|
| 1000 | for (p = res; *s; s++)
|
|---|
| 1001 | {
|
|---|
| 1002 | switch (*s)
|
|---|
| 1003 | {
|
|---|
| 1004 | case '&':
|
|---|
| 1005 | *p++ = '&';
|
|---|
| 1006 | *p++ = 'a';
|
|---|
| 1007 | *p++ = 'm';
|
|---|
| 1008 | *p++ = 'p';
|
|---|
| 1009 | *p++ = ';';
|
|---|
| 1010 | break;
|
|---|
| 1011 | case '<': case '>':
|
|---|
| 1012 | *p++ = '&';
|
|---|
| 1013 | *p++ = (*s == '<' ? 'l' : 'g');
|
|---|
| 1014 | *p++ = 't';
|
|---|
| 1015 | *p++ = ';';
|
|---|
| 1016 | break;
|
|---|
| 1017 | case '\"':
|
|---|
| 1018 | *p++ = '&';
|
|---|
| 1019 | *p++ = 'q';
|
|---|
| 1020 | *p++ = 'u';
|
|---|
| 1021 | *p++ = 'o';
|
|---|
| 1022 | *p++ = 't';
|
|---|
| 1023 | *p++ = ';';
|
|---|
| 1024 | break;
|
|---|
| 1025 | case ' ':
|
|---|
| 1026 | *p++ = '&';
|
|---|
| 1027 | *p++ = '#';
|
|---|
| 1028 | *p++ = '3';
|
|---|
| 1029 | *p++ = '2';
|
|---|
| 1030 | *p++ = ';';
|
|---|
| 1031 | break;
|
|---|
| 1032 | default:
|
|---|
| 1033 | *p++ = *s;
|
|---|
| 1034 | }
|
|---|
| 1035 | }
|
|---|
| 1036 | *p = '\0';
|
|---|
| 1037 | return res;
|
|---|
| 1038 | }
|
|---|