| 1 | /* Collect URLs from HTML source.
|
|---|
| 2 | Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
|
|---|
| 3 |
|
|---|
| 4 | This file is part of GNU Wget.
|
|---|
| 5 |
|
|---|
| 6 | GNU Wget is free software; you can redistribute it and/or modify
|
|---|
| 7 | it under the terms of the GNU General Public License as published by
|
|---|
| 8 | the Free Software Foundation; either version 2 of the License, or
|
|---|
| 9 | (at your option) any later version.
|
|---|
| 10 |
|
|---|
| 11 | GNU Wget is distributed in the hope that it will be useful,
|
|---|
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|---|
| 14 | GNU General Public License for more details.
|
|---|
| 15 |
|
|---|
| 16 | You should have received a copy of the GNU General Public License
|
|---|
| 17 | along with Wget; if not, write to the Free Software
|
|---|
| 18 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|---|
| 19 |
|
|---|
| 20 | In addition, as a special exception, the Free Software Foundation
|
|---|
| 21 | gives permission to link the code of its release of Wget with the
|
|---|
| 22 | OpenSSL project's "OpenSSL" library (or with modified versions of it
|
|---|
| 23 | that use the same license as the "OpenSSL" library), and distribute
|
|---|
| 24 | the linked executables. You must obey the GNU General Public License
|
|---|
| 25 | in all respects for all of the code used other than "OpenSSL". If you
|
|---|
| 26 | modify this file, you may extend this exception to your version of the
|
|---|
| 27 | file, but you are not obligated to do so. If you do not wish to do
|
|---|
| 28 | so, delete this exception statement from your version. */
|
|---|
| 29 |
|
|---|
| 30 | #include <config.h>
|
|---|
| 31 |
|
|---|
| 32 | #include <stdio.h>
|
|---|
| 33 | #ifdef HAVE_STRING_H
|
|---|
| 34 | # include <string.h>
|
|---|
| 35 | #else
|
|---|
| 36 | # include <strings.h>
|
|---|
| 37 | #endif
|
|---|
| 38 | #include <stdlib.h>
|
|---|
| 39 | #include <errno.h>
|
|---|
| 40 | #include <assert.h>
|
|---|
| 41 |
|
|---|
| 42 | #include "wget.h"
|
|---|
| 43 | #include "html-parse.h"
|
|---|
| 44 | #include "url.h"
|
|---|
| 45 | #include "utils.h"
|
|---|
| 46 | #include "hash.h"
|
|---|
| 47 | #include "convert.h"
|
|---|
| 48 | #include "recur.h" /* declaration of get_urls_html */
|
|---|
| 49 |
|
|---|
| 50 | #ifndef errno
|
|---|
| 51 | extern int errno;
|
|---|
| 52 | #endif
|
|---|
| 53 |
|
|---|
| 54 | struct map_context;
|
|---|
| 55 |
|
|---|
| 56 | typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
|
|---|
| 57 | struct map_context *));
|
|---|
| 58 |
|
|---|
| 59 | #define DECLARE_TAG_HANDLER(fun) \
|
|---|
| 60 | static void fun PARAMS ((int, struct taginfo *, struct map_context *))
|
|---|
| 61 |
|
|---|
| 62 | DECLARE_TAG_HANDLER (tag_find_urls);
|
|---|
| 63 | DECLARE_TAG_HANDLER (tag_handle_base);
|
|---|
| 64 | DECLARE_TAG_HANDLER (tag_handle_form);
|
|---|
| 65 | DECLARE_TAG_HANDLER (tag_handle_link);
|
|---|
| 66 | DECLARE_TAG_HANDLER (tag_handle_meta);
|
|---|
| 67 |
|
|---|
| 68 | enum {
|
|---|
| 69 | TAG_A,
|
|---|
| 70 | TAG_APPLET,
|
|---|
| 71 | TAG_AREA,
|
|---|
| 72 | TAG_BASE,
|
|---|
| 73 | TAG_BGSOUND,
|
|---|
| 74 | TAG_BODY,
|
|---|
| 75 | TAG_EMBED,
|
|---|
| 76 | TAG_FIG,
|
|---|
| 77 | TAG_FORM,
|
|---|
| 78 | TAG_FRAME,
|
|---|
| 79 | TAG_IFRAME,
|
|---|
| 80 | TAG_IMG,
|
|---|
| 81 | TAG_INPUT,
|
|---|
| 82 | TAG_LAYER,
|
|---|
| 83 | TAG_LINK,
|
|---|
| 84 | TAG_META,
|
|---|
| 85 | TAG_OBJECT,
|
|---|
| 86 | TAG_OVERLAY,
|
|---|
| 87 | TAG_SCRIPT,
|
|---|
| 88 | TAG_TABLE,
|
|---|
| 89 | TAG_TD,
|
|---|
| 90 | TAG_TH
|
|---|
| 91 | };
|
|---|
| 92 |
|
|---|
| 93 | /* The list of known tags and functions used for handling them. Most
|
|---|
| 94 | tags are simply harvested for URLs. */
|
|---|
| 95 | static struct known_tag {
|
|---|
| 96 | int tagid;
|
|---|
| 97 | const char *name;
|
|---|
| 98 | tag_handler_t handler;
|
|---|
| 99 | } known_tags[] = {
|
|---|
| 100 | { TAG_A, "a", tag_find_urls },
|
|---|
| 101 | { TAG_APPLET, "applet", tag_find_urls },
|
|---|
| 102 | { TAG_AREA, "area", tag_find_urls },
|
|---|
| 103 | { TAG_BASE, "base", tag_handle_base },
|
|---|
| 104 | { TAG_BGSOUND, "bgsound", tag_find_urls },
|
|---|
| 105 | { TAG_BODY, "body", tag_find_urls },
|
|---|
| 106 | { TAG_EMBED, "embed", tag_find_urls },
|
|---|
| 107 | { TAG_FIG, "fig", tag_find_urls },
|
|---|
| 108 | { TAG_FORM, "form", tag_handle_form },
|
|---|
| 109 | { TAG_FRAME, "frame", tag_find_urls },
|
|---|
| 110 | { TAG_IFRAME, "iframe", tag_find_urls },
|
|---|
| 111 | { TAG_IMG, "img", tag_find_urls },
|
|---|
| 112 | { TAG_INPUT, "input", tag_find_urls },
|
|---|
| 113 | { TAG_LAYER, "layer", tag_find_urls },
|
|---|
| 114 | { TAG_LINK, "link", tag_handle_link },
|
|---|
| 115 | { TAG_META, "meta", tag_handle_meta },
|
|---|
| 116 | { TAG_OBJECT, "object", tag_find_urls },
|
|---|
| 117 | { TAG_OVERLAY, "overlay", tag_find_urls },
|
|---|
| 118 | { TAG_SCRIPT, "script", tag_find_urls },
|
|---|
| 119 | { TAG_TABLE, "table", tag_find_urls },
|
|---|
| 120 | { TAG_TD, "td", tag_find_urls },
|
|---|
| 121 | { TAG_TH, "th", tag_find_urls }
|
|---|
| 122 | };
|
|---|
| 123 |
|
|---|
| 124 | /* tag_url_attributes documents which attributes of which tags contain
|
|---|
| 125 | URLs to harvest. It is used by tag_find_urls. */
|
|---|
| 126 |
|
|---|
| 127 | /* Defines for the FLAGS. */
|
|---|
| 128 |
|
|---|
| 129 | /* The link is "inline", i.e. needs to be retrieved for this document
|
|---|
| 130 | to be correctly rendered. Inline links include inlined images,
|
|---|
| 131 | stylesheets, children frames, etc. */
|
|---|
| 132 | #define ATTR_INLINE 1
|
|---|
| 133 |
|
|---|
| 134 | /* The link is expected to yield HTML contents. It's important not to
|
|---|
| 135 | try to follow HTML obtained by following e.g. <img src="...">
|
|---|
| 136 | regardless of content-type. Doing this causes infinite loops for
|
|---|
| 137 | "images" that return non-404 error pages with links to the same
|
|---|
| 138 | image. */
|
|---|
| 139 | #define ATTR_HTML 2
|
|---|
| 140 |
|
|---|
| 141 | /* For tags handled by tag_find_urls: attributes that contain URLs to
|
|---|
| 142 | download. */
|
|---|
| 143 | static struct {
|
|---|
| 144 | int tagid;
|
|---|
| 145 | const char *attr_name;
|
|---|
| 146 | int flags;
|
|---|
| 147 | } tag_url_attributes[] = {
|
|---|
| 148 | { TAG_A, "href", ATTR_HTML },
|
|---|
| 149 | { TAG_APPLET, "code", ATTR_INLINE },
|
|---|
| 150 | { TAG_AREA, "href", ATTR_HTML },
|
|---|
| 151 | { TAG_BGSOUND, "src", ATTR_INLINE },
|
|---|
| 152 | { TAG_BODY, "background", ATTR_INLINE },
|
|---|
| 153 | { TAG_EMBED, "href", ATTR_HTML },
|
|---|
| 154 | { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
|
|---|
| 155 | { TAG_FIG, "src", ATTR_INLINE },
|
|---|
| 156 | { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
|
|---|
| 157 | { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
|
|---|
| 158 | { TAG_IMG, "href", ATTR_INLINE },
|
|---|
| 159 | { TAG_IMG, "lowsrc", ATTR_INLINE },
|
|---|
| 160 | { TAG_IMG, "src", ATTR_INLINE },
|
|---|
| 161 | { TAG_INPUT, "src", ATTR_INLINE },
|
|---|
| 162 | { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
|
|---|
| 163 | { TAG_OBJECT, "data", ATTR_INLINE },
|
|---|
| 164 | { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
|
|---|
| 165 | { TAG_SCRIPT, "src", ATTR_INLINE },
|
|---|
| 166 | { TAG_TABLE, "background", ATTR_INLINE },
|
|---|
| 167 | { TAG_TD, "background", ATTR_INLINE },
|
|---|
| 168 | { TAG_TH, "background", ATTR_INLINE }
|
|---|
| 169 | };
|
|---|
| 170 |
|
|---|
| 171 | /* The lists of interesting tags and attributes are built dynamically,
|
|---|
| 172 | from the information above. However, some places in the code refer
|
|---|
| 173 | to the attributes not mentioned here. We add them manually. */
|
|---|
| 174 | static const char *additional_attributes[] = {
|
|---|
| 175 | "rel", /* used by tag_handle_link */
|
|---|
| 176 | "http-equiv", /* used by tag_handle_meta */
|
|---|
| 177 | "name", /* used by tag_handle_meta */
|
|---|
| 178 | "content", /* used by tag_handle_meta */
|
|---|
| 179 | "action" /* used by tag_handle_form */
|
|---|
| 180 | };
|
|---|
| 181 |
|
|---|
| 182 | struct hash_table *interesting_tags;
|
|---|
| 183 | struct hash_table *interesting_attributes;
|
|---|
| 184 |
|
|---|
| 185 | static void
|
|---|
| 186 | init_interesting (void)
|
|---|
| 187 | {
|
|---|
| 188 | /* Init the variables interesting_tags and interesting_attributes
|
|---|
| 189 | that are used by the HTML parser to know which tags and
|
|---|
| 190 | attributes we're interested in. We initialize this only once,
|
|---|
| 191 | for performance reasons.
|
|---|
| 192 |
|
|---|
| 193 | Here we also make sure that what we put in interesting_tags
|
|---|
| 194 | matches the user's preferences as specified through --ignore-tags
|
|---|
| 195 | and --follow-tags. */
|
|---|
| 196 |
|
|---|
| 197 | int i;
|
|---|
| 198 | interesting_tags = make_nocase_string_hash_table (countof (known_tags));
|
|---|
| 199 |
|
|---|
| 200 | /* First, add all the tags we know hot to handle, mapped to their
|
|---|
| 201 | respective entries in known_tags. */
|
|---|
| 202 | for (i = 0; i < countof (known_tags); i++)
|
|---|
| 203 | hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
|
|---|
| 204 |
|
|---|
| 205 | /* Then remove the tags ignored through --ignore-tags. */
|
|---|
| 206 | if (opt.ignore_tags)
|
|---|
| 207 | {
|
|---|
| 208 | char **ignored;
|
|---|
| 209 | for (ignored = opt.ignore_tags; *ignored; ignored++)
|
|---|
| 210 | hash_table_remove (interesting_tags, *ignored);
|
|---|
| 211 | }
|
|---|
| 212 |
|
|---|
| 213 | /* If --follow-tags is specified, use only those tags. */
|
|---|
| 214 | if (opt.follow_tags)
|
|---|
| 215 | {
|
|---|
| 216 | /* Create a new table intersecting --follow-tags and known_tags,
|
|---|
| 217 | and use it as interesting_tags. */
|
|---|
| 218 | struct hash_table *intersect = make_nocase_string_hash_table (0);
|
|---|
| 219 | char **followed;
|
|---|
| 220 | for (followed = opt.follow_tags; *followed; followed++)
|
|---|
| 221 | {
|
|---|
| 222 | struct known_tag *t = hash_table_get (interesting_tags, *followed);
|
|---|
| 223 | if (!t)
|
|---|
| 224 | continue; /* ignore unknown --follow-tags entries. */
|
|---|
| 225 | hash_table_put (intersect, *followed, t);
|
|---|
| 226 | }
|
|---|
| 227 | hash_table_destroy (interesting_tags);
|
|---|
| 228 | interesting_tags = intersect;
|
|---|
| 229 | }
|
|---|
| 230 |
|
|---|
| 231 | /* Add the attributes we care about. */
|
|---|
| 232 | interesting_attributes = make_nocase_string_hash_table (10);
|
|---|
| 233 | for (i = 0; i < countof (additional_attributes); i++)
|
|---|
| 234 | hash_table_put (interesting_attributes, additional_attributes[i], "1");
|
|---|
| 235 | for (i = 0; i < countof (tag_url_attributes); i++)
|
|---|
| 236 | hash_table_put (interesting_attributes,
|
|---|
| 237 | tag_url_attributes[i].attr_name, "1");
|
|---|
| 238 | }
|
|---|
| 239 |
|
|---|
| 240 | /* Find the value of attribute named NAME in the taginfo TAG. If the
|
|---|
| 241 | attribute is not present, return NULL. If ATTRIND is non-NULL, the
|
|---|
| 242 | index of the attribute in TAG will be stored there. */
|
|---|
| 243 |
|
|---|
| 244 | static char *
|
|---|
| 245 | find_attr (struct taginfo *tag, const char *name, int *attrind)
|
|---|
| 246 | {
|
|---|
| 247 | int i;
|
|---|
| 248 | for (i = 0; i < tag->nattrs; i++)
|
|---|
| 249 | if (!strcasecmp (tag->attrs[i].name, name))
|
|---|
| 250 | {
|
|---|
| 251 | if (attrind)
|
|---|
| 252 | *attrind = i;
|
|---|
| 253 | return tag->attrs[i].value;
|
|---|
| 254 | }
|
|---|
| 255 | return NULL;
|
|---|
| 256 | }
|
|---|
| 257 |
|
|---|
| 258 | struct map_context {
|
|---|
| 259 | char *text; /* HTML text. */
|
|---|
| 260 | char *base; /* Base URI of the document, possibly
|
|---|
| 261 | changed through <base href=...>. */
|
|---|
| 262 | const char *parent_base; /* Base of the current document. */
|
|---|
| 263 | const char *document_file; /* File name of this document. */
|
|---|
| 264 | int nofollow; /* whether NOFOLLOW was specified in a
|
|---|
| 265 | <meta name=robots> tag. */
|
|---|
| 266 |
|
|---|
| 267 | struct urlpos *head, *tail; /* List of URLs that is being
|
|---|
| 268 | built. */
|
|---|
| 269 | };
|
|---|
| 270 |
|
|---|
| 271 | /* Append LINK_URI to the urlpos structure that is being built.
|
|---|
| 272 |
|
|---|
| 273 | LINK_URI will be merged with the current document base. TAG and
|
|---|
| 274 | ATTRIND are the necessary context to store the position and
|
|---|
| 275 | size. */
|
|---|
| 276 |
|
|---|
| 277 | static struct urlpos *
|
|---|
| 278 | append_url (const char *link_uri,
|
|---|
| 279 | struct taginfo *tag, int attrind, struct map_context *ctx)
|
|---|
| 280 | {
|
|---|
| 281 | int link_has_scheme = url_has_scheme (link_uri);
|
|---|
| 282 | struct urlpos *newel;
|
|---|
| 283 | const char *base = ctx->base ? ctx->base : ctx->parent_base;
|
|---|
| 284 | struct url *url;
|
|---|
| 285 |
|
|---|
| 286 | if (!base)
|
|---|
| 287 | {
|
|---|
| 288 | DEBUGP (("%s: no base, merge will use \"%s\".\n",
|
|---|
| 289 | ctx->document_file, link_uri));
|
|---|
| 290 |
|
|---|
| 291 | if (!link_has_scheme)
|
|---|
| 292 | {
|
|---|
| 293 | /* Base URL is unavailable, and the link does not have a
|
|---|
| 294 | location attached to it -- we have to give up. Since
|
|---|
| 295 | this can only happen when using `--force-html -i', print
|
|---|
| 296 | a warning. */
|
|---|
| 297 | logprintf (LOG_NOTQUIET,
|
|---|
| 298 | _("%s: Cannot resolve incomplete link %s.\n"),
|
|---|
| 299 | ctx->document_file, link_uri);
|
|---|
| 300 | return NULL;
|
|---|
| 301 | }
|
|---|
| 302 |
|
|---|
| 303 | url = url_parse (link_uri, NULL);
|
|---|
| 304 | if (!url)
|
|---|
| 305 | {
|
|---|
| 306 | DEBUGP (("%s: link \"%s\" doesn't parse.\n",
|
|---|
| 307 | ctx->document_file, link_uri));
|
|---|
| 308 | return NULL;
|
|---|
| 309 | }
|
|---|
| 310 | }
|
|---|
| 311 | else
|
|---|
| 312 | {
|
|---|
| 313 | /* Merge BASE with LINK_URI, but also make sure the result is
|
|---|
| 314 | canonicalized, i.e. that "../" have been resolved.
|
|---|
| 315 | (parse_url will do that for us.) */
|
|---|
| 316 |
|
|---|
| 317 | char *complete_uri = uri_merge (base, link_uri);
|
|---|
| 318 |
|
|---|
| 319 | DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
|
|---|
| 320 | ctx->document_file, base, link_uri, complete_uri));
|
|---|
| 321 |
|
|---|
| 322 | url = url_parse (complete_uri, NULL);
|
|---|
| 323 | if (!url)
|
|---|
| 324 | {
|
|---|
| 325 | DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
|
|---|
| 326 | ctx->document_file, complete_uri));
|
|---|
| 327 | xfree (complete_uri);
|
|---|
| 328 | return NULL;
|
|---|
| 329 | }
|
|---|
| 330 | xfree (complete_uri);
|
|---|
| 331 | }
|
|---|
| 332 |
|
|---|
| 333 | DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
|
|---|
| 334 |
|
|---|
| 335 | newel = xnew0 (struct urlpos);
|
|---|
| 336 | newel->url = url;
|
|---|
| 337 | newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
|
|---|
| 338 | newel->size = tag->attrs[attrind].value_raw_size;
|
|---|
| 339 |
|
|---|
| 340 | /* A URL is relative if the host is not named, and the name does not
|
|---|
| 341 | start with `/'. */
|
|---|
| 342 | if (!link_has_scheme && *link_uri != '/')
|
|---|
| 343 | newel->link_relative_p = 1;
|
|---|
| 344 | else if (link_has_scheme)
|
|---|
| 345 | newel->link_complete_p = 1;
|
|---|
| 346 |
|
|---|
| 347 | if (ctx->tail)
|
|---|
| 348 | {
|
|---|
| 349 | ctx->tail->next = newel;
|
|---|
| 350 | ctx->tail = newel;
|
|---|
| 351 | }
|
|---|
| 352 | else
|
|---|
| 353 | ctx->tail = ctx->head = newel;
|
|---|
| 354 |
|
|---|
| 355 | return newel;
|
|---|
| 356 | }
|
|---|
| 357 | |
|---|
| 358 |
|
|---|
| 359 | /* All the tag_* functions are called from collect_tags_mapper, as
|
|---|
| 360 | specified by KNOWN_TAGS. */
|
|---|
| 361 |
|
|---|
| 362 | /* Default tag handler: collect URLs from attributes specified for
|
|---|
| 363 | this tag by tag_url_attributes. */
|
|---|
| 364 |
|
|---|
| 365 | static void
|
|---|
| 366 | tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|---|
| 367 | {
|
|---|
| 368 | int i, attrind;
|
|---|
| 369 | int first = -1;
|
|---|
| 370 |
|
|---|
| 371 | for (i = 0; i < countof (tag_url_attributes); i++)
|
|---|
| 372 | if (tag_url_attributes[i].tagid == tagid)
|
|---|
| 373 | {
|
|---|
| 374 | /* We've found the index of tag_url_attributes where the
|
|---|
| 375 | attributes of our tag begin. */
|
|---|
| 376 | first = i;
|
|---|
| 377 | break;
|
|---|
| 378 | }
|
|---|
| 379 | assert (first != -1);
|
|---|
| 380 |
|
|---|
| 381 | /* Loop over the "interesting" attributes of this tag. In this
|
|---|
| 382 | example, it will loop over "src" and "lowsrc".
|
|---|
| 383 |
|
|---|
| 384 | <img src="foo.png" lowsrc="bar.png">
|
|---|
| 385 |
|
|---|
| 386 | This has to be done in the outer loop so that the attributes are
|
|---|
| 387 | processed in the same order in which they appear in the page.
|
|---|
| 388 | This is required when converting links. */
|
|---|
| 389 |
|
|---|
| 390 | for (attrind = 0; attrind < tag->nattrs; attrind++)
|
|---|
| 391 | {
|
|---|
| 392 | /* Find whether TAG/ATTRIND is a combination that contains a
|
|---|
| 393 | URL. */
|
|---|
| 394 | char *link = tag->attrs[attrind].value;
|
|---|
| 395 | const int size = countof (tag_url_attributes);
|
|---|
| 396 |
|
|---|
| 397 | /* If you're cringing at the inefficiency of the nested loops,
|
|---|
| 398 | remember that they both iterate over a very small number of
|
|---|
| 399 | items. The worst-case inner loop is for the IMG tag, which
|
|---|
| 400 | has three attributes. */
|
|---|
| 401 | for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
|
|---|
| 402 | {
|
|---|
| 403 | if (0 == strcasecmp (tag->attrs[attrind].name,
|
|---|
| 404 | tag_url_attributes[i].attr_name))
|
|---|
| 405 | {
|
|---|
| 406 | struct urlpos *up = append_url (link, tag, attrind, ctx);
|
|---|
| 407 | if (up)
|
|---|
| 408 | {
|
|---|
| 409 | int flags = tag_url_attributes[i].flags;
|
|---|
| 410 | if (flags & ATTR_INLINE)
|
|---|
| 411 | up->link_inline_p = 1;
|
|---|
| 412 | if (flags & ATTR_HTML)
|
|---|
| 413 | up->link_expect_html = 1;
|
|---|
| 414 | }
|
|---|
| 415 | }
|
|---|
| 416 | }
|
|---|
| 417 | }
|
|---|
| 418 | }
|
|---|
| 419 |
|
|---|
| 420 | /* Handle the BASE tag, for <base href=...>. */
|
|---|
| 421 |
|
|---|
| 422 | static void
|
|---|
| 423 | tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|---|
| 424 | {
|
|---|
| 425 | struct urlpos *base_urlpos;
|
|---|
| 426 | int attrind;
|
|---|
| 427 | char *newbase = find_attr (tag, "href", &attrind);
|
|---|
| 428 | if (!newbase)
|
|---|
| 429 | return;
|
|---|
| 430 |
|
|---|
| 431 | base_urlpos = append_url (newbase, tag, attrind, ctx);
|
|---|
| 432 | if (!base_urlpos)
|
|---|
| 433 | return;
|
|---|
| 434 | base_urlpos->ignore_when_downloading = 1;
|
|---|
| 435 | base_urlpos->link_base_p = 1;
|
|---|
| 436 |
|
|---|
| 437 | if (ctx->base)
|
|---|
| 438 | xfree (ctx->base);
|
|---|
| 439 | if (ctx->parent_base)
|
|---|
| 440 | ctx->base = uri_merge (ctx->parent_base, newbase);
|
|---|
| 441 | else
|
|---|
| 442 | ctx->base = xstrdup (newbase);
|
|---|
| 443 | }
|
|---|
| 444 |
|
|---|
| 445 | /* Mark the URL found in <form action=...> for conversion. */
|
|---|
| 446 |
|
|---|
| 447 | static void
|
|---|
| 448 | tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|---|
| 449 | {
|
|---|
| 450 | int attrind;
|
|---|
| 451 | char *action = find_attr (tag, "action", &attrind);
|
|---|
| 452 | if (action)
|
|---|
| 453 | {
|
|---|
| 454 | struct urlpos *up = append_url (action, tag, attrind, ctx);
|
|---|
| 455 | if (up)
|
|---|
| 456 | up->ignore_when_downloading = 1;
|
|---|
| 457 | }
|
|---|
| 458 | }
|
|---|
| 459 |
|
|---|
| 460 | /* Handle the LINK tag. It requires special handling because how its
|
|---|
| 461 | links will be followed in -p mode depends on the REL attribute. */
|
|---|
| 462 |
|
|---|
| 463 | static void
|
|---|
| 464 | tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|---|
| 465 | {
|
|---|
| 466 | int attrind;
|
|---|
| 467 | char *href = find_attr (tag, "href", &attrind);
|
|---|
| 468 |
|
|---|
| 469 | /* All <link href="..."> link references are external, except those
|
|---|
| 470 | known not to be, such as style sheet and shortcut icon:
|
|---|
| 471 |
|
|---|
| 472 | <link rel="stylesheet" href="...">
|
|---|
| 473 | <link rel="shortcut icon" href="...">
|
|---|
| 474 | */
|
|---|
| 475 | if (href)
|
|---|
| 476 | {
|
|---|
| 477 | struct urlpos *up = append_url (href, tag, attrind, ctx);
|
|---|
| 478 | if (up)
|
|---|
| 479 | {
|
|---|
| 480 | char *rel = find_attr (tag, "rel", NULL);
|
|---|
| 481 | if (rel
|
|---|
| 482 | && (0 == strcasecmp (rel, "stylesheet")
|
|---|
| 483 | || 0 == strcasecmp (rel, "shortcut icon")))
|
|---|
| 484 | up->link_inline_p = 1;
|
|---|
| 485 | else
|
|---|
| 486 | /* The external ones usually point to HTML pages, such as
|
|---|
| 487 | <link rel="next" href="..."> */
|
|---|
| 488 | up->link_expect_html = 1;
|
|---|
| 489 | }
|
|---|
| 490 | }
|
|---|
| 491 | }
|
|---|
| 492 |
|
|---|
| 493 | /* Handle the META tag. This requires special handling because of the
|
|---|
| 494 | refresh feature and because of robot exclusion. */
|
|---|
| 495 |
|
|---|
| 496 | static void
|
|---|
| 497 | tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|---|
| 498 | {
|
|---|
| 499 | char *name = find_attr (tag, "name", NULL);
|
|---|
| 500 | char *http_equiv = find_attr (tag, "http-equiv", NULL);
|
|---|
| 501 |
|
|---|
| 502 | if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
|
|---|
| 503 | {
|
|---|
| 504 | /* Some pages use a META tag to specify that the page be
|
|---|
| 505 | refreshed by a new page after a given number of seconds. The
|
|---|
| 506 | general format for this is:
|
|---|
| 507 |
|
|---|
| 508 | <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
|
|---|
| 509 |
|
|---|
| 510 | So we just need to skip past the "NUMBER; URL=" garbage to
|
|---|
| 511 | get to the URL. */
|
|---|
| 512 |
|
|---|
| 513 | struct urlpos *entry;
|
|---|
| 514 | int attrind;
|
|---|
| 515 | int timeout = 0;
|
|---|
| 516 | char *p;
|
|---|
| 517 |
|
|---|
| 518 | char *refresh = find_attr (tag, "content", &attrind);
|
|---|
| 519 | if (!refresh)
|
|---|
| 520 | return;
|
|---|
| 521 |
|
|---|
| 522 | for (p = refresh; ISDIGIT (*p); p++)
|
|---|
| 523 | timeout = 10 * timeout + *p - '0';
|
|---|
| 524 | if (*p++ != ';')
|
|---|
| 525 | return;
|
|---|
| 526 |
|
|---|
| 527 | while (ISSPACE (*p))
|
|---|
| 528 | ++p;
|
|---|
| 529 | if (!( TOUPPER (*p) == 'U'
|
|---|
| 530 | && TOUPPER (*(p + 1)) == 'R'
|
|---|
| 531 | && TOUPPER (*(p + 2)) == 'L'
|
|---|
| 532 | && *(p + 3) == '='))
|
|---|
| 533 | return;
|
|---|
| 534 | p += 4;
|
|---|
| 535 | while (ISSPACE (*p))
|
|---|
| 536 | ++p;
|
|---|
| 537 |
|
|---|
| 538 | entry = append_url (p, tag, attrind, ctx);
|
|---|
| 539 | if (entry)
|
|---|
| 540 | {
|
|---|
| 541 | entry->link_refresh_p = 1;
|
|---|
| 542 | entry->refresh_timeout = timeout;
|
|---|
| 543 | entry->link_expect_html = 1;
|
|---|
| 544 | }
|
|---|
| 545 | }
|
|---|
| 546 | else if (name && 0 == strcasecmp (name, "robots"))
|
|---|
| 547 | {
|
|---|
| 548 | /* Handle stuff like:
|
|---|
| 549 | <meta name="robots" content="index,nofollow"> */
|
|---|
| 550 | char *content = find_attr (tag, "content", NULL);
|
|---|
| 551 | if (!content)
|
|---|
| 552 | return;
|
|---|
| 553 | if (!strcasecmp (content, "none"))
|
|---|
| 554 | ctx->nofollow = 1;
|
|---|
| 555 | else
|
|---|
| 556 | {
|
|---|
| 557 | while (*content)
|
|---|
| 558 | {
|
|---|
| 559 | /* Find the next occurrence of ',' or the end of
|
|---|
| 560 | the string. */
|
|---|
| 561 | char *end = strchr (content, ',');
|
|---|
| 562 | if (end)
|
|---|
| 563 | ++end;
|
|---|
| 564 | else
|
|---|
| 565 | end = content + strlen (content);
|
|---|
| 566 | if (!strncasecmp (content, "nofollow", end - content))
|
|---|
| 567 | ctx->nofollow = 1;
|
|---|
| 568 | content = end;
|
|---|
| 569 | }
|
|---|
| 570 | }
|
|---|
| 571 | }
|
|---|
| 572 | }
|
|---|
| 573 |
|
|---|
| 574 | /* Dispatch the tag handler appropriate for the tag we're mapping
|
|---|
| 575 | over. See known_tags[] for definition of tag handlers. */
|
|---|
| 576 |
|
|---|
| 577 | static void
|
|---|
| 578 | collect_tags_mapper (struct taginfo *tag, void *arg)
|
|---|
| 579 | {
|
|---|
| 580 | struct map_context *ctx = (struct map_context *)arg;
|
|---|
| 581 |
|
|---|
| 582 | /* Find the tag in our table of tags. This must not fail because
|
|---|
| 583 | map_html_tags only returns tags found in interesting_tags. */
|
|---|
| 584 | struct known_tag *t = hash_table_get (interesting_tags, tag->name);
|
|---|
| 585 | assert (t != NULL);
|
|---|
| 586 |
|
|---|
| 587 | t->handler (t->tagid, tag, ctx);
|
|---|
| 588 | }
|
|---|
| 589 | |
|---|
| 590 |
|
|---|
| 591 | /* Analyze HTML tags FILE and construct a list of URLs referenced from
|
|---|
| 592 | it. It merges relative links in FILE with URL. It is aware of
|
|---|
| 593 | <base href=...> and does the right thing. */
|
|---|
| 594 |
|
|---|
| 595 | struct urlpos *
|
|---|
| 596 | get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
|
|---|
| 597 | {
|
|---|
| 598 | struct file_memory *fm;
|
|---|
| 599 | struct map_context ctx;
|
|---|
| 600 | int flags;
|
|---|
| 601 |
|
|---|
| 602 | /* Load the file. */
|
|---|
| 603 | fm = read_file (file);
|
|---|
| 604 | if (!fm)
|
|---|
| 605 | {
|
|---|
| 606 | logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
|---|
| 607 | return NULL;
|
|---|
| 608 | }
|
|---|
| 609 | DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
|
|---|
| 610 |
|
|---|
| 611 | ctx.text = fm->content;
|
|---|
| 612 | ctx.head = ctx.tail = NULL;
|
|---|
| 613 | ctx.base = NULL;
|
|---|
| 614 | ctx.parent_base = url ? url : opt.base_href;
|
|---|
| 615 | ctx.document_file = file;
|
|---|
| 616 | ctx.nofollow = 0;
|
|---|
| 617 |
|
|---|
| 618 | if (!interesting_tags)
|
|---|
| 619 | init_interesting ();
|
|---|
| 620 |
|
|---|
| 621 | /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
|
|---|
| 622 | generate <a href=" foo"> instead of <a href="foo"> (browsers
|
|---|
| 623 | ignore spaces as well.) If you really mean space, use &32; or
|
|---|
| 624 | %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
|
|---|
| 625 | e.g. in <img src="foo.[newline]html">. Such newlines are also
|
|---|
| 626 | ignored by IE and Mozilla and are presumably introduced by
|
|---|
| 627 | writing HTML with editors that force word wrap. */
|
|---|
| 628 | flags = MHT_TRIM_VALUES;
|
|---|
| 629 | if (opt.strict_comments)
|
|---|
| 630 | flags |= MHT_STRICT_COMMENTS;
|
|---|
| 631 |
|
|---|
| 632 | map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
|
|---|
| 633 | interesting_tags, interesting_attributes);
|
|---|
| 634 |
|
|---|
| 635 | DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
|
|---|
| 636 | if (meta_disallow_follow)
|
|---|
| 637 | *meta_disallow_follow = ctx.nofollow;
|
|---|
| 638 |
|
|---|
| 639 | xfree_null (ctx.base);
|
|---|
| 640 | read_file_free (fm);
|
|---|
| 641 | return ctx.head;
|
|---|
| 642 | }
|
|---|
| 643 |
|
|---|
| 644 | /* This doesn't really have anything to do with HTML, but it's similar
|
|---|
| 645 | to get_urls_html, so we put it here. */
|
|---|
| 646 |
|
|---|
| 647 | struct urlpos *
|
|---|
| 648 | get_urls_file (const char *file)
|
|---|
| 649 | {
|
|---|
| 650 | struct file_memory *fm;
|
|---|
| 651 | struct urlpos *head, *tail;
|
|---|
| 652 | const char *text, *text_end;
|
|---|
| 653 |
|
|---|
| 654 | /* Load the file. */
|
|---|
| 655 | fm = read_file (file);
|
|---|
| 656 | if (!fm)
|
|---|
| 657 | {
|
|---|
| 658 | logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
|---|
| 659 | return NULL;
|
|---|
| 660 | }
|
|---|
| 661 | DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
|
|---|
| 662 |
|
|---|
| 663 | head = tail = NULL;
|
|---|
| 664 | text = fm->content;
|
|---|
| 665 | text_end = fm->content + fm->length;
|
|---|
| 666 | while (text < text_end)
|
|---|
| 667 | {
|
|---|
| 668 | int up_error_code;
|
|---|
| 669 | char *url_text;
|
|---|
| 670 | struct urlpos *entry;
|
|---|
| 671 | struct url *url;
|
|---|
| 672 |
|
|---|
| 673 | const char *line_beg = text;
|
|---|
| 674 | const char *line_end = memchr (text, '\n', text_end - text);
|
|---|
| 675 | if (!line_end)
|
|---|
| 676 | line_end = text_end;
|
|---|
| 677 | else
|
|---|
| 678 | ++line_end;
|
|---|
| 679 | text = line_end;
|
|---|
| 680 |
|
|---|
| 681 | /* Strip whitespace from the beginning and end of line. */
|
|---|
| 682 | while (line_beg < line_end && ISSPACE (*line_beg))
|
|---|
| 683 | ++line_beg;
|
|---|
| 684 | while (line_end > line_beg && ISSPACE (*(line_end - 1)))
|
|---|
| 685 | --line_end;
|
|---|
| 686 |
|
|---|
| 687 | if (line_beg == line_end)
|
|---|
| 688 | continue;
|
|---|
| 689 |
|
|---|
| 690 | /* The URL is in the [line_beg, line_end) region. */
|
|---|
| 691 |
|
|---|
| 692 | /* We must copy the URL to a zero-terminated string, and we
|
|---|
| 693 | can't use alloca because we're in a loop. *sigh*. */
|
|---|
| 694 | url_text = strdupdelim (line_beg, line_end);
|
|---|
| 695 |
|
|---|
| 696 | if (opt.base_href)
|
|---|
| 697 | {
|
|---|
| 698 | /* Merge opt.base_href with URL. */
|
|---|
| 699 | char *merged = uri_merge (opt.base_href, url_text);
|
|---|
| 700 | xfree (url_text);
|
|---|
| 701 | url_text = merged;
|
|---|
| 702 | }
|
|---|
| 703 |
|
|---|
| 704 | url = url_parse (url_text, &up_error_code);
|
|---|
| 705 | if (!url)
|
|---|
| 706 | {
|
|---|
| 707 | logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
|
|---|
| 708 | file, url_text, url_error (up_error_code));
|
|---|
| 709 | xfree (url_text);
|
|---|
| 710 | continue;
|
|---|
| 711 | }
|
|---|
| 712 | xfree (url_text);
|
|---|
| 713 |
|
|---|
| 714 | entry = xnew0 (struct urlpos);
|
|---|
| 715 | entry->next = NULL;
|
|---|
| 716 | entry->url = url;
|
|---|
| 717 |
|
|---|
| 718 | if (!head)
|
|---|
| 719 | head = entry;
|
|---|
| 720 | else
|
|---|
| 721 | tail->next = entry;
|
|---|
| 722 | tail = entry;
|
|---|
| 723 | }
|
|---|
| 724 | read_file_free (fm);
|
|---|
| 725 | return head;
|
|---|
| 726 | }
|
|---|
| 727 |
|
|---|
| 728 | void
|
|---|
| 729 | cleanup_html_url (void)
|
|---|
| 730 | {
|
|---|
| 731 | /* Destroy the hash tables. The hash table keys and values are not
|
|---|
| 732 | allocated by this code, so we don't need to free them here. */
|
|---|
| 733 | if (interesting_tags)
|
|---|
| 734 | hash_table_destroy (interesting_tags);
|
|---|
| 735 | if (interesting_attributes)
|
|---|
| 736 | hash_table_destroy (interesting_attributes);
|
|---|
| 737 | }
|
|---|