1 | /* Collect URLs from HTML source.
|
---|
2 | Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
|
---|
3 |
|
---|
4 | This file is part of GNU Wget.
|
---|
5 |
|
---|
6 | GNU Wget is free software; you can redistribute it and/or modify
|
---|
7 | it under the terms of the GNU General Public License as published by
|
---|
8 | the Free Software Foundation; either version 2 of the License, or
|
---|
9 | (at your option) any later version.
|
---|
10 |
|
---|
11 | GNU Wget is distributed in the hope that it will be useful,
|
---|
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | GNU General Public License for more details.
|
---|
15 |
|
---|
16 | You should have received a copy of the GNU General Public License
|
---|
17 | along with Wget; if not, write to the Free Software
|
---|
18 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 |
|
---|
20 | In addition, as a special exception, the Free Software Foundation
|
---|
21 | gives permission to link the code of its release of Wget with the
|
---|
22 | OpenSSL project's "OpenSSL" library (or with modified versions of it
|
---|
23 | that use the same license as the "OpenSSL" library), and distribute
|
---|
24 | the linked executables. You must obey the GNU General Public License
|
---|
25 | in all respects for all of the code used other than "OpenSSL". If you
|
---|
26 | modify this file, you may extend this exception to your version of the
|
---|
27 | file, but you are not obligated to do so. If you do not wish to do
|
---|
28 | so, delete this exception statement from your version. */
|
---|
29 |
|
---|
30 | #include <config.h>
|
---|
31 |
|
---|
32 | #include <stdio.h>
|
---|
33 | #ifdef HAVE_STRING_H
|
---|
34 | # include <string.h>
|
---|
35 | #else
|
---|
36 | # include <strings.h>
|
---|
37 | #endif
|
---|
38 | #include <stdlib.h>
|
---|
39 | #include <errno.h>
|
---|
40 | #include <assert.h>
|
---|
41 |
|
---|
42 | #include "wget.h"
|
---|
43 | #include "html-parse.h"
|
---|
44 | #include "url.h"
|
---|
45 | #include "utils.h"
|
---|
46 | #include "hash.h"
|
---|
47 | #include "convert.h"
|
---|
48 | #include "recur.h" /* declaration of get_urls_html */
|
---|
49 |
|
---|
50 | #ifndef errno
|
---|
51 | extern int errno;
|
---|
52 | #endif
|
---|
53 |
|
---|
54 | struct map_context;
|
---|
55 |
|
---|
56 | typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
|
---|
57 | struct map_context *));
|
---|
58 |
|
---|
59 | #define DECLARE_TAG_HANDLER(fun) \
|
---|
60 | static void fun PARAMS ((int, struct taginfo *, struct map_context *))
|
---|
61 |
|
---|
62 | DECLARE_TAG_HANDLER (tag_find_urls);
|
---|
63 | DECLARE_TAG_HANDLER (tag_handle_base);
|
---|
64 | DECLARE_TAG_HANDLER (tag_handle_form);
|
---|
65 | DECLARE_TAG_HANDLER (tag_handle_link);
|
---|
66 | DECLARE_TAG_HANDLER (tag_handle_meta);
|
---|
67 |
|
---|
68 | enum {
|
---|
69 | TAG_A,
|
---|
70 | TAG_APPLET,
|
---|
71 | TAG_AREA,
|
---|
72 | TAG_BASE,
|
---|
73 | TAG_BGSOUND,
|
---|
74 | TAG_BODY,
|
---|
75 | TAG_EMBED,
|
---|
76 | TAG_FIG,
|
---|
77 | TAG_FORM,
|
---|
78 | TAG_FRAME,
|
---|
79 | TAG_IFRAME,
|
---|
80 | TAG_IMG,
|
---|
81 | TAG_INPUT,
|
---|
82 | TAG_LAYER,
|
---|
83 | TAG_LINK,
|
---|
84 | TAG_META,
|
---|
85 | TAG_OBJECT,
|
---|
86 | TAG_OVERLAY,
|
---|
87 | TAG_SCRIPT,
|
---|
88 | TAG_TABLE,
|
---|
89 | TAG_TD,
|
---|
90 | TAG_TH
|
---|
91 | };
|
---|
92 |
|
---|
93 | /* The list of known tags and functions used for handling them. Most
|
---|
94 | tags are simply harvested for URLs. */
|
---|
95 | static struct known_tag {
|
---|
96 | int tagid;
|
---|
97 | const char *name;
|
---|
98 | tag_handler_t handler;
|
---|
99 | } known_tags[] = {
|
---|
100 | { TAG_A, "a", tag_find_urls },
|
---|
101 | { TAG_APPLET, "applet", tag_find_urls },
|
---|
102 | { TAG_AREA, "area", tag_find_urls },
|
---|
103 | { TAG_BASE, "base", tag_handle_base },
|
---|
104 | { TAG_BGSOUND, "bgsound", tag_find_urls },
|
---|
105 | { TAG_BODY, "body", tag_find_urls },
|
---|
106 | { TAG_EMBED, "embed", tag_find_urls },
|
---|
107 | { TAG_FIG, "fig", tag_find_urls },
|
---|
108 | { TAG_FORM, "form", tag_handle_form },
|
---|
109 | { TAG_FRAME, "frame", tag_find_urls },
|
---|
110 | { TAG_IFRAME, "iframe", tag_find_urls },
|
---|
111 | { TAG_IMG, "img", tag_find_urls },
|
---|
112 | { TAG_INPUT, "input", tag_find_urls },
|
---|
113 | { TAG_LAYER, "layer", tag_find_urls },
|
---|
114 | { TAG_LINK, "link", tag_handle_link },
|
---|
115 | { TAG_META, "meta", tag_handle_meta },
|
---|
116 | { TAG_OBJECT, "object", tag_find_urls },
|
---|
117 | { TAG_OVERLAY, "overlay", tag_find_urls },
|
---|
118 | { TAG_SCRIPT, "script", tag_find_urls },
|
---|
119 | { TAG_TABLE, "table", tag_find_urls },
|
---|
120 | { TAG_TD, "td", tag_find_urls },
|
---|
121 | { TAG_TH, "th", tag_find_urls }
|
---|
122 | };
|
---|
123 |
|
---|
124 | /* tag_url_attributes documents which attributes of which tags contain
|
---|
125 | URLs to harvest. It is used by tag_find_urls. */
|
---|
126 |
|
---|
127 | /* Defines for the FLAGS. */
|
---|
128 |
|
---|
129 | /* The link is "inline", i.e. needs to be retrieved for this document
|
---|
130 | to be correctly rendered. Inline links include inlined images,
|
---|
131 | stylesheets, children frames, etc. */
|
---|
132 | #define ATTR_INLINE 1
|
---|
133 |
|
---|
134 | /* The link is expected to yield HTML contents. It's important not to
|
---|
135 | try to follow HTML obtained by following e.g. <img src="...">
|
---|
136 | regardless of content-type. Doing this causes infinite loops for
|
---|
137 | "images" that return non-404 error pages with links to the same
|
---|
138 | image. */
|
---|
139 | #define ATTR_HTML 2
|
---|
140 |
|
---|
141 | /* For tags handled by tag_find_urls: attributes that contain URLs to
|
---|
142 | download. */
|
---|
143 | static struct {
|
---|
144 | int tagid;
|
---|
145 | const char *attr_name;
|
---|
146 | int flags;
|
---|
147 | } tag_url_attributes[] = {
|
---|
148 | { TAG_A, "href", ATTR_HTML },
|
---|
149 | { TAG_APPLET, "code", ATTR_INLINE },
|
---|
150 | { TAG_AREA, "href", ATTR_HTML },
|
---|
151 | { TAG_BGSOUND, "src", ATTR_INLINE },
|
---|
152 | { TAG_BODY, "background", ATTR_INLINE },
|
---|
153 | { TAG_EMBED, "href", ATTR_HTML },
|
---|
154 | { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
|
---|
155 | { TAG_FIG, "src", ATTR_INLINE },
|
---|
156 | { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
|
---|
157 | { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
|
---|
158 | { TAG_IMG, "href", ATTR_INLINE },
|
---|
159 | { TAG_IMG, "lowsrc", ATTR_INLINE },
|
---|
160 | { TAG_IMG, "src", ATTR_INLINE },
|
---|
161 | { TAG_INPUT, "src", ATTR_INLINE },
|
---|
162 | { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
|
---|
163 | { TAG_OBJECT, "data", ATTR_INLINE },
|
---|
164 | { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
|
---|
165 | { TAG_SCRIPT, "src", ATTR_INLINE },
|
---|
166 | { TAG_TABLE, "background", ATTR_INLINE },
|
---|
167 | { TAG_TD, "background", ATTR_INLINE },
|
---|
168 | { TAG_TH, "background", ATTR_INLINE }
|
---|
169 | };
|
---|
170 |
|
---|
171 | /* The lists of interesting tags and attributes are built dynamically,
|
---|
172 | from the information above. However, some places in the code refer
|
---|
173 | to the attributes not mentioned here. We add them manually. */
|
---|
174 | static const char *additional_attributes[] = {
|
---|
175 | "rel", /* used by tag_handle_link */
|
---|
176 | "http-equiv", /* used by tag_handle_meta */
|
---|
177 | "name", /* used by tag_handle_meta */
|
---|
178 | "content", /* used by tag_handle_meta */
|
---|
179 | "action" /* used by tag_handle_form */
|
---|
180 | };
|
---|
181 |
|
---|
182 | struct hash_table *interesting_tags;
|
---|
183 | struct hash_table *interesting_attributes;
|
---|
184 |
|
---|
185 | static void
|
---|
186 | init_interesting (void)
|
---|
187 | {
|
---|
188 | /* Init the variables interesting_tags and interesting_attributes
|
---|
189 | that are used by the HTML parser to know which tags and
|
---|
190 | attributes we're interested in. We initialize this only once,
|
---|
191 | for performance reasons.
|
---|
192 |
|
---|
193 | Here we also make sure that what we put in interesting_tags
|
---|
194 | matches the user's preferences as specified through --ignore-tags
|
---|
195 | and --follow-tags. */
|
---|
196 |
|
---|
197 | int i;
|
---|
198 | interesting_tags = make_nocase_string_hash_table (countof (known_tags));
|
---|
199 |
|
---|
200 | /* First, add all the tags we know hot to handle, mapped to their
|
---|
201 | respective entries in known_tags. */
|
---|
202 | for (i = 0; i < countof (known_tags); i++)
|
---|
203 | hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
|
---|
204 |
|
---|
205 | /* Then remove the tags ignored through --ignore-tags. */
|
---|
206 | if (opt.ignore_tags)
|
---|
207 | {
|
---|
208 | char **ignored;
|
---|
209 | for (ignored = opt.ignore_tags; *ignored; ignored++)
|
---|
210 | hash_table_remove (interesting_tags, *ignored);
|
---|
211 | }
|
---|
212 |
|
---|
213 | /* If --follow-tags is specified, use only those tags. */
|
---|
214 | if (opt.follow_tags)
|
---|
215 | {
|
---|
216 | /* Create a new table intersecting --follow-tags and known_tags,
|
---|
217 | and use it as interesting_tags. */
|
---|
218 | struct hash_table *intersect = make_nocase_string_hash_table (0);
|
---|
219 | char **followed;
|
---|
220 | for (followed = opt.follow_tags; *followed; followed++)
|
---|
221 | {
|
---|
222 | struct known_tag *t = hash_table_get (interesting_tags, *followed);
|
---|
223 | if (!t)
|
---|
224 | continue; /* ignore unknown --follow-tags entries. */
|
---|
225 | hash_table_put (intersect, *followed, t);
|
---|
226 | }
|
---|
227 | hash_table_destroy (interesting_tags);
|
---|
228 | interesting_tags = intersect;
|
---|
229 | }
|
---|
230 |
|
---|
231 | /* Add the attributes we care about. */
|
---|
232 | interesting_attributes = make_nocase_string_hash_table (10);
|
---|
233 | for (i = 0; i < countof (additional_attributes); i++)
|
---|
234 | hash_table_put (interesting_attributes, additional_attributes[i], "1");
|
---|
235 | for (i = 0; i < countof (tag_url_attributes); i++)
|
---|
236 | hash_table_put (interesting_attributes,
|
---|
237 | tag_url_attributes[i].attr_name, "1");
|
---|
238 | }
|
---|
239 |
|
---|
240 | /* Find the value of attribute named NAME in the taginfo TAG. If the
|
---|
241 | attribute is not present, return NULL. If ATTRIND is non-NULL, the
|
---|
242 | index of the attribute in TAG will be stored there. */
|
---|
243 |
|
---|
244 | static char *
|
---|
245 | find_attr (struct taginfo *tag, const char *name, int *attrind)
|
---|
246 | {
|
---|
247 | int i;
|
---|
248 | for (i = 0; i < tag->nattrs; i++)
|
---|
249 | if (!strcasecmp (tag->attrs[i].name, name))
|
---|
250 | {
|
---|
251 | if (attrind)
|
---|
252 | *attrind = i;
|
---|
253 | return tag->attrs[i].value;
|
---|
254 | }
|
---|
255 | return NULL;
|
---|
256 | }
|
---|
257 |
|
---|
258 | struct map_context {
|
---|
259 | char *text; /* HTML text. */
|
---|
260 | char *base; /* Base URI of the document, possibly
|
---|
261 | changed through <base href=...>. */
|
---|
262 | const char *parent_base; /* Base of the current document. */
|
---|
263 | const char *document_file; /* File name of this document. */
|
---|
264 | int nofollow; /* whether NOFOLLOW was specified in a
|
---|
265 | <meta name=robots> tag. */
|
---|
266 |
|
---|
267 | struct urlpos *head, *tail; /* List of URLs that is being
|
---|
268 | built. */
|
---|
269 | };
|
---|
270 |
|
---|
271 | /* Append LINK_URI to the urlpos structure that is being built.
|
---|
272 |
|
---|
273 | LINK_URI will be merged with the current document base. TAG and
|
---|
274 | ATTRIND are the necessary context to store the position and
|
---|
275 | size. */
|
---|
276 |
|
---|
277 | static struct urlpos *
|
---|
278 | append_url (const char *link_uri,
|
---|
279 | struct taginfo *tag, int attrind, struct map_context *ctx)
|
---|
280 | {
|
---|
281 | int link_has_scheme = url_has_scheme (link_uri);
|
---|
282 | struct urlpos *newel;
|
---|
283 | const char *base = ctx->base ? ctx->base : ctx->parent_base;
|
---|
284 | struct url *url;
|
---|
285 |
|
---|
286 | if (!base)
|
---|
287 | {
|
---|
288 | DEBUGP (("%s: no base, merge will use \"%s\".\n",
|
---|
289 | ctx->document_file, link_uri));
|
---|
290 |
|
---|
291 | if (!link_has_scheme)
|
---|
292 | {
|
---|
293 | /* Base URL is unavailable, and the link does not have a
|
---|
294 | location attached to it -- we have to give up. Since
|
---|
295 | this can only happen when using `--force-html -i', print
|
---|
296 | a warning. */
|
---|
297 | logprintf (LOG_NOTQUIET,
|
---|
298 | _("%s: Cannot resolve incomplete link %s.\n"),
|
---|
299 | ctx->document_file, link_uri);
|
---|
300 | return NULL;
|
---|
301 | }
|
---|
302 |
|
---|
303 | url = url_parse (link_uri, NULL);
|
---|
304 | if (!url)
|
---|
305 | {
|
---|
306 | DEBUGP (("%s: link \"%s\" doesn't parse.\n",
|
---|
307 | ctx->document_file, link_uri));
|
---|
308 | return NULL;
|
---|
309 | }
|
---|
310 | }
|
---|
311 | else
|
---|
312 | {
|
---|
313 | /* Merge BASE with LINK_URI, but also make sure the result is
|
---|
314 | canonicalized, i.e. that "../" have been resolved.
|
---|
315 | (parse_url will do that for us.) */
|
---|
316 |
|
---|
317 | char *complete_uri = uri_merge (base, link_uri);
|
---|
318 |
|
---|
319 | DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
|
---|
320 | ctx->document_file, base, link_uri, complete_uri));
|
---|
321 |
|
---|
322 | url = url_parse (complete_uri, NULL);
|
---|
323 | if (!url)
|
---|
324 | {
|
---|
325 | DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
|
---|
326 | ctx->document_file, complete_uri));
|
---|
327 | xfree (complete_uri);
|
---|
328 | return NULL;
|
---|
329 | }
|
---|
330 | xfree (complete_uri);
|
---|
331 | }
|
---|
332 |
|
---|
333 | DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
|
---|
334 |
|
---|
335 | newel = xnew0 (struct urlpos);
|
---|
336 | newel->url = url;
|
---|
337 | newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
|
---|
338 | newel->size = tag->attrs[attrind].value_raw_size;
|
---|
339 |
|
---|
340 | /* A URL is relative if the host is not named, and the name does not
|
---|
341 | start with `/'. */
|
---|
342 | if (!link_has_scheme && *link_uri != '/')
|
---|
343 | newel->link_relative_p = 1;
|
---|
344 | else if (link_has_scheme)
|
---|
345 | newel->link_complete_p = 1;
|
---|
346 |
|
---|
347 | if (ctx->tail)
|
---|
348 | {
|
---|
349 | ctx->tail->next = newel;
|
---|
350 | ctx->tail = newel;
|
---|
351 | }
|
---|
352 | else
|
---|
353 | ctx->tail = ctx->head = newel;
|
---|
354 |
|
---|
355 | return newel;
|
---|
356 | }
|
---|
357 | |
---|
358 |
|
---|
359 | /* All the tag_* functions are called from collect_tags_mapper, as
|
---|
360 | specified by KNOWN_TAGS. */
|
---|
361 |
|
---|
362 | /* Default tag handler: collect URLs from attributes specified for
|
---|
363 | this tag by tag_url_attributes. */
|
---|
364 |
|
---|
365 | static void
|
---|
366 | tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
|
---|
367 | {
|
---|
368 | int i, attrind;
|
---|
369 | int first = -1;
|
---|
370 |
|
---|
371 | for (i = 0; i < countof (tag_url_attributes); i++)
|
---|
372 | if (tag_url_attributes[i].tagid == tagid)
|
---|
373 | {
|
---|
374 | /* We've found the index of tag_url_attributes where the
|
---|
375 | attributes of our tag begin. */
|
---|
376 | first = i;
|
---|
377 | break;
|
---|
378 | }
|
---|
379 | assert (first != -1);
|
---|
380 |
|
---|
381 | /* Loop over the "interesting" attributes of this tag. In this
|
---|
382 | example, it will loop over "src" and "lowsrc".
|
---|
383 |
|
---|
384 | <img src="foo.png" lowsrc="bar.png">
|
---|
385 |
|
---|
386 | This has to be done in the outer loop so that the attributes are
|
---|
387 | processed in the same order in which they appear in the page.
|
---|
388 | This is required when converting links. */
|
---|
389 |
|
---|
390 | for (attrind = 0; attrind < tag->nattrs; attrind++)
|
---|
391 | {
|
---|
392 | /* Find whether TAG/ATTRIND is a combination that contains a
|
---|
393 | URL. */
|
---|
394 | char *link = tag->attrs[attrind].value;
|
---|
395 | const int size = countof (tag_url_attributes);
|
---|
396 |
|
---|
397 | /* If you're cringing at the inefficiency of the nested loops,
|
---|
398 | remember that they both iterate over a very small number of
|
---|
399 | items. The worst-case inner loop is for the IMG tag, which
|
---|
400 | has three attributes. */
|
---|
401 | for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
|
---|
402 | {
|
---|
403 | if (0 == strcasecmp (tag->attrs[attrind].name,
|
---|
404 | tag_url_attributes[i].attr_name))
|
---|
405 | {
|
---|
406 | struct urlpos *up = append_url (link, tag, attrind, ctx);
|
---|
407 | if (up)
|
---|
408 | {
|
---|
409 | int flags = tag_url_attributes[i].flags;
|
---|
410 | if (flags & ATTR_INLINE)
|
---|
411 | up->link_inline_p = 1;
|
---|
412 | if (flags & ATTR_HTML)
|
---|
413 | up->link_expect_html = 1;
|
---|
414 | }
|
---|
415 | }
|
---|
416 | }
|
---|
417 | }
|
---|
418 | }
|
---|
419 |
|
---|
420 | /* Handle the BASE tag, for <base href=...>. */
|
---|
421 |
|
---|
422 | static void
|
---|
423 | tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
|
---|
424 | {
|
---|
425 | struct urlpos *base_urlpos;
|
---|
426 | int attrind;
|
---|
427 | char *newbase = find_attr (tag, "href", &attrind);
|
---|
428 | if (!newbase)
|
---|
429 | return;
|
---|
430 |
|
---|
431 | base_urlpos = append_url (newbase, tag, attrind, ctx);
|
---|
432 | if (!base_urlpos)
|
---|
433 | return;
|
---|
434 | base_urlpos->ignore_when_downloading = 1;
|
---|
435 | base_urlpos->link_base_p = 1;
|
---|
436 |
|
---|
437 | if (ctx->base)
|
---|
438 | xfree (ctx->base);
|
---|
439 | if (ctx->parent_base)
|
---|
440 | ctx->base = uri_merge (ctx->parent_base, newbase);
|
---|
441 | else
|
---|
442 | ctx->base = xstrdup (newbase);
|
---|
443 | }
|
---|
444 |
|
---|
445 | /* Mark the URL found in <form action=...> for conversion. */
|
---|
446 |
|
---|
447 | static void
|
---|
448 | tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
|
---|
449 | {
|
---|
450 | int attrind;
|
---|
451 | char *action = find_attr (tag, "action", &attrind);
|
---|
452 | if (action)
|
---|
453 | {
|
---|
454 | struct urlpos *up = append_url (action, tag, attrind, ctx);
|
---|
455 | if (up)
|
---|
456 | up->ignore_when_downloading = 1;
|
---|
457 | }
|
---|
458 | }
|
---|
459 |
|
---|
460 | /* Handle the LINK tag. It requires special handling because how its
|
---|
461 | links will be followed in -p mode depends on the REL attribute. */
|
---|
462 |
|
---|
463 | static void
|
---|
464 | tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
|
---|
465 | {
|
---|
466 | int attrind;
|
---|
467 | char *href = find_attr (tag, "href", &attrind);
|
---|
468 |
|
---|
469 | /* All <link href="..."> link references are external, except those
|
---|
470 | known not to be, such as style sheet and shortcut icon:
|
---|
471 |
|
---|
472 | <link rel="stylesheet" href="...">
|
---|
473 | <link rel="shortcut icon" href="...">
|
---|
474 | */
|
---|
475 | if (href)
|
---|
476 | {
|
---|
477 | struct urlpos *up = append_url (href, tag, attrind, ctx);
|
---|
478 | if (up)
|
---|
479 | {
|
---|
480 | char *rel = find_attr (tag, "rel", NULL);
|
---|
481 | if (rel
|
---|
482 | && (0 == strcasecmp (rel, "stylesheet")
|
---|
483 | || 0 == strcasecmp (rel, "shortcut icon")))
|
---|
484 | up->link_inline_p = 1;
|
---|
485 | else
|
---|
486 | /* The external ones usually point to HTML pages, such as
|
---|
487 | <link rel="next" href="..."> */
|
---|
488 | up->link_expect_html = 1;
|
---|
489 | }
|
---|
490 | }
|
---|
491 | }
|
---|
492 |
|
---|
493 | /* Handle the META tag. This requires special handling because of the
|
---|
494 | refresh feature and because of robot exclusion. */
|
---|
495 |
|
---|
496 | static void
|
---|
497 | tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
|
---|
498 | {
|
---|
499 | char *name = find_attr (tag, "name", NULL);
|
---|
500 | char *http_equiv = find_attr (tag, "http-equiv", NULL);
|
---|
501 |
|
---|
502 | if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
|
---|
503 | {
|
---|
504 | /* Some pages use a META tag to specify that the page be
|
---|
505 | refreshed by a new page after a given number of seconds. The
|
---|
506 | general format for this is:
|
---|
507 |
|
---|
508 | <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
|
---|
509 |
|
---|
510 | So we just need to skip past the "NUMBER; URL=" garbage to
|
---|
511 | get to the URL. */
|
---|
512 |
|
---|
513 | struct urlpos *entry;
|
---|
514 | int attrind;
|
---|
515 | int timeout = 0;
|
---|
516 | char *p;
|
---|
517 |
|
---|
518 | char *refresh = find_attr (tag, "content", &attrind);
|
---|
519 | if (!refresh)
|
---|
520 | return;
|
---|
521 |
|
---|
522 | for (p = refresh; ISDIGIT (*p); p++)
|
---|
523 | timeout = 10 * timeout + *p - '0';
|
---|
524 | if (*p++ != ';')
|
---|
525 | return;
|
---|
526 |
|
---|
527 | while (ISSPACE (*p))
|
---|
528 | ++p;
|
---|
529 | if (!( TOUPPER (*p) == 'U'
|
---|
530 | && TOUPPER (*(p + 1)) == 'R'
|
---|
531 | && TOUPPER (*(p + 2)) == 'L'
|
---|
532 | && *(p + 3) == '='))
|
---|
533 | return;
|
---|
534 | p += 4;
|
---|
535 | while (ISSPACE (*p))
|
---|
536 | ++p;
|
---|
537 |
|
---|
538 | entry = append_url (p, tag, attrind, ctx);
|
---|
539 | if (entry)
|
---|
540 | {
|
---|
541 | entry->link_refresh_p = 1;
|
---|
542 | entry->refresh_timeout = timeout;
|
---|
543 | entry->link_expect_html = 1;
|
---|
544 | }
|
---|
545 | }
|
---|
546 | else if (name && 0 == strcasecmp (name, "robots"))
|
---|
547 | {
|
---|
548 | /* Handle stuff like:
|
---|
549 | <meta name="robots" content="index,nofollow"> */
|
---|
550 | char *content = find_attr (tag, "content", NULL);
|
---|
551 | if (!content)
|
---|
552 | return;
|
---|
553 | if (!strcasecmp (content, "none"))
|
---|
554 | ctx->nofollow = 1;
|
---|
555 | else
|
---|
556 | {
|
---|
557 | while (*content)
|
---|
558 | {
|
---|
559 | /* Find the next occurrence of ',' or the end of
|
---|
560 | the string. */
|
---|
561 | char *end = strchr (content, ',');
|
---|
562 | if (end)
|
---|
563 | ++end;
|
---|
564 | else
|
---|
565 | end = content + strlen (content);
|
---|
566 | if (!strncasecmp (content, "nofollow", end - content))
|
---|
567 | ctx->nofollow = 1;
|
---|
568 | content = end;
|
---|
569 | }
|
---|
570 | }
|
---|
571 | }
|
---|
572 | }
|
---|
573 |
|
---|
574 | /* Dispatch the tag handler appropriate for the tag we're mapping
|
---|
575 | over. See known_tags[] for definition of tag handlers. */
|
---|
576 |
|
---|
577 | static void
|
---|
578 | collect_tags_mapper (struct taginfo *tag, void *arg)
|
---|
579 | {
|
---|
580 | struct map_context *ctx = (struct map_context *)arg;
|
---|
581 |
|
---|
582 | /* Find the tag in our table of tags. This must not fail because
|
---|
583 | map_html_tags only returns tags found in interesting_tags. */
|
---|
584 | struct known_tag *t = hash_table_get (interesting_tags, tag->name);
|
---|
585 | assert (t != NULL);
|
---|
586 |
|
---|
587 | t->handler (t->tagid, tag, ctx);
|
---|
588 | }
|
---|
589 | |
---|
590 |
|
---|
591 | /* Analyze HTML tags FILE and construct a list of URLs referenced from
|
---|
592 | it. It merges relative links in FILE with URL. It is aware of
|
---|
593 | <base href=...> and does the right thing. */
|
---|
594 |
|
---|
595 | struct urlpos *
|
---|
596 | get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
|
---|
597 | {
|
---|
598 | struct file_memory *fm;
|
---|
599 | struct map_context ctx;
|
---|
600 | int flags;
|
---|
601 |
|
---|
602 | /* Load the file. */
|
---|
603 | fm = read_file (file);
|
---|
604 | if (!fm)
|
---|
605 | {
|
---|
606 | logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
---|
607 | return NULL;
|
---|
608 | }
|
---|
609 | DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
|
---|
610 |
|
---|
611 | ctx.text = fm->content;
|
---|
612 | ctx.head = ctx.tail = NULL;
|
---|
613 | ctx.base = NULL;
|
---|
614 | ctx.parent_base = url ? url : opt.base_href;
|
---|
615 | ctx.document_file = file;
|
---|
616 | ctx.nofollow = 0;
|
---|
617 |
|
---|
618 | if (!interesting_tags)
|
---|
619 | init_interesting ();
|
---|
620 |
|
---|
621 | /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
|
---|
622 | generate <a href=" foo"> instead of <a href="foo"> (browsers
|
---|
623 | ignore spaces as well.) If you really mean space, use &32; or
|
---|
624 | %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
|
---|
625 | e.g. in <img src="foo.[newline]html">. Such newlines are also
|
---|
626 | ignored by IE and Mozilla and are presumably introduced by
|
---|
627 | writing HTML with editors that force word wrap. */
|
---|
628 | flags = MHT_TRIM_VALUES;
|
---|
629 | if (opt.strict_comments)
|
---|
630 | flags |= MHT_STRICT_COMMENTS;
|
---|
631 |
|
---|
632 | map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
|
---|
633 | interesting_tags, interesting_attributes);
|
---|
634 |
|
---|
635 | DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
|
---|
636 | if (meta_disallow_follow)
|
---|
637 | *meta_disallow_follow = ctx.nofollow;
|
---|
638 |
|
---|
639 | xfree_null (ctx.base);
|
---|
640 | read_file_free (fm);
|
---|
641 | return ctx.head;
|
---|
642 | }
|
---|
643 |
|
---|
644 | /* This doesn't really have anything to do with HTML, but it's similar
|
---|
645 | to get_urls_html, so we put it here. */
|
---|
646 |
|
---|
647 | struct urlpos *
|
---|
648 | get_urls_file (const char *file)
|
---|
649 | {
|
---|
650 | struct file_memory *fm;
|
---|
651 | struct urlpos *head, *tail;
|
---|
652 | const char *text, *text_end;
|
---|
653 |
|
---|
654 | /* Load the file. */
|
---|
655 | fm = read_file (file);
|
---|
656 | if (!fm)
|
---|
657 | {
|
---|
658 | logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
---|
659 | return NULL;
|
---|
660 | }
|
---|
661 | DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
|
---|
662 |
|
---|
663 | head = tail = NULL;
|
---|
664 | text = fm->content;
|
---|
665 | text_end = fm->content + fm->length;
|
---|
666 | while (text < text_end)
|
---|
667 | {
|
---|
668 | int up_error_code;
|
---|
669 | char *url_text;
|
---|
670 | struct urlpos *entry;
|
---|
671 | struct url *url;
|
---|
672 |
|
---|
673 | const char *line_beg = text;
|
---|
674 | const char *line_end = memchr (text, '\n', text_end - text);
|
---|
675 | if (!line_end)
|
---|
676 | line_end = text_end;
|
---|
677 | else
|
---|
678 | ++line_end;
|
---|
679 | text = line_end;
|
---|
680 |
|
---|
681 | /* Strip whitespace from the beginning and end of line. */
|
---|
682 | while (line_beg < line_end && ISSPACE (*line_beg))
|
---|
683 | ++line_beg;
|
---|
684 | while (line_end > line_beg && ISSPACE (*(line_end - 1)))
|
---|
685 | --line_end;
|
---|
686 |
|
---|
687 | if (line_beg == line_end)
|
---|
688 | continue;
|
---|
689 |
|
---|
690 | /* The URL is in the [line_beg, line_end) region. */
|
---|
691 |
|
---|
692 | /* We must copy the URL to a zero-terminated string, and we
|
---|
693 | can't use alloca because we're in a loop. *sigh*. */
|
---|
694 | url_text = strdupdelim (line_beg, line_end);
|
---|
695 |
|
---|
696 | if (opt.base_href)
|
---|
697 | {
|
---|
698 | /* Merge opt.base_href with URL. */
|
---|
699 | char *merged = uri_merge (opt.base_href, url_text);
|
---|
700 | xfree (url_text);
|
---|
701 | url_text = merged;
|
---|
702 | }
|
---|
703 |
|
---|
704 | url = url_parse (url_text, &up_error_code);
|
---|
705 | if (!url)
|
---|
706 | {
|
---|
707 | logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
|
---|
708 | file, url_text, url_error (up_error_code));
|
---|
709 | xfree (url_text);
|
---|
710 | continue;
|
---|
711 | }
|
---|
712 | xfree (url_text);
|
---|
713 |
|
---|
714 | entry = xnew0 (struct urlpos);
|
---|
715 | entry->next = NULL;
|
---|
716 | entry->url = url;
|
---|
717 |
|
---|
718 | if (!head)
|
---|
719 | head = entry;
|
---|
720 | else
|
---|
721 | tail->next = entry;
|
---|
722 | tail = entry;
|
---|
723 | }
|
---|
724 | read_file_free (fm);
|
---|
725 | return head;
|
---|
726 | }
|
---|
727 |
|
---|
728 | void
|
---|
729 | cleanup_html_url (void)
|
---|
730 | {
|
---|
731 | /* Destroy the hash tables. The hash table keys and values are not
|
---|
732 | allocated by this code, so we don't need to free them here. */
|
---|
733 | if (interesting_tags)
|
---|
734 | hash_table_destroy (interesting_tags);
|
---|
735 | if (interesting_attributes)
|
---|
736 | hash_table_destroy (interesting_attributes);
|
---|
737 | }
|
---|