source: trunk/essentials/net-misc/wget/src/html-parse.c

Last change on this file was 3440, checked in by bird, 18 years ago

wget 1.10.2

File size: 30.1 KB
Line 
1/* HTML parser for Wget.
2 Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
3
4This file is part of GNU Wget.
5
6GNU Wget is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2 of the License, or (at
9your option) any later version.
10
11GNU Wget is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with Wget; if not, write to the Free Software
18Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20In addition, as a special exception, the Free Software Foundation
21gives permission to link the code of its release of Wget with the
22OpenSSL project's "OpenSSL" library (or with modified versions of it
23that use the same license as the "OpenSSL" library), and distribute
24the linked executables. You must obey the GNU General Public License
25in all respects for all of the code used other than "OpenSSL". If you
26modify this file, you may extend this exception to your version of the
27file, but you are not obligated to do so. If you do not wish to do
28so, delete this exception statement from your version. */
29
30/* The only entry point to this module is map_html_tags(), which see. */
31
32/* TODO:
33
34 - Allow hooks for callers to process contents outside tags. This
35 is needed to implement handling <style> and <script>. The
36 taginfo structure already carries the information about where the
37 tags are, but this is not enough, because one would also want to
38 skip the comments. (The funny thing is that for <style> and
39 <script> you *don't* want to skip comments!)
40
41 - Create a test suite for regression testing. */
42
43/* HISTORY:
44
45 This is the third HTML parser written for Wget. The first one was
46 written some time during the Geturl 1.0 beta cycle, and was very
47 inefficient and buggy. It also contained some very complex code to
48 remember a list of parser states, because it was supposed to be
49 reentrant.
50
51 The second HTML parser was written for Wget 1.4 (the first version
52 by the name `Wget'), and was a complete rewrite. Although the new
53 parser behaved much better and made no claims of reentrancy, it
54 still shared many of the fundamental flaws of the old version -- it
55 only regarded HTML in terms tag-attribute pairs, where the
56 attribute's value was a URL to be returned. Any other property of
57 HTML, such as <base href=...>, or strange way to specify a URL,
58 such as <meta http-equiv=Refresh content="0; URL=..."> had to be
59 crudely hacked in -- and the caller had to be aware of these hacks.
60 Like its predecessor, this parser did not support HTML comments.
61
62 After Wget 1.5.1 was released, I set out to write a third HTML
63 parser. The objectives of the new parser were to: (1) provide a
64 clean way to analyze HTML lexically, (2) separate interpretation of
65 the markup from the parsing process, (3) be as correct as possible,
66 e.g. correctly skipping comments and other SGML declarations, (4)
67 understand the most common errors in markup and skip them or be
68 relaxed towrds them, and (5) be reasonably efficient (no regexps,
69 minimum copying and minimum or no heap allocation).
70
71 I believe this parser meets all of the above goals. It is
72 reasonably well structured, and could be relatively easily
73 separated from Wget and used elsewhere. While some of its
74 intrinsic properties limit its value as a general-purpose HTML
75 parser, I believe that, with minimum modifications, it could serve
76 as a backend for one.
77
78 Due to time and other constraints, this parser was not integrated
79 into Wget until the version 1.7. */
80
81/* DESCRIPTION:
82
83 The single entry point of this parser is map_html_tags(), which
84 works by calling a function you specify for each tag. The function
85 gets called with the pointer to a structure describing the tag and
86 its attributes. */
87
88/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll
89 still need Wget headers to compile. */
90
91#include <config.h>
92
93#ifdef STANDALONE
94# define I_REALLY_WANT_CTYPE_MACROS
95#endif
96
97#include <stdio.h>
98#include <stdlib.h>
99#ifdef HAVE_STRING_H
100# include <string.h>
101#else
102# include <strings.h>
103#endif
104#include <assert.h>
105
106#include "wget.h"
107#include "html-parse.h"
108
109#ifdef STANDALONE
110# undef xmalloc
111# undef xrealloc
112# undef xfree
113# define xmalloc malloc
114# define xrealloc realloc
115# define xfree free
116
117# undef ISSPACE
118# undef ISDIGIT
119# undef ISXDIGIT
120# undef ISALPHA
121# undef ISALNUM
122# undef TOLOWER
123# undef TOUPPER
124
125# define ISSPACE(x) isspace (x)
126# define ISDIGIT(x) isdigit (x)
127# define ISXDIGIT(x) isxdigit (x)
128# define ISALPHA(x) isalpha (x)
129# define ISALNUM(x) isalnum (x)
130# define TOLOWER(x) tolower (x)
131# define TOUPPER(x) toupper (x)
132
133struct hash_table {
134 int dummy;
135};
136static void *
137hash_table_get (const struct hash_table *ht, void *ptr)
138{
139 return ptr;
140}
141#else /* not STANDALONE */
142# include "hash.h"
143#endif
144
145/* Pool support. A pool is a resizable chunk of memory. It is first
146 allocated on the stack, and moved to the heap if it needs to be
147 larger than originally expected. map_html_tags() uses it to store
148 the zero-terminated names and values of tags and attributes.
149
150 Thus taginfo->name, and attr->name and attr->value for each
151 attribute, do not point into separately allocated areas, but into
152 different parts of the pool, separated only by terminating zeros.
153 This ensures minimum amount of allocation and, for most tags, no
154 allocation because the entire pool is kept on the stack. */
155
156struct pool {
157 char *contents; /* pointer to the contents. */
158 int size; /* size of the pool. */
159 int tail; /* next available position index. */
160 int resized; /* whether the pool has been resized
161 using malloc. */
162
163 char *orig_contents; /* original pool contents, usually
164 stack-allocated. used by POOL_FREE
165 to restore the pool to the initial
166 state. */
167 int orig_size;
168};
169
170/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
171
172#define POOL_INIT(p, initial_storage, initial_size) do { \
173 struct pool *P = (p); \
174 P->contents = (initial_storage); \
175 P->size = (initial_size); \
176 P->tail = 0; \
177 P->resized = 0; \
178 P->orig_contents = P->contents; \
179 P->orig_size = P->size; \
180} while (0)
181
182/* Grow the pool to accomodate at least SIZE new bytes. If the pool
183 already has room to accomodate SIZE bytes of data, this is a no-op. */
184
185#define POOL_GROW(p, increase) \
186 GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
187 (p)->resized, char)
188
189/* Append text in the range [beg, end) to POOL. No zero-termination
190 is done. */
191
192#define POOL_APPEND(p, beg, end) do { \
193 const char *PA_beg = (beg); \
194 int PA_size = (end) - PA_beg; \
195 POOL_GROW (p, PA_size); \
196 memcpy ((p)->contents + (p)->tail, PA_beg, PA_size); \
197 (p)->tail += PA_size; \
198} while (0)
199
200/* Append one character to the pool. Can be used to zero-terminate
201 pool strings. */
202
203#define POOL_APPEND_CHR(p, ch) do { \
204 char PAC_char = (ch); \
205 POOL_GROW (p, 1); \
206 (p)->contents[(p)->tail++] = PAC_char; \
207} while (0)
208
209/* Forget old pool contents. The allocated memory is not freed. */
210#define POOL_REWIND(p) (p)->tail = 0
211
212/* Free heap-allocated memory for contents of POOL. This calls
213 xfree() if the memory was allocated through malloc. It also
214 restores `contents' and `size' to their original, pre-malloc
215 values. That way after POOL_FREE, the pool is fully usable, just
216 as if it were freshly initialized with POOL_INIT. */
217
218#define POOL_FREE(p) do { \
219 struct pool *P = p; \
220 if (P->resized) \
221 xfree (P->contents); \
222 P->contents = P->orig_contents; \
223 P->size = P->orig_size; \
224 P->tail = 0; \
225 P->resized = 0; \
226} while (0)
227
228/* Used for small stack-allocated memory chunks that might grow. Like
229 DO_REALLOC, this macro grows BASEVAR as necessary to take
230 NEEDED_SIZE items of TYPE.
231
232 The difference is that on the first resize, it will use
233 malloc+memcpy rather than realloc. That way you can stack-allocate
234 the initial chunk, and only resort to heap allocation if you
235 stumble upon large data.
236
237 After the first resize, subsequent ones are performed with realloc,
238 just like DO_REALLOC. */
239
240#define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do { \
241 long ga_needed_size = (needed_size); \
242 long ga_newsize = (sizevar); \
243 while (ga_newsize < ga_needed_size) \
244 ga_newsize <<= 1; \
245 if (ga_newsize != (sizevar)) \
246 { \
247 if (resized) \
248 basevar = (type *)xrealloc (basevar, ga_newsize * sizeof (type)); \
249 else \
250 { \
251 void *ga_new = xmalloc (ga_newsize * sizeof (type)); \
252 memcpy (ga_new, basevar, (sizevar) * sizeof (type)); \
253 (basevar) = ga_new; \
254 resized = 1; \
255 } \
256 (sizevar) = ga_newsize; \
257 } \
258} while (0)
259
260
261/* Test whether n+1-sized entity name fits in P. We don't support
262 IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
263 However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc. In
264 other words an entity needs to be terminated by either a
265 non-alphanumeric or the end of string. */
266#define FITS(p, n) (p + n == end || (p + n < end && !ISALNUM (p[n])))
267
268/* Macros that test entity names by returning true if P is followed by
269 the specified characters. */
270#define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
271#define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
272#define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
273
274/* Increment P by INC chars. If P lands at a semicolon, increment it
275 past the semicolon. This ensures that e.g. "&lt;foo" is converted
276 to "<foo", but "&lt,foo" to "<,foo". */
277#define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
278
279/* Decode the HTML character entity at *PTR, considering END to be end
280 of buffer. It is assumed that the "&" character that marks the
281 beginning of the entity has been seen at *PTR-1. If a recognized
282 ASCII entity is seen, it is returned, and *PTR is moved to the end
283 of the entity. Otherwise, -1 is returned and *PTR left unmodified.
284
285 The recognized entities are: &lt, &gt, &amp, &apos, and &quot. */
286
287static int
288decode_entity (const char **ptr, const char *end)
289{
290 const char *p = *ptr;
291 int value = -1;
292
293 if (++p == end)
294 return -1;
295
296 switch (*p++)
297 {
298 case '#':
299 /* Process numeric entities "&#DDD;" and "&#xHH;". */
300 {
301 int digits = 0;
302 value = 0;
303 if (*p == 'x')
304 for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++)
305 value = (value << 4) + XDIGIT_TO_NUM (*p);
306 else
307 for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++)
308 value = (value * 10) + (*p - '0');
309 if (!digits)
310 return -1;
311 /* Don't interpret 128+ codes and NUL because we cannot
312 portably reinserted them into HTML. */
313 if (!value || (value & ~0x7f))
314 return -1;
315 *ptr = SKIP_SEMI (p, 0);
316 return value;
317 }
318 /* Process named ASCII entities. */
319 case 'g':
320 if (ENT1 (p, 't'))
321 value = '>', *ptr = SKIP_SEMI (p, 1);
322 break;
323 case 'l':
324 if (ENT1 (p, 't'))
325 value = '<', *ptr = SKIP_SEMI (p, 1);
326 break;
327 case 'a':
328 if (ENT2 (p, 'm', 'p'))
329 value = '&', *ptr = SKIP_SEMI (p, 2);
330 else if (ENT3 (p, 'p', 'o', 's'))
331 /* handle &apos for the sake of the XML/XHTML crowd. */
332 value = '\'', *ptr = SKIP_SEMI (p, 3);
333 break;
334 case 'q':
335 if (ENT3 (p, 'u', 'o', 't'))
336 value = '\"', *ptr = SKIP_SEMI (p, 3);
337 break;
338 }
339 return value;
340}
341#undef ENT1
342#undef ENT2
343#undef ENT3
344#undef FITS
345#undef SKIP_SEMI
346
347enum {
348 AP_DOWNCASE = 1,
349 AP_DECODE_ENTITIES = 2,
350 AP_TRIM_BLANKS = 4
351};
352
353/* Copy the text in the range [BEG, END) to POOL, optionally
354 performing operations specified by FLAGS. FLAGS may be any
355 combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
356 with the following meaning:
357
358 * AP_DOWNCASE -- downcase all the letters;
359
360 * AP_DECODE_ENTITIES -- decode the named and numeric entities in
361 the ASCII range when copying the string.
362
363 * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
364 of text, as well as embedded newlines. */
365
366static void
367convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
368{
369 int old_tail = pool->tail;
370
371 /* Skip blanks if required. We must do this before entities are
372 processed, so that blanks can still be inserted as, for instance,
373 `&#32;'. */
374 if (flags & AP_TRIM_BLANKS)
375 {
376 while (beg < end && ISSPACE (*beg))
377 ++beg;
378 while (end > beg && ISSPACE (end[-1]))
379 --end;
380 }
381
382 if (flags & AP_DECODE_ENTITIES)
383 {
384 /* Grow the pool, then copy the text to the pool character by
385 character, processing the encountered entities as we go
386 along.
387
388 It's safe (and necessary) to grow the pool in advance because
389 processing the entities can only *shorten* the string, it can
390 never lengthen it. */
391 const char *from = beg;
392 char *to;
393 int squash_newlines = flags & AP_TRIM_BLANKS;
394
395 POOL_GROW (pool, end - beg);
396 to = pool->contents + pool->tail;
397
398 while (from < end)
399 {
400 if (*from == '&')
401 {
402 int entity = decode_entity (&from, end);
403 if (entity != -1)
404 *to++ = entity;
405 else
406 *to++ = *from++;
407 }
408 else if ((*from == '\n' || *from == '\r') && squash_newlines)
409 ++from;
410 else
411 *to++ = *from++;
412 }
413 /* Verify that we haven't exceeded the original size. (It
414 shouldn't happen, hence the assert.) */
415 assert (to - (pool->contents + pool->tail) <= end - beg);
416
417 /* Make POOL's tail point to the position following the string
418 we've written. */
419 pool->tail = to - pool->contents;
420 POOL_APPEND_CHR (pool, '\0');
421 }
422 else
423 {
424 /* Just copy the text to the pool. */
425 POOL_APPEND (pool, beg, end);
426 POOL_APPEND_CHR (pool, '\0');
427 }
428
429 if (flags & AP_DOWNCASE)
430 {
431 char *p = pool->contents + old_tail;
432 for (; *p; p++)
433 *p = TOLOWER (*p);
434 }
435}
436
437
438/* Originally we used to adhere to rfc 1866 here, and allowed only
439 letters, digits, periods, and hyphens as names (of tags or
440 attributes). However, this broke too many pages which used
441 proprietary or strange attributes, e.g. <img src="a.gif"
442 v:shapes="whatever">.
443
444 So now we allow any character except:
445 * whitespace
446 * 8-bit and control chars
447 * characters that clearly cannot be part of name:
448 '=', '>', '/'.
449
450 This only affects attribute and tag names; attribute values allow
451 an even greater variety of characters. */
452
453#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
454 && (x) != '=' && (x) != '>' && (x) != '/')
455
456#ifdef STANDALONE
457static int comment_backout_count;
458#endif
459
460/* Advance over an SGML declaration, such as <!DOCTYPE ...>. In
461 strict comments mode, this is used for skipping over comments as
462 well.
463
464 To recap: any SGML declaration may have comments associated with
465 it, e.g.
466 <!MY-DECL -- isn't this fun? -- foo bar>
467
468 An HTML comment is merely an empty declaration (<!>) with a comment
469 attached, like this:
470 <!-- some stuff here -->
471
472 Several comments may be embedded in one comment declaration:
473 <!-- have -- -- fun -->
474
475 Whitespace is allowed between and after the comments, but not
476 before the first comment. Additionally, this function attempts to
477 handle double quotes in SGML declarations correctly. */
478
479static const char *
480advance_declaration (const char *beg, const char *end)
481{
482 const char *p = beg;
483 char quote_char = '\0'; /* shut up, gcc! */
484 char ch;
485
486 enum {
487 AC_S_DONE,
488 AC_S_BACKOUT,
489 AC_S_BANG,
490 AC_S_DEFAULT,
491 AC_S_DCLNAME,
492 AC_S_DASH1,
493 AC_S_DASH2,
494 AC_S_COMMENT,
495 AC_S_DASH3,
496 AC_S_DASH4,
497 AC_S_QUOTE1,
498 AC_S_IN_QUOTE,
499 AC_S_QUOTE2
500 } state = AC_S_BANG;
501
502 if (beg == end)
503 return beg;
504 ch = *p++;
505
506 /* It looked like a good idea to write this as a state machine, but
507 now I wonder... */
508
509 while (state != AC_S_DONE && state != AC_S_BACKOUT)
510 {
511 if (p == end)
512 state = AC_S_BACKOUT;
513 switch (state)
514 {
515 case AC_S_DONE:
516 case AC_S_BACKOUT:
517 break;
518 case AC_S_BANG:
519 if (ch == '!')
520 {
521 ch = *p++;
522 state = AC_S_DEFAULT;
523 }
524 else
525 state = AC_S_BACKOUT;
526 break;
527 case AC_S_DEFAULT:
528 switch (ch)
529 {
530 case '-':
531 state = AC_S_DASH1;
532 break;
533 case ' ':
534 case '\t':
535 case '\r':
536 case '\n':
537 ch = *p++;
538 break;
539 case '>':
540 state = AC_S_DONE;
541 break;
542 case '\'':
543 case '\"':
544 state = AC_S_QUOTE1;
545 break;
546 default:
547 if (NAME_CHAR_P (ch))
548 state = AC_S_DCLNAME;
549 else
550 state = AC_S_BACKOUT;
551 break;
552 }
553 break;
554 case AC_S_DCLNAME:
555 if (ch == '-')
556 state = AC_S_DASH1;
557 else if (NAME_CHAR_P (ch))
558 ch = *p++;
559 else
560 state = AC_S_DEFAULT;
561 break;
562 case AC_S_QUOTE1:
563 /* We must use 0x22 because broken assert macros choke on
564 '"' and '\"'. */
565 assert (ch == '\'' || ch == 0x22);
566 quote_char = ch; /* cheating -- I really don't feel like
567 introducing more different states for
568 different quote characters. */
569 ch = *p++;
570 state = AC_S_IN_QUOTE;
571 break;
572 case AC_S_IN_QUOTE:
573 if (ch == quote_char)
574 state = AC_S_QUOTE2;
575 else
576 ch = *p++;
577 break;
578 case AC_S_QUOTE2:
579 assert (ch == quote_char);
580 ch = *p++;
581 state = AC_S_DEFAULT;
582 break;
583 case AC_S_DASH1:
584 assert (ch == '-');
585 ch = *p++;
586 state = AC_S_DASH2;
587 break;
588 case AC_S_DASH2:
589 switch (ch)
590 {
591 case '-':
592 ch = *p++;
593 state = AC_S_COMMENT;
594 break;
595 default:
596 state = AC_S_BACKOUT;
597 }
598 break;
599 case AC_S_COMMENT:
600 switch (ch)
601 {
602 case '-':
603 state = AC_S_DASH3;
604 break;
605 default:
606 ch = *p++;
607 break;
608 }
609 break;
610 case AC_S_DASH3:
611 assert (ch == '-');
612 ch = *p++;
613 state = AC_S_DASH4;
614 break;
615 case AC_S_DASH4:
616 switch (ch)
617 {
618 case '-':
619 ch = *p++;
620 state = AC_S_DEFAULT;
621 break;
622 default:
623 state = AC_S_COMMENT;
624 break;
625 }
626 break;
627 }
628 }
629
630 if (state == AC_S_BACKOUT)
631 {
632#ifdef STANDALONE
633 ++comment_backout_count;
634#endif
635 return beg + 1;
636 }
637 return p;
638}
639
640/* Find the first occurrence of the substring "-->" in [BEG, END) and
641 return the pointer to the character after the substring. If the
642 substring is not found, return NULL. */
643
644static const char *
645find_comment_end (const char *beg, const char *end)
646{
647 /* Open-coded Boyer-Moore search for "-->". Examine the third char;
648 if it's not '>' or '-', advance by three characters. Otherwise,
649 look at the preceding characters and try to find a match. */
650
651 const char *p = beg - 1;
652
653 while ((p += 3) < end)
654 switch (p[0])
655 {
656 case '>':
657 if (p[-1] == '-' && p[-2] == '-')
658 return p + 1;
659 break;
660 case '-':
661 at_dash:
662 if (p[-1] == '-')
663 {
664 at_dash_dash:
665 if (++p == end) return NULL;
666 switch (p[0])
667 {
668 case '>': return p + 1;
669 case '-': goto at_dash_dash;
670 }
671 }
672 else
673 {
674 if ((p += 2) >= end) return NULL;
675 switch (p[0])
676 {
677 case '>':
678 if (p[-1] == '-')
679 return p + 1;
680 break;
681 case '-':
682 goto at_dash;
683 }
684 }
685 }
686 return NULL;
687}
688
689
690/* Return non-zero of the string inside [b, e) are present in hash
691 table HT. */
692
693static int
694name_allowed (const struct hash_table *ht, const char *b, const char *e)
695{
696 char *copy;
697 if (!ht)
698 return 1;
699 BOUNDED_TO_ALLOCA (b, e, copy);
700 return hash_table_get (ht, copy) != NULL;
701}
702
703/* Advance P (a char pointer), with the explicit intent of being able
704 to read the next character. If this is not possible, go to finish. */
705
706#define ADVANCE(p) do { \
707 ++p; \
708 if (p >= end) \
709 goto finish; \
710} while (0)
711
712/* Skip whitespace, if any. */
713
714#define SKIP_WS(p) do { \
715 while (ISSPACE (*p)) { \
716 ADVANCE (p); \
717 } \
718} while (0)
719
720/* Skip non-whitespace, if any. */
721
722#define SKIP_NON_WS(p) do { \
723 while (!ISSPACE (*p)) { \
724 ADVANCE (p); \
725 } \
726} while (0)
727
728#ifdef STANDALONE
729static int tag_backout_count;
730#endif
731
732/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
733 MAPFUN will be called with two arguments: pointer to an initialized
734 struct taginfo, and MAPARG.
735
736 ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
737 which are the tags and attribute names that this function should
738 use. If ALLOWED_TAGS is NULL, all tags are processed; if
739 ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
740
741 (Obviously, the caller can filter out unwanted tags and attributes
742 just as well, but this is just an optimization designed to avoid
743 unnecessary copying of tags/attributes which the caller doesn't
744 care about.) */
745
746void
747map_html_tags (const char *text, int size,
748 void (*mapfun) (struct taginfo *, void *), void *maparg,
749 int flags,
750 const struct hash_table *allowed_tags,
751 const struct hash_table *allowed_attributes)
752{
753 /* storage for strings passed to MAPFUN callback; if 256 bytes is
754 too little, POOL_APPEND allocates more with malloc. */
755 char pool_initial_storage[256];
756 struct pool pool;
757
758 const char *p = text;
759 const char *end = text + size;
760
761 struct attr_pair attr_pair_initial_storage[8];
762 int attr_pair_size = countof (attr_pair_initial_storage);
763 int attr_pair_resized = 0;
764 struct attr_pair *pairs = attr_pair_initial_storage;
765
766 if (!size)
767 return;
768
769 POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
770
771 {
772 int nattrs, end_tag;
773 const char *tag_name_begin, *tag_name_end;
774 const char *tag_start_position;
775 int uninteresting_tag;
776
777 look_for_tag:
778 POOL_REWIND (&pool);
779
780 nattrs = 0;
781 end_tag = 0;
782
783 /* Find beginning of tag. We use memchr() instead of the usual
784 looping with ADVANCE() for speed. */
785 p = memchr (p, '<', end - p);
786 if (!p)
787 goto finish;
788
789 tag_start_position = p;
790 ADVANCE (p);
791
792 /* Establish the type of the tag (start-tag, end-tag or
793 declaration). */
794 if (*p == '!')
795 {
796 if (!(flags & MHT_STRICT_COMMENTS)
797 && p < end + 3 && p[1] == '-' && p[2] == '-')
798 {
799 /* If strict comments are not enforced and if we know
800 we're looking at a comment, simply look for the
801 terminating "-->". Non-strict is the default because
802 it works in other browsers and most HTML writers can't
803 be bothered with getting the comments right. */
804 const char *comment_end = find_comment_end (p + 3, end);
805 if (comment_end)
806 p = comment_end;
807 }
808 else
809 {
810 /* Either in strict comment mode or looking at a non-empty
811 declaration. Real declarations are much less likely to
812 be misused the way comments are, so advance over them
813 properly regardless of strictness. */
814 p = advance_declaration (p, end);
815 }
816 if (p == end)
817 goto finish;
818 goto look_for_tag;
819 }
820 else if (*p == '/')
821 {
822 end_tag = 1;
823 ADVANCE (p);
824 }
825 tag_name_begin = p;
826 while (NAME_CHAR_P (*p))
827 ADVANCE (p);
828 if (p == tag_name_begin)
829 goto look_for_tag;
830 tag_name_end = p;
831 SKIP_WS (p);
832 if (end_tag && *p != '>')
833 goto backout_tag;
834
835 if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
836 /* We can't just say "goto look_for_tag" here because we need
837 the loop below to properly advance over the tag's attributes. */
838 uninteresting_tag = 1;
839 else
840 {
841 uninteresting_tag = 0;
842 convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
843 }
844
845 /* Find the attributes. */
846 while (1)
847 {
848 const char *attr_name_begin, *attr_name_end;
849 const char *attr_value_begin, *attr_value_end;
850 const char *attr_raw_value_begin, *attr_raw_value_end;
851 int operation = AP_DOWNCASE; /* stupid compiler. */
852
853 SKIP_WS (p);
854
855 if (*p == '/')
856 {
857 /* A slash at this point means the tag is about to be
858 closed. This is legal in XML and has been popularized
859 in HTML via XHTML. */
860 /* <foo a=b c=d /> */
861 /* ^ */
862 ADVANCE (p);
863 SKIP_WS (p);
864 if (*p != '>')
865 goto backout_tag;
866 }
867
868 /* Check for end of tag definition. */
869 if (*p == '>')
870 break;
871
872 /* Establish bounds of attribute name. */
873 attr_name_begin = p; /* <foo bar ...> */
874 /* ^ */
875 while (NAME_CHAR_P (*p))
876 ADVANCE (p);
877 attr_name_end = p; /* <foo bar ...> */
878 /* ^ */
879 if (attr_name_begin == attr_name_end)
880 goto backout_tag;
881
882 /* Establish bounds of attribute value. */
883 SKIP_WS (p);
884 if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
885 {
886 /* Minimized attribute syntax allows `=' to be omitted.
887 For example, <UL COMPACT> is a valid shorthand for <UL
888 COMPACT="compact">. Even if such attributes are not
889 useful to Wget, we need to support them, so that the
890 tags containing them can be parsed correctly. */
891 attr_raw_value_begin = attr_value_begin = attr_name_begin;
892 attr_raw_value_end = attr_value_end = attr_name_end;
893 }
894 else if (*p == '=')
895 {
896 ADVANCE (p);
897 SKIP_WS (p);
898 if (*p == '\"' || *p == '\'')
899 {
900 int newline_seen = 0;
901 char quote_char = *p;
902 attr_raw_value_begin = p;
903 ADVANCE (p);
904 attr_value_begin = p; /* <foo bar="baz"> */
905 /* ^ */
906 while (*p != quote_char)
907 {
908 if (!newline_seen && *p == '\n')
909 {
910 /* If a newline is seen within the quotes, it
911 is most likely that someone forgot to close
912 the quote. In that case, we back out to
913 the value beginning, and terminate the tag
914 at either `>' or the delimiter, whichever
915 comes first. Such a tag terminated at `>'
916 is discarded. */
917 p = attr_value_begin;
918 newline_seen = 1;
919 continue;
920 }
921 else if (newline_seen && *p == '>')
922 break;
923 ADVANCE (p);
924 }
925 attr_value_end = p; /* <foo bar="baz"> */
926 /* ^ */
927 if (*p == quote_char)
928 ADVANCE (p);
929 else
930 goto look_for_tag;
931 attr_raw_value_end = p; /* <foo bar="baz"> */
932 /* ^ */
933 operation = AP_DECODE_ENTITIES;
934 if (flags & MHT_TRIM_VALUES)
935 operation |= AP_TRIM_BLANKS;
936 }
937 else
938 {
939 attr_value_begin = p; /* <foo bar=baz> */
940 /* ^ */
941 /* According to SGML, a name token should consist only
942 of alphanumerics, . and -. However, this is often
943 violated by, for instance, `%' in `width=75%'.
944 We'll be liberal and allow just about anything as
945 an attribute value. */
946 while (!ISSPACE (*p) && *p != '>')
947 ADVANCE (p);
948 attr_value_end = p; /* <foo bar=baz qux=quix> */
949 /* ^ */
950 if (attr_value_begin == attr_value_end)
951 /* <foo bar=> */
952 /* ^ */
953 goto backout_tag;
954 attr_raw_value_begin = attr_value_begin;
955 attr_raw_value_end = attr_value_end;
956 operation = AP_DECODE_ENTITIES;
957 }
958 }
959 else
960 {
961 /* We skipped the whitespace and found something that is
962 neither `=' nor the beginning of the next attribute's
963 name. Back out. */
964 goto backout_tag; /* <foo bar [... */
965 /* ^ */
966 }
967
968 /* If we're not interested in the tag, don't bother with any
969 of the attributes. */
970 if (uninteresting_tag)
971 continue;
972
973 /* If we aren't interested in the attribute, skip it. We
974 cannot do this test any sooner, because our text pointer
975 needs to correctly advance over the attribute. */
976 if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
977 continue;
978
979 GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
980 struct attr_pair);
981
982 pairs[nattrs].name_pool_index = pool.tail;
983 convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
984
985 pairs[nattrs].value_pool_index = pool.tail;
986 convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
987 pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
988 pairs[nattrs].value_raw_size = (attr_raw_value_end
989 - attr_raw_value_begin);
990 ++nattrs;
991 }
992
993 if (uninteresting_tag)
994 {
995 ADVANCE (p);
996 goto look_for_tag;
997 }
998
999 /* By now, we have a valid tag with a name and zero or more
1000 attributes. Fill in the data and call the mapper function. */
1001 {
1002 int i;
1003 struct taginfo taginfo;
1004
1005 taginfo.name = pool.contents;
1006 taginfo.end_tag_p = end_tag;
1007 taginfo.nattrs = nattrs;
1008 /* We fill in the char pointers only now, when pool can no
1009 longer get realloc'ed. If we did that above, we could get
1010 hosed by reallocation. Obviously, after this point, the pool
1011 may no longer be grown. */
1012 for (i = 0; i < nattrs; i++)
1013 {
1014 pairs[i].name = pool.contents + pairs[i].name_pool_index;
1015 pairs[i].value = pool.contents + pairs[i].value_pool_index;
1016 }
1017 taginfo.attrs = pairs;
1018 taginfo.start_position = tag_start_position;
1019 taginfo.end_position = p + 1;
1020 /* Ta-dam! */
1021 (*mapfun) (&taginfo, maparg);
1022 ADVANCE (p);
1023 }
1024 goto look_for_tag;
1025
1026 backout_tag:
1027#ifdef STANDALONE
1028 ++tag_backout_count;
1029#endif
1030 /* The tag wasn't really a tag. Treat its contents as ordinary
1031 data characters. */
1032 p = tag_start_position + 1;
1033 goto look_for_tag;
1034 }
1035
1036 finish:
1037 POOL_FREE (&pool);
1038 if (attr_pair_resized)
1039 xfree (pairs);
1040}
1041
1042#undef ADVANCE
1043#undef SKIP_WS
1044#undef SKIP_NON_WS
1045
1046
1047#ifdef STANDALONE
1048static void
1049test_mapper (struct taginfo *taginfo, void *arg)
1050{
1051 int i;
1052
1053 printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1054 for (i = 0; i < taginfo->nattrs; i++)
1055 printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1056 putchar ('\n');
1057 ++*(int *)arg;
1058}
1059
1060int main ()
1061{
1062 int size = 256;
1063 char *x = (char *)xmalloc (size);
1064 int length = 0;
1065 int read_count;
1066 int tag_counter = 0;
1067
1068 while ((read_count = fread (x + length, 1, size - length, stdin)))
1069 {
1070 length += read_count;
1071 size <<= 1;
1072 x = (char *)xrealloc (x, size);
1073 }
1074
1075 map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1076 printf ("TAGS: %d\n", tag_counter);
1077 printf ("Tag backouts: %d\n", tag_backout_count);
1078 printf ("Comment backouts: %d\n", comment_backout_count);
1079 return 0;
1080}
1081#endif /* STANDALONE */
Note: See TracBrowser for help on using the repository browser.