Context Navigation

html-parse.c

Visit:

Last change on this file was 3440, checked in by bird, 18 years ago
wget 1.10.2
File size: 30.1 KB

Line
1	/* HTML parser for Wget.
2	Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
3
4	This file is part of GNU Wget.
5
6	GNU Wget is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 2 of the License, or (at
9	your option) any later version.
10
11	GNU Wget is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with Wget; if not, write to the Free Software
18	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20	In addition, as a special exception, the Free Software Foundation
21	gives permission to link the code of its release of Wget with the
22	OpenSSL project's "OpenSSL" library (or with modified versions of it
23	that use the same license as the "OpenSSL" library), and distribute
24	the linked executables. You must obey the GNU General Public License
25	in all respects for all of the code used other than "OpenSSL". If you
26	modify this file, you may extend this exception to your version of the
27	file, but you are not obligated to do so. If you do not wish to do
28	so, delete this exception statement from your version. */
29
30	/* The only entry point to this module is map_html_tags(), which see. */
31
32	/* TODO:
33
34	- Allow hooks for callers to process contents outside tags. This
35	is needed to implement handling <style> and <script>. The
36	taginfo structure already carries the information about where the
37	tags are, but this is not enough, because one would also want to
38	skip the comments. (The funny thing is that for <style> and
39	<script> you don't want to skip comments!)
40
41	- Create a test suite for regression testing. */
42
43	/* HISTORY:
44
45	This is the third HTML parser written for Wget. The first one was
46	written some time during the Geturl 1.0 beta cycle, and was very
47	inefficient and buggy. It also contained some very complex code to
48	remember a list of parser states, because it was supposed to be
49	reentrant.
50
51	The second HTML parser was written for Wget 1.4 (the first version
52	by the name `Wget'), and was a complete rewrite. Although the new
53	parser behaved much better and made no claims of reentrancy, it
54	still shared many of the fundamental flaws of the old version -- it
55	only regarded HTML in terms tag-attribute pairs, where the
56	attribute's value was a URL to be returned. Any other property of
57	HTML, such as <base href=...>, or strange way to specify a URL,
58	such as <meta http-equiv=Refresh content="0; URL=..."> had to be
59	crudely hacked in -- and the caller had to be aware of these hacks.
60	Like its predecessor, this parser did not support HTML comments.
61
62	After Wget 1.5.1 was released, I set out to write a third HTML
63	parser. The objectives of the new parser were to: (1) provide a
64	clean way to analyze HTML lexically, (2) separate interpretation of
65	the markup from the parsing process, (3) be as correct as possible,
66	e.g. correctly skipping comments and other SGML declarations, (4)
67	understand the most common errors in markup and skip them or be
68	relaxed towrds them, and (5) be reasonably efficient (no regexps,
69	minimum copying and minimum or no heap allocation).
70
71	I believe this parser meets all of the above goals. It is
72	reasonably well structured, and could be relatively easily
73	separated from Wget and used elsewhere. While some of its
74	intrinsic properties limit its value as a general-purpose HTML
75	parser, I believe that, with minimum modifications, it could serve
76	as a backend for one.
77
78	Due to time and other constraints, this parser was not integrated
79	into Wget until the version 1.7. */
80
81	/* DESCRIPTION:
82
83	The single entry point of this parser is map_html_tags(), which
84	works by calling a function you specify for each tag. The function
85	gets called with the pointer to a structure describing the tag and
86	its attributes. */
87
88	/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll
89	still need Wget headers to compile. */
90
91	#include <config.h>
92
93	#ifdef STANDALONE
94	# define I_REALLY_WANT_CTYPE_MACROS
95	#endif
96
97	#include <stdio.h>
98	#include <stdlib.h>
99	#ifdef HAVE_STRING_H
100	# include <string.h>
101	#else
102	# include <strings.h>
103	#endif
104	#include <assert.h>
105
106	#include "wget.h"
107	#include "html-parse.h"
108
109	#ifdef STANDALONE
110	# undef xmalloc
111	# undef xrealloc
112	# undef xfree
113	# define xmalloc malloc
114	# define xrealloc realloc
115	# define xfree free
116
117	# undef ISSPACE
118	# undef ISDIGIT
119	# undef ISXDIGIT
120	# undef ISALPHA
121	# undef ISALNUM
122	# undef TOLOWER
123	# undef TOUPPER
124
125	# define ISSPACE(x) isspace (x)
126	# define ISDIGIT(x) isdigit (x)
127	# define ISXDIGIT(x) isxdigit (x)
128	# define ISALPHA(x) isalpha (x)
129	# define ISALNUM(x) isalnum (x)
130	# define TOLOWER(x) tolower (x)
131	# define TOUPPER(x) toupper (x)
132
133	struct hash_table {
134	int dummy;
135	};
136	static void *
137	hash_table_get (const struct hash_table ht, void ptr)
138	{
139	return ptr;
140	}
141	#else /* not STANDALONE */
142	# include "hash.h"
143	#endif
144
145	/* Pool support. A pool is a resizable chunk of memory. It is first
146	allocated on the stack, and moved to the heap if it needs to be
147	larger than originally expected. map_html_tags() uses it to store
148	the zero-terminated names and values of tags and attributes.
149
150	Thus taginfo->name, and attr->name and attr->value for each
151	attribute, do not point into separately allocated areas, but into
152	different parts of the pool, separated only by terminating zeros.
153	This ensures minimum amount of allocation and, for most tags, no
154	allocation because the entire pool is kept on the stack. */
155
156	struct pool {
157	char contents; / pointer to the contents. */
158	int size; /* size of the pool. */
159	int tail; /* next available position index. */
160	int resized; /* whether the pool has been resized
161	using malloc. */
162
163	char orig_contents; / original pool contents, usually
164	stack-allocated. used by POOL_FREE
165	to restore the pool to the initial
166	state. */
167	int orig_size;
168	};
169
170	/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
171
172	#define POOL_INIT(p, initial_storage, initial_size) do { \
173	struct pool *P = (p); \
174	P->contents = (initial_storage); \
175	P->size = (initial_size); \
176	P->tail = 0; \
177	P->resized = 0; \
178	P->orig_contents = P->contents; \
179	P->orig_size = P->size; \
180	} while (0)
181
182	/* Grow the pool to accomodate at least SIZE new bytes. If the pool
183	already has room to accomodate SIZE bytes of data, this is a no-op. */
184
185	#define POOL_GROW(p, increase) \
186	GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
187	(p)->resized, char)
188
189	/* Append text in the range [beg, end) to POOL. No zero-termination
190	is done. */
191
192	#define POOL_APPEND(p, beg, end) do { \
193	const char *PA_beg = (beg); \
194	int PA_size = (end) - PA_beg; \
195	POOL_GROW (p, PA_size); \
196	memcpy ((p)->contents + (p)->tail, PA_beg, PA_size); \
197	(p)->tail += PA_size; \
198	} while (0)
199
200	/* Append one character to the pool. Can be used to zero-terminate
201	pool strings. */
202
203	#define POOL_APPEND_CHR(p, ch) do { \
204	char PAC_char = (ch); \
205	POOL_GROW (p, 1); \
206	(p)->contents[(p)->tail++] = PAC_char; \
207	} while (0)
208
209	/* Forget old pool contents. The allocated memory is not freed. */
210	#define POOL_REWIND(p) (p)->tail = 0
211
212	/* Free heap-allocated memory for contents of POOL. This calls
213	xfree() if the memory was allocated through malloc. It also
214	restores `contents' and `size' to their original, pre-malloc
215	values. That way after POOL_FREE, the pool is fully usable, just
216	as if it were freshly initialized with POOL_INIT. */
217
218	#define POOL_FREE(p) do { \
219	struct pool *P = p; \
220	if (P->resized) \
221	xfree (P->contents); \
222	P->contents = P->orig_contents; \
223	P->size = P->orig_size; \
224	P->tail = 0; \
225	P->resized = 0; \
226	} while (0)
227
228	/* Used for small stack-allocated memory chunks that might grow. Like
229	DO_REALLOC, this macro grows BASEVAR as necessary to take
230	NEEDED_SIZE items of TYPE.
231
232	The difference is that on the first resize, it will use
233	malloc+memcpy rather than realloc. That way you can stack-allocate
234	the initial chunk, and only resort to heap allocation if you
235	stumble upon large data.
236
237	After the first resize, subsequent ones are performed with realloc,
238	just like DO_REALLOC. */
239
240	#define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do { \
241	long ga_needed_size = (needed_size); \
242	long ga_newsize = (sizevar); \
243	while (ga_newsize < ga_needed_size) \
244	ga_newsize <<= 1; \
245	if (ga_newsize != (sizevar)) \
246	{ \
247	if (resized) \
248	basevar = (type )xrealloc (basevar, ga_newsize sizeof (type)); \
249	else \
250	{ \
251	void ga_new = xmalloc (ga_newsize sizeof (type)); \
252	memcpy (ga_new, basevar, (sizevar) * sizeof (type)); \
253	(basevar) = ga_new; \
254	resized = 1; \
255	} \
256	(sizevar) = ga_newsize; \
257	} \
258	} while (0)
259
260
261	/* Test whether n+1-sized entity name fits in P. We don't support
262	IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
263	However, "<foo" will work, as will "&lt!foo", "&lt", etc. In
264	other words an entity needs to be terminated by either a
265	non-alphanumeric or the end of string. */
266	#define FITS(p, n) (p + n == end \|\| (p + n < end && !ISALNUM (p[n])))
267
268	/* Macros that test entity names by returning true if P is followed by
269	the specified characters. */
270	#define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
271	#define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
272	#define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
273
274	/* Increment P by INC chars. If P lands at a semicolon, increment it
275	past the semicolon. This ensures that e.g. "<foo" is converted
276	to "<foo", but "&lt,foo" to "<,foo". */
277	#define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
278
279	/* Decode the HTML character entity at *PTR, considering END to be end
280	of buffer. It is assumed that the "&" character that marks the
281	beginning of the entity has been seen at *PTR-1. If a recognized
282	ASCII entity is seen, it is returned, and *PTR is moved to the end
283	of the entity. Otherwise, -1 is returned and *PTR left unmodified.
284
285	The recognized entities are: &lt, &gt, &amp, &apos, and &quot. */
286
287	static int
288	decode_entity (const char *ptr, const char end)
289	{
290	const char p = ptr;
291	int value = -1;
292
293	if (++p == end)
294	return -1;
295
296	switch (*p++)
297	{
298	case '#':
299	/* Process numeric entities "&#DDD;" and "&#xHH;". */
300	{
301	int digits = 0;
302	value = 0;
303	if (*p == 'x')
304	for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++)
305	value = (value << 4) + XDIGIT_TO_NUM (*p);
306	else
307	for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++)
308	value = (value * 10) + (*p - '0');
309	if (!digits)
310	return -1;
311	/* Don't interpret 128+ codes and NUL because we cannot
312	portably reinserted them into HTML. */
313	if (!value \|\| (value & ~0x7f))
314	return -1;
315	*ptr = SKIP_SEMI (p, 0);
316	return value;
317	}
318	/* Process named ASCII entities. */
319	case 'g':
320	if (ENT1 (p, 't'))
321	value = '>', *ptr = SKIP_SEMI (p, 1);
322	break;
323	case 'l':
324	if (ENT1 (p, 't'))
325	value = '<', *ptr = SKIP_SEMI (p, 1);
326	break;
327	case 'a':
328	if (ENT2 (p, 'm', 'p'))
329	value = '&', *ptr = SKIP_SEMI (p, 2);
330	else if (ENT3 (p, 'p', 'o', 's'))
331	/* handle &apos for the sake of the XML/XHTML crowd. */
332	value = '\'', *ptr = SKIP_SEMI (p, 3);
333	break;
334	case 'q':
335	if (ENT3 (p, 'u', 'o', 't'))
336	value = '\"', *ptr = SKIP_SEMI (p, 3);
337	break;
338	}
339	return value;
340	}
341	#undef ENT1
342	#undef ENT2
343	#undef ENT3
344	#undef FITS
345	#undef SKIP_SEMI
346
347	enum {
348	AP_DOWNCASE = 1,
349	AP_DECODE_ENTITIES = 2,
350	AP_TRIM_BLANKS = 4
351	};
352
353	/* Copy the text in the range [BEG, END) to POOL, optionally
354	performing operations specified by FLAGS. FLAGS may be any
355	combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
356	with the following meaning:
357
358	* AP_DOWNCASE -- downcase all the letters;
359
360	* AP_DECODE_ENTITIES -- decode the named and numeric entities in
361	the ASCII range when copying the string.
362
363	* AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
364	of text, as well as embedded newlines. */
365
366	static void
367	convert_and_copy (struct pool pool, const char beg, const char *end, int flags)
368	{
369	int old_tail = pool->tail;
370
371	/* Skip blanks if required. We must do this before entities are
372	processed, so that blanks can still be inserted as, for instance,
373	` '. */
374	if (flags & AP_TRIM_BLANKS)
375	{
376	while (beg < end && ISSPACE (*beg))
377	++beg;
378	while (end > beg && ISSPACE (end[-1]))
379	--end;
380	}
381
382	if (flags & AP_DECODE_ENTITIES)
383	{
384	/* Grow the pool, then copy the text to the pool character by
385	character, processing the encountered entities as we go
386	along.
387
388	It's safe (and necessary) to grow the pool in advance because
389	processing the entities can only shorten the string, it can
390	never lengthen it. */
391	const char *from = beg;
392	char *to;
393	int squash_newlines = flags & AP_TRIM_BLANKS;
394
395	POOL_GROW (pool, end - beg);
396	to = pool->contents + pool->tail;
397
398	while (from < end)
399	{
400	if (*from == '&')
401	{
402	int entity = decode_entity (&from, end);
403	if (entity != -1)
404	*to++ = entity;
405	else
406	to++ = from++;
407	}
408	else if ((from == '\n' \|\| from == '\r') && squash_newlines)
409	++from;
410	else
411	to++ = from++;
412	}
413	/* Verify that we haven't exceeded the original size. (It
414	shouldn't happen, hence the assert.) */
415	assert (to - (pool->contents + pool->tail) <= end - beg);
416
417	/* Make POOL's tail point to the position following the string
418	we've written. */
419	pool->tail = to - pool->contents;
420	POOL_APPEND_CHR (pool, '\0');
421	}
422	else
423	{
424	/* Just copy the text to the pool. */
425	POOL_APPEND (pool, beg, end);
426	POOL_APPEND_CHR (pool, '\0');
427	}
428
429	if (flags & AP_DOWNCASE)
430	{
431	char *p = pool->contents + old_tail;
432	for (; *p; p++)
433	p = TOLOWER (p);
434	}
435	}
436
437
438	/* Originally we used to adhere to rfc 1866 here, and allowed only
439	letters, digits, periods, and hyphens as names (of tags or
440	attributes). However, this broke too many pages which used
441	proprietary or strange attributes, e.g. <img src="a.gif"
442	v:shapes="whatever">.
443
444	So now we allow any character except:
445	* whitespace
446	* 8-bit and control chars
447	* characters that clearly cannot be part of name:
448	'=', '>', '/'.
449
450	This only affects attribute and tag names; attribute values allow
451	an even greater variety of characters. */
452
453	#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
454	&& (x) != '=' && (x) != '>' && (x) != '/')
455
456	#ifdef STANDALONE
457	static int comment_backout_count;
458	#endif
459
460	/* Advance over an SGML declaration, such as <!DOCTYPE ...>. In
461	strict comments mode, this is used for skipping over comments as
462	well.
463
464	To recap: any SGML declaration may have comments associated with
465	it, e.g.
466	<!MY-DECL -- isn't this fun? -- foo bar>
467
468	An HTML comment is merely an empty declaration (<!>) with a comment
469	attached, like this:
470	<!-- some stuff here -->
471
472	Several comments may be embedded in one comment declaration:
473	<!-- have -- -- fun -->
474
475	Whitespace is allowed between and after the comments, but not
476	before the first comment. Additionally, this function attempts to
477	handle double quotes in SGML declarations correctly. */
478
479	static const char *
480	advance_declaration (const char beg, const char end)
481	{
482	const char *p = beg;
483	char quote_char = '\0'; /* shut up, gcc! */
484	char ch;
485
486	enum {
487	AC_S_DONE,
488	AC_S_BACKOUT,
489	AC_S_BANG,
490	AC_S_DEFAULT,
491	AC_S_DCLNAME,
492	AC_S_DASH1,
493	AC_S_DASH2,
494	AC_S_COMMENT,
495	AC_S_DASH3,
496	AC_S_DASH4,
497	AC_S_QUOTE1,
498	AC_S_IN_QUOTE,
499	AC_S_QUOTE2
500	} state = AC_S_BANG;
501
502	if (beg == end)
503	return beg;
504	ch = *p++;
505
506	/* It looked like a good idea to write this as a state machine, but
507	now I wonder... */
508
509	while (state != AC_S_DONE && state != AC_S_BACKOUT)
510	{
511	if (p == end)
512	state = AC_S_BACKOUT;
513	switch (state)
514	{
515	case AC_S_DONE:
516	case AC_S_BACKOUT:
517	break;
518	case AC_S_BANG:
519	if (ch == '!')
520	{
521	ch = *p++;
522	state = AC_S_DEFAULT;
523	}
524	else
525	state = AC_S_BACKOUT;
526	break;
527	case AC_S_DEFAULT:
528	switch (ch)
529	{
530	case '-':
531	state = AC_S_DASH1;
532	break;
533	case ' ':
534	case '\t':
535	case '\r':
536	case '\n':
537	ch = *p++;
538	break;
539	case '>':
540	state = AC_S_DONE;
541	break;
542	case '\'':
543	case '\"':
544	state = AC_S_QUOTE1;
545	break;
546	default:
547	if (NAME_CHAR_P (ch))
548	state = AC_S_DCLNAME;
549	else
550	state = AC_S_BACKOUT;
551	break;
552	}
553	break;
554	case AC_S_DCLNAME:
555	if (ch == '-')
556	state = AC_S_DASH1;
557	else if (NAME_CHAR_P (ch))
558	ch = *p++;
559	else
560	state = AC_S_DEFAULT;
561	break;
562	case AC_S_QUOTE1:
563	/* We must use 0x22 because broken assert macros choke on
564	'"' and '\"'. */
565	assert (ch == '\'' \|\| ch == 0x22);
566	quote_char = ch; /* cheating -- I really don't feel like
567	introducing more different states for
568	different quote characters. */
569	ch = *p++;
570	state = AC_S_IN_QUOTE;
571	break;
572	case AC_S_IN_QUOTE:
573	if (ch == quote_char)
574	state = AC_S_QUOTE2;
575	else
576	ch = *p++;
577	break;
578	case AC_S_QUOTE2:
579	assert (ch == quote_char);
580	ch = *p++;
581	state = AC_S_DEFAULT;
582	break;
583	case AC_S_DASH1:
584	assert (ch == '-');
585	ch = *p++;
586	state = AC_S_DASH2;
587	break;
588	case AC_S_DASH2:
589	switch (ch)
590	{
591	case '-':
592	ch = *p++;
593	state = AC_S_COMMENT;
594	break;
595	default:
596	state = AC_S_BACKOUT;
597	}
598	break;
599	case AC_S_COMMENT:
600	switch (ch)
601	{
602	case '-':
603	state = AC_S_DASH3;
604	break;
605	default:
606	ch = *p++;
607	break;
608	}
609	break;
610	case AC_S_DASH3:
611	assert (ch == '-');
612	ch = *p++;
613	state = AC_S_DASH4;
614	break;
615	case AC_S_DASH4:
616	switch (ch)
617	{
618	case '-':
619	ch = *p++;
620	state = AC_S_DEFAULT;
621	break;
622	default:
623	state = AC_S_COMMENT;
624	break;
625	}
626	break;
627	}
628	}
629
630	if (state == AC_S_BACKOUT)
631	{
632	#ifdef STANDALONE
633	++comment_backout_count;
634	#endif
635	return beg + 1;
636	}
637	return p;
638	}
639
640	/* Find the first occurrence of the substring "-->" in [BEG, END) and
641	return the pointer to the character after the substring. If the
642	substring is not found, return NULL. */
643
644	static const char *
645	find_comment_end (const char beg, const char end)
646	{
647	/* Open-coded Boyer-Moore search for "-->". Examine the third char;
648	if it's not '>' or '-', advance by three characters. Otherwise,
649	look at the preceding characters and try to find a match. */
650
651	const char *p = beg - 1;
652
653	while ((p += 3) < end)
654	switch (p[0])
655	{
656	case '>':
657	if (p[-1] == '-' && p[-2] == '-')
658	return p + 1;
659	break;
660	case '-':
661	at_dash:
662	if (p[-1] == '-')
663	{
664	at_dash_dash:
665	if (++p == end) return NULL;
666	switch (p[0])
667	{
668	case '>': return p + 1;
669	case '-': goto at_dash_dash;
670	}
671	}
672	else
673	{
674	if ((p += 2) >= end) return NULL;
675	switch (p[0])
676	{
677	case '>':
678	if (p[-1] == '-')
679	return p + 1;
680	break;
681	case '-':
682	goto at_dash;
683	}
684	}
685	}
686	return NULL;
687	}
688
689
690	/* Return non-zero of the string inside [b, e) are present in hash
691	table HT. */
692
693	static int
694	name_allowed (const struct hash_table ht, const char b, const char *e)
695	{
696	char *copy;
697	if (!ht)
698	return 1;
699	BOUNDED_TO_ALLOCA (b, e, copy);
700	return hash_table_get (ht, copy) != NULL;
701	}
702
703	/* Advance P (a char pointer), with the explicit intent of being able
704	to read the next character. If this is not possible, go to finish. */
705
706	#define ADVANCE(p) do { \
707	++p; \
708	if (p >= end) \
709	goto finish; \
710	} while (0)
711
712	/* Skip whitespace, if any. */
713
714	#define SKIP_WS(p) do { \
715	while (ISSPACE (*p)) { \
716	ADVANCE (p); \
717	} \
718	} while (0)
719
720	/* Skip non-whitespace, if any. */
721
722	#define SKIP_NON_WS(p) do { \
723	while (!ISSPACE (*p)) { \
724	ADVANCE (p); \
725	} \
726	} while (0)
727
728	#ifdef STANDALONE
729	static int tag_backout_count;
730	#endif
731
732	/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
733	MAPFUN will be called with two arguments: pointer to an initialized
734	struct taginfo, and MAPARG.
735
736	ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
737	which are the tags and attribute names that this function should
738	use. If ALLOWED_TAGS is NULL, all tags are processed; if
739	ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
740
741	(Obviously, the caller can filter out unwanted tags and attributes
742	just as well, but this is just an optimization designed to avoid
743	unnecessary copying of tags/attributes which the caller doesn't
744	care about.) */
745
746	void
747	map_html_tags (const char *text, int size,
748	void (mapfun) (struct taginfo , void ), void maparg,
749	int flags,
750	const struct hash_table *allowed_tags,
751	const struct hash_table *allowed_attributes)
752	{
753	/* storage for strings passed to MAPFUN callback; if 256 bytes is
754	too little, POOL_APPEND allocates more with malloc. */
755	char pool_initial_storage[256];
756	struct pool pool;
757
758	const char *p = text;
759	const char *end = text + size;
760
761	struct attr_pair attr_pair_initial_storage[8];
762	int attr_pair_size = countof (attr_pair_initial_storage);
763	int attr_pair_resized = 0;
764	struct attr_pair *pairs = attr_pair_initial_storage;
765
766	if (!size)
767	return;
768
769	POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
770
771	{
772	int nattrs, end_tag;
773	const char tag_name_begin, tag_name_end;
774	const char *tag_start_position;
775	int uninteresting_tag;
776
777	look_for_tag:
778	POOL_REWIND (&pool);
779
780	nattrs = 0;
781	end_tag = 0;
782
783	/* Find beginning of tag. We use memchr() instead of the usual
784	looping with ADVANCE() for speed. */
785	p = memchr (p, '<', end - p);
786	if (!p)
787	goto finish;
788
789	tag_start_position = p;
790	ADVANCE (p);
791
792	/* Establish the type of the tag (start-tag, end-tag or
793	declaration). */
794	if (*p == '!')
795	{
796	if (!(flags & MHT_STRICT_COMMENTS)
797	&& p < end + 3 && p[1] == '-' && p[2] == '-')
798	{
799	/* If strict comments are not enforced and if we know
800	we're looking at a comment, simply look for the
801	terminating "-->". Non-strict is the default because
802	it works in other browsers and most HTML writers can't
803	be bothered with getting the comments right. */
804	const char *comment_end = find_comment_end (p + 3, end);
805	if (comment_end)
806	p = comment_end;
807	}
808	else
809	{
810	/* Either in strict comment mode or looking at a non-empty
811	declaration. Real declarations are much less likely to
812	be misused the way comments are, so advance over them
813	properly regardless of strictness. */
814	p = advance_declaration (p, end);
815	}
816	if (p == end)
817	goto finish;
818	goto look_for_tag;
819	}
820	else if (*p == '/')
821	{
822	end_tag = 1;
823	ADVANCE (p);
824	}
825	tag_name_begin = p;
826	while (NAME_CHAR_P (*p))
827	ADVANCE (p);
828	if (p == tag_name_begin)
829	goto look_for_tag;
830	tag_name_end = p;
831	SKIP_WS (p);
832	if (end_tag && *p != '>')
833	goto backout_tag;
834
835	if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
836	/* We can't just say "goto look_for_tag" here because we need
837	the loop below to properly advance over the tag's attributes. */
838	uninteresting_tag = 1;
839	else
840	{
841	uninteresting_tag = 0;
842	convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
843	}
844
845	/* Find the attributes. */
846	while (1)
847	{
848	const char attr_name_begin, attr_name_end;
849	const char attr_value_begin, attr_value_end;
850	const char attr_raw_value_begin, attr_raw_value_end;
851	int operation = AP_DOWNCASE; /* stupid compiler. */
852
853	SKIP_WS (p);
854
855	if (*p == '/')
856	{
857	/* A slash at this point means the tag is about to be
858	closed. This is legal in XML and has been popularized
859	in HTML via XHTML. */
860	/* <foo a=b c=d /> */
861	/* ^ */
862	ADVANCE (p);
863	SKIP_WS (p);
864	if (*p != '>')
865	goto backout_tag;
866	}
867
868	/* Check for end of tag definition. */
869	if (*p == '>')
870	break;
871
872	/* Establish bounds of attribute name. */
873	attr_name_begin = p; /* <foo bar ...> */
874	/* ^ */
875	while (NAME_CHAR_P (*p))
876	ADVANCE (p);
877	attr_name_end = p; /* <foo bar ...> */
878	/* ^ */
879	if (attr_name_begin == attr_name_end)
880	goto backout_tag;
881
882	/* Establish bounds of attribute value. */
883	SKIP_WS (p);
884	if (NAME_CHAR_P (p) \|\| p == '/' \|\| *p == '>')
885	{
886	/* Minimized attribute syntax allows `=' to be omitted.
887	For example, <UL COMPACT> is a valid shorthand for <UL
888	COMPACT="compact">. Even if such attributes are not
889	useful to Wget, we need to support them, so that the
890	tags containing them can be parsed correctly. */
891	attr_raw_value_begin = attr_value_begin = attr_name_begin;
892	attr_raw_value_end = attr_value_end = attr_name_end;
893	}
894	else if (*p == '=')
895	{
896	ADVANCE (p);
897	SKIP_WS (p);
898	if (p == '\"' \|\| p == '\'')
899	{
900	int newline_seen = 0;
901	char quote_char = *p;
902	attr_raw_value_begin = p;
903	ADVANCE (p);
904	attr_value_begin = p; /* <foo bar="baz"> */
905	/* ^ */
906	while (*p != quote_char)
907	{
908	if (!newline_seen && *p == '\n')
909	{
910	/* If a newline is seen within the quotes, it
911	is most likely that someone forgot to close
912	the quote. In that case, we back out to
913	the value beginning, and terminate the tag
914	at either `>' or the delimiter, whichever
915	comes first. Such a tag terminated at `>'
916	is discarded. */
917	p = attr_value_begin;
918	newline_seen = 1;
919	continue;
920	}
921	else if (newline_seen && *p == '>')
922	break;
923	ADVANCE (p);
924	}
925	attr_value_end = p; /* <foo bar="baz"> */
926	/* ^ */
927	if (*p == quote_char)
928	ADVANCE (p);
929	else
930	goto look_for_tag;
931	attr_raw_value_end = p; /* <foo bar="baz"> */
932	/* ^ */
933	operation = AP_DECODE_ENTITIES;
934	if (flags & MHT_TRIM_VALUES)
935	operation \|= AP_TRIM_BLANKS;
936	}
937	else
938	{
939	attr_value_begin = p; /* <foo bar=baz> */
940	/* ^ */
941	/* According to SGML, a name token should consist only
942	of alphanumerics, . and -. However, this is often
943	violated by, for instance, `%' in `width=75%'.
944	We'll be liberal and allow just about anything as
945	an attribute value. */
946	while (!ISSPACE (p) && p != '>')
947	ADVANCE (p);
948	attr_value_end = p; /* <foo bar=baz qux=quix> */
949	/* ^ */
950	if (attr_value_begin == attr_value_end)
951	/* <foo bar=> */
952	/* ^ */
953	goto backout_tag;
954	attr_raw_value_begin = attr_value_begin;
955	attr_raw_value_end = attr_value_end;
956	operation = AP_DECODE_ENTITIES;
957	}
958	}
959	else
960	{
961	/* We skipped the whitespace and found something that is
962	neither `=' nor the beginning of the next attribute's
963	name. Back out. */
964	goto backout_tag; /* <foo bar [... */
965	/* ^ */
966	}
967
968	/* If we're not interested in the tag, don't bother with any
969	of the attributes. */
970	if (uninteresting_tag)
971	continue;
972
973	/* If we aren't interested in the attribute, skip it. We
974	cannot do this test any sooner, because our text pointer
975	needs to correctly advance over the attribute. */
976	if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
977	continue;
978
979	GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
980	struct attr_pair);
981
982	pairs[nattrs].name_pool_index = pool.tail;
983	convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
984
985	pairs[nattrs].value_pool_index = pool.tail;
986	convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
987	pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
988	pairs[nattrs].value_raw_size = (attr_raw_value_end
989	- attr_raw_value_begin);
990	++nattrs;
991	}
992
993	if (uninteresting_tag)
994	{
995	ADVANCE (p);
996	goto look_for_tag;
997	}
998
999	/* By now, we have a valid tag with a name and zero or more
1000	attributes. Fill in the data and call the mapper function. */
1001	{
1002	int i;
1003	struct taginfo taginfo;
1004
1005	taginfo.name = pool.contents;
1006	taginfo.end_tag_p = end_tag;
1007	taginfo.nattrs = nattrs;
1008	/* We fill in the char pointers only now, when pool can no
1009	longer get realloc'ed. If we did that above, we could get
1010	hosed by reallocation. Obviously, after this point, the pool
1011	may no longer be grown. */
1012	for (i = 0; i < nattrs; i++)
1013	{
1014	pairs[i].name = pool.contents + pairs[i].name_pool_index;
1015	pairs[i].value = pool.contents + pairs[i].value_pool_index;
1016	}
1017	taginfo.attrs = pairs;
1018	taginfo.start_position = tag_start_position;
1019	taginfo.end_position = p + 1;
1020	/* Ta-dam! */
1021	(*mapfun) (&taginfo, maparg);
1022	ADVANCE (p);
1023	}
1024	goto look_for_tag;
1025
1026	backout_tag:
1027	#ifdef STANDALONE
1028	++tag_backout_count;
1029	#endif
1030	/* The tag wasn't really a tag. Treat its contents as ordinary
1031	data characters. */
1032	p = tag_start_position + 1;
1033	goto look_for_tag;
1034	}
1035
1036	finish:
1037	POOL_FREE (&pool);
1038	if (attr_pair_resized)
1039	xfree (pairs);
1040	}
1041
1042	#undef ADVANCE
1043	#undef SKIP_WS
1044	#undef SKIP_NON_WS
1045
1046
1047	#ifdef STANDALONE
1048	static void
1049	test_mapper (struct taginfo taginfo, void arg)
1050	{
1051	int i;
1052
1053	printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1054	for (i = 0; i < taginfo->nattrs; i++)
1055	printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1056	putchar ('\n');
1057	++(int )arg;
1058	}
1059
1060	int main ()
1061	{
1062	int size = 256;
1063	char x = (char )xmalloc (size);
1064	int length = 0;
1065	int read_count;
1066	int tag_counter = 0;
1067
1068	while ((read_count = fread (x + length, 1, size - length, stdin)))
1069	{
1070	length += read_count;
1071	size <<= 1;
1072	x = (char *)xrealloc (x, size);
1073	}
1074
1075	map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1076	printf ("TAGS: %d\n", tag_counter);
1077	printf ("Tag backouts: %d\n", tag_backout_count);
1078	printf ("Comment backouts: %d\n", comment_backout_count);
1079	return 0;
1080	}
1081	#endif /* STANDALONE */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/net-misc/wget/src/html-parse.c

Download in other formats: