Context Navigation

source: trunk/src/helpers/textv_html.c@ 163

Visit:

Last change on this file since 163 was 142, checked in by umoeller, 24 years ago
misc. updates
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 62.4 KB

Line
1
2	/*
3	*@@sourcefile textv_html.c:
4	* this code converts HTML code to escape sequences for the
5	* XTextView control (textview.c).
6	*
7	* This code is in part ugly spaghetti, but this is intentional to
8	* make this HTML parser FAST. In general, you get about double or
9	* triple the speed compared to Netscape 4.6 on OS/2. This code
10	* doesn't understand all of HTML though, but you get most of HTML 2.
11	* There's no tables or frames at this point.
12	*
13	* The entry point into this mess is txvConvertFromHTML, which
14	* is easy to use.
15	*
16	* Note: Version numbering in this file relates to XWorkplace version
17	* numbering.
18	*
19	*@@header "helpers\textv_html.h"
20	*
21	*@@added V0.9.3 (2000-05-10) [umoeller]
22	*/
23
24	/*
25	* Copyright (C) 2000 Ulrich Mller.
26	* This program is part of the XWorkplace package.
27	* This program is free software; you can redistribute it and/or modify
28	* it under the terms of the GNU General Public License as published by
29	* the Free Software Foundation, in version 2 as it comes in the COPYING
30	* file of the XWorkplace main distribution.
31	* This program is distributed in the hope that it will be useful,
32	* but WITHOUT ANY WARRANTY; without even the implied warranty of
33	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34	* GNU General Public License for more details.
35	*/
36
37	#define OS2EMX_PLAIN_CHAR
38	// this is needed for "os2emx.h"; if this is defined,
39	// emx will define PSZ as _signed_ char, otherwise
40	// as unsigned char
41
42	#include <os2.h>
43
44	#include <stdlib.h>
45	#include <stdio.h>
46	#include <string.h>
47
48	#include "setup.h" // code generation and debugging options
49
50	#include "helpers\linklist.h"
51	#include "helpers\stringh.h"
52	#include "helpers\textview.h"
53
54	#include "helpers\textv_html.h"
55
56	/*
57	*@@category: Helpers\PM helpers\Window classes\XTextView control\HTML conversion
58	* see textv_html.c.
59	*/
60
61	/* ******************************************************************
62	*
63	* Declarations
64	*
65	********************************************************************/
66
67	/*
68	*@@ LISTDESC:
69	* structure stored in COPYTARGET to
70	* hold list information (UL, OL, ... tags).
71	*
72	*@@added V0.9.3 (2000-05-07) [umoeller]
73	*/
74
75	typedef struct _LISTDESC
76	{
77	ULONG ulListType; // 0: unordered (UL)
78	// 1: ordered (OL)
79	// 2: definition lists (DL)
80	ULONG ulItem; // list enumeration; 1 on first item,
81	// 2 on next, ...
82	} LISTDESC, *PLISTDESC;
83
84	/*
85	*@@ COPYTARGET:
86	* monster structure which holds the current
87	* status of the HTML converter while conversion
88	* is taking place. This stores input/output pointers
89	* and various flags to avoid duplicate line breaks
90	* and such.
91	*
92	* One instance of this is created in txvConvertFromHTML
93	* on the stack and then passed to all the sub-function
94	* calls.
95	*
96	*@@added V0.9.3 (2000-05-06) [umoeller]
97	*/
98
99	typedef struct _COPYTARGET
100	{
101	PSZ pSource; // ptr into source string;
102	// valid ONLY while we're in a tag handler
103	PSZ pNewSource; // can be set by tag handler to skip characters;
104	// this is set to NULL before calling a tag
105	// handler; if this is still NULL, default
106	// processing occurs
107
108	// new string:
109	PSZ pszNew; // memory buffer
110	ULONG cbNew; // size of buffer (reallocated)
111	PSZ pTarget; // current char ptr into pszNew
112
113	// saved character while tag handler is being called
114	CHAR cSaved;
115
116	PXHTMLDATA pxhtml; // ptr to XHTMLDATA passed to txvConvertFromHTML
117
118	// formatting flags while going through the text
119	BOOL fSkipNextSpace;
120	// if TRUE, subsequent spaces are skipped
121	BOOL fNeedsLinebreak;
122	// if TRUE, \n is inserted before any other character
123	BOOL fSkipNextLinebreak;
124	// if TRUE, subsequent linebreaks are skipped
125	BOOL fPRE;
126	// are we currently in a PRE tag?
127	BOOL fInLink;
128	// are we currently in a A HREF= tag?
129
130	// arguments (attributes) for tag handlers
131	PSZ pszAttributes; // != NULL while a tag handler is being called
132	// and attributes exist for the tag
133
134	// anchors count
135	USHORT usAnchorIndex; // start with 1
136
137	// list maintenance
138	ULONG ulListLevel; // if > 0, we're in a UL or OL block;
139	// raised for each block
140	ULONG ulUnorderedListLevel; // raised with each UL block to keep track
141	// of bullets
142	ULONG ulOrderedListLevel; // raised with each UL block to keep track
143	// of 1), 2), a), b)... numbering
144	ULONG ulCurrentListType; // current list type (from highest LISTDESC)
145	BOOL fInDT; // TRUE if we're currently in a DT tag
146	LINKLIST llLists; // stack of LISTDESC items
147	} COPYTARGET, *PCOPYTARGET;
148
149	typedef VOID FNPROCESSTAG(PCOPYTARGET pct);
150	typedef FNPROCESSTAG *PFNPROCESSTAG;
151
152	/* ******************************************************************
153	*
154	* Global variables
155	*
156	********************************************************************/
157
158	/* ******************************************************************
159	*
160	* Append-char helpers
161	*
162	********************************************************************/
163
164	#define COPYTARGETALLOC 100000
165
166	/*
167	*@@ AppendChar:
168	* helper for txvConvertFromHTML to
169	* append a char to the target string
170	* in COPYTARGET.
171	* This performs a few additional checks
172	* and manages memory.
173	*
174	*@@added V0.9.3 (2000-05-06) [umoeller]
175	*/
176
177	static VOID AppendChar(PCOPYTARGET pct, // in/out: formatting buffer
178	unsigned char c)
179	{
180	// calculate ofs where to store next char
181	ULONG cbOfsNext = pct->pTarget - pct->pszNew;
182	if (cbOfsNext >= pct->cbNew) // have we reached the buffer size yet?
183	{
184	// more mem needed:
185	pct->cbNew += COPYTARGETALLOC;
186	pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
187	// if first call, pszNew is NULL, and realloc
188	// behaves just like malloc
189	// adjust target, because ptr might have changed
190	pct->pTarget = pct->pszNew + cbOfsNext;
191	}
192
193	// append character
194	*pct->pTarget++ = c;
195	}
196
197	/*
198	*@@ AppendString:
199	* appends the characters in *ach,
200	* which must be null-terminated.
201	* Does NOT append a null character though.
202	*
203	*@@added V0.9.3 (2000-05-06) [umoeller]
204	*/
205
206	static VOID AppendString(PCOPYTARGET pct, // in/out: formatting buffer
207	char *ach)
208	{
209	ULONG cbAppend = strlen(ach);
210	ULONG ul;
211	PSZ pSource;
212
213	// calculate ofs where to store next char
214	ULONG cbOfsNext = pct->pTarget - pct->pszNew;
215	while (cbOfsNext + cbAppend >= pct->cbNew)
216	{
217	// more mem needed:
218	pct->cbNew += COPYTARGETALLOC;
219	pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
220	// if first call, pszNew is NULL, and realloc
221	// behaves just like malloc
222	// adjust target, because ptr might have changed
223	pct->pTarget = pct->pszNew + cbOfsNext;
224	}
225
226	// append characters
227	pSource = ach;
228	for (ul = 0;
229	ul < cbAppend;
230	ul++)
231	pct->pTarget++ = pSource++;
232	}
233
234	/*
235	*@@ AppendLinebreakCheck:
236	* checks if a linebreak is needed and
237	* inserts one if so.
238	*
239	*@@added V0.9.3 (2000-05-17) [umoeller]
240	*/
241
242	static VOID AppendLinebreakCheck(PCOPYTARGET pct)
243	{
244	if ((!pct->fPRE) && (pct->fNeedsLinebreak))
245	{
246	// yes: insert linebreak; this resets pct->fNeedsLinebreak
247	if (!pct->fSkipNextLinebreak)
248	{
249	AppendChar(pct, '\n');
250
251	if ((pct->ulListLevel) && (!pct->fInDT))
252	// if we're in a list, add a tab also,
253	// because we'll have a negative first-line margin
254	AppendString(pct, TXVESC_TAB);
255	}
256	pct->fNeedsLinebreak = FALSE;
257	}
258	}
259
260	/*
261	*@@ AppendEscapeWithDecimal:
262	* appends the specified escape code
263	* with a three-digit decimal parameter.
264	* Calls AppendString in turn.
265	*
266	*@@added V0.9.3 (2000-05-07) [umoeller]
267	*/
268
269	static VOID AppendEscapeWith3Decimals(PCOPYTARGET pct, // in/out: formatting buffer
270	char *ach,
271	USHORT us)
272	{
273	CHAR szDecimal[10];
274	if (us > 999)
275	us = 999;
276	sprintf(szDecimal, "%03d", us);
277	// append escape
278	AppendString(pct, ach);
279	AppendString(pct, szDecimal);
280	}
281
282	/*
283	*@@ AppendEscapeWith4Decimals:
284	*
285	*@@added V0.9.3 (2000-05-07) [umoeller]
286	*/
287
288	static VOID AppendEscapeWith4Decimals(PCOPYTARGET pct, // in/out: formatting buffer
289	char *ach,
290	USHORT us)
291	{
292	CHAR szDecimal[10];
293	if (us > 9999)
294	us = 9999;
295	sprintf(szDecimal, "%04d", us);
296	// append escape
297	AppendString(pct, ach);
298	AppendString(pct, szDecimal);
299	}
300
301	/* ******************************************************************
302	*
303	* Tag converter functions
304	*
305	********************************************************************/
306
307	/*
308	*@@ StartList:
309	* starts a list (UL or OL).
310	* This uses a linked list in COPYTARGET
311	* to keep a pseudo-stack for nested lists.
312	*
313	*@@added V0.9.3 (2000-05-08) [umoeller]
314	*/
315
316	static VOID StartList(PCOPYTARGET pct, // in/out: formatting buffer
317	ULONG ulListType) // list type:
318	// 0: unordered (UL)
319	// 1: ordered (OL)
320	// 2: definition lists (DL)
321	{
322	PLISTDESC pListDesc;
323
324	// raise list level
325	pct->ulListLevel++;
326
327	if (ulListType == 0)
328	// unordered:
329	pct->ulUnorderedListLevel++;
330	else if (ulListType == 1)
331	// ordered:
332	pct->ulOrderedListLevel++;
333
334	// create LISTDESC and store it on stack
335	pListDesc = (PLISTDESC)malloc(sizeof(LISTDESC));
336	pListDesc->ulListType
337	= pct->ulCurrentListType
338	= ulListType;
339	pListDesc->ulItem = 1;
340
341	lstAppendItem(&pct->llLists,
342	pListDesc);
343
344	AppendEscapeWith4Decimals(pct,
345	TXVESC_LEFTMARGIN,
346	pct->ulListLevel * 5);
347	AppendEscapeWith3Decimals(pct,
348	TXVESC_FIRSTLINEMARGIN_LEFT,
349	(ulListType == 2)
350	? 5 // for definition lists
351	: 3); // negative!
352	// add \n before any other character
353	pct->fNeedsLinebreak = TRUE;
354	}
355
356	/*
357	*@@ StopList:
358	* stops a list (UL or OL).
359	*
360	*@@added V0.9.3 (2000-05-07) [umoeller]
361	*/
362
363	static VOID StopList(PCOPYTARGET pct)
364	{
365	if (pct->ulListLevel)
366	{
367	PLISTNODE pNode;
368
369	// lower list level
370	pct->ulListLevel--;
371	AppendEscapeWith4Decimals(pct,
372	TXVESC_LEFTMARGIN,
373	pct->ulListLevel * 5);
374	AppendEscapeWith3Decimals(pct,
375	TXVESC_FIRSTLINEMARGIN_LEFT,
376	(pct->ulListLevel)
377	? 3 // we still have a list level (nested)
378	: 0);
379	pct->fNeedsLinebreak = TRUE;
380
381	// remove the LISTDESC from the stack
382	pNode = lstNodeFromIndex(&pct->llLists,
383	pct->ulListLevel); // this has been lowered already
384	if (pNode)
385	{
386	PLISTDESC pListDesc = (PLISTDESC)pNode->pItemData;
387	if (pListDesc->ulListType == 0)
388	// was unordered:
389	pct->ulUnorderedListLevel--;
390	else if (pListDesc->ulListType == 1)
391	// was ordered:
392	pct->ulOrderedListLevel--;
393
394	lstRemoveNode(&pct->llLists, pNode);
395
396	// update COPYTARGET with previous list level
397	if (pct->ulListLevel)
398	{
399	// we're still in a list (nested lists):
400	PLISTDESC pListDesc2 = (PLISTDESC)lstItemFromIndex(&pct->llLists,
401	pct->ulListLevel - 1);
402	if (pListDesc2)
403	pct->ulCurrentListType = pListDesc2->ulListType;
404	}
405	}
406	}
407	// else: buggy HTML code, ignore
408	}
409
410	/*
411	*@@ TagTITLE:
412	*
413	*@@added V0.9.3 (2000-05-19) [umoeller]
414	*/
415
416	static VOID TagTITLE(PCOPYTARGET pct)
417	{
418	// pSource currently points to <TITLE tag
419	PSZ pSource = pct->pSource + strlen(pct->pSource);
420	// points to temporary null byte in main buffer now
421	*pSource = pct->cSaved;
422
423	pSource = strchr(pct->pSource, '>');
424	if (pSource)
425	{
426	PSZ pNextOpen = strchr(pSource, '<');
427	if (pNextOpen)
428	{
429	// extract title
430	pct->pxhtml->pszTitle = strhSubstr(pSource + 1, pNextOpen);
431
432	if (strnicmp(pNextOpen + 1, "/TITLE", 6) == 0)
433	{
434	// closing /TITLE tag found:
435	// search on after that
436	pct->pNewSource = strchr(pNextOpen, '>');
437	if (pct->pNewSource)
438	pct->pNewSource++;
439	}
440	}
441	}
442	}
443
444	/*
445	*@@ TagP:
446	*
447	*/
448
449	static VOID TagP(PCOPYTARGET pct)
450	{
451	// append newline:
452	// add \n before any other character
453	pct->fNeedsLinebreak = TRUE;
454
455	/* if (pct->ulListLevel)
456	{
457	// if we are currently in a list, we must also
458	// add a tab escape, because we have set
459	// the first line margin to the left of the
460	// left margin
461	AppendString(pct,
462	TXVESC_TAB);
463	} */
464	}
465
466	static VOID TagBR(PCOPYTARGET pct)
467	{
468	AppendChar(pct,
469	'\r');
470
471	if (pct->ulListLevel)
472	{
473	// if we are currently in a list, we must also
474	// add a tab escape, because we have set
475	// the first line margin to the left of the
476	// left margin
477	AppendString(pct,
478	TXVESC_TAB);
479	}
480	if (!pct->fPRE)
481	pct->fSkipNextSpace = TRUE;
482	}
483
484	static VOID TagPRE(PCOPYTARGET pct)
485	{
486	// start of PRE tag:
487	// add \n before any other character
488	// pct->fNeedsLinebreak = TRUE;
489	AppendChar(pct, '\n');
490	pct->fNeedsLinebreak = FALSE;
491	/* AppendString(pct,
492	TXVESC_PRE_BEGIN); */
493	AppendEscapeWith3Decimals(pct,
494	TXVESC_SET_FONT,
495	1); // monospaced font
496	AppendEscapeWith4Decimals(pct,
497	TXVESC_SPACEBEFORE,
498	0); // no spacing before
499	AppendEscapeWith4Decimals(pct,
500	TXVESC_SPACEAFTER,
501	0); // no spacing after
502	// disable word-wrapping
503	AppendString(pct,
504	TXVESC_WORDWRAP "0");
505	pct->fPRE = TRUE;
506	pct->fSkipNextSpace = FALSE;
507	}
508
509	static VOID TagXPRE(PCOPYTARGET pct)
510	{
511	pct->fPRE = FALSE;
512	AppendEscapeWith3Decimals(pct,
513	TXVESC_SET_FONT,
514	0); // standard font
515	AppendString(pct, TXVESC_SPACEBEFORE);
516	AppendString(pct, "####"); // reset to default
517	AppendString(pct, TXVESC_SPACEAFTER);
518	AppendString(pct, "####"); // reset to default
519	// re-enable word-wrapping
520	AppendString(pct,
521	TXVESC_WORDWRAP "1"
522	"\n"); // force line break
523	pct->fNeedsLinebreak = FALSE;
524	// refuse to add \n even if we have another "p" coming up
525	pct->fSkipNextLinebreak = TRUE;
526	pct->fSkipNextSpace = TRUE;
527	}
528
529	static VOID TagH1(PCOPYTARGET pct)
530	{
531	pct->fNeedsLinebreak = TRUE;
532	AppendEscapeWith3Decimals(pct,
533	TXVESC_POINTSIZE_REL,
534	200); // double size
535	AppendString(pct,
536	TXVESC_BOLD_BEGIN);
537	}
538
539	static VOID TagXH1(PCOPYTARGET pct)
540	{
541	AppendString(pct,
542	TXVESC_BOLD_END);
543	AppendEscapeWith3Decimals(pct,
544	TXVESC_POINTSIZE_REL,
545	100); // regular size
546	// add \n before any other character
547	pct->fNeedsLinebreak = TRUE;
548	}
549
550	static VOID TagH2(PCOPYTARGET pct)
551	{
552	pct->fNeedsLinebreak = TRUE;
553	AppendEscapeWith3Decimals(pct,
554	TXVESC_POINTSIZE_REL,
555	175); // size in percent of regular point size
556	AppendString(pct,
557	TXVESC_BOLD_BEGIN);
558	}
559
560	static VOID TagXH2(PCOPYTARGET pct)
561	{
562	AppendString(pct,
563	TXVESC_BOLD_END);
564	AppendEscapeWith3Decimals(pct,
565	TXVESC_POINTSIZE_REL,
566	100); // regular size
567	// add \n before any other character
568	pct->fNeedsLinebreak = TRUE;
569	}
570
571	static VOID TagH3(PCOPYTARGET pct)
572	{
573	pct->fNeedsLinebreak = TRUE;
574	AppendEscapeWith3Decimals(pct,
575	TXVESC_POINTSIZE_REL,
576	150); // size in percent of regular point size
577	AppendString(pct,
578	TXVESC_BOLD_BEGIN);
579	}
580
581	static VOID TagXH3(PCOPYTARGET pct)
582	{
583	AppendString(pct,
584	TXVESC_BOLD_END);
585	AppendEscapeWith3Decimals(pct,
586	TXVESC_POINTSIZE_REL,
587	100); // size in percent of regular point size
588	// add \n before any other character
589	pct->fNeedsLinebreak = TRUE;
590	}
591
592	static VOID TagH4(PCOPYTARGET pct)
593	{
594	pct->fNeedsLinebreak = TRUE;
595	AppendEscapeWith3Decimals(pct,
596	TXVESC_POINTSIZE_REL,
597	125); // size in percent of regular point size
598	AppendString(pct,
599	TXVESC_BOLD_BEGIN);
600	}
601
602	static VOID TagXH4(PCOPYTARGET pct)
603	{
604	AppendString(pct,
605	TXVESC_BOLD_END);
606	AppendEscapeWith3Decimals(pct,
607	TXVESC_POINTSIZE_REL,
608	100); // regular size
609	// add \n before any other character
610	pct->fNeedsLinebreak = TRUE;
611	}
612
613	static VOID TagH5(PCOPYTARGET pct)
614	{
615	pct->fNeedsLinebreak = TRUE;
616	AppendEscapeWith3Decimals(pct,
617	TXVESC_POINTSIZE_REL,
618	100); // size in percent of regular point size
619	AppendString(pct,
620	TXVESC_BOLD_BEGIN);
621	}
622
623	static VOID TagXH5(PCOPYTARGET pct)
624	{
625	AppendString(pct,
626	TXVESC_BOLD_END);
627	AppendEscapeWith3Decimals(pct,
628	TXVESC_POINTSIZE_REL,
629	100); // regular size
630	// add \n before any other character
631	pct->fNeedsLinebreak = TRUE;
632	}
633
634	static VOID TagH6(PCOPYTARGET pct)
635	{
636	pct->fNeedsLinebreak = TRUE;
637	AppendEscapeWith3Decimals(pct,
638	TXVESC_POINTSIZE_REL,
639	80 ); // size in percent of regular point size
640	AppendString(pct,
641	TXVESC_BOLD_BEGIN);
642	}
643
644	static VOID TagXH6(PCOPYTARGET pct)
645	{
646	AppendString(pct,
647	TXVESC_BOLD_END);
648	AppendEscapeWith3Decimals(pct,
649	TXVESC_POINTSIZE_REL,
650	100); // regular size
651	// add \n before any other character
652	pct->fNeedsLinebreak = TRUE;
653	}
654
655	static VOID TagUL(PCOPYTARGET pct)
656	{
657	StartList(pct,
658	0); // unordered
659	}
660
661	static VOID TagXUL(PCOPYTARGET pct)
662	{
663	StopList(pct);
664	}
665
666	static VOID TagOL(PCOPYTARGET pct)
667	{
668	StartList(pct,
669	1); // ordered
670	}
671
672	static VOID TagXOL(PCOPYTARGET pct)
673	{
674	StopList(pct);
675	}
676
677	static VOID TagLI(PCOPYTARGET pct)
678	{
679	PLISTDESC pListDesc;
680	CHAR szMarker[20] = TXVESC_MARKER "\x01";
681
682	if (pct->ulListLevel)
683	{
684	// we're in a list:
685	pListDesc = (PLISTDESC)lstItemFromIndex(&pct->llLists,
686	pct->ulListLevel - 1);
687	if (pListDesc)
688	{
689	if (pListDesc->ulListType == 1)
690	// is ordered list:
691	sprintf(szMarker, "%lu.", (pListDesc->ulItem)++);
692	else if (pListDesc->ulListType == 0)
693	// is unordered list:
694	// set bullet type according to unordered nesting
695	szMarker[2] = pct->ulUnorderedListLevel;
696	}
697	}
698
699	// add \n before any other character
700	// pct->fNeedsLinebreak = TRUE;
701	// if (pct->fNeedsLinebreak)
702	{
703	AppendChar(pct, '\n');
704	pct->fNeedsLinebreak = FALSE;
705	}
706
707	AppendString(pct, szMarker);
708	AppendString(pct, TXVESC_TAB);
709	}
710
711	static VOID TagDL(PCOPYTARGET pct)
712	{
713	StartList(pct,
714	2); // definition list
715	}
716
717	static VOID TagXDL(PCOPYTARGET pct)
718	{
719	StopList(pct);
720	pct->fInDT = FALSE;
721	}
722
723	static VOID TagDT(PCOPYTARGET pct)
724	{
725	pct->fNeedsLinebreak = TRUE;
726	pct->fInDT = TRUE;
727	}
728
729	static VOID TagDD(PCOPYTARGET pct)
730	{
731	pct->fNeedsLinebreak = TRUE;
732	AppendString(pct, TXVESC_TAB);
733	if (!pct->fPRE)
734	pct->fSkipNextSpace = TRUE;
735	pct->fInDT = FALSE;
736	}
737
738	static VOID TagTR(PCOPYTARGET pct)
739	{
740	pct->fNeedsLinebreak = TRUE;
741	}
742
743	static VOID TagB(PCOPYTARGET pct)
744	{
745	AppendString(pct,
746	TXVESC_BOLD_BEGIN);
747	}
748
749	static VOID TagXB(PCOPYTARGET pct)
750	{
751	AppendString(pct,
752	TXVESC_BOLD_END);
753	}
754
755	static VOID TagI(PCOPYTARGET pct)
756	{
757	AppendString(pct,
758	TXVESC_ITALICS_BEGIN);
759	}
760
761	static VOID TagXI(PCOPYTARGET pct)
762	{
763	AppendString(pct,
764	TXVESC_ITALICS_END);
765	}
766
767	static VOID TagU(PCOPYTARGET pct)
768	{
769	AppendString(pct,
770	TXVESC_UNDERLINE_BEGIN);
771	}
772
773	static VOID TagXU(PCOPYTARGET pct)
774	{
775	AppendString(pct,
776	TXVESC_UNDERLINE_END);
777	}
778
779	static VOID TagSTRIKE(PCOPYTARGET pct)
780	{
781	AppendString(pct,
782	TXVESC_STRIKE_BEGIN);
783	}
784
785	static VOID TagXSTRIKE(PCOPYTARGET pct)
786	{
787	AppendString(pct,
788	TXVESC_STRIKE_END);
789	}
790
791	static VOID TagCODE(PCOPYTARGET pct)
792	{
793	AppendEscapeWith3Decimals(pct,
794	TXVESC_SET_FONT,
795	1); // monospaced font
796	}
797
798	static VOID TagXCODE(PCOPYTARGET pct)
799	{
800	AppendEscapeWith3Decimals(pct,
801	TXVESC_SET_FONT,
802	0); // regular font
803	}
804
805	static VOID TagA(PCOPYTARGET pct)
806	{
807	CHAR szAnchor[10];
808
809	pct->fInLink = FALSE;
810
811	if ((pct->pszAttributes) && (pct->pxhtml)) // points into main source buffer
812	{
813	// we have attributes:
814	PSZ pszClosingTag = strchr(pct->pszAttributes, '>');
815	if (pszClosingTag)
816	{
817	ULONG ulOfs = 0;
818
819	/*
820	* HREF attribute:
821	*
822	*/
823
824	PSZ pHREF = strhGetTextAttr(pct->pszAttributes, "HREF", &ulOfs),
825	pNAME = 0;
826
827	// replace '>' with null char to mark end of search
828	*pszClosingTag = 0;
829
830	if (pHREF)
831	{
832	// OK, we got a link target:
833	// create a link item and append it to the output list
834	PXHTMLLINK pNewLink = (PXHTMLLINK)malloc(sizeof(XHTMLLINK));
835	memset(pNewLink, 0, sizeof(XHTMLLINK));
836
837	pct->fInLink = TRUE;
838
839	// this starts with anchor 1
840	pNewLink->usLinkIndex = ++pct->usAnchorIndex;
841	pNewLink->pszTargetFile = pHREF;
842	// do not free
843	lstAppendItem(&pct->pxhtml->llLinks, pNewLink);
844	}
845
846	/*
847	* NAME attribute:
848	*
849	*/
850
851	pNAME = strhGetTextAttr(pct->pszAttributes, "NAME", &ulOfs);
852	if (pNAME)
853	{
854	AppendString(pct,
855	TXVESC_ANCHORNAME);
856	AppendString(pct,
857	pNAME);
858	// must be terminated with 0xFF
859	AppendChar(pct, 0xFF);
860	free(pNAME);
861	}
862	// restore '>'
863	*pszClosingTag = '>';
864	}
865	}
866
867	if (pct->fInLink)
868	{
869	sprintf(szAnchor, "%04hX", pct->usAnchorIndex);
870	AppendString(pct,
871	TXVESC_LINK);
872	AppendString(pct,
873	szAnchor);
874	}
875	}
876
877	static VOID TagXA(PCOPYTARGET pct)
878	{
879	if (pct->fInLink)
880	{
881	AppendString(pct,
882	TXVESC_LINK "####");
883	pct->fInLink = FALSE;
884	}
885	}
886
887	/* ******************************************************************
888	*
889	* Tag helpers
890	*
891	********************************************************************/
892
893	/*
894	*@@ FindTagProcessor:
895	* returns the Tag* function which handles the
896	* given tag or NULL if there's none.
897	*
898	*@@added V0.9.4 (2000-06-10) [umoeller]
899	*/
900
901	static PFNPROCESSTAG FindTagProcessor(PSZ pszTag)
902	{
903	PFNPROCESSTAG pProcessor = NULL;
904
905	CHAR c0,
906	c1;
907
908	BOOL fEndOfTag = FALSE;
909
910	PSZ pCheck = pszTag,
911	p2;
912	if (*pCheck == '/')
913	{
914	// end of tag:
915	fEndOfTag = TRUE;
916	pCheck++;
917	}
918
919	c0 = *pCheck;
920	c1 = *(pCheck + 1);
921
922	p2 = pCheck + 2;
923
924	switch (c0)
925	{
926	case 'A':
927	case 'a':
928	switch (c1)
929	{
930	case 0: // A
931	if (!fEndOfTag)
932	return TagA;
933	else
934	return TagXA;
935	case 'D': // ADDRESS
936	case 'd': // ADDRESS
937	if (stricmp(p2, "DRESS") == 0)
938	{
939	if (!fEndOfTag)
940	return TagI;
941	else
942	return TagXI;
943	}
944	}
945	break;
946
947	case 'B':
948	case 'b':
949	switch (c1)
950	{
951	case 0:
952	if (!fEndOfTag)
953	return TagB;
954	else
955	return TagXB;
956
957	case 'R': // BR
958	case 'r': // BR
959	if (*p2 == 0)
960	if (!fEndOfTag)
961	return TagBR;
962	}
963	break;
964
965	case 'C':
966	case 'c':
967	switch (c1)
968	{
969	case 'I': // CITE
970	case 'i': // CITE
971	if (stricmp(p2, "TE") == 0)
972	{
973	if (!fEndOfTag)
974	return TagI;
975	else
976	return TagXI;
977	}
978	break;
979
980	case 'O':
981	case 'o':
982	if (stricmp(p2, "DE") == 0)
983	{
984	if (!fEndOfTag)
985	return TagCODE;
986	else
987	return TagXCODE;
988	}
989	break;
990	}
991	break;
992
993	case 'D':
994	case 'd':
995	switch (c1)
996	{
997	case 'D': // DD
998	case 'd': // DD
999	if ((*p2 == 0) && (!fEndOfTag))
1000	return (TagDD);
1001	break;
1002
1003	case 'I': // DIR
1004	case 'i': // DIR
1005	if (*p2 == 'R')
1006	if (*(pCheck + 3) == 0)
1007	{
1008	if (!fEndOfTag)
1009	return TagUL;
1010	else
1011	return TagXUL;
1012	}
1013	break;
1014
1015	case 'L': // DL
1016	case 'l': // DL
1017	if (*p2 == 0)
1018	{
1019	if (!fEndOfTag)
1020	return TagDL;
1021	else
1022	return TagXDL;
1023	}
1024	break;
1025
1026	case 'T': // DT
1027	case 't': // DT
1028	if ((*p2 == 0) && (!fEndOfTag))
1029	return TagDT;
1030	break;
1031	}
1032	break;
1033
1034	case 'E':
1035	case 'e':
1036	if ( (c1 == 'M') \|\| (c1 == 'm') ) // EM
1037	if (*p2 == 0)
1038	{
1039	if (!fEndOfTag)
1040	return TagI;
1041	else
1042	return TagXI;
1043	}
1044	break;
1045
1046	case 'H':
1047	case 'h':
1048	if (c1)
1049	if (*p2 == 0)
1050	switch (c1)
1051	{
1052	case '1':
1053	if (!fEndOfTag)
1054	return TagH1;
1055	else
1056	return TagXH1;
1057	case '2':
1058	if (!fEndOfTag)
1059	return TagH2;
1060	else
1061	return TagXH2;
1062	case '3':
1063	if (!fEndOfTag)
1064	return TagH3;
1065	else
1066	return TagXH3;
1067	case '4':
1068	if (!fEndOfTag)
1069	return TagH4;
1070	else
1071	return TagXH4;
1072	case '5':
1073	if (!fEndOfTag)
1074	return TagH5;
1075	else
1076	return TagXH5;
1077	case '6':
1078	if (!fEndOfTag)
1079	return TagH6;
1080	else
1081	return TagXH6;
1082	}
1083	break;
1084
1085	case 'I':
1086	case 'i':
1087	if (c1 == 0)
1088	{
1089	if (!fEndOfTag)
1090	return TagI;
1091	else
1092	return TagXI;
1093	}
1094	break;
1095
1096	case 'L':
1097	case 'l':
1098	if ((c1 == 'I') \|\| (c1 == 'i'))
1099	if (*p2 == 0)
1100	return TagLI;
1101	break;
1102
1103	case 'M':
1104	case 'm':
1105	if (stricmp(p2, "NU") == 0)
1106	{
1107	if (!fEndOfTag)
1108	return TagUL;
1109	else
1110	return TagXUL;
1111	}
1112	break;
1113
1114	case 'O':
1115	case 'o':
1116	if ((c1 == 'L') \|\| (c1 == 'l'))
1117	if (*p2 == 0)
1118	{
1119	if (!fEndOfTag)
1120	return TagOL;
1121	else
1122	return TagXOL;
1123	}
1124	break;
1125
1126	case 'P':
1127	case 'p':
1128	switch (c1)
1129	{
1130	case 0:
1131	if (!fEndOfTag)
1132	return TagP;
1133	break;
1134
1135	case 'R': // PRE
1136	case 'r': // PRE
1137	if ((p2 == 'E') \|\| (p2 == 'e'))
1138	if (*(pCheck + 3) == 0)
1139	{
1140	if (!fEndOfTag)
1141	return TagPRE;
1142	else
1143	return TagXPRE;
1144	}
1145	break;
1146	}
1147	break;
1148
1149	case 'S':
1150	case 's':
1151	switch (c1)
1152	{
1153	case 'T': // STRONG
1154	case 't': // STRONG
1155	if (stricmp(p2, "RONG") == 0)
1156	{
1157	if (!fEndOfTag)
1158	return TagB;
1159	else
1160	return TagXB;
1161	}
1162	else if (stricmp(p2, "RIKE") == 0)
1163	{
1164	if (!fEndOfTag)
1165	return TagSTRIKE;
1166	else
1167	return TagXSTRIKE;
1168	}
1169	break;
1170
1171	case 'A':
1172	case 'a':
1173	if (stricmp(p2, "MP") == 0)
1174	{
1175	if (!fEndOfTag)
1176	return TagCODE;
1177	else
1178	return TagXCODE;
1179	}
1180	break;
1181	}
1182	break;
1183
1184	case 'T':
1185	case 't':
1186	switch (c1)
1187	{
1188	case 'R':
1189	case 'r':
1190	if (*p2 == 0)
1191	return TagTR;
1192	break;
1193
1194	case 'I':
1195	case 'i':
1196	if (stricmp(p2, "TLE") == 0)
1197	return TagTITLE;
1198	break;
1199
1200	case 'T': // TT
1201	case 't':
1202	if (*p2 == 0)
1203	{
1204	if (!fEndOfTag)
1205	return TagCODE;
1206	else
1207	return TagXCODE;
1208	}
1209	break;
1210	}
1211	break;
1212
1213	case 'U':
1214	case 'u':
1215	switch (c1)
1216	{
1217	case 0:
1218	if (!fEndOfTag)
1219	return TagU;
1220	else
1221	return TagXU;
1222
1223	case 'L':
1224	case 'l':
1225	if (*p2 == 0)
1226	{
1227	if (!fEndOfTag)
1228	return TagUL;
1229	else
1230	return TagXUL;
1231	}
1232	break;
1233	}
1234	break;
1235
1236	case 'V':
1237	case 'v':
1238	if (stricmp(p2, "R") == 0)
1239	{
1240	if (!fEndOfTag)
1241	return TagI;
1242	else
1243	return TagXI;
1244	}
1245	break;
1246
1247	case 'X':
1248	case 'x':
1249	if (stricmp(p2, "MP") == 0) // XMP
1250	{
1251	if (!fEndOfTag)
1252	return TagPRE;
1253	else
1254	return TagXPRE;
1255	}
1256	break;
1257	}
1258
1259	return (pProcessor);
1260	}
1261
1262	/*
1263	*@@ HandleTag:
1264	* called by txvConvertFromHTML when a "<" character
1265	* is found in the source buffer. This calls
1266	* FindTagProcessor in turn to find the Tag*
1267	* function which handles the tag.
1268	*
1269	*@@added V0.9.3 (2000-05-18) [umoeller]
1270	*/
1271
1272	static VOID HandleTag(PCOPYTARGET pct)
1273	{
1274	PSZ pStartOfTag = pct->pSource;
1275	// '<' == begin of tag:
1276
1277	// is it a comment? <!-- ... -->
1278	if (strncmp(pStartOfTag + 1, "!--", 3) == 0)
1279	{
1280	// start of comment:
1281	// find end of comment
1282	PSZ pEnd = strstr(pStartOfTag, "-->");
1283	if (pEnd)
1284	// found:
1285	// search on after end of comment
1286	pct->pSource = pEnd + 3;
1287	else
1288	{
1289	// end of comment not found:
1290	// stop formatting...
1291	pct->pSource++;
1292	return;
1293	}
1294	}
1295	else
1296	{
1297	// no comment:
1298	// find end of tag
1299	PSZ p2 = pStartOfTag + 1,
1300	pNextClose = 0, // receives first '>' after '<'
1301	pNextSpace = 0; // receives first ' ' after '<'
1302	BOOL fCont = TRUE;
1303	while (fCont)
1304	{
1305	switch (*p2)
1306	{
1307	case ' ':
1308	case '\r':
1309	case '\n':
1310	// store first space after '<'
1311	if (!pNextSpace)
1312	pNextSpace = p2;
1313	// overwrite line breaks with spaces;
1314	// otherwise we cannot handle tags which go across
1315	// several lines, which is valid HTML
1316	*p2 = ' ';
1317	break;
1318
1319	case '>': // end of tag found:
1320	pNextClose = p2;
1321	fCont = FALSE;
1322	break;
1323
1324	case '<':
1325	// another opening tag:
1326	// that's an HTML error
1327	AppendChar(pct,
1328	*pct->pSource++);
1329	fCont = FALSE;
1330	break;
1331
1332	case 0:
1333	fCont = FALSE;
1334	break;
1335	}
1336	p2++;
1337	}
1338
1339	if (pNextClose)
1340	{
1341	// end of tag found:
1342	ULONG cbTag;
1343	// PSZ pStartOfAttrs = 0;
1344
1345	if ((pNextSpace) && (pNextSpace < pNextClose))
1346	{
1347	// we have attributes:
1348	cbTag = pNextSpace - (pStartOfTag + 1);
1349	// pStartOfAttrs = pNextSpace;
1350	}
1351	else
1352	cbTag = pNextClose - (pStartOfTag + 1);
1353
1354	if (!cbTag)
1355	{
1356	// happens if we have a "<>" in the text:
1357	// just insert the '<>' and go on, we have no tag here
1358	AppendChar(pct,
1359	*pct->pSource++);
1360	AppendChar(pct,
1361	*pct->pSource++);
1362	}
1363	else
1364	{
1365	PFNPROCESSTAG pTagProcessor;
1366
1367	pct->cSaved = *(pStartOfTag + cbTag + 1);
1368	// add a null terminator
1369	*(pStartOfTag + cbTag + 1) = 0;
1370
1371	// find corresponding tag converter function
1372	// from G_TagProcessors map
1373	pTagProcessor = FindTagProcessor(pStartOfTag + 1); // pszTag);
1374
1375	// restore char under null terminator
1376	*(pStartOfTag + cbTag + 1) = pct->cSaved;
1377
1378	// reset new source ptr; the tag handler
1379	// can modify this
1380	pct->pNewSource = NULL;
1381
1382	if (pTagProcessor)
1383	{
1384	// tag understood:
1385
1386	// terminate string after closing tag
1387	pct->cSaved = *(pNextClose + 1); // can be null byte!
1388	*(pNextClose + 1) = 0;
1389
1390	// did we have attributes?
1391	if (pNextSpace)
1392	pct->pszAttributes = pNextSpace;
1393
1394	// finally, call the tag handler
1395	(pTagProcessor) // function
1396	(pct); // argument
1397
1398	*(pNextClose + 1) = pct->cSaved;
1399	}
1400
1401	if (pct->pNewSource == NULL)
1402	// tag handler needs no special processing:
1403	// skip '>' too
1404	pct->pSource = pNextClose + 1;
1405	else
1406	// tag handler has skipped something:
1407	pct->pSource = pct->pNewSource;
1408	}
1409	}
1410	}
1411	}
1412
1413	/*
1414	*@@ ConvertEscape:
1415	* called by HandleEscape to find the ANSI (CP 1004)
1416	* character for the given escape sequence (pszTag).
1417	*
1418	* pszTag must be null-terminated and contain only
1419	* the stuff between "&" and ";".
1420	*
1421	* This is really ugly spaghetti, but it's the fastest
1422	* way to do it.
1423	*
1424	*@@added V0.9.4 (2000-06-10) [umoeller]
1425	*/
1426
1427	static unsigned char ConvertEscape(PSZ pszTag)
1428	{
1429	CHAR c0, c1;
1430	CHAR crc = 0;
1431
1432	PSZ p2 = pszTag + 2;
1433
1434	c0 = *pszTag;
1435	c1 = *(pszTag + 1);
1436
1437	switch (c0)
1438	{
1439	case 'a':
1440	switch (c1)
1441	{
1442	case 'a':
1443	if (strcmp(p2, "cute") == 0)
1444	return 225;
1445	break;
1446
1447	case 'c':
1448	if (strcmp(p2, "irc") == 0)
1449	return 226;
1450	else if (strcmp(p2, "ute") == 0)
1451	return 180;
1452	break;
1453
1454	case 'e':
1455	if (strcmp(p2, "lig") == 0)
1456	return 230;
1457	break;
1458
1459	case 'g':
1460	if (strcmp(p2, "rave") == 0)
1461	return 224;
1462	break;
1463
1464	case 'm':
1465	if (strcmp(p2, "p") == 0)
1466	return '&';
1467	break;
1468
1469	case 'r':
1470	if (strcmp(p2, "ing") == 0)
1471	return 229;
1472	break;
1473
1474	case 't':
1475	if (strcmp(p2, "ilde") == 0)
1476	return 227;
1477	break;
1478
1479	case 'u':
1480	if (strcmp(p2, "ml") == 0)
1481	return 228;
1482	break;
1483	}
1484	break;
1485
1486	case 'b':
1487	if (strcmp(pszTag + 1, "rvbar") == 0)
1488	return 166;
1489	break;
1490
1491	case 'c':
1492	switch (c1)
1493	{
1494	case 'c':
1495	if (strcmp(p2, "edil") == 0)
1496	return 231;
1497	break;
1498
1499	case 'e':
1500	if (strcmp(p2, "dil") == 0)
1501	return 184;
1502	else if (strcmp(p2, "nt") == 0)
1503	return 162;
1504	break;
1505
1506	case 'o':
1507	if (strcmp(p2, "py") == 0)
1508	return 169;
1509	break;
1510
1511	case 'u':
1512	if (strcmp(p2, "rren") == 0)
1513	return 164;
1514	}
1515	break;
1516
1517	case 'd':
1518	switch (c1)
1519	{
1520	case 'e':
1521	if (strcmp(p2, "g") == 0) return 176;
1522	break;
1523
1524	case 'i':
1525	if (strcmp(p2, "vide") == 0) return 247;
1526	break;
1527	}
1528	break;
1529
1530	case 'e':
1531	switch (c1)
1532	{
1533	case 'a':
1534	if (strcmp(p2, "cute") == 0) return 233;
1535	break;
1536
1537	case 'c':
1538	if (strcmp(p2, "irc") == 0) return 234;
1539	break;
1540
1541	case 'g':
1542	if (strcmp(p2, "rave") == 0) return 232;
1543	break;
1544
1545	case 't':
1546	if (strcmp(p2, "h") == 0) return 240;
1547	break;
1548
1549	case 'u':
1550	if (strcmp(p2, "ml") == 0) return 235;
1551	break;
1552	}
1553	break;
1554
1555	case 'f':
1556	switch (c1)
1557	{
1558	case 'r':
1559	if (strcmp(p2, "ac14") == 0) return 188;
1560	if (strcmp(p2, "ac12") == 0) return 189;
1561	if (strcmp(p2, "ac34") == 0) return 190;
1562	break;
1563	}
1564	break;
1565
1566	case 'g':
1567	switch (c1)
1568	{
1569	case 't':
1570	if (*p2 == 0) return '>';
1571	}
1572	break;
1573
1574	case 'i':
1575	switch (c1)
1576	{
1577	case 'a':
1578	if (strcmp(p2, "cute") == 0) return 237;
1579	break;
1580
1581	case 'c':
1582	if (strcmp(p2, "irc") == 0) return 238;
1583	break;
1584
1585	case 'g':
1586	if (strcmp(p2, "rave") == 0) return 236;
1587	break;
1588
1589	case 'e':
1590	if (strcmp(p2, "xcl") == 0) return 161;
1591	break;
1592
1593	case 'q':
1594	if (strcmp(p2, "uest") == 0) return 191;
1595	break;
1596
1597	case 'u':
1598	if (strcmp(p2, "ml") == 0) return 239;
1599	}
1600	break;
1601
1602	case 'l':
1603	switch (c1)
1604	{
1605	case 't':
1606	if (*p2 == 0)
1607	return '<';
1608	break;
1609
1610	case 'a':
1611	if (strcmp(p2, "quo") == 0) return 171;
1612	}
1613	break;
1614
1615	case 'm':
1616	switch (c1)
1617	{
1618	case 'a':
1619	if (strcmp(p2, "cr") == 0) return 175;
1620	break;
1621
1622	case 'i':
1623	if (strcmp(p2, "cro") == 0) return 181;
1624	if (strcmp(p2, "ddot") == 0) return 183;
1625	break;
1626	}
1627	break;
1628
1629	case 'n':
1630	switch (c1)
1631	{
1632	case 'b':
1633	if (strcmp(p2, "sp") == 0) return 160;
1634	break;
1635
1636	case 'o':
1637	if (strcmp(p2, "t") == 0) return 172;
1638	break;
1639
1640	case 't':
1641	if (strcmp(p2, "ilde") == 0) return 241;
1642	}
1643	break;
1644
1645	case 'o':
1646	switch (c1)
1647	{
1648	case 'a':
1649	if (strcmp(p2, "cute") == 0) return 243;
1650	break;
1651
1652	case 'c':
1653	if (strcmp(p2, "irc") == 0) return 244;
1654	break;
1655
1656	case 'g':
1657	if (strcmp(p2, "rave") == 0) return 242;
1658	break;
1659
1660	case 'r':
1661	if (strcmp(p2, "df") == 0) return 170;
1662	if (strcmp(p2, "dm") == 0) return 186;
1663	break;
1664
1665	case 's':
1666	if (strcmp(p2, "lash") == 0) return 248;
1667	break;
1668
1669	case 't':
1670	if (strcmp(p2, "ilde") == 0) return 245;
1671	break;
1672
1673	case 'u':
1674	if (strcmp(p2, "ml") == 0) return 246;
1675	}
1676	break;
1677
1678	case 'p':
1679	switch (c1)
1680	{
1681	case 'a':
1682	if (strcmp(p2, "ra") == 0) return 182;
1683	break;
1684
1685	case 'l':
1686	if (strcmp(p2, "usmn") == 0) return 177;
1687	break;
1688
1689	case 'o':
1690	if (strcmp(p2, "und") == 0) return 163;
1691	}
1692	break;
1693
1694	case 'q':
1695	if (strcmp(pszTag, "quot") == 0) return '"';
1696	break;
1697
1698	case 'r':
1699	if (strcmp(pszTag, "raquo") == 0) return 187;
1700	if (strcmp(pszTag, "reg") == 0) return 174;
1701	break;
1702
1703	case 's':
1704	switch (c1)
1705	{
1706	case 'z':
1707	if (strcmp(p2, "lig") == 0) return 223;
1708	break;
1709
1710	case 'e':
1711	if (strcmp(p2, "ct") == 0) return 167;
1712	break;
1713
1714	case 'h':
1715	if (strcmp(p2, "y") == 0) return 173;
1716	break;
1717
1718	case 'u':
1719	if (strcmp(p2, "p1") == 0) return 185;
1720	if (strcmp(p2, "p2") == 0) return 178;
1721	if (strcmp(p2, "p3") == 0) return 179;
1722	}
1723	break;
1724
1725	case 't':
1726	if (strcmp(pszTag, "thorn") == 0) return 254;
1727	if (strcmp(pszTag, "times") == 0) return 215;
1728	break;
1729
1730	case 'u':
1731	switch (c1)
1732	{
1733	case 'a':
1734	if (strcmp(p2, "cute") == 0) return 250;
1735	break;
1736
1737	case 'c':
1738	if (strcmp(p2, "irc") == 0) return 251;
1739	break;
1740
1741	case 'g':
1742	if (strcmp(p2, "rave") == 0) return 249;
1743	break;
1744
1745	case 'm':
1746	if (strcmp(p2, "l") == 0) return 168;
1747	break;
1748
1749	case 'u':
1750	if (strcmp(p2, "ml") == 0) return 252;
1751	}
1752	break;
1753
1754	case 'y':
1755	if (strcmp(pszTag, "yacute") == 0) return 253;
1756	if (strcmp(pszTag, "yen") == 0) return 165;
1757	if (strcmp(pszTag, "yuml") == 0) return 255;
1758	break;
1759
1760	case 'A':
1761	switch (c1)
1762	{
1763	case 'u':
1764	if (strcmp(p2, "ml") == 0) return 196;
1765	break;
1766
1767	case 'a':
1768	if (strcmp(p2, "cute") == 0) return 193;
1769	break;
1770
1771	case 'c':
1772	if (strcmp(p2, "irc") == 0) return 194;
1773	break;
1774
1775	case 'E':
1776	if (strcmp(p2, "lig") == 0) return 198;
1777	break;
1778
1779	case 'g':
1780	if (strcmp(p2, "rave") == 0) return 192;
1781	break;
1782
1783	case 'r':
1784	if (strcmp(p2, "ing") == 0) return 197;
1785	break;
1786
1787	case 't':
1788	if (strcmp(p2, "ilde") == 0) return 195;
1789	}
1790	break;
1791
1792	case 'C':
1793	if (strcmp(pszTag, "Ccedil") == 0) return 199;
1794	break;
1795
1796	case 'E':
1797	if (strcmp(pszTag, "Ecirc") == 0) return 202;
1798	if (strcmp(pszTag, "Eacute") == 0) return 201;
1799	if (strcmp(pszTag, "Egrave") == 0) return 200;
1800	if (strcmp(pszTag, "ETH") == 0) return 208;
1801	if (strcmp(pszTag, "Euml") == 0) return 203;
1802	break;
1803
1804	case 'I':
1805	if (strcmp(pszTag, "Icirc") == 0) return 206;
1806	if (strcmp(pszTag, "Iacute") == 0) return 205;
1807	if (strcmp(pszTag, "Igrave") == 0) return 204;
1808	if (strcmp(pszTag, "Iuml") == 0) return 207;
1809	break;
1810
1811	case 'N':
1812	if (strcmp(pszTag, "Ntilde") == 0) return 209;
1813	break;
1814
1815	case 'O':
1816	switch (c1)
1817	{
1818	case 'u':
1819	if (strcmp(p2, "ml") == 0) return 214;
1820	break;
1821
1822	case 'a':
1823	if (strcmp(p2, "cute") == 0) return 211;
1824	break;
1825
1826	case 'c':
1827	if (strcmp(p2, "irc") == 0) return 212;
1828	break;
1829
1830	case 'g':
1831	if (strcmp(p2, "rave") == 0) return 210;
1832	break;
1833
1834	case 't':
1835	if (strcmp(p2, "ilde") == 0) return 213;
1836	break;
1837
1838	case 's':
1839	if (strcmp(p2, "lash") == 0) return 216;
1840	}
1841	break;
1842
1843	case 'U':
1844	switch (c1)
1845	{
1846	case 'a':
1847	if (strcmp(p2, "cute") == 0) return 218;
1848	break;
1849
1850	case 'c':
1851	if (strcmp(p2, "irc") == 0) return 219;
1852	break;
1853
1854	case 'g':
1855	if (strcmp(p2, "rave") == 0) return 217;
1856	break;
1857
1858	case 'u':
1859	if (strcmp(p2, "ml") == 0) return 220;
1860	}
1861	break;
1862
1863	case 'T':
1864	if (strcmp(pszTag, "THORN") == 0) return 222;
1865	break;
1866
1867	case 'Y':
1868	if (strcmp(pszTag, "Yacute") == 0) return 221;
1869	break;
1870	}
1871
1872	return (crc);
1873	}
1874
1875	/*
1876	*@@ HandleEscape:
1877	* called by txvConvertFromHTML when a "&" character
1878	* is found in the source buffer. This calls
1879	* ConvertEscape in turn.
1880	*
1881	*@@added V0.9.3 (2000-05-18) [umoeller]
1882	*/
1883
1884	static VOID HandleEscape(PCOPYTARGET pct)
1885	{
1886	// ampersand:
1887	// replace special characters
1888	PSZ pStartOfTag = pct->pSource;
1889	// find end of tag
1890	PSZ p2 = pStartOfTag,
1891	pNextClose = 0,
1892	pNextSpace = 0;
1893	BOOL fCont = TRUE;
1894	while (fCont)
1895	{
1896	switch (*p2)
1897	{
1898	case 0:
1899	fCont = FALSE;
1900	break;
1901
1902	case ';':
1903	pNextClose = p2;
1904	fCont = FALSE;
1905	break;
1906
1907	case ' ':
1908	if (!pNextSpace)
1909	pNextSpace = p2;
1910	break;
1911	}
1912	p2++;
1913	}
1914
1915	if (!pNextClose)
1916	// no closing tag found:
1917	// just insert the '&' and go on, we have no tag here
1918	AppendChar(pct,
1919	*pct->pSource++);
1920	else
1921	{
1922	if ((pNextSpace) && (pNextSpace < pNextClose))
1923	// space before ';':
1924	// just insert the '&' and go on, we have no tag here
1925	AppendChar(pct,
1926	*pct->pSource++);
1927	else if ((!pNextClose) \|\| (pNextClose <= pStartOfTag + 1))
1928	AppendChar(pct,
1929	*pct->pSource++);
1930	else
1931	{
1932	ULONG ulCode = 0;
1933
1934	// create substring with tag
1935	PSZ pszTag = pStartOfTag + 1;
1936	*pNextClose = 0;
1937
1938	if (*pszTag == '#')
1939	{
1940	// latin-1 or Unicode encoding ()
1941	ulCode = atoi(pszTag + 1);
1942
1943	// next input: char after ';'
1944	pct->pSource = pNextClose + 1;
1945	}
1946	else
1947	{
1948	// named entity:
1949	// find char code corresponding to escape
1950	// from G_EscapeProcessors map
1951	ulCode = ConvertEscape(pszTag);
1952	if (ulCode)
1953	// tag supported:
1954	pct->pSource = pNextClose + 1;
1955	else
1956	// tag not supported:
1957	ulCode = *pct->pSource++;
1958	}
1959
1960	// restore closing tag which we overwrote
1961	*pNextClose = ';';
1962
1963	if (ulCode)
1964	{
1965	AppendLinebreakCheck(pct);
1966
1967	AppendChar(pct,
1968	(CHAR)ulCode);
1969	pct->fSkipNextSpace = FALSE;
1970	}
1971	}
1972	}
1973	}
1974
1975	/* ******************************************************************
1976	*
1977	* Entry points
1978	*
1979	********************************************************************/
1980
1981	/*
1982	*@@ txvConvertFromHTML:
1983	* this modifies the given text string (which should
1984	* be the complete BODY block of any HTML file) so
1985	* that all HTML tags are removed and replaced with
1986	* escape sequences that the XTextView control understands.
1987	*
1988	* The buffer gets reallocated by this function, so it
1989	* must be free()'able.
1990	*
1991	* So, to have the XTextView control display an HTML file,
1992	* do this:
1993	*
1994	* 1) Load an HTML file into a buffer allocated by malloc().
1995	*
1996	* 2) Call txvConvertFromHTML.
1997	*
1998	* 3) Call WinSetWindowText on the XTextView control with
1999	* the modified buffer.
2000	*
2001	* This understands the following limited subset of HTML:
2002	*
2003	* Paragraph tags:
2004	*
2005	* -- P, BR
2006	* -- PRE, /PRE
2007	* -- UL, /UL, OL, /OL, LI
2008	* -- DL, /DL, DT, DD
2009	* -- H1, /H1 thru H6, /H6
2010	* -- Comments (<!-- .... -->)
2011	*
2012	* Character tags:
2013	*
2014	* -- B, /B, STRONG, /STRONG
2015	* -- I, /I, EM, /EM, VAR, /VAR, CITE, /CITE
2016	* -- CODE, /CODE, SAMP, /SAMP, KBD, /KBD, TT, /TT
2017	* -- U, /U
2018	* -- STRIKE, /STRIKE
2019	* -- CODE, /CODE
2020	*
2021	* The most obvious limitation is that neither tables
2022	* nor frames are supported. Also forget about CSS
2023	* and JavaScript, of course.
2024	*
2025	* All the ampersand (& something) sequences defined
2026	* in HTML 3 are properly translated.
2027	*
2028	* Note: Those are translated to the ANSI (MS-Windows,
2029	* OS/2 codepage 1004) character set. This has the
2030	* following characteristics:
2031	*
2032	* -- Codes 0-127 are identical to ASCII and thus
2033	* ISO 8559-1 ("Latin 1") also.
2034	*
2035	* -- Codes 160-255 are identical to ISO 8559-1 ("Latin 1").
2036	*
2037	* -- Codes 128-159 are NOT defined in ISO 8559-1, but
2038	* Netscape treats those as ANSI as well, so we do too.
2039	*
2040	* As a result, consider the output to be in OS/2 codepage
2041	* 1004. Either set your codepage to that (WinSetCp)
2042	* or translate the output (WinCpTranslateString).
2043	*
2044	* &#xxx; tags (with xxx being a decimal) are considered
2045	* ANSI codes as well. Even though HTML 4.0 allows Unicode
2046	* characters > 255 to be inserted this way, we ignore
2047	* those. Unicode chars from 0 to 255 are identical to
2048	* ANSI, so for to ÿ, we are HTML-compliant.
2049	*
2050	* All other tags are completely thrown out.
2051	*
2052	*@@added V0.9.3 (2000-05-06) [umoeller]
2053	*/
2054
2055	BOOL txvConvertFromHTML(char **ppszText,
2056	PVOID pxhtml, // out: various config data (PXHTMLDATA)
2057	PULONG pulProgress, // out: progress (ptr can be NULL)
2058	PBOOL pfCancel) // in: cancel flag (ptr can be NULL)
2059	{
2060	BOOL brc = TRUE;
2061
2062	ULONG cbSource = strlen(*ppszText);
2063
2064	XHTMLDATA xhtmlTemp = {0};
2065	BOOL fUsingTemp = FALSE;
2066	COPYTARGET ct = {0};
2067
2068	lstInit(&ct.llLists,
2069	TRUE); // free items
2070
2071	ct.pSource = *ppszText;
2072	// skip leading spaces
2073	ct.fSkipNextSpace = TRUE;
2074	ct.pxhtml = (PXHTMLDATA)pxhtml;
2075	if (ct.pxhtml == NULL) // not specified:
2076	{
2077	ct.pxhtml = &xhtmlTemp;
2078	fUsingTemp = TRUE;
2079	}
2080
2081	lstInit(&ct.pxhtml->llLinks, TRUE); // auto-free
2082
2083	// step 2:
2084	// actual tags formatting
2085
2086	while (TRUE)
2087	{
2088	CHAR c = *ct.pSource;
2089
2090	if (pfCancel)
2091	if (*pfCancel)
2092	{
2093	brc = FALSE;
2094	break;
2095	}
2096
2097	if (!c)
2098	// null terminator reached:
2099	break;
2100
2101	// calculate progress
2102	if (pulProgress)
2103	pulProgress = ((ct.pSource - ppszText) // characters done
2104	* 100
2105	/ cbSource); // characters total
2106
2107	switch (c)
2108	{
2109	case '<':
2110	HandleTag(&ct);
2111	break;
2112
2113	case '&':
2114	HandleEscape(&ct);
2115	break;
2116
2117	case '\r':
2118	// skip
2119	if (!ct.fSkipNextSpace)
2120	{
2121	AppendChar(&ct,
2122	' ');
2123	// ct.fNeedsLinebreak = FALSE;
2124	// but skip leading spaces which might follow
2125	if (!ct.fPRE)
2126	ct.fSkipNextSpace = TRUE;
2127	}
2128	ct.pSource++;
2129	break;
2130
2131	case '\t':
2132	{
2133	if (ct.fPRE)
2134	{
2135	ULONG ul;
2136	for (ul = 0;
2137	ul < 8;
2138	ul++)
2139	AppendChar(&ct,
2140	' ');
2141	}
2142	else
2143	{
2144	// not in PRE block:
2145	if ( (!ct.fSkipNextSpace)
2146	// && (!ct.fNeedsLinebreak)
2147	)
2148	// last was not space: copy
2149	AppendChar(&ct,
2150	' ');
2151
2152	ct.fSkipNextSpace = TRUE;
2153	}
2154
2155	// skip the tab
2156	ct.pSource++;
2157	break; }
2158
2159	case '\n':
2160	{
2161	// newline char:
2162	if (!ct.fPRE)
2163	{
2164	// if not in PRE mode, replace with space
2165	if (!ct.fSkipNextSpace)
2166	{
2167	AppendChar(&ct,
2168	' ');
2169	// ct.fNeedsLinebreak = FALSE;
2170	// but skip leading spaces which might follow
2171	ct.fSkipNextSpace = TRUE;
2172	}
2173	}
2174	else
2175	// in PRE mode, preserve line breaks
2176	AppendChar(&ct, '\n'); // ct.fNeedsLinebreak = TRUE;
2177
2178	ct.pSource++;
2179	break; }
2180
2181	case '\xFF':
2182	{
2183	AppendChar(&ct,
2184	' ');
2185	ct.pSource++;
2186	break; }
2187
2188	case ' ':
2189	if (!ct.fPRE)
2190	{
2191	// is space, and not in PRE block:
2192	if ( (!ct.fSkipNextSpace)
2193	// && (!ct.fNeedsLinebreak)
2194	)
2195	// last was not space: copy
2196	AppendChar(&ct,
2197	' ');
2198
2199	ct.fSkipNextSpace = TRUE;
2200	}
2201	else
2202	// in PRE, always add all spaces
2203	AppendChar(&ct,
2204	' ');
2205	ct.pSource++;
2206	break;
2207
2208	default:
2209	// if we're not inserting escapes or anything,
2210	// check if a linebreak is needed
2211	AppendLinebreakCheck(&ct);
2212
2213	AppendChar(&ct,
2214	*ct.pSource++);
2215	ct.fSkipNextSpace = FALSE;
2216	ct.fSkipNextLinebreak = FALSE;
2217
2218	} // end switch (*pSource);
2219	} // end while (*pSource)
2220	AppendChar(&ct,
2221	'\n');
2222	// append null-terminator
2223	AppendChar(&ct,
2224	0);
2225
2226	free(*ppszText);
2227	*ppszText = ct.pszNew;
2228
2229	lstClear(&ct.llLists);
2230
2231	if (fUsingTemp)
2232	{
2233	if (xhtmlTemp.pszTitle)
2234	free(xhtmlTemp.pszTitle);
2235	lstClear(&xhtmlTemp.llLinks);
2236	// ### better really clear this... there are PSZ's inside
2237	}
2238
2239	return (brc);
2240	}
2241
2242

Note: See TracBrowser for help on using the repository browser.

Download in other formats: