Context Navigation

source: branches/branch-1-0/src/helpers/textv_html.c

Visit:

Last change on this file was 222, checked in by umoeller, 23 years ago
Minor adjustments for new static handling.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 61.8 KB

Line
1
2	/*
3	*@@sourcefile textv_html.c:
4	* this code converts HTML code to escape sequences for the
5	* XTextView control (textview.c).
6	*
7	* This code is in part ugly spaghetti, but this is intentional to
8	* make this HTML parser FAST. In general, you get about double or
9	* triple the speed compared to Netscape 4.6 on OS/2. This code
10	* doesn't understand all of HTML though, but you get most of HTML 2.
11	* There's no tables or frames at this point.
12	*
13	* The entry point into this mess is txvConvertFromHTML, which
14	* is easy to use.
15	*
16	* Note: Version numbering in this file relates to XWorkplace version
17	* numbering.
18	*
19	*@@header "helpers\textv_html.h"
20	*
21	*@@added V0.9.3 (2000-05-10) [umoeller]
22	*/
23
24	/*
25	* Copyright (C) 2000 Ulrich Mller.
26	* This program is part of the XWorkplace package.
27	* This program is free software; you can redistribute it and/or modify
28	* it under the terms of the GNU General Public License as published by
29	* the Free Software Foundation, in version 2 as it comes in the COPYING
30	* file of the XWorkplace main distribution.
31	* This program is distributed in the hope that it will be useful,
32	* but WITHOUT ANY WARRANTY; without even the implied warranty of
33	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34	* GNU General Public License for more details.
35	*/
36
37	#define OS2EMX_PLAIN_CHAR
38	// this is needed for "os2emx.h"; if this is defined,
39	// emx will define PSZ as _signed_ char, otherwise
40	// as unsigned char
41
42	#include <os2.h>
43
44	#include <stdlib.h>
45	#include <stdio.h>
46	#include <string.h>
47
48	#include "setup.h" // code generation and debugging options
49
50	#include "helpers\linklist.h"
51	#include "helpers\stringh.h"
52	#include "helpers\textview.h"
53
54	#include "helpers\textv_html.h"
55
56	/*
57	*@@category: Helpers\PM helpers\Window classes\XTextView control\HTML conversion
58	* see textv_html.c.
59	*/
60
61	/* ******************************************************************
62	*
63	* Declarations
64	*
65	********************************************************************/
66
67	/*
68	*@@ LISTDESC:
69	* structure stored in COPYTARGET to
70	* hold list information (UL, OL, ... tags).
71	*
72	*@@added V0.9.3 (2000-05-07) [umoeller]
73	*/
74
75	typedef struct _LISTDESC
76	{
77	ULONG ulListType; // 0: unordered (UL)
78	// 1: ordered (OL)
79	// 2: definition lists (DL)
80	ULONG ulItem; // list enumeration; 1 on first item,
81	// 2 on next, ...
82	} LISTDESC, *PLISTDESC;
83
84	/*
85	*@@ COPYTARGET:
86	* monster structure which holds the current
87	* status of the HTML converter while conversion
88	* is taking place. This stores input/output pointers
89	* and various flags to avoid duplicate line breaks
90	* and such.
91	*
92	* One instance of this is created in txvConvertFromHTML
93	* on the stack and then passed to all the sub-function
94	* calls.
95	*
96	*@@added V0.9.3 (2000-05-06) [umoeller]
97	*/
98
99	typedef struct _COPYTARGET
100	{
101	PSZ pSource; // ptr into source string;
102	// valid ONLY while we're in a tag handler
103	PSZ pNewSource; // can be set by tag handler to skip characters;
104	// this is set to NULL before calling a tag
105	// handler; if this is still NULL, default
106	// processing occurs
107
108	// new string:
109	PSZ pszNew; // memory buffer
110	ULONG cbNew; // size of buffer (reallocated)
111	PSZ pTarget; // current char ptr into pszNew
112
113	// saved character while tag handler is being called
114	CHAR cSaved;
115
116	PSZ *ppszTitle; // out: title (ptr can be NULL)
117	// V0.9.20 (2002-08-10) [umoeller]
118
119	// formatting flags while going through the text
120	BOOL fSkipNextSpace;
121	// if TRUE, subsequent spaces are skipped
122	BOOL fNeedsLinebreak;
123	// if TRUE, \n is inserted before any other character
124	BOOL fSkipNextLinebreak;
125	// if TRUE, subsequent linebreaks are skipped
126	BOOL fPRE;
127	// are we currently in a PRE tag?
128	BOOL fInLink;
129	// are we currently in a A HREF= tag?
130
131	// arguments (attributes) for tag handlers
132	PSZ pszAttributes; // != NULL while a tag handler is being called
133	// and attributes exist for the tag
134
135	// anchors count
136	// USHORT usAnchorIndex; // start with 1 removed V0.9.20 (2002-08-10) [umoeller]
137
138	// list maintenance
139	ULONG ulListLevel; // if > 0, we're in a UL or OL block;
140	// raised for each block
141	ULONG ulUnorderedListLevel; // raised with each UL block to keep track
142	// of bullets
143	ULONG ulOrderedListLevel; // raised with each UL block to keep track
144	// of 1), 2), a), b)... numbering
145	ULONG ulCurrentListType; // current list type (from highest LISTDESC)
146	BOOL fInDT; // TRUE if we're currently in a DT tag
147	LINKLIST llLists; // stack of LISTDESC items
148	} COPYTARGET, *PCOPYTARGET;
149
150	typedef VOID FNPROCESSTAG(PCOPYTARGET pct);
151	typedef FNPROCESSTAG *PFNPROCESSTAG;
152
153	/* ******************************************************************
154	*
155	* Global variables
156	*
157	********************************************************************/
158
159	/* ******************************************************************
160	*
161	* Append-char helpers
162	*
163	********************************************************************/
164
165	#define COPYTARGETALLOC 100000
166
167	/*
168	*@@ AppendChar:
169	* helper for txvConvertFromHTML to
170	* append a char to the target string
171	* in COPYTARGET.
172	* This performs a few additional checks
173	* and manages memory.
174	*
175	*@@added V0.9.3 (2000-05-06) [umoeller]
176	*/
177
178	STATIC VOID AppendChar(PCOPYTARGET pct, // in/out: formatting buffer
179	unsigned char c)
180	{
181	// calculate ofs where to store next char
182	ULONG cbOfsNext = pct->pTarget - pct->pszNew;
183	if (cbOfsNext >= pct->cbNew) // have we reached the buffer size yet?
184	{
185	// more mem needed:
186	pct->cbNew += COPYTARGETALLOC;
187	pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
188	// if first call, pszNew is NULL, and realloc
189	// behaves just like malloc
190	// adjust target, because ptr might have changed
191	pct->pTarget = pct->pszNew + cbOfsNext;
192	}
193
194	// append character
195	*pct->pTarget++ = c;
196	}
197
198	/*
199	*@@ AppendString:
200	* appends the characters in *ach,
201	* which must be null-terminated.
202	* Does NOT append a null character though.
203	*
204	*@@added V0.9.3 (2000-05-06) [umoeller]
205	*/
206
207	STATIC VOID AppendString(PCOPYTARGET pct, // in/out: formatting buffer
208	char *ach)
209	{
210	ULONG cbAppend = strlen(ach);
211	ULONG ul;
212	PSZ pSource;
213
214	// calculate ofs where to store next char
215	ULONG cbOfsNext = pct->pTarget - pct->pszNew;
216	while (cbOfsNext + cbAppend >= pct->cbNew)
217	{
218	// more mem needed:
219	pct->cbNew += COPYTARGETALLOC;
220	pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
221	// if first call, pszNew is NULL, and realloc
222	// behaves just like malloc
223	// adjust target, because ptr might have changed
224	pct->pTarget = pct->pszNew + cbOfsNext;
225	}
226
227	// append characters
228	pSource = ach;
229	for (ul = 0;
230	ul < cbAppend;
231	ul++)
232	pct->pTarget++ = pSource++;
233	}
234
235	/*
236	*@@ AppendLinebreakCheck:
237	* checks if a linebreak is needed and
238	* inserts one if so.
239	*
240	*@@added V0.9.3 (2000-05-17) [umoeller]
241	*/
242
243	STATIC VOID AppendLinebreakCheck(PCOPYTARGET pct)
244	{
245	if ((!pct->fPRE) && (pct->fNeedsLinebreak))
246	{
247	// yes: insert linebreak; this resets pct->fNeedsLinebreak
248	if (!pct->fSkipNextLinebreak)
249	{
250	AppendChar(pct, '\n');
251
252	if ((pct->ulListLevel) && (!pct->fInDT))
253	// if we're in a list, add a tab also,
254	// because we'll have a negative first-line margin
255	AppendString(pct, TXVESC_TAB);
256	}
257	pct->fNeedsLinebreak = FALSE;
258	}
259	}
260
261	/*
262	*@@ AppendEscapeWithDecimal:
263	* appends the specified escape code
264	* with a three-digit decimal parameter.
265	* Calls AppendString in turn.
266	*
267	*@@added V0.9.3 (2000-05-07) [umoeller]
268	*/
269
270	STATIC VOID AppendEscapeWith3Decimals(PCOPYTARGET pct, // in/out: formatting buffer
271	char *ach,
272	USHORT us)
273	{
274	CHAR szDecimal[10];
275	if (us > 999)
276	us = 999;
277	sprintf(szDecimal, "%03d", us);
278	// append escape
279	AppendString(pct, ach);
280	AppendString(pct, szDecimal);
281	}
282
283	/*
284	*@@ AppendEscapeWith4Decimals:
285	*
286	*@@added V0.9.3 (2000-05-07) [umoeller]
287	*/
288
289	STATIC VOID AppendEscapeWith4Decimals(PCOPYTARGET pct, // in/out: formatting buffer
290	char *ach,
291	USHORT us)
292	{
293	CHAR szDecimal[10];
294	if (us > 9999)
295	us = 9999;
296	sprintf(szDecimal, "%04d", us);
297	// append escape
298	AppendString(pct, ach);
299	AppendString(pct, szDecimal);
300	}
301
302	/* ******************************************************************
303	*
304	* Tag converter functions
305	*
306	********************************************************************/
307
308	/*
309	*@@ StartList:
310	* starts a list (UL or OL).
311	* This uses a linked list in COPYTARGET
312	* to keep a pseudo-stack for nested lists.
313	*
314	*@@added V0.9.3 (2000-05-08) [umoeller]
315	*/
316
317	STATIC VOID StartList(PCOPYTARGET pct, // in/out: formatting buffer
318	ULONG ulListType) // list type:
319	// 0: unordered (UL)
320	// 1: ordered (OL)
321	// 2: definition lists (DL)
322	{
323	PLISTDESC pListDesc;
324
325	// raise list level
326	pct->ulListLevel++;
327
328	if (ulListType == 0)
329	// unordered:
330	pct->ulUnorderedListLevel++;
331	else if (ulListType == 1)
332	// ordered:
333	pct->ulOrderedListLevel++;
334
335	// create LISTDESC and store it on stack
336	pListDesc = (PLISTDESC)malloc(sizeof(LISTDESC));
337	pListDesc->ulListType
338	= pct->ulCurrentListType
339	= ulListType;
340	pListDesc->ulItem = 1;
341
342	lstAppendItem(&pct->llLists,
343	pListDesc);
344
345	AppendEscapeWith4Decimals(pct,
346	TXVESC_LEFTMARGIN,
347	pct->ulListLevel * 5);
348	AppendEscapeWith3Decimals(pct,
349	TXVESC_FIRSTLINEMARGIN_LEFT,
350	(ulListType == 2)
351	? 5 // for definition lists
352	: 3); // negative!
353	// add \n before any other character
354	pct->fNeedsLinebreak = TRUE;
355	}
356
357	/*
358	*@@ StopList:
359	* stops a list (UL or OL).
360	*
361	*@@added V0.9.3 (2000-05-07) [umoeller]
362	*/
363
364	STATIC VOID StopList(PCOPYTARGET pct)
365	{
366	if (pct->ulListLevel)
367	{
368	PLISTNODE pNode;
369
370	// lower list level
371	pct->ulListLevel--;
372	AppendEscapeWith4Decimals(pct,
373	TXVESC_LEFTMARGIN,
374	pct->ulListLevel * 5);
375	AppendEscapeWith3Decimals(pct,
376	TXVESC_FIRSTLINEMARGIN_LEFT,
377	(pct->ulListLevel)
378	? 3 // we still have a list level (nested)
379	: 0);
380	pct->fNeedsLinebreak = TRUE;
381
382	// remove the LISTDESC from the stack
383	pNode = lstNodeFromIndex(&pct->llLists,
384	pct->ulListLevel); // this has been lowered already
385	if (pNode)
386	{
387	PLISTDESC pListDesc = (PLISTDESC)pNode->pItemData;
388	if (pListDesc->ulListType == 0)
389	// was unordered:
390	pct->ulUnorderedListLevel--;
391	else if (pListDesc->ulListType == 1)
392	// was ordered:
393	pct->ulOrderedListLevel--;
394
395	lstRemoveNode(&pct->llLists, pNode);
396
397	// update COPYTARGET with previous list level
398	if (pct->ulListLevel)
399	{
400	// we're still in a list (nested lists):
401	PLISTDESC pListDesc2 = (PLISTDESC)lstItemFromIndex(&pct->llLists,
402	pct->ulListLevel - 1);
403	if (pListDesc2)
404	pct->ulCurrentListType = pListDesc2->ulListType;
405	}
406	}
407	}
408	// else: buggy HTML code, ignore
409	}
410
411	/*
412	*@@ TagTITLE:
413	*
414	*@@added V0.9.3 (2000-05-19) [umoeller]
415	*/
416
417	STATIC VOID TagTITLE(PCOPYTARGET pct)
418	{
419	// pSource currently points to <TITLE tag
420	PSZ pSource = pct->pSource + strlen(pct->pSource);
421	// points to temporary null byte in main buffer now
422	*pSource = pct->cSaved;
423
424	if (pSource = strchr(pct->pSource, '>'))
425	{
426	PSZ pNextOpen;
427	if (pNextOpen = strchr(pSource, '<'))
428	{
429	// extract title
430	if (pct->ppszTitle)
431	*(pct->ppszTitle) = strhSubstr(pSource + 1, pNextOpen);
432	// adjusted V0.9.20 (2002-08-10) [umoeller]
433
434	if (strnicmp(pNextOpen + 1, "/TITLE", 6) == 0)
435	{
436	// closing /TITLE tag found:
437	// search on after that
438	if (pct->pNewSource = strchr(pNextOpen, '>'))
439	pct->pNewSource++;
440	}
441	}
442	}
443	}
444
445	/*
446	*@@ TagP:
447	*
448	*/
449
450	STATIC VOID TagP(PCOPYTARGET pct)
451	{
452	// append newline:
453	// add \n before any other character
454	pct->fNeedsLinebreak = TRUE;
455
456	/* if (pct->ulListLevel)
457	{
458	// if we are currently in a list, we must also
459	// add a tab escape, because we have set
460	// the first line margin to the left of the
461	// left margin
462	AppendString(pct,
463	TXVESC_TAB);
464	} */
465	}
466
467	STATIC VOID TagBR(PCOPYTARGET pct)
468	{
469	AppendChar(pct,
470	'\r');
471
472	if (pct->ulListLevel)
473	{
474	// if we are currently in a list, we must also
475	// add a tab escape, because we have set
476	// the first line margin to the left of the
477	// left margin
478	AppendString(pct,
479	TXVESC_TAB);
480	}
481	if (!pct->fPRE)
482	pct->fSkipNextSpace = TRUE;
483	}
484
485	STATIC VOID TagPRE(PCOPYTARGET pct)
486	{
487	// start of PRE tag:
488	// add \n before any other character
489	// pct->fNeedsLinebreak = TRUE;
490	AppendChar(pct, '\n');
491	pct->fNeedsLinebreak = FALSE;
492	/* AppendString(pct,
493	TXVESC_PRE_BEGIN); */
494	AppendEscapeWith3Decimals(pct,
495	TXVESC_SET_FONT,
496	1); // monospaced font
497	AppendEscapeWith4Decimals(pct,
498	TXVESC_SPACEBEFORE,
499	0); // no spacing before
500	AppendEscapeWith4Decimals(pct,
501	TXVESC_SPACEAFTER,
502	0); // no spacing after
503	// disable word-wrapping
504	AppendString(pct,
505	TXVESC_WORDWRAP "0");
506	pct->fPRE = TRUE;
507	pct->fSkipNextSpace = FALSE;
508	}
509
510	STATIC VOID TagXPRE(PCOPYTARGET pct)
511	{
512	pct->fPRE = FALSE;
513	AppendEscapeWith3Decimals(pct,
514	TXVESC_SET_FONT,
515	0); // standard font
516	AppendString(pct, TXVESC_SPACEBEFORE);
517	AppendString(pct, "####"); // reset to default
518	AppendString(pct, TXVESC_SPACEAFTER);
519	AppendString(pct, "####"); // reset to default
520	// re-enable word-wrapping
521	AppendString(pct,
522	TXVESC_WORDWRAP "1"
523	"\n"); // force line break
524	pct->fNeedsLinebreak = FALSE;
525	// refuse to add \n even if we have another "p" coming up
526	pct->fSkipNextLinebreak = TRUE;
527	pct->fSkipNextSpace = TRUE;
528	}
529
530	STATIC VOID TagH1(PCOPYTARGET pct)
531	{
532	pct->fNeedsLinebreak = TRUE;
533	AppendEscapeWith3Decimals(pct,
534	TXVESC_POINTSIZE_REL,
535	200); // double size
536	AppendString(pct,
537	TXVESC_BOLD_BEGIN);
538	}
539
540	STATIC VOID TagXH1(PCOPYTARGET pct)
541	{
542	AppendString(pct,
543	TXVESC_BOLD_END);
544	AppendEscapeWith3Decimals(pct,
545	TXVESC_POINTSIZE_REL,
546	100); // regular size
547	// add \n before any other character
548	pct->fNeedsLinebreak = TRUE;
549	}
550
551	STATIC VOID TagH2(PCOPYTARGET pct)
552	{
553	pct->fNeedsLinebreak = TRUE;
554	AppendEscapeWith3Decimals(pct,
555	TXVESC_POINTSIZE_REL,
556	175); // size in percent of regular point size
557	AppendString(pct,
558	TXVESC_BOLD_BEGIN);
559	}
560
561	STATIC VOID TagXH2(PCOPYTARGET pct)
562	{
563	AppendString(pct,
564	TXVESC_BOLD_END);
565	AppendEscapeWith3Decimals(pct,
566	TXVESC_POINTSIZE_REL,
567	100); // regular size
568	// add \n before any other character
569	pct->fNeedsLinebreak = TRUE;
570	}
571
572	STATIC VOID TagH3(PCOPYTARGET pct)
573	{
574	pct->fNeedsLinebreak = TRUE;
575	AppendEscapeWith3Decimals(pct,
576	TXVESC_POINTSIZE_REL,
577	150); // size in percent of regular point size
578	AppendString(pct,
579	TXVESC_BOLD_BEGIN);
580	}
581
582	STATIC VOID TagXH3(PCOPYTARGET pct)
583	{
584	AppendString(pct,
585	TXVESC_BOLD_END);
586	AppendEscapeWith3Decimals(pct,
587	TXVESC_POINTSIZE_REL,
588	100); // size in percent of regular point size
589	// add \n before any other character
590	pct->fNeedsLinebreak = TRUE;
591	}
592
593	STATIC VOID TagH4(PCOPYTARGET pct)
594	{
595	pct->fNeedsLinebreak = TRUE;
596	AppendEscapeWith3Decimals(pct,
597	TXVESC_POINTSIZE_REL,
598	125); // size in percent of regular point size
599	AppendString(pct,
600	TXVESC_BOLD_BEGIN);
601	}
602
603	STATIC VOID TagXH4(PCOPYTARGET pct)
604	{
605	AppendString(pct,
606	TXVESC_BOLD_END);
607	AppendEscapeWith3Decimals(pct,
608	TXVESC_POINTSIZE_REL,
609	100); // regular size
610	// add \n before any other character
611	pct->fNeedsLinebreak = TRUE;
612	}
613
614	STATIC VOID TagH5(PCOPYTARGET pct)
615	{
616	pct->fNeedsLinebreak = TRUE;
617	AppendEscapeWith3Decimals(pct,
618	TXVESC_POINTSIZE_REL,
619	100); // size in percent of regular point size
620	AppendString(pct,
621	TXVESC_BOLD_BEGIN);
622	}
623
624	STATIC VOID TagXH5(PCOPYTARGET pct)
625	{
626	AppendString(pct,
627	TXVESC_BOLD_END);
628	AppendEscapeWith3Decimals(pct,
629	TXVESC_POINTSIZE_REL,
630	100); // regular size
631	// add \n before any other character
632	pct->fNeedsLinebreak = TRUE;
633	}
634
635	STATIC VOID TagH6(PCOPYTARGET pct)
636	{
637	pct->fNeedsLinebreak = TRUE;
638	AppendEscapeWith3Decimals(pct,
639	TXVESC_POINTSIZE_REL,
640	80 ); // size in percent of regular point size
641	AppendString(pct,
642	TXVESC_BOLD_BEGIN);
643	}
644
645	STATIC VOID TagXH6(PCOPYTARGET pct)
646	{
647	AppendString(pct,
648	TXVESC_BOLD_END);
649	AppendEscapeWith3Decimals(pct,
650	TXVESC_POINTSIZE_REL,
651	100); // regular size
652	// add \n before any other character
653	pct->fNeedsLinebreak = TRUE;
654	}
655
656	STATIC VOID TagUL(PCOPYTARGET pct)
657	{
658	StartList(pct,
659	0); // unordered
660	}
661
662	STATIC VOID TagXUL(PCOPYTARGET pct)
663	{
664	StopList(pct);
665	}
666
667	STATIC VOID TagOL(PCOPYTARGET pct)
668	{
669	StartList(pct,
670	1); // ordered
671	}
672
673	STATIC VOID TagXOL(PCOPYTARGET pct)
674	{
675	StopList(pct);
676	}
677
678	STATIC VOID TagLI(PCOPYTARGET pct)
679	{
680	PLISTDESC pListDesc;
681	CHAR szMarker[20] = TXVESC_MARKER "\x01";
682
683	if (pct->ulListLevel)
684	{
685	// we're in a list:
686	pListDesc = (PLISTDESC)lstItemFromIndex(&pct->llLists,
687	pct->ulListLevel - 1);
688	if (pListDesc)
689	{
690	if (pListDesc->ulListType == 1)
691	// is ordered list:
692	sprintf(szMarker, "%lu.", (pListDesc->ulItem)++);
693	else if (pListDesc->ulListType == 0)
694	// is unordered list:
695	// set bullet type according to unordered nesting
696	szMarker[2] = pct->ulUnorderedListLevel;
697	}
698	}
699
700	// add \n before any other character
701	// pct->fNeedsLinebreak = TRUE;
702	// if (pct->fNeedsLinebreak)
703	{
704	AppendChar(pct, '\n');
705	pct->fNeedsLinebreak = FALSE;
706	}
707
708	AppendString(pct, szMarker);
709	AppendString(pct, TXVESC_TAB);
710	}
711
712	STATIC VOID TagDL(PCOPYTARGET pct)
713	{
714	StartList(pct,
715	2); // definition list
716	}
717
718	STATIC VOID TagXDL(PCOPYTARGET pct)
719	{
720	StopList(pct);
721	pct->fInDT = FALSE;
722	}
723
724	STATIC VOID TagDT(PCOPYTARGET pct)
725	{
726	pct->fNeedsLinebreak = TRUE;
727	pct->fInDT = TRUE;
728	}
729
730	STATIC VOID TagDD(PCOPYTARGET pct)
731	{
732	pct->fNeedsLinebreak = TRUE;
733	AppendString(pct, TXVESC_TAB);
734	if (!pct->fPRE)
735	pct->fSkipNextSpace = TRUE;
736	pct->fInDT = FALSE;
737	}
738
739	STATIC VOID TagTR(PCOPYTARGET pct)
740	{
741	pct->fNeedsLinebreak = TRUE;
742	}
743
744	STATIC VOID TagB(PCOPYTARGET pct)
745	{
746	AppendString(pct,
747	TXVESC_BOLD_BEGIN);
748	}
749
750	STATIC VOID TagXB(PCOPYTARGET pct)
751	{
752	AppendString(pct,
753	TXVESC_BOLD_END);
754	}
755
756	STATIC VOID TagI(PCOPYTARGET pct)
757	{
758	AppendString(pct,
759	TXVESC_ITALICS_BEGIN);
760	}
761
762	STATIC VOID TagXI(PCOPYTARGET pct)
763	{
764	AppendString(pct,
765	TXVESC_ITALICS_END);
766	}
767
768	STATIC VOID TagU(PCOPYTARGET pct)
769	{
770	AppendString(pct,
771	TXVESC_UNDERLINE_BEGIN);
772	}
773
774	STATIC VOID TagXU(PCOPYTARGET pct)
775	{
776	AppendString(pct,
777	TXVESC_UNDERLINE_END);
778	}
779
780	STATIC VOID TagSTRIKE(PCOPYTARGET pct)
781	{
782	AppendString(pct,
783	TXVESC_STRIKE_BEGIN);
784	}
785
786	STATIC VOID TagXSTRIKE(PCOPYTARGET pct)
787	{
788	AppendString(pct,
789	TXVESC_STRIKE_END);
790	}
791
792	STATIC VOID TagCODE(PCOPYTARGET pct)
793	{
794	AppendEscapeWith3Decimals(pct,
795	TXVESC_SET_FONT,
796	1); // monospaced font
797	}
798
799	STATIC VOID TagXCODE(PCOPYTARGET pct)
800	{
801	AppendEscapeWith3Decimals(pct,
802	TXVESC_SET_FONT,
803	0); // regular font
804	}
805
806	STATIC VOID TagA(PCOPYTARGET pct)
807	{
808	CHAR szAnchor[10];
809	PSZ pHREF = NULL;
810
811	pct->fInLink = FALSE;
812
813	if (pct->pszAttributes)
814	{
815	// we have attributes:
816	PSZ pszClosingTag;
817	if (pszClosingTag = strchr(pct->pszAttributes, '>'))
818	{
819	ULONG ulOfs = 0;
820
821	/*
822	* HREF attribute:
823	*
824	*/
825
826	PSZ pNAME = 0;
827
828	// replace '>' with null char to mark end of search
829	*pszClosingTag = 0;
830
831	if (pHREF = strhGetTextAttr(pct->pszAttributes, "HREF", &ulOfs))
832	// OK, we got a link target:
833	pct->fInLink = TRUE;
834	// do not free
835
836	/*
837	* NAME attribute:
838	*
839	*/
840
841	if (pNAME = strhGetTextAttr(pct->pszAttributes, "NAME", &ulOfs))
842	{
843	AppendString(pct,
844	TXVESC_ANCHORNAME);
845	AppendString(pct,
846	pNAME);
847	// must be terminated with 0xFF
848	AppendChar(pct, 0xFF);
849	free(pNAME);
850	}
851
852	// restore '>'
853	*pszClosingTag = '>';
854	}
855	}
856
857	if (pHREF)
858	{
859	AppendString(pct,
860	TXVESC_LINK_BEGIN);
861	AppendString(pct,
862	pHREF);
863	// must be terminated with 0xFF
864	AppendChar(pct, 0xFF);
865
866	free(pHREF);
867	}
868	}
869
870	STATIC VOID TagXA(PCOPYTARGET pct)
871	{
872	if (pct->fInLink)
873	{
874	AppendString(pct,
875	TXVESC_LINK_END);
876	pct->fInLink = FALSE;
877	}
878	}
879
880	/* ******************************************************************
881	*
882	* Tag helpers
883	*
884	********************************************************************/
885
886	/*
887	*@@ FindTagProcessor:
888	* returns the Tag* function which handles the
889	* given tag or NULL if there's none.
890	*
891	*@@added V0.9.4 (2000-06-10) [umoeller]
892	*/
893
894	STATIC PFNPROCESSTAG FindTagProcessor(PSZ pszTag)
895	{
896	PFNPROCESSTAG pProcessor = NULL;
897
898	CHAR c0,
899	c1;
900
901	BOOL fEndOfTag = FALSE;
902
903	PSZ pCheck = pszTag,
904	p2;
905	if (*pCheck == '/')
906	{
907	// end of tag:
908	fEndOfTag = TRUE;
909	pCheck++;
910	}
911
912	c0 = *pCheck;
913	c1 = *(pCheck + 1);
914
915	p2 = pCheck + 2;
916
917	switch (c0)
918	{
919	case 'A':
920	case 'a':
921	switch (c1)
922	{
923	case 0: // A
924	if (!fEndOfTag)
925	return TagA;
926	else
927	return TagXA;
928	case 'D': // ADDRESS
929	case 'd': // ADDRESS
930	if (stricmp(p2, "DRESS") == 0)
931	{
932	if (!fEndOfTag)
933	return TagI;
934	else
935	return TagXI;
936	}
937	}
938	break;
939
940	case 'B':
941	case 'b':
942	switch (c1)
943	{
944	case 0:
945	if (!fEndOfTag)
946	return TagB;
947	else
948	return TagXB;
949
950	case 'R': // BR
951	case 'r': // BR
952	if (*p2 == 0)
953	if (!fEndOfTag)
954	return TagBR;
955	}
956	break;
957
958	case 'C':
959	case 'c':
960	switch (c1)
961	{
962	case 'I': // CITE
963	case 'i': // CITE
964	if (stricmp(p2, "TE") == 0)
965	{
966	if (!fEndOfTag)
967	return TagI;
968	else
969	return TagXI;
970	}
971	break;
972
973	case 'O':
974	case 'o':
975	if (stricmp(p2, "DE") == 0)
976	{
977	if (!fEndOfTag)
978	return TagCODE;
979	else
980	return TagXCODE;
981	}
982	break;
983	}
984	break;
985
986	case 'D':
987	case 'd':
988	switch (c1)
989	{
990	case 'D': // DD
991	case 'd': // DD
992	if ((*p2 == 0) && (!fEndOfTag))
993	return (TagDD);
994	break;
995
996	case 'I': // DIR
997	case 'i': // DIR
998	if (*p2 == 'R')
999	if (*(pCheck + 3) == 0)
1000	{
1001	if (!fEndOfTag)
1002	return TagUL;
1003	else
1004	return TagXUL;
1005	}
1006	break;
1007
1008	case 'L': // DL
1009	case 'l': // DL
1010	if (*p2 == 0)
1011	{
1012	if (!fEndOfTag)
1013	return TagDL;
1014	else
1015	return TagXDL;
1016	}
1017	break;
1018
1019	case 'T': // DT
1020	case 't': // DT
1021	if ((*p2 == 0) && (!fEndOfTag))
1022	return TagDT;
1023	break;
1024	}
1025	break;
1026
1027	case 'E':
1028	case 'e':
1029	if ( (c1 == 'M') \|\| (c1 == 'm') ) // EM
1030	if (*p2 == 0)
1031	{
1032	if (!fEndOfTag)
1033	return TagI;
1034	else
1035	return TagXI;
1036	}
1037	break;
1038
1039	case 'H':
1040	case 'h':
1041	if (c1)
1042	if (*p2 == 0)
1043	switch (c1)
1044	{
1045	case '1':
1046	if (!fEndOfTag)
1047	return TagH1;
1048	else
1049	return TagXH1;
1050	case '2':
1051	if (!fEndOfTag)
1052	return TagH2;
1053	else
1054	return TagXH2;
1055	case '3':
1056	if (!fEndOfTag)
1057	return TagH3;
1058	else
1059	return TagXH3;
1060	case '4':
1061	if (!fEndOfTag)
1062	return TagH4;
1063	else
1064	return TagXH4;
1065	case '5':
1066	if (!fEndOfTag)
1067	return TagH5;
1068	else
1069	return TagXH5;
1070	case '6':
1071	if (!fEndOfTag)
1072	return TagH6;
1073	else
1074	return TagXH6;
1075	}
1076	break;
1077
1078	case 'I':
1079	case 'i':
1080	if (c1 == 0)
1081	{
1082	if (!fEndOfTag)
1083	return TagI;
1084	else
1085	return TagXI;
1086	}
1087	break;
1088
1089	case 'L':
1090	case 'l':
1091	if ((c1 == 'I') \|\| (c1 == 'i'))
1092	if (*p2 == 0)
1093	return TagLI;
1094	break;
1095
1096	case 'M':
1097	case 'm':
1098	if (stricmp(p2, "NU") == 0)
1099	{
1100	if (!fEndOfTag)
1101	return TagUL;
1102	else
1103	return TagXUL;
1104	}
1105	break;
1106
1107	case 'O':
1108	case 'o':
1109	if ((c1 == 'L') \|\| (c1 == 'l'))
1110	if (*p2 == 0)
1111	{
1112	if (!fEndOfTag)
1113	return TagOL;
1114	else
1115	return TagXOL;
1116	}
1117	break;
1118
1119	case 'P':
1120	case 'p':
1121	switch (c1)
1122	{
1123	case 0:
1124	if (!fEndOfTag)
1125	return TagP;
1126	break;
1127
1128	case 'R': // PRE
1129	case 'r': // PRE
1130	if ((p2 == 'E') \|\| (p2 == 'e'))
1131	if (*(pCheck + 3) == 0)
1132	{
1133	if (!fEndOfTag)
1134	return TagPRE;
1135	else
1136	return TagXPRE;
1137	}
1138	break;
1139	}
1140	break;
1141
1142	case 'S':
1143	case 's':
1144	switch (c1)
1145	{
1146	case 'T': // STRONG
1147	case 't': // STRONG
1148	if (stricmp(p2, "RONG") == 0)
1149	{
1150	if (!fEndOfTag)
1151	return TagB;
1152	else
1153	return TagXB;
1154	}
1155	else if (stricmp(p2, "RIKE") == 0)
1156	{
1157	if (!fEndOfTag)
1158	return TagSTRIKE;
1159	else
1160	return TagXSTRIKE;
1161	}
1162	break;
1163
1164	case 'A':
1165	case 'a':
1166	if (stricmp(p2, "MP") == 0)
1167	{
1168	if (!fEndOfTag)
1169	return TagCODE;
1170	else
1171	return TagXCODE;
1172	}
1173	break;
1174	}
1175	break;
1176
1177	case 'T':
1178	case 't':
1179	switch (c1)
1180	{
1181	case 'R':
1182	case 'r':
1183	if (*p2 == 0)
1184	return TagTR;
1185	break;
1186
1187	case 'I':
1188	case 'i':
1189	if (stricmp(p2, "TLE") == 0)
1190	return TagTITLE;
1191	break;
1192
1193	case 'T': // TT
1194	case 't':
1195	if (*p2 == 0)
1196	{
1197	if (!fEndOfTag)
1198	return TagCODE;
1199	else
1200	return TagXCODE;
1201	}
1202	break;
1203	}
1204	break;
1205
1206	case 'U':
1207	case 'u':
1208	switch (c1)
1209	{
1210	case 0:
1211	if (!fEndOfTag)
1212	return TagU;
1213	else
1214	return TagXU;
1215
1216	case 'L':
1217	case 'l':
1218	if (*p2 == 0)
1219	{
1220	if (!fEndOfTag)
1221	return TagUL;
1222	else
1223	return TagXUL;
1224	}
1225	break;
1226	}
1227	break;
1228
1229	case 'V':
1230	case 'v':
1231	if (stricmp(p2, "R") == 0)
1232	{
1233	if (!fEndOfTag)
1234	return TagI;
1235	else
1236	return TagXI;
1237	}
1238	break;
1239
1240	case 'X':
1241	case 'x':
1242	if (stricmp(p2, "MP") == 0) // XMP
1243	{
1244	if (!fEndOfTag)
1245	return TagPRE;
1246	else
1247	return TagXPRE;
1248	}
1249	break;
1250	}
1251
1252	return (pProcessor);
1253	}
1254
1255	/*
1256	*@@ HandleTag:
1257	* called by txvConvertFromHTML when a "<" character
1258	* is found in the source buffer. This calls
1259	* FindTagProcessor in turn to find the Tag*
1260	* function which handles the tag.
1261	*
1262	*@@added V0.9.3 (2000-05-18) [umoeller]
1263	*/
1264
1265	STATIC VOID HandleTag(PCOPYTARGET pct)
1266	{
1267	PSZ pStartOfTag = pct->pSource;
1268	// '<' == begin of tag:
1269
1270	// is it a comment? <!-- ... -->
1271	if (strncmp(pStartOfTag + 1, "!--", 3) == 0)
1272	{
1273	// start of comment:
1274	// find end of comment
1275	PSZ pEnd = strstr(pStartOfTag, "-->");
1276	if (pEnd)
1277	// found:
1278	// search on after end of comment
1279	pct->pSource = pEnd + 3;
1280	else
1281	{
1282	// end of comment not found:
1283	// stop formatting...
1284	pct->pSource++;
1285	return;
1286	}
1287	}
1288	else
1289	{
1290	// no comment:
1291	// find end of tag
1292	PSZ p2 = pStartOfTag + 1,
1293	pNextClose = 0, // receives first '>' after '<'
1294	pNextSpace = 0; // receives first ' ' after '<'
1295	BOOL fCont = TRUE;
1296	while (fCont)
1297	{
1298	switch (*p2)
1299	{
1300	case ' ':
1301	case '\r':
1302	case '\n':
1303	// store first space after '<'
1304	if (!pNextSpace)
1305	pNextSpace = p2;
1306	// overwrite line breaks with spaces;
1307	// otherwise we cannot handle tags which go across
1308	// several lines, which is valid HTML
1309	*p2 = ' ';
1310	break;
1311
1312	case '>': // end of tag found:
1313	pNextClose = p2;
1314	fCont = FALSE;
1315	break;
1316
1317	case '<':
1318	// another opening tag:
1319	// that's an HTML error
1320	AppendChar(pct,
1321	*pct->pSource++);
1322	fCont = FALSE;
1323	break;
1324
1325	case 0:
1326	fCont = FALSE;
1327	break;
1328	}
1329	p2++;
1330	}
1331
1332	if (pNextClose)
1333	{
1334	// end of tag found:
1335	ULONG cbTag;
1336	// PSZ pStartOfAttrs = 0;
1337
1338	if ((pNextSpace) && (pNextSpace < pNextClose))
1339	{
1340	// we have attributes:
1341	cbTag = pNextSpace - (pStartOfTag + 1);
1342	// pStartOfAttrs = pNextSpace;
1343	}
1344	else
1345	cbTag = pNextClose - (pStartOfTag + 1);
1346
1347	if (!cbTag)
1348	{
1349	// happens if we have a "<>" in the text:
1350	// just insert the '<>' and go on, we have no tag here
1351	AppendChar(pct,
1352	*pct->pSource++);
1353	AppendChar(pct,
1354	*pct->pSource++);
1355	}
1356	else
1357	{
1358	PFNPROCESSTAG pTagProcessor;
1359
1360	pct->cSaved = *(pStartOfTag + cbTag + 1);
1361	// add a null terminator
1362	*(pStartOfTag + cbTag + 1) = 0;
1363
1364	// find corresponding tag converter function
1365	// from G_TagProcessors map
1366	pTagProcessor = FindTagProcessor(pStartOfTag + 1); // pszTag);
1367
1368	// restore char under null terminator
1369	*(pStartOfTag + cbTag + 1) = pct->cSaved;
1370
1371	// reset new source ptr; the tag handler
1372	// can modify this
1373	pct->pNewSource = NULL;
1374
1375	if (pTagProcessor)
1376	{
1377	// tag understood:
1378
1379	// terminate string after closing tag
1380	pct->cSaved = *(pNextClose + 1); // can be null byte!
1381	*(pNextClose + 1) = 0;
1382
1383	// did we have attributes?
1384	if (pNextSpace)
1385	pct->pszAttributes = pNextSpace;
1386
1387	// finally, call the tag handler
1388	(pTagProcessor) // function
1389	(pct); // argument
1390
1391	*(pNextClose + 1) = pct->cSaved;
1392	}
1393
1394	if (pct->pNewSource == NULL)
1395	// tag handler needs no special processing:
1396	// skip '>' too
1397	pct->pSource = pNextClose + 1;
1398	else
1399	// tag handler has skipped something:
1400	pct->pSource = pct->pNewSource;
1401	}
1402	}
1403	}
1404	}
1405
1406	/*
1407	*@@ ConvertEscape:
1408	* called by HandleEscape to find the ANSI (CP 1004)
1409	* character for the given escape sequence (pszTag).
1410	*
1411	* pszTag must be null-terminated and contain only
1412	* the stuff between "&" and ";".
1413	*
1414	* This is really ugly spaghetti, but it's the fastest
1415	* way to do it.
1416	*
1417	*@@added V0.9.4 (2000-06-10) [umoeller]
1418	*/
1419
1420	STATIC unsigned char ConvertEscape(PSZ pszTag)
1421	{
1422	CHAR c0, c1;
1423	CHAR crc = 0;
1424
1425	PSZ p2 = pszTag + 2;
1426
1427	c0 = *pszTag;
1428	c1 = *(pszTag + 1);
1429
1430	switch (c0)
1431	{
1432	case 'a':
1433	switch (c1)
1434	{
1435	case 'a':
1436	if (strcmp(p2, "cute") == 0)
1437	return 225;
1438	break;
1439
1440	case 'c':
1441	if (strcmp(p2, "irc") == 0)
1442	return 226;
1443	else if (strcmp(p2, "ute") == 0)
1444	return 180;
1445	break;
1446
1447	case 'e':
1448	if (strcmp(p2, "lig") == 0)
1449	return 230;
1450	break;
1451
1452	case 'g':
1453	if (strcmp(p2, "rave") == 0)
1454	return 224;
1455	break;
1456
1457	case 'm':
1458	if (strcmp(p2, "p") == 0)
1459	return '&';
1460	break;
1461
1462	case 'r':
1463	if (strcmp(p2, "ing") == 0)
1464	return 229;
1465	break;
1466
1467	case 't':
1468	if (strcmp(p2, "ilde") == 0)
1469	return 227;
1470	break;
1471
1472	case 'u':
1473	if (strcmp(p2, "ml") == 0)
1474	return 228;
1475	break;
1476	}
1477	break;
1478
1479	case 'b':
1480	if (strcmp(pszTag + 1, "rvbar") == 0)
1481	return 166;
1482	break;
1483
1484	case 'c':
1485	switch (c1)
1486	{
1487	case 'c':
1488	if (strcmp(p2, "edil") == 0)
1489	return 231;
1490	break;
1491
1492	case 'e':
1493	if (strcmp(p2, "dil") == 0)
1494	return 184;
1495	else if (strcmp(p2, "nt") == 0)
1496	return 162;
1497	break;
1498
1499	case 'o':
1500	if (strcmp(p2, "py") == 0)
1501	return 169;
1502	break;
1503
1504	case 'u':
1505	if (strcmp(p2, "rren") == 0)
1506	return 164;
1507	}
1508	break;
1509
1510	case 'd':
1511	switch (c1)
1512	{
1513	case 'e':
1514	if (strcmp(p2, "g") == 0) return 176;
1515	break;
1516
1517	case 'i':
1518	if (strcmp(p2, "vide") == 0) return 247;
1519	break;
1520	}
1521	break;
1522
1523	case 'e':
1524	switch (c1)
1525	{
1526	case 'a':
1527	if (strcmp(p2, "cute") == 0) return 233;
1528	break;
1529
1530	case 'c':
1531	if (strcmp(p2, "irc") == 0) return 234;
1532	break;
1533
1534	case 'g':
1535	if (strcmp(p2, "rave") == 0) return 232;
1536	break;
1537
1538	case 't':
1539	if (strcmp(p2, "h") == 0) return 240;
1540	break;
1541
1542	case 'u':
1543	if (strcmp(p2, "ml") == 0) return 235;
1544	break;
1545	}
1546	break;
1547
1548	case 'f':
1549	switch (c1)
1550	{
1551	case 'r':
1552	if (strcmp(p2, "ac14") == 0) return 188;
1553	if (strcmp(p2, "ac12") == 0) return 189;
1554	if (strcmp(p2, "ac34") == 0) return 190;
1555	break;
1556	}
1557	break;
1558
1559	case 'g':
1560	switch (c1)
1561	{
1562	case 't':
1563	if (*p2 == 0) return '>';
1564	}
1565	break;
1566
1567	case 'i':
1568	switch (c1)
1569	{
1570	case 'a':
1571	if (strcmp(p2, "cute") == 0) return 237;
1572	break;
1573
1574	case 'c':
1575	if (strcmp(p2, "irc") == 0) return 238;
1576	break;
1577
1578	case 'g':
1579	if (strcmp(p2, "rave") == 0) return 236;
1580	break;
1581
1582	case 'e':
1583	if (strcmp(p2, "xcl") == 0) return 161;
1584	break;
1585
1586	case 'q':
1587	if (strcmp(p2, "uest") == 0) return 191;
1588	break;
1589
1590	case 'u':
1591	if (strcmp(p2, "ml") == 0) return 239;
1592	}
1593	break;
1594
1595	case 'l':
1596	switch (c1)
1597	{
1598	case 't':
1599	if (*p2 == 0)
1600	return '<';
1601	break;
1602
1603	case 'a':
1604	if (strcmp(p2, "quo") == 0) return 171;
1605	}
1606	break;
1607
1608	case 'm':
1609	switch (c1)
1610	{
1611	case 'a':
1612	if (strcmp(p2, "cr") == 0) return 175;
1613	break;
1614
1615	case 'i':
1616	if (strcmp(p2, "cro") == 0) return 181;
1617	if (strcmp(p2, "ddot") == 0) return 183;
1618	break;
1619	}
1620	break;
1621
1622	case 'n':
1623	switch (c1)
1624	{
1625	case 'b':
1626	if (strcmp(p2, "sp") == 0) return 160;
1627	break;
1628
1629	case 'o':
1630	if (strcmp(p2, "t") == 0) return 172;
1631	break;
1632
1633	case 't':
1634	if (strcmp(p2, "ilde") == 0) return 241;
1635	}
1636	break;
1637
1638	case 'o':
1639	switch (c1)
1640	{
1641	case 'a':
1642	if (strcmp(p2, "cute") == 0) return 243;
1643	break;
1644
1645	case 'c':
1646	if (strcmp(p2, "irc") == 0) return 244;
1647	break;
1648
1649	case 'g':
1650	if (strcmp(p2, "rave") == 0) return 242;
1651	break;
1652
1653	case 'r':
1654	if (strcmp(p2, "df") == 0) return 170;
1655	if (strcmp(p2, "dm") == 0) return 186;
1656	break;
1657
1658	case 's':
1659	if (strcmp(p2, "lash") == 0) return 248;
1660	break;
1661
1662	case 't':
1663	if (strcmp(p2, "ilde") == 0) return 245;
1664	break;
1665
1666	case 'u':
1667	if (strcmp(p2, "ml") == 0) return 246;
1668	}
1669	break;
1670
1671	case 'p':
1672	switch (c1)
1673	{
1674	case 'a':
1675	if (strcmp(p2, "ra") == 0) return 182;
1676	break;
1677
1678	case 'l':
1679	if (strcmp(p2, "usmn") == 0) return 177;
1680	break;
1681
1682	case 'o':
1683	if (strcmp(p2, "und") == 0) return 163;
1684	}
1685	break;
1686
1687	case 'q':
1688	if (strcmp(pszTag, "quot") == 0) return '"';
1689	break;
1690
1691	case 'r':
1692	if (strcmp(pszTag, "raquo") == 0) return 187;
1693	if (strcmp(pszTag, "reg") == 0) return 174;
1694	break;
1695
1696	case 's':
1697	switch (c1)
1698	{
1699	case 'z':
1700	if (strcmp(p2, "lig") == 0) return 223;
1701	break;
1702
1703	case 'e':
1704	if (strcmp(p2, "ct") == 0) return 167;
1705	break;
1706
1707	case 'h':
1708	if (strcmp(p2, "y") == 0) return 173;
1709	break;
1710
1711	case 'u':
1712	if (strcmp(p2, "p1") == 0) return 185;
1713	if (strcmp(p2, "p2") == 0) return 178;
1714	if (strcmp(p2, "p3") == 0) return 179;
1715	}
1716	break;
1717
1718	case 't':
1719	if (strcmp(pszTag, "thorn") == 0) return 254;
1720	if (strcmp(pszTag, "times") == 0) return 215;
1721	break;
1722
1723	case 'u':
1724	switch (c1)
1725	{
1726	case 'a':
1727	if (strcmp(p2, "cute") == 0) return 250;
1728	break;
1729
1730	case 'c':
1731	if (strcmp(p2, "irc") == 0) return 251;
1732	break;
1733
1734	case 'g':
1735	if (strcmp(p2, "rave") == 0) return 249;
1736	break;
1737
1738	case 'm':
1739	if (strcmp(p2, "l") == 0) return 168;
1740	break;
1741
1742	case 'u':
1743	if (strcmp(p2, "ml") == 0) return 252;
1744	}
1745	break;
1746
1747	case 'y':
1748	if (strcmp(pszTag, "yacute") == 0) return 253;
1749	if (strcmp(pszTag, "yen") == 0) return 165;
1750	if (strcmp(pszTag, "yuml") == 0) return 255;
1751	break;
1752
1753	case 'A':
1754	switch (c1)
1755	{
1756	case 'u':
1757	if (strcmp(p2, "ml") == 0) return 196;
1758	break;
1759
1760	case 'a':
1761	if (strcmp(p2, "cute") == 0) return 193;
1762	break;
1763
1764	case 'c':
1765	if (strcmp(p2, "irc") == 0) return 194;
1766	break;
1767
1768	case 'E':
1769	if (strcmp(p2, "lig") == 0) return 198;
1770	break;
1771
1772	case 'g':
1773	if (strcmp(p2, "rave") == 0) return 192;
1774	break;
1775
1776	case 'r':
1777	if (strcmp(p2, "ing") == 0) return 197;
1778	break;
1779
1780	case 't':
1781	if (strcmp(p2, "ilde") == 0) return 195;
1782	}
1783	break;
1784
1785	case 'C':
1786	if (strcmp(pszTag, "Ccedil") == 0) return 199;
1787	break;
1788
1789	case 'E':
1790	if (strcmp(pszTag, "Ecirc") == 0) return 202;
1791	if (strcmp(pszTag, "Eacute") == 0) return 201;
1792	if (strcmp(pszTag, "Egrave") == 0) return 200;
1793	if (strcmp(pszTag, "ETH") == 0) return 208;
1794	if (strcmp(pszTag, "Euml") == 0) return 203;
1795	break;
1796
1797	case 'I':
1798	if (strcmp(pszTag, "Icirc") == 0) return 206;
1799	if (strcmp(pszTag, "Iacute") == 0) return 205;
1800	if (strcmp(pszTag, "Igrave") == 0) return 204;
1801	if (strcmp(pszTag, "Iuml") == 0) return 207;
1802	break;
1803
1804	case 'N':
1805	if (strcmp(pszTag, "Ntilde") == 0) return 209;
1806	break;
1807
1808	case 'O':
1809	switch (c1)
1810	{
1811	case 'u':
1812	if (strcmp(p2, "ml") == 0) return 214;
1813	break;
1814
1815	case 'a':
1816	if (strcmp(p2, "cute") == 0) return 211;
1817	break;
1818
1819	case 'c':
1820	if (strcmp(p2, "irc") == 0) return 212;
1821	break;
1822
1823	case 'g':
1824	if (strcmp(p2, "rave") == 0) return 210;
1825	break;
1826
1827	case 't':
1828	if (strcmp(p2, "ilde") == 0) return 213;
1829	break;
1830
1831	case 's':
1832	if (strcmp(p2, "lash") == 0) return 216;
1833	}
1834	break;
1835
1836	case 'U':
1837	switch (c1)
1838	{
1839	case 'a':
1840	if (strcmp(p2, "cute") == 0) return 218;
1841	break;
1842
1843	case 'c':
1844	if (strcmp(p2, "irc") == 0) return 219;
1845	break;
1846
1847	case 'g':
1848	if (strcmp(p2, "rave") == 0) return 217;
1849	break;
1850
1851	case 'u':
1852	if (strcmp(p2, "ml") == 0) return 220;
1853	}
1854	break;
1855
1856	case 'T':
1857	if (strcmp(pszTag, "THORN") == 0) return 222;
1858	break;
1859
1860	case 'Y':
1861	if (strcmp(pszTag, "Yacute") == 0) return 221;
1862	break;
1863	}
1864
1865	return (crc);
1866	}
1867
1868	/*
1869	*@@ HandleEscape:
1870	* called by txvConvertFromHTML when a "&" character
1871	* is found in the source buffer. This calls
1872	* ConvertEscape in turn.
1873	*
1874	*@@added V0.9.3 (2000-05-18) [umoeller]
1875	*/
1876
1877	STATIC VOID HandleEscape(PCOPYTARGET pct)
1878	{
1879	// ampersand:
1880	// replace special characters
1881	PSZ pStartOfTag = pct->pSource;
1882	// find end of tag
1883	PSZ p2 = pStartOfTag,
1884	pNextClose = 0,
1885	pNextSpace = 0;
1886	BOOL fCont = TRUE;
1887	while (fCont)
1888	{
1889	switch (*p2)
1890	{
1891	case 0:
1892	fCont = FALSE;
1893	break;
1894
1895	case ';':
1896	pNextClose = p2;
1897	fCont = FALSE;
1898	break;
1899
1900	case ' ':
1901	if (!pNextSpace)
1902	pNextSpace = p2;
1903	break;
1904	}
1905	p2++;
1906	}
1907
1908	if (!pNextClose)
1909	// no closing tag found:
1910	// just insert the '&' and go on, we have no tag here
1911	AppendChar(pct,
1912	*pct->pSource++);
1913	else
1914	{
1915	if ((pNextSpace) && (pNextSpace < pNextClose))
1916	// space before ';':
1917	// just insert the '&' and go on, we have no tag here
1918	AppendChar(pct,
1919	*pct->pSource++);
1920	else if ((!pNextClose) \|\| (pNextClose <= pStartOfTag + 1))
1921	AppendChar(pct,
1922	*pct->pSource++);
1923	else
1924	{
1925	ULONG ulCode = 0;
1926
1927	// create substring with tag
1928	PSZ pszTag = pStartOfTag + 1;
1929	*pNextClose = 0;
1930
1931	if (*pszTag == '#')
1932	{
1933	// latin-1 or Unicode encoding ()
1934	ulCode = atoi(pszTag + 1);
1935
1936	// next input: char after ';'
1937	pct->pSource = pNextClose + 1;
1938	}
1939	else
1940	{
1941	// named entity:
1942	// find char code corresponding to escape
1943	// from G_EscapeProcessors map
1944	ulCode = ConvertEscape(pszTag);
1945	if (ulCode)
1946	// tag supported:
1947	pct->pSource = pNextClose + 1;
1948	else
1949	// tag not supported:
1950	ulCode = *pct->pSource++;
1951	}
1952
1953	// restore closing tag which we overwrote
1954	*pNextClose = ';';
1955
1956	if (ulCode)
1957	{
1958	AppendLinebreakCheck(pct);
1959
1960	AppendChar(pct,
1961	(CHAR)ulCode);
1962	pct->fSkipNextSpace = FALSE;
1963	}
1964	}
1965	}
1966	}
1967
1968	/* ******************************************************************
1969	*
1970	* Entry points
1971	*
1972	********************************************************************/
1973
1974	/*
1975	*@@ txvConvertFromHTML:
1976	* this modifies the given text string (which should
1977	* be the complete BODY block of any HTML file) so
1978	* that all HTML tags are removed and replaced with
1979	* escape sequences that the XTextView control understands.
1980	*
1981	* The buffer gets reallocated by this function, so it
1982	* must be free()'able.
1983	*
1984	* So, to have the XTextView control display an HTML file,
1985	* do this:
1986	*
1987	* 1) Load an HTML file into a buffer allocated by malloc().
1988	*
1989	* 2) Call txvConvertFromHTML.
1990	*
1991	* 3) Call WinSetWindowText on the XTextView control with
1992	* the modified buffer.
1993	*
1994	* This understands the following limited subset of HTML:
1995	*
1996	* Paragraph tags:
1997	*
1998	* -- P, BR
1999	* -- PRE, /PRE
2000	* -- UL, /UL, OL, /OL, LI
2001	* -- DL, /DL, DT, DD
2002	* -- H1, /H1 thru H6, /H6
2003	* -- Comments (<!-- .... -->)
2004	*
2005	* Character tags:
2006	*
2007	* -- B, /B, STRONG, /STRONG
2008	* -- I, /I, EM, /EM, VAR, /VAR, CITE, /CITE
2009	* -- CODE, /CODE, SAMP, /SAMP, KBD, /KBD, TT, /TT
2010	* -- U, /U
2011	* -- STRIKE, /STRIKE
2012	* -- CODE, /CODE
2013	*
2014	* The most obvious limitation is that neither tables
2015	* nor frames are supported. Also forget about CSS
2016	* and JavaScript, of course.
2017	*
2018	* All the ampersand (& something) sequences defined
2019	* in HTML 3 are properly translated.
2020	*
2021	* Note: Those are translated to the ANSI (MS-Windows,
2022	* OS/2 codepage 1004) character set. This has the
2023	* following characteristics:
2024	*
2025	* -- Codes 0-127 are identical to ASCII and thus
2026	* ISO 8559-1 ("Latin 1") also.
2027	*
2028	* -- Codes 160-255 are identical to ISO 8559-1 ("Latin 1").
2029	*
2030	* -- Codes 128-159 are NOT defined in ISO 8559-1, but
2031	* Netscape treats those as ANSI as well, so we do too.
2032	*
2033	* As a result, consider the output to be in OS/2 codepage
2034	* 1004. Either set your codepage to that (WinSetCp)
2035	* or translate the output (WinCpTranslateString).
2036	*
2037	* &#xxx; tags (with xxx being a decimal) are considered
2038	* ANSI codes as well. Even though HTML 4.0 allows Unicode
2039	* characters > 255 to be inserted this way, we ignore
2040	* those. Unicode chars from 0 to 255 are identical to
2041	* ANSI, so for to ÿ, we are HTML-compliant.
2042	*
2043	* All other tags are completely thrown out.
2044	*
2045	*@@added V0.9.3 (2000-05-06) [umoeller]
2046	*@@changed V0.9.20 (2002-08-10) [umoeller]: changed prototype
2047	*/
2048
2049	BOOL txvConvertFromHTML(PSZ *ppszText, // in/out: text (gets reallocated)
2050	PSZ *ppszTitle, // out: if != NULL, receives malloc'd buffer with HTML title
2051	PULONG pulProgress, // out: progress (ptr can be NULL)
2052	PBOOL pfCancel) // in: cancel flag (ptr can be NULL)
2053	{
2054	BOOL brc = TRUE;
2055
2056	ULONG cbSource = strlen(*ppszText);
2057
2058	COPYTARGET ct = {0};
2059
2060	lstInit(&ct.llLists,
2061	TRUE); // free items
2062
2063	ct.ppszTitle = ppszTitle; // V0.9.20 (2002-08-10) [umoeller]
2064	// can be NULL
2065
2066	ct.pSource = *ppszText;
2067	// skip leading spaces
2068	ct.fSkipNextSpace = TRUE;
2069
2070	// step 2:
2071	// actual tags formatting
2072
2073	while (TRUE)
2074	{
2075	CHAR c = *ct.pSource;
2076
2077	if (pfCancel)
2078	if (*pfCancel)
2079	{
2080	brc = FALSE;
2081	break;
2082	}
2083
2084	if (!c)
2085	// null terminator reached:
2086	break;
2087
2088	// calculate progress
2089	if (pulProgress)
2090	pulProgress = ((ct.pSource - ppszText) // characters done
2091	* 100
2092	/ cbSource); // characters total
2093
2094	switch (c)
2095	{
2096	case '<':
2097	HandleTag(&ct);
2098	break;
2099
2100	case '&':
2101	HandleEscape(&ct);
2102	break;
2103
2104	case '\r':
2105	// skip
2106	if (!ct.fSkipNextSpace)
2107	{
2108	AppendChar(&ct,
2109	' ');
2110	// ct.fNeedsLinebreak = FALSE;
2111	// but skip leading spaces which might follow
2112	if (!ct.fPRE)
2113	ct.fSkipNextSpace = TRUE;
2114	}
2115	ct.pSource++;
2116	break;
2117
2118	case '\t':
2119	{
2120	if (ct.fPRE)
2121	{
2122	ULONG ul;
2123	for (ul = 0;
2124	ul < 8;
2125	ul++)
2126	AppendChar(&ct,
2127	' ');
2128	}
2129	else
2130	{
2131	// not in PRE block:
2132	if ( (!ct.fSkipNextSpace)
2133	// && (!ct.fNeedsLinebreak)
2134	)
2135	// last was not space: copy
2136	AppendChar(&ct,
2137	' ');
2138
2139	ct.fSkipNextSpace = TRUE;
2140	}
2141
2142	// skip the tab
2143	ct.pSource++;
2144	break; }
2145
2146	case '\n':
2147	{
2148	// newline char:
2149	if (!ct.fPRE)
2150	{
2151	// if not in PRE mode, replace with space
2152	if (!ct.fSkipNextSpace)
2153	{
2154	AppendChar(&ct,
2155	' ');
2156	// ct.fNeedsLinebreak = FALSE;
2157	// but skip leading spaces which might follow
2158	ct.fSkipNextSpace = TRUE;
2159	}
2160	}
2161	else
2162	// in PRE mode, preserve line breaks
2163	AppendChar(&ct, '\n'); // ct.fNeedsLinebreak = TRUE;
2164
2165	ct.pSource++;
2166	break; }
2167
2168	case '\xFF':
2169	{
2170	AppendChar(&ct,
2171	' ');
2172	ct.pSource++;
2173	break; }
2174
2175	case ' ':
2176	if (!ct.fPRE)
2177	{
2178	// is space, and not in PRE block:
2179	if ( (!ct.fSkipNextSpace)
2180	// && (!ct.fNeedsLinebreak)
2181	)
2182	// last was not space: copy
2183	AppendChar(&ct,
2184	' ');
2185
2186	ct.fSkipNextSpace = TRUE;
2187	}
2188	else
2189	// in PRE, always add all spaces
2190	AppendChar(&ct,
2191	' ');
2192	ct.pSource++;
2193	break;
2194
2195	default:
2196	// if we're not inserting escapes or anything,
2197	// check if a linebreak is needed
2198	AppendLinebreakCheck(&ct);
2199
2200	AppendChar(&ct,
2201	*ct.pSource++);
2202	ct.fSkipNextSpace = FALSE;
2203	ct.fSkipNextLinebreak = FALSE;
2204
2205	} // end switch (*pSource);
2206	} // end while (*pSource)
2207	AppendChar(&ct,
2208	'\n');
2209	// append null-terminator
2210	AppendChar(&ct,
2211	0);
2212
2213	free(*ppszText);
2214	*ppszText = ct.pszNew;
2215
2216	lstClear(&ct.llLists);
2217
2218	return brc;
2219	}
2220
2221

Note: See TracBrowser for help on using the repository browser.

Download in other formats: