Context Navigation

source: trunk/src/helpers/textv_html.c@ 19

Visit:

Last change on this file since 19 was 14, checked in by umoeller, 25 years ago
Major updates; timers, LVM, miscellaneous.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 61.5 KB

Line
1
2	/*
3	*@@sourcefile textv_html.c:
4	* this code converts HTML code to escape sequences for the
5	* XTextView control (textview.c).
6	*
7	* This code is in part ugly spaghetti, but this is intentional to
8	* make this HTML parser FAST. In general, you get about double or
9	* triple the speed compared to Netscape 4.6 on OS/2. This code
10	* doesn't understand all of HTML though, but you get most of HTML 2.
11	* There's no tables or frames at this point.
12	*
13	* The entry point into this mess is txvConvertFromHTML, which
14	* is easy to use.
15	*
16	* Note: Version numbering in this file relates to XWorkplace version
17	* numbering.
18	*
19	*@@header "helpers\textv_html.h"
20	*
21	*@@added V0.9.3 (2000-05-10) [umoeller]
22	*/
23
24	/*
25	* Copyright (C) 2000 Ulrich Mller.
26	* This program is part of the XWorkplace package.
27	* This program is free software; you can redistribute it and/or modify
28	* it under the terms of the GNU General Public License as published by
29	* the Free Software Foundation, in version 2 as it comes in the COPYING
30	* file of the XWorkplace main distribution.
31	* This program is distributed in the hope that it will be useful,
32	* but WITHOUT ANY WARRANTY; without even the implied warranty of
33	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34	* GNU General Public License for more details.
35	*/
36
37	#define OS2EMX_PLAIN_CHAR
38	// this is needed for "os2emx.h"; if this is defined,
39	// emx will define PSZ as _signed_ char, otherwise
40	// as unsigned char
41
42	#include <os2.h>
43
44	#include <stdlib.h>
45	#include <stdio.h>
46	#include <string.h>
47
48	#include "setup.h" // code generation and debugging options
49
50	#include "helpers\linklist.h"
51	#include "helpers\stringh.h"
52	#include "helpers\textview.h"
53
54	#include "helpers\textv_html.h"
55
56	/*
57	*@@category: Helpers\PM helpers\Window classes\XTextView control\HTML conversion
58	*/
59
60	/* ******************************************************************
61	*
62	* Declarations
63	*
64	********************************************************************/
65
66	/*
67	*@@ LISTDESC:
68	* structure stored in COPYTARGET to
69	* hold list information (UL, OL, ... tags).
70	*
71	*@@added V0.9.3 (2000-05-07) [umoeller]
72	*/
73
74	typedef struct _LISTDESC
75	{
76	ULONG ulListType; // 0: unordered (UL)
77	// 1: ordered (OL)
78	// 2: definition lists (DL)
79	ULONG ulItem; // list enumeration; 1 on first item,
80	// 2 on next, ...
81	} LISTDESC, *PLISTDESC;
82
83	/*
84	*@@ COPYTARGET:
85	* monster structure which holds the current
86	* status of the HTML converter while conversion
87	* is taking place. This stores input/output pointers
88	* and various flags to avoid duplicate line breaks
89	* and such.
90	*
91	* One instance of this is created in txvConvertFromHTML
92	* on the stack and then passed to all the sub-function
93	* calls.
94	*
95	*@@added V0.9.3 (2000-05-06) [umoeller]
96	*/
97
98	typedef struct _COPYTARGET
99	{
100	PSZ pSource; // ptr into source string;
101	// valid ONLY while we're in a tag handler
102	PSZ pNewSource; // can be set by tag handler to skip characters;
103	// this is set to NULL before calling a tag
104	// handler; if this is still NULL, default
105	// processing occurs
106
107	// new string:
108	PSZ pszNew; // memory buffer
109	ULONG cbNew; // size of buffer (reallocated)
110	PSZ pTarget; // current char ptr into pszNew
111
112	// saved character while tag handler is being called
113	CHAR cSaved;
114
115	PXHTMLDATA pxhtml; // ptr to XHTMLDATA passed to txvConvertFromHTML
116
117	// formatting flags while going through the text
118	BOOL fSkipNextSpace;
119	// if TRUE, subsequent spaces are skipped
120	BOOL fNeedsLinebreak;
121	// if TRUE, \n is inserted before any other character
122	BOOL fSkipNextLinebreak;
123	// if TRUE, subsequent linebreaks are skipped
124	BOOL fPRE;
125	// are we currently in a PRE tag?
126	BOOL fInLink;
127	// are we currently in a A HREF= tag?
128
129	// arguments (attributes) for tag handlers
130	PSZ pszAttributes; // != NULL while a tag handler is being called
131	// and attributes exist for the tag
132
133	// anchors count
134	USHORT usAnchorIndex; // start with 1
135
136	// list maintenance
137	ULONG ulListLevel; // if > 0, we're in a UL or OL block;
138	// raised for each block
139	ULONG ulUnorderedListLevel; // raised with each UL block to keep track
140	// of bullets
141	ULONG ulOrderedListLevel; // raised with each UL block to keep track
142	// of 1), 2), a), b)... numbering
143	ULONG ulCurrentListType; // current list type (from highest LISTDESC)
144	BOOL fInDT; // TRUE if we're currently in a DT tag
145	LINKLIST llLists; // stack of LISTDESC items
146	} COPYTARGET, *PCOPYTARGET;
147
148	typedef VOID FNPROCESSTAG(PCOPYTARGET pct);
149	typedef FNPROCESSTAG *PFNPROCESSTAG;
150
151	/* ******************************************************************
152	*
153	* Global variables
154	*
155	********************************************************************/
156
157	/* ******************************************************************
158	*
159	* Append-char helpers
160	*
161	********************************************************************/
162
163	#define COPYTARGETALLOC 100000
164
165	/*
166	*@@ AppendChar:
167	* helper for txvConvertFromHTML to
168	* append a char to the target string
169	* in COPYTARGET.
170	* This performs a few additional checks
171	* and manages memory.
172	*
173	*@@added V0.9.3 (2000-05-06) [umoeller]
174	*/
175
176	VOID AppendChar(PCOPYTARGET pct, // in/out: formatting buffer
177	unsigned char c)
178	{
179	// calculate ofs where to store next char
180	ULONG cbOfsNext = pct->pTarget - pct->pszNew;
181	if (cbOfsNext >= pct->cbNew) // have we reached the buffer size yet?
182	{
183	// more mem needed:
184	pct->cbNew += COPYTARGETALLOC;
185	pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
186	// if first call, pszNew is NULL, and realloc
187	// behaves just like malloc
188	// adjust target, because ptr might have changed
189	pct->pTarget = pct->pszNew + cbOfsNext;
190	}
191
192	// append character
193	*pct->pTarget++ = c;
194	}
195
196	/*
197	*@@ AppendString:
198	* appends the characters in *ach,
199	* which must be null-terminated.
200	* Does NOT append a null character though.
201	*
202	*@@added V0.9.3 (2000-05-06) [umoeller]
203	*/
204
205	VOID AppendString(PCOPYTARGET pct, // in/out: formatting buffer
206	char *ach)
207	{
208	ULONG cbAppend = strlen(ach);
209	ULONG ul;
210	PSZ pSource;
211
212	// calculate ofs where to store next char
213	ULONG cbOfsNext = pct->pTarget - pct->pszNew;
214	while (cbOfsNext + cbAppend >= pct->cbNew)
215	{
216	// more mem needed:
217	pct->cbNew += COPYTARGETALLOC;
218	pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
219	// if first call, pszNew is NULL, and realloc
220	// behaves just like malloc
221	// adjust target, because ptr might have changed
222	pct->pTarget = pct->pszNew + cbOfsNext;
223	}
224
225	// append characters
226	pSource = ach;
227	for (ul = 0;
228	ul < cbAppend;
229	ul++)
230	pct->pTarget++ = pSource++;
231	}
232
233	/*
234	*@@ AppendLinebreakCheck:
235	* checks if a linebreak is needed and
236	* inserts one if so.
237	*
238	*@@added V0.9.3 (2000-05-17) [umoeller]
239	*/
240
241	VOID AppendLinebreakCheck(PCOPYTARGET pct)
242	{
243	if ((!pct->fPRE) && (pct->fNeedsLinebreak))
244	{
245	// yes: insert linebreak; this resets pct->fNeedsLinebreak
246	if (!pct->fSkipNextLinebreak)
247	{
248	AppendChar(pct, '\n');
249
250	if ((pct->ulListLevel) && (!pct->fInDT))
251	// if we're in a list, add a tab also,
252	// because we'll have a negative first-line margin
253	AppendString(pct, TXVESC_TAB);
254	}
255	pct->fNeedsLinebreak = FALSE;
256	}
257	}
258
259	/*
260	*@@ AppendEscapeWithDecimal:
261	* appends the specified escape code
262	* with a three-digit decimal parameter.
263	* Calls AppendString in turn.
264	*
265	*@@added V0.9.3 (2000-05-07) [umoeller]
266	*/
267
268	VOID AppendEscapeWith3Decimals(PCOPYTARGET pct, // in/out: formatting buffer
269	char *ach,
270	USHORT us)
271	{
272	CHAR szDecimal[10];
273	if (us > 999)
274	us = 999;
275	sprintf(szDecimal, "%03d", us);
276	// append escape
277	AppendString(pct, ach);
278	AppendString(pct, szDecimal);
279	}
280
281	/*
282	*@@ AppendEscapeWith4Decimals:
283	*
284	*@@added V0.9.3 (2000-05-07) [umoeller]
285	*/
286
287	VOID AppendEscapeWith4Decimals(PCOPYTARGET pct, // in/out: formatting buffer
288	char *ach,
289	USHORT us)
290	{
291	CHAR szDecimal[10];
292	if (us > 9999)
293	us = 9999;
294	sprintf(szDecimal, "%04d", us);
295	// append escape
296	AppendString(pct, ach);
297	AppendString(pct, szDecimal);
298	}
299
300	/* ******************************************************************
301	*
302	* Tag converter functions
303	*
304	********************************************************************/
305
306	/*
307	*@@ StartList:
308	* starts a list (UL or OL).
309	* This uses a linked list in COPYTARGET
310	* to keep a pseudo-stack for nested lists.
311	*
312	*@@added V0.9.3 (2000-05-08) [umoeller]
313	*/
314
315	VOID StartList(PCOPYTARGET pct, // in/out: formatting buffer
316	ULONG ulListType) // list type:
317	// 0: unordered (UL)
318	// 1: ordered (OL)
319	// 2: definition lists (DL)
320	{
321	PLISTDESC pListDesc;
322
323	// raise list level
324	pct->ulListLevel++;
325
326	if (ulListType == 0)
327	// unordered:
328	pct->ulUnorderedListLevel++;
329	else if (ulListType == 1)
330	// ordered:
331	pct->ulOrderedListLevel++;
332
333	// create LISTDESC and store it on stack
334	pListDesc = (PLISTDESC)malloc(sizeof(LISTDESC));
335	pListDesc->ulListType
336	= pct->ulCurrentListType
337	= ulListType;
338	pListDesc->ulItem = 1;
339
340	lstAppendItem(&pct->llLists,
341	pListDesc);
342
343	AppendEscapeWith4Decimals(pct,
344	TXVESC_LEFTMARGIN,
345	pct->ulListLevel * 5);
346	AppendEscapeWith3Decimals(pct,
347	TXVESC_FIRSTLINEMARGIN_LEFT,
348	(ulListType == 2)
349	? 5 // for definition lists
350	: 3); // negative!
351	// add \n before any other character
352	pct->fNeedsLinebreak = TRUE;
353	}
354
355	/*
356	*@@ StopList:
357	* stops a list (UL or OL).
358	*
359	*@@added V0.9.3 (2000-05-07) [umoeller]
360	*/
361
362	VOID StopList(PCOPYTARGET pct)
363	{
364	if (pct->ulListLevel)
365	{
366	PLISTNODE pNode;
367
368	// lower list level
369	pct->ulListLevel--;
370	AppendEscapeWith4Decimals(pct,
371	TXVESC_LEFTMARGIN,
372	pct->ulListLevel * 5);
373	AppendEscapeWith3Decimals(pct,
374	TXVESC_FIRSTLINEMARGIN_LEFT,
375	(pct->ulListLevel)
376	? 3 // we still have a list level (nested)
377	: 0);
378	pct->fNeedsLinebreak = TRUE;
379
380	// remove the LISTDESC from the stack
381	pNode = lstNodeFromIndex(&pct->llLists,
382	pct->ulListLevel); // this has been lowered already
383	if (pNode)
384	{
385	PLISTDESC pListDesc = (PLISTDESC)pNode->pItemData;
386	if (pListDesc->ulListType == 0)
387	// was unordered:
388	pct->ulUnorderedListLevel--;
389	else if (pListDesc->ulListType == 1)
390	// was ordered:
391	pct->ulOrderedListLevel--;
392
393	lstRemoveNode(&pct->llLists, pNode);
394
395	// update COPYTARGET with previous list level
396	if (pct->ulListLevel)
397	{
398	// we're still in a list (nested lists):
399	PLISTDESC pListDesc2 = (PLISTDESC)lstItemFromIndex(&pct->llLists,
400	pct->ulListLevel - 1);
401	if (pListDesc2)
402	pct->ulCurrentListType = pListDesc2->ulListType;
403	}
404	}
405	}
406	// else: buggy HTML code, ignore
407	}
408
409	/*
410	*@@ TagTITLE:
411	*
412	*@@added V0.9.3 (2000-05-19) [umoeller]
413	*/
414
415	VOID TagTITLE(PCOPYTARGET pct)
416	{
417	// pSource currently points to <TITLE tag
418	PSZ pSource = pct->pSource + strlen(pct->pSource);
419	// points to temporary null byte in main buffer now
420	*pSource = pct->cSaved;
421
422	pSource = strchr(pct->pSource, '>');
423	if (pSource)
424	{
425	PSZ pNextOpen = strchr(pSource, '<');
426	if (pNextOpen)
427	{
428	// extract title
429	pct->pxhtml->pszTitle = strhSubstr(pSource + 1, pNextOpen);
430
431	if (strnicmp(pNextOpen + 1, "/TITLE", 6) == 0)
432	{
433	// closing /TITLE tag found:
434	// search on after that
435	pct->pNewSource = strchr(pNextOpen, '>');
436	if (pct->pNewSource)
437	pct->pNewSource++;
438	}
439	}
440	}
441	}
442
443	/*
444	*@@ TagP:
445	*
446	*/
447
448	VOID TagP(PCOPYTARGET pct)
449	{
450	// append newline:
451	// add \n before any other character
452	pct->fNeedsLinebreak = TRUE;
453
454	/* if (pct->ulListLevel)
455	{
456	// if we are currently in a list, we must also
457	// add a tab escape, because we have set
458	// the first line margin to the left of the
459	// left margin
460	AppendString(pct,
461	TXVESC_TAB);
462	} */
463	}
464
465	VOID TagBR(PCOPYTARGET pct)
466	{
467	AppendChar(pct,
468	'\r');
469
470	if (pct->ulListLevel)
471	{
472	// if we are currently in a list, we must also
473	// add a tab escape, because we have set
474	// the first line margin to the left of the
475	// left margin
476	AppendString(pct,
477	TXVESC_TAB);
478	}
479	if (!pct->fPRE)
480	pct->fSkipNextSpace = TRUE;
481	}
482
483	VOID TagPRE(PCOPYTARGET pct)
484	{
485	// start of PRE tag:
486	// add \n before any other character
487	// pct->fNeedsLinebreak = TRUE;
488	AppendChar(pct, '\n');
489	pct->fNeedsLinebreak = FALSE;
490	/* AppendString(pct,
491	TXVESC_PRE_BEGIN); */
492	AppendEscapeWith3Decimals(pct,
493	TXVESC_SET_FONT,
494	1); // monospaced font
495	AppendEscapeWith4Decimals(pct,
496	TXVESC_SPACEBEFORE,
497	0); // no spacing before
498	AppendEscapeWith4Decimals(pct,
499	TXVESC_SPACEAFTER,
500	0); // no spacing after
501	// disable word-wrapping
502	AppendString(pct,
503	TXVESC_WORDWRAP "0");
504	pct->fPRE = TRUE;
505	pct->fSkipNextSpace = FALSE;
506	}
507
508	VOID TagXPRE(PCOPYTARGET pct)
509	{
510	pct->fPRE = FALSE;
511	AppendEscapeWith3Decimals(pct,
512	TXVESC_SET_FONT,
513	0); // standard font
514	AppendString(pct, TXVESC_SPACEBEFORE);
515	AppendString(pct, "####"); // reset to default
516	AppendString(pct, TXVESC_SPACEAFTER);
517	AppendString(pct, "####"); // reset to default
518	// re-enable word-wrapping
519	AppendString(pct,
520	TXVESC_WORDWRAP "1"
521	"\n"); // force line break
522	pct->fNeedsLinebreak = FALSE;
523	// refuse to add \n even if we have another "p" coming up
524	pct->fSkipNextLinebreak = TRUE;
525	pct->fSkipNextSpace = TRUE;
526	}
527
528	VOID TagH1(PCOPYTARGET pct)
529	{
530	pct->fNeedsLinebreak = TRUE;
531	AppendEscapeWith3Decimals(pct,
532	TXVESC_POINTSIZE_REL,
533	200); // double size
534	AppendString(pct,
535	TXVESC_BOLD_BEGIN);
536	}
537
538	VOID TagXH1(PCOPYTARGET pct)
539	{
540	AppendString(pct,
541	TXVESC_BOLD_END);
542	AppendEscapeWith3Decimals(pct,
543	TXVESC_POINTSIZE_REL,
544	100); // regular size
545	// add \n before any other character
546	pct->fNeedsLinebreak = TRUE;
547	}
548
549	VOID TagH2(PCOPYTARGET pct)
550	{
551	pct->fNeedsLinebreak = TRUE;
552	AppendEscapeWith3Decimals(pct,
553	TXVESC_POINTSIZE_REL,
554	175); // size in percent of regular point size
555	AppendString(pct,
556	TXVESC_BOLD_BEGIN);
557	}
558
559	VOID TagXH2(PCOPYTARGET pct)
560	{
561	AppendString(pct,
562	TXVESC_BOLD_END);
563	AppendEscapeWith3Decimals(pct,
564	TXVESC_POINTSIZE_REL,
565	100); // regular size
566	// add \n before any other character
567	pct->fNeedsLinebreak = TRUE;
568	}
569
570	VOID TagH3(PCOPYTARGET pct)
571	{
572	pct->fNeedsLinebreak = TRUE;
573	AppendEscapeWith3Decimals(pct,
574	TXVESC_POINTSIZE_REL,
575	150); // size in percent of regular point size
576	AppendString(pct,
577	TXVESC_BOLD_BEGIN);
578	}
579
580	VOID TagXH3(PCOPYTARGET pct)
581	{
582	AppendString(pct,
583	TXVESC_BOLD_END);
584	AppendEscapeWith3Decimals(pct,
585	TXVESC_POINTSIZE_REL,
586	100); // size in percent of regular point size
587	// add \n before any other character
588	pct->fNeedsLinebreak = TRUE;
589	}
590
591	VOID TagH4(PCOPYTARGET pct)
592	{
593	pct->fNeedsLinebreak = TRUE;
594	AppendEscapeWith3Decimals(pct,
595	TXVESC_POINTSIZE_REL,
596	125); // size in percent of regular point size
597	AppendString(pct,
598	TXVESC_BOLD_BEGIN);
599	}
600
601	VOID TagXH4(PCOPYTARGET pct)
602	{
603	AppendString(pct,
604	TXVESC_BOLD_END);
605	AppendEscapeWith3Decimals(pct,
606	TXVESC_POINTSIZE_REL,
607	100); // regular size
608	// add \n before any other character
609	pct->fNeedsLinebreak = TRUE;
610	}
611
612	VOID TagH5(PCOPYTARGET pct)
613	{
614	pct->fNeedsLinebreak = TRUE;
615	AppendEscapeWith3Decimals(pct,
616	TXVESC_POINTSIZE_REL,
617	100); // size in percent of regular point size
618	AppendString(pct,
619	TXVESC_BOLD_BEGIN);
620	}
621
622	VOID TagXH5(PCOPYTARGET pct)
623	{
624	AppendString(pct,
625	TXVESC_BOLD_END);
626	AppendEscapeWith3Decimals(pct,
627	TXVESC_POINTSIZE_REL,
628	100); // regular size
629	// add \n before any other character
630	pct->fNeedsLinebreak = TRUE;
631	}
632
633	VOID TagH6(PCOPYTARGET pct)
634	{
635	pct->fNeedsLinebreak = TRUE;
636	AppendEscapeWith3Decimals(pct,
637	TXVESC_POINTSIZE_REL,
638	80 ); // size in percent of regular point size
639	AppendString(pct,
640	TXVESC_BOLD_BEGIN);
641	}
642
643	VOID TagXH6(PCOPYTARGET pct)
644	{
645	AppendString(pct,
646	TXVESC_BOLD_END);
647	AppendEscapeWith3Decimals(pct,
648	TXVESC_POINTSIZE_REL,
649	100); // regular size
650	// add \n before any other character
651	pct->fNeedsLinebreak = TRUE;
652	}
653
654	VOID TagUL(PCOPYTARGET pct)
655	{
656	StartList(pct,
657	0); // unordered
658	}
659
660	VOID TagXUL(PCOPYTARGET pct)
661	{
662	StopList(pct);
663	}
664
665	VOID TagOL(PCOPYTARGET pct)
666	{
667	StartList(pct,
668	1); // ordered
669	}
670
671	VOID TagXOL(PCOPYTARGET pct)
672	{
673	StopList(pct);
674	}
675
676	VOID TagLI(PCOPYTARGET pct)
677	{
678	PLISTDESC pListDesc;
679	CHAR szMarker[20] = TXVESC_MARKER "\x01";
680
681	if (pct->ulListLevel)
682	{
683	// we're in a list:
684	pListDesc = (PLISTDESC)lstItemFromIndex(&pct->llLists,
685	pct->ulListLevel - 1);
686	if (pListDesc)
687	{
688	if (pListDesc->ulListType == 1)
689	// is ordered list:
690	sprintf(szMarker, "%lu.", (pListDesc->ulItem)++);
691	else if (pListDesc->ulListType == 0)
692	// is unordered list:
693	// set bullet type according to unordered nesting
694	szMarker[2] = pct->ulUnorderedListLevel;
695	}
696	}
697
698	// add \n before any other character
699	// pct->fNeedsLinebreak = TRUE;
700	// if (pct->fNeedsLinebreak)
701	{
702	AppendChar(pct, '\n');
703	pct->fNeedsLinebreak = FALSE;
704	}
705
706	AppendString(pct, szMarker);
707	AppendString(pct, TXVESC_TAB);
708	}
709
710	VOID TagDL(PCOPYTARGET pct)
711	{
712	StartList(pct,
713	2); // definition list
714	}
715
716	VOID TagXDL(PCOPYTARGET pct)
717	{
718	StopList(pct);
719	pct->fInDT = FALSE;
720	}
721
722	VOID TagDT(PCOPYTARGET pct)
723	{
724	pct->fNeedsLinebreak = TRUE;
725	pct->fInDT = TRUE;
726	}
727
728	VOID TagDD(PCOPYTARGET pct)
729	{
730	pct->fNeedsLinebreak = TRUE;
731	AppendString(pct, TXVESC_TAB);
732	if (!pct->fPRE)
733	pct->fSkipNextSpace = TRUE;
734	pct->fInDT = FALSE;
735	}
736
737	VOID TagTR(PCOPYTARGET pct)
738	{
739	pct->fNeedsLinebreak = TRUE;
740	}
741
742	VOID TagB(PCOPYTARGET pct)
743	{
744	AppendString(pct,
745	TXVESC_BOLD_BEGIN);
746	}
747
748	VOID TagXB(PCOPYTARGET pct)
749	{
750	AppendString(pct,
751	TXVESC_BOLD_END);
752	}
753
754	VOID TagI(PCOPYTARGET pct)
755	{
756	AppendString(pct,
757	TXVESC_ITALICS_BEGIN);
758	}
759
760	VOID TagXI(PCOPYTARGET pct)
761	{
762	AppendString(pct,
763	TXVESC_ITALICS_END);
764	}
765
766	VOID TagU(PCOPYTARGET pct)
767	{
768	AppendString(pct,
769	TXVESC_UNDERLINE_BEGIN);
770	}
771
772	VOID TagXU(PCOPYTARGET pct)
773	{
774	AppendString(pct,
775	TXVESC_UNDERLINE_END);
776	}
777
778	VOID TagSTRIKE(PCOPYTARGET pct)
779	{
780	AppendString(pct,
781	TXVESC_STRIKE_BEGIN);
782	}
783
784	VOID TagXSTRIKE(PCOPYTARGET pct)
785	{
786	AppendString(pct,
787	TXVESC_STRIKE_END);
788	}
789
790	VOID TagCODE(PCOPYTARGET pct)
791	{
792	AppendEscapeWith3Decimals(pct,
793	TXVESC_SET_FONT,
794	1); // monospaced font
795	}
796
797	VOID TagXCODE(PCOPYTARGET pct)
798	{
799	AppendEscapeWith3Decimals(pct,
800	TXVESC_SET_FONT,
801	0); // regular font
802	}
803
804	VOID TagA(PCOPYTARGET pct)
805	{
806	CHAR szAnchor[10];
807
808	pct->fInLink = FALSE;
809
810	if ((pct->pszAttributes) && (pct->pxhtml)) // points into main source buffer
811	{
812	// we have attributes:
813	PSZ pszClosingTag = strchr(pct->pszAttributes, '>');
814	if (pszClosingTag)
815	{
816	ULONG ulOfs = 0;
817
818	/*
819	* HREF attribute:
820	*
821	*/
822
823	PSZ pHREF = strhGetTextAttr(pct->pszAttributes, "HREF", &ulOfs),
824	pNAME = 0;
825
826	// replace '>' with null char to mark end of search
827	*pszClosingTag = 0;
828
829	if (pHREF)
830	{
831	// OK, we got a link target:
832	// create a link item and append it to the output list
833	PXHTMLLINK pNewLink = (PXHTMLLINK)malloc(sizeof(XHTMLLINK));
834	memset(pNewLink, 0, sizeof(XHTMLLINK));
835
836	pct->fInLink = TRUE;
837
838	// this starts with anchor 1
839	pNewLink->usLinkIndex = ++pct->usAnchorIndex;
840	pNewLink->pszTargetFile = pHREF;
841	// do not free
842	lstAppendItem(&pct->pxhtml->llLinks, pNewLink);
843	}
844
845	/*
846	* NAME attribute:
847	*
848	*/
849
850	pNAME = strhGetTextAttr(pct->pszAttributes, "NAME", &ulOfs);
851	if (pNAME)
852	{
853	AppendString(pct,
854	TXVESC_ANCHORNAME);
855	AppendString(pct,
856	pNAME);
857	// must be terminated with 0xFF
858	AppendChar(pct, 0xFF);
859	free(pNAME);
860	}
861	// restore '>'
862	*pszClosingTag = '>';
863	}
864	}
865
866	if (pct->fInLink)
867	{
868	sprintf(szAnchor, "%04hX", pct->usAnchorIndex);
869	AppendString(pct,
870	TXVESC_LINK);
871	AppendString(pct,
872	szAnchor);
873	}
874	}
875
876	VOID TagXA(PCOPYTARGET pct)
877	{
878	if (pct->fInLink)
879	{
880	AppendString(pct,
881	TXVESC_LINK "####");
882	pct->fInLink = FALSE;
883	}
884	}
885
886	/* ******************************************************************
887	*
888	* Tag helpers
889	*
890	********************************************************************/
891
892	/*
893	*@@ FindTagProcessor:
894	* returns the Tag* function which handles the
895	* given tag or NULL if there's none.
896	*
897	*@@added V0.9.4 (2000-06-10) [umoeller]
898	*/
899
900	PFNPROCESSTAG FindTagProcessor(PSZ pszTag)
901	{
902	PFNPROCESSTAG pProcessor = NULL;
903
904	CHAR c0,
905	c1;
906
907	BOOL fEndOfTag = FALSE;
908
909	PSZ pCheck = pszTag,
910	p2;
911	if (*pCheck == '/')
912	{
913	// end of tag:
914	fEndOfTag = TRUE;
915	pCheck++;
916	}
917
918	c0 = *pCheck;
919	c1 = *(pCheck + 1);
920
921	p2 = pCheck + 2;
922
923	switch (c0)
924	{
925	case 'A':
926	case 'a':
927	switch (c1)
928	{
929	case 0: // A
930	if (!fEndOfTag)
931	return TagA;
932	else
933	return TagXA;
934	case 'D': // ADDRESS
935	case 'd': // ADDRESS
936	if (stricmp(p2, "DRESS") == 0)
937	{
938	if (!fEndOfTag)
939	return TagI;
940	else
941	return TagXI;
942	}
943	}
944	break;
945
946	case 'B':
947	case 'b':
948	switch (c1)
949	{
950	case 0:
951	if (!fEndOfTag)
952	return TagB;
953	else
954	return TagXB;
955
956	case 'R': // BR
957	case 'r': // BR
958	if (*p2 == 0)
959	if (!fEndOfTag)
960	return TagBR;
961	}
962	break;
963
964	case 'C':
965	case 'c':
966	switch (c1)
967	{
968	case 'I': // CITE
969	case 'i': // CITE
970	if (stricmp(p2, "TE") == 0)
971	{
972	if (!fEndOfTag)
973	return TagI;
974	else
975	return TagXI;
976	}
977	break;
978
979	case 'O':
980	case 'o':
981	if (stricmp(p2, "DE") == 0)
982	{
983	if (!fEndOfTag)
984	return TagCODE;
985	else
986	return TagXCODE;
987	}
988	break;
989	}
990	break;
991
992	case 'D':
993	case 'd':
994	switch (c1)
995	{
996	case 'D': // DD
997	case 'd': // DD
998	if ((*p2 == 0) && (!fEndOfTag))
999	return (TagDD);
1000	break;
1001
1002	case 'I': // DIR
1003	case 'i': // DIR
1004	if (*p2 == 'R')
1005	if (*(pCheck + 3) == 0)
1006	{
1007	if (!fEndOfTag)
1008	return TagUL;
1009	else
1010	return TagXUL;
1011	}
1012	break;
1013
1014	case 'L': // DL
1015	case 'l': // DL
1016	if (*p2 == 0)
1017	{
1018	if (!fEndOfTag)
1019	return TagDL;
1020	else
1021	return TagXDL;
1022	}
1023	break;
1024
1025	case 'T': // DT
1026	case 't': // DT
1027	if ((*p2 == 0) && (!fEndOfTag))
1028	return TagDT;
1029	break;
1030	}
1031	break;
1032
1033	case 'E':
1034	case 'e':
1035	if ( (c1 == 'M') \|\| (c1 == 'm') ) // EM
1036	if (*p2 == 0)
1037	{
1038	if (!fEndOfTag)
1039	return TagI;
1040	else
1041	return TagXI;
1042	}
1043	break;
1044
1045	case 'H':
1046	case 'h':
1047	if (c1)
1048	if (*p2 == 0)
1049	switch (c1)
1050	{
1051	case '1':
1052	if (!fEndOfTag)
1053	return TagH1;
1054	else
1055	return TagXH1;
1056	case '2':
1057	if (!fEndOfTag)
1058	return TagH2;
1059	else
1060	return TagXH2;
1061	case '3':
1062	if (!fEndOfTag)
1063	return TagH3;
1064	else
1065	return TagXH3;
1066	case '4':
1067	if (!fEndOfTag)
1068	return TagH4;
1069	else
1070	return TagXH4;
1071	case '5':
1072	if (!fEndOfTag)
1073	return TagH5;
1074	else
1075	return TagXH5;
1076	case '6':
1077	if (!fEndOfTag)
1078	return TagH6;
1079	else
1080	return TagXH6;
1081	}
1082	break;
1083
1084	case 'I':
1085	case 'i':
1086	if (c1 == 0)
1087	{
1088	if (!fEndOfTag)
1089	return TagI;
1090	else
1091	return TagXI;
1092	}
1093	break;
1094
1095	case 'L':
1096	case 'l':
1097	if ((c1 == 'I') \|\| (c1 == 'i'))
1098	if (*p2 == 0)
1099	return TagLI;
1100	break;
1101
1102	case 'M':
1103	case 'm':
1104	if (stricmp(p2, "NU") == 0)
1105	{
1106	if (!fEndOfTag)
1107	return TagUL;
1108	else
1109	return TagXUL;
1110	}
1111	break;
1112
1113	case 'O':
1114	case 'o':
1115	if ((c1 == 'L') \|\| (c1 == 'l'))
1116	if (*p2 == 0)
1117	{
1118	if (!fEndOfTag)
1119	return TagOL;
1120	else
1121	return TagXOL;
1122	}
1123	break;
1124
1125	case 'P':
1126	case 'p':
1127	switch (c1)
1128	{
1129	case 0:
1130	if (!fEndOfTag)
1131	return TagP;
1132	break;
1133
1134	case 'R': // PRE
1135	case 'r': // PRE
1136	if ((p2 == 'E') \|\| (p2 == 'e'))
1137	if (*(pCheck + 3) == 0)
1138	{
1139	if (!fEndOfTag)
1140	return TagPRE;
1141	else
1142	return TagXPRE;
1143	}
1144	break;
1145	}
1146	break;
1147
1148	case 'S':
1149	case 's':
1150	switch (c1)
1151	{
1152	case 'T': // STRONG
1153	case 't': // STRONG
1154	if (stricmp(p2, "RONG") == 0)
1155	{
1156	if (!fEndOfTag)
1157	return TagB;
1158	else
1159	return TagXB;
1160	}
1161	else if (stricmp(p2, "RIKE") == 0)
1162	{
1163	if (!fEndOfTag)
1164	return TagSTRIKE;
1165	else
1166	return TagXSTRIKE;
1167	}
1168	break;
1169
1170	case 'A':
1171	case 'a':
1172	if (stricmp(p2, "MP") == 0)
1173	{
1174	if (!fEndOfTag)
1175	return TagCODE;
1176	else
1177	return TagXCODE;
1178	}
1179	break;
1180	}
1181	break;
1182
1183	case 'T':
1184	case 't':
1185	switch (c1)
1186	{
1187	case 'R':
1188	case 'r':
1189	if (*p2 == 0)
1190	return TagTR;
1191	break;
1192
1193	case 'I':
1194	case 'i':
1195	if (stricmp(p2, "TLE") == 0)
1196	return TagTITLE;
1197	break;
1198
1199	case 'T': // TT
1200	case 't':
1201	if (*p2 == 0)
1202	{
1203	if (!fEndOfTag)
1204	return TagCODE;
1205	else
1206	return TagXCODE;
1207	}
1208	break;
1209	}
1210	break;
1211
1212	case 'U':
1213	case 'u':
1214	switch (c1)
1215	{
1216	case 0:
1217	if (!fEndOfTag)
1218	return TagU;
1219	else
1220	return TagXU;
1221
1222	case 'L':
1223	case 'l':
1224	if (*p2 == 0)
1225	{
1226	if (!fEndOfTag)
1227	return TagUL;
1228	else
1229	return TagXUL;
1230	}
1231	break;
1232	}
1233	break;
1234
1235	case 'V':
1236	case 'v':
1237	if (stricmp(p2, "R") == 0)
1238	{
1239	if (!fEndOfTag)
1240	return TagI;
1241	else
1242	return TagXI;
1243	}
1244	break;
1245
1246	case 'X':
1247	case 'x':
1248	if (stricmp(p2, "MP") == 0) // XMP
1249	{
1250	if (!fEndOfTag)
1251	return TagPRE;
1252	else
1253	return TagXPRE;
1254	}
1255	break;
1256	}
1257
1258	return (pProcessor);
1259	}
1260
1261	/*
1262	*@@ HandleTag:
1263	* called by txvConvertFromHTML when a "<" character
1264	* is found in the source buffer. This calls
1265	* FindTagProcessor in turn to find the Tag*
1266	* function which handles the tag.
1267	*
1268	*@@added V0.9.3 (2000-05-18) [umoeller]
1269	*/
1270
1271	VOID HandleTag(PCOPYTARGET pct)
1272	{
1273	PSZ pStartOfTag = pct->pSource;
1274	// '<' == begin of tag:
1275
1276	// is it a comment? <!-- ... -->
1277	if (strncmp(pStartOfTag + 1, "!--", 3) == 0)
1278	{
1279	// start of comment:
1280	// find end of comment
1281	PSZ pEnd = strstr(pStartOfTag, "-->");
1282	if (pEnd)
1283	// found:
1284	// search on after end of comment
1285	pct->pSource = pEnd + 3;
1286	else
1287	{
1288	// end of comment not found:
1289	// stop formatting...
1290	pct->pSource++;
1291	return;
1292	}
1293	}
1294	else
1295	{
1296	// no comment:
1297	// find end of tag
1298	PSZ p2 = pStartOfTag + 1,
1299	pNextClose = 0, // receives first '>' after '<'
1300	pNextSpace = 0; // receives first ' ' after '<'
1301	BOOL fCont = TRUE;
1302	while (fCont)
1303	{
1304	switch (*p2)
1305	{
1306	case ' ':
1307	case '\r':
1308	case '\n':
1309	// store first space after '<'
1310	if (!pNextSpace)
1311	pNextSpace = p2;
1312	// overwrite line breaks with spaces;
1313	// otherwise we cannot handle tags which go across
1314	// several lines, which is valid HTML
1315	*p2 = ' ';
1316	break;
1317
1318	case '>': // end of tag found:
1319	pNextClose = p2;
1320	fCont = FALSE;
1321	break;
1322
1323	case '<':
1324	// another opening tag:
1325	// that's an HTML error
1326	AppendChar(pct,
1327	*pct->pSource++);
1328	fCont = FALSE;
1329	break;
1330
1331	case 0:
1332	fCont = FALSE;
1333	break;
1334	}
1335	p2++;
1336	}
1337
1338	if (pNextClose)
1339	{
1340	// end of tag found:
1341	ULONG cbTag;
1342	PSZ pStartOfAttrs = 0;
1343
1344	if ((pNextSpace) && (pNextSpace < pNextClose))
1345	{
1346	// we have attributes:
1347	cbTag = pNextSpace - (pStartOfTag + 1);
1348	pStartOfAttrs = pNextSpace;
1349	}
1350	else
1351	cbTag = pNextClose - (pStartOfTag + 1);
1352
1353	if (!cbTag)
1354	{
1355	// happens if we have a "<>" in the text:
1356	// just insert the '<>' and go on, we have no tag here
1357	AppendChar(pct,
1358	*pct->pSource++);
1359	AppendChar(pct,
1360	*pct->pSource++);
1361	}
1362	else
1363	{
1364	PFNPROCESSTAG pTagProcessor;
1365
1366	pct->cSaved = *(pStartOfTag + cbTag + 1);
1367	// add a null terminator
1368	*(pStartOfTag + cbTag + 1) = 0;
1369
1370	// find corresponding tag converter function
1371	// from G_TagProcessors map
1372	pTagProcessor = FindTagProcessor(pStartOfTag + 1); // pszTag);
1373
1374	// restore char under null terminator
1375	*(pStartOfTag + cbTag + 1) = pct->cSaved;
1376
1377	// reset new source ptr; the tag handler
1378	// can modify this
1379	pct->pNewSource = NULL;
1380
1381	if (pTagProcessor)
1382	{
1383	// tag understood:
1384
1385	// terminate string after closing tag
1386	pct->cSaved = *(pNextClose + 1); // can be null byte!
1387	*(pNextClose + 1) = 0;
1388
1389	// did we have attributes?
1390	if (pNextSpace)
1391	pct->pszAttributes = pNextSpace;
1392
1393	// finally, call the tag handler
1394	(pTagProcessor) // function
1395	(pct); // argument
1396
1397	*(pNextClose + 1) = pct->cSaved;
1398	}
1399
1400	if (pct->pNewSource == NULL)
1401	// tag handler needs no special processing:
1402	// skip '>' too
1403	pct->pSource = pNextClose + 1;
1404	else
1405	// tag handler has skipped something:
1406	pct->pSource = pct->pNewSource;
1407	}
1408	}
1409	}
1410	}
1411
1412	/*
1413	*@@ ConvertEscape:
1414	* called by HandleEscape to find the ANSI (CP 1004)
1415	* character for the given escape sequence (pszTag).
1416	*
1417	* pszTag contains the stuff between "&" and ";".
1418	*
1419	* This is really ugly spaghetti, but it's the fastest
1420	* way to do it.
1421	*
1422	*@@added V0.9.4 (2000-06-10) [umoeller]
1423	*/
1424
1425	unsigned char ConvertEscape(PSZ pszTag)
1426	{
1427	CHAR c0, c1;
1428	CHAR crc = 0;
1429
1430	PSZ p2 = pszTag + 2;
1431
1432	c0 = *pszTag;
1433	c1 = *(pszTag + 1);
1434
1435	switch (c0)
1436	{
1437	case 'a':
1438	switch (c1)
1439	{
1440	case 'a':
1441	if (strcmp(p2, "cute") == 0)
1442	return 225;
1443	break;
1444
1445	case 'c':
1446	if (strcmp(p2, "irc") == 0)
1447	return 226;
1448	else if (strcmp(p2, "ute") == 0)
1449	return 180;
1450	break;
1451
1452	case 'e':
1453	if (strcmp(p2, "lig") == 0)
1454	return 230;
1455	break;
1456
1457	case 'g':
1458	if (strcmp(p2, "rave") == 0)
1459	return 224;
1460	break;
1461
1462	case 'm':
1463	if (strcmp(p2, "p") == 0)
1464	return '&';
1465	break;
1466
1467	case 'r':
1468	if (strcmp(p2, "ing") == 0)
1469	return 229;
1470	break;
1471
1472	case 't':
1473	if (strcmp(p2, "ilde") == 0)
1474	return 227;
1475	break;
1476
1477	case 'u':
1478	if (strcmp(p2, "ml") == 0)
1479	return 228;
1480	break;
1481	}
1482	break;
1483
1484	case 'b':
1485	if (strcmp(pszTag + 1, "rvbar") == 0)
1486	return 166;
1487	break;
1488
1489	case 'c':
1490	switch (c1)
1491	{
1492	case 'c':
1493	if (strcmp(p2, "edil") == 0)
1494	return 231;
1495	break;
1496
1497	case 'e':
1498	if (strcmp(p2, "dil") == 0)
1499	return 184;
1500	else if (strcmp(p2, "nt") == 0)
1501	return 162;
1502	break;
1503
1504	case 'o':
1505	if (strcmp(p2, "py") == 0)
1506	return 169;
1507	break;
1508
1509	case 'u':
1510	if (strcmp(p2, "rren") == 0)
1511	return 164;
1512	}
1513	break;
1514
1515	case 'd':
1516	switch (c1)
1517	{
1518	case 'e':
1519	if (strcmp(p2, "g") == 0) return 176;
1520	break;
1521
1522	case 'i':
1523	if (strcmp(p2, "vide") == 0) return 247;
1524	break;
1525	}
1526	break;
1527
1528	case 'e':
1529	switch (c1)
1530	{
1531	case 'a':
1532	if (strcmp(p2, "cute") == 0) return 233;
1533	break;
1534
1535	case 'c':
1536	if (strcmp(p2, "irc") == 0) return 234;
1537	break;
1538
1539	case 'g':
1540	if (strcmp(p2, "rave") == 0) return 232;
1541	break;
1542
1543	case 't':
1544	if (strcmp(p2, "h") == 0) return 240;
1545	break;
1546
1547	case 'u':
1548	if (strcmp(p2, "ml") == 0) return 235;
1549	break;
1550	}
1551	break;
1552
1553	case 'f':
1554	switch (c1)
1555	{
1556	case 'r':
1557	if (strcmp(p2, "ac14") == 0) return 188;
1558	if (strcmp(p2, "ac12") == 0) return 189;
1559	if (strcmp(p2, "ac34") == 0) return 190;
1560	break;
1561	}
1562	break;
1563
1564	case 'g':
1565	switch (c1)
1566	{
1567	case 't':
1568	if (*p2 == 0) return '>';
1569	}
1570	break;
1571
1572	case 'i':
1573	switch (c1)
1574	{
1575	case 'a':
1576	if (strcmp(p2, "cute") == 0) return 237;
1577	break;
1578
1579	case 'c':
1580	if (strcmp(p2, "irc") == 0) return 238;
1581	break;
1582
1583	case 'g':
1584	if (strcmp(p2, "rave") == 0) return 236;
1585	break;
1586
1587	case 'e':
1588	if (strcmp(p2, "xcl") == 0) return 161;
1589	break;
1590
1591	case 'q':
1592	if (strcmp(p2, "uest") == 0) return 191;
1593	break;
1594
1595	case 'u':
1596	if (strcmp(p2, "ml") == 0) return 239;
1597	}
1598	break;
1599
1600	case 'l':
1601	switch (c1)
1602	{
1603	case 't':
1604	if (*p2 == 0)
1605	return '<';
1606	break;
1607
1608	case 'a':
1609	if (strcmp(p2, "quo") == 0) return 171;
1610	}
1611	break;
1612
1613	case 'm':
1614	switch (c1)
1615	{
1616	case 'a':
1617	if (strcmp(p2, "cr") == 0) return 175;
1618	break;
1619
1620	case 'i':
1621	if (strcmp(p2, "cro") == 0) return 181;
1622	if (strcmp(p2, "ddot") == 0) return 183;
1623	break;
1624	}
1625	break;
1626
1627	case 'n':
1628	switch (c1)
1629	{
1630	case 'b':
1631	if (strcmp(p2, "sp") == 0) return 160;
1632	break;
1633
1634	case 'o':
1635	if (strcmp(p2, "t") == 0) return 172;
1636	break;
1637
1638	case 't':
1639	if (strcmp(p2, "ilde") == 0) return 241;
1640	}
1641	break;
1642
1643	case 'o':
1644	switch (c1)
1645	{
1646	case 'a':
1647	if (strcmp(p2, "cute") == 0) return 243;
1648	break;
1649
1650	case 'c':
1651	if (strcmp(p2, "irc") == 0) return 244;
1652	break;
1653
1654	case 'g':
1655	if (strcmp(p2, "rave") == 0) return 242;
1656	break;
1657
1658	case 'r':
1659	if (strcmp(p2, "df") == 0) return 170;
1660	if (strcmp(p2, "dm") == 0) return 186;
1661	break;
1662
1663	case 's':
1664	if (strcmp(p2, "lash") == 0) return 248;
1665	break;
1666
1667	case 't':
1668	if (strcmp(p2, "ilde") == 0) return 245;
1669	break;
1670
1671	case 'u':
1672	if (strcmp(p2, "ml") == 0) return 246;
1673	}
1674	break;
1675
1676	case 'p':
1677	switch (c1)
1678	{
1679	case 'a':
1680	if (strcmp(p2, "ra") == 0) return 182;
1681	break;
1682
1683	case 'l':
1684	if (strcmp(p2, "usmn") == 0) return 177;
1685	break;
1686
1687	case 'o':
1688	if (strcmp(p2, "und") == 0) return 163;
1689	}
1690	break;
1691
1692	case 'q':
1693	if (strcmp(pszTag, "quot") == 0) return '"';
1694	break;
1695
1696	case 'r':
1697	if (strcmp(pszTag, "raquo") == 0) return 187;
1698	if (strcmp(pszTag, "reg") == 0) return 174;
1699	break;
1700
1701	case 's':
1702	switch (c1)
1703	{
1704	case 'z':
1705	if (strcmp(p2, "lig") == 0) return 223;
1706	break;
1707
1708	case 'e':
1709	if (strcmp(p2, "ct") == 0) return 167;
1710	break;
1711
1712	case 'h':
1713	if (strcmp(p2, "y") == 0) return 173;
1714	break;
1715
1716	case 'u':
1717	if (strcmp(p2, "p1") == 0) return 185;
1718	if (strcmp(p2, "p2") == 0) return 178;
1719	if (strcmp(p2, "p3") == 0) return 179;
1720	}
1721	break;
1722
1723	case 't':
1724	if (strcmp(pszTag, "thorn") == 0) return 254;
1725	if (strcmp(pszTag, "times") == 0) return 215;
1726	break;
1727
1728	case 'u':
1729	switch (c1)
1730	{
1731	case 'a':
1732	if (strcmp(p2, "cute") == 0) return 250;
1733	break;
1734
1735	case 'c':
1736	if (strcmp(p2, "irc") == 0) return 251;
1737	break;
1738
1739	case 'g':
1740	if (strcmp(p2, "rave") == 0) return 249;
1741	break;
1742
1743	case 'm':
1744	if (strcmp(p2, "l") == 0) return 168;
1745	break;
1746
1747	case 'u':
1748	if (strcmp(p2, "ml") == 0) return 252;
1749	}
1750	break;
1751
1752	case 'y':
1753	if (strcmp(pszTag, "yacute") == 0) return 253;
1754	if (strcmp(pszTag, "yen") == 0) return 165;
1755	if (strcmp(pszTag, "yuml") == 0) return 255;
1756	break;
1757
1758	case 'A':
1759	switch (c1)
1760	{
1761	case 'u':
1762	if (strcmp(p2, "ml") == 0) return 196;
1763	break;
1764
1765	case 'a':
1766	if (strcmp(p2, "cute") == 0) return 193;
1767	break;
1768
1769	case 'c':
1770	if (strcmp(p2, "irc") == 0) return 194;
1771	break;
1772
1773	case 'E':
1774	if (strcmp(p2, "lig") == 0) return 198;
1775	break;
1776
1777	case 'g':
1778	if (strcmp(p2, "rave") == 0) return 192;
1779	break;
1780
1781	case 'r':
1782	if (strcmp(p2, "ing") == 0) return 197;
1783	break;
1784
1785	case 't':
1786	if (strcmp(p2, "ilde") == 0) return 195;
1787	}
1788	break;
1789
1790	case 'C':
1791	if (strcmp(pszTag, "Ccedil") == 0) return 199;
1792	break;
1793
1794	case 'E':
1795	if (strcmp(pszTag, "Ecirc") == 0) return 202;
1796	if (strcmp(pszTag, "Eacute") == 0) return 201;
1797	if (strcmp(pszTag, "Egrave") == 0) return 200;
1798	if (strcmp(pszTag, "ETH") == 0) return 208;
1799	if (strcmp(pszTag, "Euml") == 0) return 203;
1800	break;
1801
1802	case 'I':
1803	if (strcmp(pszTag, "Icirc") == 0) return 206;
1804	if (strcmp(pszTag, "Iacute") == 0) return 205;
1805	if (strcmp(pszTag, "Igrave") == 0) return 204;
1806	if (strcmp(pszTag, "Iuml") == 0) return 207;
1807	break;
1808
1809	case 'N':
1810	if (strcmp(pszTag, "Ntilde") == 0) return 209;
1811	break;
1812
1813	case 'O':
1814	switch (c1)
1815	{
1816	case 'u':
1817	if (strcmp(p2, "ml") == 0) return 214;
1818	break;
1819
1820	case 'a':
1821	if (strcmp(p2, "cute") == 0) return 211;
1822	break;
1823
1824	case 'c':
1825	if (strcmp(p2, "irc") == 0) return 212;
1826	break;
1827
1828	case 'g':
1829	if (strcmp(p2, "rave") == 0) return 210;
1830	break;
1831
1832	case 't':
1833	if (strcmp(p2, "ilde") == 0) return 213;
1834	break;
1835
1836	case 's':
1837	if (strcmp(p2, "lash") == 0) return 216;
1838	}
1839	break;
1840
1841	case 'U':
1842	switch (c1)
1843	{
1844	case 'a':
1845	if (strcmp(p2, "cute") == 0) return 218;
1846	break;
1847
1848	case 'c':
1849	if (strcmp(p2, "irc") == 0) return 219;
1850	break;
1851
1852	case 'g':
1853	if (strcmp(p2, "rave") == 0) return 217;
1854	break;
1855
1856	case 'u':
1857	if (strcmp(p2, "ml") == 0) return 220;
1858	}
1859	break;
1860
1861	case 'T':
1862	if (strcmp(pszTag, "THORN") == 0) return 222;
1863	break;
1864
1865	case 'Y':
1866	if (strcmp(pszTag, "Yacute") == 0) return 221;
1867	break;
1868	}
1869
1870	return (crc);
1871	}
1872
1873	/*
1874	*@@ HandleEscape:
1875	* called by txvConvertFromHTML when a "&" character
1876	* is found in the source buffer. This calls
1877	* ConvertEscape in turn.
1878	*
1879	*@@added V0.9.3 (2000-05-18) [umoeller]
1880	*/
1881
1882	VOID HandleEscape(PCOPYTARGET pct)
1883	{
1884	// ampersand:
1885	// replace special characters
1886	PSZ pStartOfTag = pct->pSource;
1887	// find end of tag
1888	PSZ p2 = pStartOfTag,
1889	pNextClose = 0,
1890	pNextSpace = 0;
1891	BOOL fCont = TRUE;
1892	while (fCont)
1893	{
1894	switch (*p2)
1895	{
1896	case 0:
1897	fCont = FALSE;
1898	break;
1899
1900	case ';':
1901	pNextClose = p2;
1902	fCont = FALSE;
1903	break;
1904
1905	case ' ':
1906	if (!pNextSpace)
1907	pNextSpace = p2;
1908	break;
1909	}
1910	p2++;
1911	}
1912
1913	if (!pNextClose)
1914	// no closing tag found:
1915	// just insert the '&' and go on, we have no tag here
1916	AppendChar(pct,
1917	*pct->pSource++);
1918	else
1919	{
1920	if ((pNextSpace) && (pNextSpace < pNextClose))
1921	// space before ';':
1922	// just insert the '&' and go on, we have no tag here
1923	AppendChar(pct,
1924	*pct->pSource++);
1925	else if ((!pNextClose) \|\| (pNextClose <= pStartOfTag + 1))
1926	AppendChar(pct,
1927	*pct->pSource++);
1928	else
1929	{
1930	ULONG ulCode = 0;
1931
1932	// create substring with tag
1933	PSZ pszTag = pStartOfTag + 1;
1934	*pNextClose = 0;
1935
1936	if (*pszTag == '#')
1937	{
1938	// latin-1 or Unicode encoding ()
1939	ulCode = atoi(pszTag + 1);
1940
1941	// next input: char after ';'
1942	pct->pSource = pNextClose + 1;
1943	}
1944	else
1945	{
1946	// named entity:
1947	// find char code corresponding to escape
1948	// from G_EscapeProcessors map
1949	ulCode = ConvertEscape(pszTag);
1950	if (ulCode)
1951	// tag supported:
1952	pct->pSource = pNextClose + 1;
1953	else
1954	// tag not supported:
1955	ulCode = *pct->pSource++;
1956	}
1957
1958	// restore closing tag which we overwrote
1959	*pNextClose = ';';
1960
1961	if (ulCode)
1962	{
1963	AppendLinebreakCheck(pct);
1964
1965	AppendChar(pct,
1966	(CHAR)ulCode);
1967	pct->fSkipNextSpace = FALSE;
1968	}
1969	}
1970	}
1971	}
1972
1973	/* ******************************************************************
1974	*
1975	* Entry points
1976	*
1977	********************************************************************/
1978
1979	/*
1980	*@@ txvConvertFromHTML:
1981	* this modifies the given text string (which should
1982	* be the complete BODY block of any HTML file) so
1983	* that all HTML tags are removed and replaced with
1984	* escape sequences that the XTextView control understands.
1985	*
1986	* The buffer gets reallocated by this function, so it
1987	* must be free()'able.
1988	*
1989	* So, to have the XTextView control display an HTML file,
1990	* do this:
1991	*
1992	* 1) Load an HTML file into a buffer allocated by malloc().
1993	*
1994	* 2) Call txvConvertFromHTML.
1995	*
1996	* 3) Call WinSetWindowText on the XTextView control with
1997	* the modified buffer.
1998	*
1999	* This understands the following limited subset of HTML:
2000	*
2001	* Paragraph tags:
2002	*
2003	* -- P, BR
2004	* -- PRE, /PRE
2005	* -- UL, /UL, OL, /OL, LI
2006	* -- DL, /DL, DT, DD
2007	* -- H1, /H1 thru H6, /H6
2008	* -- Comments (<!-- .... -->)
2009	*
2010	* Character tags:
2011	*
2012	* -- B, /B, STRONG, /STRONG
2013	* -- I, /I, EM, /EM, VAR, /VAR, CITE, /CITE
2014	* -- CODE, /CODE, SAMP, /SAMP, KBD, /KBD, TT, /TT
2015	* -- U, /U
2016	* -- STRIKE, /STRIKE
2017	* -- CODE, /CODE
2018	*
2019	* The most obvious limitation is that neither tables
2020	* nor frames are supported. Also forget about CSS
2021	* and JavaScript, of course.
2022	*
2023	* All the ampersand (& something) sequences defined
2024	* in HTML 3 are properly translated.
2025	*
2026	* Note: Those are translated to the ANSI (MS-Windows,
2027	* OS/2 codepage 1004) character set. This has the
2028	* following characteristics:
2029	*
2030	* -- Codes 0-127 are identical to ASCII and thus
2031	* ISO 8559-1 ("Latin 1") also.
2032	*
2033	* -- Codes 160-255 are identical to ISO 8559-1 ("Latin 1").
2034	*
2035	* -- Codes 128-159 are NOT defined in ISO 8559-1, but
2036	* Netscape treats those as ANSI as well, so we do too.
2037	*
2038	* As a result, consider the output to be in OS/2 codepage
2039	* 1004. Either set your codepage to that (WinSetCp)
2040	* or translate the output (WinCpTranslateString).
2041	*
2042	* &#xxx; tags (with xxx being a decimal) are considered
2043	* ANSI codes as well. Even though HTML 4.0 allows Unicode
2044	* characters > 255 to be inserted this way, we ignore
2045	* those. Unicode chars from 0 to 255 are identical to
2046	* ANSI, so for to ÿ, we are HTML-compliant.
2047	*
2048	* All other tags are completely thrown out.
2049	*
2050	*@@added V0.9.3 (2000-05-06) [umoeller]
2051	*/
2052
2053	BOOL txvConvertFromHTML(char **ppszText,
2054	PVOID pxhtml, // out: various config data (PXHTMLDATA)
2055	PULONG pulProgress, // out: progress (ptr can be NULL)
2056	PBOOL pfCancel) // in: cancel flag (ptr can be NULL)
2057	{
2058	BOOL brc = TRUE;
2059
2060	ULONG cbSource = strlen(*ppszText);
2061
2062	COPYTARGET ct = {0};
2063
2064	lstInit(&ct.llLists,
2065	TRUE); // free items
2066
2067	ct.pSource = *ppszText;
2068	// skip leading spaces
2069	ct.fSkipNextSpace = TRUE;
2070	ct.pxhtml = (PXHTMLDATA)pxhtml;
2071
2072	// step 2:
2073	// actual tags formatting
2074
2075	while (TRUE)
2076	{
2077	CHAR c = *ct.pSource;
2078
2079	if (pfCancel)
2080	if (*pfCancel)
2081	{
2082	brc = FALSE;
2083	break;
2084	}
2085
2086	if (!c)
2087	// null terminator reached:
2088	break;
2089
2090	// calculate progress
2091	if (pulProgress)
2092	pulProgress = ((ct.pSource - ppszText) // characters done
2093	* 100
2094	/ cbSource); // characters total
2095
2096	switch (c)
2097	{
2098	case '<':
2099	HandleTag(&ct);
2100	break;
2101
2102	case '&':
2103	HandleEscape(&ct);
2104	break;
2105
2106	case '\r':
2107	// skip
2108	if (!ct.fSkipNextSpace)
2109	{
2110	AppendChar(&ct,
2111	' ');
2112	// ct.fNeedsLinebreak = FALSE;
2113	// but skip leading spaces which might follow
2114	if (!ct.fPRE)
2115	ct.fSkipNextSpace = TRUE;
2116	}
2117	ct.pSource++;
2118	break;
2119
2120	case '\t':
2121	{
2122	if (ct.fPRE)
2123	{
2124	ULONG ul;
2125	for (ul = 0;
2126	ul < 8;
2127	ul++)
2128	AppendChar(&ct,
2129	' ');
2130	}
2131	else
2132	{
2133	// not in PRE block:
2134	if ( (!ct.fSkipNextSpace)
2135	// && (!ct.fNeedsLinebreak)
2136	)
2137	// last was not space: copy
2138	AppendChar(&ct,
2139	' ');
2140
2141	ct.fSkipNextSpace = TRUE;
2142	}
2143
2144	// skip the tab
2145	ct.pSource++;
2146	break; }
2147
2148	case '\n':
2149	{
2150	// newline char:
2151	if (!ct.fPRE)
2152	{
2153	// if not in PRE mode, replace with space
2154	if (!ct.fSkipNextSpace)
2155	{
2156	AppendChar(&ct,
2157	' ');
2158	// ct.fNeedsLinebreak = FALSE;
2159	// but skip leading spaces which might follow
2160	ct.fSkipNextSpace = TRUE;
2161	}
2162	}
2163	else
2164	// in PRE mode, preserve line breaks
2165	AppendChar(&ct, '\n'); // ct.fNeedsLinebreak = TRUE;
2166
2167	ct.pSource++;
2168	break; }
2169
2170	case '\xFF':
2171	{
2172	AppendChar(&ct,
2173	' ');
2174	ct.pSource++;
2175	break; }
2176
2177	case ' ':
2178	if (!ct.fPRE)
2179	{
2180	// is space, and not in PRE block:
2181	if ( (!ct.fSkipNextSpace)
2182	// && (!ct.fNeedsLinebreak)
2183	)
2184	// last was not space: copy
2185	AppendChar(&ct,
2186	' ');
2187
2188	ct.fSkipNextSpace = TRUE;
2189	}
2190	else
2191	// in PRE, always add all spaces
2192	AppendChar(&ct,
2193	' ');
2194	ct.pSource++;
2195	break;
2196
2197	default:
2198	// if we're not inserting escapes or anything,
2199	// check if a linebreak is needed
2200	AppendLinebreakCheck(&ct);
2201
2202	AppendChar(&ct,
2203	*ct.pSource++);
2204	ct.fSkipNextSpace = FALSE;
2205	ct.fSkipNextLinebreak = FALSE;
2206
2207	} // end switch (*pSource);
2208	} // end while (*pSource)
2209	AppendChar(&ct,
2210	'\n');
2211	// append null-terminator
2212	AppendChar(&ct,
2213	0);
2214
2215	free(*ppszText);
2216	*ppszText = ct.pszNew;
2217
2218	lstClear(&ct.llLists);
2219
2220	return (brc);
2221	}
2222
2223

Note: See TracBrowser for help on using the repository browser.

Download in other formats: