Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/src/helpers/textv_html.c@ 9

Visit:

Last change on this file since 9 was 8, checked in by umoeller, 25 years ago
Initial checkin of helpers code which used to be in WarpIN.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 62.0 KB

Line
1
2	/*
3	*@@sourcefile textv_html.c:
4	* this code converts HTML code to escape sequences for the
5	* XTextView control (textview.c).
6	*
7	* This code is in part ugly spaghetti, but this is intentional to
8	* make this HTML parser FAST. In general, you get about double or
9	* triple the speed compared to Netscape 4.6 on OS/2. This code
10	* doesn't understand all of HTML though, but you get most of HTML 2.
11	* There's no tables or frames at this point.
12	*
13	* The entry point into this mess is txvConvertFromHTML, which
14	* is easy to use.
15	*
16	* Note: Version numbering in this file relates to XWorkplace version
17	* numbering.
18	*
19	*@@header "helpers\textv_html.h"
20	*
21	*@@added V0.9.3 (2000-05-10) [umoeller]
22	*/
23
24	/*
25	* Copyright (C) 2000 Ulrich Mller.
26	* This program is part of the XWorkplace package.
27	* This program is free software; you can redistribute it and/or modify
28	* it under the terms of the GNU General Public License as published by
29	* the Free Software Foundation, in version 2 as it comes in the COPYING
30	* file of the XWorkplace main distribution.
31	* This program is distributed in the hope that it will be useful,
32	* but WITHOUT ANY WARRANTY; without even the implied warranty of
33	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34	* GNU General Public License for more details.
35	*/
36
37	#define OS2EMX_PLAIN_CHAR
38	// this is needed for "os2emx.h"; if this is defined,
39	// emx will define PSZ as _signed_ char, otherwise
40	// as unsigned char
41
42	#include <os2.h>
43
44	#include <stdlib.h>
45	#include <stdio.h>
46	#include <string.h>
47
48	#include "setup.h" // code generation and debugging options
49
50	#include "helpers\linklist.h"
51	#include "helpers\stringh.h"
52	#include "helpers\textview.h"
53
54	#include "helpers\textv_html.h"
55
56	/*
57	*@@category: Helpers\PM helpers\Window classes\XTextView control\HTML conversion
58	*/
59
60	/* ******************************************************************
61	* *
62	* Declarations *
63	* *
64	********************************************************************/
65
66	/*
67	*@@ LISTDESC:
68	* structure stored in COPYTARGET to
69	* hold list information (UL, OL, ... tags).
70	*
71	*@@added V0.9.3 (2000-05-07) [umoeller]
72	*/
73
74	typedef struct _LISTDESC
75	{
76	ULONG ulListType; // 0: unordered (UL)
77	// 1: ordered (OL)
78	// 2: definition lists (DL)
79	ULONG ulItem; // list enumeration; 1 on first item,
80	// 2 on next, ...
81	} LISTDESC, *PLISTDESC;
82
83	/*
84	*@@ COPYTARGET:
85	* monster structure which holds the current
86	* status of the HTML converter while conversion
87	* is taking place. This stores input/output pointers
88	* and various flags to avoid duplicate line breaks
89	* and such.
90	*
91	* One instance of this is created in txvConvertFromHTML
92	* on the stack and then passed to all the sub-function
93	* calls.
94	*
95	*@@added V0.9.3 (2000-05-06) [umoeller]
96	*/
97
98	typedef struct _COPYTARGET
99	{
100	PSZ pSource; // ptr into source string;
101	// valid ONLY while we're in a tag handler
102	PSZ pNewSource; // can be set by tag handler to skip characters;
103	// this is set to NULL before calling a tag
104	// handler; if this is still NULL, default
105	// processing occurs
106
107	// new string:
108	PSZ pszNew; // memory buffer
109	ULONG cbNew; // size of buffer (reallocated)
110	PSZ pTarget; // current char ptr into pszNew
111
112	// saved character while tag handler is being called
113	CHAR cSaved;
114
115	PXHTMLDATA pxhtml; // ptr to XHTMLDATA passed to txvConvertFromHTML
116
117	// formatting flags while going through the text
118	BOOL fSkipNextSpace;
119	// if TRUE, subsequent spaces are skipped
120	BOOL fNeedsLinebreak;
121	// if TRUE, \n is inserted before any other character
122	BOOL fSkipNextLinebreak;
123	// if TRUE, subsequent linebreaks are skipped
124	BOOL fPRE;
125	// are we currently in a PRE tag?
126	BOOL fInLink;
127	// are we currently in a A HREF= tag?
128
129	// arguments (attributes) for tag handlers
130	PSZ pszAttributes; // != NULL while a tag handler is being called
131	// and attributes exist for the tag
132
133	// anchors count
134	USHORT usAnchorIndex; // start with 1
135
136	// list maintenance
137	ULONG ulListLevel; // if > 0, we're in a UL or OL block;
138	// raised for each block
139	ULONG ulUnorderedListLevel; // raised with each UL block to keep track
140	// of bullets
141	ULONG ulOrderedListLevel; // raised with each UL block to keep track
142	// of 1), 2), a), b)... numbering
143	ULONG ulCurrentListType; // current list type (from highest LISTDESC)
144	BOOL fInDT; // TRUE if we're currently in a DT tag
145	LINKLIST llLists; // stack of LISTDESC items
146	} COPYTARGET, *PCOPYTARGET;
147
148	typedef VOID FNPROCESSTAG(PCOPYTARGET pct);
149	typedef FNPROCESSTAG *PFNPROCESSTAG;
150
151	/* ******************************************************************
152	* *
153	* Global variables *
154	* *
155	********************************************************************/
156
157	/* ******************************************************************
158	* *
159	* Append-char helpers *
160	* *
161	********************************************************************/
162
163	#define COPYTARGETALLOC 100000
164
165	/*
166	*@@ AppendChar:
167	* helper for txvConvertFromHTML to
168	* append a char to the target string
169	* in COPYTARGET.
170	* This performs a few additional checks
171	* and manages memory.
172	*
173	*@@added V0.9.3 (2000-05-06) [umoeller]
174	*/
175
176	VOID AppendChar(PCOPYTARGET pct, // in/out: formatting buffer
177	CHAR c)
178	{
179	// calculate ofs where to store next char
180	ULONG cbOfsNext = pct->pTarget - pct->pszNew;
181	if (cbOfsNext >= pct->cbNew) // have we reached the buffer size yet?
182	{
183	// more mem needed:
184	pct->cbNew += COPYTARGETALLOC;
185	pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
186	// if first call, pszNew is NULL, and realloc
187	// behaves just like malloc
188	// adjust target, because ptr might have changed
189	pct->pTarget = pct->pszNew + cbOfsNext;
190	}
191
192	// append character
193	*pct->pTarget++ = c;
194	}
195
196	/*
197	*@@ AppendString:
198	* appends the characters in *ach,
199	* which must be null-terminated.
200	* Does NOT append a null character though.
201	*
202	*@@added V0.9.3 (2000-05-06) [umoeller]
203	*/
204
205	VOID AppendString(PCOPYTARGET pct, // in/out: formatting buffer
206	char *ach)
207	{
208	ULONG cbAppend = strlen(ach);
209	ULONG ul;
210	PSZ pSource;
211
212	// calculate ofs where to store next char
213	ULONG cbOfsNext = pct->pTarget - pct->pszNew;
214	while (cbOfsNext + cbAppend >= pct->cbNew)
215	{
216	// more mem needed:
217	pct->cbNew += COPYTARGETALLOC;
218	pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
219	// if first call, pszNew is NULL, and realloc
220	// behaves just like malloc
221	// adjust target, because ptr might have changed
222	pct->pTarget = pct->pszNew + cbOfsNext;
223	}
224
225	// append characters
226	pSource = ach;
227	for (ul = 0;
228	ul < cbAppend;
229	ul++)
230	pct->pTarget++ = pSource++;
231	}
232
233	/*
234	*@@ AppendLinebreakCheck:
235	* checks if a linebreak is needed and
236	* inserts one if so.
237	*
238	*@@added V0.9.3 (2000-05-17) [umoeller]
239	*/
240
241	VOID AppendLinebreakCheck(PCOPYTARGET pct)
242	{
243	if ((!pct->fPRE) && (pct->fNeedsLinebreak))
244	{
245	// yes: insert linebreak; this resets pct->fNeedsLinebreak
246	if (!pct->fSkipNextLinebreak)
247	{
248	AppendChar(pct, '\n');
249
250	if ((pct->ulListLevel) && (!pct->fInDT))
251	// if we're in a list, add a tab also,
252	// because we'll have a negative first-line margin
253	AppendString(pct, TXVESC_TAB);
254	}
255	pct->fNeedsLinebreak = FALSE;
256	}
257	}
258
259	/*
260	*@@ AppendEscapeWithDecimal:
261	* appends the specified escape code
262	* with a three-digit decimal parameter.
263	* Calls AppendString in turn.
264	*
265	*@@added V0.9.3 (2000-05-07) [umoeller]
266	*/
267
268	VOID AppendEscapeWith3Decimals(PCOPYTARGET pct, // in/out: formatting buffer
269	char *ach,
270	USHORT us)
271	{
272	CHAR szDecimal[10];
273	if (us > 999)
274	us = 999;
275	sprintf(szDecimal, "%03d", us);
276	// append escape
277	AppendString(pct, ach);
278	AppendString(pct, szDecimal);
279	}
280
281	/*
282	*@@ AppendEscapeWith4Decimals:
283	*
284	*@@added V0.9.3 (2000-05-07) [umoeller]
285	*/
286
287	VOID AppendEscapeWith4Decimals(PCOPYTARGET pct, // in/out: formatting buffer
288	char *ach,
289	USHORT us)
290	{
291	CHAR szDecimal[10];
292	if (us > 9999)
293	us = 9999;
294	sprintf(szDecimal, "%04d", us);
295	// append escape
296	AppendString(pct, ach);
297	AppendString(pct, szDecimal);
298	}
299
300	/* ******************************************************************
301	* *
302	* Tag converter functions *
303	* *
304	********************************************************************/
305
306	/*
307	*@@ StartList:
308	* starts a list (UL or OL).
309	* This uses a linked list in COPYTARGET
310	* to keep a pseudo-stack for nested lists.
311	*
312	*@@added V0.9.3 (2000-05-08) [umoeller]
313	*/
314
315	VOID StartList(PCOPYTARGET pct, // in/out: formatting buffer
316	ULONG ulListType) // list type:
317	// 0: unordered (UL)
318	// 1: ordered (OL)
319	// 2: definition lists (DL)
320	{
321	PLISTDESC pListDesc;
322
323	// raise list level
324	pct->ulListLevel++;
325
326	if (ulListType == 0)
327	// unordered:
328	pct->ulUnorderedListLevel++;
329	else if (ulListType == 1)
330	// ordered:
331	pct->ulOrderedListLevel++;
332
333	// create LISTDESC and store it on stack
334	pListDesc = (PLISTDESC)malloc(sizeof(LISTDESC));
335	pListDesc->ulListType
336	= pct->ulCurrentListType
337	= ulListType;
338	pListDesc->ulItem = 1;
339
340	lstAppendItem(&pct->llLists,
341	pListDesc);
342
343	AppendEscapeWith4Decimals(pct,
344	TXVESC_LEFTMARGIN,
345	pct->ulListLevel * 5);
346	AppendEscapeWith3Decimals(pct,
347	TXVESC_FIRSTLINEMARGIN_LEFT,
348	(ulListType == 2)
349	? 5 // for definition lists
350	: 3); // negative!
351	// add \n before any other character
352	pct->fNeedsLinebreak = TRUE;
353	}
354
355	/*
356	*@@ StopList:
357	* stops a list (UL or OL).
358	*
359	*@@added V0.9.3 (2000-05-07) [umoeller]
360	*/
361
362	VOID StopList(PCOPYTARGET pct)
363	{
364	if (pct->ulListLevel)
365	{
366	PLISTNODE pNode;
367
368	// lower list level
369	pct->ulListLevel--;
370	AppendEscapeWith4Decimals(pct,
371	TXVESC_LEFTMARGIN,
372	pct->ulListLevel * 5);
373	AppendEscapeWith3Decimals(pct,
374	TXVESC_FIRSTLINEMARGIN_LEFT,
375	(pct->ulListLevel)
376	? 3 // we still have a list level (nested)
377	: 0);
378	pct->fNeedsLinebreak = TRUE;
379
380	// remove the LISTDESC from the stack
381	pNode = lstNodeFromIndex(&pct->llLists,
382	pct->ulListLevel); // this has been lowered already
383	if (pNode)
384	{
385	PLISTDESC pListDesc = (PLISTDESC)pNode->pItemData;
386	if (pListDesc->ulListType == 0)
387	// was unordered:
388	pct->ulUnorderedListLevel--;
389	else if (pListDesc->ulListType == 1)
390	// was ordered:
391	pct->ulOrderedListLevel--;
392
393	lstRemoveNode(&pct->llLists, pNode);
394
395	// update COPYTARGET with previous list level
396	if (pct->ulListLevel)
397	{
398	// we're still in a list (nested lists):
399	PLISTDESC pListDesc2 = (PLISTDESC)lstItemFromIndex(&pct->llLists,
400	pct->ulListLevel - 1);
401	if (pListDesc2)
402	pct->ulCurrentListType = pListDesc2->ulListType;
403	}
404	}
405	}
406	// else: buggy HTML code, ignore
407	}
408
409	/*
410	*@@ TagTITLE:
411	*
412	*@@added V0.9.3 (2000-05-19) [umoeller]
413	*/
414
415	VOID TagTITLE(PCOPYTARGET pct)
416	{
417	// pSource currently points to <TITLE tag
418	PSZ pSource = pct->pSource + strlen(pct->pSource);
419	// points to temporary null byte in main buffer now
420	*pSource = pct->cSaved;
421
422	pSource = strchr(pct->pSource, '>');
423	if (pSource)
424	{
425	PSZ pNextOpen = strchr(pSource, '<');
426	if (pNextOpen)
427	{
428	// extract title
429	pct->pxhtml->pszTitle = strhSubstr(pSource + 1, pNextOpen);
430
431	if (strnicmp(pNextOpen + 1, "/TITLE", 6) == 0)
432	{
433	// closing /TITLE tag found:
434	// search on after that
435	pct->pNewSource = strchr(pNextOpen, '>');
436	if (pct->pNewSource)
437	pct->pNewSource++;
438	}
439	}
440	}
441	}
442
443	/*
444	*@@ TagP:
445	*
446	*/
447
448	VOID TagP(PCOPYTARGET pct)
449	{
450	// append newline:
451	// add \n before any other character
452	pct->fNeedsLinebreak = TRUE;
453
454	/* if (pct->ulListLevel)
455	{
456	// if we are currently in a list, we must also
457	// add a tab escape, because we have set
458	// the first line margin to the left of the
459	// left margin
460	AppendString(pct,
461	TXVESC_TAB);
462	} */
463	}
464
465	VOID TagBR(PCOPYTARGET pct)
466	{
467	AppendChar(pct,
468	'\r');
469
470	if (pct->ulListLevel)
471	{
472	// if we are currently in a list, we must also
473	// add a tab escape, because we have set
474	// the first line margin to the left of the
475	// left margin
476	AppendString(pct,
477	TXVESC_TAB);
478	}
479	if (!pct->fPRE)
480	pct->fSkipNextSpace = TRUE;
481	}
482
483	VOID TagPRE(PCOPYTARGET pct)
484	{
485	// start of PRE tag:
486	// add \n before any other character
487	// pct->fNeedsLinebreak = TRUE;
488	AppendChar(pct, '\n');
489	pct->fNeedsLinebreak = FALSE;
490	/* AppendString(pct,
491	TXVESC_PRE_BEGIN); */
492	AppendEscapeWith3Decimals(pct,
493	TXVESC_SET_FONT,
494	1); // monospaced font
495	AppendEscapeWith4Decimals(pct,
496	TXVESC_SPACEBEFORE,
497	0); // no spacing before
498	AppendEscapeWith4Decimals(pct,
499	TXVESC_SPACEAFTER,
500	0); // no spacing after
501	// disable word-wrapping
502	AppendString(pct,
503	TXVESC_WORDWRAP "0");
504	pct->fPRE = TRUE;
505	pct->fSkipNextSpace = FALSE;
506	}
507
508	VOID TagXPRE(PCOPYTARGET pct)
509	{
510	pct->fPRE = FALSE;
511	AppendEscapeWith3Decimals(pct,
512	TXVESC_SET_FONT,
513	0); // standard font
514	AppendString(pct, TXVESC_SPACEBEFORE);
515	AppendString(pct, "####"); // reset to default
516	AppendString(pct, TXVESC_SPACEAFTER);
517	AppendString(pct, "####"); // reset to default
518	// re-enable word-wrapping
519	AppendString(pct,
520	TXVESC_WORDWRAP "1"
521	"\n"); // force line break
522	pct->fNeedsLinebreak = FALSE;
523	// refuse to add \n even if we have another "p" coming up
524	pct->fSkipNextLinebreak = TRUE;
525	pct->fSkipNextSpace = TRUE;
526	}
527
528	VOID TagH1(PCOPYTARGET pct)
529	{
530	pct->fNeedsLinebreak = TRUE;
531	AppendEscapeWith3Decimals(pct,
532	TXVESC_POINTSIZE_REL,
533	200); // double size
534	AppendString(pct,
535	TXVESC_BOLD_BEGIN);
536	}
537
538	VOID TagXH1(PCOPYTARGET pct)
539	{
540	AppendString(pct,
541	TXVESC_BOLD_END);
542	AppendEscapeWith3Decimals(pct,
543	TXVESC_POINTSIZE_REL,
544	100); // regular size
545	// add \n before any other character
546	pct->fNeedsLinebreak = TRUE;
547	}
548
549	VOID TagH2(PCOPYTARGET pct)
550	{
551	pct->fNeedsLinebreak = TRUE;
552	AppendEscapeWith3Decimals(pct,
553	TXVESC_POINTSIZE_REL,
554	175); // size in percent of regular point size
555	AppendString(pct,
556	TXVESC_BOLD_BEGIN);
557	}
558
559	VOID TagXH2(PCOPYTARGET pct)
560	{
561	AppendString(pct,
562	TXVESC_BOLD_END);
563	AppendEscapeWith3Decimals(pct,
564	TXVESC_POINTSIZE_REL,
565	100); // regular size
566	// add \n before any other character
567	pct->fNeedsLinebreak = TRUE;
568	}
569
570	VOID TagH3(PCOPYTARGET pct)
571	{
572	pct->fNeedsLinebreak = TRUE;
573	AppendEscapeWith3Decimals(pct,
574	TXVESC_POINTSIZE_REL,
575	150); // size in percent of regular point size
576	AppendString(pct,
577	TXVESC_BOLD_BEGIN);
578	}
579
580	VOID TagXH3(PCOPYTARGET pct)
581	{
582	AppendString(pct,
583	TXVESC_BOLD_END);
584	AppendEscapeWith3Decimals(pct,
585	TXVESC_POINTSIZE_REL,
586	100); // size in percent of regular point size
587	// add \n before any other character
588	pct->fNeedsLinebreak = TRUE;
589	}
590
591	VOID TagH4(PCOPYTARGET pct)
592	{
593	pct->fNeedsLinebreak = TRUE;
594	AppendEscapeWith3Decimals(pct,
595	TXVESC_POINTSIZE_REL,
596	125); // size in percent of regular point size
597	AppendString(pct,
598	TXVESC_BOLD_BEGIN);
599	}
600
601	VOID TagXH4(PCOPYTARGET pct)
602	{
603	AppendString(pct,
604	TXVESC_BOLD_END);
605	AppendEscapeWith3Decimals(pct,
606	TXVESC_POINTSIZE_REL,
607	100); // regular size
608	// add \n before any other character
609	pct->fNeedsLinebreak = TRUE;
610	}
611
612	VOID TagH5(PCOPYTARGET pct)
613	{
614	pct->fNeedsLinebreak = TRUE;
615	AppendEscapeWith3Decimals(pct,
616	TXVESC_POINTSIZE_REL,
617	100); // size in percent of regular point size
618	AppendString(pct,
619	TXVESC_BOLD_BEGIN);
620	}
621
622	VOID TagXH5(PCOPYTARGET pct)
623	{
624	AppendString(pct,
625	TXVESC_BOLD_END);
626	AppendEscapeWith3Decimals(pct,
627	TXVESC_POINTSIZE_REL,
628	100); // regular size
629	// add \n before any other character
630	pct->fNeedsLinebreak = TRUE;
631	}
632
633	VOID TagH6(PCOPYTARGET pct)
634	{
635	pct->fNeedsLinebreak = TRUE;
636	AppendEscapeWith3Decimals(pct,
637	TXVESC_POINTSIZE_REL,
638	80 ); // size in percent of regular point size
639	AppendString(pct,
640	TXVESC_BOLD_BEGIN);
641	}
642
643	VOID TagXH6(PCOPYTARGET pct)
644	{
645	AppendString(pct,
646	TXVESC_BOLD_END);
647	AppendEscapeWith3Decimals(pct,
648	TXVESC_POINTSIZE_REL,
649	100); // regular size
650	// add \n before any other character
651	pct->fNeedsLinebreak = TRUE;
652	}
653
654	VOID TagUL(PCOPYTARGET pct)
655	{
656	StartList(pct,
657	0); // unordered
658	}
659
660	VOID TagXUL(PCOPYTARGET pct)
661	{
662	StopList(pct);
663	}
664
665	VOID TagOL(PCOPYTARGET pct)
666	{
667	StartList(pct,
668	1); // ordered
669	}
670
671	VOID TagXOL(PCOPYTARGET pct)
672	{
673	StopList(pct);
674	}
675
676	VOID TagLI(PCOPYTARGET pct)
677	{
678	PLISTDESC pListDesc;
679	CHAR szMarker[20] = TXVESC_MARKER "\x01";
680
681	if (pct->ulListLevel)
682	{
683	// we're in a list:
684	pListDesc = (PLISTDESC)lstItemFromIndex(&pct->llLists,
685	pct->ulListLevel - 1);
686	if (pListDesc)
687	if (pListDesc->ulListType == 1)
688	// is ordered list:
689	sprintf(szMarker, "%d.", (pListDesc->ulItem)++);
690	else if (pListDesc->ulListType == 0)
691	// is unordered list:
692	// set bullet type according to unordered nesting
693	szMarker[2] = pct->ulUnorderedListLevel;
694	}
695
696	// add \n before any other character
697	// pct->fNeedsLinebreak = TRUE;
698	// if (pct->fNeedsLinebreak)
699	{
700	AppendChar(pct, '\n');
701	pct->fNeedsLinebreak = FALSE;
702	}
703
704	AppendString(pct, szMarker);
705	AppendString(pct, TXVESC_TAB);
706	}
707
708	VOID TagDL(PCOPYTARGET pct)
709	{
710	StartList(pct,
711	2); // definition list
712	}
713
714	VOID TagXDL(PCOPYTARGET pct)
715	{
716	StopList(pct);
717	pct->fInDT = FALSE;
718	}
719
720	VOID TagDT(PCOPYTARGET pct)
721	{
722	pct->fNeedsLinebreak = TRUE;
723	pct->fInDT = TRUE;
724	}
725
726	VOID TagDD(PCOPYTARGET pct)
727	{
728	pct->fNeedsLinebreak = TRUE;
729	AppendString(pct, TXVESC_TAB);
730	if (!pct->fPRE)
731	pct->fSkipNextSpace = TRUE;
732	pct->fInDT = FALSE;
733	}
734
735	VOID TagTR(PCOPYTARGET pct)
736	{
737	pct->fNeedsLinebreak = TRUE;
738	}
739
740	VOID TagB(PCOPYTARGET pct)
741	{
742	AppendString(pct,
743	TXVESC_BOLD_BEGIN);
744	}
745
746	VOID TagXB(PCOPYTARGET pct)
747	{
748	AppendString(pct,
749	TXVESC_BOLD_END);
750	}
751
752	VOID TagI(PCOPYTARGET pct)
753	{
754	AppendString(pct,
755	TXVESC_ITALICS_BEGIN);
756	}
757
758	VOID TagXI(PCOPYTARGET pct)
759	{
760	AppendString(pct,
761	TXVESC_ITALICS_END);
762	}
763
764	VOID TagU(PCOPYTARGET pct)
765	{
766	AppendString(pct,
767	TXVESC_UNDERLINE_BEGIN);
768	}
769
770	VOID TagXU(PCOPYTARGET pct)
771	{
772	AppendString(pct,
773	TXVESC_UNDERLINE_END);
774	}
775
776	VOID TagSTRIKE(PCOPYTARGET pct)
777	{
778	AppendString(pct,
779	TXVESC_STRIKE_BEGIN);
780	}
781
782	VOID TagXSTRIKE(PCOPYTARGET pct)
783	{
784	AppendString(pct,
785	TXVESC_STRIKE_END);
786	}
787
788	VOID TagCODE(PCOPYTARGET pct)
789	{
790	AppendEscapeWith3Decimals(pct,
791	TXVESC_SET_FONT,
792	1); // monospaced font
793	}
794
795	VOID TagXCODE(PCOPYTARGET pct)
796	{
797	AppendEscapeWith3Decimals(pct,
798	TXVESC_SET_FONT,
799	0); // regular font
800	}
801
802	VOID TagA(PCOPYTARGET pct)
803	{
804	CHAR szAnchor[10];
805
806	pct->fInLink = FALSE;
807
808	if ((pct->pszAttributes) && (pct->pxhtml)) // points into main source buffer
809	{
810	// we have attributes:
811	PSZ pszClosingTag = strchr(pct->pszAttributes, '>');
812	if (pszClosingTag)
813	{
814	ULONG ulOfs = 0;
815
816	/*
817	* HREF attribute:
818	*
819	*/
820
821	PSZ pHREF = strhGetTextAttr(pct->pszAttributes, "HREF", &ulOfs),
822	pNAME = 0;
823
824	// replace '>' with null char to mark end of search
825	*pszClosingTag = 0;
826
827	if (pHREF)
828	{
829	// OK, we got a link target:
830	// create a link item and append it to the output list
831	PXHTMLLINK pNewLink = (PXHTMLLINK)malloc(sizeof(XHTMLLINK));
832	memset(pNewLink, 0, sizeof(XHTMLLINK));
833
834	pct->fInLink = TRUE;
835
836	// this starts with anchor 1
837	pNewLink->usLinkIndex = ++pct->usAnchorIndex;
838	pNewLink->pszTargetFile = pHREF;
839	// do not free
840	lstAppendItem(&pct->pxhtml->llLinks, pNewLink);
841	}
842
843	/*
844	* NAME attribute:
845	*
846	*/
847
848	pNAME = strhGetTextAttr(pct->pszAttributes, "NAME", &ulOfs);
849	if (pNAME)
850	{
851	AppendString(pct,
852	TXVESC_ANCHORNAME);
853	AppendString(pct,
854	pNAME);
855	// must be terminated with 0xFF
856	AppendChar(pct, 0xFF);
857	free(pNAME);
858	}
859	// restore '>'
860	*pszClosingTag = '>';
861	}
862	}
863
864	if (pct->fInLink)
865	{
866	sprintf(szAnchor, "%04lX", pct->usAnchorIndex);
867	AppendString(pct,
868	TXVESC_LINK);
869	AppendString(pct,
870	szAnchor);
871	}
872	}
873
874	VOID TagXA(PCOPYTARGET pct)
875	{
876	if (pct->fInLink)
877	{
878	AppendString(pct,
879	TXVESC_LINK "####");
880	pct->fInLink = FALSE;
881	}
882	}
883
884	/* ******************************************************************
885	* *
886	* Tag helpers *
887	* *
888	********************************************************************/
889
890	/*
891	*@@ FindTagProcessor:
892	* returns the Tag* function which handles the
893	* given tag or NULL if there's none.
894	*
895	*@@added V0.9.4 (2000-06-10) [umoeller]
896	*/
897
898	PFNPROCESSTAG FindTagProcessor(PSZ pszTag)
899	{
900	PFNPROCESSTAG pProcessor = NULL;
901
902	CHAR c0,
903	c1;
904
905	BOOL fEndOfTag = FALSE;
906
907	PSZ pCheck = pszTag,
908	p2;
909	if (*pCheck == '/')
910	{
911	// end of tag:
912	fEndOfTag = TRUE;
913	pCheck++;
914	}
915
916	c0 = *pCheck;
917	c1 = *(pCheck + 1);
918
919	p2 = pCheck + 2;
920
921	switch (c0)
922	{
923	case 'A':
924	case 'a':
925	switch (c1)
926	{
927	case 0: // A
928	if (!fEndOfTag)
929	return TagA;
930	else
931	return TagXA;
932	case 'D': // ADDRESS
933	case 'd': // ADDRESS
934	if (stricmp(p2, "DRESS") == 0)
935	if (!fEndOfTag)
936	return TagI;
937	else
938	return TagXI;
939	}
940	break;
941
942	case 'B':
943	case 'b':
944	switch (c1)
945	{
946	case 0:
947	if (!fEndOfTag)
948	return TagB;
949	else
950	return TagXB;
951
952	case 'R': // BR
953	case 'r': // BR
954	if (*p2 == 0)
955	if (!fEndOfTag)
956	return TagBR;
957	}
958	break;
959
960	case 'C':
961	case 'c':
962	switch (c1)
963	{
964	case 'I': // CITE
965	case 'i': // CITE
966	if (stricmp(p2, "TE") == 0)
967	{
968	if (!fEndOfTag)
969	return TagI;
970	else
971	return TagXI;
972	}
973	break;
974
975	case 'O':
976	case 'o':
977	if (stricmp(p2, "DE") == 0)
978	if (!fEndOfTag)
979	return TagCODE;
980	else
981	return TagXCODE;
982	break;
983	}
984	break;
985
986	case 'D':
987	case 'd':
988	switch (c1)
989	{
990	case 'D': // DD
991	case 'd': // DD
992	if ((*p2 == 0) && (!fEndOfTag))
993	return (TagDD);
994	break;
995
996	case 'I': // DIR
997	case 'i': // DIR
998	if (*p2 == 'R')
999	if (*(pCheck + 3) == 0)
1000	if (!fEndOfTag)
1001	return TagUL;
1002	else
1003	return TagXUL;
1004	break;
1005
1006	case 'L': // DL
1007	case 'l': // DL
1008	if (*p2 == 0)
1009	if (!fEndOfTag)
1010	return TagDL;
1011	else
1012	return TagXDL;
1013	break;
1014
1015	case 'T': // DT
1016	case 't': // DT
1017	if ((*p2 == 0) && (!fEndOfTag))
1018	return TagDT;
1019	break;
1020	}
1021	break;
1022
1023	case 'E':
1024	case 'e':
1025	if ( (c1 == 'M') \|\| (c1 == 'm') ) // EM
1026	if (*p2 == 0)
1027	if (!fEndOfTag)
1028	return TagI;
1029	else
1030	return TagXI;
1031	break;
1032
1033	case 'H':
1034	case 'h':
1035	if (c1)
1036	if (*p2 == 0)
1037	switch (c1)
1038	{
1039	case '1':
1040	if (!fEndOfTag)
1041	return TagH1;
1042	else
1043	return TagXH1;
1044	case '2':
1045	if (!fEndOfTag)
1046	return TagH2;
1047	else
1048	return TagXH2;
1049	case '3':
1050	if (!fEndOfTag)
1051	return TagH3;
1052	else
1053	return TagXH3;
1054	case '4':
1055	if (!fEndOfTag)
1056	return TagH4;
1057	else
1058	return TagXH4;
1059	case '5':
1060	if (!fEndOfTag)
1061	return TagH5;
1062	else
1063	return TagXH5;
1064	case '6':
1065	if (!fEndOfTag)
1066	return TagH6;
1067	else
1068	return TagXH6;
1069	}
1070	break;
1071
1072	case 'I':
1073	case 'i':
1074	if (c1 == 0)
1075	if (!fEndOfTag)
1076	return TagI;
1077	else
1078	return TagXI;
1079	break;
1080
1081	case 'L':
1082	case 'l':
1083	if ((c1 == 'I') \|\| (c1 == 'i'))
1084	if (*p2 == 0)
1085	return TagLI;
1086	break;
1087
1088	case 'M':
1089	case 'm':
1090	if (stricmp(p2, "NU") == 0)
1091	if (!fEndOfTag)
1092	return TagUL;
1093	else
1094	return TagXUL;
1095	break;
1096
1097	case 'O':
1098	case 'o':
1099	if ((c1 == 'L') \|\| (c1 == 'l'))
1100	if (*p2 == 0)
1101	if (!fEndOfTag)
1102	return TagOL;
1103	else
1104	return TagXOL;
1105	break;
1106
1107	case 'P':
1108	case 'p':
1109	switch (c1)
1110	{
1111	case 0:
1112	if (!fEndOfTag)
1113	return TagP;
1114	break;
1115
1116	case 'R': // PRE
1117	case 'r': // PRE
1118	if ((p2 == 'E') \|\| (p2 == 'e'))
1119	if (*(pCheck + 3) == 0)
1120	if (!fEndOfTag)
1121	return TagPRE;
1122	else
1123	return TagXPRE;
1124	break;
1125	}
1126	break;
1127
1128	case 'S':
1129	case 's':
1130	switch (c1)
1131	{
1132	case 'T': // STRONG
1133	case 't': // STRONG
1134	if (stricmp(p2, "RONG") == 0)
1135	if (!fEndOfTag)
1136	return TagB;
1137	else
1138	return TagXB;
1139	else if (stricmp(p2, "RIKE") == 0)
1140	if (!fEndOfTag)
1141	return TagSTRIKE;
1142	else
1143	return TagXSTRIKE;
1144	break;
1145
1146	case 'A':
1147	case 'a':
1148	if (stricmp(p2, "MP") == 0)
1149	if (!fEndOfTag)
1150	return TagCODE;
1151	else
1152	return TagXCODE;
1153	break;
1154	}
1155	break;
1156
1157	case 'T':
1158	case 't':
1159	switch (c1)
1160	{
1161	case 'R':
1162	case 'r':
1163	if (*p2 == 0)
1164	return TagTR;
1165	break;
1166
1167	case 'I':
1168	case 'i':
1169	if (stricmp(p2, "TLE") == 0)
1170	return TagTITLE;
1171	break;
1172
1173	case 'T': // TT
1174	case 't':
1175	if (*p2 == 0)
1176	if (!fEndOfTag)
1177	return TagCODE;
1178	else
1179	return TagXCODE;
1180	break;
1181	}
1182	break;
1183
1184	case 'U':
1185	case 'u':
1186	switch (c1)
1187	{
1188	case 0:
1189	if (!fEndOfTag)
1190	return TagU;
1191	else
1192	return TagXU;
1193
1194	case 'L':
1195	case 'l':
1196	if (*p2 == 0)
1197	if (!fEndOfTag)
1198	return TagUL;
1199	else
1200	return TagXUL;
1201	break;
1202	}
1203	break;
1204
1205	case 'V':
1206	case 'v':
1207	if (stricmp(p2, "R") == 0)
1208	{
1209	if (!fEndOfTag)
1210	return TagI;
1211	else
1212	return TagXI;
1213	}
1214	break;
1215
1216	case 'X':
1217	case 'x':
1218	if (stricmp(p2, "MP") == 0) // XMP
1219	{
1220	if (!fEndOfTag)
1221	return TagPRE;
1222	else
1223	return TagXPRE;
1224	}
1225	break;
1226	}
1227
1228	return (pProcessor);
1229	}
1230
1231	/*
1232	*@@ HandleTag:
1233	* called by txvConvertFromHTML when a "<" character
1234	* is found in the source buffer. This calls
1235	* FindTagProcessor in turn to find the Tag*
1236	* function which handles the tag.
1237	*
1238	*@@added V0.9.3 (2000-05-18) [umoeller]
1239	*/
1240
1241	VOID HandleTag(PCOPYTARGET pct)
1242	{
1243	PSZ pStartOfTag = pct->pSource;
1244	// '<' == begin of tag:
1245
1246	// is it a comment? <!-- ... -->
1247	if (strncmp(pStartOfTag + 1, "!--", 3) == 0)
1248	{
1249	// start of comment:
1250	// find end of comment
1251	PSZ pEnd = strstr(pStartOfTag, "-->");
1252	if (pEnd)
1253	// found:
1254	// search on after end of comment
1255	pct->pSource = pEnd + 3;
1256	else
1257	{
1258	// end of comment not found:
1259	// stop formatting...
1260	pct->pSource++;
1261	return;
1262	}
1263	}
1264	else
1265	{
1266	// no comment:
1267	// find end of tag
1268	PSZ p2 = pStartOfTag + 1,
1269	pNextClose = 0, // receives first '>' after '<'
1270	pNextSpace = 0; // receives first ' ' after '<'
1271	BOOL fCont = TRUE;
1272	while (fCont)
1273	{
1274	switch (*p2)
1275	{
1276	case ' ':
1277	case '\r':
1278	case '\n':
1279	// store first space after '<'
1280	if (!pNextSpace)
1281	pNextSpace = p2;
1282	// overwrite line breaks with spaces;
1283	// otherwise we cannot handle tags which go across
1284	// several lines, which is valid HTML
1285	*p2 = ' ';
1286	break;
1287
1288	case '>': // end of tag found:
1289	pNextClose = p2;
1290	fCont = FALSE;
1291	break;
1292
1293	case '<':
1294	// another opening tag:
1295	// that's an HTML error
1296	AppendChar(pct,
1297	*pct->pSource++);
1298	fCont = FALSE;
1299	break;
1300
1301	case 0:
1302	fCont = FALSE;
1303	break;
1304	}
1305	p2++;
1306	}
1307
1308	if (pNextClose)
1309	{
1310	// end of tag found:
1311	ULONG cbTag;
1312	PSZ pStartOfAttrs = 0;
1313
1314	if ((pNextSpace) && (pNextSpace < pNextClose))
1315	{
1316	// we have attributes:
1317	cbTag = pNextSpace - (pStartOfTag + 1);
1318	pStartOfAttrs = pNextSpace;
1319	}
1320	else
1321	cbTag = pNextClose - (pStartOfTag + 1);
1322
1323	if (!cbTag)
1324	{
1325	// happens if we have a "<>" in the text:
1326	// just insert the '<>' and go on, we have no tag here
1327	AppendChar(pct,
1328	*pct->pSource++);
1329	AppendChar(pct,
1330	*pct->pSource++);
1331	}
1332	else
1333	{
1334	PFNPROCESSTAG pTagProcessor;
1335
1336	pct->cSaved = *(pStartOfTag + cbTag + 1);
1337	// add a null terminator
1338	*(pStartOfTag + cbTag + 1) = 0;
1339
1340	// find corresponding tag converter function
1341	// from G_TagProcessors map
1342	pTagProcessor = FindTagProcessor(pStartOfTag + 1); // pszTag);
1343
1344	// restore char under null terminator
1345	*(pStartOfTag + cbTag + 1) = pct->cSaved;
1346
1347	// reset new source ptr; the tag handler
1348	// can modify this
1349	pct->pNewSource = NULL;
1350
1351	if (pTagProcessor)
1352	{
1353	// tag understood:
1354
1355	// terminate string after closing tag
1356	pct->cSaved = *(pNextClose + 1); // can be null byte!
1357	*(pNextClose + 1) = 0;
1358
1359	// did we have attributes?
1360	if (pNextSpace)
1361	pct->pszAttributes = pNextSpace;
1362
1363	// finally, call the tag handler
1364	(pTagProcessor) // function
1365	(pct); // argument
1366
1367	*(pNextClose + 1) = pct->cSaved;
1368	}
1369
1370	if (pct->pNewSource == NULL)
1371	// tag handler needs no special processing:
1372	// skip '>' too
1373	pct->pSource = pNextClose + 1;
1374	else
1375	// tag handler has skipped something:
1376	pct->pSource = pct->pNewSource;
1377	}
1378	}
1379	}
1380	}
1381
1382	/*
1383	*@@ ConvertEscape:
1384	* called by HandleEscape to find the ANSI (CP 1004)
1385	* character for the given escape sequence (pszTag).
1386	*
1387	* pszTag contains the stuff between "&" and ";".
1388	*
1389	* This is really ugly spaghetti, but it's the fastest
1390	* way to do it.
1391	*
1392	*@@added V0.9.4 (2000-06-10) [umoeller]
1393	*/
1394
1395	CHAR ConvertEscape(PSZ pszTag)
1396	{
1397	CHAR c0, c1;
1398	CHAR crc = 0;
1399
1400	PSZ p2 = pszTag + 2;
1401
1402	c0 = *pszTag;
1403	c1 = *(pszTag + 1);
1404
1405	switch (c0)
1406	{
1407	case 'a':
1408	switch (c1)
1409	{
1410	case 'a':
1411	if (strcmp(p2, "cute") == 0)
1412	return 225;
1413	break;
1414
1415	case 'c':
1416	if (strcmp(p2, "irc") == 0)
1417	return 226;
1418	else if (strcmp(p2, "ute") == 0)
1419	return 180;
1420	break;
1421
1422	case 'e':
1423	if (strcmp(p2, "lig") == 0)
1424	return 230;
1425	break;
1426
1427	case 'g':
1428	if (strcmp(p2, "rave") == 0)
1429	return 224;
1430	break;
1431
1432	case 'm':
1433	if (strcmp(p2, "p") == 0)
1434	return '&';
1435	break;
1436
1437	case 'r':
1438	if (strcmp(p2, "ing") == 0)
1439	return 229;
1440	break;
1441
1442	case 't':
1443	if (strcmp(p2, "ilde") == 0)
1444	return 227;
1445	break;
1446
1447	case 'u':
1448	if (strcmp(p2, "ml") == 0)
1449	return 228;
1450	break;
1451	}
1452	break;
1453
1454	case 'b':
1455	if (strcmp(pszTag + 1, "rvbar") == 0)
1456	return 166;
1457	break;
1458
1459	case 'c':
1460	switch (c1)
1461	{
1462	case 'c':
1463	if (strcmp(p2, "edil") == 0)
1464	return 231;
1465	break;
1466
1467	case 'e':
1468	if (strcmp(p2, "dil") == 0)
1469	return 184;
1470	else if (strcmp(p2, "nt") == 0)
1471	return 162;
1472	break;
1473
1474	case 'o':
1475	if (strcmp(p2, "py") == 0)
1476	return 169;
1477	break;
1478
1479	case 'u':
1480	if (strcmp(p2, "rren") == 0)
1481	return 164;
1482	}
1483	break;
1484
1485	case 'd':
1486	switch (c1)
1487	{
1488	case 'e':
1489	if (strcmp(p2, "g") == 0) return 176;
1490	break;
1491
1492	case 'i':
1493	if (strcmp(p2, "vide") == 0) return 247;
1494	break;
1495	}
1496	break;
1497
1498	case 'e':
1499	switch (c1)
1500	{
1501	case 'a':
1502	if (strcmp(p2, "cute") == 0) return 233;
1503	break;
1504
1505	case 'c':
1506	if (strcmp(p2, "irc") == 0) return 234;
1507	break;
1508
1509	case 'g':
1510	if (strcmp(p2, "rave") == 0) return 232;
1511	break;
1512
1513	case 't':
1514	if (strcmp(p2, "h") == 0) return 240;
1515	break;
1516
1517	case 'u':
1518	if (strcmp(p2, "ml") == 0) return 235;
1519	break;
1520	}
1521	break;
1522
1523	case 'f':
1524	switch (c1)
1525	{
1526	case 'r':
1527	if (strcmp(p2, "ac14") == 0) return 188;
1528	if (strcmp(p2, "ac12") == 0) return 189;
1529	if (strcmp(p2, "ac34") == 0) return 190;
1530	break;
1531	}
1532	break;
1533
1534	case 'g':
1535	switch (c1)
1536	{
1537	case 't':
1538	if (*p2 == 0) return '>';
1539	}
1540	break;
1541
1542	case 'i':
1543	switch (c1)
1544	{
1545	case 'a':
1546	if (strcmp(p2, "cute") == 0) return 237;
1547	break;
1548
1549	case 'c':
1550	if (strcmp(p2, "irc") == 0) return 238;
1551	break;
1552
1553	case 'g':
1554	if (strcmp(p2, "rave") == 0) return 236;
1555	break;
1556
1557	case 'e':
1558	if (strcmp(p2, "xcl") == 0) return 161;
1559	break;
1560
1561	case 'q':
1562	if (strcmp(p2, "uest") == 0) return 191;
1563	break;
1564
1565	case 'u':
1566	if (strcmp(p2, "ml") == 0) return 239;
1567	}
1568	break;
1569
1570	case 'l':
1571	switch (c1)
1572	{
1573	case 't':
1574	if (*p2 == 0)
1575	return '<';
1576	break;
1577
1578	case 'a':
1579	if (strcmp(p2, "quo") == 0) return 171;
1580	}
1581	break;
1582
1583	case 'm':
1584	switch (c1)
1585	{
1586	case 'a':
1587	if (strcmp(p2, "cr") == 0) return 175;
1588	break;
1589
1590	case 'i':
1591	if (strcmp(p2, "cro") == 0) return 181;
1592	if (strcmp(p2, "ddot") == 0) return 183;
1593	break;
1594	}
1595	break;
1596
1597	case 'n':
1598	switch (c1)
1599	{
1600	case 'b':
1601	if (strcmp(p2, "sp") == 0) return 160;
1602	break;
1603
1604	case 'o':
1605	if (strcmp(p2, "t") == 0) return 172;
1606	break;
1607
1608	case 't':
1609	if (strcmp(p2, "ilde") == 0) return 241;
1610	}
1611	break;
1612
1613	case 'o':
1614	switch (c1)
1615	{
1616	case 'a':
1617	if (strcmp(p2, "cute") == 0) return 243;
1618	break;
1619
1620	case 'c':
1621	if (strcmp(p2, "irc") == 0) return 244;
1622	break;
1623
1624	case 'g':
1625	if (strcmp(p2, "rave") == 0) return 242;
1626	break;
1627
1628	case 'r':
1629	if (strcmp(p2, "df") == 0) return 170;
1630	if (strcmp(p2, "dm") == 0) return 186;
1631	break;
1632
1633	case 's':
1634	if (strcmp(p2, "lash") == 0) return 248;
1635	break;
1636
1637	case 't':
1638	if (strcmp(p2, "ilde") == 0) return 245;
1639	break;
1640
1641	case 'u':
1642	if (strcmp(p2, "ml") == 0) return 246;
1643	}
1644	break;
1645
1646	case 'p':
1647	switch (c1)
1648	{
1649	case 'a':
1650	if (strcmp(p2, "ra") == 0) return 182;
1651	break;
1652
1653	case 'l':
1654	if (strcmp(p2, "usmn") == 0) return 177;
1655	break;
1656
1657	case 'o':
1658	if (strcmp(p2, "und") == 0) return 163;
1659	}
1660	break;
1661
1662	case 'q':
1663	if (strcmp(pszTag, "quot") == 0) return '"';
1664	break;
1665
1666	case 'r':
1667	if (strcmp(pszTag, "raquo") == 0) return 187;
1668	if (strcmp(pszTag, "reg") == 0) return 174;
1669	break;
1670
1671	case 's':
1672	switch (c1)
1673	{
1674	case 'z':
1675	if (strcmp(p2, "lig") == 0) return 223;
1676	break;
1677
1678	case 'e':
1679	if (strcmp(p2, "ct") == 0) return 167;
1680	break;
1681
1682	case 'h':
1683	if (strcmp(p2, "y") == 0) return 173;
1684	break;
1685
1686	case 'u':
1687	if (strcmp(p2, "p1") == 0) return 185;
1688	if (strcmp(p2, "p2") == 0) return 178;
1689	if (strcmp(p2, "p3") == 0) return 179;
1690	}
1691	break;
1692
1693	case 't':
1694	if (strcmp(pszTag, "thorn") == 0) return 254;
1695	if (strcmp(pszTag, "times") == 0) return 215;
1696	break;
1697
1698	case 'u':
1699	switch (c1)
1700	{
1701	case 'a':
1702	if (strcmp(p2, "cute") == 0) return 250;
1703	break;
1704
1705	case 'c':
1706	if (strcmp(p2, "irc") == 0) return 251;
1707	break;
1708
1709	case 'g':
1710	if (strcmp(p2, "rave") == 0) return 249;
1711	break;
1712
1713	case 'm':
1714	if (strcmp(p2, "l") == 0) return 168;
1715	break;
1716
1717	case 'u':
1718	if (strcmp(p2, "ml") == 0) return 252;
1719	}
1720	break;
1721
1722	case 'y':
1723	if (strcmp(pszTag, "yacute") == 0) return 253;
1724	if (strcmp(pszTag, "yen") == 0) return 165;
1725	if (strcmp(pszTag, "yuml") == 0) return 255;
1726	break;
1727
1728	case 'A':
1729	switch (c1)
1730	{
1731	case 'u':
1732	if (strcmp(p2, "ml") == 0) return 196;
1733	break;
1734
1735	case 'a':
1736	if (strcmp(p2, "cute") == 0) return 193;
1737	break;
1738
1739	case 'c':
1740	if (strcmp(p2, "irc") == 0) return 194;
1741	break;
1742
1743	case 'E':
1744	if (strcmp(p2, "lig") == 0) return 198;
1745	break;
1746
1747	case 'g':
1748	if (strcmp(p2, "rave") == 0) return 192;
1749	break;
1750
1751	case 'r':
1752	if (strcmp(p2, "ing") == 0) return 197;
1753	break;
1754
1755	case 't':
1756	if (strcmp(p2, "ilde") == 0) return 195;
1757	}
1758	break;
1759
1760	case 'C':
1761	if (strcmp(pszTag, "Ccedil") == 0) return 199;
1762	break;
1763
1764	case 'E':
1765	if (strcmp(pszTag, "Ecirc") == 0) return 202;
1766	if (strcmp(pszTag, "Eacute") == 0) return 201;
1767	if (strcmp(pszTag, "Egrave") == 0) return 200;
1768	if (strcmp(pszTag, "ETH") == 0) return 208;
1769	if (strcmp(pszTag, "Euml") == 0) return 203;
1770	break;
1771
1772	case 'I':
1773	if (strcmp(pszTag, "Icirc") == 0) return 206;
1774	if (strcmp(pszTag, "Iacute") == 0) return 205;
1775	if (strcmp(pszTag, "Igrave") == 0) return 204;
1776	if (strcmp(pszTag, "Iuml") == 0) return 207;
1777	break;
1778
1779	case 'N':
1780	if (strcmp(pszTag, "Ntilde") == 0) return 209;
1781	break;
1782
1783	case 'O':
1784	switch (c1)
1785	{
1786	case 'u':
1787	if (strcmp(p2, "ml") == 0) return 214;
1788	break;
1789
1790	case 'a':
1791	if (strcmp(p2, "cute") == 0) return 211;
1792	break;
1793
1794	case 'c':
1795	if (strcmp(p2, "irc") == 0) return 212;
1796	break;
1797
1798	case 'g':
1799	if (strcmp(p2, "rave") == 0) return 210;
1800	break;
1801
1802	case 't':
1803	if (strcmp(p2, "ilde") == 0) return 213;
1804	break;
1805
1806	case 's':
1807	if (strcmp(p2, "lash") == 0) return 216;
1808	}
1809	break;
1810
1811	case 'U':
1812	switch (c1)
1813	{
1814	case 'a':
1815	if (strcmp(p2, "cute") == 0) return 218;
1816	break;
1817
1818	case 'c':
1819	if (strcmp(p2, "irc") == 0) return 219;
1820	break;
1821
1822	case 'g':
1823	if (strcmp(p2, "rave") == 0) return 217;
1824	break;
1825
1826	case 'u':
1827	if (strcmp(p2, "ml") == 0) return 220;
1828	}
1829	break;
1830
1831	case 'T':
1832	if (strcmp(pszTag, "THORN") == 0) return 222;
1833	break;
1834
1835	case 'Y':
1836	if (strcmp(pszTag, "Yacute") == 0) return 221;
1837	break;
1838	}
1839
1840	return (crc);
1841	}
1842
1843	/*
1844	*@@ HandleEscape:
1845	* called by txvConvertFromHTML when a "&" character
1846	* is found in the source buffer. This calls
1847	* ConvertEscape in turn.
1848	*
1849	*@@added V0.9.3 (2000-05-18) [umoeller]
1850	*/
1851
1852	VOID HandleEscape(PCOPYTARGET pct)
1853	{
1854	// ampersand:
1855	// replace special characters
1856	PSZ pStartOfTag = pct->pSource;
1857	// find end of tag
1858	PSZ p2 = pStartOfTag,
1859	pNextClose = 0,
1860	pNextSpace = 0;
1861	BOOL fCont = TRUE;
1862	while (fCont)
1863	{
1864	switch (*p2)
1865	{
1866	case 0:
1867	fCont = FALSE;
1868	break;
1869
1870	case ';':
1871	pNextClose = p2;
1872	fCont = FALSE;
1873	break;
1874
1875	case ' ':
1876	if (!pNextSpace)
1877	pNextSpace = p2;
1878	break;
1879	}
1880	p2++;
1881	}
1882
1883	if (!pNextClose)
1884	// no closing tag found:
1885	// just insert the '&' and go on, we have no tag here
1886	AppendChar(pct,
1887	*pct->pSource++);
1888	else
1889	{
1890	if ((pNextSpace) && (pNextSpace < pNextClose))
1891	// space before ';':
1892	// just insert the '&' and go on, we have no tag here
1893	AppendChar(pct,
1894	*pct->pSource++);
1895	else if ((!pNextClose) \|\| (pNextClose <= pStartOfTag + 1))
1896	AppendChar(pct,
1897	*pct->pSource++);
1898	else
1899	{
1900	ULONG ulCode = 0;
1901
1902	// create substring with tag
1903	PSZ pszTag = pStartOfTag + 1;
1904	*pNextClose = 0;
1905
1906	if (*pszTag == '#')
1907	{
1908	// latin-1 or Unicode encoding ()
1909	ulCode = atoi(pszTag + 1);
1910
1911	// next input: char after ';'
1912	pct->pSource = pNextClose + 1;
1913	}
1914	else
1915	{
1916	// named entity:
1917	// find char code corresponding to escape
1918	// from G_EscapeProcessors map
1919	ulCode = ConvertEscape(pszTag);
1920	if (ulCode)
1921	// tag supported:
1922	pct->pSource = pNextClose + 1;
1923	else
1924	// tag not supported:
1925	ulCode = *pct->pSource++;
1926	}
1927
1928	// restore closing tag which we overwrote
1929	*pNextClose = ';';
1930
1931	if (ulCode)
1932	{
1933	AppendLinebreakCheck(pct);
1934
1935	AppendChar(pct,
1936	(CHAR)ulCode);
1937	pct->fSkipNextSpace = FALSE;
1938	}
1939	}
1940	}
1941	}
1942
1943	/* ******************************************************************
1944	* *
1945	* Entry points *
1946	* *
1947	********************************************************************/
1948
1949	/*
1950	*@@ txvConvertFromHTML:
1951	* this modifies the given text string (which should
1952	* be the complete BODY block of any HTML file) so
1953	* that all HTML tags are removed and replaced with
1954	* escape sequences that the XTextView control understands.
1955	*
1956	* The buffer gets reallocated by this function, so it
1957	* must be free()'able.
1958	*
1959	* So, to have the XTextView control display an HTML file,
1960	* do this:
1961	*
1962	* 1) Load an HTML file into a buffer allocated by malloc().
1963	*
1964	* 2) Call txvConvertFromHTML.
1965	*
1966	* 3) Call WinSetWindowText on the XTextView control with
1967	* the modified buffer.
1968	*
1969	* This understands the following limited subset of HTML:
1970	*
1971	* Paragraph tags:
1972	*
1973	* -- P, BR
1974	* -- PRE, /PRE
1975	* -- UL, /UL, OL, /OL, LI
1976	* -- DL, /DL, DT, DD
1977	* -- H1, /H1 thru H6, /H6
1978	* -- Comments (<!-- .... -->)
1979	*
1980	* Character tags:
1981	*
1982	* -- B, /B, STRONG, /STRONG
1983	* -- I, /I, EM, /EM, VAR, /VAR, CITE, /CITE
1984	* -- CODE, /CODE, SAMP, /SAMP, KBD, /KBD, TT, /TT
1985	* -- U, /U
1986	* -- STRIKE, /STRIKE
1987	* -- CODE, /CODE
1988	*
1989	* The most obvious limitation is that neither tables
1990	* nor frames are supported. Also forget about CSS
1991	* and JavaScript, of course.
1992	*
1993	* All the ampersand (& something) sequences defined
1994	* in HTML 3 are properly translated.
1995	*
1996	* Note: Those are translated to the ANSI (MS-Windows,
1997	* OS/2 codepage 1004) character set. This has the
1998	* following characteristics:
1999	*
2000	* -- Codes 0-127 are identical to ASCII and thus
2001	* ISO 8559-1 ("Latin 1") also.
2002	*
2003	* -- Codes 160-255 are identical to ISO 8559-1 ("Latin 1").
2004	*
2005	* -- Codes 128-159 are NOT defined in ISO 8559-1, but
2006	* Netscape treats those as ANSI as well, so we do too.
2007	*
2008	* As a result, consider the output to be in OS/2 codepage
2009	* 1004. Either set your codepage to that (WinSetCp)
2010	* or translate the output (WinCpTranslateString).
2011	*
2012	* &#xxx; tags (with xxx being a decimal) are considered
2013	* ANSI codes as well. Even though HTML 4.0 allows Unicode
2014	* characters > 255 to be inserted this way, we ignore
2015	* those. Unicode chars from 0 to 255 are identical to
2016	* ANSI, so for to ÿ, we are HTML-compliant.
2017	*
2018	* All other tags are completely thrown out.
2019	*
2020	*@@added V0.9.3 (2000-05-06) [umoeller]
2021	*/
2022
2023	BOOL txvConvertFromHTML(char **ppszText,
2024	PVOID pxhtml, // out: various config data (PXHTMLDATA)
2025	PULONG pulProgress, // out: progress (ptr can be NULL)
2026	PBOOL pfCancel) // in: cancel flag (ptr can be NULL)
2027	{
2028	BOOL brc = TRUE;
2029
2030	PSZ pszNew,
2031	pTarget;
2032	ULONG cbSource = strlen(*ppszText);
2033
2034	COPYTARGET ct = {0};
2035
2036	lstInit(&ct.llLists,
2037	TRUE); // free items
2038
2039	ct.pSource = *ppszText;
2040	// skip leading spaces
2041	ct.fSkipNextSpace = TRUE;
2042	ct.pxhtml = (PXHTMLDATA)pxhtml;
2043
2044	// step 2:
2045	// actual tags formatting
2046
2047	while (TRUE)
2048	{
2049	CHAR c = *ct.pSource;
2050
2051	if (pfCancel)
2052	if (*pfCancel)
2053	{
2054	brc = FALSE;
2055	break;
2056	}
2057
2058	if (!c)
2059	// null terminator reached:
2060	break;
2061
2062	// calculate progress
2063	if (pulProgress)
2064	pulProgress = ((ct.pSource - ppszText) // characters done
2065	* 100
2066	/ cbSource); // characters total
2067
2068	switch (c)
2069	{
2070	case '<':
2071	HandleTag(&ct);
2072	break;
2073
2074	case '&':
2075	HandleEscape(&ct);
2076	break;
2077
2078	case '\r':
2079	// skip
2080	if (!ct.fSkipNextSpace)
2081	{
2082	AppendChar(&ct,
2083	' ');
2084	// ct.fNeedsLinebreak = FALSE;
2085	// but skip leading spaces which might follow
2086	if (!ct.fPRE)
2087	ct.fSkipNextSpace = TRUE;
2088	}
2089	ct.pSource++;
2090	break;
2091
2092	case '\t':
2093	{
2094	if (ct.fPRE)
2095	{
2096	ULONG ul;
2097	for (ul = 0;
2098	ul < 8;
2099	ul++)
2100	AppendChar(&ct,
2101	' ');
2102	}
2103	else
2104	{
2105	// not in PRE block:
2106	if ( (!ct.fSkipNextSpace)
2107	// && (!ct.fNeedsLinebreak)
2108	)
2109	// last was not space: copy
2110	AppendChar(&ct,
2111	' ');
2112
2113	ct.fSkipNextSpace = TRUE;
2114	}
2115
2116	// skip the tab
2117	ct.pSource++;
2118	break; }
2119
2120	case '\n':
2121	{
2122	// newline char:
2123	if (!ct.fPRE)
2124	{
2125	// if not in PRE mode, replace with space
2126	if (!ct.fSkipNextSpace)
2127	{
2128	AppendChar(&ct,
2129	' ');
2130	// ct.fNeedsLinebreak = FALSE;
2131	// but skip leading spaces which might follow
2132	ct.fSkipNextSpace = TRUE;
2133	}
2134	}
2135	else
2136	// in PRE mode, preserve line breaks
2137	AppendChar(&ct, '\n'); // ct.fNeedsLinebreak = TRUE;
2138
2139	ct.pSource++;
2140	break; }
2141
2142	case '\xFF':
2143	{
2144	AppendChar(&ct,
2145	' ');
2146	ct.pSource++;
2147	break; }
2148
2149	case ' ':
2150	if (!ct.fPRE)
2151	{
2152	// is space, and not in PRE block:
2153	if ( (!ct.fSkipNextSpace)
2154	// && (!ct.fNeedsLinebreak)
2155	)
2156	// last was not space: copy
2157	AppendChar(&ct,
2158	' ');
2159
2160	ct.fSkipNextSpace = TRUE;
2161	}
2162	else
2163	// in PRE, always add all spaces
2164	AppendChar(&ct,
2165	' ');
2166	ct.pSource++;
2167	break;
2168
2169	default:
2170	// if we're not inserting escapes or anything,
2171	// check if a linebreak is needed
2172	AppendLinebreakCheck(&ct);
2173
2174	AppendChar(&ct,
2175	*ct.pSource++);
2176	ct.fSkipNextSpace = FALSE;
2177	ct.fSkipNextLinebreak = FALSE;
2178
2179	} // end switch (*pSource);
2180	} // end while (*pSource)
2181	AppendChar(&ct,
2182	'\n');
2183	// append null-terminator
2184	AppendChar(&ct,
2185	0);
2186
2187	free(*ppszText);
2188	*ppszText = ct.pszNew;
2189
2190	lstClear(&ct.llLists);
2191
2192	return (brc);
2193	}
2194
2195

Note: See TracBrowser for help on using the repository browser.

Download in other formats: