source: trunk/src/helpers/textv_html.c@ 116

Last change on this file since 116 was 91, checked in by umoeller, 24 years ago

Misc changes

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 62.0 KB
Line 
1
2/*
3 *@@sourcefile textv_html.c:
4 * this code converts HTML code to escape sequences for the
5 * XTextView control (textview.c).
6 *
7 * This code is in part ugly spaghetti, but this is intentional to
8 * make this HTML parser FAST. In general, you get about double or
9 * triple the speed compared to Netscape 4.6 on OS/2. This code
10 * doesn't understand all of HTML though, but you get most of HTML 2.
11 * There's no tables or frames at this point.
12 *
13 * The entry point into this mess is txvConvertFromHTML, which
14 * is easy to use.
15 *
16 * Note: Version numbering in this file relates to XWorkplace version
17 * numbering.
18 *
19 *@@header "helpers\textv_html.h"
20 *
21 *@@added V0.9.3 (2000-05-10) [umoeller]
22 */
23
24/*
25 * Copyright (C) 2000 Ulrich M”ller.
26 * This program is part of the XWorkplace package.
27 * This program is free software; you can redistribute it and/or modify
28 * it under the terms of the GNU General Public License as published by
29 * the Free Software Foundation, in version 2 as it comes in the COPYING
30 * file of the XWorkplace main distribution.
31 * This program is distributed in the hope that it will be useful,
32 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 * GNU General Public License for more details.
35 */
36
37#define OS2EMX_PLAIN_CHAR
38 // this is needed for "os2emx.h"; if this is defined,
39 // emx will define PSZ as _signed_ char, otherwise
40 // as unsigned char
41
42#include <os2.h>
43
44#include <stdlib.h>
45#include <stdio.h>
46#include <string.h>
47
48#include "setup.h" // code generation and debugging options
49
50#include "helpers\linklist.h"
51#include "helpers\stringh.h"
52#include "helpers\textview.h"
53
54#include "helpers\textv_html.h"
55
56/*
57 *@@category: Helpers\PM helpers\Window classes\XTextView control\HTML conversion
58 * see textv_html.c.
59 */
60
61/* ******************************************************************
62 *
63 * Declarations
64 *
65 ********************************************************************/
66
67/*
68 *@@ LISTDESC:
69 * structure stored in COPYTARGET to
70 * hold list information (UL, OL, ... tags).
71 *
72 *@@added V0.9.3 (2000-05-07) [umoeller]
73 */
74
75typedef struct _LISTDESC
76{
77 ULONG ulListType; // 0: unordered (UL)
78 // 1: ordered (OL)
79 // 2: definition lists (DL)
80 ULONG ulItem; // list enumeration; 1 on first item,
81 // 2 on next, ...
82} LISTDESC, *PLISTDESC;
83
84/*
85 *@@ COPYTARGET:
86 * monster structure which holds the current
87 * status of the HTML converter while conversion
88 * is taking place. This stores input/output pointers
89 * and various flags to avoid duplicate line breaks
90 * and such.
91 *
92 * One instance of this is created in txvConvertFromHTML
93 * on the stack and then passed to all the sub-function
94 * calls.
95 *
96 *@@added V0.9.3 (2000-05-06) [umoeller]
97 */
98
99typedef struct _COPYTARGET
100{
101 PSZ pSource; // ptr into source string;
102 // valid ONLY while we're in a tag handler
103 PSZ pNewSource; // can be set by tag handler to skip characters;
104 // this is set to NULL before calling a tag
105 // handler; if this is still NULL, default
106 // processing occurs
107
108 // new string:
109 PSZ pszNew; // memory buffer
110 ULONG cbNew; // size of buffer (reallocated)
111 PSZ pTarget; // current char ptr into pszNew
112
113 // saved character while tag handler is being called
114 CHAR cSaved;
115
116 PXHTMLDATA pxhtml; // ptr to XHTMLDATA passed to txvConvertFromHTML
117
118 // formatting flags while going through the text
119 BOOL fSkipNextSpace;
120 // if TRUE, subsequent spaces are skipped
121 BOOL fNeedsLinebreak;
122 // if TRUE, \n is inserted before any other character
123 BOOL fSkipNextLinebreak;
124 // if TRUE, subsequent linebreaks are skipped
125 BOOL fPRE;
126 // are we currently in a PRE tag?
127 BOOL fInLink;
128 // are we currently in a A HREF= tag?
129
130 // arguments (attributes) for tag handlers
131 PSZ pszAttributes; // != NULL while a tag handler is being called
132 // and attributes exist for the tag
133
134 // anchors count
135 USHORT usAnchorIndex; // start with 1
136
137 // list maintenance
138 ULONG ulListLevel; // if > 0, we're in a UL or OL block;
139 // raised for each block
140 ULONG ulUnorderedListLevel; // raised with each UL block to keep track
141 // of bullets
142 ULONG ulOrderedListLevel; // raised with each UL block to keep track
143 // of 1), 2), a), b)... numbering
144 ULONG ulCurrentListType; // current list type (from highest LISTDESC)
145 BOOL fInDT; // TRUE if we're currently in a DT tag
146 LINKLIST llLists; // stack of LISTDESC items
147} COPYTARGET, *PCOPYTARGET;
148
149typedef VOID FNPROCESSTAG(PCOPYTARGET pct);
150typedef FNPROCESSTAG *PFNPROCESSTAG;
151
152/* ******************************************************************
153 *
154 * Global variables
155 *
156 ********************************************************************/
157
158/* ******************************************************************
159 *
160 * Append-char helpers
161 *
162 ********************************************************************/
163
164#define COPYTARGETALLOC 100000
165
166/*
167 *@@ AppendChar:
168 * helper for txvConvertFromHTML to
169 * append a char to the target string
170 * in COPYTARGET.
171 * This performs a few additional checks
172 * and manages memory.
173 *
174 *@@added V0.9.3 (2000-05-06) [umoeller]
175 */
176
177VOID AppendChar(PCOPYTARGET pct, // in/out: formatting buffer
178 unsigned char c)
179{
180 // calculate ofs where to store next char
181 ULONG cbOfsNext = pct->pTarget - pct->pszNew;
182 if (cbOfsNext >= pct->cbNew) // have we reached the buffer size yet?
183 {
184 // more mem needed:
185 pct->cbNew += COPYTARGETALLOC;
186 pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
187 // if first call, pszNew is NULL, and realloc
188 // behaves just like malloc
189 // adjust target, because ptr might have changed
190 pct->pTarget = pct->pszNew + cbOfsNext;
191 }
192
193 // append character
194 *pct->pTarget++ = c;
195}
196
197/*
198 *@@ AppendString:
199 * appends the characters in *ach,
200 * which must be null-terminated.
201 * Does NOT append a null character though.
202 *
203 *@@added V0.9.3 (2000-05-06) [umoeller]
204 */
205
206VOID AppendString(PCOPYTARGET pct, // in/out: formatting buffer
207 char *ach)
208{
209 ULONG cbAppend = strlen(ach);
210 ULONG ul;
211 PSZ pSource;
212
213 // calculate ofs where to store next char
214 ULONG cbOfsNext = pct->pTarget - pct->pszNew;
215 while (cbOfsNext + cbAppend >= pct->cbNew)
216 {
217 // more mem needed:
218 pct->cbNew += COPYTARGETALLOC;
219 pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
220 // if first call, pszNew is NULL, and realloc
221 // behaves just like malloc
222 // adjust target, because ptr might have changed
223 pct->pTarget = pct->pszNew + cbOfsNext;
224 }
225
226 // append characters
227 pSource = ach;
228 for (ul = 0;
229 ul < cbAppend;
230 ul++)
231 *pct->pTarget++ = *pSource++;
232}
233
234/*
235 *@@ AppendLinebreakCheck:
236 * checks if a linebreak is needed and
237 * inserts one if so.
238 *
239 *@@added V0.9.3 (2000-05-17) [umoeller]
240 */
241
242VOID AppendLinebreakCheck(PCOPYTARGET pct)
243{
244 if ((!pct->fPRE) && (pct->fNeedsLinebreak))
245 {
246 // yes: insert linebreak; this resets pct->fNeedsLinebreak
247 if (!pct->fSkipNextLinebreak)
248 {
249 AppendChar(pct, '\n');
250
251 if ((pct->ulListLevel) && (!pct->fInDT))
252 // if we're in a list, add a tab also,
253 // because we'll have a negative first-line margin
254 AppendString(pct, TXVESC_TAB);
255 }
256 pct->fNeedsLinebreak = FALSE;
257 }
258}
259
260/*
261 *@@ AppendEscapeWithDecimal:
262 * appends the specified escape code
263 * with a three-digit decimal parameter.
264 * Calls AppendString in turn.
265 *
266 *@@added V0.9.3 (2000-05-07) [umoeller]
267 */
268
269VOID AppendEscapeWith3Decimals(PCOPYTARGET pct, // in/out: formatting buffer
270 char *ach,
271 USHORT us)
272{
273 CHAR szDecimal[10];
274 if (us > 999)
275 us = 999;
276 sprintf(szDecimal, "%03d", us);
277 // append escape
278 AppendString(pct, ach);
279 AppendString(pct, szDecimal);
280}
281
282/*
283 *@@ AppendEscapeWith4Decimals:
284 *
285 *@@added V0.9.3 (2000-05-07) [umoeller]
286 */
287
288VOID AppendEscapeWith4Decimals(PCOPYTARGET pct, // in/out: formatting buffer
289 char *ach,
290 USHORT us)
291{
292 CHAR szDecimal[10];
293 if (us > 9999)
294 us = 9999;
295 sprintf(szDecimal, "%04d", us);
296 // append escape
297 AppendString(pct, ach);
298 AppendString(pct, szDecimal);
299}
300
301/* ******************************************************************
302 *
303 * Tag converter functions
304 *
305 ********************************************************************/
306
307/*
308 *@@ StartList:
309 * starts a list (UL or OL).
310 * This uses a linked list in COPYTARGET
311 * to keep a pseudo-stack for nested lists.
312 *
313 *@@added V0.9.3 (2000-05-08) [umoeller]
314 */
315
316VOID StartList(PCOPYTARGET pct, // in/out: formatting buffer
317 ULONG ulListType) // list type:
318 // 0: unordered (UL)
319 // 1: ordered (OL)
320 // 2: definition lists (DL)
321{
322 PLISTDESC pListDesc;
323
324 // raise list level
325 pct->ulListLevel++;
326
327 if (ulListType == 0)
328 // unordered:
329 pct->ulUnorderedListLevel++;
330 else if (ulListType == 1)
331 // ordered:
332 pct->ulOrderedListLevel++;
333
334 // create LISTDESC and store it on stack
335 pListDesc = (PLISTDESC)malloc(sizeof(LISTDESC));
336 pListDesc->ulListType
337 = pct->ulCurrentListType
338 = ulListType;
339 pListDesc->ulItem = 1;
340
341 lstAppendItem(&pct->llLists,
342 pListDesc);
343
344 AppendEscapeWith4Decimals(pct,
345 TXVESC_LEFTMARGIN,
346 pct->ulListLevel * 5);
347 AppendEscapeWith3Decimals(pct,
348 TXVESC_FIRSTLINEMARGIN_LEFT,
349 (ulListType == 2)
350 ? 5 // for definition lists
351 : 3); // negative!
352 // add \n before any other character
353 pct->fNeedsLinebreak = TRUE;
354}
355
356/*
357 *@@ StopList:
358 * stops a list (UL or OL).
359 *
360 *@@added V0.9.3 (2000-05-07) [umoeller]
361 */
362
363VOID StopList(PCOPYTARGET pct)
364{
365 if (pct->ulListLevel)
366 {
367 PLISTNODE pNode;
368
369 // lower list level
370 pct->ulListLevel--;
371 AppendEscapeWith4Decimals(pct,
372 TXVESC_LEFTMARGIN,
373 pct->ulListLevel * 5);
374 AppendEscapeWith3Decimals(pct,
375 TXVESC_FIRSTLINEMARGIN_LEFT,
376 (pct->ulListLevel)
377 ? 3 // we still have a list level (nested)
378 : 0);
379 pct->fNeedsLinebreak = TRUE;
380
381 // remove the LISTDESC from the stack
382 pNode = lstNodeFromIndex(&pct->llLists,
383 pct->ulListLevel); // this has been lowered already
384 if (pNode)
385 {
386 PLISTDESC pListDesc = (PLISTDESC)pNode->pItemData;
387 if (pListDesc->ulListType == 0)
388 // was unordered:
389 pct->ulUnorderedListLevel--;
390 else if (pListDesc->ulListType == 1)
391 // was ordered:
392 pct->ulOrderedListLevel--;
393
394 lstRemoveNode(&pct->llLists, pNode);
395
396 // update COPYTARGET with previous list level
397 if (pct->ulListLevel)
398 {
399 // we're still in a list (nested lists):
400 PLISTDESC pListDesc2 = (PLISTDESC)lstItemFromIndex(&pct->llLists,
401 pct->ulListLevel - 1);
402 if (pListDesc2)
403 pct->ulCurrentListType = pListDesc2->ulListType;
404 }
405 }
406 }
407 // else: buggy HTML code, ignore
408}
409
410/*
411 *@@ TagTITLE:
412 *
413 *@@added V0.9.3 (2000-05-19) [umoeller]
414 */
415
416VOID TagTITLE(PCOPYTARGET pct)
417{
418 // pSource currently points to <TITLE tag
419 PSZ pSource = pct->pSource + strlen(pct->pSource);
420 // points to temporary null byte in main buffer now
421 *pSource = pct->cSaved;
422
423 pSource = strchr(pct->pSource, '>');
424 if (pSource)
425 {
426 PSZ pNextOpen = strchr(pSource, '<');
427 if (pNextOpen)
428 {
429 // extract title
430 pct->pxhtml->pszTitle = strhSubstr(pSource + 1, pNextOpen);
431
432 if (strnicmp(pNextOpen + 1, "/TITLE", 6) == 0)
433 {
434 // closing /TITLE tag found:
435 // search on after that
436 pct->pNewSource = strchr(pNextOpen, '>');
437 if (pct->pNewSource)
438 pct->pNewSource++;
439 }
440 }
441 }
442}
443
444/*
445 *@@ TagP:
446 *
447 */
448
449VOID TagP(PCOPYTARGET pct)
450{
451 // append newline:
452 // add \n before any other character
453 pct->fNeedsLinebreak = TRUE;
454
455 /* if (pct->ulListLevel)
456 {
457 // if we are currently in a list, we must also
458 // add a tab escape, because we have set
459 // the first line margin to the left of the
460 // left margin
461 AppendString(pct,
462 TXVESC_TAB);
463 } */
464}
465
466VOID TagBR(PCOPYTARGET pct)
467{
468 AppendChar(pct,
469 '\r');
470
471 if (pct->ulListLevel)
472 {
473 // if we are currently in a list, we must also
474 // add a tab escape, because we have set
475 // the first line margin to the left of the
476 // left margin
477 AppendString(pct,
478 TXVESC_TAB);
479 }
480 if (!pct->fPRE)
481 pct->fSkipNextSpace = TRUE;
482}
483
484VOID TagPRE(PCOPYTARGET pct)
485{
486 // start of PRE tag:
487 // add \n before any other character
488 // pct->fNeedsLinebreak = TRUE;
489 AppendChar(pct, '\n');
490 pct->fNeedsLinebreak = FALSE;
491 /* AppendString(pct,
492 TXVESC_PRE_BEGIN); */
493 AppendEscapeWith3Decimals(pct,
494 TXVESC_SET_FONT,
495 1); // monospaced font
496 AppendEscapeWith4Decimals(pct,
497 TXVESC_SPACEBEFORE,
498 0); // no spacing before
499 AppendEscapeWith4Decimals(pct,
500 TXVESC_SPACEAFTER,
501 0); // no spacing after
502 // disable word-wrapping
503 AppendString(pct,
504 TXVESC_WORDWRAP "0");
505 pct->fPRE = TRUE;
506 pct->fSkipNextSpace = FALSE;
507}
508
509VOID TagXPRE(PCOPYTARGET pct)
510{
511 pct->fPRE = FALSE;
512 AppendEscapeWith3Decimals(pct,
513 TXVESC_SET_FONT,
514 0); // standard font
515 AppendString(pct, TXVESC_SPACEBEFORE);
516 AppendString(pct, "####"); // reset to default
517 AppendString(pct, TXVESC_SPACEAFTER);
518 AppendString(pct, "####"); // reset to default
519 // re-enable word-wrapping
520 AppendString(pct,
521 TXVESC_WORDWRAP "1"
522 "\n"); // force line break
523 pct->fNeedsLinebreak = FALSE;
524 // refuse to add \n even if we have another "p" coming up
525 pct->fSkipNextLinebreak = TRUE;
526 pct->fSkipNextSpace = TRUE;
527}
528
529VOID TagH1(PCOPYTARGET pct)
530{
531 pct->fNeedsLinebreak = TRUE;
532 AppendEscapeWith3Decimals(pct,
533 TXVESC_POINTSIZE_REL,
534 200); // double size
535 AppendString(pct,
536 TXVESC_BOLD_BEGIN);
537}
538
539VOID TagXH1(PCOPYTARGET pct)
540{
541 AppendString(pct,
542 TXVESC_BOLD_END);
543 AppendEscapeWith3Decimals(pct,
544 TXVESC_POINTSIZE_REL,
545 100); // regular size
546 // add \n before any other character
547 pct->fNeedsLinebreak = TRUE;
548}
549
550VOID TagH2(PCOPYTARGET pct)
551{
552 pct->fNeedsLinebreak = TRUE;
553 AppendEscapeWith3Decimals(pct,
554 TXVESC_POINTSIZE_REL,
555 175); // size in percent of regular point size
556 AppendString(pct,
557 TXVESC_BOLD_BEGIN);
558}
559
560VOID TagXH2(PCOPYTARGET pct)
561{
562 AppendString(pct,
563 TXVESC_BOLD_END);
564 AppendEscapeWith3Decimals(pct,
565 TXVESC_POINTSIZE_REL,
566 100); // regular size
567 // add \n before any other character
568 pct->fNeedsLinebreak = TRUE;
569}
570
571VOID TagH3(PCOPYTARGET pct)
572{
573 pct->fNeedsLinebreak = TRUE;
574 AppendEscapeWith3Decimals(pct,
575 TXVESC_POINTSIZE_REL,
576 150); // size in percent of regular point size
577 AppendString(pct,
578 TXVESC_BOLD_BEGIN);
579}
580
581VOID TagXH3(PCOPYTARGET pct)
582{
583 AppendString(pct,
584 TXVESC_BOLD_END);
585 AppendEscapeWith3Decimals(pct,
586 TXVESC_POINTSIZE_REL,
587 100); // size in percent of regular point size
588 // add \n before any other character
589 pct->fNeedsLinebreak = TRUE;
590}
591
592VOID TagH4(PCOPYTARGET pct)
593{
594 pct->fNeedsLinebreak = TRUE;
595 AppendEscapeWith3Decimals(pct,
596 TXVESC_POINTSIZE_REL,
597 125); // size in percent of regular point size
598 AppendString(pct,
599 TXVESC_BOLD_BEGIN);
600}
601
602VOID TagXH4(PCOPYTARGET pct)
603{
604 AppendString(pct,
605 TXVESC_BOLD_END);
606 AppendEscapeWith3Decimals(pct,
607 TXVESC_POINTSIZE_REL,
608 100); // regular size
609 // add \n before any other character
610 pct->fNeedsLinebreak = TRUE;
611}
612
613VOID TagH5(PCOPYTARGET pct)
614{
615 pct->fNeedsLinebreak = TRUE;
616 AppendEscapeWith3Decimals(pct,
617 TXVESC_POINTSIZE_REL,
618 100); // size in percent of regular point size
619 AppendString(pct,
620 TXVESC_BOLD_BEGIN);
621}
622
623VOID TagXH5(PCOPYTARGET pct)
624{
625 AppendString(pct,
626 TXVESC_BOLD_END);
627 AppendEscapeWith3Decimals(pct,
628 TXVESC_POINTSIZE_REL,
629 100); // regular size
630 // add \n before any other character
631 pct->fNeedsLinebreak = TRUE;
632}
633
634VOID TagH6(PCOPYTARGET pct)
635{
636 pct->fNeedsLinebreak = TRUE;
637 AppendEscapeWith3Decimals(pct,
638 TXVESC_POINTSIZE_REL,
639 80 ); // size in percent of regular point size
640 AppendString(pct,
641 TXVESC_BOLD_BEGIN);
642}
643
644VOID TagXH6(PCOPYTARGET pct)
645{
646 AppendString(pct,
647 TXVESC_BOLD_END);
648 AppendEscapeWith3Decimals(pct,
649 TXVESC_POINTSIZE_REL,
650 100); // regular size
651 // add \n before any other character
652 pct->fNeedsLinebreak = TRUE;
653}
654
655VOID TagUL(PCOPYTARGET pct)
656{
657 StartList(pct,
658 0); // unordered
659}
660
661VOID TagXUL(PCOPYTARGET pct)
662{
663 StopList(pct);
664}
665
666VOID TagOL(PCOPYTARGET pct)
667{
668 StartList(pct,
669 1); // ordered
670}
671
672VOID TagXOL(PCOPYTARGET pct)
673{
674 StopList(pct);
675}
676
677VOID TagLI(PCOPYTARGET pct)
678{
679 PLISTDESC pListDesc;
680 CHAR szMarker[20] = TXVESC_MARKER "\x01";
681
682 if (pct->ulListLevel)
683 {
684 // we're in a list:
685 pListDesc = (PLISTDESC)lstItemFromIndex(&pct->llLists,
686 pct->ulListLevel - 1);
687 if (pListDesc)
688 {
689 if (pListDesc->ulListType == 1)
690 // is ordered list:
691 sprintf(szMarker, "%lu.", (pListDesc->ulItem)++);
692 else if (pListDesc->ulListType == 0)
693 // is unordered list:
694 // set bullet type according to unordered nesting
695 szMarker[2] = pct->ulUnorderedListLevel;
696 }
697 }
698
699 // add \n before any other character
700 // pct->fNeedsLinebreak = TRUE;
701 // if (pct->fNeedsLinebreak)
702 {
703 AppendChar(pct, '\n');
704 pct->fNeedsLinebreak = FALSE;
705 }
706
707 AppendString(pct, szMarker);
708 AppendString(pct, TXVESC_TAB);
709}
710
711VOID TagDL(PCOPYTARGET pct)
712{
713 StartList(pct,
714 2); // definition list
715}
716
717VOID TagXDL(PCOPYTARGET pct)
718{
719 StopList(pct);
720 pct->fInDT = FALSE;
721}
722
723VOID TagDT(PCOPYTARGET pct)
724{
725 pct->fNeedsLinebreak = TRUE;
726 pct->fInDT = TRUE;
727}
728
729VOID TagDD(PCOPYTARGET pct)
730{
731 pct->fNeedsLinebreak = TRUE;
732 AppendString(pct, TXVESC_TAB);
733 if (!pct->fPRE)
734 pct->fSkipNextSpace = TRUE;
735 pct->fInDT = FALSE;
736}
737
738VOID TagTR(PCOPYTARGET pct)
739{
740 pct->fNeedsLinebreak = TRUE;
741}
742
743VOID TagB(PCOPYTARGET pct)
744{
745 AppendString(pct,
746 TXVESC_BOLD_BEGIN);
747}
748
749VOID TagXB(PCOPYTARGET pct)
750{
751 AppendString(pct,
752 TXVESC_BOLD_END);
753}
754
755VOID TagI(PCOPYTARGET pct)
756{
757 AppendString(pct,
758 TXVESC_ITALICS_BEGIN);
759}
760
761VOID TagXI(PCOPYTARGET pct)
762{
763 AppendString(pct,
764 TXVESC_ITALICS_END);
765}
766
767VOID TagU(PCOPYTARGET pct)
768{
769 AppendString(pct,
770 TXVESC_UNDERLINE_BEGIN);
771}
772
773VOID TagXU(PCOPYTARGET pct)
774{
775 AppendString(pct,
776 TXVESC_UNDERLINE_END);
777}
778
779VOID TagSTRIKE(PCOPYTARGET pct)
780{
781 AppendString(pct,
782 TXVESC_STRIKE_BEGIN);
783}
784
785VOID TagXSTRIKE(PCOPYTARGET pct)
786{
787 AppendString(pct,
788 TXVESC_STRIKE_END);
789}
790
791VOID TagCODE(PCOPYTARGET pct)
792{
793 AppendEscapeWith3Decimals(pct,
794 TXVESC_SET_FONT,
795 1); // monospaced font
796}
797
798VOID TagXCODE(PCOPYTARGET pct)
799{
800 AppendEscapeWith3Decimals(pct,
801 TXVESC_SET_FONT,
802 0); // regular font
803}
804
805VOID TagA(PCOPYTARGET pct)
806{
807 CHAR szAnchor[10];
808
809 pct->fInLink = FALSE;
810
811 if ((pct->pszAttributes) && (pct->pxhtml)) // points into main source buffer
812 {
813 // we have attributes:
814 PSZ pszClosingTag = strchr(pct->pszAttributes, '>');
815 if (pszClosingTag)
816 {
817 ULONG ulOfs = 0;
818
819 /*
820 * HREF attribute:
821 *
822 */
823
824 PSZ pHREF = strhGetTextAttr(pct->pszAttributes, "HREF", &ulOfs),
825 pNAME = 0;
826
827 // replace '>' with null char to mark end of search
828 *pszClosingTag = 0;
829
830 if (pHREF)
831 {
832 // OK, we got a link target:
833 // create a link item and append it to the output list
834 PXHTMLLINK pNewLink = (PXHTMLLINK)malloc(sizeof(XHTMLLINK));
835 memset(pNewLink, 0, sizeof(XHTMLLINK));
836
837 pct->fInLink = TRUE;
838
839 // this starts with anchor 1
840 pNewLink->usLinkIndex = ++pct->usAnchorIndex;
841 pNewLink->pszTargetFile = pHREF;
842 // do not free
843 lstAppendItem(&pct->pxhtml->llLinks, pNewLink);
844 }
845
846 /*
847 * NAME attribute:
848 *
849 */
850
851 pNAME = strhGetTextAttr(pct->pszAttributes, "NAME", &ulOfs);
852 if (pNAME)
853 {
854 AppendString(pct,
855 TXVESC_ANCHORNAME);
856 AppendString(pct,
857 pNAME);
858 // must be terminated with 0xFF
859 AppendChar(pct, 0xFF);
860 free(pNAME);
861 }
862 // restore '>'
863 *pszClosingTag = '>';
864 }
865 }
866
867 if (pct->fInLink)
868 {
869 sprintf(szAnchor, "%04hX", pct->usAnchorIndex);
870 AppendString(pct,
871 TXVESC_LINK);
872 AppendString(pct,
873 szAnchor);
874 }
875}
876
877VOID TagXA(PCOPYTARGET pct)
878{
879 if (pct->fInLink)
880 {
881 AppendString(pct,
882 TXVESC_LINK "####");
883 pct->fInLink = FALSE;
884 }
885}
886
887/* ******************************************************************
888 *
889 * Tag helpers
890 *
891 ********************************************************************/
892
893/*
894 *@@ FindTagProcessor:
895 * returns the Tag* function which handles the
896 * given tag or NULL if there's none.
897 *
898 *@@added V0.9.4 (2000-06-10) [umoeller]
899 */
900
901PFNPROCESSTAG FindTagProcessor(PSZ pszTag)
902{
903 PFNPROCESSTAG pProcessor = NULL;
904
905 CHAR c0,
906 c1;
907
908 BOOL fEndOfTag = FALSE;
909
910 PSZ pCheck = pszTag,
911 p2;
912 if (*pCheck == '/')
913 {
914 // end of tag:
915 fEndOfTag = TRUE;
916 pCheck++;
917 }
918
919 c0 = *pCheck;
920 c1 = *(pCheck + 1);
921
922 p2 = pCheck + 2;
923
924 switch (c0)
925 {
926 case 'A':
927 case 'a':
928 switch (c1)
929 {
930 case 0: // A
931 if (!fEndOfTag)
932 return TagA;
933 else
934 return TagXA;
935 case 'D': // ADDRESS
936 case 'd': // ADDRESS
937 if (stricmp(p2, "DRESS") == 0)
938 {
939 if (!fEndOfTag)
940 return TagI;
941 else
942 return TagXI;
943 }
944 }
945 break;
946
947 case 'B':
948 case 'b':
949 switch (c1)
950 {
951 case 0:
952 if (!fEndOfTag)
953 return TagB;
954 else
955 return TagXB;
956
957 case 'R': // BR
958 case 'r': // BR
959 if (*p2 == 0)
960 if (!fEndOfTag)
961 return TagBR;
962 }
963 break;
964
965 case 'C':
966 case 'c':
967 switch (c1)
968 {
969 case 'I': // CITE
970 case 'i': // CITE
971 if (stricmp(p2, "TE") == 0)
972 {
973 if (!fEndOfTag)
974 return TagI;
975 else
976 return TagXI;
977 }
978 break;
979
980 case 'O':
981 case 'o':
982 if (stricmp(p2, "DE") == 0)
983 {
984 if (!fEndOfTag)
985 return TagCODE;
986 else
987 return TagXCODE;
988 }
989 break;
990 }
991 break;
992
993 case 'D':
994 case 'd':
995 switch (c1)
996 {
997 case 'D': // DD
998 case 'd': // DD
999 if ((*p2 == 0) && (!fEndOfTag))
1000 return (TagDD);
1001 break;
1002
1003 case 'I': // DIR
1004 case 'i': // DIR
1005 if (*p2 == 'R')
1006 if (*(pCheck + 3) == 0)
1007 {
1008 if (!fEndOfTag)
1009 return TagUL;
1010 else
1011 return TagXUL;
1012 }
1013 break;
1014
1015 case 'L': // DL
1016 case 'l': // DL
1017 if (*p2 == 0)
1018 {
1019 if (!fEndOfTag)
1020 return TagDL;
1021 else
1022 return TagXDL;
1023 }
1024 break;
1025
1026 case 'T': // DT
1027 case 't': // DT
1028 if ((*p2 == 0) && (!fEndOfTag))
1029 return TagDT;
1030 break;
1031 }
1032 break;
1033
1034 case 'E':
1035 case 'e':
1036 if ( (c1 == 'M') || (c1 == 'm') ) // EM
1037 if (*p2 == 0)
1038 {
1039 if (!fEndOfTag)
1040 return TagI;
1041 else
1042 return TagXI;
1043 }
1044 break;
1045
1046 case 'H':
1047 case 'h':
1048 if (c1)
1049 if (*p2 == 0)
1050 switch (c1)
1051 {
1052 case '1':
1053 if (!fEndOfTag)
1054 return TagH1;
1055 else
1056 return TagXH1;
1057 case '2':
1058 if (!fEndOfTag)
1059 return TagH2;
1060 else
1061 return TagXH2;
1062 case '3':
1063 if (!fEndOfTag)
1064 return TagH3;
1065 else
1066 return TagXH3;
1067 case '4':
1068 if (!fEndOfTag)
1069 return TagH4;
1070 else
1071 return TagXH4;
1072 case '5':
1073 if (!fEndOfTag)
1074 return TagH5;
1075 else
1076 return TagXH5;
1077 case '6':
1078 if (!fEndOfTag)
1079 return TagH6;
1080 else
1081 return TagXH6;
1082 }
1083 break;
1084
1085 case 'I':
1086 case 'i':
1087 if (c1 == 0)
1088 {
1089 if (!fEndOfTag)
1090 return TagI;
1091 else
1092 return TagXI;
1093 }
1094 break;
1095
1096 case 'L':
1097 case 'l':
1098 if ((c1 == 'I') || (c1 == 'i'))
1099 if (*p2 == 0)
1100 return TagLI;
1101 break;
1102
1103 case 'M':
1104 case 'm':
1105 if (stricmp(p2, "NU") == 0)
1106 {
1107 if (!fEndOfTag)
1108 return TagUL;
1109 else
1110 return TagXUL;
1111 }
1112 break;
1113
1114 case 'O':
1115 case 'o':
1116 if ((c1 == 'L') || (c1 == 'l'))
1117 if (*p2 == 0)
1118 {
1119 if (!fEndOfTag)
1120 return TagOL;
1121 else
1122 return TagXOL;
1123 }
1124 break;
1125
1126 case 'P':
1127 case 'p':
1128 switch (c1)
1129 {
1130 case 0:
1131 if (!fEndOfTag)
1132 return TagP;
1133 break;
1134
1135 case 'R': // PRE
1136 case 'r': // PRE
1137 if ((*p2 == 'E') || (*p2 == 'e'))
1138 if (*(pCheck + 3) == 0)
1139 {
1140 if (!fEndOfTag)
1141 return TagPRE;
1142 else
1143 return TagXPRE;
1144 }
1145 break;
1146 }
1147 break;
1148
1149 case 'S':
1150 case 's':
1151 switch (c1)
1152 {
1153 case 'T': // STRONG
1154 case 't': // STRONG
1155 if (stricmp(p2, "RONG") == 0)
1156 {
1157 if (!fEndOfTag)
1158 return TagB;
1159 else
1160 return TagXB;
1161 }
1162 else if (stricmp(p2, "RIKE") == 0)
1163 {
1164 if (!fEndOfTag)
1165 return TagSTRIKE;
1166 else
1167 return TagXSTRIKE;
1168 }
1169 break;
1170
1171 case 'A':
1172 case 'a':
1173 if (stricmp(p2, "MP") == 0)
1174 {
1175 if (!fEndOfTag)
1176 return TagCODE;
1177 else
1178 return TagXCODE;
1179 }
1180 break;
1181 }
1182 break;
1183
1184 case 'T':
1185 case 't':
1186 switch (c1)
1187 {
1188 case 'R':
1189 case 'r':
1190 if (*p2 == 0)
1191 return TagTR;
1192 break;
1193
1194 case 'I':
1195 case 'i':
1196 if (stricmp(p2, "TLE") == 0)
1197 return TagTITLE;
1198 break;
1199
1200 case 'T': // TT
1201 case 't':
1202 if (*p2 == 0)
1203 {
1204 if (!fEndOfTag)
1205 return TagCODE;
1206 else
1207 return TagXCODE;
1208 }
1209 break;
1210 }
1211 break;
1212
1213 case 'U':
1214 case 'u':
1215 switch (c1)
1216 {
1217 case 0:
1218 if (!fEndOfTag)
1219 return TagU;
1220 else
1221 return TagXU;
1222
1223 case 'L':
1224 case 'l':
1225 if (*p2 == 0)
1226 {
1227 if (!fEndOfTag)
1228 return TagUL;
1229 else
1230 return TagXUL;
1231 }
1232 break;
1233 }
1234 break;
1235
1236 case 'V':
1237 case 'v':
1238 if (stricmp(p2, "R") == 0)
1239 {
1240 if (!fEndOfTag)
1241 return TagI;
1242 else
1243 return TagXI;
1244 }
1245 break;
1246
1247 case 'X':
1248 case 'x':
1249 if (stricmp(p2, "MP") == 0) // XMP
1250 {
1251 if (!fEndOfTag)
1252 return TagPRE;
1253 else
1254 return TagXPRE;
1255 }
1256 break;
1257 }
1258
1259 return (pProcessor);
1260}
1261
1262/*
1263 *@@ HandleTag:
1264 * called by txvConvertFromHTML when a "<" character
1265 * is found in the source buffer. This calls
1266 * FindTagProcessor in turn to find the Tag*
1267 * function which handles the tag.
1268 *
1269 *@@added V0.9.3 (2000-05-18) [umoeller]
1270 */
1271
1272VOID HandleTag(PCOPYTARGET pct)
1273{
1274 PSZ pStartOfTag = pct->pSource;
1275 // '<' == begin of tag:
1276
1277 // is it a comment? <!-- ... -->
1278 if (strncmp(pStartOfTag + 1, "!--", 3) == 0)
1279 {
1280 // start of comment:
1281 // find end of comment
1282 PSZ pEnd = strstr(pStartOfTag, "-->");
1283 if (pEnd)
1284 // found:
1285 // search on after end of comment
1286 pct->pSource = pEnd + 3;
1287 else
1288 {
1289 // end of comment not found:
1290 // stop formatting...
1291 pct->pSource++;
1292 return;
1293 }
1294 }
1295 else
1296 {
1297 // no comment:
1298 // find end of tag
1299 PSZ p2 = pStartOfTag + 1,
1300 pNextClose = 0, // receives first '>' after '<'
1301 pNextSpace = 0; // receives first ' ' after '<'
1302 BOOL fCont = TRUE;
1303 while (fCont)
1304 {
1305 switch (*p2)
1306 {
1307 case ' ':
1308 case '\r':
1309 case '\n':
1310 // store first space after '<'
1311 if (!pNextSpace)
1312 pNextSpace = p2;
1313 // overwrite line breaks with spaces;
1314 // otherwise we cannot handle tags which go across
1315 // several lines, which is valid HTML
1316 *p2 = ' ';
1317 break;
1318
1319 case '>': // end of tag found:
1320 pNextClose = p2;
1321 fCont = FALSE;
1322 break;
1323
1324 case '<':
1325 // another opening tag:
1326 // that's an HTML error
1327 AppendChar(pct,
1328 *pct->pSource++);
1329 fCont = FALSE;
1330 break;
1331
1332 case 0:
1333 fCont = FALSE;
1334 break;
1335 }
1336 p2++;
1337 }
1338
1339 if (pNextClose)
1340 {
1341 // end of tag found:
1342 ULONG cbTag;
1343 // PSZ pStartOfAttrs = 0;
1344
1345 if ((pNextSpace) && (pNextSpace < pNextClose))
1346 {
1347 // we have attributes:
1348 cbTag = pNextSpace - (pStartOfTag + 1);
1349 // pStartOfAttrs = pNextSpace;
1350 }
1351 else
1352 cbTag = pNextClose - (pStartOfTag + 1);
1353
1354 if (!cbTag)
1355 {
1356 // happens if we have a "<>" in the text:
1357 // just insert the '<>' and go on, we have no tag here
1358 AppendChar(pct,
1359 *pct->pSource++);
1360 AppendChar(pct,
1361 *pct->pSource++);
1362 }
1363 else
1364 {
1365 PFNPROCESSTAG pTagProcessor;
1366
1367 pct->cSaved = *(pStartOfTag + cbTag + 1);
1368 // add a null terminator
1369 *(pStartOfTag + cbTag + 1) = 0;
1370
1371 // find corresponding tag converter function
1372 // from G_TagProcessors map
1373 pTagProcessor = FindTagProcessor(pStartOfTag + 1); // pszTag);
1374
1375 // restore char under null terminator
1376 *(pStartOfTag + cbTag + 1) = pct->cSaved;
1377
1378 // reset new source ptr; the tag handler
1379 // can modify this
1380 pct->pNewSource = NULL;
1381
1382 if (pTagProcessor)
1383 {
1384 // tag understood:
1385
1386 // terminate string after closing tag
1387 pct->cSaved = *(pNextClose + 1); // can be null byte!
1388 *(pNextClose + 1) = 0;
1389
1390 // did we have attributes?
1391 if (pNextSpace)
1392 pct->pszAttributes = pNextSpace;
1393
1394 // finally, call the tag handler
1395 (pTagProcessor) // function
1396 (pct); // argument
1397
1398 *(pNextClose + 1) = pct->cSaved;
1399 }
1400
1401 if (pct->pNewSource == NULL)
1402 // tag handler needs no special processing:
1403 // skip '>' too
1404 pct->pSource = pNextClose + 1;
1405 else
1406 // tag handler has skipped something:
1407 pct->pSource = pct->pNewSource;
1408 }
1409 }
1410 }
1411}
1412
1413/*
1414 *@@ ConvertEscape:
1415 * called by HandleEscape to find the ANSI (CP 1004)
1416 * character for the given escape sequence (pszTag).
1417 *
1418 * pszTag must be null-terminated and contain only
1419 * the stuff between "&" and ";".
1420 *
1421 * This is really ugly spaghetti, but it's the fastest
1422 * way to do it.
1423 *
1424 *@@added V0.9.4 (2000-06-10) [umoeller]
1425 */
1426
1427unsigned char ConvertEscape(PSZ pszTag)
1428{
1429 CHAR c0, c1;
1430 CHAR crc = 0;
1431
1432 PSZ p2 = pszTag + 2;
1433
1434 c0 = *pszTag;
1435 c1 = *(pszTag + 1);
1436
1437 switch (c0)
1438 {
1439 case 'a':
1440 switch (c1)
1441 {
1442 case 'a':
1443 if (strcmp(p2, "cute") == 0)
1444 return 225;
1445 break;
1446
1447 case 'c':
1448 if (strcmp(p2, "irc") == 0)
1449 return 226;
1450 else if (strcmp(p2, "ute") == 0)
1451 return 180;
1452 break;
1453
1454 case 'e':
1455 if (strcmp(p2, "lig") == 0)
1456 return 230;
1457 break;
1458
1459 case 'g':
1460 if (strcmp(p2, "rave") == 0)
1461 return 224;
1462 break;
1463
1464 case 'm':
1465 if (strcmp(p2, "p") == 0)
1466 return '&';
1467 break;
1468
1469 case 'r':
1470 if (strcmp(p2, "ing") == 0)
1471 return 229;
1472 break;
1473
1474 case 't':
1475 if (strcmp(p2, "ilde") == 0)
1476 return 227;
1477 break;
1478
1479 case 'u':
1480 if (strcmp(p2, "ml") == 0)
1481 return 228;
1482 break;
1483 }
1484 break;
1485
1486 case 'b':
1487 if (strcmp(pszTag + 1, "rvbar") == 0)
1488 return 166;
1489 break;
1490
1491 case 'c':
1492 switch (c1)
1493 {
1494 case 'c':
1495 if (strcmp(p2, "edil") == 0)
1496 return 231;
1497 break;
1498
1499 case 'e':
1500 if (strcmp(p2, "dil") == 0)
1501 return 184;
1502 else if (strcmp(p2, "nt") == 0)
1503 return 162;
1504 break;
1505
1506 case 'o':
1507 if (strcmp(p2, "py") == 0)
1508 return 169;
1509 break;
1510
1511 case 'u':
1512 if (strcmp(p2, "rren") == 0)
1513 return 164;
1514 }
1515 break;
1516
1517 case 'd':
1518 switch (c1)
1519 {
1520 case 'e':
1521 if (strcmp(p2, "g") == 0) return 176;
1522 break;
1523
1524 case 'i':
1525 if (strcmp(p2, "vide") == 0) return 247;
1526 break;
1527 }
1528 break;
1529
1530 case 'e':
1531 switch (c1)
1532 {
1533 case 'a':
1534 if (strcmp(p2, "cute") == 0) return 233;
1535 break;
1536
1537 case 'c':
1538 if (strcmp(p2, "irc") == 0) return 234;
1539 break;
1540
1541 case 'g':
1542 if (strcmp(p2, "rave") == 0) return 232;
1543 break;
1544
1545 case 't':
1546 if (strcmp(p2, "h") == 0) return 240;
1547 break;
1548
1549 case 'u':
1550 if (strcmp(p2, "ml") == 0) return 235;
1551 break;
1552 }
1553 break;
1554
1555 case 'f':
1556 switch (c1)
1557 {
1558 case 'r':
1559 if (strcmp(p2, "ac14") == 0) return 188;
1560 if (strcmp(p2, "ac12") == 0) return 189;
1561 if (strcmp(p2, "ac34") == 0) return 190;
1562 break;
1563 }
1564 break;
1565
1566 case 'g':
1567 switch (c1)
1568 {
1569 case 't':
1570 if (*p2 == 0) return '>';
1571 }
1572 break;
1573
1574 case 'i':
1575 switch (c1)
1576 {
1577 case 'a':
1578 if (strcmp(p2, "cute") == 0) return 237;
1579 break;
1580
1581 case 'c':
1582 if (strcmp(p2, "irc") == 0) return 238;
1583 break;
1584
1585 case 'g':
1586 if (strcmp(p2, "rave") == 0) return 236;
1587 break;
1588
1589 case 'e':
1590 if (strcmp(p2, "xcl") == 0) return 161;
1591 break;
1592
1593 case 'q':
1594 if (strcmp(p2, "uest") == 0) return 191;
1595 break;
1596
1597 case 'u':
1598 if (strcmp(p2, "ml") == 0) return 239;
1599 }
1600 break;
1601
1602 case 'l':
1603 switch (c1)
1604 {
1605 case 't':
1606 if (*p2 == 0)
1607 return '<';
1608 break;
1609
1610 case 'a':
1611 if (strcmp(p2, "quo") == 0) return 171;
1612 }
1613 break;
1614
1615 case 'm':
1616 switch (c1)
1617 {
1618 case 'a':
1619 if (strcmp(p2, "cr") == 0) return 175;
1620 break;
1621
1622 case 'i':
1623 if (strcmp(p2, "cro") == 0) return 181;
1624 if (strcmp(p2, "ddot") == 0) return 183;
1625 break;
1626 }
1627 break;
1628
1629 case 'n':
1630 switch (c1)
1631 {
1632 case 'b':
1633 if (strcmp(p2, "sp") == 0) return 160;
1634 break;
1635
1636 case 'o':
1637 if (strcmp(p2, "t") == 0) return 172;
1638 break;
1639
1640 case 't':
1641 if (strcmp(p2, "ilde") == 0) return 241;
1642 }
1643 break;
1644
1645 case 'o':
1646 switch (c1)
1647 {
1648 case 'a':
1649 if (strcmp(p2, "cute") == 0) return 243;
1650 break;
1651
1652 case 'c':
1653 if (strcmp(p2, "irc") == 0) return 244;
1654 break;
1655
1656 case 'g':
1657 if (strcmp(p2, "rave") == 0) return 242;
1658 break;
1659
1660 case 'r':
1661 if (strcmp(p2, "df") == 0) return 170;
1662 if (strcmp(p2, "dm") == 0) return 186;
1663 break;
1664
1665 case 's':
1666 if (strcmp(p2, "lash") == 0) return 248;
1667 break;
1668
1669 case 't':
1670 if (strcmp(p2, "ilde") == 0) return 245;
1671 break;
1672
1673 case 'u':
1674 if (strcmp(p2, "ml") == 0) return 246;
1675 }
1676 break;
1677
1678 case 'p':
1679 switch (c1)
1680 {
1681 case 'a':
1682 if (strcmp(p2, "ra") == 0) return 182;
1683 break;
1684
1685 case 'l':
1686 if (strcmp(p2, "usmn") == 0) return 177;
1687 break;
1688
1689 case 'o':
1690 if (strcmp(p2, "und") == 0) return 163;
1691 }
1692 break;
1693
1694 case 'q':
1695 if (strcmp(pszTag, "quot") == 0) return '"';
1696 break;
1697
1698 case 'r':
1699 if (strcmp(pszTag, "raquo") == 0) return 187;
1700 if (strcmp(pszTag, "reg") == 0) return 174;
1701 break;
1702
1703 case 's':
1704 switch (c1)
1705 {
1706 case 'z':
1707 if (strcmp(p2, "lig") == 0) return 223;
1708 break;
1709
1710 case 'e':
1711 if (strcmp(p2, "ct") == 0) return 167;
1712 break;
1713
1714 case 'h':
1715 if (strcmp(p2, "y") == 0) return 173;
1716 break;
1717
1718 case 'u':
1719 if (strcmp(p2, "p1") == 0) return 185;
1720 if (strcmp(p2, "p2") == 0) return 178;
1721 if (strcmp(p2, "p3") == 0) return 179;
1722 }
1723 break;
1724
1725 case 't':
1726 if (strcmp(pszTag, "thorn") == 0) return 254;
1727 if (strcmp(pszTag, "times") == 0) return 215;
1728 break;
1729
1730 case 'u':
1731 switch (c1)
1732 {
1733 case 'a':
1734 if (strcmp(p2, "cute") == 0) return 250;
1735 break;
1736
1737 case 'c':
1738 if (strcmp(p2, "irc") == 0) return 251;
1739 break;
1740
1741 case 'g':
1742 if (strcmp(p2, "rave") == 0) return 249;
1743 break;
1744
1745 case 'm':
1746 if (strcmp(p2, "l") == 0) return 168;
1747 break;
1748
1749 case 'u':
1750 if (strcmp(p2, "ml") == 0) return 252;
1751 }
1752 break;
1753
1754 case 'y':
1755 if (strcmp(pszTag, "yacute") == 0) return 253;
1756 if (strcmp(pszTag, "yen") == 0) return 165;
1757 if (strcmp(pszTag, "yuml") == 0) return 255;
1758 break;
1759
1760 case 'A':
1761 switch (c1)
1762 {
1763 case 'u':
1764 if (strcmp(p2, "ml") == 0) return 196;
1765 break;
1766
1767 case 'a':
1768 if (strcmp(p2, "cute") == 0) return 193;
1769 break;
1770
1771 case 'c':
1772 if (strcmp(p2, "irc") == 0) return 194;
1773 break;
1774
1775 case 'E':
1776 if (strcmp(p2, "lig") == 0) return 198;
1777 break;
1778
1779 case 'g':
1780 if (strcmp(p2, "rave") == 0) return 192;
1781 break;
1782
1783 case 'r':
1784 if (strcmp(p2, "ing") == 0) return 197;
1785 break;
1786
1787 case 't':
1788 if (strcmp(p2, "ilde") == 0) return 195;
1789 }
1790 break;
1791
1792 case 'C':
1793 if (strcmp(pszTag, "Ccedil") == 0) return 199;
1794 break;
1795
1796 case 'E':
1797 if (strcmp(pszTag, "Ecirc") == 0) return 202;
1798 if (strcmp(pszTag, "Eacute") == 0) return 201;
1799 if (strcmp(pszTag, "Egrave") == 0) return 200;
1800 if (strcmp(pszTag, "ETH") == 0) return 208;
1801 if (strcmp(pszTag, "Euml") == 0) return 203;
1802 break;
1803
1804 case 'I':
1805 if (strcmp(pszTag, "Icirc") == 0) return 206;
1806 if (strcmp(pszTag, "Iacute") == 0) return 205;
1807 if (strcmp(pszTag, "Igrave") == 0) return 204;
1808 if (strcmp(pszTag, "Iuml") == 0) return 207;
1809 break;
1810
1811 case 'N':
1812 if (strcmp(pszTag, "Ntilde") == 0) return 209;
1813 break;
1814
1815 case 'O':
1816 switch (c1)
1817 {
1818 case 'u':
1819 if (strcmp(p2, "ml") == 0) return 214;
1820 break;
1821
1822 case 'a':
1823 if (strcmp(p2, "cute") == 0) return 211;
1824 break;
1825
1826 case 'c':
1827 if (strcmp(p2, "irc") == 0) return 212;
1828 break;
1829
1830 case 'g':
1831 if (strcmp(p2, "rave") == 0) return 210;
1832 break;
1833
1834 case 't':
1835 if (strcmp(p2, "ilde") == 0) return 213;
1836 break;
1837
1838 case 's':
1839 if (strcmp(p2, "lash") == 0) return 216;
1840 }
1841 break;
1842
1843 case 'U':
1844 switch (c1)
1845 {
1846 case 'a':
1847 if (strcmp(p2, "cute") == 0) return 218;
1848 break;
1849
1850 case 'c':
1851 if (strcmp(p2, "irc") == 0) return 219;
1852 break;
1853
1854 case 'g':
1855 if (strcmp(p2, "rave") == 0) return 217;
1856 break;
1857
1858 case 'u':
1859 if (strcmp(p2, "ml") == 0) return 220;
1860 }
1861 break;
1862
1863 case 'T':
1864 if (strcmp(pszTag, "THORN") == 0) return 222;
1865 break;
1866
1867 case 'Y':
1868 if (strcmp(pszTag, "Yacute") == 0) return 221;
1869 break;
1870 }
1871
1872 return (crc);
1873}
1874
1875/*
1876 *@@ HandleEscape:
1877 * called by txvConvertFromHTML when a "&" character
1878 * is found in the source buffer. This calls
1879 * ConvertEscape in turn.
1880 *
1881 *@@added V0.9.3 (2000-05-18) [umoeller]
1882 */
1883
1884VOID HandleEscape(PCOPYTARGET pct)
1885{
1886 // ampersand:
1887 // replace special characters
1888 PSZ pStartOfTag = pct->pSource;
1889 // find end of tag
1890 PSZ p2 = pStartOfTag,
1891 pNextClose = 0,
1892 pNextSpace = 0;
1893 BOOL fCont = TRUE;
1894 while (fCont)
1895 {
1896 switch (*p2)
1897 {
1898 case 0:
1899 fCont = FALSE;
1900 break;
1901
1902 case ';':
1903 pNextClose = p2;
1904 fCont = FALSE;
1905 break;
1906
1907 case ' ':
1908 if (!pNextSpace)
1909 pNextSpace = p2;
1910 break;
1911 }
1912 p2++;
1913 }
1914
1915 if (!pNextClose)
1916 // no closing tag found:
1917 // just insert the '&' and go on, we have no tag here
1918 AppendChar(pct,
1919 *pct->pSource++);
1920 else
1921 {
1922 if ((pNextSpace) && (pNextSpace < pNextClose))
1923 // space before ';':
1924 // just insert the '&' and go on, we have no tag here
1925 AppendChar(pct,
1926 *pct->pSource++);
1927 else if ((!pNextClose) || (pNextClose <= pStartOfTag + 1))
1928 AppendChar(pct,
1929 *pct->pSource++);
1930 else
1931 {
1932 ULONG ulCode = 0;
1933
1934 // create substring with tag
1935 PSZ pszTag = pStartOfTag + 1;
1936 *pNextClose = 0;
1937
1938 if (*pszTag == '#')
1939 {
1940 // latin-1 or Unicode encoding (&#000;)
1941 ulCode = atoi(pszTag + 1);
1942
1943 // next input: char after ';'
1944 pct->pSource = pNextClose + 1;
1945 }
1946 else
1947 {
1948 // named entity:
1949 // find char code corresponding to escape
1950 // from G_EscapeProcessors map
1951 ulCode = ConvertEscape(pszTag);
1952 if (ulCode)
1953 // tag supported:
1954 pct->pSource = pNextClose + 1;
1955 else
1956 // tag not supported:
1957 ulCode = *pct->pSource++;
1958 }
1959
1960 // restore closing tag which we overwrote
1961 *pNextClose = ';';
1962
1963 if (ulCode)
1964 {
1965 AppendLinebreakCheck(pct);
1966
1967 AppendChar(pct,
1968 (CHAR)ulCode);
1969 pct->fSkipNextSpace = FALSE;
1970 }
1971 }
1972 }
1973}
1974
1975/* ******************************************************************
1976 *
1977 * Entry points
1978 *
1979 ********************************************************************/
1980
1981/*
1982 *@@ txvConvertFromHTML:
1983 * this modifies the given text string (which should
1984 * be the complete BODY block of any HTML file) so
1985 * that all HTML tags are removed and replaced with
1986 * escape sequences that the XTextView control understands.
1987 *
1988 * The buffer gets reallocated by this function, so it
1989 * must be free()'able.
1990 *
1991 * So, to have the XTextView control display an HTML file,
1992 * do this:
1993 *
1994 * 1) Load an HTML file into a buffer allocated by malloc().
1995 *
1996 * 2) Call txvConvertFromHTML.
1997 *
1998 * 3) Call WinSetWindowText on the XTextView control with
1999 * the modified buffer.
2000 *
2001 * This understands the following limited subset of HTML:
2002 *
2003 * Paragraph tags:
2004 *
2005 * -- P, BR
2006 * -- PRE, /PRE
2007 * -- UL, /UL, OL, /OL, LI
2008 * -- DL, /DL, DT, DD
2009 * -- H1, /H1 thru H6, /H6
2010 * -- Comments (<!-- .... -->)
2011 *
2012 * Character tags:
2013 *
2014 * -- B, /B, STRONG, /STRONG
2015 * -- I, /I, EM, /EM, VAR, /VAR, CITE, /CITE
2016 * -- CODE, /CODE, SAMP, /SAMP, KBD, /KBD, TT, /TT
2017 * -- U, /U
2018 * -- STRIKE, /STRIKE
2019 * -- CODE, /CODE
2020 *
2021 * The most obvious limitation is that neither tables
2022 * nor frames are supported. Also forget about CSS
2023 * and JavaScript, of course.
2024 *
2025 * All the ampersand (&amp; something) sequences defined
2026 * in HTML 3 are properly translated.
2027 *
2028 * Note: Those are translated to the ANSI (MS-Windows,
2029 * OS/2 codepage 1004) character set. This has the
2030 * following characteristics:
2031 *
2032 * -- Codes 0-127 are identical to ASCII and thus
2033 * ISO 8559-1 ("Latin 1") also.
2034 *
2035 * -- Codes 160-255 are identical to ISO 8559-1 ("Latin 1").
2036 *
2037 * -- Codes 128-159 are NOT defined in ISO 8559-1, but
2038 * Netscape treats those as ANSI as well, so we do too.
2039 *
2040 * As a result, consider the output to be in OS/2 codepage
2041 * 1004. Either set your codepage to that (WinSetCp)
2042 * or translate the output (WinCpTranslateString).
2043 *
2044 * &#xxx; tags (with xxx being a decimal) are considered
2045 * ANSI codes as well. Even though HTML 4.0 allows Unicode
2046 * characters > 255 to be inserted this way, we ignore
2047 * those. Unicode chars from 0 to 255 are identical to
2048 * ANSI, so for &#000; to &#255;, we are HTML-compliant.
2049 *
2050 * All other tags are completely thrown out.
2051 *
2052 *@@added V0.9.3 (2000-05-06) [umoeller]
2053 */
2054
2055BOOL txvConvertFromHTML(char **ppszText,
2056 PVOID pxhtml, // out: various config data (PXHTMLDATA)
2057 PULONG pulProgress, // out: progress (ptr can be NULL)
2058 PBOOL pfCancel) // in: cancel flag (ptr can be NULL)
2059{
2060 BOOL brc = TRUE;
2061
2062 ULONG cbSource = strlen(*ppszText);
2063
2064 XHTMLDATA xhtmlTemp = {0};
2065 BOOL fUsingTemp = FALSE;
2066 COPYTARGET ct = {0};
2067
2068 lstInit(&ct.llLists,
2069 TRUE); // free items
2070
2071 ct.pSource = *ppszText;
2072 // skip leading spaces
2073 ct.fSkipNextSpace = TRUE;
2074 ct.pxhtml = (PXHTMLDATA)pxhtml;
2075 if (ct.pxhtml == NULL) // not specified:
2076 {
2077 ct.pxhtml = &xhtmlTemp;
2078 fUsingTemp = TRUE;
2079 }
2080
2081 lstInit(&ct.pxhtml->llLinks, TRUE); // auto-free
2082
2083 // step 2:
2084 // actual tags formatting
2085
2086 while (TRUE)
2087 {
2088 CHAR c = *ct.pSource;
2089
2090 if (pfCancel)
2091 if (*pfCancel)
2092 {
2093 brc = FALSE;
2094 break;
2095 }
2096
2097 if (!c)
2098 // null terminator reached:
2099 break;
2100
2101 // calculate progress
2102 if (pulProgress)
2103 *pulProgress = ((ct.pSource - *ppszText) // characters done
2104 * 100
2105 / cbSource); // characters total
2106
2107 switch (c)
2108 {
2109 case '<':
2110 HandleTag(&ct);
2111 break;
2112
2113 case '&':
2114 HandleEscape(&ct);
2115 break;
2116
2117 case '\r':
2118 // skip
2119 if (!ct.fSkipNextSpace)
2120 {
2121 AppendChar(&ct,
2122 ' ');
2123 // ct.fNeedsLinebreak = FALSE;
2124 // but skip leading spaces which might follow
2125 if (!ct.fPRE)
2126 ct.fSkipNextSpace = TRUE;
2127 }
2128 ct.pSource++;
2129 break;
2130
2131 case '\t':
2132 {
2133 if (ct.fPRE)
2134 {
2135 ULONG ul;
2136 for (ul = 0;
2137 ul < 8;
2138 ul++)
2139 AppendChar(&ct,
2140 ' ');
2141 }
2142 else
2143 {
2144 // not in PRE block:
2145 if ( (!ct.fSkipNextSpace)
2146 // && (!ct.fNeedsLinebreak)
2147 )
2148 // last was not space: copy
2149 AppendChar(&ct,
2150 ' ');
2151
2152 ct.fSkipNextSpace = TRUE;
2153 }
2154
2155 // skip the tab
2156 ct.pSource++;
2157 break; }
2158
2159 case '\n':
2160 {
2161 // newline char:
2162 if (!ct.fPRE)
2163 {
2164 // if not in PRE mode, replace with space
2165 if (!ct.fSkipNextSpace)
2166 {
2167 AppendChar(&ct,
2168 ' ');
2169 // ct.fNeedsLinebreak = FALSE;
2170 // but skip leading spaces which might follow
2171 ct.fSkipNextSpace = TRUE;
2172 }
2173 }
2174 else
2175 // in PRE mode, preserve line breaks
2176 AppendChar(&ct, '\n'); // ct.fNeedsLinebreak = TRUE;
2177
2178 ct.pSource++;
2179 break; }
2180
2181 case '\xFF':
2182 {
2183 AppendChar(&ct,
2184 ' ');
2185 ct.pSource++;
2186 break; }
2187
2188 case ' ':
2189 if (!ct.fPRE)
2190 {
2191 // is space, and not in PRE block:
2192 if ( (!ct.fSkipNextSpace)
2193 // && (!ct.fNeedsLinebreak)
2194 )
2195 // last was not space: copy
2196 AppendChar(&ct,
2197 ' ');
2198
2199 ct.fSkipNextSpace = TRUE;
2200 }
2201 else
2202 // in PRE, always add all spaces
2203 AppendChar(&ct,
2204 ' ');
2205 ct.pSource++;
2206 break;
2207
2208 default:
2209 // if we're not inserting escapes or anything,
2210 // check if a linebreak is needed
2211 AppendLinebreakCheck(&ct);
2212
2213 AppendChar(&ct,
2214 *ct.pSource++);
2215 ct.fSkipNextSpace = FALSE;
2216 ct.fSkipNextLinebreak = FALSE;
2217
2218 } // end switch (*pSource);
2219 } // end while (*pSource)
2220 AppendChar(&ct,
2221 '\n');
2222 // append null-terminator
2223 AppendChar(&ct,
2224 0);
2225
2226 free(*ppszText);
2227 *ppszText = ct.pszNew;
2228
2229 lstClear(&ct.llLists);
2230
2231 if (fUsingTemp)
2232 {
2233 if (xhtmlTemp.pszTitle)
2234 free(xhtmlTemp.pszTitle);
2235 lstClear(&xhtmlTemp.llLinks);
2236 // ### better really clear this... there are PSZ's inside
2237 }
2238
2239 return (brc);
2240}
2241
2242
Note: See TracBrowser for help on using the repository browser.