source: branches/branch-1-0/src/helpers/textv_html.c

Last change on this file was 222, checked in by umoeller, 23 years ago

Minor adjustments for new static handling.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 61.8 KB
Line 
1
2/*
3 *@@sourcefile textv_html.c:
4 * this code converts HTML code to escape sequences for the
5 * XTextView control (textview.c).
6 *
7 * This code is in part ugly spaghetti, but this is intentional to
8 * make this HTML parser FAST. In general, you get about double or
9 * triple the speed compared to Netscape 4.6 on OS/2. This code
10 * doesn't understand all of HTML though, but you get most of HTML 2.
11 * There's no tables or frames at this point.
12 *
13 * The entry point into this mess is txvConvertFromHTML, which
14 * is easy to use.
15 *
16 * Note: Version numbering in this file relates to XWorkplace version
17 * numbering.
18 *
19 *@@header "helpers\textv_html.h"
20 *
21 *@@added V0.9.3 (2000-05-10) [umoeller]
22 */
23
24/*
25 * Copyright (C) 2000 Ulrich M”ller.
26 * This program is part of the XWorkplace package.
27 * This program is free software; you can redistribute it and/or modify
28 * it under the terms of the GNU General Public License as published by
29 * the Free Software Foundation, in version 2 as it comes in the COPYING
30 * file of the XWorkplace main distribution.
31 * This program is distributed in the hope that it will be useful,
32 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 * GNU General Public License for more details.
35 */
36
37#define OS2EMX_PLAIN_CHAR
38 // this is needed for "os2emx.h"; if this is defined,
39 // emx will define PSZ as _signed_ char, otherwise
40 // as unsigned char
41
42#include <os2.h>
43
44#include <stdlib.h>
45#include <stdio.h>
46#include <string.h>
47
48#include "setup.h" // code generation and debugging options
49
50#include "helpers\linklist.h"
51#include "helpers\stringh.h"
52#include "helpers\textview.h"
53
54#include "helpers\textv_html.h"
55
56/*
57 *@@category: Helpers\PM helpers\Window classes\XTextView control\HTML conversion
58 * see textv_html.c.
59 */
60
61/* ******************************************************************
62 *
63 * Declarations
64 *
65 ********************************************************************/
66
67/*
68 *@@ LISTDESC:
69 * structure stored in COPYTARGET to
70 * hold list information (UL, OL, ... tags).
71 *
72 *@@added V0.9.3 (2000-05-07) [umoeller]
73 */
74
75typedef struct _LISTDESC
76{
77 ULONG ulListType; // 0: unordered (UL)
78 // 1: ordered (OL)
79 // 2: definition lists (DL)
80 ULONG ulItem; // list enumeration; 1 on first item,
81 // 2 on next, ...
82} LISTDESC, *PLISTDESC;
83
84/*
85 *@@ COPYTARGET:
86 * monster structure which holds the current
87 * status of the HTML converter while conversion
88 * is taking place. This stores input/output pointers
89 * and various flags to avoid duplicate line breaks
90 * and such.
91 *
92 * One instance of this is created in txvConvertFromHTML
93 * on the stack and then passed to all the sub-function
94 * calls.
95 *
96 *@@added V0.9.3 (2000-05-06) [umoeller]
97 */
98
99typedef struct _COPYTARGET
100{
101 PSZ pSource; // ptr into source string;
102 // valid ONLY while we're in a tag handler
103 PSZ pNewSource; // can be set by tag handler to skip characters;
104 // this is set to NULL before calling a tag
105 // handler; if this is still NULL, default
106 // processing occurs
107
108 // new string:
109 PSZ pszNew; // memory buffer
110 ULONG cbNew; // size of buffer (reallocated)
111 PSZ pTarget; // current char ptr into pszNew
112
113 // saved character while tag handler is being called
114 CHAR cSaved;
115
116 PSZ *ppszTitle; // out: title (ptr can be NULL)
117 // V0.9.20 (2002-08-10) [umoeller]
118
119 // formatting flags while going through the text
120 BOOL fSkipNextSpace;
121 // if TRUE, subsequent spaces are skipped
122 BOOL fNeedsLinebreak;
123 // if TRUE, \n is inserted before any other character
124 BOOL fSkipNextLinebreak;
125 // if TRUE, subsequent linebreaks are skipped
126 BOOL fPRE;
127 // are we currently in a PRE tag?
128 BOOL fInLink;
129 // are we currently in a A HREF= tag?
130
131 // arguments (attributes) for tag handlers
132 PSZ pszAttributes; // != NULL while a tag handler is being called
133 // and attributes exist for the tag
134
135 // anchors count
136 // USHORT usAnchorIndex; // start with 1 removed V0.9.20 (2002-08-10) [umoeller]
137
138 // list maintenance
139 ULONG ulListLevel; // if > 0, we're in a UL or OL block;
140 // raised for each block
141 ULONG ulUnorderedListLevel; // raised with each UL block to keep track
142 // of bullets
143 ULONG ulOrderedListLevel; // raised with each UL block to keep track
144 // of 1), 2), a), b)... numbering
145 ULONG ulCurrentListType; // current list type (from highest LISTDESC)
146 BOOL fInDT; // TRUE if we're currently in a DT tag
147 LINKLIST llLists; // stack of LISTDESC items
148} COPYTARGET, *PCOPYTARGET;
149
150typedef VOID FNPROCESSTAG(PCOPYTARGET pct);
151typedef FNPROCESSTAG *PFNPROCESSTAG;
152
153/* ******************************************************************
154 *
155 * Global variables
156 *
157 ********************************************************************/
158
159/* ******************************************************************
160 *
161 * Append-char helpers
162 *
163 ********************************************************************/
164
165#define COPYTARGETALLOC 100000
166
167/*
168 *@@ AppendChar:
169 * helper for txvConvertFromHTML to
170 * append a char to the target string
171 * in COPYTARGET.
172 * This performs a few additional checks
173 * and manages memory.
174 *
175 *@@added V0.9.3 (2000-05-06) [umoeller]
176 */
177
178STATIC VOID AppendChar(PCOPYTARGET pct, // in/out: formatting buffer
179 unsigned char c)
180{
181 // calculate ofs where to store next char
182 ULONG cbOfsNext = pct->pTarget - pct->pszNew;
183 if (cbOfsNext >= pct->cbNew) // have we reached the buffer size yet?
184 {
185 // more mem needed:
186 pct->cbNew += COPYTARGETALLOC;
187 pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
188 // if first call, pszNew is NULL, and realloc
189 // behaves just like malloc
190 // adjust target, because ptr might have changed
191 pct->pTarget = pct->pszNew + cbOfsNext;
192 }
193
194 // append character
195 *pct->pTarget++ = c;
196}
197
198/*
199 *@@ AppendString:
200 * appends the characters in *ach,
201 * which must be null-terminated.
202 * Does NOT append a null character though.
203 *
204 *@@added V0.9.3 (2000-05-06) [umoeller]
205 */
206
207STATIC VOID AppendString(PCOPYTARGET pct, // in/out: formatting buffer
208 char *ach)
209{
210 ULONG cbAppend = strlen(ach);
211 ULONG ul;
212 PSZ pSource;
213
214 // calculate ofs where to store next char
215 ULONG cbOfsNext = pct->pTarget - pct->pszNew;
216 while (cbOfsNext + cbAppend >= pct->cbNew)
217 {
218 // more mem needed:
219 pct->cbNew += COPYTARGETALLOC;
220 pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
221 // if first call, pszNew is NULL, and realloc
222 // behaves just like malloc
223 // adjust target, because ptr might have changed
224 pct->pTarget = pct->pszNew + cbOfsNext;
225 }
226
227 // append characters
228 pSource = ach;
229 for (ul = 0;
230 ul < cbAppend;
231 ul++)
232 *pct->pTarget++ = *pSource++;
233}
234
235/*
236 *@@ AppendLinebreakCheck:
237 * checks if a linebreak is needed and
238 * inserts one if so.
239 *
240 *@@added V0.9.3 (2000-05-17) [umoeller]
241 */
242
243STATIC VOID AppendLinebreakCheck(PCOPYTARGET pct)
244{
245 if ((!pct->fPRE) && (pct->fNeedsLinebreak))
246 {
247 // yes: insert linebreak; this resets pct->fNeedsLinebreak
248 if (!pct->fSkipNextLinebreak)
249 {
250 AppendChar(pct, '\n');
251
252 if ((pct->ulListLevel) && (!pct->fInDT))
253 // if we're in a list, add a tab also,
254 // because we'll have a negative first-line margin
255 AppendString(pct, TXVESC_TAB);
256 }
257 pct->fNeedsLinebreak = FALSE;
258 }
259}
260
261/*
262 *@@ AppendEscapeWithDecimal:
263 * appends the specified escape code
264 * with a three-digit decimal parameter.
265 * Calls AppendString in turn.
266 *
267 *@@added V0.9.3 (2000-05-07) [umoeller]
268 */
269
270STATIC VOID AppendEscapeWith3Decimals(PCOPYTARGET pct, // in/out: formatting buffer
271 char *ach,
272 USHORT us)
273{
274 CHAR szDecimal[10];
275 if (us > 999)
276 us = 999;
277 sprintf(szDecimal, "%03d", us);
278 // append escape
279 AppendString(pct, ach);
280 AppendString(pct, szDecimal);
281}
282
283/*
284 *@@ AppendEscapeWith4Decimals:
285 *
286 *@@added V0.9.3 (2000-05-07) [umoeller]
287 */
288
289STATIC VOID AppendEscapeWith4Decimals(PCOPYTARGET pct, // in/out: formatting buffer
290 char *ach,
291 USHORT us)
292{
293 CHAR szDecimal[10];
294 if (us > 9999)
295 us = 9999;
296 sprintf(szDecimal, "%04d", us);
297 // append escape
298 AppendString(pct, ach);
299 AppendString(pct, szDecimal);
300}
301
302/* ******************************************************************
303 *
304 * Tag converter functions
305 *
306 ********************************************************************/
307
308/*
309 *@@ StartList:
310 * starts a list (UL or OL).
311 * This uses a linked list in COPYTARGET
312 * to keep a pseudo-stack for nested lists.
313 *
314 *@@added V0.9.3 (2000-05-08) [umoeller]
315 */
316
317STATIC VOID StartList(PCOPYTARGET pct, // in/out: formatting buffer
318 ULONG ulListType) // list type:
319 // 0: unordered (UL)
320 // 1: ordered (OL)
321 // 2: definition lists (DL)
322{
323 PLISTDESC pListDesc;
324
325 // raise list level
326 pct->ulListLevel++;
327
328 if (ulListType == 0)
329 // unordered:
330 pct->ulUnorderedListLevel++;
331 else if (ulListType == 1)
332 // ordered:
333 pct->ulOrderedListLevel++;
334
335 // create LISTDESC and store it on stack
336 pListDesc = (PLISTDESC)malloc(sizeof(LISTDESC));
337 pListDesc->ulListType
338 = pct->ulCurrentListType
339 = ulListType;
340 pListDesc->ulItem = 1;
341
342 lstAppendItem(&pct->llLists,
343 pListDesc);
344
345 AppendEscapeWith4Decimals(pct,
346 TXVESC_LEFTMARGIN,
347 pct->ulListLevel * 5);
348 AppendEscapeWith3Decimals(pct,
349 TXVESC_FIRSTLINEMARGIN_LEFT,
350 (ulListType == 2)
351 ? 5 // for definition lists
352 : 3); // negative!
353 // add \n before any other character
354 pct->fNeedsLinebreak = TRUE;
355}
356
357/*
358 *@@ StopList:
359 * stops a list (UL or OL).
360 *
361 *@@added V0.9.3 (2000-05-07) [umoeller]
362 */
363
364STATIC VOID StopList(PCOPYTARGET pct)
365{
366 if (pct->ulListLevel)
367 {
368 PLISTNODE pNode;
369
370 // lower list level
371 pct->ulListLevel--;
372 AppendEscapeWith4Decimals(pct,
373 TXVESC_LEFTMARGIN,
374 pct->ulListLevel * 5);
375 AppendEscapeWith3Decimals(pct,
376 TXVESC_FIRSTLINEMARGIN_LEFT,
377 (pct->ulListLevel)
378 ? 3 // we still have a list level (nested)
379 : 0);
380 pct->fNeedsLinebreak = TRUE;
381
382 // remove the LISTDESC from the stack
383 pNode = lstNodeFromIndex(&pct->llLists,
384 pct->ulListLevel); // this has been lowered already
385 if (pNode)
386 {
387 PLISTDESC pListDesc = (PLISTDESC)pNode->pItemData;
388 if (pListDesc->ulListType == 0)
389 // was unordered:
390 pct->ulUnorderedListLevel--;
391 else if (pListDesc->ulListType == 1)
392 // was ordered:
393 pct->ulOrderedListLevel--;
394
395 lstRemoveNode(&pct->llLists, pNode);
396
397 // update COPYTARGET with previous list level
398 if (pct->ulListLevel)
399 {
400 // we're still in a list (nested lists):
401 PLISTDESC pListDesc2 = (PLISTDESC)lstItemFromIndex(&pct->llLists,
402 pct->ulListLevel - 1);
403 if (pListDesc2)
404 pct->ulCurrentListType = pListDesc2->ulListType;
405 }
406 }
407 }
408 // else: buggy HTML code, ignore
409}
410
411/*
412 *@@ TagTITLE:
413 *
414 *@@added V0.9.3 (2000-05-19) [umoeller]
415 */
416
417STATIC VOID TagTITLE(PCOPYTARGET pct)
418{
419 // pSource currently points to <TITLE tag
420 PSZ pSource = pct->pSource + strlen(pct->pSource);
421 // points to temporary null byte in main buffer now
422 *pSource = pct->cSaved;
423
424 if (pSource = strchr(pct->pSource, '>'))
425 {
426 PSZ pNextOpen;
427 if (pNextOpen = strchr(pSource, '<'))
428 {
429 // extract title
430 if (pct->ppszTitle)
431 *(pct->ppszTitle) = strhSubstr(pSource + 1, pNextOpen);
432 // adjusted V0.9.20 (2002-08-10) [umoeller]
433
434 if (strnicmp(pNextOpen + 1, "/TITLE", 6) == 0)
435 {
436 // closing /TITLE tag found:
437 // search on after that
438 if (pct->pNewSource = strchr(pNextOpen, '>'))
439 pct->pNewSource++;
440 }
441 }
442 }
443}
444
445/*
446 *@@ TagP:
447 *
448 */
449
450STATIC VOID TagP(PCOPYTARGET pct)
451{
452 // append newline:
453 // add \n before any other character
454 pct->fNeedsLinebreak = TRUE;
455
456 /* if (pct->ulListLevel)
457 {
458 // if we are currently in a list, we must also
459 // add a tab escape, because we have set
460 // the first line margin to the left of the
461 // left margin
462 AppendString(pct,
463 TXVESC_TAB);
464 } */
465}
466
467STATIC VOID TagBR(PCOPYTARGET pct)
468{
469 AppendChar(pct,
470 '\r');
471
472 if (pct->ulListLevel)
473 {
474 // if we are currently in a list, we must also
475 // add a tab escape, because we have set
476 // the first line margin to the left of the
477 // left margin
478 AppendString(pct,
479 TXVESC_TAB);
480 }
481 if (!pct->fPRE)
482 pct->fSkipNextSpace = TRUE;
483}
484
485STATIC VOID TagPRE(PCOPYTARGET pct)
486{
487 // start of PRE tag:
488 // add \n before any other character
489 // pct->fNeedsLinebreak = TRUE;
490 AppendChar(pct, '\n');
491 pct->fNeedsLinebreak = FALSE;
492 /* AppendString(pct,
493 TXVESC_PRE_BEGIN); */
494 AppendEscapeWith3Decimals(pct,
495 TXVESC_SET_FONT,
496 1); // monospaced font
497 AppendEscapeWith4Decimals(pct,
498 TXVESC_SPACEBEFORE,
499 0); // no spacing before
500 AppendEscapeWith4Decimals(pct,
501 TXVESC_SPACEAFTER,
502 0); // no spacing after
503 // disable word-wrapping
504 AppendString(pct,
505 TXVESC_WORDWRAP "0");
506 pct->fPRE = TRUE;
507 pct->fSkipNextSpace = FALSE;
508}
509
510STATIC VOID TagXPRE(PCOPYTARGET pct)
511{
512 pct->fPRE = FALSE;
513 AppendEscapeWith3Decimals(pct,
514 TXVESC_SET_FONT,
515 0); // standard font
516 AppendString(pct, TXVESC_SPACEBEFORE);
517 AppendString(pct, "####"); // reset to default
518 AppendString(pct, TXVESC_SPACEAFTER);
519 AppendString(pct, "####"); // reset to default
520 // re-enable word-wrapping
521 AppendString(pct,
522 TXVESC_WORDWRAP "1"
523 "\n"); // force line break
524 pct->fNeedsLinebreak = FALSE;
525 // refuse to add \n even if we have another "p" coming up
526 pct->fSkipNextLinebreak = TRUE;
527 pct->fSkipNextSpace = TRUE;
528}
529
530STATIC VOID TagH1(PCOPYTARGET pct)
531{
532 pct->fNeedsLinebreak = TRUE;
533 AppendEscapeWith3Decimals(pct,
534 TXVESC_POINTSIZE_REL,
535 200); // double size
536 AppendString(pct,
537 TXVESC_BOLD_BEGIN);
538}
539
540STATIC VOID TagXH1(PCOPYTARGET pct)
541{
542 AppendString(pct,
543 TXVESC_BOLD_END);
544 AppendEscapeWith3Decimals(pct,
545 TXVESC_POINTSIZE_REL,
546 100); // regular size
547 // add \n before any other character
548 pct->fNeedsLinebreak = TRUE;
549}
550
551STATIC VOID TagH2(PCOPYTARGET pct)
552{
553 pct->fNeedsLinebreak = TRUE;
554 AppendEscapeWith3Decimals(pct,
555 TXVESC_POINTSIZE_REL,
556 175); // size in percent of regular point size
557 AppendString(pct,
558 TXVESC_BOLD_BEGIN);
559}
560
561STATIC VOID TagXH2(PCOPYTARGET pct)
562{
563 AppendString(pct,
564 TXVESC_BOLD_END);
565 AppendEscapeWith3Decimals(pct,
566 TXVESC_POINTSIZE_REL,
567 100); // regular size
568 // add \n before any other character
569 pct->fNeedsLinebreak = TRUE;
570}
571
572STATIC VOID TagH3(PCOPYTARGET pct)
573{
574 pct->fNeedsLinebreak = TRUE;
575 AppendEscapeWith3Decimals(pct,
576 TXVESC_POINTSIZE_REL,
577 150); // size in percent of regular point size
578 AppendString(pct,
579 TXVESC_BOLD_BEGIN);
580}
581
582STATIC VOID TagXH3(PCOPYTARGET pct)
583{
584 AppendString(pct,
585 TXVESC_BOLD_END);
586 AppendEscapeWith3Decimals(pct,
587 TXVESC_POINTSIZE_REL,
588 100); // size in percent of regular point size
589 // add \n before any other character
590 pct->fNeedsLinebreak = TRUE;
591}
592
593STATIC VOID TagH4(PCOPYTARGET pct)
594{
595 pct->fNeedsLinebreak = TRUE;
596 AppendEscapeWith3Decimals(pct,
597 TXVESC_POINTSIZE_REL,
598 125); // size in percent of regular point size
599 AppendString(pct,
600 TXVESC_BOLD_BEGIN);
601}
602
603STATIC VOID TagXH4(PCOPYTARGET pct)
604{
605 AppendString(pct,
606 TXVESC_BOLD_END);
607 AppendEscapeWith3Decimals(pct,
608 TXVESC_POINTSIZE_REL,
609 100); // regular size
610 // add \n before any other character
611 pct->fNeedsLinebreak = TRUE;
612}
613
614STATIC VOID TagH5(PCOPYTARGET pct)
615{
616 pct->fNeedsLinebreak = TRUE;
617 AppendEscapeWith3Decimals(pct,
618 TXVESC_POINTSIZE_REL,
619 100); // size in percent of regular point size
620 AppendString(pct,
621 TXVESC_BOLD_BEGIN);
622}
623
624STATIC VOID TagXH5(PCOPYTARGET pct)
625{
626 AppendString(pct,
627 TXVESC_BOLD_END);
628 AppendEscapeWith3Decimals(pct,
629 TXVESC_POINTSIZE_REL,
630 100); // regular size
631 // add \n before any other character
632 pct->fNeedsLinebreak = TRUE;
633}
634
635STATIC VOID TagH6(PCOPYTARGET pct)
636{
637 pct->fNeedsLinebreak = TRUE;
638 AppendEscapeWith3Decimals(pct,
639 TXVESC_POINTSIZE_REL,
640 80 ); // size in percent of regular point size
641 AppendString(pct,
642 TXVESC_BOLD_BEGIN);
643}
644
645STATIC VOID TagXH6(PCOPYTARGET pct)
646{
647 AppendString(pct,
648 TXVESC_BOLD_END);
649 AppendEscapeWith3Decimals(pct,
650 TXVESC_POINTSIZE_REL,
651 100); // regular size
652 // add \n before any other character
653 pct->fNeedsLinebreak = TRUE;
654}
655
656STATIC VOID TagUL(PCOPYTARGET pct)
657{
658 StartList(pct,
659 0); // unordered
660}
661
662STATIC VOID TagXUL(PCOPYTARGET pct)
663{
664 StopList(pct);
665}
666
667STATIC VOID TagOL(PCOPYTARGET pct)
668{
669 StartList(pct,
670 1); // ordered
671}
672
673STATIC VOID TagXOL(PCOPYTARGET pct)
674{
675 StopList(pct);
676}
677
678STATIC VOID TagLI(PCOPYTARGET pct)
679{
680 PLISTDESC pListDesc;
681 CHAR szMarker[20] = TXVESC_MARKER "\x01";
682
683 if (pct->ulListLevel)
684 {
685 // we're in a list:
686 pListDesc = (PLISTDESC)lstItemFromIndex(&pct->llLists,
687 pct->ulListLevel - 1);
688 if (pListDesc)
689 {
690 if (pListDesc->ulListType == 1)
691 // is ordered list:
692 sprintf(szMarker, "%lu.", (pListDesc->ulItem)++);
693 else if (pListDesc->ulListType == 0)
694 // is unordered list:
695 // set bullet type according to unordered nesting
696 szMarker[2] = pct->ulUnorderedListLevel;
697 }
698 }
699
700 // add \n before any other character
701 // pct->fNeedsLinebreak = TRUE;
702 // if (pct->fNeedsLinebreak)
703 {
704 AppendChar(pct, '\n');
705 pct->fNeedsLinebreak = FALSE;
706 }
707
708 AppendString(pct, szMarker);
709 AppendString(pct, TXVESC_TAB);
710}
711
712STATIC VOID TagDL(PCOPYTARGET pct)
713{
714 StartList(pct,
715 2); // definition list
716}
717
718STATIC VOID TagXDL(PCOPYTARGET pct)
719{
720 StopList(pct);
721 pct->fInDT = FALSE;
722}
723
724STATIC VOID TagDT(PCOPYTARGET pct)
725{
726 pct->fNeedsLinebreak = TRUE;
727 pct->fInDT = TRUE;
728}
729
730STATIC VOID TagDD(PCOPYTARGET pct)
731{
732 pct->fNeedsLinebreak = TRUE;
733 AppendString(pct, TXVESC_TAB);
734 if (!pct->fPRE)
735 pct->fSkipNextSpace = TRUE;
736 pct->fInDT = FALSE;
737}
738
739STATIC VOID TagTR(PCOPYTARGET pct)
740{
741 pct->fNeedsLinebreak = TRUE;
742}
743
744STATIC VOID TagB(PCOPYTARGET pct)
745{
746 AppendString(pct,
747 TXVESC_BOLD_BEGIN);
748}
749
750STATIC VOID TagXB(PCOPYTARGET pct)
751{
752 AppendString(pct,
753 TXVESC_BOLD_END);
754}
755
756STATIC VOID TagI(PCOPYTARGET pct)
757{
758 AppendString(pct,
759 TXVESC_ITALICS_BEGIN);
760}
761
762STATIC VOID TagXI(PCOPYTARGET pct)
763{
764 AppendString(pct,
765 TXVESC_ITALICS_END);
766}
767
768STATIC VOID TagU(PCOPYTARGET pct)
769{
770 AppendString(pct,
771 TXVESC_UNDERLINE_BEGIN);
772}
773
774STATIC VOID TagXU(PCOPYTARGET pct)
775{
776 AppendString(pct,
777 TXVESC_UNDERLINE_END);
778}
779
780STATIC VOID TagSTRIKE(PCOPYTARGET pct)
781{
782 AppendString(pct,
783 TXVESC_STRIKE_BEGIN);
784}
785
786STATIC VOID TagXSTRIKE(PCOPYTARGET pct)
787{
788 AppendString(pct,
789 TXVESC_STRIKE_END);
790}
791
792STATIC VOID TagCODE(PCOPYTARGET pct)
793{
794 AppendEscapeWith3Decimals(pct,
795 TXVESC_SET_FONT,
796 1); // monospaced font
797}
798
799STATIC VOID TagXCODE(PCOPYTARGET pct)
800{
801 AppendEscapeWith3Decimals(pct,
802 TXVESC_SET_FONT,
803 0); // regular font
804}
805
806STATIC VOID TagA(PCOPYTARGET pct)
807{
808 CHAR szAnchor[10];
809 PSZ pHREF = NULL;
810
811 pct->fInLink = FALSE;
812
813 if (pct->pszAttributes)
814 {
815 // we have attributes:
816 PSZ pszClosingTag;
817 if (pszClosingTag = strchr(pct->pszAttributes, '>'))
818 {
819 ULONG ulOfs = 0;
820
821 /*
822 * HREF attribute:
823 *
824 */
825
826 PSZ pNAME = 0;
827
828 // replace '>' with null char to mark end of search
829 *pszClosingTag = 0;
830
831 if (pHREF = strhGetTextAttr(pct->pszAttributes, "HREF", &ulOfs))
832 // OK, we got a link target:
833 pct->fInLink = TRUE;
834 // do not free
835
836 /*
837 * NAME attribute:
838 *
839 */
840
841 if (pNAME = strhGetTextAttr(pct->pszAttributes, "NAME", &ulOfs))
842 {
843 AppendString(pct,
844 TXVESC_ANCHORNAME);
845 AppendString(pct,
846 pNAME);
847 // must be terminated with 0xFF
848 AppendChar(pct, 0xFF);
849 free(pNAME);
850 }
851
852 // restore '>'
853 *pszClosingTag = '>';
854 }
855 }
856
857 if (pHREF)
858 {
859 AppendString(pct,
860 TXVESC_LINK_BEGIN);
861 AppendString(pct,
862 pHREF);
863 // must be terminated with 0xFF
864 AppendChar(pct, 0xFF);
865
866 free(pHREF);
867 }
868}
869
870STATIC VOID TagXA(PCOPYTARGET pct)
871{
872 if (pct->fInLink)
873 {
874 AppendString(pct,
875 TXVESC_LINK_END);
876 pct->fInLink = FALSE;
877 }
878}
879
880/* ******************************************************************
881 *
882 * Tag helpers
883 *
884 ********************************************************************/
885
886/*
887 *@@ FindTagProcessor:
888 * returns the Tag* function which handles the
889 * given tag or NULL if there's none.
890 *
891 *@@added V0.9.4 (2000-06-10) [umoeller]
892 */
893
894STATIC PFNPROCESSTAG FindTagProcessor(PSZ pszTag)
895{
896 PFNPROCESSTAG pProcessor = NULL;
897
898 CHAR c0,
899 c1;
900
901 BOOL fEndOfTag = FALSE;
902
903 PSZ pCheck = pszTag,
904 p2;
905 if (*pCheck == '/')
906 {
907 // end of tag:
908 fEndOfTag = TRUE;
909 pCheck++;
910 }
911
912 c0 = *pCheck;
913 c1 = *(pCheck + 1);
914
915 p2 = pCheck + 2;
916
917 switch (c0)
918 {
919 case 'A':
920 case 'a':
921 switch (c1)
922 {
923 case 0: // A
924 if (!fEndOfTag)
925 return TagA;
926 else
927 return TagXA;
928 case 'D': // ADDRESS
929 case 'd': // ADDRESS
930 if (stricmp(p2, "DRESS") == 0)
931 {
932 if (!fEndOfTag)
933 return TagI;
934 else
935 return TagXI;
936 }
937 }
938 break;
939
940 case 'B':
941 case 'b':
942 switch (c1)
943 {
944 case 0:
945 if (!fEndOfTag)
946 return TagB;
947 else
948 return TagXB;
949
950 case 'R': // BR
951 case 'r': // BR
952 if (*p2 == 0)
953 if (!fEndOfTag)
954 return TagBR;
955 }
956 break;
957
958 case 'C':
959 case 'c':
960 switch (c1)
961 {
962 case 'I': // CITE
963 case 'i': // CITE
964 if (stricmp(p2, "TE") == 0)
965 {
966 if (!fEndOfTag)
967 return TagI;
968 else
969 return TagXI;
970 }
971 break;
972
973 case 'O':
974 case 'o':
975 if (stricmp(p2, "DE") == 0)
976 {
977 if (!fEndOfTag)
978 return TagCODE;
979 else
980 return TagXCODE;
981 }
982 break;
983 }
984 break;
985
986 case 'D':
987 case 'd':
988 switch (c1)
989 {
990 case 'D': // DD
991 case 'd': // DD
992 if ((*p2 == 0) && (!fEndOfTag))
993 return (TagDD);
994 break;
995
996 case 'I': // DIR
997 case 'i': // DIR
998 if (*p2 == 'R')
999 if (*(pCheck + 3) == 0)
1000 {
1001 if (!fEndOfTag)
1002 return TagUL;
1003 else
1004 return TagXUL;
1005 }
1006 break;
1007
1008 case 'L': // DL
1009 case 'l': // DL
1010 if (*p2 == 0)
1011 {
1012 if (!fEndOfTag)
1013 return TagDL;
1014 else
1015 return TagXDL;
1016 }
1017 break;
1018
1019 case 'T': // DT
1020 case 't': // DT
1021 if ((*p2 == 0) && (!fEndOfTag))
1022 return TagDT;
1023 break;
1024 }
1025 break;
1026
1027 case 'E':
1028 case 'e':
1029 if ( (c1 == 'M') || (c1 == 'm') ) // EM
1030 if (*p2 == 0)
1031 {
1032 if (!fEndOfTag)
1033 return TagI;
1034 else
1035 return TagXI;
1036 }
1037 break;
1038
1039 case 'H':
1040 case 'h':
1041 if (c1)
1042 if (*p2 == 0)
1043 switch (c1)
1044 {
1045 case '1':
1046 if (!fEndOfTag)
1047 return TagH1;
1048 else
1049 return TagXH1;
1050 case '2':
1051 if (!fEndOfTag)
1052 return TagH2;
1053 else
1054 return TagXH2;
1055 case '3':
1056 if (!fEndOfTag)
1057 return TagH3;
1058 else
1059 return TagXH3;
1060 case '4':
1061 if (!fEndOfTag)
1062 return TagH4;
1063 else
1064 return TagXH4;
1065 case '5':
1066 if (!fEndOfTag)
1067 return TagH5;
1068 else
1069 return TagXH5;
1070 case '6':
1071 if (!fEndOfTag)
1072 return TagH6;
1073 else
1074 return TagXH6;
1075 }
1076 break;
1077
1078 case 'I':
1079 case 'i':
1080 if (c1 == 0)
1081 {
1082 if (!fEndOfTag)
1083 return TagI;
1084 else
1085 return TagXI;
1086 }
1087 break;
1088
1089 case 'L':
1090 case 'l':
1091 if ((c1 == 'I') || (c1 == 'i'))
1092 if (*p2 == 0)
1093 return TagLI;
1094 break;
1095
1096 case 'M':
1097 case 'm':
1098 if (stricmp(p2, "NU") == 0)
1099 {
1100 if (!fEndOfTag)
1101 return TagUL;
1102 else
1103 return TagXUL;
1104 }
1105 break;
1106
1107 case 'O':
1108 case 'o':
1109 if ((c1 == 'L') || (c1 == 'l'))
1110 if (*p2 == 0)
1111 {
1112 if (!fEndOfTag)
1113 return TagOL;
1114 else
1115 return TagXOL;
1116 }
1117 break;
1118
1119 case 'P':
1120 case 'p':
1121 switch (c1)
1122 {
1123 case 0:
1124 if (!fEndOfTag)
1125 return TagP;
1126 break;
1127
1128 case 'R': // PRE
1129 case 'r': // PRE
1130 if ((*p2 == 'E') || (*p2 == 'e'))
1131 if (*(pCheck + 3) == 0)
1132 {
1133 if (!fEndOfTag)
1134 return TagPRE;
1135 else
1136 return TagXPRE;
1137 }
1138 break;
1139 }
1140 break;
1141
1142 case 'S':
1143 case 's':
1144 switch (c1)
1145 {
1146 case 'T': // STRONG
1147 case 't': // STRONG
1148 if (stricmp(p2, "RONG") == 0)
1149 {
1150 if (!fEndOfTag)
1151 return TagB;
1152 else
1153 return TagXB;
1154 }
1155 else if (stricmp(p2, "RIKE") == 0)
1156 {
1157 if (!fEndOfTag)
1158 return TagSTRIKE;
1159 else
1160 return TagXSTRIKE;
1161 }
1162 break;
1163
1164 case 'A':
1165 case 'a':
1166 if (stricmp(p2, "MP") == 0)
1167 {
1168 if (!fEndOfTag)
1169 return TagCODE;
1170 else
1171 return TagXCODE;
1172 }
1173 break;
1174 }
1175 break;
1176
1177 case 'T':
1178 case 't':
1179 switch (c1)
1180 {
1181 case 'R':
1182 case 'r':
1183 if (*p2 == 0)
1184 return TagTR;
1185 break;
1186
1187 case 'I':
1188 case 'i':
1189 if (stricmp(p2, "TLE") == 0)
1190 return TagTITLE;
1191 break;
1192
1193 case 'T': // TT
1194 case 't':
1195 if (*p2 == 0)
1196 {
1197 if (!fEndOfTag)
1198 return TagCODE;
1199 else
1200 return TagXCODE;
1201 }
1202 break;
1203 }
1204 break;
1205
1206 case 'U':
1207 case 'u':
1208 switch (c1)
1209 {
1210 case 0:
1211 if (!fEndOfTag)
1212 return TagU;
1213 else
1214 return TagXU;
1215
1216 case 'L':
1217 case 'l':
1218 if (*p2 == 0)
1219 {
1220 if (!fEndOfTag)
1221 return TagUL;
1222 else
1223 return TagXUL;
1224 }
1225 break;
1226 }
1227 break;
1228
1229 case 'V':
1230 case 'v':
1231 if (stricmp(p2, "R") == 0)
1232 {
1233 if (!fEndOfTag)
1234 return TagI;
1235 else
1236 return TagXI;
1237 }
1238 break;
1239
1240 case 'X':
1241 case 'x':
1242 if (stricmp(p2, "MP") == 0) // XMP
1243 {
1244 if (!fEndOfTag)
1245 return TagPRE;
1246 else
1247 return TagXPRE;
1248 }
1249 break;
1250 }
1251
1252 return (pProcessor);
1253}
1254
1255/*
1256 *@@ HandleTag:
1257 * called by txvConvertFromHTML when a "<" character
1258 * is found in the source buffer. This calls
1259 * FindTagProcessor in turn to find the Tag*
1260 * function which handles the tag.
1261 *
1262 *@@added V0.9.3 (2000-05-18) [umoeller]
1263 */
1264
1265STATIC VOID HandleTag(PCOPYTARGET pct)
1266{
1267 PSZ pStartOfTag = pct->pSource;
1268 // '<' == begin of tag:
1269
1270 // is it a comment? <!-- ... -->
1271 if (strncmp(pStartOfTag + 1, "!--", 3) == 0)
1272 {
1273 // start of comment:
1274 // find end of comment
1275 PSZ pEnd = strstr(pStartOfTag, "-->");
1276 if (pEnd)
1277 // found:
1278 // search on after end of comment
1279 pct->pSource = pEnd + 3;
1280 else
1281 {
1282 // end of comment not found:
1283 // stop formatting...
1284 pct->pSource++;
1285 return;
1286 }
1287 }
1288 else
1289 {
1290 // no comment:
1291 // find end of tag
1292 PSZ p2 = pStartOfTag + 1,
1293 pNextClose = 0, // receives first '>' after '<'
1294 pNextSpace = 0; // receives first ' ' after '<'
1295 BOOL fCont = TRUE;
1296 while (fCont)
1297 {
1298 switch (*p2)
1299 {
1300 case ' ':
1301 case '\r':
1302 case '\n':
1303 // store first space after '<'
1304 if (!pNextSpace)
1305 pNextSpace = p2;
1306 // overwrite line breaks with spaces;
1307 // otherwise we cannot handle tags which go across
1308 // several lines, which is valid HTML
1309 *p2 = ' ';
1310 break;
1311
1312 case '>': // end of tag found:
1313 pNextClose = p2;
1314 fCont = FALSE;
1315 break;
1316
1317 case '<':
1318 // another opening tag:
1319 // that's an HTML error
1320 AppendChar(pct,
1321 *pct->pSource++);
1322 fCont = FALSE;
1323 break;
1324
1325 case 0:
1326 fCont = FALSE;
1327 break;
1328 }
1329 p2++;
1330 }
1331
1332 if (pNextClose)
1333 {
1334 // end of tag found:
1335 ULONG cbTag;
1336 // PSZ pStartOfAttrs = 0;
1337
1338 if ((pNextSpace) && (pNextSpace < pNextClose))
1339 {
1340 // we have attributes:
1341 cbTag = pNextSpace - (pStartOfTag + 1);
1342 // pStartOfAttrs = pNextSpace;
1343 }
1344 else
1345 cbTag = pNextClose - (pStartOfTag + 1);
1346
1347 if (!cbTag)
1348 {
1349 // happens if we have a "<>" in the text:
1350 // just insert the '<>' and go on, we have no tag here
1351 AppendChar(pct,
1352 *pct->pSource++);
1353 AppendChar(pct,
1354 *pct->pSource++);
1355 }
1356 else
1357 {
1358 PFNPROCESSTAG pTagProcessor;
1359
1360 pct->cSaved = *(pStartOfTag + cbTag + 1);
1361 // add a null terminator
1362 *(pStartOfTag + cbTag + 1) = 0;
1363
1364 // find corresponding tag converter function
1365 // from G_TagProcessors map
1366 pTagProcessor = FindTagProcessor(pStartOfTag + 1); // pszTag);
1367
1368 // restore char under null terminator
1369 *(pStartOfTag + cbTag + 1) = pct->cSaved;
1370
1371 // reset new source ptr; the tag handler
1372 // can modify this
1373 pct->pNewSource = NULL;
1374
1375 if (pTagProcessor)
1376 {
1377 // tag understood:
1378
1379 // terminate string after closing tag
1380 pct->cSaved = *(pNextClose + 1); // can be null byte!
1381 *(pNextClose + 1) = 0;
1382
1383 // did we have attributes?
1384 if (pNextSpace)
1385 pct->pszAttributes = pNextSpace;
1386
1387 // finally, call the tag handler
1388 (pTagProcessor) // function
1389 (pct); // argument
1390
1391 *(pNextClose + 1) = pct->cSaved;
1392 }
1393
1394 if (pct->pNewSource == NULL)
1395 // tag handler needs no special processing:
1396 // skip '>' too
1397 pct->pSource = pNextClose + 1;
1398 else
1399 // tag handler has skipped something:
1400 pct->pSource = pct->pNewSource;
1401 }
1402 }
1403 }
1404}
1405
1406/*
1407 *@@ ConvertEscape:
1408 * called by HandleEscape to find the ANSI (CP 1004)
1409 * character for the given escape sequence (pszTag).
1410 *
1411 * pszTag must be null-terminated and contain only
1412 * the stuff between "&" and ";".
1413 *
1414 * This is really ugly spaghetti, but it's the fastest
1415 * way to do it.
1416 *
1417 *@@added V0.9.4 (2000-06-10) [umoeller]
1418 */
1419
1420STATIC unsigned char ConvertEscape(PSZ pszTag)
1421{
1422 CHAR c0, c1;
1423 CHAR crc = 0;
1424
1425 PSZ p2 = pszTag + 2;
1426
1427 c0 = *pszTag;
1428 c1 = *(pszTag + 1);
1429
1430 switch (c0)
1431 {
1432 case 'a':
1433 switch (c1)
1434 {
1435 case 'a':
1436 if (strcmp(p2, "cute") == 0)
1437 return 225;
1438 break;
1439
1440 case 'c':
1441 if (strcmp(p2, "irc") == 0)
1442 return 226;
1443 else if (strcmp(p2, "ute") == 0)
1444 return 180;
1445 break;
1446
1447 case 'e':
1448 if (strcmp(p2, "lig") == 0)
1449 return 230;
1450 break;
1451
1452 case 'g':
1453 if (strcmp(p2, "rave") == 0)
1454 return 224;
1455 break;
1456
1457 case 'm':
1458 if (strcmp(p2, "p") == 0)
1459 return '&';
1460 break;
1461
1462 case 'r':
1463 if (strcmp(p2, "ing") == 0)
1464 return 229;
1465 break;
1466
1467 case 't':
1468 if (strcmp(p2, "ilde") == 0)
1469 return 227;
1470 break;
1471
1472 case 'u':
1473 if (strcmp(p2, "ml") == 0)
1474 return 228;
1475 break;
1476 }
1477 break;
1478
1479 case 'b':
1480 if (strcmp(pszTag + 1, "rvbar") == 0)
1481 return 166;
1482 break;
1483
1484 case 'c':
1485 switch (c1)
1486 {
1487 case 'c':
1488 if (strcmp(p2, "edil") == 0)
1489 return 231;
1490 break;
1491
1492 case 'e':
1493 if (strcmp(p2, "dil") == 0)
1494 return 184;
1495 else if (strcmp(p2, "nt") == 0)
1496 return 162;
1497 break;
1498
1499 case 'o':
1500 if (strcmp(p2, "py") == 0)
1501 return 169;
1502 break;
1503
1504 case 'u':
1505 if (strcmp(p2, "rren") == 0)
1506 return 164;
1507 }
1508 break;
1509
1510 case 'd':
1511 switch (c1)
1512 {
1513 case 'e':
1514 if (strcmp(p2, "g") == 0) return 176;
1515 break;
1516
1517 case 'i':
1518 if (strcmp(p2, "vide") == 0) return 247;
1519 break;
1520 }
1521 break;
1522
1523 case 'e':
1524 switch (c1)
1525 {
1526 case 'a':
1527 if (strcmp(p2, "cute") == 0) return 233;
1528 break;
1529
1530 case 'c':
1531 if (strcmp(p2, "irc") == 0) return 234;
1532 break;
1533
1534 case 'g':
1535 if (strcmp(p2, "rave") == 0) return 232;
1536 break;
1537
1538 case 't':
1539 if (strcmp(p2, "h") == 0) return 240;
1540 break;
1541
1542 case 'u':
1543 if (strcmp(p2, "ml") == 0) return 235;
1544 break;
1545 }
1546 break;
1547
1548 case 'f':
1549 switch (c1)
1550 {
1551 case 'r':
1552 if (strcmp(p2, "ac14") == 0) return 188;
1553 if (strcmp(p2, "ac12") == 0) return 189;
1554 if (strcmp(p2, "ac34") == 0) return 190;
1555 break;
1556 }
1557 break;
1558
1559 case 'g':
1560 switch (c1)
1561 {
1562 case 't':
1563 if (*p2 == 0) return '>';
1564 }
1565 break;
1566
1567 case 'i':
1568 switch (c1)
1569 {
1570 case 'a':
1571 if (strcmp(p2, "cute") == 0) return 237;
1572 break;
1573
1574 case 'c':
1575 if (strcmp(p2, "irc") == 0) return 238;
1576 break;
1577
1578 case 'g':
1579 if (strcmp(p2, "rave") == 0) return 236;
1580 break;
1581
1582 case 'e':
1583 if (strcmp(p2, "xcl") == 0) return 161;
1584 break;
1585
1586 case 'q':
1587 if (strcmp(p2, "uest") == 0) return 191;
1588 break;
1589
1590 case 'u':
1591 if (strcmp(p2, "ml") == 0) return 239;
1592 }
1593 break;
1594
1595 case 'l':
1596 switch (c1)
1597 {
1598 case 't':
1599 if (*p2 == 0)
1600 return '<';
1601 break;
1602
1603 case 'a':
1604 if (strcmp(p2, "quo") == 0) return 171;
1605 }
1606 break;
1607
1608 case 'm':
1609 switch (c1)
1610 {
1611 case 'a':
1612 if (strcmp(p2, "cr") == 0) return 175;
1613 break;
1614
1615 case 'i':
1616 if (strcmp(p2, "cro") == 0) return 181;
1617 if (strcmp(p2, "ddot") == 0) return 183;
1618 break;
1619 }
1620 break;
1621
1622 case 'n':
1623 switch (c1)
1624 {
1625 case 'b':
1626 if (strcmp(p2, "sp") == 0) return 160;
1627 break;
1628
1629 case 'o':
1630 if (strcmp(p2, "t") == 0) return 172;
1631 break;
1632
1633 case 't':
1634 if (strcmp(p2, "ilde") == 0) return 241;
1635 }
1636 break;
1637
1638 case 'o':
1639 switch (c1)
1640 {
1641 case 'a':
1642 if (strcmp(p2, "cute") == 0) return 243;
1643 break;
1644
1645 case 'c':
1646 if (strcmp(p2, "irc") == 0) return 244;
1647 break;
1648
1649 case 'g':
1650 if (strcmp(p2, "rave") == 0) return 242;
1651 break;
1652
1653 case 'r':
1654 if (strcmp(p2, "df") == 0) return 170;
1655 if (strcmp(p2, "dm") == 0) return 186;
1656 break;
1657
1658 case 's':
1659 if (strcmp(p2, "lash") == 0) return 248;
1660 break;
1661
1662 case 't':
1663 if (strcmp(p2, "ilde") == 0) return 245;
1664 break;
1665
1666 case 'u':
1667 if (strcmp(p2, "ml") == 0) return 246;
1668 }
1669 break;
1670
1671 case 'p':
1672 switch (c1)
1673 {
1674 case 'a':
1675 if (strcmp(p2, "ra") == 0) return 182;
1676 break;
1677
1678 case 'l':
1679 if (strcmp(p2, "usmn") == 0) return 177;
1680 break;
1681
1682 case 'o':
1683 if (strcmp(p2, "und") == 0) return 163;
1684 }
1685 break;
1686
1687 case 'q':
1688 if (strcmp(pszTag, "quot") == 0) return '"';
1689 break;
1690
1691 case 'r':
1692 if (strcmp(pszTag, "raquo") == 0) return 187;
1693 if (strcmp(pszTag, "reg") == 0) return 174;
1694 break;
1695
1696 case 's':
1697 switch (c1)
1698 {
1699 case 'z':
1700 if (strcmp(p2, "lig") == 0) return 223;
1701 break;
1702
1703 case 'e':
1704 if (strcmp(p2, "ct") == 0) return 167;
1705 break;
1706
1707 case 'h':
1708 if (strcmp(p2, "y") == 0) return 173;
1709 break;
1710
1711 case 'u':
1712 if (strcmp(p2, "p1") == 0) return 185;
1713 if (strcmp(p2, "p2") == 0) return 178;
1714 if (strcmp(p2, "p3") == 0) return 179;
1715 }
1716 break;
1717
1718 case 't':
1719 if (strcmp(pszTag, "thorn") == 0) return 254;
1720 if (strcmp(pszTag, "times") == 0) return 215;
1721 break;
1722
1723 case 'u':
1724 switch (c1)
1725 {
1726 case 'a':
1727 if (strcmp(p2, "cute") == 0) return 250;
1728 break;
1729
1730 case 'c':
1731 if (strcmp(p2, "irc") == 0) return 251;
1732 break;
1733
1734 case 'g':
1735 if (strcmp(p2, "rave") == 0) return 249;
1736 break;
1737
1738 case 'm':
1739 if (strcmp(p2, "l") == 0) return 168;
1740 break;
1741
1742 case 'u':
1743 if (strcmp(p2, "ml") == 0) return 252;
1744 }
1745 break;
1746
1747 case 'y':
1748 if (strcmp(pszTag, "yacute") == 0) return 253;
1749 if (strcmp(pszTag, "yen") == 0) return 165;
1750 if (strcmp(pszTag, "yuml") == 0) return 255;
1751 break;
1752
1753 case 'A':
1754 switch (c1)
1755 {
1756 case 'u':
1757 if (strcmp(p2, "ml") == 0) return 196;
1758 break;
1759
1760 case 'a':
1761 if (strcmp(p2, "cute") == 0) return 193;
1762 break;
1763
1764 case 'c':
1765 if (strcmp(p2, "irc") == 0) return 194;
1766 break;
1767
1768 case 'E':
1769 if (strcmp(p2, "lig") == 0) return 198;
1770 break;
1771
1772 case 'g':
1773 if (strcmp(p2, "rave") == 0) return 192;
1774 break;
1775
1776 case 'r':
1777 if (strcmp(p2, "ing") == 0) return 197;
1778 break;
1779
1780 case 't':
1781 if (strcmp(p2, "ilde") == 0) return 195;
1782 }
1783 break;
1784
1785 case 'C':
1786 if (strcmp(pszTag, "Ccedil") == 0) return 199;
1787 break;
1788
1789 case 'E':
1790 if (strcmp(pszTag, "Ecirc") == 0) return 202;
1791 if (strcmp(pszTag, "Eacute") == 0) return 201;
1792 if (strcmp(pszTag, "Egrave") == 0) return 200;
1793 if (strcmp(pszTag, "ETH") == 0) return 208;
1794 if (strcmp(pszTag, "Euml") == 0) return 203;
1795 break;
1796
1797 case 'I':
1798 if (strcmp(pszTag, "Icirc") == 0) return 206;
1799 if (strcmp(pszTag, "Iacute") == 0) return 205;
1800 if (strcmp(pszTag, "Igrave") == 0) return 204;
1801 if (strcmp(pszTag, "Iuml") == 0) return 207;
1802 break;
1803
1804 case 'N':
1805 if (strcmp(pszTag, "Ntilde") == 0) return 209;
1806 break;
1807
1808 case 'O':
1809 switch (c1)
1810 {
1811 case 'u':
1812 if (strcmp(p2, "ml") == 0) return 214;
1813 break;
1814
1815 case 'a':
1816 if (strcmp(p2, "cute") == 0) return 211;
1817 break;
1818
1819 case 'c':
1820 if (strcmp(p2, "irc") == 0) return 212;
1821 break;
1822
1823 case 'g':
1824 if (strcmp(p2, "rave") == 0) return 210;
1825 break;
1826
1827 case 't':
1828 if (strcmp(p2, "ilde") == 0) return 213;
1829 break;
1830
1831 case 's':
1832 if (strcmp(p2, "lash") == 0) return 216;
1833 }
1834 break;
1835
1836 case 'U':
1837 switch (c1)
1838 {
1839 case 'a':
1840 if (strcmp(p2, "cute") == 0) return 218;
1841 break;
1842
1843 case 'c':
1844 if (strcmp(p2, "irc") == 0) return 219;
1845 break;
1846
1847 case 'g':
1848 if (strcmp(p2, "rave") == 0) return 217;
1849 break;
1850
1851 case 'u':
1852 if (strcmp(p2, "ml") == 0) return 220;
1853 }
1854 break;
1855
1856 case 'T':
1857 if (strcmp(pszTag, "THORN") == 0) return 222;
1858 break;
1859
1860 case 'Y':
1861 if (strcmp(pszTag, "Yacute") == 0) return 221;
1862 break;
1863 }
1864
1865 return (crc);
1866}
1867
1868/*
1869 *@@ HandleEscape:
1870 * called by txvConvertFromHTML when a "&" character
1871 * is found in the source buffer. This calls
1872 * ConvertEscape in turn.
1873 *
1874 *@@added V0.9.3 (2000-05-18) [umoeller]
1875 */
1876
1877STATIC VOID HandleEscape(PCOPYTARGET pct)
1878{
1879 // ampersand:
1880 // replace special characters
1881 PSZ pStartOfTag = pct->pSource;
1882 // find end of tag
1883 PSZ p2 = pStartOfTag,
1884 pNextClose = 0,
1885 pNextSpace = 0;
1886 BOOL fCont = TRUE;
1887 while (fCont)
1888 {
1889 switch (*p2)
1890 {
1891 case 0:
1892 fCont = FALSE;
1893 break;
1894
1895 case ';':
1896 pNextClose = p2;
1897 fCont = FALSE;
1898 break;
1899
1900 case ' ':
1901 if (!pNextSpace)
1902 pNextSpace = p2;
1903 break;
1904 }
1905 p2++;
1906 }
1907
1908 if (!pNextClose)
1909 // no closing tag found:
1910 // just insert the '&' and go on, we have no tag here
1911 AppendChar(pct,
1912 *pct->pSource++);
1913 else
1914 {
1915 if ((pNextSpace) && (pNextSpace < pNextClose))
1916 // space before ';':
1917 // just insert the '&' and go on, we have no tag here
1918 AppendChar(pct,
1919 *pct->pSource++);
1920 else if ((!pNextClose) || (pNextClose <= pStartOfTag + 1))
1921 AppendChar(pct,
1922 *pct->pSource++);
1923 else
1924 {
1925 ULONG ulCode = 0;
1926
1927 // create substring with tag
1928 PSZ pszTag = pStartOfTag + 1;
1929 *pNextClose = 0;
1930
1931 if (*pszTag == '#')
1932 {
1933 // latin-1 or Unicode encoding (&#000;)
1934 ulCode = atoi(pszTag + 1);
1935
1936 // next input: char after ';'
1937 pct->pSource = pNextClose + 1;
1938 }
1939 else
1940 {
1941 // named entity:
1942 // find char code corresponding to escape
1943 // from G_EscapeProcessors map
1944 ulCode = ConvertEscape(pszTag);
1945 if (ulCode)
1946 // tag supported:
1947 pct->pSource = pNextClose + 1;
1948 else
1949 // tag not supported:
1950 ulCode = *pct->pSource++;
1951 }
1952
1953 // restore closing tag which we overwrote
1954 *pNextClose = ';';
1955
1956 if (ulCode)
1957 {
1958 AppendLinebreakCheck(pct);
1959
1960 AppendChar(pct,
1961 (CHAR)ulCode);
1962 pct->fSkipNextSpace = FALSE;
1963 }
1964 }
1965 }
1966}
1967
1968/* ******************************************************************
1969 *
1970 * Entry points
1971 *
1972 ********************************************************************/
1973
1974/*
1975 *@@ txvConvertFromHTML:
1976 * this modifies the given text string (which should
1977 * be the complete BODY block of any HTML file) so
1978 * that all HTML tags are removed and replaced with
1979 * escape sequences that the XTextView control understands.
1980 *
1981 * The buffer gets reallocated by this function, so it
1982 * must be free()'able.
1983 *
1984 * So, to have the XTextView control display an HTML file,
1985 * do this:
1986 *
1987 * 1) Load an HTML file into a buffer allocated by malloc().
1988 *
1989 * 2) Call txvConvertFromHTML.
1990 *
1991 * 3) Call WinSetWindowText on the XTextView control with
1992 * the modified buffer.
1993 *
1994 * This understands the following limited subset of HTML:
1995 *
1996 * Paragraph tags:
1997 *
1998 * -- P, BR
1999 * -- PRE, /PRE
2000 * -- UL, /UL, OL, /OL, LI
2001 * -- DL, /DL, DT, DD
2002 * -- H1, /H1 thru H6, /H6
2003 * -- Comments (<!-- .... -->)
2004 *
2005 * Character tags:
2006 *
2007 * -- B, /B, STRONG, /STRONG
2008 * -- I, /I, EM, /EM, VAR, /VAR, CITE, /CITE
2009 * -- CODE, /CODE, SAMP, /SAMP, KBD, /KBD, TT, /TT
2010 * -- U, /U
2011 * -- STRIKE, /STRIKE
2012 * -- CODE, /CODE
2013 *
2014 * The most obvious limitation is that neither tables
2015 * nor frames are supported. Also forget about CSS
2016 * and JavaScript, of course.
2017 *
2018 * All the ampersand (&amp; something) sequences defined
2019 * in HTML 3 are properly translated.
2020 *
2021 * Note: Those are translated to the ANSI (MS-Windows,
2022 * OS/2 codepage 1004) character set. This has the
2023 * following characteristics:
2024 *
2025 * -- Codes 0-127 are identical to ASCII and thus
2026 * ISO 8559-1 ("Latin 1") also.
2027 *
2028 * -- Codes 160-255 are identical to ISO 8559-1 ("Latin 1").
2029 *
2030 * -- Codes 128-159 are NOT defined in ISO 8559-1, but
2031 * Netscape treats those as ANSI as well, so we do too.
2032 *
2033 * As a result, consider the output to be in OS/2 codepage
2034 * 1004. Either set your codepage to that (WinSetCp)
2035 * or translate the output (WinCpTranslateString).
2036 *
2037 * &#xxx; tags (with xxx being a decimal) are considered
2038 * ANSI codes as well. Even though HTML 4.0 allows Unicode
2039 * characters > 255 to be inserted this way, we ignore
2040 * those. Unicode chars from 0 to 255 are identical to
2041 * ANSI, so for &#000; to &#255;, we are HTML-compliant.
2042 *
2043 * All other tags are completely thrown out.
2044 *
2045 *@@added V0.9.3 (2000-05-06) [umoeller]
2046 *@@changed V0.9.20 (2002-08-10) [umoeller]: changed prototype
2047 */
2048
2049BOOL txvConvertFromHTML(PSZ *ppszText, // in/out: text (gets reallocated)
2050 PSZ *ppszTitle, // out: if != NULL, receives malloc'd buffer with HTML title
2051 PULONG pulProgress, // out: progress (ptr can be NULL)
2052 PBOOL pfCancel) // in: cancel flag (ptr can be NULL)
2053{
2054 BOOL brc = TRUE;
2055
2056 ULONG cbSource = strlen(*ppszText);
2057
2058 COPYTARGET ct = {0};
2059
2060 lstInit(&ct.llLists,
2061 TRUE); // free items
2062
2063 ct.ppszTitle = ppszTitle; // V0.9.20 (2002-08-10) [umoeller]
2064 // can be NULL
2065
2066 ct.pSource = *ppszText;
2067 // skip leading spaces
2068 ct.fSkipNextSpace = TRUE;
2069
2070 // step 2:
2071 // actual tags formatting
2072
2073 while (TRUE)
2074 {
2075 CHAR c = *ct.pSource;
2076
2077 if (pfCancel)
2078 if (*pfCancel)
2079 {
2080 brc = FALSE;
2081 break;
2082 }
2083
2084 if (!c)
2085 // null terminator reached:
2086 break;
2087
2088 // calculate progress
2089 if (pulProgress)
2090 *pulProgress = ((ct.pSource - *ppszText) // characters done
2091 * 100
2092 / cbSource); // characters total
2093
2094 switch (c)
2095 {
2096 case '<':
2097 HandleTag(&ct);
2098 break;
2099
2100 case '&':
2101 HandleEscape(&ct);
2102 break;
2103
2104 case '\r':
2105 // skip
2106 if (!ct.fSkipNextSpace)
2107 {
2108 AppendChar(&ct,
2109 ' ');
2110 // ct.fNeedsLinebreak = FALSE;
2111 // but skip leading spaces which might follow
2112 if (!ct.fPRE)
2113 ct.fSkipNextSpace = TRUE;
2114 }
2115 ct.pSource++;
2116 break;
2117
2118 case '\t':
2119 {
2120 if (ct.fPRE)
2121 {
2122 ULONG ul;
2123 for (ul = 0;
2124 ul < 8;
2125 ul++)
2126 AppendChar(&ct,
2127 ' ');
2128 }
2129 else
2130 {
2131 // not in PRE block:
2132 if ( (!ct.fSkipNextSpace)
2133 // && (!ct.fNeedsLinebreak)
2134 )
2135 // last was not space: copy
2136 AppendChar(&ct,
2137 ' ');
2138
2139 ct.fSkipNextSpace = TRUE;
2140 }
2141
2142 // skip the tab
2143 ct.pSource++;
2144 break; }
2145
2146 case '\n':
2147 {
2148 // newline char:
2149 if (!ct.fPRE)
2150 {
2151 // if not in PRE mode, replace with space
2152 if (!ct.fSkipNextSpace)
2153 {
2154 AppendChar(&ct,
2155 ' ');
2156 // ct.fNeedsLinebreak = FALSE;
2157 // but skip leading spaces which might follow
2158 ct.fSkipNextSpace = TRUE;
2159 }
2160 }
2161 else
2162 // in PRE mode, preserve line breaks
2163 AppendChar(&ct, '\n'); // ct.fNeedsLinebreak = TRUE;
2164
2165 ct.pSource++;
2166 break; }
2167
2168 case '\xFF':
2169 {
2170 AppendChar(&ct,
2171 ' ');
2172 ct.pSource++;
2173 break; }
2174
2175 case ' ':
2176 if (!ct.fPRE)
2177 {
2178 // is space, and not in PRE block:
2179 if ( (!ct.fSkipNextSpace)
2180 // && (!ct.fNeedsLinebreak)
2181 )
2182 // last was not space: copy
2183 AppendChar(&ct,
2184 ' ');
2185
2186 ct.fSkipNextSpace = TRUE;
2187 }
2188 else
2189 // in PRE, always add all spaces
2190 AppendChar(&ct,
2191 ' ');
2192 ct.pSource++;
2193 break;
2194
2195 default:
2196 // if we're not inserting escapes or anything,
2197 // check if a linebreak is needed
2198 AppendLinebreakCheck(&ct);
2199
2200 AppendChar(&ct,
2201 *ct.pSource++);
2202 ct.fSkipNextSpace = FALSE;
2203 ct.fSkipNextLinebreak = FALSE;
2204
2205 } // end switch (*pSource);
2206 } // end while (*pSource)
2207 AppendChar(&ct,
2208 '\n');
2209 // append null-terminator
2210 AppendChar(&ct,
2211 0);
2212
2213 free(*ppszText);
2214 *ppszText = ct.pszNew;
2215
2216 lstClear(&ct.llLists);
2217
2218 return brc;
2219}
2220
2221
Note: See TracBrowser for help on using the repository browser.